├── .gitattributes
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── publish.yml
├── .gitignore
├── LICENSE
├── __init__.py
├── cogvideo_controlnet.py
├── cogvideox_fun
    └── utils.py
├── configs
    ├── scheduler_config_2b.json
    ├── scheduler_config_5b.json
    ├── transformer_config_2b.json
    ├── transformer_config_5b.json
    ├── transformer_config_I2V_5b.json
    └── vae_config.json
├── context.py
├── custom_cogvideox_transformer_3d.py
├── embeddings.py
├── enhance_a_video
    ├── __init__.py
    ├── enhance.py
    └── globals.py
├── example_workflows
    ├── cogvideox_1.0_5b_vid2vid_02.json
    ├── cogvideox_1_0_2b_controlnet_02.json
    ├── cogvideox_1_0_5b_I2V_02.json
    ├── cogvideox_1_0_5b_I2V_Tora_02.json
    ├── cogvideox_1_0_5b_I2V_noise_warp_01.json
    ├── cogvideox_1_0_5b_T2V_02.json
    ├── cogvideox_1_0_5b_interpolation_02.json
    ├── cogvideox_1_0_5b_vid2vid_02.json
    ├── cogvideox_1_5_5b_I2V_01.json
    ├── cogvideox_Fun_180_orbit_02.json
    ├── cogvideox_Fun_I2V_02.json
    ├── cogvideox_Fun_I2V_Tora.json
    ├── cogvideox_Fun_pose_02.json
    ├── cut_and_drag_for_noisewarp_01.json
    └── noise_warp_example_input_video.mp4
├── fp8_optimization.py
├── lora_utils.py
├── model_loading.py
├── mz_enable_vae_encode_tiling.py
├── mz_gguf_loader.py
├── nodes.py
├── pipeline_cogvideox.py
├── pyproject.toml
├── readme.md
├── requirements.txt
├── tora
    ├── traj_module.py
    └── traj_utils.py
└── utils.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [kijai]
2 | custom: ["https://www.paypal.me/kijaidesign"]
3 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |     paths:
 9 |       - "pyproject.toml"
10 | 
11 | jobs:
12 |   publish-node:
13 |     name: Publish Custom Node to registry
14 |     runs-on: ubuntu-latest
15 |     # if this is a forked repository. Skipping the workflow.
16 |     if: github.event.repository.fork == false 
17 |     steps:
18 |       - name: Check out code
19 |         uses: actions/checkout@v4
20 |       - name: Publish Custom Node
21 |         uses: Comfy-Org/publish-node-action@main
22 |         with:
23 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
24 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | output/
 2 | *__pycache__/
 3 | samples*/
 4 | runs/
 5 | checkpoints/
 6 | master_ip
 7 | logs/
 8 | *.DS_Store
 9 | .idea
10 | *.pt
11 | tools/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .nodes import NODE_CLASS_MAPPINGS as NODES_CLASS, NODE_DISPLAY_NAME_MAPPINGS as NODES_DISPLAY
2 | from .model_loading import NODE_CLASS_MAPPINGS as MODEL_CLASS, NODE_DISPLAY_NAME_MAPPINGS as MODEL_DISPLAY
3 | 
4 | NODE_CLASS_MAPPINGS = {**NODES_CLASS, **MODEL_CLASS}
5 | NODE_DISPLAY_NAME_MAPPINGS = {**NODES_DISPLAY, **MODEL_DISPLAY}
6 | 
7 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]


--------------------------------------------------------------------------------
/cogvideo_controlnet.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/TheDenk/cogvideox-controlnet/blob/main/cogvideo_controlnet.py
  2 | from typing import Any, Dict, Optional, Tuple, Union
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from einops import rearrange
  7 | import torch.nn.functional as F
  8 | from .custom_cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock
  9 | from diffusers.utils import is_torch_version
 10 | from diffusers.loaders import  PeftAdapterMixin
 11 | from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
 12 | from diffusers.models.modeling_utils import ModelMixin
 13 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 14 | 
 15 | 
 16 | class CogVideoXControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin):
 17 |     _supports_gradient_checkpointing = True
 18 |     
 19 |     @register_to_config
 20 |     def __init__(
 21 |         self,
 22 |         num_attention_heads: int = 30,
 23 |         attention_head_dim: int = 64,
 24 |         vae_channels: int = 16,
 25 |         in_channels: int = 3,
 26 |         downscale_coef: int = 8,
 27 |         flip_sin_to_cos: bool = True,
 28 |         freq_shift: int = 0,
 29 |         time_embed_dim: int = 512,
 30 |         num_layers: int = 8,
 31 |         dropout: float = 0.0,
 32 |         attention_bias: bool = True,
 33 |         sample_width: int = 90,
 34 |         sample_height: int = 60,
 35 |         sample_frames: int = 49,
 36 |         patch_size: int = 2,
 37 |         temporal_compression_ratio: int = 4,
 38 |         max_text_seq_length: int = 226,
 39 |         activation_fn: str = "gelu-approximate",
 40 |         timestep_activation_fn: str = "silu",
 41 |         norm_elementwise_affine: bool = True,
 42 |         norm_eps: float = 1e-5,
 43 |         spatial_interpolation_scale: float = 1.875,
 44 |         temporal_interpolation_scale: float = 1.0,
 45 |         use_rotary_positional_embeddings: bool = False,
 46 |         use_learned_positional_embeddings: bool = False,
 47 |         out_proj_dim = None,
 48 |     ):
 49 |         super().__init__()
 50 |         inner_dim = num_attention_heads * attention_head_dim
 51 | 
 52 |         if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
 53 |             raise ValueError(
 54 |                 "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
 55 |                 "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
 56 |                 "issue at https://github.com/huggingface/diffusers/issues."
 57 |             )
 58 |             
 59 |         start_channels = in_channels * (downscale_coef ** 2)
 60 |         input_channels = [start_channels, start_channels // 2, start_channels // 4]
 61 |         self.unshuffle = nn.PixelUnshuffle(downscale_coef)
 62 |         
 63 |         self.controlnet_encode_first = nn.Sequential(
 64 |             nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
 65 |             nn.GroupNorm(2, input_channels[1]),
 66 |             nn.ReLU(),
 67 |         )
 68 | 
 69 |         self.controlnet_encode_second = nn.Sequential(
 70 |             nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
 71 |             nn.GroupNorm(2, input_channels[2]),
 72 |             nn.ReLU(),
 73 |         )
 74 |         
 75 |         # 1. Patch embedding
 76 |         self.patch_embed = CogVideoXPatchEmbed(
 77 |             patch_size=patch_size,
 78 |             in_channels=vae_channels + input_channels[2],
 79 |             embed_dim=inner_dim,
 80 |             bias=True,
 81 |             sample_width=sample_width,
 82 |             sample_height=sample_height,
 83 |             sample_frames=sample_frames,
 84 |             temporal_compression_ratio=temporal_compression_ratio,
 85 |             spatial_interpolation_scale=spatial_interpolation_scale,
 86 |             temporal_interpolation_scale=temporal_interpolation_scale,
 87 |             use_positional_embeddings=not use_rotary_positional_embeddings,
 88 |             use_learned_positional_embeddings=use_learned_positional_embeddings,
 89 |         )
 90 |         
 91 |         self.embedding_dropout = nn.Dropout(dropout)
 92 | 
 93 |         # 2. Time embeddings
 94 |         self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
 95 |         self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
 96 | 
 97 |         # 3. Define spatio-temporal transformers blocks
 98 |         self.transformer_blocks = nn.ModuleList(
 99 |             [
100 |                 CogVideoXBlock(
101 |                     dim=inner_dim,
102 |                     num_attention_heads=num_attention_heads,
103 |                     attention_head_dim=attention_head_dim,
104 |                     time_embed_dim=time_embed_dim,
105 |                     dropout=dropout,
106 |                     activation_fn=activation_fn,
107 |                     attention_bias=attention_bias,
108 |                     norm_elementwise_affine=norm_elementwise_affine,
109 |                     norm_eps=norm_eps,
110 |                 )
111 |                 for _ in range(num_layers)
112 |             ]
113 |         )
114 | 
115 |         self.out_projectors = None
116 |         if out_proj_dim is not None:
117 |             self.out_projectors = nn.ModuleList(
118 |                 [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)]
119 |             )
120 |             
121 |         self.gradient_checkpointing = False
122 |         
123 |     def _set_gradient_checkpointing(self, module, value=False):
124 |         self.gradient_checkpointing = value
125 | 
126 |     def compress_time(self, x, num_frames):
127 |         x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
128 |         batch_size, frames, channels, height, width = x.shape
129 |         x = rearrange(x, 'b f c h w -> (b h w) c f')
130 |         
131 |         if x.shape[-1] % 2 == 1:
132 |             x_first, x_rest = x[..., 0], x[..., 1:]
133 |             if x_rest.shape[-1] > 0:
134 |                 x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
135 | 
136 |             x = torch.cat([x_first[..., None], x_rest], dim=-1)
137 |         else:
138 |             x = F.avg_pool1d(x, kernel_size=2, stride=2)
139 |         x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
140 |         return x
141 |         
142 |     def forward(
143 |         self,
144 |         hidden_states: torch.Tensor,
145 |         encoder_hidden_states: torch.Tensor,
146 |         controlnet_states: torch.Tensor,
147 |         timestep: Union[int, float, torch.LongTensor],
148 |         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
149 |         timestep_cond: Optional[torch.Tensor] = None,
150 |         return_dict: bool = True,
151 |     ):
152 |         batch_size, num_frames, channels, height, width = controlnet_states.shape
153 |         # 0. Controlnet encoder
154 |         controlnet_states = rearrange(controlnet_states, 'b f c h w -> (b f) c h w')
155 |         controlnet_states = self.unshuffle(controlnet_states)
156 |         controlnet_states = self.controlnet_encode_first(controlnet_states)
157 |         controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 
158 |         num_frames = controlnet_states.shape[0] // batch_size
159 | 
160 |         controlnet_states = self.controlnet_encode_second(controlnet_states)
161 |         controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 
162 |         controlnet_states = rearrange(controlnet_states, '(b f) c h w -> b f c h w', b=batch_size)
163 | 
164 |         hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
165 |         # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep)
166 |         # 1. Time embedding
167 |         timesteps = timestep
168 |         t_emb = self.time_proj(timesteps)
169 | 
170 |         # timesteps does not contain any weights and will always return f32 tensors
171 |         # but time_embedding might actually be running in fp16. so we need to cast here.
172 |         # there might be better ways to encapsulate this.
173 |         t_emb = t_emb.to(dtype=hidden_states.dtype)
174 |         emb = self.time_embedding(t_emb, timestep_cond)
175 |         
176 |         hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
177 |         hidden_states = self.embedding_dropout(hidden_states)
178 | 
179 | 
180 |         text_seq_length = encoder_hidden_states.shape[1]
181 |         encoder_hidden_states = hidden_states[:, :text_seq_length]
182 |         hidden_states = hidden_states[:, text_seq_length:]
183 | 
184 |         
185 |         controlnet_hidden_states = ()
186 |         # 3. Transformer blocks
187 |         for i, block in enumerate(self.transformer_blocks):
188 |             if self.training and self.gradient_checkpointing:
189 | 
190 |                 def create_custom_forward(module):
191 |                     def custom_forward(*inputs):
192 |                         return module(*inputs)
193 | 
194 |                     return custom_forward
195 | 
196 |                 ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
197 |                 hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
198 |                     create_custom_forward(block),
199 |                     hidden_states,
200 |                     encoder_hidden_states,
201 |                     emb,
202 |                     image_rotary_emb,
203 |                     **ckpt_kwargs,
204 |                 )
205 |             else:
206 |                 hidden_states, encoder_hidden_states = block(
207 |                     hidden_states=hidden_states,
208 |                     encoder_hidden_states=encoder_hidden_states,
209 |                     temb=emb,
210 |                     image_rotary_emb=image_rotary_emb,
211 |                 )
212 |                 
213 |             if self.out_projectors is not None:
214 |                 controlnet_hidden_states += (self.out_projectors[i](hidden_states),)
215 |             else:
216 |                 controlnet_hidden_states += (hidden_states,)
217 |             
218 |         if not return_dict:
219 |             return (controlnet_hidden_states,)
220 |         return Transformer2DModelOutput(sample=controlnet_hidden_states)


--------------------------------------------------------------------------------
/cogvideox_fun/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PIL import Image
 3 | 
 4 | ASPECT_RATIO_512 = {
 5 |     '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
 6 |     '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
 7 |     '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
 8 |     '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
 9 |     '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
10 |     '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
11 |     '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
12 |     '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
13 |     '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
14 |     '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
15 | }
16 | ASPECT_RATIO_RANDOM_CROP_512 = {
17 |     '0.42': [320.0, 768.0], '0.5': [352.0, 704.0], 
18 |     '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0], 
19 |     '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], 
20 |     '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0], 
21 |     '2.0': [704.0, 352.0],  '2.4': [768.0, 320.0]
22 | }
23 | ASPECT_RATIO_RANDOM_CROP_PROB = [
24 |     1, 2,
25 |     4, 4, 4, 4,
26 |     8, 8, 8,
27 |     4, 4, 4, 4,
28 |     2, 1
29 | ]
30 | ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
31 | 
32 | def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
33 |     aspect_ratio = height / width
34 |     closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
35 |     return ratios[closest_ratio], float(closest_ratio)
36 | 
37 | def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
38 |     target_pixels = int(base_resolution) * int(base_resolution)
39 |     original_width, original_height = Image.open(image).size
40 |     ratio = (target_pixels / (original_width * original_height)) ** 0.5
41 |     width_slider = round(original_width * ratio)
42 |     height_slider = round(original_height * ratio)
43 |     return height_slider, width_slider


--------------------------------------------------------------------------------
/configs/scheduler_config_2b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "CogVideoXDDIMScheduler",
 3 |   "_diffusers_version": "0.30.0.dev0",
 4 |   "beta_end": 0.012,
 5 |   "beta_schedule": "scaled_linear",
 6 |   "beta_start": 0.00085,
 7 |   "clip_sample": false,
 8 |   "clip_sample_range": 1.0,
 9 |   "num_train_timesteps": 1000,
10 |   "prediction_type": "v_prediction",
11 |   "rescale_betas_zero_snr": true,
12 |   "sample_max_value": 1.0,
13 |   "set_alpha_to_one": true,
14 |   "snr_shift_scale": 3.0,
15 |   "steps_offset": 0,
16 |   "timestep_spacing": "trailing",
17 |   "trained_betas": null
18 | }
19 | 


--------------------------------------------------------------------------------
/configs/scheduler_config_5b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "CogVideoXDDIMScheduler",
 3 |   "_diffusers_version": "0.31.0.dev0",
 4 |   "beta_end": 0.012,
 5 |   "beta_schedule": "scaled_linear",
 6 |   "beta_start": 0.00085,
 7 |   "clip_sample": false,
 8 |   "clip_sample_range": 1.0,
 9 |   "num_train_timesteps": 1000,
10 |   "prediction_type": "v_prediction",
11 |   "rescale_betas_zero_snr": true,
12 |   "sample_max_value": 1.0,
13 |   "set_alpha_to_one": true,
14 |   "snr_shift_scale": 1.0,
15 |   "steps_offset": 0,
16 |   "timestep_spacing": "trailing",
17 |   "trained_betas": null
18 | }


--------------------------------------------------------------------------------
/configs/transformer_config_2b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "activation_fn": "gelu-approximate",
 3 |     "attention_bias": true,
 4 |     "attention_head_dim": 64,
 5 |     "dropout": 0.0,
 6 |     "flip_sin_to_cos": true,
 7 |     "freq_shift": 0,
 8 |     "in_channels": 16,
 9 |     "max_text_seq_length": 226,
10 |     "norm_elementwise_affine": true,
11 |     "norm_eps": 1e-05,
12 |     "num_attention_heads": 30,
13 |     "num_layers": 30,
14 |     "out_channels": 16,
15 |     "patch_size": 2,
16 |     "sample_frames": 49,
17 |     "sample_height": 60,
18 |     "sample_width": 90,
19 |     "spatial_interpolation_scale": 1.875,
20 |     "temporal_compression_ratio": 4,
21 |     "temporal_interpolation_scale": 1.0,
22 |     "text_embed_dim": 4096,
23 |     "time_embed_dim": 512,
24 |     "timestep_activation_fn": "silu",
25 |     "use_rotary_positional_embeddings": false
26 |   }


--------------------------------------------------------------------------------
/configs/transformer_config_5b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "activation_fn": "gelu-approximate",
 3 |     "attention_bias": true,
 4 |     "attention_head_dim": 64,
 5 |     "dropout": 0.0,
 6 |     "flip_sin_to_cos": true,
 7 |     "freq_shift": 0,
 8 |     "in_channels": 16,
 9 |     "max_text_seq_length": 226,
10 |     "norm_elementwise_affine": true,
11 |     "norm_eps": 1e-05,
12 |     "num_attention_heads": 48,
13 |     "num_layers": 42,
14 |     "out_channels": 16,
15 |     "patch_size": 2,
16 |     "sample_frames": 49,
17 |     "sample_height": 60,
18 |     "sample_width": 90,
19 |     "spatial_interpolation_scale": 1.875,
20 |     "temporal_compression_ratio": 4,
21 |     "temporal_interpolation_scale": 1.0,
22 |     "text_embed_dim": 4096,
23 |     "time_embed_dim": 512,
24 |     "timestep_activation_fn": "silu",
25 |     "use_rotary_positional_embeddings": true
26 |   }


--------------------------------------------------------------------------------
/configs/transformer_config_I2V_5b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "activation_fn": "gelu-approximate",
 3 |     "attention_bias": true,
 4 |     "attention_head_dim": 64,
 5 |     "dropout": 0.0,
 6 |     "flip_sin_to_cos": true,
 7 |     "freq_shift": 0,
 8 |     "in_channels": 32,
 9 |     "max_text_seq_length": 226,
10 |     "norm_elementwise_affine": true,
11 |     "norm_eps": 1e-05,
12 |     "num_attention_heads": 48,
13 |     "num_layers": 42,
14 |     "out_channels": 16,
15 |     "patch_size": 2,
16 |     "sample_frames": 49,
17 |     "sample_height": 60,
18 |     "sample_width": 90,
19 |     "spatial_interpolation_scale": 1.875,
20 |     "temporal_compression_ratio": 4,
21 |     "temporal_interpolation_scale": 1.0,
22 |     "text_embed_dim": 4096,
23 |     "time_embed_dim": 512,
24 |     "timestep_activation_fn": "silu",
25 |     "use_learned_positional_embeddings": true,
26 |     "use_rotary_positional_embeddings": true
27 |   }


--------------------------------------------------------------------------------
/configs/vae_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKLCogVideoX",
 3 |   "_diffusers_version": "0.31.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "block_out_channels": [
 6 |     128,
 7 |     256,
 8 |     256,
 9 |     512
10 |   ],
11 |   "down_block_types": [
12 |     "CogVideoXDownBlock3D",
13 |     "CogVideoXDownBlock3D",
14 |     "CogVideoXDownBlock3D",
15 |     "CogVideoXDownBlock3D"
16 |   ],
17 |   "force_upcast": true,
18 |   "in_channels": 3,
19 |   "latent_channels": 16,
20 |   "latents_mean": null,
21 |   "latents_std": null,
22 |   "layers_per_block": 3,
23 |   "norm_eps": 1e-06,
24 |   "norm_num_groups": 32,
25 |   "out_channels": 3,
26 |   "sample_height": 480,
27 |   "sample_width": 720,
28 |   "scaling_factor": 0.7,
29 |   "shift_factor": null,
30 |   "temporal_compression_ratio": 4,
31 |   "up_block_types": [
32 |     "CogVideoXUpBlock3D",
33 |     "CogVideoXUpBlock3D",
34 |     "CogVideoXUpBlock3D",
35 |     "CogVideoXUpBlock3D"
36 |   ],
37 |   "use_post_quant_conv": false,
38 |   "use_quant_conv": false
39 | }
40 | 


--------------------------------------------------------------------------------
/context.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import Callable, Optional, List
  3 | 
  4 | 
  5 | def ordered_halving(val):
  6 |     bin_str = f"{val:064b}"
  7 |     bin_flip = bin_str[::-1]
  8 |     as_int = int(bin_flip, 2)
  9 | 
 10 |     return as_int / (1 << 64)
 11 | 
 12 | def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]:
 13 |     prev_val = -1
 14 |     for i, val in enumerate(window):
 15 |         val = val % num_frames
 16 |         if val < prev_val:
 17 |             return True, i
 18 |         prev_val = val
 19 |     return False, -1
 20 | 
 21 | def shift_window_to_start(window: list[int], num_frames: int):
 22 |     start_val = window[0]
 23 |     for i in range(len(window)):
 24 |         # 1) subtract each element by start_val to move vals relative to the start of all frames
 25 |         # 2) add num_frames and take modulus to get adjusted vals
 26 |         window[i] = ((window[i] - start_val) + num_frames) % num_frames
 27 | 
 28 | def shift_window_to_end(window: list[int], num_frames: int):
 29 |     # 1) shift window to start
 30 |     shift_window_to_start(window, num_frames)
 31 |     end_val = window[-1]
 32 |     end_delta = num_frames - end_val - 1
 33 |     for i in range(len(window)):
 34 |         # 2) add end_delta to each val to slide windows to end
 35 |         window[i] = window[i] + end_delta
 36 | 
 37 | def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]:
 38 |     all_indexes = list(range(num_frames))
 39 |     for w in windows:
 40 |         for val in w:
 41 |             try:
 42 |                 all_indexes.remove(val)
 43 |             except ValueError:
 44 |                 pass
 45 |     return all_indexes
 46 | 
 47 | def uniform_looped(
 48 |     step: int = ...,
 49 |     num_steps: Optional[int] = None,
 50 |     num_frames: int = ...,
 51 |     context_size: Optional[int] = None,
 52 |     context_stride: int = 3,
 53 |     context_overlap: int = 4,
 54 |     closed_loop: bool = True,
 55 | ):
 56 |     if num_frames <= context_size:
 57 |         yield list(range(num_frames))
 58 |         return
 59 | 
 60 |     context_stride = min(context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1)
 61 | 
 62 |     for context_step in 1 << np.arange(context_stride):
 63 |         pad = int(round(num_frames * ordered_halving(step)))
 64 |         for j in range(
 65 |             int(ordered_halving(step) * context_step) + pad,
 66 |             num_frames + pad + (0 if closed_loop else -context_overlap),
 67 |             (context_size * context_step - context_overlap),
 68 |         ):
 69 |             yield [e % num_frames for e in range(j, j + context_size * context_step, context_step)]
 70 | 
 71 | #from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
 72 | def uniform_standard(
 73 |     step: int = ...,
 74 |     num_steps: Optional[int] = None,
 75 |     num_frames: int = ...,
 76 |     context_size: Optional[int] = None,
 77 |     context_stride: int = 3,
 78 |     context_overlap: int = 4,
 79 |     closed_loop: bool = True,
 80 | ):
 81 |     windows = []
 82 |     if num_frames <= context_size:
 83 |         windows.append(list(range(num_frames)))
 84 |         return windows
 85 | 
 86 |     context_stride = min(context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1)
 87 | 
 88 |     for context_step in 1 << np.arange(context_stride):
 89 |         pad = int(round(num_frames * ordered_halving(step)))
 90 |         for j in range(
 91 |             int(ordered_halving(step) * context_step) + pad,
 92 |             num_frames + pad + (0 if closed_loop else -context_overlap),
 93 |             (context_size * context_step - context_overlap),
 94 |         ):
 95 |             windows.append([e % num_frames for e in range(j, j + context_size * context_step, context_step)])
 96 | 
 97 |     # now that windows are created, shift any windows that loop, and delete duplicate windows
 98 |     delete_idxs = []
 99 |     win_i = 0
100 |     while win_i < len(windows):
101 |         # if window is rolls over itself, need to shift it
102 |         is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames)
103 |         if is_roll:
104 |             roll_val = windows[win_i][roll_idx]  # roll_val might not be 0 for windows of higher strides
105 |             shift_window_to_end(windows[win_i], num_frames=num_frames)
106 |             # check if next window (cyclical) is missing roll_val
107 |             if roll_val not in windows[(win_i+1) % len(windows)]:
108 |                 # need to insert new window here - just insert window starting at roll_val
109 |                 windows.insert(win_i+1, list(range(roll_val, roll_val + context_size)))
110 |         # delete window if it's not unique
111 |         for pre_i in range(0, win_i):
112 |             if windows[win_i] == windows[pre_i]:
113 |                 delete_idxs.append(win_i)
114 |                 break
115 |         win_i += 1
116 | 
117 |     # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation
118 |     delete_idxs.reverse()
119 |     for i in delete_idxs:
120 |         windows.pop(i)
121 |     return windows
122 | 
123 | def static_standard(
124 |     step: int = ...,
125 |     num_steps: Optional[int] = None,
126 |     num_frames: int = ...,
127 |     context_size: Optional[int] = None,
128 |     context_stride: int = 3,
129 |     context_overlap: int = 4,
130 |     closed_loop: bool = True,
131 | ):
132 |     windows = []
133 |     if num_frames <= context_size:
134 |         windows.append(list(range(num_frames)))
135 |         return windows
136 |     # always return the same set of windows
137 |     delta = context_size - context_overlap
138 |     for start_idx in range(0, num_frames, delta):
139 |         # if past the end of frames, move start_idx back to allow same context_length
140 |         ending = start_idx + context_size
141 |         if ending >= num_frames:
142 |             final_delta = ending - num_frames
143 |             final_start_idx = start_idx - final_delta
144 |             windows.append(list(range(final_start_idx, final_start_idx + context_size)))
145 |             break
146 |         windows.append(list(range(start_idx, start_idx + context_size)))
147 |     return windows
148 | 
149 | def get_context_scheduler(name: str) -> Callable:
150 |     if name == "uniform_looped":
151 |         return uniform_looped
152 |     elif name == "uniform_standard":
153 |         return uniform_standard
154 |     elif name == "static_standard":
155 |         return static_standard
156 |     else:
157 |         raise ValueError(f"Unknown context_overlap policy {name}")
158 | 
159 | 
160 | def get_total_steps(
161 |     scheduler,
162 |     timesteps: List[int],
163 |     num_steps: Optional[int] = None,
164 |     num_frames: int = ...,
165 |     context_size: Optional[int] = None,
166 |     context_stride: int = 3,
167 |     context_overlap: int = 4,
168 |     closed_loop: bool = True,
169 | ):
170 |     return sum(
171 |         len(
172 |             list(
173 |                 scheduler(
174 |                     i,
175 |                     num_steps,
176 |                     num_frames,
177 |                     context_size,
178 |                     context_stride,
179 |                     context_overlap,
180 |                 )
181 |             )
182 |         )
183 |         for i in range(len(timesteps))
184 |     )
185 | 


--------------------------------------------------------------------------------
/embeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | from typing import Tuple, Union, Optional
  5 | from diffusers.models.embeddings import get_3d_sincos_pos_embed, get_1d_rotary_pos_embed
  6 | 
  7 | 
  8 | class CogVideoXPatchEmbed(nn.Module):
  9 |     def __init__(
 10 |         self,
 11 |         patch_size: int = 2,
 12 |         patch_size_t: Optional[int] = None,
 13 |         in_channels: int = 16,
 14 |         embed_dim: int = 1920,
 15 |         text_embed_dim: int = 4096,
 16 |         bias: bool = True,
 17 |         sample_width: int = 90,
 18 |         sample_height: int = 60,
 19 |         sample_frames: int = 49,
 20 |         temporal_compression_ratio: int = 4,
 21 |         max_text_seq_length: int = 226,
 22 |         spatial_interpolation_scale: float = 1.875,
 23 |         temporal_interpolation_scale: float = 1.0,
 24 |         use_positional_embeddings: bool = True,
 25 |         use_learned_positional_embeddings: bool = True,
 26 |     ) -> None:
 27 |         super().__init__()
 28 | 
 29 |         self.patch_size = patch_size
 30 |         self.patch_size_t = patch_size_t
 31 |         self.embed_dim = embed_dim
 32 |         self.sample_height = sample_height
 33 |         self.sample_width = sample_width
 34 |         self.sample_frames = sample_frames
 35 |         self.temporal_compression_ratio = temporal_compression_ratio
 36 |         self.max_text_seq_length = max_text_seq_length
 37 |         self.spatial_interpolation_scale = spatial_interpolation_scale
 38 |         self.temporal_interpolation_scale = temporal_interpolation_scale
 39 |         self.use_positional_embeddings = use_positional_embeddings
 40 |         self.use_learned_positional_embeddings = use_learned_positional_embeddings
 41 | 
 42 |         if patch_size_t is None:
 43 |             # CogVideoX 1.0 checkpoints
 44 |             self.proj = nn.Conv2d(
 45 |                 in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
 46 |             )
 47 |         else:
 48 |             # CogVideoX 1.5 checkpoints
 49 |             self.proj = nn.Linear(in_channels * patch_size * patch_size * patch_size_t, embed_dim)
 50 | 
 51 |         self.text_proj = nn.Linear(text_embed_dim, embed_dim)
 52 | 
 53 |         if use_positional_embeddings or use_learned_positional_embeddings:
 54 |             persistent = use_learned_positional_embeddings
 55 |             pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
 56 |             self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
 57 | 
 58 |     def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
 59 |         post_patch_height = sample_height // self.patch_size
 60 |         post_patch_width = sample_width // self.patch_size
 61 |         post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
 62 |         num_patches = post_patch_height * post_patch_width * post_time_compression_frames
 63 | 
 64 |         pos_embedding = get_3d_sincos_pos_embed(
 65 |             self.embed_dim,
 66 |             (post_patch_width, post_patch_height),
 67 |             post_time_compression_frames,
 68 |             self.spatial_interpolation_scale,
 69 |             self.temporal_interpolation_scale,
 70 |         )
 71 |         pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
 72 |         joint_pos_embedding = torch.zeros(
 73 |             1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
 74 |         )
 75 |         joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
 76 | 
 77 |         return joint_pos_embedding
 78 | 
 79 |     def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
 80 |         r"""
 81 |         Args:
 82 |             text_embeds (`torch.Tensor`):
 83 |                 Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
 84 |             image_embeds (`torch.Tensor`):
 85 |                 Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
 86 |         """
 87 |         text_embeds = self.text_proj(text_embeds)
 88 | 
 89 |         batch_size, num_frames, channels, height, width = image_embeds.shape
 90 | 
 91 |         if self.patch_size_t is None:
 92 |             image_embeds = image_embeds.reshape(-1, channels, height, width)
 93 |             image_embeds = self.proj(image_embeds)
 94 |             image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
 95 |             image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
 96 |             image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
 97 |         else:
 98 |             p = self.patch_size
 99 |             p_t = self.patch_size_t
100 | 
101 |             image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
102 |             image_embeds = image_embeds.reshape(
103 |                 batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
104 |             )
105 |             image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
106 |             image_embeds = self.proj(image_embeds)
107 | 
108 |         embeds = torch.cat(
109 |             [text_embeds, image_embeds], dim=1
110 |         ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
111 | 
112 |         if self.use_positional_embeddings or self.use_learned_positional_embeddings:
113 |             if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
114 |                 raise ValueError(
115 |                     "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
116 |                     "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
117 |                 )
118 | 
119 |             pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
120 | 
121 |             if (
122 |                 self.sample_height != height
123 |                 or self.sample_width != width
124 |                 or self.sample_frames != pre_time_compression_frames
125 |             ):
126 |                 pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
127 |                 pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype)
128 |             else:
129 |                 pos_embedding = self.pos_embedding
130 | 
131 |             embeds = embeds + pos_embedding
132 | 
133 |         return embeds
134 | 
135 | def get_3d_rotary_pos_embed(
136 |     embed_dim,
137 |     crops_coords,
138 |     grid_size,
139 |     temporal_size,
140 |     theta: int = 10000,
141 |     use_real: bool = True,
142 |     grid_type: str = "linspace",
143 |     max_size: Optional[Tuple[int, int]] = None,
144 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
145 |     """
146 |     RoPE for video tokens with 3D structure.
147 | 
148 |     Args:
149 |     embed_dim: (`int`):
150 |         The embedding dimension size, corresponding to hidden_size_head.
151 |     crops_coords (`Tuple[int]`):
152 |         The top-left and bottom-right coordinates of the crop.
153 |     grid_size (`Tuple[int]`):
154 |         The grid size of the spatial positional embedding (height, width).
155 |     temporal_size (`int`):
156 |         The size of the temporal dimension.
157 |     theta (`float`):
158 |         Scaling factor for frequency computation.
159 |     grid_type (`str`):
160 |         Whether to use "linspace" or "slice" to compute grids.
161 | 
162 |     Returns:
163 |         `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
164 |     """
165 |     if use_real is not True:
166 |         raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
167 | 
168 |     if grid_type == "linspace":
169 |         start, stop = crops_coords
170 |         grid_size_h, grid_size_w = grid_size
171 |         grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
172 |         grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
173 |         grid_t = np.arange(temporal_size, dtype=np.float32)
174 |         grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
175 |     elif grid_type == "slice":
176 |         max_h, max_w = max_size
177 |         grid_size_h, grid_size_w = grid_size
178 |         grid_h = np.arange(max_h, dtype=np.float32)
179 |         grid_w = np.arange(max_w, dtype=np.float32)
180 |         grid_t = np.arange(temporal_size, dtype=np.float32)
181 |     else:
182 |         raise ValueError("Invalid value passed for `grid_type`.")
183 | 
184 |     # Compute dimensions for each axis
185 |     dim_t = embed_dim // 4
186 |     dim_h = embed_dim // 8 * 3
187 |     dim_w = embed_dim // 8 * 3
188 | 
189 |     # Temporal frequencies
190 |     freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
191 |     # Spatial frequencies for height and width
192 |     freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
193 |     freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
194 | 
195 |     # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
196 |     def combine_time_height_width(freqs_t, freqs_h, freqs_w):
197 |         freqs_t = freqs_t[:, None, None, :].expand(
198 |             -1, grid_size_h, grid_size_w, -1
199 |         )  # temporal_size, grid_size_h, grid_size_w, dim_t
200 |         freqs_h = freqs_h[None, :, None, :].expand(
201 |             temporal_size, -1, grid_size_w, -1
202 |         )  # temporal_size, grid_size_h, grid_size_2, dim_h
203 |         freqs_w = freqs_w[None, None, :, :].expand(
204 |             temporal_size, grid_size_h, -1, -1
205 |         )  # temporal_size, grid_size_h, grid_size_2, dim_w
206 | 
207 |         freqs = torch.cat(
208 |             [freqs_t, freqs_h, freqs_w], dim=-1
209 |         )  # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
210 |         freqs = freqs.view(
211 |             temporal_size * grid_size_h * grid_size_w, -1
212 |         )  # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
213 |         return freqs
214 | 
215 |     t_cos, t_sin = freqs_t  # both t_cos and t_sin has shape: temporal_size, dim_t
216 |     h_cos, h_sin = freqs_h  # both h_cos and h_sin has shape: grid_size_h, dim_h
217 |     w_cos, w_sin = freqs_w  # both w_cos and w_sin has shape: grid_size_w, dim_w
218 | 
219 |     if grid_type == "slice":
220 |         t_cos, t_sin = t_cos[:temporal_size], t_sin[:temporal_size]
221 |         h_cos, h_sin = h_cos[:grid_size_h], h_sin[:grid_size_h]
222 |         w_cos, w_sin = w_cos[:grid_size_w], w_sin[:grid_size_w]
223 | 
224 |     cos = combine_time_height_width(t_cos, h_cos, w_cos)
225 |     sin = combine_time_height_width(t_sin, h_sin, w_sin)
226 |     return cos, sin


--------------------------------------------------------------------------------
/enhance_a_video/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kijai/ComfyUI-CogVideoXWrapper/dbc63f622dd095391335612d0c7d7bbff8745cc8/enhance_a_video/__init__.py


--------------------------------------------------------------------------------
/enhance_a_video/enhance.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from einops import rearrange
 3 | from diffusers.models.attention import Attention
 4 | from .globals import get_enhance_weight, get_num_frames
 5 | 
 6 | # def get_feta_scores(query, key):
 7 | #     img_q, img_k = query, key
 8 |    
 9 | #     num_frames = get_num_frames()
10 |     
11 | #     B, S, N, C = img_q.shape
12 | 
13 | #     # Calculate spatial dimension
14 | #     spatial_dim = S // num_frames
15 |     
16 | #     # Add time dimension between spatial and head dims
17 | #     query_image = img_q.reshape(B, spatial_dim, num_frames, N, C)
18 | #     key_image = img_k.reshape(B, spatial_dim, num_frames, N, C)
19 |     
20 | #     # Expand time dimension
21 | #     query_image = query_image.expand(-1, -1, num_frames, -1, -1)  # [B, S, T, N, C]
22 | #     key_image = key_image.expand(-1, -1, num_frames, -1, -1)      # [B, S, T, N, C]
23 |     
24 | #     # Reshape to match feta_score input format: [(B S) N T C]
25 | #     query_image = rearrange(query_image, "b s t n c -> (b s) n t c")  #torch.Size([3200, 24, 5, 128])
26 | #     key_image = rearrange(key_image, "b s t n c -> (b s) n t c")
27 |     
28 | #     return feta_score(query_image, key_image, C, num_frames)
29 |  
30 | def get_feta_scores(
31 |         attn: Attention,
32 |         query: torch.Tensor,
33 |         key: torch.Tensor,
34 |         head_dim: int,
35 |         text_seq_length: int,
36 |     ) -> torch.Tensor:
37 |         num_frames = get_num_frames()
38 |         spatial_dim = int((query.shape[2] - text_seq_length) / num_frames)
39 | 
40 |         query_image = rearrange(
41 |             query[:, :, text_seq_length:],
42 |             "B N (T S) C -> (B S) N T C",
43 |             N=attn.heads,
44 |             T=num_frames,
45 |             S=spatial_dim,
46 |             C=head_dim,
47 |         )
48 |         key_image = rearrange(
49 |             key[:, :, text_seq_length:],
50 |             "B N (T S) C -> (B S) N T C",
51 |             N=attn.heads,
52 |             T=num_frames,
53 |             S=spatial_dim,
54 |             C=head_dim,
55 |         )
56 |         return feta_score(query_image, key_image, head_dim, num_frames)
57 | 
58 | def feta_score(query_image, key_image, head_dim, num_frames):
59 |     scale = head_dim**-0.5
60 |     query_image = query_image * scale
61 |     attn_temp = query_image @ key_image.transpose(-2, -1)  # translate attn to float32
62 |     attn_temp = attn_temp.to(torch.float32)
63 |     attn_temp = attn_temp.softmax(dim=-1)
64 | 
65 |     # Reshape to [batch_size * num_tokens, num_frames, num_frames]
66 |     attn_temp = attn_temp.reshape(-1, num_frames, num_frames)
67 | 
68 |     # Create a mask for diagonal elements
69 |     diag_mask = torch.eye(num_frames, device=attn_temp.device).bool()
70 |     diag_mask = diag_mask.unsqueeze(0).expand(attn_temp.shape[0], -1, -1)
71 | 
72 |     # Zero out diagonal elements
73 |     attn_wo_diag = attn_temp.masked_fill(diag_mask, 0)
74 | 
75 |     # Calculate mean for each token's attention matrix
76 |     # Number of off-diagonal elements per matrix is n*n - n
77 |     num_off_diag = num_frames * num_frames - num_frames
78 |     mean_scores = attn_wo_diag.sum(dim=(1, 2)) / num_off_diag
79 | 
80 |     enhance_scores = mean_scores.mean() * (num_frames + get_enhance_weight())
81 |     enhance_scores = enhance_scores.clamp(min=1)
82 |     return enhance_scores
83 | 


--------------------------------------------------------------------------------
/enhance_a_video/globals.py:
--------------------------------------------------------------------------------
 1 | NUM_FRAMES = None
 2 | FETA_WEIGHT = None
 3 | ENABLE_FETA = False
 4 | 
 5 | def set_num_frames(num_frames: int):
 6 |     global NUM_FRAMES
 7 |     NUM_FRAMES = num_frames
 8 | 
 9 | 
10 | def get_num_frames() -> int:
11 |     return NUM_FRAMES
12 | 
13 | 
14 | def enable_enhance():
15 |     global ENABLE_FETA
16 |     ENABLE_FETA = True
17 | 
18 | def disable_enhance():
19 |     global ENABLE_FETA
20 |     ENABLE_FETA = False
21 | 
22 | def is_enhance_enabled() -> bool:
23 |     return ENABLE_FETA
24 | 
25 | def set_enhance_weight(feta_weight: float):
26 |     global FETA_WEIGHT
27 |     FETA_WEIGHT = feta_weight
28 | 
29 | 
30 | def get_enhance_weight() -> float:
31 |     return FETA_WEIGHT
32 | 


--------------------------------------------------------------------------------
/example_workflows/cogvideox_1_0_2b_controlnet_02.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "last_node_id": 48,
   3 |   "last_link_id": 90,
   4 |   "nodes": [
   5 |     {
   6 |       "id": 41,
   7 |       "type": "HEDPreprocessor",
   8 |       "pos": {
   9 |         "0": -570,
  10 |         "1": -76
  11 |       },
  12 |       "size": {
  13 |         "0": 315,
  14 |         "1": 82
  15 |       },
  16 |       "flags": {},
  17 |       "order": 4,
  18 |       "mode": 0,
  19 |       "inputs": [
  20 |         {
  21 |           "name": "image",
  22 |           "type": "IMAGE",
  23 |           "link": 73
  24 |         }
  25 |       ],
  26 |       "outputs": [
  27 |         {
  28 |           "name": "IMAGE",
  29 |           "type": "IMAGE",
  30 |           "links": [
  31 |             74
  32 |           ],
  33 |           "slot_index": 0
  34 |         }
  35 |       ],
  36 |       "properties": {
  37 |         "Node name for S&R": "HEDPreprocessor"
  38 |       },
  39 |       "widgets_values": [
  40 |         "enable",
  41 |         768
  42 |       ]
  43 |     },
  44 |     {
  45 |       "id": 38,
  46 |       "type": "VHS_LoadVideo",
  47 |       "pos": {
  48 |         "0": -847,
  49 |         "1": -78
  50 |       },
  51 |       "size": [
  52 |         247.455078125,
  53 |         427.63671875
  54 |       ],
  55 |       "flags": {},
  56 |       "order": 0,
  57 |       "mode": 0,
  58 |       "inputs": [
  59 |         {
  60 |           "name": "meta_batch",
  61 |           "type": "VHS_BatchManager",
  62 |           "link": null,
  63 |           "shape": 7
  64 |         },
  65 |         {
  66 |           "name": "vae",
  67 |           "type": "VAE",
  68 |           "link": null,
  69 |           "shape": 7
  70 |         }
  71 |       ],
  72 |       "outputs": [
  73 |         {
  74 |           "name": "IMAGE",
  75 |           "type": "IMAGE",
  76 |           "links": [
  77 |             73
  78 |           ],
  79 |           "slot_index": 0
  80 |         },
  81 |         {
  82 |           "name": "frame_count",
  83 |           "type": "INT",
  84 |           "links": null
  85 |         },
  86 |         {
  87 |           "name": "audio",
  88 |           "type": "AUDIO",
  89 |           "links": null
  90 |         },
  91 |         {
  92 |           "name": "video_info",
  93 |           "type": "VHS_VIDEOINFO",
  94 |           "links": null
  95 |         }
  96 |       ],
  97 |       "properties": {
  98 |         "Node name for S&R": "VHS_LoadVideo"
  99 |       },
 100 |       "widgets_values": {
 101 |         "video": "car.mp4",
 102 |         "force_rate": 0,
 103 |         "force_size": "Disabled",
 104 |         "custom_width": 512,
 105 |         "custom_height": 512,
 106 |         "frame_load_cap": 49,
 107 |         "skip_first_frames": 0,
 108 |         "select_every_nth": 1,
 109 |         "choose video to upload": "image",
 110 |         "videopreview": {
 111 |           "hidden": false,
 112 |           "paused": false,
 113 |           "params": {
 114 |             "frame_load_cap": 49,
 115 |             "skip_first_frames": 0,
 116 |             "force_rate": 0,
 117 |             "filename": "car.mp4",
 118 |             "type": "input",
 119 |             "format": "video/mp4",
 120 |             "select_every_nth": 1
 121 |           },
 122 |           "muted": false
 123 |         }
 124 |       }
 125 |     },
 126 |     {
 127 |       "id": 39,
 128 |       "type": "ImageResizeKJ",
 129 |       "pos": {
 130 |         "0": -563,
 131 |         "1": 63
 132 |       },
 133 |       "size": {
 134 |         "0": 315,
 135 |         "1": 266
 136 |       },
 137 |       "flags": {},
 138 |       "order": 6,
 139 |       "mode": 0,
 140 |       "inputs": [
 141 |         {
 142 |           "name": "image",
 143 |           "type": "IMAGE",
 144 |           "link": 74
 145 |         },
 146 |         {
 147 |           "name": "get_image_size",
 148 |           "type": "IMAGE",
 149 |           "link": null,
 150 |           "shape": 7
 151 |         },
 152 |         {
 153 |           "name": "width_input",
 154 |           "type": "INT",
 155 |           "link": null,
 156 |           "widget": {
 157 |             "name": "width_input"
 158 |           },
 159 |           "shape": 7
 160 |         },
 161 |         {
 162 |           "name": "height_input",
 163 |           "type": "INT",
 164 |           "link": null,
 165 |           "widget": {
 166 |             "name": "height_input"
 167 |           },
 168 |           "shape": 7
 169 |         }
 170 |       ],
 171 |       "outputs": [
 172 |         {
 173 |           "name": "IMAGE",
 174 |           "type": "IMAGE",
 175 |           "links": [
 176 |             71
 177 |           ],
 178 |           "slot_index": 0
 179 |         },
 180 |         {
 181 |           "name": "width",
 182 |           "type": "INT",
 183 |           "links": null
 184 |         },
 185 |         {
 186 |           "name": "height",
 187 |           "type": "INT",
 188 |           "links": null
 189 |         }
 190 |       ],
 191 |       "properties": {
 192 |         "Node name for S&R": "ImageResizeKJ"
 193 |       },
 194 |       "widgets_values": [
 195 |         720,
 196 |         480,
 197 |         "lanczos",
 198 |         false,
 199 |         2,
 200 |         0,
 201 |         0,
 202 |         "disabled"
 203 |       ]
 204 |     },
 205 |     {
 206 |       "id": 30,
 207 |       "type": "CogVideoTextEncode",
 208 |       "pos": {
 209 |         "0": 130,
 210 |         "1": 350
 211 |       },
 212 |       "size": {
 213 |         "0": 475.7875061035156,
 214 |         "1": 231.29896545410156
 215 |       },
 216 |       "flags": {},
 217 |       "order": 5,
 218 |       "mode": 0,
 219 |       "inputs": [
 220 |         {
 221 |           "name": "clip",
 222 |           "type": "CLIP",
 223 |           "link": 54
 224 |         }
 225 |       ],
 226 |       "outputs": [
 227 |         {
 228 |           "name": "conditioning",
 229 |           "type": "CONDITIONING",
 230 |           "links": [
 231 |             84
 232 |           ],
 233 |           "slot_index": 0,
 234 |           "shape": 3
 235 |         },
 236 |         {
 237 |           "name": "clip",
 238 |           "type": "CLIP",
 239 |           "links": [
 240 |             78
 241 |           ],
 242 |           "slot_index": 1
 243 |         }
 244 |       ],
 245 |       "properties": {
 246 |         "Node name for S&R": "CogVideoTextEncode"
 247 |       },
 248 |       "widgets_values": [
 249 |         "car is moving among mountains",
 250 |         1,
 251 |         false
 252 |       ]
 253 |     },
 254 |     {
 255 |       "id": 31,
 256 |       "type": "CogVideoTextEncode",
 257 |       "pos": {
 258 |         "0": 139,
 259 |         "1": 643
 260 |       },
 261 |       "size": {
 262 |         "0": 463.01251220703125,
 263 |         "1": 144
 264 |       },
 265 |       "flags": {},
 266 |       "order": 7,
 267 |       "mode": 0,
 268 |       "inputs": [
 269 |         {
 270 |           "name": "clip",
 271 |           "type": "CLIP",
 272 |           "link": 78
 273 |         }
 274 |       ],
 275 |       "outputs": [
 276 |         {
 277 |           "name": "conditioning",
 278 |           "type": "CONDITIONING",
 279 |           "links": [
 280 |             85
 281 |           ],
 282 |           "slot_index": 0,
 283 |           "shape": 3
 284 |         },
 285 |         {
 286 |           "name": "clip",
 287 |           "type": "CLIP",
 288 |           "links": null
 289 |         }
 290 |       ],
 291 |       "properties": {
 292 |         "Node name for S&R": "CogVideoTextEncode"
 293 |       },
 294 |       "widgets_values": [
 295 |         "",
 296 |         1,
 297 |         true
 298 |       ]
 299 |     },
 300 |     {
 301 |       "id": 44,
 302 |       "type": "DownloadAndLoadCogVideoModel",
 303 |       "pos": {
 304 |         "0": 326,
 305 |         "1": -319
 306 |       },
 307 |       "size": {
 308 |         "0": 315,
 309 |         "1": 218
 310 |       },
 311 |       "flags": {},
 312 |       "order": 1,
 313 |       "mode": 0,
 314 |       "inputs": [
 315 |         {
 316 |           "name": "block_edit",
 317 |           "type": "TRANSFORMERBLOCKS",
 318 |           "link": null,
 319 |           "shape": 7
 320 |         },
 321 |         {
 322 |           "name": "lora",
 323 |           "type": "COGLORA",
 324 |           "link": null,
 325 |           "shape": 7
 326 |         },
 327 |         {
 328 |           "name": "compile_args",
 329 |           "type": "COMPILEARGS",
 330 |           "link": null,
 331 |           "shape": 7
 332 |         }
 333 |       ],
 334 |       "outputs": [
 335 |         {
 336 |           "name": "model",
 337 |           "type": "COGVIDEOMODEL",
 338 |           "links": [
 339 |             83
 340 |           ]
 341 |         },
 342 |         {
 343 |           "name": "vae",
 344 |           "type": "VAE",
 345 |           "links": [
 346 |             82
 347 |           ],
 348 |           "slot_index": 1
 349 |         }
 350 |       ],
 351 |       "properties": {
 352 |         "Node name for S&R": "DownloadAndLoadCogVideoModel"
 353 |       },
 354 |       "widgets_values": [
 355 |         "THUDM/CogVideoX-2b",
 356 |         "bf16",
 357 |         "disabled",
 358 |         false,
 359 |         "sdpa",
 360 |         "main_device"
 361 |       ]
 362 |     },
 363 |     {
 364 |       "id": 20,
 365 |       "type": "CLIPLoader",
 366 |       "pos": {
 367 |         "0": -175,
 368 |         "1": -317
 369 |       },
 370 |       "size": {
 371 |         "0": 452.912353515625,
 372 |         "1": 82
 373 |       },
 374 |       "flags": {},
 375 |       "order": 2,
 376 |       "mode": 0,
 377 |       "inputs": [],
 378 |       "outputs": [
 379 |         {
 380 |           "name": "CLIP",
 381 |           "type": "CLIP",
 382 |           "links": [
 383 |             54
 384 |           ],
 385 |           "slot_index": 0,
 386 |           "shape": 3
 387 |         }
 388 |       ],
 389 |       "properties": {
 390 |         "Node name for S&R": "CLIPLoader"
 391 |       },
 392 |       "widgets_values": [
 393 |         "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
 394 |         "sd3"
 395 |       ]
 396 |     },
 397 |     {
 398 |       "id": 35,
 399 |       "type": "DownloadAndLoadCogVideoControlNet",
 400 |       "pos": {
 401 |         "0": -105,
 402 |         "1": -182
 403 |       },
 404 |       "size": {
 405 |         "0": 378,
 406 |         "1": 58
 407 |       },
 408 |       "flags": {},
 409 |       "order": 3,
 410 |       "mode": 0,
 411 |       "inputs": [],
 412 |       "outputs": [
 413 |         {
 414 |           "name": "cogvideo_controlnet",
 415 |           "type": "COGVIDECONTROLNETMODEL",
 416 |           "links": [
 417 |             67
 418 |           ]
 419 |         }
 420 |       ],
 421 |       "properties": {
 422 |         "Node name for S&R": "DownloadAndLoadCogVideoControlNet"
 423 |       },
 424 |       "widgets_values": [
 425 |         "TheDenk/cogvideox-2b-controlnet-hed-v1"
 426 |       ]
 427 |     },
 428 |     {
 429 |       "id": 37,
 430 |       "type": "CogVideoControlNet",
 431 |       "pos": {
 432 |         "0": 220,
 433 |         "1": 155
 434 |       },
 435 |       "size": {
 436 |         "0": 367.79998779296875,
 437 |         "1": 126
 438 |       },
 439 |       "flags": {},
 440 |       "order": 9,
 441 |       "mode": 0,
 442 |       "inputs": [
 443 |         {
 444 |           "name": "controlnet",
 445 |           "type": "COGVIDECONTROLNETMODEL",
 446 |           "link": 67
 447 |         },
 448 |         {
 449 |           "name": "images",
 450 |           "type": "IMAGE",
 451 |           "link": 72
 452 |         }
 453 |       ],
 454 |       "outputs": [
 455 |         {
 456 |           "name": "cogvideo_controlnet",
 457 |           "type": "COGVIDECONTROLNET",
 458 |           "links": [
 459 |             86
 460 |           ],
 461 |           "slot_index": 0
 462 |         }
 463 |       ],
 464 |       "properties": {
 465 |         "Node name for S&R": "CogVideoControlNet"
 466 |       },
 467 |       "widgets_values": [
 468 |         1,
 469 |         0,
 470 |         1
 471 |       ]
 472 |     },
 473 |     {
 474 |       "id": 40,
 475 |       "type": "GetImageSizeAndCount",
 476 |       "pos": {
 477 |         "0": -123,
 478 |         "1": -34
 479 |       },
 480 |       "size": {
 481 |         "0": 277.20001220703125,
 482 |         "1": 86
 483 |       },
 484 |       "flags": {},
 485 |       "order": 8,
 486 |       "mode": 0,
 487 |       "inputs": [
 488 |         {
 489 |           "name": "image",
 490 |           "type": "IMAGE",
 491 |           "link": 71
 492 |         }
 493 |       ],
 494 |       "outputs": [
 495 |         {
 496 |           "name": "image",
 497 |           "type": "IMAGE",
 498 |           "links": [
 499 |             72,
 500 |             75
 501 |           ],
 502 |           "slot_index": 0
 503 |         },
 504 |         {
 505 |           "name": "720 width",
 506 |           "type": "INT",
 507 |           "links": [
 508 |             89
 509 |           ]
 510 |         },
 511 |         {
 512 |           "name": "480 height",
 513 |           "type": "INT",
 514 |           "links": [
 515 |             90
 516 |           ],
 517 |           "slot_index": 2
 518 |         },
 519 |         {
 520 |           "name": "49 count",
 521 |           "type": "INT",
 522 |           "links": null
 523 |         }
 524 |       ],
 525 |       "properties": {
 526 |         "Node name for S&R": "GetImageSizeAndCount"
 527 |       },
 528 |       "widgets_values": []
 529 |     },
 530 |     {
 531 |       "id": 47,
 532 |       "type": "EmptyLatentImage",
 533 |       "pos": {
 534 |         "0": 409,
 535 |         "1": 77
 536 |       },
 537 |       "size": {
 538 |         "0": 315,
 539 |         "1": 106
 540 |       },
 541 |       "flags": {
 542 |         "collapsed": true
 543 |       },
 544 |       "order": 10,
 545 |       "mode": 0,
 546 |       "inputs": [
 547 |         {
 548 |           "name": "width",
 549 |           "type": "INT",
 550 |           "link": 89,
 551 |           "widget": {
 552 |             "name": "width"
 553 |           }
 554 |         },
 555 |         {
 556 |           "name": "height",
 557 |           "type": "INT",
 558 |           "link": 90,
 559 |           "widget": {
 560 |             "name": "height"
 561 |           }
 562 |         }
 563 |       ],
 564 |       "outputs": [
 565 |         {
 566 |           "name": "LATENT",
 567 |           "type": "LATENT",
 568 |           "links": [
 569 |             88
 570 |           ]
 571 |         }
 572 |       ],
 573 |       "properties": {
 574 |         "Node name for S&R": "EmptyLatentImage"
 575 |       },
 576 |       "widgets_values": [
 577 |         720,
 578 |         480,
 579 |         1
 580 |       ]
 581 |     },
 582 |     {
 583 |       "id": 46,
 584 |       "type": "CogVideoSampler",
 585 |       "pos": {
 586 |         "0": 743,
 587 |         "1": 49
 588 |       },
 589 |       "size": {
 590 |         "0": 330,
 591 |         "1": 574
 592 |       },
 593 |       "flags": {},
 594 |       "order": 11,
 595 |       "mode": 0,
 596 |       "inputs": [
 597 |         {
 598 |           "name": "model",
 599 |           "type": "COGVIDEOMODEL",
 600 |           "link": 83
 601 |         },
 602 |         {
 603 |           "name": "positive",
 604 |           "type": "CONDITIONING",
 605 |           "link": 84
 606 |         },
 607 |         {
 608 |           "name": "negative",
 609 |           "type": "CONDITIONING",
 610 |           "link": 85
 611 |         },
 612 |         {
 613 |           "name": "samples",
 614 |           "type": "LATENT",
 615 |           "link": 88,
 616 |           "shape": 7
 617 |         },
 618 |         {
 619 |           "name": "image_cond_latents",
 620 |           "type": "LATENT",
 621 |           "link": null,
 622 |           "shape": 7
 623 |         },
 624 |         {
 625 |           "name": "context_options",
 626 |           "type": "COGCONTEXT",
 627 |           "link": null,
 628 |           "shape": 7
 629 |         },
 630 |         {
 631 |           "name": "controlnet",
 632 |           "type": "COGVIDECONTROLNET",
 633 |           "link": 86,
 634 |           "shape": 7
 635 |         },
 636 |         {
 637 |           "name": "tora_trajectory",
 638 |           "type": "TORAFEATURES",
 639 |           "link": null,
 640 |           "shape": 7
 641 |         },
 642 |         {
 643 |           "name": "fastercache",
 644 |           "type": "FASTERCACHEARGS",
 645 |           "link": null,
 646 |           "shape": 7
 647 |         }
 648 |       ],
 649 |       "outputs": [
 650 |         {
 651 |           "name": "samples",
 652 |           "type": "LATENT",
 653 |           "links": [
 654 |             87
 655 |           ]
 656 |         }
 657 |       ],
 658 |       "properties": {
 659 |         "Node name for S&R": "CogVideoSampler"
 660 |       },
 661 |       "widgets_values": [
 662 |         49,
 663 |         40,
 664 |         6,
 665 |         0,
 666 |         "fixed",
 667 |         "CogVideoXDDIM",
 668 |         1
 669 |       ]
 670 |     },
 671 |     {
 672 |       "id": 45,
 673 |       "type": "CogVideoDecode",
 674 |       "pos": {
 675 |         "0": 758,
 676 |         "1": 685
 677 |       },
 678 |       "size": {
 679 |         "0": 315,
 680 |         "1": 198
 681 |       },
 682 |       "flags": {},
 683 |       "order": 12,
 684 |       "mode": 0,
 685 |       "inputs": [
 686 |         {
 687 |           "name": "vae",
 688 |           "type": "VAE",
 689 |           "link": 82
 690 |         },
 691 |         {
 692 |           "name": "samples",
 693 |           "type": "LATENT",
 694 |           "link": 87
 695 |         }
 696 |       ],
 697 |       "outputs": [
 698 |         {
 699 |           "name": "images",
 700 |           "type": "IMAGE",
 701 |           "links": [
 702 |             81
 703 |           ]
 704 |         }
 705 |       ],
 706 |       "properties": {
 707 |         "Node name for S&R": "CogVideoDecode"
 708 |       },
 709 |       "widgets_values": [
 710 |         true,
 711 |         240,
 712 |         360,
 713 |         0.2,
 714 |         0.2,
 715 |         true
 716 |       ]
 717 |     },
 718 |     {
 719 |       "id": 42,
 720 |       "type": "ImageConcatMulti",
 721 |       "pos": {
 722 |         "0": 1145,
 723 |         "1": -24
 724 |       },
 725 |       "size": {
 726 |         "0": 210,
 727 |         "1": 150
 728 |       },
 729 |       "flags": {},
 730 |       "order": 13,
 731 |       "mode": 0,
 732 |       "inputs": [
 733 |         {
 734 |           "name": "image_1",
 735 |           "type": "IMAGE",
 736 |           "link": 75
 737 |         },
 738 |         {
 739 |           "name": "image_2",
 740 |           "type": "IMAGE",
 741 |           "link": 81
 742 |         }
 743 |       ],
 744 |       "outputs": [
 745 |         {
 746 |           "name": "images",
 747 |           "type": "IMAGE",
 748 |           "links": [
 749 |             77
 750 |           ],
 751 |           "slot_index": 0
 752 |         }
 753 |       ],
 754 |       "properties": {},
 755 |       "widgets_values": [
 756 |         2,
 757 |         "right",
 758 |         false,
 759 |         null
 760 |       ]
 761 |     },
 762 |     {
 763 |       "id": 43,
 764 |       "type": "VHS_VideoCombine",
 765 |       "pos": {
 766 |         "0": 1154,
 767 |         "1": 202
 768 |       },
 769 |       "size": [
 770 |         778.7022705078125,
 771 |         576.9007568359375
 772 |       ],
 773 |       "flags": {},
 774 |       "order": 14,
 775 |       "mode": 0,
 776 |       "inputs": [
 777 |         {
 778 |           "name": "images",
 779 |           "type": "IMAGE",
 780 |           "link": 77
 781 |         },
 782 |         {
 783 |           "name": "audio",
 784 |           "type": "AUDIO",
 785 |           "link": null,
 786 |           "shape": 7
 787 |         },
 788 |         {
 789 |           "name": "meta_batch",
 790 |           "type": "VHS_BatchManager",
 791 |           "link": null,
 792 |           "shape": 7
 793 |         },
 794 |         {
 795 |           "name": "vae",
 796 |           "type": "VAE",
 797 |           "link": null,
 798 |           "shape": 7
 799 |         }
 800 |       ],
 801 |       "outputs": [
 802 |         {
 803 |           "name": "Filenames",
 804 |           "type": "VHS_FILENAMES",
 805 |           "links": null,
 806 |           "shape": 3
 807 |         }
 808 |       ],
 809 |       "properties": {
 810 |         "Node name for S&R": "VHS_VideoCombine"
 811 |       },
 812 |       "widgets_values": {
 813 |         "frame_rate": 8,
 814 |         "loop_count": 0,
 815 |         "filename_prefix": "CogVideoX_2b_controlnet",
 816 |         "format": "video/h264-mp4",
 817 |         "pix_fmt": "yuv420p",
 818 |         "crf": 19,
 819 |         "save_metadata": true,
 820 |         "pingpong": false,
 821 |         "save_output": true,
 822 |         "videopreview": {
 823 |           "hidden": false,
 824 |           "paused": false,
 825 |           "params": {
 826 |             "filename": "CogVideoX2B_controlnet_00003.mp4",
 827 |             "subfolder": "",
 828 |             "type": "temp",
 829 |             "format": "video/h264-mp4",
 830 |             "frame_rate": 8
 831 |           },
 832 |           "muted": false
 833 |         }
 834 |       }
 835 |     }
 836 |   ],
 837 |   "links": [
 838 |     [
 839 |       54,
 840 |       20,
 841 |       0,
 842 |       30,
 843 |       0,
 844 |       "CLIP"
 845 |     ],
 846 |     [
 847 |       67,
 848 |       35,
 849 |       0,
 850 |       37,
 851 |       0,
 852 |       "COGVIDECONTROLNETMODEL"
 853 |     ],
 854 |     [
 855 |       71,
 856 |       39,
 857 |       0,
 858 |       40,
 859 |       0,
 860 |       "IMAGE"
 861 |     ],
 862 |     [
 863 |       72,
 864 |       40,
 865 |       0,
 866 |       37,
 867 |       1,
 868 |       "IMAGE"
 869 |     ],
 870 |     [
 871 |       73,
 872 |       38,
 873 |       0,
 874 |       41,
 875 |       0,
 876 |       "IMAGE"
 877 |     ],
 878 |     [
 879 |       74,
 880 |       41,
 881 |       0,
 882 |       39,
 883 |       0,
 884 |       "IMAGE"
 885 |     ],
 886 |     [
 887 |       75,
 888 |       40,
 889 |       0,
 890 |       42,
 891 |       0,
 892 |       "IMAGE"
 893 |     ],
 894 |     [
 895 |       77,
 896 |       42,
 897 |       0,
 898 |       43,
 899 |       0,
 900 |       "IMAGE"
 901 |     ],
 902 |     [
 903 |       78,
 904 |       30,
 905 |       1,
 906 |       31,
 907 |       0,
 908 |       "CLIP"
 909 |     ],
 910 |     [
 911 |       81,
 912 |       45,
 913 |       0,
 914 |       42,
 915 |       1,
 916 |       "IMAGE"
 917 |     ],
 918 |     [
 919 |       82,
 920 |       44,
 921 |       1,
 922 |       45,
 923 |       0,
 924 |       "VAE"
 925 |     ],
 926 |     [
 927 |       83,
 928 |       44,
 929 |       0,
 930 |       46,
 931 |       0,
 932 |       "COGVIDEOMODEL"
 933 |     ],
 934 |     [
 935 |       84,
 936 |       30,
 937 |       0,
 938 |       46,
 939 |       1,
 940 |       "CONDITIONING"
 941 |     ],
 942 |     [
 943 |       85,
 944 |       31,
 945 |       0,
 946 |       46,
 947 |       2,
 948 |       "CONDITIONING"
 949 |     ],
 950 |     [
 951 |       86,
 952 |       37,
 953 |       0,
 954 |       46,
 955 |       6,
 956 |       "COGVIDECONTROLNET"
 957 |     ],
 958 |     [
 959 |       87,
 960 |       46,
 961 |       0,
 962 |       45,
 963 |       1,
 964 |       "LATENT"
 965 |     ],
 966 |     [
 967 |       88,
 968 |       47,
 969 |       0,
 970 |       46,
 971 |       3,
 972 |       "LATENT"
 973 |     ],
 974 |     [
 975 |       89,
 976 |       40,
 977 |       1,
 978 |       47,
 979 |       0,
 980 |       "INT"
 981 |     ],
 982 |     [
 983 |       90,
 984 |       40,
 985 |       2,
 986 |       47,
 987 |       1,
 988 |       "INT"
 989 |     ]
 990 |   ],
 991 |   "groups": [],
 992 |   "config": {},
 993 |   "extra": {
 994 |     "ds": {
 995 |       "scale": 0.7627768444387069,
 996 |       "offset": [
 997 |         1075.4957551311677,
 998 |         398.4420252790512
 999 |       ]
1000 |     }
1001 |   },
1002 |   "version": 0.4
1003 | }


--------------------------------------------------------------------------------
/example_workflows/cogvideox_1_0_5b_I2V_02.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 63,
  3 |   "last_link_id": 149,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 31,
  7 |       "type": "CogVideoTextEncode",
  8 |       "pos": {
  9 |         "0": 497,
 10 |         "1": 520
 11 |       },
 12 |       "size": {
 13 |         "0": 463.01251220703125,
 14 |         "1": 144
 15 |       },
 16 |       "flags": {},
 17 |       "order": 6,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "clip",
 22 |           "type": "CLIP",
 23 |           "link": 149
 24 |         }
 25 |       ],
 26 |       "outputs": [
 27 |         {
 28 |           "name": "conditioning",
 29 |           "type": "CONDITIONING",
 30 |           "links": [
 31 |             146
 32 |           ],
 33 |           "slot_index": 0,
 34 |           "shape": 3
 35 |         },
 36 |         {
 37 |           "name": "clip",
 38 |           "type": "CLIP",
 39 |           "links": null
 40 |         }
 41 |       ],
 42 |       "properties": {
 43 |         "Node name for S&R": "CogVideoTextEncode"
 44 |       },
 45 |       "widgets_values": [
 46 |         "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
 47 |         1,
 48 |         true
 49 |       ]
 50 |     },
 51 |     {
 52 |       "id": 63,
 53 |       "type": "CogVideoSampler",
 54 |       "pos": {
 55 |         "0": 1142,
 56 |         "1": 74
 57 |       },
 58 |       "size": [
 59 |         330,
 60 |         574
 61 |       ],
 62 |       "flags": {},
 63 |       "order": 7,
 64 |       "mode": 0,
 65 |       "inputs": [
 66 |         {
 67 |           "name": "model",
 68 |           "type": "COGVIDEOMODEL",
 69 |           "link": 144
 70 |         },
 71 |         {
 72 |           "name": "positive",
 73 |           "type": "CONDITIONING",
 74 |           "link": 145
 75 |         },
 76 |         {
 77 |           "name": "negative",
 78 |           "type": "CONDITIONING",
 79 |           "link": 146
 80 |         },
 81 |         {
 82 |           "name": "samples",
 83 |           "type": "LATENT",
 84 |           "link": null,
 85 |           "shape": 7
 86 |         },
 87 |         {
 88 |           "name": "image_cond_latents",
 89 |           "type": "LATENT",
 90 |           "link": 147,
 91 |           "shape": 7
 92 |         },
 93 |         {
 94 |           "name": "context_options",
 95 |           "type": "COGCONTEXT",
 96 |           "link": null,
 97 |           "shape": 7
 98 |         },
 99 |         {
100 |           "name": "controlnet",
101 |           "type": "COGVIDECONTROLNET",
102 |           "link": null,
103 |           "shape": 7
104 |         },
105 |         {
106 |           "name": "tora_trajectory",
107 |           "type": "TORAFEATURES",
108 |           "link": null,
109 |           "shape": 7
110 |         },
111 |         {
112 |           "name": "fastercache",
113 |           "type": "FASTERCACHEARGS",
114 |           "link": null,
115 |           "shape": 7
116 |         }
117 |       ],
118 |       "outputs": [
119 |         {
120 |           "name": "samples",
121 |           "type": "LATENT",
122 |           "links": [
123 |             148
124 |           ]
125 |         }
126 |       ],
127 |       "properties": {
128 |         "Node name for S&R": "CogVideoSampler"
129 |       },
130 |       "widgets_values": [
131 |         49,
132 |         25,
133 |         6,
134 |         0,
135 |         "fixed",
136 |         "CogVideoXDDIM",
137 |         1
138 |       ]
139 |     },
140 |     {
141 |       "id": 62,
142 |       "type": "CogVideoImageEncode",
143 |       "pos": {
144 |         "0": 1149,
145 |         "1": 711
146 |       },
147 |       "size": {
148 |         "0": 315,
149 |         "1": 122
150 |       },
151 |       "flags": {},
152 |       "order": 5,
153 |       "mode": 0,
154 |       "inputs": [
155 |         {
156 |           "name": "vae",
157 |           "type": "VAE",
158 |           "link": 141
159 |         },
160 |         {
161 |           "name": "start_image",
162 |           "type": "IMAGE",
163 |           "link": 142
164 |         },
165 |         {
166 |           "name": "end_image",
167 |           "type": "IMAGE",
168 |           "link": null,
169 |           "shape": 7
170 |         }
171 |       ],
172 |       "outputs": [
173 |         {
174 |           "name": "samples",
175 |           "type": "LATENT",
176 |           "links": [
177 |             147
178 |           ]
179 |         }
180 |       ],
181 |       "properties": {
182 |         "Node name for S&R": "CogVideoImageEncode"
183 |       },
184 |       "widgets_values": [
185 |         false,
186 |         0
187 |       ]
188 |     },
189 |     {
190 |       "id": 59,
191 |       "type": "DownloadAndLoadCogVideoModel",
192 |       "pos": {
193 |         "0": 622,
194 |         "1": -25
195 |       },
196 |       "size": {
197 |         "0": 315,
198 |         "1": 218
199 |       },
200 |       "flags": {},
201 |       "order": 0,
202 |       "mode": 0,
203 |       "inputs": [
204 |         {
205 |           "name": "block_edit",
206 |           "type": "TRANSFORMERBLOCKS",
207 |           "link": null,
208 |           "shape": 7
209 |         },
210 |         {
211 |           "name": "lora",
212 |           "type": "COGLORA",
213 |           "link": null,
214 |           "shape": 7
215 |         },
216 |         {
217 |           "name": "compile_args",
218 |           "type": "COMPILEARGS",
219 |           "link": null,
220 |           "shape": 7
221 |         }
222 |       ],
223 |       "outputs": [
224 |         {
225 |           "name": "model",
226 |           "type": "COGVIDEOMODEL",
227 |           "links": [
228 |             144
229 |           ]
230 |         },
231 |         {
232 |           "name": "vae",
233 |           "type": "VAE",
234 |           "links": [
235 |             132,
236 |             141
237 |           ],
238 |           "slot_index": 1
239 |         }
240 |       ],
241 |       "properties": {
242 |         "Node name for S&R": "DownloadAndLoadCogVideoModel"
243 |       },
244 |       "widgets_values": [
245 |         "THUDM/CogVideoX-5b-I2V",
246 |         "bf16",
247 |         "disabled",
248 |         false,
249 |         "sdpa",
250 |         "main_device"
251 |       ]
252 |     },
253 |     {
254 |       "id": 30,
255 |       "type": "CogVideoTextEncode",
256 |       "pos": {
257 |         "0": 493,
258 |         "1": 303
259 |       },
260 |       "size": {
261 |         "0": 471.90142822265625,
262 |         "1": 168.08047485351562
263 |       },
264 |       "flags": {},
265 |       "order": 4,
266 |       "mode": 0,
267 |       "inputs": [
268 |         {
269 |           "name": "clip",
270 |           "type": "CLIP",
271 |           "link": 54
272 |         }
273 |       ],
274 |       "outputs": [
275 |         {
276 |           "name": "conditioning",
277 |           "type": "CONDITIONING",
278 |           "links": [
279 |             145
280 |           ],
281 |           "slot_index": 0,
282 |           "shape": 3
283 |         },
284 |         {
285 |           "name": "clip",
286 |           "type": "CLIP",
287 |           "links": [
288 |             149
289 |           ],
290 |           "slot_index": 1
291 |         }
292 |       ],
293 |       "properties": {
294 |         "Node name for S&R": "CogVideoTextEncode"
295 |       },
296 |       "widgets_values": [
297 |         "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
298 |         1,
299 |         false
300 |       ]
301 |     },
302 |     {
303 |       "id": 37,
304 |       "type": "ImageResizeKJ",
305 |       "pos": {
306 |         "0": 784,
307 |         "1": 731
308 |       },
309 |       "size": {
310 |         "0": 315,
311 |         "1": 266
312 |       },
313 |       "flags": {},
314 |       "order": 3,
315 |       "mode": 0,
316 |       "inputs": [
317 |         {
318 |           "name": "image",
319 |           "type": "IMAGE",
320 |           "link": 71
321 |         },
322 |         {
323 |           "name": "get_image_size",
324 |           "type": "IMAGE",
325 |           "link": null,
326 |           "shape": 7
327 |         },
328 |         {
329 |           "name": "width_input",
330 |           "type": "INT",
331 |           "link": null,
332 |           "widget": {
333 |             "name": "width_input"
334 |           }
335 |         },
336 |         {
337 |           "name": "height_input",
338 |           "type": "INT",
339 |           "link": null,
340 |           "widget": {
341 |             "name": "height_input"
342 |           }
343 |         }
344 |       ],
345 |       "outputs": [
346 |         {
347 |           "name": "IMAGE",
348 |           "type": "IMAGE",
349 |           "links": [
350 |             142
351 |           ],
352 |           "slot_index": 0,
353 |           "shape": 3
354 |         },
355 |         {
356 |           "name": "width",
357 |           "type": "INT",
358 |           "links": null,
359 |           "shape": 3
360 |         },
361 |         {
362 |           "name": "height",
363 |           "type": "INT",
364 |           "links": null,
365 |           "shape": 3
366 |         }
367 |       ],
368 |       "properties": {
369 |         "Node name for S&R": "ImageResizeKJ"
370 |       },
371 |       "widgets_values": [
372 |         720,
373 |         480,
374 |         "lanczos",
375 |         false,
376 |         16,
377 |         0,
378 |         0,
379 |         "disabled"
380 |       ]
381 |     },
382 |     {
383 |       "id": 36,
384 |       "type": "LoadImage",
385 |       "pos": {
386 |         "0": 335,
387 |         "1": 731
388 |       },
389 |       "size": {
390 |         "0": 402.06353759765625,
391 |         "1": 396.6225891113281
392 |       },
393 |       "flags": {},
394 |       "order": 1,
395 |       "mode": 0,
396 |       "inputs": [],
397 |       "outputs": [
398 |         {
399 |           "name": "IMAGE",
400 |           "type": "IMAGE",
401 |           "links": [
402 |             71
403 |           ],
404 |           "slot_index": 0,
405 |           "shape": 3
406 |         },
407 |         {
408 |           "name": "MASK",
409 |           "type": "MASK",
410 |           "links": null,
411 |           "shape": 3
412 |         }
413 |       ],
414 |       "properties": {
415 |         "Node name for S&R": "LoadImage"
416 |       },
417 |       "widgets_values": [
418 |         "sd3stag.png",
419 |         "image"
420 |       ]
421 |     },
422 |     {
423 |       "id": 20,
424 |       "type": "CLIPLoader",
425 |       "pos": {
426 |         "0": -2,
427 |         "1": 304
428 |       },
429 |       "size": {
430 |         "0": 451.30548095703125,
431 |         "1": 82
432 |       },
433 |       "flags": {},
434 |       "order": 2,
435 |       "mode": 0,
436 |       "inputs": [],
437 |       "outputs": [
438 |         {
439 |           "name": "CLIP",
440 |           "type": "CLIP",
441 |           "links": [
442 |             54
443 |           ],
444 |           "slot_index": 0,
445 |           "shape": 3
446 |         }
447 |       ],
448 |       "properties": {
449 |         "Node name for S&R": "CLIPLoader"
450 |       },
451 |       "widgets_values": [
452 |         "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
453 |         "sd3"
454 |       ]
455 |     },
456 |     {
457 |       "id": 60,
458 |       "type": "CogVideoDecode",
459 |       "pos": {
460 |         "0": 1523,
461 |         "1": -6
462 |       },
463 |       "size": {
464 |         "0": 315,
465 |         "1": 198
466 |       },
467 |       "flags": {},
468 |       "order": 8,
469 |       "mode": 0,
470 |       "inputs": [
471 |         {
472 |           "name": "vae",
473 |           "type": "VAE",
474 |           "link": 132
475 |         },
476 |         {
477 |           "name": "samples",
478 |           "type": "LATENT",
479 |           "link": 148
480 |         }
481 |       ],
482 |       "outputs": [
483 |         {
484 |           "name": "images",
485 |           "type": "IMAGE",
486 |           "links": [
487 |             134
488 |           ]
489 |         }
490 |       ],
491 |       "properties": {
492 |         "Node name for S&R": "CogVideoDecode"
493 |       },
494 |       "widgets_values": [
495 |         true,
496 |         240,
497 |         360,
498 |         0.2,
499 |         0.2,
500 |         true
501 |       ]
502 |     },
503 |     {
504 |       "id": 44,
505 |       "type": "VHS_VideoCombine",
506 |       "pos": {
507 |         "0": 1884,
508 |         "1": -6
509 |       },
510 |       "size": [
511 |         605.3909912109375,
512 |         714.2606608072917
513 |       ],
514 |       "flags": {},
515 |       "order": 9,
516 |       "mode": 0,
517 |       "inputs": [
518 |         {
519 |           "name": "images",
520 |           "type": "IMAGE",
521 |           "link": 134
522 |         },
523 |         {
524 |           "name": "audio",
525 |           "type": "AUDIO",
526 |           "link": null,
527 |           "shape": 7
528 |         },
529 |         {
530 |           "name": "meta_batch",
531 |           "type": "VHS_BatchManager",
532 |           "link": null,
533 |           "shape": 7
534 |         },
535 |         {
536 |           "name": "vae",
537 |           "type": "VAE",
538 |           "link": null,
539 |           "shape": 7
540 |         }
541 |       ],
542 |       "outputs": [
543 |         {
544 |           "name": "Filenames",
545 |           "type": "VHS_FILENAMES",
546 |           "links": null,
547 |           "shape": 3
548 |         }
549 |       ],
550 |       "properties": {
551 |         "Node name for S&R": "VHS_VideoCombine"
552 |       },
553 |       "widgets_values": {
554 |         "frame_rate": 8,
555 |         "loop_count": 0,
556 |         "filename_prefix": "CogVideoX-I2V",
557 |         "format": "video/h264-mp4",
558 |         "pix_fmt": "yuv420p",
559 |         "crf": 19,
560 |         "save_metadata": true,
561 |         "pingpong": false,
562 |         "save_output": true,
563 |         "videopreview": {
564 |           "hidden": false,
565 |           "paused": false,
566 |           "params": {
567 |             "filename": "CogVideoX-I2V_00001.mp4",
568 |             "subfolder": "",
569 |             "type": "temp",
570 |             "format": "video/h264-mp4",
571 |             "frame_rate": 8
572 |           },
573 |           "muted": false
574 |         }
575 |       }
576 |     }
577 |   ],
578 |   "links": [
579 |     [
580 |       54,
581 |       20,
582 |       0,
583 |       30,
584 |       0,
585 |       "CLIP"
586 |     ],
587 |     [
588 |       71,
589 |       36,
590 |       0,
591 |       37,
592 |       0,
593 |       "IMAGE"
594 |     ],
595 |     [
596 |       132,
597 |       59,
598 |       1,
599 |       60,
600 |       0,
601 |       "VAE"
602 |     ],
603 |     [
604 |       134,
605 |       60,
606 |       0,
607 |       44,
608 |       0,
609 |       "IMAGE"
610 |     ],
611 |     [
612 |       141,
613 |       59,
614 |       1,
615 |       62,
616 |       0,
617 |       "VAE"
618 |     ],
619 |     [
620 |       142,
621 |       37,
622 |       0,
623 |       62,
624 |       1,
625 |       "IMAGE"
626 |     ],
627 |     [
628 |       144,
629 |       59,
630 |       0,
631 |       63,
632 |       0,
633 |       "COGVIDEOMODEL"
634 |     ],
635 |     [
636 |       145,
637 |       30,
638 |       0,
639 |       63,
640 |       1,
641 |       "CONDITIONING"
642 |     ],
643 |     [
644 |       146,
645 |       31,
646 |       0,
647 |       63,
648 |       2,
649 |       "CONDITIONING"
650 |     ],
651 |     [
652 |       147,
653 |       62,
654 |       0,
655 |       63,
656 |       4,
657 |       "LATENT"
658 |     ],
659 |     [
660 |       148,
661 |       63,
662 |       0,
663 |       60,
664 |       1,
665 |       "LATENT"
666 |     ],
667 |     [
668 |       149,
669 |       30,
670 |       1,
671 |       31,
672 |       0,
673 |       "CLIP"
674 |     ]
675 |   ],
676 |   "groups": [],
677 |   "config": {},
678 |   "extra": {
679 |     "ds": {
680 |       "scale": 0.7627768444387059,
681 |       "offset": [
682 |         648.7113591814891,
683 |         185.9907078691075
684 |       ]
685 |     }
686 |   },
687 |   "version": 0.4
688 | }


--------------------------------------------------------------------------------
/example_workflows/cogvideox_1_0_5b_T2V_02.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 37,
  3 |   "last_link_id": 72,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 30,
  7 |       "type": "CogVideoTextEncode",
  8 |       "pos": {
  9 |         "0": 500,
 10 |         "1": 308
 11 |       },
 12 |       "size": [
 13 |         470.99399664051055,
 14 |         237.5088638951354
 15 |       ],
 16 |       "flags": {},
 17 |       "order": 3,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "clip",
 22 |           "type": "CLIP",
 23 |           "link": 54
 24 |         }
 25 |       ],
 26 |       "outputs": [
 27 |         {
 28 |           "name": "conditioning",
 29 |           "type": "CONDITIONING",
 30 |           "links": [
 31 |             67
 32 |           ],
 33 |           "slot_index": 0,
 34 |           "shape": 3
 35 |         },
 36 |         {
 37 |           "name": "clip",
 38 |           "type": "CLIP",
 39 |           "links": [
 40 |             65
 41 |           ],
 42 |           "slot_index": 1
 43 |         }
 44 |       ],
 45 |       "properties": {
 46 |         "Node name for S&R": "CogVideoTextEncode"
 47 |       },
 48 |       "widgets_values": [
 49 |         "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n",
 50 |         1,
 51 |         false
 52 |       ]
 53 |     },
 54 |     {
 55 |       "id": 31,
 56 |       "type": "CogVideoTextEncode",
 57 |       "pos": {
 58 |         "0": 503,
 59 |         "1": 602
 60 |       },
 61 |       "size": [
 62 |         464.4980515341475,
 63 |         169.87479027400514
 64 |       ],
 65 |       "flags": {},
 66 |       "order": 4,
 67 |       "mode": 0,
 68 |       "inputs": [
 69 |         {
 70 |           "name": "clip",
 71 |           "type": "CLIP",
 72 |           "link": 65
 73 |         }
 74 |       ],
 75 |       "outputs": [
 76 |         {
 77 |           "name": "conditioning",
 78 |           "type": "CONDITIONING",
 79 |           "links": [
 80 |             68
 81 |           ],
 82 |           "slot_index": 0,
 83 |           "shape": 3
 84 |         },
 85 |         {
 86 |           "name": "clip",
 87 |           "type": "CLIP",
 88 |           "links": null
 89 |         }
 90 |       ],
 91 |       "properties": {
 92 |         "Node name for S&R": "CogVideoTextEncode"
 93 |       },
 94 |       "widgets_values": [
 95 |         "",
 96 |         1,
 97 |         true
 98 |       ]
 99 |     },
100 |     {
101 |       "id": 11,
102 |       "type": "CogVideoDecode",
103 |       "pos": {
104 |         "0": 1416,
105 |         "1": 40
106 |       },
107 |       "size": {
108 |         "0": 300.396484375,
109 |         "1": 198
110 |       },
111 |       "flags": {},
112 |       "order": 6,
113 |       "mode": 0,
114 |       "inputs": [
115 |         {
116 |           "name": "vae",
117 |           "type": "VAE",
118 |           "link": 71
119 |         },
120 |         {
121 |           "name": "samples",
122 |           "type": "LATENT",
123 |           "link": 69
124 |         }
125 |       ],
126 |       "outputs": [
127 |         {
128 |           "name": "images",
129 |           "type": "IMAGE",
130 |           "links": [
131 |             59
132 |           ],
133 |           "slot_index": 0,
134 |           "shape": 3
135 |         }
136 |       ],
137 |       "properties": {
138 |         "Node name for S&R": "CogVideoDecode"
139 |       },
140 |       "widgets_values": [
141 |         false,
142 |         240,
143 |         360,
144 |         0.2,
145 |         0.2,
146 |         true
147 |       ]
148 |     },
149 |     {
150 |       "id": 36,
151 |       "type": "DownloadAndLoadCogVideoModel",
152 |       "pos": {
153 |         "0": 645,
154 |         "1": 17
155 |       },
156 |       "size": {
157 |         "0": 315,
158 |         "1": 218
159 |       },
160 |       "flags": {},
161 |       "order": 0,
162 |       "mode": 0,
163 |       "inputs": [
164 |         {
165 |           "name": "block_edit",
166 |           "type": "TRANSFORMERBLOCKS",
167 |           "link": null,
168 |           "shape": 7
169 |         },
170 |         {
171 |           "name": "lora",
172 |           "type": "COGLORA",
173 |           "link": null,
174 |           "shape": 7
175 |         },
176 |         {
177 |           "name": "compile_args",
178 |           "type": "COMPILEARGS",
179 |           "link": null,
180 |           "shape": 7
181 |         }
182 |       ],
183 |       "outputs": [
184 |         {
185 |           "name": "model",
186 |           "type": "COGVIDEOMODEL",
187 |           "links": [
188 |             70
189 |           ]
190 |         },
191 |         {
192 |           "name": "vae",
193 |           "type": "VAE",
194 |           "links": [
195 |             71
196 |           ],
197 |           "slot_index": 1
198 |         }
199 |       ],
200 |       "properties": {
201 |         "Node name for S&R": "DownloadAndLoadCogVideoModel"
202 |       },
203 |       "widgets_values": [
204 |         "THUDM/CogVideoX-5b",
205 |         "bf16",
206 |         "disabled",
207 |         false,
208 |         "sdpa",
209 |         "main_device"
210 |       ]
211 |     },
212 |     {
213 |       "id": 20,
214 |       "type": "CLIPLoader",
215 |       "pos": {
216 |         "0": 5,
217 |         "1": 308
218 |       },
219 |       "size": {
220 |         "0": 451.30548095703125,
221 |         "1": 82
222 |       },
223 |       "flags": {},
224 |       "order": 1,
225 |       "mode": 0,
226 |       "inputs": [],
227 |       "outputs": [
228 |         {
229 |           "name": "CLIP",
230 |           "type": "CLIP",
231 |           "links": [
232 |             54
233 |           ],
234 |           "slot_index": 0,
235 |           "shape": 3
236 |         }
237 |       ],
238 |       "properties": {
239 |         "Node name for S&R": "CLIPLoader"
240 |       },
241 |       "widgets_values": [
242 |         "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
243 |         "sd3"
244 |       ]
245 |     },
246 |     {
247 |       "id": 37,
248 |       "type": "EmptyLatentImage",
249 |       "pos": {
250 |         "0": 643,
251 |         "1": 827
252 |       },
253 |       "size": {
254 |         "0": 315,
255 |         "1": 106
256 |       },
257 |       "flags": {},
258 |       "order": 2,
259 |       "mode": 0,
260 |       "inputs": [],
261 |       "outputs": [
262 |         {
263 |           "name": "LATENT",
264 |           "type": "LATENT",
265 |           "links": [
266 |             72
267 |           ]
268 |         }
269 |       ],
270 |       "properties": {
271 |         "Node name for S&R": "EmptyLatentImage"
272 |       },
273 |       "widgets_values": [
274 |         720,
275 |         480,
276 |         1
277 |       ]
278 |     },
279 |     {
280 |       "id": 35,
281 |       "type": "CogVideoSampler",
282 |       "pos": {
283 |         "0": 1042,
284 |         "1": 291
285 |       },
286 |       "size": [
287 |         330,
288 |         574
289 |       ],
290 |       "flags": {},
291 |       "order": 5,
292 |       "mode": 0,
293 |       "inputs": [
294 |         {
295 |           "name": "model",
296 |           "type": "COGVIDEOMODEL",
297 |           "link": 70
298 |         },
299 |         {
300 |           "name": "positive",
301 |           "type": "CONDITIONING",
302 |           "link": 67
303 |         },
304 |         {
305 |           "name": "negative",
306 |           "type": "CONDITIONING",
307 |           "link": 68
308 |         },
309 |         {
310 |           "name": "samples",
311 |           "type": "LATENT",
312 |           "link": 72,
313 |           "shape": 7
314 |         },
315 |         {
316 |           "name": "image_cond_latents",
317 |           "type": "LATENT",
318 |           "link": null,
319 |           "shape": 7
320 |         },
321 |         {
322 |           "name": "context_options",
323 |           "type": "COGCONTEXT",
324 |           "link": null,
325 |           "shape": 7
326 |         },
327 |         {
328 |           "name": "controlnet",
329 |           "type": "COGVIDECONTROLNET",
330 |           "link": null,
331 |           "shape": 7
332 |         },
333 |         {
334 |           "name": "tora_trajectory",
335 |           "type": "TORAFEATURES",
336 |           "link": null,
337 |           "shape": 7
338 |         },
339 |         {
340 |           "name": "fastercache",
341 |           "type": "FASTERCACHEARGS",
342 |           "link": null,
343 |           "shape": 7
344 |         }
345 |       ],
346 |       "outputs": [
347 |         {
348 |           "name": "samples",
349 |           "type": "LATENT",
350 |           "links": [
351 |             69
352 |           ]
353 |         }
354 |       ],
355 |       "properties": {
356 |         "Node name for S&R": "CogVideoSampler"
357 |       },
358 |       "widgets_values": [
359 |         49,
360 |         50,
361 |         6,
362 |         0,
363 |         "fixed",
364 |         "CogVideoXDDIM",
365 |         1
366 |       ]
367 |     },
368 |     {
369 |       "id": 33,
370 |       "type": "VHS_VideoCombine",
371 |       "pos": {
372 |         "0": 1767,
373 |         "1": 39
374 |       },
375 |       "size": [
376 |         778.7022705078125,
377 |         829.801513671875
378 |       ],
379 |       "flags": {},
380 |       "order": 7,
381 |       "mode": 0,
382 |       "inputs": [
383 |         {
384 |           "name": "images",
385 |           "type": "IMAGE",
386 |           "link": 59
387 |         },
388 |         {
389 |           "name": "audio",
390 |           "type": "AUDIO",
391 |           "link": null,
392 |           "shape": 7
393 |         },
394 |         {
395 |           "name": "meta_batch",
396 |           "type": "VHS_BatchManager",
397 |           "link": null,
398 |           "shape": 7
399 |         },
400 |         {
401 |           "name": "vae",
402 |           "type": "VAE",
403 |           "link": null,
404 |           "shape": 7
405 |         }
406 |       ],
407 |       "outputs": [
408 |         {
409 |           "name": "Filenames",
410 |           "type": "VHS_FILENAMES",
411 |           "links": null,
412 |           "shape": 3
413 |         }
414 |       ],
415 |       "properties": {
416 |         "Node name for S&R": "VHS_VideoCombine"
417 |       },
418 |       "widgets_values": {
419 |         "frame_rate": 8,
420 |         "loop_count": 0,
421 |         "filename_prefix": "CogVideoX5B-T2V",
422 |         "format": "video/h264-mp4",
423 |         "pix_fmt": "yuv420p",
424 |         "crf": 19,
425 |         "save_metadata": true,
426 |         "pingpong": false,
427 |         "save_output": false,
428 |         "videopreview": {
429 |           "hidden": false,
430 |           "paused": false,
431 |           "params": {
432 |             "filename": "CogVideoX5B_00001.mp4",
433 |             "subfolder": "",
434 |             "type": "temp",
435 |             "format": "video/h264-mp4",
436 |             "frame_rate": 8
437 |           },
438 |           "muted": false
439 |         }
440 |       }
441 |     }
442 |   ],
443 |   "links": [
444 |     [
445 |       54,
446 |       20,
447 |       0,
448 |       30,
449 |       0,
450 |       "CLIP"
451 |     ],
452 |     [
453 |       59,
454 |       11,
455 |       0,
456 |       33,
457 |       0,
458 |       "IMAGE"
459 |     ],
460 |     [
461 |       65,
462 |       30,
463 |       1,
464 |       31,
465 |       0,
466 |       "CLIP"
467 |     ],
468 |     [
469 |       67,
470 |       30,
471 |       0,
472 |       35,
473 |       1,
474 |       "CONDITIONING"
475 |     ],
476 |     [
477 |       68,
478 |       31,
479 |       0,
480 |       35,
481 |       2,
482 |       "CONDITIONING"
483 |     ],
484 |     [
485 |       69,
486 |       35,
487 |       0,
488 |       11,
489 |       1,
490 |       "LATENT"
491 |     ],
492 |     [
493 |       70,
494 |       36,
495 |       0,
496 |       35,
497 |       0,
498 |       "COGVIDEOMODEL"
499 |     ],
500 |     [
501 |       71,
502 |       36,
503 |       1,
504 |       11,
505 |       0,
506 |       "VAE"
507 |     ],
508 |     [
509 |       72,
510 |       37,
511 |       0,
512 |       35,
513 |       3,
514 |       "LATENT"
515 |     ]
516 |   ],
517 |   "groups": [],
518 |   "config": {},
519 |   "extra": {
520 |     "ds": {
521 |       "scale": 0.7627768444387061,
522 |       "offset": [
523 |         734.1791945221892,
524 |         237.29437844909364
525 |       ]
526 |     }
527 |   },
528 |   "version": 0.4
529 | }


--------------------------------------------------------------------------------
/example_workflows/cogvideox_1_0_5b_interpolation_02.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 68,
  3 |   "last_link_id": 155,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 31,
  7 |       "type": "CogVideoTextEncode",
  8 |       "pos": {
  9 |         "0": 497,
 10 |         "1": 520
 11 |       },
 12 |       "size": {
 13 |         "0": 463.01251220703125,
 14 |         "1": 144
 15 |       },
 16 |       "flags": {},
 17 |       "order": 6,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "clip",
 22 |           "type": "CLIP",
 23 |           "link": 149
 24 |         }
 25 |       ],
 26 |       "outputs": [
 27 |         {
 28 |           "name": "conditioning",
 29 |           "type": "CONDITIONING",
 30 |           "links": [
 31 |             146
 32 |           ],
 33 |           "slot_index": 0,
 34 |           "shape": 3
 35 |         },
 36 |         {
 37 |           "name": "clip",
 38 |           "type": "CLIP",
 39 |           "links": null
 40 |         }
 41 |       ],
 42 |       "properties": {
 43 |         "Node name for S&R": "CogVideoTextEncode"
 44 |       },
 45 |       "widgets_values": [
 46 |         "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
 47 |         1,
 48 |         true
 49 |       ]
 50 |     },
 51 |     {
 52 |       "id": 63,
 53 |       "type": "CogVideoSampler",
 54 |       "pos": {
 55 |         "0": 1142,
 56 |         "1": 74
 57 |       },
 58 |       "size": [
 59 |         330,
 60 |         574
 61 |       ],
 62 |       "flags": {},
 63 |       "order": 9,
 64 |       "mode": 0,
 65 |       "inputs": [
 66 |         {
 67 |           "name": "model",
 68 |           "type": "COGVIDEOMODEL",
 69 |           "link": 144
 70 |         },
 71 |         {
 72 |           "name": "positive",
 73 |           "type": "CONDITIONING",
 74 |           "link": 145
 75 |         },
 76 |         {
 77 |           "name": "negative",
 78 |           "type": "CONDITIONING",
 79 |           "link": 146
 80 |         },
 81 |         {
 82 |           "name": "samples",
 83 |           "type": "LATENT",
 84 |           "link": null,
 85 |           "shape": 7
 86 |         },
 87 |         {
 88 |           "name": "image_cond_latents",
 89 |           "type": "LATENT",
 90 |           "link": 147,
 91 |           "shape": 7
 92 |         },
 93 |         {
 94 |           "name": "context_options",
 95 |           "type": "COGCONTEXT",
 96 |           "link": null,
 97 |           "shape": 7
 98 |         },
 99 |         {
100 |           "name": "controlnet",
101 |           "type": "COGVIDECONTROLNET",
102 |           "link": null,
103 |           "shape": 7
104 |         },
105 |         {
106 |           "name": "tora_trajectory",
107 |           "type": "TORAFEATURES",
108 |           "link": null,
109 |           "shape": 7
110 |         },
111 |         {
112 |           "name": "fastercache",
113 |           "type": "FASTERCACHEARGS",
114 |           "link": null,
115 |           "shape": 7
116 |         }
117 |       ],
118 |       "outputs": [
119 |         {
120 |           "name": "samples",
121 |           "type": "LATENT",
122 |           "links": [
123 |             148
124 |           ]
125 |         }
126 |       ],
127 |       "properties": {
128 |         "Node name for S&R": "CogVideoSampler"
129 |       },
130 |       "widgets_values": [
131 |         49,
132 |         25,
133 |         6,
134 |         0,
135 |         "fixed",
136 |         "CogVideoXDDIM",
137 |         1
138 |       ]
139 |     },
140 |     {
141 |       "id": 30,
142 |       "type": "CogVideoTextEncode",
143 |       "pos": {
144 |         "0": 493,
145 |         "1": 303
146 |       },
147 |       "size": {
148 |         "0": 471.90142822265625,
149 |         "1": 168.08047485351562
150 |       },
151 |       "flags": {},
152 |       "order": 4,
153 |       "mode": 0,
154 |       "inputs": [
155 |         {
156 |           "name": "clip",
157 |           "type": "CLIP",
158 |           "link": 54
159 |         }
160 |       ],
161 |       "outputs": [
162 |         {
163 |           "name": "conditioning",
164 |           "type": "CONDITIONING",
165 |           "links": [
166 |             145
167 |           ],
168 |           "slot_index": 0,
169 |           "shape": 3
170 |         },
171 |         {
172 |           "name": "clip",
173 |           "type": "CLIP",
174 |           "links": [
175 |             149
176 |           ],
177 |           "slot_index": 1
178 |         }
179 |       ],
180 |       "properties": {
181 |         "Node name for S&R": "CogVideoTextEncode"
182 |       },
183 |       "widgets_values": [
184 |         "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
185 |         1,
186 |         false
187 |       ]
188 |     },
189 |     {
190 |       "id": 20,
191 |       "type": "CLIPLoader",
192 |       "pos": {
193 |         "0": -2,
194 |         "1": 304
195 |       },
196 |       "size": {
197 |         "0": 451.30548095703125,
198 |         "1": 82
199 |       },
200 |       "flags": {},
201 |       "order": 0,
202 |       "mode": 0,
203 |       "inputs": [],
204 |       "outputs": [
205 |         {
206 |           "name": "CLIP",
207 |           "type": "CLIP",
208 |           "links": [
209 |             54
210 |           ],
211 |           "slot_index": 0,
212 |           "shape": 3
213 |         }
214 |       ],
215 |       "properties": {
216 |         "Node name for S&R": "CLIPLoader"
217 |       },
218 |       "widgets_values": [
219 |         "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
220 |         "sd3"
221 |       ]
222 |     },
223 |     {
224 |       "id": 36,
225 |       "type": "LoadImage",
226 |       "pos": {
227 |         "0": 105,
228 |         "1": 732
229 |       },
230 |       "size": {
231 |         "0": 402.06353759765625,
232 |         "1": 396.6225891113281
233 |       },
234 |       "flags": {},
235 |       "order": 1,
236 |       "mode": 0,
237 |       "inputs": [],
238 |       "outputs": [
239 |         {
240 |           "name": "IMAGE",
241 |           "type": "IMAGE",
242 |           "links": [
243 |             71
244 |           ],
245 |           "slot_index": 0,
246 |           "shape": 3
247 |         },
248 |         {
249 |           "name": "MASK",
250 |           "type": "MASK",
251 |           "links": null,
252 |           "shape": 3
253 |         }
254 |       ],
255 |       "properties": {
256 |         "Node name for S&R": "LoadImage"
257 |       },
258 |       "widgets_values": [
259 |         "sd3stag.png",
260 |         "image"
261 |       ]
262 |     },
263 |     {
264 |       "id": 64,
265 |       "type": "LoadImage",
266 |       "pos": {
267 |         "0": 105,
268 |         "1": 1189
269 |       },
270 |       "size": {
271 |         "0": 402.06353759765625,
272 |         "1": 396.6225891113281
273 |       },
274 |       "flags": {},
275 |       "order": 2,
276 |       "mode": 0,
277 |       "inputs": [],
278 |       "outputs": [
279 |         {
280 |           "name": "IMAGE",
281 |           "type": "IMAGE",
282 |           "links": [
283 |             151
284 |           ],
285 |           "slot_index": 0,
286 |           "shape": 3
287 |         },
288 |         {
289 |           "name": "MASK",
290 |           "type": "MASK",
291 |           "links": null,
292 |           "shape": 3
293 |         }
294 |       ],
295 |       "properties": {
296 |         "Node name for S&R": "LoadImage"
297 |       },
298 |       "widgets_values": [
299 |         "sd3stag.png",
300 |         "image"
301 |       ]
302 |     },
303 |     {
304 |       "id": 65,
305 |       "type": "ImageResizeKJ",
306 |       "pos": {
307 |         "0": 607,
308 |         "1": 1188
309 |       },
310 |       "size": [
311 |         315,
312 |         266
313 |       ],
314 |       "flags": {},
315 |       "order": 7,
316 |       "mode": 0,
317 |       "inputs": [
318 |         {
319 |           "name": "image",
320 |           "type": "IMAGE",
321 |           "link": 151
322 |         },
323 |         {
324 |           "name": "get_image_size",
325 |           "type": "IMAGE",
326 |           "link": null,
327 |           "shape": 7
328 |         },
329 |         {
330 |           "name": "width_input",
331 |           "type": "INT",
332 |           "link": null,
333 |           "widget": {
334 |             "name": "width_input"
335 |           },
336 |           "shape": 7
337 |         },
338 |         {
339 |           "name": "height_input",
340 |           "type": "INT",
341 |           "link": null,
342 |           "widget": {
343 |             "name": "height_input"
344 |           },
345 |           "shape": 7
346 |         },
347 |         {
348 |           "name": "width",
349 |           "type": "INT",
350 |           "link": 152,
351 |           "widget": {
352 |             "name": "width"
353 |           }
354 |         },
355 |         {
356 |           "name": "height",
357 |           "type": "INT",
358 |           "link": 153,
359 |           "widget": {
360 |             "name": "height"
361 |           }
362 |         }
363 |       ],
364 |       "outputs": [
365 |         {
366 |           "name": "IMAGE",
367 |           "type": "IMAGE",
368 |           "links": [
369 |             155
370 |           ],
371 |           "slot_index": 0,
372 |           "shape": 3
373 |         },
374 |         {
375 |           "name": "width",
376 |           "type": "INT",
377 |           "links": null,
378 |           "shape": 3
379 |         },
380 |         {
381 |           "name": "height",
382 |           "type": "INT",
383 |           "links": null,
384 |           "shape": 3
385 |         }
386 |       ],
387 |       "properties": {
388 |         "Node name for S&R": "ImageResizeKJ"
389 |       },
390 |       "widgets_values": [
391 |         720,
392 |         480,
393 |         "lanczos",
394 |         false,
395 |         16,
396 |         0,
397 |         0,
398 |         "disabled"
399 |       ]
400 |     },
401 |     {
402 |       "id": 37,
403 |       "type": "ImageResizeKJ",
404 |       "pos": {
405 |         "0": 593,
406 |         "1": 731
407 |       },
408 |       "size": {
409 |         "0": 315,
410 |         "1": 266
411 |       },
412 |       "flags": {},
413 |       "order": 5,
414 |       "mode": 0,
415 |       "inputs": [
416 |         {
417 |           "name": "image",
418 |           "type": "IMAGE",
419 |           "link": 71
420 |         },
421 |         {
422 |           "name": "get_image_size",
423 |           "type": "IMAGE",
424 |           "link": null,
425 |           "shape": 7
426 |         },
427 |         {
428 |           "name": "width_input",
429 |           "type": "INT",
430 |           "link": null,
431 |           "widget": {
432 |             "name": "width_input"
433 |           }
434 |         },
435 |         {
436 |           "name": "height_input",
437 |           "type": "INT",
438 |           "link": null,
439 |           "widget": {
440 |             "name": "height_input"
441 |           }
442 |         }
443 |       ],
444 |       "outputs": [
445 |         {
446 |           "name": "IMAGE",
447 |           "type": "IMAGE",
448 |           "links": [
449 |             142
450 |           ],
451 |           "slot_index": 0,
452 |           "shape": 3
453 |         },
454 |         {
455 |           "name": "width",
456 |           "type": "INT",
457 |           "links": [
458 |             152
459 |           ],
460 |           "shape": 3,
461 |           "slot_index": 1
462 |         },
463 |         {
464 |           "name": "height",
465 |           "type": "INT",
466 |           "links": [
467 |             153
468 |           ],
469 |           "shape": 3,
470 |           "slot_index": 2
471 |         }
472 |       ],
473 |       "properties": {
474 |         "Node name for S&R": "ImageResizeKJ"
475 |       },
476 |       "widgets_values": [
477 |         720,
478 |         480,
479 |         "lanczos",
480 |         false,
481 |         16,
482 |         0,
483 |         0,
484 |         "disabled"
485 |       ]
486 |     },
487 |     {
488 |       "id": 60,
489 |       "type": "CogVideoDecode",
490 |       "pos": {
491 |         "0": 1526,
492 |         "1": -4
493 |       },
494 |       "size": {
495 |         "0": 315,
496 |         "1": 198
497 |       },
498 |       "flags": {},
499 |       "order": 10,
500 |       "mode": 0,
501 |       "inputs": [
502 |         {
503 |           "name": "vae",
504 |           "type": "VAE",
505 |           "link": 132
506 |         },
507 |         {
508 |           "name": "samples",
509 |           "type": "LATENT",
510 |           "link": 148
511 |         }
512 |       ],
513 |       "outputs": [
514 |         {
515 |           "name": "images",
516 |           "type": "IMAGE",
517 |           "links": [
518 |             134
519 |           ]
520 |         }
521 |       ],
522 |       "properties": {
523 |         "Node name for S&R": "CogVideoDecode"
524 |       },
525 |       "widgets_values": [
526 |         true,
527 |         240,
528 |         360,
529 |         0.2,
530 |         0.2,
531 |         true
532 |       ]
533 |     },
534 |     {
535 |       "id": 62,
536 |       "type": "CogVideoImageEncode",
537 |       "pos": {
538 |         "0": 1152,
539 |         "1": 706
540 |       },
541 |       "size": {
542 |         "0": 315,
543 |         "1": 122
544 |       },
545 |       "flags": {},
546 |       "order": 8,
547 |       "mode": 0,
548 |       "inputs": [
549 |         {
550 |           "name": "vae",
551 |           "type": "VAE",
552 |           "link": 141
553 |         },
554 |         {
555 |           "name": "start_image",
556 |           "type": "IMAGE",
557 |           "link": 142
558 |         },
559 |         {
560 |           "name": "end_image",
561 |           "type": "IMAGE",
562 |           "link": 155,
563 |           "shape": 7
564 |         }
565 |       ],
566 |       "outputs": [
567 |         {
568 |           "name": "samples",
569 |           "type": "LATENT",
570 |           "links": [
571 |             147
572 |           ]
573 |         }
574 |       ],
575 |       "properties": {
576 |         "Node name for S&R": "CogVideoImageEncode"
577 |       },
578 |       "widgets_values": [
579 |         false,
580 |         0
581 |       ]
582 |     },
583 |     {
584 |       "id": 44,
585 |       "type": "VHS_VideoCombine",
586 |       "pos": {
587 |         "0": 1884,
588 |         "1": -3
589 |       },
590 |       "size": [
591 |         605.3909912109375,
592 |         714.2606608072917
593 |       ],
594 |       "flags": {},
595 |       "order": 11,
596 |       "mode": 0,
597 |       "inputs": [
598 |         {
599 |           "name": "images",
600 |           "type": "IMAGE",
601 |           "link": 134
602 |         },
603 |         {
604 |           "name": "audio",
605 |           "type": "AUDIO",
606 |           "link": null,
607 |           "shape": 7
608 |         },
609 |         {
610 |           "name": "meta_batch",
611 |           "type": "VHS_BatchManager",
612 |           "link": null,
613 |           "shape": 7
614 |         },
615 |         {
616 |           "name": "vae",
617 |           "type": "VAE",
618 |           "link": null,
619 |           "shape": 7
620 |         }
621 |       ],
622 |       "outputs": [
623 |         {
624 |           "name": "Filenames",
625 |           "type": "VHS_FILENAMES",
626 |           "links": null,
627 |           "shape": 3
628 |         }
629 |       ],
630 |       "properties": {
631 |         "Node name for S&R": "VHS_VideoCombine"
632 |       },
633 |       "widgets_values": {
634 |         "frame_rate": 8,
635 |         "loop_count": 0,
636 |         "filename_prefix": "CogVideoX-Interpolation",
637 |         "format": "video/h264-mp4",
638 |         "pix_fmt": "yuv420p",
639 |         "crf": 19,
640 |         "save_metadata": true,
641 |         "pingpong": false,
642 |         "save_output": true,
643 |         "videopreview": {
644 |           "hidden": false,
645 |           "paused": false,
646 |           "params": {
647 |             "filename": "CogVideoX-I2V_00003.mp4",
648 |             "subfolder": "",
649 |             "type": "temp",
650 |             "format": "video/h264-mp4",
651 |             "frame_rate": 8
652 |           },
653 |           "muted": false
654 |         }
655 |       }
656 |     },
657 |     {
658 |       "id": 59,
659 |       "type": "DownloadAndLoadCogVideoModel",
660 |       "pos": {
661 |         "0": 622,
662 |         "1": -25
663 |       },
664 |       "size": [
665 |         347.24594407027485,
666 |         218
667 |       ],
668 |       "flags": {},
669 |       "order": 3,
670 |       "mode": 0,
671 |       "inputs": [
672 |         {
673 |           "name": "block_edit",
674 |           "type": "TRANSFORMERBLOCKS",
675 |           "link": null,
676 |           "shape": 7
677 |         },
678 |         {
679 |           "name": "lora",
680 |           "type": "COGLORA",
681 |           "link": null,
682 |           "shape": 7
683 |         },
684 |         {
685 |           "name": "compile_args",
686 |           "type": "COMPILEARGS",
687 |           "link": null,
688 |           "shape": 7
689 |         }
690 |       ],
691 |       "outputs": [
692 |         {
693 |           "name": "model",
694 |           "type": "COGVIDEOMODEL",
695 |           "links": [
696 |             144
697 |           ]
698 |         },
699 |         {
700 |           "name": "vae",
701 |           "type": "VAE",
702 |           "links": [
703 |             132,
704 |             141
705 |           ],
706 |           "slot_index": 1
707 |         }
708 |       ],
709 |       "properties": {
710 |         "Node name for S&R": "DownloadAndLoadCogVideoModel"
711 |       },
712 |       "widgets_values": [
713 |         "feizhengcong/CogvideoX-Interpolation",
714 |         "bf16",
715 |         "disabled",
716 |         false,
717 |         "sdpa",
718 |         "main_device"
719 |       ]
720 |     }
721 |   ],
722 |   "links": [
723 |     [
724 |       54,
725 |       20,
726 |       0,
727 |       30,
728 |       0,
729 |       "CLIP"
730 |     ],
731 |     [
732 |       71,
733 |       36,
734 |       0,
735 |       37,
736 |       0,
737 |       "IMAGE"
738 |     ],
739 |     [
740 |       132,
741 |       59,
742 |       1,
743 |       60,
744 |       0,
745 |       "VAE"
746 |     ],
747 |     [
748 |       134,
749 |       60,
750 |       0,
751 |       44,
752 |       0,
753 |       "IMAGE"
754 |     ],
755 |     [
756 |       141,
757 |       59,
758 |       1,
759 |       62,
760 |       0,
761 |       "VAE"
762 |     ],
763 |     [
764 |       142,
765 |       37,
766 |       0,
767 |       62,
768 |       1,
769 |       "IMAGE"
770 |     ],
771 |     [
772 |       144,
773 |       59,
774 |       0,
775 |       63,
776 |       0,
777 |       "COGVIDEOMODEL"
778 |     ],
779 |     [
780 |       145,
781 |       30,
782 |       0,
783 |       63,
784 |       1,
785 |       "CONDITIONING"
786 |     ],
787 |     [
788 |       146,
789 |       31,
790 |       0,
791 |       63,
792 |       2,
793 |       "CONDITIONING"
794 |     ],
795 |     [
796 |       147,
797 |       62,
798 |       0,
799 |       63,
800 |       4,
801 |       "LATENT"
802 |     ],
803 |     [
804 |       148,
805 |       63,
806 |       0,
807 |       60,
808 |       1,
809 |       "LATENT"
810 |     ],
811 |     [
812 |       149,
813 |       30,
814 |       1,
815 |       31,
816 |       0,
817 |       "CLIP"
818 |     ],
819 |     [
820 |       151,
821 |       64,
822 |       0,
823 |       65,
824 |       0,
825 |       "IMAGE"
826 |     ],
827 |     [
828 |       152,
829 |       37,
830 |       1,
831 |       65,
832 |       4,
833 |       "INT"
834 |     ],
835 |     [
836 |       153,
837 |       37,
838 |       2,
839 |       65,
840 |       5,
841 |       "INT"
842 |     ],
843 |     [
844 |       155,
845 |       65,
846 |       0,
847 |       62,
848 |       2,
849 |       "IMAGE"
850 |     ]
851 |   ],
852 |   "groups": [],
853 |   "config": {},
854 |   "extra": {
855 |     "ds": {
856 |       "scale": 0.7627768444387061,
857 |       "offset": [
858 |         630.1733472923837,
859 |         148.14641794691272
860 |       ]
861 |     }
862 |   },
863 |   "version": 0.4
864 | }


--------------------------------------------------------------------------------
/example_workflows/cogvideox_1_5_5b_I2V_01.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 64,
  3 |   "last_link_id": 149,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 63,
  7 |       "type": "CogVideoSampler",
  8 |       "pos": {
  9 |         "0": 1142,
 10 |         "1": 74
 11 |       },
 12 |       "size": {
 13 |         "0": 330,
 14 |         "1": 574
 15 |       },
 16 |       "flags": {},
 17 |       "order": 7,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "model",
 22 |           "type": "COGVIDEOMODEL",
 23 |           "link": 144
 24 |         },
 25 |         {
 26 |           "name": "positive",
 27 |           "type": "CONDITIONING",
 28 |           "link": 145
 29 |         },
 30 |         {
 31 |           "name": "negative",
 32 |           "type": "CONDITIONING",
 33 |           "link": 146
 34 |         },
 35 |         {
 36 |           "name": "samples",
 37 |           "type": "LATENT",
 38 |           "link": null,
 39 |           "shape": 7
 40 |         },
 41 |         {
 42 |           "name": "image_cond_latents",
 43 |           "type": "LATENT",
 44 |           "link": 147,
 45 |           "shape": 7
 46 |         },
 47 |         {
 48 |           "name": "context_options",
 49 |           "type": "COGCONTEXT",
 50 |           "link": null,
 51 |           "shape": 7
 52 |         },
 53 |         {
 54 |           "name": "controlnet",
 55 |           "type": "COGVIDECONTROLNET",
 56 |           "link": null,
 57 |           "shape": 7
 58 |         },
 59 |         {
 60 |           "name": "tora_trajectory",
 61 |           "type": "TORAFEATURES",
 62 |           "link": null,
 63 |           "shape": 7
 64 |         },
 65 |         {
 66 |           "name": "fastercache",
 67 |           "type": "FASTERCACHEARGS",
 68 |           "link": null,
 69 |           "shape": 7
 70 |         }
 71 |       ],
 72 |       "outputs": [
 73 |         {
 74 |           "name": "samples",
 75 |           "type": "LATENT",
 76 |           "links": [
 77 |             148
 78 |           ]
 79 |         }
 80 |       ],
 81 |       "properties": {
 82 |         "Node name for S&R": "CogVideoSampler"
 83 |       },
 84 |       "widgets_values": [
 85 |         49,
 86 |         25,
 87 |         6,
 88 |         0,
 89 |         "fixed",
 90 |         "CogVideoXDDIM",
 91 |         1
 92 |       ]
 93 |     },
 94 |     {
 95 |       "id": 62,
 96 |       "type": "CogVideoImageEncode",
 97 |       "pos": {
 98 |         "0": 1149,
 99 |         "1": 711
100 |       },
101 |       "size": {
102 |         "0": 315,
103 |         "1": 122
104 |       },
105 |       "flags": {},
106 |       "order": 5,
107 |       "mode": 0,
108 |       "inputs": [
109 |         {
110 |           "name": "vae",
111 |           "type": "VAE",
112 |           "link": 141
113 |         },
114 |         {
115 |           "name": "start_image",
116 |           "type": "IMAGE",
117 |           "link": 142
118 |         },
119 |         {
120 |           "name": "end_image",
121 |           "type": "IMAGE",
122 |           "link": null,
123 |           "shape": 7
124 |         }
125 |       ],
126 |       "outputs": [
127 |         {
128 |           "name": "samples",
129 |           "type": "LATENT",
130 |           "links": [
131 |             147
132 |           ]
133 |         }
134 |       ],
135 |       "properties": {
136 |         "Node name for S&R": "CogVideoImageEncode"
137 |       },
138 |       "widgets_values": [
139 |         false,
140 |         0
141 |       ]
142 |     },
143 |     {
144 |       "id": 30,
145 |       "type": "CogVideoTextEncode",
146 |       "pos": {
147 |         "0": 493,
148 |         "1": 303
149 |       },
150 |       "size": {
151 |         "0": 471.90142822265625,
152 |         "1": 168.08047485351562
153 |       },
154 |       "flags": {},
155 |       "order": 4,
156 |       "mode": 0,
157 |       "inputs": [
158 |         {
159 |           "name": "clip",
160 |           "type": "CLIP",
161 |           "link": 54
162 |         }
163 |       ],
164 |       "outputs": [
165 |         {
166 |           "name": "conditioning",
167 |           "type": "CONDITIONING",
168 |           "links": [
169 |             145
170 |           ],
171 |           "slot_index": 0,
172 |           "shape": 3
173 |         },
174 |         {
175 |           "name": "clip",
176 |           "type": "CLIP",
177 |           "links": [
178 |             149
179 |           ],
180 |           "slot_index": 1
181 |         }
182 |       ],
183 |       "properties": {
184 |         "Node name for S&R": "CogVideoTextEncode"
185 |       },
186 |       "widgets_values": [
187 |         "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
188 |         1,
189 |         false
190 |       ]
191 |     },
192 |     {
193 |       "id": 36,
194 |       "type": "LoadImage",
195 |       "pos": {
196 |         "0": 335,
197 |         "1": 731
198 |       },
199 |       "size": {
200 |         "0": 402.06353759765625,
201 |         "1": 396.6225891113281
202 |       },
203 |       "flags": {},
204 |       "order": 0,
205 |       "mode": 0,
206 |       "inputs": [],
207 |       "outputs": [
208 |         {
209 |           "name": "IMAGE",
210 |           "type": "IMAGE",
211 |           "links": [
212 |             71
213 |           ],
214 |           "slot_index": 0,
215 |           "shape": 3
216 |         },
217 |         {
218 |           "name": "MASK",
219 |           "type": "MASK",
220 |           "links": null,
221 |           "shape": 3
222 |         }
223 |       ],
224 |       "properties": {
225 |         "Node name for S&R": "LoadImage"
226 |       },
227 |       "widgets_values": [
228 |         "sd3stag.png",
229 |         "image"
230 |       ]
231 |     },
232 |     {
233 |       "id": 20,
234 |       "type": "CLIPLoader",
235 |       "pos": {
236 |         "0": -2,
237 |         "1": 304
238 |       },
239 |       "size": {
240 |         "0": 451.30548095703125,
241 |         "1": 82
242 |       },
243 |       "flags": {},
244 |       "order": 1,
245 |       "mode": 0,
246 |       "inputs": [],
247 |       "outputs": [
248 |         {
249 |           "name": "CLIP",
250 |           "type": "CLIP",
251 |           "links": [
252 |             54
253 |           ],
254 |           "slot_index": 0,
255 |           "shape": 3
256 |         }
257 |       ],
258 |       "properties": {
259 |         "Node name for S&R": "CLIPLoader"
260 |       },
261 |       "widgets_values": [
262 |         "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
263 |         "sd3"
264 |       ]
265 |     },
266 |     {
267 |       "id": 60,
268 |       "type": "CogVideoDecode",
269 |       "pos": {
270 |         "0": 1523,
271 |         "1": -6
272 |       },
273 |       "size": {
274 |         "0": 315,
275 |         "1": 198
276 |       },
277 |       "flags": {},
278 |       "order": 8,
279 |       "mode": 0,
280 |       "inputs": [
281 |         {
282 |           "name": "vae",
283 |           "type": "VAE",
284 |           "link": 132
285 |         },
286 |         {
287 |           "name": "samples",
288 |           "type": "LATENT",
289 |           "link": 148
290 |         }
291 |       ],
292 |       "outputs": [
293 |         {
294 |           "name": "images",
295 |           "type": "IMAGE",
296 |           "links": [
297 |             134
298 |           ]
299 |         }
300 |       ],
301 |       "properties": {
302 |         "Node name for S&R": "CogVideoDecode"
303 |       },
304 |       "widgets_values": [
305 |         true,
306 |         240,
307 |         360,
308 |         0.2,
309 |         0.2,
310 |         true
311 |       ]
312 |     },
313 |     {
314 |       "id": 37,
315 |       "type": "ImageResizeKJ",
316 |       "pos": {
317 |         "0": 784,
318 |         "1": 731
319 |       },
320 |       "size": {
321 |         "0": 315,
322 |         "1": 266
323 |       },
324 |       "flags": {},
325 |       "order": 3,
326 |       "mode": 0,
327 |       "inputs": [
328 |         {
329 |           "name": "image",
330 |           "type": "IMAGE",
331 |           "link": 71
332 |         },
333 |         {
334 |           "name": "get_image_size",
335 |           "type": "IMAGE",
336 |           "link": null,
337 |           "shape": 7
338 |         },
339 |         {
340 |           "name": "width_input",
341 |           "type": "INT",
342 |           "link": null,
343 |           "widget": {
344 |             "name": "width_input"
345 |           }
346 |         },
347 |         {
348 |           "name": "height_input",
349 |           "type": "INT",
350 |           "link": null,
351 |           "widget": {
352 |             "name": "height_input"
353 |           }
354 |         }
355 |       ],
356 |       "outputs": [
357 |         {
358 |           "name": "IMAGE",
359 |           "type": "IMAGE",
360 |           "links": [
361 |             142
362 |           ],
363 |           "slot_index": 0,
364 |           "shape": 3
365 |         },
366 |         {
367 |           "name": "width",
368 |           "type": "INT",
369 |           "links": null,
370 |           "shape": 3
371 |         },
372 |         {
373 |           "name": "height",
374 |           "type": "INT",
375 |           "links": null,
376 |           "shape": 3
377 |         }
378 |       ],
379 |       "properties": {
380 |         "Node name for S&R": "ImageResizeKJ"
381 |       },
382 |       "widgets_values": [
383 |         1360,
384 |         768,
385 |         "lanczos",
386 |         false,
387 |         16,
388 |         0,
389 |         0,
390 |         "disabled"
391 |       ]
392 |     },
393 |     {
394 |       "id": 31,
395 |       "type": "CogVideoTextEncode",
396 |       "pos": {
397 |         "0": 497,
398 |         "1": 520
399 |       },
400 |       "size": {
401 |         "0": 463.01251220703125,
402 |         "1": 144
403 |       },
404 |       "flags": {},
405 |       "order": 6,
406 |       "mode": 0,
407 |       "inputs": [
408 |         {
409 |           "name": "clip",
410 |           "type": "CLIP",
411 |           "link": 149
412 |         }
413 |       ],
414 |       "outputs": [
415 |         {
416 |           "name": "conditioning",
417 |           "type": "CONDITIONING",
418 |           "links": [
419 |             146
420 |           ],
421 |           "slot_index": 0,
422 |           "shape": 3
423 |         },
424 |         {
425 |           "name": "clip",
426 |           "type": "CLIP",
427 |           "links": null
428 |         }
429 |       ],
430 |       "properties": {
431 |         "Node name for S&R": "CogVideoTextEncode"
432 |       },
433 |       "widgets_values": [
434 |         "",
435 |         1,
436 |         true
437 |       ]
438 |     },
439 |     {
440 |       "id": 59,
441 |       "type": "DownloadAndLoadCogVideoModel",
442 |       "pos": {
443 |         "0": 622,
444 |         "1": -25
445 |       },
446 |       "size": {
447 |         "0": 315,
448 |         "1": 218
449 |       },
450 |       "flags": {},
451 |       "order": 2,
452 |       "mode": 0,
453 |       "inputs": [
454 |         {
455 |           "name": "block_edit",
456 |           "type": "TRANSFORMERBLOCKS",
457 |           "link": null,
458 |           "shape": 7
459 |         },
460 |         {
461 |           "name": "lora",
462 |           "type": "COGLORA",
463 |           "link": null,
464 |           "shape": 7
465 |         },
466 |         {
467 |           "name": "compile_args",
468 |           "type": "COMPILEARGS",
469 |           "link": null,
470 |           "shape": 7
471 |         }
472 |       ],
473 |       "outputs": [
474 |         {
475 |           "name": "model",
476 |           "type": "COGVIDEOMODEL",
477 |           "links": [
478 |             144
479 |           ]
480 |         },
481 |         {
482 |           "name": "vae",
483 |           "type": "VAE",
484 |           "links": [
485 |             132,
486 |             141
487 |           ],
488 |           "slot_index": 1
489 |         }
490 |       ],
491 |       "properties": {
492 |         "Node name for S&R": "DownloadAndLoadCogVideoModel"
493 |       },
494 |       "widgets_values": [
495 |         "kijai/CogVideoX-5b-1.5-I2V",
496 |         "bf16",
497 |         "disabled",
498 |         false,
499 |         "sdpa",
500 |         "main_device"
501 |       ]
502 |     },
503 |     {
504 |       "id": 44,
505 |       "type": "VHS_VideoCombine",
506 |       "pos": {
507 |         "0": 1884,
508 |         "1": -6
509 |       },
510 |       "size": [
511 |         605.3909912109375,
512 |         310
513 |       ],
514 |       "flags": {},
515 |       "order": 9,
516 |       "mode": 0,
517 |       "inputs": [
518 |         {
519 |           "name": "images",
520 |           "type": "IMAGE",
521 |           "link": 134
522 |         },
523 |         {
524 |           "name": "audio",
525 |           "type": "AUDIO",
526 |           "link": null,
527 |           "shape": 7
528 |         },
529 |         {
530 |           "name": "meta_batch",
531 |           "type": "VHS_BatchManager",
532 |           "link": null,
533 |           "shape": 7
534 |         },
535 |         {
536 |           "name": "vae",
537 |           "type": "VAE",
538 |           "link": null,
539 |           "shape": 7
540 |         }
541 |       ],
542 |       "outputs": [
543 |         {
544 |           "name": "Filenames",
545 |           "type": "VHS_FILENAMES",
546 |           "links": null,
547 |           "shape": 3
548 |         }
549 |       ],
550 |       "properties": {
551 |         "Node name for S&R": "VHS_VideoCombine"
552 |       },
553 |       "widgets_values": {
554 |         "frame_rate": 16,
555 |         "loop_count": 0,
556 |         "filename_prefix": "CogVideoX_1_5_I2V",
557 |         "format": "video/h264-mp4",
558 |         "pix_fmt": "yuv420p",
559 |         "crf": 19,
560 |         "save_metadata": true,
561 |         "pingpong": false,
562 |         "save_output": true,
563 |         "videopreview": {
564 |           "hidden": false,
565 |           "paused": false,
566 |           "params": {
567 |             "filename": "CogVideoX-I2V_00004.mp4",
568 |             "subfolder": "",
569 |             "type": "temp",
570 |             "format": "video/h264-mp4",
571 |             "frame_rate": 8
572 |           },
573 |           "muted": false
574 |         }
575 |       }
576 |     }
577 |   ],
578 |   "links": [
579 |     [
580 |       54,
581 |       20,
582 |       0,
583 |       30,
584 |       0,
585 |       "CLIP"
586 |     ],
587 |     [
588 |       71,
589 |       36,
590 |       0,
591 |       37,
592 |       0,
593 |       "IMAGE"
594 |     ],
595 |     [
596 |       132,
597 |       59,
598 |       1,
599 |       60,
600 |       0,
601 |       "VAE"
602 |     ],
603 |     [
604 |       134,
605 |       60,
606 |       0,
607 |       44,
608 |       0,
609 |       "IMAGE"
610 |     ],
611 |     [
612 |       141,
613 |       59,
614 |       1,
615 |       62,
616 |       0,
617 |       "VAE"
618 |     ],
619 |     [
620 |       142,
621 |       37,
622 |       0,
623 |       62,
624 |       1,
625 |       "IMAGE"
626 |     ],
627 |     [
628 |       144,
629 |       59,
630 |       0,
631 |       63,
632 |       0,
633 |       "COGVIDEOMODEL"
634 |     ],
635 |     [
636 |       145,
637 |       30,
638 |       0,
639 |       63,
640 |       1,
641 |       "CONDITIONING"
642 |     ],
643 |     [
644 |       146,
645 |       31,
646 |       0,
647 |       63,
648 |       2,
649 |       "CONDITIONING"
650 |     ],
651 |     [
652 |       147,
653 |       62,
654 |       0,
655 |       63,
656 |       4,
657 |       "LATENT"
658 |     ],
659 |     [
660 |       148,
661 |       63,
662 |       0,
663 |       60,
664 |       1,
665 |       "LATENT"
666 |     ],
667 |     [
668 |       149,
669 |       30,
670 |       1,
671 |       31,
672 |       0,
673 |       "CLIP"
674 |     ]
675 |   ],
676 |   "groups": [],
677 |   "config": {},
678 |   "extra": {
679 |     "ds": {
680 |       "scale": 0.7627768444387097,
681 |       "offset": [
682 |         716.7143770104391,
683 |         291.75859557289965
684 |       ]
685 |     }
686 |   },
687 |   "version": 0.4
688 | }


--------------------------------------------------------------------------------
/example_workflows/cogvideox_Fun_I2V_02.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 51,
  3 |   "last_link_id": 123,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 48,
  7 |       "type": "CogVideoSampler",
  8 |       "pos": {
  9 |         "0": 1200,
 10 |         "1": 124
 11 |       },
 12 |       "size": [
 13 |         330,
 14 |         574
 15 |       ],
 16 |       "flags": {},
 17 |       "order": 7,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "model",
 22 |           "type": "COGVIDEOMODEL",
 23 |           "link": 114
 24 |         },
 25 |         {
 26 |           "name": "positive",
 27 |           "type": "CONDITIONING",
 28 |           "link": 116
 29 |         },
 30 |         {
 31 |           "name": "negative",
 32 |           "type": "CONDITIONING",
 33 |           "link": 117
 34 |         },
 35 |         {
 36 |           "name": "samples",
 37 |           "type": "LATENT",
 38 |           "link": null,
 39 |           "shape": 7
 40 |         },
 41 |         {
 42 |           "name": "image_cond_latents",
 43 |           "type": "LATENT",
 44 |           "link": 120,
 45 |           "shape": 7
 46 |         },
 47 |         {
 48 |           "name": "context_options",
 49 |           "type": "COGCONTEXT",
 50 |           "link": null,
 51 |           "shape": 7
 52 |         },
 53 |         {
 54 |           "name": "controlnet",
 55 |           "type": "COGVIDECONTROLNET",
 56 |           "link": null,
 57 |           "shape": 7
 58 |         },
 59 |         {
 60 |           "name": "tora_trajectory",
 61 |           "type": "TORAFEATURES",
 62 |           "link": null,
 63 |           "shape": 7
 64 |         },
 65 |         {
 66 |           "name": "fastercache",
 67 |           "type": "FASTERCACHEARGS",
 68 |           "link": null,
 69 |           "shape": 7
 70 |         }
 71 |       ],
 72 |       "outputs": [
 73 |         {
 74 |           "name": "samples",
 75 |           "type": "LATENT",
 76 |           "links": [
 77 |             123
 78 |           ],
 79 |           "slot_index": 0
 80 |         }
 81 |       ],
 82 |       "properties": {
 83 |         "Node name for S&R": "CogVideoSampler"
 84 |       },
 85 |       "widgets_values": [
 86 |         49,
 87 |         25,
 88 |         6,
 89 |         458091243358272,
 90 |         "randomize",
 91 |         "CogVideoXDDIM",
 92 |         1
 93 |       ]
 94 |     },
 95 |     {
 96 |       "id": 30,
 97 |       "type": "CogVideoTextEncode",
 98 |       "pos": {
 99 |         "0": 490,
100 |         "1": 146
101 |       },
102 |       "size": {
103 |         "0": 471.90142822265625,
104 |         "1": 168.08047485351562
105 |       },
106 |       "flags": {},
107 |       "order": 3,
108 |       "mode": 0,
109 |       "inputs": [
110 |         {
111 |           "name": "clip",
112 |           "type": "CLIP",
113 |           "link": 54
114 |         }
115 |       ],
116 |       "outputs": [
117 |         {
118 |           "name": "conditioning",
119 |           "type": "CONDITIONING",
120 |           "links": [
121 |             116
122 |           ],
123 |           "slot_index": 0,
124 |           "shape": 3
125 |         },
126 |         {
127 |           "name": "clip",
128 |           "type": "CLIP",
129 |           "links": [
130 |             110
131 |           ],
132 |           "slot_index": 1
133 |         }
134 |       ],
135 |       "properties": {
136 |         "Node name for S&R": "CogVideoTextEncode"
137 |       },
138 |       "widgets_values": [
139 |         "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
140 |         1,
141 |         false
142 |       ]
143 |     },
144 |     {
145 |       "id": 31,
146 |       "type": "CogVideoTextEncode",
147 |       "pos": {
148 |         "0": 497,
149 |         "1": 365
150 |       },
151 |       "size": {
152 |         "0": 463.01251220703125,
153 |         "1": 144
154 |       },
155 |       "flags": {},
156 |       "order": 5,
157 |       "mode": 0,
158 |       "inputs": [
159 |         {
160 |           "name": "clip",
161 |           "type": "CLIP",
162 |           "link": 110
163 |         }
164 |       ],
165 |       "outputs": [
166 |         {
167 |           "name": "conditioning",
168 |           "type": "CONDITIONING",
169 |           "links": [
170 |             117
171 |           ],
172 |           "slot_index": 0,
173 |           "shape": 3
174 |         },
175 |         {
176 |           "name": "clip",
177 |           "type": "CLIP",
178 |           "links": null
179 |         }
180 |       ],
181 |       "properties": {
182 |         "Node name for S&R": "CogVideoTextEncode"
183 |       },
184 |       "widgets_values": [
185 |         "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
186 |         1,
187 |         true
188 |       ]
189 |     },
190 |     {
191 |       "id": 20,
192 |       "type": "CLIPLoader",
193 |       "pos": {
194 |         "0": -7,
195 |         "1": -37
196 |       },
197 |       "size": {
198 |         "0": 451.30548095703125,
199 |         "1": 82
200 |       },
201 |       "flags": {},
202 |       "order": 0,
203 |       "mode": 0,
204 |       "inputs": [],
205 |       "outputs": [
206 |         {
207 |           "name": "CLIP",
208 |           "type": "CLIP",
209 |           "links": [
210 |             54
211 |           ],
212 |           "slot_index": 0,
213 |           "shape": 3
214 |         }
215 |       ],
216 |       "properties": {
217 |         "Node name for S&R": "CLIPLoader"
218 |       },
219 |       "widgets_values": [
220 |         "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
221 |         "sd3"
222 |       ]
223 |     },
224 |     {
225 |       "id": 50,
226 |       "type": "CogVideoImageEncodeFunInP",
227 |       "pos": {
228 |         "0": 865,
229 |         "1": 567
230 |       },
231 |       "size": [
232 |         253.60000610351562,
233 |         146
234 |       ],
235 |       "flags": {},
236 |       "order": 6,
237 |       "mode": 0,
238 |       "inputs": [
239 |         {
240 |           "name": "vae",
241 |           "type": "VAE",
242 |           "link": 119
243 |         },
244 |         {
245 |           "name": "start_image",
246 |           "type": "IMAGE",
247 |           "link": 118
248 |         },
249 |         {
250 |           "name": "end_image",
251 |           "type": "IMAGE",
252 |           "link": null,
253 |           "shape": 7
254 |         }
255 |       ],
256 |       "outputs": [
257 |         {
258 |           "name": "image_cond_latents",
259 |           "type": "LATENT",
260 |           "links": [
261 |             120
262 |           ],
263 |           "slot_index": 0
264 |         }
265 |       ],
266 |       "properties": {
267 |         "Node name for S&R": "CogVideoImageEncodeFunInP"
268 |       },
269 |       "widgets_values": [
270 |         49,
271 |         true,
272 |         0
273 |       ]
274 |     },
275 |     {
276 |       "id": 37,
277 |       "type": "ImageResizeKJ",
278 |       "pos": {
279 |         "0": 499,
280 |         "1": 587
281 |       },
282 |       "size": {
283 |         "0": 315,
284 |         "1": 266
285 |       },
286 |       "flags": {},
287 |       "order": 4,
288 |       "mode": 0,
289 |       "inputs": [
290 |         {
291 |           "name": "image",
292 |           "type": "IMAGE",
293 |           "link": 71
294 |         },
295 |         {
296 |           "name": "get_image_size",
297 |           "type": "IMAGE",
298 |           "link": null,
299 |           "shape": 7
300 |         },
301 |         {
302 |           "name": "width_input",
303 |           "type": "INT",
304 |           "link": null,
305 |           "widget": {
306 |             "name": "width_input"
307 |           }
308 |         },
309 |         {
310 |           "name": "height_input",
311 |           "type": "INT",
312 |           "link": null,
313 |           "widget": {
314 |             "name": "height_input"
315 |           }
316 |         }
317 |       ],
318 |       "outputs": [
319 |         {
320 |           "name": "IMAGE",
321 |           "type": "IMAGE",
322 |           "links": [
323 |             118
324 |           ],
325 |           "slot_index": 0,
326 |           "shape": 3
327 |         },
328 |         {
329 |           "name": "width",
330 |           "type": "INT",
331 |           "links": null,
332 |           "shape": 3
333 |         },
334 |         {
335 |           "name": "height",
336 |           "type": "INT",
337 |           "links": null,
338 |           "shape": 3
339 |         }
340 |       ],
341 |       "properties": {
342 |         "Node name for S&R": "ImageResizeKJ"
343 |       },
344 |       "widgets_values": [
345 |         720,
346 |         480,
347 |         "lanczos",
348 |         false,
349 |         2,
350 |         0,
351 |         0,
352 |         "disabled"
353 |       ]
354 |     },
355 |     {
356 |       "id": 36,
357 |       "type": "LoadImage",
358 |       "pos": {
359 |         "0": 43,
360 |         "1": 587
361 |       },
362 |       "size": [
363 |         405.2986131072541,
364 |         477.48971409949377
365 |       ],
366 |       "flags": {},
367 |       "order": 1,
368 |       "mode": 0,
369 |       "inputs": [],
370 |       "outputs": [
371 |         {
372 |           "name": "IMAGE",
373 |           "type": "IMAGE",
374 |           "links": [
375 |             71
376 |           ],
377 |           "slot_index": 0,
378 |           "shape": 3
379 |         },
380 |         {
381 |           "name": "MASK",
382 |           "type": "MASK",
383 |           "links": null,
384 |           "shape": 3
385 |         }
386 |       ],
387 |       "properties": {
388 |         "Node name for S&R": "LoadImage"
389 |       },
390 |       "widgets_values": [
391 |         "6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
392 |         "image"
393 |       ]
394 |     },
395 |     {
396 |       "id": 51,
397 |       "type": "CogVideoDecode",
398 |       "pos": {
399 |         "0": 1219,
400 |         "1": -134
401 |       },
402 |       "size": {
403 |         "0": 315,
404 |         "1": 198
405 |       },
406 |       "flags": {},
407 |       "order": 8,
408 |       "mode": 0,
409 |       "inputs": [
410 |         {
411 |           "name": "vae",
412 |           "type": "VAE",
413 |           "link": 122
414 |         },
415 |         {
416 |           "name": "samples",
417 |           "type": "LATENT",
418 |           "link": 123
419 |         }
420 |       ],
421 |       "outputs": [
422 |         {
423 |           "name": "images",
424 |           "type": "IMAGE",
425 |           "links": [
426 |             121
427 |           ]
428 |         }
429 |       ],
430 |       "properties": {
431 |         "Node name for S&R": "CogVideoDecode"
432 |       },
433 |       "widgets_values": [
434 |         true,
435 |         240,
436 |         360,
437 |         0.2,
438 |         0.2,
439 |         true
440 |       ]
441 |     },
442 |     {
443 |       "id": 44,
444 |       "type": "VHS_VideoCombine",
445 |       "pos": {
446 |         "0": 1602,
447 |         "1": -131
448 |       },
449 |       "size": [
450 |         767.7372279260157,
451 |         822.491455078125
452 |       ],
453 |       "flags": {},
454 |       "order": 9,
455 |       "mode": 0,
456 |       "inputs": [
457 |         {
458 |           "name": "images",
459 |           "type": "IMAGE",
460 |           "link": 121
461 |         },
462 |         {
463 |           "name": "audio",
464 |           "type": "AUDIO",
465 |           "link": null,
466 |           "shape": 7
467 |         },
468 |         {
469 |           "name": "meta_batch",
470 |           "type": "VHS_BatchManager",
471 |           "link": null,
472 |           "shape": 7
473 |         },
474 |         {
475 |           "name": "vae",
476 |           "type": "VAE",
477 |           "link": null,
478 |           "shape": 7
479 |         }
480 |       ],
481 |       "outputs": [
482 |         {
483 |           "name": "Filenames",
484 |           "type": "VHS_FILENAMES",
485 |           "links": null,
486 |           "shape": 3
487 |         }
488 |       ],
489 |       "properties": {
490 |         "Node name for S&R": "VHS_VideoCombine"
491 |       },
492 |       "widgets_values": {
493 |         "frame_rate": 8,
494 |         "loop_count": 0,
495 |         "filename_prefix": "CogVideoX_Fun",
496 |         "format": "video/h264-mp4",
497 |         "pix_fmt": "yuv420p",
498 |         "crf": 19,
499 |         "save_metadata": true,
500 |         "pingpong": false,
501 |         "save_output": true,
502 |         "videopreview": {
503 |           "hidden": false,
504 |           "paused": false,
505 |           "params": {
506 |             "filename": "CogVideoX_Fun_00002.mp4",
507 |             "subfolder": "",
508 |             "type": "temp",
509 |             "format": "video/h264-mp4",
510 |             "frame_rate": 8
511 |           },
512 |           "muted": false
513 |         }
514 |       }
515 |     },
516 |     {
517 |       "id": 49,
518 |       "type": "DownloadAndLoadCogVideoModel",
519 |       "pos": {
520 |         "0": 491,
521 |         "1": -167
522 |       },
523 |       "size": {
524 |         "0": 362.1656799316406,
525 |         "1": 218
526 |       },
527 |       "flags": {},
528 |       "order": 2,
529 |       "mode": 0,
530 |       "inputs": [
531 |         {
532 |           "name": "block_edit",
533 |           "type": "TRANSFORMERBLOCKS",
534 |           "link": null,
535 |           "shape": 7
536 |         },
537 |         {
538 |           "name": "lora",
539 |           "type": "COGLORA",
540 |           "link": null,
541 |           "shape": 7
542 |         },
543 |         {
544 |           "name": "compile_args",
545 |           "type": "COMPILEARGS",
546 |           "link": null,
547 |           "shape": 7
548 |         }
549 |       ],
550 |       "outputs": [
551 |         {
552 |           "name": "model",
553 |           "type": "COGVIDEOMODEL",
554 |           "links": [
555 |             114
556 |           ]
557 |         },
558 |         {
559 |           "name": "vae",
560 |           "type": "VAE",
561 |           "links": [
562 |             119,
563 |             122
564 |           ],
565 |           "slot_index": 1
566 |         }
567 |       ],
568 |       "properties": {
569 |         "Node name for S&R": "DownloadAndLoadCogVideoModel"
570 |       },
571 |       "widgets_values": [
572 |         "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
573 |         "bf16",
574 |         "disabled",
575 |         false,
576 |         "sdpa",
577 |         "main_device"
578 |       ]
579 |     }
580 |   ],
581 |   "links": [
582 |     [
583 |       54,
584 |       20,
585 |       0,
586 |       30,
587 |       0,
588 |       "CLIP"
589 |     ],
590 |     [
591 |       71,
592 |       36,
593 |       0,
594 |       37,
595 |       0,
596 |       "IMAGE"
597 |     ],
598 |     [
599 |       110,
600 |       30,
601 |       1,
602 |       31,
603 |       0,
604 |       "CLIP"
605 |     ],
606 |     [
607 |       114,
608 |       49,
609 |       0,
610 |       48,
611 |       0,
612 |       "COGVIDEOMODEL"
613 |     ],
614 |     [
615 |       116,
616 |       30,
617 |       0,
618 |       48,
619 |       1,
620 |       "CONDITIONING"
621 |     ],
622 |     [
623 |       117,
624 |       31,
625 |       0,
626 |       48,
627 |       2,
628 |       "CONDITIONING"
629 |     ],
630 |     [
631 |       118,
632 |       37,
633 |       0,
634 |       50,
635 |       1,
636 |       "IMAGE"
637 |     ],
638 |     [
639 |       119,
640 |       49,
641 |       1,
642 |       50,
643 |       0,
644 |       "VAE"
645 |     ],
646 |     [
647 |       120,
648 |       50,
649 |       0,
650 |       48,
651 |       4,
652 |       "LATENT"
653 |     ],
654 |     [
655 |       121,
656 |       51,
657 |       0,
658 |       44,
659 |       0,
660 |       "IMAGE"
661 |     ],
662 |     [
663 |       122,
664 |       49,
665 |       1,
666 |       51,
667 |       0,
668 |       "VAE"
669 |     ],
670 |     [
671 |       123,
672 |       48,
673 |       0,
674 |       51,
675 |       1,
676 |       "LATENT"
677 |     ]
678 |   ],
679 |   "groups": [],
680 |   "config": {},
681 |   "extra": {
682 |     "ds": {
683 |       "scale": 0.693433494944278,
684 |       "offset": [
685 |         416.0091223165226,
686 |         378.00843746369645
687 |       ]
688 |     }
689 |   },
690 |   "version": 0.4
691 | }


--------------------------------------------------------------------------------
/example_workflows/noise_warp_example_input_video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kijai/ComfyUI-CogVideoXWrapper/dbc63f622dd095391335612d0c7d7bbff8745cc8/example_workflows/noise_warp_example_input_video.mp4


--------------------------------------------------------------------------------
/fp8_optimization.py:
--------------------------------------------------------------------------------
 1 | #based on ComfyUI's and MinusZoneAI's fp8_linear optimization
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | def fp8_linear_forward(cls, original_dtype, input):
 7 |     weight_dtype = cls.weight.dtype
 8 |     if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
 9 |         if len(input.shape) == 3:
10 |             if weight_dtype == torch.float8_e4m3fn:
11 |                 inn = input.reshape(-1, input.shape[2]).to(torch.float8_e5m2)
12 |             else:
13 |                 inn = input.reshape(-1, input.shape[2]).to(torch.float8_e4m3fn)
14 |             w = cls.weight.t()
15 | 
16 |             scale_weight = torch.ones((1), device=input.device, dtype=torch.float32)
17 |             scale_input = scale_weight
18 | 
19 |             bias = cls.bias.to(original_dtype) if cls.bias is not None else None
20 |             out_dtype = original_dtype
21 | 
22 |             if bias is not None:
23 |                 o = torch._scaled_mm(inn, w, out_dtype=out_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
24 |             else:
25 |                 o = torch._scaled_mm(inn, w, out_dtype=out_dtype, scale_a=scale_input, scale_b=scale_weight)
26 | 
27 |             if isinstance(o, tuple):
28 |                 o = o[0]
29 | 
30 |             return o.reshape((-1, input.shape[1], cls.weight.shape[0]))
31 |         else:
32 |             cls.to(original_dtype)
33 |             out = cls.original_forward(input.to(original_dtype))
34 |             cls.to(original_dtype)
35 |             return out
36 |     else:
37 |         return cls.original_forward(input)
38 | 
39 | def convert_fp8_linear(module, original_dtype, params_to_keep={}):
40 |     setattr(module, "fp8_matmul_enabled", True)
41 |    
42 |     for name, module in module.named_modules():
43 |         if not any(keyword in name for keyword in params_to_keep):
44 |             if isinstance(module, nn.Linear):
45 |                 original_forward = module.forward
46 |                 setattr(module, "original_forward", original_forward)
47 |                 setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))
48 | 


--------------------------------------------------------------------------------
/lora_utils.py:
--------------------------------------------------------------------------------
  1 | # LoRA network module
  2 | # reference:
  3 | # https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
  4 | # https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
  5 | # https://github.com/bmaltais/kohya_ss
  6 | 
  7 | import hashlib
  8 | import math
  9 | import os
 10 | from collections import defaultdict
 11 | from io import BytesIO
 12 | from typing import List, Optional, Type, Union
 13 | 
 14 | import safetensors.torch
 15 | import torch
 16 | import torch.utils.checkpoint
 17 | from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
 18 | from safetensors.torch import load_file
 19 | from transformers import T5EncoderModel
 20 | 
 21 | 
 22 | class LoRAModule(torch.nn.Module):
 23 |     """
 24 |     replaces forward method of the original Linear, instead of replacing the original Linear module.
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         lora_name,
 30 |         org_module: torch.nn.Module,
 31 |         multiplier=1.0,
 32 |         lora_dim=4,
 33 |         alpha=1,
 34 |         dropout=None,
 35 |         rank_dropout=None,
 36 |         module_dropout=None,
 37 |     ):
 38 |         """if alpha == 0 or None, alpha is rank (no scaling)."""
 39 |         super().__init__()
 40 |         self.lora_name = lora_name
 41 | 
 42 |         if org_module.__class__.__name__ == "Conv2d":
 43 |             in_dim = org_module.in_channels
 44 |             out_dim = org_module.out_channels
 45 |         else:
 46 |             in_dim = org_module.in_features
 47 |             out_dim = org_module.out_features
 48 | 
 49 |         self.lora_dim = lora_dim
 50 |         if org_module.__class__.__name__ == "Conv2d":
 51 |             kernel_size = org_module.kernel_size
 52 |             stride = org_module.stride
 53 |             padding = org_module.padding
 54 |             self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
 55 |             self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
 56 |         else:
 57 |             self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
 58 |             self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
 59 | 
 60 |         if type(alpha) == torch.Tensor:
 61 |             alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
 62 |         alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
 63 |         self.scale = alpha / self.lora_dim
 64 |         self.register_buffer("alpha", torch.tensor(alpha))
 65 | 
 66 |         # same as microsoft's
 67 |         torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
 68 |         torch.nn.init.zeros_(self.lora_up.weight)
 69 | 
 70 |         self.multiplier = multiplier
 71 |         self.org_module = org_module  # remove in applying
 72 |         self.dropout = dropout
 73 |         self.rank_dropout = rank_dropout
 74 |         self.module_dropout = module_dropout
 75 | 
 76 |     def apply_to(self):
 77 |         self.org_forward = self.org_module.forward
 78 |         self.org_module.forward = self.forward
 79 |         del self.org_module
 80 | 
 81 |     def forward(self, x, *args, **kwargs):
 82 |         weight_dtype = x.dtype
 83 |         org_forwarded = self.org_forward(x)
 84 | 
 85 |         # module dropout
 86 |         if self.module_dropout is not None and self.training:
 87 |             if torch.rand(1) < self.module_dropout:
 88 |                 return org_forwarded
 89 | 
 90 |         lx = self.lora_down(x.to(self.lora_down.weight.dtype))
 91 | 
 92 |         # normal dropout
 93 |         if self.dropout is not None and self.training:
 94 |             lx = torch.nn.functional.dropout(lx, p=self.dropout)
 95 | 
 96 |         # rank dropout
 97 |         if self.rank_dropout is not None and self.training:
 98 |             mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
 99 |             if len(lx.size()) == 3:
100 |                 mask = mask.unsqueeze(1)  # for Text Encoder
101 |             elif len(lx.size()) == 4:
102 |                 mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
103 |             lx = lx * mask
104 | 
105 |             # scaling for rank dropout: treat as if the rank is changed
106 |             scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
107 |         else:
108 |             scale = self.scale
109 | 
110 |         lx = self.lora_up(lx)
111 | 
112 |         return org_forwarded.to(weight_dtype) + lx.to(weight_dtype) * self.multiplier * scale
113 | 
114 | 
115 | def addnet_hash_legacy(b):
116 |     """Old model hash used by sd-webui-additional-networks for .safetensors format files"""
117 |     m = hashlib.sha256()
118 | 
119 |     b.seek(0x100000)
120 |     m.update(b.read(0x10000))
121 |     return m.hexdigest()[0:8]
122 | 
123 | 
124 | def addnet_hash_safetensors(b):
125 |     """New model hash used by sd-webui-additional-networks for .safetensors format files"""
126 |     hash_sha256 = hashlib.sha256()
127 |     blksize = 1024 * 1024
128 | 
129 |     b.seek(0)
130 |     header = b.read(8)
131 |     n = int.from_bytes(header, "little")
132 | 
133 |     offset = n + 8
134 |     b.seek(offset)
135 |     for chunk in iter(lambda: b.read(blksize), b""):
136 |         hash_sha256.update(chunk)
137 | 
138 |     return hash_sha256.hexdigest()
139 | 
140 | 
141 | def precalculate_safetensors_hashes(tensors, metadata):
142 |     """Precalculate the model hashes needed by sd-webui-additional-networks to
143 |     save time on indexing the model later."""
144 | 
145 |     # Because writing user metadata to the file can change the result of
146 |     # sd_models.model_hash(), only retain the training metadata for purposes of
147 |     # calculating the hash, as they are meant to be immutable
148 |     metadata = {k: v for k, v in metadata.items() if k.startswith("ss_")}
149 | 
150 |     bytes = safetensors.torch.save(tensors, metadata)
151 |     b = BytesIO(bytes)
152 | 
153 |     model_hash = addnet_hash_safetensors(b)
154 |     legacy_hash = addnet_hash_legacy(b)
155 |     return model_hash, legacy_hash
156 | 
157 | 
158 | class LoRANetwork(torch.nn.Module):
159 |     TRANSFORMER_TARGET_REPLACE_MODULE = ["CogVideoXTransformer3DModel"]
160 |     TEXT_ENCODER_TARGET_REPLACE_MODULE = ["T5LayerSelfAttention", "T5LayerFF", "BertEncoder"]
161 |     LORA_PREFIX_TRANSFORMER = "lora_unet"
162 |     LORA_PREFIX_TEXT_ENCODER = "lora_te"
163 |     def __init__(
164 |         self,
165 |         text_encoder: Union[List[T5EncoderModel], T5EncoderModel],
166 |         unet,
167 |         multiplier: float = 1.0,
168 |         lora_dim: int = 4,
169 |         alpha: float = 1,
170 |         dropout: Optional[float] = None,
171 |         module_class: Type[object] = LoRAModule,
172 |         add_lora_in_attn_temporal: bool = False,
173 |         varbose: Optional[bool] = False,
174 |     ) -> None:
175 |         super().__init__()
176 |         self.multiplier = multiplier
177 | 
178 |         self.lora_dim = lora_dim
179 |         self.alpha = alpha
180 |         self.dropout = dropout
181 | 
182 |         print(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
183 |         print(f"neuron dropout: p={self.dropout}")
184 | 
185 |         # create module instances
186 |         def create_modules(
187 |             is_unet: bool,
188 |             root_module: torch.nn.Module,
189 |             target_replace_modules: List[torch.nn.Module],
190 |         ) -> List[LoRAModule]:
191 |             prefix = (
192 |                 self.LORA_PREFIX_TRANSFORMER
193 |                 if is_unet
194 |                 else self.LORA_PREFIX_TEXT_ENCODER
195 |             )
196 |             loras = []
197 |             skipped = []
198 |             for name, module in root_module.named_modules():
199 |                 if module.__class__.__name__ in target_replace_modules:
200 |                     for child_name, child_module in module.named_modules():
201 |                         is_linear = child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "LoRACompatibleLinear"
202 |                         is_conv2d = child_module.__class__.__name__ == "Conv2d" or child_module.__class__.__name__ == "LoRACompatibleConv"
203 |                         is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
204 |                         
205 |                         if not add_lora_in_attn_temporal:
206 |                             if "attn_temporal" in child_name:
207 |                                 continue
208 | 
209 |                         if is_linear or is_conv2d:
210 |                             lora_name = prefix + "." + name + "." + child_name
211 |                             lora_name = lora_name.replace(".", "_")
212 | 
213 |                             dim = None
214 |                             alpha = None
215 | 
216 |                             if is_linear or is_conv2d_1x1:
217 |                                 dim = self.lora_dim
218 |                                 alpha = self.alpha
219 | 
220 |                             if dim is None or dim == 0:
221 |                                 if is_linear or is_conv2d_1x1:
222 |                                     skipped.append(lora_name)
223 |                                 continue
224 | 
225 |                             lora = module_class(
226 |                                 lora_name,
227 |                                 child_module,
228 |                                 self.multiplier,
229 |                                 dim,
230 |                                 alpha,
231 |                                 dropout=dropout,
232 |                             )
233 |                             loras.append(lora)
234 |             return loras, skipped
235 | 
236 |         text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
237 | 
238 |         self.text_encoder_loras = []
239 |         skipped_te = []
240 |         for i, text_encoder in enumerate(text_encoders):
241 |             if text_encoder is not None:
242 |                 text_encoder_loras, skipped = create_modules(False, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
243 |                 self.text_encoder_loras.extend(text_encoder_loras)
244 |                 skipped_te += skipped
245 |         print(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
246 | 
247 |         self.unet_loras, skipped_un = create_modules(True, unet, LoRANetwork.TRANSFORMER_TARGET_REPLACE_MODULE)
248 |         print(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
249 | 
250 |         # assertion
251 |         names = set()
252 |         for lora in self.text_encoder_loras + self.unet_loras:
253 |             assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
254 |             names.add(lora.lora_name)
255 | 
256 |     def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
257 |         if apply_text_encoder:
258 |             print("enable LoRA for text encoder")
259 |         else:
260 |             self.text_encoder_loras = []
261 | 
262 |         if apply_unet:
263 |             print("enable LoRA for U-Net")
264 |         else:
265 |             self.unet_loras = []
266 | 
267 |         for lora in self.text_encoder_loras + self.unet_loras:
268 |             lora.apply_to()
269 |             self.add_module(lora.lora_name, lora)
270 | 
271 |     def set_multiplier(self, multiplier):
272 |         self.multiplier = multiplier
273 |         for lora in self.text_encoder_loras + self.unet_loras:
274 |             lora.multiplier = self.multiplier
275 | 
276 |     def load_weights(self, file):
277 |         if os.path.splitext(file)[1] == ".safetensors":
278 |             from safetensors.torch import load_file
279 | 
280 |             weights_sd = load_file(file)
281 |         else:
282 |             weights_sd = torch.load(file, map_location="cpu")
283 |         info = self.load_state_dict(weights_sd, False)
284 |         return info
285 | 
286 |     def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
287 |         self.requires_grad_(True)
288 |         all_params = []
289 | 
290 |         def enumerate_params(loras):
291 |             params = []
292 |             for lora in loras:
293 |                 params.extend(lora.parameters())
294 |             return params
295 | 
296 |         if self.text_encoder_loras:
297 |             param_data = {"params": enumerate_params(self.text_encoder_loras)}
298 |             if text_encoder_lr is not None:
299 |                 param_data["lr"] = text_encoder_lr
300 |             all_params.append(param_data)
301 | 
302 |         if self.unet_loras:
303 |             param_data = {"params": enumerate_params(self.unet_loras)}
304 |             if unet_lr is not None:
305 |                 param_data["lr"] = unet_lr
306 |             all_params.append(param_data)
307 | 
308 |         return all_params
309 | 
310 |     def enable_gradient_checkpointing(self):
311 |         pass
312 | 
313 |     def get_trainable_params(self):
314 |         return self.parameters()
315 | 
316 |     def save_weights(self, file, dtype, metadata):
317 |         if metadata is not None and len(metadata) == 0:
318 |             metadata = None
319 | 
320 |         state_dict = self.state_dict()
321 | 
322 |         if dtype is not None:
323 |             for key in list(state_dict.keys()):
324 |                 v = state_dict[key]
325 |                 v = v.detach().clone().to("cpu").to(dtype)
326 |                 state_dict[key] = v
327 | 
328 |         if os.path.splitext(file)[1] == ".safetensors":
329 |             from safetensors.torch import save_file
330 | 
331 |             # Precalculate model hashes to save time on indexing
332 |             if metadata is None:
333 |                 metadata = {}
334 |             model_hash, legacy_hash = precalculate_safetensors_hashes(state_dict, metadata)
335 |             metadata["sshs_model_hash"] = model_hash
336 |             metadata["sshs_legacy_hash"] = legacy_hash
337 | 
338 |             save_file(state_dict, file, metadata)
339 |         else:
340 |             torch.save(state_dict, file)
341 | 
342 | def create_network(
343 |     multiplier: float,
344 |     network_dim: Optional[int],
345 |     network_alpha: Optional[float],
346 |     text_encoder: Union[T5EncoderModel, List[T5EncoderModel]],
347 |     transformer,
348 |     neuron_dropout: Optional[float] = None,
349 |     add_lora_in_attn_temporal: bool = False,
350 |     **kwargs,
351 | ):
352 |     if network_dim is None:
353 |         network_dim = 4  # default
354 |     if network_alpha is None:
355 |         network_alpha = 1.0
356 | 
357 |     network = LoRANetwork(
358 |         text_encoder,
359 |         transformer,
360 |         multiplier=multiplier,
361 |         lora_dim=network_dim,
362 |         alpha=network_alpha,
363 |         dropout=neuron_dropout,
364 |         add_lora_in_attn_temporal=add_lora_in_attn_temporal,
365 |         varbose=True,
366 |     )
367 |     return network
368 | 
369 | def merge_lora(transformer, lora_path, multiplier, device='cpu', dtype=torch.float32, state_dict=None):
370 |     LORA_PREFIX_TRANSFORMER = "lora_unet"
371 |     LORA_PREFIX_TEXT_ENCODER = "lora_te"
372 |     if state_dict is None:
373 |         state_dict = load_file(lora_path, device=device)
374 |     else:
375 |         state_dict = state_dict
376 |     updates = defaultdict(dict)
377 |     for key, value in state_dict.items():
378 |         layer, elem = key.split('.', 1)
379 |         updates[layer][elem] = value
380 | 
381 |     for layer, elems in updates.items():
382 | 
383 |         # if "lora_te" in layer:
384 |         #     if transformer_only:
385 |         #         continue
386 |         #     else:
387 |         #         layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
388 |         #         curr_layer = pipeline.text_encoder
389 |         #else:
390 |         layer_infos = layer.split(LORA_PREFIX_TRANSFORMER + "_")[-1].split("_")
391 |         curr_layer = transformer
392 | 
393 |         temp_name = layer_infos.pop(0)
394 |         while len(layer_infos) > -1:
395 |             try:
396 |                 curr_layer = curr_layer.__getattr__(temp_name)
397 |                 if len(layer_infos) > 0:
398 |                     temp_name = layer_infos.pop(0)
399 |                 elif len(layer_infos) == 0:
400 |                     break
401 |             except Exception:
402 |                 if len(layer_infos) == 0:
403 |                     print('Error loading layer')
404 |                 if len(temp_name) > 0:
405 |                     temp_name += "_" + layer_infos.pop(0)
406 |                 else:
407 |                     temp_name = layer_infos.pop(0)
408 | 
409 |         weight_up = elems['lora_up.weight'].to(dtype).to(device)
410 |         weight_down = elems['lora_down.weight'].to(dtype).to(device)
411 |         if 'alpha' in elems.keys():
412 |             alpha = elems['alpha'].item() / weight_up.shape[1]
413 |         else:
414 |             alpha = 1.0
415 | 
416 |         curr_layer.weight.data = curr_layer.weight.data.to(device)
417 |         try:
418 |             if len(weight_up.shape) == 4:
419 |                 curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up.squeeze(3).squeeze(2),
420 |                                                                         weight_down.squeeze(3).squeeze(2)).unsqueeze(
421 |                     2).unsqueeze(3)
422 |             else:
423 |                 curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up, weight_down)
424 |         except:
425 |             print(f"Could not apply LoRA weight in layer {layer}")
426 | 
427 |     return transformer
428 | 
429 | # TODO: Refactor with merge_lora.
430 | def unmerge_lora(pipeline, lora_path, multiplier=1, device="cpu", dtype=torch.float32):
431 |     """Unmerge state_dict in LoRANetwork from the pipeline in diffusers."""
432 |     LORA_PREFIX_UNET = "lora_unet"
433 |     LORA_PREFIX_TEXT_ENCODER = "lora_te"
434 |     state_dict = load_file(lora_path, device=device)
435 | 
436 |     updates = defaultdict(dict)
437 |     for key, value in state_dict.items():
438 |         layer, elem = key.split('.', 1)
439 |         updates[layer][elem] = value
440 | 
441 |     for layer, elems in updates.items():
442 | 
443 |         if "lora_te" in layer:
444 |             layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
445 |             curr_layer = pipeline.text_encoder
446 |         else:
447 |             layer_infos = layer.split(LORA_PREFIX_UNET + "_")[-1].split("_")
448 |             curr_layer = pipeline.transformer
449 | 
450 |         temp_name = layer_infos.pop(0)
451 |         while len(layer_infos) > -1:
452 |             try:
453 |                 curr_layer = curr_layer.__getattr__(temp_name)
454 |                 if len(layer_infos) > 0:
455 |                     temp_name = layer_infos.pop(0)
456 |                 elif len(layer_infos) == 0:
457 |                     break
458 |             except Exception:
459 |                 if len(layer_infos) == 0:
460 |                     print('Error loading layer')
461 |                 if len(temp_name) > 0:
462 |                     temp_name += "_" + layer_infos.pop(0)
463 |                 else:
464 |                     temp_name = layer_infos.pop(0)
465 | 
466 |         weight_up = elems['lora_up.weight'].to(dtype)
467 |         weight_down = elems['lora_down.weight'].to(dtype)
468 |         if 'alpha' in elems.keys():
469 |             alpha = elems['alpha'].item() / weight_up.shape[1]
470 |         else:
471 |             alpha = 1.0
472 | 
473 |         curr_layer.weight.data = curr_layer.weight.data.to(device)
474 |         if len(weight_up.shape) == 4:
475 |             curr_layer.weight.data -= multiplier * alpha * torch.mm(weight_up.squeeze(3).squeeze(2),
476 |                                                                     weight_down.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
477 |         else:
478 |             curr_layer.weight.data -= multiplier * alpha * torch.mm(weight_up, weight_down)
479 | 
480 |     return pipeline
481 | 
482 | def load_lora_into_transformer(lora, transformer):
483 |         from peft import LoraConfig, set_peft_model_state_dict
484 |         from peft.mapping import PEFT_TYPE_TO_TUNER_MAPPING
485 |         from peft.tuners.tuners_utils import BaseTunerLayer
486 |         from diffusers.utils.peft_utils import get_peft_kwargs
487 |         from diffusers.utils.import_utils import is_peft_version
488 |         from diffusers.utils.state_dict_utils import convert_unet_state_dict_to_peft
489 | 
490 |         state_dict_list = []
491 |         adapter_name_list = []
492 |         strength_list = []
493 |         lora_config_list = []
494 | 
495 |         for l in lora:
496 |             state_dict = load_file(l["path"])
497 |             adapter_name_list.append(l["name"])
498 |             strength_list.append(l["strength"])
499 | 
500 |             keys = list(state_dict.keys())
501 |             transformer_keys = [k for k in keys if k.startswith("transformer")]
502 |             state_dict = {
503 |                 k.replace(f"transformer.", ""): v for k, v in state_dict.items() if k in transformer_keys
504 |             }
505 |             
506 |             # check with first key if is not in peft format
507 |             first_key = next(iter(state_dict.keys()))
508 |             if "lora_A" not in first_key:
509 |                 state_dict = convert_unet_state_dict_to_peft(state_dict)
510 |             
511 |             rank = {}
512 |             for key, val in state_dict.items():
513 |                 if "lora_B" in key:
514 |                     rank[key] = val.shape[1]
515 |             lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict)
516 |             if "use_dora" in lora_config_kwargs:
517 |                 if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
518 |                     raise ValueError(
519 |                         "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
520 |                     )
521 |                 else:
522 |                     lora_config_kwargs.pop("use_dora")
523 | 
524 |             lora_config_list.append(LoraConfig(**lora_config_kwargs))
525 |             state_dict_list.append(state_dict)
526 | 
527 | 
528 |         peft_models = []
529 | 
530 |         for i in range(len(lora_config_list)):
531 |             tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[lora_config_list[i].peft_type]
532 |             peft_model = tuner_cls(transformer, lora_config_list[i], adapter_name=adapter_name_list[i])
533 |             incompatible_keys = set_peft_model_state_dict(peft_model.model, state_dict_list[i], adapter_name_list[i])
534 |             
535 |             if incompatible_keys is not None:
536 |                 # check only for unexpected keys
537 |                 unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
538 |                 if unexpected_keys:
539 |                     print(
540 |                         f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
541 |                         f" {unexpected_keys}. "
542 |                     )
543 |             
544 |             peft_models.append(peft_model)
545 | 
546 |         if len(peft_models) > 1:
547 |             peft_models[0].add_weighted_adapter(
548 |                 adapters=adapter_name_list,
549 |                 weights=strength_list,
550 |                 combination_type="linear",
551 |                 adapter_name="combined_adapter"
552 |             )
553 |             peft_models[0].set_adapter("combined_adapter")
554 |         else:
555 |             if strength_list[0] != 1.0:
556 |                 for module in transformer.modules():
557 |                     if isinstance(module, BaseTunerLayer):
558 |                         #print(f"Setting strength for {module}")
559 |                         module.scale_layer(strength_list[0])
560 |         return peft_model.model


--------------------------------------------------------------------------------
/mz_enable_vae_encode_tiling.py:
--------------------------------------------------------------------------------
  1 | # thanks to MinusZoneAI: https://github.com/MinusZoneAI/ComfyUI-CogVideoX-MZ/blob/b98b98bd04621e4c85547866c12de2ec723ae98a/mz_enable_vae_encode_tiling.py
  2 | from typing import Optional
  3 | import torch
  4 | from diffusers.utils.accelerate_utils import apply_forward_hook
  5 | from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
  6 | from diffusers.models.modeling_outputs import AutoencoderKLOutput
  7 | 
  8 | 
  9 | @apply_forward_hook
 10 | def encode(
 11 |     self, x: torch.Tensor, return_dict: bool = True
 12 | ):
 13 |     """
 14 |     Encode a batch of images into latents.
 15 |     Args:
 16 |         x (`torch.Tensor`): Input batch of images.
 17 |         return_dict (`bool`, *optional*, defaults to `True`):
 18 |             Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
 19 |     Returns:
 20 |             The latent representations of the encoded videos. If `return_dict` is True, a
 21 |             [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
 22 |     """
 23 |     if self.use_slicing and x.shape[0] > 1:
 24 |         encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
 25 |         h = torch.cat(encoded_slices)
 26 |     else:
 27 |         h = self._encode(x)
 28 |     posterior = DiagonalGaussianDistribution(h)
 29 | 
 30 |     if not return_dict:
 31 |         return (posterior,)
 32 |     return AutoencoderKLOutput(latent_dist=posterior)
 33 | 
 34 | 
 35 | def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
 36 |     r"""Encode a batch of images using a tiled encoder.
 37 |     When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
 38 |     steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
 39 |     different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
 40 |     tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
 41 |     output, but they should be much less noticeable.
 42 |     Args:
 43 |         x (`torch.Tensor`): Input batch of videos.
 44 |     Returns:
 45 |         `torch.Tensor`:
 46 |             The latent representation of the encoded videos.
 47 |     """
 48 |     # For a rough memory estimate, take a look at the `tiled_decode` method.
 49 |     batch_size, num_channels, num_frames, height, width = x.shape
 50 |     overlap_height = int(self.tile_sample_min_height *
 51 |                          (1 - self.tile_overlap_factor_height))
 52 |     overlap_width = int(self.tile_sample_min_width *
 53 |                         (1 - self.tile_overlap_factor_width))
 54 |     blend_extent_height = int(
 55 |         self.tile_latent_min_height * self.tile_overlap_factor_height)
 56 |     blend_extent_width = int(
 57 |         self.tile_latent_min_width * self.tile_overlap_factor_width)
 58 |     row_limit_height = self.tile_latent_min_height - blend_extent_height
 59 |     row_limit_width = self.tile_latent_min_width - blend_extent_width
 60 |     frame_batch_size = 4
 61 |     # Split x into overlapping tiles and encode them separately.
 62 |     # The tiles have an overlap to avoid seams between tiles.
 63 |     rows = []
 64 |     for i in range(0, height, overlap_height):
 65 |         row = []
 66 |         for j in range(0, width, overlap_width):
 67 |             # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
 68 |             num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
 69 |             time = []
 70 |             for k in range(num_batches):
 71 |                 remaining_frames = num_frames % frame_batch_size
 72 |                 start_frame = frame_batch_size * k + \
 73 |                     (0 if k == 0 else remaining_frames)
 74 |                 end_frame = frame_batch_size * (k + 1) + remaining_frames
 75 |                 tile = x[
 76 |                     :,
 77 |                     :,
 78 |                     start_frame:end_frame,
 79 |                     i: i + self.tile_sample_min_height,
 80 |                     j: j + self.tile_sample_min_width,
 81 |                 ]
 82 |                 
 83 |                 tile = self.encoder(tile)
 84 |                 if not isinstance(tile, tuple):
 85 |                     tile = (tile,)
 86 |                 if self.quant_conv is not None:
 87 |                     tile = self.quant_conv(tile)
 88 |                 time.append(tile[0])
 89 |             try:
 90 |                 self._clear_fake_context_parallel_cache()
 91 |             except:
 92 |                 pass
 93 |             row.append(torch.cat(time, dim=2))
 94 |         rows.append(row)
 95 |     result_rows = []
 96 |     for i, row in enumerate(rows):
 97 |         result_row = []
 98 |         for j, tile in enumerate(row):
 99 |             # blend the above tile and the left tile
100 |             # to the current tile and add the current tile to the result row
101 |             if i > 0:
102 |                 tile = self.blend_v(
103 |                     rows[i - 1][j], tile, blend_extent_height)
104 |             if j > 0:
105 |                 tile = self.blend_h(row[j - 1], tile, blend_extent_width)
106 |             result_row.append(
107 |                 tile[:, :, :, :row_limit_height, :row_limit_width])
108 |         result_rows.append(torch.cat(result_row, dim=4))
109 |     enc = torch.cat(result_rows, dim=3)
110 |     return enc
111 | 
112 | 
113 | def _encode(
114 |     self, x: torch.Tensor, return_dict: bool = True
115 | ):
116 |     batch_size, num_channels, num_frames, height, width = x.shape
117 | 
118 |     if self.use_encode_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
119 |         return self.tiled_encode(x)
120 | 
121 |     if num_frames == 1:
122 |         h = self.encoder(x)
123 |         if self.quant_conv is not None:
124 |             h = self.quant_conv(h)
125 |         posterior = DiagonalGaussianDistribution(h)
126 |     else:
127 |         frame_batch_size = 4
128 |         h = []
129 |         for i in range(num_frames // frame_batch_size):
130 |             remaining_frames = num_frames % frame_batch_size
131 |             start_frame = frame_batch_size * i + \
132 |                 (0 if i == 0 else remaining_frames)
133 |             end_frame = frame_batch_size * (i + 1) + remaining_frames
134 |             z_intermediate = x[:, :, start_frame:end_frame]
135 |             z_intermediate = self.encoder(z_intermediate)
136 |             if self.quant_conv is not None:
137 |                 z_intermediate = self.quant_conv(z_intermediate)
138 |             h.append(z_intermediate)
139 |         try:
140 |             self._clear_fake_context_parallel_cache()
141 |         except:
142 |             pass
143 |         h = torch.cat(h, dim=2)
144 |     return h
145 | 
146 | 
147 | def enable_encode_tiling(
148 |     self,
149 |     tile_sample_min_height: Optional[int] = None,
150 |     tile_sample_min_width: Optional[int] = None,
151 |     tile_overlap_factor_height: Optional[float] = None,
152 |     tile_overlap_factor_width: Optional[float] = None,
153 | ) -> None:
154 |     r"""
155 |     Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
156 |     compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
157 |     processing larger images.
158 | 
159 |     Args:
160 |         tile_sample_min_height (`int`, *optional*):
161 |             The minimum height required for a sample to be separated into tiles across the height dimension.
162 |         tile_sample_min_width (`int`, *optional*):
163 |             The minimum width required for a sample to be separated into tiles across the width dimension.
164 |         tile_overlap_factor_height (`int`, *optional*):
165 |             The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
166 |             no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
167 |             value might cause more tiles to be processed leading to slow down of the decoding process.
168 |         tile_overlap_factor_width (`int`, *optional*):
169 |             The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
170 |             are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
171 |             value might cause more tiles to be processed leading to slow down of the decoding process.
172 |     """
173 |     self.use_encode_tiling = True
174 |     self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
175 |     self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
176 |     self.tile_latent_min_height = int(
177 |         self.tile_sample_min_height /
178 |         (2 ** (len(self.config.block_out_channels) - 1))
179 |     )
180 |     self.tile_latent_min_width = int(
181 |         self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
182 |     self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
183 |     self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
184 | 
185 | 
186 | from types import MethodType
187 | 
188 | 
189 | def enable_vae_encode_tiling(vae):
190 |     vae.encode = MethodType(encode, vae)
191 |     setattr(vae, "_encode", MethodType(_encode, vae))
192 |     setattr(vae, "tiled_encode", MethodType(tiled_encode, vae))
193 |     setattr(vae, "use_encode_tiling", True)
194 |     
195 |     setattr(vae, "enable_encode_tiling", MethodType(enable_encode_tiling, vae))
196 |     vae.enable_encode_tiling()
197 |     return vae
198 | 


--------------------------------------------------------------------------------
/mz_gguf_loader.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/MinusZoneAI/ComfyUI-CogVideoX-MZ/blob/9616415220fd09388622f40f6609e4ed81f048a5/mz_gguf_loader.py
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import gc
  6 | 
  7 | 
  8 | class quantize_lazy_load():
  9 |     def __init__(self):
 10 |         self.device = None
 11 | 
 12 |     def __enter__(self):
 13 |         self.device = torch.device("meta")
 14 |         self.device.__enter__()
 15 |         return self
 16 | 
 17 |     def __exit__(self, exc_type, exc_value, traceback):
 18 |         self.device.__exit__(exc_type, exc_value, traceback)
 19 | 
 20 | 
 21 | def quantize_load_state_dict(model, state_dict, device="cpu"):
 22 |     quant_keys = []
 23 |     for key in state_dict.keys():
 24 |         if key.endswith(".Q4_0_qweight"):
 25 |             quant_keys.append(key.replace(".Q4_0_qweight", ""))
 26 |             qtype = "Q4_0"
 27 |         elif key.endswith(".Q8_0_qweight"):
 28 |             quant_keys.append(key.replace(".Q8_0_qweight", ""))
 29 |             qtype = "Q8_0"
 30 | 
 31 |     for name, module in model.named_modules():
 32 |         if name in quant_keys:
 33 |             q_linear = WQLinear_GGUF.from_linear(
 34 |                 linear=module,
 35 |                 device=device,
 36 |                 qtype=qtype,
 37 |             )
 38 |             set_op_by_name(model, name, q_linear)
 39 | 
 40 |     model.to_empty(device=device)
 41 |     model.load_state_dict(state_dict, strict=False)
 42 |     model.to(device)
 43 |     return model
 44 | 
 45 | 
 46 | def set_op_by_name(layer, name, new_module):
 47 |     levels = name.split(".")
 48 |     if len(levels) > 1:
 49 |         mod_ = layer
 50 |         for l_idx in range(len(levels) - 1):
 51 |             if levels[l_idx].isdigit():
 52 |                 mod_ = mod_[int(levels[l_idx])]
 53 |             else:
 54 |                 mod_ = getattr(mod_, levels[l_idx])
 55 |         setattr(mod_, levels[-1], new_module)
 56 |     else:
 57 |         setattr(layer, name, new_module)
 58 | 
 59 | 
 60 | import torch.nn.functional as F
 61 | 
 62 | 
 63 | class WQLinear_GGUF(nn.Module):
 64 |     def __init__(
 65 |         self, in_features, out_features, bias, dev, qtype="Q4_0"
 66 |     ):
 67 |         super().__init__()
 68 | 
 69 |         self.in_features = in_features
 70 |         self.out_features = out_features
 71 |         self.qtype = qtype
 72 | 
 73 |         qweight_shape = quant_shape_to_byte_shape(
 74 |             (out_features, in_features), qtype
 75 |         )
 76 |         self.register_buffer(
 77 |             f"{qtype}_qweight",
 78 |             torch.zeros(
 79 |                 qweight_shape,
 80 |                 dtype=torch.uint8,
 81 |                 device=dev,
 82 |             ),
 83 |         )
 84 |         if bias:
 85 |             self.register_buffer(
 86 |                 "bias",
 87 |                 torch.zeros(
 88 |                     (out_features),
 89 |                     dtype=torch.float16,
 90 |                     device=dev,
 91 |                 ),
 92 |             )
 93 |         else:
 94 |             self.bias = None
 95 | 
 96 |     @classmethod
 97 |     def from_linear(
 98 |         cls, linear,
 99 |         device="cpu",
100 |         qtype="Q4_0",
101 |     ):
102 |         q_linear = cls(
103 |             linear.in_features,
104 |             linear.out_features,
105 |             linear.bias is not None,
106 |             device,
107 |             qtype=qtype,
108 |         )
109 |         return q_linear
110 | 
111 |     def extra_repr(self) -> str:
112 |         return (
113 |             "in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
114 |                 self.in_features,
115 |                 self.out_features,
116 |                 self.bias is not None,
117 |                 self.w_bit,
118 |                 self.group_size,
119 |             )
120 |         )
121 | 
122 |     @torch.no_grad()
123 |     def forward(self, x):
124 |         if self.qtype == "Q4_0":
125 |             dequant = dequantize_blocks_Q4_0(self.Q4_0_qweight, x.dtype)
126 |         elif self.qtype == "Q8_0":
127 |             dequant = dequantize_blocks_Q8_0(self.Q8_0_qweight, x.dtype)
128 |         else:
129 |             raise ValueError(f"Unknown qtype: {self.qtype}")
130 |         
131 |         return F.linear(x, dequant, bias=self.bias.to(x.dtype) if self.bias is not None else None)
132 | 
133 | 
134 | def split_block_dims(blocks, *args):
135 |     n_max = blocks.shape[1]
136 |     dims = list(args) + [n_max - sum(args)]
137 |     return torch.split(blocks, dims, dim=1)
138 | 
139 | 
140 | def quant_shape_to_byte_shape(shape, qtype) -> tuple[int, ...]:
141 |     # shape = shape[::-1]
142 |     block_size, type_size = GGML_QUANT_SIZES[qtype]
143 |     if shape[-1] % block_size != 0:
144 |         raise ValueError(
145 |             f"Quantized tensor row size ({shape[-1]}) is not a multiple of Q4_0 block size ({block_size})")
146 |     return (*shape[:-1], shape[-1] // block_size * type_size)
147 | 
148 | 
149 | def quant_shape_from_byte_shape(shape, qtype) -> tuple[int, ...]:
150 |     # shape = shape[::-1]
151 |     block_size, type_size = GGML_QUANT_SIZES[qtype]
152 |     if shape[-1] % type_size != 0:
153 |         raise ValueError(
154 |             f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of Q4_0 type size ({type_size})")
155 |     return (*shape[:-1], shape[-1] // type_size * block_size)
156 | 
157 | 
158 | GGML_QUANT_SIZES = {
159 |     "Q4_0": (32, 2 + 16),
160 |     "Q8_0": (32, 2 + 32),
161 | }
162 | 
163 | 
164 | def dequantize_blocks_Q4_0(data, dtype=torch.float16):
165 |     block_size, type_size = GGML_QUANT_SIZES["Q4_0"]
166 | 
167 |     data = data.to(torch.uint8)
168 |     shape = data.shape
169 | 
170 |     rows = data.reshape(
171 |         (-1, data.shape[-1])
172 |     ).view(torch.uint8)
173 | 
174 |     n_blocks = rows.numel() // type_size
175 |     blocks = data.reshape((n_blocks, type_size))
176 | 
177 |     n_blocks = blocks.shape[0]
178 | 
179 |     d, qs = split_block_dims(blocks, 2)
180 |     d = d.view(torch.float16)
181 | 
182 |     qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor(
183 |         [0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
184 |     qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
185 | 
186 |     out = (d * qs)
187 | 
188 |     out = out.reshape(quant_shape_from_byte_shape(
189 |         shape,
190 |         qtype="Q4_0",
191 |     )).to(dtype)
192 |     return out
193 | 
194 | def dequantize_blocks_Q8_0(data, dtype=torch.float16):
195 |     block_size, type_size = GGML_QUANT_SIZES["Q8_0"]
196 | 
197 |     data = data.to(torch.uint8)
198 |     shape = data.shape
199 | 
200 |     rows = data.reshape(
201 |         (-1, data.shape[-1])
202 |     ).view(torch.uint8)
203 | 
204 |     n_blocks = rows.numel() // type_size
205 |     blocks = data.reshape((n_blocks, type_size))
206 | 
207 |     n_blocks = blocks.shape[0]
208 | 
209 |     d, qs = split_block_dims(blocks, 2)
210 |     d = d.view(torch.float16).to(torch.float32)
211 | 
212 |     qs = qs.view(torch.int8).to(torch.float32)
213 | 
214 |     out = (d * qs)
215 | 
216 |     out = out.reshape(quant_shape_from_byte_shape(
217 |         shape,
218 |         qtype="Q8_0",
219 |     )).to(dtype)
220 |     return out
221 | 
222 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui-cogvideoxwrapper"
 3 | description = "Diffusers wrapper for CogVideoX -models: https://github.com/THUDM/CogVideo"
 4 | version = "1.5.1"
 5 | license = {file = "LICENSE"}
 6 | dependencies = ["huggingface_hub", "diffusers>=0.31.0", "accelerate>=0.33.0"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/kijai/ComfyUI-CogVideoXWrapper"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = "kijai"
14 | DisplayName = "ComfyUI-CogVideoXWrapper"
15 | Icon = ""
16 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # WORK IN PROGRESS
  2 | 
  3 | Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing
  4 | 
  5 | ## Update 9
  6 | Added preliminary support for [Go-with-the-Flow](https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow)
  7 | 
  8 | This uses LoRA weights available here: https://huggingface.co/Eyeline-Research/Go-with-the-Flow/tree/main
  9 | 
 10 | To create the input videos for the NoiseWarp process, I've added a node to KJNodes that works alongside my SplineEditor, and either [comfyui-inpaint-nodes](https://github.com/Acly/comfyui-inpaint-nodes) or just cv2 inpainting to create the cut and drag input videos.
 11 | 
 12 | The workflows are in the example_workflows -folder.
 13 | 
 14 | Quick video to showcase: First mask the subject, then use the cut and drag -workflow to create a video as seen here, then that video is used as input to the NoiseWarp node in the main workflow.
 15 | 
 16 | https://github.com/user-attachments/assets/112706b0-a38b-4c3c-b779-deba0827af4f
 17 | 
 18 | ## BREAKING Update8
 19 | 
 20 | This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.
 21 | I apologize for the inconvenience, if I don't do this now I'll keep making it worse until maintaining becomes too much of a chore, so from my pov there was no choice.
 22 | 
 23 | *Please either use the new workflows or fix the nodes in your old ones before posting issue reports!*
 24 | 
 25 | Old version will be kept in a legacy branch, but not maintained
 26 | 
 27 | - Support CogVideoX 1.5 models
 28 | - Major code cleanup (it was bad, still isn't great, wip)
 29 | - Merge Fun -model functionality into main pipeline:
 30 |     - All Fun specific nodes, besides image encode node for Fun -InP models are gone
 31 |     - Main CogVideo Sampler works with Fun models
 32 |     - DimensionX LoRAs now work with Fun models as well
 33 | 
 34 | - Remove width/height from the sampler widgets and detect from input instead, this meanst text2vid now requires using empty latents
 35 | - Separate VAE from the model, allow using fp32 VAE
 36 | - Add ability to load some of the non-GGUF models as single files (only few available for now: https://huggingface.co/Kijai/CogVideoX-comfy)
 37 | - Add some torchao quantizations as options
 38 | - Add interpolation as option for the main encode node, old interpolation specific node is gone
 39 | - torch.compile optimizations
 40 | - Remove PAB in favor of FasterCache and cleaner code
 41 | - other smaller things I forgot about at this point
 42 | 
 43 | For Fun -model based workflows it's more drastic change, for others migrating generally means re-setting many of the nodes.
 44 | 
 45 | ## Update7
 46 | 
 47 | - Refactored the Fun version's sampler to accept any resolution, this should make it lot simpler to use with Tora. **BREAKS OLD WORKFLOWS**, old FunSampler nodes need to be remade.
 48 | - The old bucket resizing is now on it's own node (CogVideoXFunResizeToClosestBucket) to keep the functionality, I honestly don't know if it matters at all, but just in case.
 49 | - Fun version's vid2vid is now also in the same node, the old vid2vid node is deprecated.
 50 | - Added support for FasterCache, this trades more VRAM use for speed with slight quality hit, similar to PAB: https://github.com/Vchitect/FasterCache
 51 | - Improved torch.compile support, it actually works now
 52 | 
 53 | ## Update6
 54 | 
 55 | Initial support for Tora (https://github.com/alibaba/Tora)
 56 | 
 57 | Converted model (included in the autodownload node):
 58 | 
 59 | https://huggingface.co/Kijai/CogVideoX-5b-Tora/tree/main
 60 | 
 61 | 
 62 | https://github.com/user-attachments/assets/d5334237-03dc-48f5-8bec-3ae5998660c6
 63 | 
 64 | 
 65 | ## Update5
 66 | This week there's been some bigger updates that will most likely affect some old workflows, sampler node especially probably need to be refreshed (re-created) if it errors out!
 67 | 
 68 | New features:
 69 | - Initial context windowing with FreeNoise noise shuffling mainly for vid2vid and pose2vid pipelines for longer generations, haven't figured it out for img2vid yet
 70 | - GGUF models and tiled encoding for I2V and pose pipelines (thanks to MinusZoneAI)
 71 | - [sageattention](https://github.com/thu-ml/SageAttention) support (Linux only) for a speed boost, I experienced ~20-30% increase with it, stacks with fp8 fast mode, doesn't need compiling
 72 | - Support CogVideoX-Fun 1.1 and it's pose models with additional control strength and application step settings, this model's input does NOT have to be just dwpose skeletons, just about anything can work
 73 | - Support LoRAs
 74 | 
 75 | https://github.com/user-attachments/assets/ddeb8f38-a647-42b3-a4b1-c6936f961deb
 76 | 
 77 | https://github.com/user-attachments/assets/c78b2832-9571-4941-8c97-fbcc1a4cc23d
 78 | 
 79 | https://github.com/user-attachments/assets/d9ed98b1-f917-432b-a16e-e01e87efb1f9
 80 | 
 81 | 
 82 | 
 83 | ## Update4
 84 | Initial support for the official I2V version of CogVideoX: https://huggingface.co/THUDM/CogVideoX-5b-I2V
 85 | 
 86 | **Also needs diffusers 0.30.3**
 87 | 
 88 | https://github.com/user-attachments/assets/c672d0af-a676-495d-a42c-7e3dd802b4b0
 89 | 
 90 | 
 91 | 
 92 | ## Update3
 93 | 
 94 | Added initial support for CogVideoX-Fun: https://github.com/aigc-apps/CogVideoX-Fun
 95 | 
 96 | Note that while this one can do image2vid, this is NOT the official I2V model yet, though it should also be released very soon.
 97 | 
 98 | https://github.com/user-attachments/assets/68f9ed16-ee53-4955-b931-1799461ac561
 99 | 
100 | 
101 | ## Updade2
102 | 
103 | Added **experimental** support for onediff, this reduced sampling time by ~40% for me, reaching 4.23 s/it on 4090 with 49 frames. 
104 | This requires using Linux, torch 2.4.0, onediff and nexfort installation:
105 | 
106 | `pip install --pre onediff onediffx`
107 | 
108 | `pip install nexfort`
109 | 
110 | First run will take around 5 mins for the compilation.
111 | 
112 | ## Update
113 | 5b model is now also supported for basic text2vid: https://huggingface.co/THUDM/CogVideoX-5b
114 | 
115 | It is also autodownloaded to `ComfyUI/models/CogVideo/CogVideoX-5b`, text encoder is not needed as we use the ComfyUI T5.
116 | 
117 | https://github.com/user-attachments/assets/991205cc-826e-4f93-831a-c10441f0f2ce
118 | 
119 | Requires diffusers 0.30.1 (this is specified in requirements.txt)
120 | 
121 | Uses same T5 model than SD3 and Flux, fp8 works fine too. Memory requirements depend mostly on the video length. 
122 | VAE decoding seems to be the only big that takes a lot of VRAM when everything is offloaded, peaks at around 13-14GB momentarily at that stage.
123 | Sampling itself takes only maybe 5-6GB.
124 | 
125 | 
126 | Hacked in img2img to attempt vid2vid workflow, works interestingly with some inputs, highly experimental.
127 | 
128 | https://github.com/user-attachments/assets/e6951ef4-ea7a-4752-94f6-cf24f2503d83
129 | 
130 | https://github.com/user-attachments/assets/9e41f37b-2bb3-411c-81fa-e91b80da2559
131 | 
132 | Also added temporal tiling as means of generating endless videos:
133 | 
134 | https://github.com/kijai/ComfyUI-CogVideoXWrapper
135 | 
136 | https://github.com/user-attachments/assets/ecdac8b8-d434-48b6-abd6-90755b6b552d
137 | 
138 | 
139 | 
140 | Original repo:
141 | https://github.com/THUDM/CogVideo
142 | 
143 | CogVideoX-Fun:
144 | https://github.com/aigc-apps/CogVideoX-Fun
145 | 
146 | Controlnet:
147 | https://github.com/TheDenk/cogvideox-controlnet
148 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface_hub
2 | diffusers>=0.31.0
3 | accelerate>=0.33.0
4 | einops
5 | peft
6 | opencv-python


--------------------------------------------------------------------------------
/tora/traj_module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from einops import rearrange, reduce
  5 | 
  6 | 
  7 | def avg_pool_nd(dims, *args, **kwargs):
  8 |     """
  9 |     Create a 1D, 2D, or 3D average pooling module.
 10 |     """
 11 |     if dims == 1:
 12 |         return nn.AvgPool1d(*args, **kwargs)
 13 |     elif dims == 2:
 14 |         return nn.AvgPool2d(*args, **kwargs)
 15 |     elif dims == 3:
 16 |         return nn.AvgPool3d(*args, **kwargs)
 17 |     raise ValueError(f"unsupported dimensions: {dims}")
 18 | 
 19 | 
 20 | def conv_nd(dims, *args, **kwargs):
 21 |     """
 22 |     Create a 1D, 2D, or 3D convolution module.
 23 |     """
 24 |     if dims == 1:
 25 |         return nn.Conv1d(*args, **kwargs)
 26 |     elif dims == 2:
 27 |         return nn.Conv2d(*args, **kwargs)
 28 |     elif dims == 3:
 29 |         return nn.Conv3d(*args, **kwargs)
 30 |     raise ValueError(f"unsupported dimensions: {dims}")
 31 | 
 32 | 
 33 | class Downsample(nn.Module):
 34 |     """
 35 |     A downsampling layer with an optional convolution.
 36 |     :param channels: channels in the inputs and outputs.
 37 |     :param use_conv: a bool determining if a convolution is applied.
 38 |     :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
 39 |                  downsampling occurs in the inner-two dimensions.
 40 |     """
 41 | 
 42 |     def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
 43 |         super().__init__()
 44 |         self.channels = channels
 45 |         self.out_channels = out_channels or channels
 46 |         self.use_conv = use_conv
 47 |         self.dims = dims
 48 |         stride = 2 if dims != 3 else (1, 2, 2)
 49 |         if use_conv:
 50 |             self.op = conv_nd(
 51 |                 dims,
 52 |                 self.channels,
 53 |                 self.out_channels,
 54 |                 3,
 55 |                 stride=stride,
 56 |                 padding=padding,
 57 |             )
 58 |         else:
 59 |             assert self.channels == self.out_channels
 60 |             self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
 61 | 
 62 |     def forward(self, x):
 63 |         assert x.shape[1] == self.channels
 64 |         return self.op(x)
 65 | 
 66 | 
 67 | class ResnetBlock(nn.Module):
 68 |     def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
 69 |         super().__init__()
 70 |         ps = ksize // 2
 71 |         if in_c != out_c or sk == False:
 72 |             self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
 73 |         else:
 74 |             # print('n_in')
 75 |             self.in_conv = None
 76 |         self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
 77 |         self.act = nn.ReLU()
 78 |         self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
 79 |         self.bn1 = nn.BatchNorm2d(out_c)
 80 |         self.bn2 = nn.BatchNorm2d(out_c)
 81 |         if sk == False:
 82 |             # self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps) # edit by zhouxiawang
 83 |             self.skep = nn.Conv2d(out_c, out_c, ksize, 1, ps)
 84 |         else:
 85 |             self.skep = None
 86 | 
 87 |         self.down = down
 88 |         if self.down == True:
 89 |             self.down_opt = Downsample(in_c, use_conv=use_conv)
 90 | 
 91 |     def forward(self, x):
 92 |         if self.down == True:
 93 |             x = self.down_opt(x)
 94 |         if self.in_conv is not None:  # edit
 95 |             x = self.in_conv(x)
 96 | 
 97 |         h = self.bn1(x)
 98 |         h = self.act(h)
 99 |         h = self.block1(h)
100 |         h = self.bn2(h)
101 |         h = self.act(h)
102 |         h = self.block2(h)
103 |         if self.skep is not None:
104 |             return h + self.skep(x)
105 |         else:
106 |             return h + x
107 | 
108 | 
109 | class VAESpatialEmulator(nn.Module):
110 |     def __init__(self, kernel_size=(8, 8)):
111 |         super().__init__()
112 |         self.kernel_size = kernel_size
113 | 
114 |     def forward(self, x):
115 |         """
116 |         x: torch.Tensor: shape [B C T H W]
117 |         """
118 |         Hp, Wp = self.kernel_size
119 |         H, W = x.shape[-2], x.shape[-1]
120 |         valid_h = H - H % Hp
121 |         valid_w = W - W % Wp
122 |         x = x[..., :valid_h, :valid_w]
123 |         x = rearrange(
124 |             x,
125 |             "B C T (Nh Hp) (Nw Wp)  -> B (Hp Wp C) T Nh Nw",
126 |             Hp=Hp,
127 |             Wp=Wp,
128 |         )
129 |         return x
130 | 
131 | 
132 | class VAETemporalEmulator(nn.Module):
133 |     def __init__(self, micro_frame_size, kernel_size=4):
134 |         super().__init__()
135 |         self.micro_frame_size = micro_frame_size
136 |         self.kernel_size = kernel_size
137 | 
138 |     def forward(self, x_z):
139 |         """
140 |         x_z: torch.Tensor: shape [B C T H W]
141 |         """
142 | 
143 |         z_list = []
144 |         for i in range(0, x_z.shape[2], self.micro_frame_size):
145 |             x_z_bs = x_z[:, :, i : i + self.micro_frame_size]
146 |             z_list.append(x_z_bs[:, :, 0:1])
147 |             x_z_bs = x_z_bs[:, :, 1:]
148 |             t_valid = x_z_bs.shape[2] - x_z_bs.shape[2] % self.kernel_size
149 |             x_z_bs = x_z_bs[:, :, :t_valid]
150 |             x_z_bs = reduce(x_z_bs, "B C (T n) H W -> B C T H W", n=self.kernel_size, reduction="mean")
151 |             z_list.append(x_z_bs)
152 |         z = torch.cat(z_list, dim=2)
153 |         return z
154 | 
155 | 
156 | class TrajExtractor(nn.Module):
157 |     def __init__(
158 |         self,
159 |         vae_downsize=(4, 8, 8),
160 |         patch_size=2,
161 |         channels=[320, 640, 1280, 1280],
162 |         nums_rb=3,
163 |         cin=2,
164 |         ksize=3,
165 |         sk=False,
166 |         use_conv=True,
167 |     ):
168 |         super(TrajExtractor, self).__init__()
169 |         self.vae_downsize = vae_downsize
170 |         # self.vae_spatial_emulator = VAESpatialEmulator(kernel_size=vae_downsize[-2:])
171 |         self.downsize_patchify = nn.PixelUnshuffle(patch_size)
172 |         self.patch_size = (1, patch_size, patch_size)
173 |         self.channels = channels
174 |         self.nums_rb = nums_rb
175 |         self.body = []
176 |         for i in range(len(channels)):
177 |             for j in range(nums_rb):
178 |                 if (i != 0) and (j == 0):
179 |                     self.body.append(
180 |                         ResnetBlock(
181 |                             channels[i - 1],
182 |                             channels[i],
183 |                             down=False,
184 |                             ksize=ksize,
185 |                             sk=sk,
186 |                             use_conv=use_conv,
187 |                         )
188 |                     )
189 |                 else:
190 |                     self.body.append(
191 |                         ResnetBlock(
192 |                             channels[i],
193 |                             channels[i],
194 |                             down=False,
195 |                             ksize=ksize,
196 |                             sk=sk,
197 |                             use_conv=use_conv,
198 |                         )
199 |                     )
200 |         self.body = nn.ModuleList(self.body)
201 |         cin_ = cin * patch_size**2
202 |         self.conv_in = nn.Conv2d(cin_, channels[0], 3, 1, 1)
203 | 
204 |         # Initialize weights
205 |         def conv_init(module):
206 |             if isinstance(module, (nn.Conv2d, nn.Conv1d)):
207 |                 nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
208 |                 if module.bias is not None:
209 |                     nn.init.constant_(module.bias, 0)
210 | 
211 |         self.apply(conv_init)
212 | 
213 |     def forward(self, x):
214 |         """
215 |         x: torch.Tensor: shape [B C T H W]
216 |         """
217 |         # downsize
218 |         T, H, W = x.shape[-3:]
219 |         if W % self.patch_size[2] != 0:
220 |             x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
221 |         if H % self.patch_size[1] != 0:
222 |             x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
223 |         if T % self.patch_size[0] != 0:
224 |             x = F.pad(
225 |                 x,
226 |                 (0, 0, 0, 0, 0, self.patch_size[0] - T % self.patch_size[0]),
227 |             )
228 |         x = rearrange(x, "B C T H W -> (B T) C H W")
229 |         x = self.downsize_patchify(x)
230 | 
231 |         # extract features
232 |         features = []
233 |         x = self.conv_in(x)
234 |         for i in range(len(self.channels)):
235 |             for j in range(self.nums_rb):
236 |                 idx = i * self.nums_rb + j
237 |                 x = self.body[idx](x)
238 |             features.append(x)
239 | 
240 |         return features
241 | 
242 | 
243 | class FloatGroupNorm(nn.GroupNorm):
244 |     def forward(self, x):
245 |         return super().forward(x.to(self.bias.dtype)).type(x.dtype)
246 | 
247 | 
248 | def zero_module(module):
249 |     """
250 |     Zero out the parameters of a module and return it.
251 |     """
252 |     for p in module.parameters():
253 |         p.detach().zero_()
254 |     return module
255 | 
256 | 
257 | class MGF(nn.Module):
258 |     def __init__(self, flow_in_channel=128, out_channels=1152):
259 |         super().__init__()
260 |         self.out_channels = out_channels
261 |         self.flow_gamma_spatial = nn.Conv2d(flow_in_channel, self.out_channels // 4, 3, padding=1)
262 |         self.flow_gamma_temporal = zero_module(
263 |             nn.Conv1d(
264 |                 self.out_channels // 4,
265 |                 self.out_channels,
266 |                 kernel_size=3,
267 |                 stride=1,
268 |                 padding=1,
269 |                 padding_mode="replicate",
270 |             )
271 |         )
272 |         self.flow_beta_spatial = nn.Conv2d(flow_in_channel, self.out_channels // 4, 3, padding=1)
273 |         self.flow_beta_temporal = zero_module(
274 |             nn.Conv1d(
275 |                 self.out_channels // 4,
276 |                 self.out_channels,
277 |                 kernel_size=3,
278 |                 stride=1,
279 |                 padding=1,
280 |                 padding_mode="replicate",
281 |             )
282 |         )
283 |         self.flow_cond_norm = FloatGroupNorm(32, self.out_channels)
284 | 
285 |     def forward(self, h, flow, T):
286 |         if flow is not None:
287 |             gamma_flow = self.flow_gamma_spatial(flow)
288 |             beta_flow = self.flow_beta_spatial(flow)
289 |             _, _, hh, wh = beta_flow.shape
290 |             
291 |             if gamma_flow.shape[0] == 1:  # Check if batch size is 1
292 |                 gamma_flow = rearrange(gamma_flow, "b c h w -> b c (h w)")
293 |                 beta_flow = rearrange(beta_flow, "b c h w -> b c (h w)")
294 |                 gamma_flow = self.flow_gamma_temporal(gamma_flow)
295 |                 beta_flow = self.flow_beta_temporal(beta_flow)
296 |                 gamma_flow = rearrange(gamma_flow, "b c (h w) -> b c h w", h=hh, w=wh)
297 |                 beta_flow = rearrange(beta_flow, "b c (h w) -> b c h w", h=hh, w=wh)
298 |             else:
299 |                 gamma_flow = rearrange(gamma_flow, "(b f) c h w -> (b h w) c f", f=T)
300 |                 beta_flow = rearrange(beta_flow, "(b f) c h w -> (b h w) c f", f=T)
301 |                 gamma_flow = self.flow_gamma_temporal(gamma_flow)
302 |                 beta_flow = self.flow_beta_temporal(beta_flow)
303 |                 gamma_flow = rearrange(gamma_flow, "(b h w) c f -> (b f) c h w", h=hh, w=wh)
304 |                 beta_flow = rearrange(beta_flow, "(b h w) c f -> (b f) c h w", h=hh, w=wh)
305 |             
306 |             h = h + self.flow_cond_norm(h) * gamma_flow + beta_flow
307 |         return h
308 | 


--------------------------------------------------------------------------------
/tora/traj_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2
  3 | import torch
  4 | 
  5 | # Note that the coordinates passed to the model must not exceed 256.
  6 | # xy range 256
  7 | 
  8 | def pdf2(sigma_matrix, grid):
  9 |     """Calculate PDF of the bivariate Gaussian distribution.
 10 |     Args:
 11 |         sigma_matrix (ndarray): with the shape (2, 2)
 12 |         grid (ndarray): generated by :func:`mesh_grid`,
 13 |             with the shape (K, K, 2), K is the kernel size.
 14 |     Returns:
 15 |         kernel (ndarrray): un-normalized kernel.
 16 |     """
 17 |     inverse_sigma = np.linalg.inv(sigma_matrix)
 18 |     kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
 19 |     return kernel
 20 | 
 21 | 
 22 | def mesh_grid(kernel_size):
 23 |     """Generate the mesh grid, centering at zero.
 24 |     Args:
 25 |         kernel_size (int):
 26 |     Returns:
 27 |         xy (ndarray): with the shape (kernel_size, kernel_size, 2)
 28 |         xx (ndarray): with the shape (kernel_size, kernel_size)
 29 |         yy (ndarray): with the shape (kernel_size, kernel_size)
 30 |     """
 31 |     ax = np.arange(-kernel_size // 2 + 1.0, kernel_size // 2 + 1.0)
 32 |     xx, yy = np.meshgrid(ax, ax)
 33 |     xy = np.hstack(
 34 |         (
 35 |             xx.reshape((kernel_size * kernel_size, 1)),
 36 |             yy.reshape(kernel_size * kernel_size, 1),
 37 |         )
 38 |     ).reshape(kernel_size, kernel_size, 2)
 39 |     return xy, xx, yy
 40 | 
 41 | 
 42 | def sigma_matrix2(sig_x, sig_y, theta):
 43 |     """Calculate the rotated sigma matrix (two dimensional matrix).
 44 |     Args:
 45 |         sig_x (float):
 46 |         sig_y (float):
 47 |         theta (float): Radian measurement.
 48 |     Returns:
 49 |         ndarray: Rotated sigma matrix.
 50 |     """
 51 |     d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
 52 |     u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
 53 |     return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
 54 | 
 55 | 
 56 | def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
 57 |     """Generate a bivariate isotropic or anisotropic Gaussian kernel.
 58 |     In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
 59 |     Args:
 60 |         kernel_size (int):
 61 |         sig_x (float):
 62 |         sig_y (float):
 63 |         theta (float): Radian measurement.
 64 |         grid (ndarray, optional): generated by :func:`mesh_grid`,
 65 |             with the shape (K, K, 2), K is the kernel size. Default: None
 66 |         isotropic (bool):
 67 |     Returns:
 68 |         kernel (ndarray): normalized kernel.
 69 |     """
 70 |     if grid is None:
 71 |         grid, _, _ = mesh_grid(kernel_size)
 72 |     if isotropic:
 73 |         sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
 74 |     else:
 75 |         sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
 76 |     kernel = pdf2(sigma_matrix, grid)
 77 |     kernel = kernel / np.sum(kernel)
 78 |     return kernel
 79 | 
 80 | size = 99
 81 | sigma = 10
 82 | blur_kernel = bivariate_Gaussian(size, sigma, sigma, 0, grid=None, isotropic=True)
 83 | blur_kernel = blur_kernel / blur_kernel[size // 2, size // 2]
 84 | 
 85 | canvas_width, canvas_height = 256, 256
 86 | 
 87 | def get_flow(points, optical_flow, video_len):
 88 |     for i in range(video_len - 1):
 89 |         p = points[i]
 90 |         p1 = points[i + 1]
 91 |         optical_flow[i + 1, p[1], p[0], 0] = p1[0] - p[0]
 92 |         optical_flow[i + 1, p[1], p[0], 1] = p1[1] - p[1]
 93 | 
 94 |     return optical_flow
 95 | 
 96 | 
 97 | def process_points(points, frames=49):
 98 |     defualt_points = [[128, 128]] * frames
 99 | 
100 |     if len(points) < 2:
101 |         return defualt_points
102 | 
103 |     elif len(points) >= frames:
104 |         skip = len(points) // frames
105 |         return points[::skip][: frames - 1] + points[-1:]
106 |     else:
107 |         insert_num = frames - len(points)
108 |         insert_num_dict = {}
109 |         interval = len(points) - 1
110 |         n = insert_num // interval
111 |         m = insert_num % interval
112 |         for i in range(interval):
113 |             insert_num_dict[i] = n
114 |         for i in range(m):
115 |             insert_num_dict[i] += 1
116 | 
117 |         res = []
118 |         for i in range(interval):
119 |             insert_points = []
120 |             x0, y0 = points[i]
121 |             x1, y1 = points[i + 1]
122 | 
123 |             delta_x = x1 - x0
124 |             delta_y = y1 - y0
125 |             for j in range(insert_num_dict[i]):
126 |                 x = x0 + (j + 1) / (insert_num_dict[i] + 1) * delta_x
127 |                 y = y0 + (j + 1) / (insert_num_dict[i] + 1) * delta_y
128 |                 insert_points.append([int(x), int(y)])
129 | 
130 |             res += points[i : i + 1] + insert_points
131 |         res += points[-1:]
132 |         return res
133 | 
134 | 
135 | def read_points_from_list(traj_list, video_len=16, reverse=False):
136 |     points = []
137 |     for point in traj_list:
138 |         if isinstance(point, str):
139 |             x, y = point.strip().split(",")
140 |         else:
141 |             x, y = point[0], point[1]
142 |         points.append((int(x), int(y)))
143 |     if reverse:
144 |         points = points[::-1]
145 | 
146 |     if len(points) > video_len:
147 |         skip = len(points) // video_len
148 |         points = points[::skip]
149 |     points = points[:video_len]
150 | 
151 |     return points
152 | 
153 | 
154 | def read_points_from_file(file, video_len=16, reverse=False):
155 |     with open(file, "r") as f:
156 |         lines = f.readlines()
157 |     points = []
158 |     for line in lines:
159 |         x, y = line.strip().split(",")
160 |         points.append((int(x), int(y)))
161 |     if reverse:
162 |         points = points[::-1]
163 | 
164 |     if len(points) > video_len:
165 |         skip = len(points) // video_len
166 |         points = points[::skip]
167 |     points = points[:video_len]
168 | 
169 |     return points
170 | 
171 | 
172 | def process_traj(trajs_list, num_frames, video_size, device="cpu"):
173 |     if trajs_list and trajs_list[0] and (not isinstance(trajs_list[0][0], (list, tuple))):
174 |         tmp = trajs_list
175 |         trajs_list = [tmp]
176 | 
177 |     optical_flow = np.zeros((num_frames, video_size[0], video_size[1], 2), dtype=np.float32)
178 |     processed_points = []
179 |     for traj_list in trajs_list:
180 |         points = read_points_from_list(traj_list, video_len=num_frames)
181 |         xy_range = 256
182 |         h, w = video_size
183 |         points = process_points(points, num_frames)
184 |         points = [[int(w * x / xy_range), int(h * y / xy_range)] for x, y in points]
185 |         optical_flow = get_flow(points, optical_flow, video_len=num_frames)
186 |         processed_points.append(points)
187 | 
188 |     print(f"received {len(trajs_list)} trajectorie(s)")
189 | 
190 |     for i in range(1, num_frames):
191 |         optical_flow[i] = cv2.filter2D(optical_flow[i], -1, blur_kernel)
192 | 
193 |     optical_flow = torch.tensor(optical_flow).to(device)
194 | 
195 |     return optical_flow, processed_points
196 | 
197 | 
198 | def add_provided_traj(traj_name):
199 |     global traj_list
200 |     traj_list = PROVIDED_TRAJS[traj_name]
201 |     traj_str = [f"{traj}" for traj in traj_list]
202 |     return ", ".join(traj_str)
203 | 
204 | 
205 | def scale_traj_list_to_256(traj_list, canvas_width, canvas_height):
206 |     scale_x = 256 / canvas_width
207 |     scale_y = 256 / canvas_height
208 |     scaled_traj_list = [[int(x * scale_x), int(y * scale_y)] for x, y in traj_list]
209 |     return scaled_traj_list


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata
 2 | import torch
 3 | import logging
 4 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 5 | log = logging.getLogger(__name__)
 6 | 
 7 | def check_diffusers_version():
 8 |     try:
 9 |         version = importlib.metadata.version('diffusers')
10 |         required_version = '0.31.0'
11 |         if version < required_version:
12 |             raise AssertionError(f"diffusers version {version} is installed, but version {required_version} or higher is required.")
13 |     except importlib.metadata.PackageNotFoundError:
14 |         raise AssertionError("diffusers is not installed.")
15 |     
16 | def remove_specific_blocks(model, block_indices_to_remove):
17 |     import torch.nn as nn
18 |     transformer_blocks = model.transformer_blocks
19 |     new_blocks = [block for i, block in enumerate(transformer_blocks) if i not in block_indices_to_remove]
20 |     model.transformer_blocks = nn.ModuleList(new_blocks)
21 |     
22 |     return model
23 | 
24 | def print_memory(device):
25 |     memory = torch.cuda.memory_allocated(device) / 1024**3
26 |     max_memory = torch.cuda.max_memory_allocated(device) / 1024**3
27 |     max_reserved = torch.cuda.max_memory_reserved(device) / 1024**3
28 |     log.info(f"Allocated memory: {memory=:.3f} GB")
29 |     log.info(f"Max allocated memory: {max_memory=:.3f} GB")
30 |     log.info(f"Max reserved memory: {max_reserved=:.3f} GB")


--------------------------------------------------------------------------------