├── .gitignore
├── LICENSE
├── README.md
├── cogvideo_controlnet.py
├── cogvideo_transformer.py
├── controlnet_img2vid_pipeline.py
├── controlnet_pipeline.py
├── inference
    ├── cli_demo.py
    ├── controlnet_img2vid.ipynb
    ├── controlnet_inference.ipynb
    └── gradio_web_demo.py
├── requirements.txt
├── resources
    ├── car.mp4
    ├── ship.mp4
    ├── stacked_car.mp4
    └── stacked_ship.mp4
└── training
    ├── accelerate_config_machine_single.yaml
    ├── controlnet_datasets.py
    ├── finetune_single_rank.sh
    └── train_controlnet.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |   Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 CogVideo Model Team @ Zhipu AI
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CogvideoX Controlnet Extention
  2 | 
  3 | https://github.com/user-attachments/assets/d3cd3cc4-de95-453f-bbf7-ccbe1711fc3c
  4 | 
  5 | This repo contains the code for simple Controlnet module for CogvideoX model.  
  6 | ### ComfyUI
  7 | <a href="https://github.com/kijai/ComfyUI-CogVideoXWrapper">ComfyUI-CogVideoXWrapper
  8 | </a> supports controlnet pipeline. See an <a href="https://github.com/kijai/ComfyUI-CogVideoXWrapper/blob/main/examples/cogvideox_2b_controlnet_example_01.json">example
  9 | </a> file.  
 10 |   
 11 | ### Models  
 12 | Supported models for 5B:
 13 | - Canny (<a href="https://huggingface.co/TheDenk/cogvideox-5b-controlnet-canny-v1">HF Model Link</a>) 
 14 | - Hed (<a href="https://huggingface.co/TheDenk/cogvideox-5b-controlnet-hed-v1">HF Model Link</a>)  
 15 | 
 16 |   
 17 | Supported models for 2B:
 18 | - Canny (<a href="https://huggingface.co/TheDenk/cogvideox-2b-controlnet-canny-v1">HF Model Link</a>) 
 19 | - Hed (<a href="https://huggingface.co/TheDenk/cogvideox-2b-controlnet-hed-v1">HF Model Link</a>) 
 20 |   
 21 | ### How to
 22 | Clone repo 
 23 | ```bash
 24 | git clone https://github.com/TheDenk/cogvideox-controlnet.git
 25 | cd cogvideox-controlnet
 26 | ```
 27 |   
 28 | Create venv  
 29 | ```bash
 30 | python -m venv venv
 31 | source venv/bin/activate
 32 | ```
 33 |   
 34 | Install requirements
 35 | ```bash
 36 | pip install -r requirements.txt
 37 | ```
 38 |   
 39 | ### Simple examples
 40 | #### Inference with cli
 41 | ```bash
 42 | python -m inference.cli_demo \
 43 |     --video_path "resources/car.mp4" \
 44 |     --prompt "The camera follows behind red car. Car is surrounded by a panoramic view of the vast, azure ocean. Seagulls soar overhead, and in the distance, a lighthouse stands sentinel, its beam cutting through the twilight. The scene captures a perfect blend of adventure and serenity, with the car symbolizing freedom on the open sea." \
 45 |     --controlnet_type "canny" \
 46 |     --base_model_path THUDM/CogVideoX-5b \
 47 |     --controlnet_model_path TheDenk/cogvideox-5b-controlnet-canny-v1
 48 | ```
 49 | 
 50 | #### Inference with Gradio
 51 | ```bash
 52 | python -m inference.gradio_web_demo \
 53 |     --controlnet_type "canny" \
 54 |     --base_model_path THUDM/CogVideoX-5b \
 55 |     --controlnet_model_path TheDenk/cogvideox-5b-controlnet-canny-v1
 56 | ```
 57 | 
 58 | ### Detailed inference
 59 | ```bash
 60 | CUDA_VISIBLE_DEVICES=0 python -m inference.cli_demo \
 61 |     --video_path "resources/car.mp4" \
 62 |     --prompt "The camera follows behind red car. Car is surrounded by a panoramic view of the vast, azure ocean. Seagulls soar overhead, and in the distance, a lighthouse stands sentinel, its beam cutting through the twilight. The scene captures a perfect blend of adventure and serenity, with the car symbolizing freedom on the open sea." \
 63 |     --controlnet_type "canny" \
 64 |     --base_model_path THUDM/CogVideoX-5b \
 65 |     --controlnet_model_path TheDenk/cogvideox-5b-controlnet-canny-v1 \
 66 |     --num_inference_steps 50 \
 67 |     --guidance_scale 6.0 \
 68 |     --controlnet_weights 1.0 \
 69 |     --controlnet_guidance_start 0.0 \
 70 |     --controlnet_guidance_end 0.5 \
 71 |     --output_path "./output.mp4" \
 72 |     --seed 42
 73 | ```
 74 | 
 75 | ## Training
 76 | The 2B model requires 48 GB VRAM (For example A6000) and 80 GB for 5B. But it depends on the number of transformer blocks which default is 8 (`controlnet_transformer_num_layers` parameter in the config).
 77 | 
 78 | #### Dataset
 79 | <a href="https://huggingface.co/datasets/nkp37/OpenVid-1M">OpenVid-1M</a> dataset was taken as the base variant. CSV files for the dataset you can find <a href="https://huggingface.co/datasets/nkp37/OpenVid-1M/tree/main/data/train">here</a>.
 80 | 
 81 | #### Train script
 82 | For start training you need fill the config files `accelerate_config_machine_single.yaml` and `finetune_single_rank.sh`.  
 83 | In `accelerate_config_machine_single.yaml` set parameter`num_processes: 1` to your GPU count.  
 84 | In `finetune_single_rank.sh`:  
 85 | 1. Set `MODEL_PATH for` base CogVideoX model. Default is THUDM/CogVideoX-2b.  
 86 | 2. Set `CUDA_VISIBLE_DEVICES` (Default is 0).  
 87 | 3. (For OpenVid dataset) Set `video_root_dir` to directory with video files and `csv_path`.  
 88 | 
 89 | Run taining
 90 | ```
 91 | cd training
 92 | bash finetune_single_rank.sh
 93 | ```
 94 | 
 95 | ## Acknowledgements
 96 | Original code and models [CogVideoX](https://github.com/THUDM/CogVideo/tree/main).  
 97 | 
 98 | ## Contacts
 99 | <p>Issues should be raised directly in the repository. For professional support and recommendations please <a>welcomedenk@gmail.com</a>.</p>
100 | 


--------------------------------------------------------------------------------
/cogvideo_controlnet.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from einops import rearrange
  6 | import torch.nn.functional as F
  7 | from diffusers.models.transformers.cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock
  8 | from diffusers.utils import is_torch_version
  9 | from diffusers.loaders import  PeftAdapterMixin
 10 | from diffusers.utils.torch_utils import maybe_allow_in_graph
 11 | from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
 12 | from diffusers.models.modeling_utils import ModelMixin
 13 | from diffusers.models.attention import Attention, FeedForward
 14 | from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor2_0
 15 | from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero, AdaLayerNormZeroSingle
 16 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 17 | 
 18 | 
 19 | class CogVideoXControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin):
 20 |     _supports_gradient_checkpointing = True
 21 |     
 22 |     @register_to_config
 23 |     def __init__(
 24 |         self,
 25 |         num_attention_heads: int = 30,
 26 |         attention_head_dim: int = 64,
 27 |         vae_channels: int = 16,
 28 |         in_channels: int = 3,
 29 |         downscale_coef: int = 8,
 30 |         flip_sin_to_cos: bool = True,
 31 |         freq_shift: int = 0,
 32 |         time_embed_dim: int = 512,
 33 |         num_layers: int = 8,
 34 |         dropout: float = 0.0,
 35 |         attention_bias: bool = True,
 36 |         sample_width: int = 90,
 37 |         sample_height: int = 60,
 38 |         sample_frames: int = 49,
 39 |         patch_size: int = 2,
 40 |         temporal_compression_ratio: int = 4,
 41 |         max_text_seq_length: int = 226,
 42 |         activation_fn: str = "gelu-approximate",
 43 |         timestep_activation_fn: str = "silu",
 44 |         norm_elementwise_affine: bool = True,
 45 |         norm_eps: float = 1e-5,
 46 |         spatial_interpolation_scale: float = 1.875,
 47 |         temporal_interpolation_scale: float = 1.0,
 48 |         use_rotary_positional_embeddings: bool = False,
 49 |         use_learned_positional_embeddings: bool = False,
 50 |         out_proj_dim = None,
 51 |     ):
 52 |         super().__init__()
 53 |         inner_dim = num_attention_heads * attention_head_dim
 54 | 
 55 |         if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
 56 |             raise ValueError(
 57 |                 "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
 58 |                 "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
 59 |                 "issue at https://github.com/huggingface/diffusers/issues."
 60 |             )
 61 |             
 62 |         start_channels = in_channels * (downscale_coef ** 2)
 63 |         input_channels = [start_channels, start_channels // 2, start_channels // 4]
 64 |         self.unshuffle = nn.PixelUnshuffle(downscale_coef)
 65 |         
 66 |         self.controlnet_encode_first = nn.Sequential(
 67 |             nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
 68 |             nn.GroupNorm(2, input_channels[1]),
 69 |             nn.ReLU(),
 70 |         )
 71 | 
 72 |         self.controlnet_encode_second = nn.Sequential(
 73 |             nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
 74 |             nn.GroupNorm(2, input_channels[2]),
 75 |             nn.ReLU(),
 76 |         )
 77 |         
 78 |         # 1. Patch embedding
 79 |         self.patch_embed = CogVideoXPatchEmbed(
 80 |             patch_size=patch_size,
 81 |             in_channels=vae_channels + input_channels[2],
 82 |             embed_dim=inner_dim,
 83 |             bias=True,
 84 |             sample_width=sample_width,
 85 |             sample_height=sample_height,
 86 |             sample_frames=sample_frames,
 87 |             temporal_compression_ratio=temporal_compression_ratio,
 88 |             spatial_interpolation_scale=spatial_interpolation_scale,
 89 |             temporal_interpolation_scale=temporal_interpolation_scale,
 90 |             use_positional_embeddings=not use_rotary_positional_embeddings,
 91 |             use_learned_positional_embeddings=use_learned_positional_embeddings,
 92 |         )
 93 |         
 94 |         self.embedding_dropout = nn.Dropout(dropout)
 95 | 
 96 |         # 2. Time embeddings
 97 |         self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
 98 |         self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
 99 | 
100 |         # 3. Define spatio-temporal transformers blocks
101 |         self.transformer_blocks = nn.ModuleList(
102 |             [
103 |                 CogVideoXBlock(
104 |                     dim=inner_dim,
105 |                     num_attention_heads=num_attention_heads,
106 |                     attention_head_dim=attention_head_dim,
107 |                     time_embed_dim=time_embed_dim,
108 |                     dropout=dropout,
109 |                     activation_fn=activation_fn,
110 |                     attention_bias=attention_bias,
111 |                     norm_elementwise_affine=norm_elementwise_affine,
112 |                     norm_eps=norm_eps,
113 |                 )
114 |                 for _ in range(num_layers)
115 |             ]
116 |         )
117 | 
118 |         self.out_projectors = None
119 |         if out_proj_dim is not None:
120 |             self.out_projectors = nn.ModuleList(
121 |                 [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)]
122 |             )
123 |             
124 |         self.gradient_checkpointing = False
125 |         
126 |     def _set_gradient_checkpointing(self, module, value=False):
127 |         self.gradient_checkpointing = value
128 | 
129 |     def compress_time(self, x, num_frames):
130 |         x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
131 |         batch_size, frames, channels, height, width = x.shape
132 |         x = rearrange(x, 'b f c h w -> (b h w) c f')
133 |         
134 |         if x.shape[-1] % 2 == 1:
135 |             x_first, x_rest = x[..., 0], x[..., 1:]
136 |             if x_rest.shape[-1] > 0:
137 |                 x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
138 | 
139 |             x = torch.cat([x_first[..., None], x_rest], dim=-1)
140 |         else:
141 |             x = F.avg_pool1d(x, kernel_size=2, stride=2)
142 |         x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
143 |         return x
144 |         
145 |     def forward(
146 |         self,
147 |         hidden_states: torch.Tensor,
148 |         encoder_hidden_states: torch.Tensor,
149 |         controlnet_states: torch.Tensor,
150 |         timestep: Union[int, float, torch.LongTensor],
151 |         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
152 |         timestep_cond: Optional[torch.Tensor] = None,
153 |         return_dict: bool = True,
154 |     ):
155 |         batch_size, num_frames, channels, height, width = controlnet_states.shape
156 |         # 0. Controlnet encoder
157 |         controlnet_states = rearrange(controlnet_states, 'b f c h w -> (b f) c h w')
158 |         controlnet_states = self.unshuffle(controlnet_states)
159 |         controlnet_states = self.controlnet_encode_first(controlnet_states)
160 |         controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 
161 |         num_frames = controlnet_states.shape[0] // batch_size
162 | 
163 |         controlnet_states = self.controlnet_encode_second(controlnet_states)
164 |         controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 
165 |         controlnet_states = rearrange(controlnet_states, '(b f) c h w -> b f c h w', b=batch_size)
166 | 
167 |         hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
168 |         # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep)
169 |         # 1. Time embedding
170 |         timesteps = timestep
171 |         t_emb = self.time_proj(timesteps)
172 | 
173 |         # timesteps does not contain any weights and will always return f32 tensors
174 |         # but time_embedding might actually be running in fp16. so we need to cast here.
175 |         # there might be better ways to encapsulate this.
176 |         t_emb = t_emb.to(dtype=hidden_states.dtype)
177 |         emb = self.time_embedding(t_emb, timestep_cond)
178 |         
179 |         hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
180 |         hidden_states = self.embedding_dropout(hidden_states)
181 | 
182 | 
183 |         text_seq_length = encoder_hidden_states.shape[1]
184 |         encoder_hidden_states = hidden_states[:, :text_seq_length]
185 |         hidden_states = hidden_states[:, text_seq_length:]
186 | 
187 |         
188 |         controlnet_hidden_states = ()
189 |         # 3. Transformer blocks
190 |         for i, block in enumerate(self.transformer_blocks):
191 |             if self.training and self.gradient_checkpointing:
192 | 
193 |                 def create_custom_forward(module):
194 |                     def custom_forward(*inputs):
195 |                         return module(*inputs)
196 | 
197 |                     return custom_forward
198 | 
199 |                 ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
200 |                 hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
201 |                     create_custom_forward(block),
202 |                     hidden_states,
203 |                     encoder_hidden_states,
204 |                     emb,
205 |                     image_rotary_emb,
206 |                     **ckpt_kwargs,
207 |                 )
208 |             else:
209 |                 hidden_states, encoder_hidden_states = block(
210 |                     hidden_states=hidden_states,
211 |                     encoder_hidden_states=encoder_hidden_states,
212 |                     temb=emb,
213 |                     image_rotary_emb=image_rotary_emb,
214 |                 )
215 |                 
216 |             if self.out_projectors is not None:
217 |                 controlnet_hidden_states += (self.out_projectors[i](hidden_states),)
218 |             else:
219 |                 controlnet_hidden_states += (hidden_states,)
220 |             
221 |         if not return_dict:
222 |             return (controlnet_hidden_states,)
223 |         return Transformer2DModelOutput(sample=controlnet_hidden_states)


--------------------------------------------------------------------------------
/cogvideo_transformer.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | from diffusers.utils import is_torch_version
  6 | from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXTransformer3DModel, Transformer2DModelOutput
  7 | 
  8 | 
  9 | class CustomCogVideoXTransformer3DModel(CogVideoXTransformer3DModel):        
 10 |     def forward(
 11 |         self,
 12 |         hidden_states: torch.Tensor,
 13 |         encoder_hidden_states: torch.Tensor,
 14 |         timestep: Union[int, float, torch.LongTensor],
 15 |         start_frame = None,
 16 |         timestep_cond: Optional[torch.Tensor] = None,
 17 |         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
 18 |         controlnet_states: torch.Tensor = None,
 19 |         controlnet_weights: Optional[Union[float, int, list, np.ndarray, torch.FloatTensor]] = 1.0,
 20 |         return_dict: bool = True,
 21 |     ):
 22 |         batch_size, num_frames, channels, height, width = hidden_states.shape
 23 | 
 24 |         if start_frame is not None:
 25 |             hidden_states = torch.cat([start_frame, hidden_states], dim=2)
 26 |         # 1. Time embedding
 27 |         timesteps = timestep
 28 |         t_emb = self.time_proj(timesteps)
 29 | 
 30 |         # timesteps does not contain any weights and will always return f32 tensors
 31 |         # but time_embedding might actually be running in fp16. so we need to cast here.
 32 |         # there might be better ways to encapsulate this.
 33 |         t_emb = t_emb.to(dtype=hidden_states.dtype)
 34 |         emb = self.time_embedding(t_emb, timestep_cond)
 35 | 
 36 |         # 2. Patch embedding
 37 |         hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
 38 |         hidden_states = self.embedding_dropout(hidden_states)
 39 | 
 40 |         text_seq_length = encoder_hidden_states.shape[1]
 41 |         encoder_hidden_states = hidden_states[:, :text_seq_length]
 42 |         hidden_states = hidden_states[:, text_seq_length:]
 43 | 
 44 |         # 3. Transformer blocks
 45 |         for i, block in enumerate(self.transformer_blocks):
 46 |             if self.training and self.gradient_checkpointing:
 47 | 
 48 |                 def create_custom_forward(module):
 49 |                     def custom_forward(*inputs):
 50 |                         return module(*inputs)
 51 | 
 52 |                     return custom_forward
 53 | 
 54 |                 ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
 55 |                 hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
 56 |                     create_custom_forward(block),
 57 |                     hidden_states,
 58 |                     encoder_hidden_states,
 59 |                     emb,
 60 |                     image_rotary_emb,
 61 |                     **ckpt_kwargs,
 62 |                 )
 63 |             else:
 64 |                 hidden_states, encoder_hidden_states = block(
 65 |                     hidden_states=hidden_states,
 66 |                     encoder_hidden_states=encoder_hidden_states,
 67 |                     temb=emb,
 68 |                     image_rotary_emb=image_rotary_emb,
 69 |                 )
 70 | 
 71 |             if (controlnet_states is not None) and (i < len(controlnet_states)):
 72 |                 controlnet_states_block = controlnet_states[i]
 73 |                 controlnet_block_weight = 1.0
 74 |                 if isinstance(controlnet_weights, (list, np.ndarray)) or torch.is_tensor(controlnet_weights):
 75 |                     controlnet_block_weight = controlnet_weights[i]
 76 |                 elif isinstance(controlnet_weights, (float, int)):
 77 |                     controlnet_block_weight = controlnet_weights
 78 |                 
 79 |                 hidden_states = hidden_states + controlnet_states_block * controlnet_block_weight
 80 | 
 81 |         if not self.config.use_rotary_positional_embeddings:
 82 |             # CogVideoX-2B
 83 |             hidden_states = self.norm_final(hidden_states)
 84 |         else:
 85 |             # CogVideoX-5B
 86 |             hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 87 |             hidden_states = self.norm_final(hidden_states)
 88 |             hidden_states = hidden_states[:, text_seq_length:]
 89 | 
 90 |         # 4. Final block
 91 |         hidden_states = self.norm_out(hidden_states, temb=emb)
 92 |         hidden_states = self.proj_out(hidden_states)
 93 | 
 94 |         # 5. Unpatchify
 95 |         p = self.config.patch_size
 96 |         p_t = self.config.patch_size_t
 97 | 
 98 |         if p_t is None:
 99 |             output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
100 |             output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
101 |         else:
102 |             output = hidden_states.reshape(
103 |                 batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
104 |             )
105 |             output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
106 | 
107 |         if not return_dict:
108 |             return (output,)
109 |         return Transformer2DModelOutput(sample=output)


--------------------------------------------------------------------------------
/controlnet_img2vid_pipeline.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import math
  3 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  4 | 
  5 | import torch
  6 | import numpy as np
  7 | import PIL
  8 | from PIL import Image
  9 | from torchvision import transforms
 10 | from einops import rearrange, repeat
 11 | from transformers import T5EncoderModel, T5Tokenizer
 12 | from diffusers.video_processor import VideoProcessor
 13 | from diffusers.utils.torch_utils import randn_tensor
 14 | from diffusers.models.embeddings import get_3d_rotary_pos_embed
 15 | from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 16 | from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 17 | from diffusers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, CogVideoXImageToVideoPipeline
 18 | from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 19 | from diffusers.pipelines.cogvideo.pipeline_cogvideox import CogVideoXPipelineOutput, CogVideoXLoraLoaderMixin
 20 | 
 21 | from cogvideo_controlnet import CogVideoXControlnet
 22 | 
 23 | 
 24 | def resize_for_crop(image, crop_h, crop_w):
 25 |     img_h, img_w = image.shape[-2:]
 26 |     if img_h >= crop_h and img_w >= crop_w:
 27 |         coef = max(crop_h / img_h, crop_w / img_w)
 28 |     elif img_h <= crop_h and img_w <= crop_w:
 29 |         coef = max(crop_h / img_h, crop_w / img_w)
 30 |     else:
 31 |         coef = crop_h / img_h if crop_h > img_h else crop_w / img_w 
 32 |     out_h, out_w = int(img_h * coef), int(img_w * coef)
 33 |     resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
 34 |     return resized_image
 35 | 
 36 | 
 37 | def prepare_frames(input_images, video_size, do_resize=True, do_crop=True):
 38 |     input_images = np.stack([np.array(x) for x in input_images])
 39 |     images_tensor = torch.from_numpy(input_images).permute(0, 3, 1, 2) / 127.5 - 1
 40 |     if do_resize:
 41 |         images_tensor = [resize_for_crop(x, crop_h=video_size[0], crop_w=video_size[1]) for x in images_tensor]
 42 |     if do_crop:
 43 |         images_tensor = [transforms.functional.center_crop(x, video_size) for x in images_tensor]
 44 |     if isinstance(images_tensor, list):
 45 |         images_tensor = torch.stack(images_tensor)
 46 |     return images_tensor.unsqueeze(0) 
 47 | 
 48 | 
 49 | def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
 50 |     tw = tgt_width
 51 |     th = tgt_height
 52 |     h, w = src
 53 |     r = h / w
 54 |     if r > (th / tw):
 55 |         resize_height = th
 56 |         resize_width = int(round(th / h * w))
 57 |     else:
 58 |         resize_width = tw
 59 |         resize_height = int(round(tw / w * h))
 60 | 
 61 |     crop_top = int(round((th - resize_height) / 2.0))
 62 |     crop_left = int(round((tw - resize_width) / 2.0))
 63 | 
 64 |     return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
 65 | 
 66 | 
 67 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 68 | def retrieve_timesteps(
 69 |     scheduler,
 70 |     num_inference_steps: Optional[int] = None,
 71 |     device: Optional[Union[str, torch.device]] = None,
 72 |     timesteps: Optional[List[int]] = None,
 73 |     sigmas: Optional[List[float]] = None,
 74 |     **kwargs,
 75 | ):
 76 |     """
 77 |     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
 78 |     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 79 | 
 80 |     Args:
 81 |         scheduler (`SchedulerMixin`):
 82 |             The scheduler to get timesteps from.
 83 |         num_inference_steps (`int`):
 84 |             The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
 85 |             must be `None`.
 86 |         device (`str` or `torch.device`, *optional*):
 87 |             The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
 88 |         timesteps (`List[int]`, *optional*):
 89 |             Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
 90 |             `num_inference_steps` and `sigmas` must be `None`.
 91 |         sigmas (`List[float]`, *optional*):
 92 |             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
 93 |             `num_inference_steps` and `timesteps` must be `None`.
 94 | 
 95 |     Returns:
 96 |         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
 97 |         second element is the number of inference steps.
 98 |     """
 99 |     if timesteps is not None and sigmas is not None:
100 |         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
101 |     if timesteps is not None:
102 |         accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
103 |         if not accepts_timesteps:
104 |             raise ValueError(
105 |                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
106 |                 f" timestep schedules. Please check whether you are using the correct scheduler."
107 |             )
108 |         scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
109 |         timesteps = scheduler.timesteps
110 |         num_inference_steps = len(timesteps)
111 |     elif sigmas is not None:
112 |         accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
113 |         if not accept_sigmas:
114 |             raise ValueError(
115 |                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
116 |                 f" sigmas schedules. Please check whether you are using the correct scheduler."
117 |             )
118 |         scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
119 |         timesteps = scheduler.timesteps
120 |         num_inference_steps = len(timesteps)
121 |     else:
122 |         scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
123 |         timesteps = scheduler.timesteps
124 |     return timesteps, num_inference_steps
125 |     
126 | 
127 | def retrieve_latents(
128 |     encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
129 | ):
130 |     if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
131 |         return encoder_output.latent_dist.sample(generator)
132 |     elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
133 |         return encoder_output.latent_dist.mode()
134 |     elif hasattr(encoder_output, "latents"):
135 |         return encoder_output.latents
136 |     else:
137 |         raise AttributeError("Could not access latents of provided encoder_output")
138 | 
139 | 
140 | class CogVideoXImageToVideoControlnetPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
141 |     r"""
142 |     Pipeline for image-to-video generation using CogVideoX.
143 | 
144 |     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
145 |     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
146 | 
147 |     Args:
148 |         vae ([`AutoencoderKL`]):
149 |             Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
150 |         text_encoder ([`T5EncoderModel`]):
151 |             Frozen text-encoder. CogVideoX uses
152 |             [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
153 |             [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
154 |         tokenizer (`T5Tokenizer`):
155 |             Tokenizer of class
156 |             [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
157 |         transformer ([`CogVideoXTransformer3DModel`]):
158 |             A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
159 |         scheduler ([`SchedulerMixin`]):
160 |             A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
161 |     """
162 | 
163 |     _optional_components = []
164 |     model_cpu_offload_seq = "text_encoder->transformer->vae"
165 | 
166 |     _callback_tensor_inputs = [
167 |         "latents",
168 |         "prompt_embeds",
169 |         "negative_prompt_embeds",
170 |     ]
171 | 
172 |     def __init__(
173 |         self,
174 |         tokenizer: T5Tokenizer,
175 |         text_encoder: T5EncoderModel,
176 |         vae: AutoencoderKLCogVideoX,
177 |         transformer: CogVideoXTransformer3DModel,
178 |         controlnet: CogVideoXControlnet,
179 |         scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
180 |     ):
181 |         super().__init__()
182 | 
183 |         self.register_modules(
184 |             tokenizer=tokenizer,
185 |             text_encoder=text_encoder,
186 |             vae=vae,
187 |             transformer=transformer,
188 |             controlnet=controlnet,
189 |             scheduler=scheduler,
190 |         )
191 |         self.vae_scale_factor_spatial = (
192 |             2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
193 |         )
194 |         self.vae_scale_factor_temporal = (
195 |             self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
196 |         )
197 |         self.vae_scaling_factor_image = (
198 |             self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
199 |         )
200 | 
201 |         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
202 | 
203 |     # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
204 |     def _get_t5_prompt_embeds(
205 |         self,
206 |         prompt: Union[str, List[str]] = None,
207 |         num_videos_per_prompt: int = 1,
208 |         max_sequence_length: int = 226,
209 |         device: Optional[torch.device] = None,
210 |         dtype: Optional[torch.dtype] = None,
211 |     ):
212 |         device = device or self._execution_device
213 |         dtype = dtype or self.text_encoder.dtype
214 | 
215 |         prompt = [prompt] if isinstance(prompt, str) else prompt
216 |         batch_size = len(prompt)
217 | 
218 |         text_inputs = self.tokenizer(
219 |             prompt,
220 |             padding="max_length",
221 |             max_length=max_sequence_length,
222 |             truncation=True,
223 |             add_special_tokens=True,
224 |             return_tensors="pt",
225 |         )
226 |         text_input_ids = text_inputs.input_ids
227 |         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
228 | 
229 |         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
230 |             removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
231 |             logger.warning(
232 |                 "The following part of your input was truncated because `max_sequence_length` is set to "
233 |                 f" {max_sequence_length} tokens: {removed_text}"
234 |             )
235 | 
236 |         prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
237 |         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
238 | 
239 |         # duplicate text embeddings for each generation per prompt, using mps friendly method
240 |         _, seq_len, _ = prompt_embeds.shape
241 |         prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
242 |         prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
243 | 
244 |         return prompt_embeds
245 | 
246 |     # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
247 |     def encode_prompt(
248 |         self,
249 |         prompt: Union[str, List[str]],
250 |         negative_prompt: Optional[Union[str, List[str]]] = None,
251 |         do_classifier_free_guidance: bool = True,
252 |         num_videos_per_prompt: int = 1,
253 |         prompt_embeds: Optional[torch.Tensor] = None,
254 |         negative_prompt_embeds: Optional[torch.Tensor] = None,
255 |         max_sequence_length: int = 226,
256 |         device: Optional[torch.device] = None,
257 |         dtype: Optional[torch.dtype] = None,
258 |     ):
259 |         r"""
260 |         Encodes the prompt into text encoder hidden states.
261 | 
262 |         Args:
263 |             prompt (`str` or `List[str]`, *optional*):
264 |                 prompt to be encoded
265 |             negative_prompt (`str` or `List[str]`, *optional*):
266 |                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
267 |                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
268 |                 less than `1`).
269 |             do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
270 |                 Whether to use classifier free guidance or not.
271 |             num_videos_per_prompt (`int`, *optional*, defaults to 1):
272 |                 Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
273 |             prompt_embeds (`torch.Tensor`, *optional*):
274 |                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
275 |                 provided, text embeddings will be generated from `prompt` input argument.
276 |             negative_prompt_embeds (`torch.Tensor`, *optional*):
277 |                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
278 |                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
279 |                 argument.
280 |             device: (`torch.device`, *optional*):
281 |                 torch device
282 |             dtype: (`torch.dtype`, *optional*):
283 |                 torch dtype
284 |         """
285 |         device = device or self._execution_device
286 | 
287 |         prompt = [prompt] if isinstance(prompt, str) else prompt
288 |         if prompt is not None:
289 |             batch_size = len(prompt)
290 |         else:
291 |             batch_size = prompt_embeds.shape[0]
292 | 
293 |         if prompt_embeds is None:
294 |             prompt_embeds = self._get_t5_prompt_embeds(
295 |                 prompt=prompt,
296 |                 num_videos_per_prompt=num_videos_per_prompt,
297 |                 max_sequence_length=max_sequence_length,
298 |                 device=device,
299 |                 dtype=dtype,
300 |             )
301 | 
302 |         if do_classifier_free_guidance and negative_prompt_embeds is None:
303 |             negative_prompt = negative_prompt or ""
304 |             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
305 | 
306 |             if prompt is not None and type(prompt) is not type(negative_prompt):
307 |                 raise TypeError(
308 |                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
309 |                     f" {type(prompt)}."
310 |                 )
311 |             elif batch_size != len(negative_prompt):
312 |                 raise ValueError(
313 |                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
314 |                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
315 |                     " the batch size of `prompt`."
316 |                 )
317 | 
318 |             negative_prompt_embeds = self._get_t5_prompt_embeds(
319 |                 prompt=negative_prompt,
320 |                 num_videos_per_prompt=num_videos_per_prompt,
321 |                 max_sequence_length=max_sequence_length,
322 |                 device=device,
323 |                 dtype=dtype,
324 |             )
325 | 
326 |         return prompt_embeds, negative_prompt_embeds
327 | 
328 |     def prepare_latents(
329 |         self,
330 |         image: torch.Tensor,
331 |         batch_size: int = 1,
332 |         num_channels_latents: int = 16,
333 |         num_frames: int = 13,
334 |         height: int = 60,
335 |         width: int = 90,
336 |         dtype: Optional[torch.dtype] = None,
337 |         device: Optional[torch.device] = None,
338 |         generator: Optional[torch.Generator] = None,
339 |         latents: Optional[torch.Tensor] = None,
340 |     ):
341 |         if isinstance(generator, list) and len(generator) != batch_size:
342 |             raise ValueError(
343 |                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
344 |                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
345 |             )
346 | 
347 |         num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
348 |         shape = (
349 |             batch_size,
350 |             num_frames,
351 |             num_channels_latents,
352 |             height // self.vae_scale_factor_spatial,
353 |             width // self.vae_scale_factor_spatial,
354 |         )
355 | 
356 |         # For CogVideoX1.5, the latent should add 1 for padding (Not use)
357 |         if self.transformer.config.patch_size_t is not None:
358 |             shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
359 | 
360 |         image = image.unsqueeze(2)  # [B, C, F, H, W]
361 | 
362 |         if isinstance(generator, list):
363 |             image_latents = [
364 |                 retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
365 |             ]
366 |         else:
367 |             image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
368 | 
369 |         image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
370 | 
371 |         if not self.vae.config.invert_scale_latents:
372 |             image_latents = self.vae_scaling_factor_image * image_latents
373 |         else:
374 |             # This is awkward but required because the CogVideoX team forgot to multiply the
375 |             # scaling factor during training :)
376 |             image_latents = 1 / self.vae_scaling_factor_image * image_latents
377 | 
378 |         padding_shape = (
379 |             batch_size,
380 |             num_frames - 1,
381 |             num_channels_latents,
382 |             height // self.vae_scale_factor_spatial,
383 |             width // self.vae_scale_factor_spatial,
384 |         )
385 | 
386 |         latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
387 |         image_latents = torch.cat([image_latents, latent_padding], dim=1)
388 | 
389 |         # Select the first frame along the second dimension
390 |         if self.transformer.config.patch_size_t is not None:
391 |             first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
392 |             image_latents = torch.cat([first_frame, image_latents], dim=1)
393 | 
394 |         if latents is None:
395 |             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
396 |         else:
397 |             latents = latents.to(device)
398 | 
399 |         # scale the initial noise by the standard deviation required by the scheduler
400 |         latents = latents * self.scheduler.init_noise_sigma
401 |         return latents, image_latents
402 | 
403 |     # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
404 |     def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
405 |         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
406 |         latents = 1 / self.vae_scaling_factor_image * latents
407 | 
408 |         frames = self.vae.decode(latents).sample
409 |         return frames
410 | 
411 |     # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
412 |     def get_timesteps(self, num_inference_steps, timesteps, strength, device):
413 |         # get the original timestep using init_timestep
414 |         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
415 | 
416 |         t_start = max(num_inference_steps - init_timestep, 0)
417 |         timesteps = timesteps[t_start * self.scheduler.order :]
418 | 
419 |         return timesteps, num_inference_steps - t_start
420 | 
421 |     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
422 |     def prepare_extra_step_kwargs(self, generator, eta):
423 |         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
424 |         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
425 |         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
426 |         # and should be between [0, 1]
427 | 
428 |         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
429 |         extra_step_kwargs = {}
430 |         if accepts_eta:
431 |             extra_step_kwargs["eta"] = eta
432 | 
433 |         # check if the scheduler accepts generator
434 |         accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
435 |         if accepts_generator:
436 |             extra_step_kwargs["generator"] = generator
437 |         return extra_step_kwargs
438 | 
439 |     def check_inputs(
440 |         self,
441 |         image,
442 |         prompt,
443 |         height,
444 |         width,
445 |         negative_prompt,
446 |         callback_on_step_end_tensor_inputs,
447 |         latents=None,
448 |         prompt_embeds=None,
449 |         negative_prompt_embeds=None,
450 |     ):
451 |         if (
452 |             not isinstance(image, torch.Tensor)
453 |             and not isinstance(image, PIL.Image.Image)
454 |             and not isinstance(image, list)
455 |         ):
456 |             raise ValueError(
457 |                 "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
458 |                 f" {type(image)}"
459 |             )
460 | 
461 |         if height % 8 != 0 or width % 8 != 0:
462 |             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
463 | 
464 |         if callback_on_step_end_tensor_inputs is not None and not all(
465 |             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
466 |         ):
467 |             raise ValueError(
468 |                 f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
469 |             )
470 |         if prompt is not None and prompt_embeds is not None:
471 |             raise ValueError(
472 |                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
473 |                 " only forward one of the two."
474 |             )
475 |         elif prompt is None and prompt_embeds is None:
476 |             raise ValueError(
477 |                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
478 |             )
479 |         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
480 |             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
481 | 
482 |         if prompt is not None and negative_prompt_embeds is not None:
483 |             raise ValueError(
484 |                 f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
485 |                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
486 |             )
487 | 
488 |         if negative_prompt is not None and negative_prompt_embeds is not None:
489 |             raise ValueError(
490 |                 f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
491 |                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
492 |             )
493 | 
494 |         if prompt_embeds is not None and negative_prompt_embeds is not None:
495 |             if prompt_embeds.shape != negative_prompt_embeds.shape:
496 |                 raise ValueError(
497 |                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
498 |                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
499 |                     f" {negative_prompt_embeds.shape}."
500 |                 )
501 | 
502 |     # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
503 |     def fuse_qkv_projections(self) -> None:
504 |         r"""Enables fused QKV projections."""
505 |         self.fusing_transformer = True
506 |         self.transformer.fuse_qkv_projections()
507 | 
508 |     # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
509 |     def unfuse_qkv_projections(self) -> None:
510 |         r"""Disable QKV projection fusion if enabled."""
511 |         if not self.fusing_transformer:
512 |             logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
513 |         else:
514 |             self.transformer.unfuse_qkv_projections()
515 |             self.fusing_transformer = False
516 | 
517 |     # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
518 |     def _prepare_rotary_positional_embeddings(
519 |         self,
520 |         height: int,
521 |         width: int,
522 |         num_frames: int,
523 |         device: torch.device,
524 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
525 |         grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
526 |         grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
527 | 
528 |         p = self.transformer.config.patch_size
529 |         p_t = self.transformer.config.patch_size_t
530 | 
531 |         base_size_width = self.transformer.config.sample_width // p
532 |         base_size_height = self.transformer.config.sample_height // p
533 | 
534 |         if p_t is None:
535 |             # CogVideoX 1.0
536 |             grid_crops_coords = get_resize_crop_region_for_grid(
537 |                 (grid_height, grid_width), base_size_width, base_size_height
538 |             )
539 |             freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
540 |                 embed_dim=self.transformer.config.attention_head_dim,
541 |                 crops_coords=grid_crops_coords,
542 |                 grid_size=(grid_height, grid_width),
543 |                 temporal_size=num_frames,
544 |             )
545 |         else:
546 |             # CogVideoX 1.5
547 |             base_num_frames = (num_frames + p_t - 1) // p_t
548 | 
549 |             freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
550 |                 embed_dim=self.transformer.config.attention_head_dim,
551 |                 crops_coords=None,
552 |                 grid_size=(grid_height, grid_width),
553 |                 temporal_size=base_num_frames,
554 |                 grid_type="slice",
555 |                 max_size=(base_size_height, base_size_width),
556 |             )
557 | 
558 |         freqs_cos = freqs_cos.to(device=device)
559 |         freqs_sin = freqs_sin.to(device=device)
560 |         return freqs_cos, freqs_sin
561 | 
562 |     def prepare_controlnet_frames(self, controlnet_frames, height, width, do_classifier_free_guidance):
563 |         prepared_frames = prepare_frames(controlnet_frames, (height, width))
564 |         controlnet_encoded_frames = prepared_frames.to(dtype=self.vae.dtype, device='cuda')
565 |         controlnet_encoded_frames = torch.cat([controlnet_encoded_frames] * 2) if do_classifier_free_guidance else controlnet_encoded_frames
566 |         return controlnet_encoded_frames.contiguous()
567 |         
568 |     @property
569 |     def guidance_scale(self):
570 |         return self._guidance_scale
571 | 
572 |     @property
573 |     def num_timesteps(self):
574 |         return self._num_timesteps
575 | 
576 |     @property
577 |     def attention_kwargs(self):
578 |         return self._attention_kwargs
579 | 
580 |     @property
581 |     def interrupt(self):
582 |         return self._interrupt
583 | 
584 |     @torch.no_grad()
585 |     def __call__(
586 |         self,
587 |         image,
588 |         controlnet_frames: List[Image.Image] = None,
589 |         prompt: Optional[Union[str, List[str]]] = None,
590 |         negative_prompt: Optional[Union[str, List[str]]] = None,
591 |         height: Optional[int] = None,
592 |         width: Optional[int] = None,
593 |         num_frames: int = 49,
594 |         num_inference_steps: int = 50,
595 |         timesteps: Optional[List[int]] = None,
596 |         guidance_scale: float = 6,
597 |         use_dynamic_cfg: bool = False,
598 |         num_videos_per_prompt: int = 1,
599 |         eta: float = 0.0,
600 |         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
601 |         latents: Optional[torch.FloatTensor] = None,
602 |         prompt_embeds: Optional[torch.FloatTensor] = None,
603 |         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
604 |         controlnet_latents: Optional[torch.FloatTensor] = None,
605 |         output_type: str = "pil",
606 |         return_dict: bool = True,
607 |         attention_kwargs: Optional[Dict[str, Any]] = None,
608 |         callback_on_step_end: Optional[
609 |             Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
610 |         ] = None,
611 |         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
612 |         max_sequence_length: int = 226,
613 |         controlnet_weights: Optional[Union[float, list, np.ndarray, torch.FloatTensor]] = 1.0,
614 |         controlnet_guidance_start: float = 0.0,
615 |         controlnet_guidance_end: float = 1.0,
616 |     ) -> Union[CogVideoXPipelineOutput, Tuple]:
617 |         """
618 |         Function invoked when calling the pipeline for generation.
619 | 
620 |         Args:
621 |             image (`PipelineImageInput`):
622 |                 The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
623 |             prompt (`str` or `List[str]`, *optional*):
624 |                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
625 |                 instead.
626 |             negative_prompt (`str` or `List[str]`, *optional*):
627 |                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
628 |                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
629 |                 less than `1`).
630 |             height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
631 |                 The height in pixels of the generated image. This is set to 480 by default for the best results.
632 |             width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
633 |                 The width in pixels of the generated image. This is set to 720 by default for the best results.
634 |             num_frames (`int`, defaults to `48`):
635 |                 Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
636 |                 contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
637 |                 num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
638 |                 needs to be satisfied is that of divisibility mentioned above.
639 |             num_inference_steps (`int`, *optional*, defaults to 50):
640 |                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
641 |                 expense of slower inference.
642 |             timesteps (`List[int]`, *optional*):
643 |                 Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
644 |                 in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
645 |                 passed will be used. Must be in descending order.
646 |             guidance_scale (`float`, *optional*, defaults to 7.0):
647 |                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
648 |                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
649 |                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
650 |                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
651 |                 usually at the expense of lower image quality.
652 |             num_videos_per_prompt (`int`, *optional*, defaults to 1):
653 |                 The number of videos to generate per prompt.
654 |             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
655 |                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
656 |                 to make generation deterministic.
657 |             latents (`torch.FloatTensor`, *optional*):
658 |                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
659 |                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
660 |                 tensor will ge generated by sampling using the supplied random `generator`.
661 |             prompt_embeds (`torch.FloatTensor`, *optional*):
662 |                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
663 |                 provided, text embeddings will be generated from `prompt` input argument.
664 |             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
665 |                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
666 |                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
667 |                 argument.
668 |             output_type (`str`, *optional*, defaults to `"pil"`):
669 |                 The output format of the generate image. Choose between
670 |                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
671 |             return_dict (`bool`, *optional*, defaults to `True`):
672 |                 Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
673 |                 of a plain tuple.
674 |             attention_kwargs (`dict`, *optional*):
675 |                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
676 |                 `self.processor` in
677 |                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
678 |             callback_on_step_end (`Callable`, *optional*):
679 |                 A function that calls at the end of each denoising steps during the inference. The function is called
680 |                 with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
681 |                 callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
682 |                 `callback_on_step_end_tensor_inputs`.
683 |             callback_on_step_end_tensor_inputs (`List`, *optional*):
684 |                 The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
685 |                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
686 |                 `._callback_tensor_inputs` attribute of your pipeline class.
687 |             max_sequence_length (`int`, defaults to `226`):
688 |                 Maximum sequence length in encoded prompt. Must be consistent with
689 |                 `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
690 | 
691 |         Examples:
692 | 
693 |         Returns:
694 |             [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
695 |             [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
696 |             `tuple`. When returning a tuple, the first element is a list with the generated images.
697 |         """
698 | 
699 |         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
700 |             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
701 | 
702 |         height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
703 |         width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
704 |         num_frames = num_frames or self.transformer.config.sample_frames
705 | 
706 |         num_videos_per_prompt = 1
707 | 
708 |         # 1. Check inputs. Raise error if not correct
709 |         self.check_inputs(
710 |             image=image,
711 |             prompt=prompt,
712 |             height=height,
713 |             width=width,
714 |             negative_prompt=negative_prompt,
715 |             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
716 |             latents=latents,
717 |             prompt_embeds=prompt_embeds,
718 |             negative_prompt_embeds=negative_prompt_embeds,
719 |         )
720 |         self._guidance_scale = guidance_scale
721 |         self._attention_kwargs = attention_kwargs
722 |         self._interrupt = False
723 | 
724 |         # 2. Default call parameters
725 |         if prompt is not None and isinstance(prompt, str):
726 |             batch_size = 1
727 |         elif prompt is not None and isinstance(prompt, list):
728 |             batch_size = len(prompt)
729 |         else:
730 |             batch_size = prompt_embeds.shape[0]
731 | 
732 |         device = self._execution_device
733 | 
734 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
735 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
736 |         # corresponds to doing no classifier free guidance.
737 |         do_classifier_free_guidance = guidance_scale > 1.0
738 | 
739 |         # 3. Encode input prompt
740 |         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
741 |             prompt=prompt,
742 |             negative_prompt=negative_prompt,
743 |             do_classifier_free_guidance=do_classifier_free_guidance,
744 |             num_videos_per_prompt=num_videos_per_prompt,
745 |             prompt_embeds=prompt_embeds,
746 |             negative_prompt_embeds=negative_prompt_embeds,
747 |             max_sequence_length=max_sequence_length,
748 |             device=device,
749 |         )
750 |         if do_classifier_free_guidance:
751 |             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
752 | 
753 |         # 4. Prepare timesteps
754 |         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
755 |         self._num_timesteps = len(timesteps)
756 | 
757 |         # 5. Prepare latents
758 |         latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
759 |             
760 |         # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
761 |         patch_size_t = self.transformer.config.patch_size_t
762 |         additional_frames = 0
763 |         if patch_size_t is not None and latent_frames % patch_size_t != 0:
764 |             additional_frames = patch_size_t - latent_frames % patch_size_t
765 |             num_frames += additional_frames * self.vae_scale_factor_temporal
766 | 
767 |         image = self.video_processor.preprocess(image, height=height, width=width).to(
768 |             device, dtype=prompt_embeds.dtype
769 |         )
770 | 
771 |         latent_channels = self.transformer.config.in_channels // 2
772 |         latents, image_latents = self.prepare_latents(
773 |             image,
774 |             batch_size * num_videos_per_prompt,
775 |             latent_channels,
776 |             num_frames,
777 |             height,
778 |             width,
779 |             prompt_embeds.dtype,
780 |             device,
781 |             generator,
782 |             latents,
783 |         )
784 | 
785 |         # 6. Encode controlnet frames
786 |         if controlnet_latents is None:
787 |             controlnet_latents = self.prepare_controlnet_frames(
788 |                 controlnet_frames,
789 |                 height, 
790 |                 width, 
791 |                 do_classifier_free_guidance,
792 |             )
793 | 
794 |         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
795 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
796 | 
797 |         # 8. Create rotary embeds if required
798 |         image_rotary_emb = (
799 |             self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
800 |             if self.transformer.config.use_rotary_positional_embeddings
801 |             else None
802 |         )
803 | 
804 |         # 9. Create ofs embeds if required
805 |         ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
806 | 
807 |         # 10. Denoising loop
808 |         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
809 | 
810 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
811 |             # for DPM-solver++
812 |             old_pred_original_sample = None
813 |             for i, t in enumerate(timesteps):
814 |                 if self.interrupt:
815 |                     continue
816 | 
817 |                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
818 |                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
819 |                 latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
820 |                 latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
821 | 
822 |                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
823 |                 timestep = t.expand(latent_model_input.shape[0])
824 | 
825 |                 current_sampling_percent = i / len(timesteps)
826 |                 
827 |                 controlnet_states = []
828 |                 if (controlnet_guidance_start <= current_sampling_percent < controlnet_guidance_end):
829 |                     # extract controlnet hidden state
830 |                     controlnet_states = self.controlnet(
831 |                         hidden_states=latent_model_input[:, :, :16, :, :],
832 |                         encoder_hidden_states=prompt_embeds,
833 |                         image_rotary_emb=image_rotary_emb,
834 |                         controlnet_states=controlnet_latents,
835 |                         timestep=timestep,
836 |                         return_dict=False,
837 |                     )[0]
838 |                     if isinstance(controlnet_states, (tuple, list)):
839 |                         controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
840 |                     else:
841 |                         controlnet_states = controlnet_states.to(dtype=self.transformer.dtype)
842 |                         
843 |                 # predict noise model_output
844 |                 noise_pred = self.transformer(
845 |                     hidden_states=latent_model_input,
846 |                     encoder_hidden_states=prompt_embeds,
847 |                     timestep=timestep,
848 |                     # ofs=ofs_emb,
849 |                     image_rotary_emb=image_rotary_emb,
850 |                     # attention_kwargs=attention_kwargs,
851 |                     controlnet_states=controlnet_states,
852 |                     controlnet_weights=controlnet_weights,
853 |                     return_dict=False,
854 |                 )[0]
855 |                 noise_pred = noise_pred.float()
856 | 
857 |                 # perform guidance
858 |                 if use_dynamic_cfg:
859 |                     self._guidance_scale = 1 + guidance_scale * (
860 |                         (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
861 |                     )
862 |                 if do_classifier_free_guidance:
863 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
864 |                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
865 | 
866 |                 # compute the previous noisy sample x_t -> x_t-1
867 |                 if not isinstance(self.scheduler, CogVideoXDPMScheduler):
868 |                     latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
869 |                 else:
870 |                     latents, old_pred_original_sample = self.scheduler.step(
871 |                         noise_pred,
872 |                         old_pred_original_sample,
873 |                         t,
874 |                         timesteps[i - 1] if i > 0 else None,
875 |                         latents,
876 |                         **extra_step_kwargs,
877 |                         return_dict=False,
878 |                     )
879 |                 latents = latents.to(prompt_embeds.dtype)
880 | 
881 |                 # call the callback, if provided
882 |                 if callback_on_step_end is not None:
883 |                     callback_kwargs = {}
884 |                     for k in callback_on_step_end_tensor_inputs:
885 |                         callback_kwargs[k] = locals()[k]
886 |                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
887 | 
888 |                     latents = callback_outputs.pop("latents", latents)
889 |                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
890 |                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
891 | 
892 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
893 |                     progress_bar.update()
894 | 
895 |         if not output_type == "latent":
896 |             # Discard any padding frames that were added for CogVideoX 1.5
897 |             latents = latents[:, additional_frames:]
898 |             video = self.decode_latents(latents)
899 |             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
900 |         else:
901 |             video = latents
902 | 
903 |         # Offload all models
904 |         self.maybe_free_model_hooks()
905 | 
906 |         if not return_dict:
907 |             return (video,)
908 | 
909 |         return CogVideoXPipelineOutput(frames=video)


--------------------------------------------------------------------------------
/controlnet_pipeline.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import math
  3 | from typing import Callable, Dict, List, Optional, Tuple, Union
  4 | 
  5 | import torch
  6 | import numpy as np
  7 | from PIL import Image
  8 | from torchvision import transforms
  9 | from einops import rearrange, repeat
 10 | from transformers import T5EncoderModel, T5Tokenizer
 11 | from diffusers.video_processor import VideoProcessor
 12 | from diffusers.utils.torch_utils import randn_tensor
 13 | from diffusers.models.embeddings import get_3d_rotary_pos_embed
 14 | from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 15 | from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 16 | from diffusers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 17 | from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 18 | from diffusers.pipelines.cogvideo.pipeline_cogvideox import CogVideoXPipelineOutput, CogVideoXLoraLoaderMixin
 19 | 
 20 | from cogvideo_controlnet import CogVideoXControlnet
 21 | 
 22 | 
 23 | def resize_for_crop(image, crop_h, crop_w):
 24 |     img_h, img_w = image.shape[-2:]
 25 |     if img_h >= crop_h and img_w >= crop_w:
 26 |         coef = max(crop_h / img_h, crop_w / img_w)
 27 |     elif img_h <= crop_h and img_w <= crop_w:
 28 |         coef = max(crop_h / img_h, crop_w / img_w)
 29 |     else:
 30 |         coef = crop_h / img_h if crop_h > img_h else crop_w / img_w 
 31 |     out_h, out_w = int(img_h * coef), int(img_w * coef)
 32 |     resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
 33 |     return resized_image
 34 | 
 35 | 
 36 | def prepare_frames(input_images, video_size, do_resize=True, do_crop=True):
 37 |     input_images = np.stack([np.array(x) for x in input_images])
 38 |     images_tensor = torch.from_numpy(input_images).permute(0, 3, 1, 2) / 127.5 - 1
 39 |     if do_resize:
 40 |         images_tensor = [resize_for_crop(x, crop_h=video_size[0], crop_w=video_size[1]) for x in images_tensor]
 41 |     if do_crop:
 42 |         images_tensor = [transforms.functional.center_crop(x, video_size) for x in images_tensor]
 43 |     if isinstance(images_tensor, list):
 44 |         images_tensor = torch.stack(images_tensor)
 45 |     return images_tensor.unsqueeze(0) 
 46 | 
 47 | 
 48 | def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
 49 |     tw = tgt_width
 50 |     th = tgt_height
 51 |     h, w = src
 52 |     r = h / w
 53 |     if r > (th / tw):
 54 |         resize_height = th
 55 |         resize_width = int(round(th / h * w))
 56 |     else:
 57 |         resize_width = tw
 58 |         resize_height = int(round(tw / w * h))
 59 | 
 60 |     crop_top = int(round((th - resize_height) / 2.0))
 61 |     crop_left = int(round((tw - resize_width) / 2.0))
 62 | 
 63 |     return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
 64 | 
 65 | 
 66 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 67 | def retrieve_timesteps(
 68 |     scheduler,
 69 |     num_inference_steps: Optional[int] = None,
 70 |     device: Optional[Union[str, torch.device]] = None,
 71 |     timesteps: Optional[List[int]] = None,
 72 |     sigmas: Optional[List[float]] = None,
 73 |     **kwargs,
 74 | ):
 75 |     """
 76 |     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
 77 |     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 78 | 
 79 |     Args:
 80 |         scheduler (`SchedulerMixin`):
 81 |             The scheduler to get timesteps from.
 82 |         num_inference_steps (`int`):
 83 |             The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
 84 |             must be `None`.
 85 |         device (`str` or `torch.device`, *optional*):
 86 |             The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
 87 |         timesteps (`List[int]`, *optional*):
 88 |             Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
 89 |             `num_inference_steps` and `sigmas` must be `None`.
 90 |         sigmas (`List[float]`, *optional*):
 91 |             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
 92 |             `num_inference_steps` and `timesteps` must be `None`.
 93 | 
 94 |     Returns:
 95 |         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
 96 |         second element is the number of inference steps.
 97 |     """
 98 |     if timesteps is not None and sigmas is not None:
 99 |         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
100 |     if timesteps is not None:
101 |         accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
102 |         if not accepts_timesteps:
103 |             raise ValueError(
104 |                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
105 |                 f" timestep schedules. Please check whether you are using the correct scheduler."
106 |             )
107 |         scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
108 |         timesteps = scheduler.timesteps
109 |         num_inference_steps = len(timesteps)
110 |     elif sigmas is not None:
111 |         accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
112 |         if not accept_sigmas:
113 |             raise ValueError(
114 |                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
115 |                 f" sigmas schedules. Please check whether you are using the correct scheduler."
116 |             )
117 |         scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
118 |         timesteps = scheduler.timesteps
119 |         num_inference_steps = len(timesteps)
120 |     else:
121 |         scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
122 |         timesteps = scheduler.timesteps
123 |     return timesteps, num_inference_steps
124 |     
125 | 
126 | class ControlnetCogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
127 |     _optional_components = []
128 |     model_cpu_offload_seq = "text_encoder->transformer->vae"
129 | 
130 |     _callback_tensor_inputs = [
131 |         "latents",
132 |         "prompt_embeds",
133 |         "negative_prompt_embeds",
134 |     ]
135 |     
136 |     def __init__(
137 |         self,
138 |         tokenizer: T5Tokenizer,
139 |         text_encoder: T5EncoderModel,
140 |         vae: AutoencoderKLCogVideoX,
141 |         transformer: CogVideoXTransformer3DModel,
142 |         scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
143 |         controlnet: CogVideoXControlnet,
144 |     ):
145 |         super().__init__()
146 | 
147 |         self.register_modules(
148 |             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, controlnet=controlnet, scheduler=scheduler
149 |         )
150 |         self.vae_scale_factor_spatial = (
151 |             2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
152 |         )
153 |         self.vae_scale_factor_temporal = (
154 |             self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
155 |         )
156 | 
157 |         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
158 |     
159 |     def prepare_controlnet_frames(self, controlnet_frames, height, width, do_classifier_free_guidance):
160 |         prepared_frames = prepare_frames(controlnet_frames, (height, width))
161 |         controlnet_encoded_frames = prepared_frames.to(dtype=self.vae.dtype, device='cuda')
162 |         controlnet_encoded_frames = torch.cat([controlnet_encoded_frames] * 2) if do_classifier_free_guidance else controlnet_encoded_frames
163 |         return controlnet_encoded_frames.contiguous()
164 | 
165 |     def _get_t5_prompt_embeds(
166 |         self,
167 |         prompt: Union[str, List[str]] = None,
168 |         num_videos_per_prompt: int = 1,
169 |         max_sequence_length: int = 226,
170 |         device: Optional[torch.device] = None,
171 |         dtype: Optional[torch.dtype] = None,
172 |     ):
173 |         device = device or self._execution_device
174 |         dtype = dtype or self.text_encoder.dtype
175 | 
176 |         prompt = [prompt] if isinstance(prompt, str) else prompt
177 |         batch_size = len(prompt)
178 | 
179 |         text_inputs = self.tokenizer(
180 |             prompt,
181 |             padding="max_length",
182 |             max_length=max_sequence_length,
183 |             truncation=True,
184 |             add_special_tokens=True,
185 |             return_tensors="pt",
186 |         )
187 |         text_input_ids = text_inputs.input_ids
188 |         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
189 | 
190 |         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
191 |             removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
192 |             logger.warning(
193 |                 "The following part of your input was truncated because `max_sequence_length` is set to "
194 |                 f" {max_sequence_length} tokens: {removed_text}"
195 |             )
196 | 
197 |         prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
198 |         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
199 | 
200 |         # duplicate text embeddings for each generation per prompt, using mps friendly method
201 |         _, seq_len, _ = prompt_embeds.shape
202 |         prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
203 |         prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
204 | 
205 |         return prompt_embeds
206 | 
207 |     def encode_prompt(
208 |         self,
209 |         prompt: Union[str, List[str]],
210 |         negative_prompt: Optional[Union[str, List[str]]] = None,
211 |         do_classifier_free_guidance: bool = True,
212 |         num_videos_per_prompt: int = 1,
213 |         prompt_embeds: Optional[torch.Tensor] = None,
214 |         negative_prompt_embeds: Optional[torch.Tensor] = None,
215 |         max_sequence_length: int = 226,
216 |         device: Optional[torch.device] = None,
217 |         dtype: Optional[torch.dtype] = None,
218 |     ):
219 |         r"""
220 |         Encodes the prompt into text encoder hidden states.
221 | 
222 |         Args:
223 |             prompt (`str` or `List[str]`, *optional*):
224 |                 prompt to be encoded
225 |             negative_prompt (`str` or `List[str]`, *optional*):
226 |                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
227 |                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
228 |                 less than `1`).
229 |             do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
230 |                 Whether to use classifier free guidance or not.
231 |             num_videos_per_prompt (`int`, *optional*, defaults to 1):
232 |                 Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
233 |             prompt_embeds (`torch.Tensor`, *optional*):
234 |                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
235 |                 provided, text embeddings will be generated from `prompt` input argument.
236 |             negative_prompt_embeds (`torch.Tensor`, *optional*):
237 |                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
238 |                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
239 |                 argument.
240 |             device: (`torch.device`, *optional*):
241 |                 torch device
242 |             dtype: (`torch.dtype`, *optional*):
243 |                 torch dtype
244 |         """
245 |         device = device or self._execution_device
246 | 
247 |         prompt = [prompt] if isinstance(prompt, str) else prompt
248 |         if prompt is not None:
249 |             batch_size = len(prompt)
250 |         else:
251 |             batch_size = prompt_embeds.shape[0]
252 | 
253 |         if prompt_embeds is None:
254 |             prompt_embeds = self._get_t5_prompt_embeds(
255 |                 prompt=prompt,
256 |                 num_videos_per_prompt=num_videos_per_prompt,
257 |                 max_sequence_length=max_sequence_length,
258 |                 device=device,
259 |                 dtype=dtype,
260 |             )
261 | 
262 |         if do_classifier_free_guidance and negative_prompt_embeds is None:
263 |             negative_prompt = negative_prompt or ""
264 |             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
265 | 
266 |             if prompt is not None and type(prompt) is not type(negative_prompt):
267 |                 raise TypeError(
268 |                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
269 |                     f" {type(prompt)}."
270 |                 )
271 |             elif batch_size != len(negative_prompt):
272 |                 raise ValueError(
273 |                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
274 |                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
275 |                     " the batch size of `prompt`."
276 |                 )
277 | 
278 |             negative_prompt_embeds = self._get_t5_prompt_embeds(
279 |                 prompt=negative_prompt,
280 |                 num_videos_per_prompt=num_videos_per_prompt,
281 |                 max_sequence_length=max_sequence_length,
282 |                 device=device,
283 |                 dtype=dtype,
284 |             )
285 | 
286 |         return prompt_embeds, negative_prompt_embeds
287 | 
288 |     def prepare_latents(
289 |         self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
290 |     ):
291 |         shape = (
292 |             batch_size,
293 |             (num_frames - 1) // self.vae_scale_factor_temporal + 1,
294 |             num_channels_latents,
295 |             height // self.vae_scale_factor_spatial,
296 |             width // self.vae_scale_factor_spatial,
297 |         )
298 |         if isinstance(generator, list) and len(generator) != batch_size:
299 |             raise ValueError(
300 |                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
301 |                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
302 |             )
303 | 
304 |         if latents is None:
305 |             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
306 |         else:
307 |             latents = latents.to(device)
308 | 
309 |         # scale the initial noise by the standard deviation required by the scheduler
310 |         latents = latents * self.scheduler.init_noise_sigma
311 |         return latents
312 | 
313 |     def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
314 |         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
315 |         latents = 1 / self.vae.config.scaling_factor * latents
316 | 
317 |         frames = self.vae.decode(latents).sample
318 |         return frames
319 | 
320 |     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
321 |     def prepare_extra_step_kwargs(self, generator, eta):
322 |         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
323 |         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
324 |         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
325 |         # and should be between [0, 1]
326 | 
327 |         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
328 |         extra_step_kwargs = {}
329 |         if accepts_eta:
330 |             extra_step_kwargs["eta"] = eta
331 | 
332 |         # check if the scheduler accepts generator
333 |         accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
334 |         if accepts_generator:
335 |             extra_step_kwargs["generator"] = generator
336 |         return extra_step_kwargs
337 | 
338 |     # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
339 |     def check_inputs(
340 |         self,
341 |         prompt,
342 |         height,
343 |         width,
344 |         negative_prompt,
345 |         callback_on_step_end_tensor_inputs,
346 |         controlnet_frames=None,
347 |         controlnet_latents=None,
348 |         prompt_embeds=None,
349 |         negative_prompt_embeds=None,
350 |     ):
351 |         if height % 8 != 0 or width % 8 != 0:
352 |             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
353 | 
354 |         if callback_on_step_end_tensor_inputs is not None and not all(
355 |             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
356 |         ):
357 |             raise ValueError(
358 |                 f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
359 |             )
360 |         if prompt is not None and prompt_embeds is not None:
361 |             raise ValueError(
362 |                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
363 |                 " only forward one of the two."
364 |             )
365 |         elif prompt is None and prompt_embeds is None:
366 |             raise ValueError(
367 |                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
368 |             )
369 |         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
370 |             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
371 | 
372 |         if prompt is not None and negative_prompt_embeds is not None:
373 |             raise ValueError(
374 |                 f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
375 |                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
376 |             )
377 | 
378 |         if negative_prompt is not None and negative_prompt_embeds is not None:
379 |             raise ValueError(
380 |                 f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
381 |                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
382 |             )
383 | 
384 |         if prompt_embeds is not None and negative_prompt_embeds is not None:
385 |             if prompt_embeds.shape != negative_prompt_embeds.shape:
386 |                 raise ValueError(
387 |                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
388 |                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
389 |                     f" {negative_prompt_embeds.shape}."
390 |                 )
391 |         if controlnet_frames is not None and controlnet_latents is not None:
392 |             raise ValueError("Only one of `controlnet_frames` or `controlnet_latents` should be provided")
393 |             
394 |     def fuse_qkv_projections(self) -> None:
395 |         r"""Enables fused QKV projections."""
396 |         self.fusing_transformer = True
397 |         self.transformer.fuse_qkv_projections()
398 | 
399 |     def unfuse_qkv_projections(self) -> None:
400 |         r"""Disable QKV projection fusion if enabled."""
401 |         if not self.fusing_transformer:
402 |             logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
403 |         else:
404 |             self.transformer.unfuse_qkv_projections()
405 |             self.fusing_transformer = False
406 | 
407 |     def _prepare_rotary_positional_embeddings(
408 |         self,
409 |         height: int,
410 |         width: int,
411 |         num_frames: int,
412 |         device: torch.device,
413 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
414 |         grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
415 |         grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
416 |         base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
417 |         base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
418 | 
419 |         grid_crops_coords = get_resize_crop_region_for_grid(
420 |             (grid_height, grid_width), base_size_width, base_size_height
421 |         )
422 |         freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
423 |             embed_dim=self.transformer.config.attention_head_dim,
424 |             crops_coords=grid_crops_coords,
425 |             grid_size=(grid_height, grid_width),
426 |             temporal_size=num_frames,
427 |         )
428 | 
429 |         freqs_cos = freqs_cos.to(device=device)
430 |         freqs_sin = freqs_sin.to(device=device)
431 |         return freqs_cos, freqs_sin
432 | 
433 |     @property
434 |     def guidance_scale(self):
435 |         return self._guidance_scale
436 | 
437 |     @property
438 |     def num_timesteps(self):
439 |         return self._num_timesteps
440 | 
441 |     @property
442 |     def attention_kwargs(self):
443 |         return self._attention_kwargs
444 | 
445 |     @property
446 |     def interrupt(self):
447 |         return self._interrupt
448 |         
449 |     @torch.no_grad()
450 |     def __call__(
451 |         self,
452 |         controlnet_frames: List[Image.Image] = None,
453 |         prompt: Optional[Union[str, List[str]]] = None,
454 |         negative_prompt: Optional[Union[str, List[str]]] = None,
455 |         height: int = 480,
456 |         width: int = 720,
457 |         num_frames: int = 49,
458 |         num_inference_steps: int = 50,
459 |         timesteps: Optional[List[int]] = None,
460 |         guidance_scale: float = 6,
461 |         use_dynamic_cfg: bool = False,
462 |         num_videos_per_prompt: int = 1,
463 |         eta: float = 0.0,
464 |         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
465 |         latents: Optional[torch.FloatTensor] = None,
466 |         controlnet_latents: Optional[torch.FloatTensor] = None,
467 |         prompt_embeds: Optional[torch.FloatTensor] = None,
468 |         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
469 |         output_type: str = "pil",
470 |         return_dict: bool = True,
471 |         callback_on_step_end: Optional[
472 |             Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
473 |         ] = None,
474 |         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
475 |         max_sequence_length: int = 226,
476 |         controlnet_weights: Optional[Union[float, list, np.ndarray, torch.FloatTensor]] = 1.0,
477 |         controlnet_guidance_start: float = 0.0,
478 |         controlnet_guidance_end: float = 1.0,
479 |     ) -> Union[CogVideoXPipelineOutput, Tuple]:
480 |         if num_frames > 49:
481 |             raise ValueError(
482 |                 "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
483 |             )
484 | 
485 |         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
486 |             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
487 | 
488 |         height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
489 |         width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
490 |         num_videos_per_prompt = 1
491 | 
492 |         # 1. Check inputs. Raise error if not correct
493 |         self.check_inputs(
494 |             prompt,
495 |             height,
496 |             width,
497 |             negative_prompt,
498 |             callback_on_step_end_tensor_inputs,
499 |             controlnet_frames,
500 |             controlnet_latents,
501 |             prompt_embeds,
502 |             negative_prompt_embeds,
503 |         )
504 |         self._guidance_scale = guidance_scale
505 |         self._interrupt = False
506 | 
507 |         # 2. Default call parameters
508 |         if prompt is not None and isinstance(prompt, str):
509 |             batch_size = 1
510 |         elif prompt is not None and isinstance(prompt, list):
511 |             batch_size = len(prompt)
512 |         else:
513 |             batch_size = prompt_embeds.shape[0]
514 | 
515 |         device = self._execution_device
516 | 
517 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
518 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
519 |         # corresponds to doing no classifier free guidance.
520 |         do_classifier_free_guidance = guidance_scale > 1.0
521 | 
522 |         # 3. Encode input prompt
523 |         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
524 |             prompt,
525 |             negative_prompt,
526 |             do_classifier_free_guidance,
527 |             num_videos_per_prompt=num_videos_per_prompt,
528 |             prompt_embeds=prompt_embeds,
529 |             negative_prompt_embeds=negative_prompt_embeds,
530 |             max_sequence_length=max_sequence_length,
531 |             device=device,
532 |         )
533 |         if do_classifier_free_guidance:
534 |             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
535 | 
536 |         # 4. Prepare timesteps
537 |         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
538 |         self._num_timesteps = len(timesteps)
539 |         
540 |         # 5. Prepare latents.
541 |         latent_channels = 16 #self.transformer.config.in_channels
542 |         latents = self.prepare_latents(
543 |             batch_size * num_videos_per_prompt,
544 |             latent_channels,
545 |             num_frames,
546 |             height,
547 |             width,
548 |             prompt_embeds.dtype,
549 |             device,
550 |             generator,
551 |             latents,
552 |         )
553 | 
554 |         # 6. Encode controlnet frames
555 |         if controlnet_latents is None:
556 |             duplicate_frames_count = num_frames - len(controlnet_frames)
557 |             if duplicate_frames_count > 0:
558 |                 # Simple duplicate first frame
559 |                 # controlnet_frames = [controlnet_frames[0]] * duplicate_frames_count + controlnet_frames
560 |                 # Or reversed duplicate frames ?
561 |                 reversed_controlnet_frames = list(reversed(controlnet_frames))
562 |                 controlnet_sum_frames = controlnet_frames + reversed_controlnet_frames
563 |                 reversed_chunks_count = num_frames // len(controlnet_sum_frames)
564 |                 controlnet_frames = [*controlnet_sum_frames]
565 |                 for _ in range(reversed_chunks_count):
566 |                     controlnet_frames += controlnet_sum_frames
567 | 
568 |             # If controlnet frames count greater than num_frames parameter
569 |             controlnet_frames = controlnet_frames[:num_frames]
570 |             
571 |             controlnet_latents = self.prepare_controlnet_frames(
572 |                 controlnet_frames,
573 |                 height, 
574 |                 width, 
575 |                 do_classifier_free_guidance,
576 |             )
577 |         
578 |         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
579 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
580 | 
581 |         # 8. Create rotary embeds if required
582 |         image_rotary_emb = (
583 |             self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
584 |             if self.transformer.config.use_rotary_positional_embeddings
585 |             else None
586 |         )
587 | 
588 |         # 9. Denoising loop
589 |         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
590 |         
591 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
592 |             # for DPM-solver++
593 |             old_pred_original_sample = None
594 |             for i, t in enumerate(timesteps):
595 |                 if self.interrupt:
596 |                     continue
597 |                 
598 |                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
599 |                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
600 |                 
601 |                 timestep = t.expand(latent_model_input.shape[0])
602 | 
603 |                 current_sampling_percent = i / len(timesteps)
604 | 
605 |                 latent_model_input = latent_model_input.to(dtype=self.transformer.dtype)
606 |                 prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
607 |                 
608 |                 controlnet_states = None
609 |                 if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
610 |                     # extract controlnet hidden state
611 |                     controlnet_states = self.controlnet(
612 |                         hidden_states=latent_model_input,
613 |                         encoder_hidden_states=prompt_embeds,
614 |                         image_rotary_emb=image_rotary_emb,
615 |                         controlnet_states=controlnet_latents,
616 |                         timestep=timestep,
617 |                         return_dict=False,
618 |                     )[0]
619 |                     if isinstance(controlnet_states, (tuple, list)):
620 |                         controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
621 |                     else:
622 |                         controlnet_states = controlnet_states.to(dtype=self.transformer.dtype)
623 | 
624 |                 # predict noise model_output
625 |                 noise_pred = self.transformer(
626 |                     hidden_states=latent_model_input,
627 |                     encoder_hidden_states=prompt_embeds,
628 |                     timestep=timestep,
629 |                     image_rotary_emb=image_rotary_emb,
630 |                     controlnet_states=controlnet_states,
631 |                     controlnet_weights=controlnet_weights,
632 |                     return_dict=False,
633 |                 )[0]
634 |                 noise_pred = noise_pred.float()
635 | 
636 |                 # perform guidance
637 |                 if use_dynamic_cfg:
638 |                     self._guidance_scale = 1 + guidance_scale * (
639 |                         (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
640 |                     )
641 |                 if do_classifier_free_guidance:
642 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
643 |                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
644 | 
645 |                 # compute the previous noisy sample x_t -> x_t-1
646 |                 if not isinstance(self.scheduler, CogVideoXDPMScheduler):
647 |                     latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
648 |                 else:
649 |                     latents, old_pred_original_sample = self.scheduler.step(
650 |                         noise_pred,
651 |                         old_pred_original_sample,
652 |                         t,
653 |                         timesteps[i - 1] if i > 0 else None,
654 |                         latents,
655 |                         **extra_step_kwargs,
656 |                         return_dict=False,
657 |                     )
658 |                 latents = latents.to(prompt_embeds.dtype)
659 | 
660 |                 # call the callback, if provided
661 |                 if callback_on_step_end is not None:
662 |                     callback_kwargs = {}
663 |                     for k in callback_on_step_end_tensor_inputs:
664 |                         callback_kwargs[k] = locals()[k]
665 |                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
666 | 
667 |                     latents = callback_outputs.pop("latents", latents)
668 |                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
669 |                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
670 | 
671 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
672 |                     progress_bar.update()
673 | 
674 |         if not output_type == "latent":
675 |             video = self.decode_latents(latents)
676 |             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
677 |         else:
678 |             video = latents
679 | 
680 |         # Offload all models
681 |         self.maybe_free_model_hooks()
682 | 
683 |         if not return_dict:
684 |             return (video,)
685 | 
686 |         return CogVideoXPipelineOutput(frames=video)
687 | 


--------------------------------------------------------------------------------
/inference/cli_demo.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Running the Script:
  3 | To run the script, use the following command with appropriate arguments:
  4 | 
  5 | ```bash
  6 | $ python inference/cli_demo.py \
  7 | --video_path "test_video/car.mp4" \
  8 | --prompt "the car is driving on a mountain road" \
  9 | --controlnet_type "hed" \
 10 | --model_path THUDM/CogVideoX-5b \
 11 | --controlnet_path TheDenk/cogvideox-5b-controlnet-hed-v1
 12 | ```
 13 | 
 14 | Additional options are available to specify the guidance scale, number of inference steps, video generation type, and output paths.
 15 | """
 16 | import sys
 17 | sys.path.append('..')
 18 | import argparse
 19 | 
 20 | import torch
 21 | from transformers import T5EncoderModel, T5Tokenizer
 22 | from diffusers import (
 23 |     CogVideoXDDIMScheduler,
 24 |     CogVideoXDPMScheduler,
 25 |     AutoencoderKLCogVideoX
 26 | )
 27 | from diffusers.utils import export_to_video, load_video
 28 | from controlnet_aux import HEDdetector, CannyDetector
 29 | 
 30 | from controlnet_pipeline import ControlnetCogVideoXPipeline
 31 | from cogvideo_transformer import CustomCogVideoXTransformer3DModel
 32 | from cogvideo_controlnet import CogVideoXControlnet
 33 | 
 34 | 
 35 | def init_controlnet_processor(controlnet_type):
 36 |     if controlnet_type in ['canny', 'lineart']:
 37 |         return controlnet_mapping[controlnet_type]()
 38 |     return controlnet_mapping[controlnet_type].from_pretrained('lllyasviel/Annotators').to(device='cuda')
 39 | 
 40 | 
 41 | controlnet_mapping = {
 42 |     'hed': HEDdetector,
 43 |     'canny': CannyDetector,
 44 | }
 45 | 
 46 | 
 47 | @torch.no_grad()
 48 | def generate_video(
 49 |     prompt: str,
 50 |     video_path: str,
 51 |     base_model_path: str,
 52 |     controlnet_model_path: str,
 53 |     controlnet_type: str,
 54 |     controlnet_weights: float = 1.0,
 55 |     controlnet_guidance_start: float = 0.0,
 56 |     controlnet_guidance_end: float = 1.0,
 57 |     lora_path: str = None,
 58 |     lora_rank: int = 128,
 59 |     output_path: str = "./output.mp4",
 60 |     num_inference_steps: int = 50,
 61 |     guidance_scale: float = 6.0,
 62 |     num_videos_per_prompt: int = 1,
 63 |     dtype: torch.dtype = torch.bfloat16,
 64 |     seed: int = 42,
 65 | ):
 66 |     """
 67 |     Generates a video based on the given prompt and saves it to the specified path.
 68 | 
 69 |     Parameters:
 70 |     - prompt (str): The description of the video to be generated.
 71 |     - video_path (str): The video for controlnet processing.
 72 |     - base_model_path (str): The path of the pre-trained model to be used.
 73 |     - controlnet_model_path (str): The path of the pre-trained conrolnet model to be used.
 74 |     - controlnet_type (str): Type of controlnet model (e.g. canny, hed).
 75 |     - controlnet_weights (float): Strenght of controlnet
 76 |     - controlnet_guidance_start (float): The stage when the controlnet starts to be applied
 77 |     - controlnet_guidance_end (float): The stage when the controlnet end to be applied
 78 |     - lora_path (str): The path of the LoRA weights to be used.
 79 |     - lora_rank (int): The rank of the LoRA weights.
 80 |     - output_path (str): The path where the generated video will be saved.
 81 |     - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
 82 |     - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
 83 |     - num_videos_per_prompt (int): Number of videos to generate per prompt.
 84 |     - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
 85 |     - seed (int): The seed for reproducibility.
 86 |     """
 87 | 
 88 |     # 0. Load selected controlnet processor
 89 |     controlnet_processor = init_controlnet_processor(controlnet_type)
 90 |     # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
 91 |     tokenizer = T5Tokenizer.from_pretrained(
 92 |         base_model_path, subfolder="tokenizer"
 93 |     )
 94 |     text_encoder = T5EncoderModel.from_pretrained(
 95 |         base_model_path, subfolder="text_encoder"
 96 |     )
 97 |     transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
 98 |         base_model_path, subfolder="transformer"
 99 |     )
100 |     vae = AutoencoderKLCogVideoX.from_pretrained(
101 |         base_model_path, subfolder="vae"
102 |     )
103 |     scheduler = CogVideoXDDIMScheduler.from_pretrained(
104 |         base_model_path, subfolder="scheduler"
105 |     )
106 |     controlnet = CogVideoXControlnet.from_pretrained(
107 |         controlnet_model_path
108 |     )
109 | 
110 |     pipe = ControlnetCogVideoXPipeline(
111 |         tokenizer=tokenizer,
112 |         text_encoder=text_encoder,
113 |         transformer=transformer,
114 |         vae=vae,
115 |         controlnet=controlnet,
116 |         scheduler=scheduler,
117 |     )
118 |     video = load_video(video_path)[:49]
119 |     controlnet_frames = [controlnet_processor(x) for x in video]
120 |     # If you're using with lora, add this code
121 |     if lora_path:
122 |         pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
123 |         pipe.fuse_lora(lora_scale=1 / lora_rank)
124 | 
125 |     # 2. Set Scheduler.
126 |     # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`.
127 |     # We recommend using `CogVideoXDDIMScheduler` for CogVideoX-2B.
128 |     # using `CogVideoXDPMScheduler` for CogVideoX-5B / CogVideoX-5B-I2V.
129 | 
130 |     # pipe.scheduler = CogVideoXDDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
131 |     pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
132 | 
133 |     # 3. Enable CPU offload for the model.
134 |     # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
135 |     # and enable to("cuda")
136 | 
137 |     # pipe.to("cuda")
138 |     pipe = pipe.to(dtype=dtype)
139 |     pipe.enable_sequential_cpu_offload()
140 | 
141 |     pipe.vae.enable_slicing()
142 |     pipe.vae.enable_tiling()
143 | 
144 |     # 4. Generate the video frames based on the prompt.
145 |     # `num_frames` is the Number of frames to generate.
146 |     # This is the default value for 6 seconds video and 8 fps and will plus 1 frame for the first frame and 49 frames.
147 |     video_generate = pipe(
148 |         prompt=prompt,
149 |         controlnet_frames=controlnet_frames,  # The path of the image to be used as the background of the video
150 |         num_videos_per_prompt=num_videos_per_prompt,  # Number of videos to generate per prompt
151 |         num_inference_steps=num_inference_steps,  # Number of inference steps
152 |         num_frames=49,  # Number of frames to generate，changed to 49 for diffusers version `0.30.3` and after.
153 |         use_dynamic_cfg=True,  # This id used for DPM Sechduler, for DDIM scheduler, it should be False
154 |         guidance_scale=guidance_scale,
155 |         generator=torch.Generator().manual_seed(seed),  # Set the seed for reproducibility
156 |         controlnet_weights=controlnet_weights,
157 |         controlnet_guidance_start=controlnet_guidance_start,
158 |         controlnet_guidance_end=controlnet_guidance_end,
159 |     ).frames[0]
160 | 
161 |     # 5. Export the generated frames to a video file. fps must be 8 for original video.
162 |     export_to_video(video_generate, output_path, fps=8)
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
167 |     parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
168 |     parser.add_argument(
169 |         "--video_path",
170 |         type=str,
171 |         required=True,
172 |         help="The path of the video for controlnet processing.",
173 |     )
174 |     parser.add_argument(
175 |         "--base_model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used"
176 |     )
177 |     parser.add_argument(
178 |         "--controlnet_model_path", type=str, default="TheDenk/cogvideox-5b-controlnet-hed-v1", help="The path of the controlnet pre-trained model to be used"
179 |     )
180 |     parser.add_argument("--controlnet_type", type=str, default='hed', help="Type of controlnet model (e.g. canny, hed)")
181 |     parser.add_argument("--controlnet_weights", type=float, default=0.8, help="Strenght of controlnet")
182 |     parser.add_argument("--controlnet_guidance_start", type=float, default=0.0, help="The stage when the controlnet starts to be applied")
183 |     parser.add_argument("--controlnet_guidance_end", type=float, default=0.5, help="The stage when the controlnet end to be applied")
184 |     parser.add_argument("--lora_path", type=str, default=None, help="The path of the LoRA weights to be used")
185 |     parser.add_argument("--lora_rank", type=int, default=128, help="The rank of the LoRA weights")
186 |     parser.add_argument(
187 |         "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
188 |     )
189 |     parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
190 |     parser.add_argument(
191 |         "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process"
192 |     )
193 |     parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
194 |     parser.add_argument(
195 |         "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')"
196 |     )
197 |     parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
198 | 
199 |     args = parser.parse_args()
200 |     dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
201 |     generate_video(
202 |         prompt=args.prompt,
203 |         video_path=args.video_path,
204 |         base_model_path=args.base_model_path,
205 |         controlnet_model_path=args.controlnet_model_path,
206 |         controlnet_type=args.controlnet_type,
207 |         controlnet_weights=args.controlnet_weights,
208 |         controlnet_guidance_start=args.controlnet_guidance_start,
209 |         controlnet_guidance_end=args.controlnet_guidance_end,
210 |         lora_path=args.lora_path,
211 |         lora_rank=args.lora_rank,
212 |         output_path=args.output_path,
213 |         num_inference_steps=args.num_inference_steps,
214 |         guidance_scale=args.guidance_scale,
215 |         num_videos_per_prompt=args.num_videos_per_prompt,
216 |         dtype=dtype,
217 |         seed=args.seed,
218 |     )
219 | 


--------------------------------------------------------------------------------
/inference/gradio_web_demo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import threading
  4 | import time
  5 | 
  6 | import gradio as gr
  7 | import torch
  8 | from transformers import T5EncoderModel, T5Tokenizer
  9 | from diffusers.utils import export_to_video, load_video
 10 | from diffusers import (
 11 |     CogVideoXDDIMScheduler,
 12 |     CogVideoXDPMScheduler,
 13 |     AutoencoderKLCogVideoX
 14 | )
 15 | from datetime import datetime, timedelta
 16 | import moviepy.editor as mp
 17 | from controlnet_aux import HEDdetector, CannyDetector
 18 | 
 19 | from controlnet_pipeline import ControlnetCogVideoXPipeline
 20 | from cogvideo_transformer import CustomCogVideoXTransformer3DModel
 21 | from cogvideo_controlnet import CogVideoXControlnet
 22 | 
 23 | 
 24 | os.makedirs("./output", exist_ok=True)
 25 | os.makedirs("./gradio_tmp", exist_ok=True)
 26 | 
 27 | controlnet_mapping = {
 28 |     'hed': HEDdetector,
 29 |     'canny': CannyDetector,
 30 | }
 31 | 
 32 | 
 33 | def init_controlnet_processor(controlnet_type):
 34 |     if controlnet_type in ['canny', 'lineart']:
 35 |         return controlnet_mapping[controlnet_type]()
 36 |     return controlnet_mapping[controlnet_type].from_pretrained('lllyasviel/Annotators').to(device='cuda')
 37 | 
 38 | 
 39 | def save_video(tensor):
 40 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 41 |     video_path = f"./output/{timestamp}.mp4"
 42 |     os.makedirs(os.path.dirname(video_path), exist_ok=True)
 43 |     export_to_video(tensor, video_path)
 44 |     return video_path
 45 | 
 46 | 
 47 | def convert_to_gif(video_path):
 48 |     clip = mp.VideoFileClip(video_path)
 49 |     clip = clip.set_fps(8)
 50 |     clip = clip.resize(height=240)
 51 |     gif_path = video_path.replace(".mp4", ".gif")
 52 |     clip.write_gif(gif_path, fps=8)
 53 |     return gif_path
 54 | 
 55 | 
 56 | def delete_old_files():
 57 |     while True:
 58 |         now = datetime.now()
 59 |         cutoff = now - timedelta(minutes=10)
 60 |         directories = ["./output", "./gradio_tmp"]
 61 | 
 62 |         for directory in directories:
 63 |             for filename in os.listdir(directory):
 64 |                 file_path = os.path.join(directory, filename)
 65 |                 if os.path.isfile(file_path):
 66 |                     file_mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
 67 |                     if file_mtime < cutoff:
 68 |                         os.remove(file_path)
 69 |         time.sleep(600)
 70 | 
 71 | 
 72 | threading.Thread(target=delete_old_files, daemon=True).start()
 73 | 
 74 | def main(args):
 75 |     # 0. Load selected controlnet processor
 76 |     controlnet_processor = init_controlnet_processor(args.controlnet_type)
 77 |     # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
 78 |     tokenizer = T5Tokenizer.from_pretrained(
 79 |         args.base_model_path, subfolder="tokenizer"
 80 |     )
 81 |     text_encoder = T5EncoderModel.from_pretrained(
 82 |         args.base_model_path, subfolder="text_encoder"
 83 |     )
 84 |     transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
 85 |         args.base_model_path, subfolder="transformer"
 86 |     )
 87 |     vae = AutoencoderKLCogVideoX.from_pretrained(
 88 |         args.base_model_path, subfolder="vae"
 89 |     )
 90 |     scheduler = CogVideoXDDIMScheduler.from_pretrained(
 91 |         args.base_model_path, subfolder="scheduler"
 92 |     )
 93 |     controlnet = CogVideoXControlnet.from_pretrained(
 94 |         args.controlnet_model_path
 95 |     )
 96 | 
 97 |     pipe = ControlnetCogVideoXPipeline(
 98 |         tokenizer=tokenizer,
 99 |         text_encoder=text_encoder,
100 |         transformer=transformer,
101 |         vae=vae,
102 |         controlnet=controlnet,
103 |         scheduler=scheduler,
104 |     )
105 |     
106 |     if args.lora_path:
107 |         pipe.load_lora_weights(args.lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
108 |         pipe.fuse_lora(lora_scale=1 / args.lora_rank)
109 | 
110 |     # pipe.scheduler = CogVideoXDDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
111 |     pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
112 | 
113 |     dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
114 |     # pipe.to("cuda")
115 |     pipe = pipe.to(dtype=dtype)
116 |     pipe.enable_sequential_cpu_offload()
117 | 
118 |     pipe.vae.enable_slicing()
119 |     pipe.vae.enable_tiling()
120 | 
121 | 
122 |     def infer(prompt: str, controlnet_frames: list, num_inference_steps: int, guidance_scale: float, seed: int, progress=gr.Progress(track_tqdm=True)):
123 |         torch.cuda.empty_cache()
124 |         video = pipe(
125 |             prompt=prompt,
126 |             controlnet_frames=controlnet_frames,
127 |             num_videos_per_prompt=1,
128 |             num_inference_steps=num_inference_steps,
129 |             num_frames=49,
130 |             guidance_scale=guidance_scale,
131 |             generator=torch.Generator().manual_seed(seed),
132 |         ).frames[0]
133 | 
134 |         return video
135 | 
136 |     with gr.Blocks() as demo:
137 |         gr.Markdown("""
138 |             <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
139 |                 CogVideoX Controlnet Gradio Simple Space🤗
140 |                 """)
141 | 
142 |         with gr.Row():
143 |             with gr.Column():
144 |                 with gr.Column():
145 |                     video_input = gr.Video(label="Video for controlnet processing", width=720, height=480)
146 |                     with gr.Row():
147 |                         download_video_button = gr.File(label="📥 Download Video", visible=False)
148 |                         download_gif_button = gr.File(label="📥 Download GIF", visible=False)
149 |                 prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
150 | 
151 |                 with gr.Column():
152 |                     gr.Markdown(
153 |                         "**Optional Parameters** (default values are recommended)<br>"
154 |                         "Increasing the number of inference steps will produce more detailed videos, but it will slow down the process.<br>"
155 |                         "50 steps are recommended for most cases.<br>"
156 |                     )
157 |                     with gr.Row():
158 |                         num_inference_steps = gr.Number(label="Inference Steps", value=50)
159 |                         guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
160 |                         seed = gr.Number(label="Seed", value=42)
161 |                     generate_button = gr.Button("🎬 Generate Video")
162 | 
163 |             with gr.Column():
164 |                 video_output = gr.Video(label="CogVideoX Generate Video", width=720, height=480)
165 |                 with gr.Row():
166 |                     download_video_button = gr.File(label="📥 Download Video", visible=False)
167 |                     download_gif_button = gr.File(label="📥 Download GIF", visible=False)
168 | 
169 |         def generate(prompt, video_input, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
170 |             video = load_video(video_input)[:49]
171 |             controlnet_frames = [controlnet_processor(x) for x in video]
172 |             tensor = infer(prompt, controlnet_frames, num_inference_steps, guidance_scale, seed, progress=progress)
173 |             video_path = save_video(tensor)
174 |             video_update = gr.update(visible=True, value=video_path)
175 |             gif_path = convert_to_gif(video_path)
176 |             gif_update = gr.update(visible=True, value=gif_path)
177 | 
178 |             return video_path, video_update, gif_update
179 | 
180 |         generate_button.click(
181 |             generate,
182 |             inputs=[prompt, video_input, num_inference_steps, guidance_scale, seed],
183 |             outputs=[video_output, download_video_button, download_gif_button],
184 |         )
185 |     demo.launch()
186 | 
187 | 
188 | if __name__ == "__main__":
189 |     parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
190 |     parser.add_argument(
191 |         "--base_model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used"
192 |     )
193 |     parser.add_argument(
194 |         "--controlnet_model_path", type=str, default="TheDenk/cogvideox-5b-controlnet-hed-v1", help="The path of the controlnet pre-trained model to be used"
195 |     )
196 |     parser.add_argument("--controlnet_type", type=str, default='hed', help="Type of controlnet model (e.g. canny, hed)")
197 |     parser.add_argument("--lora_path", type=str, default=None, help="The path of the LoRA weights to be used")
198 |     parser.add_argument("--lora_rank", type=int, default=128, help="The rank of the LoRA weights")
199 |     parser.add_argument(
200 |         "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')"
201 |     )
202 |     args = parser.parse_args()
203 |     main(args)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | spaces>=0.29.3
 2 | safetensors>=0.4.5
 3 | spandrel>=0.4.0
 4 | tqdm>=4.66.5
 5 | scikit-video>=1.1.11
 6 | git+https://github.com/huggingface/diffusers.git@main
 7 | transformers>=4.44.0
 8 | accelerate>=0.34.2
 9 | opencv-python>=4.10.0.84
10 | sentencepiece>=0.2.0
11 | numpy==1.26.0
12 | torch>=2.4.0
13 | torchvision>=0.19.0
14 | gradio>=4.44.0
15 | imageio>=2.34.2
16 | imageio-ffmpeg>=0.5.1
17 | openai>=1.45.0
18 | moviepy>=1.0.3
19 | pillow==9.5.0
20 | denku==0.0.51
21 | controlnet-aux==0.0.9


--------------------------------------------------------------------------------
/resources/car.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/cogvideox-controlnet/e21a4a3407eee8f372373fa1593297d7526caeeb/resources/car.mp4


--------------------------------------------------------------------------------
/resources/ship.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/cogvideox-controlnet/e21a4a3407eee8f372373fa1593297d7526caeeb/resources/ship.mp4


--------------------------------------------------------------------------------
/resources/stacked_car.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/cogvideox-controlnet/e21a4a3407eee8f372373fa1593297d7526caeeb/resources/stacked_car.mp4


--------------------------------------------------------------------------------
/resources/stacked_ship.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/cogvideox-controlnet/e21a4a3407eee8f372373fa1593297d7526caeeb/resources/stacked_ship.mp4


--------------------------------------------------------------------------------
/training/accelerate_config_machine_single.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | main_process_port: 29501
 3 | debug: false
 4 | deepspeed_config:
 5 |   gradient_accumulation_steps: 1
 6 |   gradient_clipping: 1.0
 7 |   offload_optimizer_device: none
 8 |   offload_param_device: none
 9 |   zero3_init_flag: false
10 |   zero_stage: 2
11 | distributed_type: DEEPSPEED
12 | downcast_bf16: 'no'
13 | enable_cpu_affinity: false
14 | machine_rank: 0
15 | main_training_function: main
16 | dynamo_backend: 'no'
17 | mixed_precision: 'no'
18 | num_machines: 1
19 | num_processes: 1
20 | rdzv_backend: static
21 | same_network: true
22 | tpu_env: []
23 | tpu_use_cluster: false
24 | tpu_use_sudo: false
25 | use_cpu: false


--------------------------------------------------------------------------------
/training/controlnet_datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import random
  4 | 
  5 | 
  6 | import cv2
  7 | import torch
  8 | import numpy as np
  9 | import pandas as pd
 10 | import torchvision.transforms as transforms
 11 | from PIL import Image
 12 | from decord import VideoReader
 13 | from torch.utils.data.dataset import Dataset
 14 | from controlnet_aux import CannyDetector, HEDdetector
 15 | 
 16 | 
 17 | def unpack_mm_params(p):
 18 |     if isinstance(p, (tuple, list)):
 19 |         return p[0], p[1]
 20 |     elif isinstance(p, (int, float)):
 21 |         return p, p
 22 |     raise Exception(f'Unknown input parameter type.\nParameter: {p}.\nType: {type(p)}')
 23 | 
 24 | 
 25 | def resize_for_crop(image, min_h, min_w):
 26 |     img_h, img_w = image.shape[-2:]
 27 |     
 28 |     if img_h >= min_h and img_w >= min_w:
 29 |         coef = min(min_h / img_h, min_w / img_w)
 30 |     elif img_h <= min_h and img_w <=min_w:
 31 |         coef = max(min_h / img_h, min_w / img_w)
 32 |     else:
 33 |         coef = min_h / img_h if min_h > img_h else min_w / img_w 
 34 | 
 35 |     out_h, out_w = int(img_h * coef), int(img_w * coef)
 36 |     resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
 37 |     return resized_image
 38 | 
 39 | 
 40 | def init_controlnet(controlnet_type):
 41 |     if controlnet_type in ['canny']:
 42 |         return controlnet_mapping[controlnet_type]()
 43 |     return controlnet_mapping[controlnet_type].from_pretrained('lllyasviel/Annotators').to(device='cuda')
 44 | 
 45 | 
 46 | controlnet_mapping = {
 47 |     'canny': CannyDetector,
 48 |     'hed': HEDdetector,
 49 | }
 50 | 
 51 | 
 52 | class BaseClass(Dataset):
 53 |     def __init__(
 54 |             self, 
 55 |             video_root_dir,
 56 |             image_size=(320, 512), 
 57 |             stride=(1, 2), 
 58 |             sample_n_frames=25,
 59 |             hflip_p=0.5,
 60 |             controlnet_type='canny',
 61 |         ):
 62 |         self.height, self.width = unpack_mm_params(image_size)
 63 |         self.stride_min, self.stride_max = unpack_mm_params(stride)
 64 |         self.video_root_dir = video_root_dir
 65 |         self.sample_n_frames = sample_n_frames
 66 |         self.hflip_p = hflip_p
 67 |         
 68 |         self.length = 0
 69 |         
 70 |         self.controlnet_processor = init_controlnet(controlnet_type)
 71 |         
 72 |     def __len__(self):
 73 |         return self.length
 74 |         
 75 |     def load_video_info(self, video_path):
 76 |         video_reader = VideoReader(video_path)
 77 |         fps_original = video_reader.get_avg_fps()
 78 |         video_length = len(video_reader)
 79 |         
 80 |         sample_stride = random.randint(self.stride_min, self.stride_max)
 81 |         clip_length = min(video_length, (self.sample_n_frames - 1) * sample_stride + 1)
 82 |         start_idx   = random.randint(0, video_length - clip_length)
 83 |         batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
 84 |         np_video = video_reader.get_batch(batch_index).asnumpy()
 85 |         pixel_values = torch.from_numpy(np_video).permute(0, 3, 1, 2).contiguous()
 86 |         pixel_values = pixel_values / 127.5 - 1
 87 |         del video_reader
 88 |         controlnet_video = [self.controlnet_processor(x) for x in np_video]
 89 |         controlnet_video = torch.from_numpy(np.stack(controlnet_video)).permute(0, 3, 1, 2).contiguous()
 90 |         controlnet_video = controlnet_video / 127.5 - 1
 91 |         return pixel_values, controlnet_video
 92 |         
 93 |     def get_batch(self, idx):
 94 |         raise Exception('Get batch method is not realized.')
 95 | 
 96 |     def __getitem__(self, idx):
 97 |         while True:
 98 |             try:
 99 |                 video, caption, controlnet_video = self.get_batch(idx)
100 |                 break
101 |             except Exception as e:
102 |                 print(e)
103 |                 idx = random.randint(0, self.length - 1)
104 | 
105 |         if self.hflip_p > random.random():
106 |             video, controlnet_video = [
107 |                 transforms.functional.hflip(x) for x in [video, controlnet_video]
108 |             ]
109 |             
110 |         video, controlnet_video = [
111 |             resize_for_crop(x, self.height, self.width) for x in [video, controlnet_video]
112 |         ] 
113 |         video, controlnet_video = [
114 |             transforms.functional.center_crop(x, (self.height, self.width)) for x in [video, controlnet_video]
115 |         ]
116 |         data = {
117 |             'video': video, 
118 |             'caption': caption, 
119 |             'controlnet_video': controlnet_video,
120 |         }
121 |         return data
122 | 
123 | 
124 | class CustomControlnetDataset(BaseClass):
125 |     def __init__(self, *args, **kwargs):
126 |         super().__init__(*args, **kwargs)
127 |         self.video_paths = glob.glob(os.path.join(self.video_root_dir, '*.mp4'))
128 |         self.length = len(self.video_paths)
129 |         
130 |     def get_batch(self, idx):
131 |         video_path = self.video_paths[idx]
132 |         caption = os.path.basename(video_path).replace('.mp4', '')
133 |         pixel_values, controlnet_video = self.load_video_info(video_path)
134 |         return pixel_values, caption, controlnet_video
135 | 
136 | 
137 | class OpenvidControlnetDataset(BaseClass):
138 |     def __init__(self, csv_path, *args, **kwargs):
139 |         super().__init__(*args, **kwargs)
140 |         videos_paths = glob.glob(os.path.join(self.video_root_dir, '*.mp4'))
141 |         videos_names = set([os.path.basename(x) for x in videos_paths])
142 |         self.df = pd.read_csv(csv_path)
143 |         self.df['checked'] = self.df['path'].map(lambda x: int(x in videos_names))
144 |         self.df = self.df[self.df['checked'] == True]
145 |         self.length = self.df.shape[0]
146 |         
147 |     def get_batch(self, idx):
148 |         item = self.df.iloc[idx]
149 |         caption = item['text']
150 |         video_name = item['path']
151 |         video_path = os.path.join(self.video_root_dir, video_name)
152 |         pixel_values, controlnet_video = self.load_video_info(video_path)
153 |         return pixel_values, caption, controlnet_video
154 | 


--------------------------------------------------------------------------------
/training/finetune_single_rank.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export MODEL_PATH="THUDM/CogVideoX-2b"
 4 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 5 | export CUDA_VISIBLE_DEVICES=0
 6 | 
 7 | # if you are not using wth 8 gus, change `accelerate_config_machine_single.yaml` num_processes as your gpu number
 8 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
 9 |   train_controlnet.py \
10 |   --tracker_name "cogvideox-controlnet" \
11 |   --gradient_checkpointing \
12 |   --pretrained_model_name_or_path $MODEL_PATH \
13 |   --enable_tiling \
14 |   --enable_slicing \
15 |   --validation_prompt "car is going in the ocean, beautiful waves:::ship in the vulcano" \
16 |   --validation_video "../resources/car.mp4:::../resources/ship.mp4" \
17 |   --validation_prompt_separator ::: \
18 |   --num_inference_steps 28 \
19 |   --num_validation_videos 1 \
20 |   --validation_steps 500 \
21 |   --seed 42 \
22 |   --mixed_precision bf16 \
23 |   --output_dir "cogvideox-controlnet" \
24 |   --height 480 \
25 |   --width 720 \
26 |   --fps 8 \
27 |   --max_num_frames 49 \
28 |   --video_root_dir "set-path-to-video-directory" \
29 |   --csv_path "set-path-to-csv-file" \
30 |   --stride_min 1 \
31 |   --stride_max 3 \
32 |   --hflip_p 0.5 \
33 |   --controlnet_type "canny" \
34 |   --controlnet_transformer_num_layers 8 \
35 |   --controlnet_input_channels 3 \
36 |   --downscale_coef 8 \
37 |   --controlnet_weights 0.5 \
38 |   --init_from_transformer \
39 |   --train_batch_size 1 \
40 |   --dataloader_num_workers 0 \
41 |   --num_train_epochs 1 \
42 |   --checkpointing_steps 1000 \
43 |   --gradient_accumulation_steps 2 \
44 |   --learning_rate 1e-5 \
45 |   --lr_scheduler cosine_with_restarts \
46 |   --lr_warmup_steps 250 \
47 |   --lr_num_cycles 1 \
48 |   --enable_slicing \
49 |   --enable_tiling \
50 |   --gradient_checkpointing \
51 |   --optimizer AdamW \
52 |   --adam_beta1 0.9 \
53 |   --adam_beta2 0.95 \
54 |   --max_grad_norm 1.0 \
55 |   --allow_tf32 
56 |   # --report_to wandb
57 |   # --pretrained_controlnet_path "cogvideox-controlnet-2b/checkpoint-2000.pt" \
58 |     


--------------------------------------------------------------------------------
/training/train_controlnet.py:
--------------------------------------------------------------------------------
   1 | # Copyright 2024 The HuggingFace Team.
   2 | # All rights reserved.
   3 | #
   4 | # Licensed under the Apache License, Version 2.0 (the "License");
   5 | # you may not use this file except in compliance with the License.
   6 | # You may obtain a copy of the License at
   7 | #
   8 | #     http://www.apache.org/licenses/LICENSE-2.0
   9 | #
  10 | # Unless required by applicable law or agreed to in writing, software
  11 | # distributed under the License is distributed on an "AS IS" BASIS,
  12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 | # See the License for the specific language governing permissions and
  14 | # limitations under the License.
  15 | 
  16 | import sys
  17 | sys.path.append('..')
  18 | import argparse
  19 | import logging
  20 | import math
  21 | import os
  22 | import shutil
  23 | from pathlib import Path
  24 | from typing import List, Optional, Tuple, Union
  25 | 
  26 | import torch
  27 | import transformers
  28 | from accelerate import Accelerator
  29 | from accelerate.logging import get_logger
  30 | from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
  31 | from huggingface_hub import create_repo, upload_folder
  32 | from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
  33 | from torch.utils.data import DataLoader, Dataset
  34 | from torchvision import transforms
  35 | from tqdm.auto import tqdm
  36 | import numpy as np
  37 | from decord import VideoReader
  38 | from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
  39 | 
  40 | import diffusers
  41 | from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
  42 | from diffusers.models.embeddings import get_3d_rotary_pos_embed
  43 | from diffusers.optimization import get_scheduler
  44 | from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
  45 | from diffusers.training_utils import (
  46 |     cast_training_params,
  47 |     clear_objs_and_retain_memory,
  48 | )
  49 | from diffusers.utils import check_min_version, export_to_video, is_wandb_available
  50 | from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
  51 | from diffusers.utils.torch_utils import is_compiled_module
  52 | 
  53 | from controlnet_datasets import OpenvidControlnetDataset
  54 | from controlnet_pipeline import ControlnetCogVideoXPipeline
  55 | from cogvideo_transformer import CustomCogVideoXTransformer3DModel
  56 | from cogvideo_controlnet import CogVideoXControlnet
  57 | 
  58 | if is_wandb_available():
  59 |     import wandb
  60 | 
  61 | # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
  62 | check_min_version("0.31.0.dev0")
  63 | 
  64 | logger = get_logger(__name__)
  65 | 
  66 | 
  67 | def get_args():
  68 |     parser = argparse.ArgumentParser(description="Simple example of a training script for CogVideoX.")
  69 | 
  70 |     # Model information
  71 |     parser.add_argument(
  72 |         "--pretrained_model_name_or_path",
  73 |         type=str,
  74 |         default=None,
  75 |         required=True,
  76 |         help="Path to pretrained model or model identifier from huggingface.co/models.",
  77 |     )
  78 |     parser.add_argument(
  79 |         "--revision",
  80 |         type=str,
  81 |         default=None,
  82 |         required=False,
  83 |         help="Revision of pretrained model identifier from huggingface.co/models.",
  84 |     )
  85 |     parser.add_argument(
  86 |         "--variant",
  87 |         type=str,
  88 |         default=None,
  89 |         help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
  90 |     )
  91 |     parser.add_argument(
  92 |         "--video_root_dir",
  93 |         type=str,
  94 |         default=None,
  95 |         required=True,
  96 |         help=("A folder containing the training data."),
  97 |     )
  98 |     parser.add_argument(
  99 |         "--csv_path",
 100 |         type=str,
 101 |         default=None,
 102 |         required=True,
 103 |         help=("A path to csv."),
 104 |     )
 105 |     parser.add_argument(
 106 |         "--stride_min",
 107 |         type=int,
 108 |         default=1,
 109 |         required=False,
 110 |         help=("Minimal stride between frames."),
 111 |     )
 112 |     parser.add_argument(
 113 |         "--stride_max",
 114 |         type=int,
 115 |         default=3,
 116 |         required=False,
 117 |         help=("Maximum stride between frames."),
 118 |     )
 119 |     parser.add_argument(
 120 |         "--hflip_p",
 121 |         type=float,
 122 |         default=0.5,
 123 |         required=False,
 124 |         help="Video horizontal flip probability.",
 125 |     )
 126 |     parser.add_argument(
 127 |         "--controlnet_type",
 128 |         type=str,
 129 |         default='canny',
 130 |         required=True,
 131 |         help=("Controlnet type. (canny, hed, etc.)"),
 132 |     )
 133 |     parser.add_argument(
 134 |         "--controlnet_transformer_num_layers",
 135 |         type=int,
 136 |         default=2,
 137 |         required=False,
 138 |         help=("Count of controlnet blocks."),
 139 |     )
 140 |     parser.add_argument(
 141 |         "--downscale_coef",
 142 |         type=int,
 143 |         default=8,
 144 |         required=False,
 145 |         help=("Downscale coef as encoder decreases resolutio before apply transformer."),
 146 |     )
 147 |     parser.add_argument(
 148 |         "--controlnet_input_channels",
 149 |         type=int,
 150 |         default=3,
 151 |         required=False,
 152 |         help=("Controlnet encoder input channels."),
 153 |     )
 154 |     parser.add_argument(
 155 |         "--controlnet_weights",
 156 |         type=float,
 157 |         default=1.0,
 158 |         required=False,
 159 |         help=("Controlnet blocks weight."),
 160 |     )
 161 |     parser.add_argument(
 162 |         "--init_from_transformer",
 163 |         action="store_true",
 164 |         help="Whether or not load start controlnet parameters from transformer model.",
 165 |     )
 166 |     parser.add_argument(
 167 |         "--pretrained_controlnet_path",
 168 |         type=str,
 169 |         default=None,
 170 |         required=False,
 171 |         help=("Path to controlnet .pt checkpoint."),
 172 |     )
 173 |     parser.add_argument(
 174 |         "--dataloader_num_workers",
 175 |         type=int,
 176 |         default=0,
 177 |         help=(
 178 |             "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
 179 |         ),
 180 |     )
 181 |     # Validation
 182 |     parser.add_argument(
 183 |         "--num_inference_steps",
 184 |         type=int,
 185 |         default=50,
 186 |         help=(
 187 |             "Num steps for denoising on validation stage."
 188 |         ),
 189 |     )
 190 |     parser.add_argument(
 191 |         "--validation_prompt",
 192 |         type=str,
 193 |         default=None,
 194 |         help="One or more prompt(s) that is used during validation to verify that the model is learning. Multiple validation prompts should be separated by the '--validation_prompt_seperator' string.",
 195 |     )
 196 |     parser.add_argument(
 197 |         "--validation_video",
 198 |         type=str,
 199 |         default=None,
 200 |         help="Paths to video for falidation.",
 201 |     )
 202 |     parser.add_argument(
 203 |         "--validation_prompt_separator",
 204 |         type=str,
 205 |         default=":::",
 206 |         help="String that separates multiple validation prompts",
 207 |     )
 208 |     parser.add_argument(
 209 |         "--num_validation_videos",
 210 |         type=int,
 211 |         default=1,
 212 |         help="Number of videos that should be generated during validation per `validation_prompt`.",
 213 |     )
 214 |     parser.add_argument(
 215 |         "--validation_steps",
 216 |         type=int,
 217 |         default=50,
 218 |         help=(
 219 |             "Run validation every X steps. Validation consists of running the prompt `args.validation_prompt` multiple times: `args.num_validation_videos`."
 220 |         ),
 221 |     )
 222 |     parser.add_argument(
 223 |         "--guidance_scale",
 224 |         type=float,
 225 |         default=6,
 226 |         help="The guidance scale to use while sampling validation videos.",
 227 |     )
 228 |     parser.add_argument(
 229 |         "--use_dynamic_cfg",
 230 |         action="store_true",
 231 |         default=False,
 232 |         help="Whether or not to use the default cosine dynamic guidance schedule when sampling validation videos.",
 233 |     )
 234 | 
 235 |     # Training information
 236 |     parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
 237 |     parser.add_argument(
 238 |         "--mixed_precision",
 239 |         type=str,
 240 |         default=None,
 241 |         choices=["no", "fp16", "bf16"],
 242 |         help=(
 243 |             "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
 244 |             " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
 245 |             " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
 246 |         ),
 247 |     )
 248 |     parser.add_argument(
 249 |         "--output_dir",
 250 |         type=str,
 251 |         default="cogvideox-controlnet",
 252 |         help="The output directory where the model predictions and checkpoints will be written.",
 253 |     )
 254 |     parser.add_argument(
 255 |         "--height",
 256 |         type=int,
 257 |         default=480,
 258 |         help="All input videos are resized to this height.",
 259 |     )
 260 |     parser.add_argument(
 261 |         "--width",
 262 |         type=int,
 263 |         default=720,
 264 |         help="All input videos are resized to this width.",
 265 |     )
 266 |     parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
 267 |     parser.add_argument(
 268 |         "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
 269 |     )
 270 |     parser.add_argument(
 271 |         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
 272 |     )
 273 |     parser.add_argument("--num_train_epochs", type=int, default=1)
 274 |     parser.add_argument(
 275 |         "--max_train_steps",
 276 |         type=int,
 277 |         default=None,
 278 |         help="Total number of training steps to perform. If provided, overrides `--num_train_epochs`.",
 279 |     )
 280 |     parser.add_argument(
 281 |         "--checkpointing_steps",
 282 |         type=int,
 283 |         default=500,
 284 |         help=(
 285 |             "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
 286 |             " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
 287 |             " training using `--resume_from_checkpoint`."
 288 |         ),
 289 |     )
 290 |     parser.add_argument(
 291 |         "--checkpoints_total_limit",
 292 |         type=int,
 293 |         default=None,
 294 |         help=("Max number of checkpoints to store."),
 295 |     )
 296 |     parser.add_argument(
 297 |         "--gradient_accumulation_steps",
 298 |         type=int,
 299 |         default=1,
 300 |         help="Number of updates steps to accumulate before performing a backward/update pass.",
 301 |     )
 302 |     parser.add_argument(
 303 |         "--gradient_checkpointing",
 304 |         action="store_true",
 305 |         help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
 306 |     )
 307 |     parser.add_argument(
 308 |         "--learning_rate",
 309 |         type=float,
 310 |         default=1e-4,
 311 |         help="Initial learning rate (after the potential warmup period) to use.",
 312 |     )
 313 |     parser.add_argument(
 314 |         "--scale_lr",
 315 |         action="store_true",
 316 |         default=False,
 317 |         help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
 318 |     )
 319 |     parser.add_argument(
 320 |         "--lr_scheduler",
 321 |         type=str,
 322 |         default="constant",
 323 |         help=(
 324 |             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
 325 |             ' "constant", "constant_with_warmup"]'
 326 |         ),
 327 |     )
 328 |     parser.add_argument(
 329 |         "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
 330 |     )
 331 |     parser.add_argument(
 332 |         "--lr_num_cycles",
 333 |         type=int,
 334 |         default=1,
 335 |         help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
 336 |     )
 337 |     parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
 338 |     parser.add_argument(
 339 |         "--enable_slicing",
 340 |         action="store_true",
 341 |         default=False,
 342 |         help="Whether or not to use VAE slicing for saving memory.",
 343 |     )
 344 |     parser.add_argument(
 345 |         "--enable_tiling",
 346 |         action="store_true",
 347 |         default=False,
 348 |         help="Whether or not to use VAE tiling for saving memory.",
 349 |     )
 350 | 
 351 |     # Optimizer
 352 |     parser.add_argument(
 353 |         "--optimizer",
 354 |         type=lambda s: s.lower(),
 355 |         default="adam",
 356 |         choices=["adam", "adamw", "prodigy"],
 357 |         help=("The optimizer type to use."),
 358 |     )
 359 |     parser.add_argument(
 360 |         "--use_8bit_adam",
 361 |         action="store_true",
 362 |         help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
 363 |     )
 364 |     parser.add_argument(
 365 |         "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
 366 |     )
 367 |     parser.add_argument(
 368 |         "--adam_beta2", type=float, default=0.95, help="The beta2 parameter for the Adam and Prodigy optimizers."
 369 |     )
 370 |     parser.add_argument(
 371 |         "--prodigy_beta3",
 372 |         type=float,
 373 |         default=None,
 374 |         help="Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2.",
 375 |     )
 376 |     parser.add_argument("--prodigy_decouple", action="store_true", help="Use AdamW style decoupled weight decay")
 377 |     parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
 378 |     parser.add_argument(
 379 |         "--adam_epsilon",
 380 |         type=float,
 381 |         default=1e-08,
 382 |         help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
 383 |     )
 384 |     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
 385 |     parser.add_argument("--prodigy_use_bias_correction", action="store_true", help="Turn on Adam's bias correction.")
 386 |     parser.add_argument(
 387 |         "--prodigy_safeguard_warmup",
 388 |         action="store_true",
 389 |         help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage.",
 390 |     )
 391 | 
 392 |     # Other information
 393 |     parser.add_argument("--tracker_name", type=str, default=None, help="Project tracker name")
 394 |     parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
 395 |     parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
 396 |     parser.add_argument(
 397 |         "--hub_model_id",
 398 |         type=str,
 399 |         default=None,
 400 |         help="The name of the repository to keep in sync with the local `output_dir`.",
 401 |     )
 402 |     parser.add_argument(
 403 |         "--logging_dir",
 404 |         type=str,
 405 |         default="logs",
 406 |         help="Directory where logs are stored.",
 407 |     )
 408 |     parser.add_argument(
 409 |         "--allow_tf32",
 410 |         action="store_true",
 411 |         help=(
 412 |             "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
 413 |             " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
 414 |         ),
 415 |     )
 416 |     parser.add_argument(
 417 |         "--report_to",
 418 |         type=str,
 419 |         default=None,
 420 |         help=(
 421 |             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
 422 |             ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
 423 |         ),
 424 |     )
 425 | 
 426 |     return parser.parse_args()
 427 | 
 428 | 
 429 | def read_video(video_path, start_index=0, frames_count=49, stride=1):
 430 |     video_reader = VideoReader(video_path)
 431 |     end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
 432 |     batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
 433 |     numpy_video = video_reader.get_batch(batch_index).asnumpy()
 434 |     return numpy_video
 435 |     
 436 | 
 437 | def log_validation(
 438 |     pipe,
 439 |     args,
 440 |     accelerator,
 441 |     pipeline_args,
 442 |     epoch,
 443 |     is_final_validation: bool = False,
 444 | ):
 445 |     logger.info(
 446 |         f"Running validation... \n Generating {args.num_validation_videos} videos with prompt: {pipeline_args['prompt']}."
 447 |     )
 448 |     # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
 449 |     scheduler_args = {}
 450 | 
 451 |     if "variance_type" in pipe.scheduler.config:
 452 |         variance_type = pipe.scheduler.config.variance_type
 453 | 
 454 |         if variance_type in ["learned", "learned_range"]:
 455 |             variance_type = "fixed_small"
 456 | 
 457 |         scheduler_args["variance_type"] = variance_type
 458 | 
 459 |     pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
 460 |     pipe = pipe.to(accelerator.device)
 461 |     # pipe.set_progress_bar_config(disable=True)
 462 | 
 463 |     # run inference
 464 |     generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
 465 | 
 466 |     videos = []
 467 |     for _ in range(args.num_validation_videos):
 468 |         video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
 469 |         videos.append(video)
 470 | 
 471 |     for i, video in enumerate(videos):
 472 |         prompt = (
 473 |             pipeline_args["prompt"][:25]
 474 |             .replace(" ", "_")
 475 |             .replace(" ", "_")
 476 |             .replace("'", "_")
 477 |             .replace('"', "_")
 478 |             .replace("/", "_")
 479 |         )
 480 |         filename = os.path.join(args.output_dir, f"{epoch}_video_{i}_{prompt}.mp4")
 481 |         export_to_video(video, filename, fps=8)
 482 | 
 483 |     clear_objs_and_retain_memory([pipe])
 484 | 
 485 |     return videos
 486 | 
 487 | 
 488 | def _get_t5_prompt_embeds(
 489 |     tokenizer: T5Tokenizer,
 490 |     text_encoder: T5EncoderModel,
 491 |     prompt: Union[str, List[str]],
 492 |     num_videos_per_prompt: int = 1,
 493 |     max_sequence_length: int = 226,
 494 |     device: Optional[torch.device] = None,
 495 |     dtype: Optional[torch.dtype] = None,
 496 |     text_input_ids=None,
 497 | ):
 498 |     prompt = [prompt] if isinstance(prompt, str) else prompt
 499 |     batch_size = len(prompt)
 500 | 
 501 |     if tokenizer is not None:
 502 |         text_inputs = tokenizer(
 503 |             prompt,
 504 |             padding="max_length",
 505 |             max_length=max_sequence_length,
 506 |             truncation=True,
 507 |             add_special_tokens=True,
 508 |             return_tensors="pt",
 509 |         )
 510 |         text_input_ids = text_inputs.input_ids
 511 |     else:
 512 |         if text_input_ids is None:
 513 |             raise ValueError("`text_input_ids` must be provided when the tokenizer is not specified.")
 514 | 
 515 |     prompt_embeds = text_encoder(text_input_ids.to(device))[0]
 516 |     prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 517 | 
 518 |     # duplicate text embeddings for each generation per prompt, using mps friendly method
 519 |     _, seq_len, _ = prompt_embeds.shape
 520 |     prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
 521 |     prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 522 | 
 523 |     return prompt_embeds
 524 | 
 525 | 
 526 | def encode_prompt(
 527 |     tokenizer: T5Tokenizer,
 528 |     text_encoder: T5EncoderModel,
 529 |     prompt: Union[str, List[str]],
 530 |     num_videos_per_prompt: int = 1,
 531 |     max_sequence_length: int = 226,
 532 |     device: Optional[torch.device] = None,
 533 |     dtype: Optional[torch.dtype] = None,
 534 |     text_input_ids=None,
 535 | ):
 536 |     prompt = [prompt] if isinstance(prompt, str) else prompt
 537 |     prompt_embeds = _get_t5_prompt_embeds(
 538 |         tokenizer,
 539 |         text_encoder,
 540 |         prompt=prompt,
 541 |         num_videos_per_prompt=num_videos_per_prompt,
 542 |         max_sequence_length=max_sequence_length,
 543 |         device=device,
 544 |         dtype=dtype,
 545 |         text_input_ids=text_input_ids,
 546 |     )
 547 |     return prompt_embeds
 548 | 
 549 | 
 550 | def compute_prompt_embeddings(
 551 |     tokenizer, text_encoder, prompt, max_sequence_length, device, dtype, requires_grad: bool = False
 552 | ):
 553 |     if requires_grad:
 554 |         prompt_embeds = encode_prompt(
 555 |             tokenizer,
 556 |             text_encoder,
 557 |             prompt,
 558 |             num_videos_per_prompt=1,
 559 |             max_sequence_length=max_sequence_length,
 560 |             device=device,
 561 |             dtype=dtype,
 562 |         )
 563 |     else:
 564 |         with torch.no_grad():
 565 |             prompt_embeds = encode_prompt(
 566 |                 tokenizer,
 567 |                 text_encoder,
 568 |                 prompt,
 569 |                 num_videos_per_prompt=1,
 570 |                 max_sequence_length=max_sequence_length,
 571 |                 device=device,
 572 |                 dtype=dtype,
 573 |             )
 574 |     return prompt_embeds
 575 | 
 576 | 
 577 | def prepare_rotary_positional_embeddings(
 578 |     height: int,
 579 |     width: int,
 580 |     num_frames: int,
 581 |     vae_scale_factor_spatial: int = 8,
 582 |     patch_size: int = 2,
 583 |     attention_head_dim: int = 64,
 584 |     device: Optional[torch.device] = None,
 585 |     base_height: int = 480,
 586 |     base_width: int = 720,
 587 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 588 |     grid_height = height // (vae_scale_factor_spatial * patch_size)
 589 |     grid_width = width // (vae_scale_factor_spatial * patch_size)
 590 |     base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
 591 |     base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
 592 | 
 593 |     grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
 594 |     freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
 595 |         embed_dim=attention_head_dim,
 596 |         crops_coords=grid_crops_coords,
 597 |         grid_size=(grid_height, grid_width),
 598 |         temporal_size=num_frames,
 599 |     )
 600 | 
 601 |     freqs_cos = freqs_cos.to(device=device)
 602 |     freqs_sin = freqs_sin.to(device=device)
 603 |     return freqs_cos, freqs_sin
 604 | 
 605 | 
 606 | def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
 607 |     # Use DeepSpeed optimzer
 608 |     if use_deepspeed:
 609 |         from accelerate.utils import DummyOptim
 610 | 
 611 |         return DummyOptim(
 612 |             params_to_optimize,
 613 |             lr=args.learning_rate,
 614 |             betas=(args.adam_beta1, args.adam_beta2),
 615 |             eps=args.adam_epsilon,
 616 |             weight_decay=args.adam_weight_decay,
 617 |         )
 618 | 
 619 |     # Optimizer creation
 620 |     supported_optimizers = ["adam", "adamw", "prodigy"]
 621 |     if args.optimizer not in supported_optimizers:
 622 |         logger.warning(
 623 |             f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include {supported_optimizers}. Defaulting to AdamW"
 624 |         )
 625 |         args.optimizer = "adamw"
 626 | 
 627 |     if args.use_8bit_adam and not (args.optimizer.lower() not in ["adam", "adamw"]):
 628 |         logger.warning(
 629 |             f"use_8bit_adam is ignored when optimizer is not set to 'Adam' or 'AdamW'. Optimizer was "
 630 |             f"set to {args.optimizer.lower()}"
 631 |         )
 632 | 
 633 |     if args.use_8bit_adam:
 634 |         try:
 635 |             import bitsandbytes as bnb
 636 |         except ImportError:
 637 |             raise ImportError(
 638 |                 "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
 639 |             )
 640 | 
 641 |     if args.optimizer.lower() == "adamw":
 642 |         optimizer_class = bnb.optim.AdamW8bit if args.use_8bit_adam else torch.optim.AdamW
 643 | 
 644 |         optimizer = optimizer_class(
 645 |             params_to_optimize,
 646 |             betas=(args.adam_beta1, args.adam_beta2),
 647 |             eps=args.adam_epsilon,
 648 |             weight_decay=args.adam_weight_decay,
 649 |         )
 650 |     elif args.optimizer.lower() == "adam":
 651 |         optimizer_class = bnb.optim.Adam8bit if args.use_8bit_adam else torch.optim.Adam
 652 | 
 653 |         optimizer = optimizer_class(
 654 |             params_to_optimize,
 655 |             betas=(args.adam_beta1, args.adam_beta2),
 656 |             eps=args.adam_epsilon,
 657 |             weight_decay=args.adam_weight_decay,
 658 |         )
 659 |     elif args.optimizer.lower() == "prodigy":
 660 |         try:
 661 |             import prodigyopt
 662 |         except ImportError:
 663 |             raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
 664 | 
 665 |         optimizer_class = prodigyopt.Prodigy
 666 | 
 667 |         if args.learning_rate <= 0.1:
 668 |             logger.warning(
 669 |                 "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
 670 |             )
 671 | 
 672 |         optimizer = optimizer_class(
 673 |             params_to_optimize,
 674 |             lr=args.learning_rate,
 675 |             betas=(args.adam_beta1, args.adam_beta2),
 676 |             beta3=args.prodigy_beta3,
 677 |             weight_decay=args.adam_weight_decay,
 678 |             eps=args.adam_epsilon,
 679 |             decouple=args.prodigy_decouple,
 680 |             use_bias_correction=args.prodigy_use_bias_correction,
 681 |             safeguard_warmup=args.prodigy_safeguard_warmup,
 682 |         )
 683 | 
 684 |     return optimizer
 685 | 
 686 | 
 687 | def main(args):
 688 |     if args.report_to == "wandb" and args.hub_token is not None:
 689 |         raise ValueError(
 690 |             "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
 691 |             " Please use `huggingface-cli login` to authenticate with the Hub."
 692 |         )
 693 | 
 694 |     if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
 695 |         # due to pytorch#99272, MPS does not yet support bfloat16.
 696 |         raise ValueError(
 697 |             "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
 698 |         )
 699 | 
 700 |     logging_dir = Path(args.output_dir, args.logging_dir)
 701 | 
 702 |     accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
 703 |     kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
 704 |     accelerator = Accelerator(
 705 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
 706 |         mixed_precision=args.mixed_precision,
 707 |         log_with=args.report_to,
 708 |         project_config=accelerator_project_config,
 709 |         kwargs_handlers=[kwargs],
 710 |     )
 711 | 
 712 |     # Disable AMP for MPS.
 713 |     if torch.backends.mps.is_available():
 714 |         accelerator.native_amp = False
 715 | 
 716 |     if args.report_to == "wandb":
 717 |         if not is_wandb_available():
 718 |             raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
 719 | 
 720 |     # Make one log on every process with the configuration for debugging.
 721 |     logging.basicConfig(
 722 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 723 |         datefmt="%m/%d/%Y %H:%M:%S",
 724 |         level=logging.INFO,
 725 |     )
 726 |     logger.info(accelerator.state, main_process_only=False)
 727 |     if accelerator.is_local_main_process:
 728 |         transformers.utils.logging.set_verbosity_warning()
 729 |         diffusers.utils.logging.set_verbosity_info()
 730 |     else:
 731 |         transformers.utils.logging.set_verbosity_error()
 732 |         diffusers.utils.logging.set_verbosity_error()
 733 | 
 734 |     # If passed along, set the training seed now.
 735 |     if args.seed is not None:
 736 |         set_seed(args.seed)
 737 | 
 738 |     # Handle the repository creation
 739 |     if accelerator.is_main_process:
 740 |         if args.output_dir is not None:
 741 |             os.makedirs(args.output_dir, exist_ok=True)
 742 | 
 743 |         if args.push_to_hub:
 744 |             repo_id = create_repo(
 745 |                 repo_id=args.hub_model_id or Path(args.output_dir).name,
 746 |                 exist_ok=True,
 747 |             ).repo_id
 748 | 
 749 |     # Prepare models and scheduler
 750 |     tokenizer = AutoTokenizer.from_pretrained(
 751 |         args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
 752 |     )
 753 | 
 754 |     text_encoder = T5EncoderModel.from_pretrained(
 755 |         args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
 756 |     )
 757 | 
 758 |     # CogVideoX-2b weights are stored in float16
 759 |     # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
 760 |     load_dtype = torch.bfloat16 if "5b" in args.pretrained_model_name_or_path.lower() else torch.float16
 761 |     transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
 762 |         args.pretrained_model_name_or_path,
 763 |         subfolder="transformer",
 764 |         torch_dtype=load_dtype,
 765 |         revision=args.revision,
 766 |         variant=args.variant,
 767 |     )
 768 | 
 769 |     vae = AutoencoderKLCogVideoX.from_pretrained(
 770 |         args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
 771 |     )
 772 | 
 773 |     controlnet = CogVideoXControlnet(
 774 |         num_layers=args.controlnet_transformer_num_layers,
 775 |         downscale_coef=args.downscale_coef,
 776 |         in_channels=args.controlnet_input_channels,
 777 |         num_attention_heads=48 if "5b" in args.pretrained_model_name_or_path.lower() else 30,
 778 |         
 779 |     )
 780 | 
 781 |     if args.init_from_transformer:
 782 |         controlnet_state_dict = {}
 783 |         for name, params in transformer.state_dict().items():
 784 |             if 'patch_embed.proj.weight' in name:
 785 |                 continue
 786 |             controlnet_state_dict[name] = params
 787 |         m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
 788 |         print(f'[ Weights from transformer was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
 789 | 
 790 |     if args.pretrained_controlnet_path:
 791 |         ckpt = torch.load(args.pretrained_controlnet_path, map_location='cpu', weights_only=False)
 792 |         controlnet_state_dict = {}
 793 |         for name, params in ckpt['state_dict'].items():
 794 |             controlnet_state_dict[name] = params
 795 |         m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
 796 |         print(f'[ Weights from pretrained controlnet was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
 797 |     
 798 |     scheduler = CogVideoXDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
 799 | 
 800 |     if args.enable_slicing:
 801 |         vae.enable_slicing()
 802 |     if args.enable_tiling:
 803 |         vae.enable_tiling()
 804 | 
 805 |     # We only train the additional adapter controlnet layers
 806 |     text_encoder.requires_grad_(False)
 807 |     transformer.requires_grad_(False)
 808 |     vae.requires_grad_(False)
 809 |     controlnet.requires_grad_(True)
 810 | 
 811 |     # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
 812 |     # as these weights are only used for inference, keeping weights in full precision is not required.
 813 |     weight_dtype = torch.float32
 814 |     if accelerator.state.deepspeed_plugin:
 815 |         # DeepSpeed is handling precision, use what's in the DeepSpeed config
 816 |         if (
 817 |             "fp16" in accelerator.state.deepspeed_plugin.deepspeed_config
 818 |             and accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
 819 |         ):
 820 |             weight_dtype = torch.float16
 821 |         if (
 822 |             "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
 823 |             and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
 824 |         ):
 825 |             weight_dtype = torch.float16
 826 |     else:
 827 |         if accelerator.mixed_precision == "fp16":
 828 |             weight_dtype = torch.float16
 829 |         elif accelerator.mixed_precision == "bf16":
 830 |             weight_dtype = torch.bfloat16
 831 | 
 832 |     if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
 833 |         # due to pytorch#99272, MPS does not yet support bfloat16.
 834 |         raise ValueError(
 835 |             "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
 836 |         )
 837 | 
 838 |     text_encoder.to(accelerator.device, dtype=weight_dtype)
 839 |     transformer.to(accelerator.device, dtype=weight_dtype)
 840 |     vae.to(accelerator.device, dtype=weight_dtype)
 841 |     controlnet.to(accelerator.device, dtype=weight_dtype)
 842 | 
 843 |     if args.gradient_checkpointing:
 844 |         transformer.enable_gradient_checkpointing()
 845 |         controlnet.enable_gradient_checkpointing()
 846 | 
 847 |     def unwrap_model(model):
 848 |         model = accelerator.unwrap_model(model)
 849 |         model = model._orig_mod if is_compiled_module(model) else model
 850 |         return model
 851 | 
 852 |     # Enable TF32 for faster training on Ampere GPUs,
 853 |     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 854 |     if args.allow_tf32 and torch.cuda.is_available():
 855 |         torch.backends.cuda.matmul.allow_tf32 = True
 856 | 
 857 |     if args.scale_lr:
 858 |         args.learning_rate = (
 859 |             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
 860 |         )
 861 | 
 862 |     # Make sure the trainable params are in float32.
 863 |     if args.mixed_precision == "fp16":
 864 |         # only upcast trainable parameters into fp32
 865 |         cast_training_params([controlnet], dtype=torch.float32)
 866 | 
 867 |     trainable_parameters = list(filter(lambda p: p.requires_grad, controlnet.parameters()))
 868 | 
 869 |     # Optimization parameters
 870 |     trainable_parameters_with_lr = {"params": trainable_parameters, "lr": args.learning_rate}
 871 |     params_to_optimize = [trainable_parameters_with_lr]
 872 | 
 873 |     use_deepspeed_optimizer = (
 874 |         accelerator.state.deepspeed_plugin is not None
 875 |         and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
 876 |     )
 877 |     use_deepspeed_scheduler = (
 878 |         accelerator.state.deepspeed_plugin is not None
 879 |         and "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
 880 |     )
 881 | 
 882 |     optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer)
 883 | 
 884 |     # Dataset and DataLoader
 885 |     train_dataset = OpenvidControlnetDataset(
 886 |         video_root_dir=args.video_root_dir,
 887 |         csv_path=args.csv_path,
 888 |         image_size=(args.height, args.width), 
 889 |         stride=(args.stride_min, args.stride_max),
 890 |         sample_n_frames=args.max_num_frames,
 891 |         hflip_p=args.hflip_p,
 892 |         controlnet_type=args.controlnet_type,
 893 |     )
 894 |         
 895 |     def encode_video(video):
 896 |         video = video.to(accelerator.device, dtype=vae.dtype)
 897 |         video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
 898 |         latent_dist = vae.encode(video).latent_dist.sample() * vae.config.scaling_factor
 899 |         return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format)
 900 |     
 901 |     def collate_fn(examples):
 902 |         videos = [example["video"] for example in examples]
 903 |         prompts = [example["caption"] for example in examples]
 904 |         controlnet_videos = [example["controlnet_video"] for example in examples]
 905 | 
 906 |         videos = torch.stack(videos)
 907 |         videos = videos.to(memory_format=torch.contiguous_format).float()
 908 | 
 909 |         controlnet_videos = torch.stack(controlnet_videos)
 910 |         controlnet_videos = controlnet_videos.to(memory_format=torch.contiguous_format).float()
 911 | 
 912 |         return {
 913 |             "videos": videos,
 914 |             "prompts": prompts,
 915 |             "controlnet_videos": controlnet_videos,
 916 |         }
 917 | 
 918 |     train_dataloader = DataLoader(
 919 |         train_dataset,
 920 |         batch_size=args.train_batch_size,
 921 |         shuffle=True,
 922 |         collate_fn=collate_fn,
 923 |         num_workers=args.dataloader_num_workers,
 924 |     )
 925 | 
 926 |     # Scheduler and math around the number of training steps.
 927 |     overrode_max_train_steps = False
 928 |     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
 929 |     if args.max_train_steps is None:
 930 |         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
 931 |         overrode_max_train_steps = True
 932 | 
 933 |     if use_deepspeed_scheduler:
 934 |         from accelerate.utils import DummyScheduler
 935 | 
 936 |         lr_scheduler = DummyScheduler(
 937 |             name=args.lr_scheduler,
 938 |             optimizer=optimizer,
 939 |             total_num_steps=args.max_train_steps * accelerator.num_processes,
 940 |             num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
 941 |         )
 942 |     else:
 943 |         lr_scheduler = get_scheduler(
 944 |             args.lr_scheduler,
 945 |             optimizer=optimizer,
 946 |             num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
 947 |             num_training_steps=args.max_train_steps * accelerator.num_processes,
 948 |             num_cycles=args.lr_num_cycles,
 949 |             power=args.lr_power,
 950 |         )
 951 | 
 952 |     # Prepare everything with our `accelerator`.
 953 |     controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
 954 |         controlnet, optimizer, train_dataloader, lr_scheduler
 955 |     )
 956 | 
 957 |     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
 958 |     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
 959 |     if overrode_max_train_steps:
 960 |         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
 961 |     # Afterwards we recalculate our number of training epochs
 962 |     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 963 | 
 964 |     # We need to initialize the trackers we use, and also store our configuration.
 965 |     # The trackers initializes automatically on the main process.
 966 |     if accelerator.is_main_process:
 967 |         tracker_name = args.tracker_name or "cogvideox-controlnet"
 968 |         accelerator.init_trackers(tracker_name, config=vars(args))
 969 | 
 970 |     # Train!
 971 |     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 972 |     num_trainable_parameters = sum(param.numel() for model in params_to_optimize for param in model["params"])
 973 | 
 974 |     logger.info("***** Running training *****")
 975 |     logger.info(f"  Num trainable parameters = {num_trainable_parameters}")
 976 |     logger.info(f"  Num examples = {len(train_dataset)}")
 977 |     logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
 978 |     logger.info(f"  Num epochs = {args.num_train_epochs}")
 979 |     logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
 980 |     logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
 981 |     logger.info(f"  Gradient accumulation steps = {args.gradient_accumulation_steps}")
 982 |     logger.info(f"  Total optimization steps = {args.max_train_steps}")
 983 |     global_step = 0
 984 |     first_epoch = 0
 985 |     initial_global_step = 0
 986 | 
 987 |     progress_bar = tqdm(
 988 |         range(0, args.max_train_steps),
 989 |         initial=initial_global_step,
 990 |         desc="Steps",
 991 |         # Only show the progress bar once on each machine.
 992 |         disable=not accelerator.is_local_main_process,
 993 |     )
 994 |     vae_scale_factor_spatial = 2 ** (len(vae.config.block_out_channels) - 1)
 995 | 
 996 |     # For DeepSpeed training
 997 |     model_config = transformer.module.config if hasattr(transformer, "module") else transformer.config
 998 | 
 999 |     for epoch in range(first_epoch, args.num_train_epochs):
1000 |         controlnet.train()
1001 | 
1002 |         for step, batch in enumerate(train_dataloader):
1003 |             models_to_accumulate = [controlnet]
1004 | 
1005 |             with accelerator.accumulate(models_to_accumulate):
1006 |                 model_input = encode_video(batch["videos"]).to(dtype=weight_dtype)  # [B, F, C, H, W]
1007 |                 controlnet_encoded_frames = batch["controlnet_videos"]
1008 |                 prompts = batch["prompts"]
1009 |                 
1010 |                 # encode prompts
1011 |                 prompt_embeds = compute_prompt_embeddings(
1012 |                     tokenizer,
1013 |                     text_encoder,
1014 |                     prompts,
1015 |                     model_config.max_text_seq_length,
1016 |                     accelerator.device,
1017 |                     weight_dtype,
1018 |                     requires_grad=False,
1019 |                 )
1020 | 
1021 |                 # Sample noise that will be added to the latents
1022 |                 noise = torch.randn_like(model_input)
1023 |                 batch_size, num_frames, num_channels, height, width = model_input.shape
1024 | 
1025 |                 # Sample a random timestep for each image
1026 |                 timesteps = torch.randint(
1027 |                     0, scheduler.config.num_train_timesteps, (batch_size,), device=model_input.device
1028 |                 )
1029 |                 timesteps = timesteps.long()
1030 |         
1031 |                 # Prepare rotary embeds
1032 |                 image_rotary_emb = (
1033 |                     prepare_rotary_positional_embeddings(
1034 |                         height=args.height,
1035 |                         width=args.width,
1036 |                         num_frames=num_frames,
1037 |                         vae_scale_factor_spatial=vae_scale_factor_spatial,
1038 |                         patch_size=model_config.patch_size,
1039 |                         attention_head_dim=model_config.attention_head_dim,
1040 |                         device=accelerator.device,
1041 |                     )
1042 |                     if model_config.use_rotary_positional_embeddings
1043 |                     else None
1044 |                 )
1045 | 
1046 |                 # Add noise to the model input according to the noise magnitude at each timestep
1047 |                 # (this is the forward diffusion process)
1048 |                 noisy_model_input = scheduler.add_noise(model_input, noise, timesteps)
1049 | 
1050 |                 controlnet_states = controlnet(
1051 |                     hidden_states=noisy_model_input,
1052 |                     encoder_hidden_states=prompt_embeds,
1053 |                     image_rotary_emb=image_rotary_emb,
1054 |                     controlnet_states=controlnet_encoded_frames,
1055 |                     timestep=timesteps,
1056 |                     return_dict=False,
1057 |                 )[0]
1058 |                 if isinstance(controlnet_states, (tuple, list)):
1059 |                     controlnet_states = [x.to(dtype=weight_dtype) for x in controlnet_states]
1060 |                 else:
1061 |                     controlnet_states = controlnet_states.to(dtype=weight_dtype)
1062 |                 # Predict the noise residual
1063 |                 model_output = transformer(
1064 |                     hidden_states=noisy_model_input,
1065 |                     encoder_hidden_states=prompt_embeds,
1066 |                     timestep=timesteps,
1067 |                     image_rotary_emb=image_rotary_emb,
1068 |                     controlnet_states=controlnet_states,
1069 |                     controlnet_weights=args.controlnet_weights,
1070 |                     return_dict=False,
1071 |                 )[0]
1072 |                 model_pred = scheduler.get_velocity(model_output, noisy_model_input, timesteps)
1073 | 
1074 |                 alphas_cumprod = scheduler.alphas_cumprod[timesteps]
1075 |                 weights = 1 / (1 - alphas_cumprod)
1076 |                 while len(weights.shape) < len(model_pred.shape):
1077 |                     weights = weights.unsqueeze(-1)
1078 | 
1079 |                 target = model_input
1080 | 
1081 |                 loss = torch.mean((weights * (model_pred - target) ** 2).reshape(batch_size, -1), dim=1)
1082 |                 loss = loss.mean()
1083 |                 accelerator.backward(loss)
1084 | 
1085 |                 if accelerator.sync_gradients:
1086 |                     params_to_clip = controlnet.parameters()
1087 |                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
1088 | 
1089 |                 if accelerator.state.deepspeed_plugin is None:
1090 |                     optimizer.step()
1091 |                     optimizer.zero_grad()
1092 | 
1093 |                 lr_scheduler.step()
1094 | 
1095 |             # Checks if the accelerator has performed an optimization step behind the scenes
1096 |             if accelerator.sync_gradients:
1097 |                 progress_bar.update(1)
1098 |                 global_step += 1
1099 | 
1100 |                 if accelerator.is_main_process:
1101 |                     if global_step % args.checkpointing_steps == 0:
1102 |                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}.pt")
1103 |                         torch.save({'state_dict': unwrap_model(controlnet).state_dict()}, save_path)
1104 |                         logger.info(f"Saved state to {save_path}")
1105 | 
1106 |             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
1107 |             progress_bar.set_postfix(**logs)
1108 |             accelerator.log(logs, step=global_step)
1109 | 
1110 |             if global_step >= args.max_train_steps:
1111 |                 break
1112 | 
1113 |             if accelerator.is_main_process:
1114 |                 if args.validation_prompt is not None and (step + 1) % args.validation_steps == 0:
1115 |                     # Create pipeline
1116 |                     pipe = ControlnetCogVideoXPipeline.from_pretrained(
1117 |                         args.pretrained_model_name_or_path,
1118 |                         transformer=unwrap_model(transformer),
1119 |                         text_encoder=unwrap_model(text_encoder),
1120 |                         vae=unwrap_model(vae),
1121 |                         controlnet=unwrap_model(controlnet),
1122 |                         scheduler=scheduler,
1123 |                         torch_dtype=weight_dtype,
1124 |                     )
1125 |     
1126 |                     validation_prompts = args.validation_prompt.split(args.validation_prompt_separator)
1127 |                     validation_videos = args.validation_video.split(args.validation_prompt_separator)
1128 |                     for validation_prompt, validation_video in zip(validation_prompts, validation_videos):
1129 |                         numpy_frames = read_video(validation_video, frames_count=args.max_num_frames)
1130 |                         controlnet_frames = np.stack([train_dataset.controlnet_processor(x) for x in numpy_frames])
1131 |                         pipeline_args = {
1132 |                             "prompt": validation_prompt,
1133 |                             "controlnet_frames": controlnet_frames,
1134 |                             "guidance_scale": args.guidance_scale,
1135 |                             "use_dynamic_cfg": args.use_dynamic_cfg,
1136 |                             "height": args.height,
1137 |                             "width": args.width,
1138 |                             "num_frames": args.max_num_frames,
1139 |                             "num_inference_steps": args.num_inference_steps,
1140 |                             "controlnet_weights": args.controlnet_weights,
1141 |                         }
1142 |     
1143 |                         validation_outputs = log_validation(
1144 |                             pipe=pipe,
1145 |                             args=args,
1146 |                             accelerator=accelerator,
1147 |                             pipeline_args=pipeline_args,
1148 |                             epoch=epoch,
1149 |                         )
1150 |     
1151 |     accelerator.wait_for_everyone()
1152 |     accelerator.end_training()
1153 | 
1154 | 
1155 | if __name__ == "__main__":
1156 |     args = get_args()
1157 |     main(args)


--------------------------------------------------------------------------------