├── .gitignore
├── LICENSE.txt
├── README.md
├── __assets__
    ├── animations
    │   ├── compare
    │   │   ├── ffmpeg
    │   │   ├── new_0.gif
    │   │   ├── new_1.gif
    │   │   ├── new_2.gif
    │   │   ├── new_3.gif
    │   │   ├── old_0.gif
    │   │   ├── old_1.gif
    │   │   ├── old_2.gif
    │   │   └── old_3.gif
    │   ├── control
    │   │   ├── canny
    │   │   │   ├── dance_1girl.gif
    │   │   │   ├── dance_medival_portrait.gif
    │   │   │   └── smiling_medival_portrait.gif
    │   │   ├── depth
    │   │   │   ├── smiling_1girl.gif
    │   │   │   ├── smiling_forbidden_castle.gif
    │   │   │   ├── smiling_halo.gif
    │   │   │   ├── smiling_medival.gif
    │   │   │   ├── smiling_realistic_0.gif
    │   │   │   ├── smiling_realistic_1.gif
    │   │   │   └── smiling_realistic_2.gif
    │   │   ├── original
    │   │   │   ├── dance_original_16_2.gif
    │   │   │   └── smiling_original_16_2.gif
    │   │   └── softedge
    │   │   │   ├── dance_1girl.gif
    │   │   │   └── smiling_realistic_0.gif
    │   ├── model_01
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    │   ├── model_02
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    │   ├── model_03
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    │   ├── model_04
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    │   ├── model_05
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    │   ├── model_06
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    │   ├── model_07
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   ├── 04.gif
    │   │   └── init.jpg
    │   └── model_08
    │   │   ├── 01.gif
    │   │   ├── 02.gif
    │   │   ├── 03.gif
    │   │   └── 04.gif
    └── figs
    │   └── gradio.jpg
├── animatediff
    ├── controlnet
    │   ├── controlnet_module.py
    │   └── controlnet_processors.py
    ├── data
    │   └── dataset.py
    ├── models
    │   ├── attention.py
    │   ├── motion_module.py
    │   ├── resnet.py
    │   ├── unet.py
    │   └── unet_blocks.py
    ├── pipelines
    │   └── pipeline_animation.py
    └── utils
    │   ├── convert_from_ckpt.py
    │   ├── convert_lora_safetensor_to_diffusers.py
    │   └── util.py
├── app.py
├── configs
    ├── inference
    │   ├── inference-v1.yaml
    │   └── inference-v2.yaml
    ├── prompts
    │   ├── 1-ToonYou-Controlnet.yaml
    │   ├── 1-ToonYou.yaml
    │   ├── 2-Lyriel-Controlnet.yaml
    │   ├── 2-Lyriel.yaml
    │   ├── 3-RcnzCartoon-Controlnet.yaml
    │   ├── 3-RcnzCartoon.yaml
    │   ├── 4-MajicMix.yaml
    │   ├── 5-RealisticVision.yaml
    │   ├── 6-Tusun.yaml
    │   ├── 7-FilmVelvia.yaml
    │   ├── 8-GhibliBackground.yaml
    │   └── v2
    │   │   ├── 1-ToonYou-Controlnet.yaml
    │   │   ├── 2-Lyriel-Controlnet.yaml
    │   │   ├── 3-RcnzCartoon-Controlnet.yaml
    │   │   ├── 5-RealisticVision-Controlnet.yaml
    │   │   └── 5-RealisticVision.yaml
    └── training
    │   ├── image_finetune.yaml
    │   └── training.yaml
├── download_bashscripts
    ├── 0-MotionModule.sh
    ├── 1-ToonYou.sh
    ├── 2-Lyriel.sh
    ├── 3-RcnzCartoon.sh
    ├── 4-MajicMix.sh
    ├── 5-RealisticVision.sh
    ├── 6-Tusun.sh
    ├── 7-FilmVelvia.sh
    ├── 8-GhibliBackground.sh
    └── 9-Controlnets.sh
├── environment.yaml
├── models
    ├── Controlnet
    │   └── Put controlnet models repo here.txt
    ├── DreamBooth_LoRA
    │   └── Put personalized T2I checkpoints here.txt
    ├── Motion_Module
    │   └── Put motion module checkpoints here.txt
    └── StableDiffusion
    │   └── Put diffusers stable-diffusion-v1-5 repo here.txt
├── scripts
    └── animate.py
├── train.py
└── videos
    ├── Put your short videos here.txt
    ├── dance.mp4
    └── smiling.mp4


/.gitignore:
--------------------------------------------------------------------------------
 1 | samples/
 2 | wandb/
 3 | outputs/
 4 | __pycache__/
 5 | 
 6 | scripts/animate_inter.py
 7 | scripts/gradio_app.py
 8 | models/Controlnet/*
 9 | models/DreamBooth_LoRA/*
10 | models/DreamBooth_LoRA/Put*personalized*T2I*checkpoints*here.txt
11 | models/Motion_Module/*
12 | models/*
13 | *.ipynb
14 | *.safetensors
15 | *.ckpt
16 | .ossutil_checkpoint/
17 | ossutil_output/
18 | debugs/
19 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Controled AnimateDiff (V2 is also available)
  2 | 
  3 | This repository is an <b>Controlnet Extension</b> of the official implementation of [AnimateDiff](https://arxiv.org/abs/2307.04725).
  4 | 
  5 | **[AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725)**
  6 | </br>
  7 | Yuwei Guo,
  8 | Ceyuan Yang*,
  9 | Anyi Rao,
 10 | Yaohui Wang,
 11 | Yu Qiao,
 12 | Dahua Lin,
 13 | Bo Dai
 14 | <p style="font-size: 0.8em; margin-top: -1em">*Corresponding Author</p>
 15 | 
 16 | <!-- [Arxiv Report](https://arxiv.org/abs/2307.04725) | [Project Page](https://animatediff.github.io/) -->
 17 | [![arXiv](https://img.shields.io/badge/arXiv-2307.04725-b31b1b.svg)](https://arxiv.org/abs/2307.04725)
 18 | [![Project Page](https://img.shields.io/badge/Project-Website-green)](https://animatediff.github.io/)
 19 | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Masbfca/AnimateDiff)
 20 | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/guoyww/AnimateDiff)
 21 | 
 22 | ***WARNING! This version works as well as official but not compatible with the official implementation due to the difference of library versions.***
 23 | 
 24 | <table width="1200" class="center">
 25 |     <tr>
 26 |     <td><img src="__assets__/animations/control/original/dance_original_16_2.gif"></td>
 27 |     <td><img src="__assets__/animations/control/softedge/dance_1girl.gif"></td>
 28 |     <td><img src="__assets__/animations/control/canny/dance_1girl.gif"></td>
 29 |     <td><img src="__assets__/animations/control/canny/dance_medival_portrait.gif"></td>
 30 |     </tr>
 31 | </table>  
 32 | <table width="1200" class="center">
 33 |     <tr>
 34 |       <td><img src="__assets__/animations/control/original/smiling_original_16_2.gif"></td>
 35 |       <td><img src="__assets__/animations/control/depth/smiling_realistic_0.gif"></td>
 36 |       <td><img src="__assets__/animations/control/depth/smiling_realistic_1.gif"></td>
 37 |       <td><img src="__assets__/animations/control/depth/smiling_realistic_2.gif"></td>
 38 |     </tr>
 39 |     <tr>
 40 |       <td><img src="__assets__/animations/control/depth/smiling_1girl.gif"></td>
 41 |       <td><img src="__assets__/animations/control/depth/smiling_forbidden_castle.gif"></td>
 42 |       <td><img src="__assets__/animations/control/depth/smiling_halo.gif"></td>
 43 |       <td><img src="__assets__/animations/control/depth/smiling_medival.gif"></td>
 44 |     </tr>
 45 | </table>  
 46 | Test video sources: <a href="https://stable-diffusion-art.com/video-to-video/">dance</a> and <a href="https://mixkit.co/free-stock-video/girl-smiling-portrait-in-the-library-4756/">smiling</a>.  
 47 | 
 48 | ## Todo
 49 | - [x] Add Controlnet in the pipeline.
 50 | - [x] Add Controlnet in Gradio Demo.
 51 | - [X] Optimize code in attention processor style. 
 52 | 
 53 | ## Features
 54 | - Added Controlnet for Video to Video control.
 55 | - GPU Memory, ~12-14GB VRAM to inference w/o Controlnet and ~15-17GB VRAM with Controlnet.
 56 | 
 57 | - **[2023/09/10]** New Motion Module release ! `mm_sd_v15_v2.ckpt` was trained on larger resolution & batch size, and gains noticabe quality improvements.Check it out at [Google Drive](https://drive.google.com/drive/folders/1EqLC65eR1-W-sGD0Im7fkED6c8GkiNFI?usp=sharing) / [HuggingFace](https://huggingface.co/guoyww/animatediff) and use it with `configs/inference/inference-v2.yaml`. Example:
 58 |   ```
 59 |   python -m scripts.animate --config configs/prompts/v2/5-RealisticVision.yaml
 60 |   ```
 61 |   Here is a qualitative comparison between `mm_sd_v15.ckpt` (left) and `mm_sd_v15_v2.ckpt` (right):
 62 |   <table class="center">
 63 |       <tr>
 64 |       <td><img src="__assets__/animations/compare/old_0.gif"></td>
 65 |       <td><img src="__assets__/animations/compare/new_0.gif"></td>
 66 |       <td><img src="__assets__/animations/compare/old_1.gif"></td>
 67 |       <td><img src="__assets__/animations/compare/new_1.gif"></td>
 68 |       <td><img src="__assets__/animations/compare/old_2.gif"></td>
 69 |       <td><img src="__assets__/animations/compare/new_2.gif"></td>
 70 |       <td><img src="__assets__/animations/compare/old_3.gif"></td>
 71 |       <td><img src="__assets__/animations/compare/new_3.gif"></td>
 72 |       </tr>
 73 |   </table>
 74 | - GPU Memory Optimization, ~12GB VRAM to inference
 75 | 
 76 | - User Interface: [Gradio](#gradio-demo), A1111 WebUI Extension [sd-webui-animatediff](https://github.com/continue-revolution/sd-webui-animatediff) (by [@continue-revolution](https://github.com/continue-revolution))
 77 | - Google Colab: [Colab](https://colab.research.google.com/github/camenduru/AnimateDiff-colab/blob/main/AnimateDiff_colab.ipynb) (by [@camenduru](https://github.com/camenduru))
 78 | 
 79 | ## Common Issues
 80 | <details>
 81 | <summary>Installation</summary>
 82 | 
 83 | Please ensure the installation of [xformer](https://github.com/facebookresearch/xformers) that is applied to reduce the inference memory.
 84 | </details>
 85 | 
 86 | 
 87 | <details>
 88 | <summary>Various resolution or number of frames</summary>
 89 | Currently, we recommend users to generate animation with 16 frames and 512 resolution that are aligned with our training settings. Notably, various resolution/frames may affect the quality more or less. 
 90 | </details>
 91 | 
 92 | 
 93 | <details>
 94 | <summary>How to use it without any coding</summary>
 95 | 
 96 | 1) Get lora models: train lora model with [A1111](https://github.com/continue-revolution/sd-webui-animatediff) based on a collection of your own favorite images (e.g., tutorials [English](https://www.youtube.com/watch?v=mfaqqL5yOO4), [Japanese](https://www.youtube.com/watch?v=N1tXVR9lplM), [Chinese](https://www.bilibili.com/video/BV1fs4y1x7p2/)) 
 97 | or download Lora models from [Civitai](https://civitai.com/).
 98 | 
 99 | 2) Animate lora models: using gradio interface or A1111 
100 | (e.g., tutorials [English](https://github.com/continue-revolution/sd-webui-animatediff), [Japanese](https://www.youtube.com/watch?v=zss3xbtvOWw), [Chinese](https://941ai.com/sd-animatediff-webui-1203.html)) 
101 | 
102 | 3) Be creative togther with other techniques, such as, super resolution, frame interpolation, music generation, etc.
103 | </details>
104 | 
105 | 
106 | <details>
107 | <summary>Animating a given image</summary>
108 | 
109 | We totally agree that animating a given image is an appealing feature, which we would try to support officially in future. For now, you may enjoy other efforts from the [talesofai](https://github.com/talesofai/AnimateDiff).  
110 | </details>
111 | 
112 | <details>
113 | <summary>Contributions from community</summary>
114 | Contributions are always welcome!! The <code>dev</code> branch is for community contributions. As for the main branch, we would like to align it with the original technical report :)
115 | </details>
116 | 
117 | 
118 | ## Setups for Inference
119 | 
120 | ### Prepare Environment
121 | 
122 | ```
123 | git clone https://github.com/guoyww/AnimateDiff.git
124 | cd AnimateDiff
125 | 
126 | conda env create -f environment.yaml
127 | conda activate animatediff
128 | ```
129 | 
130 | ### Download Base T2I & Motion Module Checkpoints
131 | We provide two versions of our Motion Module, which are trained on stable-diffusion-v1-4 and finetuned on v1-5 seperately.
132 | It's recommanded to try both of them for best results.
133 | ```
134 | git lfs install
135 | git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 models/StableDiffusion/
136 | 
137 | bash download_bashscripts/0-MotionModule.sh
138 | ```
139 | You may also directly download the motion module checkpoints from [Google Drive](https://drive.google.com/drive/folders/1EqLC65eR1-W-sGD0Im7fkED6c8GkiNFI?usp=sharing) / [HuggingFace](https://huggingface.co/guoyww/animatediff) / [CivitAI](https://civitai.com/models/108836), then put them in `models/Motion_Module/` folder.
140 | 
141 | ### Prepare Personalize T2I
142 | Here we provide inference configs for 6 demo T2I on CivitAI.
143 | You may run the following bash scripts to download these checkpoints.
144 | ```
145 | bash download_bashscripts/1-ToonYou.sh
146 | bash download_bashscripts/2-Lyriel.sh
147 | bash download_bashscripts/3-RcnzCartoon.sh
148 | bash download_bashscripts/4-MajicMix.sh
149 | bash download_bashscripts/5-RealisticVision.sh
150 | bash download_bashscripts/6-Tusun.sh
151 | bash download_bashscripts/7-FilmVelvia.sh
152 | bash download_bashscripts/8-GhibliBackground.sh
153 | ```
154 | 
155 | ### Inference
156 | After downloading the above peronalized T2I checkpoints, run the following commands to generate animations. The results will automatically be saved to `samples/` folder.
157 | ```
158 | python -m scripts.animate --config configs/prompts/1-ToonYou.yaml
159 | python -m scripts.animate --config configs/prompts/2-Lyriel.yaml
160 | python -m scripts.animate --config configs/prompts/3-RcnzCartoon.yaml
161 | python -m scripts.animate --config configs/prompts/4-MajicMix.yaml
162 | python -m scripts.animate --config configs/prompts/5-RealisticVision.yaml
163 | python -m scripts.animate --config configs/prompts/6-Tusun.yaml
164 | python -m scripts.animate --config configs/prompts/7-FilmVelvia.yaml
165 | python -m scripts.animate --config configs/prompts/8-GhibliBackground.yaml
166 | ```
167 | 
168 | To generate animations with a new DreamBooth/LoRA model, you may create a new config `.yaml` file in the following format:
169 | ```
170 | NewModel:
171 |   path: "[path to your DreamBooth/LoRA model .safetensors file]"
172 |   base: "[path to LoRA base model .safetensors file, leave it empty string if not needed]"
173 | 
174 |   motion_module:
175 |     - "models/Motion_Module/mm_sd_v14.ckpt"
176 |     - "models/Motion_Module/mm_sd_v15.ckpt"
177 |     
178 |   steps:          25
179 |   guidance_scale: 7.5
180 | 
181 |   prompt:
182 |     - "[positive prompt]"
183 | 
184 |   n_prompt:
185 |     - "[negative prompt]"
186 | ```
187 | Then run the following commands:
188 | ```
189 | python -m scripts.animate --config [path to the config file]
190 | ```
191 | ## Inference with Controlnet
192 | Controlnet appoach is using video as source of content. It takes first `L` (usualy 16) frames from video. 
193 | 
194 | Download controlnet models using script:
195 | ```bash
196 | bash download_bashscripts/9-Controlnets.sh
197 | ```  
198 | 
199 | Run examples:
200 | ```bash
201 | python -m scripts.animate --config configs/prompts/1-ToonYou-Controlnet.yaml
202 | python -m scripts.animate --config configs/prompts/2-Lyriel-Controlnet.yaml
203 | python -m scripts.animate --config configs/prompts/3-RcnzCartoon-Controlnet.yaml
204 | ```
205 | 
206 | Add controlnet to other config (see example in 1-ToonYou-Controlnet.yaml):
207 | ```yaml
208 | control:
209 |   video_path: "./videos/smiling.mp4"
210 |   get_each: 2 # get each frame from video
211 |   controlnet_processor: "softedge" # softedge, canny, depth
212 |   controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
213 |   controlnet_processor_path: "models/Controlnet/control_v11p_sd15_softedge" # control_v11p_sd15_softedge, control_v11f1p_sd15_depth, control_v11p_sd15_canny
214 |   guess_mode: True
215 | ```
216 | 
217 | ## Steps for Training
218 | 
219 | ### Dataset
220 | Before training, download the videos files and the `.csv` annotations of [WebVid10M](https://maxbain.com/webvid-dataset/) to the local mechine.
221 | Note that our examplar training script requires all the videos to be saved in a single folder. You may change this by modifying `animatediff/data/dataset.py`.
222 | 
223 | ### Configuration
224 | After dataset preparations, update the below data paths in the config `.yaml` files in `configs/training/` folder:
225 | ```
226 | train_data:
227 |   csv_path:     [Replace with .csv Annotation File Path]
228 |   video_folder: [Replace with Video Folder Path]
229 |   sample_size:  256
230 | ```
231 | Other training parameters (lr, epochs, validation settings, etc.) are also included in the config files.
232 | 
233 | ### Training
234 | To train motion modules
235 | ```
236 | torchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/training.yaml
237 | ```
238 | 
239 | To finetune the unet's image layers
240 | ```
241 | torchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/image_finetune.yaml
242 | ```
243 | 
244 | 
245 | ## Gradio Demo
246 | We have created a Gradio demo to make AnimateDiff easier to use. To launch the demo, please run the following commands:
247 | ```
248 | conda activate animatediff
249 | python app.py
250 | ```
251 | By default, the demo will run at `localhost:7860`.
252 | Be sure that imageio with backend is installed. (pip install imageio[ffmpeg])
253 | 
254 | <br><img src="__assets__/figs/gradio.jpg" style="width: 50em; margin-top: 1em">
255 | 
256 | ## Gallery
257 | Here we demonstrate several best results we found in our experiments.
258 | 
259 | <table class="center">
260 |     <tr>
261 |     <td><img src="__assets__/animations/model_01/01.gif"></td>
262 |     <td><img src="__assets__/animations/model_01/02.gif"></td>
263 |     <td><img src="__assets__/animations/model_01/03.gif"></td>
264 |     <td><img src="__assets__/animations/model_01/04.gif"></td>
265 |     </tr>
266 | </table>
267 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/30240/toonyou">ToonYou</a></p>
268 | 
269 | <table>
270 |     <tr>
271 |     <td><img src="__assets__/animations/model_02/01.gif"></td>
272 |     <td><img src="__assets__/animations/model_02/02.gif"></td>
273 |     <td><img src="__assets__/animations/model_02/03.gif"></td>
274 |     <td><img src="__assets__/animations/model_02/04.gif"></td>
275 |     </tr>
276 | </table>
277 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/4468/counterfeit-v30">Counterfeit V3.0</a></p>
278 | 
279 | <table>
280 |     <tr>
281 |     <td><img src="__assets__/animations/model_03/01.gif"></td>
282 |     <td><img src="__assets__/animations/model_03/02.gif"></td>
283 |     <td><img src="__assets__/animations/model_03/03.gif"></td>
284 |     <td><img src="__assets__/animations/model_03/04.gif"></td>
285 |     </tr>
286 | </table>
287 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/4201/realistic-vision-v20">Realistic Vision V2.0</a></p>
288 | 
289 | <table>
290 |     <tr>
291 |     <td><img src="__assets__/animations/model_04/01.gif"></td>
292 |     <td><img src="__assets__/animations/model_04/02.gif"></td>
293 |     <td><img src="__assets__/animations/model_04/03.gif"></td>
294 |     <td><img src="__assets__/animations/model_04/04.gif"></td>
295 |     </tr>
296 | </table>
297 | <p style="margin-left: 2em; margin-top: -1em">Model： <a href="https://civitai.com/models/43331/majicmix-realistic">majicMIX Realistic</a></p>
298 | 
299 | <table>
300 |     <tr>
301 |     <td><img src="__assets__/animations/model_05/01.gif"></td>
302 |     <td><img src="__assets__/animations/model_05/02.gif"></td>
303 |     <td><img src="__assets__/animations/model_05/03.gif"></td>
304 |     <td><img src="__assets__/animations/model_05/04.gif"></td>
305 |     </tr>
306 | </table>
307 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/66347/rcnz-cartoon-3d">RCNZ Cartoon</a></p>
308 | 
309 | <table>
310 |     <tr>
311 |     <td><img src="__assets__/animations/model_06/01.gif"></td>
312 |     <td><img src="__assets__/animations/model_06/02.gif"></td>
313 |     <td><img src="__assets__/animations/model_06/03.gif"></td>
314 |     <td><img src="__assets__/animations/model_06/04.gif"></td>
315 |     </tr>
316 | </table>
317 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/33208/filmgirl-film-grain-lora-and-loha">FilmVelvia</a></p>
318 | 
319 | #### Community Cases
320 | Here are some samples contributed by the community artists. Create a Pull Request if you would like to show your results here😚.
321 | 
322 | <table>
323 |     <tr>
324 |     <td><img src="__assets__/animations/model_07/init.jpg"></td>
325 |     <td><img src="__assets__/animations/model_07/01.gif"></td>
326 |     <td><img src="__assets__/animations/model_07/02.gif"></td>
327 |     <td><img src="__assets__/animations/model_07/03.gif"></td>
328 |     <td><img src="__assets__/animations/model_07/04.gif"></td>
329 |     </tr>
330 | </table>
331 | <p style="margin-left: 2em; margin-top: -1em">
332 | Character Model：<a href="https://civitai.com/models/13237/genshen-impact-yoimiya">Yoimiya</a> 
333 | (with an initial reference image, see <a href="https://github.com/talesofai/AnimateDiff">WIP fork</a> for the extended implementation.)
334 | 
335 | 
336 | <table>
337 |     <tr>
338 |     <td><img src="__assets__/animations/model_08/01.gif"></td>
339 |     <td><img src="__assets__/animations/model_08/02.gif"></td>
340 |     <td><img src="__assets__/animations/model_08/03.gif"></td>
341 |     <td><img src="__assets__/animations/model_08/04.gif"></td>
342 |     </tr>
343 | </table>
344 | <p style="margin-left: 2em; margin-top: -1em">
345 | Character Model：<a href="https://civitai.com/models/9850/paimon-genshin-impact">Paimon</a>;
346 | Pose Model：<a href="https://civitai.com/models/107295/or-holdingsign">Hold Sign</a></p>
347 | 
348 | ## BibTeX
349 | ```
350 | @article{guo2023animatediff,
351 |   title={AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning},
352 |   author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Wang, Yaohui and Qiao, Yu and Lin, Dahua and Dai, Bo},
353 |   journal={arXiv preprint arXiv:2307.04725},
354 |   year={2023}
355 | }
356 | ```
357 | 
358 | ## Contact Us
359 | **Yuwei Guo**: [guoyuwei@pjlab.org.cn](mailto:guoyuwei@pjlab.org.cn)  
360 | **Ceyuan Yang**: [yangceyuan@pjlab.org.cn](mailto:yangceyuan@pjlab.org.cn)  
361 | **Bo Dai**: [daibo@pjlab.org.cn](mailto:daibo@pjlab.org.cn)
362 | 
363 | ## Acknowledgements
364 | Codebase built upon [Tune-a-Video](https://github.com/showlab/Tune-A-Video).
365 | 


--------------------------------------------------------------------------------
/__assets__/animations/compare/ffmpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/ffmpeg


--------------------------------------------------------------------------------
/__assets__/animations/compare/new_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/new_0.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/new_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/new_1.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/new_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/new_2.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/new_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/new_3.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/old_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/old_0.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/old_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/old_1.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/old_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/old_2.gif


--------------------------------------------------------------------------------
/__assets__/animations/compare/old_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/compare/old_3.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/canny/dance_1girl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/canny/dance_1girl.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/canny/dance_medival_portrait.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/canny/dance_medival_portrait.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/canny/smiling_medival_portrait.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/canny/smiling_medival_portrait.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_1girl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_1girl.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_forbidden_castle.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_forbidden_castle.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_halo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_halo.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_medival.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_medival.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_realistic_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_realistic_0.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_realistic_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_realistic_1.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/depth/smiling_realistic_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/depth/smiling_realistic_2.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/original/dance_original_16_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/original/dance_original_16_2.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/original/smiling_original_16_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/original/smiling_original_16_2.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/softedge/dance_1girl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/softedge/dance_1girl.gif


--------------------------------------------------------------------------------
/__assets__/animations/control/softedge/smiling_realistic_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/control/softedge/smiling_realistic_0.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_01/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_01/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_01/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_01/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_01/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_01/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_01/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_01/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_02/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_02/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_02/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_02/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_02/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_02/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_02/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_02/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_03/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_03/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_03/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_03/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_03/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_03/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_03/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_03/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_04/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_04/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_04/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_04/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_04/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_04/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_04/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_04/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_05/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_05/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_05/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_05/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_05/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_05/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_05/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_05/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_06/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_06/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_06/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_06/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_06/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_06/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_06/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_06/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_07/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_07/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_07/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_07/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_07/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_07/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_07/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_07/04.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_07/init.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_07/init.jpg


--------------------------------------------------------------------------------
/__assets__/animations/model_08/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_08/01.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_08/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_08/02.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_08/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_08/03.gif


--------------------------------------------------------------------------------
/__assets__/animations/model_08/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/animations/model_08/04.gif


--------------------------------------------------------------------------------
/__assets__/figs/gradio.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/__assets__/figs/gradio.jpg


--------------------------------------------------------------------------------
/animatediff/controlnet/controlnet_module.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Any
  3 | 
  4 | import cv2
  5 | import torch
  6 | from PIL import Image
  7 | from tqdm import tqdm
  8 | from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
  9 | 
 10 | from .controlnet_processors import CONTROLNET_PROCESSORS
 11 | 
 12 | 
 13 | def get_video_info(cap):
 14 |     width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
 15 |     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 16 |     fps = int(cap.get(cv2.CAP_PROP_FPS))
 17 |     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 18 |     return height, width, fps, frame_count
 19 | 
 20 | 
 21 | class ControlnetModule:
 22 |     def __init__(self, config):
 23 |         self.config = config
 24 |         self.video_length = self.config['video_length']
 25 |         self.img_w = self.config['img_w']
 26 |         self.img_h = self.config['img_h']
 27 |         self.do_cfg = self.config['guidance_scale'] > 1.0
 28 |         self.num_inference_steps = config['steps']
 29 |         self.guess_mode = config['guess_mode']
 30 |         self.conditioning_scale = config['conditioning_scale']
 31 |         self.device = config['device']
 32 | 
 33 |         controlnet_info = CONTROLNET_PROCESSORS[self.config['controlnet_processor']]
 34 | 
 35 |         if ('controlnet_processor_path' not in config) or not len(config['controlnet_processor_path']):
 36 |             config['controlnet_processor_path'] = controlnet_info['controlnet']
 37 | 
 38 |         controlnet = ControlNetModel.from_pretrained(
 39 |             controlnet_info['controlnet'], torch_dtype=torch.float16)
 40 |         
 41 |         if controlnet_info['is_custom']:
 42 |             self.processor = controlnet_info['processor'](
 43 |                 **controlnet_info['processor_params'])
 44 |         else:
 45 |             self.processor = controlnet_info['processor'].from_pretrained(
 46 |                 'lllyasviel/Annotators')
 47 |             
 48 |         self.controlnet_pipe = StableDiffusionControlNetPipeline.from_pretrained(
 49 |             config['controlnet_pipeline'], #"runwayml/stable-diffusion-v1-5", 
 50 |             controlnet=controlnet, 
 51 |             torch_dtype=torch.float16
 52 |         )
 53 | 
 54 |         del self.controlnet_pipe.vae
 55 |         del self.controlnet_pipe.unet
 56 |         del self.controlnet_pipe.feature_extractor
 57 | 
 58 |         self.controlnet_pipe.to(self.device)
 59 | 
 60 |     def process_video(self, video_path):
 61 |         cap = cv2.VideoCapture(video_path)
 62 |         orig_height, orig_width, fps, frames_count = get_video_info(cap)
 63 |         print('| --- START VIDEO PROCESSING --- |')
 64 |         print(f'| HxW: {orig_height}x{orig_width} | FPS: {fps} | FRAMES COUNT: {frames_count} |')
 65 | 
 66 |         get_each = self.config.get('get_each', 1)
 67 |         processed_images = []
 68 | 
 69 |         for frame_index in tqdm(range(self.config['video_length'] * get_each)):
 70 |             ret, image = cap.read()
 71 |             if not ret or image is None:
 72 |                 break
 73 |             
 74 |             if frame_index % get_each != 0:
 75 |                 continue
 76 |             
 77 |             image = cv2.resize(image, (self.img_w, self.img_h))
 78 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 79 |             
 80 |             condition_image = self.processor(Image.fromarray(image))
 81 |             processed_images.append(condition_image)
 82 | 
 83 |         return processed_images
 84 | 
 85 |     def generate_control_blocks(self, processed_images, prompt, negative_prompt, seed):
 86 |         print('| --- EXTRACT CONTROLNET FEATURES --- |')
 87 | 
 88 |         shape = (1, 4, self.video_length, self.img_h // 8, self.img_w // 8)
 89 |         generator = torch.Generator(device=self.device).manual_seed(seed)
 90 |         control_latents = torch.randn(
 91 |             shape, 
 92 |             generator=generator, 
 93 |             device=self.device, 
 94 |             dtype=torch.float16
 95 |         )
 96 | 
 97 |         prompt_embeds = self.controlnet_pipe._encode_prompt(
 98 |                     prompt,
 99 |                     self.device,
100 |                     1,
101 |                     self.do_cfg,
102 |                     negative_prompt,
103 |                     prompt_embeds=None,
104 |                     negative_prompt_embeds=None,
105 |                     lora_scale=None,
106 |                 )
107 | 
108 |         self.controlnet_pipe.scheduler.set_timesteps(self.num_inference_steps, device=self.device)
109 |         timesteps = self.controlnet_pipe.scheduler.timesteps
110 | 
111 |         control_blocks = []
112 |         for t in tqdm(timesteps):
113 |             down_block_samples = []
114 |             mid_block_samples = []
115 | 
116 |             for img_index, image in enumerate(processed_images):
117 |                 latents = control_latents[:, :, img_index, :, :]
118 |                 image = self.controlnet_pipe.control_image_processor.preprocess(
119 |                     image, 
120 |                     height=self.img_h,
121 |                     width=self.img_w
122 |                 ).to(dtype=torch.float32)
123 |                 
124 |                 image = image.repeat_interleave(1, dim=0)
125 |                 image = image.to(device=self.device, dtype=torch.float16)
126 | 
127 |                 if self.do_cfg and not self.guess_mode:
128 |                     image = torch.cat([image] * 2)
129 | 
130 |                 latent_model_input = torch.cat([latents] * 2) if self.do_cfg else latents
131 |                 latent_model_input = self.controlnet_pipe.scheduler.scale_model_input(latent_model_input, t)
132 | 
133 |                 if self.guess_mode and self.do_cfg:
134 |                     control_model_input = latents
135 |                     control_model_input = self.controlnet_pipe.scheduler.scale_model_input(control_model_input, t)
136 |                     controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
137 |                 else:
138 |                     control_model_input = latent_model_input
139 |                     controlnet_prompt_embeds = prompt_embeds
140 | 
141 |                 down_block_res_samples, mid_block_res_sample = self.controlnet_pipe.controlnet(
142 |                     control_model_input.to(self.device),
143 |                     t,
144 |                     encoder_hidden_states=controlnet_prompt_embeds.to(self.device),
145 |                     controlnet_cond=image,
146 |                     conditioning_scale=self.conditioning_scale,
147 |                     guess_mode=self.guess_mode,
148 |                     return_dict=False,
149 |                 )
150 | 
151 |                 if self.guess_mode and self.do_cfg:
152 |                     down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
153 |                     mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
154 | 
155 |                 down_block_samples.append([x.detach().cpu() for x in down_block_res_samples])
156 |                 mid_block_samples.append(mid_block_res_sample.detach().cpu())
157 |                 
158 |             control_blocks.append({
159 |                 'down_block_samples': down_block_samples,
160 |                 'mid_block_samples': mid_block_samples,
161 |             })
162 | 
163 |         return control_blocks
164 | 
165 |     def resort_features(self, control_blocks):
166 |         mid_blocks = []
167 |         down_blocks = []
168 | 
169 |         for c_block in control_blocks:
170 |             d_blocks = defaultdict(list)
171 |             for image_weights in c_block['down_block_samples']:
172 |                 for b_index, block_weights in enumerate(image_weights):
173 |                     d_blocks[b_index] += block_weights.unsqueeze(0)
174 |             
175 |             down_block = []
176 |             for _, value in d_blocks.items():
177 |                 down_block.append(torch.stack(value).permute(1, 2, 0, 3, 4))
178 |             
179 |             mid_block = torch.stack(c_block['mid_block_samples']).permute(1, 2, 0, 3, 4)
180 | 
181 |             down_blocks.append(down_block)
182 |             mid_blocks.append(mid_block)
183 | 
184 |         return down_blocks, mid_blocks
185 |     
186 |     def __call__(self, video_path, prompt, negative_prompt, generator):
187 |         processed_images = self.process_video(video_path)
188 |         control_blocks = self.generate_control_blocks(
189 |             processed_images, prompt, negative_prompt, generator)
190 |         down_features, mid_features = self.resort_features(control_blocks)
191 |         return down_features, mid_features
192 | 


--------------------------------------------------------------------------------
/animatediff/controlnet/controlnet_processors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import cv2
 3 | import numpy as np
 4 | from PIL import Image
 5 | from transformers import pipeline
 6 | from controlnet_aux import HEDdetector, OpenposeDetector, NormalBaeDetector
 7 | 
 8 | 
 9 | class CannyProcessor:
10 |     def __init__(self, t1, t2, **kwargs):
11 |         self.t1 = t1
12 |         self.t2 = t2
13 | 
14 |     def __call__(self, input_image):
15 |         image = np.array(input_image)
16 |         image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
17 |         image = cv2.Canny(image, self.t1, self.t2)
18 |         image = image[:, :, None]
19 |         image = np.concatenate([image, image, image], axis=2)
20 |         control_image = Image.fromarray(image)
21 |         return control_image
22 | 
23 | 
24 | class DepthProcessor:
25 |     def __init__(self, **kwargs):
26 |         self.depth_estimator = pipeline('depth-estimation')
27 | 
28 |     def __call__(self, input_image):
29 |         image = self.depth_estimator(input_image)['depth']
30 |         image = np.array(image)
31 |         image = image[:, :, None]
32 |         image = np.concatenate([image, image, image], axis=2)
33 |         control_image = Image.fromarray(image)
34 |         return control_image
35 | 
36 | 
37 | CONTROLNET_PROCESSORS = {
38 |     'canny': {
39 |         'controlnet': 'lllyasviel/control_v11p_sd15_canny',
40 |         'processor': CannyProcessor,
41 |         'processor_params': {'t1': 50, 't2': 150},
42 |         'is_custom': True,
43 |     },
44 |     'depth': {
45 |         'controlnet': 'lllyasviel/control_v11f1p_sd15_depth',
46 |         'processor': DepthProcessor,
47 |         'processor_params': {},
48 |         'is_custom': True,
49 |     },
50 |     'softedge': {
51 |         'controlnet': 'lllyasviel/control_v11p_sd15_softedge',
52 |         'processor': HEDdetector,  # PidiNetDetector
53 |         'processor_params': {},
54 |         'is_custom': False,
55 |     },
56 |     'pose': {
57 |         'controlnet': 'lllyasviel/sd-controlnet-openpose',
58 |         'processor': OpenposeDetector,
59 |         'processor_params': {},
60 |         'is_custom': False,
61 |     },
62 |     'norm': {
63 |         'controlnet': 'lllyasviel/control_v11p_sd15_normalbae',
64 |         'processor': NormalBaeDetector,
65 |         'processor_params': {},
66 |         'is_custom': False,
67 |     },
68 | }
69 | 


--------------------------------------------------------------------------------
/animatediff/data/dataset.py:
--------------------------------------------------------------------------------
 1 | import os, io, csv, math, random
 2 | import numpy as np
 3 | from einops import rearrange
 4 | from decord import VideoReader
 5 | 
 6 | import torch
 7 | import torchvision.transforms as transforms
 8 | from torch.utils.data.dataset import Dataset
 9 | from animatediff.utils.util import zero_rank_print
10 | 
11 | 
12 | 
13 | class WebVid10M(Dataset):
14 |     def __init__(
15 |             self,
16 |             csv_path, video_folder,
17 |             sample_size=256, sample_stride=4, sample_n_frames=16,
18 |             is_image=False,
19 |         ):
20 |         zero_rank_print(f"loading annotations from {csv_path} ...")
21 |         with open(csv_path, 'r') as csvfile:
22 |             self.dataset = list(csv.DictReader(csvfile))
23 |         self.length = len(self.dataset)
24 |         zero_rank_print(f"data scale: {self.length}")
25 | 
26 |         self.video_folder    = video_folder
27 |         self.sample_stride   = sample_stride
28 |         self.sample_n_frames = sample_n_frames
29 |         self.is_image        = is_image
30 |         
31 |         sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
32 |         self.pixel_transforms = transforms.Compose([
33 |             transforms.RandomHorizontalFlip(),
34 |             transforms.Resize(sample_size[0]),
35 |             transforms.CenterCrop(sample_size),
36 |             transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
37 |         ])
38 |     
39 |     def get_batch(self, idx):
40 |         video_dict = self.dataset[idx]
41 |         videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
42 |         
43 |         video_dir    = os.path.join(self.video_folder, f"{videoid}.mp4")
44 |         video_reader = VideoReader(video_dir)
45 |         video_length = len(video_reader)
46 |         
47 |         if not self.is_image:
48 |             clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
49 |             start_idx   = random.randint(0, video_length - clip_length)
50 |             batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
51 |         else:
52 |             batch_index = [random.randint(0, video_length - 1)]
53 | 
54 |         pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
55 |         pixel_values = pixel_values / 255.
56 |         del video_reader
57 | 
58 |         if self.is_image:
59 |             pixel_values = pixel_values[0]
60 |         
61 |         return pixel_values, name
62 | 
63 |     def __len__(self):
64 |         return self.length
65 | 
66 |     def __getitem__(self, idx):
67 |         while True:
68 |             try:
69 |                 pixel_values, name = self.get_batch(idx)
70 |                 break
71 | 
72 |             except Exception as e:
73 |                 idx = random.randint(0, self.length-1)
74 | 
75 |         pixel_values = self.pixel_transforms(pixel_values)
76 |         sample = dict(pixel_values=pixel_values, text=name)
77 |         return sample
78 | 
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     from animatediff.utils.util import save_videos_grid
83 | 
84 |     dataset = WebVid10M(
85 |         csv_path="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv",
86 |         video_folder="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val",
87 |         sample_size=256,
88 |         sample_stride=4, sample_n_frames=16,
89 |         is_image=True,
90 |     )
91 |     import pdb
92 |     pdb.set_trace()
93 |     
94 |     dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=16,)
95 |     for idx, batch in enumerate(dataloader):
96 |         print(batch["pixel_values"].shape, len(batch["text"]))
97 |         # for i in range(batch["pixel_values"].shape[0]):
98 |         #     save_videos_grid(batch["pixel_values"][i:i+1].permute(0,2,1,3,4), os.path.join(".", f"{idx}-{i}.mp4"), rescale=True)
99 | 


--------------------------------------------------------------------------------
/animatediff/models/attention.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
  2 | 
  3 | from dataclasses import dataclass
  4 | from typing import Optional
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | 
  9 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 10 | from diffusers.models.modeling_utils import ModelMixin
 11 | from diffusers.utils import BaseOutput
 12 | from diffusers.utils.import_utils import is_xformers_available
 13 | from diffusers.models.attention import FeedForward, AdaLayerNorm
 14 | from diffusers.models.attention_processor import  Attention
 15 | 
 16 | from einops import rearrange, repeat
 17 | 
 18 | 
 19 | @dataclass
 20 | class Transformer3DModelOutput(BaseOutput):
 21 |     sample: torch.FloatTensor
 22 | 
 23 | 
 24 | if is_xformers_available():
 25 |     import xformers
 26 |     import xformers.ops
 27 | else:
 28 |     xformers = None
 29 | 
 30 | 
 31 | class Transformer3DModel(ModelMixin, ConfigMixin):
 32 |     @register_to_config
 33 |     def __init__(
 34 |         self,
 35 |         num_attention_heads: int = 16,
 36 |         attention_head_dim: int = 88,
 37 |         in_channels: Optional[int] = None,
 38 |         num_layers: int = 1,
 39 |         dropout: float = 0.0,
 40 |         norm_num_groups: int = 32,
 41 |         cross_attention_dim: Optional[int] = None,
 42 |         attention_bias: bool = False,
 43 |         activation_fn: str = "geglu",
 44 |         num_embeds_ada_norm: Optional[int] = None,
 45 |         use_linear_projection: bool = False,
 46 |         only_cross_attention: bool = False,
 47 |         upcast_attention: bool = False,
 48 | 
 49 |         unet_use_cross_frame_attention=None,
 50 |         unet_use_temporal_attention=None,
 51 |     ):
 52 |         super().__init__()
 53 |         self.use_linear_projection = use_linear_projection
 54 |         self.num_attention_heads = num_attention_heads
 55 |         self.attention_head_dim = attention_head_dim
 56 |         inner_dim = num_attention_heads * attention_head_dim
 57 | 
 58 |         # Define input layers
 59 |         self.in_channels = in_channels
 60 | 
 61 |         self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
 62 |         if use_linear_projection:
 63 |             self.proj_in = nn.Linear(in_channels, inner_dim)
 64 |         else:
 65 |             self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
 66 | 
 67 |         # Define transformers blocks
 68 |         self.transformer_blocks = nn.ModuleList(
 69 |             [
 70 |                 BasicTransformerBlock(
 71 |                     inner_dim,
 72 |                     num_attention_heads,
 73 |                     attention_head_dim,
 74 |                     dropout=dropout,
 75 |                     cross_attention_dim=cross_attention_dim,
 76 |                     activation_fn=activation_fn,
 77 |                     num_embeds_ada_norm=num_embeds_ada_norm,
 78 |                     attention_bias=attention_bias,
 79 |                     only_cross_attention=only_cross_attention,
 80 |                     upcast_attention=upcast_attention,
 81 | 
 82 |                     unet_use_cross_frame_attention=unet_use_cross_frame_attention,
 83 |                     unet_use_temporal_attention=unet_use_temporal_attention,
 84 |                 )
 85 |                 for d in range(num_layers)
 86 |             ]
 87 |         )
 88 | 
 89 |         # 4. Define output layers
 90 |         if use_linear_projection:
 91 |             self.proj_out = nn.Linear(in_channels, inner_dim)
 92 |         else:
 93 |             self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
 94 | 
 95 |     def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
 96 |         # Input
 97 |         assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
 98 |         video_length = hidden_states.shape[2]
 99 |         hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
100 |         encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
101 | 
102 |         batch, channel, height, weight = hidden_states.shape
103 |         residual = hidden_states
104 | 
105 |         hidden_states = self.norm(hidden_states)
106 |         if not self.use_linear_projection:
107 |             hidden_states = self.proj_in(hidden_states)
108 |             inner_dim = hidden_states.shape[1]
109 |             hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
110 |         else:
111 |             inner_dim = hidden_states.shape[1]
112 |             hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
113 |             hidden_states = self.proj_in(hidden_states)
114 | 
115 |         # Blocks
116 |         for block in self.transformer_blocks:
117 |             hidden_states = block(
118 |                 hidden_states,
119 |                 encoder_hidden_states=encoder_hidden_states,
120 |                 timestep=timestep,
121 |                 video_length=video_length
122 |             )
123 | 
124 |         # Output
125 |         if not self.use_linear_projection:
126 |             hidden_states = (
127 |                 hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
128 |             )
129 |             hidden_states = self.proj_out(hidden_states)
130 |         else:
131 |             hidden_states = self.proj_out(hidden_states)
132 |             hidden_states = (
133 |                 hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
134 |             )
135 | 
136 |         output = hidden_states + residual
137 | 
138 |         output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
139 |         if not return_dict:
140 |             return (output,)
141 | 
142 |         return Transformer3DModelOutput(sample=output)
143 | 
144 | 
145 | class BasicTransformerBlock(nn.Module):
146 |     def __init__(
147 |         self,
148 |         dim: int,
149 |         num_attention_heads: int,
150 |         attention_head_dim: int,
151 |         dropout=0.0,
152 |         cross_attention_dim: Optional[int] = None,
153 |         activation_fn: str = "geglu",
154 |         num_embeds_ada_norm: Optional[int] = None,
155 |         attention_bias: bool = False,
156 |         only_cross_attention: bool = False,
157 |         upcast_attention: bool = False,
158 | 
159 |         unet_use_cross_frame_attention = None,
160 |         unet_use_temporal_attention = None,
161 |     ):
162 |         super().__init__()
163 |         self.only_cross_attention = only_cross_attention
164 |         self.use_ada_layer_norm = num_embeds_ada_norm is not None
165 |         self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
166 |         self.unet_use_temporal_attention = unet_use_temporal_attention
167 | 
168 |         self.attn1 = Attention(
169 |             query_dim=dim,
170 |             heads=num_attention_heads,
171 |             dim_head=attention_head_dim,
172 |             dropout=dropout,
173 |             bias=attention_bias,
174 |             upcast_attention=upcast_attention,
175 |         )
176 |         self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
177 | 
178 |         # Cross-Attn
179 |         if cross_attention_dim is not None:
180 |             self.attn2 = Attention(
181 |                 query_dim=dim,
182 |                 cross_attention_dim=cross_attention_dim,
183 |                 heads=num_attention_heads,
184 |                 dim_head=attention_head_dim,
185 |                 dropout=dropout,
186 |                 bias=attention_bias,
187 |                 upcast_attention=upcast_attention,
188 |             )
189 |         else:
190 |             self.attn2 = None
191 | 
192 |         if cross_attention_dim is not None:
193 |             self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
194 |         else:
195 |             self.norm2 = None
196 | 
197 |         # Feed-forward
198 |         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
199 |         self.norm3 = nn.LayerNorm(dim)
200 | 
201 |         # Temp-Attn
202 |         assert unet_use_temporal_attention is not None
203 |         if unet_use_temporal_attention:
204 |             self.attn_temp = Attention(
205 |                 query_dim=dim,
206 |                 heads=num_attention_heads,
207 |                 dim_head=attention_head_dim,
208 |                 dropout=dropout,
209 |                 bias=attention_bias,
210 |                 upcast_attention=upcast_attention,
211 |             )
212 |             nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
213 |             self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
214 | 
215 |     def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None):
216 |         # SparseCausal-Attention
217 |         norm_hidden_states = (
218 |             self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
219 |         )
220 |         if self.unet_use_cross_frame_attention:
221 |             hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
222 |         else:
223 |             hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
224 | 
225 |         if self.attn2 is not None:
226 |             # Cross-Attention
227 |             norm_hidden_states = (
228 |                 self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
229 |             )
230 |             hidden_states = (
231 |                 self.attn2(
232 |                     norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
233 |                 )
234 |                 + hidden_states
235 |             )
236 | 
237 |         # Feed-forward
238 |         hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
239 | 
240 |         # Temporal-Attention
241 |         if self.unet_use_temporal_attention:
242 |             d = hidden_states.shape[1]
243 |             hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
244 |             norm_hidden_states = (
245 |                 self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
246 |             )
247 |             hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
248 |             hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
249 | 
250 |         return hidden_states
251 | 


--------------------------------------------------------------------------------
/animatediff/models/motion_module.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Optional, Callable
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch import nn
  7 | 
  8 | 
  9 | from diffusers.utils import BaseOutput
 10 | from diffusers.utils.import_utils import is_xformers_available
 11 | from diffusers.models.attention import FeedForward
 12 | from diffusers.models.attention_processor import  Attention, XFormersAttnProcessor, AttnProcessor
 13 | 
 14 | from einops import rearrange, repeat
 15 | import math
 16 | 
 17 | 
 18 | def zero_module(module):
 19 |     # Zero out the parameters of a module and return it.
 20 |     for p in module.parameters():
 21 |         p.detach().zero_()
 22 |     return module
 23 | 
 24 | 
 25 | @dataclass
 26 | class TemporalTransformer3DModelOutput(BaseOutput):
 27 |     sample: torch.FloatTensor
 28 | 
 29 | 
 30 | if is_xformers_available():
 31 |     import xformers
 32 |     import xformers.ops
 33 | else:
 34 |     xformers = None
 35 | 
 36 | 
 37 | def get_motion_module(
 38 |     in_channels,
 39 |     motion_module_type: str, 
 40 |     motion_module_kwargs: dict
 41 | ):
 42 |     if motion_module_type == "Vanilla":
 43 |         return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs,)    
 44 |     else:
 45 |         raise ValueError
 46 | 
 47 | 
 48 | class VanillaTemporalModule(nn.Module):
 49 |     def __init__(
 50 |         self,
 51 |         in_channels,
 52 |         num_attention_heads                = 8,
 53 |         num_transformer_block              = 2,
 54 |         attention_block_types              =( "Temporal_Self", "Temporal_Self" ),
 55 |         cross_frame_attention_mode         = None,
 56 |         temporal_position_encoding         = False,
 57 |         temporal_position_encoding_max_len = 24,
 58 |         temporal_attention_dim_div         = 1,
 59 |         zero_initialize                    = True,
 60 |     ):
 61 |         super().__init__()
 62 |         
 63 |         self.temporal_transformer = TemporalTransformer3DModel(
 64 |             in_channels=in_channels,
 65 |             num_attention_heads=num_attention_heads,
 66 |             attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
 67 |             num_layers=num_transformer_block,
 68 |             attention_block_types=attention_block_types,
 69 |             cross_frame_attention_mode=cross_frame_attention_mode,
 70 |             temporal_position_encoding=temporal_position_encoding,
 71 |             temporal_position_encoding_max_len=temporal_position_encoding_max_len,
 72 |         )
 73 |         
 74 |         if zero_initialize:
 75 |             self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
 76 | 
 77 |     def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):
 78 |         hidden_states = input_tensor
 79 |         hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
 80 | 
 81 |         output = hidden_states
 82 |         return output
 83 | 
 84 | 
 85 | class TemporalTransformer3DModel(nn.Module):
 86 |     def __init__(
 87 |         self,
 88 |         in_channels,
 89 |         num_attention_heads,
 90 |         attention_head_dim,
 91 | 
 92 |         num_layers,
 93 |         attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),        
 94 |         dropout                            = 0.0,
 95 |         norm_num_groups                    = 32,
 96 |         cross_attention_dim                = 768,
 97 |         activation_fn                      = "geglu",
 98 |         attention_bias                     = False,
 99 |         upcast_attention                   = False,
100 |         
101 |         cross_frame_attention_mode         = None,
102 |         temporal_position_encoding         = False,
103 |         temporal_position_encoding_max_len = 24,
104 |     ):
105 |         super().__init__()
106 | 
107 |         inner_dim = num_attention_heads * attention_head_dim
108 | 
109 |         self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
110 |         self.proj_in = nn.Linear(in_channels, inner_dim)
111 | 
112 |         self.transformer_blocks = nn.ModuleList(
113 |             [
114 |                 TemporalTransformerBlock(
115 |                     dim=inner_dim,
116 |                     num_attention_heads=num_attention_heads,
117 |                     attention_head_dim=attention_head_dim,
118 |                     attention_block_types=attention_block_types,
119 |                     dropout=dropout,
120 |                     norm_num_groups=norm_num_groups,
121 |                     cross_attention_dim=cross_attention_dim,
122 |                     activation_fn=activation_fn,
123 |                     attention_bias=attention_bias,
124 |                     upcast_attention=upcast_attention,
125 |                     cross_frame_attention_mode=cross_frame_attention_mode,
126 |                     temporal_position_encoding=temporal_position_encoding,
127 |                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
128 |                 )
129 |                 for d in range(num_layers)
130 |             ]
131 |         )
132 |         self.proj_out = nn.Linear(inner_dim, in_channels)    
133 |     
134 |     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
135 |         assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
136 |         video_length = hidden_states.shape[2]
137 |         hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
138 | 
139 |         batch, channel, height, weight = hidden_states.shape
140 |         residual = hidden_states
141 | 
142 |         hidden_states = self.norm(hidden_states)
143 |         inner_dim = hidden_states.shape[1]
144 |         hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
145 |         hidden_states = self.proj_in(hidden_states)
146 | 
147 |         # Transformer Blocks
148 |         for block in self.transformer_blocks:
149 |             hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length)
150 |         
151 |         # output
152 |         hidden_states = self.proj_out(hidden_states)
153 |         hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
154 | 
155 |         output = hidden_states + residual
156 |         output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
157 |         
158 |         return output
159 | 
160 | 
161 | class TemporalTransformerBlock(nn.Module):
162 |     def __init__(
163 |         self,
164 |         dim,
165 |         num_attention_heads,
166 |         attention_head_dim,
167 |         attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),
168 |         dropout                            = 0.0,
169 |         norm_num_groups                    = 32,
170 |         cross_attention_dim                = 768,
171 |         activation_fn                      = "geglu",
172 |         attention_bias                     = False,
173 |         upcast_attention                   = False,
174 |         cross_frame_attention_mode         = None,
175 |         temporal_position_encoding         = False,
176 |         temporal_position_encoding_max_len = 24,
177 |     ):
178 |         super().__init__()
179 | 
180 |         attention_blocks = []
181 |         norms = []
182 |         
183 |         for block_name in attention_block_types:
184 |             attention_blocks.append(
185 |                 VersatileAttention(
186 |                     attention_mode=block_name.split("_")[0],
187 |                     cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
188 |                     
189 |                     query_dim=dim,
190 |                     heads=num_attention_heads,
191 |                     dim_head=attention_head_dim,
192 |                     dropout=dropout,
193 |                     bias=attention_bias,
194 |                     upcast_attention=upcast_attention,
195 |         
196 |                     cross_frame_attention_mode=cross_frame_attention_mode,
197 |                     temporal_position_encoding=temporal_position_encoding,
198 |                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
199 |                 )
200 |             )
201 |             norms.append(nn.LayerNorm(dim))
202 |             
203 |         self.attention_blocks = nn.ModuleList(attention_blocks)
204 |         self.norms = nn.ModuleList(norms)
205 | 
206 |         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
207 |         self.ff_norm = nn.LayerNorm(dim)
208 | 
209 | 
210 |     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
211 |         for attention_block, norm in zip(self.attention_blocks, self.norms):
212 |             norm_hidden_states = norm(hidden_states)
213 |             hidden_states = attention_block(
214 |                 norm_hidden_states,
215 |                 encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
216 |                 video_length=video_length,
217 |             ) + hidden_states
218 |             
219 |         hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
220 |         
221 |         output = hidden_states  
222 |         return output
223 | 
224 | 
225 | class PositionalEncoding(nn.Module):
226 |     def __init__(
227 |         self, 
228 |         d_model, 
229 |         dropout = 0., 
230 |         max_len = 24
231 |     ):
232 |         super().__init__()
233 |         self.dropout = nn.Dropout(p=dropout)
234 |         position = torch.arange(max_len).unsqueeze(1)
235 |         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
236 |         pe = torch.zeros(1, max_len, d_model)
237 |         pe[0, :, 0::2] = torch.sin(position * div_term)
238 |         pe[0, :, 1::2] = torch.cos(position * div_term)
239 |         self.register_buffer('pe', pe)
240 | 
241 |     def forward(self, x):
242 |         x = x + self.pe[:, :x.size(1)]
243 |         return self.dropout(x)
244 | 
245 | 
246 | class VersatileAttention(Attention):
247 |     def __init__(
248 |             self,
249 |             attention_mode                     = None,
250 |             cross_frame_attention_mode         = None,
251 |             temporal_position_encoding         = False,
252 |             temporal_position_encoding_max_len = 24,            
253 |             *args, **kwargs
254 |         ):
255 |         super().__init__(*args, **kwargs)
256 |         assert attention_mode == "Temporal"
257 | 
258 |         self.attention_mode = attention_mode
259 |         self.is_cross_attention = kwargs["cross_attention_dim"] is not None
260 |         
261 |         self.pos_encoder = PositionalEncoding(
262 |             kwargs["query_dim"],
263 |             dropout=0., 
264 |             max_len=temporal_position_encoding_max_len
265 |         ) if (temporal_position_encoding and attention_mode == "Temporal") else None
266 | 
267 |     def extra_repr(self):
268 |         return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
269 |     
270 |     def set_use_memory_efficient_attention_xformers(
271 |         self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
272 |     ):
273 |         if use_memory_efficient_attention_xformers:
274 |             if not is_xformers_available():
275 |                 raise ModuleNotFoundError(
276 |                     (
277 |                         "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
278 |                         " xformers"
279 |                     ),
280 |                     name="xformers",
281 |                 )
282 |             elif not torch.cuda.is_available():
283 |                 raise ValueError(
284 |                     "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
285 |                     " only available for GPU "
286 |                 )
287 |             else:
288 |                 try:
289 |                     # Make sure we can run the memory efficient attention
290 |                     _ = xformers.ops.memory_efficient_attention(
291 |                         torch.randn((1, 2, 40), device="cuda"),
292 |                         torch.randn((1, 2, 40), device="cuda"),
293 |                         torch.randn((1, 2, 40), device="cuda"),
294 |                     )
295 |                 except Exception as e:
296 |                     raise e
297 | 
298 |             # XFormersAttnProcessor corrupts video generation and work with Pytorch 1.13.
299 |             # Pytorch 2.0.1 AttnProcessor works the same as XFormersAttnProcessor in Pytorch 1.13.
300 |             # You don't need XFormersAttnProcessor here.
301 |             # processor = XFormersAttnProcessor(
302 |             #     attention_op=attention_op,
303 |             # )
304 |             processor = AttnProcessor()
305 |         else:
306 |             processor = AttnProcessor()
307 | 
308 |         self.set_processor(processor)
309 | 
310 |     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None, **cross_attention_kwargs):
311 |         if self.attention_mode == "Temporal":
312 |             d = hidden_states.shape[1]
313 |             hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
314 |             
315 |             if self.pos_encoder is not None:
316 |                 hidden_states = self.pos_encoder(hidden_states)
317 |             
318 |             encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
319 |         else:
320 |             raise NotImplementedError
321 | 
322 |         hidden_states = self.processor(
323 |             self,
324 |             hidden_states,
325 |             encoder_hidden_states=encoder_hidden_states,
326 |             attention_mask=attention_mask,
327 |             **cross_attention_kwargs,
328 |         )
329 | 
330 |         if self.attention_mode == "Temporal":
331 |             hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
332 | 
333 |         return hidden_states
334 | 
335 | 


--------------------------------------------------------------------------------
/animatediff/models/resnet.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from einops import rearrange
  8 | 
  9 | 
 10 | class InflatedConv3d(nn.Conv2d):
 11 |     def forward(self, x):
 12 |         video_length = x.shape[2]
 13 | 
 14 |         x = rearrange(x, "b c f h w -> (b f) c h w")
 15 |         x = super().forward(x)
 16 |         x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
 17 | 
 18 |         return x
 19 | 
 20 | 
 21 | class InflatedGroupNorm(nn.GroupNorm):
 22 |     def forward(self, x):
 23 |         video_length = x.shape[2]
 24 | 
 25 |         x = rearrange(x, "b c f h w -> (b f) c h w")
 26 |         x = super().forward(x)
 27 |         x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
 28 | 
 29 |         return x
 30 | 
 31 | 
 32 | class Upsample3D(nn.Module):
 33 |     def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
 34 |         super().__init__()
 35 |         self.channels = channels
 36 |         self.out_channels = out_channels or channels
 37 |         self.use_conv = use_conv
 38 |         self.use_conv_transpose = use_conv_transpose
 39 |         self.name = name
 40 | 
 41 |         conv = None
 42 |         if use_conv_transpose:
 43 |             raise NotImplementedError
 44 |         elif use_conv:
 45 |             self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
 46 | 
 47 |     def forward(self, hidden_states, output_size=None):
 48 |         assert hidden_states.shape[1] == self.channels
 49 | 
 50 |         if self.use_conv_transpose:
 51 |             raise NotImplementedError
 52 | 
 53 |         # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
 54 |         dtype = hidden_states.dtype
 55 |         if dtype == torch.bfloat16:
 56 |             hidden_states = hidden_states.to(torch.float32)
 57 | 
 58 |         # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
 59 |         if hidden_states.shape[0] >= 64:
 60 |             hidden_states = hidden_states.contiguous()
 61 | 
 62 |         # if `output_size` is passed we force the interpolation output
 63 |         # size and do not make use of `scale_factor=2`
 64 |         if output_size is None:
 65 |             hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
 66 |         else:
 67 |             hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
 68 | 
 69 |         # If the input is bfloat16, we cast back to bfloat16
 70 |         if dtype == torch.bfloat16:
 71 |             hidden_states = hidden_states.to(dtype)
 72 | 
 73 |         # if self.use_conv:
 74 |         #     if self.name == "conv":
 75 |         #         hidden_states = self.conv(hidden_states)
 76 |         #     else:
 77 |         #         hidden_states = self.Conv2d_0(hidden_states)
 78 |         hidden_states = self.conv(hidden_states)
 79 | 
 80 |         return hidden_states
 81 | 
 82 | 
 83 | class Downsample3D(nn.Module):
 84 |     def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
 85 |         super().__init__()
 86 |         self.channels = channels
 87 |         self.out_channels = out_channels or channels
 88 |         self.use_conv = use_conv
 89 |         self.padding = padding
 90 |         stride = 2
 91 |         self.name = name
 92 | 
 93 |         if use_conv:
 94 |             self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
 95 |         else:
 96 |             raise NotImplementedError
 97 | 
 98 |     def forward(self, hidden_states):
 99 |         assert hidden_states.shape[1] == self.channels
100 |         if self.use_conv and self.padding == 0:
101 |             raise NotImplementedError
102 | 
103 |         assert hidden_states.shape[1] == self.channels
104 |         hidden_states = self.conv(hidden_states)
105 | 
106 |         return hidden_states
107 | 
108 | 
109 | class ResnetBlock3D(nn.Module):
110 |     def __init__(
111 |         self,
112 |         *,
113 |         in_channels,
114 |         out_channels=None,
115 |         conv_shortcut=False,
116 |         dropout=0.0,
117 |         temb_channels=512,
118 |         groups=32,
119 |         groups_out=None,
120 |         pre_norm=True,
121 |         eps=1e-6,
122 |         non_linearity="swish",
123 |         time_embedding_norm="default",
124 |         output_scale_factor=1.0,
125 |         use_in_shortcut=None,
126 |         use_inflated_groupnorm=None,
127 |     ):
128 |         super().__init__()
129 |         self.pre_norm = pre_norm
130 |         self.pre_norm = True
131 |         self.in_channels = in_channels
132 |         out_channels = in_channels if out_channels is None else out_channels
133 |         self.out_channels = out_channels
134 |         self.use_conv_shortcut = conv_shortcut
135 |         self.time_embedding_norm = time_embedding_norm
136 |         self.output_scale_factor = output_scale_factor
137 | 
138 |         if groups_out is None:
139 |             groups_out = groups
140 | 
141 |         assert use_inflated_groupnorm != None
142 |         if use_inflated_groupnorm:
143 |             self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
144 |         else:
145 |             self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
146 | 
147 |         self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
148 | 
149 |         if temb_channels is not None:
150 |             if self.time_embedding_norm == "default":
151 |                 time_emb_proj_out_channels = out_channels
152 |             elif self.time_embedding_norm == "scale_shift":
153 |                 time_emb_proj_out_channels = out_channels * 2
154 |             else:
155 |                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
156 | 
157 |             self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
158 |         else:
159 |             self.time_emb_proj = None
160 | 
161 |         if use_inflated_groupnorm:
162 |             self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
163 |         else:
164 |             self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
165 | 
166 |         self.dropout = torch.nn.Dropout(dropout)
167 |         self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
168 | 
169 |         if non_linearity == "swish":
170 |             self.nonlinearity = lambda x: F.silu(x)
171 |         elif non_linearity == "mish":
172 |             self.nonlinearity = Mish()
173 |         elif non_linearity == "silu":
174 |             self.nonlinearity = nn.SiLU()
175 | 
176 |         self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
177 | 
178 |         self.conv_shortcut = None
179 |         if self.use_in_shortcut:
180 |             self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
181 | 
182 |     def forward(self, input_tensor, temb):
183 |         hidden_states = input_tensor
184 | 
185 |         hidden_states = self.norm1(hidden_states)
186 |         hidden_states = self.nonlinearity(hidden_states)
187 | 
188 |         hidden_states = self.conv1(hidden_states)
189 | 
190 |         if temb is not None:
191 |             temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
192 | 
193 |         if temb is not None and self.time_embedding_norm == "default":
194 |             hidden_states = hidden_states + temb
195 | 
196 |         hidden_states = self.norm2(hidden_states)
197 | 
198 |         if temb is not None and self.time_embedding_norm == "scale_shift":
199 |             scale, shift = torch.chunk(temb, 2, dim=1)
200 |             hidden_states = hidden_states * (1 + scale) + shift
201 | 
202 |         hidden_states = self.nonlinearity(hidden_states)
203 | 
204 |         hidden_states = self.dropout(hidden_states)
205 |         hidden_states = self.conv2(hidden_states)
206 | 
207 |         if self.conv_shortcut is not None:
208 |             input_tensor = self.conv_shortcut(input_tensor)
209 | 
210 |         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
211 | 
212 |         return output_tensor
213 | 
214 | 
215 | class Mish(torch.nn.Module):
216 |     def forward(self, hidden_states):
217 |         return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))


--------------------------------------------------------------------------------
/animatediff/pipelines/pipeline_animation.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
  2 | 
  3 | import inspect
  4 | from typing import Callable, List, Optional, Union
  5 | from dataclasses import dataclass
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from tqdm import tqdm
 10 | 
 11 | from diffusers.utils import is_accelerate_available
 12 | from packaging import version
 13 | from transformers import CLIPTextModel, CLIPTokenizer
 14 | 
 15 | from diffusers.configuration_utils import FrozenDict
 16 | from diffusers.models import AutoencoderKL
 17 | from diffusers.pipeline_utils import DiffusionPipeline
 18 | from diffusers.schedulers import (
 19 |     DDIMScheduler,
 20 |     DPMSolverMultistepScheduler,
 21 |     EulerAncestralDiscreteScheduler,
 22 |     EulerDiscreteScheduler,
 23 |     LMSDiscreteScheduler,
 24 |     PNDMScheduler,
 25 | )
 26 | from diffusers.utils import deprecate, logging, BaseOutput
 27 | 
 28 | from einops import rearrange
 29 | 
 30 | from ..models.unet import UNet3DConditionModel
 31 | 
 32 | 
 33 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 34 | 
 35 | 
 36 | @dataclass
 37 | class AnimationPipelineOutput(BaseOutput):
 38 |     videos: Union[torch.Tensor, np.ndarray]
 39 | 
 40 | 
 41 | class AnimationPipeline(DiffusionPipeline):
 42 |     _optional_components = []
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         vae: AutoencoderKL,
 47 |         text_encoder: CLIPTextModel,
 48 |         tokenizer: CLIPTokenizer,
 49 |         unet: UNet3DConditionModel,
 50 |         scheduler: Union[
 51 |             DDIMScheduler,
 52 |             PNDMScheduler,
 53 |             LMSDiscreteScheduler,
 54 |             EulerDiscreteScheduler,
 55 |             EulerAncestralDiscreteScheduler,
 56 |             DPMSolverMultistepScheduler,
 57 |         ],
 58 |     ):
 59 |         super().__init__()
 60 | 
 61 |         if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
 62 |             deprecation_message = (
 63 |                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
 64 |                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
 65 |                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
 66 |                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
 67 |                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
 68 |                 " file"
 69 |             )
 70 |             deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
 71 |             new_config = dict(scheduler.config)
 72 |             new_config["steps_offset"] = 1
 73 |             scheduler._internal_dict = FrozenDict(new_config)
 74 | 
 75 |         if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
 76 |             deprecation_message = (
 77 |                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
 78 |                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
 79 |                 " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
 80 |                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
 81 |                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
 82 |             )
 83 |             deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
 84 |             new_config = dict(scheduler.config)
 85 |             new_config["clip_sample"] = False
 86 |             scheduler._internal_dict = FrozenDict(new_config)
 87 | 
 88 |         is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
 89 |             version.parse(unet.config._diffusers_version).base_version
 90 |         ) < version.parse("0.9.0.dev0")
 91 |         is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
 92 |         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
 93 |             deprecation_message = (
 94 |                 "The configuration file of the unet has set the default `sample_size` to smaller than"
 95 |                 " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
 96 |                 " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
 97 |                 " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
 98 |                 " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
 99 |                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
100 |                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
101 |                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
102 |                 " the `unet/config.json` file"
103 |             )
104 |             deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
105 |             new_config = dict(unet.config)
106 |             new_config["sample_size"] = 64
107 |             unet._internal_dict = FrozenDict(new_config)
108 | 
109 |         self.register_modules(
110 |             vae=vae,
111 |             text_encoder=text_encoder,
112 |             tokenizer=tokenizer,
113 |             unet=unet,
114 |             scheduler=scheduler,
115 |         )
116 |         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
117 | 
118 |     def enable_vae_slicing(self):
119 |         self.vae.enable_slicing()
120 | 
121 |     def disable_vae_slicing(self):
122 |         self.vae.disable_slicing()
123 | 
124 |     def enable_sequential_cpu_offload(self, gpu_id=0):
125 |         if is_accelerate_available():
126 |             from accelerate import cpu_offload
127 |         else:
128 |             raise ImportError("Please install accelerate via `pip install accelerate`")
129 | 
130 |         device = torch.device(f"cuda:{gpu_id}")
131 | 
132 |         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
133 |             if cpu_offloaded_model is not None:
134 |                 cpu_offload(cpu_offloaded_model, device)
135 | 
136 | 
137 |     @property
138 |     def _execution_device(self):
139 |         if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
140 |             return self.device
141 |         for module in self.unet.modules():
142 |             if (
143 |                 hasattr(module, "_hf_hook")
144 |                 and hasattr(module._hf_hook, "execution_device")
145 |                 and module._hf_hook.execution_device is not None
146 |             ):
147 |                 return torch.device(module._hf_hook.execution_device)
148 |         return self.device
149 | 
150 |     def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
151 |         batch_size = len(prompt) if isinstance(prompt, list) else 1
152 | 
153 |         text_inputs = self.tokenizer(
154 |             prompt,
155 |             padding="max_length",
156 |             max_length=self.tokenizer.model_max_length,
157 |             truncation=True,
158 |             return_tensors="pt",
159 |         )
160 |         text_input_ids = text_inputs.input_ids
161 |         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
162 | 
163 |         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
164 |             removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
165 |             logger.warning(
166 |                 "The following part of your input was truncated because CLIP can only handle sequences up to"
167 |                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
168 |             )
169 | 
170 |         if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
171 |             attention_mask = text_inputs.attention_mask.to(device)
172 |         else:
173 |             attention_mask = None
174 | 
175 |         text_embeddings = self.text_encoder(
176 |             text_input_ids.to(device),
177 |             attention_mask=attention_mask,
178 |         )
179 |         text_embeddings = text_embeddings[0]
180 | 
181 |         # duplicate text embeddings for each generation per prompt, using mps friendly method
182 |         bs_embed, seq_len, _ = text_embeddings.shape
183 |         text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
184 |         text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
185 | 
186 |         # get unconditional embeddings for classifier free guidance
187 |         if do_classifier_free_guidance:
188 |             uncond_tokens: List[str]
189 |             if negative_prompt is None:
190 |                 uncond_tokens = [""] * batch_size
191 |             elif type(prompt) is not type(negative_prompt):
192 |                 raise TypeError(
193 |                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
194 |                     f" {type(prompt)}."
195 |                 )
196 |             elif isinstance(negative_prompt, str):
197 |                 uncond_tokens = [negative_prompt]
198 |             elif batch_size != len(negative_prompt):
199 |                 raise ValueError(
200 |                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
201 |                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
202 |                     " the batch size of `prompt`."
203 |                 )
204 |             else:
205 |                 uncond_tokens = negative_prompt
206 | 
207 |             max_length = text_input_ids.shape[-1]
208 |             uncond_input = self.tokenizer(
209 |                 uncond_tokens,
210 |                 padding="max_length",
211 |                 max_length=max_length,
212 |                 truncation=True,
213 |                 return_tensors="pt",
214 |             )
215 | 
216 |             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
217 |                 attention_mask = uncond_input.attention_mask.to(device)
218 |             else:
219 |                 attention_mask = None
220 | 
221 |             uncond_embeddings = self.text_encoder(
222 |                 uncond_input.input_ids.to(device),
223 |                 attention_mask=attention_mask,
224 |             )
225 |             uncond_embeddings = uncond_embeddings[0]
226 | 
227 |             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
228 |             seq_len = uncond_embeddings.shape[1]
229 |             uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
230 |             uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
231 | 
232 |             # For classifier free guidance, we need to do two forward passes.
233 |             # Here we concatenate the unconditional and text embeddings into a single batch
234 |             # to avoid doing two forward passes
235 |             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
236 | 
237 |         return text_embeddings
238 | 
239 |     def decode_latents(self, latents):
240 |         video_length = latents.shape[2]
241 |         latents = 1 / 0.18215 * latents
242 |         latents = rearrange(latents, "b c f h w -> (b f) c h w")
243 |         # video = self.vae.decode(latents).sample
244 |         video = []
245 |         for frame_idx in tqdm(range(latents.shape[0])):
246 |             video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
247 |         video = torch.cat(video)
248 |         video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
249 |         video = (video / 2 + 0.5).clamp(0, 1)
250 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
251 |         video = video.cpu().float().numpy()
252 |         return video
253 | 
254 |     def prepare_extra_step_kwargs(self, generator, eta):
255 |         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
256 |         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
257 |         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
258 |         # and should be between [0, 1]
259 | 
260 |         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
261 |         extra_step_kwargs = {}
262 |         if accepts_eta:
263 |             extra_step_kwargs["eta"] = eta
264 | 
265 |         # check if the scheduler accepts generator
266 |         accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
267 |         if accepts_generator:
268 |             extra_step_kwargs["generator"] = generator
269 |         return extra_step_kwargs
270 | 
271 |     def check_inputs(self, prompt, height, width, callback_steps):
272 |         if not isinstance(prompt, str) and not isinstance(prompt, list):
273 |             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
274 | 
275 |         if height % 8 != 0 or width % 8 != 0:
276 |             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
277 | 
278 |         if (callback_steps is None) or (
279 |             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
280 |         ):
281 |             raise ValueError(
282 |                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
283 |                 f" {type(callback_steps)}."
284 |             )
285 | 
286 |     def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):
287 |         shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
288 |         if isinstance(generator, list) and len(generator) != batch_size:
289 |             raise ValueError(
290 |                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
291 |                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
292 |             )
293 |         if latents is None:
294 |             rand_device = "cpu" if device.type == "mps" else device
295 | 
296 |             if isinstance(generator, list):
297 |                 shape = shape
298 |                 # shape = (1,) + shape[1:]
299 |                 latents = [
300 |                     torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
301 |                     for i in range(batch_size)
302 |                 ]
303 |                 latents = torch.cat(latents, dim=0).to(device)
304 |             else:
305 |                 latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
306 |         else:
307 |             if latents.shape != shape:
308 |                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
309 |             latents = latents.to(device=device, dtype=dtype)
310 | 
311 |         # scale the initial noise by the standard deviation required by the scheduler
312 |         latents = latents * self.scheduler.init_noise_sigma
313 |         return latents
314 | 
315 |     @torch.no_grad()
316 |     def __call__(
317 |         self,
318 |         prompt: Union[str, List[str]],
319 |         video_length: Optional[int],
320 |         height: Optional[int] = None,
321 |         width: Optional[int] = None,
322 |         num_inference_steps: int = 50,
323 |         guidance_scale: float = 7.5,
324 |         negative_prompt: Optional[Union[str, List[str]]] = None,
325 |         num_videos_per_prompt: Optional[int] = 1,
326 |         eta: float = 0.0,
327 |         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
328 |         latents: Optional[torch.FloatTensor] = None,
329 |         output_type: Optional[str] = "tensor",
330 |         return_dict: bool = True,
331 |         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
332 |         callback_steps: Optional[int] = 1,
333 |         down_block_control: Optional[List[torch.FloatTensor]] = None,
334 |         mid_block_control: Optional[torch.FloatTensor] = None,
335 |         **kwargs,
336 |     ):
337 |         # Default height and width to unet
338 |         height = height or self.unet.config.sample_size * self.vae_scale_factor
339 |         width = width or self.unet.config.sample_size * self.vae_scale_factor
340 | 
341 |         # Check inputs. Raise error if not correct
342 |         self.check_inputs(prompt, height, width, callback_steps)
343 | 
344 |         # Define call parameters
345 |         # batch_size = 1 if isinstance(prompt, str) else len(prompt)
346 |         batch_size = 1
347 |         if latents is not None:
348 |             batch_size = latents.shape[0]
349 |         if isinstance(prompt, list):
350 |             batch_size = len(prompt)
351 | 
352 |         device = self._execution_device
353 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
354 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
355 |         # corresponds to doing no classifier free guidance.
356 |         do_classifier_free_guidance = guidance_scale > 1.0
357 | 
358 |         # Encode input prompt
359 |         prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
360 |         if negative_prompt is not None:
361 |             negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size 
362 |         text_embeddings = self._encode_prompt(
363 |             prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
364 |         )
365 | 
366 |         # Prepare timesteps
367 |         self.scheduler.set_timesteps(num_inference_steps, device=device)
368 |         timesteps = self.scheduler.timesteps
369 | 
370 |         # Prepare latent variables
371 |         num_channels_latents = self.unet.in_channels
372 |         latents = self.prepare_latents(
373 |             batch_size * num_videos_per_prompt,
374 |             num_channels_latents,
375 |             video_length,
376 |             height,
377 |             width,
378 |             text_embeddings.dtype,
379 |             device,
380 |             generator,
381 |             latents,
382 |         )
383 |         latents_dtype = latents.dtype
384 | 
385 |         # Prepare extra step kwargs.
386 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
387 | 
388 |         # Denoising loop
389 |         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
390 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
391 |             for i, t in enumerate(timesteps):
392 |                 # expand the latents if we are doing classifier free guidance
393 |                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
394 |                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
395 | 
396 |                 # predict the noise residual
397 |                 noise_pred = self.unet(
398 |                     latent_model_input, 
399 |                     t, 
400 |                     encoder_hidden_states=text_embeddings,
401 |                     down_block_additional_residuals=[x.to(self.device) for x in down_block_control[i]] if down_block_control is not None else None,
402 |                     mid_block_additional_residual=mid_block_control[i].to(self.device) if mid_block_control is not None else None,
403 |                 ).sample.to(dtype=latents_dtype)
404 |                 # noise_pred = []
405 |                 # import pdb
406 |                 # pdb.set_trace()
407 |                 # for batch_idx in range(latent_model_input.shape[0]):
408 |                 #     noise_pred_single = self.unet(latent_model_input[batch_idx:batch_idx+1], t, encoder_hidden_states=text_embeddings[batch_idx:batch_idx+1]).sample.to(dtype=latents_dtype)
409 |                 #     noise_pred.append(noise_pred_single)
410 |                 # noise_pred = torch.cat(noise_pred)
411 | 
412 |                 # perform guidance
413 |                 if do_classifier_free_guidance:
414 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
415 |                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
416 | 
417 |                 # compute the previous noisy sample x_t -> x_t-1
418 |                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
419 | 
420 |                 # call the callback, if provided
421 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
422 |                     progress_bar.update()
423 |                     if callback is not None and i % callback_steps == 0:
424 |                         callback(i, t, latents)
425 | 
426 |         # Post-processing
427 |         video = self.decode_latents(latents)
428 | 
429 |         # Convert to tensor
430 |         if output_type == "tensor":
431 |             video = torch.from_numpy(video)
432 | 
433 |         if not return_dict:
434 |             return video
435 | 
436 |         return AnimationPipelineOutput(videos=video)
437 | 


--------------------------------------------------------------------------------
/animatediff/utils/convert_lora_safetensor_to_diffusers.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """ Conversion script for the LoRA's safetensors checkpoints. """
 17 | 
 18 | import argparse
 19 | 
 20 | import torch
 21 | from safetensors.torch import load_file
 22 | 
 23 | from diffusers import StableDiffusionPipeline
 24 | import pdb
 25 | 
 26 | def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
 27 |     # load base model
 28 |     # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
 29 | 
 30 |     # load LoRA weight from .safetensors
 31 |     # state_dict = load_file(checkpoint_path)
 32 | 
 33 |     visited = []
 34 | 
 35 |     # directly update weight in diffusers model
 36 |     for key in state_dict:
 37 |         # it is suggested to print out the key, it usually will be something like below
 38 |         # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
 39 | 
 40 |         # as we have set the alpha beforehand, so just skip
 41 |         if ".alpha" in key or key in visited:
 42 |             continue
 43 | 
 44 |         if "text" in key:
 45 |             layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
 46 |             curr_layer = pipeline.text_encoder
 47 |         else:
 48 |             layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
 49 |             curr_layer = pipeline.unet
 50 | 
 51 |         # find the target layer
 52 |         temp_name = layer_infos.pop(0)
 53 |         while len(layer_infos) > -1:
 54 |             try:
 55 |                 curr_layer = curr_layer.__getattr__(temp_name)
 56 |                 if len(layer_infos) > 0:
 57 |                     temp_name = layer_infos.pop(0)
 58 |                 elif len(layer_infos) == 0:
 59 |                     break
 60 |             except Exception:
 61 |                 if len(temp_name) > 0:
 62 |                     temp_name += "_" + layer_infos.pop(0)
 63 |                 else:
 64 |                     temp_name = layer_infos.pop(0)
 65 | 
 66 |         pair_keys = []
 67 |         if "lora_down" in key:
 68 |             pair_keys.append(key.replace("lora_down", "lora_up"))
 69 |             pair_keys.append(key)
 70 |         else:
 71 |             pair_keys.append(key)
 72 |             pair_keys.append(key.replace("lora_up", "lora_down"))
 73 | 
 74 |         # update weight
 75 |         if len(state_dict[pair_keys[0]].shape) == 4:
 76 |             weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
 77 |             weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
 78 |             curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
 79 |         else:
 80 |             weight_up = state_dict[pair_keys[0]].to(torch.float32)
 81 |             weight_down = state_dict[pair_keys[1]].to(torch.float32)
 82 |             curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
 83 | 
 84 |         # update visited list
 85 |         for item in pair_keys:
 86 |             visited.append(item)
 87 | 
 88 |     return pipeline
 89 | 
 90 | 
 91 | if __name__ == "__main__":
 92 |     parser = argparse.ArgumentParser()
 93 | 
 94 |     parser.add_argument(
 95 |         "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
 96 |     )
 97 |     parser.add_argument(
 98 |         "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
 99 |     )
100 |     parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
101 |     parser.add_argument(
102 |         "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
103 |     )
104 |     parser.add_argument(
105 |         "--lora_prefix_text_encoder",
106 |         default="lora_te",
107 |         type=str,
108 |         help="The prefix of text encoder weight in safetensors",
109 |     )
110 |     parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
111 |     parser.add_argument(
112 |         "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
113 |     )
114 |     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
115 | 
116 |     args = parser.parse_args()
117 | 
118 |     base_model_path = args.base_model_path
119 |     checkpoint_path = args.checkpoint_path
120 |     dump_path = args.dump_path
121 |     lora_prefix_unet = args.lora_prefix_unet
122 |     lora_prefix_text_encoder = args.lora_prefix_text_encoder
123 |     alpha = args.alpha
124 | 
125 |     pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
126 | 
127 |     pipe = pipe.to(args.device)
128 |     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
129 | 


--------------------------------------------------------------------------------
/animatediff/utils/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import imageio
 3 | import numpy as np
 4 | from typing import Union
 5 | 
 6 | import torch
 7 | import torchvision
 8 | import torch.distributed as dist
 9 | 
10 | from tqdm import tqdm
11 | from einops import rearrange
12 | 
13 | 
14 | def zero_rank_print(s):
15 |     if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0): print("### " + s)
16 | 
17 | 
18 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
19 |     videos = rearrange(videos, "b c t h w -> t b c h w")
20 |     outputs = []
21 |     for x in videos:
22 |         x = torchvision.utils.make_grid(x, nrow=n_rows)
23 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
24 |         if rescale:
25 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
26 |         x = (x * 255).numpy().astype(np.uint8)
27 |         outputs.append(x)
28 | 
29 |     os.makedirs(os.path.dirname(path), exist_ok=True)
30 |     imageio.mimsave(path, outputs, fps=fps)
31 | 
32 | 
33 | # DDIM Inversion
34 | @torch.no_grad()
35 | def init_prompt(prompt, pipeline):
36 |     uncond_input = pipeline.tokenizer(
37 |         [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length,
38 |         return_tensors="pt"
39 |     )
40 |     uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
41 |     text_input = pipeline.tokenizer(
42 |         [prompt],
43 |         padding="max_length",
44 |         max_length=pipeline.tokenizer.model_max_length,
45 |         truncation=True,
46 |         return_tensors="pt",
47 |     )
48 |     text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
49 |     context = torch.cat([uncond_embeddings, text_embeddings])
50 | 
51 |     return context
52 | 
53 | 
54 | def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
55 |               sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler):
56 |     timestep, next_timestep = min(
57 |         timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep
58 |     alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
59 |     alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
60 |     beta_prod_t = 1 - alpha_prod_t
61 |     next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
62 |     next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
63 |     next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
64 |     return next_sample
65 | 
66 | 
67 | def get_noise_pred_single(latents, t, context, unet):
68 |     noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"]
69 |     return noise_pred
70 | 
71 | 
72 | @torch.no_grad()
73 | def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt):
74 |     context = init_prompt(prompt, pipeline)
75 |     uncond_embeddings, cond_embeddings = context.chunk(2)
76 |     all_latent = [latent]
77 |     latent = latent.clone().detach()
78 |     for i in tqdm(range(num_inv_steps)):
79 |         t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
80 |         noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet)
81 |         latent = next_step(noise_pred, t, latent, ddim_scheduler)
82 |         all_latent.append(latent)
83 |     return all_latent
84 | 
85 | 
86 | @torch.no_grad()
87 | def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""):
88 |     ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt)
89 |     return ddim_latents
90 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import json
  4 | import torch
  5 | import random
  6 | 
  7 | import gradio as gr
  8 | from glob import glob
  9 | from omegaconf import OmegaConf
 10 | from datetime import datetime
 11 | from safetensors import safe_open
 12 | 
 13 | from diffusers import AutoencoderKL
 14 | from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
 15 | from diffusers.utils.import_utils import is_xformers_available
 16 | from transformers import CLIPTextModel, CLIPTokenizer
 17 | 
 18 | from animatediff.models.unet import UNet3DConditionModel
 19 | from animatediff.pipelines.pipeline_animation import AnimationPipeline
 20 | from animatediff.controlnet.controlnet_module import ControlnetModule
 21 | from animatediff.utils.util import save_videos_grid
 22 | from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
 23 | from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
 24 | 
 25 | 
 26 | sample_idx     = 0
 27 | scheduler_dict = {
 28 |     "Euler": EulerDiscreteScheduler,
 29 |     "PNDM": PNDMScheduler,
 30 |     "DDIM": DDIMScheduler,
 31 | }
 32 | 
 33 | css = """
 34 | .toolbutton {
 35 |     margin-buttom: 0em 0em 0em 0em;
 36 |     max-width: 2.5em;
 37 |     min-width: 2.5em !important;
 38 |     height: 2.5em;
 39 | }
 40 | """
 41 | 
 42 | class AnimateController:
 43 |     def __init__(self):
 44 |         
 45 |         # config dirs
 46 |         self.basedir                = os.getcwd()
 47 |         self.stable_diffusion_dir   = os.path.join(self.basedir, "models", "StableDiffusion")
 48 |         self.motion_module_dir      = os.path.join(self.basedir, "models", "Motion_Module")
 49 |         self.personalized_model_dir = os.path.join(self.basedir, "models", "DreamBooth_LoRA")
 50 |         self.controlnet_dir         = os.path.join(self.basedir, "models", "Controlnet")
 51 |         self.videos_dir         = os.path.join(self.basedir, "videos")
 52 |         self.savedir                = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
 53 |         self.savedir_sample         = os.path.join(self.savedir, "sample")
 54 |         os.makedirs(self.savedir, exist_ok=True)
 55 | 
 56 |         self.stable_diffusion_list   = []
 57 |         self.motion_module_list      = []
 58 |         self.personalized_model_list = []
 59 |         self.controlnet_list         = []
 60 |         self.videos_list             = []
 61 |         
 62 |         self.refresh_stable_diffusion()
 63 |         self.refresh_motion_module()
 64 |         self.refresh_personalized_model()
 65 |         self.refresh_controlnet()
 66 |         self.refresh_videos()
 67 |         # config models
 68 |         self.tokenizer             = None
 69 |         self.text_encoder          = None
 70 |         self.vae                   = None
 71 |         self.unet                  = None
 72 |         self.pipeline              = None
 73 |         self.controlnet            = None
 74 |         self.lora_model_state_dict = {}
 75 |         
 76 |         self.inference_config      = OmegaConf.load("configs/inference/inference.yaml")
 77 | 
 78 |     def refresh_stable_diffusion(self):
 79 |         self.stable_diffusion_list = glob(os.path.join(self.stable_diffusion_dir, "*/"))
 80 | 
 81 |     def refresh_motion_module(self):
 82 |         motion_module_list = glob(os.path.join(self.motion_module_dir, "*.ckpt"))
 83 |         self.motion_module_list = [os.path.basename(p) for p in motion_module_list]
 84 | 
 85 |     def refresh_personalized_model(self):
 86 |         personalized_model_list = glob(os.path.join(self.personalized_model_dir, "*.safetensors"))
 87 |         self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list]
 88 | 
 89 |     def refresh_controlnet(self):
 90 |         self.controlnet_list = glob(os.path.join(self.controlnet_dir, "*/"))
 91 |     
 92 |     def refresh_videos(self):
 93 |         self.videos_list = glob(os.path.join(self.videos_dir, "*.mp4"))
 94 | 
 95 |     def update_stable_diffusion(self, stable_diffusion_dropdown):
 96 |         self.tokenizer = CLIPTokenizer.from_pretrained(stable_diffusion_dropdown, subfolder="tokenizer")
 97 |         self.text_encoder = CLIPTextModel.from_pretrained(stable_diffusion_dropdown, subfolder="text_encoder").cuda()
 98 |         self.vae = AutoencoderKL.from_pretrained(stable_diffusion_dropdown, subfolder="vae").cuda()
 99 |         self.unet = UNet3DConditionModel.from_pretrained_2d(stable_diffusion_dropdown, subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(self.inference_config.unet_additional_kwargs)).cuda()
100 |         return gr.Dropdown.update()
101 | 
102 |     def update_motion_module(self, motion_module_dropdown):
103 |         if self.unet is None:
104 |             gr.Info(f"Please select a pretrained model path.")
105 |             return gr.Dropdown.update(value=None)
106 |         else:
107 |             motion_module_dropdown = os.path.join(self.motion_module_dir, motion_module_dropdown)
108 |             motion_module_state_dict = torch.load(motion_module_dropdown, map_location="cpu")
109 |             missing, unexpected = self.unet.load_state_dict(motion_module_state_dict, strict=False)
110 |             assert len(unexpected) == 0
111 |             return gr.Dropdown.update()
112 | 
113 |     def update_base_model(self, base_model_dropdown):
114 |         if self.unet is None:
115 |             gr.Info(f"Please select a pretrained model path.")
116 |             return gr.Dropdown.update(value=None)
117 |         else:
118 |             base_model_dropdown = os.path.join(self.personalized_model_dir, base_model_dropdown)
119 |             base_model_state_dict = {}
120 |             with safe_open(base_model_dropdown, framework="pt", device="cpu") as f:
121 |                 for key in f.keys():
122 |                     base_model_state_dict[key] = f.get_tensor(key)
123 |                     
124 |             converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_model_state_dict, self.vae.config)
125 |             self.vae.load_state_dict(converted_vae_checkpoint)
126 | 
127 |             converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_model_state_dict, self.unet.config)
128 |             self.unet.load_state_dict(converted_unet_checkpoint, strict=False)
129 | 
130 |             self.text_encoder = convert_ldm_clip_checkpoint(base_model_state_dict)
131 |             return gr.Dropdown.update()
132 | 
133 |     def update_lora_model(self, lora_model_dropdown):
134 |         lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown)
135 |         self.lora_model_state_dict = {}
136 |         if lora_model_dropdown == "none": pass
137 |         else:
138 |             with safe_open(lora_model_dropdown, framework="pt", device="cpu") as f:
139 |                 for key in f.keys():
140 |                     self.lora_model_state_dict[key] = f.get_tensor(key)
141 |         return gr.Dropdown.update()
142 | 
143 |     def animate(
144 |         self,
145 |         stable_diffusion_dropdown,
146 |         motion_module_dropdown,
147 |         base_model_dropdown,
148 |         lora_alpha_slider,
149 |         prompt_textbox, 
150 |         negative_prompt_textbox, 
151 |         sampler_dropdown, 
152 |         sample_step_slider, 
153 |         width_slider, 
154 |         length_slider, 
155 |         height_slider, 
156 |         cfg_scale_slider, 
157 |         seed_textbox,
158 |         videos_path_dropdown,
159 |         get_each_slider,
160 |         controlnet_processor_name_dropdown,
161 |         controlnet_processor_path_dropdown,
162 |         controlnet_guess_mode_checkbox,
163 |         controlnet_conditioning_scale_slider,
164 |     ):    
165 |         if self.unet is None:
166 |             raise gr.Error(f"Please select a pretrained model path.")
167 |         if motion_module_dropdown == "": 
168 |             raise gr.Error(f"Please select a motion module.")
169 |         if base_model_dropdown == "":
170 |             raise gr.Error(f"Please select a base DreamBooth model.")
171 | 
172 |         if is_xformers_available(): self.unet.enable_xformers_memory_efficient_attention()
173 | 
174 |         pipeline = AnimationPipeline(
175 |             vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer, unet=self.unet,
176 |             scheduler=scheduler_dict[sampler_dropdown](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
177 |         ).to("cuda")
178 |         
179 |         if self.lora_model_state_dict != {}:
180 |             pipeline = convert_lora(pipeline, self.lora_model_state_dict, alpha=lora_alpha_slider)
181 | 
182 |         pipeline.to("cuda")
183 | 
184 |         if seed_textbox != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
185 |         else: torch.seed()
186 |         seed = torch.initial_seed()
187 | 
188 |         down_features, mid_features = None, None
189 |         controlnet = None
190 |         if videos_path_dropdown and videos_path_dropdown != "none":
191 |             controlnet_config = {
192 |                         'video_length': length_slider,
193 |                         'img_h': height_slider,
194 |                         'img_w': width_slider,
195 |                         'guidance_scale': cfg_scale_slider,
196 |                         'steps': sample_step_slider,
197 |                         'get_each': get_each_slider,
198 |                         'conditioning_scale': controlnet_conditioning_scale_slider,
199 |                         'controlnet_processor': controlnet_processor_name_dropdown,
200 |                         'controlnet_pipeline': stable_diffusion_dropdown,
201 |                         'controlnet_processor_path': controlnet_processor_path_dropdown,
202 |                         'guess_mode': controlnet_guess_mode_checkbox,
203 |                         'device': 'cuda',
204 |                     }
205 |             controlnet = ControlnetModule(controlnet_config)
206 |             down_features, mid_features = controlnet(
207 |                 videos_path_dropdown, prompt_textbox, negative_prompt_textbox, seed)
208 | 
209 |         sample = pipeline(
210 |             prompt_textbox,
211 |             negative_prompt     = negative_prompt_textbox,
212 |             num_inference_steps = sample_step_slider,
213 |             guidance_scale      = cfg_scale_slider,
214 |             width               = width_slider,
215 |             height              = height_slider,
216 |             video_length        = length_slider,
217 |             down_block_control  = down_features, 
218 |             mid_block_control   = mid_features,
219 |         ).videos
220 | 
221 |         save_sample_path = os.path.join(self.savedir_sample, f"{sample_idx}.mp4")
222 |         save_videos_grid(sample, save_sample_path)
223 |     
224 |         sample_config = {
225 |             "prompt": prompt_textbox,
226 |             "n_prompt": negative_prompt_textbox,
227 |             "sampler": sampler_dropdown,
228 |             "num_inference_steps": sample_step_slider,
229 |             "guidance_scale": cfg_scale_slider,
230 |             "width": width_slider,
231 |             "height": height_slider,
232 |             "video_length": length_slider,
233 |             "seed": seed
234 |         }
235 |         json_str = json.dumps(sample_config, indent=4)
236 |         with open(os.path.join(self.savedir, "logs.json"), "a") as f:
237 |             f.write(json_str)
238 |             f.write("\n\n")
239 |             
240 |         return gr.Video.update(value=save_sample_path)
241 |         
242 | 
243 | controller = AnimateController()
244 | 
245 | 
246 | def ui():
247 |     with gr.Blocks(css=css) as demo:
248 |         gr.Markdown(
249 |             """
250 |             # [AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725)
251 |             Yuwei Guo, Ceyuan Yang*, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, Bo Dai (*Corresponding Author)<br>
252 |             [Arxiv Report](https://arxiv.org/abs/2307.04725) | [Project Page](https://animatediff.github.io/) | [Github](https://github.com/guoyww/animatediff/)
253 |             """
254 |         )
255 |         with gr.Column(variant="panel"):
256 |             gr.Markdown(
257 |                 """
258 |                 ### 1. Model checkpoints (select pretrained model path first).
259 |                 """
260 |             )
261 |             with gr.Row():
262 |                 stable_diffusion_dropdown = gr.Dropdown(
263 |                     label="Pretrained Model Path",
264 |                     choices=controller.stable_diffusion_list,
265 |                     interactive=True,
266 |                 )
267 |                 stable_diffusion_dropdown.change(fn=controller.update_stable_diffusion, inputs=[stable_diffusion_dropdown], outputs=[stable_diffusion_dropdown])
268 |                 
269 |                 stable_diffusion_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
270 |                 def update_stable_diffusion():
271 |                     controller.refresh_stable_diffusion()
272 |                     return gr.Dropdown.update(choices=controller.stable_diffusion_list)
273 |                 stable_diffusion_refresh_button.click(fn=update_stable_diffusion, inputs=[], outputs=[stable_diffusion_dropdown])
274 | 
275 |             with gr.Row():
276 |                 motion_module_dropdown = gr.Dropdown(
277 |                     label="Select motion module",
278 |                     choices=controller.motion_module_list,
279 |                     interactive=True,
280 |                 )
281 |                 motion_module_dropdown.change(fn=controller.update_motion_module, inputs=[motion_module_dropdown], outputs=[motion_module_dropdown])
282 |                 
283 |                 motion_module_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
284 |                 def update_motion_module():
285 |                     controller.refresh_motion_module()
286 |                     return gr.Dropdown.update(choices=controller.motion_module_list)
287 |                 motion_module_refresh_button.click(fn=update_motion_module, inputs=[], outputs=[motion_module_dropdown])
288 |                 
289 |                 base_model_dropdown = gr.Dropdown(
290 |                     label="Select base Dreambooth model (required)",
291 |                     choices=controller.personalized_model_list,
292 |                     interactive=True,
293 |                 )
294 |                 base_model_dropdown.change(fn=controller.update_base_model, inputs=[base_model_dropdown], outputs=[base_model_dropdown])
295 |                 
296 |                 lora_model_dropdown = gr.Dropdown(
297 |                     label="Select LoRA model (optional)",
298 |                     choices=["none"] + controller.personalized_model_list,
299 |                     value="none",
300 |                     interactive=True,
301 |                 )
302 |                 lora_model_dropdown.change(fn=controller.update_lora_model, inputs=[lora_model_dropdown], outputs=[lora_model_dropdown])
303 |                 
304 |                 lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.8, minimum=0, maximum=2, interactive=True)
305 |                 
306 |                 personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
307 |                 def update_personalized_model():
308 |                     controller.refresh_personalized_model()
309 |                     return [
310 |                         gr.Dropdown.update(choices=controller.personalized_model_list),
311 |                         gr.Dropdown.update(choices=["none"] + controller.personalized_model_list)
312 |                     ]
313 |                 personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
314 | 
315 |         with gr.Column(variant="panel"):
316 |             gr.Markdown(
317 |                 """
318 |                 ### 2. Configs for AnimateDiff.
319 |                 """
320 |             )
321 |             
322 |             prompt_textbox = gr.Textbox(label="Prompt", lines=2)
323 |             negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2)
324 | 
325 |             gr.Markdown(
326 |                 """
327 |                 ### 2.* Controlnet for AnimateDiff (Optional).
328 |                 """
329 |             )
330 |             
331 |             with gr.Column(visible=False) as controlnet_column:
332 |                 with gr.Row().style(equal_height=True):
333 |                     videos_path_dropdown = gr.Dropdown(
334 |                         label="Select video for applying controlnet (optional)",
335 |                         choices=["none"] + controller.videos_list,
336 |                         value="none",
337 |                         interactive=True,
338 |                     )
339 |                     videos_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
340 |                     def update_videos():
341 |                         controller.refresh_videos()
342 |                         return gr.Dropdown.update(choices=controller.videos_list)
343 |                     videos_refresh_button.click(fn=update_videos, inputs=[], outputs=[videos_path_dropdown])
344 | 
345 |                     controlnet_processor_name_dropdown = gr.Dropdown(
346 |                         label="Select controlnet processor (if video selected)",
347 |                         choices=["canny", "depth", "softedge", "pose", "norm"],
348 |                         value="none",
349 |                         interactive=True,
350 |                     )
351 | 
352 |                     controlnet_processor_path_dropdown = gr.Dropdown(
353 |                         label="Set controlnet processor path (if video selected)",
354 |                         choices=["none"] + controller.controlnet_list,
355 |                         value="none",
356 |                         interactive=True,
357 |                     )
358 |                     controlnet_processor_path_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
359 |                     def update_videos():
360 |                         controller.refresh_videos()
361 |                         return gr.Dropdown.update(choices=controller.controlnet_list)
362 |                     controlnet_processor_path_refresh_button.click(fn=update_videos, inputs=[], outputs=[controlnet_processor_path_dropdown])
363 | 
364 |                 with gr.Row().style(equal_height=True):
365 |                     controlnet_guess_mode_checkbox          = gr.Checkbox(value=True, label="Controlnet Guess mode")
366 |                     get_each_slider                         = gr.Slider(label="Get Each Frame",     value=2,   minimum=1,   maximum=4,   step=1)
367 |                     controlnet_conditioning_scale_slider    = gr.Slider(label="Controlnet strenth", value=0.5, minimum=0.1, maximum=1.0, step=0.1)
368 | 
369 |             change_visibility = gr.Button(value="SHOW CONTROLNET SETTINGS (OPTIONAL)")
370 |             change_visibility.click(lambda :gr.update(visible=True), None, controlnet_column)
371 | 
372 |             with gr.Row().style(equal_height=False):
373 |                 with gr.Column():
374 |                     with gr.Row():
375 |                         sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
376 |                         sample_step_slider = gr.Slider(label="Sampling steps", value=25, minimum=10, maximum=100, step=1)
377 |                         
378 |                     width_slider     = gr.Slider(label="Width",            value=512, minimum=256, maximum=1024, step=64)
379 |                     height_slider    = gr.Slider(label="Height",           value=512, minimum=256, maximum=1024, step=64)
380 |                     length_slider    = gr.Slider(label="Animation length", value=16,  minimum=8,   maximum=24,   step=1)
381 |                     cfg_scale_slider = gr.Slider(label="CFG Scale",        value=7.5, minimum=0,   maximum=20)
382 |                     
383 |                     with gr.Row():
384 |                         seed_textbox = gr.Textbox(label="Seed", value=-1)
385 |                         seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
386 |                         seed_button.click(fn=lambda: gr.Textbox.update(value=random.randint(1, 1e8)), inputs=[], outputs=[seed_textbox])
387 |             
388 |                     generate_button = gr.Button(value="Generate", variant='primary')
389 |                     
390 |                 result_video = gr.Video(label="Generated Animation", interactive=False)
391 | 
392 |             generate_button.click(
393 |                 fn=controller.animate,
394 |                 inputs=[
395 |                     stable_diffusion_dropdown,
396 |                     motion_module_dropdown,
397 |                     base_model_dropdown,
398 |                     lora_alpha_slider,
399 |                     prompt_textbox, 
400 |                     negative_prompt_textbox, 
401 |                     sampler_dropdown, 
402 |                     sample_step_slider, 
403 |                     width_slider, 
404 |                     length_slider, 
405 |                     height_slider, 
406 |                     cfg_scale_slider, 
407 |                     seed_textbox,
408 |                     videos_path_dropdown,
409 |                     get_each_slider,
410 |                     controlnet_processor_name_dropdown,
411 |                     controlnet_processor_path_dropdown,
412 |                     controlnet_guess_mode_checkbox,
413 |                     controlnet_conditioning_scale_slider,
414 |                 ],
415 |                 outputs=[result_video]
416 |             )
417 |             
418 |     return demo
419 | 
420 | 
421 | if __name__ == "__main__":
422 |     demo = ui()
423 |     demo.launch(share=True)
424 | 


--------------------------------------------------------------------------------
/configs/inference/inference-v1.yaml:
--------------------------------------------------------------------------------
 1 | unet_additional_kwargs:
 2 |   unet_use_cross_frame_attention: false
 3 |   unet_use_temporal_attention: false
 4 |   use_motion_module: true
 5 |   motion_module_resolutions:
 6 |   - 1
 7 |   - 2
 8 |   - 4
 9 |   - 8
10 |   motion_module_mid_block: false
11 |   motion_module_decoder_only: false
12 |   motion_module_type: Vanilla
13 |   motion_module_kwargs:
14 |     num_attention_heads: 8
15 |     num_transformer_block: 1
16 |     attention_block_types:
17 |     - Temporal_Self
18 |     - Temporal_Self
19 |     temporal_position_encoding: true
20 |     temporal_position_encoding_max_len: 24
21 |     temporal_attention_dim_div: 1
22 | 
23 | noise_scheduler_kwargs:
24 |   beta_start: 0.00085
25 |   beta_end: 0.012
26 |   beta_schedule: "linear"
27 | 


--------------------------------------------------------------------------------
/configs/inference/inference-v2.yaml:
--------------------------------------------------------------------------------
 1 | unet_additional_kwargs:
 2 |   use_inflated_groupnorm: true
 3 |   unet_use_cross_frame_attention: false
 4 |   unet_use_temporal_attention: false
 5 |   use_motion_module: true
 6 |   motion_module_resolutions:
 7 |   - 1
 8 |   - 2
 9 |   - 4
10 |   - 8
11 |   motion_module_mid_block: true
12 |   motion_module_decoder_only: false
13 |   motion_module_type: Vanilla
14 |   motion_module_kwargs:
15 |     num_attention_heads: 8
16 |     num_transformer_block: 1
17 |     attention_block_types:
18 |     - Temporal_Self
19 |     - Temporal_Self
20 |     temporal_position_encoding: true
21 |     temporal_position_encoding_max_len: 32
22 |     temporal_attention_dim_div: 1
23 | 
24 | noise_scheduler_kwargs:
25 |   beta_start: 0.00085
26 |   beta_end: 0.012
27 |   beta_schedule: "linear"
28 | 


--------------------------------------------------------------------------------
/configs/prompts/1-ToonYou-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | ToonYou:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/toonyou_beta3.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 6 | 
 7 |   control:
 8 |     video_path: "./videos/dance.mp4" # smiling, dance or your video
 9 |     get_each: 2 # get each frame from video
10 |     conditioning_scale: 0.75 # controlnet strength
11 |     controlnet_processor: "softedge" # softedge, canny, depth
12 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
13 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_softedge" # control_v11p_sd15_softedge, control_v11f1p_sd15_depth, control_v11p_sd15_canny
14 |     guess_mode: True
15 | 
16 |   seed:           [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751]
17 |   steps:          25
18 |   guidance_scale: 7.5
19 | 
20 |   prompt:
21 |     - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
22 |     - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,"
23 |     - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern"
24 |     - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,"
25 | 
26 |   n_prompt:
27 |     - ""
28 |     - "badhandv4,easynegative,ng_deepnegative_v1_75t,verybadimagenegative_v1.3, bad-artist, bad_prompt_version2-neg, teeth"
29 |     - ""
30 |     - ""
31 | 


--------------------------------------------------------------------------------
/configs/prompts/1-ToonYou.yaml:
--------------------------------------------------------------------------------
 1 | ToonYou:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/toonyou_beta3.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 | 
12 |   prompt:
13 |     - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
14 |     - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,"
15 |     - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern"
16 |     - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,"
17 | 
18 |   n_prompt:
19 |     - ""
20 |     - "badhandv4,easynegative,ng_deepnegative_v1_75t,verybadimagenegative_v1.3, bad-artist, bad_prompt_version2-neg, teeth"
21 |     - ""
22 |     - ""
23 | 


--------------------------------------------------------------------------------
/configs/prompts/2-Lyriel-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | Lyriel:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/lyriel_v16.safetensors"
 4 |   motion_module:
 5 |     # - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   control:
 9 |     video_path: "./videos/smiling.mp4" # smiling, dance or your video
10 |     get_each: 2 # get each frame from video
11 |     conditioning_scale: 0.75 # controlnet strength
12 |     controlnet_processor: "canny" # softedge, canny, depth
13 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
14 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_canny" # control_v11p_sd15_softedge, control_v11f1p_sd15_depth, control_v11p_sd15_canny
15 |     guess_mode: True
16 | 
17 |   seed:           [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]
18 |   steps:          25
19 |   guidance_scale: 7.5
20 | 
21 |   prompt:
22 |     - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange"
23 |     - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal"
24 |     - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray"
25 |     - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown."
26 | 
27 |   n_prompt:
28 |     - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration"
29 |     - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular"
30 |     - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome"
31 |     - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render"
32 | 


--------------------------------------------------------------------------------
/configs/prompts/2-Lyriel.yaml:
--------------------------------------------------------------------------------
 1 | Lyriel:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/lyriel_v16.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 | 
12 |   prompt:
13 |     - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange"
14 |     - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal"
15 |     - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray"
16 |     - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown."
17 | 
18 |   n_prompt:
19 |     - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration"
20 |     - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular"
21 |     - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome"
22 |     - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render"
23 | 


--------------------------------------------------------------------------------
/configs/prompts/3-RcnzCartoon-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | RcnzCartoon:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors"
 4 |   motion_module:
 5 |     # - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   control:
 9 |     video_path: "./videos/smiling.mp4" # smiling, dance or your video
10 |     get_each: 2 # get each frame from video
11 |     conditioning_scale: 0.75 # controlnet strength
12 |     controlnet_processor: "softedge" # softedge, canny, depth
13 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
14 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_softedge" # control_v11p_sd15_softedge, control_v11p_sd15_canny, control_v11f1p_sd15_depth
15 |     guess_mode: True
16 | 
17 |   seed:           [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890]
18 |   steps:          25
19 |   guidance_scale: 7.5
20 | 
21 |   prompt:
22 |     - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded"
23 |     - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face"
24 |     - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes"
25 |     - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering"
26 | 
27 |   n_prompt:
28 |     - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
29 |     - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular"
30 |     - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,"
31 |     - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand"
32 | 


--------------------------------------------------------------------------------
/configs/prompts/3-RcnzCartoon.yaml:
--------------------------------------------------------------------------------
 1 | RcnzCartoon:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 | 
12 |   prompt:
13 |     - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded"
14 |     - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face"
15 |     - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes"
16 |     - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering"
17 | 
18 |   n_prompt:
19 |     - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
20 |     - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular"
21 |     - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,"
22 |     - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand"
23 | 


--------------------------------------------------------------------------------
/configs/prompts/4-MajicMix.yaml:
--------------------------------------------------------------------------------
 1 | MajicMix:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [1572448948722921032, 1099474677988590681, 6488833139725635347, 18339859844376517918]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 | 
12 |   prompt:
13 |     - "1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic"
14 |     - "best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting"
15 |     - "best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below"
16 |     - "male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic"
17 | 
18 |   n_prompt:
19 |     - "ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles"
20 |     - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
21 |     - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
22 |     - "nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people"
23 | 


--------------------------------------------------------------------------------
/configs/prompts/5-RealisticVision.yaml:
--------------------------------------------------------------------------------
 1 | RealisticVision:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/realisticVisionV20_v20.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [5658137986800322009, 12099779162349365895, 10499524853910852697, 16768009035333711932]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 | 
12 |   prompt:
13 |     - "b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
14 |     - "close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
15 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
16 |     - "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
17 | 
18 |   n_prompt:
19 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
20 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
21 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
22 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
23 | 


--------------------------------------------------------------------------------
/configs/prompts/6-Tusun.yaml:
--------------------------------------------------------------------------------
 1 | Tusun:
 2 |   base: "models/DreamBooth_LoRA/moonfilm_reality20.safetensors"
 3 |   path: "models/DreamBooth_LoRA/TUSUN.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [10154078483724687116, 2664393535095473805, 4231566096207622938, 1713349740448094493]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 |   lora_alpha: 0.6
12 | 
13 |   prompt:
14 |     - "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
15 |     - "cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
16 |     - "cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
17 |     - "character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body"
18 | 
19 |   n_prompt:
20 |     - "worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative"
21 | 


--------------------------------------------------------------------------------
/configs/prompts/7-FilmVelvia.yaml:
--------------------------------------------------------------------------------
 1 | FilmVelvia:
 2 |   base: "models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors"
 3 |   path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 |   lora_alpha: 0.6
12 | 
13 |   prompt:
14 |     - "a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name"
15 |     - ", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir"
16 |     - "fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark"
17 |     - "In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, "
18 | 
19 |   n_prompt:
20 |     - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
21 |     - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
22 |     - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
23 |     - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
24 | 


--------------------------------------------------------------------------------
/configs/prompts/8-GhibliBackground.yaml:
--------------------------------------------------------------------------------
 1 | GhibliBackground:
 2 |   base: "models/DreamBooth_LoRA/CounterfeitV30_25.safetensors"
 3 |   path: "models/DreamBooth_LoRA/lora_Ghibli_n3.safetensors"
 4 |   motion_module:
 5 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 6 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 7 | 
 8 |   seed:           [8775748474469046618, 5893874876080607656, 11911465742147695752, 12437784838692000640]
 9 |   steps:          25
10 |   guidance_scale: 7.5
11 |   lora_alpha: 1.0
12 | 
13 |   prompt:
14 |     - "best quality,single build,architecture, blue_sky, building,cloudy_sky, day, fantasy, fence, field, house, build,architecture,landscape, moss, outdoors, overgrown, path, river, road, rock, scenery, sky, sword, tower, tree, waterfall"
15 |     - "black_border, building, city, day, fantasy, ice, landscape, letterboxed, mountain, ocean, outdoors, planet, scenery, ship, snow, snowing, water, watercraft, waterfall, winter"
16 |     - ",mysterious sea area, fantasy,build,concept"
17 |     - "Tomb Raider,Scenography,Old building"
18 | 
19 |   n_prompt:
20 |     - "easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality"
21 | 


--------------------------------------------------------------------------------
/configs/prompts/v2/1-ToonYou-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | ToonYou:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/toonyou_beta3.safetensors"
 4 |   inference_config: "configs/inference/inference-v2.yaml"
 5 |   motion_module:
 6 |     - "models/Motion_Module/mm_sd_v15_v2.ckpt"
 7 | 
 8 |   control:
 9 |     video_path: "./videos/dance.mp4" # smiling, dance or your video
10 |     get_each: 2 # get each frame from video
11 |     conditioning_scale: 0.75 # controlnet strength
12 |     controlnet_processor: "softedge" # softedge, canny, depth
13 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
14 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_softedge" # control_v11p_sd15_softedge, control_v11f1p_sd15_depth, control_v11p_sd15_canny
15 |     guess_mode: True
16 | 
17 |   seed:           [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751]
18 |   steps:          25
19 |   guidance_scale: 7.5
20 | 
21 |   prompt:
22 |     - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
23 |     - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,"
24 |     - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern"
25 |     - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,"
26 | 
27 |   n_prompt:
28 |     - ""
29 |     - "badhandv4,easynegative,ng_deepnegative_v1_75t,verybadimagenegative_v1.3, bad-artist, bad_prompt_version2-neg, teeth"
30 |     - ""
31 |     - ""
32 | 


--------------------------------------------------------------------------------
/configs/prompts/v2/2-Lyriel-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | Lyriel:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/lyriel_v16.safetensors"
 4 |   inference_config: "configs/inference/inference-v2.yaml"
 5 |   motion_module:
 6 |     - "models/Motion_Module/mm_sd_v15_v2.ckpt"
 7 | 
 8 |   control:
 9 |     video_path: "./videos/smiling.mp4" # smiling, dance or your video
10 |     get_each: 2 # get each frame from video
11 |     conditioning_scale: 0.75 # controlnet strength
12 |     controlnet_processor: "canny" # softedge, canny, depth
13 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
14 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_canny" # control_v11p_sd15_softedge, control_v11f1p_sd15_depth, control_v11p_sd15_canny
15 |     guess_mode: True
16 | 
17 |   seed:           [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]
18 |   steps:          25
19 |   guidance_scale: 7.5
20 | 
21 |   prompt:
22 |     - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange"
23 |     - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal"
24 |     - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray"
25 |     - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown."
26 | 
27 |   n_prompt:
28 |     - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration"
29 |     - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular"
30 |     - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome"
31 |     - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render"
32 | 


--------------------------------------------------------------------------------
/configs/prompts/v2/3-RcnzCartoon-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | RcnzCartoon:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors"
 4 |   inference_config: "configs/inference/inference-v2.yaml"
 5 |   motion_module:
 6 |     - "models/Motion_Module/mm_sd_v15_v2.ckpt"
 7 | 
 8 |   control:
 9 |     video_path: "./videos/smiling.mp4" # smiling, dance or your video
10 |     get_each: 2 # get each frame from video
11 |     conditioning_scale: 0.75 # controlnet strength
12 |     controlnet_processor: "softedge" # softedge, canny, depth
13 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
14 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_softedge" # control_v11p_sd15_softedge, control_v11p_sd15_canny, control_v11f1p_sd15_depth
15 |     guess_mode: True
16 | 
17 |   seed:           [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890]
18 |   steps:          25
19 |   guidance_scale: 7.5
20 | 
21 |   prompt:
22 |     - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded"
23 |     - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face"
24 |     - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes"
25 |     - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering"
26 | 
27 |   n_prompt:
28 |     - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
29 |     - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular"
30 |     - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,"
31 |     - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand"
32 | 


--------------------------------------------------------------------------------
/configs/prompts/v2/5-RealisticVision-Controlnet.yaml:
--------------------------------------------------------------------------------
 1 | RealisticVision:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/realisticVisionV51_v20Novae.safetensors"
 4 | 
 5 |   inference_config: "configs/inference/inference-v2.yaml"
 6 |   motion_module:
 7 |     - "models/Motion_Module/mm_sd_v15_v2.ckpt"
 8 | 
 9 |   control:
10 |     video_path: "./videos/smiling.mp4" # smiling, dance or your video
11 |     get_each: 2 # get each frame from video
12 |     conditioning_scale: 0.75 # controlnet strength
13 |     controlnet_processor: "depth" # softedge, canny, depth
14 |     controlnet_pipeline: "models/StableDiffusion/stable-diffusion-v1-5"
15 |     controlnet_processor_path: "models/Controlnet/control_v11p_sd15_softedge" # control_v11p_sd15_softedge, control_v11f1p_sd15_depth, control_v11p_sd15_canny
16 |     guess_mode: True
17 | 
18 |   seed:           [13100322578370451493, 14752961627088720670, 9329399085567825781, 16987697414827649302]
19 |   steps:          25
20 |   guidance_scale: 7.5
21 | 
22 |   prompt:
23 |     - "b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
24 |     - "close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
25 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
26 |     - "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
27 | 
28 |   n_prompt:
29 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
30 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
31 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
32 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
33 | 


--------------------------------------------------------------------------------
/configs/prompts/v2/5-RealisticVision.yaml:
--------------------------------------------------------------------------------
 1 | RealisticVision:
 2 |   base: ""
 3 |   path: "models/DreamBooth_LoRA/realisticVisionV20_v20.safetensors"
 4 | 
 5 |   inference_config: "configs/inference/inference-v2.yaml"
 6 |   motion_module:
 7 |     - "models/Motion_Module/mm_sd_v15_v2.ckpt"
 8 | 
 9 |   seed:           [13100322578370451493, 14752961627088720670, 9329399085567825781, 16987697414827649302]
10 |   steps:          25
11 |   guidance_scale: 7.5
12 | 
13 |   prompt:
14 |     - "b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
15 |     - "close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
16 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
17 |     - "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
18 | 
19 |   n_prompt:
20 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
21 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
22 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
23 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
24 | 


--------------------------------------------------------------------------------
/configs/training/image_finetune.yaml:
--------------------------------------------------------------------------------
 1 | image_finetune: true
 2 | 
 3 | output_dir: "outputs"
 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
 5 | 
 6 | noise_scheduler_kwargs:
 7 |   num_train_timesteps: 1000
 8 |   beta_start:          0.00085
 9 |   beta_end:            0.012
10 |   beta_schedule:       "scaled_linear"
11 |   steps_offset:        1
12 |   clip_sample:         false
13 | 
14 | train_data:
15 |   csv_path:     "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv"
16 |   video_folder: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val"
17 |   sample_size:  256
18 | 
19 | validation_data:
20 |   prompts:
21 |     - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
22 |     - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
23 |     - "Robot dancing in times square."
24 |     - "Pacific coast, carmel by the sea ocean and waves."
25 |   num_inference_steps: 25
26 |   guidance_scale: 8.
27 | 
28 | trainable_modules:
29 |   - "."
30 | 
31 | unet_checkpoint_path: ""
32 | 
33 | learning_rate:    1.e-5
34 | train_batch_size: 50
35 | 
36 | max_train_epoch:      -1
37 | max_train_steps:      100
38 | checkpointing_epochs: -1
39 | checkpointing_steps:  60
40 | 
41 | validation_steps:       5000
42 | validation_steps_tuple: [2, 50]
43 | 
44 | global_seed: 42
45 | mixed_precision_training: true
46 | enable_xformers_memory_efficient_attention: True
47 | 
48 | is_debug: False
49 | 


--------------------------------------------------------------------------------
/configs/training/training.yaml:
--------------------------------------------------------------------------------
 1 | image_finetune: false
 2 | 
 3 | output_dir: "outputs"
 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
 5 | 
 6 | unet_additional_kwargs:
 7 |   use_motion_module              : true
 8 |   motion_module_resolutions      : [ 1,2,4,8 ]
 9 |   unet_use_cross_frame_attention : false
10 |   unet_use_temporal_attention    : false
11 | 
12 |   motion_module_type: Vanilla
13 |   motion_module_kwargs:
14 |     num_attention_heads                : 8
15 |     num_transformer_block              : 1
16 |     attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
17 |     temporal_position_encoding         : true
18 |     temporal_position_encoding_max_len : 24
19 |     temporal_attention_dim_div         : 1
20 |     zero_initialize                    : true
21 | 
22 | noise_scheduler_kwargs:
23 |   num_train_timesteps: 1000
24 |   beta_start:          0.00085
25 |   beta_end:            0.012
26 |   beta_schedule:       "linear"
27 |   steps_offset:        1
28 |   clip_sample:         false
29 | 
30 | train_data:
31 |   csv_path:        "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv"
32 |   video_folder:    "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val"
33 |   sample_size:     256
34 |   sample_stride:   4
35 |   sample_n_frames: 16
36 | 
37 | validation_data:
38 |   prompts:
39 |     - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
40 |     - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
41 |     - "Robot dancing in times square."
42 |     - "Pacific coast, carmel by the sea ocean and waves."
43 |   num_inference_steps: 25
44 |   guidance_scale: 8.
45 | 
46 | trainable_modules:
47 |   - "motion_modules."
48 | 
49 | unet_checkpoint_path: ""
50 | 
51 | learning_rate:    1.e-4
52 | train_batch_size: 4
53 | 
54 | max_train_epoch:      -1
55 | max_train_steps:      100
56 | checkpointing_epochs: -1
57 | checkpointing_steps:  60
58 | 
59 | validation_steps:       5000
60 | validation_steps_tuple: [2, 50]
61 | 
62 | global_seed: 42
63 | mixed_precision_training: true
64 | enable_xformers_memory_efficient_attention: True
65 | 
66 | is_debug: False
67 | 


--------------------------------------------------------------------------------
/download_bashscripts/0-MotionModule.sh:
--------------------------------------------------------------------------------
1 | gdown 1RqkQuGPaCO5sGZ6V6KZ-jUWmsRu48Kdq -O models/Motion_Module/
2 | gdown 1ql0g_Ys4UCz2RnokYlBjyOYPbttbIpbu -O models/Motion_Module/


--------------------------------------------------------------------------------
/download_bashscripts/1-ToonYou.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/78775 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/download_bashscripts/2-Lyriel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/72396 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/download_bashscripts/3-RcnzCartoon.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/71009 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/download_bashscripts/4-MajicMix.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/79068 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/download_bashscripts/5-RealisticVision.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/29460 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/download_bashscripts/6-Tusun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/97261 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
3 | wget https://civitai.com/api/download/models/50705 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
4 | 


--------------------------------------------------------------------------------
/download_bashscripts/7-FilmVelvia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/90115 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
3 | wget https://civitai.com/api/download/models/92475 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
4 | 


--------------------------------------------------------------------------------
/download_bashscripts/8-GhibliBackground.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/102828 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
3 | wget https://civitai.com/api/download/models/57618  -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
4 | 


--------------------------------------------------------------------------------
/download_bashscripts/9-Controlnets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git clone https://huggingface.co/lllyasviel/control_v11p_sd15_softedge models/Controlnet/control_v11p_sd15_softedge
3 | git clone https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth models/Controlnet/control_v11f1p_sd15_depth
4 | git clone https://huggingface.co/lllyasviel/control_v11p_sd15_canny models/Controlnet/control_v11p_sd15_canny


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: animatediff
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 | dependencies:
 6 |   - python=3.10
 7 |   - pytorch=2.0.1
 8 |   - torchvision=0.15.2
 9 |   # - torchaudio=0.13.1
10 |   # - pytorch-cuda=11.7
11 |   - pip
12 |   - pip:
13 |     - diffusers==0.20.2
14 |     - transformers==4.32.1
15 |     - xformers==0.0.21
16 |     - controlnet-aux==0.0.6
17 |     - imageio==2.27.0
18 |     - imageio[ffmpeg]
19 |     - decord==0.6.0
20 |     - gdown
21 |     - einops
22 |     - omegaconf
23 |     - safetensors
24 |     - gradio
25 |     - wandb
26 | 


--------------------------------------------------------------------------------
/models/Controlnet/Put controlnet models repo here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/models/Controlnet/Put controlnet models repo here.txt


--------------------------------------------------------------------------------
/models/DreamBooth_LoRA/Put personalized T2I checkpoints here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/models/DreamBooth_LoRA/Put personalized T2I checkpoints here.txt


--------------------------------------------------------------------------------
/models/Motion_Module/Put motion module checkpoints here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/models/Motion_Module/Put motion module checkpoints here.txt


--------------------------------------------------------------------------------
/models/StableDiffusion/Put diffusers stable-diffusion-v1-5 repo here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/models/StableDiffusion/Put diffusers stable-diffusion-v1-5 repo here.txt


--------------------------------------------------------------------------------
/scripts/animate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import inspect
  4 | import os
  5 | 
  6 | import torch
  7 | from omegaconf import OmegaConf
  8 | from diffusers import AutoencoderKL, DDIMScheduler
  9 | from transformers import CLIPTextModel, CLIPTokenizer
 10 | 
 11 | from animatediff.models.unet import UNet3DConditionModel
 12 | from animatediff.pipelines.pipeline_animation import AnimationPipeline
 13 | from animatediff.utils.util import save_videos_grid
 14 | from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
 15 | from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
 16 | from animatediff.controlnet.controlnet_module import ControlnetModule
 17 | from diffusers.utils.import_utils import is_xformers_available
 18 | 
 19 | from safetensors import safe_open
 20 | from pathlib import Path
 21 | 
 22 | 
 23 | def main(args):
 24 |     *_, func_args = inspect.getargvalues(inspect.currentframe())
 25 |     func_args = dict(func_args)
 26 |     
 27 |     time_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
 28 |     savedir = f"samples/{Path(args.config).stem}-{time_str}"
 29 |     os.makedirs(savedir)
 30 | 
 31 |     config  = OmegaConf.load(args.config)
 32 |     samples = []
 33 |     
 34 |     sample_idx = 0
 35 |     for model_idx, (config_key, model_config) in enumerate(list(config.items())):
 36 |         
 37 |         motion_modules = model_config.motion_module
 38 |         motion_modules = [motion_modules] if isinstance(motion_modules, str) else list(motion_modules)
 39 |         for motion_module in motion_modules:
 40 |             inference_config = OmegaConf.load(model_config.get("inference_config", args.inference_config))
 41 | 
 42 |             ### >>> create validation pipeline >>> ###
 43 |             tokenizer    = CLIPTokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
 44 |             text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder")
 45 |             vae          = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae")            
 46 |             unet         = UNet3DConditionModel.from_pretrained_2d(args.pretrained_model_path, subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs))
 47 | 
 48 |             if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
 49 |             else: assert False
 50 | 
 51 |             pipeline = AnimationPipeline(
 52 |                 vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,
 53 |                 scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs)),
 54 |             ).to("cuda")
 55 | 
 56 |             # 1. unet ckpt
 57 |             # 1.1 motion module
 58 |             motion_module_state_dict = torch.load(motion_module, map_location="cpu")
 59 |             if "global_step" in motion_module_state_dict: func_args.update({"global_step": motion_module_state_dict["global_step"]})
 60 |             missing, unexpected = pipeline.unet.load_state_dict(motion_module_state_dict, strict=False)
 61 |             assert len(unexpected) == 0
 62 |             
 63 |             # 1.2 T2I
 64 |             if model_config.path != "":
 65 |                 if model_config.path.endswith(".ckpt"):
 66 |                     state_dict = torch.load(model_config.path)
 67 |                     pipeline.unet.load_state_dict(state_dict)
 68 |                     
 69 |                 elif model_config.path.endswith(".safetensors"):
 70 |                     state_dict = {}
 71 |                     with safe_open(model_config.path, framework="pt", device="cpu") as f:
 72 |                         for key in f.keys():
 73 |                             state_dict[key] = f.get_tensor(key)
 74 |                             
 75 |                     is_lora = all("lora" in k for k in state_dict.keys())
 76 |                     if not is_lora:
 77 |                         base_state_dict = state_dict
 78 |                     else:
 79 |                         base_state_dict = {}
 80 |                         with safe_open(model_config.base, framework="pt", device="cpu") as f:
 81 |                             for key in f.keys():
 82 |                                 base_state_dict[key] = f.get_tensor(key)                
 83 |                     
 84 |                     # vae
 85 |                     converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_state_dict, pipeline.vae.config)
 86 |                     pipeline.vae.load_state_dict(converted_vae_checkpoint)
 87 |                     # unet
 88 |                     converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_state_dict, pipeline.unet.config)
 89 |                     pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
 90 |                     # text_model
 91 |                     pipeline.text_encoder = convert_ldm_clip_checkpoint(base_state_dict)
 92 |                     
 93 |                     # import pdb
 94 |                     # pdb.set_trace()
 95 |                     if is_lora:
 96 |                         pipeline = convert_lora(pipeline, state_dict, alpha=model_config.lora_alpha)
 97 | 
 98 |             pipeline.to("cuda")
 99 |             ### <<< create validation pipeline <<< ###
100 | 
101 |             down_features, mid_features = None, None
102 |             controlnet = None
103 |             if 'control' in model_config:
104 |                 controlnet_config = {
105 |                     'video_length': args.L,
106 |                     'img_h': args.H,
107 |                     'img_w': args.W,
108 |                     'guidance_scale': model_config.guidance_scale,
109 |                     'steps': model_config.steps,
110 |                     'device': 'cuda',
111 |                     **model_config.control
112 |                 }
113 |                 controlnet = ControlnetModule(controlnet_config)
114 | 
115 |             prompts      = model_config.prompt
116 |             n_prompts    = list(model_config.n_prompt) * len(prompts) if len(model_config.n_prompt) == 1 else model_config.n_prompt
117 |             
118 |             random_seeds = model_config.get("seed", [-1])
119 |             random_seeds = [random_seeds] if isinstance(random_seeds, int) else list(random_seeds)
120 |             random_seeds = random_seeds * len(prompts) if len(random_seeds) == 1 else random_seeds
121 |             
122 |             config[config_key].random_seed = []
123 |             for prompt_idx, (prompt, n_prompt, random_seed) in enumerate(zip(prompts, n_prompts, random_seeds)):
124 |                 
125 |                 # manually set random seed for reproduction
126 |                 if random_seed != -1: torch.manual_seed(random_seed)
127 |                 else: torch.seed()
128 |                 config[config_key].random_seed.append(torch.initial_seed())
129 | 
130 |                 if controlnet is not None:
131 |                     down_features, mid_features = controlnet(model_config.control.video_path, prompt, n_prompt, random_seed)
132 | 
133 |                 print(f"current seed: {torch.initial_seed()}")
134 |                 print(f"sampling {prompt} ...")
135 |                 sample = pipeline(
136 |                     prompt,
137 |                     negative_prompt     = n_prompt,
138 |                     num_inference_steps = model_config.steps,
139 |                     guidance_scale      = model_config.guidance_scale,
140 |                     width               = args.W,
141 |                     height              = args.H,
142 |                     video_length        = args.L,
143 |                     down_block_control  = down_features, 
144 |                     mid_block_control   = mid_features,
145 |                 ).videos
146 |                 samples.append(sample)
147 | 
148 |                 prompt = "-".join((prompt.replace("/", "").split(" ")[:10]))
149 |                 save_videos_grid(sample, f"{savedir}/sample/{sample_idx}-{prompt}.gif")
150 |                 print(f"save to {savedir}/sample/{prompt}.gif")
151 |                 
152 |                 sample_idx += 1
153 | 
154 |     samples = torch.concat(samples)
155 |     save_videos_grid(samples, f"{savedir}/sample.gif", n_rows=4)
156 | 
157 |     OmegaConf.save(config, f"{savedir}/config.yaml")
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     parser = argparse.ArgumentParser()
162 |     parser.add_argument("--pretrained_model_path", type=str, default="models/StableDiffusion/stable-diffusion-v1-5",)
163 |     parser.add_argument("--inference_config",      type=str, default="configs/inference/inference-v1.yaml")    
164 |     parser.add_argument("--config",                type=str, required=True)
165 |     
166 |     parser.add_argument("--L", type=int, default=16 )
167 |     parser.add_argument("--W", type=int, default=512)
168 |     parser.add_argument("--H", type=int, default=512)
169 | 
170 |     args = parser.parse_args()
171 |     main(args)
172 | 


--------------------------------------------------------------------------------
/videos/Put your short videos here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/videos/Put your short videos here.txt


--------------------------------------------------------------------------------
/videos/dance.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/videos/dance.mp4


--------------------------------------------------------------------------------
/videos/smiling.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheDenk/ControledAnimateDiff/8f2aa3ffec2f2dc456537120db78fdb0f36cb5ec/videos/smiling.mp4


--------------------------------------------------------------------------------