├── AnimateDiff
    ├── .gitignore
    ├── LICENSE.txt
    ├── README.md
    ├── __assets__
    │   ├── animations
    │   │   ├── compare
    │   │   │   ├── ffmpeg
    │   │   │   ├── new_0.gif
    │   │   │   ├── new_1.gif
    │   │   │   ├── new_2.gif
    │   │   │   ├── new_3.gif
    │   │   │   ├── old_0.gif
    │   │   │   ├── old_1.gif
    │   │   │   ├── old_2.gif
    │   │   │   └── old_3.gif
    │   │   ├── model_01
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── model_02
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── model_03
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── model_04
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── model_05
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── model_06
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── model_07
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   ├── 04.gif
    │   │   │   └── init.jpg
    │   │   ├── model_08
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   ├── 03.gif
    │   │   │   └── 04.gif
    │   │   ├── motion_lora
    │   │   │   ├── model_01
    │   │   │   │   ├── 01.gif
    │   │   │   │   ├── 02.gif
    │   │   │   │   ├── 03.gif
    │   │   │   │   ├── 04.gif
    │   │   │   │   ├── 05.gif
    │   │   │   │   ├── 06.gif
    │   │   │   │   ├── 07.gif
    │   │   │   │   └── 08.gif
    │   │   │   └── model_02
    │   │   │   │   ├── 01.gif
    │   │   │   │   ├── 02.gif
    │   │   │   │   ├── 03.gif
    │   │   │   │   ├── 04.gif
    │   │   │   │   ├── 05.gif
    │   │   │   │   ├── 06.gif
    │   │   │   │   ├── 07.gif
    │   │   │   │   └── 08.gif
    │   │   ├── motion_xl
    │   │   │   ├── 01.gif
    │   │   │   ├── 02.gif
    │   │   │   └── 03.gif
    │   │   └── v3
    │   │   │   ├── animation_fireworks.gif
    │   │   │   ├── animation_sunset.gif
    │   │   │   ├── sketch_boy.gif
    │   │   │   └── sketch_city.gif
    │   ├── demos
    │   │   ├── image
    │   │   │   ├── RealisticVision_firework.png
    │   │   │   ├── RealisticVision_sunset.png
    │   │   │   ├── interpolation_1.png
    │   │   │   ├── interpolation_2.png
    │   │   │   ├── low_fps_1.png
    │   │   │   ├── low_fps_2.png
    │   │   │   ├── low_fps_3.png
    │   │   │   ├── low_fps_4.png
    │   │   │   ├── painting.png
    │   │   │   ├── prediction_1.png
    │   │   │   ├── prediction_2.png
    │   │   │   ├── prediction_3.png
    │   │   │   └── prediction_4.png
    │   │   └── scribble
    │   │   │   ├── scribble_1.png
    │   │   │   ├── scribble_2_1.png
    │   │   │   ├── scribble_2_2.png
    │   │   │   ├── scribble_2_3.png
    │   │   │   └── scribble_2_readme.png
    │   ├── docs
    │   │   ├── animatediff.md
    │   │   └── gallery.md
    │   └── figs
    │   │   ├── adapter_explain.png
    │   │   └── gradio.jpg
    ├── animatediff
    │   ├── data
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── attention.py
    │   │   ├── motion_module.py
    │   │   ├── resnet.py
    │   │   ├── sparse_controlnet.py
    │   │   ├── unet.py
    │   │   └── unet_blocks.py
    │   ├── pipelines
    │   │   └── pipeline_animation.py
    │   └── utils
    │   │   ├── convert_from_ckpt.py
    │   │   ├── convert_lora_safetensor_to_diffusers.py
    │   │   └── util.py
    ├── app.py
    ├── configs
    │   ├── inference
    │   │   ├── inference-v1.yaml
    │   │   ├── inference-v2.yaml
    │   │   ├── inference-v3.yaml
    │   │   └── sparsectrl
    │   │   │   ├── image_condition.yaml
    │   │   │   └── latent_condition.yaml
    │   ├── prompts
    │   │   ├── v1
    │   │   │   ├── v1-1-ToonYou.yaml
    │   │   │   ├── v1-2-Lyriel.yaml
    │   │   │   ├── v1-3-RcnzCartoon.yaml
    │   │   │   ├── v1-4-MajicMix.yaml
    │   │   │   ├── v1-5-RealisticVision.yaml
    │   │   │   ├── v1-6-Tusun.yaml
    │   │   │   ├── v1-7-FilmVelvia.yaml
    │   │   │   └── v1-8-GhibliBackground.yaml
    │   │   ├── v2
    │   │   │   ├── v2-1-RealisticVision.yaml
    │   │   │   └── v2-2-RealisticVision-MotionLoRA.yaml
    │   │   └── v3
    │   │   │   ├── v3-1-T2V.yaml
    │   │   │   ├── v3-2-animation-RealisticVision.yaml
    │   │   │   └── v3-3-sketch-RealisticVision.yaml
    │   └── training
    │   │   └── v1
    │   │       ├── image_finetune.yaml
    │   │       ├── motion_adapter_training.yaml
    │   │       ├── spatial_adapter_training.yaml
    │   │       └── training.yaml
    ├── convert_to_safetensors.py
    ├── dataset.py
    ├── download_bashscripts
    │   ├── 0-MotionModule.sh
    │   ├── 1-ToonYou.sh
    │   ├── 2-Lyriel.sh
    │   ├── 3-RcnzCartoon.sh
    │   ├── 4-MajicMix.sh
    │   ├── 5-RealisticVision.sh
    │   ├── 6-Tusun.sh
    │   ├── 7-FilmVelvia.sh
    │   └── 8-GhibliBackground.sh
    ├── environment.yaml
    ├── models
    │   ├── DreamBooth_LoRA
    │   │   └── Put personalized T2I checkpoints here.txt
    │   ├── MotionLoRA
    │   │   └── Put MotionLoRA checkpoints here.txt
    │   └── Motion_Module
    │   │   └── Put motion module checkpoints here.txt
    ├── scripts
    │   └── animate.py
    ├── some_dict.txt
    ├── train.py
    ├── train_still_moving.py
    └── wget-log
├── README.md
├── adapters
    └── motion_adapter.py
├── results
    └── 0.gif
└── wget-log


/AnimateDiff/.gitignore:
--------------------------------------------------------------------------------
 1 | wandb/
 2 | *debug*
 3 | debugs/
 4 | outputs/
 5 | samples/
 6 | __pycache__/
 7 | ossutil_output/
 8 | .ossutil_checkpoint/
 9 | 
10 | scripts/*
11 | !scripts/animate.py
12 | 
13 | *.ipynb
14 | *.safetensors
15 | *.ckpt
16 | 
17 | models/*
18 | !models/StableDiffusion/
19 | models/StableDiffusion/*
20 | !models/StableDiffusion/*.txt
21 | !models/Motion_Module/
22 | !models/Motion_Module/*.txt
23 | !models/DreamBooth_LoRA/
24 | !models/DreamBooth_LoRA/*.txt
25 | !models/MotionLoRA/
26 | !models/MotionLoRA/*.txt
27 | outputs
28 | models
29 | dataset
30 | 


--------------------------------------------------------------------------------
/AnimateDiff/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/ffmpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/ffmpeg


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/new_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_0.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/new_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_1.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/new_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_2.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/new_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_3.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/old_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_0.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/old_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_1.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/old_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_2.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/compare/old_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_3.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_01/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_01/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_01/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_01/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_02/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_02/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_02/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_02/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_03/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_03/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_03/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_03/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_04/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_04/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_04/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_04/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_05/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_05/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_05/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_05/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_06/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_06/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_06/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_06/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_07/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_07/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_07/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_07/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_07/init.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/init.jpg


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_08/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_08/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_08/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/model_08/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/05.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/05.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/06.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/06.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/07.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/07.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_01/08.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/08.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/04.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/05.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/05.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/06.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/06.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/07.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/07.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_lora/model_02/08.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/08.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_xl/01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_xl/01.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_xl/02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_xl/02.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/motion_xl/03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_xl/03.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/v3/animation_fireworks.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/animation_fireworks.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/v3/animation_sunset.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/animation_sunset.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/v3/sketch_boy.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/sketch_boy.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/animations/v3/sketch_city.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/sketch_city.gif


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/RealisticVision_firework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/RealisticVision_firework.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/RealisticVision_sunset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/RealisticVision_sunset.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/interpolation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/interpolation_1.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/interpolation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/interpolation_2.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/low_fps_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_1.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/low_fps_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_2.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/low_fps_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_3.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/low_fps_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_4.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/painting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/painting.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/prediction_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_1.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/prediction_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_2.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/prediction_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_3.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/image/prediction_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_4.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/scribble/scribble_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_1.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/scribble/scribble_2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_1.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/scribble/scribble_2_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_2.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/scribble/scribble_2_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_3.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/demos/scribble/scribble_2_readme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_readme.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/docs/animatediff.md:
--------------------------------------------------------------------------------
  1 | # AnimateDiff: training and inference setup
  2 | ## Setups for Inference
  3 | 
  4 | ### Prepare Environment
  5 | 
  6 | ***We updated our inference code with xformers and a sequential decoding trick. Now AnimateDiff takes only ~12GB VRAM to inference, and run on a single RTX3090 !!***
  7 | 
  8 | ```
  9 | git clone https://github.com/guoyww/AnimateDiff.git
 10 | cd AnimateDiff
 11 | 
 12 | conda env create -f environment.yaml
 13 | conda activate animatediff
 14 | ```
 15 | 
 16 | ### Download Base T2I & Motion Module Checkpoints
 17 | We provide two versions of our Motion Module, which are trained on stable-diffusion-v1-4 and finetuned on v1-5 seperately.
 18 | It's recommanded to try both of them for best results.
 19 | ```
 20 | git lfs install
 21 | git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 models/StableDiffusion/
 22 | 
 23 | bash download_bashscripts/0-MotionModule.sh
 24 | ```
 25 | You may also directly download the motion module checkpoints from [Google Drive](https://drive.google.com/drive/folders/1EqLC65eR1-W-sGD0Im7fkED6c8GkiNFI?usp=sharing) / [HuggingFace](https://huggingface.co/guoyww/animatediff) / [CivitAI](https://civitai.com/models/108836/animatediff-motion-modules), then put them in `models/Motion_Module/` folder.
 26 | 
 27 | ### Prepare Personalize T2I
 28 | Here we provide inference configs for 6 demo T2I on CivitAI.
 29 | You may run the following bash scripts to download these checkpoints.
 30 | ```
 31 | bash download_bashscripts/1-ToonYou.sh
 32 | bash download_bashscripts/2-Lyriel.sh
 33 | bash download_bashscripts/3-RcnzCartoon.sh
 34 | bash download_bashscripts/4-MajicMix.sh
 35 | bash download_bashscripts/5-RealisticVision.sh
 36 | bash download_bashscripts/6-Tusun.sh
 37 | bash download_bashscripts/7-FilmVelvia.sh
 38 | bash download_bashscripts/8-GhibliBackground.sh
 39 | ```
 40 | 
 41 | ### Inference
 42 | After downloading the above peronalized T2I checkpoints, run the following commands to generate animations. The results will automatically be saved to `samples/` folder.
 43 | ```
 44 | python -m scripts.animate --config configs/prompts/1-ToonYou.yaml
 45 | python -m scripts.animate --config configs/prompts/2-Lyriel.yaml
 46 | python -m scripts.animate --config configs/prompts/3-RcnzCartoon.yaml
 47 | python -m scripts.animate --config configs/prompts/4-MajicMix.yaml
 48 | python -m scripts.animate --config configs/prompts/5-RealisticVision.yaml
 49 | python -m scripts.animate --config configs/prompts/6-Tusun.yaml
 50 | python -m scripts.animate --config configs/prompts/7-FilmVelvia.yaml
 51 | python -m scripts.animate --config configs/prompts/8-GhibliBackground.yaml
 52 | ```
 53 | 
 54 | To generate animations with a new DreamBooth/LoRA model, you may create a new config `.yaml` file in the following format:
 55 | ```
 56 | - inference_config: "[path to motion module config file]"
 57 | 
 58 |   motion_module:
 59 |     - "models/Motion_Module/mm_sd_v14.ckpt"
 60 |     - "models/Motion_Module/mm_sd_v15.ckpt"
 61 |     
 62 |     motion_module_lora_configs:
 63 |     - path:  "[path to MotionLoRA model]"
 64 |       alpha: 1.0
 65 |     - ...
 66 | 
 67 |   dreambooth_path: "[path to your DreamBooth model .safetensors file]"
 68 |   lora_model_path: "[path to your LoRA model .safetensors file, leave it empty string if not needed]"
 69 | 
 70 |   steps:          25
 71 |   guidance_scale: 7.5
 72 | 
 73 |   prompt:
 74 |     - "[positive prompt]"
 75 | 
 76 |   n_prompt:
 77 |     - "[negative prompt]"
 78 | ```
 79 | Then run the following commands:
 80 | ```
 81 | python -m scripts.animate --config [path to the config file]
 82 | ```
 83 | 
 84 | 
 85 | ## Steps for Training
 86 | 
 87 | ### Dataset
 88 | Before training, download the videos files and the `.csv` annotations of [WebVid10M](https://maxbain.com/webvid-dataset/) to the local mechine.
 89 | Note that our examplar training script requires all the videos to be saved in a single folder. You may change this by modifying `animatediff/data/dataset.py`.
 90 | 
 91 | ### Configuration
 92 | After dataset preparations, update the below data paths in the config `.yaml` files in `configs/training/` folder:
 93 | ```
 94 | train_data:
 95 |   csv_path:     [Replace with .csv Annotation File Path]
 96 |   video_folder: [Replace with Video Folder Path]
 97 |   sample_size:  256
 98 | ```
 99 | Other training parameters (lr, epochs, validation settings, etc.) are also included in the config files.
100 | 
101 | ### Training
102 | To finetune the unet's image layers
103 | ```
104 | torchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/v1/image_finetune.yaml
105 | ```
106 | 
107 | To train motion modules
108 | ```
109 | torchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/v1/training.yaml
110 | ```
111 | 


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/docs/gallery.md:
--------------------------------------------------------------------------------
 1 | # Gallery
 2 | Here we demonstrate several best results we found in our experiments.
 3 | 
 4 | <table class="center">
 5 |     <tr>
 6 |     <td><img src="../animations/model_01/01.gif"></td>
 7 |     <td><img src="../animations/model_01/02.gif"></td>
 8 |     <td><img src="../animations/model_01/03.gif"></td>
 9 |     <td><img src="../animations/model_01/04.gif"></td>
10 |     </tr>
11 | </table>
12 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/30240/toonyou">ToonYou</a></p>
13 | 
14 | <table>
15 |     <tr>
16 |     <td><img src="../animations/model_02/01.gif"></td>
17 |     <td><img src="../animations/model_02/02.gif"></td>
18 |     <td><img src="../animations/model_02/03.gif"></td>
19 |     <td><img src="../animations/model_02/04.gif"></td>
20 |     </tr>
21 | </table>
22 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/4468/counterfeit-v30">Counterfeit V3.0</a></p>
23 | 
24 | <table>
25 |     <tr>
26 |     <td><img src="../animations/model_03/01.gif"></td>
27 |     <td><img src="../animations/model_03/02.gif"></td>
28 |     <td><img src="../animations/model_03/03.gif"></td>
29 |     <td><img src="../animations/model_03/04.gif"></td>
30 |     </tr>
31 | </table>
32 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/4201/realistic-vision-v20">Realistic Vision V2.0</a></p>
33 | 
34 | <table>
35 |     <tr>
36 |     <td><img src="../animations/model_04/01.gif"></td>
37 |     <td><img src="../animations/model_04/02.gif"></td>
38 |     <td><img src="../animations/model_04/03.gif"></td>
39 |     <td><img src="../animations/model_04/04.gif"></td>
40 |     </tr>
41 | </table>
42 | <p style="margin-left: 2em; margin-top: -1em">Model： <a href="https://civitai.com/models/43331/majicmix-realistic">majicMIX Realistic</a></p>
43 | 
44 | <table>
45 |     <tr>
46 |     <td><img src="../animations/model_05/01.gif"></td>
47 |     <td><img src="../animations/model_05/02.gif"></td>
48 |     <td><img src="../animations/model_05/03.gif"></td>
49 |     <td><img src="../animations/model_05/04.gif"></td>
50 |     </tr>
51 | </table>
52 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/66347/rcnz-cartoon-3d">RCNZ Cartoon</a></p>
53 | 
54 | <table>
55 |     <tr>
56 |     <td><img src="../animations/model_06/01.gif"></td>
57 |     <td><img src="../animations/model_06/02.gif"></td>
58 |     <td><img src="../animations/model_06/03.gif"></td>
59 |     <td><img src="../animations/model_06/04.gif"></td>
60 |     </tr>
61 | </table>
62 | <p style="margin-left: 2em; margin-top: -1em">Model：<a href="https://civitai.com/models/33208/filmgirl-film-grain-lora-and-loha">FilmVelvia</a></p>
63 | 
64 | #### Community Cases
65 | Here are some samples contributed by the community artists. Create a Pull Request if you would like to show your results here😚.
66 | 
67 | <table>
68 |     <tr>
69 |     <td><img src="../animations/model_07/init.jpg"></td>
70 |     <td><img src="../animations/model_07/01.gif"></td>
71 |     <td><img src="../animations/model_07/02.gif"></td>
72 |     <td><img src="../animations/model_07/03.gif"></td>
73 |     <td><img src="../animations/model_07/04.gif"></td>
74 |     </tr>
75 | </table>
76 | <p style="margin-left: 2em; margin-top: -1em">
77 | Character Model：<a href="https://civitai.com/models/13237/genshen-impact-yoimiya">Yoimiya</a> 
78 | (with an initial reference image, see <a href="https://github.com/talesofai/AnimateDiff">WIP fork</a> for the extended implementation.)
79 | 
80 | 
81 | <table>
82 |     <tr>
83 |     <td><img src="../animations/model_08/01.gif"></td>
84 |     <td><img src="../animations/model_08/02.gif"></td>
85 |     <td><img src="../animations/model_08/03.gif"></td>
86 |     <td><img src="../animations/model_08/04.gif"></td>
87 |     </tr>
88 | </table>
89 | <p style="margin-left: 2em; margin-top: -1em">
90 | Character Model：<a href="https://civitai.com/models/9850/paimon-genshin-impact">Paimon</a>;
91 | Pose Model：<a href="https://civitai.com/models/107295/or-holdingsign">Hold Sign</a></p>
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/figs/adapter_explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/figs/adapter_explain.png


--------------------------------------------------------------------------------
/AnimateDiff/__assets__/figs/gradio.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/figs/gradio.jpg


--------------------------------------------------------------------------------
/AnimateDiff/animatediff/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import os, io, csv, math, random
  2 | import numpy as np
  3 | from einops import rearrange
  4 | from decord import VideoReader
  5 | 
  6 | import torch
  7 | import torchvision.transforms as transforms
  8 | from torch.utils.data.dataset import Dataset
  9 | from animatediff.utils.util import zero_rank_print
 10 | 
 11 | 
 12 | class WebVid10M(Dataset):
 13 |     def __init__(
 14 |         self,
 15 |         csv_path,
 16 |         video_folder,
 17 |         sample_size=256,
 18 |         sample_stride=4,
 19 |         sample_n_frames=16,
 20 |         is_image=False,
 21 |         frozen_videos=False,
 22 |         number_of_samples=None,
 23 |     ):
 24 |         zero_rank_print(f"loading annotations from {csv_path} ...")
 25 |         with open(csv_path, "r") as csvfile:
 26 |             self.dataset = list(csv.DictReader(csvfile))
 27 | 
 28 |         if number_of_samples:
 29 |             self.dataset = self.dataset[:number_of_samples]
 30 |         self.length = len(self.dataset)
 31 |         zero_rank_print(f"data scale: {self.length}")
 32 | 
 33 |         self.video_folder = video_folder
 34 |         self.sample_stride = sample_stride
 35 |         self.sample_n_frames = sample_n_frames
 36 |         self.is_image = is_image
 37 |         self.frozen_videos = frozen_videos
 38 |         if self.frozen_videos:
 39 |             print("Training with Frozen videos")
 40 |         sample_size = (
 41 |             tuple(sample_size)
 42 |             if not isinstance(sample_size, int)
 43 |             else (sample_size, sample_size)
 44 |         )
 45 |         self.pixel_transforms = transforms.Compose(
 46 |             [
 47 |                 transforms.RandomHorizontalFlip(),
 48 |                 transforms.Resize(sample_size[0]),
 49 |                 transforms.CenterCrop(sample_size),
 50 |                 transforms.Normalize(
 51 |                     mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True
 52 |                 ),
 53 |             ]
 54 |         )
 55 | 
 56 |     def get_batch(self, idx):
 57 |         video_dict = self.dataset[idx]
 58 |         videoid, name, page_dir = (
 59 |             video_dict["videoid"],
 60 |             video_dict["name"],
 61 |             video_dict["page_dir"],
 62 |         )
 63 | 
 64 |         video_dir = os.path.join(self.video_folder, f"{videoid}.mp4")
 65 |         video_reader = VideoReader(video_dir)
 66 |         video_length = len(video_reader)
 67 |         if not self.is_image:
 68 |             clip_length = min(
 69 |                 video_length, (self.sample_n_frames - 1) * self.sample_stride + 1
 70 |             )
 71 |             start_idx = random.randint(0, video_length - clip_length)
 72 |             batch_index = np.linspace(
 73 |                 start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int
 74 |             )
 75 |         else:
 76 |             batch_index = [random.randint(0, video_length - 1)]
 77 | 
 78 |         if self.frozen_videos:
 79 |             pixel_values = (
 80 |                 torch.from_numpy(
 81 |                     np.tile(
 82 |                         np.expand_dims(
 83 |                             video_reader.get_batch(batch_index).asnumpy()[0], axis=0
 84 |                         ),
 85 |                         (len(batch_index), 1, 1, 1),
 86 |                     )
 87 |                 )
 88 |                 .permute(0, 3, 1, 2)
 89 |                 .contiguous()
 90 |             )
 91 |         else:
 92 |             pixel_values = (
 93 |                 torch.from_numpy(video_reader.get_batch(batch_index).asnumpy())
 94 |                 .permute(0, 3, 1, 2)
 95 |                 .contiguous()
 96 |             )
 97 | 
 98 |         pixel_values = pixel_values / 255.0
 99 |         del video_reader
100 | 
101 |         if self.is_image:
102 |             pixel_values = pixel_values[0]
103 | 
104 |         return pixel_values, name
105 | 
106 |     def __len__(self):
107 |         return self.length
108 | 
109 |     def __getitem__(self, idx):
110 |         while True:
111 |             try:
112 |                 pixel_values, name = self.get_batch(idx)
113 |                 break
114 |             except Exception as e:
115 |                 idx = random.randint(0, self.length - 1)
116 | 
117 |         pixel_values = self.pixel_transforms(pixel_values)
118 |         sample = dict(pixel_values=pixel_values, text=name)
119 |         return sample
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     from animatediff.utils.util import save_videos_grid
124 | 
125 |     dataset = WebVid10M(
126 |         csv_path="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv",
127 |         video_folder="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val",
128 |         sample_size=256,
129 |         sample_stride=4,
130 |         sample_n_frames=16,
131 |         is_image=True,
132 |     )
133 |     import pdb
134 | 
135 |     pdb.set_trace()
136 | 
137 |     dataloader = torch.utils.data.DataLoader(
138 |         dataset,
139 |         batch_size=4,
140 |         num_workers=16,
141 |     )
142 |     for idx, batch in enumerate(dataloader):
143 |         print(batch["pixel_values"].shape, len(batch["text"]))
144 |         # for i in range(batch["pixel_values"].shape[0]):
145 |         #     save_videos_grid(batch["pixel_values"][i:i+1].permute(0,2,1,3,4), os.path.join(".", f"{idx}-{i}.mp4"), rescale=True)
146 | 


--------------------------------------------------------------------------------
/AnimateDiff/animatediff/models/attention.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
  2 | 
  3 | from dataclasses import dataclass
  4 | from typing import Optional
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch import nn
  9 | 
 10 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 11 | from diffusers.modeling_utils import ModelMixin
 12 | from diffusers.utils import BaseOutput
 13 | from diffusers.utils.import_utils import is_xformers_available
 14 | from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
 15 | 
 16 | from einops import rearrange, repeat
 17 | import pdb
 18 | 
 19 | 
 20 | @dataclass
 21 | class Transformer3DModelOutput(BaseOutput):
 22 |     sample: torch.FloatTensor
 23 | 
 24 | 
 25 | if is_xformers_available():
 26 |     import xformers
 27 |     import xformers.ops
 28 | else:
 29 |     xformers = None
 30 | 
 31 | 
 32 | class SpatialAdapter(nn.Module):
 33 |     def __init__(self, dim, rank=4):
 34 |         super().__init__()
 35 |         self.down = nn.Linear(dim, rank, bias=False)
 36 |         self.up = nn.Linear(rank, dim, bias=False)
 37 | 
 38 |         # Initialize weights
 39 |         nn.init.normal_(self.down.weight, std=1 / rank)
 40 |         nn.init.zeros_(self.up.weight)
 41 | 
 42 |     def forward(self, x):
 43 |         out = self.down(x)
 44 |         out = self.up(out)
 45 |         return x + out
 46 | 
 47 | 
 48 | class Transformer3DModel(ModelMixin, ConfigMixin):
 49 |     @register_to_config
 50 |     def __init__(
 51 |         self,
 52 |         num_attention_heads: int = 16,
 53 |         attention_head_dim: int = 88,
 54 |         in_channels: Optional[int] = None,
 55 |         num_layers: int = 1,
 56 |         dropout: float = 0.0,
 57 |         norm_num_groups: int = 32,
 58 |         cross_attention_dim: Optional[int] = None,
 59 |         attention_bias: bool = False,
 60 |         activation_fn: str = "geglu",
 61 |         num_embeds_ada_norm: Optional[int] = None,
 62 |         use_linear_projection: bool = False,
 63 |         only_cross_attention: bool = False,
 64 |         upcast_attention: bool = False,
 65 |         unet_use_cross_frame_attention=None,
 66 |         unet_use_temporal_attention=None,
 67 |         use_spatial_adapter=False,
 68 |     ):
 69 |         super().__init__()
 70 |         self.use_linear_projection = use_linear_projection
 71 |         self.num_attention_heads = num_attention_heads
 72 |         self.attention_head_dim = attention_head_dim
 73 |         inner_dim = num_attention_heads * attention_head_dim
 74 | 
 75 |         # Define input layers
 76 |         self.in_channels = in_channels
 77 | 
 78 |         self.norm = torch.nn.GroupNorm(
 79 |             num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True
 80 |         )
 81 |         if use_linear_projection:
 82 |             self.proj_in = nn.Linear(in_channels, inner_dim)
 83 |         else:
 84 |             self.proj_in = nn.Conv2d(
 85 |                 in_channels, inner_dim, kernel_size=1, stride=1, padding=0
 86 |             )
 87 | 
 88 |         # Define transformers blocks
 89 |         self.transformer_blocks = nn.ModuleList(
 90 |             [
 91 |                 BasicTransformerBlock(
 92 |                     inner_dim,
 93 |                     num_attention_heads,
 94 |                     attention_head_dim,
 95 |                     dropout=dropout,
 96 |                     cross_attention_dim=cross_attention_dim,
 97 |                     activation_fn=activation_fn,
 98 |                     num_embeds_ada_norm=num_embeds_ada_norm,
 99 |                     attention_bias=attention_bias,
100 |                     only_cross_attention=only_cross_attention,
101 |                     upcast_attention=upcast_attention,
102 |                     unet_use_cross_frame_attention=unet_use_cross_frame_attention,
103 |                     unet_use_temporal_attention=unet_use_temporal_attention,
104 |                     use_spatial_adapter=use_spatial_adapter,
105 |                 )
106 |                 for d in range(num_layers)
107 |             ]
108 |         )
109 | 
110 |         # 4. Define output layers
111 |         if use_linear_projection:
112 |             self.proj_out = nn.Linear(in_channels, inner_dim)
113 |         else:
114 |             self.proj_out = nn.Conv2d(
115 |                 inner_dim, in_channels, kernel_size=1, stride=1, padding=0
116 |             )
117 | 
118 |     def forward(
119 |         self,
120 |         hidden_states,
121 |         encoder_hidden_states=None,
122 |         timestep=None,
123 |         return_dict: bool = True,
124 |     ):
125 |         # Input
126 |         assert (
127 |             hidden_states.dim() == 5
128 |         ), f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
129 |         video_length = hidden_states.shape[2]
130 |         hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
131 |         encoder_hidden_states = repeat(
132 |             encoder_hidden_states, "b n c -> (b f) n c", f=video_length
133 |         )
134 | 
135 |         batch, channel, height, weight = hidden_states.shape
136 |         residual = hidden_states
137 | 
138 |         hidden_states = self.norm(hidden_states)
139 |         if not self.use_linear_projection:
140 |             hidden_states = self.proj_in(hidden_states)
141 |             inner_dim = hidden_states.shape[1]
142 |             hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(
143 |                 batch, height * weight, inner_dim
144 |             )
145 |         else:
146 |             inner_dim = hidden_states.shape[1]
147 |             hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(
148 |                 batch, height * weight, inner_dim
149 |             )
150 |             hidden_states = self.proj_in(hidden_states)
151 | 
152 |         # Blocks
153 |         for block in self.transformer_blocks:
154 |             hidden_states = block(
155 |                 hidden_states,
156 |                 encoder_hidden_states=encoder_hidden_states,
157 |                 timestep=timestep,
158 |                 video_length=video_length,
159 |             )
160 | 
161 |         # Output
162 |         if not self.use_linear_projection:
163 |             hidden_states = (
164 |                 hidden_states.reshape(batch, height, weight, inner_dim)
165 |                 .permute(0, 3, 1, 2)
166 |                 .contiguous()
167 |             )
168 |             hidden_states = self.proj_out(hidden_states)
169 |         else:
170 |             hidden_states = self.proj_out(hidden_states)
171 |             hidden_states = (
172 |                 hidden_states.reshape(batch, height, weight, inner_dim)
173 |                 .permute(0, 3, 1, 2)
174 |                 .contiguous()
175 |             )
176 | 
177 |         output = hidden_states + residual
178 | 
179 |         output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
180 |         if not return_dict:
181 |             return (output,)
182 | 
183 |         return Transformer3DModelOutput(sample=output)
184 | 
185 | 
186 | class BasicTransformerBlock(nn.Module):
187 |     def __init__(
188 |         self,
189 |         dim: int,
190 |         num_attention_heads: int,
191 |         attention_head_dim: int,
192 |         dropout=0.0,
193 |         cross_attention_dim: Optional[int] = None,
194 |         activation_fn: str = "geglu",
195 |         num_embeds_ada_norm: Optional[int] = None,
196 |         attention_bias: bool = False,
197 |         only_cross_attention: bool = False,
198 |         upcast_attention: bool = False,
199 |         unet_use_cross_frame_attention=None,
200 |         unet_use_temporal_attention=None,
201 |         use_spatial_adapter=False,
202 |     ):
203 |         super().__init__()
204 |         self.only_cross_attention = only_cross_attention
205 |         self.use_ada_layer_norm = num_embeds_ada_norm is not None
206 |         self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
207 |         self.unet_use_temporal_attention = unet_use_temporal_attention
208 |         self.use_spatial_adapter = use_spatial_adapter
209 |         # SC-Attn
210 |         assert unet_use_cross_frame_attention is not None
211 |         if unet_use_cross_frame_attention:
212 |             self.attn1 = SparseCausalAttention2D(
213 |                 query_dim=dim,
214 |                 heads=num_attention_heads,
215 |                 dim_head=attention_head_dim,
216 |                 dropout=dropout,
217 |                 bias=attention_bias,
218 |                 cross_attention_dim=(
219 |                     cross_attention_dim if only_cross_attention else None
220 |                 ),
221 |                 upcast_attention=upcast_attention,
222 |             )
223 |         else:
224 |             self.attn1 = CrossAttention(
225 |                 query_dim=dim,
226 |                 heads=num_attention_heads,
227 |                 dim_head=attention_head_dim,
228 |                 dropout=dropout,
229 |                 bias=attention_bias,
230 |                 upcast_attention=upcast_attention,
231 |             )
232 |         self.norm1 = (
233 |             AdaLayerNorm(dim, num_embeds_ada_norm)
234 |             if self.use_ada_layer_norm
235 |             else nn.LayerNorm(dim)
236 |         )
237 |         if self.use_spatial_adapter:
238 |             print("Using Spatial Adapter")
239 |             self.attn1_lora = SpatialAdapter(dim, dim)
240 |         else:
241 |             print("Not using Spatial Adapter")
242 |         # Cross-Attn
243 |         if cross_attention_dim is not None:
244 |             self.attn2 = CrossAttention(
245 |                 query_dim=dim,
246 |                 cross_attention_dim=cross_attention_dim,
247 |                 heads=num_attention_heads,
248 |                 dim_head=attention_head_dim,
249 |                 dropout=dropout,
250 |                 bias=attention_bias,
251 |                 upcast_attention=upcast_attention,
252 |             )
253 |             if self.use_spatial_adapter:
254 |                 self.attn2_lora = SpatialAdapter(dim, dim)
255 |         else:
256 |             self.attn2 = None
257 | 
258 |         if cross_attention_dim is not None:
259 |             self.norm2 = (
260 |                 AdaLayerNorm(dim, num_embeds_ada_norm)
261 |                 if self.use_ada_layer_norm
262 |                 else nn.LayerNorm(dim)
263 |             )
264 |         else:
265 |             self.norm2 = None
266 | 
267 |         # Feed-forward
268 |         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
269 |         self.norm3 = nn.LayerNorm(dim)
270 | 
271 |         # Temp-Attn
272 |         assert unet_use_temporal_attention is not None
273 |         if unet_use_temporal_attention:
274 |             self.attn_temp = CrossAttention(
275 |                 query_dim=dim,
276 |                 heads=num_attention_heads,
277 |                 dim_head=attention_head_dim,
278 |                 dropout=dropout,
279 |                 bias=attention_bias,
280 |                 upcast_attention=upcast_attention,
281 |             )
282 |             nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
283 |             self.norm_temp = (
284 |                 AdaLayerNorm(dim, num_embeds_ada_norm)
285 |                 if self.use_ada_layer_norm
286 |                 else nn.LayerNorm(dim)
287 |             )
288 | 
289 |     def set_lora_scale(self, scale):
290 |         self.attn1_lora.scale = scale
291 |         if hasattr(self, "attn2_lora"):
292 |             self.attn2_lora.scale = scale
293 | 
294 |     def set_use_memory_efficient_attention_xformers(
295 |         self, use_memory_efficient_attention_xformers: bool
296 |     ):
297 |         if not is_xformers_available():
298 |             print("Here is how to install it")
299 |             raise ModuleNotFoundError(
300 |                 "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
301 |                 " xformers",
302 |                 name="xformers",
303 |             )
304 |         elif not torch.cuda.is_available():
305 |             raise ValueError(
306 |                 "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
307 |                 " available for GPU "
308 |             )
309 |         else:
310 |             try:
311 |                 # Make sure we can run the memory efficient attention
312 |                 _ = xformers.ops.memory_efficient_attention(
313 |                     torch.randn((1, 2, 40), device="cuda"),
314 |                     torch.randn((1, 2, 40), device="cuda"),
315 |                     torch.randn((1, 2, 40), device="cuda"),
316 |                 )
317 |             except Exception as e:
318 |                 raise e
319 |             self.attn1._use_memory_efficient_attention_xformers = (
320 |                 use_memory_efficient_attention_xformers
321 |             )
322 |             if self.attn2 is not None:
323 |                 self.attn2._use_memory_efficient_attention_xformers = (
324 |                     use_memory_efficient_attention_xformers
325 |                 )
326 |             # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
327 | 
328 |     def forward(
329 |         self,
330 |         hidden_states,
331 |         encoder_hidden_states=None,
332 |         timestep=None,
333 |         attention_mask=None,
334 |         video_length=None,
335 |     ):
336 |         # SparseCausal-Attention
337 |         norm_hidden_states = (
338 |             self.norm1(hidden_states, timestep)
339 |             if self.use_ada_layer_norm
340 |             else self.norm1(hidden_states)
341 |         )
342 | 
343 |         # if self.only_cross_attention:
344 |         #     hidden_states = (
345 |         #         self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
346 |         #     )
347 |         # else:
348 |         #     hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
349 | 
350 |         # pdb.set_trace()
351 |         if self.unet_use_cross_frame_attention:
352 |             attn_output = (
353 |                 self.attn1(
354 |                     norm_hidden_states,
355 |                     attention_mask=attention_mask,
356 |                     video_length=video_length,
357 |                 )
358 |                 + hidden_states
359 |             )
360 |         else:
361 |             attn_output = (
362 |                 self.attn1(norm_hidden_states, attention_mask=attention_mask)
363 |                 + hidden_states
364 |             )
365 |         if self.use_spatial_adapter:
366 |             att1_lora_output = self.attn1_lora(attn_output)
367 |             hidden_states += att1_lora_output
368 | 
369 |         if self.attn2 is not None:
370 |             # Cross-Attention
371 |             norm_hidden_states = (
372 |                 self.norm2(hidden_states, timestep)
373 |                 if self.use_ada_layer_norm
374 |                 else self.norm2(hidden_states)
375 |             )
376 |             attn_output = self.attn2(
377 |                 norm_hidden_states,
378 |                 encoder_hidden_states=encoder_hidden_states,
379 |                 attention_mask=attention_mask,
380 |             )
381 |             if self.use_spatial_adapter:
382 |                 att2_lora_output = self.attn2_lora(attn_output)
383 |                 hidden_states += att2_lora_output
384 | 
385 |         # Feed-forward
386 |         hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
387 | 
388 |         # Temporal-Attention
389 |         if self.unet_use_temporal_attention:
390 |             d = hidden_states.shape[1]
391 |             hidden_states = rearrange(
392 |                 hidden_states, "(b f) d c -> (b d) f c", f=video_length
393 |             )
394 |             norm_hidden_states = (
395 |                 self.norm_temp(hidden_states, timestep)
396 |                 if self.use_ada_layer_norm
397 |                 else self.norm_temp(hidden_states)
398 |             )
399 |             hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
400 |             hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
401 | 
402 |         return hidden_states
403 | 


--------------------------------------------------------------------------------
/AnimateDiff/animatediff/models/resnet.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from einops import rearrange
  8 | 
  9 | 
 10 | class InflatedConv3d(nn.Conv2d):
 11 |     def forward(self, x):
 12 |         video_length = x.shape[2]
 13 | 
 14 |         x = rearrange(x, "b c f h w -> (b f) c h w")
 15 |         x = super().forward(x)
 16 |         x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
 17 | 
 18 |         return x
 19 | 
 20 | 
 21 | class InflatedGroupNorm(nn.GroupNorm):
 22 |     def forward(self, x):
 23 |         video_length = x.shape[2]
 24 | 
 25 |         x = rearrange(x, "b c f h w -> (b f) c h w")
 26 |         x = super().forward(x)
 27 |         x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
 28 | 
 29 |         return x
 30 | 
 31 | 
 32 | class Upsample3D(nn.Module):
 33 |     def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
 34 |         super().__init__()
 35 |         self.channels = channels
 36 |         self.out_channels = out_channels or channels
 37 |         self.use_conv = use_conv
 38 |         self.use_conv_transpose = use_conv_transpose
 39 |         self.name = name
 40 | 
 41 |         conv = None
 42 |         if use_conv_transpose:
 43 |             raise NotImplementedError
 44 |         elif use_conv:
 45 |             self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
 46 | 
 47 |     def forward(self, hidden_states, output_size=None):
 48 |         assert hidden_states.shape[1] == self.channels
 49 | 
 50 |         if self.use_conv_transpose:
 51 |             raise NotImplementedError
 52 | 
 53 |         # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
 54 |         dtype = hidden_states.dtype
 55 |         if dtype == torch.bfloat16:
 56 |             hidden_states = hidden_states.to(torch.float32)
 57 | 
 58 |         # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
 59 |         if hidden_states.shape[0] >= 64:
 60 |             hidden_states = hidden_states.contiguous()
 61 | 
 62 |         # if `output_size` is passed we force the interpolation output
 63 |         # size and do not make use of `scale_factor=2`
 64 |         if output_size is None:
 65 |             hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
 66 |         else:
 67 |             hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
 68 | 
 69 |         # If the input is bfloat16, we cast back to bfloat16
 70 |         if dtype == torch.bfloat16:
 71 |             hidden_states = hidden_states.to(dtype)
 72 | 
 73 |         # if self.use_conv:
 74 |         #     if self.name == "conv":
 75 |         #         hidden_states = self.conv(hidden_states)
 76 |         #     else:
 77 |         #         hidden_states = self.Conv2d_0(hidden_states)
 78 |         hidden_states = self.conv(hidden_states)
 79 | 
 80 |         return hidden_states
 81 | 
 82 | 
 83 | class Downsample3D(nn.Module):
 84 |     def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
 85 |         super().__init__()
 86 |         self.channels = channels
 87 |         self.out_channels = out_channels or channels
 88 |         self.use_conv = use_conv
 89 |         self.padding = padding
 90 |         stride = 2
 91 |         self.name = name
 92 | 
 93 |         if use_conv:
 94 |             self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
 95 |         else:
 96 |             raise NotImplementedError
 97 | 
 98 |     def forward(self, hidden_states):
 99 |         assert hidden_states.shape[1] == self.channels
100 |         if self.use_conv and self.padding == 0:
101 |             raise NotImplementedError
102 | 
103 |         assert hidden_states.shape[1] == self.channels
104 |         hidden_states = self.conv(hidden_states)
105 | 
106 |         return hidden_states
107 | 
108 | 
109 | class ResnetBlock3D(nn.Module):
110 |     def __init__(
111 |         self,
112 |         *,
113 |         in_channels,
114 |         out_channels=None,
115 |         conv_shortcut=False,
116 |         dropout=0.0,
117 |         temb_channels=512,
118 |         groups=32,
119 |         groups_out=None,
120 |         pre_norm=True,
121 |         eps=1e-6,
122 |         non_linearity="swish",
123 |         time_embedding_norm="default",
124 |         output_scale_factor=1.0,
125 |         use_in_shortcut=None,
126 |         use_inflated_groupnorm=False,
127 |     ):
128 |         super().__init__()
129 |         self.pre_norm = pre_norm
130 |         self.pre_norm = True
131 |         self.in_channels = in_channels
132 |         out_channels = in_channels if out_channels is None else out_channels
133 |         self.out_channels = out_channels
134 |         self.use_conv_shortcut = conv_shortcut
135 |         self.time_embedding_norm = time_embedding_norm
136 |         self.output_scale_factor = output_scale_factor
137 | 
138 |         if groups_out is None:
139 |             groups_out = groups
140 | 
141 |         assert use_inflated_groupnorm != None
142 |         if use_inflated_groupnorm:
143 |             self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
144 |         else:
145 |             self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
146 | 
147 |         self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
148 | 
149 |         if temb_channels is not None:
150 |             if self.time_embedding_norm == "default":
151 |                 time_emb_proj_out_channels = out_channels
152 |             elif self.time_embedding_norm == "scale_shift":
153 |                 time_emb_proj_out_channels = out_channels * 2
154 |             else:
155 |                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
156 | 
157 |             self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
158 |         else:
159 |             self.time_emb_proj = None
160 | 
161 |         if use_inflated_groupnorm:
162 |             self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
163 |         else:
164 |             self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
165 | 
166 |         self.dropout = torch.nn.Dropout(dropout)
167 |         self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
168 | 
169 |         if non_linearity == "swish":
170 |             self.nonlinearity = lambda x: F.silu(x)
171 |         elif non_linearity == "mish":
172 |             self.nonlinearity = Mish()
173 |         elif non_linearity == "silu":
174 |             self.nonlinearity = nn.SiLU()
175 | 
176 |         self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
177 | 
178 |         self.conv_shortcut = None
179 |         if self.use_in_shortcut:
180 |             self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
181 | 
182 |     def forward(self, input_tensor, temb):
183 |         hidden_states = input_tensor
184 | 
185 |         hidden_states = self.norm1(hidden_states)
186 |         hidden_states = self.nonlinearity(hidden_states)
187 | 
188 |         hidden_states = self.conv1(hidden_states)
189 | 
190 |         if temb is not None:
191 |             temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
192 | 
193 |         if temb is not None and self.time_embedding_norm == "default":
194 |             hidden_states = hidden_states + temb
195 | 
196 |         hidden_states = self.norm2(hidden_states)
197 | 
198 |         if temb is not None and self.time_embedding_norm == "scale_shift":
199 |             scale, shift = torch.chunk(temb, 2, dim=1)
200 |             hidden_states = hidden_states * (1 + scale) + shift
201 | 
202 |         hidden_states = self.nonlinearity(hidden_states)
203 | 
204 |         hidden_states = self.dropout(hidden_states)
205 |         hidden_states = self.conv2(hidden_states)
206 | 
207 |         if self.conv_shortcut is not None:
208 |             input_tensor = self.conv_shortcut(input_tensor)
209 | 
210 |         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
211 | 
212 |         return output_tensor
213 | 
214 | 
215 | class Mish(torch.nn.Module):
216 |     def forward(self, hidden_states):
217 |         return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))


--------------------------------------------------------------------------------
/AnimateDiff/animatediff/utils/convert_lora_safetensor_to_diffusers.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | #  Changes were made to this source code by Yuwei Guo.
 17 | """ Conversion script for the LoRA's safetensors checkpoints. """
 18 | 
 19 | import argparse
 20 | 
 21 | import torch
 22 | from safetensors.torch import load_file
 23 | 
 24 | from diffusers import StableDiffusionPipeline
 25 | 
 26 | 
 27 | def load_diffusers_lora(pipeline, state_dict, alpha=1.0):
 28 |     # directly update weight in diffusers model
 29 |     for key in state_dict:
 30 |         # only process lora down key
 31 |         if "up." in key: continue
 32 | 
 33 |         up_key    = key.replace(".down.", ".up.")
 34 |         model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "")
 35 |         model_key = model_key.replace("to_out.", "to_out.0.")
 36 |         layer_infos = model_key.split(".")[:-1]
 37 | 
 38 |         curr_layer = pipeline.unet
 39 |         while len(layer_infos) > 0:
 40 |             temp_name = layer_infos.pop(0)
 41 |             curr_layer = curr_layer.__getattr__(temp_name)
 42 | 
 43 |         weight_down = state_dict[key]
 44 |         weight_up   = state_dict[up_key]
 45 |         curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
 46 | 
 47 |     return pipeline
 48 | 
 49 | 
 50 | def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
 51 |     # load base model
 52 |     # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
 53 | 
 54 |     # load LoRA weight from .safetensors
 55 |     # state_dict = load_file(checkpoint_path)
 56 | 
 57 |     visited = []
 58 | 
 59 |     # directly update weight in diffusers model
 60 |     for key in state_dict:
 61 |         # it is suggested to print out the key, it usually will be something like below
 62 |         # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
 63 | 
 64 |         # as we have set the alpha beforehand, so just skip
 65 |         if ".alpha" in key or key in visited:
 66 |             continue
 67 | 
 68 |         if "text" in key:
 69 |             layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
 70 |             curr_layer = pipeline.text_encoder
 71 |         else:
 72 |             layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
 73 |             curr_layer = pipeline.unet
 74 | 
 75 |         # find the target layer
 76 |         temp_name = layer_infos.pop(0)
 77 |         while len(layer_infos) > -1:
 78 |             try:
 79 |                 curr_layer = curr_layer.__getattr__(temp_name)
 80 |                 if len(layer_infos) > 0:
 81 |                     temp_name = layer_infos.pop(0)
 82 |                 elif len(layer_infos) == 0:
 83 |                     break
 84 |             except Exception:
 85 |                 if len(temp_name) > 0:
 86 |                     temp_name += "_" + layer_infos.pop(0)
 87 |                 else:
 88 |                     temp_name = layer_infos.pop(0)
 89 | 
 90 |         pair_keys = []
 91 |         if "lora_down" in key:
 92 |             pair_keys.append(key.replace("lora_down", "lora_up"))
 93 |             pair_keys.append(key)
 94 |         else:
 95 |             pair_keys.append(key)
 96 |             pair_keys.append(key.replace("lora_up", "lora_down"))
 97 | 
 98 |         # update weight
 99 |         if len(state_dict[pair_keys[0]].shape) == 4:
100 |             weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
101 |             weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
102 |             curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
103 |         else:
104 |             weight_up = state_dict[pair_keys[0]].to(torch.float32)
105 |             weight_down = state_dict[pair_keys[1]].to(torch.float32)
106 |             curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
107 | 
108 |         # update visited list
109 |         for item in pair_keys:
110 |             visited.append(item)
111 | 
112 |     return pipeline
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     parser = argparse.ArgumentParser()
117 | 
118 |     parser.add_argument(
119 |         "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
120 |     )
121 |     parser.add_argument(
122 |         "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
123 |     )
124 |     parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
125 |     parser.add_argument(
126 |         "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
127 |     )
128 |     parser.add_argument(
129 |         "--lora_prefix_text_encoder",
130 |         default="lora_te",
131 |         type=str,
132 |         help="The prefix of text encoder weight in safetensors",
133 |     )
134 |     parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
135 |     parser.add_argument(
136 |         "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
137 |     )
138 |     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
139 | 
140 |     args = parser.parse_args()
141 | 
142 |     base_model_path = args.base_model_path
143 |     checkpoint_path = args.checkpoint_path
144 |     dump_path = args.dump_path
145 |     lora_prefix_unet = args.lora_prefix_unet
146 |     lora_prefix_text_encoder = args.lora_prefix_text_encoder
147 |     alpha = args.alpha
148 | 
149 |     pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
150 | 
151 |     pipe = pipe.to(args.device)
152 |     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
153 | 


--------------------------------------------------------------------------------
/AnimateDiff/animatediff/utils/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import imageio
  3 | import numpy as np
  4 | from typing import Union
  5 | 
  6 | import torch
  7 | import torchvision
  8 | import torch.distributed as dist
  9 | 
 10 | from safetensors import safe_open
 11 | from tqdm import tqdm
 12 | from einops import rearrange
 13 | from animatediff.utils.convert_from_ckpt import (
 14 |     convert_ldm_unet_checkpoint,
 15 |     convert_ldm_clip_checkpoint,
 16 |     convert_ldm_vae_checkpoint,
 17 | )
 18 | from animatediff.utils.convert_lora_safetensor_to_diffusers import (
 19 |     convert_lora,
 20 |     load_diffusers_lora,
 21 | )
 22 | 
 23 | 
 24 | def zero_rank_print(s):
 25 |     if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0):
 26 |         print("### " + s)
 27 | 
 28 | 
 29 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
 30 |     videos = rearrange(videos, "b c t h w -> t b c h w")
 31 |     outputs = []
 32 |     for x in videos:
 33 |         x = torchvision.utils.make_grid(x, nrow=n_rows)
 34 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
 35 |         if rescale:
 36 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
 37 |         x = (x * 255).numpy().astype(np.uint8)
 38 |         outputs.append(x)
 39 | 
 40 |     os.makedirs(os.path.dirname(path), exist_ok=True)
 41 |     imageio.mimsave(path, outputs, fps=fps)
 42 | 
 43 | 
 44 | # DDIM Inversion
 45 | @torch.no_grad()
 46 | def init_prompt(prompt, pipeline):
 47 |     uncond_input = pipeline.tokenizer(
 48 |         [""],
 49 |         padding="max_length",
 50 |         max_length=pipeline.tokenizer.model_max_length,
 51 |         return_tensors="pt",
 52 |     )
 53 |     uncond_embeddings = pipeline.text_encoder(
 54 |         uncond_input.input_ids.to(pipeline.device)
 55 |     )[0]
 56 |     text_input = pipeline.tokenizer(
 57 |         [prompt],
 58 |         padding="max_length",
 59 |         max_length=pipeline.tokenizer.model_max_length,
 60 |         truncation=True,
 61 |         return_tensors="pt",
 62 |     )
 63 |     text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
 64 |     context = torch.cat([uncond_embeddings, text_embeddings])
 65 | 
 66 |     return context
 67 | 
 68 | 
 69 | def next_step(
 70 |     model_output: Union[torch.FloatTensor, np.ndarray],
 71 |     timestep: int,
 72 |     sample: Union[torch.FloatTensor, np.ndarray],
 73 |     ddim_scheduler,
 74 | ):
 75 |     timestep, next_timestep = (
 76 |         min(
 77 |             timestep
 78 |             - ddim_scheduler.config.num_train_timesteps
 79 |             // ddim_scheduler.num_inference_steps,
 80 |             999,
 81 |         ),
 82 |         timestep,
 83 |     )
 84 |     alpha_prod_t = (
 85 |         ddim_scheduler.alphas_cumprod[timestep]
 86 |         if timestep >= 0
 87 |         else ddim_scheduler.final_alpha_cumprod
 88 |     )
 89 |     alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
 90 |     beta_prod_t = 1 - alpha_prod_t
 91 |     next_original_sample = (
 92 |         sample - beta_prod_t**0.5 * model_output
 93 |     ) / alpha_prod_t**0.5
 94 |     next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
 95 |     next_sample = alpha_prod_t_next**0.5 * next_original_sample + next_sample_direction
 96 |     return next_sample
 97 | 
 98 | 
 99 | def get_noise_pred_single(latents, t, context, unet):
100 |     noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"]
101 |     return noise_pred
102 | 
103 | 
104 | @torch.no_grad()
105 | def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt):
106 |     context = init_prompt(prompt, pipeline)
107 |     uncond_embeddings, cond_embeddings = context.chunk(2)
108 |     all_latent = [latent]
109 |     latent = latent.clone().detach()
110 |     for i in tqdm(range(num_inv_steps)):
111 |         t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
112 |         noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet)
113 |         latent = next_step(noise_pred, t, latent, ddim_scheduler)
114 |         all_latent.append(latent)
115 |     return all_latent
116 | 
117 | 
118 | @torch.no_grad()
119 | def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""):
120 |     ddim_latents = ddim_loop(
121 |         pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt
122 |     )
123 |     return ddim_latents
124 | 
125 | 
126 | def load_weights(
127 |     animation_pipeline,
128 |     # motion module
129 |     motion_module_path="",
130 |     motion_module_lora_configs=[],
131 |     # domain adapter
132 |     adapter_lora_path="",
133 |     adapter_lora_scale=1.0,
134 |     # image layers
135 |     dreambooth_model_path="",
136 |     lora_model_path="",
137 |     lora_alpha=0.8,
138 | ):
139 |     # motion module
140 |     unet_state_dict = {}
141 |     if motion_module_path != "":
142 |         print(f"load motion module from {motion_module_path}")
143 |         motion_module_state_dict = torch.load(motion_module_path, map_location="cpu")
144 |         motion_module_state_dict = (
145 |             motion_module_state_dict["state_dict"]
146 |             if "state_dict" in motion_module_state_dict
147 |             else motion_module_state_dict
148 |         )
149 |         unet_state_dict.update(
150 |             {
151 |                 name: param
152 |                 for name, param in motion_module_state_dict.items()
153 |                 if "motion_modules." in name
154 |             }
155 |         )
156 |         unet_state_dict.pop("animatediff_config", "")
157 | 
158 |     missing, unexpected = animation_pipeline.unet.load_state_dict(
159 |         unet_state_dict, strict=False
160 |     )
161 |     assert len(unexpected) == 0
162 |     del unet_state_dict
163 | 
164 |     # base model
165 |     if dreambooth_model_path != "":
166 |         print(f"load dreambooth model from {dreambooth_model_path}")
167 |         if dreambooth_model_path.endswith(".safetensors"):
168 |             dreambooth_state_dict = {}
169 |             with safe_open(dreambooth_model_path, framework="pt", device="cpu") as f:
170 |                 for key in f.keys():
171 |                     dreambooth_state_dict[key] = f.get_tensor(key)
172 |         elif dreambooth_model_path.endswith(".ckpt"):
173 |             dreambooth_state_dict = torch.load(
174 |                 dreambooth_model_path, map_location="cpu"
175 |             )
176 | 
177 |         # 1. vae
178 |         converted_vae_checkpoint = convert_ldm_vae_checkpoint(
179 |             dreambooth_state_dict, animation_pipeline.vae.config
180 |         )
181 |         animation_pipeline.vae.load_state_dict(converted_vae_checkpoint)
182 |         # 2. unet
183 |         converted_unet_checkpoint = convert_ldm_unet_checkpoint(
184 |             dreambooth_state_dict, animation_pipeline.unet.config
185 |         )
186 |         animation_pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
187 |         # 3. text_model
188 |         animation_pipeline.text_encoder = convert_ldm_clip_checkpoint(
189 |             dreambooth_state_dict
190 |         )
191 |         del dreambooth_state_dict
192 | 
193 |     # lora layers
194 |     if lora_model_path != "":
195 |         print(f"load lora model from {lora_model_path}")
196 |         assert lora_model_path.endswith(".safetensors")
197 |         lora_state_dict = {}
198 |         with safe_open(lora_model_path, framework="pt", device="cpu") as f:
199 |             for key in f.keys():
200 |                 lora_state_dict[key] = f.get_tensor(key)
201 | 
202 |         animation_pipeline = convert_lora(
203 |             animation_pipeline, lora_state_dict, alpha=lora_alpha
204 |         )
205 |         del lora_state_dict
206 | 
207 |     # domain adapter lora
208 |     if adapter_lora_path != "":
209 |         print(f"load domain lora from {adapter_lora_path}")
210 |         domain_lora_state_dict = torch.load(adapter_lora_path, map_location="cpu")
211 |         domain_lora_state_dict = (
212 |             domain_lora_state_dict["state_dict"]
213 |             if "state_dict" in domain_lora_state_dict
214 |             else domain_lora_state_dict
215 |         )
216 |         domain_lora_state_dict.pop("animatediff_config", "")
217 | 
218 |         animation_pipeline = load_diffusers_lora(
219 |             animation_pipeline, domain_lora_state_dict, alpha=adapter_lora_scale
220 |         )
221 | 
222 |     # motion module lora
223 |     for motion_module_lora_config in motion_module_lora_configs:
224 |         path, alpha = (
225 |             motion_module_lora_config["path"],
226 |             motion_module_lora_config["alpha"],
227 |         )
228 |         print(f"load motion LoRA from {path}")
229 |         motion_lora_state_dict = torch.load(path, map_location="cpu")
230 |         motion_lora_state_dict = (
231 |             motion_lora_state_dict["state_dict"]
232 |             if "state_dict" in motion_lora_state_dict
233 |             else motion_lora_state_dict
234 |         )
235 |         motion_lora_state_dict.pop("animatediff_config", "")
236 | 
237 |         animation_pipeline = load_diffusers_lora(
238 |             animation_pipeline, motion_lora_state_dict, alpha
239 |         )
240 | 
241 |     return animation_pipeline
242 | 


--------------------------------------------------------------------------------
/AnimateDiff/app.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import json
  4 | import torch
  5 | import random
  6 | 
  7 | import gradio as gr
  8 | from glob import glob
  9 | from omegaconf import OmegaConf
 10 | from datetime import datetime
 11 | from safetensors import safe_open
 12 | 
 13 | from diffusers import AutoencoderKL
 14 | from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
 15 | from diffusers.utils.import_utils import is_xformers_available
 16 | from transformers import CLIPTextModel, CLIPTokenizer
 17 | 
 18 | from animatediff.models.unet import UNet3DConditionModel
 19 | from animatediff.pipelines.pipeline_animation import AnimationPipeline
 20 | from animatediff.utils.util import save_videos_grid
 21 | from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
 22 | from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
 23 | 
 24 | 
 25 | sample_idx     = 0
 26 | scheduler_dict = {
 27 |     "Euler": EulerDiscreteScheduler,
 28 |     "PNDM": PNDMScheduler,
 29 |     "DDIM": DDIMScheduler,
 30 | }
 31 | 
 32 | css = """
 33 | .toolbutton {
 34 |     margin-buttom: 0em 0em 0em 0em;
 35 |     max-width: 2.5em;
 36 |     min-width: 2.5em !important;
 37 |     height: 2.5em;
 38 | }
 39 | """
 40 | 
 41 | class AnimateController:
 42 |     def __init__(self):
 43 |         
 44 |         # config dirs
 45 |         self.basedir                = os.getcwd()
 46 |         self.stable_diffusion_dir   = os.path.join(self.basedir, "models", "StableDiffusion")
 47 |         self.motion_module_dir      = os.path.join(self.basedir, "models", "Motion_Module")
 48 |         self.personalized_model_dir = os.path.join(self.basedir, "models", "DreamBooth_LoRA")
 49 |         self.savedir                = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
 50 |         self.savedir_sample         = os.path.join(self.savedir, "sample")
 51 |         os.makedirs(self.savedir, exist_ok=True)
 52 | 
 53 |         self.stable_diffusion_list   = []
 54 |         self.motion_module_list      = []
 55 |         self.personalized_model_list = []
 56 |         
 57 |         self.refresh_stable_diffusion()
 58 |         self.refresh_motion_module()
 59 |         self.refresh_personalized_model()
 60 |         
 61 |         # config models
 62 |         self.tokenizer             = None
 63 |         self.text_encoder          = None
 64 |         self.vae                   = None
 65 |         self.unet                  = None
 66 |         self.pipeline              = None
 67 |         self.lora_model_state_dict = {}
 68 |         
 69 |         self.inference_config      = OmegaConf.load("configs/inference/inference.yaml")
 70 | 
 71 |     def refresh_stable_diffusion(self):
 72 |         self.stable_diffusion_list = glob(os.path.join(self.stable_diffusion_dir, "*/"))
 73 | 
 74 |     def refresh_motion_module(self):
 75 |         motion_module_list = glob(os.path.join(self.motion_module_dir, "*.ckpt"))
 76 |         self.motion_module_list = [os.path.basename(p) for p in motion_module_list]
 77 | 
 78 |     def refresh_personalized_model(self):
 79 |         personalized_model_list = glob(os.path.join(self.personalized_model_dir, "*.safetensors"))
 80 |         self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list]
 81 | 
 82 |     def update_stable_diffusion(self, stable_diffusion_dropdown):
 83 |         self.tokenizer = CLIPTokenizer.from_pretrained(stable_diffusion_dropdown, subfolder="tokenizer")
 84 |         self.text_encoder = CLIPTextModel.from_pretrained(stable_diffusion_dropdown, subfolder="text_encoder").cuda()
 85 |         self.vae = AutoencoderKL.from_pretrained(stable_diffusion_dropdown, subfolder="vae").cuda()
 86 |         self.unet = UNet3DConditionModel.from_pretrained_2d(stable_diffusion_dropdown, subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(self.inference_config.unet_additional_kwargs)).cuda()
 87 |         return gr.Dropdown.update()
 88 | 
 89 |     def update_motion_module(self, motion_module_dropdown):
 90 |         if self.unet is None:
 91 |             gr.Info(f"Please select a pretrained model path.")
 92 |             return gr.Dropdown.update(value=None)
 93 |         else:
 94 |             motion_module_dropdown = os.path.join(self.motion_module_dir, motion_module_dropdown)
 95 |             motion_module_state_dict = torch.load(motion_module_dropdown, map_location="cpu")
 96 |             missing, unexpected = self.unet.load_state_dict(motion_module_state_dict, strict=False)
 97 |             assert len(unexpected) == 0
 98 |             return gr.Dropdown.update()
 99 | 
100 |     def update_base_model(self, base_model_dropdown):
101 |         if self.unet is None:
102 |             gr.Info(f"Please select a pretrained model path.")
103 |             return gr.Dropdown.update(value=None)
104 |         else:
105 |             base_model_dropdown = os.path.join(self.personalized_model_dir, base_model_dropdown)
106 |             base_model_state_dict = {}
107 |             with safe_open(base_model_dropdown, framework="pt", device="cpu") as f:
108 |                 for key in f.keys():
109 |                     base_model_state_dict[key] = f.get_tensor(key)
110 |                     
111 |             converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_model_state_dict, self.vae.config)
112 |             self.vae.load_state_dict(converted_vae_checkpoint)
113 | 
114 |             converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_model_state_dict, self.unet.config)
115 |             self.unet.load_state_dict(converted_unet_checkpoint, strict=False)
116 | 
117 |             self.text_encoder = convert_ldm_clip_checkpoint(base_model_state_dict)
118 |             return gr.Dropdown.update()
119 | 
120 |     def update_lora_model(self, lora_model_dropdown):
121 |         lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown)
122 |         self.lora_model_state_dict = {}
123 |         if lora_model_dropdown == "none": pass
124 |         else:
125 |             with safe_open(lora_model_dropdown, framework="pt", device="cpu") as f:
126 |                 for key in f.keys():
127 |                     self.lora_model_state_dict[key] = f.get_tensor(key)
128 |         return gr.Dropdown.update()
129 | 
130 |     def animate(
131 |         self,
132 |         stable_diffusion_dropdown,
133 |         motion_module_dropdown,
134 |         base_model_dropdown,
135 |         lora_alpha_slider,
136 |         prompt_textbox, 
137 |         negative_prompt_textbox, 
138 |         sampler_dropdown, 
139 |         sample_step_slider, 
140 |         width_slider, 
141 |         length_slider, 
142 |         height_slider, 
143 |         cfg_scale_slider, 
144 |         seed_textbox
145 |     ):    
146 |         if self.unet is None:
147 |             raise gr.Error(f"Please select a pretrained model path.")
148 |         if motion_module_dropdown == "": 
149 |             raise gr.Error(f"Please select a motion module.")
150 |         if base_model_dropdown == "":
151 |             raise gr.Error(f"Please select a base DreamBooth model.")
152 | 
153 |         if is_xformers_available(): self.unet.enable_xformers_memory_efficient_attention()
154 | 
155 |         pipeline = AnimationPipeline(
156 |             vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer, unet=self.unet,
157 |             scheduler=scheduler_dict[sampler_dropdown](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
158 |         ).to("cuda")
159 |         
160 |         if self.lora_model_state_dict != {}:
161 |             pipeline = convert_lora(pipeline, self.lora_model_state_dict, alpha=lora_alpha_slider)
162 | 
163 |         pipeline.to("cuda")
164 | 
165 |         if seed_textbox != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
166 |         else: torch.seed()
167 |         seed = torch.initial_seed()
168 |         
169 |         sample = pipeline(
170 |             prompt_textbox,
171 |             negative_prompt     = negative_prompt_textbox,
172 |             num_inference_steps = sample_step_slider,
173 |             guidance_scale      = cfg_scale_slider,
174 |             width               = width_slider,
175 |             height              = height_slider,
176 |             video_length        = length_slider,
177 |         ).videos
178 | 
179 |         save_sample_path = os.path.join(self.savedir_sample, f"{sample_idx}.mp4")
180 |         save_videos_grid(sample, save_sample_path)
181 |     
182 |         sample_config = {
183 |             "prompt": prompt_textbox,
184 |             "n_prompt": negative_prompt_textbox,
185 |             "sampler": sampler_dropdown,
186 |             "num_inference_steps": sample_step_slider,
187 |             "guidance_scale": cfg_scale_slider,
188 |             "width": width_slider,
189 |             "height": height_slider,
190 |             "video_length": length_slider,
191 |             "seed": seed
192 |         }
193 |         json_str = json.dumps(sample_config, indent=4)
194 |         with open(os.path.join(self.savedir, "logs.json"), "a") as f:
195 |             f.write(json_str)
196 |             f.write("\n\n")
197 |             
198 |         return gr.Video.update(value=save_sample_path)
199 |         
200 | 
201 | controller = AnimateController()
202 | 
203 | 
204 | def ui():
205 |     with gr.Blocks(css=css) as demo:
206 |         gr.Markdown(
207 |             """
208 |             # [AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725)
209 |             Yuwei Guo, Ceyuan Yang*, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, Bo Dai (*Corresponding Author)<br>
210 |             [Arxiv Report](https://arxiv.org/abs/2307.04725) | [Project Page](https://animatediff.github.io/) | [Github](https://github.com/guoyww/animatediff/)
211 |             """
212 |         )
213 |         with gr.Column(variant="panel"):
214 |             gr.Markdown(
215 |                 """
216 |                 ### 1. Model checkpoints (select pretrained model path first).
217 |                 """
218 |             )
219 |             with gr.Row():
220 |                 stable_diffusion_dropdown = gr.Dropdown(
221 |                     label="Pretrained Model Path",
222 |                     choices=controller.stable_diffusion_list,
223 |                     interactive=True,
224 |                 )
225 |                 stable_diffusion_dropdown.change(fn=controller.update_stable_diffusion, inputs=[stable_diffusion_dropdown], outputs=[stable_diffusion_dropdown])
226 |                 
227 |                 stable_diffusion_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
228 |                 def update_stable_diffusion():
229 |                     controller.refresh_stable_diffusion()
230 |                     return gr.Dropdown.update(choices=controller.stable_diffusion_list)
231 |                 stable_diffusion_refresh_button.click(fn=update_stable_diffusion, inputs=[], outputs=[stable_diffusion_dropdown])
232 | 
233 |             with gr.Row():
234 |                 motion_module_dropdown = gr.Dropdown(
235 |                     label="Select motion module",
236 |                     choices=controller.motion_module_list,
237 |                     interactive=True,
238 |                 )
239 |                 motion_module_dropdown.change(fn=controller.update_motion_module, inputs=[motion_module_dropdown], outputs=[motion_module_dropdown])
240 |                 
241 |                 motion_module_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
242 |                 def update_motion_module():
243 |                     controller.refresh_motion_module()
244 |                     return gr.Dropdown.update(choices=controller.motion_module_list)
245 |                 motion_module_refresh_button.click(fn=update_motion_module, inputs=[], outputs=[motion_module_dropdown])
246 |                 
247 |                 base_model_dropdown = gr.Dropdown(
248 |                     label="Select base Dreambooth model (required)",
249 |                     choices=controller.personalized_model_list,
250 |                     interactive=True,
251 |                 )
252 |                 base_model_dropdown.change(fn=controller.update_base_model, inputs=[base_model_dropdown], outputs=[base_model_dropdown])
253 |                 
254 |                 lora_model_dropdown = gr.Dropdown(
255 |                     label="Select LoRA model (optional)",
256 |                     choices=["none"] + controller.personalized_model_list,
257 |                     value="none",
258 |                     interactive=True,
259 |                 )
260 |                 lora_model_dropdown.change(fn=controller.update_lora_model, inputs=[lora_model_dropdown], outputs=[lora_model_dropdown])
261 |                 
262 |                 lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.8, minimum=0, maximum=2, interactive=True)
263 |                 
264 |                 personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
265 |                 def update_personalized_model():
266 |                     controller.refresh_personalized_model()
267 |                     return [
268 |                         gr.Dropdown.update(choices=controller.personalized_model_list),
269 |                         gr.Dropdown.update(choices=["none"] + controller.personalized_model_list)
270 |                     ]
271 |                 personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
272 | 
273 |         with gr.Column(variant="panel"):
274 |             gr.Markdown(
275 |                 """
276 |                 ### 2. Configs for AnimateDiff.
277 |                 """
278 |             )
279 |             
280 |             prompt_textbox = gr.Textbox(label="Prompt", lines=2)
281 |             negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2)
282 |                 
283 |             with gr.Row().style(equal_height=False):
284 |                 with gr.Column():
285 |                     with gr.Row():
286 |                         sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
287 |                         sample_step_slider = gr.Slider(label="Sampling steps", value=25, minimum=10, maximum=100, step=1)
288 |                         
289 |                     width_slider     = gr.Slider(label="Width",            value=512, minimum=256, maximum=1024, step=64)
290 |                     height_slider    = gr.Slider(label="Height",           value=512, minimum=256, maximum=1024, step=64)
291 |                     length_slider    = gr.Slider(label="Animation length", value=16,  minimum=8,   maximum=24,   step=1)
292 |                     cfg_scale_slider = gr.Slider(label="CFG Scale",        value=7.5, minimum=0,   maximum=20)
293 |                     
294 |                     with gr.Row():
295 |                         seed_textbox = gr.Textbox(label="Seed", value=-1)
296 |                         seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
297 |                         seed_button.click(fn=lambda: gr.Textbox.update(value=random.randint(1, 1e8)), inputs=[], outputs=[seed_textbox])
298 |             
299 |                     generate_button = gr.Button(value="Generate", variant='primary')
300 |                     
301 |                 result_video = gr.Video(label="Generated Animation", interactive=False)
302 | 
303 |             generate_button.click(
304 |                 fn=controller.animate,
305 |                 inputs=[
306 |                     stable_diffusion_dropdown,
307 |                     motion_module_dropdown,
308 |                     base_model_dropdown,
309 |                     lora_alpha_slider,
310 |                     prompt_textbox, 
311 |                     negative_prompt_textbox, 
312 |                     sampler_dropdown, 
313 |                     sample_step_slider, 
314 |                     width_slider, 
315 |                     length_slider, 
316 |                     height_slider, 
317 |                     cfg_scale_slider, 
318 |                     seed_textbox,
319 |                 ],
320 |                 outputs=[result_video]
321 |             )
322 |             
323 |     return demo
324 | 
325 | 
326 | if __name__ == "__main__":
327 |     demo = ui()
328 |     demo.launch(share=True)
329 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/inference/inference-v1.yaml:
--------------------------------------------------------------------------------
 1 | unet_additional_kwargs:
 2 |   unet_use_cross_frame_attention: false
 3 |   unet_use_temporal_attention:    false
 4 |   use_motion_module:              true
 5 |   motion_module_resolutions:      [1,2,4,8]
 6 |   motion_module_mid_block:        false
 7 |   motion_module_decoder_only:     false
 8 |   motion_module_type:             "Vanilla"
 9 |   use_spatial_adapter:            true 
10 |   
11 |   motion_module_kwargs:
12 |     num_attention_heads:                8
13 |     num_transformer_block:              1
14 |     attention_block_types:              [ "Temporal_Self", "Temporal_Self" ]
15 |     temporal_position_encoding:         true
16 |     temporal_position_encoding_max_len: 24
17 |     temporal_attention_dim_div:         1
18 |     use_motion_adapter:                 1
19 |     motion_adapter_scale:               0
20 |     
21 | 
22 | noise_scheduler_kwargs:
23 |   beta_start:    0.00085
24 |   beta_end:      0.012
25 |   beta_schedule: "linear"
26 |   steps_offset:  1
27 |   clip_sample:   False
28 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/inference/inference-v2.yaml:
--------------------------------------------------------------------------------
 1 | unet_additional_kwargs:
 2 |   use_inflated_groupnorm:         true
 3 |   unet_use_cross_frame_attention: false
 4 |   unet_use_temporal_attention:    false
 5 |   use_motion_module:              true
 6 |   motion_module_resolutions:      [1,2,4,8]
 7 |   motion_module_mid_block:        true
 8 |   motion_module_decoder_only:     false
 9 |   motion_module_type:             "Vanilla"
10 | 
11 |   motion_module_kwargs:
12 |     num_attention_heads:                8
13 |     num_transformer_block:              1
14 |     attention_block_types:              [ "Temporal_Self", "Temporal_Self" ]
15 |     temporal_position_encoding:         true
16 |     temporal_position_encoding_max_len: 32
17 |     temporal_attention_dim_div:         1
18 | 
19 | noise_scheduler_kwargs:
20 |   beta_start:    0.00085
21 |   beta_end:      0.012
22 |   beta_schedule: "linear"
23 |   steps_offset:  1
24 |   clip_sample:   False
25 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/inference/inference-v3.yaml:
--------------------------------------------------------------------------------
 1 | unet_additional_kwargs:
 2 |   use_inflated_groupnorm:     true
 3 |   use_motion_module:          true
 4 |   motion_module_resolutions:  [1,2,4,8]
 5 |   motion_module_mid_block:    false
 6 |   motion_module_type:         Vanilla
 7 | 
 8 |   motion_module_kwargs:
 9 |     num_attention_heads:                 8
10 |     num_transformer_block:               1
11 |     attention_block_types:               [ "Temporal_Self", "Temporal_Self" ]
12 |     temporal_position_encoding:          true
13 |     temporal_position_encoding_max_len:  32
14 |     temporal_attention_dim_div:          1
15 |     zero_initialize:                     true
16 | 
17 | noise_scheduler_kwargs:
18 |   beta_start:    0.00085
19 |   beta_end:      0.012
20 |   beta_schedule: "linear"
21 |   steps_offset:  1
22 |   clip_sample:   False
23 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/inference/sparsectrl/image_condition.yaml:
--------------------------------------------------------------------------------
 1 | controlnet_additional_kwargs:
 2 |   set_noisy_sample_input_to_zero:     true
 3 |   use_simplified_condition_embedding: false
 4 |   conditioning_channels:              3
 5 | 
 6 |   use_motion_module:         true
 7 |   motion_module_resolutions: [1,2,4,8]
 8 |   motion_module_mid_block:   false
 9 |   motion_module_type:        "Vanilla"
10 | 
11 |   motion_module_kwargs:
12 |     num_attention_heads:                8
13 |     num_transformer_block:              1
14 |     attention_block_types:              [ "Temporal_Self" ]
15 |     temporal_position_encoding:         true
16 |     temporal_position_encoding_max_len: 32
17 |     temporal_attention_dim_div:         1
18 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/inference/sparsectrl/latent_condition.yaml:
--------------------------------------------------------------------------------
 1 | controlnet_additional_kwargs:
 2 |   set_noisy_sample_input_to_zero:     true
 3 |   use_simplified_condition_embedding: true
 4 |   conditioning_channels:              4
 5 | 
 6 |   use_motion_module:         true
 7 |   motion_module_resolutions: [1,2,4,8]
 8 |   motion_module_mid_block:   false
 9 |   motion_module_type:        "Vanilla"
10 | 
11 |   motion_module_kwargs:
12 |     num_attention_heads:                8
13 |     num_transformer_block:              1
14 |     attention_block_types:              [ "Temporal_Self" ]
15 |     temporal_position_encoding:         true
16 |     temporal_position_encoding_max_len: 32
17 |     temporal_attention_dim_div:         1
18 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-1-ToonYou.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/toonyou_beta6.safetensors"
 3 |   lora_model_path: ""
 4 | 
 5 |   inference_config: "configs/inference/inference-v1.yaml"
 6 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 7 | 
 8 |   seed: [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751]
 9 |   steps: 25
10 |   guidance_scale: 8
11 | 
12 |   prompt:
13 |     - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
14 |     - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,"
15 |     - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern"
16 |     - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,"
17 | 
18 |   n_prompt:
19 |     - "worst quality, low quality, letterboxed"
20 | 
21 | 
22 | # motion module v1_15
23 | - dreambooth_path: "models/DreamBooth_LoRA/toonyou_beta6.safetensors"
24 |   lora_model_path: ""
25 | 
26 |   inference_config: "configs/inference/inference-v1.yaml"
27 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
28 | 
29 |   seed: [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751]
30 |   steps: 25
31 |   guidance_scale: 8
32 | 
33 |   prompt:
34 |     - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
35 |     - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,"
36 |     - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern"
37 |     - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,"
38 | 
39 |   n_prompt:
40 |     - "worst quality, low quality, letterboxed"
41 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-2-Lyriel.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/lyriel_v16.safetensors"
 3 |   lora_model_path: ""
 4 | 
 5 |   inference_config: "configs/inference/inference-v1.yaml"
 6 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 7 |   
 8 |   seed: [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]
 9 |   steps: 25
10 |   guidance_scale: 8
11 | 
12 |   prompt:
13 |     - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange"
14 |     - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal"
15 |     - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray"
16 |     - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown."
17 | 
18 |   n_prompt:
19 |     - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration"
20 |     - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular"
21 |     - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome"
22 |     - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render"
23 | 
24 | 
25 | # motion module v1_15
26 | - dreambooth_path: "models/DreamBooth_LoRA/lyriel_v16.safetensors"
27 |   lora_model_path: ""
28 | 
29 |   inference_config: "configs/inference/inference-v1.yaml"
30 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
31 |   
32 |   seed: [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]
33 |   steps: 25
34 |   guidance_scale: 8
35 | 
36 |   prompt:
37 |     - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange"
38 |     - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal"
39 |     - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray"
40 |     - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown."
41 | 
42 |   n_prompt:
43 |     - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration"
44 |     - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular"
45 |     - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome"
46 |     - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render"
47 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-3-RcnzCartoon.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors"
 3 |   lora_model_path: ""
 4 | 
 5 |   inference_config: "configs/inference/inference-v1.yaml"
 6 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 7 | 
 8 |   seed: [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890]
 9 |   steps: 25
10 |   guidance_scale: 8
11 | 
12 |   prompt:
13 |     - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded"
14 |     - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face"
15 |     - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes"
16 |     - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering"
17 | 
18 |   n_prompt:
19 |     - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
20 |     - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular"
21 |     - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,"
22 |     - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand"
23 | 
24 | 
25 | # motion module v1_15
26 | - dreambooth_path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors"
27 |   lora_model_path: ""
28 | 
29 |   inference_config: "configs/inference/inference-v1.yaml"
30 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
31 | 
32 |   seed: [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890]
33 |   steps: 25
34 |   guidance_scale: 8
35 | 
36 |   prompt:
37 |     - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded"
38 |     - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face"
39 |     - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes"
40 |     - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering"
41 | 
42 |   n_prompt:
43 |     - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
44 |     - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular"
45 |     - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,"
46 |     - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand"
47 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-4-MajicMix.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors"
 3 |   lora_model_path: ""
 4 | 
 5 |   inference_config: "configs/inference/inference-v1.yaml"
 6 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 7 | 
 8 |   seed: [1572448948722921032, 1099474677988590681, 6488833139725635347, 18339859844376517918]
 9 |   steps: 25
10 |   guidance_scale: 8
11 | 
12 |   prompt:
13 |     - "1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic"
14 |     - "best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting"
15 |     - "best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below"
16 |     - "male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic"
17 | 
18 |   n_prompt:
19 |     - "ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles"
20 |     - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
21 |     - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
22 |     - "nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people"
23 | 
24 | 
25 | # motion module v1_15
26 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors"
27 |   lora_model_path: ""
28 | 
29 |   inference_config: "configs/inference/inference-v1.yaml"
30 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
31 | 
32 |   seed: [1572448948722921032, 1099474677988590681, 6488833139725635347, 18339859844376517918]
33 |   steps: 25
34 |   guidance_scale: 8
35 | 
36 |   prompt:
37 |     - "1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic"
38 |     - "best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting"
39 |     - "best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below"
40 |     - "male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic"
41 | 
42 |   n_prompt:
43 |     - "ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles"
44 |     - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
45 |     - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
46 |     - "nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people"
47 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-5-RealisticVision.yaml:
--------------------------------------------------------------------------------
 1 | # # motion module v1_14
 2 | # - dreambooth_path: "./models/DreamBooth_LoRA/alex.safetensors"
 3 | #   lora_model_path: ""
 4 | 
 5 | #   inference_config: "configs/inference/inference-v1.yaml"
 6 | #   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 7 | #   motion_adapter_ckpt: "./outputs/motion_adapter_training-2024-07-30T10-47-27/checkpoints/checkpoint-max-steps-4000.ckpt"
 8 | #   spatial_adapter_ckpt: "./outputs/spatial_adapter_training-2024-07-30T12-12-31/checkpoints/checkpoint-epoch-15.ckpt"
 9 | 
10 | #   seed: [5658137986800322009, 12099779162349365895, 10499524853910852697, 16768009035333711932]
11 | #   steps: 25
12 | #   guidance_scale: 8
13 | 
14 | #   prompt:
15 | #     - "b&w photo of ohwx man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
16 | #     - "close up photo of a ohwx man, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
17 | #     - "photo of ohwx man, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
18 | #     - "night, b&w photo of ohwx man in the old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
19 | 
20 | #   n_prompt:
21 | #     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
22 | #     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
23 | #     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
24 | #     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
25 | 
26 | 
27 | # motion module v1_15
28 | - dreambooth_path: "./models/DreamBooth_LoRA/alex.safetensors"
29 |   lora_model_path: ""
30 | 
31 |   inference_config: "configs/inference/inference-v1.yaml"
32 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
33 |   motion_adapter_ckpt: "./outputs/motion_adapter_training-2024-07-30T10-47-27/checkpoints/checkpoint-max-steps-4000.ckpt"
34 |   spatial_adapter_ckpt: "./outputs/spatial_adapter_training-2024-07-30T12-12-31/checkpoints/checkpoint-epoch-15.ckpt"
35 | 
36 |   seed: [5658137986800322009, 12099779162349365895, 10499524853910852697, 16768009035333711932]
37 |   steps: 25
38 |   guidance_scale: 8
39 | 
40 |   prompt:
41 |     - "b&w photo of ohwx man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
42 |     - "close up photo of a ohwx man, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
43 |     - "photo of ohwx man, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
44 |     - "night, b&w photo of ohwx man in the old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
45 | 
46 |   n_prompt:
47 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
48 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
49 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
50 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
51 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-6-Tusun.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/moonfilm_reality20.safetensors"
 3 |   lora_model_path: "models/DreamBooth_LoRA/TUSUN.safetensors"
 4 |   lora_alpha: 0.6
 5 | 
 6 |   inference_config: "configs/inference/inference-v1.yaml"
 7 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 8 | 
 9 |   seed: [10154078483724687116, 2664393535095473805, 4231566096207622938, 1713349740448094493]
10 |   steps: 25
11 |   guidance_scale: 8
12 | 
13 |   prompt:
14 |     - "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
15 |     - "cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
16 |     - "cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
17 |     - "character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body"
18 | 
19 |   n_prompt:
20 |     - "worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative"
21 | 
22 | 
23 | # motion module v1_15
24 | - dreambooth_path: "models/DreamBooth_LoRA/moonfilm_reality20.safetensors"
25 |   lora_model_path: "models/DreamBooth_LoRA/TUSUN.safetensors"
26 |   lora_alpha: 0.6
27 | 
28 |   inference_config: "configs/inference/inference-v1.yaml"
29 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
30 | 
31 |   seed: [10154078483724687116, 2664393535095473805, 4231566096207622938, 1713349740448094493]
32 |   steps: 25
33 |   guidance_scale: 8
34 | 
35 |   prompt:
36 |     - "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
37 |     - "cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
38 |     - "cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
39 |     - "character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body"
40 | 
41 |   n_prompt:
42 |     - "worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative"
43 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-7-FilmVelvia.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors"
 3 |   lora_model_path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors"
 4 |   lora_alpha: 0.6
 5 | 
 6 |   inference_config: "configs/inference/inference-v1.yaml"
 7 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 8 | 
 9 |   seed: [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877]
10 |   steps: 25
11 |   guidance_scale: 8
12 | 
13 |   prompt:
14 |     - "a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name"
15 |     - ", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir"
16 |     - "fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark"
17 |     - "In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, "
18 | 
19 |   n_prompt:
20 |     - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
21 |     - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
22 |     - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
23 |     - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
24 | 
25 | 
26 | # motion module v1_15
27 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors"
28 |   lora_model_path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors"
29 |   lora_alpha: 0.6
30 | 
31 |   inference_config: "configs/inference/inference-v1.yaml"
32 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
33 | 
34 |   seed: [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877]
35 |   steps: 25
36 |   guidance_scale: 8
37 | 
38 |   prompt:
39 |     - "a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name"
40 |     - ", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir"
41 |     - "fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark"
42 |     - "In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, "
43 | 
44 |   n_prompt:
45 |     - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
46 |     - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
47 |     - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
48 |     - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
49 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v1/v1-8-GhibliBackground.yaml:
--------------------------------------------------------------------------------
 1 | # motion module v1_14
 2 | - dreambooth_path: "models/DreamBooth_LoRA/CounterfeitV30_25.safetensors"
 3 |   lora_model_path: "models/DreamBooth_LoRA/lora_Ghibli_n3.safetensors"
 4 |   lora_alpha: 1.0
 5 | 
 6 |   inference_config: "configs/inference/inference-v1.yaml"
 7 |   motion_module:    "models/Motion_Module/mm_sd_v14.ckpt"
 8 | 
 9 |   seed: [8775748474469046618, 5893874876080607656, 11911465742147695752, 12437784838692000640]
10 |   steps: 25
11 |   guidance_scale: 8
12 | 
13 |   prompt:
14 |     - "best quality,single build,architecture, blue_sky, building,cloudy_sky, day, fantasy, fence, field, house, build,architecture,landscape, moss, outdoors, overgrown, path, river, road, rock, scenery, sky, sword, tower, tree, waterfall"
15 |     - "black_border, building, city, day, fantasy, ice, landscape, letterboxed, mountain, ocean, outdoors, planet, scenery, ship, snow, snowing, water, watercraft, waterfall, winter"
16 |     - ",mysterious sea area, fantasy,build,concept"
17 |     - "Tomb Raider,Scenography,Old building"
18 | 
19 |   n_prompt:
20 |     - "worst quality, low quality, letterboxed"
21 | 
22 | 
23 | # motion module v1_15
24 | - dreambooth_path: "models/DreamBooth_LoRA/CounterfeitV30_25.safetensors"
25 |   lora_model_path: "models/DreamBooth_LoRA/lora_Ghibli_n3.safetensors"
26 |   lora_alpha: 1.0
27 | 
28 |   inference_config: "configs/inference/inference-v1.yaml"
29 |   motion_module:    "models/Motion_Module/mm_sd_v15.ckpt"
30 | 
31 |   seed: [8775748474469046618, 5893874876080607656, 11911465742147695752, 12437784838692000640]
32 |   steps: 25
33 |   guidance_scale: 8
34 | 
35 |   prompt:
36 |     - "best quality,single build,architecture, blue_sky, building,cloudy_sky, day, fantasy, fence, field, house, build,architecture,landscape, moss, outdoors, overgrown, path, river, road, rock, scenery, sky, sword, tower, tree, waterfall"
37 |     - "black_border, building, city, day, fantasy, ice, landscape, letterboxed, mountain, ocean, outdoors, planet, scenery, ship, snow, snowing, water, watercraft, waterfall, winter"
38 |     - ",mysterious sea area, fantasy,build,concept"
39 |     - "Tomb Raider,Scenography,Old building"
40 | 
41 |   n_prompt:
42 |     - "worst quality, low quality, letterboxed"
43 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v2/v2-1-RealisticVision.yaml:
--------------------------------------------------------------------------------
 1 | - inference_config: "configs/inference/inference-v2.yaml"
 2 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
 3 | 
 4 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 5 |   lora_model_path: ""
 6 | 
 7 |   seed:           [13100322578370451493, 14752961627088720670, 9329399085567825781, 16987697414827649302]
 8 |   steps:          25
 9 |   guidance_scale: 7.5
10 | 
11 |   prompt:
12 |     - "b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
13 |     - "close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
14 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
15 |     - "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
16 | 
17 |   n_prompt:
18 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
19 |     - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
20 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
21 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
22 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v2/v2-2-RealisticVision-MotionLoRA.yaml:
--------------------------------------------------------------------------------
  1 | # ZoomIn
  2 | - inference_config: "configs/inference/inference-v2.yaml"
  3 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
  4 | 
  5 |   motion_module_lora_configs:
  6 |     - path:  "models/MotionLoRA/v2_lora_ZoomIn.ckpt"
  7 |       alpha: 1.0
  8 | 
  9 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 10 |   lora_model_path: ""
 11 | 
 12 |   seed:           45987230
 13 |   steps:          25
 14 |   guidance_scale: 7.5
 15 | 
 16 |   prompt:
 17 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
 18 | 
 19 |   n_prompt:
 20 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
 21 | 
 22 | 
 23 | # ZoomOut
 24 | - inference_config: "configs/inference/inference-v2.yaml"
 25 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
 26 | 
 27 |   motion_module_lora_configs:
 28 |     - path:  "models/MotionLoRA/v2_lora_ZoomOut.ckpt"
 29 |       alpha: 1.0
 30 | 
 31 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 32 |   lora_model_path: ""
 33 | 
 34 |   seed:           45987230
 35 |   steps:          25
 36 |   guidance_scale: 7.5
 37 | 
 38 |   prompt:
 39 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
 40 | 
 41 |   n_prompt:
 42 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
 43 | 
 44 | 
 45 | # PanLeft
 46 | - inference_config: "configs/inference/inference-v2.yaml"
 47 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
 48 | 
 49 |   motion_module_lora_configs:
 50 |     - path:  "models/MotionLoRA/v2_lora_PanLeft.ckpt"
 51 |       alpha: 1.0
 52 | 
 53 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 54 |   lora_model_path: ""
 55 | 
 56 |   seed:           45987230
 57 |   steps:          25
 58 |   guidance_scale: 7.5
 59 | 
 60 |   prompt:
 61 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
 62 | 
 63 |   n_prompt:
 64 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
 65 | 
 66 | 
 67 | # PanRight
 68 | - inference_config: "configs/inference/inference-v2.yaml"
 69 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
 70 | 
 71 |   motion_module_lora_configs:
 72 |     - path:  "models/MotionLoRA/v2_lora_PanRight.ckpt"
 73 |       alpha: 1.0
 74 | 
 75 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 76 |   lora_model_path: ""
 77 | 
 78 |   seed:           45987230
 79 |   steps:          25
 80 |   guidance_scale: 7.5
 81 | 
 82 |   prompt:
 83 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
 84 | 
 85 |   n_prompt:
 86 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
 87 | 
 88 | 
 89 | # TiltUp
 90 | - inference_config: "configs/inference/inference-v2.yaml"
 91 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
 92 | 
 93 |   motion_module_lora_configs:
 94 |     - path:  "models/MotionLoRA/v2_lora_TiltUp.ckpt"
 95 |       alpha: 1.0
 96 | 
 97 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 98 |   lora_model_path: ""
 99 | 
100 |   seed:           45987230
101 |   steps:          25
102 |   guidance_scale: 7.5
103 | 
104 |   prompt:
105 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
106 | 
107 |   n_prompt:
108 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
109 | 
110 | 
111 | # TiltDown
112 | - inference_config: "configs/inference/inference-v2.yaml"
113 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
114 | 
115 |   motion_module_lora_configs:
116 |     - path:  "models/MotionLoRA/v2_lora_TiltDown.ckpt"
117 |       alpha: 1.0
118 | 
119 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
120 |   lora_model_path: ""
121 | 
122 |   seed:           45987230
123 |   steps:          25
124 |   guidance_scale: 7.5
125 | 
126 |   prompt:
127 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
128 | 
129 |   n_prompt:
130 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
131 | 
132 | 
133 | # RollingAnticlockwise
134 | - inference_config: "configs/inference/inference-v2.yaml"
135 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
136 | 
137 |   motion_module_lora_configs:
138 |     - path:  "models/MotionLoRA/v2_lora_RollingAnticlockwise.ckpt"
139 |       alpha: 1.0
140 | 
141 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
142 |   lora_model_path: ""
143 | 
144 |   seed:           45987230
145 |   steps:          25
146 |   guidance_scale: 7.5
147 | 
148 |   prompt:
149 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
150 | 
151 |   n_prompt:
152 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
153 | 
154 | 
155 | # RollingClockwise
156 | - inference_config: "configs/inference/inference-v2.yaml"
157 |   motion_module:    "models/Motion_Module/mm_sd_v15_v2.ckpt"
158 | 
159 |   motion_module_lora_configs:
160 |     - path:  "models/MotionLoRA/v2_lora_RollingClockwise.ckpt"
161 |       alpha: 1.0
162 | 
163 |   dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
164 |   lora_model_path: ""
165 | 
166 |   seed:           45987230
167 |   steps:          25
168 |   guidance_scale: 7.5
169 | 
170 |   prompt:
171 |     - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
172 | 
173 |   n_prompt:
174 |     - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
175 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v3/v3-1-T2V.yaml:
--------------------------------------------------------------------------------
  1 | # 1-animation
  2 | - domain_lora_scale: 1.0
  3 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
  4 |   dreambooth_path:   ""
  5 | 
  6 |   inference_config: "configs/inference/inference-v3.yaml"
  7 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
  8 | 
  9 |   controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml"
 10 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt"
 11 | 
 12 |   H: 256
 13 |   W: 384
 14 |   seed: [123,234]
 15 |   steps: 25
 16 |   guidance_scale: 8.5
 17 | 
 18 |   controlnet_image_indexs: [0]
 19 |   controlnet_images:
 20 |     - "__assets__/demos/image/painting.png"
 21 | 
 22 |   prompt:
 23 |     - an oil painting of a sailboat in the ocean wave
 24 |     - an oil painting of a sailboat in the ocean wave
 25 |   n_prompt:
 26 |     - "worst quality, low quality, letterboxed"
 27 | 
 28 | 
 29 | # 2-interpolation
 30 | - domain_lora_scale: 1.0
 31 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
 32 |   dreambooth_path:   ""
 33 | 
 34 |   inference_config: "configs/inference/inference-v3.yaml"
 35 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
 36 | 
 37 |   controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml"
 38 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt"
 39 | 
 40 |   H: 256
 41 |   W: 384
 42 |   seed: [123,234]
 43 |   steps: 25
 44 |   guidance_scale: 8.5
 45 | 
 46 |   controlnet_image_indexs: [0,-1]
 47 |   controlnet_images:
 48 |     - "__assets__/demos/image/interpolation_1.png"
 49 |     - "__assets__/demos/image/interpolation_2.png"
 50 | 
 51 |   prompt:
 52 |     - "aerial view, beautiful forest, autumn, 4k, high quality"
 53 |     - "aerial view, beautiful forest, autumn, 4k, high quality"
 54 |   n_prompt:
 55 |     - "worst quality, low quality, letterboxed"
 56 | 
 57 | 
 58 | # 3-interpolation
 59 | - domain_lora_scale: 1.0
 60 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
 61 |   dreambooth_path:   ""
 62 | 
 63 |   inference_config: "configs/inference/inference-v3.yaml"
 64 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
 65 | 
 66 |   controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml"
 67 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt"
 68 | 
 69 |   H: 256
 70 |   W: 384
 71 |   seed:           [123,234]
 72 |   steps:          25
 73 |   guidance_scale: 8.5
 74 | 
 75 |   controlnet_image_indexs: [0,5,10,15]
 76 |   controlnet_images:
 77 |     - "__assets__/demos/image/low_fps_1.png"
 78 |     - "__assets__/demos/image/low_fps_2.png"
 79 |     - "__assets__/demos/image/low_fps_3.png"
 80 |     - "__assets__/demos/image/low_fps_4.png"
 81 | 
 82 |   prompt:
 83 |     - "two people holding hands in a field with wind turbines in the background"
 84 |     - "two people holding hands in a field with wind turbines in the background"
 85 |   n_prompt:
 86 |     - "worst quality, low quality, letterboxed"
 87 | 
 88 | 
 89 | # 3-prediction
 90 | - domain_lora_scale: 1.0
 91 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
 92 |   dreambooth_path:   ""
 93 | 
 94 |   inference_config: "configs/inference/inference-v3.yaml"
 95 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
 96 | 
 97 |   controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml"
 98 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt"
 99 | 
100 |   H: 256
101 |   W: 384
102 |   seed:           [123,234]
103 |   steps:          25
104 |   guidance_scale: 8.5
105 | 
106 |   controlnet_image_indexs: [0,1,2,3]
107 |   controlnet_images:
108 |     - "__assets__/demos/image/prediction_1.png"
109 |     - "__assets__/demos/image/prediction_2.png"
110 |     - "__assets__/demos/image/prediction_3.png"
111 |     - "__assets__/demos/image/prediction_4.png"
112 | 
113 |   prompt:
114 |     - "an astronaut is flying in the space, 4k, high resolution"
115 |     - "an astronaut is flying in the space, 4k, high resolution"
116 |   n_prompt:
117 |     - "worst quality, low quality, letterboxed"
118 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v3/v3-2-animation-RealisticVision.yaml:
--------------------------------------------------------------------------------
 1 | # animation-1
 2 | - domain_lora_scale: 1.0
 3 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
 4 |   dreambooth_path:   "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 5 | 
 6 |   inference_config: "configs/inference/inference-v3.yaml"
 7 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
 8 | 
 9 |   controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml"
10 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt"
11 | 
12 |   seed: -1
13 |   steps: 25
14 |   guidance_scale: 8.5
15 | 
16 |   controlnet_image_indexs: [0]
17 |   controlnet_images:
18 |     - "__assets__/demos/image/RealisticVision_firework.png"
19 | 
20 |   prompt:
21 |     - "closeup face photo of man in black clothes, night city street, bokeh, fireworks in background"
22 |     - "closeup face photo of man in black clothes, night city street, bokeh, fireworks in background"
23 |   n_prompt:
24 |     - "worst quality, low quality, letterboxed"
25 | 
26 | 
27 | # animation-2
28 | - domain_lora_scale: 1.0
29 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
30 |   dreambooth_path:   "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
31 | 
32 |   inference_config: "configs/inference/inference-v3.yaml"
33 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
34 | 
35 |   controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml"
36 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt"
37 | 
38 |   seed: -1
39 |   steps: 25
40 |   guidance_scale: 8.5
41 | 
42 |   controlnet_image_indexs: [0]
43 |   controlnet_images:
44 |     - "__assets__/demos/image/RealisticVision_sunset.png"
45 | 
46 |   prompt:
47 |     - "masterpiece, bestquality, highlydetailed, ultradetailed, sunset, orange sky, warm lighting, fishing boats, ocean waves, seagulls, rippling water, wharf, silhouette, serene atmosphere, dusk, evening glow, golden hour, coastal landscape, seaside scenery"
48 |     - "masterpiece, bestquality, highlydetailed, ultradetailed, sunset, orange sky, warm lighting, fishing boats, ocean waves, seagulls, rippling water, wharf, silhouette, serene atmosphere, dusk, evening glow, golden hour, coastal landscape, seaside scenery"
49 |   n_prompt:
50 |     - "worst quality, low quality, letterboxed"
51 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/prompts/v3/v3-3-sketch-RealisticVision.yaml:
--------------------------------------------------------------------------------
 1 | # 1-sketch-to-video
 2 | - domain_lora_scale: 1.0
 3 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
 4 |   dreambooth_path:    "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
 5 | 
 6 |   inference_config: "configs/inference/inference-v3.yaml"
 7 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
 8 | 
 9 |   controlnet_config: "configs/inference/sparsectrl/image_condition.yaml"
10 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_scribble.ckpt"
11 | 
12 |   seed: -1
13 |   steps: 25
14 |   guidance_scale: 8.5
15 | 
16 |   controlnet_image_indexs: [0]
17 |   controlnet_images:
18 |     - "__assets__/demos/scribble/scribble_1.png"
19 | 
20 |   prompt:
21 |     - "a back view of a boy, standing on the ground, looking at the sky, sunlight, masterpieces"
22 |     - "a back view of a boy, standing on the ground, looking at the sky, clouds, sunset, orange sky, beautiful sunlight, masterpieces"
23 |   n_prompt:
24 |     - "worst quality, low quality, letterboxed"
25 | 
26 | 
27 | # 2-storyboarding
28 | - domain_lora_scale: 1.0
29 |   adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
30 |   dreambooth_path:    "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors"
31 | 
32 |   inference_config: "configs/inference/inference-v3.yaml"
33 |   motion_module:    "models/Motion_Module/v3_sd15_mm.ckpt"
34 | 
35 |   controlnet_config: "configs/inference/sparsectrl/image_condition.yaml"
36 |   controlnet_path:   "models/SparseCtrl/v3_sd15_sparsectrl_scribble.ckpt"
37 | 
38 |   seed: -1
39 |   steps: 25
40 |   guidance_scale: 8.5
41 | 
42 |   controlnet_image_indexs: [0,8,15]
43 |   controlnet_images:
44 |     - "__assets__/demos/scribble/scribble_2_1.png"
45 |     - "__assets__/demos/scribble/scribble_2_2.png"
46 |     - "__assets__/demos/scribble/scribble_2_3.png"
47 | 
48 |   prompt:
49 |     - "an aerial view of a modern city, sunlight, day time, masterpiece, high quality"
50 |     - "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
51 |   n_prompt:
52 |     - "worst quality, low quality, letterboxed"
53 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/training/v1/image_finetune.yaml:
--------------------------------------------------------------------------------
 1 | image_finetune: true
 2 | 
 3 | output_dir: "outputs"
 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
 5 | 
 6 | noise_scheduler_kwargs:
 7 |   num_train_timesteps: 1000
 8 |   beta_start:          0.00085
 9 |   beta_end:            0.012
10 |   beta_schedule:       "scaled_linear"
11 |   steps_offset:        1
12 |   clip_sample:         false
13 | 
14 | train_data:
15 |   csv_path:     "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv"
16 |   video_folder: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val"
17 |   sample_size:  256
18 | 
19 | validation_data:
20 |   prompts:
21 |     - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
22 |     - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
23 |     - "Robot dancing in times square."
24 |     - "Pacific coast, carmel by the sea ocean and waves."
25 |   num_inference_steps: 25
26 |   guidance_scale: 8.
27 | 
28 | trainable_modules:
29 |   - "."
30 | 
31 | unet_checkpoint_path: ""
32 | 
33 | learning_rate:    1.e-5
34 | train_batch_size: 50
35 | 
36 | max_train_epoch:      -1
37 | max_train_steps:      100
38 | checkpointing_epochs: -1
39 | checkpointing_steps:  60
40 | 
41 | validation_steps:       5000
42 | validation_steps_tuple: [2, 50]
43 | 
44 | global_seed: 42
45 | mixed_precision_training: true
46 | enable_xformers_memory_efficient_attention: True
47 | 
48 | is_debug: False
49 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/training/v1/motion_adapter_training.yaml:
--------------------------------------------------------------------------------
 1 | image_finetune: false
 2 | 
 3 | output_dir: "outputs"
 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
 5 | 
 6 | unet_additional_kwargs:
 7 |   use_motion_module              : true
 8 |   motion_module_resolutions      : [ 1,2,4,8 ]
 9 |   unet_use_cross_frame_attention : false
10 |   unet_use_temporal_attention    : false
11 |   use_motion_adapter             : true
12 | 
13 |   motion_module_type: Vanilla
14 |   motion_module_kwargs:
15 |     num_attention_heads                : 8
16 |     num_transformer_block              : 1
17 |     attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
18 |     temporal_position_encoding         : true
19 |     temporal_position_encoding_max_len : 24
20 |     temporal_attention_dim_div         : 1
21 |     zero_initialize                    : true
22 |     use_motion_adapter                 : true
23 | 
24 | noise_scheduler_kwargs:
25 |   num_train_timesteps: 1000
26 |   beta_start:          0.00085
27 |   beta_end:            0.012
28 |   beta_schedule:       "linear"
29 |   steps_offset:        1
30 |   clip_sample:         false
31 | 
32 | train_data:
33 |   csv_path:        "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/train.csv"
34 |   video_folder:    "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/videos"
35 |   sample_size:     256
36 |   sample_stride:   4
37 |   sample_n_frames: 16
38 | 
39 | validation_data:
40 |   prompts:
41 |     - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
42 |     - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
43 |     - "Robot dancing in times square."
44 |     - "Pacific coast, carmel by the sea ocean and waves."
45 |   num_inference_steps: 25
46 |   guidance_scale: 8.
47 | 
48 | trainable_modules:
49 |   - "q_lora."
50 |   - "k_lora."
51 |   - "v_lora."
52 | 
53 | 
54 | motion_adapater_ckpt: "./models/Motion_Module/mm_sd_v15.ckpt"
55 | 
56 | 
57 | unet_checkpoint_path: ""
58 | 
59 | learning_rate:    2.e-5
60 | train_batch_size: 1
61 | 
62 | max_train_epoch:      -1
63 | max_train_steps:      4000
64 | checkpointing_epochs: -1
65 | checkpointing_steps:  4000
66 | 
67 | validation_steps:       500
68 | validation_steps_tuple: [2, 50]
69 | 
70 | global_seed: 42
71 | mixed_precision_training: true
72 | enable_xformers_memory_efficient_attention: True
73 | 
74 | is_debug: False
75 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/training/v1/spatial_adapter_training.yaml:
--------------------------------------------------------------------------------
 1 | image_finetune: false
 2 | 
 3 | output_dir: "outputs"
 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
 5 | 
 6 | unet_additional_kwargs:
 7 |   use_motion_module              : true
 8 |   motion_module_resolutions      : [ 1,2,4,8 ]
 9 |   unet_use_cross_frame_attention : false
10 |   unet_use_temporal_attention    : false
11 |   use_spatial_adapter            : true
12 | 
13 |   motion_module_type: Vanilla
14 |   motion_module_kwargs:
15 |     num_attention_heads                : 8
16 |     num_transformer_block              : 1
17 |     attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
18 |     temporal_position_encoding         : true
19 |     temporal_position_encoding_max_len : 24
20 |     temporal_attention_dim_div         : 1
21 |     zero_initialize                    : true
22 |     use_motion_adapter                 : true 
23 | 
24 | noise_scheduler_kwargs:
25 |   num_train_timesteps: 1000
26 |   beta_start:          0.00085
27 |   beta_end:            0.012
28 |   beta_schedule:       "linear"
29 |   steps_offset:        1
30 |   clip_sample:         false
31 | 
32 | train_data:
33 |   csv_path:        "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/train.csv"
34 |   video_folder:    "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/videos"
35 |   sample_size:     256
36 |   sample_stride:   4
37 |   sample_n_frames: 16
38 | 
39 | validation_data:
40 |   prompts:
41 |     - "ohwx man in the Snow rocky mountains. Snow blanketed rocky mountains surround and shadow deep canyons."
42 |     - "ohwx man reading book"
43 |     - "ohwx man smiling"
44 |     - "ohwx man walking drinking coffee"
45 |   num_inference_steps: 25
46 |   guidance_scale: 8.
47 | 
48 | 
49 | 
50 | trainable_modules:
51 |   - "attn1_lora."
52 |   - "attn2_lora."
53 | 
54 | motion_adapater_ckpt: "./outputs/motion_adapter_training-2024-07-30T10-47-27/checkpoints/checkpoint-max-steps-4000.ckpt"
55 | 
56 | 
57 | unet_checkpoint_path: ""
58 | 
59 | learning_rate:    2.e-5
60 | train_batch_size: 1
61 | 
62 | max_train_epoch:      -1
63 | max_train_steps:      600
64 | checkpointing_epochs: -1
65 | checkpointing_steps:  600
66 | 
67 | validation_steps:       600
68 | validation_steps_tuple: [2, 50]
69 | 
70 | global_seed: 42
71 | mixed_precision_training: true
72 | enable_xformers_memory_efficient_attention: True
73 | 
74 | is_debug: False
75 | 


--------------------------------------------------------------------------------
/AnimateDiff/configs/training/v1/training.yaml:
--------------------------------------------------------------------------------
 1 | image_finetune: false
 2 | 
 3 | output_dir: "outputs"
 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
 5 | 
 6 | unet_additional_kwargs:
 7 |   use_motion_module              : true
 8 |   motion_module_resolutions      : [ 1,2,4,8 ]
 9 |   unet_use_cross_frame_attention : false
10 |   unet_use_temporal_attention    : false
11 | 
12 |   motion_module_type: Vanilla
13 |   motion_module_kwargs:
14 |     num_attention_heads                : 8
15 |     num_transformer_block              : 1
16 |     attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
17 |     temporal_position_encoding         : true
18 |     temporal_position_encoding_max_len : 24
19 |     temporal_attention_dim_div         : 1
20 |     zero_initialize                    : true
21 | 
22 | noise_scheduler_kwargs:
23 |   num_train_timesteps: 1000
24 |   beta_start:          0.00085
25 |   beta_end:            0.012
26 |   beta_schedule:       "linear"
27 |   steps_offset:        1
28 |   clip_sample:         false
29 | 
30 | train_data:
31 |   csv_path:        "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/train.csv"
32 |   video_folder:    "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/videos"
33 |   sample_size:     256
34 |   sample_stride:   4
35 |   sample_n_frames: 16
36 | 
37 | validation_data:
38 |   prompts:
39 |     - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
40 |     - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
41 |     - "Robot dancing in times square."
42 |     - "Pacific coast, carmel by the sea ocean and waves."
43 |   num_inference_steps: 25
44 |   guidance_scale: 8.
45 | 
46 | # trainable_modules:
47 | #   - "q_lora."
48 | #   - "k_lora."
49 | #   - "v_lora."
50 | 
51 | trainable_modules:
52 |   - "attn1_lora."
53 |   - "attn2_lora."
54 | 
55 | motion_adapater_ckpt: ""
56 | 
57 | 
58 | unet_checkpoint_path: ""
59 | 
60 | learning_rate:    2.e-5
61 | train_batch_size: 1
62 | 
63 | max_train_epoch:      -1
64 | max_train_steps:      4000
65 | checkpointing_epochs: -1
66 | checkpointing_steps:  5
67 | 
68 | validation_steps:       5000
69 | validation_steps_tuple: [2, 50]
70 | 
71 | global_seed: 42
72 | mixed_precision_training: true
73 | enable_xformers_memory_efficient_attention: True
74 | 
75 | is_debug: False
76 | 


--------------------------------------------------------------------------------
/AnimateDiff/convert_to_safetensors.py:
--------------------------------------------------------------------------------
  1 | # Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
  2 | # *Only* converts the UNet, VAE, and Text Encoder.
  3 | # Does not convert optimizer state or any other thing.
  4 | 
  5 | import argparse
  6 | import os.path as osp
  7 | import re
  8 | 
  9 | import torch
 10 | from safetensors.torch import load_file, save_file
 11 | 
 12 | 
 13 | # =================#
 14 | # UNet Conversion #
 15 | # =================#
 16 | 
 17 | unet_conversion_map = [
 18 |     # (stable-diffusion, HF Diffusers)
 19 |     ("time_embed.0.weight", "time_embedding.linear_1.weight"),
 20 |     ("time_embed.0.bias", "time_embedding.linear_1.bias"),
 21 |     ("time_embed.2.weight", "time_embedding.linear_2.weight"),
 22 |     ("time_embed.2.bias", "time_embedding.linear_2.bias"),
 23 |     ("input_blocks.0.0.weight", "conv_in.weight"),
 24 |     ("input_blocks.0.0.bias", "conv_in.bias"),
 25 |     ("out.0.weight", "conv_norm_out.weight"),
 26 |     ("out.0.bias", "conv_norm_out.bias"),
 27 |     ("out.2.weight", "conv_out.weight"),
 28 |     ("out.2.bias", "conv_out.bias"),
 29 | ]
 30 | 
 31 | unet_conversion_map_resnet = [
 32 |     # (stable-diffusion, HF Diffusers)
 33 |     ("in_layers.0", "norm1"),
 34 |     ("in_layers.2", "conv1"),
 35 |     ("out_layers.0", "norm2"),
 36 |     ("out_layers.3", "conv2"),
 37 |     ("emb_layers.1", "time_emb_proj"),
 38 |     ("skip_connection", "conv_shortcut"),
 39 | ]
 40 | 
 41 | unet_conversion_map_layer = []
 42 | # hardcoded number of downblocks and resnets/attentions...
 43 | # would need smarter logic for other networks.
 44 | for i in range(4):
 45 |     # loop over downblocks/upblocks
 46 | 
 47 |     for j in range(2):
 48 |         # loop over resnets/attentions for downblocks
 49 |         hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
 50 |         sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
 51 |         unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
 52 | 
 53 |         if i < 3:
 54 |             # no attention layers in down_blocks.3
 55 |             hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
 56 |             sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
 57 |             unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
 58 | 
 59 |     for j in range(3):
 60 |         # loop over resnets/attentions for upblocks
 61 |         hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
 62 |         sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
 63 |         unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
 64 | 
 65 |         if i > 0:
 66 |             # no attention layers in up_blocks.0
 67 |             hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
 68 |             sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
 69 |             unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
 70 | 
 71 |     if i < 3:
 72 |         # no downsample in down_blocks.3
 73 |         hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
 74 |         sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
 75 |         unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
 76 | 
 77 |         # no upsample in up_blocks.3
 78 |         hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
 79 |         sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
 80 |         unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
 81 | 
 82 | hf_mid_atn_prefix = "mid_block.attentions.0."
 83 | sd_mid_atn_prefix = "middle_block.1."
 84 | unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
 85 | 
 86 | for j in range(2):
 87 |     hf_mid_res_prefix = f"mid_block.resnets.{j}."
 88 |     sd_mid_res_prefix = f"middle_block.{2*j}."
 89 |     unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
 90 | 
 91 | 
 92 | def convert_unet_state_dict(unet_state_dict):
 93 |     # buyer beware: this is a *brittle* function,
 94 |     # and correct output requires that all of these pieces interact in
 95 |     # the exact order in which I have arranged them.
 96 |     mapping = {k: k for k in unet_state_dict.keys()}
 97 |     for sd_name, hf_name in unet_conversion_map:
 98 |         mapping[hf_name] = sd_name
 99 |     for k, v in mapping.items():
100 |         if "resnets" in k:
101 |             for sd_part, hf_part in unet_conversion_map_resnet:
102 |                 v = v.replace(hf_part, sd_part)
103 |             mapping[k] = v
104 |     for k, v in mapping.items():
105 |         for sd_part, hf_part in unet_conversion_map_layer:
106 |             v = v.replace(hf_part, sd_part)
107 |         mapping[k] = v
108 |     new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
109 |     return new_state_dict
110 | 
111 | 
112 | # ================#
113 | # VAE Conversion #
114 | # ================#
115 | 
116 | vae_conversion_map = [
117 |     # (stable-diffusion, HF Diffusers)
118 |     ("nin_shortcut", "conv_shortcut"),
119 |     ("norm_out", "conv_norm_out"),
120 |     ("mid.attn_1.", "mid_block.attentions.0."),
121 | ]
122 | 
123 | for i in range(4):
124 |     # down_blocks have two resnets
125 |     for j in range(2):
126 |         hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
127 |         sd_down_prefix = f"encoder.down.{i}.block.{j}."
128 |         vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
129 | 
130 |     if i < 3:
131 |         hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
132 |         sd_downsample_prefix = f"down.{i}.downsample."
133 |         vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
134 | 
135 |         hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
136 |         sd_upsample_prefix = f"up.{3-i}.upsample."
137 |         vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
138 | 
139 |     # up_blocks have three resnets
140 |     # also, up blocks in hf are numbered in reverse from sd
141 |     for j in range(3):
142 |         hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
143 |         sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
144 |         vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
145 | 
146 | # this part accounts for mid blocks in both the encoder and the decoder
147 | for i in range(2):
148 |     hf_mid_res_prefix = f"mid_block.resnets.{i}."
149 |     sd_mid_res_prefix = f"mid.block_{i+1}."
150 |     vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
151 | 
152 | 
153 | vae_conversion_map_attn = [
154 |     # (stable-diffusion, HF Diffusers)
155 |     ("norm.", "group_norm."),
156 |     ("q.", "query."),
157 |     ("k.", "key."),
158 |     ("v.", "value."),
159 |     ("proj_out.", "proj_attn."),
160 | ]
161 | 
162 | # This is probably not the most ideal solution, but it does work.
163 | vae_extra_conversion_map = [
164 |     ("to_q", "q"),
165 |     ("to_k", "k"),
166 |     ("to_v", "v"),
167 |     ("to_out.0", "proj_out"),
168 | ]
169 | 
170 | 
171 | def reshape_weight_for_sd(w):
172 |     # convert HF linear weights to SD conv2d weights
173 |     if not w.ndim == 1:
174 |         return w.reshape(*w.shape, 1, 1)
175 |     else:
176 |         return w
177 | 
178 | 
179 | def convert_vae_state_dict(vae_state_dict):
180 |     mapping = {k: k for k in vae_state_dict.keys()}
181 |     for k, v in mapping.items():
182 |         for sd_part, hf_part in vae_conversion_map:
183 |             v = v.replace(hf_part, sd_part)
184 |         mapping[k] = v
185 |     for k, v in mapping.items():
186 |         if "attentions" in k:
187 |             for sd_part, hf_part in vae_conversion_map_attn:
188 |                 v = v.replace(hf_part, sd_part)
189 |             mapping[k] = v
190 |     new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
191 |     weights_to_convert = ["q", "k", "v", "proj_out"]
192 |     keys_to_rename = {}
193 |     for k, v in new_state_dict.items():
194 |         for weight_name in weights_to_convert:
195 |             if f"mid.attn_1.{weight_name}.weight" in k:
196 |                 print(f"Reshaping {k} for SD format")
197 |                 new_state_dict[k] = reshape_weight_for_sd(v)
198 |         for weight_name, real_weight_name in vae_extra_conversion_map:
199 |             if (
200 |                 f"mid.attn_1.{weight_name}.weight" in k
201 |                 or f"mid.attn_1.{weight_name}.bias" in k
202 |             ):
203 |                 keys_to_rename[k] = k.replace(weight_name, real_weight_name)
204 |     for k, v in keys_to_rename.items():
205 |         if k in new_state_dict:
206 |             print(f"Renaming {k} to {v}")
207 |             new_state_dict[v] = reshape_weight_for_sd(new_state_dict[k])
208 |             del new_state_dict[k]
209 |     return new_state_dict
210 | 
211 | 
212 | # =========================#
213 | # Text Encoder Conversion #
214 | # =========================#
215 | 
216 | 
217 | textenc_conversion_lst = [
218 |     # (stable-diffusion, HF Diffusers)
219 |     ("resblocks.", "text_model.encoder.layers."),
220 |     ("ln_1", "layer_norm1"),
221 |     ("ln_2", "layer_norm2"),
222 |     (".c_fc.", ".fc1."),
223 |     (".c_proj.", ".fc2."),
224 |     (".attn", ".self_attn"),
225 |     ("ln_final.", "transformer.text_model.final_layer_norm."),
226 |     (
227 |         "token_embedding.weight",
228 |         "transformer.text_model.embeddings.token_embedding.weight",
229 |     ),
230 |     (
231 |         "positional_embedding",
232 |         "transformer.text_model.embeddings.position_embedding.weight",
233 |     ),
234 | ]
235 | protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
236 | textenc_pattern = re.compile("|".join(protected.keys()))
237 | 
238 | # Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
239 | code2idx = {"q": 0, "k": 1, "v": 2}
240 | 
241 | 
242 | def convert_text_enc_state_dict_v20(text_enc_dict):
243 |     new_state_dict = {}
244 |     capture_qkv_weight = {}
245 |     capture_qkv_bias = {}
246 |     for k, v in text_enc_dict.items():
247 |         if (
248 |             k.endswith(".self_attn.q_proj.weight")
249 |             or k.endswith(".self_attn.k_proj.weight")
250 |             or k.endswith(".self_attn.v_proj.weight")
251 |         ):
252 |             k_pre = k[: -len(".q_proj.weight")]
253 |             k_code = k[-len("q_proj.weight")]
254 |             if k_pre not in capture_qkv_weight:
255 |                 capture_qkv_weight[k_pre] = [None, None, None]
256 |             capture_qkv_weight[k_pre][code2idx[k_code]] = v
257 |             continue
258 | 
259 |         if (
260 |             k.endswith(".self_attn.q_proj.bias")
261 |             or k.endswith(".self_attn.k_proj.bias")
262 |             or k.endswith(".self_attn.v_proj.bias")
263 |         ):
264 |             k_pre = k[: -len(".q_proj.bias")]
265 |             k_code = k[-len("q_proj.bias")]
266 |             if k_pre not in capture_qkv_bias:
267 |                 capture_qkv_bias[k_pre] = [None, None, None]
268 |             capture_qkv_bias[k_pre][code2idx[k_code]] = v
269 |             continue
270 | 
271 |         relabelled_key = textenc_pattern.sub(
272 |             lambda m: protected[re.escape(m.group(0))], k
273 |         )
274 |         new_state_dict[relabelled_key] = v
275 | 
276 |     for k_pre, tensors in capture_qkv_weight.items():
277 |         if None in tensors:
278 |             raise Exception(
279 |                 "CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing"
280 |             )
281 |         relabelled_key = textenc_pattern.sub(
282 |             lambda m: protected[re.escape(m.group(0))], k_pre
283 |         )
284 |         new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
285 | 
286 |     for k_pre, tensors in capture_qkv_bias.items():
287 |         if None in tensors:
288 |             raise Exception(
289 |                 "CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing"
290 |             )
291 |         relabelled_key = textenc_pattern.sub(
292 |             lambda m: protected[re.escape(m.group(0))], k_pre
293 |         )
294 |         new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
295 | 
296 |     return new_state_dict
297 | 
298 | 
299 | def convert_text_enc_state_dict(text_enc_dict):
300 |     return text_enc_dict
301 | 
302 | 
303 | if __name__ == "__main__":
304 |     parser = argparse.ArgumentParser()
305 | 
306 |     parser.add_argument(
307 |         "--model_path",
308 |         default=None,
309 |         type=str,
310 |         required=True,
311 |         help="Path to the model to convert.",
312 |     )
313 |     parser.add_argument(
314 |         "--checkpoint_path",
315 |         default=None,
316 |         type=str,
317 |         required=True,
318 |         help="Path to the output model.",
319 |     )
320 |     parser.add_argument(
321 |         "--half", action="store_true", help="Save weights in half precision."
322 |     )
323 |     parser.add_argument(
324 |         "--use_safetensors",
325 |         action="store_true",
326 |         help="Save weights use safetensors, default is ckpt.",
327 |     )
328 | 
329 |     args = parser.parse_args()
330 | 
331 |     assert args.model_path is not None, "Must provide a model path!"
332 | 
333 |     assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
334 | 
335 |     # Path for safetensors
336 |     unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.safetensors")
337 |     vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.safetensors")
338 |     text_enc_path = osp.join(args.model_path, "text_encoder", "model.safetensors")
339 | 
340 |     # Load models from safetensors if it exists, if it doesn't pytorch
341 |     if osp.exists(unet_path):
342 |         unet_state_dict = load_file(unet_path, device="cpu")
343 |     else:
344 |         unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
345 |         unet_state_dict = torch.load(unet_path, map_location="cpu")
346 | 
347 |     if osp.exists(vae_path):
348 |         vae_state_dict = load_file(vae_path, device="cpu")
349 |     else:
350 |         vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
351 |         vae_state_dict = torch.load(vae_path, map_location="cpu")
352 | 
353 |     if osp.exists(text_enc_path):
354 |         text_enc_dict = load_file(text_enc_path, device="cpu")
355 |     else:
356 |         text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
357 |         text_enc_dict = torch.load(text_enc_path, map_location="cpu")
358 | 
359 |     # Convert the UNet model
360 |     unet_state_dict = convert_unet_state_dict(unet_state_dict)
361 |     unet_state_dict = {
362 |         "model.diffusion_model." + k: v for k, v in unet_state_dict.items()
363 |     }
364 | 
365 |     # Convert the VAE model
366 |     vae_state_dict = convert_vae_state_dict(vae_state_dict)
367 |     vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
368 | 
369 |     # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper
370 |     is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict
371 | 
372 |     if is_v20_model:
373 |         # Need to add the tag 'transformer' in advance so we can knock it out from the final layer-norm
374 |         text_enc_dict = {"transformer." + k: v for k, v in text_enc_dict.items()}
375 |         text_enc_dict = convert_text_enc_state_dict_v20(text_enc_dict)
376 |         text_enc_dict = {
377 |             "cond_stage_model.model." + k: v for k, v in text_enc_dict.items()
378 |         }
379 |     else:
380 |         text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
381 |         text_enc_dict = {
382 |             "cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()
383 |         }
384 | 
385 |     # Put together new checkpoint
386 |     state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
387 |     if args.half:
388 |         state_dict = {k: v.half() for k, v in state_dict.items()}
389 | 
390 |     if args.use_safetensors:
391 |         save_file(state_dict, args.checkpoint_path)
392 |     else:
393 |         state_dict = {"state_dict": state_dict}
394 |         torch.save(state_dict, args.checkpoint_path)
395 | 


--------------------------------------------------------------------------------
/AnimateDiff/dataset.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | ds = load_dataset("TempoFunk/webvid-10M", cache_dir="./dataset")
4 | 


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/0-MotionModule.sh:
--------------------------------------------------------------------------------
1 | gdown 1RqkQuGPaCO5sGZ6V6KZ-jUWmsRu48Kdq -O models/Motion_Module/
2 | gdown 1ql0g_Ys4UCz2RnokYlBjyOYPbttbIpbu -O models/Motion_Module/


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/1-ToonYou.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/78775 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/2-Lyriel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/72396 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/3-RcnzCartoon.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/71009 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/4-MajicMix.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/79068 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/5-RealisticVision.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/130072 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/6-Tusun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/97261 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
3 | wget https://civitai.com/api/download/models/50705 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
4 | 


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/7-FilmVelvia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/90115 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
3 | wget https://civitai.com/api/download/models/55911 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
4 | 


--------------------------------------------------------------------------------
/AnimateDiff/download_bashscripts/8-GhibliBackground.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://civitai.com/api/download/models/102828 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
3 | wget https://civitai.com/api/download/models/57618  -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate
4 | 


--------------------------------------------------------------------------------
/AnimateDiff/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: animatediff
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 | dependencies:
 6 |   - python=3.10
 7 |   - pytorch=1.13.1
 8 |   - torchvision=0.14.1
 9 |   - torchaudio=0.13.1
10 |   - pytorch-cuda=11.7
11 |   - pip
12 |   - pip:
13 |     - diffusers==0.11.1
14 |     - transformers==4.25.1
15 |     - xformers==0.0.16
16 |     - imageio==2.27.0
17 |     - decord==0.6.0
18 |     - gdown
19 |     - einops
20 |     - omegaconf
21 |     - safetensors
22 |     - gradio
23 |     - wandb
24 | 


--------------------------------------------------------------------------------
/AnimateDiff/models/DreamBooth_LoRA/Put personalized T2I checkpoints here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/models/DreamBooth_LoRA/Put personalized T2I checkpoints here.txt


--------------------------------------------------------------------------------
/AnimateDiff/models/MotionLoRA/Put MotionLoRA checkpoints here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/models/MotionLoRA/Put MotionLoRA checkpoints here.txt


--------------------------------------------------------------------------------
/AnimateDiff/models/Motion_Module/Put motion module checkpoints here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/models/Motion_Module/Put motion module checkpoints here.txt


--------------------------------------------------------------------------------
/AnimateDiff/scripts/animate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import inspect
  4 | import os
  5 | from omegaconf import OmegaConf
  6 | 
  7 | import torch
  8 | import torchvision.transforms as transforms
  9 | 
 10 | import diffusers
 11 | from diffusers import AutoencoderKL, DDIMScheduler
 12 | 
 13 | from tqdm.auto import tqdm
 14 | from transformers import CLIPTextModel, CLIPTokenizer
 15 | 
 16 | from animatediff.models.unet import UNet3DConditionModel
 17 | from animatediff.models.sparse_controlnet import SparseControlNetModel
 18 | from animatediff.pipelines.pipeline_animation import AnimationPipeline
 19 | from animatediff.utils.util import save_videos_grid
 20 | from animatediff.utils.util import load_weights
 21 | from diffusers.utils.import_utils import is_xformers_available
 22 | 
 23 | from einops import rearrange, repeat
 24 | 
 25 | import csv, pdb, glob, math
 26 | from pathlib import Path
 27 | from PIL import Image
 28 | import numpy as np
 29 | 
 30 | 
 31 | def load_motion_adapter_ckpt(unet, motion_adapater_ckpt):
 32 |     print("Loading Motion Adapter checkpoints")
 33 |     checkpoint = torch.load(
 34 |         motion_adapater_ckpt,
 35 |         map_location="cpu",
 36 |     )
 37 |     # Extract the state dict
 38 |     if "state_dict" in checkpoint:
 39 |         state_dict = checkpoint["state_dict"]
 40 |     else:
 41 |         raise KeyError("state_dict not found in checkpoint")
 42 | 
 43 |     unet_state_dict = unet.state_dict()
 44 |     motion_adapter_state_dict = {
 45 |         k: v
 46 |         for k, v in unet_state_dict.items()
 47 |         if "q_lora." in k or "k_lora." in k or "v_lora." in k
 48 |     }
 49 | 
 50 |     motion_adapter_state_dict.update(
 51 |         {k: v for k, v in state_dict.items() if k in motion_adapter_state_dict}
 52 |     )
 53 | 
 54 |     print(motion_adapter_state_dict)
 55 |     return motion_adapter_state_dict, unet_state_dict
 56 | 
 57 | 
 58 | def load_spatial_adapter_ckpt(unet, spatial_adapter_ckpt):
 59 |     print("Loading Sptial Adapter checkpoints")
 60 |     checkpoint = torch.load(
 61 |         spatial_adapter_ckpt,
 62 |         map_location="cpu",
 63 |     )
 64 |     # Extract the state dict
 65 |     if "state_dict" in checkpoint:
 66 |         state_dict = checkpoint["state_dict"]
 67 |     else:
 68 |         raise KeyError("state_dict not found in checkpoint")
 69 | 
 70 |     unet_state_dict = unet.state_dict()
 71 |     spatial_adapter_state_dict = {
 72 |         k: v
 73 |         for k, v in unet_state_dict.items()
 74 |         if "attn1_lora." in k or "attn2_lora." in k
 75 |     }
 76 | 
 77 |     spatial_adapter_state_dict.update(
 78 |         {k: v for k, v in state_dict.items() if k in spatial_adapter_state_dict}
 79 |     )
 80 |     print(spatial_adapter_state_dict)
 81 |     return spatial_adapter_state_dict, unet_state_dict
 82 | 
 83 | 
 84 | @torch.no_grad()
 85 | def main(args):
 86 |     *_, func_args = inspect.getargvalues(inspect.currentframe())
 87 |     func_args = dict(func_args)
 88 | 
 89 |     time_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
 90 |     savedir = f"samples/{Path(args.config).stem}-{time_str}"
 91 |     os.makedirs(savedir)
 92 | 
 93 |     config = OmegaConf.load(args.config)
 94 |     samples = []
 95 | 
 96 |     # create validation pipeline
 97 |     tokenizer = CLIPTokenizer.from_pretrained(
 98 |         args.pretrained_model_path, subfolder="tokenizer"
 99 |     )
100 |     text_encoder = CLIPTextModel.from_pretrained(
101 |         args.pretrained_model_path, subfolder="text_encoder"
102 |     ).cuda()
103 |     vae = AutoencoderKL.from_pretrained(
104 |         args.pretrained_model_path, subfolder="vae"
105 |     ).cuda()
106 | 
107 |     sample_idx = 0
108 |     for model_idx, model_config in enumerate(config):
109 |         model_config.W = model_config.get("W", args.W)
110 |         model_config.H = model_config.get("H", args.H)
111 |         model_config.L = model_config.get("L", args.L)
112 | 
113 |         inference_config = OmegaConf.load(
114 |             model_config.get("inference_config", args.inference_config)
115 |         )
116 |         unet = UNet3DConditionModel.from_pretrained_2d(
117 |             args.pretrained_model_path,
118 |             subfolder="unet",
119 |             unet_additional_kwargs=OmegaConf.to_container(
120 |                 inference_config.unet_additional_kwargs
121 |             ),
122 |         ).cuda()
123 | 
124 |         # load controlnet model
125 |         controlnet = controlnet_images = None
126 |         if model_config.get("controlnet_path", "") != "":
127 |             assert model_config.get("controlnet_images", "") != ""
128 |             assert model_config.get("controlnet_config", "") != ""
129 | 
130 |             unet.config.num_attention_heads = 8
131 |             unet.config.projection_class_embeddings_input_dim = None
132 | 
133 |             controlnet_config = OmegaConf.load(model_config.controlnet_config)
134 |             controlnet = SparseControlNetModel.from_unet(
135 |                 unet,
136 |                 controlnet_additional_kwargs=controlnet_config.get(
137 |                     "controlnet_additional_kwargs", {}
138 |                 ),
139 |             )
140 | 
141 |             print(
142 |                 f"loading controlnet checkpoint from {model_config.controlnet_path} ..."
143 |             )
144 |             controlnet_state_dict = torch.load(
145 |                 model_config.controlnet_path, map_location="cpu"
146 |             )
147 |             controlnet_state_dict = (
148 |                 controlnet_state_dict["controlnet"]
149 |                 if "controlnet" in controlnet_state_dict
150 |                 else controlnet_state_dict
151 |             )
152 |             controlnet_state_dict.pop("animatediff_config", "")
153 |             controlnet.load_state_dict(controlnet_state_dict)
154 |             controlnet.cuda()
155 | 
156 |             image_paths = model_config.controlnet_images
157 |             if isinstance(image_paths, str):
158 |                 image_paths = [image_paths]
159 | 
160 |             print(f"controlnet image paths:")
161 |             for path in image_paths:
162 |                 print(path)
163 |             assert len(image_paths) <= model_config.L
164 | 
165 |             image_transforms = transforms.Compose(
166 |                 [
167 |                     transforms.RandomResizedCrop(
168 |                         (model_config.H, model_config.W),
169 |                         (1.0, 1.0),
170 |                         ratio=(
171 |                             model_config.W / model_config.H,
172 |                             model_config.W / model_config.H,
173 |                         ),
174 |                     ),
175 |                     transforms.ToTensor(),
176 |                 ]
177 |             )
178 | 
179 |             if model_config.get("normalize_condition_images", False):
180 | 
181 |                 def image_norm(image):
182 |                     image = image.mean(dim=0, keepdim=True).repeat(3, 1, 1)
183 |                     image -= image.min()
184 |                     image /= image.max()
185 |                     return image
186 | 
187 |             else:
188 |                 image_norm = lambda x: x
189 | 
190 |             controlnet_images = [
191 |                 image_norm(image_transforms(Image.open(path).convert("RGB")))
192 |                 for path in image_paths
193 |             ]
194 | 
195 |             os.makedirs(os.path.join(savedir, "control_images"), exist_ok=True)
196 |             for i, image in enumerate(controlnet_images):
197 |                 Image.fromarray(
198 |                     (255.0 * (image.numpy().transpose(1, 2, 0))).astype(np.uint8)
199 |                 ).save(f"{savedir}/control_images/{i}.png")
200 | 
201 |             controlnet_images = torch.stack(controlnet_images).unsqueeze(0).cuda()
202 |             controlnet_images = rearrange(controlnet_images, "b f c h w -> b c f h w")
203 | 
204 |             if controlnet.use_simplified_condition_embedding:
205 |                 num_controlnet_images = controlnet_images.shape[2]
206 |                 controlnet_images = rearrange(
207 |                     controlnet_images, "b c f h w -> (b f) c h w"
208 |                 )
209 |                 controlnet_images = (
210 |                     vae.encode(controlnet_images * 2.0 - 1.0).latent_dist.sample()
211 |                     * 0.18215
212 |                 )
213 |                 controlnet_images = rearrange(
214 |                     controlnet_images,
215 |                     "(b f) c h w -> b c f h w",
216 |                     f=num_controlnet_images,
217 |                 )
218 | 
219 |         # set xformers
220 |         if is_xformers_available() and (not args.without_xformers):
221 |             unet.enable_xformers_memory_efficient_attention()
222 |             if controlnet is not None:
223 |                 controlnet.enable_xformers_memory_efficient_attention()
224 | 
225 |         pipeline = AnimationPipeline(
226 |             vae=vae,
227 |             text_encoder=text_encoder,
228 |             tokenizer=tokenizer,
229 |             unet=unet,
230 |             controlnet=controlnet,
231 |             scheduler=DDIMScheduler(
232 |                 **OmegaConf.to_container(inference_config.noise_scheduler_kwargs)
233 |             ),
234 |         ).to("cuda")
235 | 
236 |         pipeline = load_weights(
237 |             pipeline,
238 |             # motion module
239 |             motion_module_path=model_config.get("motion_module", ""),
240 |             motion_module_lora_configs=model_config.get(
241 |                 "motion_module_lora_configs", []
242 |             ),
243 |             # domain adapter
244 |             adapter_lora_path=model_config.get("adapter_lora_path", ""),
245 |             adapter_lora_scale=model_config.get("adapter_lora_scale", 1.0),
246 |             # image layers
247 |             dreambooth_model_path=model_config.get("dreambooth_path", ""),
248 |             lora_model_path=model_config.get("lora_model_path", ""),
249 |             lora_alpha=model_config.get("lora_alpha", 0.8),
250 |         ).to("cuda")
251 | 
252 |         motion_adapter_state_dict, unet_state_dict = load_motion_adapter_ckpt(
253 |             pipeline.unet, model_config.get("motion_adapter_ckpt", "")
254 |         )
255 |         missing, unexpected = pipeline.unet.load_state_dict(
256 |             motion_adapter_state_dict, strict=False
257 |         )
258 |         assert len(unexpected) == 0
259 | 
260 |         spatial_adapter_state_dict, unet_state_dict = load_spatial_adapter_ckpt(
261 |             pipeline.unet, model_config.get("spatial_adapter_ckpt", "")
262 |         )
263 |         missing, unexpected = pipeline.unet.load_state_dict(
264 |             spatial_adapter_state_dict, strict=False
265 |         )
266 |         assert len(unexpected) == 0
267 | 
268 |         prompts = model_config.prompt
269 |         n_prompts = (
270 |             list(model_config.n_prompt) * len(prompts)
271 |             if len(model_config.n_prompt) == 1
272 |             else model_config.n_prompt
273 |         )
274 | 
275 |         random_seeds = model_config.get("seed", [-1])
276 |         random_seeds = (
277 |             [random_seeds] if isinstance(random_seeds, int) else list(random_seeds)
278 |         )
279 |         random_seeds = (
280 |             random_seeds * len(prompts) if len(random_seeds) == 1 else random_seeds
281 |         )
282 | 
283 |         config[model_idx].random_seed = []
284 |         for prompt_idx, (prompt, n_prompt, random_seed) in enumerate(
285 |             zip(prompts, n_prompts, random_seeds)
286 |         ):
287 | 
288 |             # manually set random seed for reproduction
289 |             if random_seed != -1:
290 |                 torch.manual_seed(random_seed)
291 |             else:
292 |                 torch.seed()
293 |             config[model_idx].random_seed.append(torch.initial_seed())
294 | 
295 |             print(f"current seed: {torch.initial_seed()}")
296 |             print(f"sampling {prompt} ...")
297 |             sample = pipeline(
298 |                 prompt,
299 |                 negative_prompt=n_prompt,
300 |                 num_inference_steps=model_config.steps,
301 |                 guidance_scale=model_config.guidance_scale,
302 |                 width=model_config.W,
303 |                 height=model_config.H,
304 |                 video_length=model_config.L,
305 |                 controlnet_images=controlnet_images,
306 |                 controlnet_image_index=model_config.get("controlnet_image_indexs", [0]),
307 |             ).videos
308 |             samples.append(sample)
309 | 
310 |             prompt = "-".join((prompt.replace("/", "").split(" ")[:10]))
311 |             save_videos_grid(sample, f"{savedir}/sample/{sample_idx}-{prompt}.gif")
312 |             print(f"save to {savedir}/sample/{prompt}.gif")
313 | 
314 |             sample_idx += 1
315 | 
316 |     samples = torch.concat(samples)
317 |     save_videos_grid(samples, f"{savedir}/sample.gif", n_rows=4)
318 | 
319 |     OmegaConf.save(config, f"{savedir}/config.yaml")
320 | 
321 | 
322 | if __name__ == "__main__":
323 |     parser = argparse.ArgumentParser()
324 |     parser.add_argument(
325 |         "--pretrained-model-path",
326 |         type=str,
327 |         default="models/StableDiffusion/stable-diffusion-v1-5",
328 |     )
329 |     parser.add_argument(
330 |         "--inference-config", type=str, default="configs/inference/inference-v1.yaml"
331 |     )
332 |     parser.add_argument("--config", type=str, required=True)
333 | 
334 |     parser.add_argument("--L", type=int, default=16)
335 |     parser.add_argument("--W", type=int, default=512)
336 |     parser.add_argument("--H", type=int, default=512)
337 | 
338 |     parser.add_argument("--without-xformers", action="store_true")
339 | 
340 |     args = parser.parse_args()
341 |     main(args)
342 | 


--------------------------------------------------------------------------------
/AnimateDiff/wget-log:
--------------------------------------------------------------------------------
1 | --2024-07-26 18:24:24--  https://secta-models.s3.us-east-1.amazonaws.com/1336237.tar?X-Amz-Algorithm=AWS4-HMAC-SHA256
2 | Resolving secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)... 52.217.172.210, 54.231.201.34, 52.217.124.250, ...
3 | Connecting to secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)|52.217.172.210|:443... connected.
4 | HTTP request sent, awaiting response... 400 Bad Request
5 | 2024-07-26 18:24:25 ERROR 400: Bad Request.
6 | 
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Still-Moving: Open-Source Implementation
 2 | 
 3 | ## About
 4 | 
 5 | This repository contains an open-source implementation of the "Still-Moving" model, based on the paper "Still-Moving: Customized Video Generation without Customized Video Data" by Chefer et al.
 6 | [project page](https://still-moving.github.io/)
 7 | 
 8 | Still-Moving is a novel framework for customizing text-to-video (T2V) generation models without requiring customized video data. It leverages customized text-to-image (T2I) models and adapts them for video generation, combining spatial priors from T2I models with motion priors from T2V models.
 9 | 
10 | 
11 | ## Progress
12 | I trained Motion adapter and Spatial Adapter as mentioned in the paper. <b>
13 | Not sure why the motion is so fast, and output is bad with customized dreambooth model
14 | ![Alt text](./results/0.gif)
15 | 
16 | ## Key Features
17 | 
18 | - Customization of T2V models using only still image data
19 | - Support for personalization, stylization, and conditional generation
20 | - Implementation of Motion Adapters and Spatial Adapters
21 | - Compatible with different T2V architectures (e.g., Lumiere, AnimateDiff)
22 | 
23 | ## Installation
24 | 
25 | [Include installation instructions here]
26 | 
27 | ## Usage
28 | 
29 | [Provide basic usage examples here]
30 | 
31 | ## Implementation Details
32 | 
33 | - Motion Adapters: LoRA layers applied to temporal attention blocks (✔️ Done)
34 | - Spatial Adapters: LoRA layers added after injected customized T2I layers (✔️ Done)
35 | - Training process: Two-step training for Motion and Spatial Adapters
36 | - Supported models: [List the T2V models you've implemented]
37 | 
38 | 
39 | ## Contributing
40 | 
41 | We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or proposing new features, your efforts are appreciated.
42 | 
43 | Please make sure to update tests as appropriate and adhere to the project's coding standards.
44 | 
45 | ### Areas for Contribution
46 | 
47 | - Implementing support for additional T2V models
48 | - Optimizing performance and reducing computational requirements
49 | - Improving documentation and adding usage examples
50 | - Creating tools for easier model customization
51 | - Developing a user-friendly interface for video generation
52 | 
53 | ## License
54 | 
55 | Open to use
56 | 
57 | ## Contact
58 | 
59 | Harsh Bhatt - harshbhatt7585@gmail.com
60 | 


--------------------------------------------------------------------------------
/adapters/motion_adapter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class LoRALayer(nn.Module):
 7 |     def __init__(self, in_features, out_features, rank=4):
 8 |         super().__init__()
 9 |         self.down = nn.Linear(in_features, rank, bias=False)
10 |         self.up = nn.Linear(rank, out_features, bias=False)
11 |         self.scale = 1.0
12 | 
13 |         nn.init.normal_(self.down.weight, std=1 / rank)
14 |         nn.init.zeros_(self.up.weight)
15 | 
16 |     def forward(self, x):
17 |         return self.up(self.down(x)) * self.scale
18 | 
19 | 
20 | class LoRALinear(nn.Module):
21 |     def __init__(self, linear_layer, rank=4):
22 |         super().__init__()
23 |         self.in_features = linear_layer.in_features
24 |         self.out_features = linear_layer.out_features
25 | 
26 |         self.linear = linear_layer
27 |         self.lora = LoRALayer(self.in_features, self.out_features, rank=rank)
28 | 
29 |     def forward(self, x):
30 |         return self.lienar(x) + self.lora(x)
31 | 


--------------------------------------------------------------------------------
/results/0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/results/0.gif


--------------------------------------------------------------------------------
/wget-log:
--------------------------------------------------------------------------------
1 | --2024-07-26 18:26:01--  https://secta-models.s3.us-east-1.amazonaws.com/1336237.tar?X-Amz-Algorithm=AWS4-HMAC-SHA256
2 | Resolving secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)... 3.5.30.19, 54.231.203.82, 52.216.110.142, ...
3 | Connecting to secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)|3.5.30.19|:443... connected.
4 | HTTP request sent, awaiting response... 400 Bad Request
5 | 2024-07-26 18:26:01 ERROR 400: Bad Request.
6 | 
7 | 


--------------------------------------------------------------------------------