├── AnimateDiff ├── .gitignore ├── LICENSE.txt ├── README.md ├── __assets__ │ ├── animations │ │ ├── compare │ │ │ ├── ffmpeg │ │ │ ├── new_0.gif │ │ │ ├── new_1.gif │ │ │ ├── new_2.gif │ │ │ ├── new_3.gif │ │ │ ├── old_0.gif │ │ │ ├── old_1.gif │ │ │ ├── old_2.gif │ │ │ └── old_3.gif │ │ ├── model_01 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── model_02 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── model_03 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── model_04 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── model_05 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── model_06 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── model_07 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ ├── 04.gif │ │ │ └── init.jpg │ │ ├── model_08 │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ ├── 03.gif │ │ │ └── 04.gif │ │ ├── motion_lora │ │ │ ├── model_01 │ │ │ │ ├── 01.gif │ │ │ │ ├── 02.gif │ │ │ │ ├── 03.gif │ │ │ │ ├── 04.gif │ │ │ │ ├── 05.gif │ │ │ │ ├── 06.gif │ │ │ │ ├── 07.gif │ │ │ │ └── 08.gif │ │ │ └── model_02 │ │ │ │ ├── 01.gif │ │ │ │ ├── 02.gif │ │ │ │ ├── 03.gif │ │ │ │ ├── 04.gif │ │ │ │ ├── 05.gif │ │ │ │ ├── 06.gif │ │ │ │ ├── 07.gif │ │ │ │ └── 08.gif │ │ ├── motion_xl │ │ │ ├── 01.gif │ │ │ ├── 02.gif │ │ │ └── 03.gif │ │ └── v3 │ │ │ ├── animation_fireworks.gif │ │ │ ├── animation_sunset.gif │ │ │ ├── sketch_boy.gif │ │ │ └── sketch_city.gif │ ├── demos │ │ ├── image │ │ │ ├── RealisticVision_firework.png │ │ │ ├── RealisticVision_sunset.png │ │ │ ├── interpolation_1.png │ │ │ ├── interpolation_2.png │ │ │ ├── low_fps_1.png │ │ │ ├── low_fps_2.png │ │ │ ├── low_fps_3.png │ │ │ ├── low_fps_4.png │ │ │ ├── painting.png │ │ │ ├── prediction_1.png │ │ │ ├── prediction_2.png │ │ │ ├── prediction_3.png │ │ │ └── prediction_4.png │ │ └── scribble │ │ │ ├── scribble_1.png │ │ │ ├── scribble_2_1.png │ │ │ ├── scribble_2_2.png │ │ │ ├── scribble_2_3.png │ │ │ └── scribble_2_readme.png │ ├── docs │ │ ├── animatediff.md │ │ └── gallery.md │ └── figs │ │ ├── adapter_explain.png │ │ └── gradio.jpg ├── animatediff │ ├── data │ │ └── dataset.py │ ├── models │ │ ├── attention.py │ │ ├── motion_module.py │ │ ├── resnet.py │ │ ├── sparse_controlnet.py │ │ ├── unet.py │ │ └── unet_blocks.py │ ├── pipelines │ │ └── pipeline_animation.py │ └── utils │ │ ├── convert_from_ckpt.py │ │ ├── convert_lora_safetensor_to_diffusers.py │ │ └── util.py ├── app.py ├── configs │ ├── inference │ │ ├── inference-v1.yaml │ │ ├── inference-v2.yaml │ │ ├── inference-v3.yaml │ │ └── sparsectrl │ │ │ ├── image_condition.yaml │ │ │ └── latent_condition.yaml │ ├── prompts │ │ ├── v1 │ │ │ ├── v1-1-ToonYou.yaml │ │ │ ├── v1-2-Lyriel.yaml │ │ │ ├── v1-3-RcnzCartoon.yaml │ │ │ ├── v1-4-MajicMix.yaml │ │ │ ├── v1-5-RealisticVision.yaml │ │ │ ├── v1-6-Tusun.yaml │ │ │ ├── v1-7-FilmVelvia.yaml │ │ │ └── v1-8-GhibliBackground.yaml │ │ ├── v2 │ │ │ ├── v2-1-RealisticVision.yaml │ │ │ └── v2-2-RealisticVision-MotionLoRA.yaml │ │ └── v3 │ │ │ ├── v3-1-T2V.yaml │ │ │ ├── v3-2-animation-RealisticVision.yaml │ │ │ └── v3-3-sketch-RealisticVision.yaml │ └── training │ │ └── v1 │ │ ├── image_finetune.yaml │ │ ├── motion_adapter_training.yaml │ │ ├── spatial_adapter_training.yaml │ │ └── training.yaml ├── convert_to_safetensors.py ├── dataset.py ├── download_bashscripts │ ├── 0-MotionModule.sh │ ├── 1-ToonYou.sh │ ├── 2-Lyriel.sh │ ├── 3-RcnzCartoon.sh │ ├── 4-MajicMix.sh │ ├── 5-RealisticVision.sh │ ├── 6-Tusun.sh │ ├── 7-FilmVelvia.sh │ └── 8-GhibliBackground.sh ├── environment.yaml ├── models │ ├── DreamBooth_LoRA │ │ └── Put personalized T2I checkpoints here.txt │ ├── MotionLoRA │ │ └── Put MotionLoRA checkpoints here.txt │ └── Motion_Module │ │ └── Put motion module checkpoints here.txt ├── scripts │ └── animate.py ├── some_dict.txt ├── train.py ├── train_still_moving.py └── wget-log ├── README.md ├── adapters └── motion_adapter.py ├── results └── 0.gif └── wget-log /AnimateDiff/.gitignore: -------------------------------------------------------------------------------- 1 | wandb/ 2 | *debug* 3 | debugs/ 4 | outputs/ 5 | samples/ 6 | __pycache__/ 7 | ossutil_output/ 8 | .ossutil_checkpoint/ 9 | 10 | scripts/* 11 | !scripts/animate.py 12 | 13 | *.ipynb 14 | *.safetensors 15 | *.ckpt 16 | 17 | models/* 18 | !models/StableDiffusion/ 19 | models/StableDiffusion/* 20 | !models/StableDiffusion/*.txt 21 | !models/Motion_Module/ 22 | !models/Motion_Module/*.txt 23 | !models/DreamBooth_LoRA/ 24 | !models/DreamBooth_LoRA/*.txt 25 | !models/MotionLoRA/ 26 | !models/MotionLoRA/*.txt 27 | outputs 28 | models 29 | dataset 30 | -------------------------------------------------------------------------------- /AnimateDiff/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/ffmpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/ffmpeg -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/new_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_0.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/new_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_1.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/new_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_2.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/new_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/new_3.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/old_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_0.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/old_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_1.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/old_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_2.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/compare/old_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/compare/old_3.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_01/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_01/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_01/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_01/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_01/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_02/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_02/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_02/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_02/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_02/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_03/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_03/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_03/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_03/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_03/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_04/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_04/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_04/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_04/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_04/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_05/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_05/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_05/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_05/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_05/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_06/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_06/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_06/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_06/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_06/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_07/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_07/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_07/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_07/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_07/init.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_07/init.jpg -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_08/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_08/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_08/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/model_08/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/model_08/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/05.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/05.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/06.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/06.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/07.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/07.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_01/08.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_01/08.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/04.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/05.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/05.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/06.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/06.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/07.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/07.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_lora/model_02/08.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_lora/model_02/08.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_xl/01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_xl/01.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_xl/02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_xl/02.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/motion_xl/03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/motion_xl/03.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/v3/animation_fireworks.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/animation_fireworks.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/v3/animation_sunset.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/animation_sunset.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/v3/sketch_boy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/sketch_boy.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/animations/v3/sketch_city.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/animations/v3/sketch_city.gif -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/RealisticVision_firework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/RealisticVision_firework.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/RealisticVision_sunset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/RealisticVision_sunset.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/interpolation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/interpolation_1.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/interpolation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/interpolation_2.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/low_fps_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_1.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/low_fps_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_2.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/low_fps_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_3.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/low_fps_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/low_fps_4.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/painting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/painting.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/prediction_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_1.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/prediction_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_2.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/prediction_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_3.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/image/prediction_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/image/prediction_4.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/scribble/scribble_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_1.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/scribble/scribble_2_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_1.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/scribble/scribble_2_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_2.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/scribble/scribble_2_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_3.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/demos/scribble/scribble_2_readme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/demos/scribble/scribble_2_readme.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/docs/animatediff.md: -------------------------------------------------------------------------------- 1 | # AnimateDiff: training and inference setup 2 | ## Setups for Inference 3 | 4 | ### Prepare Environment 5 | 6 | ***We updated our inference code with xformers and a sequential decoding trick. Now AnimateDiff takes only ~12GB VRAM to inference, and run on a single RTX3090 !!*** 7 | 8 | ``` 9 | git clone https://github.com/guoyww/AnimateDiff.git 10 | cd AnimateDiff 11 | 12 | conda env create -f environment.yaml 13 | conda activate animatediff 14 | ``` 15 | 16 | ### Download Base T2I & Motion Module Checkpoints 17 | We provide two versions of our Motion Module, which are trained on stable-diffusion-v1-4 and finetuned on v1-5 seperately. 18 | It's recommanded to try both of them for best results. 19 | ``` 20 | git lfs install 21 | git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 models/StableDiffusion/ 22 | 23 | bash download_bashscripts/0-MotionModule.sh 24 | ``` 25 | You may also directly download the motion module checkpoints from [Google Drive](https://drive.google.com/drive/folders/1EqLC65eR1-W-sGD0Im7fkED6c8GkiNFI?usp=sharing) / [HuggingFace](https://huggingface.co/guoyww/animatediff) / [CivitAI](https://civitai.com/models/108836/animatediff-motion-modules), then put them in `models/Motion_Module/` folder. 26 | 27 | ### Prepare Personalize T2I 28 | Here we provide inference configs for 6 demo T2I on CivitAI. 29 | You may run the following bash scripts to download these checkpoints. 30 | ``` 31 | bash download_bashscripts/1-ToonYou.sh 32 | bash download_bashscripts/2-Lyriel.sh 33 | bash download_bashscripts/3-RcnzCartoon.sh 34 | bash download_bashscripts/4-MajicMix.sh 35 | bash download_bashscripts/5-RealisticVision.sh 36 | bash download_bashscripts/6-Tusun.sh 37 | bash download_bashscripts/7-FilmVelvia.sh 38 | bash download_bashscripts/8-GhibliBackground.sh 39 | ``` 40 | 41 | ### Inference 42 | After downloading the above peronalized T2I checkpoints, run the following commands to generate animations. The results will automatically be saved to `samples/` folder. 43 | ``` 44 | python -m scripts.animate --config configs/prompts/1-ToonYou.yaml 45 | python -m scripts.animate --config configs/prompts/2-Lyriel.yaml 46 | python -m scripts.animate --config configs/prompts/3-RcnzCartoon.yaml 47 | python -m scripts.animate --config configs/prompts/4-MajicMix.yaml 48 | python -m scripts.animate --config configs/prompts/5-RealisticVision.yaml 49 | python -m scripts.animate --config configs/prompts/6-Tusun.yaml 50 | python -m scripts.animate --config configs/prompts/7-FilmVelvia.yaml 51 | python -m scripts.animate --config configs/prompts/8-GhibliBackground.yaml 52 | ``` 53 | 54 | To generate animations with a new DreamBooth/LoRA model, you may create a new config `.yaml` file in the following format: 55 | ``` 56 | - inference_config: "[path to motion module config file]" 57 | 58 | motion_module: 59 | - "models/Motion_Module/mm_sd_v14.ckpt" 60 | - "models/Motion_Module/mm_sd_v15.ckpt" 61 | 62 | motion_module_lora_configs: 63 | - path: "[path to MotionLoRA model]" 64 | alpha: 1.0 65 | - ... 66 | 67 | dreambooth_path: "[path to your DreamBooth model .safetensors file]" 68 | lora_model_path: "[path to your LoRA model .safetensors file, leave it empty string if not needed]" 69 | 70 | steps: 25 71 | guidance_scale: 7.5 72 | 73 | prompt: 74 | - "[positive prompt]" 75 | 76 | n_prompt: 77 | - "[negative prompt]" 78 | ``` 79 | Then run the following commands: 80 | ``` 81 | python -m scripts.animate --config [path to the config file] 82 | ``` 83 | 84 | 85 | ## Steps for Training 86 | 87 | ### Dataset 88 | Before training, download the videos files and the `.csv` annotations of [WebVid10M](https://maxbain.com/webvid-dataset/) to the local mechine. 89 | Note that our examplar training script requires all the videos to be saved in a single folder. You may change this by modifying `animatediff/data/dataset.py`. 90 | 91 | ### Configuration 92 | After dataset preparations, update the below data paths in the config `.yaml` files in `configs/training/` folder: 93 | ``` 94 | train_data: 95 | csv_path: [Replace with .csv Annotation File Path] 96 | video_folder: [Replace with Video Folder Path] 97 | sample_size: 256 98 | ``` 99 | Other training parameters (lr, epochs, validation settings, etc.) are also included in the config files. 100 | 101 | ### Training 102 | To finetune the unet's image layers 103 | ``` 104 | torchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/v1/image_finetune.yaml 105 | ``` 106 | 107 | To train motion modules 108 | ``` 109 | torchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/v1/training.yaml 110 | ``` 111 | -------------------------------------------------------------------------------- /AnimateDiff/__assets__/docs/gallery.md: -------------------------------------------------------------------------------- 1 | # Gallery 2 | Here we demonstrate several best results we found in our experiments. 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |

Model:ToonYou

13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 |

Model:Counterfeit V3.0

23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
32 |

Model:Realistic Vision V2.0

33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 |
42 |

Model: majicMIX Realistic

43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 |
52 |

Model:RCNZ Cartoon

53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |

Model:FilmVelvia

63 | 64 | #### Community Cases 65 | Here are some samples contributed by the community artists. Create a Pull Request if you would like to show your results here😚. 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 |
76 |

77 | Character Model:Yoimiya 78 | (with an initial reference image, see WIP fork for the extended implementation.) 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 |
89 |

90 | Character Model:Paimon; 91 | Pose Model:Hold Sign

92 | 93 | 94 | -------------------------------------------------------------------------------- /AnimateDiff/__assets__/figs/adapter_explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/figs/adapter_explain.png -------------------------------------------------------------------------------- /AnimateDiff/__assets__/figs/gradio.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/__assets__/figs/gradio.jpg -------------------------------------------------------------------------------- /AnimateDiff/animatediff/data/dataset.py: -------------------------------------------------------------------------------- 1 | import os, io, csv, math, random 2 | import numpy as np 3 | from einops import rearrange 4 | from decord import VideoReader 5 | 6 | import torch 7 | import torchvision.transforms as transforms 8 | from torch.utils.data.dataset import Dataset 9 | from animatediff.utils.util import zero_rank_print 10 | 11 | 12 | class WebVid10M(Dataset): 13 | def __init__( 14 | self, 15 | csv_path, 16 | video_folder, 17 | sample_size=256, 18 | sample_stride=4, 19 | sample_n_frames=16, 20 | is_image=False, 21 | frozen_videos=False, 22 | number_of_samples=None, 23 | ): 24 | zero_rank_print(f"loading annotations from {csv_path} ...") 25 | with open(csv_path, "r") as csvfile: 26 | self.dataset = list(csv.DictReader(csvfile)) 27 | 28 | if number_of_samples: 29 | self.dataset = self.dataset[:number_of_samples] 30 | self.length = len(self.dataset) 31 | zero_rank_print(f"data scale: {self.length}") 32 | 33 | self.video_folder = video_folder 34 | self.sample_stride = sample_stride 35 | self.sample_n_frames = sample_n_frames 36 | self.is_image = is_image 37 | self.frozen_videos = frozen_videos 38 | if self.frozen_videos: 39 | print("Training with Frozen videos") 40 | sample_size = ( 41 | tuple(sample_size) 42 | if not isinstance(sample_size, int) 43 | else (sample_size, sample_size) 44 | ) 45 | self.pixel_transforms = transforms.Compose( 46 | [ 47 | transforms.RandomHorizontalFlip(), 48 | transforms.Resize(sample_size[0]), 49 | transforms.CenterCrop(sample_size), 50 | transforms.Normalize( 51 | mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True 52 | ), 53 | ] 54 | ) 55 | 56 | def get_batch(self, idx): 57 | video_dict = self.dataset[idx] 58 | videoid, name, page_dir = ( 59 | video_dict["videoid"], 60 | video_dict["name"], 61 | video_dict["page_dir"], 62 | ) 63 | 64 | video_dir = os.path.join(self.video_folder, f"{videoid}.mp4") 65 | video_reader = VideoReader(video_dir) 66 | video_length = len(video_reader) 67 | if not self.is_image: 68 | clip_length = min( 69 | video_length, (self.sample_n_frames - 1) * self.sample_stride + 1 70 | ) 71 | start_idx = random.randint(0, video_length - clip_length) 72 | batch_index = np.linspace( 73 | start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int 74 | ) 75 | else: 76 | batch_index = [random.randint(0, video_length - 1)] 77 | 78 | if self.frozen_videos: 79 | pixel_values = ( 80 | torch.from_numpy( 81 | np.tile( 82 | np.expand_dims( 83 | video_reader.get_batch(batch_index).asnumpy()[0], axis=0 84 | ), 85 | (len(batch_index), 1, 1, 1), 86 | ) 87 | ) 88 | .permute(0, 3, 1, 2) 89 | .contiguous() 90 | ) 91 | else: 92 | pixel_values = ( 93 | torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()) 94 | .permute(0, 3, 1, 2) 95 | .contiguous() 96 | ) 97 | 98 | pixel_values = pixel_values / 255.0 99 | del video_reader 100 | 101 | if self.is_image: 102 | pixel_values = pixel_values[0] 103 | 104 | return pixel_values, name 105 | 106 | def __len__(self): 107 | return self.length 108 | 109 | def __getitem__(self, idx): 110 | while True: 111 | try: 112 | pixel_values, name = self.get_batch(idx) 113 | break 114 | except Exception as e: 115 | idx = random.randint(0, self.length - 1) 116 | 117 | pixel_values = self.pixel_transforms(pixel_values) 118 | sample = dict(pixel_values=pixel_values, text=name) 119 | return sample 120 | 121 | 122 | if __name__ == "__main__": 123 | from animatediff.utils.util import save_videos_grid 124 | 125 | dataset = WebVid10M( 126 | csv_path="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv", 127 | video_folder="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val", 128 | sample_size=256, 129 | sample_stride=4, 130 | sample_n_frames=16, 131 | is_image=True, 132 | ) 133 | import pdb 134 | 135 | pdb.set_trace() 136 | 137 | dataloader = torch.utils.data.DataLoader( 138 | dataset, 139 | batch_size=4, 140 | num_workers=16, 141 | ) 142 | for idx, batch in enumerate(dataloader): 143 | print(batch["pixel_values"].shape, len(batch["text"])) 144 | # for i in range(batch["pixel_values"].shape[0]): 145 | # save_videos_grid(batch["pixel_values"][i:i+1].permute(0,2,1,3,4), os.path.join(".", f"{idx}-{i}.mp4"), rescale=True) 146 | -------------------------------------------------------------------------------- /AnimateDiff/animatediff/models/attention.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py 2 | 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | from torch import nn 9 | 10 | from diffusers.configuration_utils import ConfigMixin, register_to_config 11 | from diffusers.modeling_utils import ModelMixin 12 | from diffusers.utils import BaseOutput 13 | from diffusers.utils.import_utils import is_xformers_available 14 | from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm 15 | 16 | from einops import rearrange, repeat 17 | import pdb 18 | 19 | 20 | @dataclass 21 | class Transformer3DModelOutput(BaseOutput): 22 | sample: torch.FloatTensor 23 | 24 | 25 | if is_xformers_available(): 26 | import xformers 27 | import xformers.ops 28 | else: 29 | xformers = None 30 | 31 | 32 | class SpatialAdapter(nn.Module): 33 | def __init__(self, dim, rank=4): 34 | super().__init__() 35 | self.down = nn.Linear(dim, rank, bias=False) 36 | self.up = nn.Linear(rank, dim, bias=False) 37 | 38 | # Initialize weights 39 | nn.init.normal_(self.down.weight, std=1 / rank) 40 | nn.init.zeros_(self.up.weight) 41 | 42 | def forward(self, x): 43 | out = self.down(x) 44 | out = self.up(out) 45 | return x + out 46 | 47 | 48 | class Transformer3DModel(ModelMixin, ConfigMixin): 49 | @register_to_config 50 | def __init__( 51 | self, 52 | num_attention_heads: int = 16, 53 | attention_head_dim: int = 88, 54 | in_channels: Optional[int] = None, 55 | num_layers: int = 1, 56 | dropout: float = 0.0, 57 | norm_num_groups: int = 32, 58 | cross_attention_dim: Optional[int] = None, 59 | attention_bias: bool = False, 60 | activation_fn: str = "geglu", 61 | num_embeds_ada_norm: Optional[int] = None, 62 | use_linear_projection: bool = False, 63 | only_cross_attention: bool = False, 64 | upcast_attention: bool = False, 65 | unet_use_cross_frame_attention=None, 66 | unet_use_temporal_attention=None, 67 | use_spatial_adapter=False, 68 | ): 69 | super().__init__() 70 | self.use_linear_projection = use_linear_projection 71 | self.num_attention_heads = num_attention_heads 72 | self.attention_head_dim = attention_head_dim 73 | inner_dim = num_attention_heads * attention_head_dim 74 | 75 | # Define input layers 76 | self.in_channels = in_channels 77 | 78 | self.norm = torch.nn.GroupNorm( 79 | num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True 80 | ) 81 | if use_linear_projection: 82 | self.proj_in = nn.Linear(in_channels, inner_dim) 83 | else: 84 | self.proj_in = nn.Conv2d( 85 | in_channels, inner_dim, kernel_size=1, stride=1, padding=0 86 | ) 87 | 88 | # Define transformers blocks 89 | self.transformer_blocks = nn.ModuleList( 90 | [ 91 | BasicTransformerBlock( 92 | inner_dim, 93 | num_attention_heads, 94 | attention_head_dim, 95 | dropout=dropout, 96 | cross_attention_dim=cross_attention_dim, 97 | activation_fn=activation_fn, 98 | num_embeds_ada_norm=num_embeds_ada_norm, 99 | attention_bias=attention_bias, 100 | only_cross_attention=only_cross_attention, 101 | upcast_attention=upcast_attention, 102 | unet_use_cross_frame_attention=unet_use_cross_frame_attention, 103 | unet_use_temporal_attention=unet_use_temporal_attention, 104 | use_spatial_adapter=use_spatial_adapter, 105 | ) 106 | for d in range(num_layers) 107 | ] 108 | ) 109 | 110 | # 4. Define output layers 111 | if use_linear_projection: 112 | self.proj_out = nn.Linear(in_channels, inner_dim) 113 | else: 114 | self.proj_out = nn.Conv2d( 115 | inner_dim, in_channels, kernel_size=1, stride=1, padding=0 116 | ) 117 | 118 | def forward( 119 | self, 120 | hidden_states, 121 | encoder_hidden_states=None, 122 | timestep=None, 123 | return_dict: bool = True, 124 | ): 125 | # Input 126 | assert ( 127 | hidden_states.dim() == 5 128 | ), f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}." 129 | video_length = hidden_states.shape[2] 130 | hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") 131 | encoder_hidden_states = repeat( 132 | encoder_hidden_states, "b n c -> (b f) n c", f=video_length 133 | ) 134 | 135 | batch, channel, height, weight = hidden_states.shape 136 | residual = hidden_states 137 | 138 | hidden_states = self.norm(hidden_states) 139 | if not self.use_linear_projection: 140 | hidden_states = self.proj_in(hidden_states) 141 | inner_dim = hidden_states.shape[1] 142 | hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( 143 | batch, height * weight, inner_dim 144 | ) 145 | else: 146 | inner_dim = hidden_states.shape[1] 147 | hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( 148 | batch, height * weight, inner_dim 149 | ) 150 | hidden_states = self.proj_in(hidden_states) 151 | 152 | # Blocks 153 | for block in self.transformer_blocks: 154 | hidden_states = block( 155 | hidden_states, 156 | encoder_hidden_states=encoder_hidden_states, 157 | timestep=timestep, 158 | video_length=video_length, 159 | ) 160 | 161 | # Output 162 | if not self.use_linear_projection: 163 | hidden_states = ( 164 | hidden_states.reshape(batch, height, weight, inner_dim) 165 | .permute(0, 3, 1, 2) 166 | .contiguous() 167 | ) 168 | hidden_states = self.proj_out(hidden_states) 169 | else: 170 | hidden_states = self.proj_out(hidden_states) 171 | hidden_states = ( 172 | hidden_states.reshape(batch, height, weight, inner_dim) 173 | .permute(0, 3, 1, 2) 174 | .contiguous() 175 | ) 176 | 177 | output = hidden_states + residual 178 | 179 | output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length) 180 | if not return_dict: 181 | return (output,) 182 | 183 | return Transformer3DModelOutput(sample=output) 184 | 185 | 186 | class BasicTransformerBlock(nn.Module): 187 | def __init__( 188 | self, 189 | dim: int, 190 | num_attention_heads: int, 191 | attention_head_dim: int, 192 | dropout=0.0, 193 | cross_attention_dim: Optional[int] = None, 194 | activation_fn: str = "geglu", 195 | num_embeds_ada_norm: Optional[int] = None, 196 | attention_bias: bool = False, 197 | only_cross_attention: bool = False, 198 | upcast_attention: bool = False, 199 | unet_use_cross_frame_attention=None, 200 | unet_use_temporal_attention=None, 201 | use_spatial_adapter=False, 202 | ): 203 | super().__init__() 204 | self.only_cross_attention = only_cross_attention 205 | self.use_ada_layer_norm = num_embeds_ada_norm is not None 206 | self.unet_use_cross_frame_attention = unet_use_cross_frame_attention 207 | self.unet_use_temporal_attention = unet_use_temporal_attention 208 | self.use_spatial_adapter = use_spatial_adapter 209 | # SC-Attn 210 | assert unet_use_cross_frame_attention is not None 211 | if unet_use_cross_frame_attention: 212 | self.attn1 = SparseCausalAttention2D( 213 | query_dim=dim, 214 | heads=num_attention_heads, 215 | dim_head=attention_head_dim, 216 | dropout=dropout, 217 | bias=attention_bias, 218 | cross_attention_dim=( 219 | cross_attention_dim if only_cross_attention else None 220 | ), 221 | upcast_attention=upcast_attention, 222 | ) 223 | else: 224 | self.attn1 = CrossAttention( 225 | query_dim=dim, 226 | heads=num_attention_heads, 227 | dim_head=attention_head_dim, 228 | dropout=dropout, 229 | bias=attention_bias, 230 | upcast_attention=upcast_attention, 231 | ) 232 | self.norm1 = ( 233 | AdaLayerNorm(dim, num_embeds_ada_norm) 234 | if self.use_ada_layer_norm 235 | else nn.LayerNorm(dim) 236 | ) 237 | if self.use_spatial_adapter: 238 | print("Using Spatial Adapter") 239 | self.attn1_lora = SpatialAdapter(dim, dim) 240 | else: 241 | print("Not using Spatial Adapter") 242 | # Cross-Attn 243 | if cross_attention_dim is not None: 244 | self.attn2 = CrossAttention( 245 | query_dim=dim, 246 | cross_attention_dim=cross_attention_dim, 247 | heads=num_attention_heads, 248 | dim_head=attention_head_dim, 249 | dropout=dropout, 250 | bias=attention_bias, 251 | upcast_attention=upcast_attention, 252 | ) 253 | if self.use_spatial_adapter: 254 | self.attn2_lora = SpatialAdapter(dim, dim) 255 | else: 256 | self.attn2 = None 257 | 258 | if cross_attention_dim is not None: 259 | self.norm2 = ( 260 | AdaLayerNorm(dim, num_embeds_ada_norm) 261 | if self.use_ada_layer_norm 262 | else nn.LayerNorm(dim) 263 | ) 264 | else: 265 | self.norm2 = None 266 | 267 | # Feed-forward 268 | self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn) 269 | self.norm3 = nn.LayerNorm(dim) 270 | 271 | # Temp-Attn 272 | assert unet_use_temporal_attention is not None 273 | if unet_use_temporal_attention: 274 | self.attn_temp = CrossAttention( 275 | query_dim=dim, 276 | heads=num_attention_heads, 277 | dim_head=attention_head_dim, 278 | dropout=dropout, 279 | bias=attention_bias, 280 | upcast_attention=upcast_attention, 281 | ) 282 | nn.init.zeros_(self.attn_temp.to_out[0].weight.data) 283 | self.norm_temp = ( 284 | AdaLayerNorm(dim, num_embeds_ada_norm) 285 | if self.use_ada_layer_norm 286 | else nn.LayerNorm(dim) 287 | ) 288 | 289 | def set_lora_scale(self, scale): 290 | self.attn1_lora.scale = scale 291 | if hasattr(self, "attn2_lora"): 292 | self.attn2_lora.scale = scale 293 | 294 | def set_use_memory_efficient_attention_xformers( 295 | self, use_memory_efficient_attention_xformers: bool 296 | ): 297 | if not is_xformers_available(): 298 | print("Here is how to install it") 299 | raise ModuleNotFoundError( 300 | "Refer to https://github.com/facebookresearch/xformers for more information on how to install" 301 | " xformers", 302 | name="xformers", 303 | ) 304 | elif not torch.cuda.is_available(): 305 | raise ValueError( 306 | "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only" 307 | " available for GPU " 308 | ) 309 | else: 310 | try: 311 | # Make sure we can run the memory efficient attention 312 | _ = xformers.ops.memory_efficient_attention( 313 | torch.randn((1, 2, 40), device="cuda"), 314 | torch.randn((1, 2, 40), device="cuda"), 315 | torch.randn((1, 2, 40), device="cuda"), 316 | ) 317 | except Exception as e: 318 | raise e 319 | self.attn1._use_memory_efficient_attention_xformers = ( 320 | use_memory_efficient_attention_xformers 321 | ) 322 | if self.attn2 is not None: 323 | self.attn2._use_memory_efficient_attention_xformers = ( 324 | use_memory_efficient_attention_xformers 325 | ) 326 | # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers 327 | 328 | def forward( 329 | self, 330 | hidden_states, 331 | encoder_hidden_states=None, 332 | timestep=None, 333 | attention_mask=None, 334 | video_length=None, 335 | ): 336 | # SparseCausal-Attention 337 | norm_hidden_states = ( 338 | self.norm1(hidden_states, timestep) 339 | if self.use_ada_layer_norm 340 | else self.norm1(hidden_states) 341 | ) 342 | 343 | # if self.only_cross_attention: 344 | # hidden_states = ( 345 | # self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states 346 | # ) 347 | # else: 348 | # hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states 349 | 350 | # pdb.set_trace() 351 | if self.unet_use_cross_frame_attention: 352 | attn_output = ( 353 | self.attn1( 354 | norm_hidden_states, 355 | attention_mask=attention_mask, 356 | video_length=video_length, 357 | ) 358 | + hidden_states 359 | ) 360 | else: 361 | attn_output = ( 362 | self.attn1(norm_hidden_states, attention_mask=attention_mask) 363 | + hidden_states 364 | ) 365 | if self.use_spatial_adapter: 366 | att1_lora_output = self.attn1_lora(attn_output) 367 | hidden_states += att1_lora_output 368 | 369 | if self.attn2 is not None: 370 | # Cross-Attention 371 | norm_hidden_states = ( 372 | self.norm2(hidden_states, timestep) 373 | if self.use_ada_layer_norm 374 | else self.norm2(hidden_states) 375 | ) 376 | attn_output = self.attn2( 377 | norm_hidden_states, 378 | encoder_hidden_states=encoder_hidden_states, 379 | attention_mask=attention_mask, 380 | ) 381 | if self.use_spatial_adapter: 382 | att2_lora_output = self.attn2_lora(attn_output) 383 | hidden_states += att2_lora_output 384 | 385 | # Feed-forward 386 | hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states 387 | 388 | # Temporal-Attention 389 | if self.unet_use_temporal_attention: 390 | d = hidden_states.shape[1] 391 | hidden_states = rearrange( 392 | hidden_states, "(b f) d c -> (b d) f c", f=video_length 393 | ) 394 | norm_hidden_states = ( 395 | self.norm_temp(hidden_states, timestep) 396 | if self.use_ada_layer_norm 397 | else self.norm_temp(hidden_states) 398 | ) 399 | hidden_states = self.attn_temp(norm_hidden_states) + hidden_states 400 | hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d) 401 | 402 | return hidden_states 403 | -------------------------------------------------------------------------------- /AnimateDiff/animatediff/models/resnet.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from einops import rearrange 8 | 9 | 10 | class InflatedConv3d(nn.Conv2d): 11 | def forward(self, x): 12 | video_length = x.shape[2] 13 | 14 | x = rearrange(x, "b c f h w -> (b f) c h w") 15 | x = super().forward(x) 16 | x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) 17 | 18 | return x 19 | 20 | 21 | class InflatedGroupNorm(nn.GroupNorm): 22 | def forward(self, x): 23 | video_length = x.shape[2] 24 | 25 | x = rearrange(x, "b c f h w -> (b f) c h w") 26 | x = super().forward(x) 27 | x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) 28 | 29 | return x 30 | 31 | 32 | class Upsample3D(nn.Module): 33 | def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"): 34 | super().__init__() 35 | self.channels = channels 36 | self.out_channels = out_channels or channels 37 | self.use_conv = use_conv 38 | self.use_conv_transpose = use_conv_transpose 39 | self.name = name 40 | 41 | conv = None 42 | if use_conv_transpose: 43 | raise NotImplementedError 44 | elif use_conv: 45 | self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1) 46 | 47 | def forward(self, hidden_states, output_size=None): 48 | assert hidden_states.shape[1] == self.channels 49 | 50 | if self.use_conv_transpose: 51 | raise NotImplementedError 52 | 53 | # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 54 | dtype = hidden_states.dtype 55 | if dtype == torch.bfloat16: 56 | hidden_states = hidden_states.to(torch.float32) 57 | 58 | # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984 59 | if hidden_states.shape[0] >= 64: 60 | hidden_states = hidden_states.contiguous() 61 | 62 | # if `output_size` is passed we force the interpolation output 63 | # size and do not make use of `scale_factor=2` 64 | if output_size is None: 65 | hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest") 66 | else: 67 | hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest") 68 | 69 | # If the input is bfloat16, we cast back to bfloat16 70 | if dtype == torch.bfloat16: 71 | hidden_states = hidden_states.to(dtype) 72 | 73 | # if self.use_conv: 74 | # if self.name == "conv": 75 | # hidden_states = self.conv(hidden_states) 76 | # else: 77 | # hidden_states = self.Conv2d_0(hidden_states) 78 | hidden_states = self.conv(hidden_states) 79 | 80 | return hidden_states 81 | 82 | 83 | class Downsample3D(nn.Module): 84 | def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"): 85 | super().__init__() 86 | self.channels = channels 87 | self.out_channels = out_channels or channels 88 | self.use_conv = use_conv 89 | self.padding = padding 90 | stride = 2 91 | self.name = name 92 | 93 | if use_conv: 94 | self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding) 95 | else: 96 | raise NotImplementedError 97 | 98 | def forward(self, hidden_states): 99 | assert hidden_states.shape[1] == self.channels 100 | if self.use_conv and self.padding == 0: 101 | raise NotImplementedError 102 | 103 | assert hidden_states.shape[1] == self.channels 104 | hidden_states = self.conv(hidden_states) 105 | 106 | return hidden_states 107 | 108 | 109 | class ResnetBlock3D(nn.Module): 110 | def __init__( 111 | self, 112 | *, 113 | in_channels, 114 | out_channels=None, 115 | conv_shortcut=False, 116 | dropout=0.0, 117 | temb_channels=512, 118 | groups=32, 119 | groups_out=None, 120 | pre_norm=True, 121 | eps=1e-6, 122 | non_linearity="swish", 123 | time_embedding_norm="default", 124 | output_scale_factor=1.0, 125 | use_in_shortcut=None, 126 | use_inflated_groupnorm=False, 127 | ): 128 | super().__init__() 129 | self.pre_norm = pre_norm 130 | self.pre_norm = True 131 | self.in_channels = in_channels 132 | out_channels = in_channels if out_channels is None else out_channels 133 | self.out_channels = out_channels 134 | self.use_conv_shortcut = conv_shortcut 135 | self.time_embedding_norm = time_embedding_norm 136 | self.output_scale_factor = output_scale_factor 137 | 138 | if groups_out is None: 139 | groups_out = groups 140 | 141 | assert use_inflated_groupnorm != None 142 | if use_inflated_groupnorm: 143 | self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True) 144 | else: 145 | self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True) 146 | 147 | self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) 148 | 149 | if temb_channels is not None: 150 | if self.time_embedding_norm == "default": 151 | time_emb_proj_out_channels = out_channels 152 | elif self.time_embedding_norm == "scale_shift": 153 | time_emb_proj_out_channels = out_channels * 2 154 | else: 155 | raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ") 156 | 157 | self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels) 158 | else: 159 | self.time_emb_proj = None 160 | 161 | if use_inflated_groupnorm: 162 | self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True) 163 | else: 164 | self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True) 165 | 166 | self.dropout = torch.nn.Dropout(dropout) 167 | self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) 168 | 169 | if non_linearity == "swish": 170 | self.nonlinearity = lambda x: F.silu(x) 171 | elif non_linearity == "mish": 172 | self.nonlinearity = Mish() 173 | elif non_linearity == "silu": 174 | self.nonlinearity = nn.SiLU() 175 | 176 | self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut 177 | 178 | self.conv_shortcut = None 179 | if self.use_in_shortcut: 180 | self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) 181 | 182 | def forward(self, input_tensor, temb): 183 | hidden_states = input_tensor 184 | 185 | hidden_states = self.norm1(hidden_states) 186 | hidden_states = self.nonlinearity(hidden_states) 187 | 188 | hidden_states = self.conv1(hidden_states) 189 | 190 | if temb is not None: 191 | temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None] 192 | 193 | if temb is not None and self.time_embedding_norm == "default": 194 | hidden_states = hidden_states + temb 195 | 196 | hidden_states = self.norm2(hidden_states) 197 | 198 | if temb is not None and self.time_embedding_norm == "scale_shift": 199 | scale, shift = torch.chunk(temb, 2, dim=1) 200 | hidden_states = hidden_states * (1 + scale) + shift 201 | 202 | hidden_states = self.nonlinearity(hidden_states) 203 | 204 | hidden_states = self.dropout(hidden_states) 205 | hidden_states = self.conv2(hidden_states) 206 | 207 | if self.conv_shortcut is not None: 208 | input_tensor = self.conv_shortcut(input_tensor) 209 | 210 | output_tensor = (input_tensor + hidden_states) / self.output_scale_factor 211 | 212 | return output_tensor 213 | 214 | 215 | class Mish(torch.nn.Module): 216 | def forward(self, hidden_states): 217 | return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states)) -------------------------------------------------------------------------------- /AnimateDiff/animatediff/utils/convert_lora_safetensor_to_diffusers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # Changes were made to this source code by Yuwei Guo. 17 | """ Conversion script for the LoRA's safetensors checkpoints. """ 18 | 19 | import argparse 20 | 21 | import torch 22 | from safetensors.torch import load_file 23 | 24 | from diffusers import StableDiffusionPipeline 25 | 26 | 27 | def load_diffusers_lora(pipeline, state_dict, alpha=1.0): 28 | # directly update weight in diffusers model 29 | for key in state_dict: 30 | # only process lora down key 31 | if "up." in key: continue 32 | 33 | up_key = key.replace(".down.", ".up.") 34 | model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "") 35 | model_key = model_key.replace("to_out.", "to_out.0.") 36 | layer_infos = model_key.split(".")[:-1] 37 | 38 | curr_layer = pipeline.unet 39 | while len(layer_infos) > 0: 40 | temp_name = layer_infos.pop(0) 41 | curr_layer = curr_layer.__getattr__(temp_name) 42 | 43 | weight_down = state_dict[key] 44 | weight_up = state_dict[up_key] 45 | curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device) 46 | 47 | return pipeline 48 | 49 | 50 | def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6): 51 | # load base model 52 | # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32) 53 | 54 | # load LoRA weight from .safetensors 55 | # state_dict = load_file(checkpoint_path) 56 | 57 | visited = [] 58 | 59 | # directly update weight in diffusers model 60 | for key in state_dict: 61 | # it is suggested to print out the key, it usually will be something like below 62 | # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight" 63 | 64 | # as we have set the alpha beforehand, so just skip 65 | if ".alpha" in key or key in visited: 66 | continue 67 | 68 | if "text" in key: 69 | layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_") 70 | curr_layer = pipeline.text_encoder 71 | else: 72 | layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_") 73 | curr_layer = pipeline.unet 74 | 75 | # find the target layer 76 | temp_name = layer_infos.pop(0) 77 | while len(layer_infos) > -1: 78 | try: 79 | curr_layer = curr_layer.__getattr__(temp_name) 80 | if len(layer_infos) > 0: 81 | temp_name = layer_infos.pop(0) 82 | elif len(layer_infos) == 0: 83 | break 84 | except Exception: 85 | if len(temp_name) > 0: 86 | temp_name += "_" + layer_infos.pop(0) 87 | else: 88 | temp_name = layer_infos.pop(0) 89 | 90 | pair_keys = [] 91 | if "lora_down" in key: 92 | pair_keys.append(key.replace("lora_down", "lora_up")) 93 | pair_keys.append(key) 94 | else: 95 | pair_keys.append(key) 96 | pair_keys.append(key.replace("lora_up", "lora_down")) 97 | 98 | # update weight 99 | if len(state_dict[pair_keys[0]].shape) == 4: 100 | weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32) 101 | weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32) 102 | curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device) 103 | else: 104 | weight_up = state_dict[pair_keys[0]].to(torch.float32) 105 | weight_down = state_dict[pair_keys[1]].to(torch.float32) 106 | curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device) 107 | 108 | # update visited list 109 | for item in pair_keys: 110 | visited.append(item) 111 | 112 | return pipeline 113 | 114 | 115 | if __name__ == "__main__": 116 | parser = argparse.ArgumentParser() 117 | 118 | parser.add_argument( 119 | "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format." 120 | ) 121 | parser.add_argument( 122 | "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." 123 | ) 124 | parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") 125 | parser.add_argument( 126 | "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors" 127 | ) 128 | parser.add_argument( 129 | "--lora_prefix_text_encoder", 130 | default="lora_te", 131 | type=str, 132 | help="The prefix of text encoder weight in safetensors", 133 | ) 134 | parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW") 135 | parser.add_argument( 136 | "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not." 137 | ) 138 | parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)") 139 | 140 | args = parser.parse_args() 141 | 142 | base_model_path = args.base_model_path 143 | checkpoint_path = args.checkpoint_path 144 | dump_path = args.dump_path 145 | lora_prefix_unet = args.lora_prefix_unet 146 | lora_prefix_text_encoder = args.lora_prefix_text_encoder 147 | alpha = args.alpha 148 | 149 | pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha) 150 | 151 | pipe = pipe.to(args.device) 152 | pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors) 153 | -------------------------------------------------------------------------------- /AnimateDiff/animatediff/utils/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import imageio 3 | import numpy as np 4 | from typing import Union 5 | 6 | import torch 7 | import torchvision 8 | import torch.distributed as dist 9 | 10 | from safetensors import safe_open 11 | from tqdm import tqdm 12 | from einops import rearrange 13 | from animatediff.utils.convert_from_ckpt import ( 14 | convert_ldm_unet_checkpoint, 15 | convert_ldm_clip_checkpoint, 16 | convert_ldm_vae_checkpoint, 17 | ) 18 | from animatediff.utils.convert_lora_safetensor_to_diffusers import ( 19 | convert_lora, 20 | load_diffusers_lora, 21 | ) 22 | 23 | 24 | def zero_rank_print(s): 25 | if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0): 26 | print("### " + s) 27 | 28 | 29 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8): 30 | videos = rearrange(videos, "b c t h w -> t b c h w") 31 | outputs = [] 32 | for x in videos: 33 | x = torchvision.utils.make_grid(x, nrow=n_rows) 34 | x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) 35 | if rescale: 36 | x = (x + 1.0) / 2.0 # -1,1 -> 0,1 37 | x = (x * 255).numpy().astype(np.uint8) 38 | outputs.append(x) 39 | 40 | os.makedirs(os.path.dirname(path), exist_ok=True) 41 | imageio.mimsave(path, outputs, fps=fps) 42 | 43 | 44 | # DDIM Inversion 45 | @torch.no_grad() 46 | def init_prompt(prompt, pipeline): 47 | uncond_input = pipeline.tokenizer( 48 | [""], 49 | padding="max_length", 50 | max_length=pipeline.tokenizer.model_max_length, 51 | return_tensors="pt", 52 | ) 53 | uncond_embeddings = pipeline.text_encoder( 54 | uncond_input.input_ids.to(pipeline.device) 55 | )[0] 56 | text_input = pipeline.tokenizer( 57 | [prompt], 58 | padding="max_length", 59 | max_length=pipeline.tokenizer.model_max_length, 60 | truncation=True, 61 | return_tensors="pt", 62 | ) 63 | text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0] 64 | context = torch.cat([uncond_embeddings, text_embeddings]) 65 | 66 | return context 67 | 68 | 69 | def next_step( 70 | model_output: Union[torch.FloatTensor, np.ndarray], 71 | timestep: int, 72 | sample: Union[torch.FloatTensor, np.ndarray], 73 | ddim_scheduler, 74 | ): 75 | timestep, next_timestep = ( 76 | min( 77 | timestep 78 | - ddim_scheduler.config.num_train_timesteps 79 | // ddim_scheduler.num_inference_steps, 80 | 999, 81 | ), 82 | timestep, 83 | ) 84 | alpha_prod_t = ( 85 | ddim_scheduler.alphas_cumprod[timestep] 86 | if timestep >= 0 87 | else ddim_scheduler.final_alpha_cumprod 88 | ) 89 | alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep] 90 | beta_prod_t = 1 - alpha_prod_t 91 | next_original_sample = ( 92 | sample - beta_prod_t**0.5 * model_output 93 | ) / alpha_prod_t**0.5 94 | next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output 95 | next_sample = alpha_prod_t_next**0.5 * next_original_sample + next_sample_direction 96 | return next_sample 97 | 98 | 99 | def get_noise_pred_single(latents, t, context, unet): 100 | noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"] 101 | return noise_pred 102 | 103 | 104 | @torch.no_grad() 105 | def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt): 106 | context = init_prompt(prompt, pipeline) 107 | uncond_embeddings, cond_embeddings = context.chunk(2) 108 | all_latent = [latent] 109 | latent = latent.clone().detach() 110 | for i in tqdm(range(num_inv_steps)): 111 | t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1] 112 | noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet) 113 | latent = next_step(noise_pred, t, latent, ddim_scheduler) 114 | all_latent.append(latent) 115 | return all_latent 116 | 117 | 118 | @torch.no_grad() 119 | def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""): 120 | ddim_latents = ddim_loop( 121 | pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt 122 | ) 123 | return ddim_latents 124 | 125 | 126 | def load_weights( 127 | animation_pipeline, 128 | # motion module 129 | motion_module_path="", 130 | motion_module_lora_configs=[], 131 | # domain adapter 132 | adapter_lora_path="", 133 | adapter_lora_scale=1.0, 134 | # image layers 135 | dreambooth_model_path="", 136 | lora_model_path="", 137 | lora_alpha=0.8, 138 | ): 139 | # motion module 140 | unet_state_dict = {} 141 | if motion_module_path != "": 142 | print(f"load motion module from {motion_module_path}") 143 | motion_module_state_dict = torch.load(motion_module_path, map_location="cpu") 144 | motion_module_state_dict = ( 145 | motion_module_state_dict["state_dict"] 146 | if "state_dict" in motion_module_state_dict 147 | else motion_module_state_dict 148 | ) 149 | unet_state_dict.update( 150 | { 151 | name: param 152 | for name, param in motion_module_state_dict.items() 153 | if "motion_modules." in name 154 | } 155 | ) 156 | unet_state_dict.pop("animatediff_config", "") 157 | 158 | missing, unexpected = animation_pipeline.unet.load_state_dict( 159 | unet_state_dict, strict=False 160 | ) 161 | assert len(unexpected) == 0 162 | del unet_state_dict 163 | 164 | # base model 165 | if dreambooth_model_path != "": 166 | print(f"load dreambooth model from {dreambooth_model_path}") 167 | if dreambooth_model_path.endswith(".safetensors"): 168 | dreambooth_state_dict = {} 169 | with safe_open(dreambooth_model_path, framework="pt", device="cpu") as f: 170 | for key in f.keys(): 171 | dreambooth_state_dict[key] = f.get_tensor(key) 172 | elif dreambooth_model_path.endswith(".ckpt"): 173 | dreambooth_state_dict = torch.load( 174 | dreambooth_model_path, map_location="cpu" 175 | ) 176 | 177 | # 1. vae 178 | converted_vae_checkpoint = convert_ldm_vae_checkpoint( 179 | dreambooth_state_dict, animation_pipeline.vae.config 180 | ) 181 | animation_pipeline.vae.load_state_dict(converted_vae_checkpoint) 182 | # 2. unet 183 | converted_unet_checkpoint = convert_ldm_unet_checkpoint( 184 | dreambooth_state_dict, animation_pipeline.unet.config 185 | ) 186 | animation_pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False) 187 | # 3. text_model 188 | animation_pipeline.text_encoder = convert_ldm_clip_checkpoint( 189 | dreambooth_state_dict 190 | ) 191 | del dreambooth_state_dict 192 | 193 | # lora layers 194 | if lora_model_path != "": 195 | print(f"load lora model from {lora_model_path}") 196 | assert lora_model_path.endswith(".safetensors") 197 | lora_state_dict = {} 198 | with safe_open(lora_model_path, framework="pt", device="cpu") as f: 199 | for key in f.keys(): 200 | lora_state_dict[key] = f.get_tensor(key) 201 | 202 | animation_pipeline = convert_lora( 203 | animation_pipeline, lora_state_dict, alpha=lora_alpha 204 | ) 205 | del lora_state_dict 206 | 207 | # domain adapter lora 208 | if adapter_lora_path != "": 209 | print(f"load domain lora from {adapter_lora_path}") 210 | domain_lora_state_dict = torch.load(adapter_lora_path, map_location="cpu") 211 | domain_lora_state_dict = ( 212 | domain_lora_state_dict["state_dict"] 213 | if "state_dict" in domain_lora_state_dict 214 | else domain_lora_state_dict 215 | ) 216 | domain_lora_state_dict.pop("animatediff_config", "") 217 | 218 | animation_pipeline = load_diffusers_lora( 219 | animation_pipeline, domain_lora_state_dict, alpha=adapter_lora_scale 220 | ) 221 | 222 | # motion module lora 223 | for motion_module_lora_config in motion_module_lora_configs: 224 | path, alpha = ( 225 | motion_module_lora_config["path"], 226 | motion_module_lora_config["alpha"], 227 | ) 228 | print(f"load motion LoRA from {path}") 229 | motion_lora_state_dict = torch.load(path, map_location="cpu") 230 | motion_lora_state_dict = ( 231 | motion_lora_state_dict["state_dict"] 232 | if "state_dict" in motion_lora_state_dict 233 | else motion_lora_state_dict 234 | ) 235 | motion_lora_state_dict.pop("animatediff_config", "") 236 | 237 | animation_pipeline = load_diffusers_lora( 238 | animation_pipeline, motion_lora_state_dict, alpha 239 | ) 240 | 241 | return animation_pipeline 242 | -------------------------------------------------------------------------------- /AnimateDiff/app.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import json 4 | import torch 5 | import random 6 | 7 | import gradio as gr 8 | from glob import glob 9 | from omegaconf import OmegaConf 10 | from datetime import datetime 11 | from safetensors import safe_open 12 | 13 | from diffusers import AutoencoderKL 14 | from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler 15 | from diffusers.utils.import_utils import is_xformers_available 16 | from transformers import CLIPTextModel, CLIPTokenizer 17 | 18 | from animatediff.models.unet import UNet3DConditionModel 19 | from animatediff.pipelines.pipeline_animation import AnimationPipeline 20 | from animatediff.utils.util import save_videos_grid 21 | from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint 22 | from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora 23 | 24 | 25 | sample_idx = 0 26 | scheduler_dict = { 27 | "Euler": EulerDiscreteScheduler, 28 | "PNDM": PNDMScheduler, 29 | "DDIM": DDIMScheduler, 30 | } 31 | 32 | css = """ 33 | .toolbutton { 34 | margin-buttom: 0em 0em 0em 0em; 35 | max-width: 2.5em; 36 | min-width: 2.5em !important; 37 | height: 2.5em; 38 | } 39 | """ 40 | 41 | class AnimateController: 42 | def __init__(self): 43 | 44 | # config dirs 45 | self.basedir = os.getcwd() 46 | self.stable_diffusion_dir = os.path.join(self.basedir, "models", "StableDiffusion") 47 | self.motion_module_dir = os.path.join(self.basedir, "models", "Motion_Module") 48 | self.personalized_model_dir = os.path.join(self.basedir, "models", "DreamBooth_LoRA") 49 | self.savedir = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S")) 50 | self.savedir_sample = os.path.join(self.savedir, "sample") 51 | os.makedirs(self.savedir, exist_ok=True) 52 | 53 | self.stable_diffusion_list = [] 54 | self.motion_module_list = [] 55 | self.personalized_model_list = [] 56 | 57 | self.refresh_stable_diffusion() 58 | self.refresh_motion_module() 59 | self.refresh_personalized_model() 60 | 61 | # config models 62 | self.tokenizer = None 63 | self.text_encoder = None 64 | self.vae = None 65 | self.unet = None 66 | self.pipeline = None 67 | self.lora_model_state_dict = {} 68 | 69 | self.inference_config = OmegaConf.load("configs/inference/inference.yaml") 70 | 71 | def refresh_stable_diffusion(self): 72 | self.stable_diffusion_list = glob(os.path.join(self.stable_diffusion_dir, "*/")) 73 | 74 | def refresh_motion_module(self): 75 | motion_module_list = glob(os.path.join(self.motion_module_dir, "*.ckpt")) 76 | self.motion_module_list = [os.path.basename(p) for p in motion_module_list] 77 | 78 | def refresh_personalized_model(self): 79 | personalized_model_list = glob(os.path.join(self.personalized_model_dir, "*.safetensors")) 80 | self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list] 81 | 82 | def update_stable_diffusion(self, stable_diffusion_dropdown): 83 | self.tokenizer = CLIPTokenizer.from_pretrained(stable_diffusion_dropdown, subfolder="tokenizer") 84 | self.text_encoder = CLIPTextModel.from_pretrained(stable_diffusion_dropdown, subfolder="text_encoder").cuda() 85 | self.vae = AutoencoderKL.from_pretrained(stable_diffusion_dropdown, subfolder="vae").cuda() 86 | self.unet = UNet3DConditionModel.from_pretrained_2d(stable_diffusion_dropdown, subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(self.inference_config.unet_additional_kwargs)).cuda() 87 | return gr.Dropdown.update() 88 | 89 | def update_motion_module(self, motion_module_dropdown): 90 | if self.unet is None: 91 | gr.Info(f"Please select a pretrained model path.") 92 | return gr.Dropdown.update(value=None) 93 | else: 94 | motion_module_dropdown = os.path.join(self.motion_module_dir, motion_module_dropdown) 95 | motion_module_state_dict = torch.load(motion_module_dropdown, map_location="cpu") 96 | missing, unexpected = self.unet.load_state_dict(motion_module_state_dict, strict=False) 97 | assert len(unexpected) == 0 98 | return gr.Dropdown.update() 99 | 100 | def update_base_model(self, base_model_dropdown): 101 | if self.unet is None: 102 | gr.Info(f"Please select a pretrained model path.") 103 | return gr.Dropdown.update(value=None) 104 | else: 105 | base_model_dropdown = os.path.join(self.personalized_model_dir, base_model_dropdown) 106 | base_model_state_dict = {} 107 | with safe_open(base_model_dropdown, framework="pt", device="cpu") as f: 108 | for key in f.keys(): 109 | base_model_state_dict[key] = f.get_tensor(key) 110 | 111 | converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_model_state_dict, self.vae.config) 112 | self.vae.load_state_dict(converted_vae_checkpoint) 113 | 114 | converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_model_state_dict, self.unet.config) 115 | self.unet.load_state_dict(converted_unet_checkpoint, strict=False) 116 | 117 | self.text_encoder = convert_ldm_clip_checkpoint(base_model_state_dict) 118 | return gr.Dropdown.update() 119 | 120 | def update_lora_model(self, lora_model_dropdown): 121 | lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown) 122 | self.lora_model_state_dict = {} 123 | if lora_model_dropdown == "none": pass 124 | else: 125 | with safe_open(lora_model_dropdown, framework="pt", device="cpu") as f: 126 | for key in f.keys(): 127 | self.lora_model_state_dict[key] = f.get_tensor(key) 128 | return gr.Dropdown.update() 129 | 130 | def animate( 131 | self, 132 | stable_diffusion_dropdown, 133 | motion_module_dropdown, 134 | base_model_dropdown, 135 | lora_alpha_slider, 136 | prompt_textbox, 137 | negative_prompt_textbox, 138 | sampler_dropdown, 139 | sample_step_slider, 140 | width_slider, 141 | length_slider, 142 | height_slider, 143 | cfg_scale_slider, 144 | seed_textbox 145 | ): 146 | if self.unet is None: 147 | raise gr.Error(f"Please select a pretrained model path.") 148 | if motion_module_dropdown == "": 149 | raise gr.Error(f"Please select a motion module.") 150 | if base_model_dropdown == "": 151 | raise gr.Error(f"Please select a base DreamBooth model.") 152 | 153 | if is_xformers_available(): self.unet.enable_xformers_memory_efficient_attention() 154 | 155 | pipeline = AnimationPipeline( 156 | vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer, unet=self.unet, 157 | scheduler=scheduler_dict[sampler_dropdown](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs)) 158 | ).to("cuda") 159 | 160 | if self.lora_model_state_dict != {}: 161 | pipeline = convert_lora(pipeline, self.lora_model_state_dict, alpha=lora_alpha_slider) 162 | 163 | pipeline.to("cuda") 164 | 165 | if seed_textbox != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox)) 166 | else: torch.seed() 167 | seed = torch.initial_seed() 168 | 169 | sample = pipeline( 170 | prompt_textbox, 171 | negative_prompt = negative_prompt_textbox, 172 | num_inference_steps = sample_step_slider, 173 | guidance_scale = cfg_scale_slider, 174 | width = width_slider, 175 | height = height_slider, 176 | video_length = length_slider, 177 | ).videos 178 | 179 | save_sample_path = os.path.join(self.savedir_sample, f"{sample_idx}.mp4") 180 | save_videos_grid(sample, save_sample_path) 181 | 182 | sample_config = { 183 | "prompt": prompt_textbox, 184 | "n_prompt": negative_prompt_textbox, 185 | "sampler": sampler_dropdown, 186 | "num_inference_steps": sample_step_slider, 187 | "guidance_scale": cfg_scale_slider, 188 | "width": width_slider, 189 | "height": height_slider, 190 | "video_length": length_slider, 191 | "seed": seed 192 | } 193 | json_str = json.dumps(sample_config, indent=4) 194 | with open(os.path.join(self.savedir, "logs.json"), "a") as f: 195 | f.write(json_str) 196 | f.write("\n\n") 197 | 198 | return gr.Video.update(value=save_sample_path) 199 | 200 | 201 | controller = AnimateController() 202 | 203 | 204 | def ui(): 205 | with gr.Blocks(css=css) as demo: 206 | gr.Markdown( 207 | """ 208 | # [AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725) 209 | Yuwei Guo, Ceyuan Yang*, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, Bo Dai (*Corresponding Author)
210 | [Arxiv Report](https://arxiv.org/abs/2307.04725) | [Project Page](https://animatediff.github.io/) | [Github](https://github.com/guoyww/animatediff/) 211 | """ 212 | ) 213 | with gr.Column(variant="panel"): 214 | gr.Markdown( 215 | """ 216 | ### 1. Model checkpoints (select pretrained model path first). 217 | """ 218 | ) 219 | with gr.Row(): 220 | stable_diffusion_dropdown = gr.Dropdown( 221 | label="Pretrained Model Path", 222 | choices=controller.stable_diffusion_list, 223 | interactive=True, 224 | ) 225 | stable_diffusion_dropdown.change(fn=controller.update_stable_diffusion, inputs=[stable_diffusion_dropdown], outputs=[stable_diffusion_dropdown]) 226 | 227 | stable_diffusion_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton") 228 | def update_stable_diffusion(): 229 | controller.refresh_stable_diffusion() 230 | return gr.Dropdown.update(choices=controller.stable_diffusion_list) 231 | stable_diffusion_refresh_button.click(fn=update_stable_diffusion, inputs=[], outputs=[stable_diffusion_dropdown]) 232 | 233 | with gr.Row(): 234 | motion_module_dropdown = gr.Dropdown( 235 | label="Select motion module", 236 | choices=controller.motion_module_list, 237 | interactive=True, 238 | ) 239 | motion_module_dropdown.change(fn=controller.update_motion_module, inputs=[motion_module_dropdown], outputs=[motion_module_dropdown]) 240 | 241 | motion_module_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton") 242 | def update_motion_module(): 243 | controller.refresh_motion_module() 244 | return gr.Dropdown.update(choices=controller.motion_module_list) 245 | motion_module_refresh_button.click(fn=update_motion_module, inputs=[], outputs=[motion_module_dropdown]) 246 | 247 | base_model_dropdown = gr.Dropdown( 248 | label="Select base Dreambooth model (required)", 249 | choices=controller.personalized_model_list, 250 | interactive=True, 251 | ) 252 | base_model_dropdown.change(fn=controller.update_base_model, inputs=[base_model_dropdown], outputs=[base_model_dropdown]) 253 | 254 | lora_model_dropdown = gr.Dropdown( 255 | label="Select LoRA model (optional)", 256 | choices=["none"] + controller.personalized_model_list, 257 | value="none", 258 | interactive=True, 259 | ) 260 | lora_model_dropdown.change(fn=controller.update_lora_model, inputs=[lora_model_dropdown], outputs=[lora_model_dropdown]) 261 | 262 | lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.8, minimum=0, maximum=2, interactive=True) 263 | 264 | personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton") 265 | def update_personalized_model(): 266 | controller.refresh_personalized_model() 267 | return [ 268 | gr.Dropdown.update(choices=controller.personalized_model_list), 269 | gr.Dropdown.update(choices=["none"] + controller.personalized_model_list) 270 | ] 271 | personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown]) 272 | 273 | with gr.Column(variant="panel"): 274 | gr.Markdown( 275 | """ 276 | ### 2. Configs for AnimateDiff. 277 | """ 278 | ) 279 | 280 | prompt_textbox = gr.Textbox(label="Prompt", lines=2) 281 | negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2) 282 | 283 | with gr.Row().style(equal_height=False): 284 | with gr.Column(): 285 | with gr.Row(): 286 | sampler_dropdown = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0]) 287 | sample_step_slider = gr.Slider(label="Sampling steps", value=25, minimum=10, maximum=100, step=1) 288 | 289 | width_slider = gr.Slider(label="Width", value=512, minimum=256, maximum=1024, step=64) 290 | height_slider = gr.Slider(label="Height", value=512, minimum=256, maximum=1024, step=64) 291 | length_slider = gr.Slider(label="Animation length", value=16, minimum=8, maximum=24, step=1) 292 | cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20) 293 | 294 | with gr.Row(): 295 | seed_textbox = gr.Textbox(label="Seed", value=-1) 296 | seed_button = gr.Button(value="\U0001F3B2", elem_classes="toolbutton") 297 | seed_button.click(fn=lambda: gr.Textbox.update(value=random.randint(1, 1e8)), inputs=[], outputs=[seed_textbox]) 298 | 299 | generate_button = gr.Button(value="Generate", variant='primary') 300 | 301 | result_video = gr.Video(label="Generated Animation", interactive=False) 302 | 303 | generate_button.click( 304 | fn=controller.animate, 305 | inputs=[ 306 | stable_diffusion_dropdown, 307 | motion_module_dropdown, 308 | base_model_dropdown, 309 | lora_alpha_slider, 310 | prompt_textbox, 311 | negative_prompt_textbox, 312 | sampler_dropdown, 313 | sample_step_slider, 314 | width_slider, 315 | length_slider, 316 | height_slider, 317 | cfg_scale_slider, 318 | seed_textbox, 319 | ], 320 | outputs=[result_video] 321 | ) 322 | 323 | return demo 324 | 325 | 326 | if __name__ == "__main__": 327 | demo = ui() 328 | demo.launch(share=True) 329 | -------------------------------------------------------------------------------- /AnimateDiff/configs/inference/inference-v1.yaml: -------------------------------------------------------------------------------- 1 | unet_additional_kwargs: 2 | unet_use_cross_frame_attention: false 3 | unet_use_temporal_attention: false 4 | use_motion_module: true 5 | motion_module_resolutions: [1,2,4,8] 6 | motion_module_mid_block: false 7 | motion_module_decoder_only: false 8 | motion_module_type: "Vanilla" 9 | use_spatial_adapter: true 10 | 11 | motion_module_kwargs: 12 | num_attention_heads: 8 13 | num_transformer_block: 1 14 | attention_block_types: [ "Temporal_Self", "Temporal_Self" ] 15 | temporal_position_encoding: true 16 | temporal_position_encoding_max_len: 24 17 | temporal_attention_dim_div: 1 18 | use_motion_adapter: 1 19 | motion_adapter_scale: 0 20 | 21 | 22 | noise_scheduler_kwargs: 23 | beta_start: 0.00085 24 | beta_end: 0.012 25 | beta_schedule: "linear" 26 | steps_offset: 1 27 | clip_sample: False 28 | -------------------------------------------------------------------------------- /AnimateDiff/configs/inference/inference-v2.yaml: -------------------------------------------------------------------------------- 1 | unet_additional_kwargs: 2 | use_inflated_groupnorm: true 3 | unet_use_cross_frame_attention: false 4 | unet_use_temporal_attention: false 5 | use_motion_module: true 6 | motion_module_resolutions: [1,2,4,8] 7 | motion_module_mid_block: true 8 | motion_module_decoder_only: false 9 | motion_module_type: "Vanilla" 10 | 11 | motion_module_kwargs: 12 | num_attention_heads: 8 13 | num_transformer_block: 1 14 | attention_block_types: [ "Temporal_Self", "Temporal_Self" ] 15 | temporal_position_encoding: true 16 | temporal_position_encoding_max_len: 32 17 | temporal_attention_dim_div: 1 18 | 19 | noise_scheduler_kwargs: 20 | beta_start: 0.00085 21 | beta_end: 0.012 22 | beta_schedule: "linear" 23 | steps_offset: 1 24 | clip_sample: False 25 | -------------------------------------------------------------------------------- /AnimateDiff/configs/inference/inference-v3.yaml: -------------------------------------------------------------------------------- 1 | unet_additional_kwargs: 2 | use_inflated_groupnorm: true 3 | use_motion_module: true 4 | motion_module_resolutions: [1,2,4,8] 5 | motion_module_mid_block: false 6 | motion_module_type: Vanilla 7 | 8 | motion_module_kwargs: 9 | num_attention_heads: 8 10 | num_transformer_block: 1 11 | attention_block_types: [ "Temporal_Self", "Temporal_Self" ] 12 | temporal_position_encoding: true 13 | temporal_position_encoding_max_len: 32 14 | temporal_attention_dim_div: 1 15 | zero_initialize: true 16 | 17 | noise_scheduler_kwargs: 18 | beta_start: 0.00085 19 | beta_end: 0.012 20 | beta_schedule: "linear" 21 | steps_offset: 1 22 | clip_sample: False 23 | -------------------------------------------------------------------------------- /AnimateDiff/configs/inference/sparsectrl/image_condition.yaml: -------------------------------------------------------------------------------- 1 | controlnet_additional_kwargs: 2 | set_noisy_sample_input_to_zero: true 3 | use_simplified_condition_embedding: false 4 | conditioning_channels: 3 5 | 6 | use_motion_module: true 7 | motion_module_resolutions: [1,2,4,8] 8 | motion_module_mid_block: false 9 | motion_module_type: "Vanilla" 10 | 11 | motion_module_kwargs: 12 | num_attention_heads: 8 13 | num_transformer_block: 1 14 | attention_block_types: [ "Temporal_Self" ] 15 | temporal_position_encoding: true 16 | temporal_position_encoding_max_len: 32 17 | temporal_attention_dim_div: 1 18 | -------------------------------------------------------------------------------- /AnimateDiff/configs/inference/sparsectrl/latent_condition.yaml: -------------------------------------------------------------------------------- 1 | controlnet_additional_kwargs: 2 | set_noisy_sample_input_to_zero: true 3 | use_simplified_condition_embedding: true 4 | conditioning_channels: 4 5 | 6 | use_motion_module: true 7 | motion_module_resolutions: [1,2,4,8] 8 | motion_module_mid_block: false 9 | motion_module_type: "Vanilla" 10 | 11 | motion_module_kwargs: 12 | num_attention_heads: 8 13 | num_transformer_block: 1 14 | attention_block_types: [ "Temporal_Self" ] 15 | temporal_position_encoding: true 16 | temporal_position_encoding_max_len: 32 17 | temporal_attention_dim_div: 1 18 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-1-ToonYou.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/toonyou_beta6.safetensors" 3 | lora_model_path: "" 4 | 5 | inference_config: "configs/inference/inference-v1.yaml" 6 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 7 | 8 | seed: [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751] 9 | steps: 25 10 | guidance_scale: 8 11 | 12 | prompt: 13 | - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress" 14 | - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes," 15 | - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern" 16 | - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle," 17 | 18 | n_prompt: 19 | - "worst quality, low quality, letterboxed" 20 | 21 | 22 | # motion module v1_15 23 | - dreambooth_path: "models/DreamBooth_LoRA/toonyou_beta6.safetensors" 24 | lora_model_path: "" 25 | 26 | inference_config: "configs/inference/inference-v1.yaml" 27 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 28 | 29 | seed: [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751] 30 | steps: 25 31 | guidance_scale: 8 32 | 33 | prompt: 34 | - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress" 35 | - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes," 36 | - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern" 37 | - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle," 38 | 39 | n_prompt: 40 | - "worst quality, low quality, letterboxed" 41 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-2-Lyriel.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/lyriel_v16.safetensors" 3 | lora_model_path: "" 4 | 5 | inference_config: "configs/inference/inference-v1.yaml" 6 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 7 | 8 | seed: [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551] 9 | steps: 25 10 | guidance_scale: 8 11 | 12 | prompt: 13 | - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange" 14 | - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal" 15 | - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray" 16 | - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown." 17 | 18 | n_prompt: 19 | - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration" 20 | - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular" 21 | - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome" 22 | - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render" 23 | 24 | 25 | # motion module v1_15 26 | - dreambooth_path: "models/DreamBooth_LoRA/lyriel_v16.safetensors" 27 | lora_model_path: "" 28 | 29 | inference_config: "configs/inference/inference-v1.yaml" 30 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 31 | 32 | seed: [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551] 33 | steps: 25 34 | guidance_scale: 8 35 | 36 | prompt: 37 | - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange" 38 | - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal" 39 | - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray" 40 | - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown." 41 | 42 | n_prompt: 43 | - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration" 44 | - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular" 45 | - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome" 46 | - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render" 47 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-3-RcnzCartoon.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors" 3 | lora_model_path: "" 4 | 5 | inference_config: "configs/inference/inference-v1.yaml" 6 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 7 | 8 | seed: [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890] 9 | steps: 25 10 | guidance_scale: 8 11 | 12 | prompt: 13 | - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded" 14 | - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face" 15 | - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes" 16 | - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering" 17 | 18 | n_prompt: 19 | - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation" 20 | - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular" 21 | - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face," 22 | - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand" 23 | 24 | 25 | # motion module v1_15 26 | - dreambooth_path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors" 27 | lora_model_path: "" 28 | 29 | inference_config: "configs/inference/inference-v1.yaml" 30 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 31 | 32 | seed: [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890] 33 | steps: 25 34 | guidance_scale: 8 35 | 36 | prompt: 37 | - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded" 38 | - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face" 39 | - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes" 40 | - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering" 41 | 42 | n_prompt: 43 | - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation" 44 | - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular" 45 | - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face," 46 | - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand" 47 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-4-MajicMix.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors" 3 | lora_model_path: "" 4 | 5 | inference_config: "configs/inference/inference-v1.yaml" 6 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 7 | 8 | seed: [1572448948722921032, 1099474677988590681, 6488833139725635347, 18339859844376517918] 9 | steps: 25 10 | guidance_scale: 8 11 | 12 | prompt: 13 | - "1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic" 14 | - "best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting" 15 | - "best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below" 16 | - "male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic" 17 | 18 | n_prompt: 19 | - "ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles" 20 | - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome" 21 | - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome" 22 | - "nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people" 23 | 24 | 25 | # motion module v1_15 26 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors" 27 | lora_model_path: "" 28 | 29 | inference_config: "configs/inference/inference-v1.yaml" 30 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 31 | 32 | seed: [1572448948722921032, 1099474677988590681, 6488833139725635347, 18339859844376517918] 33 | steps: 25 34 | guidance_scale: 8 35 | 36 | prompt: 37 | - "1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic" 38 | - "best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting" 39 | - "best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below" 40 | - "male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic" 41 | 42 | n_prompt: 43 | - "ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles" 44 | - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome" 45 | - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome" 46 | - "nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people" 47 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-5-RealisticVision.yaml: -------------------------------------------------------------------------------- 1 | # # motion module v1_14 2 | # - dreambooth_path: "./models/DreamBooth_LoRA/alex.safetensors" 3 | # lora_model_path: "" 4 | 5 | # inference_config: "configs/inference/inference-v1.yaml" 6 | # motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 7 | # motion_adapter_ckpt: "./outputs/motion_adapter_training-2024-07-30T10-47-27/checkpoints/checkpoint-max-steps-4000.ckpt" 8 | # spatial_adapter_ckpt: "./outputs/spatial_adapter_training-2024-07-30T12-12-31/checkpoints/checkpoint-epoch-15.ckpt" 9 | 10 | # seed: [5658137986800322009, 12099779162349365895, 10499524853910852697, 16768009035333711932] 11 | # steps: 25 12 | # guidance_scale: 8 13 | 14 | # prompt: 15 | # - "b&w photo of ohwx man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 16 | # - "close up photo of a ohwx man, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot" 17 | # - "photo of ohwx man, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 18 | # - "night, b&w photo of ohwx man in the old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain" 19 | 20 | # n_prompt: 21 | # - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck" 22 | # - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck" 23 | # - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 24 | # - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 25 | 26 | 27 | # motion module v1_15 28 | - dreambooth_path: "./models/DreamBooth_LoRA/alex.safetensors" 29 | lora_model_path: "" 30 | 31 | inference_config: "configs/inference/inference-v1.yaml" 32 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 33 | motion_adapter_ckpt: "./outputs/motion_adapter_training-2024-07-30T10-47-27/checkpoints/checkpoint-max-steps-4000.ckpt" 34 | spatial_adapter_ckpt: "./outputs/spatial_adapter_training-2024-07-30T12-12-31/checkpoints/checkpoint-epoch-15.ckpt" 35 | 36 | seed: [5658137986800322009, 12099779162349365895, 10499524853910852697, 16768009035333711932] 37 | steps: 25 38 | guidance_scale: 8 39 | 40 | prompt: 41 | - "b&w photo of ohwx man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 42 | - "close up photo of a ohwx man, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot" 43 | - "photo of ohwx man, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 44 | - "night, b&w photo of ohwx man in the old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain" 45 | 46 | n_prompt: 47 | - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck" 48 | - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck" 49 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 50 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 51 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-6-Tusun.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/moonfilm_reality20.safetensors" 3 | lora_model_path: "models/DreamBooth_LoRA/TUSUN.safetensors" 4 | lora_alpha: 0.6 5 | 6 | inference_config: "configs/inference/inference-v1.yaml" 7 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 8 | 9 | seed: [10154078483724687116, 2664393535095473805, 4231566096207622938, 1713349740448094493] 10 | steps: 25 11 | guidance_scale: 8 12 | 13 | prompt: 14 | - "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing" 15 | - "cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing" 16 | - "cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing" 17 | - "character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body" 18 | 19 | n_prompt: 20 | - "worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative" 21 | 22 | 23 | # motion module v1_15 24 | - dreambooth_path: "models/DreamBooth_LoRA/moonfilm_reality20.safetensors" 25 | lora_model_path: "models/DreamBooth_LoRA/TUSUN.safetensors" 26 | lora_alpha: 0.6 27 | 28 | inference_config: "configs/inference/inference-v1.yaml" 29 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 30 | 31 | seed: [10154078483724687116, 2664393535095473805, 4231566096207622938, 1713349740448094493] 32 | steps: 25 33 | guidance_scale: 8 34 | 35 | prompt: 36 | - "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing" 37 | - "cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing" 38 | - "cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing" 39 | - "character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body" 40 | 41 | n_prompt: 42 | - "worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative" 43 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-7-FilmVelvia.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors" 3 | lora_model_path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors" 4 | lora_alpha: 0.6 5 | 6 | inference_config: "configs/inference/inference-v1.yaml" 7 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 8 | 9 | seed: [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877] 10 | steps: 25 11 | guidance_scale: 8 12 | 13 | prompt: 14 | - "a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name" 15 | - ", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir" 16 | - "fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark" 17 | - "In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, " 18 | 19 | n_prompt: 20 | - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 21 | - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 22 | - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 23 | - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 24 | 25 | 26 | # motion module v1_15 27 | - dreambooth_path: "models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors" 28 | lora_model_path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors" 29 | lora_alpha: 0.6 30 | 31 | inference_config: "configs/inference/inference-v1.yaml" 32 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 33 | 34 | seed: [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877] 35 | steps: 25 36 | guidance_scale: 8 37 | 38 | prompt: 39 | - "a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name" 40 | - ", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir" 41 | - "fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark" 42 | - "In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, " 43 | 44 | n_prompt: 45 | - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 46 | - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 47 | - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 48 | - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg" 49 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v1/v1-8-GhibliBackground.yaml: -------------------------------------------------------------------------------- 1 | # motion module v1_14 2 | - dreambooth_path: "models/DreamBooth_LoRA/CounterfeitV30_25.safetensors" 3 | lora_model_path: "models/DreamBooth_LoRA/lora_Ghibli_n3.safetensors" 4 | lora_alpha: 1.0 5 | 6 | inference_config: "configs/inference/inference-v1.yaml" 7 | motion_module: "models/Motion_Module/mm_sd_v14.ckpt" 8 | 9 | seed: [8775748474469046618, 5893874876080607656, 11911465742147695752, 12437784838692000640] 10 | steps: 25 11 | guidance_scale: 8 12 | 13 | prompt: 14 | - "best quality,single build,architecture, blue_sky, building,cloudy_sky, day, fantasy, fence, field, house, build,architecture,landscape, moss, outdoors, overgrown, path, river, road, rock, scenery, sky, sword, tower, tree, waterfall" 15 | - "black_border, building, city, day, fantasy, ice, landscape, letterboxed, mountain, ocean, outdoors, planet, scenery, ship, snow, snowing, water, watercraft, waterfall, winter" 16 | - ",mysterious sea area, fantasy,build,concept" 17 | - "Tomb Raider,Scenography,Old building" 18 | 19 | n_prompt: 20 | - "worst quality, low quality, letterboxed" 21 | 22 | 23 | # motion module v1_15 24 | - dreambooth_path: "models/DreamBooth_LoRA/CounterfeitV30_25.safetensors" 25 | lora_model_path: "models/DreamBooth_LoRA/lora_Ghibli_n3.safetensors" 26 | lora_alpha: 1.0 27 | 28 | inference_config: "configs/inference/inference-v1.yaml" 29 | motion_module: "models/Motion_Module/mm_sd_v15.ckpt" 30 | 31 | seed: [8775748474469046618, 5893874876080607656, 11911465742147695752, 12437784838692000640] 32 | steps: 25 33 | guidance_scale: 8 34 | 35 | prompt: 36 | - "best quality,single build,architecture, blue_sky, building,cloudy_sky, day, fantasy, fence, field, house, build,architecture,landscape, moss, outdoors, overgrown, path, river, road, rock, scenery, sky, sword, tower, tree, waterfall" 37 | - "black_border, building, city, day, fantasy, ice, landscape, letterboxed, mountain, ocean, outdoors, planet, scenery, ship, snow, snowing, water, watercraft, waterfall, winter" 38 | - ",mysterious sea area, fantasy,build,concept" 39 | - "Tomb Raider,Scenography,Old building" 40 | 41 | n_prompt: 42 | - "worst quality, low quality, letterboxed" 43 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v2/v2-1-RealisticVision.yaml: -------------------------------------------------------------------------------- 1 | - inference_config: "configs/inference/inference-v2.yaml" 2 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 3 | 4 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 5 | lora_model_path: "" 6 | 7 | seed: [13100322578370451493, 14752961627088720670, 9329399085567825781, 16987697414827649302] 8 | steps: 25 9 | guidance_scale: 7.5 10 | 11 | prompt: 12 | - "b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 13 | - "close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot" 14 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 15 | - "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain" 16 | 17 | n_prompt: 18 | - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck" 19 | - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck" 20 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 21 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 22 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v2/v2-2-RealisticVision-MotionLoRA.yaml: -------------------------------------------------------------------------------- 1 | # ZoomIn 2 | - inference_config: "configs/inference/inference-v2.yaml" 3 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 4 | 5 | motion_module_lora_configs: 6 | - path: "models/MotionLoRA/v2_lora_ZoomIn.ckpt" 7 | alpha: 1.0 8 | 9 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 10 | lora_model_path: "" 11 | 12 | seed: 45987230 13 | steps: 25 14 | guidance_scale: 7.5 15 | 16 | prompt: 17 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 18 | 19 | n_prompt: 20 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 21 | 22 | 23 | # ZoomOut 24 | - inference_config: "configs/inference/inference-v2.yaml" 25 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 26 | 27 | motion_module_lora_configs: 28 | - path: "models/MotionLoRA/v2_lora_ZoomOut.ckpt" 29 | alpha: 1.0 30 | 31 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 32 | lora_model_path: "" 33 | 34 | seed: 45987230 35 | steps: 25 36 | guidance_scale: 7.5 37 | 38 | prompt: 39 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 40 | 41 | n_prompt: 42 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 43 | 44 | 45 | # PanLeft 46 | - inference_config: "configs/inference/inference-v2.yaml" 47 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 48 | 49 | motion_module_lora_configs: 50 | - path: "models/MotionLoRA/v2_lora_PanLeft.ckpt" 51 | alpha: 1.0 52 | 53 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 54 | lora_model_path: "" 55 | 56 | seed: 45987230 57 | steps: 25 58 | guidance_scale: 7.5 59 | 60 | prompt: 61 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 62 | 63 | n_prompt: 64 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 65 | 66 | 67 | # PanRight 68 | - inference_config: "configs/inference/inference-v2.yaml" 69 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 70 | 71 | motion_module_lora_configs: 72 | - path: "models/MotionLoRA/v2_lora_PanRight.ckpt" 73 | alpha: 1.0 74 | 75 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 76 | lora_model_path: "" 77 | 78 | seed: 45987230 79 | steps: 25 80 | guidance_scale: 7.5 81 | 82 | prompt: 83 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 84 | 85 | n_prompt: 86 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 87 | 88 | 89 | # TiltUp 90 | - inference_config: "configs/inference/inference-v2.yaml" 91 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 92 | 93 | motion_module_lora_configs: 94 | - path: "models/MotionLoRA/v2_lora_TiltUp.ckpt" 95 | alpha: 1.0 96 | 97 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 98 | lora_model_path: "" 99 | 100 | seed: 45987230 101 | steps: 25 102 | guidance_scale: 7.5 103 | 104 | prompt: 105 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 106 | 107 | n_prompt: 108 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 109 | 110 | 111 | # TiltDown 112 | - inference_config: "configs/inference/inference-v2.yaml" 113 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 114 | 115 | motion_module_lora_configs: 116 | - path: "models/MotionLoRA/v2_lora_TiltDown.ckpt" 117 | alpha: 1.0 118 | 119 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 120 | lora_model_path: "" 121 | 122 | seed: 45987230 123 | steps: 25 124 | guidance_scale: 7.5 125 | 126 | prompt: 127 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 128 | 129 | n_prompt: 130 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 131 | 132 | 133 | # RollingAnticlockwise 134 | - inference_config: "configs/inference/inference-v2.yaml" 135 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 136 | 137 | motion_module_lora_configs: 138 | - path: "models/MotionLoRA/v2_lora_RollingAnticlockwise.ckpt" 139 | alpha: 1.0 140 | 141 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 142 | lora_model_path: "" 143 | 144 | seed: 45987230 145 | steps: 25 146 | guidance_scale: 7.5 147 | 148 | prompt: 149 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 150 | 151 | n_prompt: 152 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 153 | 154 | 155 | # RollingClockwise 156 | - inference_config: "configs/inference/inference-v2.yaml" 157 | motion_module: "models/Motion_Module/mm_sd_v15_v2.ckpt" 158 | 159 | motion_module_lora_configs: 160 | - path: "models/MotionLoRA/v2_lora_RollingClockwise.ckpt" 161 | alpha: 1.0 162 | 163 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 164 | lora_model_path: "" 165 | 166 | seed: 45987230 167 | steps: 25 168 | guidance_scale: 7.5 169 | 170 | prompt: 171 | - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3" 172 | 173 | n_prompt: 174 | - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" 175 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v3/v3-1-T2V.yaml: -------------------------------------------------------------------------------- 1 | # 1-animation 2 | - domain_lora_scale: 1.0 3 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 4 | dreambooth_path: "" 5 | 6 | inference_config: "configs/inference/inference-v3.yaml" 7 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 8 | 9 | controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml" 10 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt" 11 | 12 | H: 256 13 | W: 384 14 | seed: [123,234] 15 | steps: 25 16 | guidance_scale: 8.5 17 | 18 | controlnet_image_indexs: [0] 19 | controlnet_images: 20 | - "__assets__/demos/image/painting.png" 21 | 22 | prompt: 23 | - an oil painting of a sailboat in the ocean wave 24 | - an oil painting of a sailboat in the ocean wave 25 | n_prompt: 26 | - "worst quality, low quality, letterboxed" 27 | 28 | 29 | # 2-interpolation 30 | - domain_lora_scale: 1.0 31 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 32 | dreambooth_path: "" 33 | 34 | inference_config: "configs/inference/inference-v3.yaml" 35 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 36 | 37 | controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml" 38 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt" 39 | 40 | H: 256 41 | W: 384 42 | seed: [123,234] 43 | steps: 25 44 | guidance_scale: 8.5 45 | 46 | controlnet_image_indexs: [0,-1] 47 | controlnet_images: 48 | - "__assets__/demos/image/interpolation_1.png" 49 | - "__assets__/demos/image/interpolation_2.png" 50 | 51 | prompt: 52 | - "aerial view, beautiful forest, autumn, 4k, high quality" 53 | - "aerial view, beautiful forest, autumn, 4k, high quality" 54 | n_prompt: 55 | - "worst quality, low quality, letterboxed" 56 | 57 | 58 | # 3-interpolation 59 | - domain_lora_scale: 1.0 60 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 61 | dreambooth_path: "" 62 | 63 | inference_config: "configs/inference/inference-v3.yaml" 64 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 65 | 66 | controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml" 67 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt" 68 | 69 | H: 256 70 | W: 384 71 | seed: [123,234] 72 | steps: 25 73 | guidance_scale: 8.5 74 | 75 | controlnet_image_indexs: [0,5,10,15] 76 | controlnet_images: 77 | - "__assets__/demos/image/low_fps_1.png" 78 | - "__assets__/demos/image/low_fps_2.png" 79 | - "__assets__/demos/image/low_fps_3.png" 80 | - "__assets__/demos/image/low_fps_4.png" 81 | 82 | prompt: 83 | - "two people holding hands in a field with wind turbines in the background" 84 | - "two people holding hands in a field with wind turbines in the background" 85 | n_prompt: 86 | - "worst quality, low quality, letterboxed" 87 | 88 | 89 | # 3-prediction 90 | - domain_lora_scale: 1.0 91 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 92 | dreambooth_path: "" 93 | 94 | inference_config: "configs/inference/inference-v3.yaml" 95 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 96 | 97 | controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml" 98 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt" 99 | 100 | H: 256 101 | W: 384 102 | seed: [123,234] 103 | steps: 25 104 | guidance_scale: 8.5 105 | 106 | controlnet_image_indexs: [0,1,2,3] 107 | controlnet_images: 108 | - "__assets__/demos/image/prediction_1.png" 109 | - "__assets__/demos/image/prediction_2.png" 110 | - "__assets__/demos/image/prediction_3.png" 111 | - "__assets__/demos/image/prediction_4.png" 112 | 113 | prompt: 114 | - "an astronaut is flying in the space, 4k, high resolution" 115 | - "an astronaut is flying in the space, 4k, high resolution" 116 | n_prompt: 117 | - "worst quality, low quality, letterboxed" 118 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v3/v3-2-animation-RealisticVision.yaml: -------------------------------------------------------------------------------- 1 | # animation-1 2 | - domain_lora_scale: 1.0 3 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 4 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 5 | 6 | inference_config: "configs/inference/inference-v3.yaml" 7 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 8 | 9 | controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml" 10 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt" 11 | 12 | seed: -1 13 | steps: 25 14 | guidance_scale: 8.5 15 | 16 | controlnet_image_indexs: [0] 17 | controlnet_images: 18 | - "__assets__/demos/image/RealisticVision_firework.png" 19 | 20 | prompt: 21 | - "closeup face photo of man in black clothes, night city street, bokeh, fireworks in background" 22 | - "closeup face photo of man in black clothes, night city street, bokeh, fireworks in background" 23 | n_prompt: 24 | - "worst quality, low quality, letterboxed" 25 | 26 | 27 | # animation-2 28 | - domain_lora_scale: 1.0 29 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 30 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 31 | 32 | inference_config: "configs/inference/inference-v3.yaml" 33 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 34 | 35 | controlnet_config: "configs/inference/sparsectrl/latent_condition.yaml" 36 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt" 37 | 38 | seed: -1 39 | steps: 25 40 | guidance_scale: 8.5 41 | 42 | controlnet_image_indexs: [0] 43 | controlnet_images: 44 | - "__assets__/demos/image/RealisticVision_sunset.png" 45 | 46 | prompt: 47 | - "masterpiece, bestquality, highlydetailed, ultradetailed, sunset, orange sky, warm lighting, fishing boats, ocean waves, seagulls, rippling water, wharf, silhouette, serene atmosphere, dusk, evening glow, golden hour, coastal landscape, seaside scenery" 48 | - "masterpiece, bestquality, highlydetailed, ultradetailed, sunset, orange sky, warm lighting, fishing boats, ocean waves, seagulls, rippling water, wharf, silhouette, serene atmosphere, dusk, evening glow, golden hour, coastal landscape, seaside scenery" 49 | n_prompt: 50 | - "worst quality, low quality, letterboxed" 51 | -------------------------------------------------------------------------------- /AnimateDiff/configs/prompts/v3/v3-3-sketch-RealisticVision.yaml: -------------------------------------------------------------------------------- 1 | # 1-sketch-to-video 2 | - domain_lora_scale: 1.0 3 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 4 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 5 | 6 | inference_config: "configs/inference/inference-v3.yaml" 7 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 8 | 9 | controlnet_config: "configs/inference/sparsectrl/image_condition.yaml" 10 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_scribble.ckpt" 11 | 12 | seed: -1 13 | steps: 25 14 | guidance_scale: 8.5 15 | 16 | controlnet_image_indexs: [0] 17 | controlnet_images: 18 | - "__assets__/demos/scribble/scribble_1.png" 19 | 20 | prompt: 21 | - "a back view of a boy, standing on the ground, looking at the sky, sunlight, masterpieces" 22 | - "a back view of a boy, standing on the ground, looking at the sky, clouds, sunset, orange sky, beautiful sunlight, masterpieces" 23 | n_prompt: 24 | - "worst quality, low quality, letterboxed" 25 | 26 | 27 | # 2-storyboarding 28 | - domain_lora_scale: 1.0 29 | adapter_lora_path: "models/Motion_Module/v3_sd15_adapter.ckpt" 30 | dreambooth_path: "models/DreamBooth_LoRA/realisticVisionV51_v51VAE.safetensors" 31 | 32 | inference_config: "configs/inference/inference-v3.yaml" 33 | motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" 34 | 35 | controlnet_config: "configs/inference/sparsectrl/image_condition.yaml" 36 | controlnet_path: "models/SparseCtrl/v3_sd15_sparsectrl_scribble.ckpt" 37 | 38 | seed: -1 39 | steps: 25 40 | guidance_scale: 8.5 41 | 42 | controlnet_image_indexs: [0,8,15] 43 | controlnet_images: 44 | - "__assets__/demos/scribble/scribble_2_1.png" 45 | - "__assets__/demos/scribble/scribble_2_2.png" 46 | - "__assets__/demos/scribble/scribble_2_3.png" 47 | 48 | prompt: 49 | - "an aerial view of a modern city, sunlight, day time, masterpiece, high quality" 50 | - "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality" 51 | n_prompt: 52 | - "worst quality, low quality, letterboxed" 53 | -------------------------------------------------------------------------------- /AnimateDiff/configs/training/v1/image_finetune.yaml: -------------------------------------------------------------------------------- 1 | image_finetune: true 2 | 3 | output_dir: "outputs" 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5" 5 | 6 | noise_scheduler_kwargs: 7 | num_train_timesteps: 1000 8 | beta_start: 0.00085 9 | beta_end: 0.012 10 | beta_schedule: "scaled_linear" 11 | steps_offset: 1 12 | clip_sample: false 13 | 14 | train_data: 15 | csv_path: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv" 16 | video_folder: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val" 17 | sample_size: 256 18 | 19 | validation_data: 20 | prompts: 21 | - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons." 22 | - "A drone view of celebration with Christma tree and fireworks, starry sky - background." 23 | - "Robot dancing in times square." 24 | - "Pacific coast, carmel by the sea ocean and waves." 25 | num_inference_steps: 25 26 | guidance_scale: 8. 27 | 28 | trainable_modules: 29 | - "." 30 | 31 | unet_checkpoint_path: "" 32 | 33 | learning_rate: 1.e-5 34 | train_batch_size: 50 35 | 36 | max_train_epoch: -1 37 | max_train_steps: 100 38 | checkpointing_epochs: -1 39 | checkpointing_steps: 60 40 | 41 | validation_steps: 5000 42 | validation_steps_tuple: [2, 50] 43 | 44 | global_seed: 42 45 | mixed_precision_training: true 46 | enable_xformers_memory_efficient_attention: True 47 | 48 | is_debug: False 49 | -------------------------------------------------------------------------------- /AnimateDiff/configs/training/v1/motion_adapter_training.yaml: -------------------------------------------------------------------------------- 1 | image_finetune: false 2 | 3 | output_dir: "outputs" 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5" 5 | 6 | unet_additional_kwargs: 7 | use_motion_module : true 8 | motion_module_resolutions : [ 1,2,4,8 ] 9 | unet_use_cross_frame_attention : false 10 | unet_use_temporal_attention : false 11 | use_motion_adapter : true 12 | 13 | motion_module_type: Vanilla 14 | motion_module_kwargs: 15 | num_attention_heads : 8 16 | num_transformer_block : 1 17 | attention_block_types : [ "Temporal_Self", "Temporal_Self" ] 18 | temporal_position_encoding : true 19 | temporal_position_encoding_max_len : 24 20 | temporal_attention_dim_div : 1 21 | zero_initialize : true 22 | use_motion_adapter : true 23 | 24 | noise_scheduler_kwargs: 25 | num_train_timesteps: 1000 26 | beta_start: 0.00085 27 | beta_end: 0.012 28 | beta_schedule: "linear" 29 | steps_offset: 1 30 | clip_sample: false 31 | 32 | train_data: 33 | csv_path: "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/train.csv" 34 | video_folder: "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/videos" 35 | sample_size: 256 36 | sample_stride: 4 37 | sample_n_frames: 16 38 | 39 | validation_data: 40 | prompts: 41 | - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons." 42 | - "A drone view of celebration with Christma tree and fireworks, starry sky - background." 43 | - "Robot dancing in times square." 44 | - "Pacific coast, carmel by the sea ocean and waves." 45 | num_inference_steps: 25 46 | guidance_scale: 8. 47 | 48 | trainable_modules: 49 | - "q_lora." 50 | - "k_lora." 51 | - "v_lora." 52 | 53 | 54 | motion_adapater_ckpt: "./models/Motion_Module/mm_sd_v15.ckpt" 55 | 56 | 57 | unet_checkpoint_path: "" 58 | 59 | learning_rate: 2.e-5 60 | train_batch_size: 1 61 | 62 | max_train_epoch: -1 63 | max_train_steps: 4000 64 | checkpointing_epochs: -1 65 | checkpointing_steps: 4000 66 | 67 | validation_steps: 500 68 | validation_steps_tuple: [2, 50] 69 | 70 | global_seed: 42 71 | mixed_precision_training: true 72 | enable_xformers_memory_efficient_attention: True 73 | 74 | is_debug: False 75 | -------------------------------------------------------------------------------- /AnimateDiff/configs/training/v1/spatial_adapter_training.yaml: -------------------------------------------------------------------------------- 1 | image_finetune: false 2 | 3 | output_dir: "outputs" 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5" 5 | 6 | unet_additional_kwargs: 7 | use_motion_module : true 8 | motion_module_resolutions : [ 1,2,4,8 ] 9 | unet_use_cross_frame_attention : false 10 | unet_use_temporal_attention : false 11 | use_spatial_adapter : true 12 | 13 | motion_module_type: Vanilla 14 | motion_module_kwargs: 15 | num_attention_heads : 8 16 | num_transformer_block : 1 17 | attention_block_types : [ "Temporal_Self", "Temporal_Self" ] 18 | temporal_position_encoding : true 19 | temporal_position_encoding_max_len : 24 20 | temporal_attention_dim_div : 1 21 | zero_initialize : true 22 | use_motion_adapter : true 23 | 24 | noise_scheduler_kwargs: 25 | num_train_timesteps: 1000 26 | beta_start: 0.00085 27 | beta_end: 0.012 28 | beta_schedule: "linear" 29 | steps_offset: 1 30 | clip_sample: false 31 | 32 | train_data: 33 | csv_path: "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/train.csv" 34 | video_folder: "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/videos" 35 | sample_size: 256 36 | sample_stride: 4 37 | sample_n_frames: 16 38 | 39 | validation_data: 40 | prompts: 41 | - "ohwx man in the Snow rocky mountains. Snow blanketed rocky mountains surround and shadow deep canyons." 42 | - "ohwx man reading book" 43 | - "ohwx man smiling" 44 | - "ohwx man walking drinking coffee" 45 | num_inference_steps: 25 46 | guidance_scale: 8. 47 | 48 | 49 | 50 | trainable_modules: 51 | - "attn1_lora." 52 | - "attn2_lora." 53 | 54 | motion_adapater_ckpt: "./outputs/motion_adapter_training-2024-07-30T10-47-27/checkpoints/checkpoint-max-steps-4000.ckpt" 55 | 56 | 57 | unet_checkpoint_path: "" 58 | 59 | learning_rate: 2.e-5 60 | train_batch_size: 1 61 | 62 | max_train_epoch: -1 63 | max_train_steps: 600 64 | checkpointing_epochs: -1 65 | checkpointing_steps: 600 66 | 67 | validation_steps: 600 68 | validation_steps_tuple: [2, 50] 69 | 70 | global_seed: 42 71 | mixed_precision_training: true 72 | enable_xformers_memory_efficient_attention: True 73 | 74 | is_debug: False 75 | -------------------------------------------------------------------------------- /AnimateDiff/configs/training/v1/training.yaml: -------------------------------------------------------------------------------- 1 | image_finetune: false 2 | 3 | output_dir: "outputs" 4 | pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5" 5 | 6 | unet_additional_kwargs: 7 | use_motion_module : true 8 | motion_module_resolutions : [ 1,2,4,8 ] 9 | unet_use_cross_frame_attention : false 10 | unet_use_temporal_attention : false 11 | 12 | motion_module_type: Vanilla 13 | motion_module_kwargs: 14 | num_attention_heads : 8 15 | num_transformer_block : 1 16 | attention_block_types : [ "Temporal_Self", "Temporal_Self" ] 17 | temporal_position_encoding : true 18 | temporal_position_encoding_max_len : 24 19 | temporal_attention_dim_div : 1 20 | zero_initialize : true 21 | 22 | noise_scheduler_kwargs: 23 | num_train_timesteps: 1000 24 | beta_start: 0.00085 25 | beta_end: 0.012 26 | beta_schedule: "linear" 27 | steps_offset: 1 28 | clip_sample: false 29 | 30 | train_data: 31 | csv_path: "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/train.csv" 32 | video_folder: "/home/harshb/workspace/learnings/StillMoving/AnimateDiff/dataset/videos" 33 | sample_size: 256 34 | sample_stride: 4 35 | sample_n_frames: 16 36 | 37 | validation_data: 38 | prompts: 39 | - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons." 40 | - "A drone view of celebration with Christma tree and fireworks, starry sky - background." 41 | - "Robot dancing in times square." 42 | - "Pacific coast, carmel by the sea ocean and waves." 43 | num_inference_steps: 25 44 | guidance_scale: 8. 45 | 46 | # trainable_modules: 47 | # - "q_lora." 48 | # - "k_lora." 49 | # - "v_lora." 50 | 51 | trainable_modules: 52 | - "attn1_lora." 53 | - "attn2_lora." 54 | 55 | motion_adapater_ckpt: "" 56 | 57 | 58 | unet_checkpoint_path: "" 59 | 60 | learning_rate: 2.e-5 61 | train_batch_size: 1 62 | 63 | max_train_epoch: -1 64 | max_train_steps: 4000 65 | checkpointing_epochs: -1 66 | checkpointing_steps: 5 67 | 68 | validation_steps: 5000 69 | validation_steps_tuple: [2, 50] 70 | 71 | global_seed: 42 72 | mixed_precision_training: true 73 | enable_xformers_memory_efficient_attention: True 74 | 75 | is_debug: False 76 | -------------------------------------------------------------------------------- /AnimateDiff/convert_to_safetensors.py: -------------------------------------------------------------------------------- 1 | # Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint. 2 | # *Only* converts the UNet, VAE, and Text Encoder. 3 | # Does not convert optimizer state or any other thing. 4 | 5 | import argparse 6 | import os.path as osp 7 | import re 8 | 9 | import torch 10 | from safetensors.torch import load_file, save_file 11 | 12 | 13 | # =================# 14 | # UNet Conversion # 15 | # =================# 16 | 17 | unet_conversion_map = [ 18 | # (stable-diffusion, HF Diffusers) 19 | ("time_embed.0.weight", "time_embedding.linear_1.weight"), 20 | ("time_embed.0.bias", "time_embedding.linear_1.bias"), 21 | ("time_embed.2.weight", "time_embedding.linear_2.weight"), 22 | ("time_embed.2.bias", "time_embedding.linear_2.bias"), 23 | ("input_blocks.0.0.weight", "conv_in.weight"), 24 | ("input_blocks.0.0.bias", "conv_in.bias"), 25 | ("out.0.weight", "conv_norm_out.weight"), 26 | ("out.0.bias", "conv_norm_out.bias"), 27 | ("out.2.weight", "conv_out.weight"), 28 | ("out.2.bias", "conv_out.bias"), 29 | ] 30 | 31 | unet_conversion_map_resnet = [ 32 | # (stable-diffusion, HF Diffusers) 33 | ("in_layers.0", "norm1"), 34 | ("in_layers.2", "conv1"), 35 | ("out_layers.0", "norm2"), 36 | ("out_layers.3", "conv2"), 37 | ("emb_layers.1", "time_emb_proj"), 38 | ("skip_connection", "conv_shortcut"), 39 | ] 40 | 41 | unet_conversion_map_layer = [] 42 | # hardcoded number of downblocks and resnets/attentions... 43 | # would need smarter logic for other networks. 44 | for i in range(4): 45 | # loop over downblocks/upblocks 46 | 47 | for j in range(2): 48 | # loop over resnets/attentions for downblocks 49 | hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}." 50 | sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0." 51 | unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix)) 52 | 53 | if i < 3: 54 | # no attention layers in down_blocks.3 55 | hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}." 56 | sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1." 57 | unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix)) 58 | 59 | for j in range(3): 60 | # loop over resnets/attentions for upblocks 61 | hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}." 62 | sd_up_res_prefix = f"output_blocks.{3*i + j}.0." 63 | unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix)) 64 | 65 | if i > 0: 66 | # no attention layers in up_blocks.0 67 | hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}." 68 | sd_up_atn_prefix = f"output_blocks.{3*i + j}.1." 69 | unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix)) 70 | 71 | if i < 3: 72 | # no downsample in down_blocks.3 73 | hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv." 74 | sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op." 75 | unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix)) 76 | 77 | # no upsample in up_blocks.3 78 | hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." 79 | sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}." 80 | unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix)) 81 | 82 | hf_mid_atn_prefix = "mid_block.attentions.0." 83 | sd_mid_atn_prefix = "middle_block.1." 84 | unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix)) 85 | 86 | for j in range(2): 87 | hf_mid_res_prefix = f"mid_block.resnets.{j}." 88 | sd_mid_res_prefix = f"middle_block.{2*j}." 89 | unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix)) 90 | 91 | 92 | def convert_unet_state_dict(unet_state_dict): 93 | # buyer beware: this is a *brittle* function, 94 | # and correct output requires that all of these pieces interact in 95 | # the exact order in which I have arranged them. 96 | mapping = {k: k for k in unet_state_dict.keys()} 97 | for sd_name, hf_name in unet_conversion_map: 98 | mapping[hf_name] = sd_name 99 | for k, v in mapping.items(): 100 | if "resnets" in k: 101 | for sd_part, hf_part in unet_conversion_map_resnet: 102 | v = v.replace(hf_part, sd_part) 103 | mapping[k] = v 104 | for k, v in mapping.items(): 105 | for sd_part, hf_part in unet_conversion_map_layer: 106 | v = v.replace(hf_part, sd_part) 107 | mapping[k] = v 108 | new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()} 109 | return new_state_dict 110 | 111 | 112 | # ================# 113 | # VAE Conversion # 114 | # ================# 115 | 116 | vae_conversion_map = [ 117 | # (stable-diffusion, HF Diffusers) 118 | ("nin_shortcut", "conv_shortcut"), 119 | ("norm_out", "conv_norm_out"), 120 | ("mid.attn_1.", "mid_block.attentions.0."), 121 | ] 122 | 123 | for i in range(4): 124 | # down_blocks have two resnets 125 | for j in range(2): 126 | hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}." 127 | sd_down_prefix = f"encoder.down.{i}.block.{j}." 128 | vae_conversion_map.append((sd_down_prefix, hf_down_prefix)) 129 | 130 | if i < 3: 131 | hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0." 132 | sd_downsample_prefix = f"down.{i}.downsample." 133 | vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix)) 134 | 135 | hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." 136 | sd_upsample_prefix = f"up.{3-i}.upsample." 137 | vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix)) 138 | 139 | # up_blocks have three resnets 140 | # also, up blocks in hf are numbered in reverse from sd 141 | for j in range(3): 142 | hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}." 143 | sd_up_prefix = f"decoder.up.{3-i}.block.{j}." 144 | vae_conversion_map.append((sd_up_prefix, hf_up_prefix)) 145 | 146 | # this part accounts for mid blocks in both the encoder and the decoder 147 | for i in range(2): 148 | hf_mid_res_prefix = f"mid_block.resnets.{i}." 149 | sd_mid_res_prefix = f"mid.block_{i+1}." 150 | vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix)) 151 | 152 | 153 | vae_conversion_map_attn = [ 154 | # (stable-diffusion, HF Diffusers) 155 | ("norm.", "group_norm."), 156 | ("q.", "query."), 157 | ("k.", "key."), 158 | ("v.", "value."), 159 | ("proj_out.", "proj_attn."), 160 | ] 161 | 162 | # This is probably not the most ideal solution, but it does work. 163 | vae_extra_conversion_map = [ 164 | ("to_q", "q"), 165 | ("to_k", "k"), 166 | ("to_v", "v"), 167 | ("to_out.0", "proj_out"), 168 | ] 169 | 170 | 171 | def reshape_weight_for_sd(w): 172 | # convert HF linear weights to SD conv2d weights 173 | if not w.ndim == 1: 174 | return w.reshape(*w.shape, 1, 1) 175 | else: 176 | return w 177 | 178 | 179 | def convert_vae_state_dict(vae_state_dict): 180 | mapping = {k: k for k in vae_state_dict.keys()} 181 | for k, v in mapping.items(): 182 | for sd_part, hf_part in vae_conversion_map: 183 | v = v.replace(hf_part, sd_part) 184 | mapping[k] = v 185 | for k, v in mapping.items(): 186 | if "attentions" in k: 187 | for sd_part, hf_part in vae_conversion_map_attn: 188 | v = v.replace(hf_part, sd_part) 189 | mapping[k] = v 190 | new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()} 191 | weights_to_convert = ["q", "k", "v", "proj_out"] 192 | keys_to_rename = {} 193 | for k, v in new_state_dict.items(): 194 | for weight_name in weights_to_convert: 195 | if f"mid.attn_1.{weight_name}.weight" in k: 196 | print(f"Reshaping {k} for SD format") 197 | new_state_dict[k] = reshape_weight_for_sd(v) 198 | for weight_name, real_weight_name in vae_extra_conversion_map: 199 | if ( 200 | f"mid.attn_1.{weight_name}.weight" in k 201 | or f"mid.attn_1.{weight_name}.bias" in k 202 | ): 203 | keys_to_rename[k] = k.replace(weight_name, real_weight_name) 204 | for k, v in keys_to_rename.items(): 205 | if k in new_state_dict: 206 | print(f"Renaming {k} to {v}") 207 | new_state_dict[v] = reshape_weight_for_sd(new_state_dict[k]) 208 | del new_state_dict[k] 209 | return new_state_dict 210 | 211 | 212 | # =========================# 213 | # Text Encoder Conversion # 214 | # =========================# 215 | 216 | 217 | textenc_conversion_lst = [ 218 | # (stable-diffusion, HF Diffusers) 219 | ("resblocks.", "text_model.encoder.layers."), 220 | ("ln_1", "layer_norm1"), 221 | ("ln_2", "layer_norm2"), 222 | (".c_fc.", ".fc1."), 223 | (".c_proj.", ".fc2."), 224 | (".attn", ".self_attn"), 225 | ("ln_final.", "transformer.text_model.final_layer_norm."), 226 | ( 227 | "token_embedding.weight", 228 | "transformer.text_model.embeddings.token_embedding.weight", 229 | ), 230 | ( 231 | "positional_embedding", 232 | "transformer.text_model.embeddings.position_embedding.weight", 233 | ), 234 | ] 235 | protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst} 236 | textenc_pattern = re.compile("|".join(protected.keys())) 237 | 238 | # Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp 239 | code2idx = {"q": 0, "k": 1, "v": 2} 240 | 241 | 242 | def convert_text_enc_state_dict_v20(text_enc_dict): 243 | new_state_dict = {} 244 | capture_qkv_weight = {} 245 | capture_qkv_bias = {} 246 | for k, v in text_enc_dict.items(): 247 | if ( 248 | k.endswith(".self_attn.q_proj.weight") 249 | or k.endswith(".self_attn.k_proj.weight") 250 | or k.endswith(".self_attn.v_proj.weight") 251 | ): 252 | k_pre = k[: -len(".q_proj.weight")] 253 | k_code = k[-len("q_proj.weight")] 254 | if k_pre not in capture_qkv_weight: 255 | capture_qkv_weight[k_pre] = [None, None, None] 256 | capture_qkv_weight[k_pre][code2idx[k_code]] = v 257 | continue 258 | 259 | if ( 260 | k.endswith(".self_attn.q_proj.bias") 261 | or k.endswith(".self_attn.k_proj.bias") 262 | or k.endswith(".self_attn.v_proj.bias") 263 | ): 264 | k_pre = k[: -len(".q_proj.bias")] 265 | k_code = k[-len("q_proj.bias")] 266 | if k_pre not in capture_qkv_bias: 267 | capture_qkv_bias[k_pre] = [None, None, None] 268 | capture_qkv_bias[k_pre][code2idx[k_code]] = v 269 | continue 270 | 271 | relabelled_key = textenc_pattern.sub( 272 | lambda m: protected[re.escape(m.group(0))], k 273 | ) 274 | new_state_dict[relabelled_key] = v 275 | 276 | for k_pre, tensors in capture_qkv_weight.items(): 277 | if None in tensors: 278 | raise Exception( 279 | "CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing" 280 | ) 281 | relabelled_key = textenc_pattern.sub( 282 | lambda m: protected[re.escape(m.group(0))], k_pre 283 | ) 284 | new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors) 285 | 286 | for k_pre, tensors in capture_qkv_bias.items(): 287 | if None in tensors: 288 | raise Exception( 289 | "CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing" 290 | ) 291 | relabelled_key = textenc_pattern.sub( 292 | lambda m: protected[re.escape(m.group(0))], k_pre 293 | ) 294 | new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors) 295 | 296 | return new_state_dict 297 | 298 | 299 | def convert_text_enc_state_dict(text_enc_dict): 300 | return text_enc_dict 301 | 302 | 303 | if __name__ == "__main__": 304 | parser = argparse.ArgumentParser() 305 | 306 | parser.add_argument( 307 | "--model_path", 308 | default=None, 309 | type=str, 310 | required=True, 311 | help="Path to the model to convert.", 312 | ) 313 | parser.add_argument( 314 | "--checkpoint_path", 315 | default=None, 316 | type=str, 317 | required=True, 318 | help="Path to the output model.", 319 | ) 320 | parser.add_argument( 321 | "--half", action="store_true", help="Save weights in half precision." 322 | ) 323 | parser.add_argument( 324 | "--use_safetensors", 325 | action="store_true", 326 | help="Save weights use safetensors, default is ckpt.", 327 | ) 328 | 329 | args = parser.parse_args() 330 | 331 | assert args.model_path is not None, "Must provide a model path!" 332 | 333 | assert args.checkpoint_path is not None, "Must provide a checkpoint path!" 334 | 335 | # Path for safetensors 336 | unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.safetensors") 337 | vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.safetensors") 338 | text_enc_path = osp.join(args.model_path, "text_encoder", "model.safetensors") 339 | 340 | # Load models from safetensors if it exists, if it doesn't pytorch 341 | if osp.exists(unet_path): 342 | unet_state_dict = load_file(unet_path, device="cpu") 343 | else: 344 | unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin") 345 | unet_state_dict = torch.load(unet_path, map_location="cpu") 346 | 347 | if osp.exists(vae_path): 348 | vae_state_dict = load_file(vae_path, device="cpu") 349 | else: 350 | vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin") 351 | vae_state_dict = torch.load(vae_path, map_location="cpu") 352 | 353 | if osp.exists(text_enc_path): 354 | text_enc_dict = load_file(text_enc_path, device="cpu") 355 | else: 356 | text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin") 357 | text_enc_dict = torch.load(text_enc_path, map_location="cpu") 358 | 359 | # Convert the UNet model 360 | unet_state_dict = convert_unet_state_dict(unet_state_dict) 361 | unet_state_dict = { 362 | "model.diffusion_model." + k: v for k, v in unet_state_dict.items() 363 | } 364 | 365 | # Convert the VAE model 366 | vae_state_dict = convert_vae_state_dict(vae_state_dict) 367 | vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()} 368 | 369 | # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper 370 | is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict 371 | 372 | if is_v20_model: 373 | # Need to add the tag 'transformer' in advance so we can knock it out from the final layer-norm 374 | text_enc_dict = {"transformer." + k: v for k, v in text_enc_dict.items()} 375 | text_enc_dict = convert_text_enc_state_dict_v20(text_enc_dict) 376 | text_enc_dict = { 377 | "cond_stage_model.model." + k: v for k, v in text_enc_dict.items() 378 | } 379 | else: 380 | text_enc_dict = convert_text_enc_state_dict(text_enc_dict) 381 | text_enc_dict = { 382 | "cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items() 383 | } 384 | 385 | # Put together new checkpoint 386 | state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict} 387 | if args.half: 388 | state_dict = {k: v.half() for k, v in state_dict.items()} 389 | 390 | if args.use_safetensors: 391 | save_file(state_dict, args.checkpoint_path) 392 | else: 393 | state_dict = {"state_dict": state_dict} 394 | torch.save(state_dict, args.checkpoint_path) 395 | -------------------------------------------------------------------------------- /AnimateDiff/dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | ds = load_dataset("TempoFunk/webvid-10M", cache_dir="./dataset") 4 | -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/0-MotionModule.sh: -------------------------------------------------------------------------------- 1 | gdown 1RqkQuGPaCO5sGZ6V6KZ-jUWmsRu48Kdq -O models/Motion_Module/ 2 | gdown 1ql0g_Ys4UCz2RnokYlBjyOYPbttbIpbu -O models/Motion_Module/ -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/1-ToonYou.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/78775 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/2-Lyriel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/72396 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/3-RcnzCartoon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/71009 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/4-MajicMix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/79068 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/5-RealisticVision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/130072 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/6-Tusun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/97261 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate 3 | wget https://civitai.com/api/download/models/50705 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate 4 | -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/7-FilmVelvia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/90115 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate 3 | wget https://civitai.com/api/download/models/55911 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate 4 | -------------------------------------------------------------------------------- /AnimateDiff/download_bashscripts/8-GhibliBackground.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://civitai.com/api/download/models/102828 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate 3 | wget https://civitai.com/api/download/models/57618 -P models/DreamBooth_LoRA/ --content-disposition --no-check-certificate 4 | -------------------------------------------------------------------------------- /AnimateDiff/environment.yaml: -------------------------------------------------------------------------------- 1 | name: animatediff 2 | channels: 3 | - pytorch 4 | - nvidia 5 | dependencies: 6 | - python=3.10 7 | - pytorch=1.13.1 8 | - torchvision=0.14.1 9 | - torchaudio=0.13.1 10 | - pytorch-cuda=11.7 11 | - pip 12 | - pip: 13 | - diffusers==0.11.1 14 | - transformers==4.25.1 15 | - xformers==0.0.16 16 | - imageio==2.27.0 17 | - decord==0.6.0 18 | - gdown 19 | - einops 20 | - omegaconf 21 | - safetensors 22 | - gradio 23 | - wandb 24 | -------------------------------------------------------------------------------- /AnimateDiff/models/DreamBooth_LoRA/Put personalized T2I checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/models/DreamBooth_LoRA/Put personalized T2I checkpoints here.txt -------------------------------------------------------------------------------- /AnimateDiff/models/MotionLoRA/Put MotionLoRA checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/models/MotionLoRA/Put MotionLoRA checkpoints here.txt -------------------------------------------------------------------------------- /AnimateDiff/models/Motion_Module/Put motion module checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/AnimateDiff/models/Motion_Module/Put motion module checkpoints here.txt -------------------------------------------------------------------------------- /AnimateDiff/scripts/animate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import inspect 4 | import os 5 | from omegaconf import OmegaConf 6 | 7 | import torch 8 | import torchvision.transforms as transforms 9 | 10 | import diffusers 11 | from diffusers import AutoencoderKL, DDIMScheduler 12 | 13 | from tqdm.auto import tqdm 14 | from transformers import CLIPTextModel, CLIPTokenizer 15 | 16 | from animatediff.models.unet import UNet3DConditionModel 17 | from animatediff.models.sparse_controlnet import SparseControlNetModel 18 | from animatediff.pipelines.pipeline_animation import AnimationPipeline 19 | from animatediff.utils.util import save_videos_grid 20 | from animatediff.utils.util import load_weights 21 | from diffusers.utils.import_utils import is_xformers_available 22 | 23 | from einops import rearrange, repeat 24 | 25 | import csv, pdb, glob, math 26 | from pathlib import Path 27 | from PIL import Image 28 | import numpy as np 29 | 30 | 31 | def load_motion_adapter_ckpt(unet, motion_adapater_ckpt): 32 | print("Loading Motion Adapter checkpoints") 33 | checkpoint = torch.load( 34 | motion_adapater_ckpt, 35 | map_location="cpu", 36 | ) 37 | # Extract the state dict 38 | if "state_dict" in checkpoint: 39 | state_dict = checkpoint["state_dict"] 40 | else: 41 | raise KeyError("state_dict not found in checkpoint") 42 | 43 | unet_state_dict = unet.state_dict() 44 | motion_adapter_state_dict = { 45 | k: v 46 | for k, v in unet_state_dict.items() 47 | if "q_lora." in k or "k_lora." in k or "v_lora." in k 48 | } 49 | 50 | motion_adapter_state_dict.update( 51 | {k: v for k, v in state_dict.items() if k in motion_adapter_state_dict} 52 | ) 53 | 54 | print(motion_adapter_state_dict) 55 | return motion_adapter_state_dict, unet_state_dict 56 | 57 | 58 | def load_spatial_adapter_ckpt(unet, spatial_adapter_ckpt): 59 | print("Loading Sptial Adapter checkpoints") 60 | checkpoint = torch.load( 61 | spatial_adapter_ckpt, 62 | map_location="cpu", 63 | ) 64 | # Extract the state dict 65 | if "state_dict" in checkpoint: 66 | state_dict = checkpoint["state_dict"] 67 | else: 68 | raise KeyError("state_dict not found in checkpoint") 69 | 70 | unet_state_dict = unet.state_dict() 71 | spatial_adapter_state_dict = { 72 | k: v 73 | for k, v in unet_state_dict.items() 74 | if "attn1_lora." in k or "attn2_lora." in k 75 | } 76 | 77 | spatial_adapter_state_dict.update( 78 | {k: v for k, v in state_dict.items() if k in spatial_adapter_state_dict} 79 | ) 80 | print(spatial_adapter_state_dict) 81 | return spatial_adapter_state_dict, unet_state_dict 82 | 83 | 84 | @torch.no_grad() 85 | def main(args): 86 | *_, func_args = inspect.getargvalues(inspect.currentframe()) 87 | func_args = dict(func_args) 88 | 89 | time_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") 90 | savedir = f"samples/{Path(args.config).stem}-{time_str}" 91 | os.makedirs(savedir) 92 | 93 | config = OmegaConf.load(args.config) 94 | samples = [] 95 | 96 | # create validation pipeline 97 | tokenizer = CLIPTokenizer.from_pretrained( 98 | args.pretrained_model_path, subfolder="tokenizer" 99 | ) 100 | text_encoder = CLIPTextModel.from_pretrained( 101 | args.pretrained_model_path, subfolder="text_encoder" 102 | ).cuda() 103 | vae = AutoencoderKL.from_pretrained( 104 | args.pretrained_model_path, subfolder="vae" 105 | ).cuda() 106 | 107 | sample_idx = 0 108 | for model_idx, model_config in enumerate(config): 109 | model_config.W = model_config.get("W", args.W) 110 | model_config.H = model_config.get("H", args.H) 111 | model_config.L = model_config.get("L", args.L) 112 | 113 | inference_config = OmegaConf.load( 114 | model_config.get("inference_config", args.inference_config) 115 | ) 116 | unet = UNet3DConditionModel.from_pretrained_2d( 117 | args.pretrained_model_path, 118 | subfolder="unet", 119 | unet_additional_kwargs=OmegaConf.to_container( 120 | inference_config.unet_additional_kwargs 121 | ), 122 | ).cuda() 123 | 124 | # load controlnet model 125 | controlnet = controlnet_images = None 126 | if model_config.get("controlnet_path", "") != "": 127 | assert model_config.get("controlnet_images", "") != "" 128 | assert model_config.get("controlnet_config", "") != "" 129 | 130 | unet.config.num_attention_heads = 8 131 | unet.config.projection_class_embeddings_input_dim = None 132 | 133 | controlnet_config = OmegaConf.load(model_config.controlnet_config) 134 | controlnet = SparseControlNetModel.from_unet( 135 | unet, 136 | controlnet_additional_kwargs=controlnet_config.get( 137 | "controlnet_additional_kwargs", {} 138 | ), 139 | ) 140 | 141 | print( 142 | f"loading controlnet checkpoint from {model_config.controlnet_path} ..." 143 | ) 144 | controlnet_state_dict = torch.load( 145 | model_config.controlnet_path, map_location="cpu" 146 | ) 147 | controlnet_state_dict = ( 148 | controlnet_state_dict["controlnet"] 149 | if "controlnet" in controlnet_state_dict 150 | else controlnet_state_dict 151 | ) 152 | controlnet_state_dict.pop("animatediff_config", "") 153 | controlnet.load_state_dict(controlnet_state_dict) 154 | controlnet.cuda() 155 | 156 | image_paths = model_config.controlnet_images 157 | if isinstance(image_paths, str): 158 | image_paths = [image_paths] 159 | 160 | print(f"controlnet image paths:") 161 | for path in image_paths: 162 | print(path) 163 | assert len(image_paths) <= model_config.L 164 | 165 | image_transforms = transforms.Compose( 166 | [ 167 | transforms.RandomResizedCrop( 168 | (model_config.H, model_config.W), 169 | (1.0, 1.0), 170 | ratio=( 171 | model_config.W / model_config.H, 172 | model_config.W / model_config.H, 173 | ), 174 | ), 175 | transforms.ToTensor(), 176 | ] 177 | ) 178 | 179 | if model_config.get("normalize_condition_images", False): 180 | 181 | def image_norm(image): 182 | image = image.mean(dim=0, keepdim=True).repeat(3, 1, 1) 183 | image -= image.min() 184 | image /= image.max() 185 | return image 186 | 187 | else: 188 | image_norm = lambda x: x 189 | 190 | controlnet_images = [ 191 | image_norm(image_transforms(Image.open(path).convert("RGB"))) 192 | for path in image_paths 193 | ] 194 | 195 | os.makedirs(os.path.join(savedir, "control_images"), exist_ok=True) 196 | for i, image in enumerate(controlnet_images): 197 | Image.fromarray( 198 | (255.0 * (image.numpy().transpose(1, 2, 0))).astype(np.uint8) 199 | ).save(f"{savedir}/control_images/{i}.png") 200 | 201 | controlnet_images = torch.stack(controlnet_images).unsqueeze(0).cuda() 202 | controlnet_images = rearrange(controlnet_images, "b f c h w -> b c f h w") 203 | 204 | if controlnet.use_simplified_condition_embedding: 205 | num_controlnet_images = controlnet_images.shape[2] 206 | controlnet_images = rearrange( 207 | controlnet_images, "b c f h w -> (b f) c h w" 208 | ) 209 | controlnet_images = ( 210 | vae.encode(controlnet_images * 2.0 - 1.0).latent_dist.sample() 211 | * 0.18215 212 | ) 213 | controlnet_images = rearrange( 214 | controlnet_images, 215 | "(b f) c h w -> b c f h w", 216 | f=num_controlnet_images, 217 | ) 218 | 219 | # set xformers 220 | if is_xformers_available() and (not args.without_xformers): 221 | unet.enable_xformers_memory_efficient_attention() 222 | if controlnet is not None: 223 | controlnet.enable_xformers_memory_efficient_attention() 224 | 225 | pipeline = AnimationPipeline( 226 | vae=vae, 227 | text_encoder=text_encoder, 228 | tokenizer=tokenizer, 229 | unet=unet, 230 | controlnet=controlnet, 231 | scheduler=DDIMScheduler( 232 | **OmegaConf.to_container(inference_config.noise_scheduler_kwargs) 233 | ), 234 | ).to("cuda") 235 | 236 | pipeline = load_weights( 237 | pipeline, 238 | # motion module 239 | motion_module_path=model_config.get("motion_module", ""), 240 | motion_module_lora_configs=model_config.get( 241 | "motion_module_lora_configs", [] 242 | ), 243 | # domain adapter 244 | adapter_lora_path=model_config.get("adapter_lora_path", ""), 245 | adapter_lora_scale=model_config.get("adapter_lora_scale", 1.0), 246 | # image layers 247 | dreambooth_model_path=model_config.get("dreambooth_path", ""), 248 | lora_model_path=model_config.get("lora_model_path", ""), 249 | lora_alpha=model_config.get("lora_alpha", 0.8), 250 | ).to("cuda") 251 | 252 | motion_adapter_state_dict, unet_state_dict = load_motion_adapter_ckpt( 253 | pipeline.unet, model_config.get("motion_adapter_ckpt", "") 254 | ) 255 | missing, unexpected = pipeline.unet.load_state_dict( 256 | motion_adapter_state_dict, strict=False 257 | ) 258 | assert len(unexpected) == 0 259 | 260 | spatial_adapter_state_dict, unet_state_dict = load_spatial_adapter_ckpt( 261 | pipeline.unet, model_config.get("spatial_adapter_ckpt", "") 262 | ) 263 | missing, unexpected = pipeline.unet.load_state_dict( 264 | spatial_adapter_state_dict, strict=False 265 | ) 266 | assert len(unexpected) == 0 267 | 268 | prompts = model_config.prompt 269 | n_prompts = ( 270 | list(model_config.n_prompt) * len(prompts) 271 | if len(model_config.n_prompt) == 1 272 | else model_config.n_prompt 273 | ) 274 | 275 | random_seeds = model_config.get("seed", [-1]) 276 | random_seeds = ( 277 | [random_seeds] if isinstance(random_seeds, int) else list(random_seeds) 278 | ) 279 | random_seeds = ( 280 | random_seeds * len(prompts) if len(random_seeds) == 1 else random_seeds 281 | ) 282 | 283 | config[model_idx].random_seed = [] 284 | for prompt_idx, (prompt, n_prompt, random_seed) in enumerate( 285 | zip(prompts, n_prompts, random_seeds) 286 | ): 287 | 288 | # manually set random seed for reproduction 289 | if random_seed != -1: 290 | torch.manual_seed(random_seed) 291 | else: 292 | torch.seed() 293 | config[model_idx].random_seed.append(torch.initial_seed()) 294 | 295 | print(f"current seed: {torch.initial_seed()}") 296 | print(f"sampling {prompt} ...") 297 | sample = pipeline( 298 | prompt, 299 | negative_prompt=n_prompt, 300 | num_inference_steps=model_config.steps, 301 | guidance_scale=model_config.guidance_scale, 302 | width=model_config.W, 303 | height=model_config.H, 304 | video_length=model_config.L, 305 | controlnet_images=controlnet_images, 306 | controlnet_image_index=model_config.get("controlnet_image_indexs", [0]), 307 | ).videos 308 | samples.append(sample) 309 | 310 | prompt = "-".join((prompt.replace("/", "").split(" ")[:10])) 311 | save_videos_grid(sample, f"{savedir}/sample/{sample_idx}-{prompt}.gif") 312 | print(f"save to {savedir}/sample/{prompt}.gif") 313 | 314 | sample_idx += 1 315 | 316 | samples = torch.concat(samples) 317 | save_videos_grid(samples, f"{savedir}/sample.gif", n_rows=4) 318 | 319 | OmegaConf.save(config, f"{savedir}/config.yaml") 320 | 321 | 322 | if __name__ == "__main__": 323 | parser = argparse.ArgumentParser() 324 | parser.add_argument( 325 | "--pretrained-model-path", 326 | type=str, 327 | default="models/StableDiffusion/stable-diffusion-v1-5", 328 | ) 329 | parser.add_argument( 330 | "--inference-config", type=str, default="configs/inference/inference-v1.yaml" 331 | ) 332 | parser.add_argument("--config", type=str, required=True) 333 | 334 | parser.add_argument("--L", type=int, default=16) 335 | parser.add_argument("--W", type=int, default=512) 336 | parser.add_argument("--H", type=int, default=512) 337 | 338 | parser.add_argument("--without-xformers", action="store_true") 339 | 340 | args = parser.parse_args() 341 | main(args) 342 | -------------------------------------------------------------------------------- /AnimateDiff/wget-log: -------------------------------------------------------------------------------- 1 | --2024-07-26 18:24:24-- https://secta-models.s3.us-east-1.amazonaws.com/1336237.tar?X-Amz-Algorithm=AWS4-HMAC-SHA256 2 | Resolving secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)... 52.217.172.210, 54.231.201.34, 52.217.124.250, ... 3 | Connecting to secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)|52.217.172.210|:443... connected. 4 | HTTP request sent, awaiting response... 400 Bad Request 5 | 2024-07-26 18:24:25 ERROR 400: Bad Request. 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Still-Moving: Open-Source Implementation 2 | 3 | ## About 4 | 5 | This repository contains an open-source implementation of the "Still-Moving" model, based on the paper "Still-Moving: Customized Video Generation without Customized Video Data" by Chefer et al. 6 | [project page](https://still-moving.github.io/) 7 | 8 | Still-Moving is a novel framework for customizing text-to-video (T2V) generation models without requiring customized video data. It leverages customized text-to-image (T2I) models and adapts them for video generation, combining spatial priors from T2I models with motion priors from T2V models. 9 | 10 | 11 | ## Progress 12 | I trained Motion adapter and Spatial Adapter as mentioned in the paper. 13 | Not sure why the motion is so fast, and output is bad with customized dreambooth model 14 | ![Alt text](./results/0.gif) 15 | 16 | ## Key Features 17 | 18 | - Customization of T2V models using only still image data 19 | - Support for personalization, stylization, and conditional generation 20 | - Implementation of Motion Adapters and Spatial Adapters 21 | - Compatible with different T2V architectures (e.g., Lumiere, AnimateDiff) 22 | 23 | ## Installation 24 | 25 | [Include installation instructions here] 26 | 27 | ## Usage 28 | 29 | [Provide basic usage examples here] 30 | 31 | ## Implementation Details 32 | 33 | - Motion Adapters: LoRA layers applied to temporal attention blocks (✔️ Done) 34 | - Spatial Adapters: LoRA layers added after injected customized T2I layers (✔️ Done) 35 | - Training process: Two-step training for Motion and Spatial Adapters 36 | - Supported models: [List the T2V models you've implemented] 37 | 38 | 39 | ## Contributing 40 | 41 | We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or proposing new features, your efforts are appreciated. 42 | 43 | Please make sure to update tests as appropriate and adhere to the project's coding standards. 44 | 45 | ### Areas for Contribution 46 | 47 | - Implementing support for additional T2V models 48 | - Optimizing performance and reducing computational requirements 49 | - Improving documentation and adding usage examples 50 | - Creating tools for easier model customization 51 | - Developing a user-friendly interface for video generation 52 | 53 | ## License 54 | 55 | Open to use 56 | 57 | ## Contact 58 | 59 | Harsh Bhatt - harshbhatt7585@gmail.com 60 | -------------------------------------------------------------------------------- /adapters/motion_adapter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class LoRALayer(nn.Module): 7 | def __init__(self, in_features, out_features, rank=4): 8 | super().__init__() 9 | self.down = nn.Linear(in_features, rank, bias=False) 10 | self.up = nn.Linear(rank, out_features, bias=False) 11 | self.scale = 1.0 12 | 13 | nn.init.normal_(self.down.weight, std=1 / rank) 14 | nn.init.zeros_(self.up.weight) 15 | 16 | def forward(self, x): 17 | return self.up(self.down(x)) * self.scale 18 | 19 | 20 | class LoRALinear(nn.Module): 21 | def __init__(self, linear_layer, rank=4): 22 | super().__init__() 23 | self.in_features = linear_layer.in_features 24 | self.out_features = linear_layer.out_features 25 | 26 | self.linear = linear_layer 27 | self.lora = LoRALayer(self.in_features, self.out_features, rank=rank) 28 | 29 | def forward(self, x): 30 | return self.lienar(x) + self.lora(x) 31 | -------------------------------------------------------------------------------- /results/0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshbhatt7585/StillMoving/22607c8d52cc2ef6f84734496587e5ce848dba1a/results/0.gif -------------------------------------------------------------------------------- /wget-log: -------------------------------------------------------------------------------- 1 | --2024-07-26 18:26:01-- https://secta-models.s3.us-east-1.amazonaws.com/1336237.tar?X-Amz-Algorithm=AWS4-HMAC-SHA256 2 | Resolving secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)... 3.5.30.19, 54.231.203.82, 52.216.110.142, ... 3 | Connecting to secta-models.s3.us-east-1.amazonaws.com (secta-models.s3.us-east-1.amazonaws.com)|3.5.30.19|:443... connected. 4 | HTTP request sent, awaiting response... 400 Bad Request 5 | 2024-07-26 18:26:01 ERROR 400: Bad Request. 6 | 7 | --------------------------------------------------------------------------------