├── .dockerignore ├── .gitattributes ├── .gitignore ├── DiffSynth-Studio ├── DiffSynth_Studio.py ├── LICENSE ├── README.md ├── diffsynth │ ├── __init__.py │ ├── controlnets │ │ ├── __init__.py │ │ ├── controlnet_unit.py │ │ └── processors.py │ ├── data │ │ ├── __init__.py │ │ └── video.py │ ├── extensions │ │ ├── ESRGAN │ │ │ └── __init__.py │ │ ├── FastBlend │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── cupy_kernels.py │ │ │ ├── data.py │ │ │ ├── patch_match.py │ │ │ └── runners │ │ │ │ ├── __init__.py │ │ │ │ ├── accurate.py │ │ │ │ ├── balanced.py │ │ │ │ ├── fast.py │ │ │ │ └── interpolation.py │ │ └── RIFE │ │ │ └── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── downloader.py │ │ ├── hunyuan_dit.py │ │ ├── hunyuan_dit_text_encoder.py │ │ ├── sd_controlnet.py │ │ ├── sd_ipadapter.py │ │ ├── sd_lora.py │ │ ├── sd_motion.py │ │ ├── sd_text_encoder.py │ │ ├── sd_unet.py │ │ ├── sd_vae_decoder.py │ │ ├── sd_vae_encoder.py │ │ ├── sdxl_ipadapter.py │ │ ├── sdxl_motion.py │ │ ├── sdxl_text_encoder.py │ │ ├── sdxl_unet.py │ │ ├── sdxl_vae_decoder.py │ │ ├── sdxl_vae_encoder.py │ │ ├── svd_image_encoder.py │ │ ├── svd_unet.py │ │ ├── svd_vae_decoder.py │ │ ├── svd_vae_encoder.py │ │ └── tiler.py │ ├── pipelines │ │ ├── __init__.py │ │ ├── dancer.py │ │ ├── hunyuan_dit.py │ │ ├── stable_diffusion.py │ │ ├── stable_diffusion_video.py │ │ ├── stable_diffusion_xl.py │ │ ├── stable_diffusion_xl_video.py │ │ └── stable_video_diffusion.py │ ├── processors │ │ ├── FastBlend.py │ │ ├── PILEditor.py │ │ ├── RIFE.py │ │ ├── __init__.py │ │ ├── base.py │ │ └── sequencial_processor.py │ ├── prompts │ │ ├── __init__.py │ │ ├── hunyuan_dit_prompter.py │ │ ├── sd_prompter.py │ │ ├── sdxl_prompter.py │ │ └── utils.py │ ├── schedulers │ │ ├── __init__.py │ │ ├── continuous_ode.py │ │ └── ddim.py │ └── tokenizer_configs │ │ ├── hunyuan_dit │ │ ├── tokenizer │ │ │ ├── special_tokens_map.json │ │ │ ├── tokenizer_config.json │ │ │ ├── vocab.txt │ │ │ └── vocab_org.txt │ │ └── tokenizer_t5 │ │ │ ├── config.json │ │ │ ├── special_tokens_map.json │ │ │ ├── spiece.model │ │ │ └── tokenizer_config.json │ │ ├── stable_diffusion │ │ └── tokenizer │ │ │ ├── merges.txt │ │ │ ├── special_tokens_map.json │ │ │ ├── tokenizer_config.json │ │ │ └── vocab.json │ │ └── stable_diffusion_xl │ │ └── tokenizer_2 │ │ ├── merges.txt │ │ ├── special_tokens_map.json │ │ ├── tokenizer_config.json │ │ └── vocab.json ├── environment.yml ├── examples │ ├── Diffutoon │ │ ├── Diffutoon.ipynb │ │ ├── README.md │ │ ├── diffutoon_toon_shading.py │ │ ├── diffutoon_toon_shading_with_editing_signals.py │ │ └── sd_toon_shading.py │ ├── ExVideo │ │ ├── ExVideo_ema.py │ │ ├── ExVideo_svd_test.py │ │ ├── ExVideo_svd_train.py │ │ └── README.md │ ├── Ip-Adapter │ │ ├── README.md │ │ └── sdxl_ipadapter.py │ ├── diffsynth │ │ ├── README.md │ │ └── sd_video_rerender.py │ ├── hunyuan_dit │ │ ├── README.md │ │ └── train_hunyuan_dit_lora.py │ ├── image_synthesis │ │ ├── README.md │ │ ├── sd_prompt_refining.py │ │ ├── sd_text_to_image.py │ │ ├── sdxl_text_to_image.py │ │ └── sdxl_turbo.py │ └── video_synthesis │ │ ├── README.md │ │ ├── sd_text_to_video.py │ │ ├── sdxl_text_to_video.py │ │ └── svd_text_to_video.py ├── models │ ├── AnimateDiff │ │ └── Put AnimateDiff ckpt files here.txt │ ├── Annotators │ │ └── Put ControlNet annotators here.txt │ ├── BeautifulPrompt │ │ └── Put BeautifulPrompt models here.txt │ ├── ControlNet │ │ └── Put ControlNet pth files here.txt │ ├── HunyuanDiT │ │ └── Put Hunyuan DiT checkpoints here.txt │ ├── IpAdapter │ │ └── Put IP-Adapter checkpoints here.txt │ ├── RIFE │ │ └── Put RIFE models here.txt │ ├── lora │ │ └── Put lora files here.txt │ ├── stable_diffusion │ │ └── Put Stable Diffusion checkpoints here.txt │ ├── stable_diffusion_xl │ │ └── Put Stable Diffusion XL checkpoints here.txt │ ├── stable_diffusion_xl_turbo │ │ └── Put Stable Diffusion XL Turbo checkpoints here.txt │ ├── stable_video_diffusion │ │ └── Put Stable Video Diffusion checkpoints here.txt │ ├── textual_inversion │ │ └── Put Textual Inversion files here.txt │ └── translator │ │ └── Put translator models here.txt ├── pages │ ├── 1_Image_Creator.py │ └── 2_Video_Creator.py ├── requirements.txt └── setup.py ├── README.md ├── analyse.py ├── cog.yaml ├── download-weights.py ├── predict.py └── sd_toon_shading.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # The .dockerignore file excludes files from the container build process. 2 | # 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file 4 | 5 | # Exclude Git files 6 | .git 7 | .github 8 | .gitignore 9 | 10 | # Exclude Python cache files 11 | __pycache__ 12 | .mypy_cache 13 | .pytest_cache 14 | .ruff_cache 15 | 16 | # Exclude Python virtual environment 17 | /venv 18 | 19 | *.mp4 20 | *.mp3 21 | 22 | analyse.py 23 | download-weights.py -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | *.mp4 3 | __pycache__ -------------------------------------------------------------------------------- /DiffSynth-Studio/DiffSynth_Studio.py: -------------------------------------------------------------------------------- 1 | # Set web page format 2 | import streamlit as st 3 | st.set_page_config(layout="wide") 4 | # Diasble virtual VRAM on windows system 5 | import torch 6 | torch.cuda.set_per_process_memory_fraction(0.999, 0) 7 | 8 | 9 | st.markdown(""" 10 | # DiffSynth Studio 11 | 12 | [Source Code](https://github.com/Artiprocher/DiffSynth-Studio) 13 | 14 | Welcome to DiffSynth Studio. 15 | """) 16 | -------------------------------------------------------------------------------- /DiffSynth-Studio/README.md: -------------------------------------------------------------------------------- 1 | # DiffSynth Studio 2 | 3 | ## Introduction 4 | 5 | DiffSynth Studio is a Diffusion engine. We have restructured architectures including Text Encoder, UNet, VAE, among others, maintaining compatibility with models from the open-source community while enhancing computational performance. We provide many interesting features. Enjoy the magic of Diffusion models! 6 | 7 | ## Roadmap 8 | 9 | * Aug 29, 2023. We propose DiffSynth, a video synthesis framework. 10 | * [Project Page](https://ecnu-cilab.github.io/DiffSynth.github.io/). 11 | * The source codes are released in [EasyNLP](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth). 12 | * The technical report (ECML PKDD 2024) is released on [arXiv](https://arxiv.org/abs/2308.03463). 13 | * Oct 1, 2023. We release an early version of this project, namely FastSDXL. A try for building a diffusion engine. 14 | * The source codes are released on [GitHub](https://github.com/Artiprocher/FastSDXL). 15 | * FastSDXL includes a trainable OLSS scheduler for efficiency improvement. 16 | * The original repo of OLSS is [here](https://github.com/alibaba/EasyNLP/tree/master/diffusion/olss_scheduler). 17 | * The technical report (CIKM 2023) is released on [arXiv](https://arxiv.org/abs/2305.14677). 18 | * A demo video is shown on [Bilibili](https://www.bilibili.com/video/BV1w8411y7uj). 19 | * Since OLSS requires additional training, we don't implement it in this project. 20 | * Nov 15, 2023. We propose FastBlend, a powerful video deflickering algorithm. 21 | * The sd-webui extension is released on [GitHub](https://github.com/Artiprocher/sd-webui-fastblend). 22 | * Demo videos are shown on Bilibili, including three tasks. 23 | * [Video deflickering](https://www.bilibili.com/video/BV1d94y1W7PE) 24 | * [Video interpolation](https://www.bilibili.com/video/BV1Lw411m71p) 25 | * [Image-driven video rendering](https://www.bilibili.com/video/BV1RB4y1Z7LF) 26 | * The technical report is released on [arXiv](https://arxiv.org/abs/2311.09265). 27 | * An unofficial ComfyUI extension developed by other users is released on [GitHub](https://github.com/AInseven/ComfyUI-fastblend). 28 | * Dec 8, 2023. We decide to develop a new Project, aiming to release the potential of diffusion models, especially in video synthesis. The development of this project is started. 29 | * Jan 29, 2024. We propose Diffutoon, a fantastic solution for toon shading. 30 | * [Project Page](https://ecnu-cilab.github.io/DiffutoonProjectPage/). 31 | * The source codes are released in this project. 32 | * The technical report (IJCAI 2024) is released on [arXiv](https://arxiv.org/abs/2401.16224). 33 | * June 13, 2024. DiffSynth Studio is transferred to ModelScope. The developers have transitioned from "I" to "we". Of course, I will still participate in development and maintenance. 34 | * June 21, 2024. We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames. 35 | * [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/). 36 | * Source code is released in this repo. See [`examples/ExVideo`](./examples/ExVideo/). 37 | * Models are released on [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1). 38 | * Technical report is released on [arXiv](https://arxiv.org/abs/2406.14130). 39 | * Until now, DiffSynth Studio has supported the following models: 40 | * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5) 41 | * [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) 42 | * [ControlNet](https://github.com/lllyasviel/ControlNet) 43 | * [AnimateDiff](https://github.com/guoyww/animatediff/) 44 | * [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter) 45 | * [ESRGAN](https://github.com/xinntao/ESRGAN) 46 | * [RIFE](https://github.com/hzwer/ECCV2022-RIFE) 47 | * [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT) 48 | * [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 49 | * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) 50 | 51 | ## Installation 52 | 53 | ``` 54 | git clone https://github.com/modelscope/DiffSynth-Studio.git 55 | cd DiffSynth-Studio 56 | pip install -e . 57 | ``` 58 | 59 | ## Usage (in Python code) 60 | 61 | The Python examples are in [`examples`](./examples/). We provide an overview here. 62 | 63 | ### Long Video Synthesis 64 | 65 | We trained an extended video synthesis model, which can generate 128 frames. [`examples/ExVideo`](./examples/ExVideo/) 66 | 67 | https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc 68 | 69 | ### Image Synthesis 70 | 71 | Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/) 72 | 73 | |512*512|1024*1024|2048*2048|4096*4096| 74 | |-|-|-|-| 75 | |![512](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/55f679e9-7445-4605-9315-302e93d11370)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/9087a73c-9164-4c58-b2a0-effc694143fb)|![4096](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edee9e71-fc39-4d1c-9ca9-fa52002c67ac)| 76 | 77 | |1024*1024|2048*2048| 78 | |-|-| 79 | |![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)| 80 | 81 | ### Toon Shading 82 | 83 | Render realistic videos in a flatten style and enable video editing features. [`examples/Diffutoon`](./examples/Diffutoon/) 84 | 85 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd 86 | 87 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/20528af5-5100-474a-8cdc-440b9efdd86c 88 | 89 | ### Video Stylization 90 | 91 | Video stylization without video models. [`examples/diffsynth`](./examples/diffsynth/) 92 | 93 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea 94 | 95 | ### Chinese Models 96 | 97 | Use Hunyuan-DiT to generate images with Chinese prompts. We also support LoRA fine-tuning of this model. [`examples/hunyuan_dit`](./examples/hunyuan_dit/) 98 | 99 | Prompt: 少女手捧鲜花,坐在公园的长椅上,夕阳的余晖洒在少女的脸庞,整个画面充满诗意的美感 100 | 101 | |1024x1024|2048x2048 (highres-fix)| 102 | |-|-| 103 | |![image_1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/2b6528cf-a229-46e9-b7dd-4a9475b07308)|![image_2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/11d264ec-966b-45c9-9804-74b60428b866)| 104 | 105 | Prompt: 一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉 106 | 107 | |Without LoRA|With LoRA| 108 | |-|-| 109 | |![image_without_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e)|![image_with_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282)| 110 | 111 | ## Usage (in WebUI) 112 | 113 | ``` 114 | python -m streamlit run DiffSynth_Studio.py 115 | ``` 116 | 117 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/93085557-73f3-4eee-a205-9829591ef954 118 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import * 2 | from .models import * 3 | from .prompts import * 4 | from .schedulers import * 5 | from .pipelines import * 6 | from .controlnets import * 7 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/controlnets/__init__.py: -------------------------------------------------------------------------------- 1 | from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager 2 | from .processors import Annotator 3 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/controlnets/controlnet_unit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from .processors import Processor_id 4 | 5 | 6 | class ControlNetConfigUnit: 7 | def __init__(self, processor_id: Processor_id, model_path, scale=1.0): 8 | self.processor_id = processor_id 9 | self.model_path = model_path 10 | self.scale = scale 11 | 12 | 13 | class ControlNetUnit: 14 | def __init__(self, processor, model, scale=1.0): 15 | self.processor = processor 16 | self.model = model 17 | self.scale = scale 18 | 19 | 20 | class MultiControlNetManager: 21 | def __init__(self, controlnet_units=[]): 22 | self.processors = [unit.processor for unit in controlnet_units] 23 | self.models = [unit.model for unit in controlnet_units] 24 | self.scales = [unit.scale for unit in controlnet_units] 25 | 26 | def process_image(self, image, processor_id=None): 27 | if processor_id is None: 28 | processed_image = [processor(image) for processor in self.processors] 29 | else: 30 | processed_image = [self.processors[processor_id](image)] 31 | processed_image = torch.concat([ 32 | torch.Tensor(np.array(image_, dtype=np.float32) / 255).permute(2, 0, 1).unsqueeze(0) 33 | for image_ in processed_image 34 | ], dim=0) 35 | return processed_image 36 | 37 | def __call__( 38 | self, 39 | sample, timestep, encoder_hidden_states, conditionings, 40 | tiled=False, tile_size=64, tile_stride=32 41 | ): 42 | res_stack = None 43 | for conditioning, model, scale in zip(conditionings, self.models, self.scales): 44 | res_stack_ = model( 45 | sample, timestep, encoder_hidden_states, conditioning, 46 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride 47 | ) 48 | res_stack_ = [res * scale for res in res_stack_] 49 | if res_stack is None: 50 | res_stack = res_stack_ 51 | else: 52 | res_stack = [i + j for i, j in zip(res_stack, res_stack_)] 53 | return res_stack 54 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/controlnets/processors.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Literal, TypeAlias 2 | import warnings 3 | with warnings.catch_warnings(): 4 | warnings.simplefilter("ignore") 5 | from controlnet_aux.processor import ( 6 | CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector 7 | ) 8 | 9 | 10 | Processor_id: TypeAlias = Literal[ 11 | "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "tile" 12 | ] 13 | 14 | class Annotator: 15 | def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None): 16 | if processor_id == "canny": 17 | self.processor = CannyDetector() 18 | elif processor_id == "depth": 19 | self.processor = MidasDetector.from_pretrained(model_path).to("cuda") 20 | elif processor_id == "softedge": 21 | self.processor = HEDdetector.from_pretrained(model_path).to("cuda") 22 | elif processor_id == "lineart": 23 | self.processor = LineartDetector.from_pretrained(model_path).to("cuda") 24 | elif processor_id == "lineart_anime": 25 | self.processor = LineartAnimeDetector.from_pretrained(model_path).to("cuda") 26 | elif processor_id == "openpose": 27 | self.processor = OpenposeDetector.from_pretrained(model_path).to("cuda") 28 | elif processor_id == "tile": 29 | self.processor = None 30 | else: 31 | raise ValueError(f"Unsupported processor_id: {processor_id}") 32 | 33 | self.processor_id = processor_id 34 | self.detect_resolution = detect_resolution 35 | 36 | def __call__(self, image): 37 | width, height = image.size 38 | if self.processor_id == "openpose": 39 | kwargs = { 40 | "include_body": True, 41 | "include_hand": True, 42 | "include_face": True 43 | } 44 | else: 45 | kwargs = {} 46 | if self.processor is not None: 47 | detect_resolution = self.detect_resolution if self.detect_resolution is not None else min(width, height) 48 | image = self.processor(image, detect_resolution=detect_resolution, image_resolution=min(width, height), **kwargs) 49 | image = image.resize((width, height)) 50 | return image 51 | 52 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .video import VideoData, save_video, save_frames 2 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/data/video.py: -------------------------------------------------------------------------------- 1 | import imageio, os 2 | import numpy as np 3 | from PIL import Image 4 | from tqdm import tqdm 5 | 6 | 7 | class LowMemoryVideo: 8 | def __init__(self, file_name): 9 | self.reader = imageio.get_reader(file_name) 10 | 11 | def __len__(self): 12 | return self.reader.count_frames() 13 | 14 | def __getitem__(self, item): 15 | return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB") 16 | 17 | def __del__(self): 18 | self.reader.close() 19 | 20 | 21 | def split_file_name(file_name): 22 | result = [] 23 | number = -1 24 | for i in file_name: 25 | if ord(i)>=ord("0") and ord(i)<=ord("9"): 26 | if number == -1: 27 | number = 0 28 | number = number*10 + ord(i) - ord("0") 29 | else: 30 | if number != -1: 31 | result.append(number) 32 | number = -1 33 | result.append(i) 34 | if number != -1: 35 | result.append(number) 36 | result = tuple(result) 37 | return result 38 | 39 | 40 | def search_for_images(folder): 41 | file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")] 42 | file_list = [(split_file_name(file_name), file_name) for file_name in file_list] 43 | file_list = [i[1] for i in sorted(file_list)] 44 | file_list = [os.path.join(folder, i) for i in file_list] 45 | return file_list 46 | 47 | 48 | class LowMemoryImageFolder: 49 | def __init__(self, folder, file_list=None): 50 | if file_list is None: 51 | self.file_list = search_for_images(folder) 52 | else: 53 | self.file_list = [os.path.join(folder, file_name) for file_name in file_list] 54 | 55 | def __len__(self): 56 | return len(self.file_list) 57 | 58 | def __getitem__(self, item): 59 | return Image.open(self.file_list[item]).convert("RGB") 60 | 61 | def __del__(self): 62 | pass 63 | 64 | 65 | def crop_and_resize(image, height, width): 66 | image = np.array(image) 67 | image_height, image_width, _ = image.shape 68 | if image_height / image_width < height / width: 69 | croped_width = int(image_height / height * width) 70 | left = (image_width - croped_width) // 2 71 | image = image[:, left: left+croped_width] 72 | image = Image.fromarray(image).resize((width, height)) 73 | else: 74 | croped_height = int(image_width / width * height) 75 | left = (image_height - croped_height) // 2 76 | image = image[left: left+croped_height, :] 77 | image = Image.fromarray(image).resize((width, height)) 78 | return image 79 | 80 | 81 | class VideoData: 82 | def __init__(self, video_file=None, image_folder=None, height=None, width=None, **kwargs): 83 | if video_file is not None: 84 | self.data_type = "video" 85 | self.data = LowMemoryVideo(video_file, **kwargs) 86 | elif image_folder is not None: 87 | self.data_type = "images" 88 | self.data = LowMemoryImageFolder(image_folder, **kwargs) 89 | else: 90 | raise ValueError("Cannot open video or image folder") 91 | self.length = None 92 | self.set_shape(height, width) 93 | 94 | def raw_data(self): 95 | frames = [] 96 | for i in range(self.__len__()): 97 | frames.append(self.__getitem__(i)) 98 | return frames 99 | 100 | def set_length(self, length): 101 | self.length = length 102 | 103 | def set_shape(self, height, width): 104 | self.height = height 105 | self.width = width 106 | 107 | def __len__(self): 108 | if self.length is None: 109 | return len(self.data) 110 | else: 111 | return self.length 112 | 113 | def shape(self): 114 | if self.height is not None and self.width is not None: 115 | return self.height, self.width 116 | else: 117 | height, width, _ = self.__getitem__(0).shape 118 | return height, width 119 | 120 | def __getitem__(self, item): 121 | frame = self.data.__getitem__(item) 122 | width, height = frame.size 123 | if self.height is not None and self.width is not None: 124 | if self.height != height or self.width != width: 125 | frame = crop_and_resize(frame, self.height, self.width) 126 | return frame 127 | 128 | def __del__(self): 129 | pass 130 | 131 | def save_images(self, folder): 132 | os.makedirs(folder, exist_ok=True) 133 | for i in tqdm(range(self.__len__()), desc="Saving images"): 134 | frame = self.__getitem__(i) 135 | frame.save(os.path.join(folder, f"{i}.png")) 136 | 137 | 138 | def save_video(frames, save_path, fps, quality=9): 139 | writer = imageio.get_writer(save_path, fps=fps, quality=quality) 140 | for frame in tqdm(frames, desc="Saving video"): 141 | frame = np.array(frame) 142 | writer.append_data(frame) 143 | writer.close() 144 | 145 | def save_frames(frames, save_path): 146 | os.makedirs(save_path, exist_ok=True) 147 | for i, frame in enumerate(tqdm(frames, desc="Saving images")): 148 | frame.save(os.path.join(save_path, f"{i}.png")) 149 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/ESRGAN/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import repeat 3 | from PIL import Image 4 | import numpy as np 5 | 6 | 7 | class ResidualDenseBlock(torch.nn.Module): 8 | 9 | def __init__(self, num_feat=64, num_grow_ch=32): 10 | super(ResidualDenseBlock, self).__init__() 11 | self.conv1 = torch.nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1) 12 | self.conv2 = torch.nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1) 13 | self.conv3 = torch.nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1) 14 | self.conv4 = torch.nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1) 15 | self.conv5 = torch.nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1) 16 | self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True) 17 | 18 | def forward(self, x): 19 | x1 = self.lrelu(self.conv1(x)) 20 | x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1))) 21 | x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1))) 22 | x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1))) 23 | x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1)) 24 | return x5 * 0.2 + x 25 | 26 | 27 | class RRDB(torch.nn.Module): 28 | 29 | def __init__(self, num_feat, num_grow_ch=32): 30 | super(RRDB, self).__init__() 31 | self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch) 32 | self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch) 33 | self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch) 34 | 35 | def forward(self, x): 36 | out = self.rdb1(x) 37 | out = self.rdb2(out) 38 | out = self.rdb3(out) 39 | return out * 0.2 + x 40 | 41 | 42 | class RRDBNet(torch.nn.Module): 43 | 44 | def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32): 45 | super(RRDBNet, self).__init__() 46 | self.conv_first = torch.nn.Conv2d(num_in_ch, num_feat, 3, 1, 1) 47 | self.body = torch.torch.nn.Sequential(*[RRDB(num_feat=num_feat, num_grow_ch=num_grow_ch) for _ in range(num_block)]) 48 | self.conv_body = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) 49 | # upsample 50 | self.conv_up1 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) 51 | self.conv_up2 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) 52 | self.conv_hr = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) 53 | self.conv_last = torch.nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) 54 | self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True) 55 | 56 | def forward(self, x): 57 | feat = x 58 | feat = self.conv_first(feat) 59 | body_feat = self.conv_body(self.body(feat)) 60 | feat = feat + body_feat 61 | # upsample 62 | feat = repeat(feat, "B C H W -> B C (H 2) (W 2)") 63 | feat = self.lrelu(self.conv_up1(feat)) 64 | feat = repeat(feat, "B C H W -> B C (H 2) (W 2)") 65 | feat = self.lrelu(self.conv_up2(feat)) 66 | out = self.conv_last(self.lrelu(self.conv_hr(feat))) 67 | return out 68 | 69 | 70 | class ESRGAN(torch.nn.Module): 71 | def __init__(self, model): 72 | super().__init__() 73 | self.model = model 74 | 75 | @staticmethod 76 | def from_pretrained(model_path): 77 | model = RRDBNet() 78 | state_dict = torch.load(model_path, map_location="cpu")["params_ema"] 79 | model.load_state_dict(state_dict) 80 | model.eval() 81 | return ESRGAN(model) 82 | 83 | def process_image(self, image): 84 | image = torch.Tensor(np.array(image, dtype=np.float32) / 255).permute(2, 0, 1) 85 | return image 86 | 87 | def process_images(self, images): 88 | images = [self.process_image(image) for image in images] 89 | images = torch.stack(images) 90 | return images 91 | 92 | def decode_images(self, images): 93 | images = (images.permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8) 94 | images = [Image.fromarray(image) for image in images] 95 | return images 96 | 97 | @torch.no_grad() 98 | def upscale(self, images, batch_size=4, progress_bar=lambda x:x): 99 | # Preprocess 100 | input_tensor = self.process_images(images) 101 | 102 | # Interpolate 103 | output_tensor = [] 104 | for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)): 105 | batch_id_ = min(batch_id + batch_size, input_tensor.shape[0]) 106 | batch_input_tensor = input_tensor[batch_id: batch_id_] 107 | batch_input_tensor = batch_input_tensor.to( 108 | device=self.model.conv_first.weight.device, 109 | dtype=self.model.conv_first.weight.dtype) 110 | batch_output_tensor = self.model(batch_input_tensor) 111 | output_tensor.append(batch_output_tensor.cpu()) 112 | 113 | # Output 114 | output_tensor = torch.concat(output_tensor, dim=0) 115 | 116 | # To images 117 | output_images = self.decode_images(output_tensor) 118 | return output_images 119 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/__init__.py: -------------------------------------------------------------------------------- 1 | from .runners.fast import TableManager, PyramidPatchMatcher 2 | from PIL import Image 3 | import numpy as np 4 | import cupy as cp 5 | 6 | 7 | class FastBlendSmoother: 8 | def __init__(self): 9 | self.batch_size = 8 10 | self.window_size = 64 11 | self.ebsynth_config = { 12 | "minimum_patch_size": 5, 13 | "threads_per_block": 8, 14 | "num_iter": 5, 15 | "gpu_id": 0, 16 | "guide_weight": 10.0, 17 | "initialize": "identity", 18 | "tracking_window_size": 0, 19 | } 20 | 21 | @staticmethod 22 | def from_model_manager(model_manager): 23 | # TODO: fetch GPU ID from model_manager 24 | return FastBlendSmoother() 25 | 26 | def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config): 27 | frames_guide = [np.array(frame) for frame in frames_guide] 28 | frames_style = [np.array(frame) for frame in frames_style] 29 | table_manager = TableManager() 30 | patch_match_engine = PyramidPatchMatcher( 31 | image_height=frames_style[0].shape[0], 32 | image_width=frames_style[0].shape[1], 33 | channel=3, 34 | **ebsynth_config 35 | ) 36 | # left part 37 | table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="FastBlend Step 1/4") 38 | table_l = table_manager.remapping_table_to_blending_table(table_l) 39 | table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="FastBlend Step 2/4") 40 | # right part 41 | table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="FastBlend Step 3/4") 42 | table_r = table_manager.remapping_table_to_blending_table(table_r) 43 | table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="FastBlend Step 4/4")[::-1] 44 | # merge 45 | frames = [] 46 | for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r): 47 | weight_m = -1 48 | weight = weight_l + weight_m + weight_r 49 | frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight) 50 | frames.append(frame) 51 | frames = [Image.fromarray(frame.clip(0, 255).astype("uint8")) for frame in frames] 52 | return frames 53 | 54 | def __call__(self, rendered_frames, original_frames=None, **kwargs): 55 | frames = self.run( 56 | original_frames, rendered_frames, 57 | self.batch_size, self.window_size, self.ebsynth_config 58 | ) 59 | mempool = cp.get_default_memory_pool() 60 | pinned_mempool = cp.get_default_pinned_memory_pool() 61 | mempool.free_all_blocks() 62 | pinned_mempool.free_all_blocks() 63 | return frames -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/cupy_kernels.py: -------------------------------------------------------------------------------- 1 | import cupy as cp 2 | 3 | remapping_kernel = cp.RawKernel(r''' 4 | extern "C" __global__ 5 | void remap( 6 | const int height, 7 | const int width, 8 | const int channel, 9 | const int patch_size, 10 | const int pad_size, 11 | const float* source_style, 12 | const int* nnf, 13 | float* target_style 14 | ) { 15 | const int r = (patch_size - 1) / 2; 16 | const int x = blockDim.x * blockIdx.x + threadIdx.x; 17 | const int y = blockDim.y * blockIdx.y + threadIdx.y; 18 | if (x >= height or y >= width) return; 19 | const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel; 20 | const int pid = (x + pad_size) * (width + pad_size * 2) + (y + pad_size); 21 | const int min_px = x < r ? -x : -r; 22 | const int max_px = x + r > height - 1 ? height - 1 - x : r; 23 | const int min_py = y < r ? -y : -r; 24 | const int max_py = y + r > width - 1 ? width - 1 - y : r; 25 | int num = 0; 26 | for (int px = min_px; px <= max_px; px++){ 27 | for (int py = min_py; py <= max_py; py++){ 28 | const int nid = (x + px) * width + y + py; 29 | const int x_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 0] - px; 30 | const int y_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 1] - py; 31 | if (x_ < 0 or y_ < 0 or x_ >= height or y_ >= width)continue; 32 | const int pid_ = (x_ + pad_size) * (width + pad_size * 2) + (y_ + pad_size); 33 | num++; 34 | for (int c = 0; c < channel; c++){ 35 | target_style[z + pid * channel + c] += source_style[z + pid_ * channel + c]; 36 | } 37 | } 38 | } 39 | for (int c = 0; c < channel; c++){ 40 | target_style[z + pid * channel + c] /= num; 41 | } 42 | } 43 | ''', 'remap') 44 | 45 | 46 | patch_error_kernel = cp.RawKernel(r''' 47 | extern "C" __global__ 48 | void patch_error( 49 | const int height, 50 | const int width, 51 | const int channel, 52 | const int patch_size, 53 | const int pad_size, 54 | const float* source, 55 | const int* nnf, 56 | const float* target, 57 | float* error 58 | ) { 59 | const int r = (patch_size - 1) / 2; 60 | const int x = blockDim.x * blockIdx.x + threadIdx.x; 61 | const int y = blockDim.y * blockIdx.y + threadIdx.y; 62 | const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel; 63 | if (x >= height or y >= width) return; 64 | const int x_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 0]; 65 | const int y_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 1]; 66 | float e = 0; 67 | for (int px = -r; px <= r; px++){ 68 | for (int py = -r; py <= r; py++){ 69 | const int pid = (x + pad_size + px) * (width + pad_size * 2) + y + pad_size + py; 70 | const int pid_ = (x_ + pad_size + px) * (width + pad_size * 2) + y_ + pad_size + py; 71 | for (int c = 0; c < channel; c++){ 72 | const float diff = target[z + pid * channel + c] - source[z + pid_ * channel + c]; 73 | e += diff * diff; 74 | } 75 | } 76 | } 77 | error[blockIdx.z * height * width + x * width + y] = e; 78 | } 79 | ''', 'patch_error') 80 | 81 | 82 | pairwise_patch_error_kernel = cp.RawKernel(r''' 83 | extern "C" __global__ 84 | void pairwise_patch_error( 85 | const int height, 86 | const int width, 87 | const int channel, 88 | const int patch_size, 89 | const int pad_size, 90 | const float* source_a, 91 | const int* nnf_a, 92 | const float* source_b, 93 | const int* nnf_b, 94 | float* error 95 | ) { 96 | const int r = (patch_size - 1) / 2; 97 | const int x = blockDim.x * blockIdx.x + threadIdx.x; 98 | const int y = blockDim.y * blockIdx.y + threadIdx.y; 99 | const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel; 100 | if (x >= height or y >= width) return; 101 | const int z_nnf = blockIdx.z * height * width * 2 + (x * width + y) * 2; 102 | const int x_a = nnf_a[z_nnf + 0]; 103 | const int y_a = nnf_a[z_nnf + 1]; 104 | const int x_b = nnf_b[z_nnf + 0]; 105 | const int y_b = nnf_b[z_nnf + 1]; 106 | float e = 0; 107 | for (int px = -r; px <= r; px++){ 108 | for (int py = -r; py <= r; py++){ 109 | const int pid_a = (x_a + pad_size + px) * (width + pad_size * 2) + y_a + pad_size + py; 110 | const int pid_b = (x_b + pad_size + px) * (width + pad_size * 2) + y_b + pad_size + py; 111 | for (int c = 0; c < channel; c++){ 112 | const float diff = source_a[z + pid_a * channel + c] - source_b[z + pid_b * channel + c]; 113 | e += diff * diff; 114 | } 115 | } 116 | } 117 | error[blockIdx.z * height * width + x * width + y] = e; 118 | } 119 | ''', 'pairwise_patch_error') 120 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/data.py: -------------------------------------------------------------------------------- 1 | import imageio, os 2 | import numpy as np 3 | from PIL import Image 4 | 5 | 6 | def read_video(file_name): 7 | reader = imageio.get_reader(file_name) 8 | video = [] 9 | for frame in reader: 10 | frame = np.array(frame) 11 | video.append(frame) 12 | reader.close() 13 | return video 14 | 15 | 16 | def get_video_fps(file_name): 17 | reader = imageio.get_reader(file_name) 18 | fps = reader.get_meta_data()["fps"] 19 | reader.close() 20 | return fps 21 | 22 | 23 | def save_video(frames_path, video_path, num_frames, fps): 24 | writer = imageio.get_writer(video_path, fps=fps, quality=9) 25 | for i in range(num_frames): 26 | frame = np.array(Image.open(os.path.join(frames_path, "%05d.png" % i))) 27 | writer.append_data(frame) 28 | writer.close() 29 | return video_path 30 | 31 | 32 | class LowMemoryVideo: 33 | def __init__(self, file_name): 34 | self.reader = imageio.get_reader(file_name) 35 | 36 | def __len__(self): 37 | return self.reader.count_frames() 38 | 39 | def __getitem__(self, item): 40 | return np.array(self.reader.get_data(item)) 41 | 42 | def __del__(self): 43 | self.reader.close() 44 | 45 | 46 | def split_file_name(file_name): 47 | result = [] 48 | number = -1 49 | for i in file_name: 50 | if ord(i)>=ord("0") and ord(i)<=ord("9"): 51 | if number == -1: 52 | number = 0 53 | number = number*10 + ord(i) - ord("0") 54 | else: 55 | if number != -1: 56 | result.append(number) 57 | number = -1 58 | result.append(i) 59 | if number != -1: 60 | result.append(number) 61 | result = tuple(result) 62 | return result 63 | 64 | 65 | def search_for_images(folder): 66 | file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")] 67 | file_list = [(split_file_name(file_name), file_name) for file_name in file_list] 68 | file_list = [i[1] for i in sorted(file_list)] 69 | file_list = [os.path.join(folder, i) for i in file_list] 70 | return file_list 71 | 72 | 73 | def read_images(folder): 74 | file_list = search_for_images(folder) 75 | frames = [np.array(Image.open(i)) for i in file_list] 76 | return frames 77 | 78 | 79 | class LowMemoryImageFolder: 80 | def __init__(self, folder, file_list=None): 81 | if file_list is None: 82 | self.file_list = search_for_images(folder) 83 | else: 84 | self.file_list = [os.path.join(folder, file_name) for file_name in file_list] 85 | 86 | def __len__(self): 87 | return len(self.file_list) 88 | 89 | def __getitem__(self, item): 90 | return np.array(Image.open(self.file_list[item])) 91 | 92 | def __del__(self): 93 | pass 94 | 95 | 96 | class VideoData: 97 | def __init__(self, video_file, image_folder, **kwargs): 98 | if video_file is not None: 99 | self.data_type = "video" 100 | self.data = LowMemoryVideo(video_file, **kwargs) 101 | elif image_folder is not None: 102 | self.data_type = "images" 103 | self.data = LowMemoryImageFolder(image_folder, **kwargs) 104 | else: 105 | raise ValueError("Cannot open video or image folder") 106 | self.length = None 107 | self.height = None 108 | self.width = None 109 | 110 | def raw_data(self): 111 | frames = [] 112 | for i in range(self.__len__()): 113 | frames.append(self.__getitem__(i)) 114 | return frames 115 | 116 | def set_length(self, length): 117 | self.length = length 118 | 119 | def set_shape(self, height, width): 120 | self.height = height 121 | self.width = width 122 | 123 | def __len__(self): 124 | if self.length is None: 125 | return len(self.data) 126 | else: 127 | return self.length 128 | 129 | def shape(self): 130 | if self.height is not None and self.width is not None: 131 | return self.height, self.width 132 | else: 133 | height, width, _ = self.__getitem__(0).shape 134 | return height, width 135 | 136 | def __getitem__(self, item): 137 | frame = self.data.__getitem__(item) 138 | height, width, _ = frame.shape 139 | if self.height is not None and self.width is not None: 140 | if self.height != height or self.width != width: 141 | frame = Image.fromarray(frame).resize((self.width, self.height)) 142 | frame = np.array(frame) 143 | return frame 144 | 145 | def __del__(self): 146 | pass 147 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/runners/__init__.py: -------------------------------------------------------------------------------- 1 | from .accurate import AccurateModeRunner 2 | from .fast import FastModeRunner 3 | from .balanced import BalancedModeRunner 4 | from .interpolation import InterpolationModeRunner, InterpolationModeSingleFrameRunner 5 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/runners/accurate.py: -------------------------------------------------------------------------------- 1 | from ..patch_match import PyramidPatchMatcher 2 | import os 3 | import numpy as np 4 | from PIL import Image 5 | from tqdm import tqdm 6 | 7 | 8 | class AccurateModeRunner: 9 | def __init__(self): 10 | pass 11 | 12 | def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Accurate Mode", save_path=None): 13 | patch_match_engine = PyramidPatchMatcher( 14 | image_height=frames_style[0].shape[0], 15 | image_width=frames_style[0].shape[1], 16 | channel=3, 17 | use_mean_target_style=True, 18 | **ebsynth_config 19 | ) 20 | # run 21 | n = len(frames_style) 22 | for target in tqdm(range(n), desc=desc): 23 | l, r = max(target - window_size, 0), min(target + window_size + 1, n) 24 | remapped_frames = [] 25 | for i in range(l, r, batch_size): 26 | j = min(i + batch_size, r) 27 | source_guide = np.stack([frames_guide[source] for source in range(i, j)]) 28 | target_guide = np.stack([frames_guide[target]] * (j - i)) 29 | source_style = np.stack([frames_style[source] for source in range(i, j)]) 30 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 31 | remapped_frames.append(target_style) 32 | frame = np.concatenate(remapped_frames, axis=0).mean(axis=0) 33 | frame = frame.clip(0, 255).astype("uint8") 34 | if save_path is not None: 35 | Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target)) -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/runners/balanced.py: -------------------------------------------------------------------------------- 1 | from ..patch_match import PyramidPatchMatcher 2 | import os 3 | import numpy as np 4 | from PIL import Image 5 | from tqdm import tqdm 6 | 7 | 8 | class BalancedModeRunner: 9 | def __init__(self): 10 | pass 11 | 12 | def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Balanced Mode", save_path=None): 13 | patch_match_engine = PyramidPatchMatcher( 14 | image_height=frames_style[0].shape[0], 15 | image_width=frames_style[0].shape[1], 16 | channel=3, 17 | **ebsynth_config 18 | ) 19 | # tasks 20 | n = len(frames_style) 21 | tasks = [] 22 | for target in range(n): 23 | for source in range(target - window_size, target + window_size + 1): 24 | if source >= 0 and source < n and source != target: 25 | tasks.append((source, target)) 26 | # run 27 | frames = [(None, 1) for i in range(n)] 28 | for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc): 29 | tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))] 30 | source_guide = np.stack([frames_guide[source] for source, target in tasks_batch]) 31 | target_guide = np.stack([frames_guide[target] for source, target in tasks_batch]) 32 | source_style = np.stack([frames_style[source] for source, target in tasks_batch]) 33 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 34 | for (source, target), result in zip(tasks_batch, target_style): 35 | frame, weight = frames[target] 36 | if frame is None: 37 | frame = frames_style[target] 38 | frames[target] = ( 39 | frame * (weight / (weight + 1)) + result / (weight + 1), 40 | weight + 1 41 | ) 42 | if weight + 1 == min(n, target + window_size + 1) - max(0, target - window_size): 43 | frame = frame.clip(0, 255).astype("uint8") 44 | if save_path is not None: 45 | Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target)) 46 | frames[target] = (None, 1) 47 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/extensions/FastBlend/runners/fast.py: -------------------------------------------------------------------------------- 1 | from ..patch_match import PyramidPatchMatcher 2 | import functools, os 3 | import numpy as np 4 | from PIL import Image 5 | from tqdm import tqdm 6 | 7 | 8 | class TableManager: 9 | def __init__(self): 10 | pass 11 | 12 | def task_list(self, n): 13 | tasks = [] 14 | max_level = 1 15 | while (1<=n: 24 | break 25 | meta_data = { 26 | "source": i, 27 | "target": j, 28 | "level": level + 1 29 | } 30 | tasks.append(meta_data) 31 | tasks.sort(key=functools.cmp_to_key(lambda u, v: u["level"]-v["level"])) 32 | return tasks 33 | 34 | def build_remapping_table(self, frames_guide, frames_style, patch_match_engine, batch_size, desc=""): 35 | n = len(frames_guide) 36 | tasks = self.task_list(n) 37 | remapping_table = [[(frames_style[i], 1)] for i in range(n)] 38 | for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc): 39 | tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))] 40 | source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch]) 41 | target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch]) 42 | source_style = np.stack([frames_style[task["source"]] for task in tasks_batch]) 43 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 44 | for task, result in zip(tasks_batch, target_style): 45 | target, level = task["target"], task["level"] 46 | if len(remapping_table[target])==level: 47 | remapping_table[target].append((result, 1)) 48 | else: 49 | frame, weight = remapping_table[target][level] 50 | remapping_table[target][level] = ( 51 | frame * (weight / (weight + 1)) + result / (weight + 1), 52 | weight + 1 53 | ) 54 | return remapping_table 55 | 56 | def remapping_table_to_blending_table(self, table): 57 | for i in range(len(table)): 58 | for j in range(1, len(table[i])): 59 | frame_1, weight_1 = table[i][j-1] 60 | frame_2, weight_2 = table[i][j] 61 | frame = (frame_1 + frame_2) / 2 62 | weight = weight_1 + weight_2 63 | table[i][j] = (frame, weight) 64 | return table 65 | 66 | def tree_query(self, leftbound, rightbound): 67 | node_list = [] 68 | node_index = rightbound 69 | while node_index>=leftbound: 70 | node_level = 0 71 | while (1<=leftbound: 72 | node_level += 1 73 | node_list.append((node_index, node_level)) 74 | node_index -= 1<0: 31 | tasks = [] 32 | for m in range(index_style[0]): 33 | tasks.append((index_style[0], m, index_style[0])) 34 | task_group.append(tasks) 35 | # middle frames 36 | for l, r in zip(index_style[:-1], index_style[1:]): 37 | tasks = [] 38 | for m in range(l, r): 39 | tasks.append((l, m, r)) 40 | task_group.append(tasks) 41 | # last frame 42 | tasks = [] 43 | for m in range(index_style[-1], n): 44 | tasks.append((index_style[-1], m, index_style[-1])) 45 | task_group.append(tasks) 46 | return task_group 47 | 48 | def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None): 49 | patch_match_engine = PyramidPatchMatcher( 50 | image_height=frames_style[0].shape[0], 51 | image_width=frames_style[0].shape[1], 52 | channel=3, 53 | use_mean_target_style=False, 54 | use_pairwise_patch_error=True, 55 | **ebsynth_config 56 | ) 57 | # task 58 | index_dict = self.get_index_dict(index_style) 59 | task_group = self.get_task_group(index_style, len(frames_guide)) 60 | # run 61 | for tasks in task_group: 62 | index_start, index_end = min([i[1] for i in tasks]), max([i[1] for i in tasks]) 63 | for batch_id in tqdm(range(0, len(tasks), batch_size), desc=f"Rendering frames {index_start}...{index_end}"): 64 | tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))] 65 | source_guide, target_guide, source_style = [], [], [] 66 | for l, m, r in tasks_batch: 67 | # l -> m 68 | source_guide.append(frames_guide[l]) 69 | target_guide.append(frames_guide[m]) 70 | source_style.append(frames_style[index_dict[l]]) 71 | # r -> m 72 | source_guide.append(frames_guide[r]) 73 | target_guide.append(frames_guide[m]) 74 | source_style.append(frames_style[index_dict[r]]) 75 | source_guide = np.stack(source_guide) 76 | target_guide = np.stack(target_guide) 77 | source_style = np.stack(source_style) 78 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 79 | if save_path is not None: 80 | for frame_l, frame_r, (l, m, r) in zip(target_style[0::2], target_style[1::2], tasks_batch): 81 | weight_l, weight_r = self.get_weight(l, m, r) 82 | frame = frame_l * weight_l + frame_r * weight_r 83 | frame = frame.clip(0, 255).astype("uint8") 84 | Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % m)) 85 | 86 | 87 | class InterpolationModeSingleFrameRunner: 88 | def __init__(self): 89 | pass 90 | 91 | def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None): 92 | # check input 93 | tracking_window_size = ebsynth_config["tracking_window_size"] 94 | if tracking_window_size * 2 >= batch_size: 95 | raise ValueError("batch_size should be larger than track_window_size * 2") 96 | frame_style = frames_style[0] 97 | frame_guide = frames_guide[index_style[0]] 98 | patch_match_engine = PyramidPatchMatcher( 99 | image_height=frame_style.shape[0], 100 | image_width=frame_style.shape[1], 101 | channel=3, 102 | **ebsynth_config 103 | ) 104 | # run 105 | frame_id, n = 0, len(frames_guide) 106 | for i in tqdm(range(0, n, batch_size - tracking_window_size * 2), desc=f"Rendering frames 0...{n}"): 107 | if i + batch_size > n: 108 | l, r = max(n - batch_size, 0), n 109 | else: 110 | l, r = i, i + batch_size 111 | source_guide = np.stack([frame_guide] * (r-l)) 112 | target_guide = np.stack([frames_guide[i] for i in range(l, r)]) 113 | source_style = np.stack([frame_style] * (r-l)) 114 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 115 | for i, frame in zip(range(l, r), target_style): 116 | if i==frame_id: 117 | frame = frame.clip(0, 255).astype("uint8") 118 | Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % frame_id)) 119 | frame_id += 1 120 | if r < n and r-frame_id <= tracking_window_size: 121 | break 122 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import rearrange 3 | 4 | 5 | def low_version_attention(query, key, value, attn_bias=None): 6 | scale = 1 / query.shape[-1] ** 0.5 7 | query = query * scale 8 | attn = torch.matmul(query, key.transpose(-2, -1)) 9 | if attn_bias is not None: 10 | attn = attn + attn_bias 11 | attn = attn.softmax(-1) 12 | return attn @ value 13 | 14 | 15 | class Attention(torch.nn.Module): 16 | 17 | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False): 18 | super().__init__() 19 | dim_inner = head_dim * num_heads 20 | kv_dim = kv_dim if kv_dim is not None else q_dim 21 | self.num_heads = num_heads 22 | self.head_dim = head_dim 23 | 24 | self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q) 25 | self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv) 26 | self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv) 27 | self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out) 28 | 29 | def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0): 30 | batch_size = q.shape[0] 31 | ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 32 | ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 33 | ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v) 34 | hidden_states = hidden_states + scale * ip_hidden_states 35 | return hidden_states 36 | 37 | def torch_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None): 38 | if encoder_hidden_states is None: 39 | encoder_hidden_states = hidden_states 40 | 41 | batch_size = encoder_hidden_states.shape[0] 42 | 43 | q = self.to_q(hidden_states) 44 | k = self.to_k(encoder_hidden_states) 45 | v = self.to_v(encoder_hidden_states) 46 | 47 | q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 48 | k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 49 | v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 50 | 51 | if qkv_preprocessor is not None: 52 | q, k, v = qkv_preprocessor(q, k, v) 53 | 54 | hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask) 55 | if ipadapter_kwargs is not None: 56 | hidden_states = self.interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs) 57 | hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim) 58 | hidden_states = hidden_states.to(q.dtype) 59 | 60 | hidden_states = self.to_out(hidden_states) 61 | 62 | return hidden_states 63 | 64 | def xformers_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None): 65 | if encoder_hidden_states is None: 66 | encoder_hidden_states = hidden_states 67 | 68 | q = self.to_q(hidden_states) 69 | k = self.to_k(encoder_hidden_states) 70 | v = self.to_v(encoder_hidden_states) 71 | 72 | q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads) 73 | k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads) 74 | v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads) 75 | 76 | if attn_mask is not None: 77 | hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask) 78 | else: 79 | import xformers.ops as xops 80 | hidden_states = xops.memory_efficient_attention(q, k, v) 81 | hidden_states = rearrange(hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads) 82 | 83 | hidden_states = hidden_states.to(q.dtype) 84 | hidden_states = self.to_out(hidden_states) 85 | 86 | return hidden_states 87 | 88 | def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None): 89 | return self.torch_forward(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask, ipadapter_kwargs=ipadapter_kwargs, qkv_preprocessor=qkv_preprocessor) -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/hunyuan_dit_text_encoder.py: -------------------------------------------------------------------------------- 1 | from transformers import BertModel, BertConfig, T5EncoderModel, T5Config 2 | import torch 3 | 4 | 5 | 6 | class HunyuanDiTCLIPTextEncoder(BertModel): 7 | def __init__(self): 8 | config = BertConfig( 9 | _name_or_path = "", 10 | architectures = ["BertModel"], 11 | attention_probs_dropout_prob = 0.1, 12 | bos_token_id = 0, 13 | classifier_dropout = None, 14 | directionality = "bidi", 15 | eos_token_id = 2, 16 | hidden_act = "gelu", 17 | hidden_dropout_prob = 0.1, 18 | hidden_size = 1024, 19 | initializer_range = 0.02, 20 | intermediate_size = 4096, 21 | layer_norm_eps = 1e-12, 22 | max_position_embeddings = 512, 23 | model_type = "bert", 24 | num_attention_heads = 16, 25 | num_hidden_layers = 24, 26 | output_past = True, 27 | pad_token_id = 0, 28 | pooler_fc_size = 768, 29 | pooler_num_attention_heads = 12, 30 | pooler_num_fc_layers = 3, 31 | pooler_size_per_head = 128, 32 | pooler_type = "first_token_transform", 33 | position_embedding_type = "absolute", 34 | torch_dtype = "float32", 35 | transformers_version = "4.37.2", 36 | type_vocab_size = 2, 37 | use_cache = True, 38 | vocab_size = 47020 39 | ) 40 | super().__init__(config, add_pooling_layer=False) 41 | self.eval() 42 | 43 | def forward(self, input_ids, attention_mask, clip_skip=1): 44 | input_shape = input_ids.size() 45 | 46 | batch_size, seq_length = input_shape 47 | device = input_ids.device 48 | 49 | past_key_values_length = 0 50 | 51 | if attention_mask is None: 52 | attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) 53 | 54 | extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) 55 | 56 | embedding_output = self.embeddings( 57 | input_ids=input_ids, 58 | position_ids=None, 59 | token_type_ids=None, 60 | inputs_embeds=None, 61 | past_key_values_length=0, 62 | ) 63 | encoder_outputs = self.encoder( 64 | embedding_output, 65 | attention_mask=extended_attention_mask, 66 | head_mask=None, 67 | encoder_hidden_states=None, 68 | encoder_attention_mask=None, 69 | past_key_values=None, 70 | use_cache=False, 71 | output_attentions=False, 72 | output_hidden_states=True, 73 | return_dict=True, 74 | ) 75 | all_hidden_states = encoder_outputs.hidden_states 76 | prompt_emb = all_hidden_states[-clip_skip] 77 | if clip_skip > 1: 78 | mean, std = all_hidden_states[-1].mean(), all_hidden_states[-1].std() 79 | prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean 80 | return prompt_emb 81 | 82 | def state_dict_converter(self): 83 | return HunyuanDiTCLIPTextEncoderStateDictConverter() 84 | 85 | 86 | 87 | class HunyuanDiTT5TextEncoder(T5EncoderModel): 88 | def __init__(self): 89 | config = T5Config( 90 | _name_or_path = "../HunyuanDiT/t2i/mt5", 91 | architectures = ["MT5ForConditionalGeneration"], 92 | classifier_dropout = 0.0, 93 | d_ff = 5120, 94 | d_kv = 64, 95 | d_model = 2048, 96 | decoder_start_token_id = 0, 97 | dense_act_fn = "gelu_new", 98 | dropout_rate = 0.1, 99 | eos_token_id = 1, 100 | feed_forward_proj = "gated-gelu", 101 | initializer_factor = 1.0, 102 | is_encoder_decoder = True, 103 | is_gated_act = True, 104 | layer_norm_epsilon = 1e-06, 105 | model_type = "t5", 106 | num_decoder_layers = 24, 107 | num_heads = 32, 108 | num_layers = 24, 109 | output_past = True, 110 | pad_token_id = 0, 111 | relative_attention_max_distance = 128, 112 | relative_attention_num_buckets = 32, 113 | tie_word_embeddings = False, 114 | tokenizer_class = "T5Tokenizer", 115 | transformers_version = "4.37.2", 116 | use_cache = True, 117 | vocab_size = 250112 118 | ) 119 | super().__init__(config) 120 | self.eval() 121 | 122 | def forward(self, input_ids, attention_mask, clip_skip=1): 123 | outputs = super().forward( 124 | input_ids=input_ids, 125 | attention_mask=attention_mask, 126 | output_hidden_states=True, 127 | ) 128 | prompt_emb = outputs.hidden_states[-clip_skip] 129 | if clip_skip > 1: 130 | mean, std = outputs.hidden_states[-1].mean(), outputs.hidden_states[-1].std() 131 | prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean 132 | return prompt_emb 133 | 134 | def state_dict_converter(self): 135 | return HunyuanDiTT5TextEncoderStateDictConverter() 136 | 137 | 138 | 139 | class HunyuanDiTCLIPTextEncoderStateDictConverter(): 140 | def __init__(self): 141 | pass 142 | 143 | def from_diffusers(self, state_dict): 144 | state_dict_ = {name[5:]: param for name, param in state_dict.items() if name.startswith("bert.")} 145 | return state_dict_ 146 | 147 | def from_civitai(self, state_dict): 148 | return self.from_diffusers(state_dict) 149 | 150 | 151 | class HunyuanDiTT5TextEncoderStateDictConverter(): 152 | def __init__(self): 153 | pass 154 | 155 | def from_diffusers(self, state_dict): 156 | state_dict_ = {name: param for name, param in state_dict.items() if name.startswith("encoder.")} 157 | state_dict_["shared.weight"] = state_dict["shared.weight"] 158 | return state_dict_ 159 | 160 | def from_civitai(self, state_dict): 161 | return self.from_diffusers(state_dict) 162 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/sd_ipadapter.py: -------------------------------------------------------------------------------- 1 | from .svd_image_encoder import SVDImageEncoder 2 | from .sdxl_ipadapter import IpAdapterImageProjModel, IpAdapterModule, SDXLIpAdapterStateDictConverter 3 | from transformers import CLIPImageProcessor 4 | import torch 5 | 6 | 7 | class IpAdapterCLIPImageEmbedder(SVDImageEncoder): 8 | def __init__(self): 9 | super().__init__() 10 | self.image_processor = CLIPImageProcessor() 11 | 12 | def forward(self, image): 13 | pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values 14 | pixel_values = pixel_values.to(device=self.embeddings.class_embedding.device, dtype=self.embeddings.class_embedding.dtype) 15 | return super().forward(pixel_values) 16 | 17 | 18 | class SDIpAdapter(torch.nn.Module): 19 | def __init__(self): 20 | super().__init__() 21 | shape_list = [(768, 320)] * 2 + [(768, 640)] * 2 + [(768, 1280)] * 5 + [(768, 640)] * 3 + [(768, 320)] * 3 + [(768, 1280)] * 1 22 | self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(*shape) for shape in shape_list]) 23 | self.image_proj = IpAdapterImageProjModel(cross_attention_dim=768, clip_embeddings_dim=1024, clip_extra_context_tokens=4) 24 | self.set_full_adapter() 25 | 26 | def set_full_adapter(self): 27 | block_ids = [1, 4, 9, 12, 17, 20, 40, 43, 46, 50, 53, 56, 60, 63, 66, 29] 28 | self.call_block_id = {(i, 0): j for j, i in enumerate(block_ids)} 29 | 30 | def set_less_adapter(self): 31 | # IP-Adapter for SD v1.5 doesn't support this feature. 32 | self.set_full_adapter(self) 33 | 34 | def forward(self, hidden_states, scale=1.0): 35 | hidden_states = self.image_proj(hidden_states) 36 | hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1]) 37 | ip_kv_dict = {} 38 | for (block_id, transformer_id) in self.call_block_id: 39 | ipadapter_id = self.call_block_id[(block_id, transformer_id)] 40 | ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states) 41 | if block_id not in ip_kv_dict: 42 | ip_kv_dict[block_id] = {} 43 | ip_kv_dict[block_id][transformer_id] = { 44 | "ip_k": ip_k, 45 | "ip_v": ip_v, 46 | "scale": scale 47 | } 48 | return ip_kv_dict 49 | 50 | def state_dict_converter(self): 51 | return SDIpAdapterStateDictConverter() 52 | 53 | 54 | class SDIpAdapterStateDictConverter(SDXLIpAdapterStateDictConverter): 55 | def __init__(self): 56 | pass 57 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/sd_lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .sd_unet import SDUNetStateDictConverter, SDUNet 3 | from .sd_text_encoder import SDTextEncoderStateDictConverter, SDTextEncoder 4 | 5 | 6 | class SDLoRA: 7 | def __init__(self): 8 | pass 9 | 10 | def convert_state_dict(self, state_dict, lora_prefix="lora_unet_", alpha=1.0, device="cuda"): 11 | special_keys = { 12 | "down.blocks": "down_blocks", 13 | "up.blocks": "up_blocks", 14 | "mid.block": "mid_block", 15 | "proj.in": "proj_in", 16 | "proj.out": "proj_out", 17 | "transformer.blocks": "transformer_blocks", 18 | "to.q": "to_q", 19 | "to.k": "to_k", 20 | "to.v": "to_v", 21 | "to.out": "to_out", 22 | } 23 | state_dict_ = {} 24 | for key in state_dict: 25 | if ".lora_up" not in key: 26 | continue 27 | if not key.startswith(lora_prefix): 28 | continue 29 | weight_up = state_dict[key].to(device="cuda", dtype=torch.float16) 30 | weight_down = state_dict[key.replace(".lora_up", ".lora_down")].to(device="cuda", dtype=torch.float16) 31 | if len(weight_up.shape) == 4: 32 | weight_up = weight_up.squeeze(3).squeeze(2).to(torch.float32) 33 | weight_down = weight_down.squeeze(3).squeeze(2).to(torch.float32) 34 | lora_weight = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3) 35 | else: 36 | lora_weight = alpha * torch.mm(weight_up, weight_down) 37 | target_name = key.split(".")[0].replace("_", ".")[len(lora_prefix):] + ".weight" 38 | for special_key in special_keys: 39 | target_name = target_name.replace(special_key, special_keys[special_key]) 40 | state_dict_[target_name] = lora_weight.cpu() 41 | return state_dict_ 42 | 43 | def add_lora_to_unet(self, unet: SDUNet, state_dict_lora, alpha=1.0, device="cuda"): 44 | state_dict_unet = unet.state_dict() 45 | state_dict_lora = self.convert_state_dict(state_dict_lora, lora_prefix="lora_unet_", alpha=alpha, device=device) 46 | state_dict_lora = SDUNetStateDictConverter().from_diffusers(state_dict_lora) 47 | if len(state_dict_lora) > 0: 48 | for name in state_dict_lora: 49 | state_dict_unet[name] += state_dict_lora[name].to(device=device) 50 | unet.load_state_dict(state_dict_unet) 51 | 52 | def add_lora_to_text_encoder(self, text_encoder: SDTextEncoder, state_dict_lora, alpha=1.0, device="cuda"): 53 | state_dict_text_encoder = text_encoder.state_dict() 54 | state_dict_lora = self.convert_state_dict(state_dict_lora, lora_prefix="lora_te_", alpha=alpha, device=device) 55 | state_dict_lora = SDTextEncoderStateDictConverter().from_diffusers(state_dict_lora) 56 | if len(state_dict_lora) > 0: 57 | for name in state_dict_lora: 58 | state_dict_text_encoder[name] += state_dict_lora[name].to(device=device) 59 | text_encoder.load_state_dict(state_dict_text_encoder) 60 | 61 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/sdxl_ipadapter.py: -------------------------------------------------------------------------------- 1 | from .svd_image_encoder import SVDImageEncoder 2 | from transformers import CLIPImageProcessor 3 | import torch 4 | 5 | 6 | class IpAdapterXLCLIPImageEmbedder(SVDImageEncoder): 7 | def __init__(self): 8 | super().__init__(embed_dim=1664, encoder_intermediate_size=8192, projection_dim=1280, num_encoder_layers=48, num_heads=16, head_dim=104) 9 | self.image_processor = CLIPImageProcessor() 10 | 11 | def forward(self, image): 12 | pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values 13 | pixel_values = pixel_values.to(device=self.embeddings.class_embedding.device, dtype=self.embeddings.class_embedding.dtype) 14 | return super().forward(pixel_values) 15 | 16 | 17 | class IpAdapterImageProjModel(torch.nn.Module): 18 | def __init__(self, cross_attention_dim=2048, clip_embeddings_dim=1280, clip_extra_context_tokens=4): 19 | super().__init__() 20 | self.cross_attention_dim = cross_attention_dim 21 | self.clip_extra_context_tokens = clip_extra_context_tokens 22 | self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim) 23 | self.norm = torch.nn.LayerNorm(cross_attention_dim) 24 | 25 | def forward(self, image_embeds): 26 | clip_extra_context_tokens = self.proj(image_embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim) 27 | clip_extra_context_tokens = self.norm(clip_extra_context_tokens) 28 | return clip_extra_context_tokens 29 | 30 | 31 | class IpAdapterModule(torch.nn.Module): 32 | def __init__(self, input_dim, output_dim): 33 | super().__init__() 34 | self.to_k_ip = torch.nn.Linear(input_dim, output_dim, bias=False) 35 | self.to_v_ip = torch.nn.Linear(input_dim, output_dim, bias=False) 36 | 37 | def forward(self, hidden_states): 38 | ip_k = self.to_k_ip(hidden_states) 39 | ip_v = self.to_v_ip(hidden_states) 40 | return ip_k, ip_v 41 | 42 | 43 | class SDXLIpAdapter(torch.nn.Module): 44 | def __init__(self): 45 | super().__init__() 46 | shape_list = [(2048, 640)] * 4 + [(2048, 1280)] * 50 + [(2048, 640)] * 6 + [(2048, 1280)] * 10 47 | self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(*shape) for shape in shape_list]) 48 | self.image_proj = IpAdapterImageProjModel() 49 | self.set_full_adapter() 50 | 51 | def set_full_adapter(self): 52 | map_list = sum([ 53 | [(7, i) for i in range(2)], 54 | [(10, i) for i in range(2)], 55 | [(15, i) for i in range(10)], 56 | [(18, i) for i in range(10)], 57 | [(25, i) for i in range(10)], 58 | [(28, i) for i in range(10)], 59 | [(31, i) for i in range(10)], 60 | [(35, i) for i in range(2)], 61 | [(38, i) for i in range(2)], 62 | [(41, i) for i in range(2)], 63 | [(21, i) for i in range(10)], 64 | ], []) 65 | self.call_block_id = {i: j for j, i in enumerate(map_list)} 66 | 67 | def set_less_adapter(self): 68 | map_list = sum([ 69 | [(7, i) for i in range(2)], 70 | [(10, i) for i in range(2)], 71 | [(15, i) for i in range(10)], 72 | [(18, i) for i in range(10)], 73 | [(25, i) for i in range(10)], 74 | [(28, i) for i in range(10)], 75 | [(31, i) for i in range(10)], 76 | [(35, i) for i in range(2)], 77 | [(38, i) for i in range(2)], 78 | [(41, i) for i in range(2)], 79 | [(21, i) for i in range(10)], 80 | ], []) 81 | self.call_block_id = {i: j for j, i in enumerate(map_list) if j>=34 and j<44} 82 | 83 | def forward(self, hidden_states, scale=1.0): 84 | hidden_states = self.image_proj(hidden_states) 85 | hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1]) 86 | ip_kv_dict = {} 87 | for (block_id, transformer_id) in self.call_block_id: 88 | ipadapter_id = self.call_block_id[(block_id, transformer_id)] 89 | ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states) 90 | if block_id not in ip_kv_dict: 91 | ip_kv_dict[block_id] = {} 92 | ip_kv_dict[block_id][transformer_id] = { 93 | "ip_k": ip_k, 94 | "ip_v": ip_v, 95 | "scale": scale 96 | } 97 | return ip_kv_dict 98 | 99 | def state_dict_converter(self): 100 | return SDXLIpAdapterStateDictConverter() 101 | 102 | 103 | class SDXLIpAdapterStateDictConverter: 104 | def __init__(self): 105 | pass 106 | 107 | def from_diffusers(self, state_dict): 108 | state_dict_ = {} 109 | for name in state_dict["ip_adapter"]: 110 | names = name.split(".") 111 | layer_id = str(int(names[0]) // 2) 112 | name_ = ".".join(["ipadapter_modules"] + [layer_id] + names[1:]) 113 | state_dict_[name_] = state_dict["ip_adapter"][name] 114 | for name in state_dict["image_proj"]: 115 | name_ = "image_proj." + name 116 | state_dict_[name_] = state_dict["image_proj"][name] 117 | return state_dict_ 118 | 119 | def from_civitai(self, state_dict): 120 | return self.from_diffusers(state_dict) 121 | 122 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/sdxl_motion.py: -------------------------------------------------------------------------------- 1 | from .sd_motion import TemporalBlock 2 | import torch 3 | 4 | 5 | 6 | class SDXLMotionModel(torch.nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | self.motion_modules = torch.nn.ModuleList([ 10 | TemporalBlock(8, 320//8, 320, eps=1e-6), 11 | TemporalBlock(8, 320//8, 320, eps=1e-6), 12 | 13 | TemporalBlock(8, 640//8, 640, eps=1e-6), 14 | TemporalBlock(8, 640//8, 640, eps=1e-6), 15 | 16 | TemporalBlock(8, 1280//8, 1280, eps=1e-6), 17 | TemporalBlock(8, 1280//8, 1280, eps=1e-6), 18 | 19 | TemporalBlock(8, 1280//8, 1280, eps=1e-6), 20 | TemporalBlock(8, 1280//8, 1280, eps=1e-6), 21 | TemporalBlock(8, 1280//8, 1280, eps=1e-6), 22 | 23 | TemporalBlock(8, 640//8, 640, eps=1e-6), 24 | TemporalBlock(8, 640//8, 640, eps=1e-6), 25 | TemporalBlock(8, 640//8, 640, eps=1e-6), 26 | 27 | TemporalBlock(8, 320//8, 320, eps=1e-6), 28 | TemporalBlock(8, 320//8, 320, eps=1e-6), 29 | TemporalBlock(8, 320//8, 320, eps=1e-6), 30 | ]) 31 | self.call_block_id = { 32 | 0: 0, 33 | 2: 1, 34 | 7: 2, 35 | 10: 3, 36 | 15: 4, 37 | 18: 5, 38 | 25: 6, 39 | 28: 7, 40 | 31: 8, 41 | 35: 9, 42 | 38: 10, 43 | 41: 11, 44 | 44: 12, 45 | 46: 13, 46 | 48: 14, 47 | } 48 | 49 | def forward(self): 50 | pass 51 | 52 | def state_dict_converter(self): 53 | return SDMotionModelStateDictConverter() 54 | 55 | 56 | class SDMotionModelStateDictConverter: 57 | def __init__(self): 58 | pass 59 | 60 | def from_diffusers(self, state_dict): 61 | rename_dict = { 62 | "norm": "norm", 63 | "proj_in": "proj_in", 64 | "transformer_blocks.0.attention_blocks.0.to_q": "transformer_blocks.0.attn1.to_q", 65 | "transformer_blocks.0.attention_blocks.0.to_k": "transformer_blocks.0.attn1.to_k", 66 | "transformer_blocks.0.attention_blocks.0.to_v": "transformer_blocks.0.attn1.to_v", 67 | "transformer_blocks.0.attention_blocks.0.to_out.0": "transformer_blocks.0.attn1.to_out", 68 | "transformer_blocks.0.attention_blocks.0.pos_encoder": "transformer_blocks.0.pe1", 69 | "transformer_blocks.0.attention_blocks.1.to_q": "transformer_blocks.0.attn2.to_q", 70 | "transformer_blocks.0.attention_blocks.1.to_k": "transformer_blocks.0.attn2.to_k", 71 | "transformer_blocks.0.attention_blocks.1.to_v": "transformer_blocks.0.attn2.to_v", 72 | "transformer_blocks.0.attention_blocks.1.to_out.0": "transformer_blocks.0.attn2.to_out", 73 | "transformer_blocks.0.attention_blocks.1.pos_encoder": "transformer_blocks.0.pe2", 74 | "transformer_blocks.0.norms.0": "transformer_blocks.0.norm1", 75 | "transformer_blocks.0.norms.1": "transformer_blocks.0.norm2", 76 | "transformer_blocks.0.ff.net.0.proj": "transformer_blocks.0.act_fn.proj", 77 | "transformer_blocks.0.ff.net.2": "transformer_blocks.0.ff", 78 | "transformer_blocks.0.ff_norm": "transformer_blocks.0.norm3", 79 | "proj_out": "proj_out", 80 | } 81 | name_list = sorted([i for i in state_dict if i.startswith("down_blocks.")]) 82 | name_list += sorted([i for i in state_dict if i.startswith("mid_block.")]) 83 | name_list += sorted([i for i in state_dict if i.startswith("up_blocks.")]) 84 | state_dict_ = {} 85 | last_prefix, module_id = "", -1 86 | for name in name_list: 87 | names = name.split(".") 88 | prefix_index = names.index("temporal_transformer") + 1 89 | prefix = ".".join(names[:prefix_index]) 90 | if prefix != last_prefix: 91 | last_prefix = prefix 92 | module_id += 1 93 | middle_name = ".".join(names[prefix_index:-1]) 94 | suffix = names[-1] 95 | if "pos_encoder" in names: 96 | rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name]]) 97 | else: 98 | rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name], suffix]) 99 | state_dict_[rename] = state_dict[name] 100 | return state_dict_ 101 | 102 | def from_civitai(self, state_dict): 103 | return self.from_diffusers(state_dict) 104 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/sdxl_vae_decoder.py: -------------------------------------------------------------------------------- 1 | from .sd_vae_decoder import SDVAEDecoder, SDVAEDecoderStateDictConverter 2 | 3 | 4 | class SDXLVAEDecoder(SDVAEDecoder): 5 | def __init__(self): 6 | super().__init__() 7 | self.scaling_factor = 0.13025 8 | 9 | def state_dict_converter(self): 10 | return SDXLVAEDecoderStateDictConverter() 11 | 12 | 13 | class SDXLVAEDecoderStateDictConverter(SDVAEDecoderStateDictConverter): 14 | def __init__(self): 15 | super().__init__() 16 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/sdxl_vae_encoder.py: -------------------------------------------------------------------------------- 1 | from .sd_vae_encoder import SDVAEEncoderStateDictConverter, SDVAEEncoder 2 | 3 | 4 | class SDXLVAEEncoder(SDVAEEncoder): 5 | def __init__(self): 6 | super().__init__() 7 | self.scaling_factor = 0.13025 8 | 9 | def state_dict_converter(self): 10 | return SDXLVAEEncoderStateDictConverter() 11 | 12 | 13 | class SDXLVAEEncoderStateDictConverter(SDVAEEncoderStateDictConverter): 14 | def __init__(self): 15 | super().__init__() 16 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/models/tiler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import rearrange, repeat 3 | 4 | 5 | class TileWorker: 6 | def __init__(self): 7 | pass 8 | 9 | 10 | def mask(self, height, width, border_width): 11 | # Create a mask with shape (height, width). 12 | # The centre area is filled with 1, and the border line is filled with values in range (0, 1]. 13 | x = torch.arange(height).repeat(width, 1).T 14 | y = torch.arange(width).repeat(height, 1) 15 | mask = torch.stack([x + 1, height - x, y + 1, width - y]).min(dim=0).values 16 | mask = (mask / border_width).clip(0, 1) 17 | return mask 18 | 19 | 20 | def tile(self, model_input, tile_size, tile_stride, tile_device, tile_dtype): 21 | # Convert a tensor (b, c, h, w) to (b, c, tile_size, tile_size, tile_num) 22 | batch_size, channel, _, _ = model_input.shape 23 | model_input = model_input.to(device=tile_device, dtype=tile_dtype) 24 | unfold_operator = torch.nn.Unfold( 25 | kernel_size=(tile_size, tile_size), 26 | stride=(tile_stride, tile_stride) 27 | ) 28 | model_input = unfold_operator(model_input) 29 | model_input = model_input.view((batch_size, channel, tile_size, tile_size, -1)) 30 | 31 | return model_input 32 | 33 | 34 | def tiled_inference(self, forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype): 35 | # Call y=forward_fn(x) for each tile 36 | tile_num = model_input.shape[-1] 37 | model_output_stack = [] 38 | 39 | for tile_id in range(0, tile_num, tile_batch_size): 40 | 41 | # process input 42 | tile_id_ = min(tile_id + tile_batch_size, tile_num) 43 | x = model_input[:, :, :, :, tile_id: tile_id_] 44 | x = x.to(device=inference_device, dtype=inference_dtype) 45 | x = rearrange(x, "b c h w n -> (n b) c h w") 46 | 47 | # process output 48 | y = forward_fn(x) 49 | y = rearrange(y, "(n b) c h w -> b c h w n", n=tile_id_-tile_id) 50 | y = y.to(device=tile_device, dtype=tile_dtype) 51 | model_output_stack.append(y) 52 | 53 | model_output = torch.concat(model_output_stack, dim=-1) 54 | return model_output 55 | 56 | 57 | def io_scale(self, model_output, tile_size): 58 | # Determine the size modification happend in forward_fn 59 | # We only consider the same scale on height and width. 60 | io_scale = model_output.shape[2] / tile_size 61 | return io_scale 62 | 63 | 64 | def untile(self, model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype): 65 | # The reversed function of tile 66 | mask = self.mask(tile_size, tile_size, border_width) 67 | mask = mask.to(device=tile_device, dtype=tile_dtype) 68 | mask = rearrange(mask, "h w -> 1 1 h w 1") 69 | model_output = model_output * mask 70 | 71 | fold_operator = torch.nn.Fold( 72 | output_size=(height, width), 73 | kernel_size=(tile_size, tile_size), 74 | stride=(tile_stride, tile_stride) 75 | ) 76 | mask = repeat(mask[0, 0, :, :, 0], "h w -> 1 (h w) n", n=model_output.shape[-1]) 77 | model_output = rearrange(model_output, "b c h w n -> b (c h w) n") 78 | model_output = fold_operator(model_output) / fold_operator(mask) 79 | 80 | return model_output 81 | 82 | 83 | def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_batch_size=1, tile_device="cpu", tile_dtype=torch.float32, border_width=None): 84 | # Prepare 85 | inference_device, inference_dtype = model_input.device, model_input.dtype 86 | height, width = model_input.shape[2], model_input.shape[3] 87 | border_width = int(tile_stride*0.5) if border_width is None else border_width 88 | 89 | # tile 90 | model_input = self.tile(model_input, tile_size, tile_stride, tile_device, tile_dtype) 91 | 92 | # inference 93 | model_output = self.tiled_inference(forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype) 94 | 95 | # resize 96 | io_scale = self.io_scale(model_output, tile_size) 97 | height, width = int(height*io_scale), int(width*io_scale) 98 | tile_size, tile_stride = int(tile_size*io_scale), int(tile_stride*io_scale) 99 | border_width = int(border_width*io_scale) 100 | 101 | # untile 102 | model_output = self.untile(model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype) 103 | 104 | # Done! 105 | model_output = model_output.to(device=inference_device, dtype=inference_dtype) 106 | return model_output -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .stable_diffusion import SDImagePipeline 2 | from .stable_diffusion_xl import SDXLImagePipeline 3 | from .stable_diffusion_video import SDVideoPipeline, SDVideoPipelineRunner 4 | from .stable_diffusion_xl_video import SDXLVideoPipeline 5 | from .stable_video_diffusion import SVDVideoPipeline 6 | from .hunyuan_dit import HunyuanDiTImagePipeline 7 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/pipelines/dancer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ..models import SDUNet, SDMotionModel, SDXLUNet, SDXLMotionModel 3 | from ..models.sd_unet import PushBlock, PopBlock 4 | from ..controlnets import MultiControlNetManager 5 | 6 | 7 | def lets_dance( 8 | unet: SDUNet, 9 | motion_modules: SDMotionModel = None, 10 | controlnet: MultiControlNetManager = None, 11 | sample = None, 12 | timestep = None, 13 | encoder_hidden_states = None, 14 | ipadapter_kwargs_list = {}, 15 | controlnet_frames = None, 16 | unet_batch_size = 1, 17 | controlnet_batch_size = 1, 18 | cross_frame_attention = False, 19 | tiled=False, 20 | tile_size=64, 21 | tile_stride=32, 22 | device = "cuda", 23 | vram_limit_level = 0, 24 | ): 25 | # 1. ControlNet 26 | # This part will be repeated on overlapping frames if animatediff_batch_size > animatediff_stride. 27 | # I leave it here because I intend to do something interesting on the ControlNets. 28 | controlnet_insert_block_id = 30 29 | if controlnet is not None and controlnet_frames is not None: 30 | res_stacks = [] 31 | # process controlnet frames with batch 32 | for batch_id in range(0, sample.shape[0], controlnet_batch_size): 33 | batch_id_ = min(batch_id + controlnet_batch_size, sample.shape[0]) 34 | res_stack = controlnet( 35 | sample[batch_id: batch_id_], 36 | timestep, 37 | encoder_hidden_states[batch_id: batch_id_], 38 | controlnet_frames[:, batch_id: batch_id_], 39 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride 40 | ) 41 | if vram_limit_level >= 1: 42 | res_stack = [res.cpu() for res in res_stack] 43 | res_stacks.append(res_stack) 44 | # concat the residual 45 | additional_res_stack = [] 46 | for i in range(len(res_stacks[0])): 47 | res = torch.concat([res_stack[i] for res_stack in res_stacks], dim=0) 48 | additional_res_stack.append(res) 49 | else: 50 | additional_res_stack = None 51 | 52 | # 2. time 53 | time_emb = unet.time_proj(timestep[None]).to(sample.dtype) 54 | time_emb = unet.time_embedding(time_emb) 55 | 56 | # 3. pre-process 57 | height, width = sample.shape[2], sample.shape[3] 58 | hidden_states = unet.conv_in(sample) 59 | text_emb = encoder_hidden_states 60 | res_stack = [hidden_states.cpu() if vram_limit_level>=1 else hidden_states] 61 | 62 | # 4. blocks 63 | for block_id, block in enumerate(unet.blocks): 64 | # 4.1 UNet 65 | if isinstance(block, PushBlock): 66 | hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack) 67 | if vram_limit_level>=1: 68 | res_stack[-1] = res_stack[-1].cpu() 69 | elif isinstance(block, PopBlock): 70 | if vram_limit_level>=1: 71 | res_stack[-1] = res_stack[-1].to(device) 72 | hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack) 73 | else: 74 | hidden_states_input = hidden_states 75 | hidden_states_output = [] 76 | for batch_id in range(0, sample.shape[0], unet_batch_size): 77 | batch_id_ = min(batch_id + unet_batch_size, sample.shape[0]) 78 | hidden_states, _, _, _ = block( 79 | hidden_states_input[batch_id: batch_id_], 80 | time_emb, 81 | text_emb[batch_id: batch_id_], 82 | res_stack, 83 | cross_frame_attention=cross_frame_attention, 84 | ipadapter_kwargs_list=ipadapter_kwargs_list.get(block_id, {}), 85 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride 86 | ) 87 | hidden_states_output.append(hidden_states) 88 | hidden_states = torch.concat(hidden_states_output, dim=0) 89 | # 4.2 AnimateDiff 90 | if motion_modules is not None: 91 | if block_id in motion_modules.call_block_id: 92 | motion_module_id = motion_modules.call_block_id[block_id] 93 | hidden_states, time_emb, text_emb, res_stack = motion_modules.motion_modules[motion_module_id]( 94 | hidden_states, time_emb, text_emb, res_stack, 95 | batch_size=1 96 | ) 97 | # 4.3 ControlNet 98 | if block_id == controlnet_insert_block_id and additional_res_stack is not None: 99 | hidden_states += additional_res_stack.pop().to(device) 100 | if vram_limit_level>=1: 101 | res_stack = [(res.to(device) + additional_res.to(device)).cpu() for res, additional_res in zip(res_stack, additional_res_stack)] 102 | else: 103 | res_stack = [res + additional_res for res, additional_res in zip(res_stack, additional_res_stack)] 104 | 105 | # 5. output 106 | hidden_states = unet.conv_norm_out(hidden_states) 107 | hidden_states = unet.conv_act(hidden_states) 108 | hidden_states = unet.conv_out(hidden_states) 109 | 110 | return hidden_states 111 | 112 | 113 | 114 | 115 | def lets_dance_xl( 116 | unet: SDXLUNet, 117 | motion_modules: SDXLMotionModel = None, 118 | controlnet: MultiControlNetManager = None, 119 | sample = None, 120 | add_time_id = None, 121 | add_text_embeds = None, 122 | timestep = None, 123 | encoder_hidden_states = None, 124 | ipadapter_kwargs_list = {}, 125 | controlnet_frames = None, 126 | unet_batch_size = 1, 127 | controlnet_batch_size = 1, 128 | cross_frame_attention = False, 129 | tiled=False, 130 | tile_size=64, 131 | tile_stride=32, 132 | device = "cuda", 133 | vram_limit_level = 0, 134 | ): 135 | # 2. time 136 | t_emb = unet.time_proj(timestep[None]).to(sample.dtype) 137 | t_emb = unet.time_embedding(t_emb) 138 | 139 | time_embeds = unet.add_time_proj(add_time_id) 140 | time_embeds = time_embeds.reshape((add_text_embeds.shape[0], -1)) 141 | add_embeds = torch.concat([add_text_embeds, time_embeds], dim=-1) 142 | add_embeds = add_embeds.to(sample.dtype) 143 | add_embeds = unet.add_time_embedding(add_embeds) 144 | 145 | time_emb = t_emb + add_embeds 146 | 147 | # 3. pre-process 148 | height, width = sample.shape[2], sample.shape[3] 149 | hidden_states = unet.conv_in(sample) 150 | text_emb = encoder_hidden_states 151 | res_stack = [hidden_states] 152 | 153 | # 4. blocks 154 | for block_id, block in enumerate(unet.blocks): 155 | hidden_states, time_emb, text_emb, res_stack = block( 156 | hidden_states, time_emb, text_emb, res_stack, 157 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride, 158 | ipadapter_kwargs_list=ipadapter_kwargs_list.get(block_id, {}) 159 | ) 160 | # 4.2 AnimateDiff 161 | if motion_modules is not None: 162 | if block_id in motion_modules.call_block_id: 163 | motion_module_id = motion_modules.call_block_id[block_id] 164 | hidden_states, time_emb, text_emb, res_stack = motion_modules.motion_modules[motion_module_id]( 165 | hidden_states, time_emb, text_emb, res_stack, 166 | batch_size=1 167 | ) 168 | 169 | # 5. output 170 | hidden_states = unet.conv_norm_out(hidden_states) 171 | hidden_states = unet.conv_act(hidden_states) 172 | hidden_states = unet.conv_out(hidden_states) 173 | 174 | return hidden_states -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/pipelines/stable_diffusion.py: -------------------------------------------------------------------------------- 1 | from ..models import ModelManager, SDTextEncoder, SDUNet, SDVAEDecoder, SDVAEEncoder, SDIpAdapter, IpAdapterCLIPImageEmbedder 2 | from ..controlnets import MultiControlNetManager, ControlNetUnit, ControlNetConfigUnit, Annotator 3 | from ..prompts import SDPrompter 4 | from ..schedulers import EnhancedDDIMScheduler 5 | from .dancer import lets_dance 6 | from typing import List 7 | import torch 8 | from tqdm import tqdm 9 | from PIL import Image 10 | import numpy as np 11 | 12 | 13 | class SDImagePipeline(torch.nn.Module): 14 | 15 | def __init__(self, device="cuda", torch_dtype=torch.float16): 16 | super().__init__() 17 | self.scheduler = EnhancedDDIMScheduler() 18 | self.prompter = SDPrompter() 19 | self.device = device 20 | self.torch_dtype = torch_dtype 21 | # models 22 | self.text_encoder: SDTextEncoder = None 23 | self.unet: SDUNet = None 24 | self.vae_decoder: SDVAEDecoder = None 25 | self.vae_encoder: SDVAEEncoder = None 26 | self.controlnet: MultiControlNetManager = None 27 | self.ipadapter_image_encoder: IpAdapterCLIPImageEmbedder = None 28 | self.ipadapter: SDIpAdapter = None 29 | 30 | 31 | def fetch_main_models(self, model_manager: ModelManager): 32 | self.text_encoder = model_manager.text_encoder 33 | self.unet = model_manager.unet 34 | self.vae_decoder = model_manager.vae_decoder 35 | self.vae_encoder = model_manager.vae_encoder 36 | 37 | 38 | def fetch_controlnet_models(self, model_manager: ModelManager, controlnet_config_units: List[ControlNetConfigUnit]=[]): 39 | controlnet_units = [] 40 | for config in controlnet_config_units: 41 | controlnet_unit = ControlNetUnit( 42 | Annotator(config.processor_id), 43 | model_manager.get_model_with_model_path(config.model_path), 44 | config.scale 45 | ) 46 | controlnet_units.append(controlnet_unit) 47 | self.controlnet = MultiControlNetManager(controlnet_units) 48 | 49 | 50 | def fetch_ipadapter(self, model_manager: ModelManager): 51 | if "ipadapter" in model_manager.model: 52 | self.ipadapter = model_manager.ipadapter 53 | if "ipadapter_image_encoder" in model_manager.model: 54 | self.ipadapter_image_encoder = model_manager.ipadapter_image_encoder 55 | 56 | 57 | def fetch_prompter(self, model_manager: ModelManager): 58 | self.prompter.load_from_model_manager(model_manager) 59 | 60 | 61 | @staticmethod 62 | def from_model_manager(model_manager: ModelManager, controlnet_config_units: List[ControlNetConfigUnit]=[]): 63 | pipe = SDImagePipeline( 64 | device=model_manager.device, 65 | torch_dtype=model_manager.torch_dtype, 66 | ) 67 | pipe.fetch_main_models(model_manager) 68 | pipe.fetch_prompter(model_manager) 69 | pipe.fetch_controlnet_models(model_manager, controlnet_config_units) 70 | pipe.fetch_ipadapter(model_manager) 71 | return pipe 72 | 73 | 74 | def preprocess_image(self, image): 75 | image = torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1).permute(2, 0, 1).unsqueeze(0) 76 | return image 77 | 78 | 79 | def decode_image(self, latent, tiled=False, tile_size=64, tile_stride=32): 80 | image = self.vae_decoder(latent.to(self.device), tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)[0] 81 | image = image.cpu().permute(1, 2, 0).numpy() 82 | image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8")) 83 | return image 84 | 85 | 86 | @torch.no_grad() 87 | def __call__( 88 | self, 89 | prompt, 90 | negative_prompt="", 91 | cfg_scale=7.5, 92 | clip_skip=1, 93 | input_image=None, 94 | ipadapter_images=None, 95 | ipadapter_scale=1.0, 96 | controlnet_image=None, 97 | denoising_strength=1.0, 98 | height=512, 99 | width=512, 100 | num_inference_steps=20, 101 | tiled=False, 102 | tile_size=64, 103 | tile_stride=32, 104 | progress_bar_cmd=tqdm, 105 | progress_bar_st=None, 106 | ): 107 | # Prepare scheduler 108 | self.scheduler.set_timesteps(num_inference_steps, denoising_strength) 109 | 110 | # Prepare latent tensors 111 | if input_image is not None: 112 | image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype) 113 | latents = self.vae_encoder(image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) 114 | noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) 115 | latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) 116 | else: 117 | latents = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) 118 | 119 | # Encode prompts 120 | prompt_emb_posi = self.prompter.encode_prompt(self.text_encoder, prompt, clip_skip=clip_skip, device=self.device, positive=True) 121 | prompt_emb_nega = self.prompter.encode_prompt(self.text_encoder, negative_prompt, clip_skip=clip_skip, device=self.device, positive=False) 122 | 123 | # IP-Adapter 124 | if ipadapter_images is not None: 125 | ipadapter_image_encoding = self.ipadapter_image_encoder(ipadapter_images) 126 | ipadapter_kwargs_list_posi = self.ipadapter(ipadapter_image_encoding, scale=ipadapter_scale) 127 | ipadapter_kwargs_list_nega = self.ipadapter(torch.zeros_like(ipadapter_image_encoding)) 128 | else: 129 | ipadapter_kwargs_list_posi, ipadapter_kwargs_list_nega = {}, {} 130 | 131 | # Prepare ControlNets 132 | if controlnet_image is not None: 133 | controlnet_image = self.controlnet.process_image(controlnet_image).to(device=self.device, dtype=self.torch_dtype) 134 | controlnet_image = controlnet_image.unsqueeze(1) 135 | 136 | # Denoise 137 | for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): 138 | timestep = torch.IntTensor((timestep,))[0].to(self.device) 139 | 140 | # Classifier-free guidance 141 | noise_pred_posi = lets_dance( 142 | self.unet, motion_modules=None, controlnet=self.controlnet, 143 | sample=latents, timestep=timestep, encoder_hidden_states=prompt_emb_posi, controlnet_frames=controlnet_image, 144 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride, 145 | ipadapter_kwargs_list=ipadapter_kwargs_list_posi, 146 | device=self.device, vram_limit_level=0 147 | ) 148 | noise_pred_nega = lets_dance( 149 | self.unet, motion_modules=None, controlnet=self.controlnet, 150 | sample=latents, timestep=timestep, encoder_hidden_states=prompt_emb_nega, controlnet_frames=controlnet_image, 151 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride, 152 | ipadapter_kwargs_list=ipadapter_kwargs_list_nega, 153 | device=self.device, vram_limit_level=0 154 | ) 155 | noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega) 156 | 157 | # DDIM 158 | latents = self.scheduler.step(noise_pred, timestep, latents) 159 | 160 | # UI 161 | if progress_bar_st is not None: 162 | progress_bar_st.progress(progress_id / len(self.scheduler.timesteps)) 163 | 164 | # Decode image 165 | image = self.decode_image(latents, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) 166 | 167 | return image 168 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/pipelines/stable_diffusion_xl.py: -------------------------------------------------------------------------------- 1 | from ..models import ModelManager, SDXLTextEncoder, SDXLTextEncoder2, SDXLUNet, SDXLVAEDecoder, SDXLVAEEncoder, SDXLIpAdapter, IpAdapterXLCLIPImageEmbedder 2 | # TODO: SDXL ControlNet 3 | from ..prompts import SDXLPrompter 4 | from ..schedulers import EnhancedDDIMScheduler 5 | from .dancer import lets_dance_xl 6 | import torch 7 | from tqdm import tqdm 8 | from PIL import Image 9 | import numpy as np 10 | 11 | 12 | class SDXLImagePipeline(torch.nn.Module): 13 | 14 | def __init__(self, device="cuda", torch_dtype=torch.float16): 15 | super().__init__() 16 | self.scheduler = EnhancedDDIMScheduler() 17 | self.prompter = SDXLPrompter() 18 | self.device = device 19 | self.torch_dtype = torch_dtype 20 | # models 21 | self.text_encoder: SDXLTextEncoder = None 22 | self.text_encoder_2: SDXLTextEncoder2 = None 23 | self.unet: SDXLUNet = None 24 | self.vae_decoder: SDXLVAEDecoder = None 25 | self.vae_encoder: SDXLVAEEncoder = None 26 | self.ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder = None 27 | self.ipadapter: SDXLIpAdapter = None 28 | # TODO: SDXL ControlNet 29 | 30 | def fetch_main_models(self, model_manager: ModelManager): 31 | self.text_encoder = model_manager.text_encoder 32 | self.text_encoder_2 = model_manager.text_encoder_2 33 | self.unet = model_manager.unet 34 | self.vae_decoder = model_manager.vae_decoder 35 | self.vae_encoder = model_manager.vae_encoder 36 | 37 | 38 | def fetch_controlnet_models(self, model_manager: ModelManager, **kwargs): 39 | # TODO: SDXL ControlNet 40 | pass 41 | 42 | 43 | def fetch_ipadapter(self, model_manager: ModelManager): 44 | if "ipadapter_xl" in model_manager.model: 45 | self.ipadapter = model_manager.ipadapter_xl 46 | if "ipadapter_xl_image_encoder" in model_manager.model: 47 | self.ipadapter_image_encoder = model_manager.ipadapter_xl_image_encoder 48 | 49 | 50 | def fetch_prompter(self, model_manager: ModelManager): 51 | self.prompter.load_from_model_manager(model_manager) 52 | 53 | 54 | @staticmethod 55 | def from_model_manager(model_manager: ModelManager, controlnet_config_units = [], **kwargs): 56 | pipe = SDXLImagePipeline( 57 | device=model_manager.device, 58 | torch_dtype=model_manager.torch_dtype, 59 | ) 60 | pipe.fetch_main_models(model_manager) 61 | pipe.fetch_prompter(model_manager) 62 | pipe.fetch_controlnet_models(model_manager, controlnet_config_units=controlnet_config_units) 63 | pipe.fetch_ipadapter(model_manager) 64 | return pipe 65 | 66 | 67 | def preprocess_image(self, image): 68 | image = torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1).permute(2, 0, 1).unsqueeze(0) 69 | return image 70 | 71 | 72 | def decode_image(self, latent, tiled=False, tile_size=64, tile_stride=32): 73 | image = self.vae_decoder(latent.to(self.device), tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)[0] 74 | image = image.cpu().permute(1, 2, 0).numpy() 75 | image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8")) 76 | return image 77 | 78 | 79 | @torch.no_grad() 80 | def __call__( 81 | self, 82 | prompt, 83 | negative_prompt="", 84 | cfg_scale=7.5, 85 | clip_skip=1, 86 | clip_skip_2=2, 87 | input_image=None, 88 | ipadapter_images=None, 89 | ipadapter_scale=1.0, 90 | controlnet_image=None, 91 | denoising_strength=1.0, 92 | height=1024, 93 | width=1024, 94 | num_inference_steps=20, 95 | tiled=False, 96 | tile_size=64, 97 | tile_stride=32, 98 | progress_bar_cmd=tqdm, 99 | progress_bar_st=None, 100 | ): 101 | # Prepare scheduler 102 | self.scheduler.set_timesteps(num_inference_steps, denoising_strength) 103 | 104 | # Prepare latent tensors 105 | if input_image is not None: 106 | image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype) 107 | latents = self.vae_encoder(image.to(torch.float32), tiled=tiled, tile_size=tile_size, tile_stride=tile_stride).to(self.torch_dtype) 108 | noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) 109 | latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) 110 | else: 111 | latents = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) 112 | 113 | # Encode prompts 114 | add_prompt_emb_posi, prompt_emb_posi = self.prompter.encode_prompt( 115 | self.text_encoder, 116 | self.text_encoder_2, 117 | prompt, 118 | clip_skip=clip_skip, clip_skip_2=clip_skip_2, 119 | device=self.device, 120 | positive=True, 121 | ) 122 | if cfg_scale != 1.0: 123 | add_prompt_emb_nega, prompt_emb_nega = self.prompter.encode_prompt( 124 | self.text_encoder, 125 | self.text_encoder_2, 126 | negative_prompt, 127 | clip_skip=clip_skip, clip_skip_2=clip_skip_2, 128 | device=self.device, 129 | positive=False, 130 | ) 131 | 132 | # Prepare positional id 133 | add_time_id = torch.tensor([height, width, 0, 0, height, width], device=self.device) 134 | 135 | # IP-Adapter 136 | if ipadapter_images is not None: 137 | ipadapter_image_encoding = self.ipadapter_image_encoder(ipadapter_images) 138 | ipadapter_kwargs_list_posi = self.ipadapter(ipadapter_image_encoding, scale=ipadapter_scale) 139 | ipadapter_kwargs_list_nega = self.ipadapter(torch.zeros_like(ipadapter_image_encoding)) 140 | else: 141 | ipadapter_kwargs_list_posi, ipadapter_kwargs_list_nega = {}, {} 142 | 143 | # Denoise 144 | for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): 145 | timestep = torch.IntTensor((timestep,))[0].to(self.device) 146 | 147 | # Classifier-free guidance 148 | noise_pred_posi = lets_dance_xl( 149 | self.unet, 150 | sample=latents, timestep=timestep, encoder_hidden_states=prompt_emb_posi, 151 | add_time_id=add_time_id, add_text_embeds=add_prompt_emb_posi, 152 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride, 153 | ipadapter_kwargs_list=ipadapter_kwargs_list_posi, 154 | ) 155 | if cfg_scale != 1.0: 156 | noise_pred_nega = lets_dance_xl( 157 | self.unet, 158 | sample=latents, timestep=timestep, encoder_hidden_states=prompt_emb_nega, 159 | add_time_id=add_time_id, add_text_embeds=add_prompt_emb_nega, 160 | tiled=tiled, tile_size=tile_size, tile_stride=tile_stride, 161 | ipadapter_kwargs_list=ipadapter_kwargs_list_nega, 162 | ) 163 | noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega) 164 | else: 165 | noise_pred = noise_pred_posi 166 | 167 | latents = self.scheduler.step(noise_pred, timestep, latents) 168 | 169 | if progress_bar_st is not None: 170 | progress_bar_st.progress(progress_id / len(self.scheduler.timesteps)) 171 | 172 | # Decode image 173 | image = self.decode_image(latents.to(torch.float32), tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) 174 | 175 | return image 176 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/pipelines/stable_diffusion_xl_video.py: -------------------------------------------------------------------------------- 1 | from ..models import ModelManager, SDXLTextEncoder, SDXLTextEncoder2, SDXLUNet, SDXLVAEDecoder, SDXLVAEEncoder, SDXLMotionModel 2 | from .dancer import lets_dance_xl 3 | # TODO: SDXL ControlNet 4 | from ..prompts import SDXLPrompter 5 | from ..schedulers import EnhancedDDIMScheduler 6 | import torch 7 | from tqdm import tqdm 8 | from PIL import Image 9 | import numpy as np 10 | 11 | 12 | class SDXLVideoPipeline(torch.nn.Module): 13 | 14 | def __init__(self, device="cuda", torch_dtype=torch.float16, use_animatediff=True): 15 | super().__init__() 16 | self.scheduler = EnhancedDDIMScheduler(beta_schedule="linear" if use_animatediff else "scaled_linear") 17 | self.prompter = SDXLPrompter() 18 | self.device = device 19 | self.torch_dtype = torch_dtype 20 | # models 21 | self.text_encoder: SDXLTextEncoder = None 22 | self.text_encoder_2: SDXLTextEncoder2 = None 23 | self.unet: SDXLUNet = None 24 | self.vae_decoder: SDXLVAEDecoder = None 25 | self.vae_encoder: SDXLVAEEncoder = None 26 | # TODO: SDXL ControlNet 27 | self.motion_modules: SDXLMotionModel = None 28 | 29 | 30 | def fetch_main_models(self, model_manager: ModelManager): 31 | self.text_encoder = model_manager.text_encoder 32 | self.text_encoder_2 = model_manager.text_encoder_2 33 | self.unet = model_manager.unet 34 | self.vae_decoder = model_manager.vae_decoder 35 | self.vae_encoder = model_manager.vae_encoder 36 | 37 | 38 | def fetch_controlnet_models(self, model_manager: ModelManager, **kwargs): 39 | # TODO: SDXL ControlNet 40 | pass 41 | 42 | 43 | def fetch_motion_modules(self, model_manager: ModelManager): 44 | if "motion_modules_xl" in model_manager.model: 45 | self.motion_modules = model_manager.motion_modules_xl 46 | 47 | 48 | def fetch_prompter(self, model_manager: ModelManager): 49 | self.prompter.load_from_model_manager(model_manager) 50 | 51 | 52 | @staticmethod 53 | def from_model_manager(model_manager: ModelManager, controlnet_config_units = [], **kwargs): 54 | pipe = SDXLVideoPipeline( 55 | device=model_manager.device, 56 | torch_dtype=model_manager.torch_dtype, 57 | use_animatediff="motion_modules_xl" in model_manager.model 58 | ) 59 | pipe.fetch_main_models(model_manager) 60 | pipe.fetch_motion_modules(model_manager) 61 | pipe.fetch_prompter(model_manager) 62 | pipe.fetch_controlnet_models(model_manager, controlnet_config_units=controlnet_config_units) 63 | return pipe 64 | 65 | 66 | def preprocess_image(self, image): 67 | image = torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1).permute(2, 0, 1).unsqueeze(0) 68 | return image 69 | 70 | 71 | def decode_image(self, latent, tiled=False, tile_size=64, tile_stride=32): 72 | image = self.vae_decoder(latent.to(self.device), tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)[0] 73 | image = image.cpu().permute(1, 2, 0).numpy() 74 | image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8")) 75 | return image 76 | 77 | 78 | def decode_images(self, latents, tiled=False, tile_size=64, tile_stride=32): 79 | images = [ 80 | self.decode_image(latents[frame_id: frame_id+1], tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) 81 | for frame_id in range(latents.shape[0]) 82 | ] 83 | return images 84 | 85 | 86 | def encode_images(self, processed_images, tiled=False, tile_size=64, tile_stride=32): 87 | latents = [] 88 | for image in processed_images: 89 | image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype) 90 | latent = self.vae_encoder(image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride).cpu() 91 | latents.append(latent) 92 | latents = torch.concat(latents, dim=0) 93 | return latents 94 | 95 | 96 | @torch.no_grad() 97 | def __call__( 98 | self, 99 | prompt, 100 | negative_prompt="", 101 | cfg_scale=7.5, 102 | clip_skip=1, 103 | clip_skip_2=2, 104 | num_frames=None, 105 | input_frames=None, 106 | controlnet_frames=None, 107 | denoising_strength=1.0, 108 | height=512, 109 | width=512, 110 | num_inference_steps=20, 111 | animatediff_batch_size = 16, 112 | animatediff_stride = 8, 113 | unet_batch_size = 1, 114 | controlnet_batch_size = 1, 115 | cross_frame_attention = False, 116 | smoother=None, 117 | smoother_progress_ids=[], 118 | vram_limit_level=0, 119 | progress_bar_cmd=tqdm, 120 | progress_bar_st=None, 121 | ): 122 | # Prepare scheduler 123 | self.scheduler.set_timesteps(num_inference_steps, denoising_strength) 124 | 125 | # Prepare latent tensors 126 | if self.motion_modules is None: 127 | noise = torch.randn((1, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1) 128 | else: 129 | noise = torch.randn((num_frames, 4, height//8, width//8), device="cuda", dtype=self.torch_dtype) 130 | if input_frames is None or denoising_strength == 1.0: 131 | latents = noise 132 | else: 133 | latents = self.encode_images(input_frames) 134 | latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) 135 | 136 | # Encode prompts 137 | add_prompt_emb_posi, prompt_emb_posi = self.prompter.encode_prompt( 138 | self.text_encoder, 139 | self.text_encoder_2, 140 | prompt, 141 | clip_skip=clip_skip, clip_skip_2=clip_skip_2, 142 | device=self.device, 143 | positive=True, 144 | ) 145 | if cfg_scale != 1.0: 146 | add_prompt_emb_nega, prompt_emb_nega = self.prompter.encode_prompt( 147 | self.text_encoder, 148 | self.text_encoder_2, 149 | negative_prompt, 150 | clip_skip=clip_skip, clip_skip_2=clip_skip_2, 151 | device=self.device, 152 | positive=False, 153 | ) 154 | 155 | # Prepare positional id 156 | add_time_id = torch.tensor([height, width, 0, 0, height, width], device=self.device) 157 | 158 | # Denoise 159 | for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): 160 | timestep = torch.IntTensor((timestep,))[0].to(self.device) 161 | 162 | # Classifier-free guidance 163 | noise_pred_posi = lets_dance_xl( 164 | self.unet, motion_modules=self.motion_modules, controlnet=None, 165 | sample=latents, add_time_id=add_time_id, add_text_embeds=add_prompt_emb_posi, 166 | timestep=timestep, encoder_hidden_states=prompt_emb_posi, controlnet_frames=controlnet_frames, 167 | cross_frame_attention=cross_frame_attention, 168 | device=self.device, vram_limit_level=vram_limit_level 169 | ) 170 | if cfg_scale != 1.0: 171 | noise_pred_nega = lets_dance_xl( 172 | self.unet, motion_modules=self.motion_modules, controlnet=None, 173 | sample=latents, add_time_id=add_time_id, add_text_embeds=add_prompt_emb_nega, 174 | timestep=timestep, encoder_hidden_states=prompt_emb_nega, controlnet_frames=controlnet_frames, 175 | cross_frame_attention=cross_frame_attention, 176 | device=self.device, vram_limit_level=vram_limit_level 177 | ) 178 | noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega) 179 | else: 180 | noise_pred = noise_pred_posi 181 | 182 | latents = self.scheduler.step(noise_pred, timestep, latents) 183 | 184 | if progress_bar_st is not None: 185 | progress_bar_st.progress(progress_id / len(self.scheduler.timesteps)) 186 | 187 | # Decode image 188 | image = self.decode_images(latents.to(torch.float32)) 189 | 190 | return image 191 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/processors/FastBlend.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import cupy as cp 3 | import numpy as np 4 | from tqdm import tqdm 5 | from ..extensions.FastBlend.patch_match import PyramidPatchMatcher 6 | from ..extensions.FastBlend.runners.fast import TableManager 7 | from .base import VideoProcessor 8 | 9 | 10 | class FastBlendSmoother(VideoProcessor): 11 | def __init__( 12 | self, 13 | inference_mode="fast", batch_size=8, window_size=60, 14 | minimum_patch_size=5, threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0, initialize="identity", tracking_window_size=0 15 | ): 16 | self.inference_mode = inference_mode 17 | self.batch_size = batch_size 18 | self.window_size = window_size 19 | self.ebsynth_config = { 20 | "minimum_patch_size": minimum_patch_size, 21 | "threads_per_block": threads_per_block, 22 | "num_iter": num_iter, 23 | "gpu_id": gpu_id, 24 | "guide_weight": guide_weight, 25 | "initialize": initialize, 26 | "tracking_window_size": tracking_window_size 27 | } 28 | 29 | @staticmethod 30 | def from_model_manager(model_manager, **kwargs): 31 | # TODO: fetch GPU ID from model_manager 32 | return FastBlendSmoother(**kwargs) 33 | 34 | def inference_fast(self, frames_guide, frames_style): 35 | table_manager = TableManager() 36 | patch_match_engine = PyramidPatchMatcher( 37 | image_height=frames_style[0].shape[0], 38 | image_width=frames_style[0].shape[1], 39 | channel=3, 40 | **self.ebsynth_config 41 | ) 42 | # left part 43 | table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, self.batch_size, desc="Fast Mode Step 1/4") 44 | table_l = table_manager.remapping_table_to_blending_table(table_l) 45 | table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, self.window_size, self.batch_size, desc="Fast Mode Step 2/4") 46 | # right part 47 | table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, self.batch_size, desc="Fast Mode Step 3/4") 48 | table_r = table_manager.remapping_table_to_blending_table(table_r) 49 | table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, self.window_size, self.batch_size, desc="Fast Mode Step 4/4")[::-1] 50 | # merge 51 | frames = [] 52 | for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r): 53 | weight_m = -1 54 | weight = weight_l + weight_m + weight_r 55 | frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight) 56 | frames.append(frame) 57 | frames = [frame.clip(0, 255).astype("uint8") for frame in frames] 58 | frames = [Image.fromarray(frame) for frame in frames] 59 | return frames 60 | 61 | def inference_balanced(self, frames_guide, frames_style): 62 | patch_match_engine = PyramidPatchMatcher( 63 | image_height=frames_style[0].shape[0], 64 | image_width=frames_style[0].shape[1], 65 | channel=3, 66 | **self.ebsynth_config 67 | ) 68 | output_frames = [] 69 | # tasks 70 | n = len(frames_style) 71 | tasks = [] 72 | for target in range(n): 73 | for source in range(target - self.window_size, target + self.window_size + 1): 74 | if source >= 0 and source < n and source != target: 75 | tasks.append((source, target)) 76 | # run 77 | frames = [(None, 1) for i in range(n)] 78 | for batch_id in tqdm(range(0, len(tasks), self.batch_size), desc="Balanced Mode"): 79 | tasks_batch = tasks[batch_id: min(batch_id+self.batch_size, len(tasks))] 80 | source_guide = np.stack([frames_guide[source] for source, target in tasks_batch]) 81 | target_guide = np.stack([frames_guide[target] for source, target in tasks_batch]) 82 | source_style = np.stack([frames_style[source] for source, target in tasks_batch]) 83 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 84 | for (source, target), result in zip(tasks_batch, target_style): 85 | frame, weight = frames[target] 86 | if frame is None: 87 | frame = frames_style[target] 88 | frames[target] = ( 89 | frame * (weight / (weight + 1)) + result / (weight + 1), 90 | weight + 1 91 | ) 92 | if weight + 1 == min(n, target + self.window_size + 1) - max(0, target - self.window_size): 93 | frame = frame.clip(0, 255).astype("uint8") 94 | output_frames.append(Image.fromarray(frame)) 95 | frames[target] = (None, 1) 96 | return output_frames 97 | 98 | def inference_accurate(self, frames_guide, frames_style): 99 | patch_match_engine = PyramidPatchMatcher( 100 | image_height=frames_style[0].shape[0], 101 | image_width=frames_style[0].shape[1], 102 | channel=3, 103 | use_mean_target_style=True, 104 | **self.ebsynth_config 105 | ) 106 | output_frames = [] 107 | # run 108 | n = len(frames_style) 109 | for target in tqdm(range(n), desc="Accurate Mode"): 110 | l, r = max(target - self.window_size, 0), min(target + self.window_size + 1, n) 111 | remapped_frames = [] 112 | for i in range(l, r, self.batch_size): 113 | j = min(i + self.batch_size, r) 114 | source_guide = np.stack([frames_guide[source] for source in range(i, j)]) 115 | target_guide = np.stack([frames_guide[target]] * (j - i)) 116 | source_style = np.stack([frames_style[source] for source in range(i, j)]) 117 | _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style) 118 | remapped_frames.append(target_style) 119 | frame = np.concatenate(remapped_frames, axis=0).mean(axis=0) 120 | frame = frame.clip(0, 255).astype("uint8") 121 | output_frames.append(Image.fromarray(frame)) 122 | return output_frames 123 | 124 | def release_vram(self): 125 | mempool = cp.get_default_memory_pool() 126 | pinned_mempool = cp.get_default_pinned_memory_pool() 127 | mempool.free_all_blocks() 128 | pinned_mempool.free_all_blocks() 129 | 130 | def __call__(self, rendered_frames, original_frames=None, **kwargs): 131 | rendered_frames = [np.array(frame) for frame in rendered_frames] 132 | original_frames = [np.array(frame) for frame in original_frames] 133 | if self.inference_mode == "fast": 134 | output_frames = self.inference_fast(original_frames, rendered_frames) 135 | elif self.inference_mode == "balanced": 136 | output_frames = self.inference_balanced(original_frames, rendered_frames) 137 | elif self.inference_mode == "accurate": 138 | output_frames = self.inference_accurate(original_frames, rendered_frames) 139 | else: 140 | raise ValueError("inference_mode must be fast, balanced or accurate") 141 | self.release_vram() 142 | return output_frames 143 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/processors/PILEditor.py: -------------------------------------------------------------------------------- 1 | from PIL import ImageEnhance 2 | from .base import VideoProcessor 3 | 4 | 5 | class ContrastEditor(VideoProcessor): 6 | def __init__(self, rate=1.5): 7 | self.rate = rate 8 | 9 | @staticmethod 10 | def from_model_manager(model_manager, **kwargs): 11 | return ContrastEditor(**kwargs) 12 | 13 | def __call__(self, rendered_frames, **kwargs): 14 | rendered_frames = [ImageEnhance.Contrast(i).enhance(self.rate) for i in rendered_frames] 15 | return rendered_frames 16 | 17 | 18 | class SharpnessEditor(VideoProcessor): 19 | def __init__(self, rate=1.5): 20 | self.rate = rate 21 | 22 | @staticmethod 23 | def from_model_manager(model_manager, **kwargs): 24 | return SharpnessEditor(**kwargs) 25 | 26 | def __call__(self, rendered_frames, **kwargs): 27 | rendered_frames = [ImageEnhance.Sharpness(i).enhance(self.rate) for i in rendered_frames] 28 | return rendered_frames 29 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/processors/RIFE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from PIL import Image 4 | from .base import VideoProcessor 5 | 6 | 7 | class RIFESmoother(VideoProcessor): 8 | def __init__(self, model, device="cuda", scale=1.0, batch_size=4, interpolate=True): 9 | self.model = model 10 | self.device = device 11 | 12 | # IFNet only does not support float16 13 | self.torch_dtype = torch.float32 14 | 15 | # Other parameters 16 | self.scale = scale 17 | self.batch_size = batch_size 18 | self.interpolate = interpolate 19 | 20 | @staticmethod 21 | def from_model_manager(model_manager, **kwargs): 22 | return RIFESmoother(model_manager.RIFE, device=model_manager.device, **kwargs) 23 | 24 | def process_image(self, image): 25 | width, height = image.size 26 | if width % 32 != 0 or height % 32 != 0: 27 | width = (width + 31) // 32 28 | height = (height + 31) // 32 29 | image = image.resize((width, height)) 30 | image = torch.Tensor(np.array(image, dtype=np.float32)[:, :, [2,1,0]] / 255).permute(2, 0, 1) 31 | return image 32 | 33 | def process_images(self, images): 34 | images = [self.process_image(image) for image in images] 35 | images = torch.stack(images) 36 | return images 37 | 38 | def decode_images(self, images): 39 | images = (images[:, [2,1,0]].permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8) 40 | images = [Image.fromarray(image) for image in images] 41 | return images 42 | 43 | def process_tensors(self, input_tensor, scale=1.0, batch_size=4): 44 | output_tensor = [] 45 | for batch_id in range(0, input_tensor.shape[0], batch_size): 46 | batch_id_ = min(batch_id + batch_size, input_tensor.shape[0]) 47 | batch_input_tensor = input_tensor[batch_id: batch_id_] 48 | batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype) 49 | flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale]) 50 | output_tensor.append(merged[2].cpu()) 51 | output_tensor = torch.concat(output_tensor, dim=0) 52 | return output_tensor 53 | 54 | @torch.no_grad() 55 | def __call__(self, rendered_frames, **kwargs): 56 | # Preprocess 57 | processed_images = self.process_images(rendered_frames) 58 | 59 | # Input 60 | input_tensor = torch.cat((processed_images[:-2], processed_images[2:]), dim=1) 61 | 62 | # Interpolate 63 | output_tensor = self.process_tensors(input_tensor, scale=self.scale, batch_size=self.batch_size) 64 | 65 | if self.interpolate: 66 | # Blend 67 | input_tensor = torch.cat((processed_images[1:-1], output_tensor), dim=1) 68 | output_tensor = self.process_tensors(input_tensor, scale=self.scale, batch_size=self.batch_size) 69 | processed_images[1:-1] = output_tensor 70 | else: 71 | processed_images[1:-1] = (processed_images[1:-1] + output_tensor) / 2 72 | 73 | # To images 74 | output_images = self.decode_images(processed_images) 75 | if output_images[0].size != rendered_frames[0].size: 76 | output_images = [image.resize(rendered_frames[0].size) for image in output_images] 77 | return output_images 78 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/diffsynth/processors/__init__.py -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/processors/base.py: -------------------------------------------------------------------------------- 1 | class VideoProcessor: 2 | def __init__(self): 3 | pass 4 | 5 | def __call__(self): 6 | raise NotImplementedError 7 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/processors/sequencial_processor.py: -------------------------------------------------------------------------------- 1 | from .base import VideoProcessor 2 | 3 | 4 | class AutoVideoProcessor(VideoProcessor): 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def from_model_manager(model_manager, processor_type, **kwargs): 10 | if processor_type == "FastBlend": 11 | from .FastBlend import FastBlendSmoother 12 | return FastBlendSmoother.from_model_manager(model_manager, **kwargs) 13 | elif processor_type == "Contrast": 14 | from .PILEditor import ContrastEditor 15 | return ContrastEditor.from_model_manager(model_manager, **kwargs) 16 | elif processor_type == "Sharpness": 17 | from .PILEditor import SharpnessEditor 18 | return SharpnessEditor.from_model_manager(model_manager, **kwargs) 19 | elif processor_type == "RIFE": 20 | from .RIFE import RIFESmoother 21 | return RIFESmoother.from_model_manager(model_manager, **kwargs) 22 | else: 23 | raise ValueError(f"invalid processor_type: {processor_type}") 24 | 25 | 26 | class SequencialProcessor(VideoProcessor): 27 | def __init__(self, processors=[]): 28 | self.processors = processors 29 | 30 | @staticmethod 31 | def from_model_manager(model_manager, configs): 32 | processors = [ 33 | AutoVideoProcessor.from_model_manager(model_manager, config["processor_type"], **config["config"]) 34 | for config in configs 35 | ] 36 | return SequencialProcessor(processors) 37 | 38 | def __call__(self, rendered_frames, **kwargs): 39 | for processor in self.processors: 40 | rendered_frames = processor(rendered_frames, **kwargs) 41 | return rendered_frames 42 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | from .sd_prompter import SDPrompter 2 | from .sdxl_prompter import SDXLPrompter 3 | from .hunyuan_dit_prompter import HunyuanDiTPrompter 4 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/prompts/hunyuan_dit_prompter.py: -------------------------------------------------------------------------------- 1 | from .utils import Prompter 2 | from transformers import BertModel, T5EncoderModel, BertTokenizer, AutoTokenizer 3 | import warnings, os 4 | 5 | 6 | class HunyuanDiTPrompter(Prompter): 7 | def __init__( 8 | self, 9 | tokenizer_path=None, 10 | tokenizer_t5_path=None 11 | ): 12 | if tokenizer_path is None: 13 | base_path = os.path.dirname(os.path.dirname(__file__)) 14 | tokenizer_path = os.path.join(base_path, "tokenizer_configs/hunyuan_dit/tokenizer") 15 | if tokenizer_t5_path is None: 16 | base_path = os.path.dirname(os.path.dirname(__file__)) 17 | tokenizer_t5_path = os.path.join(base_path, "tokenizer_configs/hunyuan_dit/tokenizer_t5") 18 | super().__init__() 19 | self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path) 20 | with warnings.catch_warnings(): 21 | warnings.simplefilter("ignore") 22 | self.tokenizer_t5 = AutoTokenizer.from_pretrained(tokenizer_t5_path) 23 | 24 | 25 | def encode_prompt_using_signle_model(self, prompt, text_encoder, tokenizer, max_length, clip_skip, device): 26 | text_inputs = tokenizer( 27 | prompt, 28 | padding="max_length", 29 | max_length=max_length, 30 | truncation=True, 31 | return_attention_mask=True, 32 | return_tensors="pt", 33 | ) 34 | text_input_ids = text_inputs.input_ids 35 | attention_mask = text_inputs.attention_mask.to(device) 36 | prompt_embeds = text_encoder( 37 | text_input_ids.to(device), 38 | attention_mask=attention_mask, 39 | clip_skip=clip_skip 40 | ) 41 | return prompt_embeds, attention_mask 42 | 43 | 44 | def encode_prompt( 45 | self, 46 | text_encoder: BertModel, 47 | text_encoder_t5: T5EncoderModel, 48 | prompt, 49 | clip_skip=1, 50 | clip_skip_2=1, 51 | positive=True, 52 | device="cuda" 53 | ): 54 | prompt = self.process_prompt(prompt, positive=positive) 55 | 56 | # CLIP 57 | prompt_emb, attention_mask = self.encode_prompt_using_signle_model(prompt, text_encoder, self.tokenizer, self.tokenizer.model_max_length, clip_skip, device) 58 | 59 | # T5 60 | prompt_emb_t5, attention_mask_t5 = self.encode_prompt_using_signle_model(prompt, text_encoder_t5, self.tokenizer_t5, self.tokenizer_t5.model_max_length, clip_skip_2, device) 61 | 62 | return prompt_emb, attention_mask, prompt_emb_t5, attention_mask_t5 63 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/prompts/sd_prompter.py: -------------------------------------------------------------------------------- 1 | from .utils import Prompter, tokenize_long_prompt 2 | from transformers import CLIPTokenizer 3 | from ..models import SDTextEncoder 4 | import os 5 | 6 | 7 | class SDPrompter(Prompter): 8 | def __init__(self, tokenizer_path=None): 9 | if tokenizer_path is None: 10 | base_path = os.path.dirname(os.path.dirname(__file__)) 11 | tokenizer_path = os.path.join(base_path, "tokenizer_configs/stable_diffusion/tokenizer") 12 | super().__init__() 13 | self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path) 14 | 15 | def encode_prompt(self, text_encoder: SDTextEncoder, prompt, clip_skip=1, device="cuda", positive=True): 16 | prompt = self.process_prompt(prompt, positive=positive) 17 | input_ids = tokenize_long_prompt(self.tokenizer, prompt).to(device) 18 | prompt_emb = text_encoder(input_ids, clip_skip=clip_skip) 19 | prompt_emb = prompt_emb.reshape((1, prompt_emb.shape[0]*prompt_emb.shape[1], -1)) 20 | 21 | return prompt_emb -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/prompts/sdxl_prompter.py: -------------------------------------------------------------------------------- 1 | from .utils import Prompter, tokenize_long_prompt 2 | from transformers import CLIPTokenizer 3 | from ..models import SDXLTextEncoder, SDXLTextEncoder2 4 | import torch, os 5 | 6 | 7 | class SDXLPrompter(Prompter): 8 | def __init__( 9 | self, 10 | tokenizer_path=None, 11 | tokenizer_2_path=None 12 | ): 13 | if tokenizer_path is None: 14 | base_path = os.path.dirname(os.path.dirname(__file__)) 15 | tokenizer_path = os.path.join(base_path, "tokenizer_configs/stable_diffusion/tokenizer") 16 | if tokenizer_2_path is None: 17 | base_path = os.path.dirname(os.path.dirname(__file__)) 18 | tokenizer_2_path = os.path.join(base_path, "tokenizer_configs/stable_diffusion_xl/tokenizer_2") 19 | super().__init__() 20 | self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path) 21 | self.tokenizer_2 = CLIPTokenizer.from_pretrained(tokenizer_2_path) 22 | 23 | def encode_prompt( 24 | self, 25 | text_encoder: SDXLTextEncoder, 26 | text_encoder_2: SDXLTextEncoder2, 27 | prompt, 28 | clip_skip=1, 29 | clip_skip_2=2, 30 | positive=True, 31 | device="cuda" 32 | ): 33 | prompt = self.process_prompt(prompt, positive=positive) 34 | 35 | # 1 36 | input_ids = tokenize_long_prompt(self.tokenizer, prompt).to(device) 37 | prompt_emb_1 = text_encoder(input_ids, clip_skip=clip_skip) 38 | 39 | # 2 40 | input_ids_2 = tokenize_long_prompt(self.tokenizer_2, prompt).to(device) 41 | add_text_embeds, prompt_emb_2 = text_encoder_2(input_ids_2, clip_skip=clip_skip_2) 42 | 43 | # Merge 44 | prompt_emb = torch.concatenate([prompt_emb_1, prompt_emb_2], dim=-1) 45 | 46 | # For very long prompt, we only use the first 77 tokens to compute `add_text_embeds`. 47 | add_text_embeds = add_text_embeds[0:1] 48 | prompt_emb = prompt_emb.reshape((1, prompt_emb.shape[0]*prompt_emb.shape[1], -1)) 49 | return add_text_embeds, prompt_emb 50 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/prompts/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import CLIPTokenizer, AutoTokenizer 2 | from ..models import ModelManager 3 | import os 4 | 5 | 6 | def tokenize_long_prompt(tokenizer, prompt): 7 | # Get model_max_length from self.tokenizer 8 | length = tokenizer.model_max_length 9 | 10 | # To avoid the warning. set self.tokenizer.model_max_length to +oo. 11 | tokenizer.model_max_length = 99999999 12 | 13 | # Tokenize it! 14 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids 15 | 16 | # Determine the real length. 17 | max_length = (input_ids.shape[1] + length - 1) // length * length 18 | 19 | # Restore tokenizer.model_max_length 20 | tokenizer.model_max_length = length 21 | 22 | # Tokenize it again with fixed length. 23 | input_ids = tokenizer( 24 | prompt, 25 | return_tensors="pt", 26 | padding="max_length", 27 | max_length=max_length, 28 | truncation=True 29 | ).input_ids 30 | 31 | # Reshape input_ids to fit the text encoder. 32 | num_sentence = input_ids.shape[1] // length 33 | input_ids = input_ids.reshape((num_sentence, length)) 34 | 35 | return input_ids 36 | 37 | 38 | class BeautifulPrompt: 39 | def __init__(self, tokenizer_path=None, model=None): 40 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) 41 | self.model = model 42 | self.template = 'Instruction: Give a simple description of the image to generate a drawing prompt.\nInput: {raw_prompt}\nOutput:' 43 | 44 | def __call__(self, raw_prompt): 45 | model_input = self.template.format(raw_prompt=raw_prompt) 46 | input_ids = self.tokenizer.encode(model_input, return_tensors='pt').to(self.model.device) 47 | outputs = self.model.generate( 48 | input_ids, 49 | max_new_tokens=384, 50 | do_sample=True, 51 | temperature=0.9, 52 | top_k=50, 53 | top_p=0.95, 54 | repetition_penalty=1.1, 55 | num_return_sequences=1 56 | ) 57 | prompt = raw_prompt + ", " + self.tokenizer.batch_decode( 58 | outputs[:, input_ids.size(1):], 59 | skip_special_tokens=True 60 | )[0].strip() 61 | return prompt 62 | 63 | 64 | class Translator: 65 | def __init__(self, tokenizer_path=None, model=None): 66 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) 67 | self.model = model 68 | 69 | def __call__(self, prompt): 70 | input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.model.device) 71 | output_ids = self.model.generate(input_ids) 72 | prompt = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 73 | return prompt 74 | 75 | 76 | class Prompter: 77 | def __init__(self): 78 | self.tokenizer: CLIPTokenizer = None 79 | self.keyword_dict = {} 80 | self.translator: Translator = None 81 | self.beautiful_prompt: BeautifulPrompt = None 82 | 83 | def load_textual_inversion(self, textual_inversion_dict): 84 | self.keyword_dict = {} 85 | additional_tokens = [] 86 | for keyword in textual_inversion_dict: 87 | tokens, _ = textual_inversion_dict[keyword] 88 | additional_tokens += tokens 89 | self.keyword_dict[keyword] = " " + " ".join(tokens) + " " 90 | self.tokenizer.add_tokens(additional_tokens) 91 | 92 | def load_beautiful_prompt(self, model, model_path): 93 | model_folder = os.path.dirname(model_path) 94 | self.beautiful_prompt = BeautifulPrompt(tokenizer_path=model_folder, model=model) 95 | if model_folder.endswith("v2"): 96 | self.beautiful_prompt.template = """Converts a simple image description into a prompt. \ 97 | Prompts are formatted as multiple related tags separated by commas, plus you can use () to increase the weight, [] to decrease the weight, \ 98 | or use a number to specify the weight. You should add appropriate words to make the images described in the prompt more aesthetically pleasing, \ 99 | but make sure there is a correlation between the input and output.\n\ 100 | ### Input: {raw_prompt}\n### Output:""" 101 | 102 | def load_translator(self, model, model_path): 103 | model_folder = os.path.dirname(model_path) 104 | self.translator = Translator(tokenizer_path=model_folder, model=model) 105 | 106 | def load_from_model_manager(self, model_manager: ModelManager): 107 | self.load_textual_inversion(model_manager.textual_inversion_dict) 108 | if "translator" in model_manager.model: 109 | self.load_translator(model_manager.model["translator"], model_manager.model_path["translator"]) 110 | if "beautiful_prompt" in model_manager.model: 111 | self.load_beautiful_prompt(model_manager.model["beautiful_prompt"], model_manager.model_path["beautiful_prompt"]) 112 | 113 | def process_prompt(self, prompt, positive=True): 114 | for keyword in self.keyword_dict: 115 | if keyword in prompt: 116 | prompt = prompt.replace(keyword, self.keyword_dict[keyword]) 117 | if positive and self.translator is not None: 118 | prompt = self.translator(prompt) 119 | print(f"Your prompt is translated: \"{prompt}\"") 120 | if positive and self.beautiful_prompt is not None: 121 | prompt = self.beautiful_prompt(prompt) 122 | print(f"Your prompt is refined by BeautifulPrompt: \"{prompt}\"") 123 | return prompt 124 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from .ddim import EnhancedDDIMScheduler 2 | from .continuous_ode import ContinuousODEScheduler 3 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/schedulers/continuous_ode.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class ContinuousODEScheduler(): 5 | 6 | def __init__(self, num_inference_steps=100, sigma_max=700.0, sigma_min=0.002, rho=7.0): 7 | self.sigma_max = sigma_max 8 | self.sigma_min = sigma_min 9 | self.rho = rho 10 | self.set_timesteps(num_inference_steps) 11 | 12 | 13 | def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0): 14 | ramp = torch.linspace(1-denoising_strength, 1, num_inference_steps) 15 | min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho)) 16 | max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho)) 17 | self.sigmas = torch.pow(max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho) 18 | self.timesteps = torch.log(self.sigmas) * 0.25 19 | 20 | 21 | def step(self, model_output, timestep, sample, to_final=False): 22 | timestep_id = torch.argmin((self.timesteps - timestep).abs()) 23 | sigma = self.sigmas[timestep_id] 24 | sample *= (sigma*sigma + 1).sqrt() 25 | estimated_sample = -sigma / (sigma*sigma + 1).sqrt() * model_output + 1 / (sigma*sigma + 1) * sample 26 | if to_final or timestep_id + 1 >= len(self.timesteps): 27 | prev_sample = estimated_sample 28 | else: 29 | sigma_ = self.sigmas[timestep_id + 1] 30 | derivative = 1 / sigma * (sample - estimated_sample) 31 | prev_sample = sample + derivative * (sigma_ - sigma) 32 | prev_sample /= (sigma_*sigma_ + 1).sqrt() 33 | return prev_sample 34 | 35 | 36 | def return_to_timestep(self, timestep, sample, sample_stablized): 37 | # This scheduler doesn't support this function. 38 | pass 39 | 40 | 41 | def add_noise(self, original_samples, noise, timestep): 42 | timestep_id = torch.argmin((self.timesteps - timestep).abs()) 43 | sigma = self.sigmas[timestep_id] 44 | sample = (original_samples + noise * sigma) / (sigma*sigma + 1).sqrt() 45 | return sample 46 | 47 | 48 | def training_target(self, sample, noise, timestep): 49 | timestep_id = torch.argmin((self.timesteps - timestep).abs()) 50 | sigma = self.sigmas[timestep_id] 51 | target = (-(sigma*sigma + 1).sqrt() / sigma + 1 / (sigma*sigma + 1).sqrt() / sigma) * sample + 1 / (sigma*sigma + 1).sqrt() * noise 52 | return target 53 | 54 | 55 | def training_weight(self, timestep): 56 | timestep_id = torch.argmin((self.timesteps - timestep).abs()) 57 | sigma = self.sigmas[timestep_id] 58 | weight = (1 + sigma*sigma).sqrt() / sigma 59 | return weight 60 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/schedulers/ddim.py: -------------------------------------------------------------------------------- 1 | import torch, math 2 | 3 | 4 | class EnhancedDDIMScheduler(): 5 | 6 | def __init__(self, num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="epsilon"): 7 | self.num_train_timesteps = num_train_timesteps 8 | if beta_schedule == "scaled_linear": 9 | betas = torch.square(torch.linspace(math.sqrt(beta_start), math.sqrt(beta_end), num_train_timesteps, dtype=torch.float32)) 10 | elif beta_schedule == "linear": 11 | betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) 12 | else: 13 | raise NotImplementedError(f"{beta_schedule} is not implemented") 14 | self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0).tolist() 15 | self.set_timesteps(10) 16 | self.prediction_type = prediction_type 17 | 18 | 19 | def set_timesteps(self, num_inference_steps, denoising_strength=1.0): 20 | # The timesteps are aligned to 999...0, which is different from other implementations, 21 | # but I think this implementation is more reasonable in theory. 22 | max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0) 23 | num_inference_steps = min(num_inference_steps, max_timestep + 1) 24 | if num_inference_steps == 1: 25 | self.timesteps = [max_timestep] 26 | else: 27 | step_length = max_timestep / (num_inference_steps - 1) 28 | self.timesteps = [round(max_timestep - i*step_length) for i in range(num_inference_steps)] 29 | 30 | 31 | def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev): 32 | if self.prediction_type == "epsilon": 33 | weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t) 34 | weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t) 35 | prev_sample = sample * weight_x + model_output * weight_e 36 | elif self.prediction_type == "v_prediction": 37 | weight_e = -math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t)) + math.sqrt(alpha_prod_t * (1 - alpha_prod_t_prev)) 38 | weight_x = math.sqrt(alpha_prod_t * alpha_prod_t_prev) + math.sqrt((1 - alpha_prod_t) * (1 - alpha_prod_t_prev)) 39 | prev_sample = sample * weight_x + model_output * weight_e 40 | else: 41 | raise NotImplementedError(f"{self.prediction_type} is not implemented") 42 | return prev_sample 43 | 44 | 45 | def step(self, model_output, timestep, sample, to_final=False): 46 | alpha_prod_t = self.alphas_cumprod[timestep] 47 | timestep_id = self.timesteps.index(timestep) 48 | if to_final or timestep_id + 1 >= len(self.timesteps): 49 | alpha_prod_t_prev = 1.0 50 | else: 51 | timestep_prev = self.timesteps[timestep_id + 1] 52 | alpha_prod_t_prev = self.alphas_cumprod[timestep_prev] 53 | 54 | return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev) 55 | 56 | 57 | def return_to_timestep(self, timestep, sample, sample_stablized): 58 | alpha_prod_t = self.alphas_cumprod[timestep] 59 | noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(1 - alpha_prod_t) 60 | return noise_pred 61 | 62 | 63 | def add_noise(self, original_samples, noise, timestep): 64 | sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[timestep]) 65 | sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[timestep]) 66 | noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise 67 | return noisy_samples 68 | 69 | def training_target(self, sample, noise, timestep): 70 | sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[timestep]) 71 | sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[timestep]) 72 | target = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample 73 | return target 74 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "mask_token": "[MASK]", 4 | "pad_token": "[PAD]", 5 | "sep_token": "[SEP]", 6 | "unk_token": "[UNK]" 7 | } 8 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "do_basic_tokenize": true, 4 | "do_lower_case": true, 5 | "mask_token": "[MASK]", 6 | "name_or_path": "hfl/chinese-roberta-wwm-ext", 7 | "never_split": null, 8 | "pad_token": "[PAD]", 9 | "sep_token": "[SEP]", 10 | "special_tokens_map_file": "/home/chenweifeng/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext/snapshots/5c58d0b8ec1d9014354d691c538661bf00bfdb44/special_tokens_map.json", 11 | "strip_accents": null, 12 | "tokenize_chinese_chars": true, 13 | "tokenizer_class": "BertTokenizer", 14 | "unk_token": "[UNK]", 15 | "model_max_length": 77 16 | } 17 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/t5/mt5-xl", 3 | "architectures": [ 4 | "MT5ForConditionalGeneration" 5 | ], 6 | "d_ff": 5120, 7 | "d_kv": 64, 8 | "d_model": 2048, 9 | "decoder_start_token_id": 0, 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "layer_norm_epsilon": 1e-06, 16 | "model_type": "mt5", 17 | "num_decoder_layers": 24, 18 | "num_heads": 32, 19 | "num_layers": 24, 20 | "output_past": true, 21 | "pad_token_id": 0, 22 | "relative_attention_num_buckets": 32, 23 | "tie_word_embeddings": false, 24 | "tokenizer_class": "T5Tokenizer", 25 | "transformers_version": "4.10.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 250112 28 | } 29 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"eos_token": "", "unk_token": "", "pad_token": ""} -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/spiece.model -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"eos_token": "", "unk_token": "", "pad_token": "", "extra_ids": 0, "additional_special_tokens": null, "special_tokens_map_file": "", "tokenizer_file": null, "name_or_path": "google/mt5-small", "model_max_length": 256, "legacy": true} -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "<|startoftext|>", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "<|endoftext|>", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "pad_token": "<|endoftext|>", 17 | "unk_token": { 18 | "content": "<|endoftext|>", 19 | "lstrip": false, 20 | "normalized": true, 21 | "rstrip": false, 22 | "single_word": false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": { 4 | "__type": "AddedToken", 5 | "content": "<|startoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false 10 | }, 11 | "do_lower_case": true, 12 | "eos_token": { 13 | "__type": "AddedToken", 14 | "content": "<|endoftext|>", 15 | "lstrip": false, 16 | "normalized": true, 17 | "rstrip": false, 18 | "single_word": false 19 | }, 20 | "errors": "replace", 21 | "model_max_length": 77, 22 | "name_or_path": "openai/clip-vit-large-patch14", 23 | "pad_token": "<|endoftext|>", 24 | "special_tokens_map_file": "./special_tokens_map.json", 25 | "tokenizer_class": "CLIPTokenizer", 26 | "unk_token": { 27 | "__type": "AddedToken", 28 | "content": "<|endoftext|>", 29 | "lstrip": false, 30 | "normalized": true, 31 | "rstrip": false, 32 | "single_word": false 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "<|startoftext|>", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "<|endoftext|>", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "pad_token": "!", 17 | "unk_token": { 18 | "content": "<|endoftext|>", 19 | "lstrip": false, 20 | "normalized": true, 21 | "rstrip": false, 22 | "single_word": false 23 | } 24 | } -------------------------------------------------------------------------------- /DiffSynth-Studio/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "added_tokens_decoder": { 4 | "0": { 5 | "content": "!", 6 | "lstrip": false, 7 | "normalized": false, 8 | "rstrip": false, 9 | "single_word": false, 10 | "special": true 11 | }, 12 | "49406": { 13 | "content": "<|startoftext|>", 14 | "lstrip": false, 15 | "normalized": true, 16 | "rstrip": false, 17 | "single_word": false, 18 | "special": true 19 | }, 20 | "49407": { 21 | "content": "<|endoftext|>", 22 | "lstrip": false, 23 | "normalized": true, 24 | "rstrip": false, 25 | "single_word": false, 26 | "special": true 27 | } 28 | }, 29 | "bos_token": "<|startoftext|>", 30 | "clean_up_tokenization_spaces": true, 31 | "do_lower_case": true, 32 | "eos_token": "<|endoftext|>", 33 | "errors": "replace", 34 | "model_max_length": 77, 35 | "pad_token": "!", 36 | "tokenizer_class": "CLIPTokenizer", 37 | "unk_token": "<|endoftext|>" 38 | } -------------------------------------------------------------------------------- /DiffSynth-Studio/environment.yml: -------------------------------------------------------------------------------- 1 | name: DiffSynthStudio 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - python=3.9.16 8 | - pip=23.0.1 9 | - cudatoolkit 10 | - pytorch 11 | - cupy 12 | - pip: 13 | - transformers 14 | - controlnet-aux==0.0.7 15 | - streamlit 16 | - streamlit-drawable-canvas 17 | - imageio 18 | - imageio[ffmpeg] 19 | - safetensors 20 | - einops 21 | - sentencepiece 22 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/Diffutoon/README.md: -------------------------------------------------------------------------------- 1 | # Diffutoon 2 | 3 | [Diffutoon](https://arxiv.org/abs/2401.16224) is a toon shading approach. This approach is adept for rendering high-resoluton videos with rapid motion. 4 | 5 | ## Example: Toon Shading (Diffutoon) 6 | 7 | Directly render realistic videos in a flatten style. In this example, you can easily modify the parameters in the config dict. See [`diffutoon_toon_shading.py`](./diffutoon_toon_shading.py). We also provide [an example on Colab](https://colab.research.google.com/github/Artiprocher/DiffSynth-Studio/blob/main/examples/Diffutoon.ipynb). 8 | 9 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd 10 | 11 | ## Example: Toon Shading with Editing Signals (Diffutoon) 12 | 13 | This example supports video editing signals. See [`diffutoon_toon_shading_with_editing_signals.py`](./diffutoon_toon_shading_with_editing_signals.py). The editing feature is also supported in the [Colab example](https://colab.research.google.com/github/Artiprocher/DiffSynth-Studio/blob/main/examples/Diffutoon/Diffutoon.ipynb). 14 | 15 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/20528af5-5100-474a-8cdc-440b9efdd86c 16 | 17 | ## Example: Toon Shading (in native Python code) 18 | 19 | This example is provided for developers. If you don't want to use the config to manage parameters, you can see [`sd_toon_shading.py`](./sd_toon_shading.py) to learn how to use it in native Python code. 20 | 21 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/607c199b-6140-410b-a111-3e4ffb01142c 22 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/Diffutoon/diffutoon_toon_shading.py: -------------------------------------------------------------------------------- 1 | from diffsynth import SDVideoPipelineRunner 2 | 3 | 4 | # Download models 5 | # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575) 6 | # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) 7 | # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) 8 | # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth) 9 | # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) 10 | # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) 11 | # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) 12 | 13 | # The original video in the example is https://www.bilibili.com/video/BV1iG411a7sQ/. 14 | 15 | config = { 16 | "models": { 17 | "model_list": [ 18 | "models/stable_diffusion/aingdiffusion_v12.safetensors", 19 | "models/AnimateDiff/mm_sd_v15_v2.ckpt", 20 | "models/ControlNet/control_v11f1e_sd15_tile.pth", 21 | "models/ControlNet/control_v11p_sd15_lineart.pth" 22 | ], 23 | "textual_inversion_folder": "models/textual_inversion", 24 | "device": "cuda", 25 | "lora_alphas": [], 26 | "controlnet_units": [ 27 | { 28 | "processor_id": "tile", 29 | "model_path": "models/ControlNet/control_v11f1e_sd15_tile.pth", 30 | "scale": 0.5 31 | }, 32 | { 33 | "processor_id": "lineart", 34 | "model_path": "models/ControlNet/control_v11p_sd15_lineart.pth", 35 | "scale": 0.5 36 | } 37 | ] 38 | }, 39 | "data": { 40 | "input_frames": { 41 | "video_file": "data/examples/diffutoon/input_video.mp4", 42 | "image_folder": None, 43 | "height": 1536, 44 | "width": 1536, 45 | "start_frame_id": 0, 46 | "end_frame_id": 30 47 | }, 48 | "controlnet_frames": [ 49 | { 50 | "video_file": "data/examples/diffutoon/input_video.mp4", 51 | "image_folder": None, 52 | "height": 1536, 53 | "width": 1536, 54 | "start_frame_id": 0, 55 | "end_frame_id": 30 56 | }, 57 | { 58 | "video_file": "data/examples/diffutoon/input_video.mp4", 59 | "image_folder": None, 60 | "height": 1536, 61 | "width": 1536, 62 | "start_frame_id": 0, 63 | "end_frame_id": 30 64 | } 65 | ], 66 | "output_folder": "data/examples/diffutoon/output", 67 | "fps": 30 68 | }, 69 | "pipeline": { 70 | "seed": 0, 71 | "pipeline_inputs": { 72 | "prompt": "best quality, perfect anime illustration, light, a girl is dancing, smile, solo", 73 | "negative_prompt": "verybadimagenegative_v1.3", 74 | "cfg_scale": 7.0, 75 | "clip_skip": 2, 76 | "denoising_strength": 1.0, 77 | "num_inference_steps": 10, 78 | "animatediff_batch_size": 16, 79 | "animatediff_stride": 8, 80 | "unet_batch_size": 1, 81 | "controlnet_batch_size": 1, 82 | "cross_frame_attention": False, 83 | # The following parameters will be overwritten. You don't need to modify them. 84 | "input_frames": [], 85 | "num_frames": 30, 86 | "width": 1536, 87 | "height": 1536, 88 | "controlnet_frames": [] 89 | } 90 | } 91 | } 92 | 93 | runner = SDVideoPipelineRunner() 94 | runner.run(config) 95 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py: -------------------------------------------------------------------------------- 1 | from diffsynth import SDVideoPipelineRunner 2 | import os 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575) 7 | # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) 8 | # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) 9 | # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth) 10 | # `models/ControlNet/control_v11f1p_sd15_depth.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1p_sd15_depth.pth) 11 | # `models/ControlNet/control_v11p_sd15_softedge.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_softedge.pth) 12 | # `models/Annotators/dpt_hybrid-midas-501f0c75.pt`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt) 13 | # `models/Annotators/ControlNetHED.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth) 14 | # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) 15 | # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) 16 | # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) 17 | 18 | # The original video in the example is https://www.bilibili.com/video/BV1zu4y1s7Ec/. 19 | 20 | config_stage_1 = { 21 | "models": { 22 | "model_list": [ 23 | "models/stable_diffusion/aingdiffusion_v12.safetensors", 24 | "models/ControlNet/control_v11p_sd15_softedge.pth", 25 | "models/ControlNet/control_v11f1p_sd15_depth.pth" 26 | ], 27 | "textual_inversion_folder": "models/textual_inversion", 28 | "device": "cuda", 29 | "lora_alphas": [], 30 | "controlnet_units": [ 31 | { 32 | "processor_id": "softedge", 33 | "model_path": "models/ControlNet/control_v11p_sd15_softedge.pth", 34 | "scale": 0.5 35 | }, 36 | { 37 | "processor_id": "depth", 38 | "model_path": "models/ControlNet/control_v11f1p_sd15_depth.pth", 39 | "scale": 0.5 40 | } 41 | ] 42 | }, 43 | "data": { 44 | "input_frames": { 45 | "video_file": "data/examples/diffutoon_edit/input_video.mp4", 46 | "image_folder": None, 47 | "height": 512, 48 | "width": 512, 49 | "start_frame_id": 0, 50 | "end_frame_id": 30 51 | }, 52 | "controlnet_frames": [ 53 | { 54 | "video_file": "data/examples/diffutoon_edit/input_video.mp4", 55 | "image_folder": None, 56 | "height": 512, 57 | "width": 512, 58 | "start_frame_id": 0, 59 | "end_frame_id": 30 60 | }, 61 | { 62 | "video_file": "data/examples/diffutoon_edit/input_video.mp4", 63 | "image_folder": None, 64 | "height": 512, 65 | "width": 512, 66 | "start_frame_id": 0, 67 | "end_frame_id": 30 68 | } 69 | ], 70 | "output_folder": "data/examples/diffutoon_edit/color_video", 71 | "fps": 25 72 | }, 73 | "smoother_configs": [ 74 | { 75 | "processor_type": "FastBlend", 76 | "config": {} 77 | } 78 | ], 79 | "pipeline": { 80 | "seed": 0, 81 | "pipeline_inputs": { 82 | "prompt": "best quality, perfect anime illustration, orange clothes, night, a girl is dancing, smile, solo, black silk stockings", 83 | "negative_prompt": "verybadimagenegative_v1.3", 84 | "cfg_scale": 7.0, 85 | "clip_skip": 1, 86 | "denoising_strength": 0.9, 87 | "num_inference_steps": 20, 88 | "animatediff_batch_size": 8, 89 | "animatediff_stride": 4, 90 | "unet_batch_size": 8, 91 | "controlnet_batch_size": 8, 92 | "cross_frame_attention": True, 93 | "smoother_progress_ids": [-1], 94 | # The following parameters will be overwritten. You don't need to modify them. 95 | "input_frames": [], 96 | "num_frames": 30, 97 | "width": 512, 98 | "height": 512, 99 | "controlnet_frames": [] 100 | } 101 | } 102 | } 103 | 104 | 105 | config_stage_2 = { 106 | "models": { 107 | "model_list": [ 108 | "models/stable_diffusion/aingdiffusion_v12.safetensors", 109 | "models/AnimateDiff/mm_sd_v15_v2.ckpt", 110 | "models/ControlNet/control_v11f1e_sd15_tile.pth", 111 | "models/ControlNet/control_v11p_sd15_lineart.pth" 112 | ], 113 | "textual_inversion_folder": "models/textual_inversion", 114 | "device": "cuda", 115 | "lora_alphas": [], 116 | "controlnet_units": [ 117 | { 118 | "processor_id": "tile", 119 | "model_path": "models/ControlNet/control_v11f1e_sd15_tile.pth", 120 | "scale": 0.5 121 | }, 122 | { 123 | "processor_id": "lineart", 124 | "model_path": "models/ControlNet/control_v11p_sd15_lineart.pth", 125 | "scale": 0.5 126 | } 127 | ] 128 | }, 129 | "data": { 130 | "input_frames": { 131 | "video_file": "data/examples/diffutoon_edit/input_video.mp4", 132 | "image_folder": None, 133 | "height": 1536, 134 | "width": 1536, 135 | "start_frame_id": 0, 136 | "end_frame_id": 30 137 | }, 138 | "controlnet_frames": [ 139 | { 140 | "video_file": "data/examples/diffutoon_edit/input_video.mp4", 141 | "image_folder": None, 142 | "height": 1536, 143 | "width": 1536, 144 | "start_frame_id": 0, 145 | "end_frame_id": 30 146 | }, 147 | { 148 | "video_file": "data/examples/diffutoon_edit/input_video.mp4", 149 | "image_folder": None, 150 | "height": 1536, 151 | "width": 1536, 152 | "start_frame_id": 0, 153 | "end_frame_id": 30 154 | } 155 | ], 156 | "output_folder": "data/examples/diffutoon_edit/output", 157 | "fps": 30 158 | }, 159 | "pipeline": { 160 | "seed": 0, 161 | "pipeline_inputs": { 162 | "prompt": "best quality, perfect anime illustration, light, a girl is dancing, smile, solo", 163 | "negative_prompt": "verybadimagenegative_v1.3", 164 | "cfg_scale": 7.0, 165 | "clip_skip": 2, 166 | "denoising_strength": 1.0, 167 | "num_inference_steps": 10, 168 | "animatediff_batch_size": 16, 169 | "animatediff_stride": 8, 170 | "unet_batch_size": 1, 171 | "controlnet_batch_size": 1, 172 | "cross_frame_attention": False, 173 | # The following parameters will be overwritten. You don't need to modify them. 174 | "input_frames": [], 175 | "num_frames": 30, 176 | "width": 1536, 177 | "height": 1536, 178 | "controlnet_frames": [] 179 | } 180 | } 181 | } 182 | 183 | 184 | runner = SDVideoPipelineRunner() 185 | runner.run(config_stage_1) 186 | 187 | # Replace the color video with the synthesized video 188 | config_stage_2["data"]["controlnet_frames"][0] = { 189 | "video_file": os.path.join(config_stage_1["data"]["output_folder"], "video.mp4"), 190 | "image_folder": None, 191 | "height": config_stage_2["data"]["input_frames"]["height"], 192 | "width": config_stage_2["data"]["input_frames"]["width"], 193 | "start_frame_id": None, 194 | "end_frame_id": None 195 | } 196 | runner.run(config_stage_2) 197 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/Diffutoon/sd_toon_shading.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames 2 | from diffsynth.extensions.RIFE import RIFESmoother 3 | import torch 4 | 5 | 6 | # Download models 7 | # `models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors`: [link](https://civitai.com/api/download/models/266360?type=Model&format=SafeTensor&size=pruned&fp=fp16) 8 | # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) 9 | # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) 10 | # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth) 11 | # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) 12 | # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) 13 | # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) 14 | # `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing) 15 | 16 | 17 | # Load models 18 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 19 | model_manager.load_textual_inversions("models/textual_inversion") 20 | model_manager.load_models([ 21 | "models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors", 22 | "models/AnimateDiff/mm_sd_v15_v2.ckpt", 23 | "models/ControlNet/control_v11p_sd15_lineart.pth", 24 | "models/ControlNet/control_v11f1e_sd15_tile.pth", 25 | "models/RIFE/flownet.pkl" 26 | ]) 27 | pipe = SDVideoPipeline.from_model_manager( 28 | model_manager, 29 | [ 30 | ControlNetConfigUnit( 31 | processor_id="lineart", 32 | model_path="models/ControlNet/control_v11p_sd15_lineart.pth", 33 | scale=0.5 34 | ), 35 | ControlNetConfigUnit( 36 | processor_id="tile", 37 | model_path="models/ControlNet/control_v11f1e_sd15_tile.pth", 38 | scale=0.5 39 | ) 40 | ] 41 | ) 42 | smoother = RIFESmoother.from_model_manager(model_manager) 43 | 44 | # Load video (we only use 60 frames for quick testing) 45 | # The original video is here: https://www.bilibili.com/video/BV19w411A7YJ/ 46 | video = VideoData( 47 | video_file="data/bilibili_videos/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻 - 1.66 微笑调查队🌻(Av278681824,P1).mp4", 48 | height=1024, width=1024) 49 | input_video = [video[i] for i in range(40*60, 41*60)] 50 | 51 | # Toon shading (20G VRAM) 52 | torch.manual_seed(0) 53 | output_video = pipe( 54 | prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo", 55 | negative_prompt="verybadimagenegative_v1.3", 56 | cfg_scale=3, clip_skip=2, 57 | controlnet_frames=input_video, num_frames=len(input_video), 58 | num_inference_steps=10, height=1024, width=1024, 59 | animatediff_batch_size=32, animatediff_stride=16, 60 | vram_limit_level=0, 61 | ) 62 | output_video = smoother(output_video) 63 | 64 | # Save video 65 | save_video(output_video, "output_video.mp4", fps=60) 66 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/ExVideo/ExVideo_ema.py: -------------------------------------------------------------------------------- 1 | import torch, os, argparse 2 | from safetensors.torch import save_file 3 | 4 | 5 | def load_pl_state_dict(file_path): 6 | print(f"loading {file_path}") 7 | state_dict = torch.load(file_path, map_location="cpu") 8 | trainable_param_names = set(state_dict["trainable_param_names"]) 9 | if "module" in state_dict: 10 | state_dict = state_dict["module"] 11 | if "state_dict" in state_dict: 12 | state_dict = state_dict["state_dict"] 13 | state_dict_ = {} 14 | for name, param in state_dict.items(): 15 | if name.startswith("_forward_module."): 16 | name = name[len("_forward_module."):] 17 | if name.startswith("unet."): 18 | name = name[len("unet."):] 19 | if name in trainable_param_names: 20 | state_dict_[name] = param 21 | return state_dict_ 22 | 23 | 24 | def ckpt_to_epochs(ckpt_name): 25 | return int(ckpt_name.split("=")[1].split("-")[0]) 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser(description="Simple example of a training script.") 30 | parser.add_argument( 31 | "--output_path", 32 | type=str, 33 | default="./", 34 | help="Path to save the model.", 35 | ) 36 | parser.add_argument( 37 | "--gamma", 38 | type=float, 39 | default=0.9, 40 | help="Gamma in EMA.", 41 | ) 42 | args = parser.parse_args() 43 | return args 44 | 45 | 46 | if __name__ == '__main__': 47 | # args 48 | args = parse_args() 49 | folder = args.output_path 50 | gamma = args.gamma 51 | 52 | # EMA 53 | ckpt_list = sorted([(ckpt_to_epochs(ckpt_name), ckpt_name) for ckpt_name in os.listdir(folder) if os.path.isdir(f"{folder}/{ckpt_name}")]) 54 | state_dict_ema = None 55 | for epochs, ckpt_name in ckpt_list: 56 | state_dict = load_pl_state_dict(f"{folder}/{ckpt_name}/checkpoint/mp_rank_00_model_states.pt") 57 | if state_dict_ema is None: 58 | state_dict_ema = {name: param.float() for name, param in state_dict.items()} 59 | else: 60 | for name, param in state_dict.items(): 61 | state_dict_ema[name] = state_dict_ema[name] * gamma + param.float() * (1 - gamma) 62 | save_path = ckpt_name.replace(".ckpt", "-ema.safetensors") 63 | print(f"save to {folder}/{save_path}") 64 | save_file(state_dict_ema, f"{folder}/{save_path}") 65 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/ExVideo/ExVideo_svd_test.py: -------------------------------------------------------------------------------- 1 | from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline 2 | from diffsynth import ModelManager 3 | import torch, os 4 | 5 | # The models will be downloaded automatically. 6 | # You can also use the following urls to download them manually. 7 | 8 | # Download models (from Huggingface) 9 | # Text-to-image model: 10 | # `models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/clip_text_encoder/pytorch_model.bin) 11 | # `models/HunyuanDiT/t2i/mt5/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/mt5/pytorch_model.bin) 12 | # `models/HunyuanDiT/t2i/model/pytorch_model_ema.pt`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/model/pytorch_model_ema.pt) 13 | # `models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin) 14 | # Stable Video Diffusion model: 15 | # `models/stable_video_diffusion/svd_xt.safetensors`: [link](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors) 16 | # ExVideo extension blocks: 17 | # `models/stable_video_diffusion/model.fp16.safetensors`: [link](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1/resolve/main/model.fp16.safetensors) 18 | 19 | # Download models (from Modelscope) 20 | # Text-to-image model: 21 | # `models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin`: [link](https://www.modelscope.cn/api/v1/models/modelscope/HunyuanDiT/repo?Revision=master&FilePath=t2i%2Fclip_text_encoder%2Fpytorch_model.bin) 22 | # `models/HunyuanDiT/t2i/mt5/pytorch_model.bin`: [link](https://www.modelscope.cn/api/v1/models/modelscope/HunyuanDiT/repo?Revision=master&FilePath=t2i%2Fmt5%2Fpytorch_model.bin) 23 | # `models/HunyuanDiT/t2i/model/pytorch_model_ema.pt`: [link](https://www.modelscope.cn/api/v1/models/modelscope/HunyuanDiT/repo?Revision=master&FilePath=t2i%2Fmodel%2Fpytorch_model_ema.pt) 24 | # `models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin`: [link](https://www.modelscope.cn/api/v1/models/modelscope/HunyuanDiT/repo?Revision=master&FilePath=t2i%2Fsdxl-vae-fp16-fix%2Fdiffusion_pytorch_model.bin) 25 | # Stable Video Diffusion model: 26 | # `models/stable_video_diffusion/svd_xt.safetensors`: [link](https://www.modelscope.cn/api/v1/models/AI-ModelScope/stable-video-diffusion-img2vid-xt/repo?Revision=master&FilePath=svd_xt.safetensors) 27 | # ExVideo extension blocks: 28 | # `models/stable_video_diffusion/model.fp16.safetensors`: [link](https://modelscope.cn/api/v1/models/ECNU-CILab/ExVideo-SVD-128f-v1/repo?Revision=master&FilePath=model.fp16.safetensors) 29 | 30 | 31 | def generate_image(): 32 | # Load models 33 | os.environ["TOKENIZERS_PARALLELISM"] = "True" 34 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["HunyuanDiT"]) 35 | pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager) 36 | 37 | # Generate an image 38 | torch.manual_seed(0) 39 | image = pipe( 40 | prompt="bonfire, on the stone", 41 | negative_prompt="错误的眼睛,糟糕的人脸,毁容,糟糕的艺术,变形,多余的肢体,模糊的颜色,模糊,重复,病态,残缺,", 42 | num_inference_steps=50, height=1024, width=1024, 43 | ) 44 | model_manager.to("cpu") 45 | return image 46 | 47 | 48 | def generate_video(image): 49 | # Load models 50 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"]) 51 | pipe = SVDVideoPipeline.from_model_manager(model_manager) 52 | 53 | # Generate a video 54 | torch.manual_seed(1) 55 | video = pipe( 56 | input_image=image.resize((512, 512)), 57 | num_frames=128, fps=30, height=512, width=512, 58 | motion_bucket_id=127, 59 | num_inference_steps=50, 60 | min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2 61 | ) 62 | model_manager.to("cpu") 63 | return video 64 | 65 | 66 | def upscale_video(image, video): 67 | # Load models 68 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"]) 69 | pipe = SVDVideoPipeline.from_model_manager(model_manager) 70 | 71 | # Generate a video 72 | torch.manual_seed(2) 73 | video = pipe( 74 | input_image=image.resize((1024, 1024)), 75 | input_video=[frame.resize((1024, 1024)) for frame in video], denoising_strength=0.5, 76 | num_frames=128, fps=30, height=1024, width=1024, 77 | motion_bucket_id=127, 78 | num_inference_steps=25, 79 | min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2 80 | ) 81 | model_manager.to("cpu") 82 | return video 83 | 84 | 85 | # We use Hunyuan DiT to generate the first frame. 10GB VRAM is required. 86 | # If you want to use your own image, 87 | # please use `image = Image.open("your_image_file.png")` to replace the following code. 88 | image = generate_image() 89 | image.save("image.png") 90 | 91 | # Now, generate a video with resolution of 512. 20GB VRAM is required. 92 | video = generate_video(image) 93 | save_video(video, "video_512.mp4", fps=30) 94 | 95 | # Upscale the video. 52GB VRAM is required. 96 | video = upscale_video(image, video) 97 | save_video(video, "video_1024.mp4", fps=30) 98 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/ExVideo/README.md: -------------------------------------------------------------------------------- 1 | # ExVideo 2 | 3 | ExVideo is a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames. 4 | 5 | * [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/) 6 | * [Technical report](https://arxiv.org/abs/2406.14130) 7 | * Extended models 8 | * [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) 9 | * [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1) 10 | 11 | ## Example: Text-to-video via extended Stable Video Diffusion 12 | 13 | Generate a video using a text-to-image model and our image-to-video model. See [ExVideo_svd_test.py](./ExVideo_svd_test.py). 14 | 15 | https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc 16 | 17 | ## Train 18 | 19 | * Step 1: Install additional packages 20 | 21 | ``` 22 | pip install lightning deepspeed 23 | ``` 24 | 25 | * Step 2: Download base model (from [HuggingFace](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors) or [ModelScope](https://www.modelscope.cn/api/v1/models/AI-ModelScope/stable-video-diffusion-img2vid-xt/repo?Revision=master&FilePath=svd_xt.safetensors)) to `models/stable_video_diffusion/svd_xt.safetensors`. 26 | 27 | * Step 3: Prepare datasets 28 | 29 | ``` 30 | path/to/your/dataset 31 | ├── metadata.json 32 | └── videos 33 | ├── video_1.mp4 34 | ├── video_2.mp4 35 | └── video_3.mp4 36 | ``` 37 | 38 | where the `metadata.json` is 39 | 40 | ``` 41 | [ 42 | { 43 | "path": "videos/video_1.mp4" 44 | }, 45 | { 46 | "path": "videos/video_2.mp4" 47 | }, 48 | { 49 | "path": "videos/video_3.mp4" 50 | } 51 | ] 52 | ``` 53 | 54 | * Step 4: Run 55 | 56 | ``` 57 | CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" python -u ExVideo_svd_train.py \ 58 | --pretrained_path "models/stable_video_diffusion/svd_xt.safetensors" \ 59 | --dataset_path "path/to/your/dataset" \ 60 | --output_path "path/to/save/models" \ 61 | --steps_per_epoch 8000 \ 62 | --num_frames 128 \ 63 | --height 512 \ 64 | --width 512 \ 65 | --dataloader_num_workers 2 \ 66 | --learning_rate 1e-5 \ 67 | --max_epochs 100 68 | ``` 69 | 70 | * Step 5: Post-process checkpoints 71 | 72 | Calculate Exponential Moving Average (EMA) and package it using `safetensors`. 73 | 74 | ``` 75 | python ExVideo_ema.py --output_path "path/to/save/models/lightning_logs/version_xx" --gamma 0.9 76 | ``` 77 | 78 | * Step 6: Enjoy your model 79 | 80 | The EMA model is at `path/to/save/models/lightning_logs/version_xx/checkpoints/epoch=xx-step=yyy-ema.safetensors`. Load it in [ExVideo_svd_test.py](./ExVideo_svd_test.py) and then enjoy your model. 81 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/Ip-Adapter/README.md: -------------------------------------------------------------------------------- 1 | # IP-Adapter 2 | 3 | The features of IP-Adapter in DiffSynth Studio is not completed. Please wait for us. 4 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/Ip-Adapter/sdxl_ipadapter.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDXLImagePipeline 2 | import torch 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 7 | # `models/IpAdapter/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors) 8 | # `models/IpAdapter/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors) 9 | 10 | # Load models 11 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 12 | model_manager.load_models([ 13 | "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors", 14 | "models/IpAdapter/image_encoder/model.safetensors", 15 | "models/IpAdapter/ip-adapter_sdxl.bin" 16 | ]) 17 | pipe = SDXLImagePipeline.from_model_manager(model_manager) 18 | pipe.ipadapter.set_less_adapter() 19 | 20 | torch.manual_seed(0) 21 | style_image = pipe( 22 | prompt="Starry Night, blue sky, by van Gogh", 23 | negative_prompt="dark, gray", 24 | cfg_scale=5, 25 | height=1024, width=1024, num_inference_steps=30, 26 | ) 27 | style_image.save("style_image.jpg") 28 | 29 | image = pipe( 30 | prompt="a cat", 31 | negative_prompt="", 32 | cfg_scale=5, 33 | height=1024, width=1024, num_inference_steps=30, 34 | ipadapter_images=[style_image] 35 | ) 36 | image.save("transferred_image.jpg") 37 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/diffsynth/README.md: -------------------------------------------------------------------------------- 1 | # DiffSynth 2 | 3 | DiffSynth is the initial version of our video synthesis framework. In this framework, you can apply video deflickering algorithms to the latent space of diffusion models. You can refer to the [original repo](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth) for more details. 4 | 5 | We provide an example for video stylization. In this pipeline, the rendered video is completely different from the original video, thus we need a powerful deflickering algorithm. We use FastBlend to implement the deflickering module. Please see [`sd_video_rerender.py`](./sd_video_rerender.py). 6 | 7 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea 8 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/diffsynth/sd_video_rerender.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video 2 | from diffsynth.processors.FastBlend import FastBlendSmoother 3 | from diffsynth.processors.PILEditor import ContrastEditor, SharpnessEditor 4 | from diffsynth.processors.sequencial_processor import SequencialProcessor 5 | import torch 6 | 7 | 8 | # Download models 9 | # `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16) 10 | # `models/ControlNet/control_v11f1p_sd15_depth.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1p_sd15_depth.pth) 11 | # `models/ControlNet/control_v11p_sd15_softedge.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_softedge.pth) 12 | # `models/Annotators/dpt_hybrid-midas-501f0c75.pt`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt) 13 | # `models/Annotators/ControlNetHED.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth) 14 | 15 | # Load models 16 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 17 | model_manager.load_models([ 18 | "models/stable_diffusion/dreamshaper_8.safetensors", 19 | "models/ControlNet/control_v11f1p_sd15_depth.pth", 20 | "models/ControlNet/control_v11p_sd15_softedge.pth" 21 | ]) 22 | pipe = SDVideoPipeline.from_model_manager( 23 | model_manager, 24 | [ 25 | ControlNetConfigUnit( 26 | processor_id="depth", 27 | model_path=rf"models/ControlNet/control_v11f1p_sd15_depth.pth", 28 | scale=0.5 29 | ), 30 | ControlNetConfigUnit( 31 | processor_id="softedge", 32 | model_path=rf"models/ControlNet/control_v11p_sd15_softedge.pth", 33 | scale=0.5 34 | ) 35 | ] 36 | ) 37 | smoother = SequencialProcessor([FastBlendSmoother(), ContrastEditor(rate=1.1), SharpnessEditor(rate=1.1)]) 38 | 39 | # Load video 40 | # Original video: https://pixabay.com/videos/flow-rocks-water-fluent-stones-159627/ 41 | video = VideoData(video_file="data/pixabay100/159627 (1080p).mp4", height=512, width=768) 42 | input_video = [video[i] for i in range(128)] 43 | 44 | # Rerender 45 | torch.manual_seed(0) 46 | output_video = pipe( 47 | prompt="winter, ice, snow, water, river", 48 | negative_prompt="", cfg_scale=7, 49 | input_frames=input_video, controlnet_frames=input_video, num_frames=len(input_video), 50 | num_inference_steps=20, height=512, width=768, 51 | animatediff_batch_size=8, animatediff_stride=4, unet_batch_size=8, 52 | cross_frame_attention=True, 53 | smoother=smoother, smoother_progress_ids=[4, 9, 14, 19] 54 | ) 55 | 56 | # Save images and video 57 | save_video(output_video, "output_video.mp4", fps=30) 58 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/image_synthesis/README.md: -------------------------------------------------------------------------------- 1 | # Image Synthesis 2 | 3 | Image synthesis is the base feature of DiffSynth Studio. 4 | 5 | ### Example: Stable Diffusion 6 | 7 | We can generate images with very high resolution. Please see [`sd_text_to_image.py`](./sd_text_to_image.py) for more details. 8 | 9 | |512*512|1024*1024|2048*2048|4096*4096| 10 | |-|-|-|-| 11 | |![512](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/55f679e9-7445-4605-9315-302e93d11370)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/9087a73c-9164-4c58-b2a0-effc694143fb)|![4096](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edee9e71-fc39-4d1c-9ca9-fa52002c67ac)| 12 | 13 | ### Example: Stable Diffusion XL 14 | 15 | Generate images with Stable Diffusion XL. Please see [`sdxl_text_to_image.py`](./sdxl_text_to_image.py) for more details. 16 | 17 | |1024*1024|2048*2048| 18 | |-|-| 19 | |![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)| 20 | 21 | ### Example: Stable Diffusion XL Turbo 22 | 23 | Generate images with Stable Diffusion XL Turbo. You can see [`sdxl_turbo.py`](./sdxl_turbo.py) for more details, but we highly recommend you to use it in the WebUI. 24 | 25 | |"black car"|"red car"| 26 | |-|-| 27 | |![black_car](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/7fbfd803-68d4-44f3-8713-8c925fec47d0)|![black_car_to_red_car](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/aaf886e4-c33c-4fd8-98e2-29eef117ba00)| 28 | 29 | ### Example: Prompt Processing 30 | 31 | If you are not native English user, we provide translation service for you. Our prompter can translate other language to English and refine it using "BeautifulPrompt" models. Please see [`sd_prompt_refining.py`](./sd_prompt_refining.py) for more details. 32 | 33 | Prompt: "一个漂亮的女孩". The [translation model](https://huggingface.co/Helsinki-NLP/opus-mt-en-zh) will translate it to English. 34 | 35 | |seed=0|seed=1|seed=2|seed=3| 36 | |-|-|-|-| 37 | |![0_](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/ebb25ca8-7ce1-4d9e-8081-59a867c70c4d)|![1_](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/a7e79853-3c1a-471a-9c58-c209ec4b76dd)|![2_](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/a292b959-a121-481f-b79c-61cc3346f810)|![3_](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1c19b54e-5a6f-4d48-960b-a7b2b149bb4c)| 38 | 39 | Prompt: "一个漂亮的女孩". The [translation model](https://huggingface.co/Helsinki-NLP/opus-mt-en-zh) will translate it to English. Then the [refining model](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) will refine the translated prompt for better visual quality. 40 | 41 | |seed=0|seed=1|seed=2|seed=3| 42 | |-|-|-|-| 43 | |![0](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/778b1bd9-44e0-46ac-a99c-712b3fc9aaa4)|![1](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/c03479b8-2082-4c6e-8e1c-3582b98686f6)|![2](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edb33d21-3288-4a55-96ca-a4bfe1b50b00)|![3](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/7848cfc1-cad5-4848-8373-41d24e98e584)| 44 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/image_synthesis/sd_prompt_refining.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDXLImagePipeline 2 | import torch 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 7 | # `models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd/`: [link](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) 8 | # `models/translator/opus-mt-zh-en/`: [link](https://huggingface.co/Helsinki-NLP/opus-mt-en-zh) 9 | 10 | 11 | # Load models 12 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 13 | model_manager.load_textual_inversions("models/textual_inversion") 14 | model_manager.load_models([ 15 | "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors", 16 | "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd/model.safetensors", 17 | "models/translator/opus-mt-zh-en/pytorch_model.bin" 18 | ]) 19 | pipe = SDXLImagePipeline.from_model_manager(model_manager) 20 | 21 | prompt = "一个漂亮的女孩" 22 | negative_prompt = "" 23 | 24 | for seed in range(4): 25 | torch.manual_seed(seed) 26 | image = pipe( 27 | prompt=prompt, negative_prompt=negative_prompt, 28 | height=1024, width=1024, 29 | num_inference_steps=30 30 | ) 31 | image.save(f"{seed}.jpg") 32 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/image_synthesis/sd_text_to_image.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDImagePipeline, ControlNetConfigUnit 2 | import torch 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575?type=Model&format=SafeTensor&size=full&fp=fp16) 7 | # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) 8 | # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth) 9 | # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) 10 | # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) 11 | 12 | 13 | # Load models 14 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 15 | model_manager.load_textual_inversions("models/textual_inversion") 16 | model_manager.load_models([ 17 | "models/stable_diffusion/aingdiffusion_v12.safetensors", 18 | "models/ControlNet/control_v11f1e_sd15_tile.pth", 19 | "models/ControlNet/control_v11p_sd15_lineart.pth" 20 | ]) 21 | pipe = SDImagePipeline.from_model_manager( 22 | model_manager, 23 | [ 24 | ControlNetConfigUnit( 25 | processor_id="tile", 26 | model_path=rf"models/ControlNet/control_v11f1e_sd15_tile.pth", 27 | scale=0.5 28 | ), 29 | ControlNetConfigUnit( 30 | processor_id="lineart", 31 | model_path=rf"models/ControlNet/control_v11p_sd15_lineart.pth", 32 | scale=0.7 33 | ), 34 | ] 35 | ) 36 | 37 | prompt = "masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait," 38 | negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw," 39 | 40 | torch.manual_seed(0) 41 | image = pipe( 42 | prompt=prompt, 43 | negative_prompt=negative_prompt, 44 | cfg_scale=7.5, clip_skip=1, 45 | height=512, width=512, num_inference_steps=80, 46 | ) 47 | image.save("512.jpg") 48 | 49 | image = pipe( 50 | prompt=prompt, 51 | negative_prompt=negative_prompt, 52 | cfg_scale=7.5, clip_skip=1, 53 | input_image=image.resize((1024, 1024)), controlnet_image=image.resize((1024, 1024)), 54 | height=1024, width=1024, num_inference_steps=40, denoising_strength=0.7, 55 | ) 56 | image.save("1024.jpg") 57 | 58 | image = pipe( 59 | prompt=prompt, 60 | negative_prompt=negative_prompt, 61 | cfg_scale=7.5, clip_skip=1, 62 | input_image=image.resize((2048, 2048)), controlnet_image=image.resize((2048, 2048)), 63 | height=2048, width=2048, num_inference_steps=20, denoising_strength=0.7, 64 | ) 65 | image.save("2048.jpg") 66 | 67 | image = pipe( 68 | prompt=prompt, 69 | negative_prompt=negative_prompt, 70 | cfg_scale=7.5, clip_skip=1, 71 | input_image=image.resize((4096, 4096)), controlnet_image=image.resize((4096, 4096)), 72 | height=4096, width=4096, num_inference_steps=10, denoising_strength=0.5, 73 | tiled=True, tile_size=128, tile_stride=64 74 | ) 75 | image.save("4096.jpg") 76 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/image_synthesis/sdxl_text_to_image.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDXLImagePipeline 2 | import torch 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion_xl/bluePencilXL_v200.safetensors`: [link](https://civitai.com/api/download/models/245614?type=Model&format=SafeTensor&size=pruned&fp=fp16) 7 | 8 | 9 | # Load models 10 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 11 | model_manager.load_models(["models/stable_diffusion_xl/bluePencilXL_v200.safetensors"]) 12 | pipe = SDXLImagePipeline.from_model_manager(model_manager) 13 | 14 | prompt = "masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait," 15 | negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw," 16 | 17 | torch.manual_seed(0) 18 | image = pipe( 19 | prompt=prompt, 20 | negative_prompt=negative_prompt, 21 | cfg_scale=6, 22 | height=1024, width=1024, num_inference_steps=60, 23 | ) 24 | image.save("1024.jpg") 25 | 26 | image = pipe( 27 | prompt=prompt, 28 | negative_prompt=negative_prompt, 29 | cfg_scale=6, 30 | input_image=image.resize((2048, 2048)), 31 | height=2048, width=2048, num_inference_steps=60, denoising_strength=0.5 32 | ) 33 | image.save("2048.jpg") 34 | 35 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/image_synthesis/sdxl_turbo.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDXLImagePipeline 2 | import torch 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion_xl_turbo/sd_xl_turbo_1.0_fp16.safetensors`: [link](https://huggingface.co/stabilityai/sdxl-turbo/resolve/main/sd_xl_turbo_1.0_fp16.safetensors) 7 | 8 | 9 | # Load models 10 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 11 | model_manager.load_models(["models/stable_diffusion_xl_turbo/sd_xl_turbo_1.0_fp16.safetensors"]) 12 | pipe = SDXLImagePipeline.from_model_manager(model_manager) 13 | 14 | # Text to image 15 | torch.manual_seed(0) 16 | image = pipe( 17 | prompt="black car", 18 | # Do not modify the following parameters! 19 | cfg_scale=1, height=512, width=512, num_inference_steps=1, progress_bar_cmd=lambda x:x 20 | ) 21 | image.save(f"black_car.jpg") 22 | 23 | # Image to image 24 | torch.manual_seed(0) 25 | image = pipe( 26 | prompt="red car", 27 | input_image=image, denoising_strength=0.7, 28 | # Do not modify the following parameters! 29 | cfg_scale=1, height=512, width=512, num_inference_steps=1, progress_bar_cmd=lambda x:x 30 | ) 31 | image.save(f"black_car_to_red_car.jpg") 32 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/video_synthesis/README.md: -------------------------------------------------------------------------------- 1 | # Text to Video 2 | 3 | In DiffSynth Studio, we can use AnimateDiff and SVD to generate videos. However, these models usually generate terrible contents. We do not recommend users to use these models, until a more powerful video model emerges. 4 | 5 | ### Example 7: Text to Video 6 | 7 | Generate a video using a Stable Diffusion model and an AnimateDiff model. We can break the limitation of number of frames! See [sd_text_to_video.py](./sd_text_to_video.py). 8 | 9 | https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/8f556355-4079-4445-9b48-e9da77699437 10 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/video_synthesis/sd_text_to_video.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDImagePipeline, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames 2 | from diffsynth.extensions.RIFE import RIFEInterpolater 3 | import torch 4 | 5 | 6 | # Download models 7 | # `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16) 8 | # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) 9 | # `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing) 10 | 11 | 12 | # Load models 13 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 14 | model_manager.load_models([ 15 | "models/stable_diffusion/dreamshaper_8.safetensors", 16 | "models/AnimateDiff/mm_sd_v15_v2.ckpt", 17 | "models/RIFE/flownet.pkl" 18 | ]) 19 | 20 | # Text -> Image 21 | pipe_image = SDImagePipeline.from_model_manager(model_manager) 22 | torch.manual_seed(0) 23 | image = pipe_image( 24 | prompt = "lightning storm, sea", 25 | negative_prompt = "", 26 | cfg_scale=7.5, 27 | num_inference_steps=30, height=512, width=768, 28 | ) 29 | 30 | # Text + Image -> Video (6GB VRAM is enough!) 31 | pipe = SDVideoPipeline.from_model_manager(model_manager) 32 | output_video = pipe( 33 | prompt = "lightning storm, sea", 34 | negative_prompt = "", 35 | cfg_scale=7.5, 36 | num_frames=64, 37 | num_inference_steps=10, height=512, width=768, 38 | animatediff_batch_size=16, animatediff_stride=1, input_frames=[image]*64, denoising_strength=0.9, 39 | vram_limit_level=0, 40 | ) 41 | 42 | # Video -> Video with high fps 43 | interpolater = RIFEInterpolater.from_model_manager(model_manager) 44 | output_video = interpolater.interpolate(output_video, num_iter=3) 45 | 46 | # Save images and video 47 | save_video(output_video, "output_video.mp4", fps=120) 48 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/video_synthesis/sdxl_text_to_video.py: -------------------------------------------------------------------------------- 1 | from diffsynth import ModelManager, SDXLVideoPipeline, save_video 2 | import torch 3 | 4 | 5 | # Download models 6 | # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 7 | # `models/AnimateDiff/mm_sdxl_v10_beta.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sdxl_v10_beta.ckpt) 8 | 9 | 10 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 11 | model_manager.load_models([ 12 | "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors", 13 | "models/AnimateDiff/mm_sdxl_v10_beta.ckpt" 14 | ]) 15 | pipe = SDXLVideoPipeline.from_model_manager(model_manager) 16 | 17 | prompt = "A panda standing on a surfboard in the ocean in sunset, 4k, high resolution.Realistic, Cinematic, high resolution" 18 | negative_prompt = "" 19 | 20 | torch.manual_seed(0) 21 | video = pipe( 22 | prompt=prompt, 23 | negative_prompt=negative_prompt, 24 | cfg_scale=8.5, 25 | height=1024, width=1024, num_frames=16, 26 | num_inference_steps=100, 27 | ) 28 | save_video(video, "video.mp4", fps=16) 29 | -------------------------------------------------------------------------------- /DiffSynth-Studio/examples/video_synthesis/svd_text_to_video.py: -------------------------------------------------------------------------------- 1 | from diffsynth import save_video, SDXLImagePipeline, ModelManager, SVDVideoPipeline 2 | from diffsynth import ModelManager 3 | import torch 4 | 5 | 6 | # Download models 7 | # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 8 | # `models/stable_video_diffusion/svd_xt.safetensors`: [link](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors) 9 | 10 | 11 | prompt = "cloud, wind" 12 | torch.manual_seed(0) 13 | 14 | # 1. Text-to-image using SD-XL 15 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 16 | model_manager.load_models(["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"]) 17 | pipe = SDXLImagePipeline.from_model_manager(model_manager) 18 | image = pipe( 19 | prompt=prompt, 20 | negative_prompt="", 21 | cfg_scale=6, 22 | height=1024, width=1024, num_inference_steps=50, 23 | ) 24 | pipe.to("cpu") 25 | torch.cuda.empty_cache() 26 | 27 | # 2. Image-to-video using SVD 28 | model_manager = ModelManager() 29 | model_manager.load_models(["models/stable_video_diffusion/svd_xt.safetensors"]) 30 | pipe = SVDVideoPipeline.from_model_manager(model_manager) 31 | video = pipe( 32 | input_image=image, 33 | num_frames=25, fps=15, height=1024, width=1024, 34 | motion_bucket_id=127, 35 | num_inference_steps=50 36 | ) 37 | save_video(video, "video.mp4", fps=15) 38 | -------------------------------------------------------------------------------- /DiffSynth-Studio/models/AnimateDiff/Put AnimateDiff ckpt files here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/AnimateDiff/Put AnimateDiff ckpt files here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/Annotators/Put ControlNet annotators here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/Annotators/Put ControlNet annotators here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/BeautifulPrompt/Put BeautifulPrompt models here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/BeautifulPrompt/Put BeautifulPrompt models here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/ControlNet/Put ControlNet pth files here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/ControlNet/Put ControlNet pth files here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/HunyuanDiT/Put Hunyuan DiT checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/HunyuanDiT/Put Hunyuan DiT checkpoints here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/IpAdapter/Put IP-Adapter checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/IpAdapter/Put IP-Adapter checkpoints here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/RIFE/Put RIFE models here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/RIFE/Put RIFE models here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/lora/Put lora files here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/lora/Put lora files here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/stable_diffusion/Put Stable Diffusion checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/stable_diffusion/Put Stable Diffusion checkpoints here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/stable_diffusion_xl/Put Stable Diffusion XL checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/stable_diffusion_xl/Put Stable Diffusion XL checkpoints here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/stable_diffusion_xl_turbo/Put Stable Diffusion XL Turbo checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/stable_diffusion_xl_turbo/Put Stable Diffusion XL Turbo checkpoints here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/stable_video_diffusion/Put Stable Video Diffusion checkpoints here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/stable_video_diffusion/Put Stable Video Diffusion checkpoints here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/textual_inversion/Put Textual Inversion files here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/textual_inversion/Put Textual Inversion files here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/models/translator/Put translator models here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philz1337x/video-style-transfer/f124bd28360b958ed72df0481fbd625d24667a25/DiffSynth-Studio/models/translator/Put translator models here.txt -------------------------------------------------------------------------------- /DiffSynth-Studio/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.0 2 | cupy-cuda12x 3 | pip 4 | transformers 5 | controlnet-aux==0.0.7 6 | streamlit 7 | streamlit-drawable-canvas 8 | imageio 9 | imageio[ffmpeg] 10 | safetensors 11 | einops 12 | sentencepiece 13 | -------------------------------------------------------------------------------- /DiffSynth-Studio/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pkg_resources 3 | from setuptools import setup, find_packages 4 | 5 | 6 | setup( 7 | name="diffsynth", 8 | py_modules=["diffsynth"], 9 | version="1.0.0", 10 | description="", 11 | author="Artiprocher", 12 | packages=find_packages(), 13 | install_requires=[ 14 | str(r) 15 | for r in pkg_resources.parse_requirements( 16 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 17 | ) 18 | ], 19 | include_package_data=True 20 | ) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Video-Style-Stransfer

4 | 5 | [![App](https://img.shields.io/badge/App-ClarityAI.co-blueviolet)](https://ClarityAI.co) 6 | 7 | [![Replicate](https://img.shields.io/badge/Demo-Replicate-purple)](https://replicate.com/philz1337x/video-style-transfer) 8 | 9 | [![GitHub Repo](https://img.shields.io/badge/GitHub-video-style-transfer-blue?logo=github)](https://github.com/philz1337x/video-style-transfer) 10 | 11 | [![Twitter Follow](https://img.shields.io/twitter/follow/philz1337x?style=social)](https://twitter.com/philz1337x) 12 | ![GitHub stars](https://img.shields.io/github/stars/philz1337x/video-style-transfer?style=social&label=Star) 13 | 14 |
15 | 16 | # 👋 Hello 17 | 18 | I build open source AI apps. To finance my work i also build paid versions of my code. But feel free to use the free code. I post features and new projects on https://twitter.com/philz1337x 19 | 20 | # 🗞️ Updates 21 | 22 | - 07/23/2024: Code release 23 | 24 | # 🚀 Options to use Video-Style-Transfer 25 | 26 | ## 🧑‍💻 App 27 | 28 | The simplest option to use it is with my upscaler app at [ClarityAI.co](https://ClarityAI.co) 29 | 30 | ## API: Run on replicate 31 | 32 | Use my public replicate model at: Replicate.com/philz1337x/video-style-transfer 33 | 34 | ## Advanced: Deploy and run with cog (locally or cloud) 35 | 36 | If you are not familiar with cog read: cog docs 37 | 38 | - run `download_weights.py` 39 | 40 | - predict with cog: 41 | 42 | ```su 43 | cog predict -i video="link-to-image" 44 | ``` 45 | -------------------------------------------------------------------------------- /analyse.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | def get_video_info(video_path): 4 | cap = cv2.VideoCapture(video_path) 5 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 6 | fps = int(cap.get(cv2.CAP_PROP_FPS)) 7 | duration_seconds = total_frames / fps 8 | duration_minutes = duration_seconds / 60 9 | duration_hours = duration_minutes / 60 10 | cap.release() 11 | return total_frames, fps, duration_seconds, duration_minutes, duration_hours 12 | 13 | video_path = 'a.mp4' # Hier den Pfad zu Ihrer MP4-Datei angeben 14 | frame_count, fps, duration_seconds, duration_minutes, duration_hours = get_video_info(video_path) 15 | 16 | print(f"Das Video hat {frame_count} Frames.") 17 | print(f"Die FPS des Videos sind {fps}.") 18 | print(f"Die Gesamtdauer des Videos beträgt {duration_seconds:.2f} Sekunden.") 19 | print(f"Das entspricht {duration_minutes:.2f} Minuten oder {duration_hours:.2f} Stunden.") 20 | -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | build: 2 | gpu: true 3 | system_packages: 4 | - "libgl1-mesa-glx" 5 | - "libglib2.0-0" 6 | python_version: "3.10.4" 7 | python_packages: 8 | - "torch==2.0.1" 9 | - "torchvision" 10 | - "opencv-contrib-python==4.10.0.84" 11 | - "imageio==2.34.2" 12 | - "imageio-ffmpeg==0.5.1" 13 | - "tqdm==4.66.4" 14 | - "safetensors==0.4.3" 15 | - "huggingface-hub==0.23.4" 16 | - "einops==0.8.0" 17 | - "transformers==4.41.2" 18 | - "controlnet-aux==0.0.9" 19 | - "numpy==1.26.4" 20 | - "moviepy==1.0.3" 21 | predict: "predict.py:Predictor" 22 | -------------------------------------------------------------------------------- /download-weights.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import shutil 4 | 5 | def download_file(url, folder_path, filename, auth=None): 6 | if not os.path.exists(folder_path): 7 | os.makedirs(folder_path) 8 | file_path = os.path.join(folder_path, filename) 9 | 10 | if os.path.isfile(file_path): 11 | print(f"File already exists: {file_path}") 12 | else: 13 | headers = {} 14 | if auth: 15 | headers['Authorization'] = f'token {auth}' # Hier Token einfügen, wenn nötig 16 | 17 | try: 18 | response = requests.get(url, headers=headers, stream=True) 19 | if response.status_code == 200: 20 | with open(file_path, 'wb') as file: 21 | for chunk in response.iter_content(chunk_size=1024): 22 | file.write(chunk) 23 | print(f"File successfully downloaded and saved: {file_path}") 24 | else: 25 | print(f"Error downloading the file. Status code: {response.status_code}") 26 | except requests.exceptions.RequestException as e: 27 | print(f"Error downloading the file: {e}") 28 | 29 | # Download models 30 | download_file( 31 | "https://civitai.com/api/download/models/266360?type=Model&format=SafeTensor&size=pruned&fp=fp16", 32 | "models/stable_diffusion", 33 | "flat2DAnimerge_v45Sharp.safetensors" 34 | ) 35 | 36 | download_file( 37 | "https://huggingface.co/philz1337x/rv60b1/resolve/main/realisticVisionV60B1_v60B1VAE.safetensors?download=true", 38 | "models/stable_diffusion", 39 | "realisticVisionV60B1_v60B1VAE.safetensors" 40 | ) 41 | 42 | download_file( 43 | "https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt", 44 | "models/AnimateDiff", 45 | "mm_sd_v15_v2.ckpt" 46 | ) 47 | 48 | download_file( 49 | "https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth", 50 | "models/ControlNet", 51 | "control_v11p_sd15_lineart.pth" 52 | ) 53 | 54 | download_file( 55 | "https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth", 56 | "models/ControlNet", 57 | "control_v11f1e_sd15_tile.pth" 58 | ) 59 | 60 | download_file( 61 | "https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth", 62 | "models/Annotators", 63 | "sk_model.pth" 64 | ) 65 | 66 | download_file( 67 | "https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth", 68 | "models/Annotators", 69 | "sk_model2.pth" 70 | ) 71 | 72 | download_file( 73 | "https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16", 74 | "models/textual_inversion", 75 | "verybadimagenegative_v1.3.pt" 76 | ) 77 | 78 | download_file( 79 | "https://huggingface.co/philz1337x/test/resolve/main/flownet.pkl?download=true", 80 | "models/RIFE", 81 | "flownet.pkl" 82 | ) 83 | 84 | -------------------------------------------------------------------------------- /sd_toon_shading.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, 'DiffSynth-Studio') 3 | 4 | import cv2 5 | 6 | from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video 7 | from diffsynth.extensions.RIFE import RIFESmoother 8 | import torch 9 | 10 | # Download models 11 | # `models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors`: [link](https://civitai.com/api/download/models/266360?type=Model&format=SafeTensor&size=pruned&fp=fp16) 12 | # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) 13 | # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) 14 | # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth) 15 | # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) 16 | # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) 17 | # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) 18 | # `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing) 19 | 20 | # Load models 21 | model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") 22 | model_manager.load_textual_inversions("models/textual_inversion") 23 | model_manager.load_models([ 24 | "models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors", 25 | "models/AnimateDiff/mm_sd_v15_v2.ckpt", 26 | "models/ControlNet/control_v11p_sd15_lineart.pth", 27 | "models/ControlNet/control_v11f1e_sd15_tile.pth", 28 | "models/RIFE/flownet.pkl" 29 | ]) 30 | pipe = SDVideoPipeline.from_model_manager( 31 | model_manager, 32 | [ 33 | ControlNetConfigUnit( 34 | processor_id="lineart", 35 | model_path="models/ControlNet/control_v11p_sd15_lineart.pth", 36 | scale=0.5 37 | ), 38 | ControlNetConfigUnit( 39 | processor_id="tile", 40 | model_path="models/ControlNet/control_v11f1e_sd15_tile.pth", 41 | scale=0.5 42 | ) 43 | ] 44 | ) 45 | smoother = RIFESmoother.from_model_manager(model_manager) 46 | 47 | # Load video 48 | def count_frames(video_path): 49 | cap = cv2.VideoCapture(video_path) 50 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 51 | cap.release() 52 | return total_frames 53 | 54 | def get_framerate(video_path): 55 | cap = cv2.VideoCapture(video_path) 56 | fps = int(cap.get(cv2.CAP_PROP_FPS)) 57 | cap.release() 58 | return fps 59 | 60 | def load_video(video_file, input_framerate, start_frame=None, end_frame=None, target_fps=None): 61 | video = VideoData( 62 | video_file=video_file, 63 | height=1024, width=1024 64 | ) 65 | 66 | # Calculate frame range 67 | start_frame = start_frame or 0 68 | end_frame = end_frame or len(video) 69 | target_fps = target_fps or input_framerate # Use video's FPS if not specified 70 | 71 | frame_rate = get_framerate(video_file) 72 | # Select frames based on start_frame and end_frame 73 | selected_frames = [] 74 | for i in range(start_frame, min(end_frame, len(video))): 75 | if i % (frame_rate // target_fps) == 0: 76 | selected_frames.append(video[i]) 77 | 78 | if not selected_frames: 79 | raise ValueError("No frames selected. Check start_frame, end_frame, and target_fps settings.") 80 | 81 | return selected_frames 82 | 83 | video_path = 'a.mp4' # Hier den Pfad zu Ihrer MP4-Datei angeben 84 | frame_count = count_frames(video_path) 85 | input_framerate = get_framerate(video_path) 86 | print(f"The video has {frame_count} frames.") 87 | 88 | # Load video with optional frame range and target FPS 89 | input_video = load_video(video_path, input_framerate, start_frame=1, end_frame=60) 90 | 91 | if input_video is None or len(input_video) == 0: 92 | raise ValueError("Input video data is empty or not initialized.") 93 | 94 | # Toon shading (20G VRAM) 95 | torch.manual_seed(0) 96 | output_video = pipe( 97 | prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo", 98 | negative_prompt="verybadimagenegative_v1.3", 99 | cfg_scale=3, clip_skip=2, 100 | controlnet_frames=input_video, num_frames=len(input_video), 101 | num_inference_steps=10, height=1024, width=1024, 102 | animatediff_batch_size=32, animatediff_stride=16, 103 | vram_limit_level=0, 104 | ) 105 | output_video = smoother(output_video) 106 | 107 | # Save video 108 | save_video(output_video, "output_video.mp4", fps=input_framerate) 109 | --------------------------------------------------------------------------------