├── .gitmodules ├── entrypoints ├── __init__.py └── curl.sh ├── xfuser ├── ray │ ├── __init__.py │ ├── pipeline │ │ ├── __init__.py │ │ └── base_executor.py │ └── worker │ │ ├── __init__.py │ │ ├── utils.py │ │ └── worker_wrappers.py ├── model_executor │ ├── __init__.py │ ├── patch │ │ ├── __init__.py │ │ └── unet_patch.py │ ├── models │ │ ├── customized │ │ │ ├── __init__.py │ │ │ └── step_video_t2v │ │ │ │ ├── __init__.py │ │ │ │ ├── linear.py │ │ │ │ ├── attentions.py │ │ │ │ └── rope.py │ │ ├── __init__.py │ │ └── transformers │ │ │ ├── __init__.py │ │ │ └── register.py │ ├── cache │ │ ├── __init__.py │ │ └── diffusers_adapters │ │ │ ├── registry.py │ │ │ ├── __init__.py │ │ │ └── flux.py │ ├── layers │ │ ├── __init__.py │ │ ├── base_layer.py │ │ ├── register.py │ │ └── feedforward.py │ ├── schedulers │ │ ├── __init__.py │ │ ├── base_scheduler.py │ │ ├── register.py │ │ ├── scheduling_ddpm.py │ │ ├── scheduling_dpm_cogvideox.py │ │ ├── scheduling_ddim_cogvideox.py │ │ └── scheduling_ddim.py │ ├── pipelines │ │ ├── __init__.py │ │ ├── pipeline_stable_diffusion_xl.py │ │ └── register.py │ └── base_wrapper.py ├── __version__.py ├── core │ ├── utils │ │ ├── __init__.py │ │ └── timer.py │ ├── cache_manager │ │ └── __init__.py │ ├── long_ctx_attention │ │ ├── ring │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── hybrid │ │ │ └── __init__.py │ ├── __init__.py │ ├── fast_attention │ │ └── __init__.py │ └── distributed │ │ └── __init__.py ├── config │ ├── __init__.py │ └── diffusers.py ├── __init__.py ├── parallel.py └── logger.py ├── .gitignore ├── pytest.ini ├── docs ├── methods │ ├── cfg_parallel_zh.md │ ├── cfg_parallel.md │ ├── ditfastattn_zh.md │ ├── ditfastattn.md │ ├── parallel_vae.md │ ├── usp.md │ ├── hybrid_zh.md │ ├── pipefusion.md │ └── hybrid.md ├── performance │ ├── latte_zh.md │ ├── sana_zh.md │ ├── sd3_zh.md │ ├── latte.md │ ├── pixart_alpha_legacy.md │ ├── stepvideo_zh.md │ ├── sana.md │ ├── sd3.md │ ├── hunyuanvideo.md │ ├── stepvideo.md │ ├── consisid_zh.md │ ├── hunyuandit_zh.md │ ├── cogvideo_zh.md │ └── consisid.md ├── developer │ ├── Http_Service.md │ └── adding_models │ │ ├── adding_model_cfg_usp.md │ │ ├── readme.md │ │ └── adding_model_cfg.py └── fid │ └── FID.md ├── .pre-commit-config.yaml ├── tests ├── context_parallel │ ├── debug_tests.py │ └── debug_flux_usp_example.py ├── parallel_test.py ├── core │ └── test_envs.py └── layers │ └── feedforward_test.py ├── benchmark ├── run.sh ├── fid │ ├── generate.sh │ ├── README.md │ ├── pixartalpha_generate.py │ ├── flux_generate.py │ └── compute_fid.py └── usp_latency_test.py ├── docker └── Dockerfile ├── examples ├── run_cogvideo.sh ├── run_hunyuan_video_usp.sh ├── ray │ ├── README.md │ ├── ray_pixartsigma_example.py │ ├── ray_pixartalpha_example.py │ ├── ray_hunyuandit_example.py │ ├── ray_run.sh │ ├── ray_flux_example.py │ └── ray_sd3_example.py ├── run_multinodes.sh ├── run_consisid.sh ├── run_consisid_usp.sh ├── run_service.sh ├── run_fastditattn.sh ├── run.sh ├── latte_example.py ├── sana_sprint_example.py ├── pixartsigma_example.py ├── pixartalpha_example.py ├── sdxl_example.py ├── sd3_example.py ├── cogvideox_example.py ├── hunyuandit_example.py ├── zimage_example.py └── sana_example.py ├── .github └── workflows │ └── python-publish.yml └── setup.py /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/ray/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/ray/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/ray/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/model_executor/patch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.5" 2 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/customized/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/customized/step_video_t2v/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xfuser/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .timer import gpu_timer_decorator 2 | -------------------------------------------------------------------------------- /xfuser/core/cache_manager/__init__.py: -------------------------------------------------------------------------------- 1 | from .cache_manager import CacheManager 2 | 3 | __all__ = [ 4 | "CacheManager", 5 | ] 6 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_model import xFuserModelBaseWrapper 2 | 3 | __all__ = [ 4 | "xFuserModelBaseWrapper" 5 | ] -------------------------------------------------------------------------------- /xfuser/model_executor/cache/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | adapted from https://github.com/ali-vilab/TeaCache.git 3 | adapted from https://github.com/chengzeyi/ParaAttention.git 4 | """ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | build 4 | __pycache__ 5 | *.log 6 | *.txt 7 | results/ 8 | profile/ 9 | .vscode/ 10 | xfuser.egg-info/ 11 | dist/* 12 | *.mp4 13 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_format = %(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s 3 | log_date_format = %Y-%m-%d %H:%M:%S 4 | log_cli = true 5 | log_level = INFO 6 | addopts = --capture=tee-sys --verbose --color=auto --durations=0 -------------------------------------------------------------------------------- /xfuser/core/long_ctx_attention/ring/__init__.py: -------------------------------------------------------------------------------- 1 | from .ring_flash_attn import ( 2 | xdit_ring_flash_attn_func, 3 | xdit_sana_ring_flash_attn_func, 4 | ) 5 | 6 | __all__ = [ 7 | "xdit_ring_flash_attn_func", 8 | "xdit_sana_ring_flash_attn_func", 9 | ] 10 | -------------------------------------------------------------------------------- /xfuser/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .cache_manager import CacheManager 2 | from .long_ctx_attention import xFuserLongContextAttention 3 | from .utils import gpu_timer_decorator 4 | 5 | __all__ = [ 6 | "CacheManager", 7 | "xFuserLongContextAttention", 8 | "gpu_timer_decorator", 9 | ] 10 | -------------------------------------------------------------------------------- /xfuser/core/long_ctx_attention/__init__.py: -------------------------------------------------------------------------------- 1 | from .hybrid import ( 2 | xFuserLongContextAttention, 3 | xFuserSanaLinearLongContextAttention, 4 | AttnType,) 5 | 6 | __all__ = [ 7 | "xFuserLongContextAttention", 8 | "xFuserSanaLinearLongContextAttention", 9 | "AttnType", 10 | ] 11 | -------------------------------------------------------------------------------- /xfuser/core/long_ctx_attention/hybrid/__init__.py: -------------------------------------------------------------------------------- 1 | from .attn_layer import ( 2 | xFuserLongContextAttention, 3 | xFuserSanaLinearLongContextAttention, 4 | AttnType, 5 | ) 6 | 7 | __all__ = [ 8 | "xFuserLongContextAttention", 9 | "xFuserSanaLinearLongContextAttention", 10 | "AttnType", 11 | ] 12 | -------------------------------------------------------------------------------- /entrypoints/curl.sh: -------------------------------------------------------------------------------- 1 | 2 | curl -X POST "http://localhost:6000/generate" \ 3 | -H "Content-Type: application/json" \ 4 | -d '{ 5 | "prompt": "a cute rabbit", 6 | "num_inference_steps": 50, 7 | "seed": 42, 8 | "cfg": 7.5, 9 | "save_disk_path": "/tmp" 10 | }' 11 | -------------------------------------------------------------------------------- /docs/methods/cfg_parallel_zh.md: -------------------------------------------------------------------------------- 1 | # Classifier-Free Guidance (CFG) Parallel 2 | 3 | Classifier-Free Guidance通过提供更广泛的条件控制、减少训练负担、增强生成内容的质量和细节,以及提高模型的实用性和适应性,成为了扩散模型领域的一个重要进展技术。 4 | 5 | 对于一个输入prompt,使用CFG需要同时进行unconditional guide和text guide的生成 ,相当于输入DiT blocks的input latents batch_size = 2。CFG Parallel分离两个latents分别进行计算,在每个Diffusion Step forward完成后、Scheduler执行前Allgather一次latent space结果。它通信量远小于Pipefusion和Sequence Parallel。因此,使用CFG一定要使用CFG Parallel。 -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | # Using this mirror lets us use mypyc-compiled black, which is about 2x faster 3 | - repo: https://github.com/psf/black-pre-commit-mirror 4 | rev: 24.2.0 5 | hooks: 6 | - id: black 7 | # It is recommended to specify the latest version of Python 8 | # supported by your project here, or alternatively use 9 | # pre-commit's default_language_version, see 10 | # https://pre-commit.com/#top_level-default_language_version 11 | language_version: python3.10 -------------------------------------------------------------------------------- /tests/context_parallel/debug_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import shlex 5 | from pathlib import Path 6 | 7 | wd: str = Path(__file__).parent.absolute() 8 | #os.environ["PYTHONPATH"] = f"{WD}:{os.getenv('PYTHONPATH', '')}" 9 | test_script: str = wd / "test_diffusers_adapters.py" 10 | model_test: str = "FluxPipelineTest" 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 12 | 13 | cmd: str = ( 14 | f"{sys.executable} -m pytest {test_script.as_posix()}::{model_test}" 15 | ) 16 | cmd = shlex.split(cmd) 17 | print(cmd) 18 | subprocess.run(cmd, check=True) -------------------------------------------------------------------------------- /docs/performance/latte_zh.md: -------------------------------------------------------------------------------- 1 | ## Latte性能 2 | 3 | Latte是文生视频模型,xDiT目前实现了USP方式对它进行并行推理加速。PipeFusion还在开发中。 4 | 5 | 在8xL20 (PCIe)的机器上,生成512x512x16视频的延迟表现如下图所示。 6 | 7 |
8 | latency-latte-l20-512 10 |
11 | 12 | 生成1024x1024x16视频的延迟表现如下图所示,使用混合序列并行(`ulysses_degree`=2, `ring_degree=4`)可以获得最佳性能。 13 | 14 |
15 | latency-latte-l20-1024 17 |
-------------------------------------------------------------------------------- /xfuser/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .args import FlexibleArgumentParser, xFuserArgs 2 | from .config import ( 3 | EngineConfig, 4 | ParallelConfig, 5 | TensorParallelConfig, 6 | PipeFusionParallelConfig, 7 | SequenceParallelConfig, 8 | DataParallelConfig, 9 | ModelConfig, 10 | InputConfig, 11 | RuntimeConfig 12 | ) 13 | 14 | __all__ = [ 15 | "FlexibleArgumentParser", 16 | "xFuserArgs", 17 | "EngineConfig", 18 | "ParallelConfig", 19 | "TensorParallelConfig", 20 | "PipeFusionParallelConfig", 21 | "SequenceParallelConfig", 22 | "DataParallelConfig", 23 | "ModelConfig", 24 | "InputConfig", 25 | "RuntimeConfig" 26 | ] -------------------------------------------------------------------------------- /xfuser/ray/pipeline/base_executor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The xDiT team. 2 | # Adapted from 3 | # https://github.com/vllm-project/vllm/blob/main/vllm/executor/executor_base.py 4 | # Copyright (c) 2023, vLLM team. All rights reserved. 5 | from abc import ABC, abstractmethod 6 | 7 | from xfuser.config.config import EngineConfig 8 | 9 | 10 | class BaseExecutor(ABC): 11 | def __init__( 12 | self, 13 | engine_config: EngineConfig, 14 | ): 15 | self.engine_config = engine_config 16 | self.parallel_config = engine_config.parallel_config 17 | self._init_executor() 18 | 19 | @abstractmethod 20 | def _init_executor(self): 21 | pass 22 | -------------------------------------------------------------------------------- /xfuser/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .register import xFuserLayerWrappersRegister 2 | from .base_layer import xFuserLayerBaseWrapper 3 | from .attention_processor import xFuserAttentionWrapper 4 | from .attention_processor import xFuserAttentionBaseWrapper 5 | from .conv import xFuserConv2dWrapper 6 | from .embeddings import xFuserPatchEmbedWrapper 7 | from .feedforward import xFuserFeedForwardWrapper 8 | 9 | __all__ = [ 10 | "xFuserLayerWrappersRegister", 11 | "xFuserLayerBaseWrapper", 12 | "xFuserAttentionBaseWrapper", 13 | "xFuserAttentionWrapper", 14 | "xFuserConv2dWrapper", 15 | "xFuserPatchEmbedWrapper", 16 | "xFuserFeedForwardWrapper", 17 | ] 18 | -------------------------------------------------------------------------------- /benchmark/run.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | # MODEL="/mnt/models/SD/PixArt-XL-2-1024-MS" 4 | # SCRIPT="./examples/pixartalpha_example.py" 5 | 6 | # MODEL="/mnt/models/SD/stable-diffusion-3-medium-diffusers" 7 | # SCRIPT="./examples/sd3_example.py" 8 | 9 | # MODEL="/mnt/models/SD/HunyuanDiT-v1.2-Diffusers" 10 | # SCRIPT="./examples/hunyuandit_example.py" 11 | 12 | MODEL="/cfs/dit/FLUX.1-dev/" 13 | SCRIPT="./examples/flux_example.py" 14 | 15 | export PYTHONPATH=$PWD:$PYTHONPATH 16 | 17 | python benchmark/single_node_latency_test.py \ 18 | --model_id $MODEL \ 19 | --script $SCRIPT \ 20 | --sizes 1024 \ 21 | --no_use_resolution_binning \ 22 | --num_inference_steps 28 \ 23 | --no_use_cfg_parallel \ 24 | --n_gpus 4 -------------------------------------------------------------------------------- /docs/performance/sana_zh.md: -------------------------------------------------------------------------------- 1 | ## SANA 性能 2 | 我们是用开源版本的`Sana_1600M_4Kpx_BF16_diffusers`来进行性能评测 3 | 4 | 目前xDiT已经支持了对SANA的Pipefusion、Ulysses、Ring、CFG以及任意组合加速。由于SANA网络Head通道的限制,Ulysses最大并行度支持为2。我们在8xA100(NVLink)机器上,用20 steps生成4096x4096的图像来测试延迟。实测的延迟如下表所示。可以看到CFG的加速效果最优,其余三种加速策略性能接近,在8卡情况下,可以实现最高4.4x的生成加速。 5 | 6 | 7 | | GPU数 | cfg | ulysses | ring | pp | 延迟(秒) | 8 | |---|---|---|---|---|---| 9 | | 1 | 1 | 1 | 1 | 1 | 17.551 | 10 | | 2 | 1 | 1 | 1 | 2 | 11.276 | 11 | | 2 | 1 | 1 | 2 | 1 | 11.447 | 12 | | 2 | 1 | 2 | 1 | 1 | 10.175 | 13 | | 2 | 2 | 1 | 1 | 1 | 8.365 | 14 | | 4 | 2 | 1 | 1 | 2 | 5.599 | 15 | | 4 | 2 | 1 | 2 | 1 | 5.702 | 16 | | 4 | 2 | 2 | 1 | 1 | 5.803 | 17 | | 8 | 2 | 1 | 1 | 4 | 4.050 | 18 | | 8 | 2 | 1 | 2 | 2 | 4.091 | 19 | | 8 | 2 | 1 | 4 | 1 | 4.003 | 20 | | 8 | 2 | 2 | 1 | 2 | 4.201 | 21 | | 8 | 2 | 2 | 2 | 1 | 3.991 | -------------------------------------------------------------------------------- /xfuser/core/utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | from torch.cuda import synchronize 5 | 6 | try: 7 | import torch_musa 8 | from torch_musa.core.device import synchronize 9 | except ModuleNotFoundError: 10 | pass 11 | 12 | import xfuser.envs as envs 13 | if envs._is_npu(): 14 | from torch.npu import synchronize 15 | 16 | def gpu_timer_decorator(func): 17 | def wrapper(*args, **kwargs): 18 | synchronize() 19 | start_time = time.time() 20 | result = func(*args, **kwargs) 21 | synchronize() 22 | end_time = time.time() 23 | 24 | if torch.distributed.get_rank() == 0: 25 | print( 26 | f"{func.__name__} took {end_time - start_time} seconds to run on GPU." 27 | ) 28 | return result 29 | 30 | return wrapper 31 | -------------------------------------------------------------------------------- /docs/performance/sd3_zh.md: -------------------------------------------------------------------------------- 1 | ## Stable Diffusion 3性能 2 | 3 | 我们是用开源版本的stable-diffusion-3-medium-diffusers 2B模型进行性能评测。 4 | 5 | 在8xA100(NVLink)机器上,在使用不同GPU数目时,最佳的并行方案都是不同的。这说明了多种并行和混合并行的重要性。 6 | 最佳的并行策略在不同GPU规模时分别是:在2个GPU上,使用`cfg_parallel=2`;在4个GPU上,使用`cfg_parallel=2, pipefusion_parallel=2`;在8个GPU上,使用`cfg_parallel=2, pipefusion_parallel=4`。 7 | 8 | torch.compile在除了8 GPU的场景下都来来了加速效果。 9 | 10 | 11 |
12 | latency-sd3_a100 14 |
15 | 16 | 在8xL40 (PCIe)上的延迟情况如下图所示。同样,不同GPU规模,最佳并行策略都是不同的。 17 | torch.compile都来了加速效果。 18 | 19 |
20 | latency-hunyuandit_l40 22 |
23 | -------------------------------------------------------------------------------- /docs/methods/cfg_parallel.md: -------------------------------------------------------------------------------- 1 | ## Classifier-Free Guidance (CFG) Parallel 2 | [Chinese Version](./cfg_parallel_zh.md) 3 | 4 | The Classifier-Free Guidance (CFG) has become an important trick diffusion models by providing broader conditional control, reducing training burden, enhancing the quality and details of generated content, and improving the practicality and adaptability of the model. 5 | 6 | For an input prompt, using CFG requires generating both unconditional guide and text guide simultaneously, which is equivalent to inputting input latents batch_size = 2 of DiT blocks. CFG Parallel separates the two latents for computation, and after each Diffusion Step forward is completed and before the Scheduler executes, it performs an Allgather operation on the latent space results. Its communication overhead is much smaller than Pipefusion and Sequence Parallel. Therefore, when using CFG, CFG Parallel must be used. -------------------------------------------------------------------------------- /xfuser/config/diffusers.py: -------------------------------------------------------------------------------- 1 | import diffusers 2 | from packaging.version import Version 3 | 4 | DEFAULT_MINIMUM_DIFFUSERS_VERSION = "0.33.0" 5 | MINIMUM_DIFFUSERS_VERSIONS = { 6 | "hunyuanvideo_15": "0.36.0", 7 | "zimage": "0.36.0", 8 | "flux2": "0.36.0", 9 | "flux": "0.35.2", 10 | "flux_kontext": "0.35.2", 11 | "hunyuanvideo": "0.35.2", 12 | "wan": "0.35.2", 13 | } 14 | 15 | def has_valid_diffusers_version(model_name: str|None = None) -> bool: 16 | diffusers_version = diffusers.__version__ 17 | minimum_diffusers_version = MINIMUM_DIFFUSERS_VERSIONS.get(model_name, DEFAULT_MINIMUM_DIFFUSERS_VERSION) 18 | return Version(diffusers_version).release >= Version(minimum_diffusers_version).release 19 | 20 | 21 | def get_minimum_diffusers_version(model_name: str|None = None) -> str: 22 | return MINIMUM_DIFFUSERS_VERSIONS.get(model_name, DEFAULT_MINIMUM_DIFFUSERS_VERSION) -------------------------------------------------------------------------------- /xfuser/model_executor/cache/diffusers_adapters/registry.py: -------------------------------------------------------------------------------- 1 | """ 2 | adapted from https://github.com/ali-vilab/TeaCache.git 3 | adapted from https://github.com/chengzeyi/ParaAttention.git 4 | """ 5 | from xfuser.config.diffusers import has_valid_diffusers_version 6 | from typing import Type, Dict 7 | 8 | TRANSFORMER_ADAPTER_REGISTRY: Dict[Type, str] = {} 9 | 10 | def register_transformer_adapter(transformer_class: Type, adapter_name: str): 11 | TRANSFORMER_ADAPTER_REGISTRY[transformer_class] = adapter_name 12 | 13 | if has_valid_diffusers_version("flux"): 14 | from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel 15 | from xfuser.model_executor.models.transformers.transformer_flux import xFuserFluxTransformer2DWrapper 16 | register_transformer_adapter(FluxTransformer2DModel, "flux") 17 | register_transformer_adapter(xFuserFluxTransformer2DWrapper, "flux") 18 | 19 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the NVIDIA PyTorch base image 2 | FROM nvcr.io/nvidia/pytorch:24.07-py3 3 | 4 | # Install git 5 | RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* 6 | 7 | 8 | 9 | # Update pip to the latest version 10 | RUN pip install --no-cache-dir --upgrade pip 11 | 12 | # Uninstall apex first 13 | RUN pip uninstall -y apex 14 | 15 | # # Install flash_attn separately with --use-pep517 flag 16 | # RUN pip install --no-cache-dir --use-pep517 flash-attn==2.6.3 flask 17 | 18 | RUN pip install xfuser 19 | 20 | RUN pip install flask 21 | 22 | # Copy the entire comfyui-xdit directory into the container 23 | COPY ./http-service /app/http-service 24 | 25 | # Change to the xDiT directory 26 | WORKDIR /app 27 | 28 | # Set ENTRYPOINT with CMD as default arguments 29 | # ENTRYPOINT ["python", "/app/comfyui-xdit/launch_host.py"] 30 | # CMD ["--config", "./comfyui-xdit/config.json"] 31 | -------------------------------------------------------------------------------- /docs/performance/latte.md: -------------------------------------------------------------------------------- 1 | ## Latte Performance 2 | [Chinese Version](./latte_zh.md) 3 | 4 | Latte is a text-to-video model, and xDiT currently implements parallel inference acceleration for it using the USP method. PipeFusion is under development. 5 | 6 | On an 8xL20 (PCIe) machine, the latency performance for generating 512x512x16 videos is shown in the graph below. 7 | 8 |
9 | latency-latte-l20-512 11 |
12 | 13 | The latency performance for generating 1024x1024x16 videos is depicted in the following graph. Using mixed sequence parallelization (`ulysses_degree=2`, `ring_degree=4`) yields the best performance. 14 | 15 |
16 | latency-latte-l20-1024 18 |
-------------------------------------------------------------------------------- /xfuser/model_executor/cache/diffusers_adapters/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | adapted from https://github.com/ali-vilab/TeaCache.git 3 | adapted from https://github.com/chengzeyi/ParaAttention.git 4 | """ 5 | import importlib 6 | from typing import Type, Dict, TypeVar 7 | from xfuser.model_executor.cache.diffusers_adapters.registry import TRANSFORMER_ADAPTER_REGISTRY 8 | from xfuser.logger import init_logger 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | def apply_cache_on_transformer(transformer, *args, **kwargs): 14 | adapter_name = TRANSFORMER_ADAPTER_REGISTRY.get(type(transformer)) 15 | if not adapter_name: 16 | logger.error(f"Unknown transformer class: {transformer.__class__.__name__}") 17 | return transformer 18 | 19 | adapter_module = importlib.import_module(f".{adapter_name}", __package__) 20 | apply_cache_on_transformer_fn = getattr(adapter_module, "apply_cache_on_transformer") 21 | return apply_cache_on_transformer_fn(transformer, *args, **kwargs) 22 | -------------------------------------------------------------------------------- /xfuser/__init__.py: -------------------------------------------------------------------------------- 1 | from xfuser.model_executor.pipelines import ( 2 | xFuserPixArtAlphaPipeline, 3 | xFuserPixArtSigmaPipeline, 4 | xFuserStableDiffusion3Pipeline, 5 | xFuserFluxPipeline, 6 | xFuserLattePipeline, 7 | xFuserHunyuanDiTPipeline, 8 | xFuserCogVideoXPipeline, 9 | xFuserConsisIDPipeline, 10 | xFuserStableDiffusionXLPipeline, 11 | xFuserSanaPipeline, 12 | xFuserSanaSprintPipeline, 13 | ) 14 | from xfuser.config import xFuserArgs, EngineConfig 15 | from xfuser.parallel import xDiTParallel 16 | 17 | __all__ = [ 18 | "xFuserPixArtAlphaPipeline", 19 | "xFuserPixArtSigmaPipeline", 20 | "xFuserStableDiffusion3Pipeline", 21 | "xFuserFluxPipeline", 22 | "xFuserLattePipeline", 23 | "xFuserHunyuanDiTPipeline", 24 | "xFuserCogVideoXPipeline", 25 | "xFuserConsisIDPipeline", 26 | "xFuserStableDiffusionXLPipeline", 27 | "xFuserSanaPipeline", 28 | "xFuserSanaSprintPipeline", 29 | "xFuserArgs", 30 | "EngineConfig", 31 | "xDiTParallel", 32 | ] 33 | -------------------------------------------------------------------------------- /xfuser/ray/worker/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The xDiT team. 2 | # Adapted from 3 | # https://github.com/vllm-project/vllm/blob/main/vllm/utils.py 4 | # Copyright (c) 2023, vLLM team. All rights reserved. 5 | import os 6 | from typing import Dict, Any 7 | import importlib.util 8 | from xfuser.logger import init_logger 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | def resolve_obj_by_qualname(qualname: str) -> Any: 14 | """ 15 | Resolve an object by its fully qualified name. 16 | """ 17 | module_name, obj_name = qualname.rsplit(".", 1) 18 | module = importlib.import_module(module_name) 19 | return getattr(module, obj_name) 20 | 21 | 22 | def update_environment_variables(envs: Dict[str, str]): 23 | for k, v in envs.items(): 24 | if k in os.environ and os.environ[k] != v: 25 | logger.warning( 26 | "Overwriting environment variable %s " "from '%s' to '%s'", 27 | k, 28 | os.environ[k], 29 | v, 30 | ) 31 | os.environ[k] = v -------------------------------------------------------------------------------- /xfuser/core/fast_attention/__init__.py: -------------------------------------------------------------------------------- 1 | from .fast_attn_state import ( 2 | get_fast_attn_state, 3 | get_fast_attn_enable, 4 | get_fast_attn_step, 5 | get_fast_attn_calib, 6 | get_fast_attn_threshold, 7 | get_fast_attn_window_size, 8 | get_fast_attn_coco_path, 9 | get_fast_attn_use_cache, 10 | get_fast_attn_config_file, 11 | get_fast_attn_layer_name, 12 | initialize_fast_attn_state, 13 | ) 14 | 15 | from .attn_layer import ( 16 | FastAttnMethod, 17 | xFuserFastAttention, 18 | ) 19 | 20 | from .utils import fast_attention_compression 21 | 22 | __all__ = [ 23 | "get_fast_attn_state", 24 | "get_fast_attn_enable", 25 | "get_fast_attn_step", 26 | "get_fast_attn_calib", 27 | "get_fast_attn_threshold", 28 | "get_fast_attn_window_size", 29 | "get_fast_attn_coco_path", 30 | "get_fast_attn_use_cache", 31 | "get_fast_attn_config_file", 32 | "get_fast_attn_layer_name", 33 | "initialize_fast_attn_state", 34 | "xFuserFastAttention", 35 | "FastAttnMethod", 36 | "fast_attention_compression", 37 | ] 38 | -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from .register import xFuserSchedulerWrappersRegister 2 | from .base_scheduler import xFuserSchedulerBaseWrapper 3 | from .scheduling_dpmsolver_multistep import ( 4 | xFuserDPMSolverMultistepSchedulerWrapper 5 | ) 6 | from .scheduling_flow_match_euler_discrete import ( 7 | xFuserFlowMatchEulerDiscreteSchedulerWrapper, 8 | ) 9 | from .scheduling_ddim import xFuserDDIMSchedulerWrapper 10 | from .scheduling_ddpm import xFuserDDPMSchedulerWrapper 11 | from .scheduling_ddim_cogvideox import xFuserCogVideoXDDIMSchedulerWrapper 12 | from .scheduling_dpm_cogvideox import xFuserCogVideoXDPMSchedulerWrapper 13 | from .scheduling_scm import xFuserSCMSchedulerWrapper 14 | 15 | __all__ = [ 16 | "xFuserSchedulerWrappersRegister", 17 | "xFuserSchedulerBaseWrapper", 18 | "xFuserDPMSolverMultistepSchedulerWrapper", 19 | "xFuserFlowMatchEulerDiscreteSchedulerWrapper", 20 | "xFuserDDIMSchedulerWrapper", 21 | "xFuserCogVideoXDDIMSchedulerWrapper", 22 | "xFuserCogVideoXDPMSchedulerWrapper", 23 | "xFuserDDPMSchedulerWrapper", 24 | "xFuserSCMSchedulerWrapper", 25 | ] 26 | -------------------------------------------------------------------------------- /examples/run_cogvideo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | export PYTHONPATH=$PWD:$PYTHONPATH 5 | 6 | # CogVideoX configuration 7 | SCRIPT="cogvideox_example.py" 8 | MODEL_ID="/cfs/dit/CogVideoX1.5-5B" 9 | INFERENCE_STEP=50 10 | 11 | mkdir -p ./results 12 | 13 | # CogVideoX specific task args 14 | TASK_ARGS="--height 768 --width 1360 --num_frames 17 --guidance_scale 6.0" 15 | 16 | # CogVideoX parallel configuration 17 | N_GPUS=8 18 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 2" 19 | CFG_ARGS="--use_cfg_parallel" 20 | 21 | # Uncomment and modify these as needed 22 | # PIPEFUSION_ARGS="--num_pipeline_patch 8" 23 | # OUTPUT_ARGS="--output_type latent" 24 | # PARALLLEL_VAE="--use_parallel_vae" 25 | ENABLE_TILING="--enable_tiling" 26 | # COMPILE_FLAG="--use_torch_compile" 27 | 28 | torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \ 29 | --model $MODEL_ID \ 30 | $PARALLEL_ARGS \ 31 | $TASK_ARGS \ 32 | $PIPEFUSION_ARGS \ 33 | $OUTPUT_ARGS \ 34 | --num_inference_steps $INFERENCE_STEP \ 35 | --warmup_steps 0 \ 36 | --prompt "A little girl is riding a bicycle at high speed. Focused, detailed, realistic." \ 37 | $CFG_ARGS \ 38 | $PARALLLEL_VAE \ 39 | $ENABLE_TILING \ 40 | $COMPILE_FLAG 41 | -------------------------------------------------------------------------------- /docs/performance/pixart_alpha_legacy.md: -------------------------------------------------------------------------------- 1 | # Pixart-Alpha Legacy Version Performance 2 | 3 | Here are the benchmark results for Pixart-Alpha using the 20-step DPM solver as the scheduler across various image resolutions. 4 | To replicate these findings, please refer to the script at [legacy/scripts/benchmark.sh](../../legacy/scripts/benchmark.sh). 5 | 6 | 1. The Latency on 4xA100-80GB (PCIe) 7 | 8 |
9 | A100 PCIe latency 10 |
11 | 12 | 2. The Latency on 8xL20-48GB (PCIe) 13 | 14 |
15 | L20 latency 16 |
17 | 18 | 3. The Latency on 8xA100-80GB (NVLink) 19 | 20 |
21 | latency-A100-NVLink 22 |
23 | 24 | 4. The Latency on 4xT4-16GB (PCIe) 25 | 26 |
27 | latency-T4 29 |
30 | -------------------------------------------------------------------------------- /xfuser/model_executor/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_pipeline import xFuserPipelineBaseWrapper 2 | from .pipeline_pixart_alpha import xFuserPixArtAlphaPipeline 3 | from .pipeline_pixart_sigma import xFuserPixArtSigmaPipeline 4 | from .pipeline_stable_diffusion_3 import xFuserStableDiffusion3Pipeline 5 | from .pipeline_flux import xFuserFluxPipeline 6 | from .pipeline_latte import xFuserLattePipeline 7 | from .pipeline_cogvideox import xFuserCogVideoXPipeline 8 | from .pipeline_consisid import xFuserConsisIDPipeline 9 | from .pipeline_hunyuandit import xFuserHunyuanDiTPipeline 10 | from .pipeline_stable_diffusion_xl import xFuserStableDiffusionXLPipeline 11 | from .pipeline_sana import xFuserSanaPipeline 12 | from .pipeline_sana_sprint import xFuserSanaSprintPipeline 13 | 14 | __all__ = [ 15 | "xFuserPipelineBaseWrapper", 16 | "xFuserPixArtAlphaPipeline", 17 | "xFuserPixArtSigmaPipeline", 18 | "xFuserStableDiffusion3Pipeline", 19 | "xFuserFluxPipeline", 20 | "xFuserLattePipeline", 21 | "xFuserHunyuanDiTPipeline", 22 | "xFuserCogVideoXPipeline", 23 | "xFuserConsisIDPipeline", 24 | "xFuserStableDiffusionXLPipeline", 25 | "xFuserSanaPipeline", 26 | "xFuserSanaSprintPipeline", 27 | ] -------------------------------------------------------------------------------- /docs/developer/Http_Service.md: -------------------------------------------------------------------------------- 1 | ## Launch a Text-to-Image Http Service 2 | 3 | Launch an HTTP-based text-to-image service that generates images from textual descriptions (prompts) using the DiT model. 4 | The generated images can either be returned directly to users or saved to a specified disk location. 5 | For example, the following command launches a HTTP service with 4 GPUs, 2 Ulysses parallel degree, 2 PipeFusion parallel degree, and the model path is `./models/FLUX.1-schnell`. 6 | 7 | ```bash 8 | python ./entrypoints/launch.py --world_size 4 --ulysses_parallel_degree 2 --pipefusion_parallel_degree 2 --model_path /your_model_path/FLUX.1-schnell 9 | ``` 10 | 11 | 12 | To an example HTTP request is shown below. The `save_disk_path` parameter is optional - if not set, the image will be returned directly; if set, the generated image will be saved to the specified directory on disk. 13 | 14 | ```bash 15 | curl -X POST "http://localhost:6000/generate" \ 16 | -H "Content-Type: application/json" \ 17 | -d '{ 18 | "prompt": "a cute rabbit", 19 | "num_inference_steps": 50, 20 | "seed": 42, 21 | "cfg": 7.5, 22 | "save_disk_path": "/tmp" 23 | }' 24 | ``` 25 | -------------------------------------------------------------------------------- /docs/methods/ditfastattn_zh.md: -------------------------------------------------------------------------------- 1 | ### DiTFastAttn: Attention Compression for Diffusion Transformer Models 2 | 3 | [DiTFastAttn](https://github.com/thu-nics/DiTFastAttn)是一种针对单卡DiTs推理的加速方案,利用Input Temperal Reduction通过如下三种方式来减少计算量: 4 | 5 | 1. Window Attention with Residual Caching to reduce spatial redundancy. 6 | 2. Temporal Similarity Reduction to exploit the similarity between steps. 7 | 3. Conditional Redundancy Elimination to skip redundant computations during conditional generation 8 | 9 | 目前使用DiTFastAttn只能数据并行,或者单GPU运行。不支持其他方式并行,比如USP和PipeFusion等。我们未来计划实现并行版本的DiTFastAttn。 10 | 11 | ## 下载COCO数据集 12 | ``` 13 | wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip 14 | unzip annotations_trainval2014.zip 15 | ``` 16 | 17 | ## 运行 18 | 19 | 在脚本中修改数据集路径,然后运行 20 | 21 | ``` 22 | bash examples/run_fastditattn.sh 23 | ``` 24 | 25 | ## 引用 26 | 27 | ``` 28 | @misc{yuan2024ditfastattn, 29 | title={DiTFastAttn: Attention Compression for Diffusion Transformer Models}, 30 | author={Zhihang Yuan and Pu Lu and Hanling Zhang and Xuefei Ning and Linfeng Zhang and Tianchen Zhao and Shengen Yan and Guohao Dai and Yu Wang}, 31 | year={2024}, 32 | eprint={2406.08552}, 33 | archivePrefix={arXiv}, 34 | } 35 | ``` -------------------------------------------------------------------------------- /docs/performance/stepvideo_zh.md: -------------------------------------------------------------------------------- 1 | ## Step-Video-T2V 30B 性能 2 | 3 | ### 评测说明 4 | 5 | 我们是用开源版本的Step-Video-T2V 30B模型对SP(序列)并行与TP(张量)并行进行性能评测。我们使用ulysses_degree作为sp_degree。 6 | 7 | 测试启动脚本参考:https://github.com/stepfun-ai/Step-Video-T2V/tree/main#multi-gpu-parallel-deployment 8 | 9 | ### Nvidia H20*8 (NVLINK) 10 | 11 | #### 并行策略对比 12 | | 总卡数 | 并行类型 | 配置参数 | 时延 | 加速比 | 显存占用 | 13 | |--------|----------|--------|---------|---------|--------------------| 14 | | 1 | Baseline | `TP1 SP1` | 213.60s | 1.00x | 92,170M | 15 | | 2 | TP | `TP2` | 108.97s | 0.98x | 57,458M ▼37.7% | 16 | | 2 | SP | `SP2` | 108.13s | 0.99x | 86,258M ▼6.4% | 17 | | 4 | TP | `TP4` | 57.61s | 0.93x | 36,566M ▼60.3% | 18 | | 4 | SP | `SP4` | 57.01s | 0.94x | 78,226M ▼15.1% | 19 | | 8 | TP | `TP8` | 30.40s | 0.88x | 30,028M ▼67.4% | 20 | | 8 | SP | `SP8` | 30.10s | 0.89x | 79,684M ▼13.5% | 21 | 22 | 23 | #### 关键发现 24 | - **硬件适配性**: 25 | - 消费级卡组(5090/5090D):完整支持32GB*8配置的训练任务 26 | - 专业推理卡(L20/L40):48GB*4配置实现全参数推理 27 | 28 | - **效率表现**: 29 | - TP8策略显存节省达67.4%(对比SP8高53.9%) 30 | - 混合并行时延降低趋势与理论值偏差<12% 31 | 32 | - **扩展特性**: 33 | - 多维参数切片实现近乎线性的扩展效率 34 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /examples/run_hunyuan_video_usp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | export PYTHONPATH=$PWD:$PYTHONPATH 5 | 6 | # CogVideoX configuration 7 | SCRIPT="hunyuan_video_usp_example.py" 8 | MODEL_ID="/cfs/dit/HunyuanVideo" 9 | # MODEL_ID="tencent/HunyuanVideo" 10 | INFERENCE_STEP=50 11 | 12 | mkdir -p ./results 13 | 14 | # CogVideoX specific task args 15 | TASK_ARGS="--height 720 --width 1280 --num_frames 129 --guidance_scale 5.0" 16 | 17 | # CogVideoX parallel configuration 18 | N_GPUS=8 19 | PARALLEL_ARGS="--ulysses_degree 4 --ring_degree 2" 20 | # CFG_ARGS="--use_cfg_parallel" 21 | 22 | # Uncomment and modify these as needed 23 | # PIPEFUSION_ARGS="--num_pipeline_patch 8" 24 | # OUTPUT_ARGS="--output_type latent" 25 | # PARALLLEL_VAE="--use_parallel_vae" 26 | ENABLE_TILING="--enable_tiling" 27 | ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload" 28 | # COMPILE_FLAG="--use_torch_compile" 29 | 30 | torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \ 31 | --model $MODEL_ID \ 32 | $PARALLEL_ARGS \ 33 | $TASK_ARGS \ 34 | $PIPEFUSION_ARGS \ 35 | $OUTPUT_ARGS \ 36 | --num_inference_steps $INFERENCE_STEP \ 37 | --warmup_steps 0 \ 38 | --prompt "A cat walks on the grass, realistic" \ 39 | $CFG_ARGS \ 40 | $PARALLLEL_VAE \ 41 | $ENABLE_TILING \ 42 | $ENABLE_MODEL_CPU_OFFLOAD \ 43 | $COMPILE_FLAG 44 | -------------------------------------------------------------------------------- /docs/performance/sana.md: -------------------------------------------------------------------------------- 1 | ## Performance of SANA 2 | [Chinese Version](./sana_zh.md) 3 | 4 | We use the open-source version of `Sana_1600M_4Kpx_BF16_diffusers` for performance evaluation. 5 | 6 | Currently, xDiT supports acceleration for SANA with Pipefusion, Ulysses, Ring, CFG, and any combination thereof. Due to the limitation of the Head channel in the SANA network, the maximum parallelism supported by Ulysses is 2. We tested latency on an 8xA100 (NVLink) machine by generating 4096x4096 images with 20 steps. The measured latencies are shown in the table below. It can be seen that CFG achieves the best acceleration effect, while the other three acceleration strategies have similar performance. In the case of 8 GPUs, up to 4.4x generation acceleration can be achieved. 7 | 8 | | #GPUs | cfg | ulysses | ring | pp | Latency (seconds) | 9 | |---|---|---|---|---|---| 10 | | 1 | 1 | 1 | 1 | 1 | 17.551 | 11 | | 2 | 1 | 1 | 1 | 2 | 11.276 | 12 | | 2 | 1 | 1 | 2 | 1 | 11.447 | 13 | | 2 | 1 | 2 | 1 | 1 | 10.175 | 14 | | 2 | 2 | 1 | 1 | 1 | 8.365 | 15 | | 4 | 2 | 1 | 1 | 2 | 5.599 | 16 | | 4 | 2 | 1 | 2 | 1 | 5.702 | 17 | | 4 | 2 | 2 | 1 | 1 | 5.803 | 18 | | 8 | 2 | 1 | 1 | 4 | 4.050 | 19 | | 8 | 2 | 1 | 2 | 2 | 4.091 | 20 | | 8 | 2 | 1 | 4 | 1 | 4.003 | 21 | | 8 | 2 | 2 | 1 | 2 | 4.201 | 22 | | 8 | 2 | 2 | 2 | 1 | 3.991 | -------------------------------------------------------------------------------- /docs/performance/sd3.md: -------------------------------------------------------------------------------- 1 | ## Performance of Stable Diffusion 3 2 | [Chinese Version](./sd3_zh.md) 3 | 4 | We conducted performance evaluations using the open-source version of the stable-diffusion-3-medium-diffusers 2B model. 5 | 6 | On an 8xA100 (NVLink) machine, the optimal parallelization strategy varied depending on the number of GPUs used, highlighting the importance of diverse and hybrid parallel approaches. The best parallel strategies for different GPU scales were as follows: with 2 GPUs, `cfg_parallel=2` was used; with 4 GPUs, `cfg_parallel=2, pipefusion_parallel=2` was employed; and with 8 GPUs, `cfg_parallel=2, pipefusion_parallel=4` was utilized. 7 | 8 | torch.compile provided acceleration in all scenarios except for the 8 GPU configuration. 9 | 10 |
11 | latency-sd3_a100 13 |
14 | 15 | The latency situation on 8xL40 (PCIe) is depicted in the graph below. Similarly, the optimal parallel strategies varied with different GPU scales. torch.compile delivered acceleration in all cases. 16 | 17 |
18 | latency-hunyuandit_l40 20 |
-------------------------------------------------------------------------------- /examples/ray/README.md: -------------------------------------------------------------------------------- 1 | ## Running DiT Backbone and VAE Module Separately 2 | 3 | The DiT model typically consists of DiT backbone (encoder + transformers) and VAE module. 4 | The DiT backbone module has high computational requirements but stable memory usage. 5 | For high-resolution images, the VAE module has high memory consumption due to temporary memory spikes from convolution operators, despite its low computational requirements. This often leads to OOM (Out of Memory) issues caused by the VAE module. 6 | 7 | Therefore, separating the encoder + DiT backbone from the VAE module can effectively alleviate OOM issues. 8 | We use Ray to implement the separation of backbone and VAE functionality, and allocate different GPU parallelism for VAE and DiT backbone. 9 | 10 | In `ray_run.sh`, we define different model configurations. 11 | For example, if we use 3 GPUs and want to allocate 1 GPU for VAE and 2 GPUs for DiT backbone, the settings in `ray_run.sh` would be: 12 | 13 | ``` 14 | N_GPUS=3 # world size 15 | PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 1" 16 | VAE_PARALLEL_SIZE=1 17 | DIT_PARALLEL_SIZE=2 18 | ``` 19 | 20 | Here, `VAE_PARALLEL_SIZE` specifies the parallelism for VAE, DIT_PARALLEL_SIZE defines DiT parallelism, and PARALLEL_ARGS contains the parallel configuration for DiT backbone, which in this case uses PipeFusion to run on 2 GPUs. 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/developer/adding_models/adding_model_cfg_usp.md: -------------------------------------------------------------------------------- 1 | # Parallelize new models with CFG parallelism and USP provided by xDiT 2 | 3 | The following two tutorials provide detailed instructions on how to implement CFG parallelism and USP (Unified Sequence Parallelism) supported by xDiT for a new DiT model: 4 | 5 | [Parallelize new models with CFG parallelism provided by xDiT](adding_model_cfg.md) 6 | 7 | [Parallelize new models with USP provided by xDiT](adding_model_usp.md) 8 | 9 | [Parallelize new models with USP provided by xDiT (text replica)](adding_model_usp_text_replica.md) 10 | 11 | Both parallelization techniques can be concurrently employed. To achieve this, specify the level of parallelization for both CFG parallelism and USP as demonstrated below. The number of GPUs should be twice the product of the degrees of ulysses attention and ring attention: 12 | 13 | ```python 14 | from xfuser.core.distributed import initialize_model_parallel 15 | initialize_model_parallel( 16 | ring_degree=, 17 | ulysses_degree=, 18 | classifier_free_guidance_degree=2, 19 | ) 20 | # restriction: dist.get_world_size() == 2 x x 21 | ``` 22 | 23 | Following this, both CFG parallelism and USP can be simultaneously implemented. For a comprehensive example script showcasing this approach, refer to [adding_model_cfg_usp.py](adding_model_cfg_usp.py). 24 | -------------------------------------------------------------------------------- /tests/parallel_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from diffusers import StableDiffusion3Pipeline, FluxPipeline 3 | 4 | from xfuser import xFuserArgs 5 | from xfuser.parallel import xDiTParallel 6 | from xfuser.config import FlexibleArgumentParser 7 | from xfuser.core.distributed import get_world_group 8 | 9 | 10 | def main(): 11 | parser = FlexibleArgumentParser(description="xFuser Arguments") 12 | args = xFuserArgs.add_cli_args(parser).parse_args() 13 | engine_args = xFuserArgs.from_cli_args(args) 14 | engine_config, input_config = engine_args.create_config() 15 | 16 | local_rank = get_world_group().local_rank 17 | pipe = StableDiffusion3Pipeline.from_pretrained( 18 | pretrained_model_name_or_path=engine_config.model_config.model, 19 | torch_dtype=torch.float16, 20 | ).to(f"cuda:{local_rank}") 21 | 22 | paralleler = xDiTParallel(pipe, engine_config, input_config) 23 | 24 | paralleler( 25 | height=input_config.height, 26 | width=input_config.height, 27 | prompt=input_config.prompt, 28 | num_inference_steps=input_config.num_inference_steps, 29 | output_type=input_config.output_type, 30 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 31 | ) 32 | if input_config.output_type == "pil": 33 | paralleler.save("results", "stable_diffusion_3") 34 | 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /benchmark/fid/generate.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | export PYTHONPATH=$PWD:$PYTHONPATH 4 | export CAPTION_FILE="dataset_coco.json" 5 | export SAMPLE_IMAGES_FOLODER="sample_images" 6 | 7 | # Select the model type 8 | export MODEL_TYPE="Pixart-alpha" 9 | # Configuration for different model types 10 | # script, model_id, inference_step 11 | declare -A MODEL_CONFIGS=( 12 | ["Pixart-alpha"]="pixartalpha_generate.py /cfs/dit/PixArt-XL-2-256-MS 20" 13 | ["Flux"]="flux_generate.py /cfs/dit/FLUX.1-dev 28" 14 | ) 15 | 16 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then 17 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" 18 | export SCRIPT MODEL_ID INFERENCE_STEP 19 | else 20 | echo "Invalid MODEL_TYPE: $MODEL_TYPE" 21 | exit 1 22 | fi 23 | 24 | # task args 25 | TASK_ARGS="--height 256 --width 256 --no_use_resolution_binning" 26 | 27 | N_GPUS=8 28 | PARALLEL_ARGS="--pipefusion_parallel_degree 8 --ulysses_degree 1 --ring_degree 1" 29 | 30 | torchrun --nproc_per_node=$N_GPUS ./benchmark/fid/$SCRIPT \ 31 | --model $MODEL_ID \ 32 | $PARALLEL_ARGS \ 33 | $TASK_ARGS \ 34 | $PIPEFUSION_ARGS \ 35 | $OUTPUT_ARGS \ 36 | --num_inference_steps $INFERENCE_STEP \ 37 | --warmup_steps 1 \ 38 | --prompt "brown dog laying on the ground with a metal bowl in front of him." \ 39 | $CFG_ARGS \ 40 | $PARALLLEL_VAE \ 41 | $COMPILE_FLAG \ 42 | --caption_file $CAPTION_FILE \ 43 | --sample_images_folder $SAMPLE_IMAGES_FOLODER \ 44 | -------------------------------------------------------------------------------- /tests/context_parallel/debug_flux_usp_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import shlex 5 | from pathlib import Path 6 | 7 | os.environ["HF_HUB_CACHE"] = "/mnt/co-research/shared-models/hub" 8 | 9 | root_dir = Path(__file__).parents[2].absolute() 10 | #os.environ["PYTHONPATH"] = f"{WD}:{os.getenv('PYTHONPATH', '')}" 11 | examples_dir = root_dir / "examples" 12 | flux_script = examples_dir / "flux_usp_example.py" 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 14 | n_gpus = 2 15 | 16 | model_id = "black-forest-labs/FLUX.1-dev" 17 | inference_steps = 28 18 | warmup_steps = 3 19 | max_sequence_length = 512 20 | height = 1024 21 | width = 1024 22 | task_args = f"--max-sequence-length {max_sequence_length} --height {height} --width {width}" 23 | pipefusion_parallel_degree = 1 24 | ulysses_degree = 2 25 | ring_degree = 1 26 | parallel_args = ( 27 | f"--pipefusion_parallel_degree {pipefusion_parallel_degree} " 28 | f"--ulysses_degree {ulysses_degree} " 29 | f"--ring_degree {ring_degree} " 30 | ) 31 | compile_flag = "--use_torch_compile" 32 | 33 | cmd: str = ( 34 | f"{sys.executable} -m torch.distributed.run --nproc_per_node={n_gpus} {flux_script.as_posix()} " 35 | f"--model {model_id} " 36 | f"{parallel_args} " 37 | f"{task_args} " 38 | f"--num_inference_steps {inference_steps} " 39 | f"--warmup_steps {warmup_steps} " 40 | f"--prompt \"A dark tree.\" " 41 | ) 42 | cmd = shlex.split(cmd) 43 | print(cmd) 44 | subprocess.run(cmd, check=True) -------------------------------------------------------------------------------- /docs/performance/hunyuanvideo.md: -------------------------------------------------------------------------------- 1 | ## HunyuanVideo Performance Report 2 | 3 | xDiT is [HunyuanVideo](https://github.com/Tencent/HunyuanVideo?tab=readme-ov-file#-parallel-inference-on-multiple-gpus-by-xdit)'s official parallel inference engine. On H100 and H20 GPUs, xDiT reduces the generation time of 1028x720 videos from 31 minutes to 5 minutes, and 960x960 videos from 28 minutes to 6 minutes. 4 | 5 | The H100 and H20 performance benchmarks are done with the official HunyuanVideo repository. The L20 performance benchmarks are done with the `diffusers` implementation. 6 | The L20 performance benchmarks are measured using this [script](examples/hunyuan_video_usp_example.py), along with `flash-attn==2.7.2.post1` and CUDA 12.4. 7 | 8 | ### 1280x720 Resolution (129 frames, 50 steps) - Ulysses Latency (seconds) 9 | 10 |
11 | 12 | | GPU Type | 1 GPU | 2 GPUs | 4 GPUs | 8 GPUs | 13 | |----------|--------|---------|---------|---------| 14 | | H100 | 1,904.08 | 925.04 | 514.08 | 337.58 | 15 | | H20 | 6,639.17 | 3,400.55 | 1,762.86 | 940.97 | 16 | | L20 | 6,043.88 | 3,271.44 | 2,080.05 | | 17 | 18 |
19 | 20 | ### 960x960 Resolution (129 frames, 50 steps) - Ulysses Latency (seconds) 21 | 22 |
23 | 24 | | GPU Type | 1 GPU | 2 GPUs | 3 GPUs | 6 GPUs | 25 | |----------|--------|---------|---------|---------| 26 | | H100 | 1,735.01 | 934.09 | 645.45 | 367.02 | 27 | | H20 | 6,621.46 | 3,400.55 | 2,310.48 | 1,214.67 | 28 | | L20 | 6,039.08 | 3,260.62 | 2,284.74 | | 29 | 30 |
31 | -------------------------------------------------------------------------------- /xfuser/model_executor/layers/base_layer.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABCMeta 2 | from typing import List 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from xfuser.config.config import InputConfig, ParallelConfig, RuntimeConfig 8 | from xfuser.model_executor.base_wrapper import xFuserBaseWrapper 9 | 10 | 11 | class xFuserLayerBaseWrapper(nn.Module, xFuserBaseWrapper, metaclass=ABCMeta): 12 | 13 | def __init__(self, module: nn.Module): 14 | super().__init__() 15 | super(nn.Module, self).__init__(module=module) 16 | self.activation_cache = None 17 | 18 | def __getattr__(self, name: str): 19 | if "_parameters" in self.__dict__: 20 | _parameters = self.__dict__["_parameters"] 21 | if name in _parameters: 22 | return _parameters[name] 23 | if "_buffers" in self.__dict__: 24 | _buffers = self.__dict__["_buffers"] 25 | if name in _buffers: 26 | return _buffers[name] 27 | if "_modules" in self.__dict__: 28 | modules = self.__dict__["_modules"] 29 | if name in modules: 30 | return modules[name] 31 | try: 32 | return getattr(self.module, name) 33 | except RecursionError: 34 | raise AttributeError( 35 | f"module {type(self.module).__name__} has no " f"attribute {name}" 36 | ) 37 | 38 | @abstractmethod 39 | def forward(self, *args, **kwargs): 40 | pass 41 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/customized/step_video_t2v/linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from xfuser.core.distributed.parallel_state import ( 4 | get_tensor_model_parallel_rank, 5 | get_tp_group, 6 | get_tensor_model_parallel_world_size 7 | ) 8 | 9 | 10 | class ColumnParallelLinear(nn.Linear): 11 | def __init__(self, in_features, out_features, bias=True, gather_output=True, tp_group=None): 12 | self.tp_size = get_tensor_model_parallel_world_size() 13 | self.tp_rank = get_tensor_model_parallel_rank() 14 | self.tp_group = tp_group or get_tp_group() 15 | 16 | super().__init__(in_features, out_features, bias=bias) 17 | 18 | def forward(self, x): 19 | x = super().forward(x) 20 | return x 21 | 22 | 23 | class RowParallelLinear(nn.Linear): 24 | def __init__(self, in_features, out_features, bias=True, input_is_parallel=True, tp_group=None): 25 | self.tp_size = get_tensor_model_parallel_world_size() 26 | self.tp_rank = get_tensor_model_parallel_rank() 27 | self.tp_group = tp_group or get_tp_group() 28 | self.input_is_parallel = input_is_parallel 29 | 30 | super().__init__(in_features, out_features, bias=bias) 31 | 32 | def forward(self, x): 33 | if not self.input_is_parallel: 34 | x = torch.chunk(x, self.tp_size, dim=-1)[self.tp_rank] 35 | x = super().forward(x) 36 | # 执行All-Reduce聚合结果 37 | x = self.tp_group.all_reduce(x) 38 | return x 39 | -------------------------------------------------------------------------------- /docs/methods/ditfastattn.md: -------------------------------------------------------------------------------- 1 | ### DiTFastAttn: Attention Compression for Diffusion Transformer Models 2 | 3 | [DiTFastAttn](https://github.com/thu-nics/DiTFastAttn) is an acceleration solution for single-GPU DiTs inference, utilizing Input Temporal Reduction to reduce computational complexity through the following three methods: 4 | 5 | 1. Window Attention with Residual Caching to reduce spatial redundancy. 6 | 2. Temporal Similarity Reduction to exploit the similarity between steps. 7 | 3. Conditional Redundancy Elimination to skip redundant computations during conditional generation 8 | 9 | Currently, DiTFastAttn can only be used with data parallelism or on a single GPU. It does not support other parallel methods such as USP and PipeFusion. We plan to implement a parallel version of DiTFastAttn in the future. 10 | 11 | ## Download COCO Dataset 12 | ``` 13 | wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip 14 | unzip annotations_trainval2014.zip 15 | ``` 16 | 17 | ## Running 18 | 19 | Modify the dataset path in the script, then run 20 | 21 | ``` 22 | bash examples/run_fastditattn.sh 23 | ``` 24 | 25 | ## Reference 26 | 27 | ``` 28 | @misc{yuan2024ditfastattn, 29 | title={DiTFastAttn: Attention Compression for Diffusion Transformer Models}, 30 | author={Zhihang Yuan and Pu Lu and Hanling Zhang and Xuefei Ning and Linfeng Zhang and Tianchen Zhao and Shengen Yan and Guohao Dai and Yu Wang}, 31 | year={2024}, 32 | eprint={2406.08552}, 33 | archivePrefix={arXiv}, 34 | } 35 | ``` -------------------------------------------------------------------------------- /tests/core/test_envs.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | import torch 4 | from xfuser import envs 5 | 6 | class TestEnvs(unittest.TestCase): 7 | 8 | @patch('torch.cuda.is_available', return_value=True) 9 | def test_get_device_cuda(self, mock_is_available): 10 | device = envs.get_device(0) 11 | self.assertEqual(device.type, 'cuda') 12 | self.assertEqual(device.index, 0) 13 | device_name = envs.get_device_name() 14 | self.assertEqual(device_name, 'cuda') 15 | 16 | @patch('torch.cuda.is_available', return_value=False) 17 | @patch('xfuser.envs._is_mps', return_value=True) 18 | def test_get_device_mps(self, mock_is_mps, mock_is_available): 19 | device = envs.get_device(0) 20 | self.assertEqual(device.type, 'mps') 21 | device_name = envs.get_device_name() 22 | self.assertEqual(device_name, 'mps') 23 | # test that getting CUDA_VERSION does not raise an error 24 | cuda_version = envs.CUDA_VERSION 25 | self.assertIsNotNone(cuda_version) 26 | 27 | @patch('torch.cuda.is_available', return_value=False) 28 | @patch('xfuser.envs._is_mps', return_value=False) 29 | @patch('xfuser.envs._is_musa', return_value=False) 30 | def test_get_device_cpu(self, mock_is_musa, mock_is_mps, mock_is_available): 31 | device = envs.get_device(0) 32 | self.assertEqual(device.type, 'cpu') 33 | device_name = envs.get_device_name() 34 | self.assertEqual(device_name, 'cpu') 35 | 36 | if __name__ == '__main__': 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from xfuser.config.diffusers import has_valid_diffusers_version 2 | from .register import xFuserTransformerWrappersRegister 3 | from .base_transformer import xFuserTransformerBaseWrapper 4 | from .pixart_transformer_2d import xFuserPixArtTransformer2DWrapper 5 | from .transformer_sd3 import xFuserSD3Transformer2DWrapper 6 | from .latte_transformer_3d import xFuserLatteTransformer3DWrapper 7 | from .hunyuan_transformer_2d import xFuserHunyuanDiT2DWrapper 8 | from .cogvideox_transformer_3d import xFuserCogVideoXTransformer3DWrapper 9 | from .consisid_transformer_3d import xFuserConsisIDTransformer3DWrapper 10 | from .sana_transformer_2d import xFuserSanaTransformer2DWrapper 11 | 12 | __all__ = [ 13 | "xFuserTransformerWrappersRegister", 14 | "xFuserTransformerBaseWrapper", 15 | "xFuserPixArtTransformer2DWrapper", 16 | "xFuserSD3Transformer2DWrapper", 17 | "xFuserLatteTransformer3DWrapper", 18 | "xFuserCogVideoXTransformer3DWrapper", 19 | "xFuserHunyuanDiT2DWrapper", 20 | "xFuserConsisIDTransformer3DWrapper", 21 | "xFuserSanaTransformer2DWrapper" 22 | ] 23 | 24 | # Gating some imports based on diffusers version, as they import part of diffusers 25 | if has_valid_diffusers_version("flux"): 26 | from .transformer_flux import xFuserFluxTransformer2DWrapper 27 | __all__.append("xFuserFluxTransformer2DWrapper") 28 | 29 | 30 | if has_valid_diffusers_version("zimage"): 31 | from .transformer_z_image import xFuserZImageTransformer2DWrapper 32 | __all__.append("xFuserZImageTransformer2DWrapper") -------------------------------------------------------------------------------- /examples/run_multinodes.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | # nccl settings 4 | #export NCCL_DEBUG=INFO 5 | 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | export NCCL_SOCKET_IFNAME=eth0 8 | export GLOO_SOCKET_IFNAME=eth0 9 | export NCCL_P2P_DISABLE=1 10 | 11 | #export NCCL_IB_GID_INDEX=3 12 | #export NCCL_IB_DISABLE=0 13 | #export NCCL_NET_GDR_LEVEL=2 14 | #export NCCL_IB_QPS_PER_CONNECTION=4 15 | #export NCCL_IB_TC=160 16 | #export NCCL_IB_TIMEOUT=22 17 | # export NCCL_P2P=0 18 | 19 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" 20 | 21 | export PYTHONPATH=$PWD:$PYTHONPATH 22 | 23 | 24 | NRANK=0 25 | MASTERIP=127.0.0.1 26 | MASTERPORT=6000 27 | DISTARGAS="--nnodes=2 --node_rank=${NRANK} --master_addr=${MASTERIP} --master_port=${MASTERPORT}" 28 | 29 | SCRIPT=pixartalpha_example.py 30 | MODEL_ID="/cfs/dit/PixArt-XL-2-1024-MS/" 31 | INFERENCE_STEP=20 32 | 33 | SIZE=1024 34 | GUIDANCE_SCALE=4.5 35 | PARALLEL_ARGS="--ulysses_degree=1 --ring_degree=1 --pipefusion_parallel_degree=8" 36 | TASK_ARGS="--height=${SIZE} --width=${SIZE} --no_use_resolution_binning --guidance_scale=${GUIDANCE_SCALE}" 37 | OUTPUT_ARGS="--output_type=latent" 38 | CFG_ARGS="--use_cfg_parallel" 39 | 40 | # PARALLLEL_VAE="--use_parallel_vae" 41 | # COMPILE_FLAG="--use_torch_compile" 42 | 43 | torchrun --nproc_per_node=8 $DISTARGAS \ 44 | ./examples/$SCRIPT \ 45 | --model=$MODEL_ID \ 46 | $PARALLEL_ARGS \ 47 | $TASK_ARGS \ 48 | $OUTPUT_ARGS \ 49 | --num_inference_steps $INFERENCE_STEP \ 50 | --warmup_steps=1 \ 51 | --prompt="brown dog laying on the ground with a metal bowl in front of him." \ 52 | $CFG_ARGS \ 53 | $PARALLLEL_VAE \ 54 | $COMPILE_FLAG 55 | -------------------------------------------------------------------------------- /tests/layers/feedforward_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | from diffusers.models.attention import FeedForward 4 | from xfuser.model_executor.layers.feedforward import xFuserFeedForwardWrapper 5 | from xfuser.core.distributed import ( 6 | init_distributed_environment, 7 | initialize_model_parallel, 8 | ) 9 | from torch import distributed as dist 10 | 11 | 12 | class TestFeedForward(unittest.TestCase): 13 | def setUp(self): 14 | init_distributed_environment() 15 | 16 | self.world_size = dist.get_world_size() 17 | self.local_rank = dist.get_rank() 18 | 19 | initialize_model_parallel(tensor_parallel_degree=self.world_size) 20 | 21 | def test_feedforward(self): 22 | torch.manual_seed(0) 23 | self.input_data = torch.ones(1, 20).cuda(self.local_rank) 24 | dist.broadcast(self.input_data, src=0) 25 | 26 | torch.manual_seed(0) 27 | self.model1 = FeedForward(20, 5, bias=True, activation_fn="geglu").cuda( 28 | self.local_rank 29 | ) 30 | 31 | # Broadcast the parameters 32 | for param in self.model1.parameters(): 33 | dist.broadcast(param.data, src=0) 34 | 35 | output1 = self.model1(self.input_data) 36 | 37 | self.model2 = xFuserFeedForwardWrapper(self.model1) 38 | output2 = self.model2(self.input_data) 39 | 40 | print(output1 - output2) 41 | self.assertTrue(torch.allclose(output1, output2, atol=1e-2)) 42 | 43 | 44 | # torchrun --nproc_per_node=2 ./tests/layers/feedforward_test.py 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /benchmark/fid/README.md: -------------------------------------------------------------------------------- 1 | ### Procedure 2 | #### Prerequisite 3 | Firstly, Install the following additional dependencies before testing: 4 | ``` 5 | pip3 install clean-fid 6 | ``` 7 | 8 | #### Reference Batch Preparation 9 | Download the COCO dataset from [here](https://huggingface.co/datasets/HuggingFaceM4/COCO), only the validation set and caption dataset are needed. Unzip the [val2014.zip](http://images.cocodataset.org/zips/val2014.zip) and [caption_datasets.zip](https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip) and you'll get the files in the following format: 10 | ``` 11 | val2014/ 12 | COCO_val2014_000000xxxxxx.jpg 13 | ... 14 | dataset_coco.json 15 | dataset_flickr30k.json 16 | dataset_flickr8k.json 17 | ``` 18 | Then run the following command to process the reference images: 19 | ``` 20 | python3 process_ref_data.py --coco_json dataset_coco.json --num_samples 30000 --input_dir $PATH_TO_VAL2014 --output_dir $REF_IMAGES_FOLODER 21 | ``` 22 | 23 | #### Sample Batch Generation 24 | Run the following command to generate the sample images: 25 | ``` 26 | bash ./benchmark/fid/generate.sh 27 | ``` 28 | You can edit the `generate.sh` to change the model type, caption file, sample images folder, etc. 29 | 30 | #### Evaluate the results using clean-fid 31 | After you completing the above procedure, you'll get the reference images and generated images in the `$REF_IMAGES_FOLODER` and `$SAMPLE_IMAGES_FOLODER` (replace them with the corresponding folders). You can evalute the results with `compute_fid.py` by running: 32 | 33 | ``` 34 | python compute_fid.py --ref_path $REF_IMAGES_FOLODER --sample_path $SAMPLE_IMAGES_FOLODER 35 | ``` 36 | -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/base_scheduler.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABCMeta 2 | from functools import wraps 3 | from typing import List 4 | 5 | from diffusers.schedulers import SchedulerMixin 6 | from xfuser.core.distributed import ( 7 | get_pipeline_parallel_world_size, 8 | get_sequence_parallel_world_size, 9 | ) 10 | from xfuser.model_executor.base_wrapper import xFuserBaseWrapper 11 | 12 | 13 | class xFuserSchedulerBaseWrapper(xFuserBaseWrapper, metaclass=ABCMeta): 14 | def __init__( 15 | self, 16 | module: SchedulerMixin, 17 | ): 18 | super().__init__( 19 | module=module, 20 | ) 21 | 22 | def __setattr__(self, name, value): 23 | if name == "module": 24 | super().__setattr__(name, value) 25 | elif ( 26 | hasattr(self, "module") 27 | and self.module is not None 28 | and hasattr(self.module, name) 29 | ): 30 | setattr(self.module, name, value) 31 | else: 32 | super().__setattr__(name, value) 33 | 34 | @abstractmethod 35 | def step(self, *args, **kwargs): 36 | pass 37 | 38 | @staticmethod 39 | def check_to_use_naive_step(func): 40 | @wraps(func) 41 | def check_naive_step_fn(self, *args, **kwargs): 42 | if ( 43 | get_pipeline_parallel_world_size() == 1 44 | and get_sequence_parallel_world_size() == 1 45 | ): 46 | return self.module.step(*args, **kwargs) 47 | else: 48 | return func(self, *args, **kwargs) 49 | 50 | return check_naive_step_fn 51 | -------------------------------------------------------------------------------- /docs/methods/parallel_vae.md: -------------------------------------------------------------------------------- 1 | ## Patch Parallel VAE 2 | 3 | The [stabilityai/sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse) adopted by diffusers bring OOM to high-resolution images (8192px on A100). A critical issue is the CUDA memory spike, as documented in [diffusers/issues/5924](https://github.com/huggingface/diffusers/issues/5924). 4 | 5 | To address this limitation, we developed [DistVAE](https://github.com/xdit-project/DistVAE), an solution that enables efficient processing of high-resolution images in parallel. Our approach incorporates two key strategies: 6 | 7 | * Patch Parallel: We divide the feature maps in the latent space into multiple patches and perform sequence parallel VAE decoding across different devices. This technique reduces the peak memory required for intermediate activations to 1/$N$, where N is the number of devices utilized. 8 | For the convolutional operator in VAE, we require the communication of the halo region data of the image as shown in the following figures. 9 | 10 |
11 | hybrid process group config 12 |
13 | 14 | * Chunked Input Processing: Similar to [MIT-patch-conv](https://hanlab.mit.edu/blog/patch-conv), we split the input feature map into chunks and feed them into convolution operator sequentially. This approach minimizes temporary memory consumption. 15 | 16 | By synergizing these two methods, we have dramatically expanded the capabilities of VAE decoding. Our implementation successfully handles image resolutions up to 10240px - an impressive 11-fold increase compared to the default VAE implmentation. -------------------------------------------------------------------------------- /docs/methods/usp.md: -------------------------------------------------------------------------------- 1 | ## USP: A Unified Sequence Parallelism Approach for Long Context Generative AI 2 | [Chinese Blog 1](https://zhuanlan.zhihu.com/p/698031151); [Chinese Blog 2](https://zhuanlan.zhihu.com/p/689067888) 3 | 4 | DeepSpeed-Ulysses and Ring-Attention are not mutually exclusive options. 5 | Both should be used in a mixed manner to jointly split the sequence dimension. 6 | By adjusting their parallelism degrees to ensure that ulysses-degree multiplied by ring-degree equals sp-degree, we refer to this as Unified-SP. 7 | The advantage of Unified-SP is that it encompasses the capabilities of both original methods without any loss, only offering additional benefits. 8 | Firstly, it eliminates the restriction that Ulysses' sp-degree must be less than the number of attention heads. 9 | Moreover, the communication pattern of mixed parallelism is more friendly to heterogeneous networks, providing acceleration over PCIe and in multi-machine multi-GPU environments compared to using Ulysses or Ring alone. Therefore, we recommend using the Unified-SP implementation as the default sequence parallelism solution. 10 | 11 | In xDiT, we utilize the USP implementation from [feifeibear/long-context-attention](https://github.com/feifeibear/long-context-attention). Since DiT does not use Causal Attention, there is no need for load balancing operations on Ring-Attention. For more details, please refer to the following [paper](https://arxiv.org/abs/2405.07719). 12 | 13 | ``` 14 | @article{fang2024unified, 15 | title={USP: A Unified Sequence Parallelism Approach for Long Context Generative AI}, 16 | author={Fang, Jiarui and Zhao, Shangchun}, 17 | journal={arXiv preprint arXiv:2405.07719}, 18 | year={2024} 19 | } 20 | ``` -------------------------------------------------------------------------------- /xfuser/ray/worker/worker_wrappers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The xDiT team. 2 | # Adapted from 3 | # https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker_base.py 4 | # Copyright (c) 2023, vLLM team. All rights reserved. 5 | import os 6 | from abc import ABC 7 | from typing import Any, Dict 8 | 9 | from xfuser.ray.worker.utils import update_environment_variables, resolve_obj_by_qualname 10 | from xfuser.config.config import ParallelConfig 11 | 12 | class BaseWorkerWrapper(ABC): 13 | def __init__(self, worker_cls: str): 14 | self.worker_cls = worker_cls 15 | self.worker = None 16 | 17 | # lazy import 18 | def init_worker(self, *args, **kwargs): 19 | worker_class = resolve_obj_by_qualname( 20 | self.worker_cls) 21 | self.worker = worker_class(*args, **kwargs) 22 | assert self.worker is not None 23 | 24 | def execute_method(self, method: str, *args, **kwargs) -> Any: 25 | method = getattr(self, method, None) or getattr( 26 | self.worker, method, None) 27 | if not method: 28 | raise (AttributeError( 29 | f"Method {method} not found in Worker class")) 30 | return method(*args, **kwargs) 31 | 32 | def update_environs(environs: Dict[str, str]): 33 | if "CUDA_VISIBLE_DEVICES" in environs and "CUDA_VISIBLE_DEVICES" in os.environ: 34 | del os.environ["CUDA_VISIBLE_DEVICES"] 35 | update_environment_variables(environs) 36 | 37 | 38 | class RayWorkerWrapper(BaseWorkerWrapper): 39 | def __init__(self, parallel_config: ParallelConfig, worker_cls: str, rank: int) -> None: 40 | super().__init__(worker_cls) 41 | self.init_worker(parallel_config, rank) -------------------------------------------------------------------------------- /docs/fid/FID.md: -------------------------------------------------------------------------------- 1 | 2 | ### Procedure 3 | #### Prerequisite 4 | Firstly, Install the following additional dependencies before testing: 5 | ``` 6 | pip3 install datasets tensorflow scipy 7 | ``` 8 | 9 | #### Sample Batch Generation 10 | Then you can use `scripts/generate.py` to generate images with COCO captions. An example command is as follow: 11 | ``` 12 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --rdzv-endpoint=localhost:8070 scripts/generate.py --pipeline pixart --scheduler dpm-solver --warmup_steps 4 --parallelism pipeline --no_cuda_graph --dataset coco --no_split_batch --guidance_scale 2.0 --pp_num_patch 8.0 13 | ``` 14 | 15 | After that, you can use `scripts/npz.py` to pack the generated images into a `.npz` file, where the `$GENERATED_IMAGES_FOLODER` is the path you saved the generated images, while `$IMAGES_NUM` is the total images count: 16 | ``` 17 | python3 scripts/npz.py --sample_dir $GENERATED_IMAGES_FOLODER --num $IMAGES_NUM 18 | ``` 19 | 20 | #### Reference Batch Generation 21 | To get the COCO ref images, you can run the following commands: 22 | ``` 23 | python3 scripts/dump_coco.py 24 | ``` 25 | Then you could use `scripts/npz.py` to pack the reference images into a `.npz` file as well, where the `$REF_IMAGES_FOLODER` is the path you saved the reference images, while `$IMAGES_NUM` is the total images count: 26 | ``` 27 | python3 scripts/npz.py --sample_dir $REF_IMAGES_FOLODER --num $IMAGES_NUM 28 | ``` 29 | 30 | #### Evaluate the results 31 | After you completing the above procedure, you'll get two .npz files `$SAMPLE_NPZ` and `$REF_NPZ` (replace them with the corresponding files). You can evalute the results with `scripts/evaluator` by running: 32 | ``` 33 | python3 scripts/evaluator.py --ref_batch $REF_NPZ --sample_batch $SAMPLE_NPZ 34 | ``` -------------------------------------------------------------------------------- /docs/methods/hybrid_zh.md: -------------------------------------------------------------------------------- 1 | 2 | ## 混合并行设计 3 | 4 | xDiT设计目标是扩展推理过程到超大规模,比如异构网络互联条件下多机多卡,比如以太网和PCIe。单一并行方式,比如PipeFusion或者SP,很难同时做到这两点,不同并行方式混合在一起变得尤为重要。 5 | 6 | xDiT支持四种并行方式:PipeFusion、Sequence、Data和CFG Parallel。其中,Data和CFG Parallel在图像间并行相对简单,而PipeFusion和Sequence在图像内部的不同Patch间并行则较为复杂。能让着两种并行方式的混合使用,正式xDiT核心创新点之一。 7 | 8 | PipeFusion利用Input Tempor Redundancy特点,使用过时的KV(Stale KV)进行Attention计算,这使得PipeFusion无法像大型语言模型(LLM)那样轻松地实现并行策略的混合。具体来说,使用标准的序列并行接口,如RingAttention、Ulysses或USP,无法满足SP与PipeFusion混合并行的需求。 9 | 10 | 我们对这个问题具体说明,下图展示了pipe_degree=4,sp_degree=2的混合并行方法。设置`num_pipeline_patch`=4,图片切分为M=`num_pipeline_patch*sp_degree`=8个Patch,分别是P0~P7。 11 | 12 |
13 | hybrid process group config 14 |
15 | 16 | Standard SP Attention实现,输入Q,K,V和输出O都是沿着序列维度切分,且切分方式一致。如果不同rank的输入patch没有重叠,每个micro step计算出fresh KV更新的位置在不同rank间也没有重叠。如下图所示,standard SP的KV Buffer中黄色部分是SP0 rank=0拥有的fresh KV,绿色部分是SP1 rank=1拥有的fresh KV,二者并不相同。在这个diffusion step内,device=0无法拿到P1,3,5,7的fresh KV进行计算,但是PipeFusion则需要在下一个diffusion step中,拥有上一个diffusion step全部的KV。standard SP只拥有1/sp_degree的fresh kv buffer,因此无法获得混合并行推理正确的结果。 17 | 18 |
19 | hybrid parallel workflow 20 |
21 | 22 | 23 | 24 | xDiT专门定制了序列并行的实现方式,以适应这种混合并行的需求。xDiT使用`xFuserLongContextAttention`把SP的中间结果存在KV Buffer内。这样效果如下图,每个micro-step SP执行完毕后,SP Group内不同rank设备的fresh KV是replicate的。这样一个diffusion step后,SP Group所有设备的KV Buffer都更新成最新,供下一个Diffusion Step使用。 25 | 26 | 27 |
28 | kvbuffer in hybrid parallel 29 |
-------------------------------------------------------------------------------- /docs/performance/stepvideo.md: -------------------------------------------------------------------------------- 1 | ## Step-Video-T2V 30B Performance 2 | 3 | ### Evaluation Protocol 4 | The benchmark was conducted using the open-source Step-Video-T2V 30B model to evaluate SP (Sequence Parallelism) and TP (Tensor Parallelism) performance. We applied ulysses_degree as sp_degree. 5 | 6 | Implementation reference: 7 | `https://github.com/stepfun-ai/Step-Video-T2V/tree/main#multi-gpu-parallel-deployment` 8 | 9 | ### Nvidia H20 Cluster (8×NVLink) 10 | 11 | #### Parallel Strategy Comparison 12 | | GPUs | Parallel Type | Configuration | Latency | Speedup Ratio | Memory Usage | 13 | |-------|--------------|---------------|-----------|---------------|--------------------| 14 | | 1 | Baseline | `TP1 SP1` | 213.60s | 1.00x | 92,170M | 15 | | 2 | TP | `TP2` | 108.97s | 0.98x | 57,458M ▼37.7% | 16 | | 2 | SP | `SP2` | 108.13s | 0.99x | 86,258M ▼6.4% | 17 | | 4 | TP | `TP4` | 57.61s | 0.93x | 36,566M ▼60.3% | 18 | | 4 | SP | `SP4` | 57.01s | 0.94x | 78,226M ▼15.1% | 19 | | 8 | TP | `TP8` | 30.40s | 0.88x | 30,028M ▼67.4% | 20 | | 8 | SP | `SP8` | 30.10s | 0.89x | 79,684M ▼13.5% | 21 | 22 | #### Key Findings 23 | - **Hardware Compatibility**: 24 | - Consumer GPUs (5090/5090D): Full training support on 32GB×8 configuration 25 | - Inference Accelerators (L20/L40): Full parameter inference on 48GB×4 configuration 26 | 27 | - **Efficiency Metrics**: 28 | - TP8 achieves 67.4% memory optimization (53.9% higher than SP8) 29 | - Mixed-parallel latency trend remains within <12% deviation from theoretical expectation 30 | 31 | - **Scalability**: 32 | - Multi-dimensional parameter slicing enables near-linear scaling efficiency 33 | - Layered communication optimization reduces cross-node synchronization overhead by 75% 34 | -------------------------------------------------------------------------------- /examples/run_consisid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | export PYTHONPATH=$PWD:$PYTHONPATH 5 | 6 | # ConsisID configuration 7 | SCRIPT="consisid_example.py" 8 | MODEL_ID="/cfs/dit/ConsisID-preview" 9 | INFERENCE_STEP=50 10 | 11 | mkdir -p ./results 12 | 13 | # ConsisID specific task args 14 | TASK_ARGS="--height 480 --width 720 --num_frames 49 --guidance_scale 6.0" 15 | 16 | # ConsisID parallel configuration 17 | N_GPUS=6 18 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 3" 19 | # CFG_ARGS="--use_cfg_parallel" 20 | 21 | # Uncomment and modify these as needed 22 | # PIPEFUSION_ARGS="--num_pipeline_patch 8" 23 | # OUTPUT_ARGS="--output_type latent" 24 | # PARALLLEL_VAE="--use_parallel_vae" 25 | # ENABLE_TILING="--enable_tiling" 26 | # COMPILE_FLAG="--use_torch_compile" 27 | 28 | torchrun --master_port=1234 --nproc_per_node=$N_GPUS ./examples/$SCRIPT \ 29 | --model $MODEL_ID \ 30 | $PARALLEL_ARGS \ 31 | $TASK_ARGS \ 32 | $PIPEFUSION_ARGS \ 33 | $OUTPUT_ARGS \ 34 | --num_inference_steps $INFERENCE_STEP \ 35 | --warmup_steps 0 \ 36 | --prompt "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel." \ 37 | --img_file_path "https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true" \ 38 | $CFG_ARGS \ 39 | $PARALLLEL_VAE \ 40 | $ENABLE_TILING \ 41 | $COMPILE_FLAG -------------------------------------------------------------------------------- /examples/run_consisid_usp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | export PYTHONPATH=$PWD:$PYTHONPATH 5 | 6 | # ConsisID configuration 7 | SCRIPT="consisid_usp_example.py" 8 | MODEL_ID="/cfs/dit/ConsisID-preview" 9 | INFERENCE_STEP=50 10 | 11 | mkdir -p ./results 12 | 13 | # ConsisID specific task args 14 | TASK_ARGS="--height 480 --width 720 --num_frames 49 --guidance_scale 6.0" 15 | 16 | # ConsisID parallel configuration 17 | N_GPUS=4 18 | PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 2" 19 | CFG_ARGS="--use_cfg_parallel" 20 | 21 | # Uncomment and modify these as needed 22 | # PIPEFUSION_ARGS="--num_pipeline_patch 8" 23 | # OUTPUT_ARGS="--output_type latent" 24 | # PARALLLEL_VAE="--use_parallel_vae" 25 | # ENABLE_TILING="--enable_tiling" 26 | # COMPILE_FLAG="--use_torch_compile" 27 | 28 | torchrun --master_port=1234 --nproc_per_node=$N_GPUS ./examples/$SCRIPT \ 29 | --model $MODEL_ID \ 30 | $PARALLEL_ARGS \ 31 | $TASK_ARGS \ 32 | $PIPEFUSION_ARGS \ 33 | $OUTPUT_ARGS \ 34 | --num_inference_steps $INFERENCE_STEP \ 35 | --warmup_steps 0 \ 36 | --prompt "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel." \ 37 | --img_file_path "https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true" \ 38 | $CFG_ARGS \ 39 | $PARALLLEL_VAE \ 40 | $ENABLE_TILING \ 41 | $COMPILE_FLAG -------------------------------------------------------------------------------- /xfuser/model_executor/base_wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABCMeta 2 | from functools import wraps 3 | from typing import Any, List, Optional 4 | 5 | from xfuser.core.distributed.parallel_state import ( 6 | get_classifier_free_guidance_world_size, 7 | get_pipeline_parallel_world_size, 8 | get_sequence_parallel_world_size, 9 | get_tensor_model_parallel_world_size, 10 | ) 11 | from xfuser.core.distributed.runtime_state import get_runtime_state 12 | from xfuser.core.fast_attention import get_fast_attn_enable 13 | 14 | 15 | class xFuserBaseWrapper(metaclass=ABCMeta): 16 | 17 | def __init__( 18 | self, 19 | module: Any, 20 | ): 21 | self.module = module 22 | self.module_type = type(module) 23 | 24 | def __getattr__(self, name: str): 25 | try: 26 | return getattr(self.module, name) 27 | except RecursionError: 28 | raise AttributeError( 29 | f"module {type(self.module).__name__} has no " f"attribute {name}" 30 | ) 31 | 32 | def __str__(self): 33 | return str(self.module) 34 | 35 | @staticmethod 36 | def forward_check_condition(func): 37 | @wraps(func) 38 | def check_condition_fn(self, *args, **kwargs): 39 | if ( 40 | get_pipeline_parallel_world_size() == 1 41 | and get_classifier_free_guidance_world_size() == 1 42 | and get_sequence_parallel_world_size() == 1 43 | and get_tensor_model_parallel_world_size() == 1 44 | and get_fast_attn_enable() == False 45 | ): 46 | return func(self, *args, **kwargs) 47 | if not get_runtime_state().is_ready(): 48 | raise ValueError( 49 | "Runtime state is not ready, please call RuntimeState.set_input_parameters " 50 | "before calling forward" 51 | ) 52 | return func(self, *args, **kwargs) 53 | 54 | return check_condition_fn 55 | -------------------------------------------------------------------------------- /xfuser/model_executor/layers/register.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | import torch 3 | import torch.nn as nn 4 | 5 | from xfuser.logger import init_logger 6 | from xfuser.model_executor.layers.base_layer import xFuserLayerBaseWrapper 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | class xFuserLayerWrappersRegister: 12 | _XFUSER_LAYER_MAPPING: Dict[ 13 | Type[nn.Module], Type[xFuserLayerBaseWrapper] 14 | ] = {} 15 | 16 | @classmethod 17 | def register(cls, origin_layer_class: Type[nn.Module]): 18 | def decorator(xfuser_layer_wrapper: Type[xFuserLayerBaseWrapper]): 19 | if not issubclass(xfuser_layer_wrapper, xFuserLayerBaseWrapper): 20 | raise ValueError( 21 | f"{xfuser_layer_wrapper.__class__.__name__} is not a " 22 | f"subclass of xFuserLayerBaseWrapper" 23 | ) 24 | cls._XFUSER_LAYER_MAPPING[origin_layer_class] = xfuser_layer_wrapper 25 | return xfuser_layer_wrapper 26 | 27 | return decorator 28 | 29 | @classmethod 30 | def get_wrapper(cls, layer: nn.Module) -> xFuserLayerBaseWrapper: 31 | candidate = None 32 | candidate_origin = None 33 | for ( 34 | origin_layer_class, 35 | xfuser_layer_wrapper, 36 | ) in cls._XFUSER_LAYER_MAPPING.items(): 37 | if isinstance(layer, origin_layer_class): 38 | if ( 39 | (candidate is None and candidate_origin is None) 40 | or origin_layer_class == layer.__class__ 41 | or issubclass(origin_layer_class, candidate_origin) 42 | ): 43 | candidate_origin = origin_layer_class 44 | candidate = xfuser_layer_wrapper 45 | 46 | if candidate is None: 47 | raise ValueError( 48 | f"Layer class {layer.__class__.__name__} " 49 | f"is not supported by xFuser" 50 | ) 51 | else: 52 | return candidate 53 | -------------------------------------------------------------------------------- /xfuser/model_executor/pipelines/pipeline_stable_diffusion_xl.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 2 | import torch 3 | import os 4 | from xfuser.model_executor.patch.unet_patch import apply_unet_cfg_parallel_monkey_patch 5 | 6 | from diffusers import StableDiffusionXLPipeline 7 | from xfuser.model_executor.pipelines.base_pipeline import xFuserPipelineBaseWrapper 8 | from xfuser.core.distributed import ( 9 | get_classifier_free_guidance_world_size, 10 | ) 11 | from xfuser.config import EngineConfig, InputConfig 12 | from xfuser.model_executor.pipelines.register import xFuserPipelineWrapperRegister 13 | 14 | @xFuserPipelineWrapperRegister.register(StableDiffusionXLPipeline) 15 | class xFuserStableDiffusionXLPipeline(xFuserPipelineBaseWrapper): 16 | def __init__(self, pipeline: StableDiffusionXLPipeline, engine_config: EngineConfig): 17 | super().__init__(pipeline=pipeline, engine_config=engine_config) 18 | if get_classifier_free_guidance_world_size() == 2: 19 | self.module = apply_unet_cfg_parallel_monkey_patch(self.module) 20 | 21 | @classmethod 22 | def from_pretrained( 23 | cls, 24 | pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], 25 | engine_config: EngineConfig, 26 | return_org_pipeline: bool = False, 27 | **kwargs, 28 | ): 29 | pipeline = StableDiffusionXLPipeline.from_pretrained( 30 | pretrained_model_name_or_path, **kwargs 31 | ) 32 | if return_org_pipeline: 33 | return pipeline 34 | return cls(pipeline, engine_config) 35 | 36 | @xFuserPipelineBaseWrapper.check_model_parallel_state( 37 | sequence_parallel_available=False, 38 | pipefusion_parallel_available=False, 39 | ) 40 | @xFuserPipelineBaseWrapper.check_to_use_naive_forward 41 | @xFuserPipelineBaseWrapper.enable_data_parallel 42 | def __call__( 43 | self, 44 | *args, 45 | **kwargs, 46 | ): 47 | return self.module(*args, **kwargs) 48 | -------------------------------------------------------------------------------- /docs/performance/consisid_zh.md: -------------------------------------------------------------------------------- 1 | ## ConsisID Performance Report 2 | 3 | [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) 是一种身份保持的文本到视频生成模型,其通过频率分解在生成的视频中保持面部一致性。xDiT 目前整合了 USP 技术(包括 Ulysses 注意力和 Ring 注意力)和 CFG 并行来提高推理速度,同时 PipeFusion 的工作正在进行中。我们对基于 diffusers 库的单 GPU ConsisID 推理与我们提出的并行化版本在生成 49帧(6秒)720x480 分辨率视频时的性能差异进行了深入分析。由于我们可以任意组合不同的并行方式以获得不同的性能。在本文中,我们对xDiT在1-6张H100(Nvidia)GPU上的加速性能进行了系统测试。 4 | 5 | 如表所示,对于模型ConsisID,无论是采用 Ulysses Attention、Ring Attention 还是 Classifier-Free Guidance(CFG)并行,均观察到推理延迟的显著降低。值得注意的是,由于其较低的通信开销,CFG 并行方法在性能上优于其他两种技术。通过结合序列并行和 CFG 并行,我们成功提升了推理效率。随着并行度的增加,推理延迟持续下降。在最优配置下,xDiT 相对于单GPU推理实现了 3.21 倍的加速,使得每次迭代仅需 0.72 秒。鉴于 ConsisID 默认的 50 次迭代,总计 35 秒即可完成 49帧 视频的端到端生成,并且运行过程中占用GPU显存40G。 6 | 7 | ### 720x480 Resolution (49 frames, 50 steps) 8 | 9 | 10 | | N-GPUs | ulysses_degree | ring_degree | cfg-parallel | times | 11 | |:------:|:--------------:|:-----------:|:------------:|:---------:| 12 | | 6 | 2 | 3 | 1 | 44.89s | 13 | | 6 | 3 | 2 | 1 | 44.24s | 14 | | 6 | 1 | 3 | 2 | 35.78s | 15 | | 6 | 3 | 1 | 2 | 38.35s | 16 | | 4 | 2 | 1 | 2 | 41.37s | 17 | | 4 | 1 | 2 | 2 | 40.68s | 18 | | 3 | 3 | 1 | 1 | 53.57s | 19 | | 3 | 1 | 3 | 1 | 55.51s | 20 | | 2 | 1 | 2 | 1 | 70.19s | 21 | | 2 | 2 | 1 | 1 | 76.56s | 22 | | 2 | 1 | 1 | 2 | 59.72s | 23 | | 1 | 1 | 1 | 1 | 114.87s | 24 | 25 | ## Resources 26 | 27 | 通过以下资源了解有关 ConsisID 的更多信息: 28 | 29 | - 一段 [视频](https://www.youtube.com/watch?v=PhlgC-bI5SQ) 演示了 ConsisID 的主要功能; 30 | - 有关更多详细信息,请参阅研究论文 [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://hf.co/papers/2411.17440)。 31 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/customized/step_video_t2v/attentions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from einops import rearrange 4 | 5 | try: 6 | from xfuser.core.long_ctx_attention import xFuserLongContextAttention 7 | except ImportError: 8 | xFuserLongContextAttention = None 9 | 10 | 11 | class Attention(nn.Module): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def attn_processor(self, attn_type): 16 | if attn_type == 'torch': 17 | return self.torch_attn_func 18 | elif attn_type == 'parallel': 19 | return self.parallel_attn_func 20 | else: 21 | raise Exception('Not supported attention type...') 22 | 23 | def torch_attn_func( 24 | self, 25 | q, 26 | k, 27 | v, 28 | attn_mask=None, 29 | causal=False, 30 | drop_rate=0.0, 31 | **kwargs 32 | ): 33 | 34 | if attn_mask is not None and attn_mask.dtype != torch.bool: 35 | attn_mask = attn_mask.to(q.dtype) 36 | 37 | if attn_mask is not None and attn_mask.ndim == 3: 38 | n_heads = q.shape[2] 39 | attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) 40 | 41 | q, k, v = map(lambda x: rearrange(x, 'b s h d -> b h s d'), (q, k, v)) 42 | x = torch.nn.functional.scaled_dot_product_attention( 43 | q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal 44 | ) 45 | x = rearrange(x, 'b h s d -> b s h d') 46 | return x 47 | 48 | def parallel_attn_func( 49 | self, 50 | q, 51 | k, 52 | v, 53 | causal=False, 54 | **kwargs 55 | ): 56 | assert xFuserLongContextAttention is not None; 57 | 'to use sequence parallel attention, xFuserLongContextAttention should be imported...' 58 | hybrid_seq_parallel_attn = xFuserLongContextAttention() 59 | x = hybrid_seq_parallel_attn( 60 | None, q, k, v, causal=causal 61 | ) 62 | return x 63 | -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/register.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | import torch 3 | import torch.nn as nn 4 | 5 | from xfuser.logger import init_logger 6 | from xfuser.model_executor.schedulers.base_scheduler import xFuserSchedulerBaseWrapper 7 | 8 | logger = init_logger(__name__) 9 | 10 | class xFuserSchedulerWrappersRegister: 11 | _XFUSER_SCHEDULER_MAPPING: Dict[ 12 | Type[nn.Module], 13 | Type[xFuserSchedulerBaseWrapper] 14 | ] = {} 15 | 16 | @classmethod 17 | def register(cls, origin_scheduler_class: Type[nn.Module]): 18 | def decorator(xfuser_scheduler_class: Type[nn.Module]): 19 | if not issubclass(xfuser_scheduler_class, 20 | xFuserSchedulerBaseWrapper): 21 | raise ValueError( 22 | f"{xfuser_scheduler_class.__class__.__name__} is not " 23 | f"a subclass of xFuserSchedulerBaseWrapper" 24 | ) 25 | cls._XFUSER_SCHEDULER_MAPPING[origin_scheduler_class] = \ 26 | xfuser_scheduler_class 27 | return xfuser_scheduler_class 28 | return decorator 29 | 30 | @classmethod 31 | def get_wrapper( 32 | cls, 33 | scheduler: nn.Module 34 | ) -> xFuserSchedulerBaseWrapper: 35 | candidate = None 36 | candidate_origin = None 37 | for (origin_scheduler_class, 38 | wrapper_class) in cls._XFUSER_SCHEDULER_MAPPING.items(): 39 | if isinstance(scheduler, origin_scheduler_class): 40 | if ((candidate is None and candidate_origin is None) or 41 | origin_scheduler_class == scheduler.__class__ or 42 | issubclass(origin_scheduler_class, candidate_origin)): 43 | candidate_origin = origin_scheduler_class 44 | candidate = wrapper_class 45 | 46 | if candidate is None: 47 | logger.info(f"Scheduler class {scheduler.__class__.__name__} " 48 | f"is not supported by xFuser") 49 | return None 50 | else: 51 | return candidate -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/scheduling_ddpm.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.distributed 5 | 6 | from diffusers.utils.torch_utils import randn_tensor 7 | from diffusers.schedulers.scheduling_ddpm import ( 8 | DDPMScheduler, 9 | DDPMSchedulerOutput, 10 | ) 11 | 12 | from xfuser.core.distributed import ( 13 | get_pipeline_parallel_world_size, 14 | get_sequence_parallel_world_size, 15 | get_runtime_state, 16 | ) 17 | from .register import xFuserSchedulerWrappersRegister 18 | from .base_scheduler import xFuserSchedulerBaseWrapper 19 | 20 | 21 | @xFuserSchedulerWrappersRegister.register(DDPMScheduler) 22 | class xFuserDDPMSchedulerWrapper(xFuserSchedulerBaseWrapper): 23 | 24 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step 25 | def step( 26 | self, 27 | *args, 28 | generator=None, 29 | **kwargs, 30 | ) -> Union[DDPMSchedulerOutput, Tuple]: 31 | """ 32 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion 33 | process from the learned model outputs (most often the predicted noise). 34 | 35 | Args: 36 | model_output (`torch.Tensor`): 37 | The direct output from learned diffusion model. 38 | timestep (`float`): 39 | The current discrete timestep in the diffusion chain. 40 | sample (`torch.Tensor`): 41 | A current instance of a sample created by the diffusion process. 42 | generator (`torch.Generator`, *optional*): 43 | A random number generator. 44 | return_dict (`bool`, *optional*, defaults to `True`): 45 | Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`. 46 | 47 | Returns: 48 | [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`: 49 | If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a 50 | tuple is returned where the first element is the sample tensor. 51 | 52 | """ 53 | return self.module.step(*args, generator, **kwargs) 54 | -------------------------------------------------------------------------------- /examples/run_service.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | # export NCCL_PXN_DISABLE=1 4 | # # export NCCL_DEBUG=INFO 5 | # export NCCL_SOCKET_IFNAME=eth0 6 | # export NCCL_IB_GID_INDEX=3 7 | # export NCCL_IB_DISABLE=0 8 | # export NCCL_NET_GDR_LEVEL=2 9 | # export NCCL_IB_QPS_PER_CONNECTION=4 10 | # export NCCL_IB_TC=160 11 | # export NCCL_IB_TIMEOUT=22 12 | # export NCCL_P2P=0 13 | # export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | export PYTHONPATH=$PWD:$PYTHONPATH 16 | 17 | # Select the model type 18 | # The model is downloaded to a specified location on disk, 19 | # or you can simply use the model's ID on Hugging Face, 20 | # which will then be downloaded to the default cache path on Hugging Face. 21 | 22 | export MODEL_TYPE="Flux" 23 | # Configuration for different model types 24 | # script, model_id, inference_step 25 | declare -A MODEL_CONFIGS=( 26 | ["Flux"]="flux_service.py /cfs/dit/FLUX.1-schnell 4" 27 | ) 28 | 29 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then 30 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" 31 | export SCRIPT MODEL_ID INFERENCE_STEP 32 | else 33 | echo "Invalid MODEL_TYPE: $MODEL_TYPE" 34 | exit 1 35 | fi 36 | 37 | mkdir -p ./results 38 | 39 | for HEIGHT in 1024 40 | do 41 | for N_GPUS in 1; 42 | do 43 | 44 | TASK_ARGS="--height $HEIGHT --width $HEIGHT --no_use_resolution_binning --guidance_scale 3.5" 45 | 46 | PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1" 47 | 48 | 49 | 50 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance. 51 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 " 52 | 53 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed. 54 | # OUTPUT_ARGS="--output_type latent" 55 | 56 | # PARALLLEL_VAE="--use_parallel_vae" 57 | 58 | # Another compile option is `--use_onediff` which will use onediff's compiler. 59 | # COMPILE_FLAG="--use_torch_compile" 60 | 61 | python ./examples/$SCRIPT \ 62 | --model $MODEL_ID \ 63 | $PARALLEL_ARGS \ 64 | $TASK_ARGS \ 65 | $PIPEFUSION_ARGS \ 66 | $OUTPUT_ARGS \ 67 | --num_inference_steps $INFERENCE_STEP \ 68 | --warmup_steps 0 \ 69 | --prompt "A small dog" \ 70 | $CFG_ARGS \ 71 | $PARALLLEL_VAE \ 72 | $COMPILE_FLAG 73 | 74 | done 75 | done 76 | 77 | 78 | -------------------------------------------------------------------------------- /xfuser/parallel.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from xfuser.config.config import InputConfig 5 | from xfuser.core.distributed import ( 6 | init_distributed_environment, 7 | initialize_model_parallel, 8 | ) 9 | from xfuser.config import EngineConfig 10 | from xfuser.core.distributed.parallel_state import ( 11 | get_data_parallel_rank, 12 | get_data_parallel_world_size, 13 | is_dp_last_group, 14 | ) 15 | from xfuser.core.distributed.runtime_state import get_runtime_state 16 | from xfuser.logger import init_logger 17 | from xfuser.model_executor.pipelines.base_pipeline import xFuserPipelineBaseWrapper 18 | from xfuser.model_executor.pipelines.register import xFuserPipelineWrapperRegister 19 | 20 | logger = init_logger(__name__) 21 | 22 | 23 | class xDiTParallel: 24 | def __init__(self, pipe, engine_config: EngineConfig, input_config: InputConfig): 25 | xfuser_pipe_wrapper = xFuserPipelineWrapperRegister.get_class(pipe) 26 | self.pipe = xfuser_pipe_wrapper(pipeline=pipe, engine_config=engine_config) 27 | self.config = engine_config 28 | self.pipe.prepare_run(input_config) 29 | 30 | def __call__( 31 | self, 32 | *args, 33 | **kwargs, 34 | ): 35 | self.result = self.pipe(*args, **kwargs) 36 | return self.result 37 | 38 | def save(self, directory: str, prefix: str): 39 | dp_rank = get_data_parallel_rank() 40 | parallel_info = ( 41 | f"dp{self.config.parallel_config.dp_degree}_cfg{self.config.parallel_config.cfg_degree}_" 42 | f"ulysses{self.config.parallel_config.ulysses_degree}_ring{self.config.parallel_config.ring_degree}_" 43 | f"pp{self.config.parallel_config.pp_degree}_patch{self.config.parallel_config.pp_config.num_pipeline_patch}" 44 | ) 45 | if is_dp_last_group(): 46 | path = Path(f"{directory}") 47 | path.mkdir(mode=755, parents=True, exist_ok=True) 48 | path = path / f"{prefix}_result_{parallel_info}_dprank{dp_rank}" 49 | for i, image in enumerate(self.result.images): 50 | image.save(f"{str(path)}_image{i}.png") 51 | print(f"{str(path)}_image{i}.png") 52 | 53 | def __del__(self): 54 | get_runtime_state().destroy_distributed_env() 55 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/transformers/register.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | import torch 3 | import torch.nn as nn 4 | 5 | from xfuser.logger import init_logger 6 | from xfuser.model_executor.models.transformers.base_transformer import ( 7 | xFuserTransformerBaseWrapper, 8 | ) 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | class xFuserTransformerWrappersRegister: 14 | _XFUSER_TRANSFORMER_MAPPING: Dict[ 15 | Type[nn.Module], Type[xFuserTransformerBaseWrapper] 16 | ] = {} 17 | 18 | @classmethod 19 | def register(cls, origin_transformer_class: Type[nn.Module]): 20 | def decorator(xfuser_transformer_class: Type[nn.Module]): 21 | if not issubclass( 22 | xfuser_transformer_class, xFuserTransformerBaseWrapper 23 | ): 24 | raise ValueError( 25 | f"{xfuser_transformer_class.__class__.__name__} is not " 26 | f"a subclass of xFuserTransformerBaseWrapper" 27 | ) 28 | cls._XFUSER_TRANSFORMER_MAPPING[origin_transformer_class] = ( 29 | xfuser_transformer_class 30 | ) 31 | return xfuser_transformer_class 32 | 33 | return decorator 34 | 35 | @classmethod 36 | def get_wrapper(cls, transformer: nn.Module) -> xFuserTransformerBaseWrapper: 37 | candidate = None 38 | candidate_origin = None 39 | for ( 40 | origin_transformer_class, 41 | wrapper_class, 42 | ) in cls._XFUSER_TRANSFORMER_MAPPING.items(): 43 | if origin_transformer_class is None: 44 | continue 45 | if isinstance(transformer, origin_transformer_class): 46 | if ( 47 | candidate is None 48 | or origin_transformer_class == transformer.__class__ 49 | or issubclass(origin_transformer_class, candidate_origin) 50 | ): 51 | candidate_origin = origin_transformer_class 52 | candidate = wrapper_class 53 | 54 | if candidate is None: 55 | raise ValueError( 56 | f"Transformer class {transformer.__class__.__name__} " 57 | f"is not supported by xFuser" 58 | ) 59 | else: 60 | return candidate 61 | -------------------------------------------------------------------------------- /docs/developer/adding_models/readme.md: -------------------------------------------------------------------------------- 1 | # Apply xDiT to new models 2 | 3 | xDiT was initially developed to accelerate the inference process of Diffusion Transformers (DiTs) within Huggingface `diffusers`. However, with the rapid emergence of various DiT models, you may find yourself needing to support new models that xDiT hasn't yet accommodated or models that are not officially supported by `diffusers` at all. 4 | 5 | xDiT offers interfaces for multiple parallelization methods, including CFG parallelism, sequence parallelism, and PipeFusion, shown as below. 6 | 7 |
8 | api.jpg 10 |
11 | 12 | CFG parallelism is the simplest method to implement, requiring only additional split and merge operations over the batch_size dimension during each iteration. By leveraging CFG parallelism, a nearly 2x speedup can be achieved when conducting inference on two GPUs. Sequence parallelism, on the other hand, involves splitting the sequence during each iteration and necessitates additional communication to handle attention computation in a distributed environment. xDiT introduces USP (Unified Sequence Parallelism) combining two existing sequence parallelism method such as Ulysses Attention and Ring Attention. 13 | 14 | PipeFusion is employed in situations where GPU memory is insufficient or the communication bandwidth between GPUs is low. The method distributes the model parameters among multiple GPUs. Supporting models with PipeFusion is more complex compared to CFG parallelism and USP, but it is useful given machines of limited GPU memory capacity or limited bandwidth. 15 | 16 | The parallelization methods mentioned above can be performed simultaneously to achieve further speed enhancements. For a detailed guide on leveraging CFG parallelism, USP, and PipeFusion using xDiT, refer to the following comprehensive tutorial. 17 | 18 | [Parallelize new models with CFG parallelism provided by xDiT](adding_model_cfg.md) 19 | 20 | [Parallelize new models with USP provided by xDiT](adding_model_usp.md) 21 | 22 | [Parallelize new models with USP provided by xDiT (text replica)](adding_model_usp_text_replica.md) 23 | 24 | [Parallelize new models with a hybrid of CFG parallelism and USP provided by xDiT](adding_model_cfg_usp.md) 25 | 26 | [Parallelize new models with PipeFusion, USP, and CFG parallelism provided by xDiT](adding_model_pipefusion.md) -------------------------------------------------------------------------------- /xfuser/core/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .parallel_state import ( 2 | get_world_group, 3 | get_dp_group, 4 | get_cfg_group, 5 | get_sp_group, 6 | get_pp_group, 7 | get_pipeline_parallel_world_size, 8 | get_pipeline_parallel_rank, 9 | is_pipeline_first_stage, 10 | is_pipeline_last_stage, 11 | get_data_parallel_world_size, 12 | get_data_parallel_rank, 13 | is_dp_last_group, 14 | get_classifier_free_guidance_world_size, 15 | get_classifier_free_guidance_rank, 16 | get_sequence_parallel_world_size, 17 | get_sequence_parallel_rank, 18 | get_ulysses_parallel_world_size, 19 | get_ulysses_parallel_rank, 20 | get_ring_parallel_world_size, 21 | get_ring_parallel_rank, 22 | init_distributed_environment, 23 | initialize_model_parallel, 24 | model_parallel_is_initialized, 25 | get_tensor_model_parallel_world_size, 26 | get_vae_parallel_group, 27 | get_vae_parallel_rank, 28 | get_vae_parallel_world_size, 29 | get_dit_world_size, 30 | init_vae_group, 31 | init_dit_group, 32 | get_dit_group, 33 | ) 34 | from .runtime_state import ( 35 | get_runtime_state, 36 | runtime_state_is_initialized, 37 | initialize_runtime_state, 38 | ) 39 | 40 | __all__ = [ 41 | "get_world_group", 42 | "get_dp_group", 43 | "get_cfg_group", 44 | "get_sp_group", 45 | "get_pp_group", 46 | "get_pipeline_parallel_world_size", 47 | "get_pipeline_parallel_rank", 48 | "is_pipeline_first_stage", 49 | "is_pipeline_last_stage", 50 | "get_data_parallel_world_size", 51 | "get_data_parallel_rank", 52 | "is_dp_last_group", 53 | "get_classifier_free_guidance_world_size", 54 | "get_classifier_free_guidance_rank", 55 | "get_sequence_parallel_world_size", 56 | "get_sequence_parallel_rank", 57 | "get_ulysses_parallel_world_size", 58 | "get_ulysses_parallel_rank", 59 | "get_ring_parallel_world_size", 60 | "get_ring_parallel_rank", 61 | "init_distributed_environment", 62 | "init_model_parallel_group", 63 | "initialize_model_parallel", 64 | "model_parallel_is_initialized", 65 | "get_runtime_state", 66 | "runtime_state_is_initialized", 67 | "initialize_runtime_state", 68 | "get_dit_world_size", 69 | "get_vae_parallel_group", 70 | "get_vae_parallel_rank", 71 | "get_vae_parallel_world_size", 72 | "init_vae_group", 73 | "init_dit_group", 74 | "get_dit_group", 75 | ] 76 | -------------------------------------------------------------------------------- /examples/run_fastditattn.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | # export NCCL_PXN_DISABLE=1 4 | # # export NCCL_DEBUG=INFO 5 | # export NCCL_SOCKET_IFNAME=eth0 6 | # export NCCL_IB_GID_INDEX=3 7 | # export NCCL_IB_DISABLE=0 8 | # export NCCL_NET_GDR_LEVEL=2 9 | # export NCCL_IB_QPS_PER_CONNECTION=4 10 | # export NCCL_IB_TC=160 11 | # export NCCL_IB_TIMEOUT=22 12 | # export NCCL_P2P=0 13 | # export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | export PYTHONPATH=$PWD:$PYTHONPATH 16 | 17 | # Select the model type 18 | # The model is downloaded to a specified location on disk, 19 | # or you can simply use the model's ID on Hugging Face, 20 | # which will then be downloaded to the default cache path on Hugging Face. 21 | 22 | export COCO_PATH="/cfs/fjr2/xDiT/coco/annotations/captions_val2014.json" 23 | export MODEL_TYPE="Pixart-alpha" 24 | # Configuration for different model types 25 | # script, model_id, inference_step 26 | declare -A MODEL_CONFIGS=( 27 | ["Pixart-alpha"]="pixartalpha_example.py /cfs/dit/PixArt-XL-2-1024-MS 20" 28 | ["Pixart-sigma"]="pixartsigma_example.py /cfs/dit/PixArt-Sigma-XL-2-2K-MS 20" 29 | ) 30 | 31 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then 32 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" 33 | export SCRIPT MODEL_ID INFERENCE_STEP 34 | else 35 | echo "Invalid MODEL_TYPE: $MODEL_TYPE" 36 | exit 1 37 | fi 38 | 39 | mkdir -p ./results 40 | 41 | TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 4.5" 42 | FAST_ATTN_ARGS="--use_fast_attn --window_size 512 --n_calib 4 --threshold 0.15 --use_cache --coco_path $COCO_PATH" 43 | 44 | 45 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance. 46 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 " 47 | 48 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed. 49 | # OUTPUT_ARGS="--output_type latent" 50 | 51 | # PARALLLEL_VAE="--use_parallel_vae" 52 | 53 | # Another compile option is `--use_onediff` which will use onediff's compiler. 54 | # COMPILE_FLAG="--use_torch_compile" 55 | 56 | torchrun --nproc_per_node=1 ./examples/$SCRIPT \ 57 | --model $MODEL_ID \ 58 | $PARALLEL_ARGS \ 59 | $TASK_ARGS \ 60 | $PIPEFUSION_ARGS \ 61 | $OUTPUT_ARGS \ 62 | --num_inference_steps $INFERENCE_STEP \ 63 | --warmup_steps 0 \ 64 | --prompt "A small dog" \ 65 | $CFG_ARGS \ 66 | $FAST_ATTN_ARGS \ 67 | $PARALLLEL_VAE \ 68 | $COMPILE_FLAG 69 | -------------------------------------------------------------------------------- /xfuser/model_executor/cache/diffusers_adapters/flux.py: -------------------------------------------------------------------------------- 1 | """ 2 | adapted from https://github.com/ali-vilab/TeaCache.git 3 | adapted from https://github.com/chengzeyi/ParaAttention.git 4 | """ 5 | import functools 6 | import unittest 7 | 8 | import torch 9 | from torch import nn 10 | from diffusers import DiffusionPipeline, FluxTransformer2DModel 11 | from xfuser.model_executor.cache.diffusers_adapters.registry import TRANSFORMER_ADAPTER_REGISTRY 12 | 13 | from xfuser.model_executor.cache import utils 14 | 15 | def create_cached_transformer_blocks(use_cache, transformer, rel_l1_thresh, return_hidden_states_first, num_steps): 16 | cached_transformer_class = { 17 | "Fb": utils.FBCachedTransformerBlocks, 18 | "Tea": utils.TeaCachedTransformerBlocks, 19 | }.get(use_cache) 20 | 21 | if not cached_transformer_class: 22 | raise ValueError(f"Unsupported use_cache value: {use_cache}") 23 | 24 | return cached_transformer_class( 25 | transformer.transformer_blocks, 26 | transformer.single_transformer_blocks, 27 | transformer=transformer, 28 | rel_l1_thresh=rel_l1_thresh, 29 | return_hidden_states_first=return_hidden_states_first, 30 | num_steps=num_steps, 31 | name=TRANSFORMER_ADAPTER_REGISTRY.get(type(transformer)), 32 | ) 33 | 34 | 35 | def apply_cache_on_transformer( 36 | transformer: FluxTransformer2DModel, 37 | *, 38 | rel_l1_thresh=0.12, 39 | return_hidden_states_first=False, 40 | num_steps=8, 41 | use_cache="Fb", 42 | ): 43 | cached_transformer_blocks = nn.ModuleList([ 44 | create_cached_transformer_blocks(use_cache, transformer, rel_l1_thresh, return_hidden_states_first, num_steps) 45 | ]) 46 | 47 | dummy_single_transformer_blocks = torch.nn.ModuleList() 48 | 49 | original_forward = transformer.forward 50 | 51 | @functools.wraps(original_forward) 52 | def new_forward( 53 | self, 54 | *args, 55 | **kwargs, 56 | ): 57 | with unittest.mock.patch.object( 58 | self, 59 | "transformer_blocks", 60 | cached_transformer_blocks, 61 | ), unittest.mock.patch.object( 62 | self, 63 | "single_transformer_blocks", 64 | dummy_single_transformer_blocks, 65 | ): 66 | return original_forward( 67 | *args, 68 | **kwargs, 69 | ) 70 | 71 | transformer.forward = new_forward.__get__(transformer) 72 | 73 | return transformer 74 | 75 | -------------------------------------------------------------------------------- /benchmark/usp_latency_test.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import subprocess 4 | import argparse 5 | 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7" 7 | 8 | 9 | def run_command(cmd): 10 | process = subprocess.Popen( 11 | cmd, 12 | shell=True, 13 | stdout=subprocess.PIPE, 14 | stderr=subprocess.STDOUT, 15 | universal_newlines=True, 16 | ) 17 | output = "" 18 | for line in process.stdout: 19 | if "epoch time:" in line or "Running test for size" in line: 20 | print(line.strip()) 21 | output += line 22 | process.wait() 23 | if process.returncode != 0: 24 | print(f"Command failed: {cmd}") 25 | print(output + "\n") 26 | # subprocess.run(cmd, shell=True, check=True) 27 | 28 | 29 | def main(): 30 | parser = argparse.ArgumentParser(description="Run benchmark tests") 31 | parser.add_argument("--model_id", type=str, required=True, help="Path to the model") 32 | parser.add_argument( 33 | "--sizes", type=int, nargs="+", required=True, help="List of sizes to test" 34 | ) 35 | parser.add_argument( 36 | "--script", 37 | type=str, 38 | required=True, 39 | help="Script to run (e.g., tests/test_pixartalpha.py)", 40 | ) 41 | parser.add_argument( 42 | "--n_gpus", type=int, nargs="+", required=True, help="Number of GPUs to use" 43 | ) 44 | parser.add_argument("--steps", type=int, default=20, help="Number of steps") 45 | args = parser.parse_args() 46 | MODEL_ID = args.model_id 47 | SIZES = args.sizes 48 | SCRIPT = args.script 49 | N_GPUS = args.n_gpus 50 | STEPS = args.steps 51 | 52 | for size in SIZES: 53 | for num_gpus in N_GPUS: 54 | for i in range(int(math.log2(num_gpus)) + 1): 55 | ulysses_degree = int(math.pow(2, i)) 56 | ring_degree = num_gpus // ulysses_degree 57 | 58 | print( 59 | f"Running test for size {size}, ulysses_degree {ulysses_degree}, ring_degree {ring_degree}", 60 | flush=True, 61 | ) 62 | cmd = ( 63 | f"torchrun --nproc_per_node={num_gpus} {SCRIPT} --prompt 'A small cat' --output_type 'latent' --model {MODEL_ID} " 64 | f"--height {size} --width {size} --ulysses_degree {ulysses_degree} --ring_degree {ring_degree} --num_inference_steps {STEPS}" 65 | ) 66 | 67 | run_command(cmd) 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /examples/run.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | export PYTHONPATH=$PWD:$PYTHONPATH 4 | 5 | # Select the model type 6 | export MODEL_TYPE="Flux" 7 | # Configuration for different model types 8 | # script, model_id, inference_step 9 | declare -A MODEL_CONFIGS=( 10 | ["Pixart-alpha"]="pixartalpha_example.py /cfs/dit/PixArt-XL-2-1024-MS 20" 11 | ["Pixart-sigma"]="pixartsigma_example.py /cfs/dit/PixArt-Sigma-XL-2-2K-MS 20" 12 | ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20" 13 | ["Flux"]="flux_example.py /cfs/dit/FLUX.1-dev/ 28" 14 | ["FluxControl"]="flux_control_example.py /cfs/dit/FLUX.1-Depth-dev/ 28" 15 | ["HunyuanDiT"]="hunyuandit_example.py /cfs/dit/HunyuanDiT-v1.2-Diffusers 50" 16 | ["SDXL"]="sdxl_example.py /cfs/dit/stable-diffusion-xl-base-1.0 30" 17 | ) 18 | 19 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then 20 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" 21 | export SCRIPT MODEL_ID INFERENCE_STEP 22 | else 23 | echo "Invalid MODEL_TYPE: $MODEL_TYPE" 24 | exit 1 25 | fi 26 | 27 | mkdir -p ./results 28 | 29 | # task args 30 | TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5" 31 | 32 | # cache args 33 | # CACHE_ARGS="--use_teacache" 34 | # CACHE_ARGS="--use_fbcache" 35 | 36 | # On 8 gpus, pp=2, ulysses=2, ring=1, cfg_parallel=2 (split batch) 37 | N_GPUS=8 38 | PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 2" 39 | 40 | # CFG_ARGS="--use_cfg_parallel" 41 | 42 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance. 43 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 " 44 | 45 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed. 46 | # OUTPUT_ARGS="--output_type latent" 47 | 48 | # PARALLLEL_VAE="--use_parallel_vae" 49 | 50 | # Another compile option is `--use_onediff` which will use onediff's compiler. 51 | # COMPILE_FLAG="--use_torch_compile" 52 | 53 | 54 | # Use this flag to quantize the T5 text encoder, which could reduce the memory usage and have no effect on the result quality. 55 | # QUANTIZE_FLAG="--use_fp8_t5_encoder" 56 | 57 | # export CUDA_VISIBLE_DEVICES=4,5,6,7 58 | 59 | torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \ 60 | --model $MODEL_ID \ 61 | $PARALLEL_ARGS \ 62 | $TASK_ARGS \ 63 | $PIPEFUSION_ARGS \ 64 | $OUTPUT_ARGS \ 65 | --num_inference_steps $INFERENCE_STEP \ 66 | --warmup_steps 1 \ 67 | --prompt "brown dog laying on the ground with a metal bowl in front of him." \ 68 | $CFG_ARGS \ 69 | $PARALLLEL_VAE \ 70 | $COMPILE_FLAG \ 71 | $QUANTIZE_FLAG \ 72 | $CACHE_ARGS \ 73 | -------------------------------------------------------------------------------- /examples/ray/ray_pixartsigma_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserPixArtSigmaPipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline 9 | 10 | def main(): 11 | os.environ["MASTER_ADDR"] = "localhost" 12 | os.environ["MASTER_PORT"] = "12355" 13 | parser = FlexibleArgumentParser(description="xFuser Arguments") 14 | args = xFuserArgs.add_cli_args(parser).parse_args() 15 | engine_args = xFuserArgs.from_cli_args(args) 16 | engine_config, input_config = engine_args.create_config() 17 | model_name = engine_config.model_config.model.split("/")[-1] 18 | encoder_kwargs = { 19 | 'text_encoder': { 20 | 'model_class': T5EncoderModel, 21 | 'pretrained_model_name_or_path': engine_config.model_config.model, 22 | 'subfolder': 'text_encoder', 23 | 'torch_dtype': torch.float16 24 | }, 25 | } 26 | # if args.use_fp8_t5_encoder: 27 | # from optimum.quanto import freeze, qfloat8, quantize 28 | # print(f"rank {local_rank} quantizing text encoder") 29 | # quantize(text_encoder, weights=qfloat8) 30 | # freeze(text_encoder) 31 | 32 | pipe = RayDiffusionPipeline.from_pretrained( 33 | PipelineClass=xFuserPixArtSigmaPipeline, 34 | pretrained_model_name_or_path=engine_config.model_config.model, 35 | engine_config=engine_config, 36 | torch_dtype=torch.float16, 37 | **encoder_kwargs 38 | ) 39 | pipe.prepare_run(input_config) 40 | 41 | torch.cuda.reset_peak_memory_stats() 42 | start_time = time.time() 43 | output = pipe( 44 | height=input_config.height, 45 | width=input_config.width, 46 | prompt=input_config.prompt, 47 | num_inference_steps=input_config.num_inference_steps, 48 | output_type=input_config.output_type, 49 | use_resolution_binning=input_config.use_resolution_binning, 50 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 51 | clean_caption=False, 52 | ) 53 | end_time = time.time() 54 | elapsed_time = end_time - start_time 55 | print(f"elapsed time:{elapsed_time}") 56 | if not os.path.exists("results"): 57 | os.mkdir("results") 58 | 59 | for _, images in enumerate(output): 60 | if images is not None: 61 | image = images[0] 62 | path = f"./results/{model_name}_ray_result.png" 63 | image.save(path) 64 | print( 65 | f"image saved to {path}" 66 | ) 67 | break 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /examples/ray/ray_pixartalpha_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline 9 | from xfuser.model_executor.pipelines import xFuserPixArtAlphaPipeline 10 | 11 | def main(): 12 | os.environ["MASTER_ADDR"] = "localhost" 13 | os.environ["MASTER_PORT"] = "12355" 14 | parser = FlexibleArgumentParser(description="xFuser Arguments") 15 | args = xFuserArgs.add_cli_args(parser).parse_args() 16 | engine_args = xFuserArgs.from_cli_args(args) 17 | engine_config, input_config = engine_args.create_config() 18 | model_name = engine_config.model_config.model.split("/")[-1] 19 | encoder_kwargs = { 20 | 'text_encoder': { 21 | 'model_class': T5EncoderModel, 22 | 'pretrained_model_name_or_path': engine_config.model_config.model, 23 | 'subfolder': 'text_encoder', 24 | 'torch_dtype': torch.float16 25 | }, 26 | } 27 | # if args.use_fp8_t5_encoder: 28 | # from optimum.quanto import freeze, qfloat8, quantize 29 | # print(f"rank {local_rank} quantizing text encoder") 30 | # quantize(text_encoder, weights=qfloat8) 31 | # freeze(text_encoder) 32 | 33 | pipe = RayDiffusionPipeline.from_pretrained( 34 | PipelineClass=xFuserPixArtAlphaPipeline, 35 | pretrained_model_name_or_path=engine_config.model_config.model, 36 | engine_config=engine_config, 37 | torch_dtype=torch.float16, 38 | **encoder_kwargs 39 | ) 40 | pipe.prepare_run(input_config) 41 | 42 | torch.cuda.reset_peak_memory_stats() 43 | start_time = time.time() 44 | output = pipe( 45 | height=input_config.height, 46 | width=input_config.width, 47 | prompt=input_config.prompt, 48 | num_inference_steps=input_config.num_inference_steps, 49 | output_type=input_config.output_type, 50 | use_resolution_binning=input_config.use_resolution_binning, 51 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 52 | ) 53 | end_time = time.time() 54 | elapsed_time = end_time - start_time 55 | print(f"elapsed time:{elapsed_time}") 56 | if not os.path.exists("results"): 57 | os.mkdir("results") 58 | 59 | for _, images in enumerate(output): 60 | if images is not None: 61 | image = images[0] 62 | path = f"./results/{model_name}_ray_result.png" 63 | image.save(path) 64 | print( 65 | f"image saved to {path}" 66 | ) 67 | break 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /examples/ray/ray_hunyuandit_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline 9 | from xfuser.model_executor.pipelines import xFuserHunyuanDiTPipeline 10 | 11 | def main(): 12 | os.environ["MASTER_ADDR"] = "localhost" 13 | os.environ["MASTER_PORT"] = "12355" 14 | parser = FlexibleArgumentParser(description="xFuser Arguments") 15 | args = xFuserArgs.add_cli_args(parser).parse_args() 16 | engine_args = xFuserArgs.from_cli_args(args) 17 | engine_config, input_config = engine_args.create_config() 18 | model_name = engine_config.model_config.model.split("/")[-1] 19 | encoder_kwargs = { 20 | 'text_encoder_2': { 21 | 'model_class': T5EncoderModel, 22 | 'pretrained_model_name_or_path': engine_config.model_config.model, 23 | 'subfolder': 'text_encoder_2', 24 | 'torch_dtype': torch.bfloat16 25 | }, 26 | } 27 | # if args.use_fp8_t5_encoder: 28 | # from optimum.quanto import freeze, qfloat8, quantize 29 | # print(f"rank {local_rank} quantizing text encoder 2") 30 | # quantize(text_encoder_2, weights=qfloat8) 31 | # freeze(text_encoder_2) 32 | 33 | pipe = RayDiffusionPipeline.from_pretrained( 34 | PipelineClass=xFuserHunyuanDiTPipeline, 35 | pretrained_model_name_or_path=engine_config.model_config.model, 36 | engine_config=engine_config, 37 | torch_dtype=torch.float16, 38 | **encoder_kwargs 39 | ) 40 | 41 | pipe.prepare_run(input_config) 42 | 43 | torch.cuda.reset_peak_memory_stats() 44 | start_time = time.time() 45 | output = pipe( 46 | height=input_config.height, 47 | width=input_config.width, 48 | prompt=input_config.prompt, 49 | num_inference_steps=input_config.num_inference_steps, 50 | output_type=input_config.output_type, 51 | use_resolution_binning=input_config.use_resolution_binning, 52 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 53 | ) 54 | end_time = time.time() 55 | elapsed_time = end_time - start_time 56 | print(f"elapsed time:{elapsed_time}") 57 | if not os.path.exists("results"): 58 | os.mkdir("results") 59 | 60 | for _, images in enumerate(output): 61 | if images is not None: 62 | image = images[0] 63 | path = f"./results/{model_name}_ray_result.png" 64 | image.save(path) 65 | print( 66 | f"image saved to {path}" 67 | ) 68 | break 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /docs/performance/hunyuandit_zh.md: -------------------------------------------------------------------------------- 1 | ## HunyuanDiT性能 2 | 3 | ## 8xA100 (NVLink) 4 | 在8xA100(NVLink)机器上,在使用不同GPU数目时,最佳的并行方案都是不同的。这说明了多种并行和混合并行的重要性。 5 | 最佳的并行策略在不同GPU规模时分别是:在2个GPU上,使用`ulysses_degree=2`;在4个GPU上,使用`cfg_parallel=2, ulysses_degree=2`;在8个GPU上,使用`cfg_parallel=2, pipefusion_parallel=4`。 6 | 7 | torch.compile带来的加速效果也很可观,同样并行方案有1.26x到1.76x加速效果,对于8 GPU的场景是最明显的,有1.76x加速。 8 | 9 |
10 | latency-hunyuandit_a100 12 |
13 | 14 | 15 | 下图展示了HunyuanDiT在8xA100 GPU上的可扩展性。我们额外测试了2048px图像生成任务,尽管HunyuanDiT并不具备生成2048px图像的能力。 16 | HunyuanDiT采用DiT块通过Skip Connection相互连接的结构,每个DiT块既与相邻块相连,也与非相邻块相连。 17 | 18 | 对于1024px图像生成任务,最佳的混合并行配置如下:2个GPU时使用pipefusion=2;4个GPU时使用cfg=2, pipefusion=2;8个GPU时使用cfg=2, pipefusion=4。PipeFusion的预热步数设为1。 19 | 在4个和8个GPU上,混合并行分别比单一并行方法获得了1.04x和1.23x的加速。在8个GPU上,PipeFusion的延迟低于SP-Ulysses,但在4个GPU上两者延迟相近。 20 | 在所有并行方法中,SP-Ring展现出最差的可扩展性。 21 | 22 | 对于2048px图像生成任务,8个GPU时的最佳混合并行配置变为cfg=2, pipefusion=2, ring=2。 23 | 同样,在4个和8个GPU上,混合并行相比单一并行方法获得了小幅提升。然而,在使用4个或8个GPU时,由于GPU之间Skip Connection需要额外的点对点通信,PipeFusion表现出比SP-Ulysses和SP-Ring更高的延迟。 24 | 当PipeFusion的并行度为2时,这个问题得到缓解,这突显了在混合配置中使用合适并行度的重要性。 25 | 随着图像尺寸从1024px增加到2048px,SP-Ring和SP-Ulysses之间的性能差距减小,这是因为模型的计算通信比降低,使得SP-Ring能够隐藏更多的通信开销。 26 | 27 |
28 | latency-hunyuandit_a100 30 |
31 | 32 | 33 | ## 8xL40 (PCIe) 34 | 35 | 在8xL40 (PCIe)上的延迟情况如下图所示。同样,不同GPU规模,最佳并行策略都是不同的。 36 | 和A100上不同,在L40上,8 GPU和4 GPU的延迟没有明显变化。我们认为是PCIe导致跨socket之间通信带宽过低导致的。 37 | 38 | torch.compile带来1.2x到1.43x加速。 39 | 40 |
41 | latency-hunyuandit_l40 43 |
44 | 45 | ## 8xL20 (PCIe) 46 | 47 | 在8xL20 (PCIe)上的延迟情况如下图所示。L20的FP16 FLOPS是119.5 TFLOPS,相比L40是181.05 TFLOPS。但是在8 GPU上,L20的延迟反而相比L40更低。 48 | 49 |
50 | latency-hunyuandit_l40 52 |
53 | 54 | 55 | ## 8xV100 (NVLink) 56 | 57 | 在8xV100上的加速下如下图所示。torch.compile带来1.10x到1.30x加速。 58 | 59 |
60 | latency-hunyuandit_v100 62 |
63 | 64 | ## 4xT4 (PCIe) 65 | 66 | 在4xT4上的加速下如下图所示。 67 | 68 |
69 | latency-hunyuandit_t4 71 |
72 | -------------------------------------------------------------------------------- /examples/ray/ray_run.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | # If using a Ray cluster across multiple machines, you need to manually start a Ray cluster like this: 3 | # ray start --head --port=6379 for master node 4 | # ray start --address='192.168.1.1:6379' for worker node 5 | # otherwise, it is not necessary. (for single node) 6 | 7 | export PYTHONPATH=$PWD:$PYTHONPATH 8 | 9 | # Select the model type 10 | export MODEL_TYPE="Flux" 11 | # Configuration for different model types 12 | # script, model_id, inference_step 13 | declare -A MODEL_CONFIGS=( 14 | ["Sd3"]="ray_sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20" 15 | ["Flux"]="ray_flux_example.py /cfs/dit/FLUX.1-dev 28" 16 | ["Pixart-alpha"]="ray_pixartalpha_example.py /cfs/dit/PixArt-XL-2-1024-MS 20" 17 | ["Pixart-sigma"]="ray_pixartsigma_example.py /cfs/dit/PixArt-Sigma-XL-2-1024-MS 20" 18 | ["HunyuanDiT"]="ray_hunyuandit_example.py /cfs/dit/HunyuanDiT-v1.2-Diffusers 50" 19 | ) 20 | 21 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then 22 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" 23 | export SCRIPT MODEL_ID INFERENCE_STEP 24 | else 25 | echo "Invalid MODEL_TYPE: $MODEL_TYPE" 26 | exit 1 27 | fi 28 | 29 | mkdir -p ./results 30 | 31 | # task args 32 | TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning" 33 | 34 | 35 | N_GPUS=2 # world size 36 | PARALLEL_ARGS="--pipefusion_parallel_degree 1 --ulysses_degree 1 --ring_degree 1" 37 | VAE_PARALLEL_SIZE=1 38 | DIT_PARALLEL_SIZE=1 39 | # CFG_ARGS="--use_cfg_parallel" 40 | 41 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance. 42 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 " 43 | 44 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed. 45 | # OUTPUT_ARGS="--output_type latent" 46 | 47 | # PARALLLEL_VAE="--use_parallel_vae" 48 | 49 | # Another compile option is `--use_onediff` which will use onediff's compiler. 50 | # COMPILE_FLAG="--use_torch_compile" 51 | 52 | 53 | # Use this flag to quantize the T5 text encoder, which could reduce the memory usage and have no effect on the result quality. 54 | # QUANTIZE_FLAG="--use_fp8_t5_encoder" 55 | 56 | # It is necessary to set CUDA_VISIBLE_DEVICES for the ray driver and workers. 57 | export CUDA_VISIBLE_DEVICES=4,5,6,7 58 | 59 | python ./examples/ray/$SCRIPT \ 60 | --model $MODEL_ID \ 61 | $PARALLEL_ARGS \ 62 | $TASK_ARGS \ 63 | $PIPEFUSION_ARGS \ 64 | $OUTPUT_ARGS \ 65 | --num_inference_steps $INFERENCE_STEP \ 66 | --warmup_steps 1 \ 67 | --prompt "brown dog laying on the ground with a metal bowl in front of him." \ 68 | --use_ray \ 69 | --ray_world_size $N_GPUS \ 70 | $CFG_ARGS \ 71 | $PARALLLEL_VAE \ 72 | $COMPILE_FLAG \ 73 | $QUANTIZE_FLAG \ 74 | --use_parallel_vae \ 75 | --dit_parallel_size $DIT_PARALLEL_SIZE \ 76 | --vae_parallel_size $VAE_PARALLEL_SIZE 77 | -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/scheduling_dpm_cogvideox.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.distributed 5 | 6 | from diffusers.utils.torch_utils import randn_tensor 7 | from diffusers.schedulers.scheduling_dpm_cogvideox import ( 8 | CogVideoXDPMScheduler, 9 | DDIMSchedulerOutput, 10 | ) 11 | 12 | from .register import xFuserSchedulerWrappersRegister 13 | from .base_scheduler import xFuserSchedulerBaseWrapper 14 | 15 | 16 | @xFuserSchedulerWrappersRegister.register(CogVideoXDPMScheduler) 17 | class xFuserCogVideoXDPMSchedulerWrapper(xFuserSchedulerBaseWrapper): 18 | 19 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step 20 | def step( 21 | self, 22 | *args, 23 | **kwargs, 24 | ) -> Union[DDIMSchedulerOutput, Tuple]: 25 | """ 26 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion 27 | process from the learned model outputs (most often the predicted noise). 28 | 29 | Args: 30 | model_output (`torch.Tensor`): 31 | The direct output from learned diffusion model. 32 | timestep (`float`): 33 | The current discrete timestep in the diffusion chain. 34 | sample (`torch.Tensor`): 35 | A current instance of a sample created by the diffusion process. 36 | eta (`float`): 37 | The weight of noise for added noise in diffusion step. 38 | use_clipped_model_output (`bool`, defaults to `False`): 39 | If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary 40 | because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no 41 | clipping has happened, "corrected" `model_output` would coincide with the one provided as input and 42 | `use_clipped_model_output` has no effect. 43 | generator (`torch.Generator`, *optional*): 44 | A random number generator. 45 | variance_noise (`torch.Tensor`): 46 | Alternative to generating noise with `generator` by directly providing the noise for the variance 47 | itself. Useful for methods such as [`CycleDiffusion`]. 48 | return_dict (`bool`, *optional*, defaults to `True`): 49 | Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. 50 | 51 | Returns: 52 | [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: 53 | If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a 54 | tuple is returned where the first element is the sample tensor. 55 | 56 | """ 57 | return self.module.step(*args, **kwargs) 58 | -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/scheduling_ddim_cogvideox.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.distributed 5 | 6 | from diffusers.utils.torch_utils import randn_tensor 7 | from diffusers.schedulers.scheduling_ddim_cogvideox import ( 8 | CogVideoXDDIMScheduler, 9 | DDIMSchedulerOutput, 10 | ) 11 | 12 | from .register import xFuserSchedulerWrappersRegister 13 | from .base_scheduler import xFuserSchedulerBaseWrapper 14 | 15 | 16 | @xFuserSchedulerWrappersRegister.register(CogVideoXDDIMScheduler) 17 | class xFuserCogVideoXDDIMSchedulerWrapper(xFuserSchedulerBaseWrapper): 18 | 19 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step 20 | def step( 21 | self, 22 | *args, 23 | **kwargs, 24 | ) -> Union[DDIMSchedulerOutput, Tuple]: 25 | """ 26 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion 27 | process from the learned model outputs (most often the predicted noise). 28 | 29 | Args: 30 | model_output (`torch.Tensor`): 31 | The direct output from learned diffusion model. 32 | timestep (`float`): 33 | The current discrete timestep in the diffusion chain. 34 | sample (`torch.Tensor`): 35 | A current instance of a sample created by the diffusion process. 36 | eta (`float`): 37 | The weight of noise for added noise in diffusion step. 38 | use_clipped_model_output (`bool`, defaults to `False`): 39 | If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary 40 | because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no 41 | clipping has happened, "corrected" `model_output` would coincide with the one provided as input and 42 | `use_clipped_model_output` has no effect. 43 | generator (`torch.Generator`, *optional*): 44 | A random number generator. 45 | variance_noise (`torch.Tensor`): 46 | Alternative to generating noise with `generator` by directly providing the noise for the variance 47 | itself. Useful for methods such as [`CycleDiffusion`]. 48 | return_dict (`bool`, *optional*, defaults to `True`): 49 | Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. 50 | 51 | Returns: 52 | [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: 53 | If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a 54 | tuple is returned where the first element is the sample tensor. 55 | 56 | """ 57 | return self.module.step(*args, **kwargs) 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | import subprocess 3 | 4 | 5 | def get_cuda_version(): 6 | try: 7 | nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode("utf-8") 8 | version_line = [line for line in nvcc_version.split("\n") if "release" in line][ 9 | 0 10 | ] 11 | cuda_version = version_line.split(" ")[-2].replace(",", "") 12 | return "cu" + cuda_version.replace(".", "") 13 | except (subprocess.CalledProcessError, FileNotFoundError): 14 | return "no_cuda" 15 | 16 | 17 | if __name__ == "__main__": 18 | with open("README.md", "r") as f: 19 | long_description = f.read() 20 | fp = open("xfuser/__version__.py", "r").read() 21 | version = eval(fp.strip().split()[-1]) 22 | 23 | setup( 24 | name="xfuser", 25 | author="xDiT Team", 26 | author_email="fangjiarui123@gmail.com", 27 | packages=find_packages(), 28 | install_requires=[ 29 | "torch>=2.4.1", 30 | "accelerate>=0.33.0", 31 | "transformers>=4.39.1", 32 | "sentencepiece>=0.1.99", 33 | "beautifulsoup4>=4.12.3", 34 | "distvae", 35 | "yunchang>=0.6.0", 36 | "einops", 37 | "diffusers>=0.33.0", 38 | ], 39 | extras_require={ 40 | "flash-attn": [ 41 | "flash-attn>=2.6.0", # NOTE: flash-attn is necessary if ring_degree > 1 42 | ], 43 | "optimum-quanto": [ 44 | "optimum-quanto", # NOTE: optimum-quanto is necessary if use_fp8_t5_encoder is enabled 45 | ], 46 | "torchao": [ 47 | "torchao", # NOTE: torchao is necessary if use_fp8_gemms is enabled 48 | ], 49 | "flask": [ 50 | "flask", # NOTE: flask is necessary to run xDiT as an http service 51 | ], 52 | "ray": [ 53 | "ray", # NOTE: ray is necessary if RayDiffusionPipeline is used 54 | ], 55 | "opencv-python": [ 56 | "opencv-python-headless", # NOTE: opencv-python is necessary if ConsisIDPipeline is used 57 | ], 58 | "test": [ 59 | "pytest", 60 | "imageio", 61 | "imageio-ffmpeg" 62 | ] 63 | }, 64 | url="https://github.com/xdit-project/xDiT.", 65 | description="A Scalable Inference Engine for Diffusion Transformers (DiTs) on Multiple Computing Devices", 66 | long_description=long_description, 67 | long_description_content_type="text/markdown", 68 | version=version, 69 | classifiers=[ 70 | "Programming Language :: Python :: 3", 71 | "Operating System :: OS Independent", 72 | ], 73 | include_package_data=True, 74 | python_requires=">=3.10", 75 | ) 76 | -------------------------------------------------------------------------------- /docs/performance/cogvideo_zh.md: -------------------------------------------------------------------------------- 1 | ## CogVideoX 性能表现 2 | 3 | CogVideoX/CogVideoX1.5 是有文本/图像生成视频的模型。xDiT 目前整合了 USP 技术(包括 Ulysses 注意力和 Ring 注意力)和 CFG 并行来提高推理速度,同时 PipeFusion 的工作正在进行中。我们对基于 `diffusers` 库的单 GPU CogVideoX 推理与我们提出的并行化版本在生成 49帧(6秒)720x480 分辨率视频时的性能差异进行了深入分析。由于我们可以任意组合不同的并行方式以获得不同的性能。在本文中,我们对xDiT在1-12张L40(PCIe)GPU上的加速性能进行了系统测试。 4 | 5 | ### CogVideoX-2b/5b 6 | 7 | 如图所示,对于基础模型 CogVideoX-2b,无论是采用 Ulysses Attention、Ring Attention 还是 Classifier-Free Guidance(CFG)并行,均观察到推理延迟的显著降低。值得注意的是,由于其较低的通信开销,CFG 并行方法在性能上优于其他两种技术。通过结合序列并行和 CFG 并行,我们成功提升了推理效率。随着并行度的增加,推理延迟持续下降。在最优配置下,xDiT 相对于单GPU推理实现了 4.29 倍的加速,使得每次迭代仅需 0.49 秒。鉴于 CogVideoX 默认的 50 次迭代,总计 30 秒即可完成 24.5 秒视频的端到端生成。 8 | 9 |
10 | latency-cogvideo-l40-2b 12 |
13 | 14 | 针对更复杂的CogVideoX-5b模型,虽然参数增加以提升视频质量和视觉效果,导致计算成本显著上升,但在该模型上,所有方法仍然保持与CogVideoX-2b相似的性能趋势,且并行版本的加速效果进一步提升。相较于单GPU版本,xDiT实现了高达7.75倍的推理速度提升,将端到端视频生成时间缩短至约40秒。 15 | 16 |
17 | latency-cogvideo-l40-5b 19 |
20 | 21 | 在搭载A100 GPU的系统中,xDiT 在 CogVideoX-2b 和 CogVideoX-5b 上展现出类似的加速效果,具体表现可见下方两图。 22 | 23 |
24 | latency-cogvideo-a100-2b 26 |
27 | 28 | 29 |
30 | latency-cogvideo-a100-5b 32 |
33 | 34 | ### CogVideoX1.5-5B 35 | 36 | 同样,我们在配备了L40(PCIe)GPU的系统上用CogVideoX1.5-5B生成161帧1360x768分辨率的视频,我们对比了diffusers库中单卡的推理实现与xDiT的并行版本在推理延迟上的差异。 37 | 如图所示,无论Ulysses Attention、Ring Attention还是CFG并行,均可以降低xDiT的推理延迟。其中,给定2张GPU卡时,CFG并行由于通信量较小,表现出比Ulysses Attention、Ring Attention更高的性能。通过结合序列并行和CFG并行,我们进一步提高了推理效率。随着并行度的增加,推理延迟持续降低。在8卡环境下,混合Ulysses-2,Ring-2,CFG-2时xDiT可以获得最佳性能,相比于单卡推理方法可以实现6.12倍的加速,生成一个视频只需不到10分钟。 38 | 39 |
40 | latency-cogvideo1.5-5b-l40 42 |
43 | 44 | 我们对xDiT在H20和L20上生成81帧1360x768分辨率视频的加速效果进行了进一步比较。从下图可以观察到,在这两台设备上,CogVideoX1.5-5B的推理延迟非常相似,然而考虑到H20的价格高于L20,L20展现出了更高的性价比。 45 | 46 | 47 |
48 | latency-cogvideo1.5-5b-l40 50 |
51 | 52 | 53 |
54 | latency-cogvideo1.5-5b-l40 56 |
57 | -------------------------------------------------------------------------------- /xfuser/model_executor/pipelines/register.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type, Union 2 | from diffusers.pipelines.pipeline_utils import DiffusionPipeline 3 | 4 | from xfuser.logger import init_logger 5 | from .base_pipeline import xFuserPipelineBaseWrapper 6 | 7 | logger = init_logger(__name__) 8 | 9 | class xFuserPipelineWrapperRegister: 10 | _XFUSER_PIPE_MAPPING: Dict[ 11 | Type[DiffusionPipeline], 12 | Type[xFuserPipelineBaseWrapper] 13 | ] = {} 14 | 15 | @classmethod 16 | def register(cls, origin_pipe_class: Type[DiffusionPipeline]): 17 | def decorator(xfuser_pipe_class: Type[xFuserPipelineBaseWrapper]): 18 | if not issubclass(xfuser_pipe_class, xFuserPipelineBaseWrapper): 19 | raise ValueError(f"{xfuser_pipe_class} is not a subclass of" 20 | f" xFuserPipelineBaseWrapper") 21 | cls._XFUSER_PIPE_MAPPING[origin_pipe_class] = \ 22 | xfuser_pipe_class 23 | return xfuser_pipe_class 24 | return decorator 25 | 26 | @classmethod 27 | def get_class( 28 | cls, 29 | pipe: Union[DiffusionPipeline, Type[DiffusionPipeline]] 30 | ) -> Type[xFuserPipelineBaseWrapper]: 31 | if isinstance(pipe, type): 32 | candidate = None 33 | candidate_origin = None 34 | for (origin_model_class, 35 | xfuser_model_class) in cls._XFUSER_PIPE_MAPPING.items(): 36 | if issubclass(pipe, origin_model_class): 37 | if ((candidate is None and candidate_origin is None) or 38 | issubclass(origin_model_class, candidate_origin)): 39 | candidate_origin = origin_model_class 40 | candidate = xfuser_model_class 41 | if candidate is None: 42 | raise ValueError(f"Diffusion Pipeline class {pipe} " 43 | f"is not supported by xFuser") 44 | else: 45 | return candidate 46 | elif isinstance(pipe, DiffusionPipeline): 47 | candidate = None 48 | candidate_origin = None 49 | for (origin_model_class, 50 | xfuser_model_class) in cls._XFUSER_PIPE_MAPPING.items(): 51 | if isinstance(pipe, origin_model_class): 52 | if ((candidate is None and candidate_origin is None) or 53 | issubclass(origin_model_class, candidate_origin)): 54 | candidate_origin = origin_model_class 55 | candidate = xfuser_model_class 56 | 57 | if candidate is None: 58 | raise ValueError(f"Diffusion Pipeline class {pipe.__class__} " 59 | f"is not supported by xFuser") 60 | else: 61 | return candidate 62 | else: 63 | raise ValueError(f"Unsupported type {type(pipe)} for pipe") -------------------------------------------------------------------------------- /docs/performance/consisid.md: -------------------------------------------------------------------------------- 1 | ## ConsisID Performance Report 2 | 3 | [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) is an identity-preserving text-to-video generation model that keeps the face consistent in the generated video by frequency decomposition.xDiT currently integrates USP techniques, including Ulysses Attention, Ring Attention, and CFG parallelization, to enhance inference speed, while work on PipeFusion is ongoing. We conducted an in-depth analysis comparing single-GPU ConsisID inference, based on the diffusers library, with our proposed parallelized version for generating 49 frames (6 seconds) of 720x480 resolution video. By flexibly combining different parallelization methods, we achieved varying performance outcomes. In this study, we systematically evaluate xDiT's acceleration performance across 1 to 6 Nvidia H100 GPUs. 4 | 5 | As shown in the table, the ConsisID model achieves a significant reduction in inference latency with Ulysses Attention, Ring Attention, or Classifier-Free Guidance (CFG) parallelization. Notably, CFG parallelization outperforms the other two techniques due to its lower communication overhead. By combining sequence parallelization and CFG parallelization, inference efficiency was further improved. With increased parallelism, inference latency continued to decrease. Under the optimal configuration, xDiT achieved a 3.21× speedup over single-GPU inference, reducing iteration time to just 0.72 seconds. For the default 50 iterations of ConsisID, this enables end-to-end generation of 49 frames in 35 seconds, with a GPU memory usage of 40 GB. 6 | 7 | ### 720x480 Resolution (49 frames, 50 steps) 8 | 9 | 10 | | N-GPUs | Ulysses Degree | Ring Degree | Cfg Parallel | Times | 11 | | :----: | :------------: | :---------: | :----------: | :-----: | 12 | | 6 | 2 | 3 | 1 | 44.89s | 13 | | 6 | 3 | 2 | 1 | 44.24s | 14 | | 6 | 1 | 3 | 2 | 35.78s | 15 | | 6 | 3 | 1 | 2 | 38.35s | 16 | | 4 | 2 | 1 | 2 | 41.37s | 17 | | 4 | 1 | 2 | 2 | 40.68s | 18 | | 3 | 3 | 1 | 1 | 53.57s | 19 | | 3 | 1 | 3 | 1 | 55.51s | 20 | | 2 | 1 | 2 | 1 | 70.19s | 21 | | 2 | 2 | 1 | 1 | 76.56s | 22 | | 2 | 1 | 1 | 2 | 59.72s | 23 | | 1 | 1 | 1 | 1 | 114.87s | 24 | 25 | ## Resources 26 | 27 | Learn more about ConsisID with the following resources. 28 | - A [video](https://www.youtube.com/watch?v=PhlgC-bI5SQ) demonstrating ConsisID's main features. 29 | - The research paper, [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://hf.co/papers/2411.17440) for more details. 30 | -------------------------------------------------------------------------------- /examples/ray/ray_flux_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserArgs 7 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline 8 | from xfuser.config import FlexibleArgumentParser 9 | from xfuser.model_executor.pipelines import xFuserFluxPipeline 10 | 11 | def main(): 12 | os.environ["MASTER_ADDR"] = "localhost" 13 | os.environ["MASTER_PORT"] = "12355" 14 | parser = FlexibleArgumentParser(description="xFuser Arguments") 15 | args = xFuserArgs.add_cli_args(parser).parse_args() 16 | engine_args = xFuserArgs.from_cli_args(args) 17 | engine_config, input_config = engine_args.create_config() 18 | engine_config.runtime_config.dtype = torch.bfloat16 19 | model_name = engine_config.model_config.model.split("/")[-1] 20 | PipelineClass = xFuserFluxPipeline 21 | # equal to 22 | # text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16) 23 | # but load encoder in worker 24 | encoder_kwargs = { 25 | 'text_encoder_2': { 26 | 'model_class': T5EncoderModel, 27 | 'pretrained_model_name_or_path': engine_config.model_config.model, 28 | 'subfolder': 'text_encoder_2', 29 | 'torch_dtype': torch.bfloat16 30 | }, 31 | } 32 | # if args.use_fp8_t5_encoder: 33 | # from optimum.quanto import freeze, qfloat8, quantize 34 | # quantize(text_encoder_2, weights=qfloat8) 35 | # freeze(text_encoder_2) 36 | 37 | pipe = RayDiffusionPipeline.from_pretrained( 38 | PipelineClass=PipelineClass, 39 | pretrained_model_name_or_path=engine_config.model_config.model, 40 | engine_config=engine_config, 41 | torch_dtype=torch.bfloat16, 42 | **encoder_kwargs 43 | ) 44 | pipe.prepare_run(input_config) 45 | 46 | start_time = time.time() 47 | output = pipe( 48 | height=input_config.height, 49 | width=input_config.width, 50 | prompt=input_config.prompt, 51 | num_inference_steps=input_config.num_inference_steps, 52 | output_type=input_config.output_type, 53 | max_sequence_length=256, 54 | guidance_scale=0.0, 55 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 56 | ) 57 | end_time = time.time() 58 | elapsed_time = end_time - start_time 59 | print(f"elapsed time:{elapsed_time}") 60 | if not os.path.exists("results"): 61 | os.mkdir("results") 62 | 63 | for _, images in enumerate(output): 64 | if images is not None: 65 | image = images[0] 66 | path = f"./results/{model_name}_ray_result.png" 67 | image.save(path) 68 | print( 69 | f"image saved to {path}" 70 | ) 71 | break 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /xfuser/model_executor/patch/unet_patch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from typing import Union, Optional, Dict 4 | from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput 5 | 6 | def unet_cfg_parallel_monkey_patch_forward( 7 | self, 8 | sample: torch.Tensor, 9 | timestep: Union[torch.Tensor, float, int], 10 | encoder_hidden_states: torch.Tensor, 11 | added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, 12 | return_dict: bool = True, 13 | *args, 14 | **kwargs 15 | ): 16 | assert dist.is_initialized(), "Distributed training is not initialized" 17 | 18 | # Initialize output_buffer and buffer_list as instance attributes if they don't exist 19 | if not hasattr(self, 'output_buffer'): 20 | self.output_buffer = None 21 | if not hasattr(self, 'buffer_list'): 22 | self.buffer_list = None 23 | 24 | b, c, h, w = sample.shape 25 | original_forward = type(self).forward 26 | 27 | rank = dist.get_rank() 28 | sample = sample[rank:rank+1] 29 | timestep = timestep[rank:rank+1] if torch.is_tensor(timestep) and timestep.ndim > 0 else timestep 30 | encoder_hidden_states = encoder_hidden_states[rank:rank+1] 31 | if added_cond_kwargs is not None: 32 | new_added_cond_kwargs = {} 33 | for k in added_cond_kwargs: 34 | new_added_cond_kwargs[k] = added_cond_kwargs[k][rank : rank + 1] 35 | added_cond_kwargs = new_added_cond_kwargs 36 | 37 | output = original_forward( 38 | self, 39 | sample=sample, 40 | timestep=timestep, 41 | encoder_hidden_states=encoder_hidden_states, 42 | added_cond_kwargs=added_cond_kwargs, 43 | return_dict=False, 44 | *args, 45 | **kwargs 46 | )[0] 47 | 48 | world_size = dist.get_world_size() 49 | assert world_size == 2, f"world_size is {world_size}, expected 2 in unet_cfg_parallel_monkey_patch_forward" 50 | 51 | if self.output_buffer is None: 52 | self.output_buffer = torch.empty((b, c, h, w), device=output.device, dtype=output.dtype) 53 | if self.buffer_list is None: 54 | self.buffer_list = [torch.empty_like(output) for _ in range(world_size)] 55 | 56 | dist.all_gather(self.buffer_list, output.contiguous(), async_op=False) 57 | torch.cat(self.buffer_list[: 1], dim=2, out=self.output_buffer[0:1]) 58 | torch.cat(self.buffer_list[1 :], dim=2, out=self.output_buffer[1:2]) 59 | output = self.output_buffer 60 | 61 | if return_dict: 62 | output = UNet2DConditionOutput(sample=output) 63 | else: 64 | output = (output,) 65 | return output 66 | 67 | def apply_unet_cfg_parallel_monkey_patch(pipe): 68 | """Apply the monkey patch to the pipeline's UNet if world size is 2.""" 69 | import types 70 | world_size = dist.get_world_size() 71 | if world_size == 2: 72 | pipe.unet.forward = types.MethodType(unet_cfg_parallel_monkey_patch_forward, pipe.unet) 73 | return pipe -------------------------------------------------------------------------------- /examples/ray/ray_sd3_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserArgs 7 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline 8 | from xfuser.config import FlexibleArgumentParser 9 | from xfuser.model_executor.pipelines import xFuserStableDiffusion3Pipeline 10 | 11 | def main(): 12 | os.environ["MASTER_ADDR"] = "localhost" 13 | os.environ["MASTER_PORT"] = "12355" 14 | parser = FlexibleArgumentParser(description="xFuser Arguments") 15 | args = xFuserArgs.add_cli_args(parser).parse_args() 16 | engine_args = xFuserArgs.from_cli_args(args) 17 | engine_config, input_config = engine_args.create_config() 18 | model_name = engine_config.model_config.model.split("/")[-1] 19 | PipelineClass = xFuserStableDiffusion3Pipeline 20 | 21 | # equal to 22 | # text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16) 23 | # but load encoder in worker 24 | encoder_kwargs = { 25 | 'text_encoder_3': { 26 | 'model_class': T5EncoderModel, 27 | 'pretrained_model_name_or_path': engine_config.model_config.model, 28 | 'subfolder': 'text_encoder_3', 29 | 'torch_dtype': torch.float16 30 | }, 31 | } 32 | 33 | # if args.use_fp8_t5_encoder: 34 | # from optimum.quanto import freeze, qfloat8, quantize 35 | # print(f"rank {local_rank} quantizing text encoder 2") 36 | # quantize(text_encoder_3, weights=qfloat8) 37 | # freeze(text_encoder_3) 38 | 39 | pipe = RayDiffusionPipeline.from_pretrained( 40 | PipelineClass=PipelineClass, 41 | pretrained_model_name_or_path=engine_config.model_config.model, 42 | engine_config=engine_config, 43 | torch_dtype=torch.float16, 44 | **encoder_kwargs 45 | ) 46 | pipe.prepare_run(input_config) 47 | 48 | torch.cuda.reset_peak_memory_stats() 49 | start_time = time.time() 50 | output = pipe( 51 | height=input_config.height, 52 | width=input_config.width, 53 | prompt=input_config.prompt, 54 | num_inference_steps=input_config.num_inference_steps, 55 | output_type=input_config.output_type, 56 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 57 | ) 58 | end_time = time.time() 59 | elapsed_time = end_time - start_time 60 | print(f"elapsed time:{elapsed_time}") 61 | if not os.path.exists("results"): 62 | os.mkdir("results") 63 | 64 | for _, images in enumerate(output): 65 | if images is not None: 66 | image = images[0] 67 | path = f"./results/{model_name}_ray_result.png" 68 | image.save(path) 69 | print( 70 | f"image saved to {path}" 71 | ) 72 | break 73 | 74 | 75 | if __name__ == "__main__": 76 | main() -------------------------------------------------------------------------------- /xfuser/model_executor/schedulers/scheduling_ddim.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.distributed 5 | 6 | from diffusers.utils.torch_utils import randn_tensor 7 | from diffusers.schedulers.scheduling_ddim import ( 8 | DDIMScheduler, 9 | DDIMSchedulerOutput, 10 | ) 11 | 12 | from xfuser.core.distributed import ( 13 | get_pipeline_parallel_world_size, 14 | get_sequence_parallel_world_size, 15 | get_runtime_state, 16 | ) 17 | from .register import xFuserSchedulerWrappersRegister 18 | from .base_scheduler import xFuserSchedulerBaseWrapper 19 | 20 | 21 | @xFuserSchedulerWrappersRegister.register(DDIMScheduler) 22 | class xFuserDDIMSchedulerWrapper(xFuserSchedulerBaseWrapper): 23 | 24 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step 25 | def step( 26 | self, 27 | *args, 28 | **kwargs, 29 | ) -> Union[DDIMSchedulerOutput, Tuple]: 30 | """ 31 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion 32 | process from the learned model outputs (most often the predicted noise). 33 | 34 | Args: 35 | model_output (`torch.Tensor`): 36 | The direct output from learned diffusion model. 37 | timestep (`float`): 38 | The current discrete timestep in the diffusion chain. 39 | sample (`torch.Tensor`): 40 | A current instance of a sample created by the diffusion process. 41 | eta (`float`): 42 | The weight of noise for added noise in diffusion step. 43 | use_clipped_model_output (`bool`, defaults to `False`): 44 | If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary 45 | because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no 46 | clipping has happened, "corrected" `model_output` would coincide with the one provided as input and 47 | `use_clipped_model_output` has no effect. 48 | generator (`torch.Generator`, *optional*): 49 | A random number generator. 50 | variance_noise (`torch.Tensor`): 51 | Alternative to generating noise with `generator` by directly providing the noise for the variance 52 | itself. Useful for methods such as [`CycleDiffusion`]. 53 | return_dict (`bool`, *optional*, defaults to `True`): 54 | Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. 55 | 56 | Returns: 57 | [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: 58 | If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a 59 | tuple is returned where the first element is the sample tensor. 60 | 61 | """ 62 | return self.module.step(*args, **kwargs) 63 | -------------------------------------------------------------------------------- /benchmark/fid/pixartalpha_generate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.distributed 4 | import json, os 5 | from xfuser import xFuserPixArtAlphaPipeline, xFuserArgs 6 | from xfuser.config import FlexibleArgumentParser 7 | from xfuser.core.distributed import ( 8 | get_world_group, 9 | get_runtime_state 10 | ) 11 | import gc 12 | 13 | 14 | _NUM_FID_CANDIDATE = 30000 15 | CFG = 2.0 16 | 17 | def flush(): 18 | gc.collect() 19 | torch.cuda.empty_cache() 20 | 21 | def main(): 22 | parser = FlexibleArgumentParser(description='xFuser Arguments') 23 | parser.add_argument('--caption_file', type=str, default='captions_coco.json') 24 | parser.add_argument('--sample_images_folder', type=str, default='sample_images') 25 | args = xFuserArgs.add_cli_args(parser).parse_args() 26 | engine_args = xFuserArgs.from_cli_args(args) 27 | engine_config, input_config = engine_args.create_config() 28 | local_rank = get_world_group().local_rank 29 | 30 | pipe = xFuserPixArtAlphaPipeline.from_pretrained( 31 | pretrained_model_name_or_path=engine_config.model_config.model, 32 | engine_config=engine_config, 33 | torch_dtype=torch.float16, 34 | ).to(f"cuda:{local_rank}") 35 | 36 | if args.enable_sequential_cpu_offload: 37 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank) 38 | logging.info(f'rank {local_rank} sequential CPU offload enabled') 39 | else: 40 | pipe = pipe.to(f'cuda:{local_rank}') 41 | 42 | pipe.prepare_run(input_config, steps=1) 43 | 44 | with open(args.caption_file) as f: 45 | raw_captions = json.load(f) 46 | 47 | raw_captions = raw_captions['images'][:_NUM_FID_CANDIDATE] 48 | captions = list(map(lambda x: x['sentences'][0]['raw'], raw_captions)) 49 | filenames = list(map(lambda x: x['filename'], raw_captions)) 50 | 51 | folder_path = args.sample_images_folder 52 | if not os.path.exists(folder_path): 53 | os.makedirs(folder_path) 54 | 55 | # run multiple prompts at a time to save time 56 | num_prompt_one_step = 120 57 | for j in range(0, _NUM_FID_CANDIDATE, num_prompt_one_step): 58 | output = pipe( 59 | height=256, 60 | width=256, 61 | prompt=captions[j:j+num_prompt_one_step], 62 | num_inference_steps=input_config.num_inference_steps, 63 | output_type=input_config.output_type, 64 | max_sequence_length=256, 65 | guidance_scale=CFG, 66 | generator=torch.Generator(device='cuda').manual_seed(input_config.seed), 67 | ) 68 | if input_config.output_type == 'pil': 69 | if pipe.is_dp_last_group(): 70 | for k, local_filename in enumerate(filenames[j:j+num_prompt_one_step]): 71 | output.images[k].save(f'{folder_path}/{local_filename}') 72 | print(f'{j}-{j+num_prompt_one_step-1} generation finished!') 73 | flush() 74 | 75 | get_runtime_state().destroy_distributed_env() 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /benchmark/fid/flux_generate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.distributed 4 | import json, os 5 | from xfuser import xFuserFluxPipeline, xFuserArgs 6 | from xfuser.config import FlexibleArgumentParser 7 | from xfuser.core.distributed import ( 8 | get_world_group, 9 | get_runtime_state 10 | ) 11 | import gc 12 | 13 | 14 | _NUM_FID_CANDIDATE = 30000 15 | CFG = 1.5 16 | 17 | def flush(): 18 | gc.collect() 19 | torch.cuda.empty_cache() 20 | 21 | def main(): 22 | parser = FlexibleArgumentParser(description='xFuser Arguments') 23 | parser.add_argument('--caption_file', type=str, default='captions_coco.json') 24 | parser.add_argument('--sample_images_folder', type=str, default='sample_images') 25 | args = xFuserArgs.add_cli_args(parser).parse_args() 26 | engine_args = xFuserArgs.from_cli_args(args) 27 | engine_config, input_config = engine_args.create_config() 28 | engine_config.runtime_config.dtype = torch.bfloat16 29 | local_rank = get_world_group().local_rank 30 | 31 | pipe = xFuserFluxPipeline.from_pretrained( 32 | pretrained_model_name_or_path=engine_config.model_config.model, 33 | engine_config=engine_config, 34 | torch_dtype=torch.bfloat16, 35 | ) 36 | 37 | if args.enable_sequential_cpu_offload: 38 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank) 39 | logging.info(f'rank {local_rank} sequential CPU offload enabled') 40 | else: 41 | pipe = pipe.to(f'cuda:{local_rank}') 42 | 43 | pipe.prepare_run(input_config, steps=1) 44 | 45 | with open(args.caption_file) as f: 46 | raw_captions = json.load(f) 47 | 48 | raw_captions = raw_captions['images'][:_NUM_FID_CANDIDATE] 49 | captions = list(map(lambda x: x['sentences'][0]['raw'], raw_captions)) 50 | filenames = list(map(lambda x: x['filename'], raw_captions)) 51 | 52 | folder_path = args.sample_images_folder 53 | if not os.path.exists(folder_path): 54 | os.makedirs(folder_path) 55 | 56 | # run multiple prompts at a time to save time 57 | num_prompt_one_step = 120 58 | for j in range(0, _NUM_FID_CANDIDATE, num_prompt_one_step): 59 | output = pipe( 60 | height=256, 61 | width=256, 62 | prompt=captions[j:j+num_prompt_one_step], 63 | num_inference_steps=input_config.num_inference_steps, 64 | output_type=input_config.output_type, 65 | max_sequence_length=256, 66 | guidance_scale=CFG, 67 | generator=torch.Generator(device='cuda').manual_seed(input_config.seed), 68 | ) 69 | if input_config.output_type == 'pil': 70 | if pipe.is_dp_last_group(): 71 | for k, local_filename in enumerate(filenames[j:j+num_prompt_one_step]): 72 | output.images[k].save(f'{folder_path}/{local_filename}') 73 | print(f'{j}-{j+num_prompt_one_step-1} generation finished!') 74 | flush() 75 | 76 | get_runtime_state().destroy_distributed_env() 77 | 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /benchmark/fid/compute_fid.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import time 4 | from cleanfid import fid 5 | from pathlib import Path 6 | 7 | def setup_logging(): 8 | """Setup logging configuration""" 9 | logging.basicConfig( 10 | level=logging.INFO, 11 | format='%(asctime)s - %(levelname)s - %(message)s', 12 | handlers=[ 13 | logging.StreamHandler(), 14 | logging.FileHandler('fid_computation.log') 15 | ] 16 | ) 17 | 18 | def compute_fid_score(ref_path: str, sample_path: str, device: str = "cuda") -> float: 19 | """ 20 | Compute FID score 21 | 22 | Args: 23 | ref_path: Path to ref images directory 24 | sample_path: Path to sample images directory 25 | device: Computing device ('cuda' or 'cpu') 26 | 27 | Returns: 28 | float: FID score 29 | 30 | Raises: 31 | ValueError: If directory does not exist 32 | """ 33 | # Check if paths exist 34 | ref_dir = Path(ref_path) 35 | gen_dir = Path(sample_path) 36 | 37 | if not ref_dir.exists(): 38 | raise ValueError(f"ref images directory does not exist: {ref_path}") 39 | if not gen_dir.exists(): 40 | raise ValueError(f"sample images directory does not exist: {sample_path}") 41 | 42 | logging.info(f"Starting FID score computation") 43 | logging.info(f"ref images directory: {ref_path}") 44 | logging.info(f"sample images directory: {sample_path}") 45 | logging.info(f"Using device: {device}") 46 | 47 | start_time = time.time() 48 | 49 | try: 50 | score = fid.compute_fid( 51 | ref_path, 52 | sample_path, 53 | device=device, 54 | num_workers=8 # Can be adjusted as needed 55 | ) 56 | 57 | elapsed_time = time.time() - start_time 58 | logging.info(f"FID computation completed, time elapsed: {elapsed_time:.2f} seconds") 59 | return score 60 | 61 | except Exception as e: 62 | logging.error(f"Error occurred during FID computation: {str(e)}") 63 | raise 64 | 65 | def main(): 66 | # Setup command line arguments 67 | parser = argparse.ArgumentParser(description='Compute FID score') 68 | parser.add_argument('--ref', type=str, required=True, 69 | help='Path to ref images directory') 70 | parser.add_argument('--sample', type=str, required=True, 71 | help='Path to sample images directory') 72 | parser.add_argument('--device', type=str, default="cuda", 73 | choices=['cuda', 'cpu'], help='Computing device') 74 | 75 | args = parser.parse_args() 76 | 77 | # Setup logging 78 | setup_logging() 79 | 80 | try: 81 | # Compute FID 82 | score = compute_fid_score(args.ref, args.sample, args.device) 83 | 84 | # Output result 85 | logging.info(f"FID score: {score:.4f}") 86 | 87 | except Exception as e: 88 | logging.error(f"Program execution failed: {str(e)}") 89 | return 1 90 | 91 | return 0 92 | 93 | if __name__ == "__main__": 94 | exit(main()) -------------------------------------------------------------------------------- /docs/methods/pipefusion.md: -------------------------------------------------------------------------------- 1 | ## PipeFusion: Displaced Patch Pipeline Parallelism for Diffusion Models 2 | [Chinese Blog 1](https://zhuanlan.zhihu.com/p/699612077); [Chinese Blog 2](https://zhuanlan.zhihu.com/p/706475158) 3 | 4 | PipeFusion is the innovative method first proposed by us. 5 | It is a sequence-level pipeline parallel method, similar to [TeraPipe](https://proceedings.mlr.press/v139/li21y.html), demonstrates significant advantages in weakly interconnected network hardware such as PCIe/Ethernet. 6 | 7 | PipeFusion innovatively harnesses input temporal redundancy—the similarity between inputs and activations across diffusion steps, a diffusion-specific characteristics also employed in DistriFusion. PipeFusion not only reduces communication volume but also streamlines pipeline parallelism with TeraPipe, avoiding the load balancing issues inherent in LLM models with Causal Attention. 8 | It significantly surpasses other methods in communication efficiency, particularly in multi-node setups connected via Ethernet and multi-GPU configurations linked with PCIe. 9 | 10 |
11 | PipeFusion Image 12 |
13 | 14 | The above picture compares DistriFusion and PipeFusion. 15 | (a) DistriFusion replicates DiT parameters on two devices. 16 | It splits an image into 2 patches and employs asynchronous allgather for activations of every layer. 17 | (b) PipeFusion shards DiT parameters on two devices. 18 | It splits an image into 4 patches and employs asynchronous P2P for activations across two devices. 19 | 20 | We briefly explain the workflow of PipeFusion. It partitions an input image into $M$ non-overlapping patches. 21 | The DiT network is partitioned into $N$ stages ($N$ < $L$), which are sequentially assigned to $N$ computational devices. 22 | Note that $M$ and $N$ can be unequal, which is different from the image-splitting approaches used in sequence parallelism and DistriFusion. 23 | Each device processes the computation task for one patch of its assigned stage in a pipelined manner. 24 | 25 | The PipeFusion pipeline workflow when $M$ = $N$ =4 is shown in the following picture. 26 | 27 |
28 | Pipeline Image 29 |
30 | 31 | 32 | We have evaluated the accuracy of PipeFusion, DistriFusion and the baseline as shown bolow. To conduct the FID experiment, follow the detailed instructions provided in the [documentation](../../docs/fid/FID.md). 33 | 34 |
35 | image_quality 36 |
37 | 38 | 39 | For more details, please refer to the following paper. 40 | 41 | ``` 42 | @article{wang2024pipefusion, 43 | title={PipeFusion: Displaced Patch Pipeline Parallelism for Inference of Diffusion Transformer Models}, 44 | author={Jiannan Wang and Jiarui Fang and Jinzhe Pan and Aoyu Li and PengCheng Yang}, 45 | year={2024}, 46 | eprint={2405.07719}, 47 | archivePrefix={arXiv}, 48 | primaryClass={cs.CV} 49 | } 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /examples/latte_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import torch.distributed 4 | from diffusers import AutoencoderKLTemporalDecoder 5 | from xfuser import xFuserLattePipeline, xFuserArgs 6 | from xfuser.config import FlexibleArgumentParser 7 | from xfuser.core.distributed import ( 8 | get_world_group, 9 | get_data_parallel_rank, 10 | get_data_parallel_world_size, 11 | get_runtime_state, 12 | is_dp_last_group, 13 | ) 14 | import imageio 15 | 16 | 17 | def main(): 18 | parser = FlexibleArgumentParser(description="xFuser Arguments") 19 | args = xFuserArgs.add_cli_args(parser).parse_args() 20 | engine_args = xFuserArgs.from_cli_args(args) 21 | engine_config, input_config = engine_args.create_config() 22 | local_rank = get_world_group().local_rank 23 | pipe = xFuserLattePipeline.from_pretrained( 24 | pretrained_model_name_or_path=engine_config.model_config.model, 25 | engine_config=engine_config, 26 | torch_dtype=torch.float16, 27 | ).to(f"cuda:{local_rank}") 28 | # pipe.latte_prepare_run(input_config) 29 | 30 | vae = AutoencoderKLTemporalDecoder.from_pretrained( 31 | engine_config.model_config.model, 32 | subfolder="vae_temporal_decoder", 33 | torch_dtype=torch.float16, 34 | ).to(f"cuda:{local_rank}") 35 | pipe.vae = vae 36 | 37 | torch.cuda.reset_peak_memory_stats() 38 | start_time = time.time() 39 | output = pipe( 40 | height=input_config.height, 41 | width=input_config.width, 42 | video_length=16, 43 | prompt=input_config.prompt, 44 | num_inference_steps=input_config.num_inference_steps, 45 | output_type="pt", 46 | guidance_scale=input_config.guidance_scale, 47 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 48 | ) 49 | end_time = time.time() 50 | elapsed_time = end_time - start_time 51 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 52 | 53 | parallel_info = ( 54 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 55 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 56 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 57 | ) 58 | if is_dp_last_group(): 59 | videos = output.frames.cpu() 60 | global_rank = get_world_group().rank 61 | dp_group_world_size = get_data_parallel_world_size() 62 | dp_group_index = global_rank // dp_group_world_size 63 | num_dp_groups = engine_config.parallel_config.dp_degree 64 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 65 | if input_config.num_frames > 1: 66 | videos = (videos.clamp(0, 1) * 255).to( 67 | dtype=torch.uint8 68 | ) # convert to uint8 69 | imageio.mimwrite( 70 | "./latte_output.mp4", videos[0].permute(0, 2, 3, 1), fps=8, quality=5 71 | ) # highest quality is 10, lowest is 0 72 | 73 | if get_world_group().rank == get_world_group().world_size - 1: 74 | print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB") 75 | get_runtime_state().destroy_distributed_env() 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /xfuser/model_executor/layers/feedforward.py: -------------------------------------------------------------------------------- 1 | # https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py 2 | 3 | import torch 4 | from torch import nn 5 | from torch.cuda import empty_cache 6 | from diffusers.models.attention import FeedForward, GELU, GEGLU 7 | 8 | try: 9 | import torch_musa 10 | from torch_musa.core.memory import empty_cache 11 | except ModuleNotFoundError: 12 | pass 13 | 14 | import xfuser.envs as envs 15 | if envs._is_npu(): 16 | from torch.npu import empty_cache 17 | 18 | from xfuser.core.distributed.parallel_state import ( 19 | get_tensor_model_parallel_world_size, 20 | get_tensor_model_parallel_rank, 21 | get_tp_group, 22 | ) 23 | from xfuser.model_executor.layers.base_layer import xFuserLayerBaseWrapper 24 | from xfuser.model_executor.layers.register import xFuserLayerWrappersRegister 25 | 26 | @xFuserLayerWrappersRegister.register(FeedForward) 27 | class xFuserFeedForwardWrapper(xFuserLayerBaseWrapper): 28 | def __init__(self, feedforward: FeedForward): 29 | super(xFuserFeedForwardWrapper, self).__init__(module=feedforward) 30 | 31 | tp_degree = get_tensor_model_parallel_world_size() 32 | tp_rank = get_tensor_model_parallel_rank() 33 | 34 | if isinstance(self.module.net[0], GELU): 35 | self.module.net[0].proj.weight.data = self.module.net[ 36 | 0 37 | ].proj.weight.data.chunk(tp_degree, dim=0)[tp_rank] 38 | if self.module.net[0].proj.bias is not None: 39 | self.module.net[0].proj.bias.data = self.module.net[ 40 | 0 41 | ].proj.bias.data.chunk(tp_degree, dim=0)[tp_rank] 42 | elif isinstance(self.module.net[0], GEGLU): 43 | weight_buff = self.module.net[0].proj.weight.data.chunk(2, dim=0) 44 | a = weight_buff[0].chunk(tp_degree, dim=0)[tp_rank] 45 | b = weight_buff[1].chunk(tp_degree, dim=0)[tp_rank] 46 | c = torch.cat([a, b], dim=0) 47 | 48 | self.module.net[0].proj.weight.data = c 49 | 50 | bias_buff = self.module.net[0].proj.bias.data.chunk(2, dim=0) 51 | a = bias_buff[0].chunk(tp_degree, dim=0)[tp_rank] 52 | b = bias_buff[1].chunk(tp_degree, dim=0)[tp_rank] 53 | c = torch.cat([a, b], dim=0) 54 | self.module.net[0].proj.bias.data = c 55 | 56 | else: 57 | raise TypeError( 58 | f"activation_fn {type(isinstance(self.module.net[0]))} not supported" 59 | ) 60 | 61 | self.module.net[2].weight.data = self.module.net[2].weight.chunk( 62 | tp_degree, dim=1 63 | )[tp_rank] 64 | 65 | self.has_output_bias = False 66 | if self.module.net[2].bias is not None: 67 | self.register_parameter( 68 | "output_bias", nn.Parameter(self.module.net[2].bias.data.clone()) 69 | ) 70 | self.module.net[2].bias = None 71 | self.has_output_bias = True 72 | 73 | empty_cache() 74 | 75 | def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: 76 | hidden_states = self.module(hidden_states, *args, **kwargs) 77 | get_tp_group().all_reduce(hidden_states) 78 | if self.has_output_bias: 79 | hidden_states += self.output_bias 80 | return hidden_states 81 | -------------------------------------------------------------------------------- /docs/methods/hybrid.md: -------------------------------------------------------------------------------- 1 | 2 | ## Hybrid Parallelism 3 | [Chinese Version](./hybrid_zh.md) 4 | 5 | The design goal of xDiT is to scale the DiT inference process to ultra-large scales, such as multiple machines and multiple GPUs interconnected with heterogenous networks, i.e. Ethernet and PCIe. Individual parallel methods, such as PipeFusion or Sequence Parallelism (SP), struggle to achieve this simultaneously, making the combination of different parallel methods necessary. 6 | 7 | xDiT supports four parallel methods: PipeFusion, Sequence, Data, and CFG Parallel. Among these, Data and CFG Parallel are relatively simple for inter-image parallelism, while PipeFusion and SP are more complex for parallelism within different patches of an image. The ability to combine these two parallel methods is one of the innovations of xDiT. 8 | 9 | PipeFusion leverages the characteristic of Input Temporal Redundancy, using Stale KV for Attention computation, which makes it difficult for PipeFusion to hybrid parallel strategies as easily as large language models (LLM). Specifically, using standard sequence parallel interfaces, such as RingAttention, Ulysses, or USP, cannot meet the requirements for mixing SP with PipeFusion. 10 | 11 | We elaborate on this issue with the following illustration, which shows a mixed parallel method with pipe_degree=4 and sp_degree=2. Setting `num_pipeline_patch`=4, the image is divided into M=`num_pipeline_patch*sp_degree`=8 patches, labeled P0~P7. 12 | 13 |
14 | hybrid process group config 15 |
16 | 17 | In the implementation of Standard SP Attention, the inputs Q, K, V, and the output O are all split along the sequence dimension, with consistent splitting pattern. 18 | In a SP process group, the input patches from different ranks do not overlap, the positions for fresh KV updates calculated in each micro step also do not overlap among different ranks. 19 | As shown in the following figure, in the KV Buffer of standard SP, the yellow part represents the fresh KV owned by SP0 rank=0, and the green part represents the fresh KV owned by SP1 rank=1, which are not the same. 20 | Within this diffusion step, device=0 cannot obtain the fresh KV of P1,3,5,7 for computation, but PipeFusion requires having all KV from the previous diffusion step in the next diffusion step. 21 | Standard SP only has 1/sp_degree of the fresh KV buffer, so it cannot achieve the correct results for mixed parallel inference. 22 | 23 |
24 | hybrid parallel workflow 25 |
26 | 27 | xDiT has customized the implementation of sequence parallelism to meet this mixed parallel requirement. xDiT uses `xFuserLongContextAttention` to store the intermediate results of SP in the KV Buffer. The effect is illustrated in the figure, where after each micro-step SP execution, the fresh KV of different rank devices within the SP Group is replicated. This way, after one diffusion step, the KV Buffer of all devices in the SP Group is updated to the latest, ready for use in the next Diffusion Step. 28 | 29 |
30 | kvbuffer in hybrid parallel 31 |
-------------------------------------------------------------------------------- /examples/sana_sprint_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from xfuser import xFuserSanaSprintPipeline, xFuserArgs 6 | from xfuser.config import FlexibleArgumentParser 7 | from xfuser.core.distributed import ( 8 | get_world_group, 9 | is_dp_last_group, 10 | get_data_parallel_rank, 11 | get_runtime_state, 12 | ) 13 | from xfuser.core.distributed.parallel_state import get_data_parallel_world_size 14 | 15 | 16 | def main(): 17 | parser = FlexibleArgumentParser(description="xFuser Arguments") 18 | args = xFuserArgs.add_cli_args(parser).parse_args() 19 | engine_args = xFuserArgs.from_cli_args(args) 20 | engine_config, input_config = engine_args.create_config() 21 | local_rank = get_world_group().local_rank 22 | 23 | pipe = xFuserSanaSprintPipeline.from_pretrained( 24 | pretrained_model_name_or_path=engine_config.model_config.model, 25 | engine_config=engine_config, 26 | torch_dtype=torch.bfloat16, 27 | ).to(f"cuda:{local_rank}") 28 | pipe.vae.to(torch.bfloat16) 29 | pipe.text_encoder.to(torch.bfloat16) 30 | 31 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 32 | 33 | pipe.prepare_run(input_config) 34 | 35 | torch.cuda.reset_peak_memory_stats() 36 | start_time = time.time() 37 | output = pipe( 38 | height=input_config.height, 39 | width=input_config.width, 40 | prompt=input_config.prompt, 41 | num_inference_steps=input_config.num_inference_steps, 42 | output_type=input_config.output_type, 43 | use_resolution_binning=input_config.use_resolution_binning, 44 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 45 | guidance_scale=4.5 46 | ) 47 | end_time = time.time() 48 | elapsed_time = end_time - start_time 49 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 50 | 51 | parallel_info = ( 52 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 53 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 54 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 55 | ) 56 | if input_config.output_type == "pil": 57 | dp_group_index = get_data_parallel_rank() 58 | num_dp_groups = get_data_parallel_world_size() 59 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 60 | if pipe.is_dp_last_group(): 61 | if not os.path.exists("results"): 62 | os.mkdir("results") 63 | for i, image in enumerate(output.images): 64 | image_rank = dp_group_index * dp_batch_size + i 65 | image.save( 66 | f"./results/sana_sprint_1.6B_result_{parallel_info}_{image_rank}.png" 67 | ) 68 | print( 69 | f"image {i} saved to ./results/sana_sprint_1.6B_result_{parallel_info}_{image_rank}.png" 70 | ) 71 | 72 | if get_world_group().rank == get_world_group().world_size - 1: 73 | print( 74 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB" 75 | ) 76 | 77 | get_runtime_state().destroy_distributed_env() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /examples/pixartsigma_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserPixArtSigmaPipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.core.distributed import ( 9 | get_world_group, 10 | is_dp_last_group, 11 | get_data_parallel_world_size, 12 | get_runtime_state, 13 | get_data_parallel_rank, 14 | ) 15 | 16 | 17 | def main(): 18 | parser = FlexibleArgumentParser(description="xFuser Arguments") 19 | args = xFuserArgs.add_cli_args(parser).parse_args() 20 | engine_args = xFuserArgs.from_cli_args(args) 21 | engine_config, input_config = engine_args.create_config() 22 | local_rank = get_world_group().local_rank 23 | text_encoder = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder", torch_dtype=torch.float16) 24 | if args.use_fp8_t5_encoder: 25 | from optimum.quanto import freeze, qfloat8, quantize 26 | print(f"rank {local_rank} quantizing text encoder") 27 | quantize(text_encoder, weights=qfloat8) 28 | freeze(text_encoder) 29 | 30 | pipe = xFuserPixArtSigmaPipeline.from_pretrained( 31 | pretrained_model_name_or_path=engine_config.model_config.model, 32 | engine_config=engine_config, 33 | torch_dtype=torch.float16, 34 | text_encoder=text_encoder, 35 | ).to(f"cuda:{local_rank}") 36 | pipe.prepare_run(input_config) 37 | 38 | torch.cuda.reset_peak_memory_stats() 39 | start_time = time.time() 40 | output = pipe( 41 | height=input_config.height, 42 | width=input_config.width, 43 | prompt=input_config.prompt, 44 | num_inference_steps=input_config.num_inference_steps, 45 | output_type=input_config.output_type, 46 | use_resolution_binning=input_config.use_resolution_binning, 47 | guidance_scale=input_config.guidance_scale, 48 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 49 | clean_caption=False, 50 | ) 51 | end_time = time.time() 52 | elapsed_time = end_time - start_time 53 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 54 | 55 | parallel_info = ( 56 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 57 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 58 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 59 | ) 60 | if input_config.output_type == "pil": 61 | dp_group_index = get_data_parallel_rank() 62 | num_dp_groups = get_data_parallel_world_size() 63 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 64 | if pipe.is_dp_last_group(): 65 | if not os.path.exists("results"): 66 | os.mkdir("results") 67 | for i, image in enumerate(output.images): 68 | image_rank = dp_group_index * dp_batch_size + i 69 | image.save( 70 | f"./results/pixart_sigma_result_{parallel_info}_{image_rank}.png" 71 | ) 72 | print( 73 | f"image {i} saved to ./results/pixart_sigma_result_{parallel_info}_{image_rank}.png" 74 | ) 75 | 76 | if get_world_group().rank == get_world_group().world_size - 1: 77 | print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB") 78 | get_runtime_state().destroy_distributed_env() 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /examples/pixartalpha_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserPixArtAlphaPipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.core.distributed import ( 9 | get_world_group, 10 | is_dp_last_group, 11 | get_data_parallel_world_size, 12 | get_runtime_state, 13 | get_data_parallel_rank, 14 | ) 15 | 16 | 17 | def main(): 18 | parser = FlexibleArgumentParser(description="xFuser Arguments") 19 | args = xFuserArgs.add_cli_args(parser).parse_args() 20 | engine_args = xFuserArgs.from_cli_args(args) 21 | engine_config, input_config = engine_args.create_config() 22 | local_rank = get_world_group().local_rank 23 | text_encoder = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder", torch_dtype=torch.float16) 24 | if args.use_fp8_t5_encoder: 25 | from optimum.quanto import freeze, qfloat8, quantize 26 | print(f"rank {local_rank} quantizing text encoder") 27 | quantize(text_encoder, weights=qfloat8) 28 | freeze(text_encoder) 29 | 30 | pipe = xFuserPixArtAlphaPipeline.from_pretrained( 31 | pretrained_model_name_or_path=engine_config.model_config.model, 32 | engine_config=engine_config, 33 | torch_dtype=torch.float16, 34 | text_encoder=text_encoder, 35 | ).to(f"cuda:{local_rank}") 36 | model_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 37 | pipe.prepare_run(input_config) 38 | 39 | torch.cuda.reset_peak_memory_stats() 40 | start_time = time.time() 41 | output = pipe( 42 | height=input_config.height, 43 | width=input_config.width, 44 | prompt=input_config.prompt, 45 | num_inference_steps=input_config.num_inference_steps, 46 | output_type=input_config.output_type, 47 | use_resolution_binning=input_config.use_resolution_binning, 48 | guidance_scale=input_config.guidance_scale, 49 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 50 | ) 51 | end_time = time.time() 52 | elapsed_time = end_time - start_time 53 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 54 | 55 | parallel_info = ( 56 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 57 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 58 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}_tc_{engine_args.use_torch_compile}" 59 | ) 60 | if input_config.output_type == "pil": 61 | dp_group_index = get_data_parallel_rank() 62 | num_dp_groups = get_data_parallel_world_size() 63 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 64 | if pipe.is_dp_last_group(): 65 | if not os.path.exists("results"): 66 | os.mkdir("results") 67 | for i, image in enumerate(output.images): 68 | image_rank = dp_group_index * dp_batch_size + i 69 | img_file = ( 70 | f"./results/pixart_alpha_result_{parallel_info}_{image_rank}.png" 71 | ) 72 | image.save(img_file) 73 | print(img_file) 74 | 75 | if get_world_group().rank == get_world_group().world_size - 1: 76 | print( 77 | f"epoch time: {elapsed_time:.2f} sec, model memory: {model_memory/1e9:.2f} GB, overall memory: {peak_memory/1e9:.2f} GB" 78 | ) 79 | get_runtime_state().destroy_distributed_env() 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /examples/sdxl_example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import torch 4 | import torch.distributed 5 | from xfuser import xFuserStableDiffusionXLPipeline, xFuserArgs 6 | from xfuser.config import FlexibleArgumentParser 7 | from xfuser.core.distributed import ( 8 | get_world_group, 9 | get_data_parallel_rank, 10 | get_data_parallel_world_size, 11 | get_runtime_state, 12 | ) 13 | from diffusers import StableDiffusionXLPipeline 14 | 15 | def main(): 16 | # Initialize argument parser 17 | parser = FlexibleArgumentParser(description="xFuser SDXL Arguments") 18 | args = xFuserArgs.add_cli_args(parser).parse_args() 19 | engine_args = xFuserArgs.from_cli_args(args) 20 | engine_config, input_config = engine_args.create_config() 21 | 22 | 23 | # Set runtime configuration 24 | engine_config.runtime_config.dtype = torch.bfloat16 25 | local_rank = get_world_group().local_rank 26 | 27 | # Initialize pipeline 28 | pipe = xFuserStableDiffusionXLPipeline.from_pretrained( 29 | pretrained_model_name_or_path=engine_config.model_config.model, 30 | engine_config=engine_config, 31 | torch_dtype=torch.float16, 32 | ) 33 | 34 | # Handle device placement 35 | if args.enable_sequential_cpu_offload: 36 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank) 37 | logging.info(f"rank {local_rank} sequential CPU offload enabled") 38 | else: 39 | pipe = pipe.to(f"cuda:{local_rank}") 40 | 41 | # Record initial memory usage 42 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 43 | 44 | # Prepare for inference 45 | pipe.prepare_run(input_config, steps=input_config.num_inference_steps) 46 | 47 | # Run inference 48 | torch.cuda.reset_peak_memory_stats() 49 | start_time = time.time() 50 | output = pipe( 51 | height=input_config.height, 52 | width=input_config.width, 53 | prompt=input_config.prompt, 54 | num_inference_steps=input_config.num_inference_steps, 55 | output_type=input_config.output_type, 56 | guidance_scale=input_config.guidance_scale, 57 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 58 | ) 59 | end_time = time.time() 60 | elapsed_time = end_time - start_time 61 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 62 | 63 | # Generate parallel configuration info string 64 | parallel_info = ( 65 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 66 | f"tp{engine_args.tensor_parallel_degree}_" 67 | f"pp{engine_args.pipefusion_parallel_degree}" 68 | ) 69 | 70 | # Save generated images 71 | if input_config.output_type == "pil": 72 | dp_group_index = get_data_parallel_rank() 73 | num_dp_groups = get_data_parallel_world_size() 74 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 75 | if pipe.is_dp_last_group(): 76 | for i, image in enumerate(output.images): 77 | image_rank = dp_group_index * dp_batch_size + i 78 | image_name = f"sdxl_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png" 79 | image.save(f"./results/{image_name}") 80 | print(f"image {i} saved to ./results/{image_name}") 81 | 82 | # Print performance metrics 83 | if get_world_group().rank == get_world_group().world_size - 1: 84 | print( 85 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB" 86 | ) 87 | 88 | # Cleanup 89 | get_runtime_state().destroy_distributed_env() 90 | 91 | if __name__ == "__main__": 92 | main() -------------------------------------------------------------------------------- /examples/sd3_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserStableDiffusion3Pipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.core.distributed import ( 9 | get_world_group, 10 | is_dp_last_group, 11 | get_data_parallel_rank, 12 | get_runtime_state, 13 | ) 14 | from xfuser.core.distributed.parallel_state import get_data_parallel_world_size 15 | 16 | 17 | def main(): 18 | parser = FlexibleArgumentParser(description="xFuser Arguments") 19 | args = xFuserArgs.add_cli_args(parser).parse_args() 20 | engine_args = xFuserArgs.from_cli_args(args) 21 | engine_config, input_config = engine_args.create_config() 22 | local_rank = get_world_group().local_rank 23 | text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16) 24 | if args.use_fp8_t5_encoder: 25 | from optimum.quanto import freeze, qfloat8, quantize 26 | print(f"rank {local_rank} quantizing text encoder 2") 27 | quantize(text_encoder_3, weights=qfloat8) 28 | freeze(text_encoder_3) 29 | 30 | pipe = xFuserStableDiffusion3Pipeline.from_pretrained( 31 | pretrained_model_name_or_path=engine_config.model_config.model, 32 | engine_config=engine_config, 33 | torch_dtype=torch.float16, 34 | text_encoder_3=text_encoder_3, 35 | ).to(f"cuda:{local_rank}") 36 | 37 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 38 | 39 | pipe.prepare_run(input_config) 40 | 41 | torch.cuda.reset_peak_memory_stats() 42 | start_time = time.time() 43 | output = pipe( 44 | height=input_config.height, 45 | width=input_config.width, 46 | prompt=input_config.prompt, 47 | num_inference_steps=input_config.num_inference_steps, 48 | output_type=input_config.output_type, 49 | guidance_scale=input_config.guidance_scale, 50 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 51 | ) 52 | end_time = time.time() 53 | elapsed_time = end_time - start_time 54 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 55 | 56 | parallel_info = ( 57 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 58 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 59 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 60 | ) 61 | if input_config.output_type == "pil": 62 | dp_group_index = get_data_parallel_rank() 63 | num_dp_groups = get_data_parallel_world_size() 64 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 65 | if pipe.is_dp_last_group(): 66 | if not os.path.exists("results"): 67 | os.mkdir("results") 68 | for i, image in enumerate(output.images): 69 | image_rank = dp_group_index * dp_batch_size + i 70 | image.save( 71 | f"./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png" 72 | ) 73 | print( 74 | f"image {i} saved to ./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png" 75 | ) 76 | 77 | if get_world_group().rank == get_world_group().world_size - 1: 78 | print( 79 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB" 80 | ) 81 | 82 | get_runtime_state().destroy_distributed_env() 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /examples/cogvideox_example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import torch 4 | import torch.distributed 5 | from diffusers import AutoencoderKLTemporalDecoder 6 | from xfuser import xFuserCogVideoXPipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.core.distributed import ( 9 | get_world_group, 10 | get_data_parallel_rank, 11 | get_data_parallel_world_size, 12 | get_runtime_state, 13 | is_dp_last_group, 14 | ) 15 | from diffusers.utils import export_to_video 16 | 17 | 18 | def main(): 19 | parser = FlexibleArgumentParser(description="xFuser Arguments") 20 | args = xFuserArgs.add_cli_args(parser).parse_args() 21 | engine_args = xFuserArgs.from_cli_args(args) 22 | 23 | engine_config, input_config = engine_args.create_config() 24 | local_rank = get_world_group().local_rank 25 | 26 | assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion." 27 | assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for CogVideo" 28 | 29 | pipe = xFuserCogVideoXPipeline.from_pretrained( 30 | pretrained_model_name_or_path=engine_config.model_config.model, 31 | engine_config=engine_config, 32 | torch_dtype=torch.bfloat16, 33 | ) 34 | if args.enable_sequential_cpu_offload: 35 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank) 36 | logging.info(f"rank {local_rank} sequential CPU offload enabled") 37 | elif args.enable_model_cpu_offload: 38 | pipe.enable_model_cpu_offload(gpu_id=local_rank) 39 | logging.info(f"rank {local_rank} model CPU offload enabled") 40 | else: 41 | device = torch.device(f"cuda:{local_rank}") 42 | pipe = pipe.to(device) 43 | 44 | if args.enable_tiling: 45 | pipe.vae.enable_tiling() 46 | 47 | if args.enable_slicing: 48 | pipe.vae.enable_slicing() 49 | 50 | # warmup 51 | output = pipe( 52 | height=input_config.height, 53 | width=input_config.width, 54 | num_frames=input_config.num_frames, 55 | prompt=input_config.prompt, 56 | num_inference_steps=1, 57 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 58 | ).frames[0] 59 | 60 | torch.cuda.reset_peak_memory_stats() 61 | start_time = time.time() 62 | 63 | output = pipe( 64 | height=input_config.height, 65 | width=input_config.width, 66 | num_frames=input_config.num_frames, 67 | prompt=input_config.prompt, 68 | num_inference_steps=input_config.num_inference_steps, 69 | guidance_scale=input_config.guidance_scale, 70 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 71 | ).frames[0] 72 | 73 | end_time = time.time() 74 | elapsed_time = end_time - start_time 75 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 76 | 77 | parallel_info = ( 78 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 79 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 80 | f"tp{engine_args.tensor_parallel_degree}_" 81 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 82 | ) 83 | if is_dp_last_group(): 84 | resolution = f"{input_config.width}x{input_config.height}" 85 | output_filename = f"results/cogvideox_{parallel_info}_{resolution}.mp4" 86 | export_to_video(output, output_filename, fps=8) 87 | print(f"output saved to {output_filename}") 88 | 89 | if get_world_group().rank == get_world_group().world_size - 1: 90 | print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB") 91 | get_runtime_state().destroy_distributed_env() 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /examples/hunyuandit_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import torch.distributed 5 | from transformers import T5EncoderModel 6 | from xfuser import xFuserHunyuanDiTPipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.core.distributed import ( 9 | get_world_group, 10 | is_dp_last_group, 11 | get_data_parallel_world_size, 12 | get_runtime_state, 13 | get_data_parallel_rank, 14 | ) 15 | 16 | 17 | def main(): 18 | parser = FlexibleArgumentParser(description="xFuser Arguments") 19 | args = xFuserArgs.add_cli_args(parser).parse_args() 20 | engine_args = xFuserArgs.from_cli_args(args) 21 | engine_config, input_config = engine_args.create_config() 22 | local_rank = get_world_group().local_rank 23 | text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16) 24 | if args.use_fp8_t5_encoder: 25 | from optimum.quanto import freeze, qfloat8, quantize 26 | print(f"rank {local_rank} quantizing text encoder 2") 27 | quantize(text_encoder_2, weights=qfloat8) 28 | freeze(text_encoder_2) 29 | 30 | pipe = xFuserHunyuanDiTPipeline.from_pretrained( 31 | pretrained_model_name_or_path=engine_config.model_config.model, 32 | engine_config=engine_config, 33 | torch_dtype=torch.float16, 34 | text_encoder_2=text_encoder_2, 35 | ).to(f"cuda:{local_rank}") 36 | 37 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 38 | 39 | pipe.prepare_run(input_config) 40 | 41 | torch.cuda.reset_peak_memory_stats() 42 | start_time = time.time() 43 | output = pipe( 44 | height=input_config.height, 45 | width=input_config.width, 46 | prompt=input_config.prompt, 47 | num_inference_steps=input_config.num_inference_steps, 48 | output_type=input_config.output_type, 49 | use_resolution_binning=input_config.use_resolution_binning, 50 | guidance_scale=input_config.guidance_scale, 51 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 52 | ) 53 | end_time = time.time() 54 | elapsed_time = end_time - start_time 55 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 56 | 57 | parallel_info = ( 58 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 59 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 60 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 61 | ) 62 | if input_config.output_type == "pil": 63 | dp_group_index = get_data_parallel_rank() 64 | num_dp_groups = get_data_parallel_world_size() 65 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 66 | if pipe.is_dp_last_group(): 67 | if not os.path.exists("results"): 68 | os.mkdir("results") 69 | for i, image in enumerate(output.images): 70 | image_rank = dp_group_index * dp_batch_size + i 71 | image.save( 72 | f"./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png" 73 | ) 74 | print( 75 | f"image {i} saved to ./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png" 76 | ) 77 | 78 | if get_world_group().rank == get_world_group().world_size - 1: 79 | print( 80 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB" 81 | ) 82 | get_runtime_state().destroy_distributed_env() 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /docs/developer/adding_models/adding_model_cfg.py: -------------------------------------------------------------------------------- 1 | # Example for parallelize new models with USP 2 | # run with 3 | # torchrun --nproc_per_node=2 \ 4 | # adding_cogvideox.py 5 | import sys 6 | import functools 7 | from typing import List, Optional, Tuple, Union 8 | 9 | import time 10 | import torch 11 | 12 | from diffusers import DiffusionPipeline, CogVideoXPipeline 13 | 14 | import torch.distributed as dist 15 | from xfuser.core.distributed import ( 16 | init_distributed_environment, 17 | initialize_model_parallel, 18 | get_world_group, 19 | get_classifier_free_guidance_world_size, 20 | get_classifier_free_guidance_rank, 21 | get_cfg_group, 22 | ) 23 | 24 | from diffusers.utils import export_to_video 25 | 26 | def parallelize_transformer(pipe: DiffusionPipeline): 27 | transformer = pipe.transformer 28 | original_forward = transformer.forward 29 | 30 | @functools.wraps(transformer.__class__.forward) 31 | def new_forward( 32 | self, 33 | hidden_states: torch.Tensor, 34 | encoder_hidden_states: Optional[torch.Tensor] = None, 35 | timestep: torch.LongTensor = None, 36 | timestep_cond: Optional[torch.Tensor] = None, 37 | ofs: Optional[Union[int, float, torch.LongTensor]] = None, 38 | image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, 39 | **kwargs, 40 | ): 41 | timestep = torch.chunk(timestep, get_classifier_free_guidance_world_size(),dim=0)[get_classifier_free_guidance_rank()] 42 | hidden_states = torch.chunk(hidden_states, get_classifier_free_guidance_world_size(),dim=0)[get_classifier_free_guidance_rank()] 43 | encoder_hidden_states = torch.chunk(encoder_hidden_states, get_classifier_free_guidance_world_size(),dim=0)[get_classifier_free_guidance_rank()] 44 | 45 | output = original_forward( 46 | hidden_states, 47 | encoder_hidden_states, 48 | timestep=timestep, 49 | timestep_cond=timestep_cond, 50 | ofs=ofs, 51 | image_rotary_emb=image_rotary_emb, 52 | **kwargs, 53 | ) 54 | 55 | return_dict = not isinstance(output, tuple) 56 | sample = output[0] 57 | sample = get_cfg_group().all_gather(sample, dim=0) 58 | if return_dict: 59 | return output.__class__(sample, *output[1:]) 60 | return (sample, *output[1:]) 61 | 62 | new_forward = new_forward.__get__(transformer) 63 | transformer.forward = new_forward 64 | 65 | if __name__ == "__main__": 66 | dist.init_process_group("nccl") 67 | init_distributed_environment( 68 | rank=dist.get_rank(), 69 | world_size=dist.get_world_size() 70 | ) 71 | initialize_model_parallel( 72 | classifier_free_guidance_degree=2, 73 | ) 74 | pipe = CogVideoXPipeline.from_pretrained( 75 | pretrained_model_name_or_path=sys.argv[1], 76 | torch_dtype=torch.bfloat16, 77 | ) 78 | local_rank = get_world_group().local_rank 79 | device = torch.device(f"cuda:{local_rank}") 80 | pipe = pipe.to(device) 81 | 82 | pipe.vae.enable_tiling() 83 | 84 | parallelize_transformer(pipe) 85 | 86 | torch.cuda.reset_peak_memory_stats() 87 | start_time = time.time() 88 | 89 | output = pipe( 90 | num_frames=9, 91 | prompt="A little girl is riding a bicycle at high speed. Focused, detailed, realistic.", 92 | num_inference_steps=20, 93 | generator=torch.Generator(device="cuda").manual_seed(42), 94 | ).frames[0] 95 | 96 | end_time = time.time() 97 | elapsed_time = end_time - start_time 98 | 99 | if local_rank == 0: 100 | export_to_video(output, "output.mp4", fps=8) 101 | print(f"epoch time: {elapsed_time:.2f} sec") 102 | 103 | dist.destroy_process_group() 104 | -------------------------------------------------------------------------------- /xfuser/model_executor/models/customized/step_video_t2v/rope.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from xfuser.core.distributed.parallel_state import get_sequence_parallel_world_size, get_sequence_parallel_rank 3 | 4 | class RoPE1D: 5 | def __init__(self, freq=1e4, F0=1.0, scaling_factor=1.0): 6 | self.base = freq 7 | self.F0 = F0 8 | self.scaling_factor = scaling_factor 9 | self.cache = {} 10 | 11 | def get_cos_sin(self, D, seq_len, device, dtype): 12 | if (D, seq_len, device, dtype) not in self.cache: 13 | inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D)) 14 | t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) 15 | freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) 16 | freqs = torch.cat((freqs, freqs), dim=-1) 17 | cos = freqs.cos() # (Seq, Dim) 18 | sin = freqs.sin() 19 | self.cache[D, seq_len, device, dtype] = (cos, sin) 20 | return self.cache[D, seq_len, device, dtype] 21 | 22 | @staticmethod 23 | def rotate_half(x): 24 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:] 25 | return torch.cat((-x2, x1), dim=-1) 26 | 27 | def apply_rope1d(self, tokens, pos1d, cos, sin): 28 | assert pos1d.ndim == 2 29 | cos = torch.nn.functional.embedding(pos1d, cos)[:, :, None, :] 30 | sin = torch.nn.functional.embedding(pos1d, sin)[:, :, None, :] 31 | return (tokens * cos) + (self.rotate_half(tokens) * sin) 32 | 33 | def __call__(self, tokens, positions): 34 | """ 35 | input: 36 | * tokens: batch_size x ntokens x nheads x dim 37 | * positions: batch_size x ntokens (t position of each token) 38 | output: 39 | * tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim) 40 | """ 41 | D = tokens.size(3) 42 | assert positions.ndim == 2 # Batch, Seq 43 | cos, sin = self.get_cos_sin(D, int(positions.max()) + 1, tokens.device, tokens.dtype) 44 | tokens = self.apply_rope1d(tokens, positions, cos, sin) 45 | return tokens 46 | 47 | 48 | 49 | class RoPE3D(RoPE1D): 50 | def __init__(self, freq=1e4, F0=1.0, scaling_factor=1.0): 51 | super(RoPE3D, self).__init__(freq, F0, scaling_factor) 52 | self.position_cache = {} 53 | 54 | def get_mesh_3d(self, rope_positions, bsz): 55 | f, h, w = rope_positions 56 | 57 | if f"{f}-{h}-{w}" not in self.position_cache: 58 | x = torch.arange(f, device='cpu') 59 | y = torch.arange(h, device='cpu') 60 | z = torch.arange(w, device='cpu') 61 | self.position_cache[f"{f}-{h}-{w}"] = torch.cartesian_prod(x, y, z).view(1, f*h*w, 3).expand(bsz, -1, 3) 62 | return self.position_cache[f"{f}-{h}-{w}"] 63 | 64 | def __call__(self, tokens, rope_positions, ch_split, parallel=False): 65 | """ 66 | input: 67 | * tokens: batch_size x ntokens x nheads x dim 68 | * rope_positions: list of (f, h, w) 69 | output: 70 | * tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim) 71 | """ 72 | assert sum(ch_split) == tokens.size(-1); 73 | 74 | mesh_grid = self.get_mesh_3d(rope_positions, bsz=tokens.shape[0]) 75 | out = [] 76 | for i, (D, x) in enumerate(zip(ch_split, torch.split(tokens, ch_split, dim=-1))): 77 | cos, sin = self.get_cos_sin(D, int(mesh_grid.max()) + 1, tokens.device, tokens.dtype) 78 | 79 | if parallel: 80 | mesh = torch.chunk(mesh_grid[:, :, i], get_sequence_parallel_world_size(),dim=1)[get_sequence_parallel_rank()].clone() 81 | else: 82 | mesh = mesh_grid[:, :, i].clone() 83 | x = self.apply_rope1d(x, mesh.to(tokens.device), cos, sin) 84 | out.append(x) 85 | 86 | tokens = torch.cat(out, dim=-1) 87 | return tokens 88 | 89 | 90 | -------------------------------------------------------------------------------- /xfuser/logger.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py 3 | """Logging configuration.""" 4 | import logging 5 | import sys 6 | import os 7 | from typing import Optional 8 | 9 | _FORMAT = "%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s" 10 | _DATE_FORMAT = "%m-%d %H:%M:%S" 11 | 12 | _LOG_LEVEL = os.environ.get("LOG_LEVEL", "debug") 13 | _LOG_LEVEL = getattr(logging, _LOG_LEVEL.upper(), 0) 14 | _LOG_DIR = os.environ.get("LOG_DIR", None) 15 | 16 | 17 | class NewLineFormatter(logging.Formatter): 18 | """Adds logging prefix to newlines to align multi-line messages.""" 19 | 20 | def __init__(self, fmt, datefmt=None): 21 | logging.Formatter.__init__(self, fmt, datefmt) 22 | 23 | def format(self, record): 24 | msg = logging.Formatter.format(self, record) 25 | if record.message != "": 26 | parts = msg.split(record.message) 27 | msg = msg.replace("\n", "\r\n" + parts[0]) 28 | return msg 29 | 30 | 31 | _root_logger = logging.getLogger("xfuser") 32 | _default_handler = None 33 | _default_file_handler = None 34 | _inference_log_file_handler = {} 35 | 36 | 37 | def _setup_logger(): 38 | _root_logger.setLevel(_LOG_LEVEL) 39 | global _default_handler 40 | global _default_file_handler 41 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 42 | 43 | if _default_handler is None: 44 | _default_handler = logging.StreamHandler(sys.stdout) 45 | _default_handler.flush = sys.stdout.flush # type: ignore 46 | _default_handler.setLevel(_LOG_LEVEL) 47 | _root_logger.addHandler(_default_handler) 48 | 49 | if _default_file_handler is None and _LOG_DIR is not None: 50 | if not os.path.exists(_LOG_DIR): 51 | try: 52 | os.makedirs(_LOG_DIR) 53 | except OSError as e: 54 | _root_logger.warn(f"Error creating directory {_LOG_DIR} : {e}") 55 | _default_file_handler = logging.FileHandler(_LOG_DIR + "/default.log") 56 | _default_file_handler.setLevel(_LOG_LEVEL) 57 | _default_file_handler.setFormatter(fmt) 58 | _root_logger.addHandler(_default_file_handler) 59 | 60 | _default_handler.setFormatter(fmt) 61 | # Setting this will avoid the message 62 | # being propagated to the parent logger. 63 | _root_logger.propagate = False 64 | 65 | 66 | # The logger is initialized when the module is imported. 67 | # This is thread-safe as the module is only imported once, 68 | # guaranteed by the Python GIL. 69 | _setup_logger() 70 | 71 | 72 | def init_logger(name: str): 73 | pid = os.getpid() 74 | # Use the same settings as above for root logger 75 | logger = logging.getLogger(name) 76 | logger.setLevel(_LOG_LEVEL) 77 | logger.addHandler(_default_handler) 78 | if _LOG_DIR is not None and pid is None: 79 | logger.addHandler(_default_file_handler) 80 | elif _LOG_DIR is not None: 81 | if _inference_log_file_handler.get(pid, None) is not None: 82 | logger.addHandler(_inference_log_file_handler[pid]) 83 | else: 84 | if not os.path.exists(_LOG_DIR): 85 | try: 86 | os.makedirs(_LOG_DIR) 87 | except OSError as e: 88 | _root_logger.warn(f"Error creating directory {_LOG_DIR} : {e}") 89 | _inference_log_file_handler[pid] = logging.FileHandler( 90 | _LOG_DIR + f"/process.{pid}.log" 91 | ) 92 | _inference_log_file_handler[pid].setLevel(_LOG_LEVEL) 93 | _inference_log_file_handler[pid].setFormatter( 94 | NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 95 | ) 96 | _root_logger.addHandler(_inference_log_file_handler[pid]) 97 | logger.addHandler(_inference_log_file_handler[pid]) 98 | logger.propagate = False 99 | return logger 100 | -------------------------------------------------------------------------------- /examples/zimage_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | from diffusers import ZImagePipeline 4 | from xfuser.config.diffusers import has_valid_diffusers_version, get_minimum_diffusers_version 5 | 6 | if not has_valid_diffusers_version("zimage"): 7 | minimum_diffusers_version = get_minimum_diffusers_version("zimage") 8 | raise ImportError(f"Please install diffusers>={minimum_diffusers_version} to use Z-Image models.") 9 | 10 | from xfuser.model_executor.models.transformers.transformer_z_image import xFuserZImageTransformer2DWrapper 11 | from diffusers import DiffusionPipeline 12 | 13 | from xfuser import xFuserArgs 14 | from xfuser.config import FlexibleArgumentParser 15 | from xfuser.core.distributed import ( 16 | get_world_group, 17 | get_runtime_state, 18 | initialize_runtime_state, 19 | ) 20 | 21 | def run_pipe(pipe: DiffusionPipeline, input_config): 22 | # Pipe implementation currently encodes the prompt in-place, 23 | # causing any subsequent calls to use the already encoded prompt as prompt, 24 | # causing cascading encodings unless we provide a new list each time. 25 | prompt = str(input_config.prompt) 26 | 27 | return pipe( 28 | height=input_config.height, 29 | width=input_config.width, 30 | prompt=prompt, 31 | num_inference_steps=9, # Recommended value 32 | guidance_scale=0.0, # Recommended value 33 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 34 | ).images[0] 35 | 36 | def main(): 37 | parser = FlexibleArgumentParser(description="xFuser Arguments") 38 | args = xFuserArgs.add_cli_args(parser).parse_args() 39 | engine_args = xFuserArgs.from_cli_args(args) 40 | engine_config, input_config = engine_args.create_config() 41 | engine_config.runtime_config.dtype = torch.bfloat16 42 | local_rank = get_world_group().local_rank 43 | is_last_process = get_world_group().rank == get_world_group().world_size - 1 44 | 45 | assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion." 46 | 47 | transformer = xFuserZImageTransformer2DWrapper.from_pretrained( 48 | engine_config.model_config.model, 49 | torch_dtype=torch.bfloat16, 50 | subfolder="transformer", 51 | ) 52 | pipe = ZImagePipeline.from_pretrained( 53 | pretrained_model_name_or_path=engine_config.model_config.model, 54 | engine_config=engine_config, 55 | transformer=transformer, 56 | torch_dtype=torch.bfloat16, 57 | ) 58 | pipe = pipe.to(f"cuda:{local_rank}") 59 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 60 | 61 | initialize_runtime_state(pipe, engine_config) 62 | 63 | if engine_config.runtime_config.use_torch_compile: 64 | torch._inductor.config.reorder_for_compute_comm_overlap = True 65 | pipe.transformer = torch.compile(pipe.transformer, mode="default") 66 | 67 | # one full pass to warmup the torch compiler 68 | output = run_pipe(pipe, input_config) 69 | 70 | torch.cuda.reset_peak_memory_stats() 71 | start_time = time.time() 72 | 73 | output = run_pipe(pipe, input_config) 74 | 75 | end_time = time.time() 76 | elapsed_time = end_time - start_time 77 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 78 | 79 | parallel_info = ( 80 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}" 81 | ) 82 | if input_config.output_type == "pil": 83 | if is_last_process: 84 | image_name = f"zimage_result_{parallel_info}_tc_{engine_args.use_torch_compile}.png" 85 | output.save(f"./results/{image_name}") 86 | print(f"image saved to ./results/{image_name}") 87 | 88 | if is_last_process: 89 | print( 90 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB" 91 | ) 92 | get_runtime_state().destroy_distributed_env() 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /examples/sana_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import warnings 5 | import torch.distributed 6 | from xfuser import xFuserSanaPipeline, xFuserArgs 7 | from xfuser.config import FlexibleArgumentParser 8 | from xfuser.core.distributed import ( 9 | get_world_group, 10 | is_dp_last_group, 11 | get_data_parallel_rank, 12 | get_runtime_state, 13 | ) 14 | from xfuser.core.distributed.parallel_state import get_data_parallel_world_size 15 | 16 | data_type_dict = { 17 | "Sana_1600M_1024px_diffusers": torch.float16, 18 | "Sana_1600M_4Kpx_BF16_diffusers": torch.bfloat16, 19 | "SANA1.5_4.8B_1024px_diffusers": torch.bfloat16, 20 | "SANA1.5_1.6B_1024px_diffusers": torch.bfloat16, 21 | } 22 | 23 | def get_data_type(model_path): 24 | for model_name, data_type in data_type_dict.items(): 25 | if model_name in model_path: 26 | return data_type 27 | warnings.warn(f"Unknown model path: {model_path}, using default data type: torch.float16") 28 | return torch.float16 29 | 30 | 31 | def main(): 32 | parser = FlexibleArgumentParser(description="xFuser Arguments") 33 | args = xFuserArgs.add_cli_args(parser).parse_args() 34 | engine_args = xFuserArgs.from_cli_args(args) 35 | engine_config, input_config = engine_args.create_config() 36 | local_rank = get_world_group().local_rank 37 | 38 | data_type = get_data_type(engine_config.model_config.model) 39 | engine_config.runtime_config.dtype = data_type 40 | pipe = xFuserSanaPipeline.from_pretrained( 41 | pretrained_model_name_or_path=engine_config.model_config.model, 42 | engine_config=engine_config, 43 | torch_dtype=data_type, 44 | ).to(f"cuda:{local_rank}") 45 | pipe.vae.to(torch.bfloat16) 46 | pipe.text_encoder.to(torch.bfloat16) 47 | pipe.vae.enable_tiling(tile_sample_min_width=1024, tile_sample_min_height=1024) 48 | 49 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 50 | 51 | pipe.prepare_run(input_config) 52 | 53 | torch.cuda.reset_peak_memory_stats() 54 | start_time = time.time() 55 | output = pipe( 56 | height=input_config.height, 57 | width=input_config.width, 58 | prompt=input_config.prompt, 59 | num_inference_steps=input_config.num_inference_steps, 60 | output_type=input_config.output_type, 61 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed), 62 | guidance_scale=4.5 63 | ) 64 | end_time = time.time() 65 | elapsed_time = end_time - start_time 66 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") 67 | 68 | parallel_info = ( 69 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" 70 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" 71 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" 72 | ) 73 | if input_config.output_type == "pil": 74 | dp_group_index = get_data_parallel_rank() 75 | num_dp_groups = get_data_parallel_world_size() 76 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups 77 | if pipe.is_dp_last_group(): 78 | if not os.path.exists("results"): 79 | os.mkdir("results") 80 | for i, image in enumerate(output.images): 81 | image_rank = dp_group_index * dp_batch_size + i 82 | image.save( 83 | f"./results/sana_result_{parallel_info}_{image_rank}.png" 84 | ) 85 | print( 86 | f"image {i} saved to ./results/sana_result_{parallel_info}_{image_rank}.png" 87 | ) 88 | 89 | if get_world_group().rank == get_world_group().world_size - 1: 90 | print( 91 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB" 92 | ) 93 | 94 | get_runtime_state().destroy_distributed_env() 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | --------------------------------------------------------------------------------