├── .gitmodules
├── entrypoints
├── __init__.py
└── curl.sh
├── xfuser
├── ray
│ ├── __init__.py
│ ├── pipeline
│ │ ├── __init__.py
│ │ └── base_executor.py
│ └── worker
│ │ ├── __init__.py
│ │ ├── utils.py
│ │ └── worker_wrappers.py
├── model_executor
│ ├── __init__.py
│ ├── patch
│ │ ├── __init__.py
│ │ └── unet_patch.py
│ ├── models
│ │ ├── customized
│ │ │ ├── __init__.py
│ │ │ └── step_video_t2v
│ │ │ │ ├── __init__.py
│ │ │ │ ├── linear.py
│ │ │ │ ├── attentions.py
│ │ │ │ └── rope.py
│ │ ├── __init__.py
│ │ └── transformers
│ │ │ ├── __init__.py
│ │ │ └── register.py
│ ├── cache
│ │ ├── __init__.py
│ │ └── diffusers_adapters
│ │ │ ├── registry.py
│ │ │ ├── __init__.py
│ │ │ └── flux.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── base_layer.py
│ │ ├── register.py
│ │ └── feedforward.py
│ ├── schedulers
│ │ ├── __init__.py
│ │ ├── base_scheduler.py
│ │ ├── register.py
│ │ ├── scheduling_ddpm.py
│ │ ├── scheduling_dpm_cogvideox.py
│ │ ├── scheduling_ddim_cogvideox.py
│ │ └── scheduling_ddim.py
│ ├── pipelines
│ │ ├── __init__.py
│ │ ├── pipeline_stable_diffusion_xl.py
│ │ └── register.py
│ └── base_wrapper.py
├── __version__.py
├── core
│ ├── utils
│ │ ├── __init__.py
│ │ └── timer.py
│ ├── cache_manager
│ │ └── __init__.py
│ ├── long_ctx_attention
│ │ ├── ring
│ │ │ └── __init__.py
│ │ ├── __init__.py
│ │ └── hybrid
│ │ │ └── __init__.py
│ ├── __init__.py
│ ├── fast_attention
│ │ └── __init__.py
│ └── distributed
│ │ └── __init__.py
├── config
│ ├── __init__.py
│ └── diffusers.py
├── __init__.py
├── parallel.py
└── logger.py
├── .gitignore
├── pytest.ini
├── docs
├── methods
│ ├── cfg_parallel_zh.md
│ ├── cfg_parallel.md
│ ├── ditfastattn_zh.md
│ ├── ditfastattn.md
│ ├── parallel_vae.md
│ ├── usp.md
│ ├── hybrid_zh.md
│ ├── pipefusion.md
│ └── hybrid.md
├── performance
│ ├── latte_zh.md
│ ├── sana_zh.md
│ ├── sd3_zh.md
│ ├── latte.md
│ ├── pixart_alpha_legacy.md
│ ├── stepvideo_zh.md
│ ├── sana.md
│ ├── sd3.md
│ ├── hunyuanvideo.md
│ ├── stepvideo.md
│ ├── consisid_zh.md
│ ├── hunyuandit_zh.md
│ ├── cogvideo_zh.md
│ └── consisid.md
├── developer
│ ├── Http_Service.md
│ └── adding_models
│ │ ├── adding_model_cfg_usp.md
│ │ ├── readme.md
│ │ └── adding_model_cfg.py
└── fid
│ └── FID.md
├── .pre-commit-config.yaml
├── tests
├── context_parallel
│ ├── debug_tests.py
│ └── debug_flux_usp_example.py
├── parallel_test.py
├── core
│ └── test_envs.py
└── layers
│ └── feedforward_test.py
├── benchmark
├── run.sh
├── fid
│ ├── generate.sh
│ ├── README.md
│ ├── pixartalpha_generate.py
│ ├── flux_generate.py
│ └── compute_fid.py
└── usp_latency_test.py
├── docker
└── Dockerfile
├── examples
├── run_cogvideo.sh
├── run_hunyuan_video_usp.sh
├── ray
│ ├── README.md
│ ├── ray_pixartsigma_example.py
│ ├── ray_pixartalpha_example.py
│ ├── ray_hunyuandit_example.py
│ ├── ray_run.sh
│ ├── ray_flux_example.py
│ └── ray_sd3_example.py
├── run_multinodes.sh
├── run_consisid.sh
├── run_consisid_usp.sh
├── run_service.sh
├── run_fastditattn.sh
├── run.sh
├── latte_example.py
├── sana_sprint_example.py
├── pixartsigma_example.py
├── pixartalpha_example.py
├── sdxl_example.py
├── sd3_example.py
├── cogvideox_example.py
├── hunyuandit_example.py
├── zimage_example.py
└── sana_example.py
├── .github
└── workflows
│ └── python-publish.yml
└── setup.py
/.gitmodules:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/ray/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/ray/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/ray/worker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/model_executor/patch/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.5"
2 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/customized/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/customized/step_video_t2v/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xfuser/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .timer import gpu_timer_decorator
2 |
--------------------------------------------------------------------------------
/xfuser/core/cache_manager/__init__.py:
--------------------------------------------------------------------------------
1 | from .cache_manager import CacheManager
2 |
3 | __all__ = [
4 | "CacheManager",
5 | ]
6 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_model import xFuserModelBaseWrapper
2 |
3 | __all__ = [
4 | "xFuserModelBaseWrapper"
5 | ]
--------------------------------------------------------------------------------
/xfuser/model_executor/cache/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | adapted from https://github.com/ali-vilab/TeaCache.git
3 | adapted from https://github.com/chengzeyi/ParaAttention.git
4 | """
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 | build
4 | __pycache__
5 | *.log
6 | *.txt
7 | results/
8 | profile/
9 | .vscode/
10 | xfuser.egg-info/
11 | dist/*
12 | *.mp4
13 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | log_format = %(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s
3 | log_date_format = %Y-%m-%d %H:%M:%S
4 | log_cli = true
5 | log_level = INFO
6 | addopts = --capture=tee-sys --verbose --color=auto --durations=0
--------------------------------------------------------------------------------
/xfuser/core/long_ctx_attention/ring/__init__.py:
--------------------------------------------------------------------------------
1 | from .ring_flash_attn import (
2 | xdit_ring_flash_attn_func,
3 | xdit_sana_ring_flash_attn_func,
4 | )
5 |
6 | __all__ = [
7 | "xdit_ring_flash_attn_func",
8 | "xdit_sana_ring_flash_attn_func",
9 | ]
10 |
--------------------------------------------------------------------------------
/xfuser/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .cache_manager import CacheManager
2 | from .long_ctx_attention import xFuserLongContextAttention
3 | from .utils import gpu_timer_decorator
4 |
5 | __all__ = [
6 | "CacheManager",
7 | "xFuserLongContextAttention",
8 | "gpu_timer_decorator",
9 | ]
10 |
--------------------------------------------------------------------------------
/xfuser/core/long_ctx_attention/__init__.py:
--------------------------------------------------------------------------------
1 | from .hybrid import (
2 | xFuserLongContextAttention,
3 | xFuserSanaLinearLongContextAttention,
4 | AttnType,)
5 |
6 | __all__ = [
7 | "xFuserLongContextAttention",
8 | "xFuserSanaLinearLongContextAttention",
9 | "AttnType",
10 | ]
11 |
--------------------------------------------------------------------------------
/xfuser/core/long_ctx_attention/hybrid/__init__.py:
--------------------------------------------------------------------------------
1 | from .attn_layer import (
2 | xFuserLongContextAttention,
3 | xFuserSanaLinearLongContextAttention,
4 | AttnType,
5 | )
6 |
7 | __all__ = [
8 | "xFuserLongContextAttention",
9 | "xFuserSanaLinearLongContextAttention",
10 | "AttnType",
11 | ]
12 |
--------------------------------------------------------------------------------
/entrypoints/curl.sh:
--------------------------------------------------------------------------------
1 |
2 | curl -X POST "http://localhost:6000/generate" \
3 | -H "Content-Type: application/json" \
4 | -d '{
5 | "prompt": "a cute rabbit",
6 | "num_inference_steps": 50,
7 | "seed": 42,
8 | "cfg": 7.5,
9 | "save_disk_path": "/tmp"
10 | }'
11 |
--------------------------------------------------------------------------------
/docs/methods/cfg_parallel_zh.md:
--------------------------------------------------------------------------------
1 | # Classifier-Free Guidance (CFG) Parallel
2 |
3 | Classifier-Free Guidance通过提供更广泛的条件控制、减少训练负担、增强生成内容的质量和细节,以及提高模型的实用性和适应性,成为了扩散模型领域的一个重要进展技术。
4 |
5 | 对于一个输入prompt,使用CFG需要同时进行unconditional guide和text guide的生成 ,相当于输入DiT blocks的input latents batch_size = 2。CFG Parallel分离两个latents分别进行计算,在每个Diffusion Step forward完成后、Scheduler执行前Allgather一次latent space结果。它通信量远小于Pipefusion和Sequence Parallel。因此,使用CFG一定要使用CFG Parallel。
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
3 | - repo: https://github.com/psf/black-pre-commit-mirror
4 | rev: 24.2.0
5 | hooks:
6 | - id: black
7 | # It is recommended to specify the latest version of Python
8 | # supported by your project here, or alternatively use
9 | # pre-commit's default_language_version, see
10 | # https://pre-commit.com/#top_level-default_language_version
11 | language_version: python3.10
--------------------------------------------------------------------------------
/tests/context_parallel/debug_tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import subprocess
4 | import shlex
5 | from pathlib import Path
6 |
7 | wd: str = Path(__file__).parent.absolute()
8 | #os.environ["PYTHONPATH"] = f"{WD}:{os.getenv('PYTHONPATH', '')}"
9 | test_script: str = wd / "test_diffusers_adapters.py"
10 | model_test: str = "FluxPipelineTest"
11 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
12 |
13 | cmd: str = (
14 | f"{sys.executable} -m pytest {test_script.as_posix()}::{model_test}"
15 | )
16 | cmd = shlex.split(cmd)
17 | print(cmd)
18 | subprocess.run(cmd, check=True)
--------------------------------------------------------------------------------
/docs/performance/latte_zh.md:
--------------------------------------------------------------------------------
1 | ## Latte性能
2 |
3 | Latte是文生视频模型,xDiT目前实现了USP方式对它进行并行推理加速。PipeFusion还在开发中。
4 |
5 | 在8xL20 (PCIe)的机器上,生成512x512x16视频的延迟表现如下图所示。
6 |
7 |
8 |

10 |
11 |
12 | 生成1024x1024x16视频的延迟表现如下图所示,使用混合序列并行(`ulysses_degree`=2, `ring_degree=4`)可以获得最佳性能。
13 |
14 |
15 |

17 |
--------------------------------------------------------------------------------
/xfuser/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .args import FlexibleArgumentParser, xFuserArgs
2 | from .config import (
3 | EngineConfig,
4 | ParallelConfig,
5 | TensorParallelConfig,
6 | PipeFusionParallelConfig,
7 | SequenceParallelConfig,
8 | DataParallelConfig,
9 | ModelConfig,
10 | InputConfig,
11 | RuntimeConfig
12 | )
13 |
14 | __all__ = [
15 | "FlexibleArgumentParser",
16 | "xFuserArgs",
17 | "EngineConfig",
18 | "ParallelConfig",
19 | "TensorParallelConfig",
20 | "PipeFusionParallelConfig",
21 | "SequenceParallelConfig",
22 | "DataParallelConfig",
23 | "ModelConfig",
24 | "InputConfig",
25 | "RuntimeConfig"
26 | ]
--------------------------------------------------------------------------------
/xfuser/ray/pipeline/base_executor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The xDiT team.
2 | # Adapted from
3 | # https://github.com/vllm-project/vllm/blob/main/vllm/executor/executor_base.py
4 | # Copyright (c) 2023, vLLM team. All rights reserved.
5 | from abc import ABC, abstractmethod
6 |
7 | from xfuser.config.config import EngineConfig
8 |
9 |
10 | class BaseExecutor(ABC):
11 | def __init__(
12 | self,
13 | engine_config: EngineConfig,
14 | ):
15 | self.engine_config = engine_config
16 | self.parallel_config = engine_config.parallel_config
17 | self._init_executor()
18 |
19 | @abstractmethod
20 | def _init_executor(self):
21 | pass
22 |
--------------------------------------------------------------------------------
/xfuser/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .register import xFuserLayerWrappersRegister
2 | from .base_layer import xFuserLayerBaseWrapper
3 | from .attention_processor import xFuserAttentionWrapper
4 | from .attention_processor import xFuserAttentionBaseWrapper
5 | from .conv import xFuserConv2dWrapper
6 | from .embeddings import xFuserPatchEmbedWrapper
7 | from .feedforward import xFuserFeedForwardWrapper
8 |
9 | __all__ = [
10 | "xFuserLayerWrappersRegister",
11 | "xFuserLayerBaseWrapper",
12 | "xFuserAttentionBaseWrapper",
13 | "xFuserAttentionWrapper",
14 | "xFuserConv2dWrapper",
15 | "xFuserPatchEmbedWrapper",
16 | "xFuserFeedForwardWrapper",
17 | ]
18 |
--------------------------------------------------------------------------------
/benchmark/run.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | # MODEL="/mnt/models/SD/PixArt-XL-2-1024-MS"
4 | # SCRIPT="./examples/pixartalpha_example.py"
5 |
6 | # MODEL="/mnt/models/SD/stable-diffusion-3-medium-diffusers"
7 | # SCRIPT="./examples/sd3_example.py"
8 |
9 | # MODEL="/mnt/models/SD/HunyuanDiT-v1.2-Diffusers"
10 | # SCRIPT="./examples/hunyuandit_example.py"
11 |
12 | MODEL="/cfs/dit/FLUX.1-dev/"
13 | SCRIPT="./examples/flux_example.py"
14 |
15 | export PYTHONPATH=$PWD:$PYTHONPATH
16 |
17 | python benchmark/single_node_latency_test.py \
18 | --model_id $MODEL \
19 | --script $SCRIPT \
20 | --sizes 1024 \
21 | --no_use_resolution_binning \
22 | --num_inference_steps 28 \
23 | --no_use_cfg_parallel \
24 | --n_gpus 4
--------------------------------------------------------------------------------
/docs/performance/sana_zh.md:
--------------------------------------------------------------------------------
1 | ## SANA 性能
2 | 我们是用开源版本的`Sana_1600M_4Kpx_BF16_diffusers`来进行性能评测
3 |
4 | 目前xDiT已经支持了对SANA的Pipefusion、Ulysses、Ring、CFG以及任意组合加速。由于SANA网络Head通道的限制,Ulysses最大并行度支持为2。我们在8xA100(NVLink)机器上,用20 steps生成4096x4096的图像来测试延迟。实测的延迟如下表所示。可以看到CFG的加速效果最优,其余三种加速策略性能接近,在8卡情况下,可以实现最高4.4x的生成加速。
5 |
6 |
7 | | GPU数 | cfg | ulysses | ring | pp | 延迟(秒) |
8 | |---|---|---|---|---|---|
9 | | 1 | 1 | 1 | 1 | 1 | 17.551 |
10 | | 2 | 1 | 1 | 1 | 2 | 11.276 |
11 | | 2 | 1 | 1 | 2 | 1 | 11.447 |
12 | | 2 | 1 | 2 | 1 | 1 | 10.175 |
13 | | 2 | 2 | 1 | 1 | 1 | 8.365 |
14 | | 4 | 2 | 1 | 1 | 2 | 5.599 |
15 | | 4 | 2 | 1 | 2 | 1 | 5.702 |
16 | | 4 | 2 | 2 | 1 | 1 | 5.803 |
17 | | 8 | 2 | 1 | 1 | 4 | 4.050 |
18 | | 8 | 2 | 1 | 2 | 2 | 4.091 |
19 | | 8 | 2 | 1 | 4 | 1 | 4.003 |
20 | | 8 | 2 | 2 | 1 | 2 | 4.201 |
21 | | 8 | 2 | 2 | 2 | 1 | 3.991 |
--------------------------------------------------------------------------------
/xfuser/core/utils/timer.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import torch
4 | from torch.cuda import synchronize
5 |
6 | try:
7 | import torch_musa
8 | from torch_musa.core.device import synchronize
9 | except ModuleNotFoundError:
10 | pass
11 |
12 | import xfuser.envs as envs
13 | if envs._is_npu():
14 | from torch.npu import synchronize
15 |
16 | def gpu_timer_decorator(func):
17 | def wrapper(*args, **kwargs):
18 | synchronize()
19 | start_time = time.time()
20 | result = func(*args, **kwargs)
21 | synchronize()
22 | end_time = time.time()
23 |
24 | if torch.distributed.get_rank() == 0:
25 | print(
26 | f"{func.__name__} took {end_time - start_time} seconds to run on GPU."
27 | )
28 | return result
29 |
30 | return wrapper
31 |
--------------------------------------------------------------------------------
/docs/performance/sd3_zh.md:
--------------------------------------------------------------------------------
1 | ## Stable Diffusion 3性能
2 |
3 | 我们是用开源版本的stable-diffusion-3-medium-diffusers 2B模型进行性能评测。
4 |
5 | 在8xA100(NVLink)机器上,在使用不同GPU数目时,最佳的并行方案都是不同的。这说明了多种并行和混合并行的重要性。
6 | 最佳的并行策略在不同GPU规模时分别是:在2个GPU上,使用`cfg_parallel=2`;在4个GPU上,使用`cfg_parallel=2, pipefusion_parallel=2`;在8个GPU上,使用`cfg_parallel=2, pipefusion_parallel=4`。
7 |
8 | torch.compile在除了8 GPU的场景下都来来了加速效果。
9 |
10 |
11 |
12 |

14 |
15 |
16 | 在8xL40 (PCIe)上的延迟情况如下图所示。同样,不同GPU规模,最佳并行策略都是不同的。
17 | torch.compile都来了加速效果。
18 |
19 |
20 |

22 |
23 |
--------------------------------------------------------------------------------
/docs/methods/cfg_parallel.md:
--------------------------------------------------------------------------------
1 | ## Classifier-Free Guidance (CFG) Parallel
2 | [Chinese Version](./cfg_parallel_zh.md)
3 |
4 | The Classifier-Free Guidance (CFG) has become an important trick diffusion models by providing broader conditional control, reducing training burden, enhancing the quality and details of generated content, and improving the practicality and adaptability of the model.
5 |
6 | For an input prompt, using CFG requires generating both unconditional guide and text guide simultaneously, which is equivalent to inputting input latents batch_size = 2 of DiT blocks. CFG Parallel separates the two latents for computation, and after each Diffusion Step forward is completed and before the Scheduler executes, it performs an Allgather operation on the latent space results. Its communication overhead is much smaller than Pipefusion and Sequence Parallel. Therefore, when using CFG, CFG Parallel must be used.
--------------------------------------------------------------------------------
/xfuser/config/diffusers.py:
--------------------------------------------------------------------------------
1 | import diffusers
2 | from packaging.version import Version
3 |
4 | DEFAULT_MINIMUM_DIFFUSERS_VERSION = "0.33.0"
5 | MINIMUM_DIFFUSERS_VERSIONS = {
6 | "hunyuanvideo_15": "0.36.0",
7 | "zimage": "0.36.0",
8 | "flux2": "0.36.0",
9 | "flux": "0.35.2",
10 | "flux_kontext": "0.35.2",
11 | "hunyuanvideo": "0.35.2",
12 | "wan": "0.35.2",
13 | }
14 |
15 | def has_valid_diffusers_version(model_name: str|None = None) -> bool:
16 | diffusers_version = diffusers.__version__
17 | minimum_diffusers_version = MINIMUM_DIFFUSERS_VERSIONS.get(model_name, DEFAULT_MINIMUM_DIFFUSERS_VERSION)
18 | return Version(diffusers_version).release >= Version(minimum_diffusers_version).release
19 |
20 |
21 | def get_minimum_diffusers_version(model_name: str|None = None) -> str:
22 | return MINIMUM_DIFFUSERS_VERSIONS.get(model_name, DEFAULT_MINIMUM_DIFFUSERS_VERSION)
--------------------------------------------------------------------------------
/xfuser/model_executor/cache/diffusers_adapters/registry.py:
--------------------------------------------------------------------------------
1 | """
2 | adapted from https://github.com/ali-vilab/TeaCache.git
3 | adapted from https://github.com/chengzeyi/ParaAttention.git
4 | """
5 | from xfuser.config.diffusers import has_valid_diffusers_version
6 | from typing import Type, Dict
7 |
8 | TRANSFORMER_ADAPTER_REGISTRY: Dict[Type, str] = {}
9 |
10 | def register_transformer_adapter(transformer_class: Type, adapter_name: str):
11 | TRANSFORMER_ADAPTER_REGISTRY[transformer_class] = adapter_name
12 |
13 | if has_valid_diffusers_version("flux"):
14 | from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
15 | from xfuser.model_executor.models.transformers.transformer_flux import xFuserFluxTransformer2DWrapper
16 | register_transformer_adapter(FluxTransformer2DModel, "flux")
17 | register_transformer_adapter(xFuserFluxTransformer2DWrapper, "flux")
18 |
19 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the NVIDIA PyTorch base image
2 | FROM nvcr.io/nvidia/pytorch:24.07-py3
3 |
4 | # Install git
5 | RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
6 |
7 |
8 |
9 | # Update pip to the latest version
10 | RUN pip install --no-cache-dir --upgrade pip
11 |
12 | # Uninstall apex first
13 | RUN pip uninstall -y apex
14 |
15 | # # Install flash_attn separately with --use-pep517 flag
16 | # RUN pip install --no-cache-dir --use-pep517 flash-attn==2.6.3 flask
17 |
18 | RUN pip install xfuser
19 |
20 | RUN pip install flask
21 |
22 | # Copy the entire comfyui-xdit directory into the container
23 | COPY ./http-service /app/http-service
24 |
25 | # Change to the xDiT directory
26 | WORKDIR /app
27 |
28 | # Set ENTRYPOINT with CMD as default arguments
29 | # ENTRYPOINT ["python", "/app/comfyui-xdit/launch_host.py"]
30 | # CMD ["--config", "./comfyui-xdit/config.json"]
31 |
--------------------------------------------------------------------------------
/docs/performance/latte.md:
--------------------------------------------------------------------------------
1 | ## Latte Performance
2 | [Chinese Version](./latte_zh.md)
3 |
4 | Latte is a text-to-video model, and xDiT currently implements parallel inference acceleration for it using the USP method. PipeFusion is under development.
5 |
6 | On an 8xL20 (PCIe) machine, the latency performance for generating 512x512x16 videos is shown in the graph below.
7 |
8 |
9 |

11 |
12 |
13 | The latency performance for generating 1024x1024x16 videos is depicted in the following graph. Using mixed sequence parallelization (`ulysses_degree=2`, `ring_degree=4`) yields the best performance.
14 |
15 |
16 |

18 |
--------------------------------------------------------------------------------
/xfuser/model_executor/cache/diffusers_adapters/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | adapted from https://github.com/ali-vilab/TeaCache.git
3 | adapted from https://github.com/chengzeyi/ParaAttention.git
4 | """
5 | import importlib
6 | from typing import Type, Dict, TypeVar
7 | from xfuser.model_executor.cache.diffusers_adapters.registry import TRANSFORMER_ADAPTER_REGISTRY
8 | from xfuser.logger import init_logger
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | def apply_cache_on_transformer(transformer, *args, **kwargs):
14 | adapter_name = TRANSFORMER_ADAPTER_REGISTRY.get(type(transformer))
15 | if not adapter_name:
16 | logger.error(f"Unknown transformer class: {transformer.__class__.__name__}")
17 | return transformer
18 |
19 | adapter_module = importlib.import_module(f".{adapter_name}", __package__)
20 | apply_cache_on_transformer_fn = getattr(adapter_module, "apply_cache_on_transformer")
21 | return apply_cache_on_transformer_fn(transformer, *args, **kwargs)
22 |
--------------------------------------------------------------------------------
/xfuser/__init__.py:
--------------------------------------------------------------------------------
1 | from xfuser.model_executor.pipelines import (
2 | xFuserPixArtAlphaPipeline,
3 | xFuserPixArtSigmaPipeline,
4 | xFuserStableDiffusion3Pipeline,
5 | xFuserFluxPipeline,
6 | xFuserLattePipeline,
7 | xFuserHunyuanDiTPipeline,
8 | xFuserCogVideoXPipeline,
9 | xFuserConsisIDPipeline,
10 | xFuserStableDiffusionXLPipeline,
11 | xFuserSanaPipeline,
12 | xFuserSanaSprintPipeline,
13 | )
14 | from xfuser.config import xFuserArgs, EngineConfig
15 | from xfuser.parallel import xDiTParallel
16 |
17 | __all__ = [
18 | "xFuserPixArtAlphaPipeline",
19 | "xFuserPixArtSigmaPipeline",
20 | "xFuserStableDiffusion3Pipeline",
21 | "xFuserFluxPipeline",
22 | "xFuserLattePipeline",
23 | "xFuserHunyuanDiTPipeline",
24 | "xFuserCogVideoXPipeline",
25 | "xFuserConsisIDPipeline",
26 | "xFuserStableDiffusionXLPipeline",
27 | "xFuserSanaPipeline",
28 | "xFuserSanaSprintPipeline",
29 | "xFuserArgs",
30 | "EngineConfig",
31 | "xDiTParallel",
32 | ]
33 |
--------------------------------------------------------------------------------
/xfuser/ray/worker/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The xDiT team.
2 | # Adapted from
3 | # https://github.com/vllm-project/vllm/blob/main/vllm/utils.py
4 | # Copyright (c) 2023, vLLM team. All rights reserved.
5 | import os
6 | from typing import Dict, Any
7 | import importlib.util
8 | from xfuser.logger import init_logger
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | def resolve_obj_by_qualname(qualname: str) -> Any:
14 | """
15 | Resolve an object by its fully qualified name.
16 | """
17 | module_name, obj_name = qualname.rsplit(".", 1)
18 | module = importlib.import_module(module_name)
19 | return getattr(module, obj_name)
20 |
21 |
22 | def update_environment_variables(envs: Dict[str, str]):
23 | for k, v in envs.items():
24 | if k in os.environ and os.environ[k] != v:
25 | logger.warning(
26 | "Overwriting environment variable %s " "from '%s' to '%s'",
27 | k,
28 | os.environ[k],
29 | v,
30 | )
31 | os.environ[k] = v
--------------------------------------------------------------------------------
/xfuser/core/fast_attention/__init__.py:
--------------------------------------------------------------------------------
1 | from .fast_attn_state import (
2 | get_fast_attn_state,
3 | get_fast_attn_enable,
4 | get_fast_attn_step,
5 | get_fast_attn_calib,
6 | get_fast_attn_threshold,
7 | get_fast_attn_window_size,
8 | get_fast_attn_coco_path,
9 | get_fast_attn_use_cache,
10 | get_fast_attn_config_file,
11 | get_fast_attn_layer_name,
12 | initialize_fast_attn_state,
13 | )
14 |
15 | from .attn_layer import (
16 | FastAttnMethod,
17 | xFuserFastAttention,
18 | )
19 |
20 | from .utils import fast_attention_compression
21 |
22 | __all__ = [
23 | "get_fast_attn_state",
24 | "get_fast_attn_enable",
25 | "get_fast_attn_step",
26 | "get_fast_attn_calib",
27 | "get_fast_attn_threshold",
28 | "get_fast_attn_window_size",
29 | "get_fast_attn_coco_path",
30 | "get_fast_attn_use_cache",
31 | "get_fast_attn_config_file",
32 | "get_fast_attn_layer_name",
33 | "initialize_fast_attn_state",
34 | "xFuserFastAttention",
35 | "FastAttnMethod",
36 | "fast_attention_compression",
37 | ]
38 |
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/__init__.py:
--------------------------------------------------------------------------------
1 | from .register import xFuserSchedulerWrappersRegister
2 | from .base_scheduler import xFuserSchedulerBaseWrapper
3 | from .scheduling_dpmsolver_multistep import (
4 | xFuserDPMSolverMultistepSchedulerWrapper
5 | )
6 | from .scheduling_flow_match_euler_discrete import (
7 | xFuserFlowMatchEulerDiscreteSchedulerWrapper,
8 | )
9 | from .scheduling_ddim import xFuserDDIMSchedulerWrapper
10 | from .scheduling_ddpm import xFuserDDPMSchedulerWrapper
11 | from .scheduling_ddim_cogvideox import xFuserCogVideoXDDIMSchedulerWrapper
12 | from .scheduling_dpm_cogvideox import xFuserCogVideoXDPMSchedulerWrapper
13 | from .scheduling_scm import xFuserSCMSchedulerWrapper
14 |
15 | __all__ = [
16 | "xFuserSchedulerWrappersRegister",
17 | "xFuserSchedulerBaseWrapper",
18 | "xFuserDPMSolverMultistepSchedulerWrapper",
19 | "xFuserFlowMatchEulerDiscreteSchedulerWrapper",
20 | "xFuserDDIMSchedulerWrapper",
21 | "xFuserCogVideoXDDIMSchedulerWrapper",
22 | "xFuserCogVideoXDPMSchedulerWrapper",
23 | "xFuserDDPMSchedulerWrapper",
24 | "xFuserSCMSchedulerWrapper",
25 | ]
26 |
--------------------------------------------------------------------------------
/examples/run_cogvideo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | export PYTHONPATH=$PWD:$PYTHONPATH
5 |
6 | # CogVideoX configuration
7 | SCRIPT="cogvideox_example.py"
8 | MODEL_ID="/cfs/dit/CogVideoX1.5-5B"
9 | INFERENCE_STEP=50
10 |
11 | mkdir -p ./results
12 |
13 | # CogVideoX specific task args
14 | TASK_ARGS="--height 768 --width 1360 --num_frames 17 --guidance_scale 6.0"
15 |
16 | # CogVideoX parallel configuration
17 | N_GPUS=8
18 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 2"
19 | CFG_ARGS="--use_cfg_parallel"
20 |
21 | # Uncomment and modify these as needed
22 | # PIPEFUSION_ARGS="--num_pipeline_patch 8"
23 | # OUTPUT_ARGS="--output_type latent"
24 | # PARALLLEL_VAE="--use_parallel_vae"
25 | ENABLE_TILING="--enable_tiling"
26 | # COMPILE_FLAG="--use_torch_compile"
27 |
28 | torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
29 | --model $MODEL_ID \
30 | $PARALLEL_ARGS \
31 | $TASK_ARGS \
32 | $PIPEFUSION_ARGS \
33 | $OUTPUT_ARGS \
34 | --num_inference_steps $INFERENCE_STEP \
35 | --warmup_steps 0 \
36 | --prompt "A little girl is riding a bicycle at high speed. Focused, detailed, realistic." \
37 | $CFG_ARGS \
38 | $PARALLLEL_VAE \
39 | $ENABLE_TILING \
40 | $COMPILE_FLAG
41 |
--------------------------------------------------------------------------------
/docs/performance/pixart_alpha_legacy.md:
--------------------------------------------------------------------------------
1 | # Pixart-Alpha Legacy Version Performance
2 |
3 | Here are the benchmark results for Pixart-Alpha using the 20-step DPM solver as the scheduler across various image resolutions.
4 | To replicate these findings, please refer to the script at [legacy/scripts/benchmark.sh](../../legacy/scripts/benchmark.sh).
5 |
6 | 1. The Latency on 4xA100-80GB (PCIe)
7 |
8 |
9 |

10 |
11 |
12 | 2. The Latency on 8xL20-48GB (PCIe)
13 |
14 |
15 |

16 |
17 |
18 | 3. The Latency on 8xA100-80GB (NVLink)
19 |
20 |
21 |

22 |
23 |
24 | 4. The Latency on 4xT4-16GB (PCIe)
25 |
26 |
27 |

29 |
30 |
--------------------------------------------------------------------------------
/xfuser/model_executor/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_pipeline import xFuserPipelineBaseWrapper
2 | from .pipeline_pixart_alpha import xFuserPixArtAlphaPipeline
3 | from .pipeline_pixart_sigma import xFuserPixArtSigmaPipeline
4 | from .pipeline_stable_diffusion_3 import xFuserStableDiffusion3Pipeline
5 | from .pipeline_flux import xFuserFluxPipeline
6 | from .pipeline_latte import xFuserLattePipeline
7 | from .pipeline_cogvideox import xFuserCogVideoXPipeline
8 | from .pipeline_consisid import xFuserConsisIDPipeline
9 | from .pipeline_hunyuandit import xFuserHunyuanDiTPipeline
10 | from .pipeline_stable_diffusion_xl import xFuserStableDiffusionXLPipeline
11 | from .pipeline_sana import xFuserSanaPipeline
12 | from .pipeline_sana_sprint import xFuserSanaSprintPipeline
13 |
14 | __all__ = [
15 | "xFuserPipelineBaseWrapper",
16 | "xFuserPixArtAlphaPipeline",
17 | "xFuserPixArtSigmaPipeline",
18 | "xFuserStableDiffusion3Pipeline",
19 | "xFuserFluxPipeline",
20 | "xFuserLattePipeline",
21 | "xFuserHunyuanDiTPipeline",
22 | "xFuserCogVideoXPipeline",
23 | "xFuserConsisIDPipeline",
24 | "xFuserStableDiffusionXLPipeline",
25 | "xFuserSanaPipeline",
26 | "xFuserSanaSprintPipeline",
27 | ]
--------------------------------------------------------------------------------
/docs/developer/Http_Service.md:
--------------------------------------------------------------------------------
1 | ## Launch a Text-to-Image Http Service
2 |
3 | Launch an HTTP-based text-to-image service that generates images from textual descriptions (prompts) using the DiT model.
4 | The generated images can either be returned directly to users or saved to a specified disk location.
5 | For example, the following command launches a HTTP service with 4 GPUs, 2 Ulysses parallel degree, 2 PipeFusion parallel degree, and the model path is `./models/FLUX.1-schnell`.
6 |
7 | ```bash
8 | python ./entrypoints/launch.py --world_size 4 --ulysses_parallel_degree 2 --pipefusion_parallel_degree 2 --model_path /your_model_path/FLUX.1-schnell
9 | ```
10 |
11 |
12 | To an example HTTP request is shown below. The `save_disk_path` parameter is optional - if not set, the image will be returned directly; if set, the generated image will be saved to the specified directory on disk.
13 |
14 | ```bash
15 | curl -X POST "http://localhost:6000/generate" \
16 | -H "Content-Type: application/json" \
17 | -d '{
18 | "prompt": "a cute rabbit",
19 | "num_inference_steps": 50,
20 | "seed": 42,
21 | "cfg": 7.5,
22 | "save_disk_path": "/tmp"
23 | }'
24 | ```
25 |
--------------------------------------------------------------------------------
/docs/methods/ditfastattn_zh.md:
--------------------------------------------------------------------------------
1 | ### DiTFastAttn: Attention Compression for Diffusion Transformer Models
2 |
3 | [DiTFastAttn](https://github.com/thu-nics/DiTFastAttn)是一种针对单卡DiTs推理的加速方案,利用Input Temperal Reduction通过如下三种方式来减少计算量:
4 |
5 | 1. Window Attention with Residual Caching to reduce spatial redundancy.
6 | 2. Temporal Similarity Reduction to exploit the similarity between steps.
7 | 3. Conditional Redundancy Elimination to skip redundant computations during conditional generation
8 |
9 | 目前使用DiTFastAttn只能数据并行,或者单GPU运行。不支持其他方式并行,比如USP和PipeFusion等。我们未来计划实现并行版本的DiTFastAttn。
10 |
11 | ## 下载COCO数据集
12 | ```
13 | wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
14 | unzip annotations_trainval2014.zip
15 | ```
16 |
17 | ## 运行
18 |
19 | 在脚本中修改数据集路径,然后运行
20 |
21 | ```
22 | bash examples/run_fastditattn.sh
23 | ```
24 |
25 | ## 引用
26 |
27 | ```
28 | @misc{yuan2024ditfastattn,
29 | title={DiTFastAttn: Attention Compression for Diffusion Transformer Models},
30 | author={Zhihang Yuan and Pu Lu and Hanling Zhang and Xuefei Ning and Linfeng Zhang and Tianchen Zhao and Shengen Yan and Guohao Dai and Yu Wang},
31 | year={2024},
32 | eprint={2406.08552},
33 | archivePrefix={arXiv},
34 | }
35 | ```
--------------------------------------------------------------------------------
/docs/performance/stepvideo_zh.md:
--------------------------------------------------------------------------------
1 | ## Step-Video-T2V 30B 性能
2 |
3 | ### 评测说明
4 |
5 | 我们是用开源版本的Step-Video-T2V 30B模型对SP(序列)并行与TP(张量)并行进行性能评测。我们使用ulysses_degree作为sp_degree。
6 |
7 | 测试启动脚本参考:https://github.com/stepfun-ai/Step-Video-T2V/tree/main#multi-gpu-parallel-deployment
8 |
9 | ### Nvidia H20*8 (NVLINK)
10 |
11 | #### 并行策略对比
12 | | 总卡数 | 并行类型 | 配置参数 | 时延 | 加速比 | 显存占用 |
13 | |--------|----------|--------|---------|---------|--------------------|
14 | | 1 | Baseline | `TP1 SP1` | 213.60s | 1.00x | 92,170M |
15 | | 2 | TP | `TP2` | 108.97s | 0.98x | 57,458M ▼37.7% |
16 | | 2 | SP | `SP2` | 108.13s | 0.99x | 86,258M ▼6.4% |
17 | | 4 | TP | `TP4` | 57.61s | 0.93x | 36,566M ▼60.3% |
18 | | 4 | SP | `SP4` | 57.01s | 0.94x | 78,226M ▼15.1% |
19 | | 8 | TP | `TP8` | 30.40s | 0.88x | 30,028M ▼67.4% |
20 | | 8 | SP | `SP8` | 30.10s | 0.89x | 79,684M ▼13.5% |
21 |
22 |
23 | #### 关键发现
24 | - **硬件适配性**:
25 | - 消费级卡组(5090/5090D):完整支持32GB*8配置的训练任务
26 | - 专业推理卡(L20/L40):48GB*4配置实现全参数推理
27 |
28 | - **效率表现**:
29 | - TP8策略显存节省达67.4%(对比SP8高53.9%)
30 | - 混合并行时延降低趋势与理论值偏差<12%
31 |
32 | - **扩展特性**:
33 | - 多维参数切片实现近乎线性的扩展效率
34 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v4
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_API_TOKEN }}
40 |
--------------------------------------------------------------------------------
/examples/run_hunyuan_video_usp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | export PYTHONPATH=$PWD:$PYTHONPATH
5 |
6 | # CogVideoX configuration
7 | SCRIPT="hunyuan_video_usp_example.py"
8 | MODEL_ID="/cfs/dit/HunyuanVideo"
9 | # MODEL_ID="tencent/HunyuanVideo"
10 | INFERENCE_STEP=50
11 |
12 | mkdir -p ./results
13 |
14 | # CogVideoX specific task args
15 | TASK_ARGS="--height 720 --width 1280 --num_frames 129 --guidance_scale 5.0"
16 |
17 | # CogVideoX parallel configuration
18 | N_GPUS=8
19 | PARALLEL_ARGS="--ulysses_degree 4 --ring_degree 2"
20 | # CFG_ARGS="--use_cfg_parallel"
21 |
22 | # Uncomment and modify these as needed
23 | # PIPEFUSION_ARGS="--num_pipeline_patch 8"
24 | # OUTPUT_ARGS="--output_type latent"
25 | # PARALLLEL_VAE="--use_parallel_vae"
26 | ENABLE_TILING="--enable_tiling"
27 | ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload"
28 | # COMPILE_FLAG="--use_torch_compile"
29 |
30 | torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
31 | --model $MODEL_ID \
32 | $PARALLEL_ARGS \
33 | $TASK_ARGS \
34 | $PIPEFUSION_ARGS \
35 | $OUTPUT_ARGS \
36 | --num_inference_steps $INFERENCE_STEP \
37 | --warmup_steps 0 \
38 | --prompt "A cat walks on the grass, realistic" \
39 | $CFG_ARGS \
40 | $PARALLLEL_VAE \
41 | $ENABLE_TILING \
42 | $ENABLE_MODEL_CPU_OFFLOAD \
43 | $COMPILE_FLAG
44 |
--------------------------------------------------------------------------------
/docs/performance/sana.md:
--------------------------------------------------------------------------------
1 | ## Performance of SANA
2 | [Chinese Version](./sana_zh.md)
3 |
4 | We use the open-source version of `Sana_1600M_4Kpx_BF16_diffusers` for performance evaluation.
5 |
6 | Currently, xDiT supports acceleration for SANA with Pipefusion, Ulysses, Ring, CFG, and any combination thereof. Due to the limitation of the Head channel in the SANA network, the maximum parallelism supported by Ulysses is 2. We tested latency on an 8xA100 (NVLink) machine by generating 4096x4096 images with 20 steps. The measured latencies are shown in the table below. It can be seen that CFG achieves the best acceleration effect, while the other three acceleration strategies have similar performance. In the case of 8 GPUs, up to 4.4x generation acceleration can be achieved.
7 |
8 | | #GPUs | cfg | ulysses | ring | pp | Latency (seconds) |
9 | |---|---|---|---|---|---|
10 | | 1 | 1 | 1 | 1 | 1 | 17.551 |
11 | | 2 | 1 | 1 | 1 | 2 | 11.276 |
12 | | 2 | 1 | 1 | 2 | 1 | 11.447 |
13 | | 2 | 1 | 2 | 1 | 1 | 10.175 |
14 | | 2 | 2 | 1 | 1 | 1 | 8.365 |
15 | | 4 | 2 | 1 | 1 | 2 | 5.599 |
16 | | 4 | 2 | 1 | 2 | 1 | 5.702 |
17 | | 4 | 2 | 2 | 1 | 1 | 5.803 |
18 | | 8 | 2 | 1 | 1 | 4 | 4.050 |
19 | | 8 | 2 | 1 | 2 | 2 | 4.091 |
20 | | 8 | 2 | 1 | 4 | 1 | 4.003 |
21 | | 8 | 2 | 2 | 1 | 2 | 4.201 |
22 | | 8 | 2 | 2 | 2 | 1 | 3.991 |
--------------------------------------------------------------------------------
/docs/performance/sd3.md:
--------------------------------------------------------------------------------
1 | ## Performance of Stable Diffusion 3
2 | [Chinese Version](./sd3_zh.md)
3 |
4 | We conducted performance evaluations using the open-source version of the stable-diffusion-3-medium-diffusers 2B model.
5 |
6 | On an 8xA100 (NVLink) machine, the optimal parallelization strategy varied depending on the number of GPUs used, highlighting the importance of diverse and hybrid parallel approaches. The best parallel strategies for different GPU scales were as follows: with 2 GPUs, `cfg_parallel=2` was used; with 4 GPUs, `cfg_parallel=2, pipefusion_parallel=2` was employed; and with 8 GPUs, `cfg_parallel=2, pipefusion_parallel=4` was utilized.
7 |
8 | torch.compile provided acceleration in all scenarios except for the 8 GPU configuration.
9 |
10 |
11 |

13 |
14 |
15 | The latency situation on 8xL40 (PCIe) is depicted in the graph below. Similarly, the optimal parallel strategies varied with different GPU scales. torch.compile delivered acceleration in all cases.
16 |
17 |
18 |

20 |
--------------------------------------------------------------------------------
/examples/ray/README.md:
--------------------------------------------------------------------------------
1 | ## Running DiT Backbone and VAE Module Separately
2 |
3 | The DiT model typically consists of DiT backbone (encoder + transformers) and VAE module.
4 | The DiT backbone module has high computational requirements but stable memory usage.
5 | For high-resolution images, the VAE module has high memory consumption due to temporary memory spikes from convolution operators, despite its low computational requirements. This often leads to OOM (Out of Memory) issues caused by the VAE module.
6 |
7 | Therefore, separating the encoder + DiT backbone from the VAE module can effectively alleviate OOM issues.
8 | We use Ray to implement the separation of backbone and VAE functionality, and allocate different GPU parallelism for VAE and DiT backbone.
9 |
10 | In `ray_run.sh`, we define different model configurations.
11 | For example, if we use 3 GPUs and want to allocate 1 GPU for VAE and 2 GPUs for DiT backbone, the settings in `ray_run.sh` would be:
12 |
13 | ```
14 | N_GPUS=3 # world size
15 | PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 1"
16 | VAE_PARALLEL_SIZE=1
17 | DIT_PARALLEL_SIZE=2
18 | ```
19 |
20 | Here, `VAE_PARALLEL_SIZE` specifies the parallelism for VAE, DIT_PARALLEL_SIZE defines DiT parallelism, and PARALLEL_ARGS contains the parallel configuration for DiT backbone, which in this case uses PipeFusion to run on 2 GPUs.
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docs/developer/adding_models/adding_model_cfg_usp.md:
--------------------------------------------------------------------------------
1 | # Parallelize new models with CFG parallelism and USP provided by xDiT
2 |
3 | The following two tutorials provide detailed instructions on how to implement CFG parallelism and USP (Unified Sequence Parallelism) supported by xDiT for a new DiT model:
4 |
5 | [Parallelize new models with CFG parallelism provided by xDiT](adding_model_cfg.md)
6 |
7 | [Parallelize new models with USP provided by xDiT](adding_model_usp.md)
8 |
9 | [Parallelize new models with USP provided by xDiT (text replica)](adding_model_usp_text_replica.md)
10 |
11 | Both parallelization techniques can be concurrently employed. To achieve this, specify the level of parallelization for both CFG parallelism and USP as demonstrated below. The number of GPUs should be twice the product of the degrees of ulysses attention and ring attention:
12 |
13 | ```python
14 | from xfuser.core.distributed import initialize_model_parallel
15 | initialize_model_parallel(
16 | ring_degree=,
17 | ulysses_degree=,
18 | classifier_free_guidance_degree=2,
19 | )
20 | # restriction: dist.get_world_size() == 2 x x
21 | ```
22 |
23 | Following this, both CFG parallelism and USP can be simultaneously implemented. For a comprehensive example script showcasing this approach, refer to [adding_model_cfg_usp.py](adding_model_cfg_usp.py).
24 |
--------------------------------------------------------------------------------
/tests/parallel_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from diffusers import StableDiffusion3Pipeline, FluxPipeline
3 |
4 | from xfuser import xFuserArgs
5 | from xfuser.parallel import xDiTParallel
6 | from xfuser.config import FlexibleArgumentParser
7 | from xfuser.core.distributed import get_world_group
8 |
9 |
10 | def main():
11 | parser = FlexibleArgumentParser(description="xFuser Arguments")
12 | args = xFuserArgs.add_cli_args(parser).parse_args()
13 | engine_args = xFuserArgs.from_cli_args(args)
14 | engine_config, input_config = engine_args.create_config()
15 |
16 | local_rank = get_world_group().local_rank
17 | pipe = StableDiffusion3Pipeline.from_pretrained(
18 | pretrained_model_name_or_path=engine_config.model_config.model,
19 | torch_dtype=torch.float16,
20 | ).to(f"cuda:{local_rank}")
21 |
22 | paralleler = xDiTParallel(pipe, engine_config, input_config)
23 |
24 | paralleler(
25 | height=input_config.height,
26 | width=input_config.height,
27 | prompt=input_config.prompt,
28 | num_inference_steps=input_config.num_inference_steps,
29 | output_type=input_config.output_type,
30 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
31 | )
32 | if input_config.output_type == "pil":
33 | paralleler.save("results", "stable_diffusion_3")
34 |
35 |
36 | if __name__ == "__main__":
37 | main()
38 |
--------------------------------------------------------------------------------
/benchmark/fid/generate.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | export PYTHONPATH=$PWD:$PYTHONPATH
4 | export CAPTION_FILE="dataset_coco.json"
5 | export SAMPLE_IMAGES_FOLODER="sample_images"
6 |
7 | # Select the model type
8 | export MODEL_TYPE="Pixart-alpha"
9 | # Configuration for different model types
10 | # script, model_id, inference_step
11 | declare -A MODEL_CONFIGS=(
12 | ["Pixart-alpha"]="pixartalpha_generate.py /cfs/dit/PixArt-XL-2-256-MS 20"
13 | ["Flux"]="flux_generate.py /cfs/dit/FLUX.1-dev 28"
14 | )
15 |
16 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
17 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
18 | export SCRIPT MODEL_ID INFERENCE_STEP
19 | else
20 | echo "Invalid MODEL_TYPE: $MODEL_TYPE"
21 | exit 1
22 | fi
23 |
24 | # task args
25 | TASK_ARGS="--height 256 --width 256 --no_use_resolution_binning"
26 |
27 | N_GPUS=8
28 | PARALLEL_ARGS="--pipefusion_parallel_degree 8 --ulysses_degree 1 --ring_degree 1"
29 |
30 | torchrun --nproc_per_node=$N_GPUS ./benchmark/fid/$SCRIPT \
31 | --model $MODEL_ID \
32 | $PARALLEL_ARGS \
33 | $TASK_ARGS \
34 | $PIPEFUSION_ARGS \
35 | $OUTPUT_ARGS \
36 | --num_inference_steps $INFERENCE_STEP \
37 | --warmup_steps 1 \
38 | --prompt "brown dog laying on the ground with a metal bowl in front of him." \
39 | $CFG_ARGS \
40 | $PARALLLEL_VAE \
41 | $COMPILE_FLAG \
42 | --caption_file $CAPTION_FILE \
43 | --sample_images_folder $SAMPLE_IMAGES_FOLODER \
44 |
--------------------------------------------------------------------------------
/tests/context_parallel/debug_flux_usp_example.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import subprocess
4 | import shlex
5 | from pathlib import Path
6 |
7 | os.environ["HF_HUB_CACHE"] = "/mnt/co-research/shared-models/hub"
8 |
9 | root_dir = Path(__file__).parents[2].absolute()
10 | #os.environ["PYTHONPATH"] = f"{WD}:{os.getenv('PYTHONPATH', '')}"
11 | examples_dir = root_dir / "examples"
12 | flux_script = examples_dir / "flux_usp_example.py"
13 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
14 | n_gpus = 2
15 |
16 | model_id = "black-forest-labs/FLUX.1-dev"
17 | inference_steps = 28
18 | warmup_steps = 3
19 | max_sequence_length = 512
20 | height = 1024
21 | width = 1024
22 | task_args = f"--max-sequence-length {max_sequence_length} --height {height} --width {width}"
23 | pipefusion_parallel_degree = 1
24 | ulysses_degree = 2
25 | ring_degree = 1
26 | parallel_args = (
27 | f"--pipefusion_parallel_degree {pipefusion_parallel_degree} "
28 | f"--ulysses_degree {ulysses_degree} "
29 | f"--ring_degree {ring_degree} "
30 | )
31 | compile_flag = "--use_torch_compile"
32 |
33 | cmd: str = (
34 | f"{sys.executable} -m torch.distributed.run --nproc_per_node={n_gpus} {flux_script.as_posix()} "
35 | f"--model {model_id} "
36 | f"{parallel_args} "
37 | f"{task_args} "
38 | f"--num_inference_steps {inference_steps} "
39 | f"--warmup_steps {warmup_steps} "
40 | f"--prompt \"A dark tree.\" "
41 | )
42 | cmd = shlex.split(cmd)
43 | print(cmd)
44 | subprocess.run(cmd, check=True)
--------------------------------------------------------------------------------
/docs/performance/hunyuanvideo.md:
--------------------------------------------------------------------------------
1 | ## HunyuanVideo Performance Report
2 |
3 | xDiT is [HunyuanVideo](https://github.com/Tencent/HunyuanVideo?tab=readme-ov-file#-parallel-inference-on-multiple-gpus-by-xdit)'s official parallel inference engine. On H100 and H20 GPUs, xDiT reduces the generation time of 1028x720 videos from 31 minutes to 5 minutes, and 960x960 videos from 28 minutes to 6 minutes.
4 |
5 | The H100 and H20 performance benchmarks are done with the official HunyuanVideo repository. The L20 performance benchmarks are done with the `diffusers` implementation.
6 | The L20 performance benchmarks are measured using this [script](examples/hunyuan_video_usp_example.py), along with `flash-attn==2.7.2.post1` and CUDA 12.4.
7 |
8 | ### 1280x720 Resolution (129 frames, 50 steps) - Ulysses Latency (seconds)
9 |
10 |
11 |
12 | | GPU Type | 1 GPU | 2 GPUs | 4 GPUs | 8 GPUs |
13 | |----------|--------|---------|---------|---------|
14 | | H100 | 1,904.08 | 925.04 | 514.08 | 337.58 |
15 | | H20 | 6,639.17 | 3,400.55 | 1,762.86 | 940.97 |
16 | | L20 | 6,043.88 | 3,271.44 | 2,080.05 | |
17 |
18 |
19 |
20 | ### 960x960 Resolution (129 frames, 50 steps) - Ulysses Latency (seconds)
21 |
22 |
23 |
24 | | GPU Type | 1 GPU | 2 GPUs | 3 GPUs | 6 GPUs |
25 | |----------|--------|---------|---------|---------|
26 | | H100 | 1,735.01 | 934.09 | 645.45 | 367.02 |
27 | | H20 | 6,621.46 | 3,400.55 | 2,310.48 | 1,214.67 |
28 | | L20 | 6,039.08 | 3,260.62 | 2,284.74 | |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/xfuser/model_executor/layers/base_layer.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod, ABCMeta
2 | from typing import List
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from xfuser.config.config import InputConfig, ParallelConfig, RuntimeConfig
8 | from xfuser.model_executor.base_wrapper import xFuserBaseWrapper
9 |
10 |
11 | class xFuserLayerBaseWrapper(nn.Module, xFuserBaseWrapper, metaclass=ABCMeta):
12 |
13 | def __init__(self, module: nn.Module):
14 | super().__init__()
15 | super(nn.Module, self).__init__(module=module)
16 | self.activation_cache = None
17 |
18 | def __getattr__(self, name: str):
19 | if "_parameters" in self.__dict__:
20 | _parameters = self.__dict__["_parameters"]
21 | if name in _parameters:
22 | return _parameters[name]
23 | if "_buffers" in self.__dict__:
24 | _buffers = self.__dict__["_buffers"]
25 | if name in _buffers:
26 | return _buffers[name]
27 | if "_modules" in self.__dict__:
28 | modules = self.__dict__["_modules"]
29 | if name in modules:
30 | return modules[name]
31 | try:
32 | return getattr(self.module, name)
33 | except RecursionError:
34 | raise AttributeError(
35 | f"module {type(self.module).__name__} has no " f"attribute {name}"
36 | )
37 |
38 | @abstractmethod
39 | def forward(self, *args, **kwargs):
40 | pass
41 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/customized/step_video_t2v/linear.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from xfuser.core.distributed.parallel_state import (
4 | get_tensor_model_parallel_rank,
5 | get_tp_group,
6 | get_tensor_model_parallel_world_size
7 | )
8 |
9 |
10 | class ColumnParallelLinear(nn.Linear):
11 | def __init__(self, in_features, out_features, bias=True, gather_output=True, tp_group=None):
12 | self.tp_size = get_tensor_model_parallel_world_size()
13 | self.tp_rank = get_tensor_model_parallel_rank()
14 | self.tp_group = tp_group or get_tp_group()
15 |
16 | super().__init__(in_features, out_features, bias=bias)
17 |
18 | def forward(self, x):
19 | x = super().forward(x)
20 | return x
21 |
22 |
23 | class RowParallelLinear(nn.Linear):
24 | def __init__(self, in_features, out_features, bias=True, input_is_parallel=True, tp_group=None):
25 | self.tp_size = get_tensor_model_parallel_world_size()
26 | self.tp_rank = get_tensor_model_parallel_rank()
27 | self.tp_group = tp_group or get_tp_group()
28 | self.input_is_parallel = input_is_parallel
29 |
30 | super().__init__(in_features, out_features, bias=bias)
31 |
32 | def forward(self, x):
33 | if not self.input_is_parallel:
34 | x = torch.chunk(x, self.tp_size, dim=-1)[self.tp_rank]
35 | x = super().forward(x)
36 | # 执行All-Reduce聚合结果
37 | x = self.tp_group.all_reduce(x)
38 | return x
39 |
--------------------------------------------------------------------------------
/docs/methods/ditfastattn.md:
--------------------------------------------------------------------------------
1 | ### DiTFastAttn: Attention Compression for Diffusion Transformer Models
2 |
3 | [DiTFastAttn](https://github.com/thu-nics/DiTFastAttn) is an acceleration solution for single-GPU DiTs inference, utilizing Input Temporal Reduction to reduce computational complexity through the following three methods:
4 |
5 | 1. Window Attention with Residual Caching to reduce spatial redundancy.
6 | 2. Temporal Similarity Reduction to exploit the similarity between steps.
7 | 3. Conditional Redundancy Elimination to skip redundant computations during conditional generation
8 |
9 | Currently, DiTFastAttn can only be used with data parallelism or on a single GPU. It does not support other parallel methods such as USP and PipeFusion. We plan to implement a parallel version of DiTFastAttn in the future.
10 |
11 | ## Download COCO Dataset
12 | ```
13 | wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
14 | unzip annotations_trainval2014.zip
15 | ```
16 |
17 | ## Running
18 |
19 | Modify the dataset path in the script, then run
20 |
21 | ```
22 | bash examples/run_fastditattn.sh
23 | ```
24 |
25 | ## Reference
26 |
27 | ```
28 | @misc{yuan2024ditfastattn,
29 | title={DiTFastAttn: Attention Compression for Diffusion Transformer Models},
30 | author={Zhihang Yuan and Pu Lu and Hanling Zhang and Xuefei Ning and Linfeng Zhang and Tianchen Zhao and Shengen Yan and Guohao Dai and Yu Wang},
31 | year={2024},
32 | eprint={2406.08552},
33 | archivePrefix={arXiv},
34 | }
35 | ```
--------------------------------------------------------------------------------
/tests/core/test_envs.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch
3 | import torch
4 | from xfuser import envs
5 |
6 | class TestEnvs(unittest.TestCase):
7 |
8 | @patch('torch.cuda.is_available', return_value=True)
9 | def test_get_device_cuda(self, mock_is_available):
10 | device = envs.get_device(0)
11 | self.assertEqual(device.type, 'cuda')
12 | self.assertEqual(device.index, 0)
13 | device_name = envs.get_device_name()
14 | self.assertEqual(device_name, 'cuda')
15 |
16 | @patch('torch.cuda.is_available', return_value=False)
17 | @patch('xfuser.envs._is_mps', return_value=True)
18 | def test_get_device_mps(self, mock_is_mps, mock_is_available):
19 | device = envs.get_device(0)
20 | self.assertEqual(device.type, 'mps')
21 | device_name = envs.get_device_name()
22 | self.assertEqual(device_name, 'mps')
23 | # test that getting CUDA_VERSION does not raise an error
24 | cuda_version = envs.CUDA_VERSION
25 | self.assertIsNotNone(cuda_version)
26 |
27 | @patch('torch.cuda.is_available', return_value=False)
28 | @patch('xfuser.envs._is_mps', return_value=False)
29 | @patch('xfuser.envs._is_musa', return_value=False)
30 | def test_get_device_cpu(self, mock_is_musa, mock_is_mps, mock_is_available):
31 | device = envs.get_device(0)
32 | self.assertEqual(device.type, 'cpu')
33 | device_name = envs.get_device_name()
34 | self.assertEqual(device_name, 'cpu')
35 |
36 | if __name__ == '__main__':
37 | unittest.main()
38 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from xfuser.config.diffusers import has_valid_diffusers_version
2 | from .register import xFuserTransformerWrappersRegister
3 | from .base_transformer import xFuserTransformerBaseWrapper
4 | from .pixart_transformer_2d import xFuserPixArtTransformer2DWrapper
5 | from .transformer_sd3 import xFuserSD3Transformer2DWrapper
6 | from .latte_transformer_3d import xFuserLatteTransformer3DWrapper
7 | from .hunyuan_transformer_2d import xFuserHunyuanDiT2DWrapper
8 | from .cogvideox_transformer_3d import xFuserCogVideoXTransformer3DWrapper
9 | from .consisid_transformer_3d import xFuserConsisIDTransformer3DWrapper
10 | from .sana_transformer_2d import xFuserSanaTransformer2DWrapper
11 |
12 | __all__ = [
13 | "xFuserTransformerWrappersRegister",
14 | "xFuserTransformerBaseWrapper",
15 | "xFuserPixArtTransformer2DWrapper",
16 | "xFuserSD3Transformer2DWrapper",
17 | "xFuserLatteTransformer3DWrapper",
18 | "xFuserCogVideoXTransformer3DWrapper",
19 | "xFuserHunyuanDiT2DWrapper",
20 | "xFuserConsisIDTransformer3DWrapper",
21 | "xFuserSanaTransformer2DWrapper"
22 | ]
23 |
24 | # Gating some imports based on diffusers version, as they import part of diffusers
25 | if has_valid_diffusers_version("flux"):
26 | from .transformer_flux import xFuserFluxTransformer2DWrapper
27 | __all__.append("xFuserFluxTransformer2DWrapper")
28 |
29 |
30 | if has_valid_diffusers_version("zimage"):
31 | from .transformer_z_image import xFuserZImageTransformer2DWrapper
32 | __all__.append("xFuserZImageTransformer2DWrapper")
--------------------------------------------------------------------------------
/examples/run_multinodes.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | # nccl settings
4 | #export NCCL_DEBUG=INFO
5 |
6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
7 | export NCCL_SOCKET_IFNAME=eth0
8 | export GLOO_SOCKET_IFNAME=eth0
9 | export NCCL_P2P_DISABLE=1
10 |
11 | #export NCCL_IB_GID_INDEX=3
12 | #export NCCL_IB_DISABLE=0
13 | #export NCCL_NET_GDR_LEVEL=2
14 | #export NCCL_IB_QPS_PER_CONNECTION=4
15 | #export NCCL_IB_TC=160
16 | #export NCCL_IB_TIMEOUT=22
17 | # export NCCL_P2P=0
18 |
19 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
20 |
21 | export PYTHONPATH=$PWD:$PYTHONPATH
22 |
23 |
24 | NRANK=0
25 | MASTERIP=127.0.0.1
26 | MASTERPORT=6000
27 | DISTARGAS="--nnodes=2 --node_rank=${NRANK} --master_addr=${MASTERIP} --master_port=${MASTERPORT}"
28 |
29 | SCRIPT=pixartalpha_example.py
30 | MODEL_ID="/cfs/dit/PixArt-XL-2-1024-MS/"
31 | INFERENCE_STEP=20
32 |
33 | SIZE=1024
34 | GUIDANCE_SCALE=4.5
35 | PARALLEL_ARGS="--ulysses_degree=1 --ring_degree=1 --pipefusion_parallel_degree=8"
36 | TASK_ARGS="--height=${SIZE} --width=${SIZE} --no_use_resolution_binning --guidance_scale=${GUIDANCE_SCALE}"
37 | OUTPUT_ARGS="--output_type=latent"
38 | CFG_ARGS="--use_cfg_parallel"
39 |
40 | # PARALLLEL_VAE="--use_parallel_vae"
41 | # COMPILE_FLAG="--use_torch_compile"
42 |
43 | torchrun --nproc_per_node=8 $DISTARGAS \
44 | ./examples/$SCRIPT \
45 | --model=$MODEL_ID \
46 | $PARALLEL_ARGS \
47 | $TASK_ARGS \
48 | $OUTPUT_ARGS \
49 | --num_inference_steps $INFERENCE_STEP \
50 | --warmup_steps=1 \
51 | --prompt="brown dog laying on the ground with a metal bowl in front of him." \
52 | $CFG_ARGS \
53 | $PARALLLEL_VAE \
54 | $COMPILE_FLAG
55 |
--------------------------------------------------------------------------------
/tests/layers/feedforward_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 | from diffusers.models.attention import FeedForward
4 | from xfuser.model_executor.layers.feedforward import xFuserFeedForwardWrapper
5 | from xfuser.core.distributed import (
6 | init_distributed_environment,
7 | initialize_model_parallel,
8 | )
9 | from torch import distributed as dist
10 |
11 |
12 | class TestFeedForward(unittest.TestCase):
13 | def setUp(self):
14 | init_distributed_environment()
15 |
16 | self.world_size = dist.get_world_size()
17 | self.local_rank = dist.get_rank()
18 |
19 | initialize_model_parallel(tensor_parallel_degree=self.world_size)
20 |
21 | def test_feedforward(self):
22 | torch.manual_seed(0)
23 | self.input_data = torch.ones(1, 20).cuda(self.local_rank)
24 | dist.broadcast(self.input_data, src=0)
25 |
26 | torch.manual_seed(0)
27 | self.model1 = FeedForward(20, 5, bias=True, activation_fn="geglu").cuda(
28 | self.local_rank
29 | )
30 |
31 | # Broadcast the parameters
32 | for param in self.model1.parameters():
33 | dist.broadcast(param.data, src=0)
34 |
35 | output1 = self.model1(self.input_data)
36 |
37 | self.model2 = xFuserFeedForwardWrapper(self.model1)
38 | output2 = self.model2(self.input_data)
39 |
40 | print(output1 - output2)
41 | self.assertTrue(torch.allclose(output1, output2, atol=1e-2))
42 |
43 |
44 | # torchrun --nproc_per_node=2 ./tests/layers/feedforward_test.py
45 | if __name__ == "__main__":
46 | unittest.main()
47 |
--------------------------------------------------------------------------------
/benchmark/fid/README.md:
--------------------------------------------------------------------------------
1 | ### Procedure
2 | #### Prerequisite
3 | Firstly, Install the following additional dependencies before testing:
4 | ```
5 | pip3 install clean-fid
6 | ```
7 |
8 | #### Reference Batch Preparation
9 | Download the COCO dataset from [here](https://huggingface.co/datasets/HuggingFaceM4/COCO), only the validation set and caption dataset are needed. Unzip the [val2014.zip](http://images.cocodataset.org/zips/val2014.zip) and [caption_datasets.zip](https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip) and you'll get the files in the following format:
10 | ```
11 | val2014/
12 | COCO_val2014_000000xxxxxx.jpg
13 | ...
14 | dataset_coco.json
15 | dataset_flickr30k.json
16 | dataset_flickr8k.json
17 | ```
18 | Then run the following command to process the reference images:
19 | ```
20 | python3 process_ref_data.py --coco_json dataset_coco.json --num_samples 30000 --input_dir $PATH_TO_VAL2014 --output_dir $REF_IMAGES_FOLODER
21 | ```
22 |
23 | #### Sample Batch Generation
24 | Run the following command to generate the sample images:
25 | ```
26 | bash ./benchmark/fid/generate.sh
27 | ```
28 | You can edit the `generate.sh` to change the model type, caption file, sample images folder, etc.
29 |
30 | #### Evaluate the results using clean-fid
31 | After you completing the above procedure, you'll get the reference images and generated images in the `$REF_IMAGES_FOLODER` and `$SAMPLE_IMAGES_FOLODER` (replace them with the corresponding folders). You can evalute the results with `compute_fid.py` by running:
32 |
33 | ```
34 | python compute_fid.py --ref_path $REF_IMAGES_FOLODER --sample_path $SAMPLE_IMAGES_FOLODER
35 | ```
36 |
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/base_scheduler.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod, ABCMeta
2 | from functools import wraps
3 | from typing import List
4 |
5 | from diffusers.schedulers import SchedulerMixin
6 | from xfuser.core.distributed import (
7 | get_pipeline_parallel_world_size,
8 | get_sequence_parallel_world_size,
9 | )
10 | from xfuser.model_executor.base_wrapper import xFuserBaseWrapper
11 |
12 |
13 | class xFuserSchedulerBaseWrapper(xFuserBaseWrapper, metaclass=ABCMeta):
14 | def __init__(
15 | self,
16 | module: SchedulerMixin,
17 | ):
18 | super().__init__(
19 | module=module,
20 | )
21 |
22 | def __setattr__(self, name, value):
23 | if name == "module":
24 | super().__setattr__(name, value)
25 | elif (
26 | hasattr(self, "module")
27 | and self.module is not None
28 | and hasattr(self.module, name)
29 | ):
30 | setattr(self.module, name, value)
31 | else:
32 | super().__setattr__(name, value)
33 |
34 | @abstractmethod
35 | def step(self, *args, **kwargs):
36 | pass
37 |
38 | @staticmethod
39 | def check_to_use_naive_step(func):
40 | @wraps(func)
41 | def check_naive_step_fn(self, *args, **kwargs):
42 | if (
43 | get_pipeline_parallel_world_size() == 1
44 | and get_sequence_parallel_world_size() == 1
45 | ):
46 | return self.module.step(*args, **kwargs)
47 | else:
48 | return func(self, *args, **kwargs)
49 |
50 | return check_naive_step_fn
51 |
--------------------------------------------------------------------------------
/docs/methods/parallel_vae.md:
--------------------------------------------------------------------------------
1 | ## Patch Parallel VAE
2 |
3 | The [stabilityai/sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse) adopted by diffusers bring OOM to high-resolution images (8192px on A100). A critical issue is the CUDA memory spike, as documented in [diffusers/issues/5924](https://github.com/huggingface/diffusers/issues/5924).
4 |
5 | To address this limitation, we developed [DistVAE](https://github.com/xdit-project/DistVAE), an solution that enables efficient processing of high-resolution images in parallel. Our approach incorporates two key strategies:
6 |
7 | * Patch Parallel: We divide the feature maps in the latent space into multiple patches and perform sequence parallel VAE decoding across different devices. This technique reduces the peak memory required for intermediate activations to 1/$N$, where N is the number of devices utilized.
8 | For the convolutional operator in VAE, we require the communication of the halo region data of the image as shown in the following figures.
9 |
10 |
11 |

12 |
13 |
14 | * Chunked Input Processing: Similar to [MIT-patch-conv](https://hanlab.mit.edu/blog/patch-conv), we split the input feature map into chunks and feed them into convolution operator sequentially. This approach minimizes temporary memory consumption.
15 |
16 | By synergizing these two methods, we have dramatically expanded the capabilities of VAE decoding. Our implementation successfully handles image resolutions up to 10240px - an impressive 11-fold increase compared to the default VAE implmentation.
--------------------------------------------------------------------------------
/docs/methods/usp.md:
--------------------------------------------------------------------------------
1 | ## USP: A Unified Sequence Parallelism Approach for Long Context Generative AI
2 | [Chinese Blog 1](https://zhuanlan.zhihu.com/p/698031151); [Chinese Blog 2](https://zhuanlan.zhihu.com/p/689067888)
3 |
4 | DeepSpeed-Ulysses and Ring-Attention are not mutually exclusive options.
5 | Both should be used in a mixed manner to jointly split the sequence dimension.
6 | By adjusting their parallelism degrees to ensure that ulysses-degree multiplied by ring-degree equals sp-degree, we refer to this as Unified-SP.
7 | The advantage of Unified-SP is that it encompasses the capabilities of both original methods without any loss, only offering additional benefits.
8 | Firstly, it eliminates the restriction that Ulysses' sp-degree must be less than the number of attention heads.
9 | Moreover, the communication pattern of mixed parallelism is more friendly to heterogeneous networks, providing acceleration over PCIe and in multi-machine multi-GPU environments compared to using Ulysses or Ring alone. Therefore, we recommend using the Unified-SP implementation as the default sequence parallelism solution.
10 |
11 | In xDiT, we utilize the USP implementation from [feifeibear/long-context-attention](https://github.com/feifeibear/long-context-attention). Since DiT does not use Causal Attention, there is no need for load balancing operations on Ring-Attention. For more details, please refer to the following [paper](https://arxiv.org/abs/2405.07719).
12 |
13 | ```
14 | @article{fang2024unified,
15 | title={USP: A Unified Sequence Parallelism Approach for Long Context Generative AI},
16 | author={Fang, Jiarui and Zhao, Shangchun},
17 | journal={arXiv preprint arXiv:2405.07719},
18 | year={2024}
19 | }
20 | ```
--------------------------------------------------------------------------------
/xfuser/ray/worker/worker_wrappers.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The xDiT team.
2 | # Adapted from
3 | # https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker_base.py
4 | # Copyright (c) 2023, vLLM team. All rights reserved.
5 | import os
6 | from abc import ABC
7 | from typing import Any, Dict
8 |
9 | from xfuser.ray.worker.utils import update_environment_variables, resolve_obj_by_qualname
10 | from xfuser.config.config import ParallelConfig
11 |
12 | class BaseWorkerWrapper(ABC):
13 | def __init__(self, worker_cls: str):
14 | self.worker_cls = worker_cls
15 | self.worker = None
16 |
17 | # lazy import
18 | def init_worker(self, *args, **kwargs):
19 | worker_class = resolve_obj_by_qualname(
20 | self.worker_cls)
21 | self.worker = worker_class(*args, **kwargs)
22 | assert self.worker is not None
23 |
24 | def execute_method(self, method: str, *args, **kwargs) -> Any:
25 | method = getattr(self, method, None) or getattr(
26 | self.worker, method, None)
27 | if not method:
28 | raise (AttributeError(
29 | f"Method {method} not found in Worker class"))
30 | return method(*args, **kwargs)
31 |
32 | def update_environs(environs: Dict[str, str]):
33 | if "CUDA_VISIBLE_DEVICES" in environs and "CUDA_VISIBLE_DEVICES" in os.environ:
34 | del os.environ["CUDA_VISIBLE_DEVICES"]
35 | update_environment_variables(environs)
36 |
37 |
38 | class RayWorkerWrapper(BaseWorkerWrapper):
39 | def __init__(self, parallel_config: ParallelConfig, worker_cls: str, rank: int) -> None:
40 | super().__init__(worker_cls)
41 | self.init_worker(parallel_config, rank)
--------------------------------------------------------------------------------
/docs/fid/FID.md:
--------------------------------------------------------------------------------
1 |
2 | ### Procedure
3 | #### Prerequisite
4 | Firstly, Install the following additional dependencies before testing:
5 | ```
6 | pip3 install datasets tensorflow scipy
7 | ```
8 |
9 | #### Sample Batch Generation
10 | Then you can use `scripts/generate.py` to generate images with COCO captions. An example command is as follow:
11 | ```
12 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --rdzv-endpoint=localhost:8070 scripts/generate.py --pipeline pixart --scheduler dpm-solver --warmup_steps 4 --parallelism pipeline --no_cuda_graph --dataset coco --no_split_batch --guidance_scale 2.0 --pp_num_patch 8.0
13 | ```
14 |
15 | After that, you can use `scripts/npz.py` to pack the generated images into a `.npz` file, where the `$GENERATED_IMAGES_FOLODER` is the path you saved the generated images, while `$IMAGES_NUM` is the total images count:
16 | ```
17 | python3 scripts/npz.py --sample_dir $GENERATED_IMAGES_FOLODER --num $IMAGES_NUM
18 | ```
19 |
20 | #### Reference Batch Generation
21 | To get the COCO ref images, you can run the following commands:
22 | ```
23 | python3 scripts/dump_coco.py
24 | ```
25 | Then you could use `scripts/npz.py` to pack the reference images into a `.npz` file as well, where the `$REF_IMAGES_FOLODER` is the path you saved the reference images, while `$IMAGES_NUM` is the total images count:
26 | ```
27 | python3 scripts/npz.py --sample_dir $REF_IMAGES_FOLODER --num $IMAGES_NUM
28 | ```
29 |
30 | #### Evaluate the results
31 | After you completing the above procedure, you'll get two .npz files `$SAMPLE_NPZ` and `$REF_NPZ` (replace them with the corresponding files). You can evalute the results with `scripts/evaluator` by running:
32 | ```
33 | python3 scripts/evaluator.py --ref_batch $REF_NPZ --sample_batch $SAMPLE_NPZ
34 | ```
--------------------------------------------------------------------------------
/docs/methods/hybrid_zh.md:
--------------------------------------------------------------------------------
1 |
2 | ## 混合并行设计
3 |
4 | xDiT设计目标是扩展推理过程到超大规模,比如异构网络互联条件下多机多卡,比如以太网和PCIe。单一并行方式,比如PipeFusion或者SP,很难同时做到这两点,不同并行方式混合在一起变得尤为重要。
5 |
6 | xDiT支持四种并行方式:PipeFusion、Sequence、Data和CFG Parallel。其中,Data和CFG Parallel在图像间并行相对简单,而PipeFusion和Sequence在图像内部的不同Patch间并行则较为复杂。能让着两种并行方式的混合使用,正式xDiT核心创新点之一。
7 |
8 | PipeFusion利用Input Tempor Redundancy特点,使用过时的KV(Stale KV)进行Attention计算,这使得PipeFusion无法像大型语言模型(LLM)那样轻松地实现并行策略的混合。具体来说,使用标准的序列并行接口,如RingAttention、Ulysses或USP,无法满足SP与PipeFusion混合并行的需求。
9 |
10 | 我们对这个问题具体说明,下图展示了pipe_degree=4,sp_degree=2的混合并行方法。设置`num_pipeline_patch`=4,图片切分为M=`num_pipeline_patch*sp_degree`=8个Patch,分别是P0~P7。
11 |
12 |
13 |

14 |
15 |
16 | Standard SP Attention实现,输入Q,K,V和输出O都是沿着序列维度切分,且切分方式一致。如果不同rank的输入patch没有重叠,每个micro step计算出fresh KV更新的位置在不同rank间也没有重叠。如下图所示,standard SP的KV Buffer中黄色部分是SP0 rank=0拥有的fresh KV,绿色部分是SP1 rank=1拥有的fresh KV,二者并不相同。在这个diffusion step内,device=0无法拿到P1,3,5,7的fresh KV进行计算,但是PipeFusion则需要在下一个diffusion step中,拥有上一个diffusion step全部的KV。standard SP只拥有1/sp_degree的fresh kv buffer,因此无法获得混合并行推理正确的结果。
17 |
18 |
19 |

20 |
21 |
22 |
23 |
24 | xDiT专门定制了序列并行的实现方式,以适应这种混合并行的需求。xDiT使用`xFuserLongContextAttention`把SP的中间结果存在KV Buffer内。这样效果如下图,每个micro-step SP执行完毕后,SP Group内不同rank设备的fresh KV是replicate的。这样一个diffusion step后,SP Group所有设备的KV Buffer都更新成最新,供下一个Diffusion Step使用。
25 |
26 |
27 |
28 |

29 |
--------------------------------------------------------------------------------
/docs/performance/stepvideo.md:
--------------------------------------------------------------------------------
1 | ## Step-Video-T2V 30B Performance
2 |
3 | ### Evaluation Protocol
4 | The benchmark was conducted using the open-source Step-Video-T2V 30B model to evaluate SP (Sequence Parallelism) and TP (Tensor Parallelism) performance. We applied ulysses_degree as sp_degree.
5 |
6 | Implementation reference:
7 | `https://github.com/stepfun-ai/Step-Video-T2V/tree/main#multi-gpu-parallel-deployment`
8 |
9 | ### Nvidia H20 Cluster (8×NVLink)
10 |
11 | #### Parallel Strategy Comparison
12 | | GPUs | Parallel Type | Configuration | Latency | Speedup Ratio | Memory Usage |
13 | |-------|--------------|---------------|-----------|---------------|--------------------|
14 | | 1 | Baseline | `TP1 SP1` | 213.60s | 1.00x | 92,170M |
15 | | 2 | TP | `TP2` | 108.97s | 0.98x | 57,458M ▼37.7% |
16 | | 2 | SP | `SP2` | 108.13s | 0.99x | 86,258M ▼6.4% |
17 | | 4 | TP | `TP4` | 57.61s | 0.93x | 36,566M ▼60.3% |
18 | | 4 | SP | `SP4` | 57.01s | 0.94x | 78,226M ▼15.1% |
19 | | 8 | TP | `TP8` | 30.40s | 0.88x | 30,028M ▼67.4% |
20 | | 8 | SP | `SP8` | 30.10s | 0.89x | 79,684M ▼13.5% |
21 |
22 | #### Key Findings
23 | - **Hardware Compatibility**:
24 | - Consumer GPUs (5090/5090D): Full training support on 32GB×8 configuration
25 | - Inference Accelerators (L20/L40): Full parameter inference on 48GB×4 configuration
26 |
27 | - **Efficiency Metrics**:
28 | - TP8 achieves 67.4% memory optimization (53.9% higher than SP8)
29 | - Mixed-parallel latency trend remains within <12% deviation from theoretical expectation
30 |
31 | - **Scalability**:
32 | - Multi-dimensional parameter slicing enables near-linear scaling efficiency
33 | - Layered communication optimization reduces cross-node synchronization overhead by 75%
34 |
--------------------------------------------------------------------------------
/examples/run_consisid.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | export PYTHONPATH=$PWD:$PYTHONPATH
5 |
6 | # ConsisID configuration
7 | SCRIPT="consisid_example.py"
8 | MODEL_ID="/cfs/dit/ConsisID-preview"
9 | INFERENCE_STEP=50
10 |
11 | mkdir -p ./results
12 |
13 | # ConsisID specific task args
14 | TASK_ARGS="--height 480 --width 720 --num_frames 49 --guidance_scale 6.0"
15 |
16 | # ConsisID parallel configuration
17 | N_GPUS=6
18 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 3"
19 | # CFG_ARGS="--use_cfg_parallel"
20 |
21 | # Uncomment and modify these as needed
22 | # PIPEFUSION_ARGS="--num_pipeline_patch 8"
23 | # OUTPUT_ARGS="--output_type latent"
24 | # PARALLLEL_VAE="--use_parallel_vae"
25 | # ENABLE_TILING="--enable_tiling"
26 | # COMPILE_FLAG="--use_torch_compile"
27 |
28 | torchrun --master_port=1234 --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
29 | --model $MODEL_ID \
30 | $PARALLEL_ARGS \
31 | $TASK_ARGS \
32 | $PIPEFUSION_ARGS \
33 | $OUTPUT_ARGS \
34 | --num_inference_steps $INFERENCE_STEP \
35 | --warmup_steps 0 \
36 | --prompt "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel." \
37 | --img_file_path "https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true" \
38 | $CFG_ARGS \
39 | $PARALLLEL_VAE \
40 | $ENABLE_TILING \
41 | $COMPILE_FLAG
--------------------------------------------------------------------------------
/examples/run_consisid_usp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | export PYTHONPATH=$PWD:$PYTHONPATH
5 |
6 | # ConsisID configuration
7 | SCRIPT="consisid_usp_example.py"
8 | MODEL_ID="/cfs/dit/ConsisID-preview"
9 | INFERENCE_STEP=50
10 |
11 | mkdir -p ./results
12 |
13 | # ConsisID specific task args
14 | TASK_ARGS="--height 480 --width 720 --num_frames 49 --guidance_scale 6.0"
15 |
16 | # ConsisID parallel configuration
17 | N_GPUS=4
18 | PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 2"
19 | CFG_ARGS="--use_cfg_parallel"
20 |
21 | # Uncomment and modify these as needed
22 | # PIPEFUSION_ARGS="--num_pipeline_patch 8"
23 | # OUTPUT_ARGS="--output_type latent"
24 | # PARALLLEL_VAE="--use_parallel_vae"
25 | # ENABLE_TILING="--enable_tiling"
26 | # COMPILE_FLAG="--use_torch_compile"
27 |
28 | torchrun --master_port=1234 --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
29 | --model $MODEL_ID \
30 | $PARALLEL_ARGS \
31 | $TASK_ARGS \
32 | $PIPEFUSION_ARGS \
33 | $OUTPUT_ARGS \
34 | --num_inference_steps $INFERENCE_STEP \
35 | --warmup_steps 0 \
36 | --prompt "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel." \
37 | --img_file_path "https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true" \
38 | $CFG_ARGS \
39 | $PARALLLEL_VAE \
40 | $ENABLE_TILING \
41 | $COMPILE_FLAG
--------------------------------------------------------------------------------
/xfuser/model_executor/base_wrapper.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod, ABCMeta
2 | from functools import wraps
3 | from typing import Any, List, Optional
4 |
5 | from xfuser.core.distributed.parallel_state import (
6 | get_classifier_free_guidance_world_size,
7 | get_pipeline_parallel_world_size,
8 | get_sequence_parallel_world_size,
9 | get_tensor_model_parallel_world_size,
10 | )
11 | from xfuser.core.distributed.runtime_state import get_runtime_state
12 | from xfuser.core.fast_attention import get_fast_attn_enable
13 |
14 |
15 | class xFuserBaseWrapper(metaclass=ABCMeta):
16 |
17 | def __init__(
18 | self,
19 | module: Any,
20 | ):
21 | self.module = module
22 | self.module_type = type(module)
23 |
24 | def __getattr__(self, name: str):
25 | try:
26 | return getattr(self.module, name)
27 | except RecursionError:
28 | raise AttributeError(
29 | f"module {type(self.module).__name__} has no " f"attribute {name}"
30 | )
31 |
32 | def __str__(self):
33 | return str(self.module)
34 |
35 | @staticmethod
36 | def forward_check_condition(func):
37 | @wraps(func)
38 | def check_condition_fn(self, *args, **kwargs):
39 | if (
40 | get_pipeline_parallel_world_size() == 1
41 | and get_classifier_free_guidance_world_size() == 1
42 | and get_sequence_parallel_world_size() == 1
43 | and get_tensor_model_parallel_world_size() == 1
44 | and get_fast_attn_enable() == False
45 | ):
46 | return func(self, *args, **kwargs)
47 | if not get_runtime_state().is_ready():
48 | raise ValueError(
49 | "Runtime state is not ready, please call RuntimeState.set_input_parameters "
50 | "before calling forward"
51 | )
52 | return func(self, *args, **kwargs)
53 |
54 | return check_condition_fn
55 |
--------------------------------------------------------------------------------
/xfuser/model_executor/layers/register.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Type
2 | import torch
3 | import torch.nn as nn
4 |
5 | from xfuser.logger import init_logger
6 | from xfuser.model_executor.layers.base_layer import xFuserLayerBaseWrapper
7 |
8 | logger = init_logger(__name__)
9 |
10 |
11 | class xFuserLayerWrappersRegister:
12 | _XFUSER_LAYER_MAPPING: Dict[
13 | Type[nn.Module], Type[xFuserLayerBaseWrapper]
14 | ] = {}
15 |
16 | @classmethod
17 | def register(cls, origin_layer_class: Type[nn.Module]):
18 | def decorator(xfuser_layer_wrapper: Type[xFuserLayerBaseWrapper]):
19 | if not issubclass(xfuser_layer_wrapper, xFuserLayerBaseWrapper):
20 | raise ValueError(
21 | f"{xfuser_layer_wrapper.__class__.__name__} is not a "
22 | f"subclass of xFuserLayerBaseWrapper"
23 | )
24 | cls._XFUSER_LAYER_MAPPING[origin_layer_class] = xfuser_layer_wrapper
25 | return xfuser_layer_wrapper
26 |
27 | return decorator
28 |
29 | @classmethod
30 | def get_wrapper(cls, layer: nn.Module) -> xFuserLayerBaseWrapper:
31 | candidate = None
32 | candidate_origin = None
33 | for (
34 | origin_layer_class,
35 | xfuser_layer_wrapper,
36 | ) in cls._XFUSER_LAYER_MAPPING.items():
37 | if isinstance(layer, origin_layer_class):
38 | if (
39 | (candidate is None and candidate_origin is None)
40 | or origin_layer_class == layer.__class__
41 | or issubclass(origin_layer_class, candidate_origin)
42 | ):
43 | candidate_origin = origin_layer_class
44 | candidate = xfuser_layer_wrapper
45 |
46 | if candidate is None:
47 | raise ValueError(
48 | f"Layer class {layer.__class__.__name__} "
49 | f"is not supported by xFuser"
50 | )
51 | else:
52 | return candidate
53 |
--------------------------------------------------------------------------------
/xfuser/model_executor/pipelines/pipeline_stable_diffusion_xl.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union
2 | import torch
3 | import os
4 | from xfuser.model_executor.patch.unet_patch import apply_unet_cfg_parallel_monkey_patch
5 |
6 | from diffusers import StableDiffusionXLPipeline
7 | from xfuser.model_executor.pipelines.base_pipeline import xFuserPipelineBaseWrapper
8 | from xfuser.core.distributed import (
9 | get_classifier_free_guidance_world_size,
10 | )
11 | from xfuser.config import EngineConfig, InputConfig
12 | from xfuser.model_executor.pipelines.register import xFuserPipelineWrapperRegister
13 |
14 | @xFuserPipelineWrapperRegister.register(StableDiffusionXLPipeline)
15 | class xFuserStableDiffusionXLPipeline(xFuserPipelineBaseWrapper):
16 | def __init__(self, pipeline: StableDiffusionXLPipeline, engine_config: EngineConfig):
17 | super().__init__(pipeline=pipeline, engine_config=engine_config)
18 | if get_classifier_free_guidance_world_size() == 2:
19 | self.module = apply_unet_cfg_parallel_monkey_patch(self.module)
20 |
21 | @classmethod
22 | def from_pretrained(
23 | cls,
24 | pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
25 | engine_config: EngineConfig,
26 | return_org_pipeline: bool = False,
27 | **kwargs,
28 | ):
29 | pipeline = StableDiffusionXLPipeline.from_pretrained(
30 | pretrained_model_name_or_path, **kwargs
31 | )
32 | if return_org_pipeline:
33 | return pipeline
34 | return cls(pipeline, engine_config)
35 |
36 | @xFuserPipelineBaseWrapper.check_model_parallel_state(
37 | sequence_parallel_available=False,
38 | pipefusion_parallel_available=False,
39 | )
40 | @xFuserPipelineBaseWrapper.check_to_use_naive_forward
41 | @xFuserPipelineBaseWrapper.enable_data_parallel
42 | def __call__(
43 | self,
44 | *args,
45 | **kwargs,
46 | ):
47 | return self.module(*args, **kwargs)
48 |
--------------------------------------------------------------------------------
/docs/performance/consisid_zh.md:
--------------------------------------------------------------------------------
1 | ## ConsisID Performance Report
2 |
3 | [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) 是一种身份保持的文本到视频生成模型,其通过频率分解在生成的视频中保持面部一致性。xDiT 目前整合了 USP 技术(包括 Ulysses 注意力和 Ring 注意力)和 CFG 并行来提高推理速度,同时 PipeFusion 的工作正在进行中。我们对基于 diffusers 库的单 GPU ConsisID 推理与我们提出的并行化版本在生成 49帧(6秒)720x480 分辨率视频时的性能差异进行了深入分析。由于我们可以任意组合不同的并行方式以获得不同的性能。在本文中,我们对xDiT在1-6张H100(Nvidia)GPU上的加速性能进行了系统测试。
4 |
5 | 如表所示,对于模型ConsisID,无论是采用 Ulysses Attention、Ring Attention 还是 Classifier-Free Guidance(CFG)并行,均观察到推理延迟的显著降低。值得注意的是,由于其较低的通信开销,CFG 并行方法在性能上优于其他两种技术。通过结合序列并行和 CFG 并行,我们成功提升了推理效率。随着并行度的增加,推理延迟持续下降。在最优配置下,xDiT 相对于单GPU推理实现了 3.21 倍的加速,使得每次迭代仅需 0.72 秒。鉴于 ConsisID 默认的 50 次迭代,总计 35 秒即可完成 49帧 视频的端到端生成,并且运行过程中占用GPU显存40G。
6 |
7 | ### 720x480 Resolution (49 frames, 50 steps)
8 |
9 |
10 | | N-GPUs | ulysses_degree | ring_degree | cfg-parallel | times |
11 | |:------:|:--------------:|:-----------:|:------------:|:---------:|
12 | | 6 | 2 | 3 | 1 | 44.89s |
13 | | 6 | 3 | 2 | 1 | 44.24s |
14 | | 6 | 1 | 3 | 2 | 35.78s |
15 | | 6 | 3 | 1 | 2 | 38.35s |
16 | | 4 | 2 | 1 | 2 | 41.37s |
17 | | 4 | 1 | 2 | 2 | 40.68s |
18 | | 3 | 3 | 1 | 1 | 53.57s |
19 | | 3 | 1 | 3 | 1 | 55.51s |
20 | | 2 | 1 | 2 | 1 | 70.19s |
21 | | 2 | 2 | 1 | 1 | 76.56s |
22 | | 2 | 1 | 1 | 2 | 59.72s |
23 | | 1 | 1 | 1 | 1 | 114.87s |
24 |
25 | ## Resources
26 |
27 | 通过以下资源了解有关 ConsisID 的更多信息:
28 |
29 | - 一段 [视频](https://www.youtube.com/watch?v=PhlgC-bI5SQ) 演示了 ConsisID 的主要功能;
30 | - 有关更多详细信息,请参阅研究论文 [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://hf.co/papers/2411.17440)。
31 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/customized/step_video_t2v/attentions.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from einops import rearrange
4 |
5 | try:
6 | from xfuser.core.long_ctx_attention import xFuserLongContextAttention
7 | except ImportError:
8 | xFuserLongContextAttention = None
9 |
10 |
11 | class Attention(nn.Module):
12 | def __init__(self):
13 | super().__init__()
14 |
15 | def attn_processor(self, attn_type):
16 | if attn_type == 'torch':
17 | return self.torch_attn_func
18 | elif attn_type == 'parallel':
19 | return self.parallel_attn_func
20 | else:
21 | raise Exception('Not supported attention type...')
22 |
23 | def torch_attn_func(
24 | self,
25 | q,
26 | k,
27 | v,
28 | attn_mask=None,
29 | causal=False,
30 | drop_rate=0.0,
31 | **kwargs
32 | ):
33 |
34 | if attn_mask is not None and attn_mask.dtype != torch.bool:
35 | attn_mask = attn_mask.to(q.dtype)
36 |
37 | if attn_mask is not None and attn_mask.ndim == 3:
38 | n_heads = q.shape[2]
39 | attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
40 |
41 | q, k, v = map(lambda x: rearrange(x, 'b s h d -> b h s d'), (q, k, v))
42 | x = torch.nn.functional.scaled_dot_product_attention(
43 | q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
44 | )
45 | x = rearrange(x, 'b h s d -> b s h d')
46 | return x
47 |
48 | def parallel_attn_func(
49 | self,
50 | q,
51 | k,
52 | v,
53 | causal=False,
54 | **kwargs
55 | ):
56 | assert xFuserLongContextAttention is not None;
57 | 'to use sequence parallel attention, xFuserLongContextAttention should be imported...'
58 | hybrid_seq_parallel_attn = xFuserLongContextAttention()
59 | x = hybrid_seq_parallel_attn(
60 | None, q, k, v, causal=causal
61 | )
62 | return x
63 |
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/register.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Type
2 | import torch
3 | import torch.nn as nn
4 |
5 | from xfuser.logger import init_logger
6 | from xfuser.model_executor.schedulers.base_scheduler import xFuserSchedulerBaseWrapper
7 |
8 | logger = init_logger(__name__)
9 |
10 | class xFuserSchedulerWrappersRegister:
11 | _XFUSER_SCHEDULER_MAPPING: Dict[
12 | Type[nn.Module],
13 | Type[xFuserSchedulerBaseWrapper]
14 | ] = {}
15 |
16 | @classmethod
17 | def register(cls, origin_scheduler_class: Type[nn.Module]):
18 | def decorator(xfuser_scheduler_class: Type[nn.Module]):
19 | if not issubclass(xfuser_scheduler_class,
20 | xFuserSchedulerBaseWrapper):
21 | raise ValueError(
22 | f"{xfuser_scheduler_class.__class__.__name__} is not "
23 | f"a subclass of xFuserSchedulerBaseWrapper"
24 | )
25 | cls._XFUSER_SCHEDULER_MAPPING[origin_scheduler_class] = \
26 | xfuser_scheduler_class
27 | return xfuser_scheduler_class
28 | return decorator
29 |
30 | @classmethod
31 | def get_wrapper(
32 | cls,
33 | scheduler: nn.Module
34 | ) -> xFuserSchedulerBaseWrapper:
35 | candidate = None
36 | candidate_origin = None
37 | for (origin_scheduler_class,
38 | wrapper_class) in cls._XFUSER_SCHEDULER_MAPPING.items():
39 | if isinstance(scheduler, origin_scheduler_class):
40 | if ((candidate is None and candidate_origin is None) or
41 | origin_scheduler_class == scheduler.__class__ or
42 | issubclass(origin_scheduler_class, candidate_origin)):
43 | candidate_origin = origin_scheduler_class
44 | candidate = wrapper_class
45 |
46 | if candidate is None:
47 | logger.info(f"Scheduler class {scheduler.__class__.__name__} "
48 | f"is not supported by xFuser")
49 | return None
50 | else:
51 | return candidate
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/scheduling_ddpm.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple, Union
2 |
3 | import torch
4 | import torch.distributed
5 |
6 | from diffusers.utils.torch_utils import randn_tensor
7 | from diffusers.schedulers.scheduling_ddpm import (
8 | DDPMScheduler,
9 | DDPMSchedulerOutput,
10 | )
11 |
12 | from xfuser.core.distributed import (
13 | get_pipeline_parallel_world_size,
14 | get_sequence_parallel_world_size,
15 | get_runtime_state,
16 | )
17 | from .register import xFuserSchedulerWrappersRegister
18 | from .base_scheduler import xFuserSchedulerBaseWrapper
19 |
20 |
21 | @xFuserSchedulerWrappersRegister.register(DDPMScheduler)
22 | class xFuserDDPMSchedulerWrapper(xFuserSchedulerBaseWrapper):
23 |
24 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step
25 | def step(
26 | self,
27 | *args,
28 | generator=None,
29 | **kwargs,
30 | ) -> Union[DDPMSchedulerOutput, Tuple]:
31 | """
32 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
33 | process from the learned model outputs (most often the predicted noise).
34 |
35 | Args:
36 | model_output (`torch.Tensor`):
37 | The direct output from learned diffusion model.
38 | timestep (`float`):
39 | The current discrete timestep in the diffusion chain.
40 | sample (`torch.Tensor`):
41 | A current instance of a sample created by the diffusion process.
42 | generator (`torch.Generator`, *optional*):
43 | A random number generator.
44 | return_dict (`bool`, *optional*, defaults to `True`):
45 | Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
46 |
47 | Returns:
48 | [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
49 | If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
50 | tuple is returned where the first element is the sample tensor.
51 |
52 | """
53 | return self.module.step(*args, generator, **kwargs)
54 |
--------------------------------------------------------------------------------
/examples/run_service.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | # export NCCL_PXN_DISABLE=1
4 | # # export NCCL_DEBUG=INFO
5 | # export NCCL_SOCKET_IFNAME=eth0
6 | # export NCCL_IB_GID_INDEX=3
7 | # export NCCL_IB_DISABLE=0
8 | # export NCCL_NET_GDR_LEVEL=2
9 | # export NCCL_IB_QPS_PER_CONNECTION=4
10 | # export NCCL_IB_TC=160
11 | # export NCCL_IB_TIMEOUT=22
12 | # export NCCL_P2P=0
13 | # export CUDA_DEVICE_MAX_CONNECTIONS=1
14 |
15 | export PYTHONPATH=$PWD:$PYTHONPATH
16 |
17 | # Select the model type
18 | # The model is downloaded to a specified location on disk,
19 | # or you can simply use the model's ID on Hugging Face,
20 | # which will then be downloaded to the default cache path on Hugging Face.
21 |
22 | export MODEL_TYPE="Flux"
23 | # Configuration for different model types
24 | # script, model_id, inference_step
25 | declare -A MODEL_CONFIGS=(
26 | ["Flux"]="flux_service.py /cfs/dit/FLUX.1-schnell 4"
27 | )
28 |
29 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
30 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
31 | export SCRIPT MODEL_ID INFERENCE_STEP
32 | else
33 | echo "Invalid MODEL_TYPE: $MODEL_TYPE"
34 | exit 1
35 | fi
36 |
37 | mkdir -p ./results
38 |
39 | for HEIGHT in 1024
40 | do
41 | for N_GPUS in 1;
42 | do
43 |
44 | TASK_ARGS="--height $HEIGHT --width $HEIGHT --no_use_resolution_binning --guidance_scale 3.5"
45 |
46 | PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1"
47 |
48 |
49 |
50 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance.
51 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 "
52 |
53 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed.
54 | # OUTPUT_ARGS="--output_type latent"
55 |
56 | # PARALLLEL_VAE="--use_parallel_vae"
57 |
58 | # Another compile option is `--use_onediff` which will use onediff's compiler.
59 | # COMPILE_FLAG="--use_torch_compile"
60 |
61 | python ./examples/$SCRIPT \
62 | --model $MODEL_ID \
63 | $PARALLEL_ARGS \
64 | $TASK_ARGS \
65 | $PIPEFUSION_ARGS \
66 | $OUTPUT_ARGS \
67 | --num_inference_steps $INFERENCE_STEP \
68 | --warmup_steps 0 \
69 | --prompt "A small dog" \
70 | $CFG_ARGS \
71 | $PARALLLEL_VAE \
72 | $COMPILE_FLAG
73 |
74 | done
75 | done
76 |
77 |
78 |
--------------------------------------------------------------------------------
/xfuser/parallel.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | from xfuser.config.config import InputConfig
5 | from xfuser.core.distributed import (
6 | init_distributed_environment,
7 | initialize_model_parallel,
8 | )
9 | from xfuser.config import EngineConfig
10 | from xfuser.core.distributed.parallel_state import (
11 | get_data_parallel_rank,
12 | get_data_parallel_world_size,
13 | is_dp_last_group,
14 | )
15 | from xfuser.core.distributed.runtime_state import get_runtime_state
16 | from xfuser.logger import init_logger
17 | from xfuser.model_executor.pipelines.base_pipeline import xFuserPipelineBaseWrapper
18 | from xfuser.model_executor.pipelines.register import xFuserPipelineWrapperRegister
19 |
20 | logger = init_logger(__name__)
21 |
22 |
23 | class xDiTParallel:
24 | def __init__(self, pipe, engine_config: EngineConfig, input_config: InputConfig):
25 | xfuser_pipe_wrapper = xFuserPipelineWrapperRegister.get_class(pipe)
26 | self.pipe = xfuser_pipe_wrapper(pipeline=pipe, engine_config=engine_config)
27 | self.config = engine_config
28 | self.pipe.prepare_run(input_config)
29 |
30 | def __call__(
31 | self,
32 | *args,
33 | **kwargs,
34 | ):
35 | self.result = self.pipe(*args, **kwargs)
36 | return self.result
37 |
38 | def save(self, directory: str, prefix: str):
39 | dp_rank = get_data_parallel_rank()
40 | parallel_info = (
41 | f"dp{self.config.parallel_config.dp_degree}_cfg{self.config.parallel_config.cfg_degree}_"
42 | f"ulysses{self.config.parallel_config.ulysses_degree}_ring{self.config.parallel_config.ring_degree}_"
43 | f"pp{self.config.parallel_config.pp_degree}_patch{self.config.parallel_config.pp_config.num_pipeline_patch}"
44 | )
45 | if is_dp_last_group():
46 | path = Path(f"{directory}")
47 | path.mkdir(mode=755, parents=True, exist_ok=True)
48 | path = path / f"{prefix}_result_{parallel_info}_dprank{dp_rank}"
49 | for i, image in enumerate(self.result.images):
50 | image.save(f"{str(path)}_image{i}.png")
51 | print(f"{str(path)}_image{i}.png")
52 |
53 | def __del__(self):
54 | get_runtime_state().destroy_distributed_env()
55 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/transformers/register.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Type
2 | import torch
3 | import torch.nn as nn
4 |
5 | from xfuser.logger import init_logger
6 | from xfuser.model_executor.models.transformers.base_transformer import (
7 | xFuserTransformerBaseWrapper,
8 | )
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | class xFuserTransformerWrappersRegister:
14 | _XFUSER_TRANSFORMER_MAPPING: Dict[
15 | Type[nn.Module], Type[xFuserTransformerBaseWrapper]
16 | ] = {}
17 |
18 | @classmethod
19 | def register(cls, origin_transformer_class: Type[nn.Module]):
20 | def decorator(xfuser_transformer_class: Type[nn.Module]):
21 | if not issubclass(
22 | xfuser_transformer_class, xFuserTransformerBaseWrapper
23 | ):
24 | raise ValueError(
25 | f"{xfuser_transformer_class.__class__.__name__} is not "
26 | f"a subclass of xFuserTransformerBaseWrapper"
27 | )
28 | cls._XFUSER_TRANSFORMER_MAPPING[origin_transformer_class] = (
29 | xfuser_transformer_class
30 | )
31 | return xfuser_transformer_class
32 |
33 | return decorator
34 |
35 | @classmethod
36 | def get_wrapper(cls, transformer: nn.Module) -> xFuserTransformerBaseWrapper:
37 | candidate = None
38 | candidate_origin = None
39 | for (
40 | origin_transformer_class,
41 | wrapper_class,
42 | ) in cls._XFUSER_TRANSFORMER_MAPPING.items():
43 | if origin_transformer_class is None:
44 | continue
45 | if isinstance(transformer, origin_transformer_class):
46 | if (
47 | candidate is None
48 | or origin_transformer_class == transformer.__class__
49 | or issubclass(origin_transformer_class, candidate_origin)
50 | ):
51 | candidate_origin = origin_transformer_class
52 | candidate = wrapper_class
53 |
54 | if candidate is None:
55 | raise ValueError(
56 | f"Transformer class {transformer.__class__.__name__} "
57 | f"is not supported by xFuser"
58 | )
59 | else:
60 | return candidate
61 |
--------------------------------------------------------------------------------
/docs/developer/adding_models/readme.md:
--------------------------------------------------------------------------------
1 | # Apply xDiT to new models
2 |
3 | xDiT was initially developed to accelerate the inference process of Diffusion Transformers (DiTs) within Huggingface `diffusers`. However, with the rapid emergence of various DiT models, you may find yourself needing to support new models that xDiT hasn't yet accommodated or models that are not officially supported by `diffusers` at all.
4 |
5 | xDiT offers interfaces for multiple parallelization methods, including CFG parallelism, sequence parallelism, and PipeFusion, shown as below.
6 |
7 |
8 |

10 |
11 |
12 | CFG parallelism is the simplest method to implement, requiring only additional split and merge operations over the batch_size dimension during each iteration. By leveraging CFG parallelism, a nearly 2x speedup can be achieved when conducting inference on two GPUs. Sequence parallelism, on the other hand, involves splitting the sequence during each iteration and necessitates additional communication to handle attention computation in a distributed environment. xDiT introduces USP (Unified Sequence Parallelism) combining two existing sequence parallelism method such as Ulysses Attention and Ring Attention.
13 |
14 | PipeFusion is employed in situations where GPU memory is insufficient or the communication bandwidth between GPUs is low. The method distributes the model parameters among multiple GPUs. Supporting models with PipeFusion is more complex compared to CFG parallelism and USP, but it is useful given machines of limited GPU memory capacity or limited bandwidth.
15 |
16 | The parallelization methods mentioned above can be performed simultaneously to achieve further speed enhancements. For a detailed guide on leveraging CFG parallelism, USP, and PipeFusion using xDiT, refer to the following comprehensive tutorial.
17 |
18 | [Parallelize new models with CFG parallelism provided by xDiT](adding_model_cfg.md)
19 |
20 | [Parallelize new models with USP provided by xDiT](adding_model_usp.md)
21 |
22 | [Parallelize new models with USP provided by xDiT (text replica)](adding_model_usp_text_replica.md)
23 |
24 | [Parallelize new models with a hybrid of CFG parallelism and USP provided by xDiT](adding_model_cfg_usp.md)
25 |
26 | [Parallelize new models with PipeFusion, USP, and CFG parallelism provided by xDiT](adding_model_pipefusion.md)
--------------------------------------------------------------------------------
/xfuser/core/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .parallel_state import (
2 | get_world_group,
3 | get_dp_group,
4 | get_cfg_group,
5 | get_sp_group,
6 | get_pp_group,
7 | get_pipeline_parallel_world_size,
8 | get_pipeline_parallel_rank,
9 | is_pipeline_first_stage,
10 | is_pipeline_last_stage,
11 | get_data_parallel_world_size,
12 | get_data_parallel_rank,
13 | is_dp_last_group,
14 | get_classifier_free_guidance_world_size,
15 | get_classifier_free_guidance_rank,
16 | get_sequence_parallel_world_size,
17 | get_sequence_parallel_rank,
18 | get_ulysses_parallel_world_size,
19 | get_ulysses_parallel_rank,
20 | get_ring_parallel_world_size,
21 | get_ring_parallel_rank,
22 | init_distributed_environment,
23 | initialize_model_parallel,
24 | model_parallel_is_initialized,
25 | get_tensor_model_parallel_world_size,
26 | get_vae_parallel_group,
27 | get_vae_parallel_rank,
28 | get_vae_parallel_world_size,
29 | get_dit_world_size,
30 | init_vae_group,
31 | init_dit_group,
32 | get_dit_group,
33 | )
34 | from .runtime_state import (
35 | get_runtime_state,
36 | runtime_state_is_initialized,
37 | initialize_runtime_state,
38 | )
39 |
40 | __all__ = [
41 | "get_world_group",
42 | "get_dp_group",
43 | "get_cfg_group",
44 | "get_sp_group",
45 | "get_pp_group",
46 | "get_pipeline_parallel_world_size",
47 | "get_pipeline_parallel_rank",
48 | "is_pipeline_first_stage",
49 | "is_pipeline_last_stage",
50 | "get_data_parallel_world_size",
51 | "get_data_parallel_rank",
52 | "is_dp_last_group",
53 | "get_classifier_free_guidance_world_size",
54 | "get_classifier_free_guidance_rank",
55 | "get_sequence_parallel_world_size",
56 | "get_sequence_parallel_rank",
57 | "get_ulysses_parallel_world_size",
58 | "get_ulysses_parallel_rank",
59 | "get_ring_parallel_world_size",
60 | "get_ring_parallel_rank",
61 | "init_distributed_environment",
62 | "init_model_parallel_group",
63 | "initialize_model_parallel",
64 | "model_parallel_is_initialized",
65 | "get_runtime_state",
66 | "runtime_state_is_initialized",
67 | "initialize_runtime_state",
68 | "get_dit_world_size",
69 | "get_vae_parallel_group",
70 | "get_vae_parallel_rank",
71 | "get_vae_parallel_world_size",
72 | "init_vae_group",
73 | "init_dit_group",
74 | "get_dit_group",
75 | ]
76 |
--------------------------------------------------------------------------------
/examples/run_fastditattn.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | # export NCCL_PXN_DISABLE=1
4 | # # export NCCL_DEBUG=INFO
5 | # export NCCL_SOCKET_IFNAME=eth0
6 | # export NCCL_IB_GID_INDEX=3
7 | # export NCCL_IB_DISABLE=0
8 | # export NCCL_NET_GDR_LEVEL=2
9 | # export NCCL_IB_QPS_PER_CONNECTION=4
10 | # export NCCL_IB_TC=160
11 | # export NCCL_IB_TIMEOUT=22
12 | # export NCCL_P2P=0
13 | # export CUDA_DEVICE_MAX_CONNECTIONS=1
14 |
15 | export PYTHONPATH=$PWD:$PYTHONPATH
16 |
17 | # Select the model type
18 | # The model is downloaded to a specified location on disk,
19 | # or you can simply use the model's ID on Hugging Face,
20 | # which will then be downloaded to the default cache path on Hugging Face.
21 |
22 | export COCO_PATH="/cfs/fjr2/xDiT/coco/annotations/captions_val2014.json"
23 | export MODEL_TYPE="Pixart-alpha"
24 | # Configuration for different model types
25 | # script, model_id, inference_step
26 | declare -A MODEL_CONFIGS=(
27 | ["Pixart-alpha"]="pixartalpha_example.py /cfs/dit/PixArt-XL-2-1024-MS 20"
28 | ["Pixart-sigma"]="pixartsigma_example.py /cfs/dit/PixArt-Sigma-XL-2-2K-MS 20"
29 | )
30 |
31 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
32 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
33 | export SCRIPT MODEL_ID INFERENCE_STEP
34 | else
35 | echo "Invalid MODEL_TYPE: $MODEL_TYPE"
36 | exit 1
37 | fi
38 |
39 | mkdir -p ./results
40 |
41 | TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 4.5"
42 | FAST_ATTN_ARGS="--use_fast_attn --window_size 512 --n_calib 4 --threshold 0.15 --use_cache --coco_path $COCO_PATH"
43 |
44 |
45 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance.
46 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 "
47 |
48 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed.
49 | # OUTPUT_ARGS="--output_type latent"
50 |
51 | # PARALLLEL_VAE="--use_parallel_vae"
52 |
53 | # Another compile option is `--use_onediff` which will use onediff's compiler.
54 | # COMPILE_FLAG="--use_torch_compile"
55 |
56 | torchrun --nproc_per_node=1 ./examples/$SCRIPT \
57 | --model $MODEL_ID \
58 | $PARALLEL_ARGS \
59 | $TASK_ARGS \
60 | $PIPEFUSION_ARGS \
61 | $OUTPUT_ARGS \
62 | --num_inference_steps $INFERENCE_STEP \
63 | --warmup_steps 0 \
64 | --prompt "A small dog" \
65 | $CFG_ARGS \
66 | $FAST_ATTN_ARGS \
67 | $PARALLLEL_VAE \
68 | $COMPILE_FLAG
69 |
--------------------------------------------------------------------------------
/xfuser/model_executor/cache/diffusers_adapters/flux.py:
--------------------------------------------------------------------------------
1 | """
2 | adapted from https://github.com/ali-vilab/TeaCache.git
3 | adapted from https://github.com/chengzeyi/ParaAttention.git
4 | """
5 | import functools
6 | import unittest
7 |
8 | import torch
9 | from torch import nn
10 | from diffusers import DiffusionPipeline, FluxTransformer2DModel
11 | from xfuser.model_executor.cache.diffusers_adapters.registry import TRANSFORMER_ADAPTER_REGISTRY
12 |
13 | from xfuser.model_executor.cache import utils
14 |
15 | def create_cached_transformer_blocks(use_cache, transformer, rel_l1_thresh, return_hidden_states_first, num_steps):
16 | cached_transformer_class = {
17 | "Fb": utils.FBCachedTransformerBlocks,
18 | "Tea": utils.TeaCachedTransformerBlocks,
19 | }.get(use_cache)
20 |
21 | if not cached_transformer_class:
22 | raise ValueError(f"Unsupported use_cache value: {use_cache}")
23 |
24 | return cached_transformer_class(
25 | transformer.transformer_blocks,
26 | transformer.single_transformer_blocks,
27 | transformer=transformer,
28 | rel_l1_thresh=rel_l1_thresh,
29 | return_hidden_states_first=return_hidden_states_first,
30 | num_steps=num_steps,
31 | name=TRANSFORMER_ADAPTER_REGISTRY.get(type(transformer)),
32 | )
33 |
34 |
35 | def apply_cache_on_transformer(
36 | transformer: FluxTransformer2DModel,
37 | *,
38 | rel_l1_thresh=0.12,
39 | return_hidden_states_first=False,
40 | num_steps=8,
41 | use_cache="Fb",
42 | ):
43 | cached_transformer_blocks = nn.ModuleList([
44 | create_cached_transformer_blocks(use_cache, transformer, rel_l1_thresh, return_hidden_states_first, num_steps)
45 | ])
46 |
47 | dummy_single_transformer_blocks = torch.nn.ModuleList()
48 |
49 | original_forward = transformer.forward
50 |
51 | @functools.wraps(original_forward)
52 | def new_forward(
53 | self,
54 | *args,
55 | **kwargs,
56 | ):
57 | with unittest.mock.patch.object(
58 | self,
59 | "transformer_blocks",
60 | cached_transformer_blocks,
61 | ), unittest.mock.patch.object(
62 | self,
63 | "single_transformer_blocks",
64 | dummy_single_transformer_blocks,
65 | ):
66 | return original_forward(
67 | *args,
68 | **kwargs,
69 | )
70 |
71 | transformer.forward = new_forward.__get__(transformer)
72 |
73 | return transformer
74 |
75 |
--------------------------------------------------------------------------------
/benchmark/usp_latency_test.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import subprocess
4 | import argparse
5 |
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
7 |
8 |
9 | def run_command(cmd):
10 | process = subprocess.Popen(
11 | cmd,
12 | shell=True,
13 | stdout=subprocess.PIPE,
14 | stderr=subprocess.STDOUT,
15 | universal_newlines=True,
16 | )
17 | output = ""
18 | for line in process.stdout:
19 | if "epoch time:" in line or "Running test for size" in line:
20 | print(line.strip())
21 | output += line
22 | process.wait()
23 | if process.returncode != 0:
24 | print(f"Command failed: {cmd}")
25 | print(output + "\n")
26 | # subprocess.run(cmd, shell=True, check=True)
27 |
28 |
29 | def main():
30 | parser = argparse.ArgumentParser(description="Run benchmark tests")
31 | parser.add_argument("--model_id", type=str, required=True, help="Path to the model")
32 | parser.add_argument(
33 | "--sizes", type=int, nargs="+", required=True, help="List of sizes to test"
34 | )
35 | parser.add_argument(
36 | "--script",
37 | type=str,
38 | required=True,
39 | help="Script to run (e.g., tests/test_pixartalpha.py)",
40 | )
41 | parser.add_argument(
42 | "--n_gpus", type=int, nargs="+", required=True, help="Number of GPUs to use"
43 | )
44 | parser.add_argument("--steps", type=int, default=20, help="Number of steps")
45 | args = parser.parse_args()
46 | MODEL_ID = args.model_id
47 | SIZES = args.sizes
48 | SCRIPT = args.script
49 | N_GPUS = args.n_gpus
50 | STEPS = args.steps
51 |
52 | for size in SIZES:
53 | for num_gpus in N_GPUS:
54 | for i in range(int(math.log2(num_gpus)) + 1):
55 | ulysses_degree = int(math.pow(2, i))
56 | ring_degree = num_gpus // ulysses_degree
57 |
58 | print(
59 | f"Running test for size {size}, ulysses_degree {ulysses_degree}, ring_degree {ring_degree}",
60 | flush=True,
61 | )
62 | cmd = (
63 | f"torchrun --nproc_per_node={num_gpus} {SCRIPT} --prompt 'A small cat' --output_type 'latent' --model {MODEL_ID} "
64 | f"--height {size} --width {size} --ulysses_degree {ulysses_degree} --ring_degree {ring_degree} --num_inference_steps {STEPS}"
65 | )
66 |
67 | run_command(cmd)
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/examples/run.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | export PYTHONPATH=$PWD:$PYTHONPATH
4 |
5 | # Select the model type
6 | export MODEL_TYPE="Flux"
7 | # Configuration for different model types
8 | # script, model_id, inference_step
9 | declare -A MODEL_CONFIGS=(
10 | ["Pixart-alpha"]="pixartalpha_example.py /cfs/dit/PixArt-XL-2-1024-MS 20"
11 | ["Pixart-sigma"]="pixartsigma_example.py /cfs/dit/PixArt-Sigma-XL-2-2K-MS 20"
12 | ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20"
13 | ["Flux"]="flux_example.py /cfs/dit/FLUX.1-dev/ 28"
14 | ["FluxControl"]="flux_control_example.py /cfs/dit/FLUX.1-Depth-dev/ 28"
15 | ["HunyuanDiT"]="hunyuandit_example.py /cfs/dit/HunyuanDiT-v1.2-Diffusers 50"
16 | ["SDXL"]="sdxl_example.py /cfs/dit/stable-diffusion-xl-base-1.0 30"
17 | )
18 |
19 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
20 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
21 | export SCRIPT MODEL_ID INFERENCE_STEP
22 | else
23 | echo "Invalid MODEL_TYPE: $MODEL_TYPE"
24 | exit 1
25 | fi
26 |
27 | mkdir -p ./results
28 |
29 | # task args
30 | TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
31 |
32 | # cache args
33 | # CACHE_ARGS="--use_teacache"
34 | # CACHE_ARGS="--use_fbcache"
35 |
36 | # On 8 gpus, pp=2, ulysses=2, ring=1, cfg_parallel=2 (split batch)
37 | N_GPUS=8
38 | PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 2"
39 |
40 | # CFG_ARGS="--use_cfg_parallel"
41 |
42 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance.
43 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 "
44 |
45 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed.
46 | # OUTPUT_ARGS="--output_type latent"
47 |
48 | # PARALLLEL_VAE="--use_parallel_vae"
49 |
50 | # Another compile option is `--use_onediff` which will use onediff's compiler.
51 | # COMPILE_FLAG="--use_torch_compile"
52 |
53 |
54 | # Use this flag to quantize the T5 text encoder, which could reduce the memory usage and have no effect on the result quality.
55 | # QUANTIZE_FLAG="--use_fp8_t5_encoder"
56 |
57 | # export CUDA_VISIBLE_DEVICES=4,5,6,7
58 |
59 | torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
60 | --model $MODEL_ID \
61 | $PARALLEL_ARGS \
62 | $TASK_ARGS \
63 | $PIPEFUSION_ARGS \
64 | $OUTPUT_ARGS \
65 | --num_inference_steps $INFERENCE_STEP \
66 | --warmup_steps 1 \
67 | --prompt "brown dog laying on the ground with a metal bowl in front of him." \
68 | $CFG_ARGS \
69 | $PARALLLEL_VAE \
70 | $COMPILE_FLAG \
71 | $QUANTIZE_FLAG \
72 | $CACHE_ARGS \
73 |
--------------------------------------------------------------------------------
/examples/ray/ray_pixartsigma_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserPixArtSigmaPipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline
9 |
10 | def main():
11 | os.environ["MASTER_ADDR"] = "localhost"
12 | os.environ["MASTER_PORT"] = "12355"
13 | parser = FlexibleArgumentParser(description="xFuser Arguments")
14 | args = xFuserArgs.add_cli_args(parser).parse_args()
15 | engine_args = xFuserArgs.from_cli_args(args)
16 | engine_config, input_config = engine_args.create_config()
17 | model_name = engine_config.model_config.model.split("/")[-1]
18 | encoder_kwargs = {
19 | 'text_encoder': {
20 | 'model_class': T5EncoderModel,
21 | 'pretrained_model_name_or_path': engine_config.model_config.model,
22 | 'subfolder': 'text_encoder',
23 | 'torch_dtype': torch.float16
24 | },
25 | }
26 | # if args.use_fp8_t5_encoder:
27 | # from optimum.quanto import freeze, qfloat8, quantize
28 | # print(f"rank {local_rank} quantizing text encoder")
29 | # quantize(text_encoder, weights=qfloat8)
30 | # freeze(text_encoder)
31 |
32 | pipe = RayDiffusionPipeline.from_pretrained(
33 | PipelineClass=xFuserPixArtSigmaPipeline,
34 | pretrained_model_name_or_path=engine_config.model_config.model,
35 | engine_config=engine_config,
36 | torch_dtype=torch.float16,
37 | **encoder_kwargs
38 | )
39 | pipe.prepare_run(input_config)
40 |
41 | torch.cuda.reset_peak_memory_stats()
42 | start_time = time.time()
43 | output = pipe(
44 | height=input_config.height,
45 | width=input_config.width,
46 | prompt=input_config.prompt,
47 | num_inference_steps=input_config.num_inference_steps,
48 | output_type=input_config.output_type,
49 | use_resolution_binning=input_config.use_resolution_binning,
50 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
51 | clean_caption=False,
52 | )
53 | end_time = time.time()
54 | elapsed_time = end_time - start_time
55 | print(f"elapsed time:{elapsed_time}")
56 | if not os.path.exists("results"):
57 | os.mkdir("results")
58 |
59 | for _, images in enumerate(output):
60 | if images is not None:
61 | image = images[0]
62 | path = f"./results/{model_name}_ray_result.png"
63 | image.save(path)
64 | print(
65 | f"image saved to {path}"
66 | )
67 | break
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/examples/ray/ray_pixartalpha_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline
9 | from xfuser.model_executor.pipelines import xFuserPixArtAlphaPipeline
10 |
11 | def main():
12 | os.environ["MASTER_ADDR"] = "localhost"
13 | os.environ["MASTER_PORT"] = "12355"
14 | parser = FlexibleArgumentParser(description="xFuser Arguments")
15 | args = xFuserArgs.add_cli_args(parser).parse_args()
16 | engine_args = xFuserArgs.from_cli_args(args)
17 | engine_config, input_config = engine_args.create_config()
18 | model_name = engine_config.model_config.model.split("/")[-1]
19 | encoder_kwargs = {
20 | 'text_encoder': {
21 | 'model_class': T5EncoderModel,
22 | 'pretrained_model_name_or_path': engine_config.model_config.model,
23 | 'subfolder': 'text_encoder',
24 | 'torch_dtype': torch.float16
25 | },
26 | }
27 | # if args.use_fp8_t5_encoder:
28 | # from optimum.quanto import freeze, qfloat8, quantize
29 | # print(f"rank {local_rank} quantizing text encoder")
30 | # quantize(text_encoder, weights=qfloat8)
31 | # freeze(text_encoder)
32 |
33 | pipe = RayDiffusionPipeline.from_pretrained(
34 | PipelineClass=xFuserPixArtAlphaPipeline,
35 | pretrained_model_name_or_path=engine_config.model_config.model,
36 | engine_config=engine_config,
37 | torch_dtype=torch.float16,
38 | **encoder_kwargs
39 | )
40 | pipe.prepare_run(input_config)
41 |
42 | torch.cuda.reset_peak_memory_stats()
43 | start_time = time.time()
44 | output = pipe(
45 | height=input_config.height,
46 | width=input_config.width,
47 | prompt=input_config.prompt,
48 | num_inference_steps=input_config.num_inference_steps,
49 | output_type=input_config.output_type,
50 | use_resolution_binning=input_config.use_resolution_binning,
51 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
52 | )
53 | end_time = time.time()
54 | elapsed_time = end_time - start_time
55 | print(f"elapsed time:{elapsed_time}")
56 | if not os.path.exists("results"):
57 | os.mkdir("results")
58 |
59 | for _, images in enumerate(output):
60 | if images is not None:
61 | image = images[0]
62 | path = f"./results/{model_name}_ray_result.png"
63 | image.save(path)
64 | print(
65 | f"image saved to {path}"
66 | )
67 | break
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/examples/ray/ray_hunyuandit_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline
9 | from xfuser.model_executor.pipelines import xFuserHunyuanDiTPipeline
10 |
11 | def main():
12 | os.environ["MASTER_ADDR"] = "localhost"
13 | os.environ["MASTER_PORT"] = "12355"
14 | parser = FlexibleArgumentParser(description="xFuser Arguments")
15 | args = xFuserArgs.add_cli_args(parser).parse_args()
16 | engine_args = xFuserArgs.from_cli_args(args)
17 | engine_config, input_config = engine_args.create_config()
18 | model_name = engine_config.model_config.model.split("/")[-1]
19 | encoder_kwargs = {
20 | 'text_encoder_2': {
21 | 'model_class': T5EncoderModel,
22 | 'pretrained_model_name_or_path': engine_config.model_config.model,
23 | 'subfolder': 'text_encoder_2',
24 | 'torch_dtype': torch.bfloat16
25 | },
26 | }
27 | # if args.use_fp8_t5_encoder:
28 | # from optimum.quanto import freeze, qfloat8, quantize
29 | # print(f"rank {local_rank} quantizing text encoder 2")
30 | # quantize(text_encoder_2, weights=qfloat8)
31 | # freeze(text_encoder_2)
32 |
33 | pipe = RayDiffusionPipeline.from_pretrained(
34 | PipelineClass=xFuserHunyuanDiTPipeline,
35 | pretrained_model_name_or_path=engine_config.model_config.model,
36 | engine_config=engine_config,
37 | torch_dtype=torch.float16,
38 | **encoder_kwargs
39 | )
40 |
41 | pipe.prepare_run(input_config)
42 |
43 | torch.cuda.reset_peak_memory_stats()
44 | start_time = time.time()
45 | output = pipe(
46 | height=input_config.height,
47 | width=input_config.width,
48 | prompt=input_config.prompt,
49 | num_inference_steps=input_config.num_inference_steps,
50 | output_type=input_config.output_type,
51 | use_resolution_binning=input_config.use_resolution_binning,
52 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
53 | )
54 | end_time = time.time()
55 | elapsed_time = end_time - start_time
56 | print(f"elapsed time:{elapsed_time}")
57 | if not os.path.exists("results"):
58 | os.mkdir("results")
59 |
60 | for _, images in enumerate(output):
61 | if images is not None:
62 | image = images[0]
63 | path = f"./results/{model_name}_ray_result.png"
64 | image.save(path)
65 | print(
66 | f"image saved to {path}"
67 | )
68 | break
69 |
70 |
71 | if __name__ == "__main__":
72 | main()
73 |
--------------------------------------------------------------------------------
/docs/performance/hunyuandit_zh.md:
--------------------------------------------------------------------------------
1 | ## HunyuanDiT性能
2 |
3 | ## 8xA100 (NVLink)
4 | 在8xA100(NVLink)机器上,在使用不同GPU数目时,最佳的并行方案都是不同的。这说明了多种并行和混合并行的重要性。
5 | 最佳的并行策略在不同GPU规模时分别是:在2个GPU上,使用`ulysses_degree=2`;在4个GPU上,使用`cfg_parallel=2, ulysses_degree=2`;在8个GPU上,使用`cfg_parallel=2, pipefusion_parallel=4`。
6 |
7 | torch.compile带来的加速效果也很可观,同样并行方案有1.26x到1.76x加速效果,对于8 GPU的场景是最明显的,有1.76x加速。
8 |
9 |
10 |

12 |
13 |
14 |
15 | 下图展示了HunyuanDiT在8xA100 GPU上的可扩展性。我们额外测试了2048px图像生成任务,尽管HunyuanDiT并不具备生成2048px图像的能力。
16 | HunyuanDiT采用DiT块通过Skip Connection相互连接的结构,每个DiT块既与相邻块相连,也与非相邻块相连。
17 |
18 | 对于1024px图像生成任务,最佳的混合并行配置如下:2个GPU时使用pipefusion=2;4个GPU时使用cfg=2, pipefusion=2;8个GPU时使用cfg=2, pipefusion=4。PipeFusion的预热步数设为1。
19 | 在4个和8个GPU上,混合并行分别比单一并行方法获得了1.04x和1.23x的加速。在8个GPU上,PipeFusion的延迟低于SP-Ulysses,但在4个GPU上两者延迟相近。
20 | 在所有并行方法中,SP-Ring展现出最差的可扩展性。
21 |
22 | 对于2048px图像生成任务,8个GPU时的最佳混合并行配置变为cfg=2, pipefusion=2, ring=2。
23 | 同样,在4个和8个GPU上,混合并行相比单一并行方法获得了小幅提升。然而,在使用4个或8个GPU时,由于GPU之间Skip Connection需要额外的点对点通信,PipeFusion表现出比SP-Ulysses和SP-Ring更高的延迟。
24 | 当PipeFusion的并行度为2时,这个问题得到缓解,这突显了在混合配置中使用合适并行度的重要性。
25 | 随着图像尺寸从1024px增加到2048px,SP-Ring和SP-Ulysses之间的性能差距减小,这是因为模型的计算通信比降低,使得SP-Ring能够隐藏更多的通信开销。
26 |
27 |
28 |

30 |
31 |
32 |
33 | ## 8xL40 (PCIe)
34 |
35 | 在8xL40 (PCIe)上的延迟情况如下图所示。同样,不同GPU规模,最佳并行策略都是不同的。
36 | 和A100上不同,在L40上,8 GPU和4 GPU的延迟没有明显变化。我们认为是PCIe导致跨socket之间通信带宽过低导致的。
37 |
38 | torch.compile带来1.2x到1.43x加速。
39 |
40 |
41 |

43 |
44 |
45 | ## 8xL20 (PCIe)
46 |
47 | 在8xL20 (PCIe)上的延迟情况如下图所示。L20的FP16 FLOPS是119.5 TFLOPS,相比L40是181.05 TFLOPS。但是在8 GPU上,L20的延迟反而相比L40更低。
48 |
49 |
50 |

52 |
53 |
54 |
55 | ## 8xV100 (NVLink)
56 |
57 | 在8xV100上的加速下如下图所示。torch.compile带来1.10x到1.30x加速。
58 |
59 |
60 |

62 |
63 |
64 | ## 4xT4 (PCIe)
65 |
66 | 在4xT4上的加速下如下图所示。
67 |
68 |
69 |

71 |
72 |
--------------------------------------------------------------------------------
/examples/ray/ray_run.sh:
--------------------------------------------------------------------------------
1 | set -x
2 | # If using a Ray cluster across multiple machines, you need to manually start a Ray cluster like this:
3 | # ray start --head --port=6379 for master node
4 | # ray start --address='192.168.1.1:6379' for worker node
5 | # otherwise, it is not necessary. (for single node)
6 |
7 | export PYTHONPATH=$PWD:$PYTHONPATH
8 |
9 | # Select the model type
10 | export MODEL_TYPE="Flux"
11 | # Configuration for different model types
12 | # script, model_id, inference_step
13 | declare -A MODEL_CONFIGS=(
14 | ["Sd3"]="ray_sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20"
15 | ["Flux"]="ray_flux_example.py /cfs/dit/FLUX.1-dev 28"
16 | ["Pixart-alpha"]="ray_pixartalpha_example.py /cfs/dit/PixArt-XL-2-1024-MS 20"
17 | ["Pixart-sigma"]="ray_pixartsigma_example.py /cfs/dit/PixArt-Sigma-XL-2-1024-MS 20"
18 | ["HunyuanDiT"]="ray_hunyuandit_example.py /cfs/dit/HunyuanDiT-v1.2-Diffusers 50"
19 | )
20 |
21 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
22 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
23 | export SCRIPT MODEL_ID INFERENCE_STEP
24 | else
25 | echo "Invalid MODEL_TYPE: $MODEL_TYPE"
26 | exit 1
27 | fi
28 |
29 | mkdir -p ./results
30 |
31 | # task args
32 | TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning"
33 |
34 |
35 | N_GPUS=2 # world size
36 | PARALLEL_ARGS="--pipefusion_parallel_degree 1 --ulysses_degree 1 --ring_degree 1"
37 | VAE_PARALLEL_SIZE=1
38 | DIT_PARALLEL_SIZE=1
39 | # CFG_ARGS="--use_cfg_parallel"
40 |
41 | # By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance.
42 | # PIPEFUSION_ARGS="--num_pipeline_patch 8 "
43 |
44 | # For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed.
45 | # OUTPUT_ARGS="--output_type latent"
46 |
47 | # PARALLLEL_VAE="--use_parallel_vae"
48 |
49 | # Another compile option is `--use_onediff` which will use onediff's compiler.
50 | # COMPILE_FLAG="--use_torch_compile"
51 |
52 |
53 | # Use this flag to quantize the T5 text encoder, which could reduce the memory usage and have no effect on the result quality.
54 | # QUANTIZE_FLAG="--use_fp8_t5_encoder"
55 |
56 | # It is necessary to set CUDA_VISIBLE_DEVICES for the ray driver and workers.
57 | export CUDA_VISIBLE_DEVICES=4,5,6,7
58 |
59 | python ./examples/ray/$SCRIPT \
60 | --model $MODEL_ID \
61 | $PARALLEL_ARGS \
62 | $TASK_ARGS \
63 | $PIPEFUSION_ARGS \
64 | $OUTPUT_ARGS \
65 | --num_inference_steps $INFERENCE_STEP \
66 | --warmup_steps 1 \
67 | --prompt "brown dog laying on the ground with a metal bowl in front of him." \
68 | --use_ray \
69 | --ray_world_size $N_GPUS \
70 | $CFG_ARGS \
71 | $PARALLLEL_VAE \
72 | $COMPILE_FLAG \
73 | $QUANTIZE_FLAG \
74 | --use_parallel_vae \
75 | --dit_parallel_size $DIT_PARALLEL_SIZE \
76 | --vae_parallel_size $VAE_PARALLEL_SIZE
77 |
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/scheduling_dpm_cogvideox.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple, Union
2 |
3 | import torch
4 | import torch.distributed
5 |
6 | from diffusers.utils.torch_utils import randn_tensor
7 | from diffusers.schedulers.scheduling_dpm_cogvideox import (
8 | CogVideoXDPMScheduler,
9 | DDIMSchedulerOutput,
10 | )
11 |
12 | from .register import xFuserSchedulerWrappersRegister
13 | from .base_scheduler import xFuserSchedulerBaseWrapper
14 |
15 |
16 | @xFuserSchedulerWrappersRegister.register(CogVideoXDPMScheduler)
17 | class xFuserCogVideoXDPMSchedulerWrapper(xFuserSchedulerBaseWrapper):
18 |
19 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step
20 | def step(
21 | self,
22 | *args,
23 | **kwargs,
24 | ) -> Union[DDIMSchedulerOutput, Tuple]:
25 | """
26 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
27 | process from the learned model outputs (most often the predicted noise).
28 |
29 | Args:
30 | model_output (`torch.Tensor`):
31 | The direct output from learned diffusion model.
32 | timestep (`float`):
33 | The current discrete timestep in the diffusion chain.
34 | sample (`torch.Tensor`):
35 | A current instance of a sample created by the diffusion process.
36 | eta (`float`):
37 | The weight of noise for added noise in diffusion step.
38 | use_clipped_model_output (`bool`, defaults to `False`):
39 | If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
40 | because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
41 | clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
42 | `use_clipped_model_output` has no effect.
43 | generator (`torch.Generator`, *optional*):
44 | A random number generator.
45 | variance_noise (`torch.Tensor`):
46 | Alternative to generating noise with `generator` by directly providing the noise for the variance
47 | itself. Useful for methods such as [`CycleDiffusion`].
48 | return_dict (`bool`, *optional*, defaults to `True`):
49 | Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
50 |
51 | Returns:
52 | [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
53 | If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
54 | tuple is returned where the first element is the sample tensor.
55 |
56 | """
57 | return self.module.step(*args, **kwargs)
58 |
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/scheduling_ddim_cogvideox.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple, Union
2 |
3 | import torch
4 | import torch.distributed
5 |
6 | from diffusers.utils.torch_utils import randn_tensor
7 | from diffusers.schedulers.scheduling_ddim_cogvideox import (
8 | CogVideoXDDIMScheduler,
9 | DDIMSchedulerOutput,
10 | )
11 |
12 | from .register import xFuserSchedulerWrappersRegister
13 | from .base_scheduler import xFuserSchedulerBaseWrapper
14 |
15 |
16 | @xFuserSchedulerWrappersRegister.register(CogVideoXDDIMScheduler)
17 | class xFuserCogVideoXDDIMSchedulerWrapper(xFuserSchedulerBaseWrapper):
18 |
19 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step
20 | def step(
21 | self,
22 | *args,
23 | **kwargs,
24 | ) -> Union[DDIMSchedulerOutput, Tuple]:
25 | """
26 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
27 | process from the learned model outputs (most often the predicted noise).
28 |
29 | Args:
30 | model_output (`torch.Tensor`):
31 | The direct output from learned diffusion model.
32 | timestep (`float`):
33 | The current discrete timestep in the diffusion chain.
34 | sample (`torch.Tensor`):
35 | A current instance of a sample created by the diffusion process.
36 | eta (`float`):
37 | The weight of noise for added noise in diffusion step.
38 | use_clipped_model_output (`bool`, defaults to `False`):
39 | If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
40 | because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
41 | clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
42 | `use_clipped_model_output` has no effect.
43 | generator (`torch.Generator`, *optional*):
44 | A random number generator.
45 | variance_noise (`torch.Tensor`):
46 | Alternative to generating noise with `generator` by directly providing the noise for the variance
47 | itself. Useful for methods such as [`CycleDiffusion`].
48 | return_dict (`bool`, *optional*, defaults to `True`):
49 | Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
50 |
51 | Returns:
52 | [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
53 | If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
54 | tuple is returned where the first element is the sample tensor.
55 |
56 | """
57 | return self.module.step(*args, **kwargs)
58 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | import subprocess
3 |
4 |
5 | def get_cuda_version():
6 | try:
7 | nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
8 | version_line = [line for line in nvcc_version.split("\n") if "release" in line][
9 | 0
10 | ]
11 | cuda_version = version_line.split(" ")[-2].replace(",", "")
12 | return "cu" + cuda_version.replace(".", "")
13 | except (subprocess.CalledProcessError, FileNotFoundError):
14 | return "no_cuda"
15 |
16 |
17 | if __name__ == "__main__":
18 | with open("README.md", "r") as f:
19 | long_description = f.read()
20 | fp = open("xfuser/__version__.py", "r").read()
21 | version = eval(fp.strip().split()[-1])
22 |
23 | setup(
24 | name="xfuser",
25 | author="xDiT Team",
26 | author_email="fangjiarui123@gmail.com",
27 | packages=find_packages(),
28 | install_requires=[
29 | "torch>=2.4.1",
30 | "accelerate>=0.33.0",
31 | "transformers>=4.39.1",
32 | "sentencepiece>=0.1.99",
33 | "beautifulsoup4>=4.12.3",
34 | "distvae",
35 | "yunchang>=0.6.0",
36 | "einops",
37 | "diffusers>=0.33.0",
38 | ],
39 | extras_require={
40 | "flash-attn": [
41 | "flash-attn>=2.6.0", # NOTE: flash-attn is necessary if ring_degree > 1
42 | ],
43 | "optimum-quanto": [
44 | "optimum-quanto", # NOTE: optimum-quanto is necessary if use_fp8_t5_encoder is enabled
45 | ],
46 | "torchao": [
47 | "torchao", # NOTE: torchao is necessary if use_fp8_gemms is enabled
48 | ],
49 | "flask": [
50 | "flask", # NOTE: flask is necessary to run xDiT as an http service
51 | ],
52 | "ray": [
53 | "ray", # NOTE: ray is necessary if RayDiffusionPipeline is used
54 | ],
55 | "opencv-python": [
56 | "opencv-python-headless", # NOTE: opencv-python is necessary if ConsisIDPipeline is used
57 | ],
58 | "test": [
59 | "pytest",
60 | "imageio",
61 | "imageio-ffmpeg"
62 | ]
63 | },
64 | url="https://github.com/xdit-project/xDiT.",
65 | description="A Scalable Inference Engine for Diffusion Transformers (DiTs) on Multiple Computing Devices",
66 | long_description=long_description,
67 | long_description_content_type="text/markdown",
68 | version=version,
69 | classifiers=[
70 | "Programming Language :: Python :: 3",
71 | "Operating System :: OS Independent",
72 | ],
73 | include_package_data=True,
74 | python_requires=">=3.10",
75 | )
76 |
--------------------------------------------------------------------------------
/docs/performance/cogvideo_zh.md:
--------------------------------------------------------------------------------
1 | ## CogVideoX 性能表现
2 |
3 | CogVideoX/CogVideoX1.5 是有文本/图像生成视频的模型。xDiT 目前整合了 USP 技术(包括 Ulysses 注意力和 Ring 注意力)和 CFG 并行来提高推理速度,同时 PipeFusion 的工作正在进行中。我们对基于 `diffusers` 库的单 GPU CogVideoX 推理与我们提出的并行化版本在生成 49帧(6秒)720x480 分辨率视频时的性能差异进行了深入分析。由于我们可以任意组合不同的并行方式以获得不同的性能。在本文中,我们对xDiT在1-12张L40(PCIe)GPU上的加速性能进行了系统测试。
4 |
5 | ### CogVideoX-2b/5b
6 |
7 | 如图所示,对于基础模型 CogVideoX-2b,无论是采用 Ulysses Attention、Ring Attention 还是 Classifier-Free Guidance(CFG)并行,均观察到推理延迟的显著降低。值得注意的是,由于其较低的通信开销,CFG 并行方法在性能上优于其他两种技术。通过结合序列并行和 CFG 并行,我们成功提升了推理效率。随着并行度的增加,推理延迟持续下降。在最优配置下,xDiT 相对于单GPU推理实现了 4.29 倍的加速,使得每次迭代仅需 0.49 秒。鉴于 CogVideoX 默认的 50 次迭代,总计 30 秒即可完成 24.5 秒视频的端到端生成。
8 |
9 |
10 |

12 |
13 |
14 | 针对更复杂的CogVideoX-5b模型,虽然参数增加以提升视频质量和视觉效果,导致计算成本显著上升,但在该模型上,所有方法仍然保持与CogVideoX-2b相似的性能趋势,且并行版本的加速效果进一步提升。相较于单GPU版本,xDiT实现了高达7.75倍的推理速度提升,将端到端视频生成时间缩短至约40秒。
15 |
16 |
17 |

19 |
20 |
21 | 在搭载A100 GPU的系统中,xDiT 在 CogVideoX-2b 和 CogVideoX-5b 上展现出类似的加速效果,具体表现可见下方两图。
22 |
23 |
24 |

26 |
27 |
28 |
29 |
30 |

32 |
33 |
34 | ### CogVideoX1.5-5B
35 |
36 | 同样,我们在配备了L40(PCIe)GPU的系统上用CogVideoX1.5-5B生成161帧1360x768分辨率的视频,我们对比了diffusers库中单卡的推理实现与xDiT的并行版本在推理延迟上的差异。
37 | 如图所示,无论Ulysses Attention、Ring Attention还是CFG并行,均可以降低xDiT的推理延迟。其中,给定2张GPU卡时,CFG并行由于通信量较小,表现出比Ulysses Attention、Ring Attention更高的性能。通过结合序列并行和CFG并行,我们进一步提高了推理效率。随着并行度的增加,推理延迟持续降低。在8卡环境下,混合Ulysses-2,Ring-2,CFG-2时xDiT可以获得最佳性能,相比于单卡推理方法可以实现6.12倍的加速,生成一个视频只需不到10分钟。
38 |
39 |
40 |

42 |
43 |
44 | 我们对xDiT在H20和L20上生成81帧1360x768分辨率视频的加速效果进行了进一步比较。从下图可以观察到,在这两台设备上,CogVideoX1.5-5B的推理延迟非常相似,然而考虑到H20的价格高于L20,L20展现出了更高的性价比。
45 |
46 |
47 |
48 |

50 |
51 |
52 |
53 |
54 |

56 |
57 |
--------------------------------------------------------------------------------
/xfuser/model_executor/pipelines/register.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Type, Union
2 | from diffusers.pipelines.pipeline_utils import DiffusionPipeline
3 |
4 | from xfuser.logger import init_logger
5 | from .base_pipeline import xFuserPipelineBaseWrapper
6 |
7 | logger = init_logger(__name__)
8 |
9 | class xFuserPipelineWrapperRegister:
10 | _XFUSER_PIPE_MAPPING: Dict[
11 | Type[DiffusionPipeline],
12 | Type[xFuserPipelineBaseWrapper]
13 | ] = {}
14 |
15 | @classmethod
16 | def register(cls, origin_pipe_class: Type[DiffusionPipeline]):
17 | def decorator(xfuser_pipe_class: Type[xFuserPipelineBaseWrapper]):
18 | if not issubclass(xfuser_pipe_class, xFuserPipelineBaseWrapper):
19 | raise ValueError(f"{xfuser_pipe_class} is not a subclass of"
20 | f" xFuserPipelineBaseWrapper")
21 | cls._XFUSER_PIPE_MAPPING[origin_pipe_class] = \
22 | xfuser_pipe_class
23 | return xfuser_pipe_class
24 | return decorator
25 |
26 | @classmethod
27 | def get_class(
28 | cls,
29 | pipe: Union[DiffusionPipeline, Type[DiffusionPipeline]]
30 | ) -> Type[xFuserPipelineBaseWrapper]:
31 | if isinstance(pipe, type):
32 | candidate = None
33 | candidate_origin = None
34 | for (origin_model_class,
35 | xfuser_model_class) in cls._XFUSER_PIPE_MAPPING.items():
36 | if issubclass(pipe, origin_model_class):
37 | if ((candidate is None and candidate_origin is None) or
38 | issubclass(origin_model_class, candidate_origin)):
39 | candidate_origin = origin_model_class
40 | candidate = xfuser_model_class
41 | if candidate is None:
42 | raise ValueError(f"Diffusion Pipeline class {pipe} "
43 | f"is not supported by xFuser")
44 | else:
45 | return candidate
46 | elif isinstance(pipe, DiffusionPipeline):
47 | candidate = None
48 | candidate_origin = None
49 | for (origin_model_class,
50 | xfuser_model_class) in cls._XFUSER_PIPE_MAPPING.items():
51 | if isinstance(pipe, origin_model_class):
52 | if ((candidate is None and candidate_origin is None) or
53 | issubclass(origin_model_class, candidate_origin)):
54 | candidate_origin = origin_model_class
55 | candidate = xfuser_model_class
56 |
57 | if candidate is None:
58 | raise ValueError(f"Diffusion Pipeline class {pipe.__class__} "
59 | f"is not supported by xFuser")
60 | else:
61 | return candidate
62 | else:
63 | raise ValueError(f"Unsupported type {type(pipe)} for pipe")
--------------------------------------------------------------------------------
/docs/performance/consisid.md:
--------------------------------------------------------------------------------
1 | ## ConsisID Performance Report
2 |
3 | [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) is an identity-preserving text-to-video generation model that keeps the face consistent in the generated video by frequency decomposition.xDiT currently integrates USP techniques, including Ulysses Attention, Ring Attention, and CFG parallelization, to enhance inference speed, while work on PipeFusion is ongoing. We conducted an in-depth analysis comparing single-GPU ConsisID inference, based on the diffusers library, with our proposed parallelized version for generating 49 frames (6 seconds) of 720x480 resolution video. By flexibly combining different parallelization methods, we achieved varying performance outcomes. In this study, we systematically evaluate xDiT's acceleration performance across 1 to 6 Nvidia H100 GPUs.
4 |
5 | As shown in the table, the ConsisID model achieves a significant reduction in inference latency with Ulysses Attention, Ring Attention, or Classifier-Free Guidance (CFG) parallelization. Notably, CFG parallelization outperforms the other two techniques due to its lower communication overhead. By combining sequence parallelization and CFG parallelization, inference efficiency was further improved. With increased parallelism, inference latency continued to decrease. Under the optimal configuration, xDiT achieved a 3.21× speedup over single-GPU inference, reducing iteration time to just 0.72 seconds. For the default 50 iterations of ConsisID, this enables end-to-end generation of 49 frames in 35 seconds, with a GPU memory usage of 40 GB.
6 |
7 | ### 720x480 Resolution (49 frames, 50 steps)
8 |
9 |
10 | | N-GPUs | Ulysses Degree | Ring Degree | Cfg Parallel | Times |
11 | | :----: | :------------: | :---------: | :----------: | :-----: |
12 | | 6 | 2 | 3 | 1 | 44.89s |
13 | | 6 | 3 | 2 | 1 | 44.24s |
14 | | 6 | 1 | 3 | 2 | 35.78s |
15 | | 6 | 3 | 1 | 2 | 38.35s |
16 | | 4 | 2 | 1 | 2 | 41.37s |
17 | | 4 | 1 | 2 | 2 | 40.68s |
18 | | 3 | 3 | 1 | 1 | 53.57s |
19 | | 3 | 1 | 3 | 1 | 55.51s |
20 | | 2 | 1 | 2 | 1 | 70.19s |
21 | | 2 | 2 | 1 | 1 | 76.56s |
22 | | 2 | 1 | 1 | 2 | 59.72s |
23 | | 1 | 1 | 1 | 1 | 114.87s |
24 |
25 | ## Resources
26 |
27 | Learn more about ConsisID with the following resources.
28 | - A [video](https://www.youtube.com/watch?v=PhlgC-bI5SQ) demonstrating ConsisID's main features.
29 | - The research paper, [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://hf.co/papers/2411.17440) for more details.
30 |
--------------------------------------------------------------------------------
/examples/ray/ray_flux_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserArgs
7 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline
8 | from xfuser.config import FlexibleArgumentParser
9 | from xfuser.model_executor.pipelines import xFuserFluxPipeline
10 |
11 | def main():
12 | os.environ["MASTER_ADDR"] = "localhost"
13 | os.environ["MASTER_PORT"] = "12355"
14 | parser = FlexibleArgumentParser(description="xFuser Arguments")
15 | args = xFuserArgs.add_cli_args(parser).parse_args()
16 | engine_args = xFuserArgs.from_cli_args(args)
17 | engine_config, input_config = engine_args.create_config()
18 | engine_config.runtime_config.dtype = torch.bfloat16
19 | model_name = engine_config.model_config.model.split("/")[-1]
20 | PipelineClass = xFuserFluxPipeline
21 | # equal to
22 | # text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
23 | # but load encoder in worker
24 | encoder_kwargs = {
25 | 'text_encoder_2': {
26 | 'model_class': T5EncoderModel,
27 | 'pretrained_model_name_or_path': engine_config.model_config.model,
28 | 'subfolder': 'text_encoder_2',
29 | 'torch_dtype': torch.bfloat16
30 | },
31 | }
32 | # if args.use_fp8_t5_encoder:
33 | # from optimum.quanto import freeze, qfloat8, quantize
34 | # quantize(text_encoder_2, weights=qfloat8)
35 | # freeze(text_encoder_2)
36 |
37 | pipe = RayDiffusionPipeline.from_pretrained(
38 | PipelineClass=PipelineClass,
39 | pretrained_model_name_or_path=engine_config.model_config.model,
40 | engine_config=engine_config,
41 | torch_dtype=torch.bfloat16,
42 | **encoder_kwargs
43 | )
44 | pipe.prepare_run(input_config)
45 |
46 | start_time = time.time()
47 | output = pipe(
48 | height=input_config.height,
49 | width=input_config.width,
50 | prompt=input_config.prompt,
51 | num_inference_steps=input_config.num_inference_steps,
52 | output_type=input_config.output_type,
53 | max_sequence_length=256,
54 | guidance_scale=0.0,
55 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
56 | )
57 | end_time = time.time()
58 | elapsed_time = end_time - start_time
59 | print(f"elapsed time:{elapsed_time}")
60 | if not os.path.exists("results"):
61 | os.mkdir("results")
62 |
63 | for _, images in enumerate(output):
64 | if images is not None:
65 | image = images[0]
66 | path = f"./results/{model_name}_ray_result.png"
67 | image.save(path)
68 | print(
69 | f"image saved to {path}"
70 | )
71 | break
72 |
73 |
74 | if __name__ == "__main__":
75 | main()
76 |
--------------------------------------------------------------------------------
/xfuser/model_executor/patch/unet_patch.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.distributed as dist
3 | from typing import Union, Optional, Dict
4 | from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
5 |
6 | def unet_cfg_parallel_monkey_patch_forward(
7 | self,
8 | sample: torch.Tensor,
9 | timestep: Union[torch.Tensor, float, int],
10 | encoder_hidden_states: torch.Tensor,
11 | added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
12 | return_dict: bool = True,
13 | *args,
14 | **kwargs
15 | ):
16 | assert dist.is_initialized(), "Distributed training is not initialized"
17 |
18 | # Initialize output_buffer and buffer_list as instance attributes if they don't exist
19 | if not hasattr(self, 'output_buffer'):
20 | self.output_buffer = None
21 | if not hasattr(self, 'buffer_list'):
22 | self.buffer_list = None
23 |
24 | b, c, h, w = sample.shape
25 | original_forward = type(self).forward
26 |
27 | rank = dist.get_rank()
28 | sample = sample[rank:rank+1]
29 | timestep = timestep[rank:rank+1] if torch.is_tensor(timestep) and timestep.ndim > 0 else timestep
30 | encoder_hidden_states = encoder_hidden_states[rank:rank+1]
31 | if added_cond_kwargs is not None:
32 | new_added_cond_kwargs = {}
33 | for k in added_cond_kwargs:
34 | new_added_cond_kwargs[k] = added_cond_kwargs[k][rank : rank + 1]
35 | added_cond_kwargs = new_added_cond_kwargs
36 |
37 | output = original_forward(
38 | self,
39 | sample=sample,
40 | timestep=timestep,
41 | encoder_hidden_states=encoder_hidden_states,
42 | added_cond_kwargs=added_cond_kwargs,
43 | return_dict=False,
44 | *args,
45 | **kwargs
46 | )[0]
47 |
48 | world_size = dist.get_world_size()
49 | assert world_size == 2, f"world_size is {world_size}, expected 2 in unet_cfg_parallel_monkey_patch_forward"
50 |
51 | if self.output_buffer is None:
52 | self.output_buffer = torch.empty((b, c, h, w), device=output.device, dtype=output.dtype)
53 | if self.buffer_list is None:
54 | self.buffer_list = [torch.empty_like(output) for _ in range(world_size)]
55 |
56 | dist.all_gather(self.buffer_list, output.contiguous(), async_op=False)
57 | torch.cat(self.buffer_list[: 1], dim=2, out=self.output_buffer[0:1])
58 | torch.cat(self.buffer_list[1 :], dim=2, out=self.output_buffer[1:2])
59 | output = self.output_buffer
60 |
61 | if return_dict:
62 | output = UNet2DConditionOutput(sample=output)
63 | else:
64 | output = (output,)
65 | return output
66 |
67 | def apply_unet_cfg_parallel_monkey_patch(pipe):
68 | """Apply the monkey patch to the pipeline's UNet if world size is 2."""
69 | import types
70 | world_size = dist.get_world_size()
71 | if world_size == 2:
72 | pipe.unet.forward = types.MethodType(unet_cfg_parallel_monkey_patch_forward, pipe.unet)
73 | return pipe
--------------------------------------------------------------------------------
/examples/ray/ray_sd3_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserArgs
7 | from xfuser.ray.pipeline.pipeline_utils import RayDiffusionPipeline
8 | from xfuser.config import FlexibleArgumentParser
9 | from xfuser.model_executor.pipelines import xFuserStableDiffusion3Pipeline
10 |
11 | def main():
12 | os.environ["MASTER_ADDR"] = "localhost"
13 | os.environ["MASTER_PORT"] = "12355"
14 | parser = FlexibleArgumentParser(description="xFuser Arguments")
15 | args = xFuserArgs.add_cli_args(parser).parse_args()
16 | engine_args = xFuserArgs.from_cli_args(args)
17 | engine_config, input_config = engine_args.create_config()
18 | model_name = engine_config.model_config.model.split("/")[-1]
19 | PipelineClass = xFuserStableDiffusion3Pipeline
20 |
21 | # equal to
22 | # text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16)
23 | # but load encoder in worker
24 | encoder_kwargs = {
25 | 'text_encoder_3': {
26 | 'model_class': T5EncoderModel,
27 | 'pretrained_model_name_or_path': engine_config.model_config.model,
28 | 'subfolder': 'text_encoder_3',
29 | 'torch_dtype': torch.float16
30 | },
31 | }
32 |
33 | # if args.use_fp8_t5_encoder:
34 | # from optimum.quanto import freeze, qfloat8, quantize
35 | # print(f"rank {local_rank} quantizing text encoder 2")
36 | # quantize(text_encoder_3, weights=qfloat8)
37 | # freeze(text_encoder_3)
38 |
39 | pipe = RayDiffusionPipeline.from_pretrained(
40 | PipelineClass=PipelineClass,
41 | pretrained_model_name_or_path=engine_config.model_config.model,
42 | engine_config=engine_config,
43 | torch_dtype=torch.float16,
44 | **encoder_kwargs
45 | )
46 | pipe.prepare_run(input_config)
47 |
48 | torch.cuda.reset_peak_memory_stats()
49 | start_time = time.time()
50 | output = pipe(
51 | height=input_config.height,
52 | width=input_config.width,
53 | prompt=input_config.prompt,
54 | num_inference_steps=input_config.num_inference_steps,
55 | output_type=input_config.output_type,
56 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
57 | )
58 | end_time = time.time()
59 | elapsed_time = end_time - start_time
60 | print(f"elapsed time:{elapsed_time}")
61 | if not os.path.exists("results"):
62 | os.mkdir("results")
63 |
64 | for _, images in enumerate(output):
65 | if images is not None:
66 | image = images[0]
67 | path = f"./results/{model_name}_ray_result.png"
68 | image.save(path)
69 | print(
70 | f"image saved to {path}"
71 | )
72 | break
73 |
74 |
75 | if __name__ == "__main__":
76 | main()
--------------------------------------------------------------------------------
/xfuser/model_executor/schedulers/scheduling_ddim.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple, Union
2 |
3 | import torch
4 | import torch.distributed
5 |
6 | from diffusers.utils.torch_utils import randn_tensor
7 | from diffusers.schedulers.scheduling_ddim import (
8 | DDIMScheduler,
9 | DDIMSchedulerOutput,
10 | )
11 |
12 | from xfuser.core.distributed import (
13 | get_pipeline_parallel_world_size,
14 | get_sequence_parallel_world_size,
15 | get_runtime_state,
16 | )
17 | from .register import xFuserSchedulerWrappersRegister
18 | from .base_scheduler import xFuserSchedulerBaseWrapper
19 |
20 |
21 | @xFuserSchedulerWrappersRegister.register(DDIMScheduler)
22 | class xFuserDDIMSchedulerWrapper(xFuserSchedulerBaseWrapper):
23 |
24 | @xFuserSchedulerBaseWrapper.check_to_use_naive_step
25 | def step(
26 | self,
27 | *args,
28 | **kwargs,
29 | ) -> Union[DDIMSchedulerOutput, Tuple]:
30 | """
31 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
32 | process from the learned model outputs (most often the predicted noise).
33 |
34 | Args:
35 | model_output (`torch.Tensor`):
36 | The direct output from learned diffusion model.
37 | timestep (`float`):
38 | The current discrete timestep in the diffusion chain.
39 | sample (`torch.Tensor`):
40 | A current instance of a sample created by the diffusion process.
41 | eta (`float`):
42 | The weight of noise for added noise in diffusion step.
43 | use_clipped_model_output (`bool`, defaults to `False`):
44 | If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
45 | because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
46 | clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
47 | `use_clipped_model_output` has no effect.
48 | generator (`torch.Generator`, *optional*):
49 | A random number generator.
50 | variance_noise (`torch.Tensor`):
51 | Alternative to generating noise with `generator` by directly providing the noise for the variance
52 | itself. Useful for methods such as [`CycleDiffusion`].
53 | return_dict (`bool`, *optional*, defaults to `True`):
54 | Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
55 |
56 | Returns:
57 | [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
58 | If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
59 | tuple is returned where the first element is the sample tensor.
60 |
61 | """
62 | return self.module.step(*args, **kwargs)
63 |
--------------------------------------------------------------------------------
/benchmark/fid/pixartalpha_generate.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch
3 | import torch.distributed
4 | import json, os
5 | from xfuser import xFuserPixArtAlphaPipeline, xFuserArgs
6 | from xfuser.config import FlexibleArgumentParser
7 | from xfuser.core.distributed import (
8 | get_world_group,
9 | get_runtime_state
10 | )
11 | import gc
12 |
13 |
14 | _NUM_FID_CANDIDATE = 30000
15 | CFG = 2.0
16 |
17 | def flush():
18 | gc.collect()
19 | torch.cuda.empty_cache()
20 |
21 | def main():
22 | parser = FlexibleArgumentParser(description='xFuser Arguments')
23 | parser.add_argument('--caption_file', type=str, default='captions_coco.json')
24 | parser.add_argument('--sample_images_folder', type=str, default='sample_images')
25 | args = xFuserArgs.add_cli_args(parser).parse_args()
26 | engine_args = xFuserArgs.from_cli_args(args)
27 | engine_config, input_config = engine_args.create_config()
28 | local_rank = get_world_group().local_rank
29 |
30 | pipe = xFuserPixArtAlphaPipeline.from_pretrained(
31 | pretrained_model_name_or_path=engine_config.model_config.model,
32 | engine_config=engine_config,
33 | torch_dtype=torch.float16,
34 | ).to(f"cuda:{local_rank}")
35 |
36 | if args.enable_sequential_cpu_offload:
37 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
38 | logging.info(f'rank {local_rank} sequential CPU offload enabled')
39 | else:
40 | pipe = pipe.to(f'cuda:{local_rank}')
41 |
42 | pipe.prepare_run(input_config, steps=1)
43 |
44 | with open(args.caption_file) as f:
45 | raw_captions = json.load(f)
46 |
47 | raw_captions = raw_captions['images'][:_NUM_FID_CANDIDATE]
48 | captions = list(map(lambda x: x['sentences'][0]['raw'], raw_captions))
49 | filenames = list(map(lambda x: x['filename'], raw_captions))
50 |
51 | folder_path = args.sample_images_folder
52 | if not os.path.exists(folder_path):
53 | os.makedirs(folder_path)
54 |
55 | # run multiple prompts at a time to save time
56 | num_prompt_one_step = 120
57 | for j in range(0, _NUM_FID_CANDIDATE, num_prompt_one_step):
58 | output = pipe(
59 | height=256,
60 | width=256,
61 | prompt=captions[j:j+num_prompt_one_step],
62 | num_inference_steps=input_config.num_inference_steps,
63 | output_type=input_config.output_type,
64 | max_sequence_length=256,
65 | guidance_scale=CFG,
66 | generator=torch.Generator(device='cuda').manual_seed(input_config.seed),
67 | )
68 | if input_config.output_type == 'pil':
69 | if pipe.is_dp_last_group():
70 | for k, local_filename in enumerate(filenames[j:j+num_prompt_one_step]):
71 | output.images[k].save(f'{folder_path}/{local_filename}')
72 | print(f'{j}-{j+num_prompt_one_step-1} generation finished!')
73 | flush()
74 |
75 | get_runtime_state().destroy_distributed_env()
76 |
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/benchmark/fid/flux_generate.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch
3 | import torch.distributed
4 | import json, os
5 | from xfuser import xFuserFluxPipeline, xFuserArgs
6 | from xfuser.config import FlexibleArgumentParser
7 | from xfuser.core.distributed import (
8 | get_world_group,
9 | get_runtime_state
10 | )
11 | import gc
12 |
13 |
14 | _NUM_FID_CANDIDATE = 30000
15 | CFG = 1.5
16 |
17 | def flush():
18 | gc.collect()
19 | torch.cuda.empty_cache()
20 |
21 | def main():
22 | parser = FlexibleArgumentParser(description='xFuser Arguments')
23 | parser.add_argument('--caption_file', type=str, default='captions_coco.json')
24 | parser.add_argument('--sample_images_folder', type=str, default='sample_images')
25 | args = xFuserArgs.add_cli_args(parser).parse_args()
26 | engine_args = xFuserArgs.from_cli_args(args)
27 | engine_config, input_config = engine_args.create_config()
28 | engine_config.runtime_config.dtype = torch.bfloat16
29 | local_rank = get_world_group().local_rank
30 |
31 | pipe = xFuserFluxPipeline.from_pretrained(
32 | pretrained_model_name_or_path=engine_config.model_config.model,
33 | engine_config=engine_config,
34 | torch_dtype=torch.bfloat16,
35 | )
36 |
37 | if args.enable_sequential_cpu_offload:
38 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
39 | logging.info(f'rank {local_rank} sequential CPU offload enabled')
40 | else:
41 | pipe = pipe.to(f'cuda:{local_rank}')
42 |
43 | pipe.prepare_run(input_config, steps=1)
44 |
45 | with open(args.caption_file) as f:
46 | raw_captions = json.load(f)
47 |
48 | raw_captions = raw_captions['images'][:_NUM_FID_CANDIDATE]
49 | captions = list(map(lambda x: x['sentences'][0]['raw'], raw_captions))
50 | filenames = list(map(lambda x: x['filename'], raw_captions))
51 |
52 | folder_path = args.sample_images_folder
53 | if not os.path.exists(folder_path):
54 | os.makedirs(folder_path)
55 |
56 | # run multiple prompts at a time to save time
57 | num_prompt_one_step = 120
58 | for j in range(0, _NUM_FID_CANDIDATE, num_prompt_one_step):
59 | output = pipe(
60 | height=256,
61 | width=256,
62 | prompt=captions[j:j+num_prompt_one_step],
63 | num_inference_steps=input_config.num_inference_steps,
64 | output_type=input_config.output_type,
65 | max_sequence_length=256,
66 | guidance_scale=CFG,
67 | generator=torch.Generator(device='cuda').manual_seed(input_config.seed),
68 | )
69 | if input_config.output_type == 'pil':
70 | if pipe.is_dp_last_group():
71 | for k, local_filename in enumerate(filenames[j:j+num_prompt_one_step]):
72 | output.images[k].save(f'{folder_path}/{local_filename}')
73 | print(f'{j}-{j+num_prompt_one_step-1} generation finished!')
74 | flush()
75 |
76 | get_runtime_state().destroy_distributed_env()
77 |
78 |
79 | if __name__ == '__main__':
80 | main()
81 |
--------------------------------------------------------------------------------
/benchmark/fid/compute_fid.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import time
4 | from cleanfid import fid
5 | from pathlib import Path
6 |
7 | def setup_logging():
8 | """Setup logging configuration"""
9 | logging.basicConfig(
10 | level=logging.INFO,
11 | format='%(asctime)s - %(levelname)s - %(message)s',
12 | handlers=[
13 | logging.StreamHandler(),
14 | logging.FileHandler('fid_computation.log')
15 | ]
16 | )
17 |
18 | def compute_fid_score(ref_path: str, sample_path: str, device: str = "cuda") -> float:
19 | """
20 | Compute FID score
21 |
22 | Args:
23 | ref_path: Path to ref images directory
24 | sample_path: Path to sample images directory
25 | device: Computing device ('cuda' or 'cpu')
26 |
27 | Returns:
28 | float: FID score
29 |
30 | Raises:
31 | ValueError: If directory does not exist
32 | """
33 | # Check if paths exist
34 | ref_dir = Path(ref_path)
35 | gen_dir = Path(sample_path)
36 |
37 | if not ref_dir.exists():
38 | raise ValueError(f"ref images directory does not exist: {ref_path}")
39 | if not gen_dir.exists():
40 | raise ValueError(f"sample images directory does not exist: {sample_path}")
41 |
42 | logging.info(f"Starting FID score computation")
43 | logging.info(f"ref images directory: {ref_path}")
44 | logging.info(f"sample images directory: {sample_path}")
45 | logging.info(f"Using device: {device}")
46 |
47 | start_time = time.time()
48 |
49 | try:
50 | score = fid.compute_fid(
51 | ref_path,
52 | sample_path,
53 | device=device,
54 | num_workers=8 # Can be adjusted as needed
55 | )
56 |
57 | elapsed_time = time.time() - start_time
58 | logging.info(f"FID computation completed, time elapsed: {elapsed_time:.2f} seconds")
59 | return score
60 |
61 | except Exception as e:
62 | logging.error(f"Error occurred during FID computation: {str(e)}")
63 | raise
64 |
65 | def main():
66 | # Setup command line arguments
67 | parser = argparse.ArgumentParser(description='Compute FID score')
68 | parser.add_argument('--ref', type=str, required=True,
69 | help='Path to ref images directory')
70 | parser.add_argument('--sample', type=str, required=True,
71 | help='Path to sample images directory')
72 | parser.add_argument('--device', type=str, default="cuda",
73 | choices=['cuda', 'cpu'], help='Computing device')
74 |
75 | args = parser.parse_args()
76 |
77 | # Setup logging
78 | setup_logging()
79 |
80 | try:
81 | # Compute FID
82 | score = compute_fid_score(args.ref, args.sample, args.device)
83 |
84 | # Output result
85 | logging.info(f"FID score: {score:.4f}")
86 |
87 | except Exception as e:
88 | logging.error(f"Program execution failed: {str(e)}")
89 | return 1
90 |
91 | return 0
92 |
93 | if __name__ == "__main__":
94 | exit(main())
--------------------------------------------------------------------------------
/docs/methods/pipefusion.md:
--------------------------------------------------------------------------------
1 | ## PipeFusion: Displaced Patch Pipeline Parallelism for Diffusion Models
2 | [Chinese Blog 1](https://zhuanlan.zhihu.com/p/699612077); [Chinese Blog 2](https://zhuanlan.zhihu.com/p/706475158)
3 |
4 | PipeFusion is the innovative method first proposed by us.
5 | It is a sequence-level pipeline parallel method, similar to [TeraPipe](https://proceedings.mlr.press/v139/li21y.html), demonstrates significant advantages in weakly interconnected network hardware such as PCIe/Ethernet.
6 |
7 | PipeFusion innovatively harnesses input temporal redundancy—the similarity between inputs and activations across diffusion steps, a diffusion-specific characteristics also employed in DistriFusion. PipeFusion not only reduces communication volume but also streamlines pipeline parallelism with TeraPipe, avoiding the load balancing issues inherent in LLM models with Causal Attention.
8 | It significantly surpasses other methods in communication efficiency, particularly in multi-node setups connected via Ethernet and multi-GPU configurations linked with PCIe.
9 |
10 |
11 |

12 |
13 |
14 | The above picture compares DistriFusion and PipeFusion.
15 | (a) DistriFusion replicates DiT parameters on two devices.
16 | It splits an image into 2 patches and employs asynchronous allgather for activations of every layer.
17 | (b) PipeFusion shards DiT parameters on two devices.
18 | It splits an image into 4 patches and employs asynchronous P2P for activations across two devices.
19 |
20 | We briefly explain the workflow of PipeFusion. It partitions an input image into $M$ non-overlapping patches.
21 | The DiT network is partitioned into $N$ stages ($N$ < $L$), which are sequentially assigned to $N$ computational devices.
22 | Note that $M$ and $N$ can be unequal, which is different from the image-splitting approaches used in sequence parallelism and DistriFusion.
23 | Each device processes the computation task for one patch of its assigned stage in a pipelined manner.
24 |
25 | The PipeFusion pipeline workflow when $M$ = $N$ =4 is shown in the following picture.
26 |
27 |
28 |

29 |
30 |
31 |
32 | We have evaluated the accuracy of PipeFusion, DistriFusion and the baseline as shown bolow. To conduct the FID experiment, follow the detailed instructions provided in the [documentation](../../docs/fid/FID.md).
33 |
34 |
35 |

36 |
37 |
38 |
39 | For more details, please refer to the following paper.
40 |
41 | ```
42 | @article{wang2024pipefusion,
43 | title={PipeFusion: Displaced Patch Pipeline Parallelism for Inference of Diffusion Transformer Models},
44 | author={Jiannan Wang and Jiarui Fang and Jinzhe Pan and Aoyu Li and PengCheng Yang},
45 | year={2024},
46 | eprint={2405.07719},
47 | archivePrefix={arXiv},
48 | primaryClass={cs.CV}
49 | }
50 | ```
51 |
52 |
--------------------------------------------------------------------------------
/examples/latte_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import torch
3 | import torch.distributed
4 | from diffusers import AutoencoderKLTemporalDecoder
5 | from xfuser import xFuserLattePipeline, xFuserArgs
6 | from xfuser.config import FlexibleArgumentParser
7 | from xfuser.core.distributed import (
8 | get_world_group,
9 | get_data_parallel_rank,
10 | get_data_parallel_world_size,
11 | get_runtime_state,
12 | is_dp_last_group,
13 | )
14 | import imageio
15 |
16 |
17 | def main():
18 | parser = FlexibleArgumentParser(description="xFuser Arguments")
19 | args = xFuserArgs.add_cli_args(parser).parse_args()
20 | engine_args = xFuserArgs.from_cli_args(args)
21 | engine_config, input_config = engine_args.create_config()
22 | local_rank = get_world_group().local_rank
23 | pipe = xFuserLattePipeline.from_pretrained(
24 | pretrained_model_name_or_path=engine_config.model_config.model,
25 | engine_config=engine_config,
26 | torch_dtype=torch.float16,
27 | ).to(f"cuda:{local_rank}")
28 | # pipe.latte_prepare_run(input_config)
29 |
30 | vae = AutoencoderKLTemporalDecoder.from_pretrained(
31 | engine_config.model_config.model,
32 | subfolder="vae_temporal_decoder",
33 | torch_dtype=torch.float16,
34 | ).to(f"cuda:{local_rank}")
35 | pipe.vae = vae
36 |
37 | torch.cuda.reset_peak_memory_stats()
38 | start_time = time.time()
39 | output = pipe(
40 | height=input_config.height,
41 | width=input_config.width,
42 | video_length=16,
43 | prompt=input_config.prompt,
44 | num_inference_steps=input_config.num_inference_steps,
45 | output_type="pt",
46 | guidance_scale=input_config.guidance_scale,
47 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
48 | )
49 | end_time = time.time()
50 | elapsed_time = end_time - start_time
51 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
52 |
53 | parallel_info = (
54 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
55 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
56 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
57 | )
58 | if is_dp_last_group():
59 | videos = output.frames.cpu()
60 | global_rank = get_world_group().rank
61 | dp_group_world_size = get_data_parallel_world_size()
62 | dp_group_index = global_rank // dp_group_world_size
63 | num_dp_groups = engine_config.parallel_config.dp_degree
64 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
65 | if input_config.num_frames > 1:
66 | videos = (videos.clamp(0, 1) * 255).to(
67 | dtype=torch.uint8
68 | ) # convert to uint8
69 | imageio.mimwrite(
70 | "./latte_output.mp4", videos[0].permute(0, 2, 3, 1), fps=8, quality=5
71 | ) # highest quality is 10, lowest is 0
72 |
73 | if get_world_group().rank == get_world_group().world_size - 1:
74 | print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
75 | get_runtime_state().destroy_distributed_env()
76 |
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/xfuser/model_executor/layers/feedforward.py:
--------------------------------------------------------------------------------
1 | # https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
2 |
3 | import torch
4 | from torch import nn
5 | from torch.cuda import empty_cache
6 | from diffusers.models.attention import FeedForward, GELU, GEGLU
7 |
8 | try:
9 | import torch_musa
10 | from torch_musa.core.memory import empty_cache
11 | except ModuleNotFoundError:
12 | pass
13 |
14 | import xfuser.envs as envs
15 | if envs._is_npu():
16 | from torch.npu import empty_cache
17 |
18 | from xfuser.core.distributed.parallel_state import (
19 | get_tensor_model_parallel_world_size,
20 | get_tensor_model_parallel_rank,
21 | get_tp_group,
22 | )
23 | from xfuser.model_executor.layers.base_layer import xFuserLayerBaseWrapper
24 | from xfuser.model_executor.layers.register import xFuserLayerWrappersRegister
25 |
26 | @xFuserLayerWrappersRegister.register(FeedForward)
27 | class xFuserFeedForwardWrapper(xFuserLayerBaseWrapper):
28 | def __init__(self, feedforward: FeedForward):
29 | super(xFuserFeedForwardWrapper, self).__init__(module=feedforward)
30 |
31 | tp_degree = get_tensor_model_parallel_world_size()
32 | tp_rank = get_tensor_model_parallel_rank()
33 |
34 | if isinstance(self.module.net[0], GELU):
35 | self.module.net[0].proj.weight.data = self.module.net[
36 | 0
37 | ].proj.weight.data.chunk(tp_degree, dim=0)[tp_rank]
38 | if self.module.net[0].proj.bias is not None:
39 | self.module.net[0].proj.bias.data = self.module.net[
40 | 0
41 | ].proj.bias.data.chunk(tp_degree, dim=0)[tp_rank]
42 | elif isinstance(self.module.net[0], GEGLU):
43 | weight_buff = self.module.net[0].proj.weight.data.chunk(2, dim=0)
44 | a = weight_buff[0].chunk(tp_degree, dim=0)[tp_rank]
45 | b = weight_buff[1].chunk(tp_degree, dim=0)[tp_rank]
46 | c = torch.cat([a, b], dim=0)
47 |
48 | self.module.net[0].proj.weight.data = c
49 |
50 | bias_buff = self.module.net[0].proj.bias.data.chunk(2, dim=0)
51 | a = bias_buff[0].chunk(tp_degree, dim=0)[tp_rank]
52 | b = bias_buff[1].chunk(tp_degree, dim=0)[tp_rank]
53 | c = torch.cat([a, b], dim=0)
54 | self.module.net[0].proj.bias.data = c
55 |
56 | else:
57 | raise TypeError(
58 | f"activation_fn {type(isinstance(self.module.net[0]))} not supported"
59 | )
60 |
61 | self.module.net[2].weight.data = self.module.net[2].weight.chunk(
62 | tp_degree, dim=1
63 | )[tp_rank]
64 |
65 | self.has_output_bias = False
66 | if self.module.net[2].bias is not None:
67 | self.register_parameter(
68 | "output_bias", nn.Parameter(self.module.net[2].bias.data.clone())
69 | )
70 | self.module.net[2].bias = None
71 | self.has_output_bias = True
72 |
73 | empty_cache()
74 |
75 | def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
76 | hidden_states = self.module(hidden_states, *args, **kwargs)
77 | get_tp_group().all_reduce(hidden_states)
78 | if self.has_output_bias:
79 | hidden_states += self.output_bias
80 | return hidden_states
81 |
--------------------------------------------------------------------------------
/docs/methods/hybrid.md:
--------------------------------------------------------------------------------
1 |
2 | ## Hybrid Parallelism
3 | [Chinese Version](./hybrid_zh.md)
4 |
5 | The design goal of xDiT is to scale the DiT inference process to ultra-large scales, such as multiple machines and multiple GPUs interconnected with heterogenous networks, i.e. Ethernet and PCIe. Individual parallel methods, such as PipeFusion or Sequence Parallelism (SP), struggle to achieve this simultaneously, making the combination of different parallel methods necessary.
6 |
7 | xDiT supports four parallel methods: PipeFusion, Sequence, Data, and CFG Parallel. Among these, Data and CFG Parallel are relatively simple for inter-image parallelism, while PipeFusion and SP are more complex for parallelism within different patches of an image. The ability to combine these two parallel methods is one of the innovations of xDiT.
8 |
9 | PipeFusion leverages the characteristic of Input Temporal Redundancy, using Stale KV for Attention computation, which makes it difficult for PipeFusion to hybrid parallel strategies as easily as large language models (LLM). Specifically, using standard sequence parallel interfaces, such as RingAttention, Ulysses, or USP, cannot meet the requirements for mixing SP with PipeFusion.
10 |
11 | We elaborate on this issue with the following illustration, which shows a mixed parallel method with pipe_degree=4 and sp_degree=2. Setting `num_pipeline_patch`=4, the image is divided into M=`num_pipeline_patch*sp_degree`=8 patches, labeled P0~P7.
12 |
13 |
14 |

15 |
16 |
17 | In the implementation of Standard SP Attention, the inputs Q, K, V, and the output O are all split along the sequence dimension, with consistent splitting pattern.
18 | In a SP process group, the input patches from different ranks do not overlap, the positions for fresh KV updates calculated in each micro step also do not overlap among different ranks.
19 | As shown in the following figure, in the KV Buffer of standard SP, the yellow part represents the fresh KV owned by SP0 rank=0, and the green part represents the fresh KV owned by SP1 rank=1, which are not the same.
20 | Within this diffusion step, device=0 cannot obtain the fresh KV of P1,3,5,7 for computation, but PipeFusion requires having all KV from the previous diffusion step in the next diffusion step.
21 | Standard SP only has 1/sp_degree of the fresh KV buffer, so it cannot achieve the correct results for mixed parallel inference.
22 |
23 |
24 |

25 |
26 |
27 | xDiT has customized the implementation of sequence parallelism to meet this mixed parallel requirement. xDiT uses `xFuserLongContextAttention` to store the intermediate results of SP in the KV Buffer. The effect is illustrated in the figure, where after each micro-step SP execution, the fresh KV of different rank devices within the SP Group is replicated. This way, after one diffusion step, the KV Buffer of all devices in the SP Group is updated to the latest, ready for use in the next Diffusion Step.
28 |
29 |
30 |

31 |
--------------------------------------------------------------------------------
/examples/sana_sprint_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from xfuser import xFuserSanaSprintPipeline, xFuserArgs
6 | from xfuser.config import FlexibleArgumentParser
7 | from xfuser.core.distributed import (
8 | get_world_group,
9 | is_dp_last_group,
10 | get_data_parallel_rank,
11 | get_runtime_state,
12 | )
13 | from xfuser.core.distributed.parallel_state import get_data_parallel_world_size
14 |
15 |
16 | def main():
17 | parser = FlexibleArgumentParser(description="xFuser Arguments")
18 | args = xFuserArgs.add_cli_args(parser).parse_args()
19 | engine_args = xFuserArgs.from_cli_args(args)
20 | engine_config, input_config = engine_args.create_config()
21 | local_rank = get_world_group().local_rank
22 |
23 | pipe = xFuserSanaSprintPipeline.from_pretrained(
24 | pretrained_model_name_or_path=engine_config.model_config.model,
25 | engine_config=engine_config,
26 | torch_dtype=torch.bfloat16,
27 | ).to(f"cuda:{local_rank}")
28 | pipe.vae.to(torch.bfloat16)
29 | pipe.text_encoder.to(torch.bfloat16)
30 |
31 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
32 |
33 | pipe.prepare_run(input_config)
34 |
35 | torch.cuda.reset_peak_memory_stats()
36 | start_time = time.time()
37 | output = pipe(
38 | height=input_config.height,
39 | width=input_config.width,
40 | prompt=input_config.prompt,
41 | num_inference_steps=input_config.num_inference_steps,
42 | output_type=input_config.output_type,
43 | use_resolution_binning=input_config.use_resolution_binning,
44 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
45 | guidance_scale=4.5
46 | )
47 | end_time = time.time()
48 | elapsed_time = end_time - start_time
49 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
50 |
51 | parallel_info = (
52 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
53 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
54 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
55 | )
56 | if input_config.output_type == "pil":
57 | dp_group_index = get_data_parallel_rank()
58 | num_dp_groups = get_data_parallel_world_size()
59 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
60 | if pipe.is_dp_last_group():
61 | if not os.path.exists("results"):
62 | os.mkdir("results")
63 | for i, image in enumerate(output.images):
64 | image_rank = dp_group_index * dp_batch_size + i
65 | image.save(
66 | f"./results/sana_sprint_1.6B_result_{parallel_info}_{image_rank}.png"
67 | )
68 | print(
69 | f"image {i} saved to ./results/sana_sprint_1.6B_result_{parallel_info}_{image_rank}.png"
70 | )
71 |
72 | if get_world_group().rank == get_world_group().world_size - 1:
73 | print(
74 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
75 | )
76 |
77 | get_runtime_state().destroy_distributed_env()
78 |
79 |
80 | if __name__ == "__main__":
81 | main()
82 |
--------------------------------------------------------------------------------
/examples/pixartsigma_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserPixArtSigmaPipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.core.distributed import (
9 | get_world_group,
10 | is_dp_last_group,
11 | get_data_parallel_world_size,
12 | get_runtime_state,
13 | get_data_parallel_rank,
14 | )
15 |
16 |
17 | def main():
18 | parser = FlexibleArgumentParser(description="xFuser Arguments")
19 | args = xFuserArgs.add_cli_args(parser).parse_args()
20 | engine_args = xFuserArgs.from_cli_args(args)
21 | engine_config, input_config = engine_args.create_config()
22 | local_rank = get_world_group().local_rank
23 | text_encoder = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder", torch_dtype=torch.float16)
24 | if args.use_fp8_t5_encoder:
25 | from optimum.quanto import freeze, qfloat8, quantize
26 | print(f"rank {local_rank} quantizing text encoder")
27 | quantize(text_encoder, weights=qfloat8)
28 | freeze(text_encoder)
29 |
30 | pipe = xFuserPixArtSigmaPipeline.from_pretrained(
31 | pretrained_model_name_or_path=engine_config.model_config.model,
32 | engine_config=engine_config,
33 | torch_dtype=torch.float16,
34 | text_encoder=text_encoder,
35 | ).to(f"cuda:{local_rank}")
36 | pipe.prepare_run(input_config)
37 |
38 | torch.cuda.reset_peak_memory_stats()
39 | start_time = time.time()
40 | output = pipe(
41 | height=input_config.height,
42 | width=input_config.width,
43 | prompt=input_config.prompt,
44 | num_inference_steps=input_config.num_inference_steps,
45 | output_type=input_config.output_type,
46 | use_resolution_binning=input_config.use_resolution_binning,
47 | guidance_scale=input_config.guidance_scale,
48 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
49 | clean_caption=False,
50 | )
51 | end_time = time.time()
52 | elapsed_time = end_time - start_time
53 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
54 |
55 | parallel_info = (
56 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
57 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
58 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
59 | )
60 | if input_config.output_type == "pil":
61 | dp_group_index = get_data_parallel_rank()
62 | num_dp_groups = get_data_parallel_world_size()
63 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
64 | if pipe.is_dp_last_group():
65 | if not os.path.exists("results"):
66 | os.mkdir("results")
67 | for i, image in enumerate(output.images):
68 | image_rank = dp_group_index * dp_batch_size + i
69 | image.save(
70 | f"./results/pixart_sigma_result_{parallel_info}_{image_rank}.png"
71 | )
72 | print(
73 | f"image {i} saved to ./results/pixart_sigma_result_{parallel_info}_{image_rank}.png"
74 | )
75 |
76 | if get_world_group().rank == get_world_group().world_size - 1:
77 | print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
78 | get_runtime_state().destroy_distributed_env()
79 |
80 |
81 | if __name__ == "__main__":
82 | main()
83 |
--------------------------------------------------------------------------------
/examples/pixartalpha_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserPixArtAlphaPipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.core.distributed import (
9 | get_world_group,
10 | is_dp_last_group,
11 | get_data_parallel_world_size,
12 | get_runtime_state,
13 | get_data_parallel_rank,
14 | )
15 |
16 |
17 | def main():
18 | parser = FlexibleArgumentParser(description="xFuser Arguments")
19 | args = xFuserArgs.add_cli_args(parser).parse_args()
20 | engine_args = xFuserArgs.from_cli_args(args)
21 | engine_config, input_config = engine_args.create_config()
22 | local_rank = get_world_group().local_rank
23 | text_encoder = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder", torch_dtype=torch.float16)
24 | if args.use_fp8_t5_encoder:
25 | from optimum.quanto import freeze, qfloat8, quantize
26 | print(f"rank {local_rank} quantizing text encoder")
27 | quantize(text_encoder, weights=qfloat8)
28 | freeze(text_encoder)
29 |
30 | pipe = xFuserPixArtAlphaPipeline.from_pretrained(
31 | pretrained_model_name_or_path=engine_config.model_config.model,
32 | engine_config=engine_config,
33 | torch_dtype=torch.float16,
34 | text_encoder=text_encoder,
35 | ).to(f"cuda:{local_rank}")
36 | model_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
37 | pipe.prepare_run(input_config)
38 |
39 | torch.cuda.reset_peak_memory_stats()
40 | start_time = time.time()
41 | output = pipe(
42 | height=input_config.height,
43 | width=input_config.width,
44 | prompt=input_config.prompt,
45 | num_inference_steps=input_config.num_inference_steps,
46 | output_type=input_config.output_type,
47 | use_resolution_binning=input_config.use_resolution_binning,
48 | guidance_scale=input_config.guidance_scale,
49 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
50 | )
51 | end_time = time.time()
52 | elapsed_time = end_time - start_time
53 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
54 |
55 | parallel_info = (
56 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
57 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
58 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}_tc_{engine_args.use_torch_compile}"
59 | )
60 | if input_config.output_type == "pil":
61 | dp_group_index = get_data_parallel_rank()
62 | num_dp_groups = get_data_parallel_world_size()
63 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
64 | if pipe.is_dp_last_group():
65 | if not os.path.exists("results"):
66 | os.mkdir("results")
67 | for i, image in enumerate(output.images):
68 | image_rank = dp_group_index * dp_batch_size + i
69 | img_file = (
70 | f"./results/pixart_alpha_result_{parallel_info}_{image_rank}.png"
71 | )
72 | image.save(img_file)
73 | print(img_file)
74 |
75 | if get_world_group().rank == get_world_group().world_size - 1:
76 | print(
77 | f"epoch time: {elapsed_time:.2f} sec, model memory: {model_memory/1e9:.2f} GB, overall memory: {peak_memory/1e9:.2f} GB"
78 | )
79 | get_runtime_state().destroy_distributed_env()
80 |
81 |
82 | if __name__ == "__main__":
83 | main()
84 |
--------------------------------------------------------------------------------
/examples/sdxl_example.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | import torch
4 | import torch.distributed
5 | from xfuser import xFuserStableDiffusionXLPipeline, xFuserArgs
6 | from xfuser.config import FlexibleArgumentParser
7 | from xfuser.core.distributed import (
8 | get_world_group,
9 | get_data_parallel_rank,
10 | get_data_parallel_world_size,
11 | get_runtime_state,
12 | )
13 | from diffusers import StableDiffusionXLPipeline
14 |
15 | def main():
16 | # Initialize argument parser
17 | parser = FlexibleArgumentParser(description="xFuser SDXL Arguments")
18 | args = xFuserArgs.add_cli_args(parser).parse_args()
19 | engine_args = xFuserArgs.from_cli_args(args)
20 | engine_config, input_config = engine_args.create_config()
21 |
22 |
23 | # Set runtime configuration
24 | engine_config.runtime_config.dtype = torch.bfloat16
25 | local_rank = get_world_group().local_rank
26 |
27 | # Initialize pipeline
28 | pipe = xFuserStableDiffusionXLPipeline.from_pretrained(
29 | pretrained_model_name_or_path=engine_config.model_config.model,
30 | engine_config=engine_config,
31 | torch_dtype=torch.float16,
32 | )
33 |
34 | # Handle device placement
35 | if args.enable_sequential_cpu_offload:
36 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
37 | logging.info(f"rank {local_rank} sequential CPU offload enabled")
38 | else:
39 | pipe = pipe.to(f"cuda:{local_rank}")
40 |
41 | # Record initial memory usage
42 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
43 |
44 | # Prepare for inference
45 | pipe.prepare_run(input_config, steps=input_config.num_inference_steps)
46 |
47 | # Run inference
48 | torch.cuda.reset_peak_memory_stats()
49 | start_time = time.time()
50 | output = pipe(
51 | height=input_config.height,
52 | width=input_config.width,
53 | prompt=input_config.prompt,
54 | num_inference_steps=input_config.num_inference_steps,
55 | output_type=input_config.output_type,
56 | guidance_scale=input_config.guidance_scale,
57 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
58 | )
59 | end_time = time.time()
60 | elapsed_time = end_time - start_time
61 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
62 |
63 | # Generate parallel configuration info string
64 | parallel_info = (
65 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
66 | f"tp{engine_args.tensor_parallel_degree}_"
67 | f"pp{engine_args.pipefusion_parallel_degree}"
68 | )
69 |
70 | # Save generated images
71 | if input_config.output_type == "pil":
72 | dp_group_index = get_data_parallel_rank()
73 | num_dp_groups = get_data_parallel_world_size()
74 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
75 | if pipe.is_dp_last_group():
76 | for i, image in enumerate(output.images):
77 | image_rank = dp_group_index * dp_batch_size + i
78 | image_name = f"sdxl_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
79 | image.save(f"./results/{image_name}")
80 | print(f"image {i} saved to ./results/{image_name}")
81 |
82 | # Print performance metrics
83 | if get_world_group().rank == get_world_group().world_size - 1:
84 | print(
85 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
86 | )
87 |
88 | # Cleanup
89 | get_runtime_state().destroy_distributed_env()
90 |
91 | if __name__ == "__main__":
92 | main()
--------------------------------------------------------------------------------
/examples/sd3_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserStableDiffusion3Pipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.core.distributed import (
9 | get_world_group,
10 | is_dp_last_group,
11 | get_data_parallel_rank,
12 | get_runtime_state,
13 | )
14 | from xfuser.core.distributed.parallel_state import get_data_parallel_world_size
15 |
16 |
17 | def main():
18 | parser = FlexibleArgumentParser(description="xFuser Arguments")
19 | args = xFuserArgs.add_cli_args(parser).parse_args()
20 | engine_args = xFuserArgs.from_cli_args(args)
21 | engine_config, input_config = engine_args.create_config()
22 | local_rank = get_world_group().local_rank
23 | text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16)
24 | if args.use_fp8_t5_encoder:
25 | from optimum.quanto import freeze, qfloat8, quantize
26 | print(f"rank {local_rank} quantizing text encoder 2")
27 | quantize(text_encoder_3, weights=qfloat8)
28 | freeze(text_encoder_3)
29 |
30 | pipe = xFuserStableDiffusion3Pipeline.from_pretrained(
31 | pretrained_model_name_or_path=engine_config.model_config.model,
32 | engine_config=engine_config,
33 | torch_dtype=torch.float16,
34 | text_encoder_3=text_encoder_3,
35 | ).to(f"cuda:{local_rank}")
36 |
37 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
38 |
39 | pipe.prepare_run(input_config)
40 |
41 | torch.cuda.reset_peak_memory_stats()
42 | start_time = time.time()
43 | output = pipe(
44 | height=input_config.height,
45 | width=input_config.width,
46 | prompt=input_config.prompt,
47 | num_inference_steps=input_config.num_inference_steps,
48 | output_type=input_config.output_type,
49 | guidance_scale=input_config.guidance_scale,
50 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
51 | )
52 | end_time = time.time()
53 | elapsed_time = end_time - start_time
54 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
55 |
56 | parallel_info = (
57 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
58 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
59 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
60 | )
61 | if input_config.output_type == "pil":
62 | dp_group_index = get_data_parallel_rank()
63 | num_dp_groups = get_data_parallel_world_size()
64 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
65 | if pipe.is_dp_last_group():
66 | if not os.path.exists("results"):
67 | os.mkdir("results")
68 | for i, image in enumerate(output.images):
69 | image_rank = dp_group_index * dp_batch_size + i
70 | image.save(
71 | f"./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png"
72 | )
73 | print(
74 | f"image {i} saved to ./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png"
75 | )
76 |
77 | if get_world_group().rank == get_world_group().world_size - 1:
78 | print(
79 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
80 | )
81 |
82 | get_runtime_state().destroy_distributed_env()
83 |
84 |
85 | if __name__ == "__main__":
86 | main()
87 |
--------------------------------------------------------------------------------
/examples/cogvideox_example.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | import torch
4 | import torch.distributed
5 | from diffusers import AutoencoderKLTemporalDecoder
6 | from xfuser import xFuserCogVideoXPipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.core.distributed import (
9 | get_world_group,
10 | get_data_parallel_rank,
11 | get_data_parallel_world_size,
12 | get_runtime_state,
13 | is_dp_last_group,
14 | )
15 | from diffusers.utils import export_to_video
16 |
17 |
18 | def main():
19 | parser = FlexibleArgumentParser(description="xFuser Arguments")
20 | args = xFuserArgs.add_cli_args(parser).parse_args()
21 | engine_args = xFuserArgs.from_cli_args(args)
22 |
23 | engine_config, input_config = engine_args.create_config()
24 | local_rank = get_world_group().local_rank
25 |
26 | assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
27 | assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for CogVideo"
28 |
29 | pipe = xFuserCogVideoXPipeline.from_pretrained(
30 | pretrained_model_name_or_path=engine_config.model_config.model,
31 | engine_config=engine_config,
32 | torch_dtype=torch.bfloat16,
33 | )
34 | if args.enable_sequential_cpu_offload:
35 | pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
36 | logging.info(f"rank {local_rank} sequential CPU offload enabled")
37 | elif args.enable_model_cpu_offload:
38 | pipe.enable_model_cpu_offload(gpu_id=local_rank)
39 | logging.info(f"rank {local_rank} model CPU offload enabled")
40 | else:
41 | device = torch.device(f"cuda:{local_rank}")
42 | pipe = pipe.to(device)
43 |
44 | if args.enable_tiling:
45 | pipe.vae.enable_tiling()
46 |
47 | if args.enable_slicing:
48 | pipe.vae.enable_slicing()
49 |
50 | # warmup
51 | output = pipe(
52 | height=input_config.height,
53 | width=input_config.width,
54 | num_frames=input_config.num_frames,
55 | prompt=input_config.prompt,
56 | num_inference_steps=1,
57 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
58 | ).frames[0]
59 |
60 | torch.cuda.reset_peak_memory_stats()
61 | start_time = time.time()
62 |
63 | output = pipe(
64 | height=input_config.height,
65 | width=input_config.width,
66 | num_frames=input_config.num_frames,
67 | prompt=input_config.prompt,
68 | num_inference_steps=input_config.num_inference_steps,
69 | guidance_scale=input_config.guidance_scale,
70 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
71 | ).frames[0]
72 |
73 | end_time = time.time()
74 | elapsed_time = end_time - start_time
75 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
76 |
77 | parallel_info = (
78 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
79 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
80 | f"tp{engine_args.tensor_parallel_degree}_"
81 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
82 | )
83 | if is_dp_last_group():
84 | resolution = f"{input_config.width}x{input_config.height}"
85 | output_filename = f"results/cogvideox_{parallel_info}_{resolution}.mp4"
86 | export_to_video(output, output_filename, fps=8)
87 | print(f"output saved to {output_filename}")
88 |
89 | if get_world_group().rank == get_world_group().world_size - 1:
90 | print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
91 | get_runtime_state().destroy_distributed_env()
92 |
93 |
94 | if __name__ == "__main__":
95 | main()
96 |
--------------------------------------------------------------------------------
/examples/hunyuandit_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import torch.distributed
5 | from transformers import T5EncoderModel
6 | from xfuser import xFuserHunyuanDiTPipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.core.distributed import (
9 | get_world_group,
10 | is_dp_last_group,
11 | get_data_parallel_world_size,
12 | get_runtime_state,
13 | get_data_parallel_rank,
14 | )
15 |
16 |
17 | def main():
18 | parser = FlexibleArgumentParser(description="xFuser Arguments")
19 | args = xFuserArgs.add_cli_args(parser).parse_args()
20 | engine_args = xFuserArgs.from_cli_args(args)
21 | engine_config, input_config = engine_args.create_config()
22 | local_rank = get_world_group().local_rank
23 | text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
24 | if args.use_fp8_t5_encoder:
25 | from optimum.quanto import freeze, qfloat8, quantize
26 | print(f"rank {local_rank} quantizing text encoder 2")
27 | quantize(text_encoder_2, weights=qfloat8)
28 | freeze(text_encoder_2)
29 |
30 | pipe = xFuserHunyuanDiTPipeline.from_pretrained(
31 | pretrained_model_name_or_path=engine_config.model_config.model,
32 | engine_config=engine_config,
33 | torch_dtype=torch.float16,
34 | text_encoder_2=text_encoder_2,
35 | ).to(f"cuda:{local_rank}")
36 |
37 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
38 |
39 | pipe.prepare_run(input_config)
40 |
41 | torch.cuda.reset_peak_memory_stats()
42 | start_time = time.time()
43 | output = pipe(
44 | height=input_config.height,
45 | width=input_config.width,
46 | prompt=input_config.prompt,
47 | num_inference_steps=input_config.num_inference_steps,
48 | output_type=input_config.output_type,
49 | use_resolution_binning=input_config.use_resolution_binning,
50 | guidance_scale=input_config.guidance_scale,
51 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
52 | )
53 | end_time = time.time()
54 | elapsed_time = end_time - start_time
55 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
56 |
57 | parallel_info = (
58 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
59 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
60 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
61 | )
62 | if input_config.output_type == "pil":
63 | dp_group_index = get_data_parallel_rank()
64 | num_dp_groups = get_data_parallel_world_size()
65 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
66 | if pipe.is_dp_last_group():
67 | if not os.path.exists("results"):
68 | os.mkdir("results")
69 | for i, image in enumerate(output.images):
70 | image_rank = dp_group_index * dp_batch_size + i
71 | image.save(
72 | f"./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
73 | )
74 | print(
75 | f"image {i} saved to ./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
76 | )
77 |
78 | if get_world_group().rank == get_world_group().world_size - 1:
79 | print(
80 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
81 | )
82 | get_runtime_state().destroy_distributed_env()
83 |
84 |
85 | if __name__ == "__main__":
86 | main()
87 |
--------------------------------------------------------------------------------
/docs/developer/adding_models/adding_model_cfg.py:
--------------------------------------------------------------------------------
1 | # Example for parallelize new models with USP
2 | # run with
3 | # torchrun --nproc_per_node=2 \
4 | # adding_cogvideox.py
5 | import sys
6 | import functools
7 | from typing import List, Optional, Tuple, Union
8 |
9 | import time
10 | import torch
11 |
12 | from diffusers import DiffusionPipeline, CogVideoXPipeline
13 |
14 | import torch.distributed as dist
15 | from xfuser.core.distributed import (
16 | init_distributed_environment,
17 | initialize_model_parallel,
18 | get_world_group,
19 | get_classifier_free_guidance_world_size,
20 | get_classifier_free_guidance_rank,
21 | get_cfg_group,
22 | )
23 |
24 | from diffusers.utils import export_to_video
25 |
26 | def parallelize_transformer(pipe: DiffusionPipeline):
27 | transformer = pipe.transformer
28 | original_forward = transformer.forward
29 |
30 | @functools.wraps(transformer.__class__.forward)
31 | def new_forward(
32 | self,
33 | hidden_states: torch.Tensor,
34 | encoder_hidden_states: Optional[torch.Tensor] = None,
35 | timestep: torch.LongTensor = None,
36 | timestep_cond: Optional[torch.Tensor] = None,
37 | ofs: Optional[Union[int, float, torch.LongTensor]] = None,
38 | image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
39 | **kwargs,
40 | ):
41 | timestep = torch.chunk(timestep, get_classifier_free_guidance_world_size(),dim=0)[get_classifier_free_guidance_rank()]
42 | hidden_states = torch.chunk(hidden_states, get_classifier_free_guidance_world_size(),dim=0)[get_classifier_free_guidance_rank()]
43 | encoder_hidden_states = torch.chunk(encoder_hidden_states, get_classifier_free_guidance_world_size(),dim=0)[get_classifier_free_guidance_rank()]
44 |
45 | output = original_forward(
46 | hidden_states,
47 | encoder_hidden_states,
48 | timestep=timestep,
49 | timestep_cond=timestep_cond,
50 | ofs=ofs,
51 | image_rotary_emb=image_rotary_emb,
52 | **kwargs,
53 | )
54 |
55 | return_dict = not isinstance(output, tuple)
56 | sample = output[0]
57 | sample = get_cfg_group().all_gather(sample, dim=0)
58 | if return_dict:
59 | return output.__class__(sample, *output[1:])
60 | return (sample, *output[1:])
61 |
62 | new_forward = new_forward.__get__(transformer)
63 | transformer.forward = new_forward
64 |
65 | if __name__ == "__main__":
66 | dist.init_process_group("nccl")
67 | init_distributed_environment(
68 | rank=dist.get_rank(),
69 | world_size=dist.get_world_size()
70 | )
71 | initialize_model_parallel(
72 | classifier_free_guidance_degree=2,
73 | )
74 | pipe = CogVideoXPipeline.from_pretrained(
75 | pretrained_model_name_or_path=sys.argv[1],
76 | torch_dtype=torch.bfloat16,
77 | )
78 | local_rank = get_world_group().local_rank
79 | device = torch.device(f"cuda:{local_rank}")
80 | pipe = pipe.to(device)
81 |
82 | pipe.vae.enable_tiling()
83 |
84 | parallelize_transformer(pipe)
85 |
86 | torch.cuda.reset_peak_memory_stats()
87 | start_time = time.time()
88 |
89 | output = pipe(
90 | num_frames=9,
91 | prompt="A little girl is riding a bicycle at high speed. Focused, detailed, realistic.",
92 | num_inference_steps=20,
93 | generator=torch.Generator(device="cuda").manual_seed(42),
94 | ).frames[0]
95 |
96 | end_time = time.time()
97 | elapsed_time = end_time - start_time
98 |
99 | if local_rank == 0:
100 | export_to_video(output, "output.mp4", fps=8)
101 | print(f"epoch time: {elapsed_time:.2f} sec")
102 |
103 | dist.destroy_process_group()
104 |
--------------------------------------------------------------------------------
/xfuser/model_executor/models/customized/step_video_t2v/rope.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from xfuser.core.distributed.parallel_state import get_sequence_parallel_world_size, get_sequence_parallel_rank
3 |
4 | class RoPE1D:
5 | def __init__(self, freq=1e4, F0=1.0, scaling_factor=1.0):
6 | self.base = freq
7 | self.F0 = F0
8 | self.scaling_factor = scaling_factor
9 | self.cache = {}
10 |
11 | def get_cos_sin(self, D, seq_len, device, dtype):
12 | if (D, seq_len, device, dtype) not in self.cache:
13 | inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
14 | t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
15 | freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
16 | freqs = torch.cat((freqs, freqs), dim=-1)
17 | cos = freqs.cos() # (Seq, Dim)
18 | sin = freqs.sin()
19 | self.cache[D, seq_len, device, dtype] = (cos, sin)
20 | return self.cache[D, seq_len, device, dtype]
21 |
22 | @staticmethod
23 | def rotate_half(x):
24 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
25 | return torch.cat((-x2, x1), dim=-1)
26 |
27 | def apply_rope1d(self, tokens, pos1d, cos, sin):
28 | assert pos1d.ndim == 2
29 | cos = torch.nn.functional.embedding(pos1d, cos)[:, :, None, :]
30 | sin = torch.nn.functional.embedding(pos1d, sin)[:, :, None, :]
31 | return (tokens * cos) + (self.rotate_half(tokens) * sin)
32 |
33 | def __call__(self, tokens, positions):
34 | """
35 | input:
36 | * tokens: batch_size x ntokens x nheads x dim
37 | * positions: batch_size x ntokens (t position of each token)
38 | output:
39 | * tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim)
40 | """
41 | D = tokens.size(3)
42 | assert positions.ndim == 2 # Batch, Seq
43 | cos, sin = self.get_cos_sin(D, int(positions.max()) + 1, tokens.device, tokens.dtype)
44 | tokens = self.apply_rope1d(tokens, positions, cos, sin)
45 | return tokens
46 |
47 |
48 |
49 | class RoPE3D(RoPE1D):
50 | def __init__(self, freq=1e4, F0=1.0, scaling_factor=1.0):
51 | super(RoPE3D, self).__init__(freq, F0, scaling_factor)
52 | self.position_cache = {}
53 |
54 | def get_mesh_3d(self, rope_positions, bsz):
55 | f, h, w = rope_positions
56 |
57 | if f"{f}-{h}-{w}" not in self.position_cache:
58 | x = torch.arange(f, device='cpu')
59 | y = torch.arange(h, device='cpu')
60 | z = torch.arange(w, device='cpu')
61 | self.position_cache[f"{f}-{h}-{w}"] = torch.cartesian_prod(x, y, z).view(1, f*h*w, 3).expand(bsz, -1, 3)
62 | return self.position_cache[f"{f}-{h}-{w}"]
63 |
64 | def __call__(self, tokens, rope_positions, ch_split, parallel=False):
65 | """
66 | input:
67 | * tokens: batch_size x ntokens x nheads x dim
68 | * rope_positions: list of (f, h, w)
69 | output:
70 | * tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim)
71 | """
72 | assert sum(ch_split) == tokens.size(-1);
73 |
74 | mesh_grid = self.get_mesh_3d(rope_positions, bsz=tokens.shape[0])
75 | out = []
76 | for i, (D, x) in enumerate(zip(ch_split, torch.split(tokens, ch_split, dim=-1))):
77 | cos, sin = self.get_cos_sin(D, int(mesh_grid.max()) + 1, tokens.device, tokens.dtype)
78 |
79 | if parallel:
80 | mesh = torch.chunk(mesh_grid[:, :, i], get_sequence_parallel_world_size(),dim=1)[get_sequence_parallel_rank()].clone()
81 | else:
82 | mesh = mesh_grid[:, :, i].clone()
83 | x = self.apply_rope1d(x, mesh.to(tokens.device), cos, sin)
84 | out.append(x)
85 |
86 | tokens = torch.cat(out, dim=-1)
87 | return tokens
88 |
89 |
90 |
--------------------------------------------------------------------------------
/xfuser/logger.py:
--------------------------------------------------------------------------------
1 | # Adapted from
2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
3 | """Logging configuration."""
4 | import logging
5 | import sys
6 | import os
7 | from typing import Optional
8 |
9 | _FORMAT = "%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s"
10 | _DATE_FORMAT = "%m-%d %H:%M:%S"
11 |
12 | _LOG_LEVEL = os.environ.get("LOG_LEVEL", "debug")
13 | _LOG_LEVEL = getattr(logging, _LOG_LEVEL.upper(), 0)
14 | _LOG_DIR = os.environ.get("LOG_DIR", None)
15 |
16 |
17 | class NewLineFormatter(logging.Formatter):
18 | """Adds logging prefix to newlines to align multi-line messages."""
19 |
20 | def __init__(self, fmt, datefmt=None):
21 | logging.Formatter.__init__(self, fmt, datefmt)
22 |
23 | def format(self, record):
24 | msg = logging.Formatter.format(self, record)
25 | if record.message != "":
26 | parts = msg.split(record.message)
27 | msg = msg.replace("\n", "\r\n" + parts[0])
28 | return msg
29 |
30 |
31 | _root_logger = logging.getLogger("xfuser")
32 | _default_handler = None
33 | _default_file_handler = None
34 | _inference_log_file_handler = {}
35 |
36 |
37 | def _setup_logger():
38 | _root_logger.setLevel(_LOG_LEVEL)
39 | global _default_handler
40 | global _default_file_handler
41 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
42 |
43 | if _default_handler is None:
44 | _default_handler = logging.StreamHandler(sys.stdout)
45 | _default_handler.flush = sys.stdout.flush # type: ignore
46 | _default_handler.setLevel(_LOG_LEVEL)
47 | _root_logger.addHandler(_default_handler)
48 |
49 | if _default_file_handler is None and _LOG_DIR is not None:
50 | if not os.path.exists(_LOG_DIR):
51 | try:
52 | os.makedirs(_LOG_DIR)
53 | except OSError as e:
54 | _root_logger.warn(f"Error creating directory {_LOG_DIR} : {e}")
55 | _default_file_handler = logging.FileHandler(_LOG_DIR + "/default.log")
56 | _default_file_handler.setLevel(_LOG_LEVEL)
57 | _default_file_handler.setFormatter(fmt)
58 | _root_logger.addHandler(_default_file_handler)
59 |
60 | _default_handler.setFormatter(fmt)
61 | # Setting this will avoid the message
62 | # being propagated to the parent logger.
63 | _root_logger.propagate = False
64 |
65 |
66 | # The logger is initialized when the module is imported.
67 | # This is thread-safe as the module is only imported once,
68 | # guaranteed by the Python GIL.
69 | _setup_logger()
70 |
71 |
72 | def init_logger(name: str):
73 | pid = os.getpid()
74 | # Use the same settings as above for root logger
75 | logger = logging.getLogger(name)
76 | logger.setLevel(_LOG_LEVEL)
77 | logger.addHandler(_default_handler)
78 | if _LOG_DIR is not None and pid is None:
79 | logger.addHandler(_default_file_handler)
80 | elif _LOG_DIR is not None:
81 | if _inference_log_file_handler.get(pid, None) is not None:
82 | logger.addHandler(_inference_log_file_handler[pid])
83 | else:
84 | if not os.path.exists(_LOG_DIR):
85 | try:
86 | os.makedirs(_LOG_DIR)
87 | except OSError as e:
88 | _root_logger.warn(f"Error creating directory {_LOG_DIR} : {e}")
89 | _inference_log_file_handler[pid] = logging.FileHandler(
90 | _LOG_DIR + f"/process.{pid}.log"
91 | )
92 | _inference_log_file_handler[pid].setLevel(_LOG_LEVEL)
93 | _inference_log_file_handler[pid].setFormatter(
94 | NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
95 | )
96 | _root_logger.addHandler(_inference_log_file_handler[pid])
97 | logger.addHandler(_inference_log_file_handler[pid])
98 | logger.propagate = False
99 | return logger
100 |
--------------------------------------------------------------------------------
/examples/zimage_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import torch
3 | from diffusers import ZImagePipeline
4 | from xfuser.config.diffusers import has_valid_diffusers_version, get_minimum_diffusers_version
5 |
6 | if not has_valid_diffusers_version("zimage"):
7 | minimum_diffusers_version = get_minimum_diffusers_version("zimage")
8 | raise ImportError(f"Please install diffusers>={minimum_diffusers_version} to use Z-Image models.")
9 |
10 | from xfuser.model_executor.models.transformers.transformer_z_image import xFuserZImageTransformer2DWrapper
11 | from diffusers import DiffusionPipeline
12 |
13 | from xfuser import xFuserArgs
14 | from xfuser.config import FlexibleArgumentParser
15 | from xfuser.core.distributed import (
16 | get_world_group,
17 | get_runtime_state,
18 | initialize_runtime_state,
19 | )
20 |
21 | def run_pipe(pipe: DiffusionPipeline, input_config):
22 | # Pipe implementation currently encodes the prompt in-place,
23 | # causing any subsequent calls to use the already encoded prompt as prompt,
24 | # causing cascading encodings unless we provide a new list each time.
25 | prompt = str(input_config.prompt)
26 |
27 | return pipe(
28 | height=input_config.height,
29 | width=input_config.width,
30 | prompt=prompt,
31 | num_inference_steps=9, # Recommended value
32 | guidance_scale=0.0, # Recommended value
33 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
34 | ).images[0]
35 |
36 | def main():
37 | parser = FlexibleArgumentParser(description="xFuser Arguments")
38 | args = xFuserArgs.add_cli_args(parser).parse_args()
39 | engine_args = xFuserArgs.from_cli_args(args)
40 | engine_config, input_config = engine_args.create_config()
41 | engine_config.runtime_config.dtype = torch.bfloat16
42 | local_rank = get_world_group().local_rank
43 | is_last_process = get_world_group().rank == get_world_group().world_size - 1
44 |
45 | assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
46 |
47 | transformer = xFuserZImageTransformer2DWrapper.from_pretrained(
48 | engine_config.model_config.model,
49 | torch_dtype=torch.bfloat16,
50 | subfolder="transformer",
51 | )
52 | pipe = ZImagePipeline.from_pretrained(
53 | pretrained_model_name_or_path=engine_config.model_config.model,
54 | engine_config=engine_config,
55 | transformer=transformer,
56 | torch_dtype=torch.bfloat16,
57 | )
58 | pipe = pipe.to(f"cuda:{local_rank}")
59 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
60 |
61 | initialize_runtime_state(pipe, engine_config)
62 |
63 | if engine_config.runtime_config.use_torch_compile:
64 | torch._inductor.config.reorder_for_compute_comm_overlap = True
65 | pipe.transformer = torch.compile(pipe.transformer, mode="default")
66 |
67 | # one full pass to warmup the torch compiler
68 | output = run_pipe(pipe, input_config)
69 |
70 | torch.cuda.reset_peak_memory_stats()
71 | start_time = time.time()
72 |
73 | output = run_pipe(pipe, input_config)
74 |
75 | end_time = time.time()
76 | elapsed_time = end_time - start_time
77 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
78 |
79 | parallel_info = (
80 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}"
81 | )
82 | if input_config.output_type == "pil":
83 | if is_last_process:
84 | image_name = f"zimage_result_{parallel_info}_tc_{engine_args.use_torch_compile}.png"
85 | output.save(f"./results/{image_name}")
86 | print(f"image saved to ./results/{image_name}")
87 |
88 | if is_last_process:
89 | print(
90 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
91 | )
92 | get_runtime_state().destroy_distributed_env()
93 |
94 |
95 | if __name__ == "__main__":
96 | main()
97 |
--------------------------------------------------------------------------------
/examples/sana_example.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import torch
4 | import warnings
5 | import torch.distributed
6 | from xfuser import xFuserSanaPipeline, xFuserArgs
7 | from xfuser.config import FlexibleArgumentParser
8 | from xfuser.core.distributed import (
9 | get_world_group,
10 | is_dp_last_group,
11 | get_data_parallel_rank,
12 | get_runtime_state,
13 | )
14 | from xfuser.core.distributed.parallel_state import get_data_parallel_world_size
15 |
16 | data_type_dict = {
17 | "Sana_1600M_1024px_diffusers": torch.float16,
18 | "Sana_1600M_4Kpx_BF16_diffusers": torch.bfloat16,
19 | "SANA1.5_4.8B_1024px_diffusers": torch.bfloat16,
20 | "SANA1.5_1.6B_1024px_diffusers": torch.bfloat16,
21 | }
22 |
23 | def get_data_type(model_path):
24 | for model_name, data_type in data_type_dict.items():
25 | if model_name in model_path:
26 | return data_type
27 | warnings.warn(f"Unknown model path: {model_path}, using default data type: torch.float16")
28 | return torch.float16
29 |
30 |
31 | def main():
32 | parser = FlexibleArgumentParser(description="xFuser Arguments")
33 | args = xFuserArgs.add_cli_args(parser).parse_args()
34 | engine_args = xFuserArgs.from_cli_args(args)
35 | engine_config, input_config = engine_args.create_config()
36 | local_rank = get_world_group().local_rank
37 |
38 | data_type = get_data_type(engine_config.model_config.model)
39 | engine_config.runtime_config.dtype = data_type
40 | pipe = xFuserSanaPipeline.from_pretrained(
41 | pretrained_model_name_or_path=engine_config.model_config.model,
42 | engine_config=engine_config,
43 | torch_dtype=data_type,
44 | ).to(f"cuda:{local_rank}")
45 | pipe.vae.to(torch.bfloat16)
46 | pipe.text_encoder.to(torch.bfloat16)
47 | pipe.vae.enable_tiling(tile_sample_min_width=1024, tile_sample_min_height=1024)
48 |
49 | parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
50 |
51 | pipe.prepare_run(input_config)
52 |
53 | torch.cuda.reset_peak_memory_stats()
54 | start_time = time.time()
55 | output = pipe(
56 | height=input_config.height,
57 | width=input_config.width,
58 | prompt=input_config.prompt,
59 | num_inference_steps=input_config.num_inference_steps,
60 | output_type=input_config.output_type,
61 | generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
62 | guidance_scale=4.5
63 | )
64 | end_time = time.time()
65 | elapsed_time = end_time - start_time
66 | peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
67 |
68 | parallel_info = (
69 | f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
70 | f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
71 | f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
72 | )
73 | if input_config.output_type == "pil":
74 | dp_group_index = get_data_parallel_rank()
75 | num_dp_groups = get_data_parallel_world_size()
76 | dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
77 | if pipe.is_dp_last_group():
78 | if not os.path.exists("results"):
79 | os.mkdir("results")
80 | for i, image in enumerate(output.images):
81 | image_rank = dp_group_index * dp_batch_size + i
82 | image.save(
83 | f"./results/sana_result_{parallel_info}_{image_rank}.png"
84 | )
85 | print(
86 | f"image {i} saved to ./results/sana_result_{parallel_info}_{image_rank}.png"
87 | )
88 |
89 | if get_world_group().rank == get_world_group().world_size - 1:
90 | print(
91 | f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
92 | )
93 |
94 | get_runtime_state().destroy_distributed_env()
95 |
96 |
97 | if __name__ == "__main__":
98 | main()
99 |
--------------------------------------------------------------------------------