├── .gitignore ├── LICENSE ├── README.md ├── cpu_requirements.txt ├── gpu-trt-infer-demo.py ├── gpu_requirements.txt ├── libnvinfer_plugin.so.8.5.1 ├── pipeline_openvino_stable_diffusion.py └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2023] [The OFA-Sys Team] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Diffusion deployment 3 | 4 | This repository contains scripts for the deployment of diffusion models (based on [diffusers](https://github.com/huggingface/diffusers)) on both GPU (Nvidia) and CPU (Intel). The aim is to significantly speed up the inference of diffusion models. It provides **a ~12x speedup on CPUs and a ~4x speedup on GPUs.** 5 | Integrated with [small-stable-diffusion-v0](https://huggingface.co/OFA-Sys/small-stable-diffusion-v0/), it could generate an image in just **5s** on the CPU. 6 | 7 | ## CPU speedup 8 | We develop the diffusion deployment on CPU based on [Intel OpenVINO](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html). The pipeline `OpenVINOStableDiffusionPipeline` is modified from `OnnxStableDiffusionPipeline`. The code used here is in `pipeline_openvino_stable_diffusion.py`. 9 | #### Results 10 | Here are some experimental results on stable diffusion v1.4 for comparison with the default Pytorch CPU and Onnx pipeline. 11 | 12 | | Pipeline | Pytorch CPU | Onnx | OpenVINO | 13 | | --------- | ----------- | ------------ | --------------- | 14 | | Time Cost | 397s | 77s ± 2.56 s | 33.9 s ± 247 ms | 15 | | Speedup | 1 | 5.2 | 11.7 | 16 | 17 | *Test setting: CPU Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz / PNDM scheduler 50 steps)* 18 | 19 | #### Prerequisites 20 | There are several limitations of OpenVINO now, Therefore, we only support the following platforms for CPU speedup. 21 | + Ubuntu 18.04, 20.04, RHEL(CPU only) or Windows 10 - 64 bit 22 | + Python 3.7, 3.8 or 3.9 for Linux and only Python3.9 for Windows 23 | 24 | **Requirements** 25 | + diffuers 26 | + transformers 27 | + openvino runtime 28 | 29 | To install `openvino runtime`, you could simply use `pip install onnxruntime-openvino==1.13.0`. 30 | #### Usage 31 | To use this deployment, you could follow the following code: 32 | ```py 33 | # Load a onnx pipeline firstly. 34 | from diffusers import OnnxStableDiffusionPipeline 35 | onnx_pipe = OnnxStableDiffusionPipeline.from_pretrained( 36 | "OFA-Sys/small-stable-diffusion-v0", 37 | revision="onnx", 38 | provider="CPUExecutionProvider", 39 | ) 40 | # Convert it to OpenVINO pipeline. 41 | import pipeline_openvino_stable_diffusion 42 | openvino_pipe = pipeline_openvino_stable_diffusion.OpenVINOStableDiffusionPipeline.from_onnx_pipeline(onnx_pipe) 43 | 44 | # Generate images. 45 | images = openvino_pipe("an apple, 4k") 46 | ``` 47 | 48 | ## GPU speedup 49 | 50 | We develop the deployment on GPU based on TensorRT and its plugins. 51 | #### Comparison 52 | Here are some experimental results of stable-diffusion-v1-4 and [small-stable-diffusion-v0](https://huggingface.co/OFA-Sys/small-stable-diffusion-v0/). 53 | | Model\Pipeline | Pytorch GPU | TensorRT | TensorRT Plugin | 54 | | ---------------------- | ----------- | -------- | --------------- | 55 | | Stable diffusion | 3.94 | 1.44s | 1.07s | 56 | | Small stable diffusion | 2.7s | 1.01s | 0.65s | 57 | 58 | 59 | #### Prerequisites 60 | See the `gpu_requirements.txt` for requirments. To use plugins, we need `tensorrt>=8.5`. If you use `tensorrt==8.4`, you could run it by deleting `trt.init_libnvinfer_plugins(TRT_LOGGER, '')` in `gpu-trt-infer-demo.py` and not adding `PLUGIN_LIBS` to `LD_PRELOAD`. 61 | 62 | #### Usage 63 | ```sh 64 | export PLUGIN_LIBS="/path/to/libnvinfer_plugin.so.8.5.1" 65 | export HF_TOKEN="Your_HF_TOKEN" 66 | 67 | mkdir -p onnx engine output 68 | LD_PRELOAD=${PLUGIN_LIBS} python3 demo-diffusion.py "a beautiful photograph of Mt. Fuji during cherry blossom" --enable-preview-features --hf-token=$HF_TOKEN -v 69 | 70 | ``` 71 | 72 | 73 | 74 | ## Contributions 75 | Contributions to this repository are welcome. If you would like to contribute, please open a pull request and make sure to follow the existing code style. 76 | -------------------------------------------------------------------------------- /cpu_requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers 2 | transformers 3 | onnxruntime-openvino==1.13.0 -------------------------------------------------------------------------------- /gpu-trt-infer-demo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The OFA-Sys Team. 2 | # All rights reserved. 3 | # This source code is licensed under the Apache 2.0 license 4 | # found in the LICENSE file in the root directory. 5 | # SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 | # SPDX-License-Identifier: Apache-2.0 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | import argparse 22 | from cuda import cudart 23 | from models import CLIP, UNet, VAE 24 | import numpy as np 25 | import nvtx 26 | import os 27 | import onnx 28 | from polygraphy import cuda 29 | import time 30 | import torch 31 | from transformers import CLIPTokenizer 32 | import tensorrt as trt 33 | from utilities import Engine, DPMScheduler, LMSDiscreteScheduler, save_image, TRT_LOGGER 34 | 35 | 36 | def parseArgs(): 37 | parser = argparse.ArgumentParser( 38 | description="Options for Stable Diffusion Demo") 39 | # Stable Diffusion configuration 40 | parser.add_argument('prompt', 41 | nargs='*', 42 | help="Text prompt(s) to guide image generation") 43 | parser.add_argument( 44 | '--height', 45 | type=int, 46 | default=512, 47 | help="Height of image to generate (must be multiple of 8)") 48 | parser.add_argument( 49 | '--width', 50 | type=int, 51 | default=512, 52 | help="Height of image to generate (must be multiple of 8)") 53 | parser.add_argument('--num-images', 54 | type=int, 55 | default=1, 56 | help="Number of images to generate per prompt") 57 | parser.add_argument('--steps', 58 | type=int, 59 | default=50, 60 | help="Number of inference steps") 61 | parser.add_argument('--denoise-prec', 62 | type=str, 63 | default='fp16', 64 | choices=['fp32', 'fp16'], 65 | help="UNet model precision") 66 | parser.add_argument( 67 | '--negative-prompt', 68 | nargs='*', 69 | default=[''], 70 | help="The negative prompt(s) to guide the image generation.") 71 | parser.add_argument( 72 | '--repeat-prompt', 73 | type=int, 74 | default=1, 75 | choices=[1, 2, 4, 8, 16], 76 | help="Number of times to repeat the prompt (batch size multiplier)") 77 | parser.add_argument('--scheduler', 78 | type=str, 79 | default="LMSD", 80 | choices=["LMSD", "DPM"], 81 | help="Scheduler for diffusion process") 82 | 83 | # ONNX export 84 | parser.add_argument( 85 | '--onnx-opset', 86 | type=int, 87 | default=16, 88 | choices=range(7, 18), 89 | help="Select ONNX opset version to target for exported models") 90 | parser.add_argument('--onnx-dir', 91 | default='onnx', 92 | help="Output directory for ONNX export") 93 | parser.add_argument('--force-onnx-export', 94 | action='store_true', 95 | help="Force ONNX export of CLIP, UNET, and VAE models") 96 | parser.add_argument( 97 | '--force-onnx-optimize', 98 | action='store_true', 99 | help="Force ONNX optimizations for CLIP, UNET, and VAE models") 100 | parser.add_argument('--force-engine-build', 101 | action='store_true', 102 | help="Force rebuilding the TensorRT engine") 103 | parser.add_argument( 104 | '--force-static-batch', 105 | action='store_true', 106 | help="Force building TensorRT engines with fixed batch size.") 107 | parser.add_argument( 108 | '--minimal-optimization', 109 | action='store_true', 110 | help="Limited optimizations to only const folding and shape inference." 111 | ) 112 | parser.add_argument('--enable-preview-features', 113 | action='store_true', 114 | help="Enable TensorRT preview features.") 115 | 116 | # TensorRT inference 117 | parser.add_argument('--engine-dir', 118 | default='engine', 119 | help="Output directory for TensorRT engines") 120 | parser.add_argument( 121 | '--num-warmup-runs', 122 | type=int, 123 | default=5, 124 | help="Number of warmup runs before benchmarking performance") 125 | parser.add_argument('--profile', 126 | action='store_true', 127 | help="Enable performance profiling") 128 | parser.add_argument( 129 | '--seed', 130 | type=int, 131 | default=None, 132 | help="Seed for random generator to get consistent results") 133 | 134 | parser.add_argument('--output-dir', 135 | default='output', 136 | help="Output directory for logs and image artifacts") 137 | parser.add_argument( 138 | '--hf-token', 139 | type=str, 140 | help="HuggingFace API token to use for downloading checkpoints") 141 | parser.add_argument('-v', 142 | '--verbose', 143 | action='store_true', 144 | help="Show verbose output") 145 | return parser.parse_args() 146 | 147 | 148 | class DemoDiffusion: 149 | """ 150 | Application showcasing the acceleration of [Stable Diffusion v1.4](https://huggingface.co/CompVis/stable-diffusion-v1-4) pipeline using NVidia TensorRT w/ Plugins. 151 | """ 152 | def __init__( 153 | self, 154 | image_height, 155 | image_width, 156 | denoising_steps, 157 | scheduler="LMSD", 158 | denoising_fp16=True, 159 | guidance_scale=7.5, 160 | device='cuda', 161 | output_dir='.', 162 | hf_token=None, 163 | verbose=False, 164 | profile=False, 165 | ): 166 | """ 167 | Initializes the Diffusion pipeline. 168 | 169 | Args: 170 | image_height (int): 171 | Height (in pixels) of the image to be generated. Should be a multiple of 8. 172 | image_width (int): 173 | Width (in pixels) of the image to be generated. Should be a multiple of 8. 174 | denoising_steps (int): 175 | The number of denoising steps. 176 | More denoising steps usually lead to a higher quality image at the expense of slower inference. 177 | denoising_fp16 (bool): 178 | Run the denoising loop (UNet) in fp16 precision. 179 | When enabled image quality will be lower but generally results in higher throughput. 180 | guidance_scale (float): 181 | Guidance scale is enabled by setting as > 1. 182 | Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality. 183 | device (str): 184 | PyTorch device to run inference. Default: 'cuda' 185 | output_dir (str): 186 | Output directory for log files and image artifacts 187 | hf_token (str): 188 | HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints. 189 | verbose (bool): 190 | Enable verbose logging. 191 | profile (bool): 192 | Insert NVTX profiling markers. 193 | """ 194 | 195 | if image_height % 8 != 0 or image_width % 8 != 0: 196 | raise ValueError( 197 | f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}." 198 | ) 199 | # Spatial dimensions of latent tensor 200 | self.latent_height = image_height // 8 201 | self.latent_width = image_width // 8 202 | 203 | # Only supports single image per prompt. 204 | self.num_images = 1 205 | 206 | self.denoising_steps = denoising_steps 207 | self.denoising_fp16 = denoising_fp16 208 | assert guidance_scale > 1.0 209 | self.guidance_scale = guidance_scale 210 | 211 | self.output_dir = output_dir 212 | self.hf_token = hf_token 213 | self.device = device 214 | self.verbose = verbose 215 | self.profile = profile 216 | 217 | # A scheduler to be used in combination with unet to denoise the encoded image latens. 218 | # This demo uses an adaptation of LMSDiscreteScheduler or DPMScheduler: 219 | sched_opts = { 220 | 'num_train_timesteps': 1000, 221 | 'beta_start': 0.00085, 222 | 'beta_end': 0.012 223 | } 224 | if scheduler == "DPM": 225 | self.scheduler = DPMScheduler(device=self.device, **sched_opts) 226 | elif scheduler == "LMSD": 227 | self.scheduler = LMSDiscreteScheduler(device=self.device, 228 | **sched_opts) 229 | else: 230 | raise ValueError(f"Scheduler should be either DPM or LMSD") 231 | 232 | self.tokenizer = None 233 | 234 | self.unet_model_key = 'unet_fp16' if denoising_fp16 else 'unet' 235 | self.models = { 236 | 'clip': 237 | CLIP(hf_token=hf_token, 238 | image_width=image_width, 239 | image_height=image_height, 240 | device=device, 241 | verbose=verbose), 242 | self.unet_model_key: 243 | UNet(hf_token=hf_token, 244 | image_width=image_width, 245 | image_height=image_height, 246 | fp16=denoising_fp16, 247 | device=device, 248 | verbose=verbose), 249 | 'vae': 250 | VAE(hf_token=hf_token, 251 | image_width=image_width, 252 | image_height=image_height, 253 | device=device, 254 | verbose=verbose) 255 | } 256 | 257 | self.engine = {} 258 | self.stream = cuda.Stream() 259 | 260 | def teardown(self): 261 | for engine in self.engine.values(): 262 | del engine 263 | self.stream.free() 264 | del self.stream 265 | 266 | def getModelPath(self, name, onnx_dir, opt=True): 267 | return os.path.join(onnx_dir, name + ('.opt' if opt else '') + '.onnx') 268 | 269 | def loadEngines( 270 | self, 271 | engine_dir, 272 | onnx_dir, 273 | onnx_opset, 274 | opt_batch_size, 275 | force_export=False, 276 | force_optimize=False, 277 | force_build=False, 278 | minimal_optimization=False, 279 | static_batch=False, 280 | enable_preview=False, 281 | ): 282 | """ 283 | Build and load engines for TensorRT accelerated inference. 284 | Export ONNX models first, if applicable. 285 | 286 | Args: 287 | engine_dir (str): 288 | Directory to write the TensorRT engines. 289 | onnx_dir (str): 290 | Directory to write the ONNX models. 291 | onnx_opset (int): 292 | ONNX opset version to export the models. 293 | opt_batch_size (int): 294 | Batch size to optimize for during engine building. 295 | force_export (bool): 296 | Force re-exporting the ONNX models. 297 | force_optimize (bool): 298 | Force re-optimizing the ONNX models. 299 | force_build (bool): 300 | Force re-building the TensorRT engine. 301 | minimal_optimization (bool): 302 | Apply minimal optimizations during build (no plugins). 303 | static_batch (bool): 304 | Build engine only for specified opt_batch_size. 305 | enable_preview (bool): 306 | Enable TensorRT preview features. 307 | """ 308 | def exportOnnx(model_name, obj): 309 | onnx_opt_path = self.getModelPath(model_name, onnx_dir) 310 | if force_optimize or not os.path.exists(onnx_opt_path): 311 | onnx_path = self.getModelPath(model_name, onnx_dir, opt=False) 312 | if force_export or not os.path.exists(onnx_path): 313 | print(f"Exporting model: {onnx_path}") 314 | model = obj.get_model() 315 | with torch.inference_mode(), torch.autocast("cuda"): 316 | inputs = obj.get_sample_input( 317 | batch_size=opt_batch_size) 318 | torch.onnx.export( 319 | model, 320 | inputs, 321 | onnx_path, 322 | export_params=True, 323 | opset_version=onnx_opset, 324 | do_constant_folding=True, 325 | input_names=obj.get_input_names(), 326 | output_names=obj.get_output_names(), 327 | dynamic_axes=obj.get_dynamic_axes(), 328 | ) 329 | else: 330 | print(f"Found cached model: {onnx_path}") 331 | 332 | print(f"Generating optimizing model: {onnx_opt_path}") 333 | onnx_opt_graph = obj.optimize( 334 | onnx.load(onnx_path), 335 | minimal_optimization=minimal_optimization) 336 | onnx.save(onnx_opt_graph, onnx_opt_path) 337 | else: 338 | print(f"Found cached optimized model: {onnx_opt_path} ") 339 | 340 | # Build engines 341 | for model_name, obj in self.models.items(): 342 | engine = Engine(model_name, engine_dir) 343 | if force_build or not os.path.exists(engine.engine_path): 344 | onnx_path = self.getModelPath(model_name, onnx_dir) 345 | if not os.path.exists(onnx_path): 346 | exportOnnx(model_name, obj) 347 | engine.build(onnx_path, fp16=True, \ 348 | input_profile=obj.get_input_profile(batch_size=opt_batch_size, static_batch=static_batch), \ 349 | enable_preview=enable_preview) 350 | 351 | # Load engines 352 | for model_name, obj in self.models.items(): 353 | engine = Engine(model_name, engine_dir) 354 | engine.activate() 355 | self.engine[model_name] = engine 356 | 357 | def loadModules(self, ): 358 | self.tokenizer = CLIPTokenizer.from_pretrained( 359 | "openai/clip-vit-large-patch14") 360 | self.scheduler.set_timesteps(self.denoising_steps) 361 | # Pre-compute latent input scales and linear multistep coefficients 362 | self.scheduler.configure() 363 | 364 | def runEngine(self, model_name, feed_dict): 365 | engine = self.engine[model_name] 366 | return engine.infer(feed_dict, self.stream) 367 | 368 | def infer( 369 | self, 370 | prompt, 371 | negative_prompt, 372 | warmup=False, 373 | verbose=False, 374 | ): 375 | """ 376 | Run the diffusion pipeline. 377 | 378 | Args: 379 | prompt (str): 380 | The text prompt to guide image generation. 381 | negative_prompt (str): 382 | The prompt not to guide the image generation. 383 | warmup (bool): 384 | Indicate if this is a warmup run. 385 | verbose (bool): 386 | Enable verbose logging. 387 | """ 388 | # Process inputs 389 | batch_size = len(prompt) 390 | assert len(prompt) == len(negative_prompt) 391 | 392 | # Create profiling events 393 | events = {} 394 | for stage in ['clip', 'denoise', 'vae']: 395 | for marker in ['start', 'stop']: 396 | events[stage + '-' + marker] = cudart.cudaEventCreate()[1] 397 | 398 | # Allocate buffers for TensorRT engine bindings 399 | for model_name, obj in self.models.items(): 400 | self.engine[model_name].allocate_buffers( 401 | shape_dict=obj.get_shape_dict(batch_size=batch_size), 402 | device=self.device) 403 | 404 | generator = None 405 | if args.seed is not None: 406 | generator = torch.Generator(device="cuda").manual_seed(args.seed) 407 | 408 | # Run Stable Diffusion pipeline 409 | with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime( 410 | TRT_LOGGER) as runtime: 411 | # latents need to be generated on the target device 412 | unet_channels = 4 # unet.in_channels 413 | latents_shape = (batch_size * self.num_images, unet_channels, 414 | self.latent_height, self.latent_width) 415 | latents_dtype = torch.float32 # text_embeddings.dtype 416 | latents = torch.randn(latents_shape, 417 | device=self.device, 418 | dtype=latents_dtype, 419 | generator=generator) 420 | 421 | # Scale the initial noise by the standard deviation required by the scheduler 422 | latents = latents * self.scheduler.init_noise_sigma 423 | 424 | torch.cuda.synchronize() 425 | e2e_tic = time.perf_counter() 426 | 427 | if self.profile: 428 | nvtx_clip = nvtx.start_range(message='clip', color='green') 429 | cudart.cudaEventRecord(events['clip-start'], 0) 430 | # Tokenize input 431 | text_input_ids = self.tokenizer( 432 | prompt, 433 | padding="max_length", 434 | max_length=self.tokenizer.model_max_length, 435 | return_tensors="pt", 436 | ).input_ids.type(torch.int32).to(self.device) 437 | 438 | # CLIP text encoder 439 | text_input_ids_inp = cuda.DeviceView(ptr=text_input_ids.data_ptr(), 440 | shape=text_input_ids.shape, 441 | dtype=np.int32) 442 | text_embeddings = self.runEngine( 443 | 'clip', {"input_ids": text_input_ids_inp})['text_embeddings'] 444 | 445 | # Duplicate text embeddings for each generation per prompt 446 | bs_embed, seq_len, _ = text_embeddings.shape 447 | text_embeddings = text_embeddings.repeat(1, self.num_images, 1) 448 | text_embeddings = text_embeddings.view(bs_embed * self.num_images, 449 | seq_len, -1) 450 | 451 | max_length = text_input_ids.shape[-1] 452 | uncond_input_ids = self.tokenizer( 453 | negative_prompt, 454 | padding="max_length", 455 | max_length=max_length, 456 | truncation=True, 457 | return_tensors="pt", 458 | ).input_ids.type(torch.int32).to(self.device) 459 | uncond_input_ids_inp = cuda.DeviceView( 460 | ptr=uncond_input_ids.data_ptr(), 461 | shape=uncond_input_ids.shape, 462 | dtype=np.int32) 463 | uncond_embeddings = self.runEngine( 464 | 'clip', {"input_ids": uncond_input_ids_inp})['text_embeddings'] 465 | 466 | # Duplicate unconditional embeddings for each generation per prompt 467 | seq_len = uncond_embeddings.shape[1] 468 | uncond_embeddings = uncond_embeddings.repeat(1, self.num_images, 1) 469 | uncond_embeddings = uncond_embeddings.view( 470 | batch_size * self.num_images, seq_len, -1) 471 | 472 | # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance 473 | text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) 474 | 475 | if self.denoising_fp16: 476 | text_embeddings = text_embeddings.to(dtype=torch.float16) 477 | 478 | cudart.cudaEventRecord(events['clip-stop'], 0) 479 | if self.profile: 480 | nvtx.end_range(nvtx_clip) 481 | 482 | cudart.cudaEventRecord(events['denoise-start'], 0) 483 | for step_index, timestep in enumerate(self.scheduler.timesteps): 484 | if self.profile: 485 | nvtx_latent_scale = nvtx.start_range( 486 | message='latent_scale', color='pink') 487 | # expand the latents if we are doing classifier free guidance 488 | latent_model_input = torch.cat([latents] * 2) 489 | # LMSDiscreteScheduler.scale_model_input() 490 | latent_model_input = self.scheduler.scale_model_input( 491 | latent_model_input, step_index) 492 | if self.profile: 493 | nvtx.end_range(nvtx_latent_scale) 494 | 495 | # predict the noise residual 496 | if self.profile: 497 | nvtx_unet = nvtx.start_range(message='unet', color='blue') 498 | dtype = np.float16 if self.denoising_fp16 else np.float32 499 | if timestep.dtype != torch.float32: 500 | timestep_float = timestep.float() 501 | else: 502 | timestep_float = timestep 503 | sample_inp = cuda.DeviceView(ptr=latent_model_input.data_ptr(), 504 | shape=latent_model_input.shape, 505 | dtype=np.float32) 506 | timestep_inp = cuda.DeviceView(ptr=timestep_float.data_ptr(), 507 | shape=timestep_float.shape, 508 | dtype=np.float32) 509 | embeddings_inp = cuda.DeviceView( 510 | ptr=text_embeddings.data_ptr(), 511 | shape=text_embeddings.shape, 512 | dtype=dtype) 513 | noise_pred = self.runEngine( 514 | self.unet_model_key, { 515 | "sample": sample_inp, 516 | "timestep": timestep_inp, 517 | "encoder_hidden_states": embeddings_inp 518 | })['latent'] 519 | if self.profile: 520 | nvtx.end_range(nvtx_unet) 521 | 522 | if self.profile: 523 | nvtx_latent_step = nvtx.start_range(message='latent_step', 524 | color='pink') 525 | # Perform guidance 526 | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) 527 | noise_pred = noise_pred_uncond + self.guidance_scale * ( 528 | noise_pred_text - noise_pred_uncond) 529 | 530 | latents = self.scheduler.step(noise_pred, latents, step_index, 531 | timestep) 532 | 533 | if self.profile: 534 | nvtx.end_range(nvtx_latent_step) 535 | 536 | latents = 1. / 0.18215 * latents 537 | cudart.cudaEventRecord(events['denoise-stop'], 0) 538 | 539 | if self.profile: 540 | nvtx_vae = nvtx.start_range(message='vae', color='red') 541 | cudart.cudaEventRecord(events['vae-start'], 0) 542 | sample_inp = cuda.DeviceView(ptr=latents.data_ptr(), 543 | shape=latents.shape, 544 | dtype=np.float32) 545 | images = self.runEngine('vae', {"latent": sample_inp})['images'] 546 | cudart.cudaEventRecord(events['vae-stop'], 0) 547 | if self.profile: 548 | nvtx.end_range(nvtx_vae) 549 | 550 | torch.cuda.synchronize() 551 | e2e_toc = time.perf_counter() 552 | if not warmup: 553 | print('|------------|--------------|') 554 | print('| {:^10} | {:^12} |'.format('Module', 'Latency')) 555 | print('|------------|--------------|') 556 | print('| {:^10} | {:>9.2f} ms |'.format( 557 | 'CLIP', 558 | cudart.cudaEventElapsedTime(events['clip-start'], 559 | events['clip-stop'])[1])) 560 | print('| {:^10} | {:>9.2f} ms |'.format( 561 | 'UNet x ' + str(self.denoising_steps), 562 | cudart.cudaEventElapsedTime(events['denoise-start'], 563 | events['denoise-stop'])[1])) 564 | print('| {:^10} | {:>9.2f} ms |'.format( 565 | 'VAE', 566 | cudart.cudaEventElapsedTime(events['vae-start'], 567 | events['vae-stop'])[1])) 568 | print('|------------|--------------|') 569 | print('| {:^10} | {:>9.2f} ms |'.format( 570 | 'Pipeline', (e2e_toc - e2e_tic) * 1000.)) 571 | print('|------------|--------------|') 572 | 573 | # Save image 574 | image_name_prefix = 'sd-' + ( 575 | 'fp16' if self.denoising_fp16 else 'fp32') + ''.join( 576 | set([ 577 | '-' + prompt[i].replace(' ', '_')[:10] 578 | for i in range(batch_size) 579 | ])) + '-' 580 | save_image(images, self.output_dir, image_name_prefix) 581 | 582 | 583 | if __name__ == "__main__": 584 | 585 | print("[I] Initializing StableDiffusion demo with TensorRT Plugins") 586 | args = parseArgs() 587 | 588 | # Process prompt 589 | if not isinstance(args.prompt, list): 590 | raise ValueError( 591 | f"`prompt` must be of type `str` or `str` list, but is {type(prompt)}" 592 | ) 593 | prompt = args.prompt * args.repeat_prompt 594 | 595 | if not isinstance(args.negative_prompt, list): 596 | raise ValueError( 597 | f"`--negative-prompt` must be of type `str` or `str` list, but is {type(args.negative_prompt)}" 598 | ) 599 | if len(args.negative_prompt) == 1: 600 | negative_prompt = args.negative_prompt * len(prompt) 601 | else: 602 | negative_prompt = args.negative_prompt 603 | 604 | # Register TensorRT plugins 605 | trt.init_libnvinfer_plugins(TRT_LOGGER, '') 606 | 607 | # Initialize demo 608 | demo = DemoDiffusion(image_height=args.height, 609 | image_width=args.width, 610 | denoising_steps=args.steps, 611 | denoising_fp16=(args.denoise_prec == 'fp16'), 612 | output_dir=args.output_dir, 613 | scheduler=args.scheduler, 614 | hf_token=args.hf_token, 615 | verbose=args.verbose, 616 | profile=args.profile) 617 | 618 | # Build/load TensorRT engines and torch models 619 | demo.loadEngines(args.engine_dir, args.onnx_dir, args.onnx_opset, opt_batch_size=len(prompt), \ 620 | force_export=args.force_onnx_export, force_optimize=args.force_onnx_optimize, \ 621 | force_build=args.force_engine_build, minimal_optimization=args.minimal_optimization, \ 622 | static_batch=args.force_static_batch, enable_preview=args.enable_preview_features) 623 | demo.loadModules() 624 | 625 | print("[I] Warming up ..") 626 | for _ in range(args.num_warmup_runs): 627 | images = demo.infer(prompt, 628 | negative_prompt, 629 | warmup=True, 630 | verbose=False) 631 | 632 | print("[I] Running StableDiffusion pipeline") 633 | if args.profile: 634 | cudart.cudaProfilerStart() 635 | images = demo.infer(prompt, negative_prompt, verbose=args.verbose) 636 | if args.profile: 637 | cudart.cudaProfilerStop() 638 | 639 | demo.teardown() 640 | -------------------------------------------------------------------------------- /gpu_requirements.txt: -------------------------------------------------------------------------------- 1 | colored 2 | cuda-python 3 | diffusers==0.8.0 4 | numpy==1.23.5 5 | ftfy 6 | matplotlib 7 | nvtx 8 | onnx==1.12.0 9 | --extra-index-url https://pypi.ngc.nvidia.com 10 | onnx-graphsurgeon==0.3.25 11 | onnxruntime==1.13.1 12 | polygraphy==0.43.1 13 | scipy 14 | transformers==4.25.1 15 | tensorrt==8.5.1.7 -------------------------------------------------------------------------------- /libnvinfer_plugin.so.8.5.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OFA-Sys/diffusion-deploy/6a0a083a7ceca5faad7e76c0517f1ceca28e1d50/libnvinfer_plugin.so.8.5.1 -------------------------------------------------------------------------------- /pipeline_openvino_stable_diffusion.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The OFA-Sys Team. 2 | # This source code is licensed under the Apache 2.0 license 3 | # found in the LICENSE file in the root directory. 4 | # Copyright 2022 The HuggingFace Inc. team. 5 | # All rights reserved. 6 | # This source code is licensed under the Apache 2.0 license 7 | # found in the LICENSE file in the root directory. 8 | 9 | import inspect 10 | from typing import Callable, List, Optional, Union 11 | 12 | import numpy as np 13 | import torch 14 | import os 15 | 16 | from transformers import CLIPFeatureExtractor, CLIPTokenizer 17 | 18 | from diffusers.configuration_utils import FrozenDict 19 | from diffusers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler 20 | from diffusers.utils import deprecate, logging 21 | from diffusers.onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel 22 | 23 | from diffusers import OnnxStableDiffusionPipeline, DiffusionPipeline 24 | from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput 25 | from openvino.runtime import Core 26 | 27 | logger = logging.get_logger(__name__) 28 | 29 | 30 | class OpenVINOStableDiffusionPipeline(DiffusionPipeline): 31 | vae_encoder: OnnxRuntimeModel 32 | vae_decoder: OnnxRuntimeModel 33 | text_encoder: OnnxRuntimeModel 34 | tokenizer: CLIPTokenizer 35 | unet: OnnxRuntimeModel 36 | scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler] 37 | safety_checker: OnnxRuntimeModel 38 | feature_extractor: CLIPFeatureExtractor 39 | 40 | _optional_components = ["safety_checker", "feature_extractor"] 41 | 42 | def __init__( 43 | self, 44 | vae_encoder: OnnxRuntimeModel, 45 | vae_decoder: OnnxRuntimeModel, 46 | text_encoder: OnnxRuntimeModel, 47 | tokenizer: CLIPTokenizer, 48 | unet: OnnxRuntimeModel, 49 | scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], 50 | safety_checker: OnnxRuntimeModel, 51 | feature_extractor: CLIPFeatureExtractor, 52 | requires_safety_checker: bool = True, 53 | ): 54 | super().__init__() 55 | 56 | if hasattr(scheduler.config, 57 | "steps_offset") and scheduler.config.steps_offset != 1: 58 | deprecation_message = ( 59 | f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" 60 | f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " 61 | "to update the config accordingly as leaving `steps_offset` might led to incorrect results" 62 | " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," 63 | " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" 64 | " file") 65 | deprecate("steps_offset!=1", 66 | "1.0.0", 67 | deprecation_message, 68 | standard_warn=False) 69 | new_config = dict(scheduler.config) 70 | new_config["steps_offset"] = 1 71 | scheduler._internal_dict = FrozenDict(new_config) 72 | 73 | if hasattr(scheduler.config, 74 | "clip_sample") and scheduler.config.clip_sample is True: 75 | deprecation_message = ( 76 | f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." 77 | " `clip_sample` should be set to False in the configuration file. Please make sure to update the" 78 | " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" 79 | " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" 80 | " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" 81 | ) 82 | deprecate("clip_sample not set", 83 | "1.0.0", 84 | deprecation_message, 85 | standard_warn=False) 86 | new_config = dict(scheduler.config) 87 | new_config["clip_sample"] = False 88 | scheduler._internal_dict = FrozenDict(new_config) 89 | 90 | if safety_checker is None and requires_safety_checker: 91 | logger.warning( 92 | f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" 93 | " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" 94 | " results in services or applications open to the public. Both the diffusers team and Hugging Face" 95 | " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" 96 | " it only for use-cases that involve analyzing network behavior or auditing its results. For more" 97 | " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." 98 | ) 99 | 100 | if safety_checker is not None and feature_extractor is None: 101 | raise ValueError( 102 | "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" 103 | " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." 104 | ) 105 | 106 | self.register_modules( 107 | vae_encoder=vae_encoder, 108 | vae_decoder=vae_decoder, 109 | text_encoder=text_encoder, 110 | tokenizer=tokenizer, 111 | unet=unet, 112 | scheduler=scheduler, 113 | safety_checker=safety_checker, 114 | feature_extractor=feature_extractor, 115 | ) 116 | self.convert_to_openvino() 117 | self.register_to_config( 118 | requires_safety_checker=requires_safety_checker) 119 | 120 | @classmethod 121 | def from_onnx_pipeline(cls, onnx_pipe: OnnxStableDiffusionPipeline): 122 | r""" 123 | Create OpenVINOStableDiffusionPipeline from a onnx stable pipeline. 124 | Parameters: 125 | onnx_pipe (OnnxStableDiffusionPipeline) 126 | """ 127 | return cls(onnx_pipe.vae_encoder, onnx_pipe.vae_decoder, 128 | onnx_pipe.text_encoder, onnx_pipe.tokenizer, onnx_pipe.unet, 129 | onnx_pipe.scheduler, onnx_pipe.safety_checker, 130 | onnx_pipe.feature_extractor, True) 131 | 132 | def convert_to_openvino(self): 133 | ie = Core() 134 | 135 | # VAE decoder 136 | vae_decoder_onnx = ie.read_model( 137 | model=os.path.join(self.vae_decoder.model_save_dir, "model.onnx")) 138 | vae_decoder = ie.compile_model(model=vae_decoder_onnx, 139 | device_name="CPU") 140 | 141 | # Text encoder 142 | text_encoder_onnx = ie.read_model( 143 | model=os.path.join(self.text_encoder.model_save_dir, "model.onnx")) 144 | text_encoder = ie.compile_model(model=text_encoder_onnx, 145 | device_name="CPU") 146 | 147 | # Unet 148 | unet_onnx = ie.read_model( 149 | model=os.path.join(self.unet.model_save_dir, "model.onnx")) 150 | unet = ie.compile_model(model=unet_onnx, device_name="CPU") 151 | 152 | self.register_modules(vae_decoder=vae_decoder, 153 | text_encoder=text_encoder, 154 | unet=unet) 155 | 156 | def _encode_prompt(self, prompt, num_images_per_prompt, 157 | do_classifier_free_guidance, negative_prompt): 158 | r""" 159 | Encodes the prompt into text encoder hidden states. 160 | Args: 161 | prompt (`str` or `List[str]`): 162 | prompt to be encoded 163 | num_images_per_prompt (`int`): 164 | number of images that should be generated per prompt 165 | do_classifier_free_guidance (`bool`): 166 | whether to use classifier free guidance or not 167 | negative_prompt (`str` or `List[str]`): 168 | The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored 169 | if `guidance_scale` is less than `1`). 170 | """ 171 | batch_size = len(prompt) if isinstance(prompt, list) else 1 172 | 173 | # get prompt text embeddings 174 | text_inputs = self.tokenizer( 175 | prompt, 176 | padding="max_length", 177 | max_length=self.tokenizer.model_max_length, 178 | truncation=True, 179 | return_tensors="np", 180 | ) 181 | text_input_ids = text_inputs.input_ids 182 | untruncated_ids = self.tokenizer(prompt, 183 | padding="max_length", 184 | return_tensors="np").input_ids 185 | 186 | if not np.array_equal(text_input_ids, untruncated_ids): 187 | removed_text = self.tokenizer.batch_decode( 188 | untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) 189 | logger.warning( 190 | "The following part of your input was truncated because CLIP can only handle sequences up to" 191 | f" {self.tokenizer.model_max_length} tokens: {removed_text}") 192 | 193 | prompt_embeds = self.text_encoder( 194 | {"input_ids": 195 | text_input_ids.astype(np.int32)})[self.text_encoder.outputs[0]] 196 | prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) 197 | 198 | # get unconditional embeddings for classifier free guidance 199 | if do_classifier_free_guidance: 200 | uncond_tokens: List[str] 201 | if negative_prompt is None: 202 | uncond_tokens = [""] * batch_size 203 | elif type(prompt) is not type(negative_prompt): 204 | raise TypeError( 205 | f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" 206 | f" {type(prompt)}.") 207 | elif isinstance(negative_prompt, str): 208 | uncond_tokens = [negative_prompt] * batch_size 209 | elif batch_size != len(negative_prompt): 210 | raise ValueError( 211 | f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" 212 | f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" 213 | " the batch size of `prompt`.") 214 | else: 215 | uncond_tokens = negative_prompt 216 | 217 | max_length = text_input_ids.shape[-1] 218 | uncond_input = self.tokenizer( 219 | uncond_tokens, 220 | padding="max_length", 221 | max_length=max_length, 222 | truncation=True, 223 | return_tensors="np", 224 | ) 225 | negative_prompt_embeds = self.text_encoder({ 226 | "input_ids": 227 | uncond_input.input_ids.astype(np.int32) 228 | })[self.text_encoder.outputs[0]] 229 | negative_prompt_embeds = np.repeat(negative_prompt_embeds, 230 | num_images_per_prompt, 231 | axis=0) 232 | 233 | # For classifier free guidance, we need to do two forward passes. 234 | # Here we concatenate the unconditional and text embeddings into a single batch 235 | # to avoid doing two forward passes 236 | prompt_embeds = np.concatenate( 237 | [negative_prompt_embeds, prompt_embeds]) 238 | 239 | return prompt_embeds 240 | 241 | def __call__( 242 | self, 243 | prompt: Union[str, List[str]], 244 | height: Optional[int] = 512, 245 | width: Optional[int] = 512, 246 | num_inference_steps: Optional[int] = 50, 247 | guidance_scale: Optional[float] = 7.5, 248 | negative_prompt: Optional[Union[str, List[str]]] = None, 249 | num_images_per_prompt: Optional[int] = 1, 250 | eta: Optional[float] = 0.0, 251 | generator: Optional[np.random.RandomState] = None, 252 | latents: Optional[np.ndarray] = None, 253 | output_type: Optional[str] = "pil", 254 | return_dict: bool = True, 255 | callback: Optional[Callable[[int, int, np.ndarray], None]] = None, 256 | callback_steps: Optional[int] = 1, 257 | ): 258 | if isinstance(prompt, str): 259 | batch_size = 1 260 | elif isinstance(prompt, list): 261 | batch_size = len(prompt) 262 | else: 263 | raise ValueError( 264 | f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" 265 | ) 266 | 267 | if height % 8 != 0 or width % 8 != 0: 268 | raise ValueError( 269 | f"`height` and `width` have to be divisible by 8 but are {height} and {width}." 270 | ) 271 | 272 | if (callback_steps is None) or (callback_steps is not None and 273 | (not isinstance(callback_steps, int) 274 | or callback_steps <= 0)): 275 | raise ValueError( 276 | f"`callback_steps` has to be a positive integer but is {callback_steps} of type" 277 | f" {type(callback_steps)}.") 278 | 279 | if generator is None: 280 | generator = np.random 281 | 282 | # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) 283 | # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` 284 | # corresponds to doing no classifier free guidance. 285 | do_classifier_free_guidance = guidance_scale > 1.0 286 | 287 | prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, 288 | do_classifier_free_guidance, 289 | negative_prompt) 290 | 291 | # get the initial random noise unless the user supplied it 292 | latents_dtype = prompt_embeds.dtype 293 | latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, 294 | width // 8) 295 | if latents is None: 296 | latents = generator.randn(*latents_shape).astype(latents_dtype) 297 | elif latents.shape != latents_shape: 298 | raise ValueError( 299 | f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" 300 | ) 301 | 302 | # set timesteps 303 | self.scheduler.set_timesteps(num_inference_steps) 304 | 305 | latents = latents * np.float64(self.scheduler.init_noise_sigma) 306 | 307 | # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature 308 | # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. 309 | # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 310 | # and should be between [0, 1] 311 | accepts_eta = "eta" in set( 312 | inspect.signature(self.scheduler.step).parameters.keys()) 313 | extra_step_kwargs = {} 314 | if accepts_eta: 315 | extra_step_kwargs["eta"] = eta 316 | 317 | # timestep_dtype = next( 318 | # (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)" 319 | # ) 320 | timestep_dtype = 'tensor(int64)' 321 | timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype] 322 | 323 | for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)): 324 | # expand the latents if we are doing classifier free guidance 325 | latent_model_input = np.concatenate( 326 | [latents] * 2) if do_classifier_free_guidance else latents 327 | latent_model_input = self.scheduler.scale_model_input( 328 | torch.from_numpy(latent_model_input), t) 329 | latent_model_input = latent_model_input.cpu().numpy() 330 | 331 | # predict the noise residual 332 | timestep = np.array([t], dtype=timestep_dtype) 333 | unet_input = { 334 | "sample": latent_model_input, 335 | "timestep": timestep, 336 | "encoder_hidden_states": prompt_embeds 337 | } 338 | noise_pred = self.unet(unet_input)[self.unet.outputs[0]] 339 | # noise_pred = noise_pred[0] 340 | 341 | # perform guidance 342 | if do_classifier_free_guidance: 343 | noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) 344 | noise_pred = noise_pred_uncond + guidance_scale * ( 345 | noise_pred_text - noise_pred_uncond) 346 | 347 | # compute the previous noisy sample x_t -> x_t-1 348 | scheduler_output = self.scheduler.step( 349 | torch.from_numpy(noise_pred), t, torch.from_numpy(latents), 350 | **extra_step_kwargs) 351 | latents = scheduler_output.prev_sample.numpy() 352 | 353 | # call the callback, if provided 354 | if callback is not None and i % callback_steps == 0: 355 | callback(i, t, latents) 356 | 357 | latents = 1 / 0.18215 * latents 358 | image2 = self.vae_decoder({"latent_sample": 359 | latents})[self.vae_decoder.outputs[0]] 360 | # image = self.vae_decoder(latent_sample=latents)[0] 361 | # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 362 | image = np.concatenate([ 363 | self.vae_decoder({"latent_sample": 364 | latents[i:i + 1]})[self.vae_decoder.outputs[0]] 365 | for i in range(latents.shape[0]) 366 | ]) 367 | 368 | image = np.clip(image / 2 + 0.5, 0, 1) 369 | image = image.transpose((0, 2, 3, 1)) 370 | 371 | if self.safety_checker is not None: 372 | safety_checker_input = self.feature_extractor( 373 | self.numpy_to_pil(image), 374 | return_tensors="np").pixel_values.astype(image.dtype) 375 | 376 | image, has_nsfw_concepts = self.safety_checker( 377 | clip_input=safety_checker_input, images=image) 378 | 379 | # There will throw an error if use safety_checker batchsize>1 380 | images, has_nsfw_concept = [], [] 381 | for i in range(image.shape[0]): 382 | image_i, has_nsfw_concept_i = self.safety_checker( 383 | clip_input=safety_checker_input[i:i + 1], 384 | images=image[i:i + 1]) 385 | images.append(image_i) 386 | has_nsfw_concept.append(has_nsfw_concept_i[0]) 387 | image = np.concatenate(images) 388 | else: 389 | has_nsfw_concept = None 390 | 391 | if output_type == "pil": 392 | image = self.numpy_to_pil(image) 393 | 394 | if not return_dict: 395 | return (image, has_nsfw_concept) 396 | 397 | return StableDiffusionPipelineOutput( 398 | images=image, nsfw_content_detected=has_nsfw_concept) 399 | -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The OFA-Sys Team. 2 | # All rights reserved. 3 | # This source code is licensed under the Apache 2.0 license 4 | # found in the LICENSE file in the root directory. 5 | # 6 | # Copyright 2022 The HuggingFace Inc. team. 7 | # SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 8 | # SPDX-License-Identifier: Apache-2.0 9 | # 10 | # Licensed under the Apache License, Version 2.0 (the "License"); 11 | # you may not use this file except in compliance with the License. 12 | # You may obtain a copy of the License at 13 | # 14 | # http://www.apache.org/licenses/LICENSE-2.0 15 | # 16 | # Unless required by applicable law or agreed to in writing, software 17 | # distributed under the License is distributed on an "AS IS" BASIS, 18 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | # See the License for the specific language governing permissions and 20 | # limitations under the License. 21 | # 22 | 23 | from collections import OrderedDict 24 | from copy import copy 25 | import numpy as np 26 | import os 27 | import math 28 | from PIL import Image 29 | from polygraphy.backend.common import bytes_from_path 30 | from polygraphy.backend.trt import CreateConfig, Profile 31 | from polygraphy.backend.trt import engine_from_bytes, engine_from_network, network_from_onnx_path, save_engine 32 | from polygraphy.backend.trt import util as trt_util 33 | from polygraphy import cuda 34 | import random 35 | from scipy import integrate 36 | import tensorrt as trt 37 | import torch 38 | 39 | TRT_LOGGER = trt.Logger(trt.Logger.WARNING) 40 | 41 | 42 | class Engine(): 43 | def __init__( 44 | self, 45 | model_name, 46 | engine_dir, 47 | ): 48 | self.engine_path = os.path.join(engine_dir, model_name + '.plan') 49 | self.engine = None 50 | self.context = None 51 | self.buffers = OrderedDict() 52 | self.tensors = OrderedDict() 53 | 54 | def __del__(self): 55 | [ 56 | buf.free() for buf in self.buffers.values() 57 | if isinstance(buf, cuda.DeviceArray) 58 | ] 59 | del self.engine 60 | del self.context 61 | del self.buffers 62 | del self.tensors 63 | 64 | def build(self, onnx_path, fp16, input_profile=None, enable_preview=False): 65 | print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}") 66 | p = Profile() 67 | if input_profile: 68 | for name, dims in input_profile.items(): 69 | assert len(dims) == 3 70 | p.add(name, min=dims[0], opt=dims[1], max=dims[2]) 71 | 72 | preview_features = [] 73 | if enable_preview: 74 | trt_version = [int(i) for i in trt.__version__.split(".")] 75 | # FASTER_DYNAMIC_SHAPES_0805 should only be used for TRT 8.5.1 or above. 76 | if trt_version[0] > 8 or \ 77 | (trt_version[0] == 8 and (trt_version[1] > 5 or (trt_version[1] == 5 and trt_version[2] >= 1))): 78 | preview_features = [ 79 | trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805 80 | ] 81 | 82 | engine = engine_from_network(network_from_onnx_path(onnx_path), 83 | config=CreateConfig( 84 | fp16=fp16, 85 | profiles=[p], 86 | preview_features=preview_features)) 87 | save_engine(engine, path=self.engine_path) 88 | 89 | def activate(self): 90 | print(f"Loading TensorRT engine: {self.engine_path}") 91 | self.engine = engine_from_bytes(bytes_from_path(self.engine_path)) 92 | self.context = self.engine.create_execution_context() 93 | 94 | def allocate_buffers(self, shape_dict=None, device='cuda'): 95 | for idx in range(trt_util.get_bindings_per_profile(self.engine)): 96 | binding = self.engine[idx] 97 | if shape_dict and binding in shape_dict: 98 | shape = shape_dict[binding] 99 | else: 100 | shape = self.engine.get_binding_shape(binding) 101 | dtype = trt_util.np_dtype_from_trt( 102 | self.engine.get_binding_dtype(binding)) 103 | if self.engine.binding_is_input(binding): 104 | self.context.set_binding_shape(idx, shape) 105 | # Workaround to convert np dtype to torch 106 | np_type_tensor = np.empty(shape=[], dtype=dtype) 107 | torch_type_tensor = torch.from_numpy(np_type_tensor) 108 | tensor = torch.empty( 109 | tuple(shape), dtype=torch_type_tensor.dtype).to(device=device) 110 | self.tensors[binding] = tensor 111 | self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), 112 | shape=shape, 113 | dtype=dtype) 114 | 115 | def infer(self, feed_dict, stream): 116 | start_binding, end_binding = trt_util.get_active_profile_bindings( 117 | self.context) 118 | # shallow copy of ordered dict 119 | device_buffers = copy(self.buffers) 120 | for name, buf in feed_dict.items(): 121 | assert isinstance(buf, cuda.DeviceView) 122 | device_buffers[name] = buf 123 | bindings = [0] * start_binding + [ 124 | buf.ptr for buf in device_buffers.values() 125 | ] 126 | noerror = self.context.execute_async_v2(bindings=bindings, 127 | stream_handle=stream.ptr) 128 | if not noerror: 129 | raise ValueError(f"ERROR: inference failed.") 130 | 131 | return self.tensors 132 | 133 | 134 | class LMSDiscreteScheduler(): 135 | def __init__( 136 | self, 137 | device='cuda', 138 | beta_start=0.00085, 139 | beta_end=0.012, 140 | num_train_timesteps=1000, 141 | ): 142 | self.num_train_timesteps = num_train_timesteps 143 | self.order = 4 144 | 145 | self.beta_start = beta_start 146 | self.beta_end = beta_end 147 | betas = (torch.linspace(beta_start**0.5, 148 | beta_end**0.5, 149 | self.num_train_timesteps, 150 | dtype=torch.float32)**2) 151 | alphas = 1.0 - betas 152 | self.alphas_cumprod = torch.cumprod(alphas, dim=0) 153 | 154 | sigmas = np.array( 155 | ((1 - self.alphas_cumprod) / self.alphas_cumprod)**0.5) 156 | sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) 157 | self.sigmas = torch.from_numpy(sigmas) 158 | 159 | # standard deviation of the initial noise distribution 160 | self.init_noise_sigma = self.sigmas.max() 161 | 162 | self.device = device 163 | 164 | def set_timesteps(self, steps): 165 | self.num_inference_steps = steps 166 | 167 | timesteps = np.linspace(0, 168 | self.num_train_timesteps - 1, 169 | steps, 170 | dtype=float)[::-1].copy() 171 | sigmas = np.array( 172 | ((1 - self.alphas_cumprod) / self.alphas_cumprod)**0.5) 173 | sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) 174 | sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) 175 | self.sigmas = torch.from_numpy(sigmas).to(device=self.device) 176 | 177 | # Move all timesteps to correct device beforehand 178 | self.timesteps = torch.from_numpy(timesteps).to( 179 | device=self.device).float() 180 | self.derivatives = [] 181 | 182 | def scale_model_input(self, sample: torch.FloatTensor, idx, *args, 183 | **kwargs) -> torch.FloatTensor: 184 | return sample * self.latent_scales[idx] 185 | 186 | def configure(self): 187 | order = self.order 188 | self.lms_coeffs = [] 189 | self.latent_scales = [ 190 | 1. / ((sigma**2 + 1)**0.5) for sigma in self.sigmas 191 | ] 192 | 193 | def get_lms_coefficient(order, t, current_order): 194 | """ 195 | Compute a linear multistep coefficient. 196 | """ 197 | def lms_derivative(tau): 198 | prod = 1.0 199 | for k in range(order): 200 | if current_order == k: 201 | continue 202 | prod *= (tau - self.sigmas[t - k]) / ( 203 | self.sigmas[t - current_order] - self.sigmas[t - k]) 204 | return prod 205 | 206 | integrated_coeff = integrate.quad(lms_derivative, 207 | self.sigmas[t], 208 | self.sigmas[t + 1], 209 | epsrel=1e-4)[0] 210 | return integrated_coeff 211 | 212 | for step_index in range(self.num_inference_steps): 213 | order = min(step_index + 1, order) 214 | self.lms_coeffs.append([ 215 | get_lms_coefficient(order, step_index, curr_order) 216 | for curr_order in range(order) 217 | ]) 218 | 219 | def step(self, output, latents, idx, timestep): 220 | # compute the previous noisy sample x_t -> x_t-1 221 | # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise 222 | sigma = self.sigmas[idx] 223 | pred_original_sample = latents - sigma * output 224 | # 2. Convert to an ODE derivative 225 | derivative = (latents - pred_original_sample) / sigma 226 | self.derivatives.append(derivative) 227 | if len(self.derivatives) > self.order: 228 | self.derivatives.pop(0) 229 | # 3. Compute previous sample based on the derivatives path 230 | prev_sample = latents + sum( 231 | coeff * derivative for coeff, derivative in zip( 232 | self.lms_coeffs[idx], reversed(self.derivatives))) 233 | 234 | return prev_sample 235 | 236 | 237 | class DPMScheduler(): 238 | def __init__( 239 | self, 240 | beta_start=0.00085, 241 | beta_end=0.012, 242 | num_train_timesteps=1000, 243 | solver_order=2, 244 | predict_epsilon=True, 245 | thresholding=False, 246 | dynamic_thresholding_ratio=0.995, 247 | sample_max_value=1.0, 248 | algorithm_type="dpmsolver++", 249 | solver_type="midpoint", 250 | lower_order_final=True, 251 | device='cuda', 252 | ): 253 | # this schedule is very specific to the latent diffusion model. 254 | self.betas = (torch.linspace(beta_start**0.5, 255 | beta_end**0.5, 256 | num_train_timesteps, 257 | dtype=torch.float32)**2) 258 | 259 | self.device = device 260 | self.alphas = 1.0 - self.betas 261 | self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) 262 | # Currently we only support VP-type noise schedule 263 | self.alpha_t = torch.sqrt(self.alphas_cumprod) 264 | self.sigma_t = torch.sqrt(1 - self.alphas_cumprod) 265 | self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t) 266 | 267 | # standard deviation of the initial noise distribution 268 | self.init_noise_sigma = 1.0 269 | 270 | self.algorithm_type = algorithm_type 271 | self.predict_epsilon = predict_epsilon 272 | self.thresholding = thresholding 273 | self.dynamic_thresholding_ratio = dynamic_thresholding_ratio 274 | self.sample_max_value = sample_max_value 275 | self.lower_order_final = lower_order_final 276 | 277 | # settings for DPM-Solver 278 | if algorithm_type not in ["dpmsolver", "dpmsolver++"]: 279 | raise NotImplementedError( 280 | f"{algorithm_type} does is not implemented for {self.__class__}" 281 | ) 282 | if solver_type not in ["midpoint", "heun"]: 283 | raise NotImplementedError( 284 | f"{solver_type} does is not implemented for {self.__class__}") 285 | 286 | # setable values 287 | self.num_inference_steps = None 288 | self.solver_order = solver_order 289 | self.num_train_timesteps = num_train_timesteps 290 | self.solver_type = solver_type 291 | 292 | self.first_order_first_coef = [] 293 | self.first_order_second_coef = [] 294 | 295 | self.second_order_first_coef = [] 296 | self.second_order_second_coef = [] 297 | self.second_order_third_coef = [] 298 | 299 | self.third_order_first_coef = [] 300 | self.third_order_second_coef = [] 301 | self.third_order_third_coef = [] 302 | self.third_order_fourth_coef = [] 303 | 304 | def scale_model_input(self, sample: torch.FloatTensor, *args, 305 | **kwargs) -> torch.FloatTensor: 306 | return sample 307 | 308 | def configure(self): 309 | lower_order_nums = 0 310 | for step_index in range(self.num_inference_steps): 311 | step_idx = step_index 312 | timestep = self.timesteps[step_idx] 313 | 314 | prev_timestep = 0 if step_idx == len( 315 | self.timesteps) - 1 else self.timesteps[step_idx + 1] 316 | 317 | self.dpm_solver_first_order_coefs_precompute( 318 | timestep, prev_timestep) 319 | 320 | timestep_list = [self.timesteps[step_index - 1], timestep] 321 | self.multistep_dpm_solver_second_order_coefs_precompute( 322 | timestep_list, prev_timestep) 323 | 324 | timestep_list = [ 325 | self.timesteps[step_index - 2], self.timesteps[step_index - 1], 326 | timestep 327 | ] 328 | self.multistep_dpm_solver_third_order_coefs_precompute( 329 | timestep_list, prev_timestep) 330 | 331 | if lower_order_nums < self.solver_order: 332 | lower_order_nums += 1 333 | 334 | def dpm_solver_first_order_coefs_precompute(self, timestep, prev_timestep): 335 | lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[ 336 | timestep] 337 | alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep] 338 | sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep] 339 | h = lambda_t - lambda_s 340 | if self.algorithm_type == "dpmsolver++": 341 | self.first_order_first_coef.append(sigma_t / sigma_s) 342 | self.first_order_second_coef.append(alpha_t * 343 | (torch.exp(-h) - 1.0)) 344 | elif self.algorithm_type == "dpmsolver": 345 | self.first_order_first_coef.append(alpha_t / alpha_s) 346 | self.first_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0)) 347 | 348 | def multistep_dpm_solver_second_order_coefs_precompute( 349 | self, timestep_list, prev_timestep): 350 | t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2] 351 | lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[ 352 | s0], self.lambda_t[s1] 353 | alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] 354 | sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] 355 | h = lambda_t - lambda_s0 356 | if self.algorithm_type == "dpmsolver++": 357 | # See https://arxiv.org/abs/2211.01095 for detailed derivations 358 | if self.solver_type == "midpoint": 359 | self.second_order_first_coef.append(sigma_t / sigma_s0) 360 | self.second_order_second_coef.append( 361 | (alpha_t * (torch.exp(-h) - 1.0))) 362 | self.second_order_third_coef.append( 363 | 0.5 * (alpha_t * (torch.exp(-h) - 1.0))) 364 | elif self.solver_type == "heun": 365 | self.second_order_first_coef.append(sigma_t / sigma_s0) 366 | self.second_order_second_coef.append( 367 | (alpha_t * (torch.exp(-h) - 1.0))) 368 | self.second_order_third_coef.append( 369 | alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) 370 | elif self.algorithm_type == "dpmsolver": 371 | # See https://arxiv.org/abs/2206.00927 for detailed derivations 372 | if self.solver_type == "midpoint": 373 | self.second_order_first_coef.append(alpha_t / alpha_s0) 374 | self.second_order_second_coef.append( 375 | (sigma_t * (torch.exp(h) - 1.0))) 376 | self.second_order_third_coef.append( 377 | 0.5 * (sigma_t * (torch.exp(h) - 1.0))) 378 | elif self.solver_type == "heun": 379 | self.second_order_first_coef.append(alpha_t / alpha_s0) 380 | self.second_order_second_coef.append( 381 | (sigma_t * (torch.exp(h) - 1.0))) 382 | self.second_order_third_coef.append( 383 | (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0))) 384 | 385 | def multistep_dpm_solver_third_order_coefs_precompute( 386 | self, timestep_list, prev_timestep): 387 | t, s0 = prev_timestep, timestep_list[-1] 388 | lambda_t, lambda_s0 = (self.lambda_t[t], self.lambda_t[s0]) 389 | alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] 390 | sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] 391 | h = lambda_t - lambda_s0 392 | if self.algorithm_type == "dpmsolver++": 393 | self.third_order_first_coef.append(sigma_t / sigma_s0) 394 | self.third_order_second_coef.append(alpha_t * 395 | (torch.exp(-h) - 1.0)) 396 | self.third_order_third_coef.append( 397 | alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) 398 | self.third_order_fourth_coef.append( 399 | alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) 400 | elif self.algorithm_type == "dpmsolver": 401 | self.third_order_first_coef.append(alpha_t / alpha_s0) 402 | self.third_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0)) 403 | self.third_order_third_coef.append( 404 | sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) 405 | self.third_order_fourth_coef.append( 406 | sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) 407 | 408 | def set_timesteps(self, num_inference_steps): 409 | self.num_inference_steps = num_inference_steps 410 | timesteps = (np.linspace(0, self.num_train_timesteps - 1, 411 | num_inference_steps + 412 | 1).round()[::-1][:-1].copy().astype(np.int32)) 413 | self.timesteps = torch.from_numpy(timesteps).to(self.device) 414 | self.model_outputs = [ 415 | None, 416 | ] * self.solver_order 417 | self.lower_order_nums = 0 418 | 419 | def convert_model_output(self, model_output, timestep, sample): 420 | # DPM-Solver++ needs to solve an integral of the data prediction model. 421 | if self.algorithm_type == "dpmsolver++": 422 | if self.predict_epsilon: 423 | alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ 424 | timestep] 425 | x0_pred = (sample - sigma_t * model_output) / alpha_t 426 | else: 427 | x0_pred = model_output 428 | if self.thresholding: 429 | # Dynamic thresholding in https://arxiv.org/abs/2205.11487 430 | dynamic_max_val = torch.quantile( 431 | torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), 432 | self.dynamic_thresholding_ratio, 433 | dim=1) 434 | dynamic_max_val = torch.maximum( 435 | dynamic_max_val, 436 | self.sample_max_value * 437 | torch.ones_like(dynamic_max_val).to( 438 | dynamic_max_val.device), 439 | )[(..., ) + (None, ) * (x0_pred.ndim - 1)] 440 | x0_pred = torch.clamp(x0_pred, -dynamic_max_val, 441 | dynamic_max_val) / dynamic_max_val 442 | return x0_pred 443 | # DPM-Solver needs to solve an integral of the noise prediction model. 444 | elif self.algorithm_type == "dpmsolver": 445 | if self.predict_epsilon: 446 | return model_output 447 | else: 448 | alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ 449 | timestep] 450 | epsilon = (sample - alpha_t * model_output) / sigma_t 451 | return epsilon 452 | 453 | def dpm_solver_first_order_update(self, idx, model_output, sample): 454 | first_coef = self.first_order_first_coef[idx] 455 | second_coef = self.first_order_second_coef[idx] 456 | 457 | if self.algorithm_type == "dpmsolver++": 458 | x_t = first_coef * sample - second_coef * model_output 459 | elif self.algorithm_type == "dpmsolver": 460 | x_t = first_coef * sample - second_coef * model_output 461 | return x_t 462 | 463 | def multistep_dpm_solver_second_order_update(self, idx, model_output_list, 464 | timestep_list, prev_timestep, 465 | sample): 466 | t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2] 467 | m0, m1 = model_output_list[-1], model_output_list[-2] 468 | lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[ 469 | s0], self.lambda_t[s1] 470 | h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1 471 | r0 = h_0 / h 472 | D0, D1 = m0, (1.0 / r0) * (m0 - m1) 473 | 474 | first_coef = self.second_order_first_coef[idx] 475 | second_coef = self.second_order_second_coef[idx] 476 | third_coef = self.second_order_third_coef[idx] 477 | 478 | if self.algorithm_type == "dpmsolver++": 479 | # See https://arxiv.org/abs/2211.01095 for detailed derivations 480 | if self.solver_type == "midpoint": 481 | x_t = (first_coef * sample - second_coef * D0 - 482 | third_coef * D1) 483 | elif self.solver_type == "heun": 484 | x_t = (first_coef * sample - second_coef * D0 + 485 | third_coef * D1) 486 | elif self.algorithm_type == "dpmsolver": 487 | # See https://arxiv.org/abs/2206.00927 for detailed derivations 488 | if self.solver_type == "midpoint": 489 | x_t = (first_coef * sample - second_coef * D0 - 490 | third_coef * D1) 491 | elif self.solver_type == "heun": 492 | x_t = (first_coef * sample - second_coef * D0 - 493 | third_coef * D1) 494 | return x_t 495 | 496 | def multistep_dpm_solver_third_order_update(self, idx, model_output_list, 497 | timestep_list, prev_timestep, 498 | sample): 499 | t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[ 500 | -2], timestep_list[-3] 501 | m0, m1, m2 = model_output_list[-1], model_output_list[ 502 | -2], model_output_list[-3] 503 | lambda_t, lambda_s0, lambda_s1, lambda_s2 = ( 504 | self.lambda_t[t], 505 | self.lambda_t[s0], 506 | self.lambda_t[s1], 507 | self.lambda_t[s2], 508 | ) 509 | h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2 510 | r0, r1 = h_0 / h, h_1 / h 511 | D0 = m0 512 | D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2) 513 | D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1) 514 | D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1) 515 | 516 | first_coef = self.third_order_first_coef[idx] 517 | second_coef = self.third_order_second_coef[idx] 518 | third_coef = self.third_order_third_coef[idx] 519 | fourth_coef = self.third_order_fourth_coef[idx] 520 | 521 | if self.algorithm_type == "dpmsolver++": 522 | # See https://arxiv.org/abs/2206.00927 for detailed derivations 523 | x_t = (first_coef * sample - second_coef * D0 + third_coef * D1 - 524 | fourth_coef * D2) 525 | elif self.algorithm_type == "dpmsolver": 526 | # See https://arxiv.org/abs/2206.00927 for detailed derivations 527 | x_t = (first_coef * sample - second_coef * D0 - third_coef * D1 - 528 | fourth_coef * D2) 529 | return x_t 530 | 531 | def step(self, output, latents, step_index, timestep): 532 | if self.num_inference_steps is None: 533 | raise ValueError( 534 | "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" 535 | ) 536 | 537 | prev_timestep = 0 if step_index == len( 538 | self.timesteps) - 1 else self.timesteps[step_index + 1] 539 | lower_order_final = ((step_index == len(self.timesteps) - 1) 540 | and self.lower_order_final 541 | and len(self.timesteps) < 15) 542 | lower_order_second = ((step_index == len(self.timesteps) - 2) 543 | and self.lower_order_final 544 | and len(self.timesteps) < 15) 545 | 546 | output = self.convert_model_output(output, timestep, latents) 547 | for i in range(self.solver_order - 1): 548 | self.model_outputs[i] = self.model_outputs[i + 1] 549 | self.model_outputs[-1] = output 550 | 551 | if self.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final: 552 | prev_sample = self.dpm_solver_first_order_update( 553 | step_index, output, latents) 554 | elif self.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second: 555 | timestep_list = [self.timesteps[step_index - 1], timestep] 556 | prev_sample = self.multistep_dpm_solver_second_order_update( 557 | step_index, self.model_outputs, timestep_list, prev_timestep, 558 | latents) 559 | else: 560 | timestep_list = [ 561 | self.timesteps[step_index - 2], self.timesteps[step_index - 1], 562 | timestep 563 | ] 564 | prev_sample = self.multistep_dpm_solver_third_order_update( 565 | step_index, self.model_outputs, timestep_list, prev_timestep, 566 | latents) 567 | 568 | if self.lower_order_nums < self.solver_order: 569 | self.lower_order_nums += 1 570 | 571 | return prev_sample 572 | 573 | 574 | def save_image(images, image_path_dir, image_name_prefix): 575 | """ 576 | Save the generated images to png files. 577 | """ 578 | images = ((images + 1) * 255 / 2).clamp(0, 255).detach().permute( 579 | 0, 2, 3, 1).round().type(torch.uint8).cpu().numpy() 580 | for i in range(images.shape[0]): 581 | image_path = os.path.join( 582 | image_path_dir, image_name_prefix + str(i + 1) + '-' + 583 | str(random.randint(1000, 9999)) + '.png') 584 | print(f"Saving image {i+1} / {images.shape[0]} to: {image_path}") 585 | Image.fromarray(images[i]).save(image_path) 586 | --------------------------------------------------------------------------------