├── .github └── workflows │ ├── build-and-push-docker-image.yml │ └── build-image-on-prs.yml ├── .gitignore ├── Dockerfile ├── LICENSE.md ├── README.md ├── assets ├── 0.png ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── bluelove.png ├── greenlove.png ├── redlove.png ├── screen.png ├── village_0.png ├── village_10_2.png ├── village_15_2.png ├── village_5.png └── village_5_2.png ├── clip_config.pickle ├── main.py ├── parallel.py ├── requirements.txt ├── sb.py ├── schedulers.py ├── server.py ├── tests └── test_image_generation.py └── utils.py /.github/workflows/build-and-push-docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Push CI 2 | 3 | on: 4 | push: 5 | branches: ['master'] 6 | paths: 7 | - '**.py' 8 | - 'Dockerfile' 9 | jobs: 10 | docker: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - 14 | name: Set up Docker Buildx 15 | uses: docker/setup-buildx-action@v2 16 | - 17 | name: Login to DockerHub 18 | uses: docker/login-action@v2 19 | with: 20 | username: nicklucche 21 | password: ${{ secrets.DOCKERHUB_PASS }} 22 | - 23 | name: Build and push 24 | uses: docker/build-push-action@v3 25 | with: 26 | push: true 27 | tags: nicklucche/stable-diffusion:latest -------------------------------------------------------------------------------- /.github/workflows/build-image-on-prs.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build 2 | 3 | on: 4 | pull_request: 5 | branches: ['master'] 6 | paths: 7 | - '**.py' 8 | - 'Dockerfile' 9 | jobs: 10 | docker-build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Remove unnecessary files 14 | run: | 15 | sudo rm -rf /usr/share/dotnet 16 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 17 | - name: Checkout 18 | uses: actions/checkout@v2 19 | - name: Build Docker image 20 | run: docker build -t stable-diffusion:pr . 21 | # cpu-only tests on github runner, override entrypoint while mounting whole repo 22 | # TODO: consider artifacts or git lfs to avoid downloading models for testing 23 | - name: Run Tests 24 | run: docker run --entrypoint bash --rm -v .:/app2 stable-diffusion:pr -c 'cd /app2 && python3 -m pip install pytest && python3 -m pytest -v tests/' 25 | 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime 2 | WORKDIR /app 3 | COPY *.py requirements.txt *.pickle /app 4 | RUN apt update && apt install -y git 5 | RUN pip install -r requirements.txt 6 | ENV GRADIO_SERVER_PORT=7860 7 | ENV GRADIO_SERVER_NAME=0.0.0.0 8 | EXPOSE 7860 9 | ENTRYPOINT ["python3", "server.py"] -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Nicolò Lucchesi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A friend of mine working in art/design wanted to try out [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) on his own GPU-equipped PC, but he doesn't know much about coding, so I thought that baking a quick docker build was an easy way to help him out. This repo holds the files that go into that build. 2 | 3 | I also took the liberty of throwing in a simple web UI (made with gradio) to wrap the model. Perhaps we can evolve it a bit to offer a few more functionalities (see TODO). 4 | 5 | **UPDATE:** we now support inference on multiple GPUs with a "Data Parallel" approach. 6 | 7 | ~~**UPDATE 2:** we now support inference on multiple GPUs with a "Model Parallel" approach (see `Multi-GPU` section).~~ 8 | 9 | **UPDATE 3 but really it's a v2:** [Stable Diffusion 2.0](https://stability.ai/blog/stable-diffusion-v2-release) is out generating images more beautiful than ever! This is now the default model being loaded and it supports all previous features and more. I've also added support for *img2img* and *image inpainting* and refreshed the UI, give it a try! 10 | 11 | # Requirements 12 | - OS: Ubuntu (tested on 20.04) or Windows (tested on Windows 10 21H2) 13 | - Nvidia GPU with at least 6GB vRAM (gtx 700 onward, please refer [here](https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html)). Mind that the bigger the image size (or the number of images) you want to dream, the more memory you're gonna need. For reference, dreaming a 256x256 image should take up ~5gb, while a 512x512 around 7gb. 14 | - Free Disk space > 2.8gb 15 | - Docker and Nvidia-docker. 16 | - HuggingFace account as well as ~~registration to this repository https://huggingface.co/CompVis/stable-diffusion-v1-4 (simply click on `Access Repository`)~~. No longer needed if you use default v2 model (see "About model versions" below). 17 | 18 | # Installation 19 | 20 | First of all, make sure to have docker and nvidia-docker installed in your machine. 21 | 22 | **Windows users**: [install WSL/Ubuntu](https://stackoverflow.com/a/56783810) from store->install [docker](https://docs.docker.com/desktop/windows/wsl/) and start it->update Windows 10 to version 21H2 (Windows 11 should be ok as is)->test out [GPU-support](https://docs.nvidia.com/cuda/wsl-user-guide/index.html#cuda-support-for-wsl2) (a simple `nvidia-smi` in WSL should do). If `nvidia-smi` does not work from WSL, make sure you have updated your nvidia drivers from the official app. 23 | 24 | The easiest way to try out the model is to simply use the pre-built image at `nicklucche/stable-diffusion`. 25 | 26 | My advice is that you start the container with: 27 | 28 | `docker run --name stable-diffusion --pull=always --gpus all -it -p 7860:7860 nicklucche/stable-diffusion` 29 | 30 | the *first time* you run it, as it will download the model weights (can take a few minutes to do so) and store them on disk (as long as you don't delete the container). 31 | Then you can simply do `docker stop stable-diffusion` to stop the container and `docker start stable-diffusion` to bring it back up whenever you need. 32 | `--pull=always` is to make sure you get the latest image from dockerhub, you can skip it if you already have it locally. 33 | 34 | Once the init phase is finished a message will pop-up in your terminal (`docker logs stable-diffusion`) and you should be able to head to http://localhost:7860/ in your favorite browser and see something like this: 35 | 36 | ![](assets/screen.png) 37 | 38 | By default, the half-precision/fp16 model is loaded. This is the recommended approach if you're planning to run the model on a GPU with < 10GB of memory (takes half the space, ~half the time and yields similar output). To disable FP16 and run inference using single-precision (FP32), set the environment variable FP16=0 as a docker run option, like so: 39 | 40 | `docker run .. -e FP16=0 ...` 41 | 42 | ## Multi-GPU 43 | 44 | The model can be run in both a "DataParallel" or a combined "Model+Data Parallel" fashion to speed up inference time and leverage your multi-gpu setup to its fullest. 45 | 46 | ### Data Parallel 47 | 48 | This means that the model is replicated over multiple GPUs, each handled by a separate sub-process. By default, the model runs on device 0 (no parallelism). You can change that by specifying the desired device(s) by adding one of the following options: 49 | 50 | - `-e DEVICES=1 ...` runs model on GPU 1 (starts from 0) 51 | - `-e DEVICES=0,1 ...` runs model on GPU 0 and 1 52 | - `-e DEVICES=all ...` runs model on all available GPUs 53 | 54 | Each device/model generates a full image, so make sure you increase the `Number of Images` slider to generate multiple images in parallel! 55 | (Single image generation speed won't be affected). 56 | 57 | I should also mention that adding the nsfw filter (by checking corresponding box) includes moving an additional model to GPU, so it can cause out of memory issues. 58 | 59 | ### ~~Model Parallel~~ -Currently disabled! Use "Data Parallel" for true parallelism!- 60 | 61 | It works by splitting the model into a fixed number of parts, assigning each part to a device and then handling data transfer from one device to the other (more technical details [here](https://github.com/NickLucche/stable-diffusion-nvidia-docker/issues/8) or from source). 62 | This was originally intended to support setups that had GPUs with small amounts of VRAM that could only run the model by combining their resources, but now it also supports splitting multiple models to accomodate for bigger GPUs, effectively combining Model and Data Parallel. 63 | 64 | Single image inference will be slower in this modality (since we may need to move data from one device to the other), but it allows to fill your memory more efficiently if you have big GPUs by creating multiple models. 65 | You can try out this option with: 66 | 67 | `-e MODEL_PARALLEL=1` 68 | 69 | Note that if your system has highly imbalanced GPU memory distribution (e.g. gpu0->6Gb, gpu1->24Gb.. ) the smallest device might bottleneck the inference process; the easiest way to fix that, is to ignore the smallest device by *not* specifying it in the `DEVICES` list (e.g. `-e DEVICES=1,2..`). 70 | 71 | ## About models 72 | 73 | By default, the model loaded is [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base). Many other checkpoints have been created that are compatible with [diffusers](https://github.com/huggingface/diffusers) (awesome library, ckeck it out) and you can provide them as an additional environment variable like so: 74 | 75 | `-e MODEL_ID=runwayml/stable-diffusion-v1-5` 76 | 77 | Model weights are downloaded to and loaded from `/root/.cache/huggingface/diffusers`, so if you want to share your model across multiple containers runs, you can provide this path as a [docker volume](https://docs.docker.com/storage/volumes/): 78 | 79 | `-v /path/to/your/hugginface/cache:/root/.cache/huggingface/diffusers` 80 | 81 | Mind that the host path (first path up to ":") might very well be the same as the second if you're using the same diffusers library on the host and you didn't modify `HF_HOME`. 82 | 83 | Some models may require a huggingface token to be downloaded, you can get yours at https://huggingface.co/settings/tokens after registering for free on their website. You can then add the token to your env with `-e TOKEN=`. 84 | 85 | **P.S:** Feel free to open an issue for any problem you may face during installation. 86 | 87 | # Samples 88 | 89 | The internet is full of these, but I felt I couldn't let this repo go without sharing a few of "my own".. 90 | 91 |

92 | 93 | 94 |

95 | 96 | Fixed seed, slightly change text input (thanks to @mronchetti for the cool prompt): 97 |

98 | 99 | 100 | 101 |

102 | 103 | Fixed seed, same input, increase `guidance_scale` (more "adherent" to text) with a step of 5: 104 |

105 | 106 | 107 | 108 |

109 |

110 | 111 | 112 |

113 | 114 | 'Picture' vs 'Drawing' text input: 115 |

116 | 117 | 118 |

119 | 120 | 121 | ## TODO 122 | - [x] allow other input modalities (images) 123 | - [ ] support extra v2 features (depth-based generation, upscaling) 124 | - [x] move model to specifiec GPU number (env variable) 125 | - [x] multi-gpu support (data parallel) 126 | - [x] multi-gpu support (PipelineParallel/model parallel) 127 | - [ ] Data+Model parallel: optimize memory assignment for 512x512 inference 128 | - [ ] dump and clear prompt history 129 | - [ ] test on older cudnn 130 | -------------------------------------------------------------------------------- /assets/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/0.png -------------------------------------------------------------------------------- /assets/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/1.png -------------------------------------------------------------------------------- /assets/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/2.png -------------------------------------------------------------------------------- /assets/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/3.png -------------------------------------------------------------------------------- /assets/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/4.png -------------------------------------------------------------------------------- /assets/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/5.png -------------------------------------------------------------------------------- /assets/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/6.png -------------------------------------------------------------------------------- /assets/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/7.png -------------------------------------------------------------------------------- /assets/bluelove.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/bluelove.png -------------------------------------------------------------------------------- /assets/greenlove.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/greenlove.png -------------------------------------------------------------------------------- /assets/redlove.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/redlove.png -------------------------------------------------------------------------------- /assets/screen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/screen.png -------------------------------------------------------------------------------- /assets/village_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_0.png -------------------------------------------------------------------------------- /assets/village_10_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_10_2.png -------------------------------------------------------------------------------- /assets/village_15_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_15_2.png -------------------------------------------------------------------------------- /assets/village_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_5.png -------------------------------------------------------------------------------- /assets/village_5_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_5_2.png -------------------------------------------------------------------------------- /clip_config.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/clip_config.pickle -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | import torch 3 | from PIL import Image 4 | import os 5 | from utils import ModelParts2GPUsAssigner, get_gpu_setting 6 | from parallel import StableDiffusionModelParallel, StableDiffusionMultiProcessing 7 | import numpy as np 8 | from sb import DiffusionModel 9 | 10 | # read env variables 11 | TOKEN = os.environ.get("TOKEN", None) 12 | MODEL_ID = os.environ.get("MODEL_ID", "stabilityai/stable-diffusion-2-base") 13 | 14 | # If you are limited by GPU memory (e.g <10GB VRAM), please make sure to load in fp16 precision 15 | fp16 = bool(int(os.environ.get("FP16", 1))) 16 | # MP = bool(int(os.environ.get("MODEL_PARALLEL", 0))) 17 | MP = False # disabled 18 | MIN_INPAINT_MASK_PERCENT = 0.1 19 | 20 | # FIXME devices=0,1 causes cuda error on memory access..? 21 | IS_MULTI, DEVICES = get_gpu_setting(os.environ.get("DEVICES", "0")) 22 | 23 | # TODO docs 24 | def init_pipeline(model_or_path=MODEL_ID, devices: List[int]=DEVICES)->Union[DiffusionModel, StableDiffusionMultiProcessing]: 25 | kwargs = dict( 26 | pretrained_model_name_or_path=model_or_path, 27 | revision="fp16" if fp16 else None, 28 | torch_dtype=torch.float16 if fp16 else None, 29 | use_auth_token=TOKEN, 30 | requires_safety_checker=False, 31 | ) 32 | model_ass = None 33 | # single-gpu multiple models currently disabled 34 | if MP and len(devices) > 1: 35 | # setup for model parallel: find model parts->gpus assignment 36 | print( 37 | f"Looking for a valid assignment in which to split model parts to device(s): {devices}" 38 | ) 39 | ass_finder = ModelParts2GPUsAssigner(devices) 40 | model_ass = ass_finder() 41 | if not len(model_ass): 42 | raise Exception( 43 | "Unable to find a valid assignment of model parts to GPUs! This could be bad luck in sampling!" 44 | ) 45 | print("Assignments:", model_ass) 46 | 47 | # TODO move logic 48 | # if multi and pipe is not None: 49 | # avoid re-creating processes in multi-gpu mode, have them reload a different model 50 | # pipe.reload_model(model_or_path) 51 | if IS_MULTI: 52 | # DataParallel: one process *per GPU* (each has a copy of the model) 53 | # ModelParallel: one process *per model*, each model (possibly) on multiple GPUs 54 | n_procs = len(devices) if not MP else len(model_ass) 55 | pipe = StableDiffusionMultiProcessing.from_pretrained( 56 | n_procs, devices, model_parallel_assignment=model_ass, **kwargs 57 | ) 58 | else: 59 | pipe = DiffusionModel.from_pretrained(**kwargs) 60 | if len(devices): 61 | pipe.to(f"cuda:{devices[0]}") 62 | 63 | return pipe 64 | 65 | 66 | def inference( 67 | pipe: DiffusionModel, 68 | prompt, 69 | num_images=1, 70 | num_inference_steps=50, 71 | height=512, 72 | width=512, 73 | guidance_scale=7, 74 | seed=None, 75 | nsfw_filter=False, 76 | low_vram=False, 77 | noise_scheduler=None, 78 | inv_strenght=0.0, 79 | input_image=None, 80 | input_sketch=None, 81 | masked_image=None, 82 | ): 83 | prompt = [prompt] * num_images 84 | input_kwargs = dict( 85 | inference_type = "text", 86 | prompt=prompt, 87 | # number of denoising steps run during inference (the higher the better) 88 | num_inference_steps=num_inference_steps, 89 | height=height, 90 | width=width, 91 | guidance_scale=guidance_scale, 92 | # NOTE seed with multiples gpus will be different for each one but fixed! 93 | generator=seed, 94 | ) 95 | # input sketch has priority over input image 96 | if input_sketch is not None: 97 | input_image = input_sketch 98 | 99 | # TODO batch images by providing a torch tensor 100 | if input_image is not None: 101 | # image guided generation 102 | input_image = input_image.resize((width, height)) 103 | # TODO negative prompt? 104 | input_kwargs["init_image"] = input_image 105 | input_kwargs["strength"] = 1.0 - inv_strenght 106 | input_kwargs["inference_type"] = "img2img" 107 | elif masked_image is not None: 108 | # resize to specified shape 109 | masked_image = { 110 | k: v.convert("RGB").resize((width, height)) for k, v in masked_image.items() 111 | } 112 | 113 | # to do image inpainting, we must provide a big enough mask 114 | if np.count_nonzero(masked_image["mask"].convert("1")) < ( 115 | width * height * MIN_INPAINT_MASK_PERCENT 116 | ): 117 | raise ValueError("Mask is too small. Please paint-over a larger area") 118 | input_kwargs["image"] = masked_image["image"] 119 | input_kwargs["mask_image"] = masked_image["mask"] 120 | input_kwargs["inference_type"] = "inpaint" 121 | 122 | pipe.set_nsfw(nsfw_filter) 123 | 124 | # needed on 16GB RAM 768x768 fp32 125 | pipe.enable_attention_slicing("auto" if low_vram else None) 126 | 127 | # set noise scheduler for inference 128 | if noise_scheduler is not None: 129 | pipe.scheduler = noise_scheduler 130 | 131 | with torch.autocast("cuda"): 132 | images: List[Image.Image] = pipe(**input_kwargs)["images"] 133 | return images 134 | 135 | 136 | if __name__ == "__main__": 137 | from utils import image_grid 138 | 139 | images = inference(input("Input prompt:")) 140 | grid = image_grid(images, rows=1, cols=1) 141 | grid.show() 142 | -------------------------------------------------------------------------------- /parallel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import torch 4 | import torch.multiprocessing as mp 5 | import torch.nn as nn 6 | from transformers import CLIPConfig 7 | from schedulers import schedulers 8 | import pickle 9 | from diffusers.pipelines.stable_diffusion.safety_checker import ( 10 | StableDiffusionSafetyChecker, 11 | ) 12 | from diffusers.pipelines.stable_diffusion import ( 13 | StableDiffusionPipeline, 14 | StableDiffusionImg2ImgPipeline, 15 | StableDiffusionInpaintPipeline, 16 | ) 17 | from utils import ToGPUWrapper, dummy_checker, dummy_extractor, remove_nsfw 18 | from typing import Any, Dict, List, Optional, Union 19 | import random 20 | from sb import DiffusionModel 21 | 22 | ## Data Parallel: each process handles a copy of the model, executed on a different device ## 23 | ## +Model Parallel: model components are (potentially) scattered across different devices, each model handled by a process ## 24 | def cuda_inference_process( 25 | worker_id: int, 26 | devices: List[torch.device], 27 | in_q: mp.Queue, 28 | out_q: mp.Queue, 29 | model_kwargs: Dict[Any, Any], 30 | ): 31 | """Code executed by the torch.multiprocessing process, handling inference on device `device_id`. 32 | It's a simple loop in which the worker pulls data from a shared input queue, and puts result 33 | into an output queue. 34 | """ 35 | # wont work in pytorch 1.12 https://github.com/pytorch/pytorch/issues/80876 36 | # os.environ["CUDA_VISIBLE_DEVICES"]=str(device_id) 37 | mp_ass: Dict[int, int] = model_kwargs.pop("model_parallel_assignment", None) 38 | # each worker gets a different starting seed so they can be fixed and yet produce different results 39 | worker_seed = random.randint(0, int(2**32 - 1)) 40 | # TODO replace with custom `StableDiffusion` model, single process == multi-process 41 | try: 42 | if mp_ass is None: 43 | # TODO should we make sure we're downloading the model only once? 44 | device_id = devices[worker_id] 45 | print( 46 | f"Creating and moving model to cuda:{device_id} ({torch.cuda.get_device_name(device_id)}).." 47 | ) 48 | model: DiffusionModel = DiffusionModel.from_pretrained(**model_kwargs).to( 49 | f"cuda:{device_id}" 50 | ) 51 | else: 52 | mp_ass = mp_ass[worker_id] 53 | print("Model parallel worker component assignment:", mp_ass) 54 | print(f"Creating and moving model parts to respective devices..") 55 | model = StableDiffusionModelParallel.from_pretrained(**model_kwargs).to( 56 | mp_ass 57 | ) 58 | # TODO add for model parallel, but likely to refactor too 59 | # safety_checker, safety_extr = remove_nsfw(model) 60 | # create nsfw clip filter so we can re-set it if needed 61 | # safety_checker = StableDiffusionSafetyChecker( 62 | # CLIPConfig(**model_kwargs.pop("clip_config")) 63 | # ) 64 | out_q.put(True) 65 | except Exception as e: 66 | print(e) 67 | out_q.put(False) 68 | return 69 | # inference loop 70 | while True: 71 | # get prompt 72 | prompts, kwargs = in_q.get() 73 | if type(prompts) is not list: 74 | # special commands 75 | if prompts == "quit": 76 | break 77 | elif prompts == "safety_checker" and mp_ass is not None: 78 | # TODO 79 | raise NotImplementedError() 80 | elif prompts == "safety_checker": 81 | # safety checker is also be moved to GPU (it can cause crashes) when 'clip' is passed 82 | model.set_nsfw(kwargs == "clip") 83 | elif prompts == "scheduler": 84 | model.scheduler = kwargs 85 | elif prompts == "low_vram": 86 | model.enable_attention_slicing(kwargs) 87 | elif prompts == "reload_model": 88 | print(f"Worker {device_id}- Reloading model from disk..") 89 | model_path_or_id = kwargs 90 | model = model.reload_model(model_path_or_id) # maintains device 91 | # model loading needs ack 92 | out_q.put(True) 93 | continue 94 | if not len(prompts): 95 | images = [] 96 | else: 97 | # actual inference 98 | # print("Inference", prompts, kwargs, model.device) 99 | if kwargs.get("generator", None) is not None and kwargs["generator"] > 0: 100 | # NOTE different seed for each worker, but fixed! 101 | kwargs["generator"] = kwargs["generator"] + worker_seed 102 | # for repeatable results: tensor generated on cpu for model parallel 103 | # TODO unify model parallel interface, still using StableDiffusionPipeline 104 | if mp_ass is not None: 105 | kwargs["generator"] = torch.Generator("cpu").manual_seed( 106 | kwargs["generator"] 107 | ) 108 | else: 109 | kwargs.pop("generator", None) 110 | try: 111 | with torch.autocast("cuda"): 112 | images: List[Image.Image] = model(prompt=prompts, **kwargs).images 113 | except Exception as e: 114 | print(f"[Model {device_id}] Error during inference:", e) 115 | # TODO proper error propagation to master process 116 | images = [ 117 | Image.fromarray( 118 | np.zeros((kwargs["height"], kwargs["width"], 3), dtype=np.uint8) 119 | ) 120 | ] 121 | out_q.put(images) 122 | 123 | 124 | # class that handles multi-gpu models, mimicking original interface 125 | class StableDiffusionMultiProcessing(object): 126 | def __init__( 127 | self, n_procs: int, devices: List[int], model_id_or_path: str = "" 128 | ) -> None: 129 | self.devices = devices 130 | self.n = n_procs 131 | self._safety_checker = "dummy" 132 | self._scheduler = "PNDM" 133 | self._pipeline_type = "text" 134 | self._pipe_name = model_id_or_path 135 | 136 | def _send_cmd(self, k1, k2, wait_ack=True): 137 | # send a cmd to all processes (put item in queue) 138 | for i in range(self.n): 139 | self.q.put((k1[i], k2[i])) 140 | # and wait for its completion 141 | res = [] 142 | if wait_ack: 143 | for _ in range(self.n): 144 | res.append(self.outq.get()) 145 | return res 146 | 147 | def _send_cmd_to_all(self, k1, k2, wait_ack=True): 148 | return self._send_cmd([k1] * self.n, [k2] * self.n, wait_ack=wait_ack) 149 | 150 | def __call__(self, prompt, **kwargs): 151 | # run inference on different processes, each handles a model on a different GPU (split load evenly) 152 | # FIXME when n_prompts < n, unused processes get an empty list as input, so we can always wait all processes 153 | prompt = [list(p) for p in np.array_split(prompt, self.n)] 154 | # request inference and block for result 155 | res = self._send_cmd(prompt, [kwargs] * self.n) 156 | # mimic interface 157 | return {"images": [img for images in res for img in images]} 158 | 159 | @classmethod 160 | def from_pretrained( 161 | cls, n_processes: int, devices: List[int], **kwargs 162 | ) -> "StableDiffusionMultiProcessing": 163 | # create communication i/o "channels" 164 | cls.q = mp.Queue() 165 | cls.outq = mp.Queue() 166 | # load nsfw filter CLIP configuration 167 | # TODO still needed? 168 | with open("./clip_config.pickle", "rb") as f: 169 | d = pickle.load(f) 170 | kwargs["clip_config"] = d 171 | 172 | # create models in their own process and move them to correspoding device 173 | cls._procs: List[mp.Process] = [] 174 | for i in range(n_processes): 175 | p = mp.Process( 176 | target=cuda_inference_process, 177 | args=(i, devices, cls.q, cls.outq, kwargs), 178 | daemon=False, 179 | ) 180 | p.start() 181 | cls._procs.append(p) 182 | 183 | # wait until you move all models to their respective gpu (consistent with single mode) 184 | for _ in range(n_processes): 185 | d = cls.outq.get() 186 | assert d 187 | # cls.pipes: List[StableDiffusionPipeline] = models 188 | return cls(n_processes, devices, kwargs["pretrained_model_name_or_path"]) 189 | 190 | def __del__(self): 191 | # exit and join condition 192 | for _ in range(self.n): 193 | self.q.put(("quit", "")) 194 | for p in self._procs: 195 | p.join() 196 | 197 | def __len__(self): 198 | return self.n 199 | 200 | # mimic interface 201 | @property 202 | def safety_checker(self): 203 | return self._safety_checker 204 | 205 | @safety_checker.setter 206 | def safety_checker(self, value): 207 | # value=None->set filter, o/w set nsfw filter off 208 | nsfw_on = value is None 209 | # switch nsfw on, otherwise don't bother re-setting on processes 210 | if self.safety_checker == "dummy" and nsfw_on: 211 | self._safety_checker == "clip" 212 | self._send_cmd_to_all("safety_checker", "clip", wait_ack=False) 213 | elif self.safety_checker == "clip" and not nsfw_on: 214 | self._safety_checker == "dummy" 215 | self._send_cmd_to_all("safety_checker", "dummy", wait_ack=False) 216 | 217 | @property 218 | def scheduler(self): 219 | return self._scheduler 220 | 221 | @scheduler.setter 222 | def scheduler(self, value): 223 | # avoid re-setting if already set 224 | if self.scheduler == value or value not in schedulers: 225 | return 226 | self._scheduler = value 227 | self._send_cmd_to_all("scheduler", value, wait_ack=False) 228 | 229 | def enable_attention_slicing(self, value): 230 | self._send_cmd_to_all("low_vram", value, wait_ack=False) 231 | 232 | # def disable_attention_slicing(self): 233 | # self._send_cmd_to_all("low_vram", None, wait_ack=False) 234 | 235 | def change_pipeline_type(self, new_type: str): 236 | assert new_type in ["text", "img2img", "inpaint"] 237 | if new_type == self._pipeline_type: 238 | return 239 | self._pipeline_type = new_type 240 | self._send_cmd_to_all("pipeline_type", new_type, wait_ack=False) 241 | 242 | def reload_model(self, model_or_path: str): 243 | # reset all other options to default so they can be restored on next call 244 | self._send_cmd_to_all("reload_model", model_or_path, wait_ack=True) 245 | self._safety_checker = "dummy" 246 | self._scheduler = "PNDM" 247 | self._pipeline_type = "text" 248 | 249 | def set_nsfw(self, nsfw: bool): 250 | # this will avoid unnecessary inter-process calls and only set if needed 251 | if nsfw: 252 | self.safety_checker = None 253 | else: 254 | self.safety_checker = dummy_checker 255 | 256 | 257 | from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer 258 | 259 | from diffusers.models import AutoencoderKL, UNet2DConditionModel 260 | from diffusers.pipeline_utils import DiffusionPipeline 261 | from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler 262 | 263 | 264 | class StableDiffusionModelParallel(StableDiffusionPipeline): 265 | def __init__( 266 | self, 267 | vae: AutoencoderKL, 268 | text_encoder: CLIPTextModel, 269 | tokenizer: CLIPTokenizer, 270 | unet: UNet2DConditionModel, 271 | scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], 272 | safety_checker: StableDiffusionSafetyChecker, 273 | feature_extractor: CLIPFeatureExtractor, 274 | ): 275 | """ 276 | Model can be split into 4 main components: 277 | - unet_encoder (downblocks to middle block) 278 | - unet_decoder (up_blocks+) 279 | - text_encoder 280 | - vae 281 | This class handles the components of a model that are split among multiple GPUs, 282 | taking care of moving tensors and Modules to the right devices: e.g. 283 | unet_encoder GPU_0 -> unet_decoder GPU_1 -> text_encoder GPU_1 -> vae GPU_0. 284 | Result is eventually moved back to CPU at the end of each foward call. 285 | """ 286 | super().__init__( 287 | vae, 288 | text_encoder, 289 | tokenizer, 290 | unet, 291 | scheduler, 292 | safety_checker, 293 | feature_extractor, 294 | ) 295 | self._scheduler = self.scheduler 296 | # self._safety_checker = self.safety_checker 297 | 298 | def to(self, part_to_device: Dict[int, torch.device]): 299 | # move each component onto the specified device 300 | self.vae = ToGPUWrapper(self.vae, part_to_device[3]) 301 | self.text_encoder = ToGPUWrapper(self.text_encoder, part_to_device[2]) 302 | 303 | # move unet, requires a bit more work as it is chunked further into multiple parts 304 | # move encoder 305 | for layer in [ 306 | "time_proj", 307 | "time_embedding", 308 | "conv_in", 309 | "down_blocks", 310 | "mid_block", 311 | ]: 312 | module = getattr(self.unet, layer) 313 | if type(module) is nn.ModuleList: 314 | mlist = nn.ModuleList( 315 | [ToGPUWrapper(mod, part_to_device[0]) for mod in module] 316 | ) 317 | setattr(self.unet, layer, mlist) 318 | else: 319 | setattr(self.unet, layer, ToGPUWrapper(module, part_to_device[0])) 320 | 321 | # move decoder 322 | for layer in ["up_blocks", "conv_norm_out", "conv_act", "conv_out"]: 323 | module = getattr(self.unet, layer) 324 | if type(module) is nn.ModuleList: 325 | mlist = nn.ModuleList( 326 | [ToGPUWrapper(mod, part_to_device[1]) for mod in module] 327 | ) 328 | setattr(self.unet, layer, mlist) 329 | else: 330 | setattr(self.unet, layer, ToGPUWrapper(module, part_to_device[1])) 331 | 332 | # need to wrap scheduler.step to move sampled noise to unet gpu 333 | self._wrap_scheduler_step() 334 | return self 335 | 336 | @property 337 | def device(self) -> torch.device: 338 | # NOTE this overrides super so we can handle all tensors devices manually, all `to(self.device)` 339 | # done in the forward pass become a no-op 340 | return None 341 | 342 | def _wrap_scheduler_step(self): 343 | prev_foo = self._scheduler.step 344 | 345 | def wrapper(x, i, sample: torch.Tensor, *args, **kwargs): 346 | sample = sample.to(self.unet.up_blocks.device) 347 | return prev_foo(x, i, sample, *args, **kwargs) 348 | 349 | self._scheduler.step = wrapper 350 | 351 | # override this interface for setting 352 | # @property 353 | # def safety_checker(self): 354 | # return self._safety_checker 355 | 356 | # @safety_checker.setter 357 | # def safety_checker(self, value): 358 | # # switch nsfw on, otherwise don't bother re-setting on processes 359 | # if self.safety_checker is None and value is not None: 360 | # self._safety_checker == value 361 | # elif self.safety_checker is not None and value is None: 362 | # self._safety_checker == None 363 | 364 | @property 365 | def scheduler(self): 366 | return self._scheduler 367 | 368 | @scheduler.setter 369 | def scheduler( 370 | self, value: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler] 371 | ): 372 | # if self.scheduler.__class__.__name__ == value.__class__.__name__: 373 | # return 374 | if not hasattr(self, "_scheduler"): 375 | # used during init phase 376 | self._scheduler = value 377 | else: 378 | self._scheduler = value 379 | self._wrap_scheduler_step() 380 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gradio==3.11.0 2 | diffusers==0.9.0 3 | transformers==4.25.1 4 | xformers==0.0.16 5 | scipy==1.7.0 6 | ftfy==6.1.1 7 | invisible-watermark 8 | accelerate==0.14.0 -------------------------------------------------------------------------------- /sb.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional, Union 2 | from diffusers.pipelines.stable_diffusion import ( 3 | StableDiffusionPipeline, 4 | StableDiffusionPipelineOutput, 5 | StableDiffusionImg2ImgPipeline, 6 | StableDiffusionInpaintPipeline, 7 | ) 8 | import torch 9 | from diffusers.pipelines.stable_diffusion.safety_checker import ( 10 | StableDiffusionSafetyChecker, 11 | ) 12 | from utils import remove_nsfw 13 | from schedulers import schedulers 14 | from transformers import CLIPFeatureExtractor 15 | 16 | 17 | class DiffusionModel: 18 | def __init__(self, pipe: StableDiffusionPipeline = None) -> None: 19 | self.pipe: StableDiffusionPipeline = pipe 20 | self._safety: StableDiffusionSafetyChecker = None 21 | self._safety_extractor: CLIPFeatureExtractor = None 22 | self._pipe_name = "" 23 | self._device = torch.cpu 24 | 25 | @classmethod 26 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): 27 | return cls()._load_pipeline(pretrained_model_name_or_path, **kwargs) 28 | 29 | def _load_pipeline(self, model_or_path, **kwargs): 30 | if self.pipe is not None and self._pipe_name == model_or_path: 31 | # avoid re-loading same model 32 | return 33 | 34 | print(f"Loading {model_or_path} from disk..") 35 | self.pipe = StableDiffusionPipeline.from_pretrained( 36 | pretrained_model_name_or_path=model_or_path, **kwargs 37 | ) 38 | # remove safety checker so it doesn't use up GPU memory (by default) 39 | self._safety, self._safety_extractor = remove_nsfw(self.pipe) 40 | 41 | self._pipe_name = model_or_path 42 | print("Model Loaded!") 43 | return self 44 | 45 | def __call__( 46 | self, inference_type: str, *args: Any, **kwargs: Any 47 | ) -> StableDiffusionPipelineOutput: 48 | # NOTE: to avoid re-loading the model, we ""cast"" the pipeline 49 | if inference_type == "text": 50 | self.pipe.__class__ = StableDiffusionPipeline 51 | elif inference_type == "img2img": 52 | self.pipe.__class__ = StableDiffusionImg2ImgPipeline 53 | elif inference_type == "inpaint": 54 | self.pipe.__class__ = StableDiffusionInpaintPipeline 55 | # generator cant be pickled for multiprocessing, provide a coherent interface 56 | if kwargs.get("generator", None) is not None and kwargs["generator"] > 0: 57 | kwargs["generator"] = torch.Generator(self._device).manual_seed( 58 | kwargs["generator"] 59 | ) 60 | else: 61 | kwargs.pop("generator", None) # ignore seed < 0 62 | 63 | return self.pipe(*args, **kwargs) 64 | 65 | def reload_model(self, model_or_path: str, **kwargs): 66 | # this is separated from __call__ hoping that we can get a single model that can do inpainting and img2img without reloading 67 | return self._load_pipeline(model_or_path, **kwargs).to( 68 | self._device 69 | ) # maintain device! 70 | 71 | def to(self, device: Union[torch.device, str]): 72 | self.pipe.to(device) 73 | self._device = device 74 | return self 75 | 76 | def set_nsfw(self, nsfw: bool): 77 | if nsfw: 78 | # re- instatiate safety checkers 79 | self.pipe.safety_checker = self._safety_checker.to(self._device) 80 | self.pipe.feature_extractor = self._safety_extractor 81 | else: 82 | # ignore return value, we already have the safety network 83 | remove_nsfw(self.pipe) 84 | 85 | # mimic interface 86 | @property 87 | def scheduler(self): 88 | return self.pipe.scheduler 89 | 90 | @scheduler.setter 91 | def scheduler(self, scheduler: str): 92 | assert isinstance(scheduler, str) 93 | if self.scheduler.__class__.__name__ == schedulers[scheduler].__name__: 94 | # avoid re-setting same scheduler 95 | pass 96 | elif scheduler is not None and scheduler in schedulers: 97 | print(f"Setting noise scheduler to {scheduler}") 98 | # TODO use a default config instead of self.pipe.scheduler.config? 99 | s = getattr(schedulers[scheduler], "from_config")( 100 | self.pipe.scheduler.config 101 | ) 102 | self.pipe.scheduler = s 103 | else: 104 | raise ValueError(f"Invalid Scheduler {scheduler}!") 105 | 106 | def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): 107 | # TODO this can be further pushed 108 | # when slice_size is None, this is disabled 109 | return self.pipe.enable_attention_slicing(slice_size) 110 | -------------------------------------------------------------------------------- /schedulers.py: -------------------------------------------------------------------------------- 1 | # just a util file to gather the supported noise schedulers 2 | from diffusers.schedulers import * 3 | # setup noise schedulers 4 | schedulers_names = [ 5 | "EulerDiscrete", 6 | "DDIM", 7 | "PNDM", 8 | "K-LMS linear", 9 | "K-LMS scaled", 10 | ] 11 | schedulers_cls = [ 12 | EulerDiscreteScheduler, 13 | DDIMScheduler, 14 | PNDMScheduler, 15 | LMSDiscreteScheduler, 16 | LMSDiscreteScheduler, 17 | ] 18 | # NOTE scheduler params are now loaded from pre-trained model 19 | # schedulers_args = [ 20 | # dict(), 21 | # { 22 | # "beta_end": 0.012, 23 | # "beta_schedule": "scaled_linear", 24 | # "beta_start": 0.00085, 25 | # "num_train_timesteps": 1000, 26 | # "skip_prk_steps": True, 27 | # }, 28 | # dict(), 29 | # dict(beta_schedule="scaled_linear"), 30 | # ] 31 | # scheduler_name -> scheduler_class 32 | schedulers = dict(zip(schedulers_names, schedulers_cls)) -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import numpy as np 3 | import torch.multiprocessing as mp 4 | from schedulers import schedulers_names 5 | import json 6 | 7 | def pop_up_exceptions(func): 8 | def wrapper(*args, **kwargs): 9 | try: 10 | result = func(*args, **kwargs) 11 | except Exception as e: 12 | raise gr.Error(str(e)) 13 | return result 14 | return wrapper 15 | 16 | if __name__ == "__main__": 17 | mp.set_start_method("spawn", force=True) 18 | 19 | from main import inference, MP as model_parallel, init_pipeline, MODEL_ID, DEVICES 20 | 21 | # create model(s) 22 | pipeline = init_pipeline() 23 | 24 | @pop_up_exceptions 25 | def change_model(choice: str): 26 | if choice == "Base Model": 27 | pipeline.reload_model(MODEL_ID) 28 | elif choice == "Inpainting": 29 | pipeline.reload_model("stabilityai/stable-diffusion-2-inpainting") 30 | return gr.Image.update(interactive=choice =="Inpainting") 31 | 32 | history = [] 33 | @pop_up_exceptions 34 | def dream( 35 | prompt: str, 36 | *args 37 | ): 38 | # return [(np.random.randn(512, 512, 3)).astype(np.uint8)], [["test"]] 39 | if not len(prompt.strip()): 40 | return [], history 41 | images = inference(pipeline, prompt, *args) 42 | if not len(history) or [prompt] != history[-1]: 43 | history.append([prompt]) 44 | 45 | return images, history 46 | 47 | # v2 model was trained on sfw data only, hence no safety checker 48 | enable_nsfw_toggle=not model_parallel and MODEL_ID!="stabilityai/stable-diffusion-2-base" 49 | 50 | with gr.Blocks() as demo: 51 | with gr.Row(): 52 | with gr.Column(): 53 | with gr.Tabs(): 54 | inputs = [] 55 | with gr.TabItem("Text2Img"): 56 | with gr.Column(): 57 | # FIXME crashes with weird error if no input 58 | inputs.append(gr.Textbox(placeholder="Place your input prompt here and start dreaming!", label="Input Prompt")), 59 | inputs.append(gr.Slider(1, max(24, len(DEVICES)*2), 1, step=1, label="Number of Images")), 60 | inputs.append(gr.Slider(1, 200, 50, step=1, label="Steps")), 61 | inputs.append(gr.Slider(256, 1024, 512, step=64, label="Height")), 62 | inputs.append(gr.Slider(256, 1024, 512, step=64, label="Width")), 63 | inputs.append(gr.Slider(0, 20, 7.5, step=0.5, label="Guidance Scale")), 64 | inputs.append(gr.Number(label="Seed", precision=0)), 65 | # inputs.append(# gr.Checkbox(True, label="FP16")), 66 | inputs.append(gr.Checkbox(False, label="NSFW Filter", interactive=enable_nsfw_toggle)), 67 | inputs.append(gr.Checkbox(False, label="Low VRAM mode")), 68 | inputs.append(gr.Dropdown(schedulers_names, value="PNDM", label="Noise Scheduler")), 69 | with gr.TabItem("Img2Img"): 70 | with gr.Column(): 71 | gr.Markdown("Image and prompt guided generation. Use one of the box below to provide an input image: if two are provided, `Sketch2Img` has priority. Remember to clear the input by pressing `clear`.") 72 | inputs.append(gr.Slider(0, 1, 0.25, step=0.05, label="Img2Img input fidelity")), 73 | inputs.append(gr.Image(type="pil", tool=None, label="Image Conditioning")), 74 | # FIXME state is not resetting when clicking x! resets when clear is pressed 75 | inputs.append(gr.Image(type="pil", source='canvas', tool='color-sketch', label="Sketch2Img")) 76 | with gr.TabItem("Image Inpainting"): 77 | with gr.Column(): 78 | gr.Markdown("NOTE: Using image inpainting requires loading a different model from disk!") 79 | inpainting = gr.Image(type="pil", tool='sketch', label="Image Inpaint", interactive=False) 80 | inputs.append(inpainting), 81 | with gr.Row(): 82 | clear_btn = gr.Button("Clear", variant="secondary") 83 | button = gr.Button("Generate Image!", variant="primary") 84 | with gr.Column(variant="box"): 85 | outputs=[gr.Gallery(show_label=False).style(grid=2, container=True)] 86 | load_radio = gr.Radio(["Base Model", "Inpainting"], value="Base Model",label="Model to load:") 87 | outputs.append(gr.Dataframe(col_count=(1, "fixed"),headers=["Prompt History"], interactive=True)) 88 | # sample prompt from https://strikingloo.github.io/DALL-E-2-prompt-guide 89 | # NOTE prompt MUST be first input, since UI order is forwarded as is to `inference` 90 | gr.Examples(["A digital illustration of a medieval town, 4k, detailed, trending in artstation, fantasy"], inputs=inputs[:1]) 91 | button.click(dream, inputs=inputs, outputs=outputs) 92 | # clear inputs and outputs 93 | clear_btn.click( 94 | None, 95 | [], 96 | ( 97 | inputs 98 | + outputs 99 | ), 100 | _js=f"""() => {json.dumps( 101 | [component.cleared_value if hasattr(component, "cleared_value") else None 102 | for component in inputs+outputs] 103 | ) 104 | } 105 | """, 106 | ) 107 | load_radio.change(change_model, inputs=load_radio,outputs=inpainting) 108 | demo.launch(share=False) -------------------------------------------------------------------------------- /tests/test_image_generation.py: -------------------------------------------------------------------------------- 1 | from main import init_pipeline, inference 2 | from diffusers import StableDiffusionPipeline 3 | from diffusers.pipelines.stable_diffusion import ( 4 | StableDiffusionImg2ImgPipeline, 5 | StableDiffusionInpaintPipeline, 6 | ) 7 | import pytest 8 | from typing import List 9 | from PIL import Image 10 | import torch 11 | import numpy as np 12 | import psutil 13 | 14 | PROMPT = "A starry night" 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def txt2img() -> StableDiffusionPipeline: 19 | pipe = StableDiffusionPipeline.from_pretrained( 20 | "stabilityai/stable-diffusion-2-base", 21 | # revision="fp16", cpu needs fp32 22 | # torch_dtype=torch.float16, 23 | ) 24 | if torch.cuda.is_available(): 25 | pipe.to(torch.device("cuda")) 26 | return pipe 27 | 28 | 29 | def requires_cuda(func): 30 | def wrapper(*args, **kwargs): 31 | if not torch.cuda.is_available(): 32 | pytest.skip("This test needs a GPU") 33 | return func(*args, **kwargs) 34 | 35 | return wrapper 36 | 37 | def enable_multiprocessing(func): 38 | def wrapper(*args, **kwargs): 39 | from main import IS_MULTI 40 | IS_MULTI = True 41 | try: 42 | res = func(*args, **kwargs) 43 | except Exception as e: 44 | raise e 45 | finally: 46 | IS_MULTI = False 47 | return res 48 | 49 | return wrapper 50 | 51 | 52 | def check_n_free_GBs(n: int = 0): 53 | # guard against executing tests where not enough space is present 54 | # mostly a workaround for github runners (TODO check `HF_HOME` root not /home) 55 | ps = psutil.disk_usage("/home") 56 | if ps.free / (1024.0**3) < n: 57 | pytest.skip( 58 | f"This test needs {n} gigabytes to run! Space left is only {ps.free / (1024.0 ** 3)}G." 59 | ) 60 | 61 | 62 | # these tests have to run on cpu.. 63 | def test_txt2img(txt2img: StableDiffusionPipeline): 64 | check_n_free_GBs(n=17) 65 | input_kwargs = dict( 66 | prompt=PROMPT, 67 | num_inference_steps=3, 68 | height=512, 69 | width=512, 70 | generator=None, 71 | ) 72 | images: List[Image.Image] = txt2img(**input_kwargs)["images"] 73 | assert images[0].size == (512, 512) 74 | 75 | 76 | @requires_cuda 77 | def test_txt2img_pipeline(): 78 | check_n_free_GBs(n=2.5) 79 | pipe = init_pipeline("stabilityai/stable-diffusion-2-base", [0]) 80 | images = inference( 81 | pipe, PROMPT, num_images=1, num_inference_steps=3, height=512, width=512 82 | ) 83 | assert len(images) == 1 and images[0].size == (512, 512) 84 | 85 | @enable_multiprocessing 86 | def test_txt2img_pipeline_multiprocessing(): 87 | test_txt2img_pipeline() 88 | 89 | @requires_cuda 90 | def test_img2img_pipeline(): 91 | check_n_free_GBs(n=2.5) 92 | pipe = init_pipeline("stabilityai/stable-diffusion-2-base", [0]) 93 | image = Image.open("./assets/0.png") 94 | images = inference( 95 | pipe, 96 | PROMPT, 97 | num_images=1, 98 | num_inference_steps=3, 99 | height=512, 100 | width=512, 101 | input_image=image, 102 | inv_strenght=0.5, 103 | ) 104 | assert len(images) == 1 and images[0].size == (512, 512) 105 | # test multi-processing 106 | 107 | @enable_multiprocessing 108 | def test_img2img_pipeline_multiprocessing(): 109 | test_img2img_pipeline() 110 | 111 | @requires_cuda 112 | def test_imginpainting_pipeline(): 113 | check_n_free_GBs(n=2.5) 114 | pipe = init_pipeline("stabilityai/stable-diffusion-2-inpainting", [0]) 115 | image = Image.open("./assets/0.png") 116 | # mask image 117 | mask = np.array(image) 118 | mask[:, : image.size[0] // 2] = 0 119 | mask = Image.fromarray(mask) 120 | images = inference( 121 | pipe, 122 | PROMPT, 123 | num_images=1, 124 | num_inference_steps=3, 125 | height=512, 126 | width=512, 127 | masked_image={"image": image, "mask": mask}, 128 | ) 129 | assert len(images) == 1 and images[0].size == (512, 512) 130 | # masked part more diverse than the "fixed" one 131 | res, source = np.array(images[0]), np.array(image) 132 | assert (source[:, : image.size[0] // 2] - res[:, : image.size[0] // 2]).sum() < ( 133 | source[:, image.size[0] // 2 :] - res[:, image.size[0] // 2 :] 134 | ).sum() 135 | 136 | @enable_multiprocessing 137 | def test_imginpainting_pipeline_multiprocessing(): 138 | test_imginpainting_pipeline() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Tuple, List 3 | import torch 4 | import torch.nn as nn 5 | from PIL import Image 6 | import numpy as np 7 | import multiprocessing 8 | from diffusers import StableDiffusionPipeline 9 | from diffusers.pipelines.stable_diffusion.safety_checker import ( 10 | StableDiffusionSafetyChecker, 11 | ) 12 | from transformers import CLIPFeatureExtractor 13 | from transformers.feature_extraction_utils import BatchFeature 14 | 15 | 16 | def image_grid(imgs, rows, cols): 17 | assert len(imgs) == rows * cols 18 | 19 | w, h = imgs[0].size 20 | grid = Image.new("RGB", size=(cols * w, rows * h)) 21 | grid_w, grid_h = grid.size 22 | 23 | for i, img in enumerate(imgs): 24 | grid.paste(img, box=(i % cols * w, i // cols * h)) 25 | return grid 26 | 27 | 28 | def dummy_checker(images, *args, **kwargs): 29 | # removes nsfw filter 30 | return images, False 31 | 32 | 33 | def dummy_extractor(images, return_tensors="pt"): 34 | # print(type(images), type(images[0])) 35 | if type(images) is list: 36 | images = [np.array(img) for img in images] 37 | data = {"pixel_values": images} 38 | return BatchFeature(data=data, tensor_type=return_tensors) 39 | 40 | 41 | def remove_nsfw( 42 | model: StableDiffusionPipeline, 43 | ) -> Tuple[StableDiffusionSafetyChecker, CLIPFeatureExtractor]: 44 | nsfw_model: StableDiffusionSafetyChecker = model.safety_checker 45 | # don't panic is safety_checker is already a dummy 46 | if isinstance(nsfw_model, StableDiffusionSafetyChecker): 47 | nsfw_model = nsfw_model.cpu() 48 | model.safety_checker = dummy_checker 49 | extr = model.feature_extractor 50 | model.feature_extractor = dummy_extractor 51 | return nsfw_model, extr 52 | 53 | 54 | def get_gpu_setting(env_var: str) -> Tuple[bool, List[int]]: 55 | if not torch.cuda.is_available(): 56 | print("GPU not detected! Make sure you have a GPU to reduce inference time!") 57 | return False, [] 58 | # reads user input, returns multi_gpu flag and gpu id(s) 59 | n = torch.cuda.device_count() 60 | if env_var == "all": 61 | gpus = list(range(n)) 62 | elif "," in env_var: 63 | gpus = [int(gnum) for gnum in env_var.split(",") if int(gnum) < n] 64 | else: 65 | gpus = [int(env_var)] 66 | assert len( 67 | gpus 68 | ), f"Make sure to provide valid device ids! You have {n} GPU(s), you can specify the following values: {list(range(n))}" 69 | return len(gpus) > 1, gpus 70 | 71 | 72 | def get_free_memory_Mb(device: int): 73 | # returns (free, total) device memory, in bytes 74 | return torch.cuda.mem_get_info(device)[0] / 2**20 75 | 76 | 77 | def model_size_Mb(model): 78 | # from the legend @ptrblck himself https://discuss.pytorch.org/t/finding-model-size/130275/2 79 | param_size = 0 80 | for param in model.parameters(): 81 | param_size += param.nelement() * param.element_size() 82 | buffer_size = 0 83 | for buffer in model.buffers(): 84 | buffer_size += buffer.nelement() * buffer.element_size() 85 | return (param_size + buffer_size) / 1024**2 86 | 87 | 88 | class ToGPUWrapper(nn.Module, object): 89 | def __init__(self, layer: nn.Module, device: torch.device) -> None: 90 | # composition design, we wrap a nn.Module, change forward 91 | super().__init__() 92 | self.device = device 93 | # move wrapped model to correct device 94 | self.layer = layer.to(device) 95 | 96 | def forward(self, x: torch.Tensor = None, *args, **kwargs): 97 | # move input and output to given device 98 | # print(self.layer.__class__.__name__) 99 | args = [a.to(self.device) if type(a) is torch.Tensor else a for a in args] 100 | for k in kwargs: 101 | if type(kwargs[k]) is torch.Tensor: 102 | kwargs[k] = kwargs[k].to(self.device) 103 | if x is None: 104 | y = self.layer(*args, **kwargs) 105 | else: 106 | y = self.layer(x.to(self.device), *args, **kwargs) 107 | # text model wraps output.. this could be made more generic 108 | if self.layer.__class__.__name__ == "CLIPTextModel": 109 | # getting does something like this self.to_tuple()[k] 110 | y.last_hidden_state = y.last_hidden_state.to(self.device) 111 | return y 112 | return y.to(self.device) 113 | 114 | # FIXME this is giving recursion problems 115 | # def __getattr__(self, name: str): 116 | # return getattr(self.layer, name) 117 | 118 | def __iter__(self): 119 | return iter(self.layer) 120 | 121 | def __next__(self): 122 | return next(self.layer) 123 | 124 | def decode(self, z): 125 | # for vae output 126 | return self.layer.decode(z.to(self.device)) 127 | 128 | 129 | class ModelParts2GPUsAssigner: 130 | def __init__( 131 | self, 132 | devices: List[int], 133 | ) -> None: 134 | """ 135 | Finds a valid assignment of model parts (unet, vae..) to available GPUs 136 | using a stochastic brute-force approach. The problem is formulated 137 | as a Integer Linear Programming one: 138 | maximize w^t X with w=[a, b, c, d] 139 | subject to x_1 a + y_1 b + z_1 c + k_1 d \leq v_1 140 | \dots 141 | x_n a + y_n b + z_n c + k_n d \leq v_n 142 | with \sum x_i=\sum y_i=\sum z_i=\sum k_i 143 | x, y, z, k \geq 0 144 | x, y, z, k \in Z^n 145 | 146 | `self.W` represents the memory requirements of each component in which the model is split 147 | into. 148 | `self.G` is a vector of size N, containing the available memory of each device. Available 149 | memory is conservatively taken as 60% of the free memory. 150 | The assignment state I is a Nx4 matrix where I[i,j] represents the number of components j 151 | assigned to GPU i (initially 0). 152 | """ 153 | self.N = len(devices) 154 | # memory "budget" for each device: we consider 90% of the available GPU memory 155 | G = [int(get_free_memory_Mb(d) * 0.9) for d in devices] 156 | print("Free GPU memory (per device): ", G) 157 | # FIXME G is kind of a function of n_models itself, as the more models you have 158 | # the more memory you will be using for storing intermediate results... 159 | self.G = np.array(G, dtype=np.uint16) 160 | # model components memory usage, fixed order: unet_e, unet_d, text_encoder, vae 161 | # TODO make dynamic using `model_size_Mb(model.text_encoder)`, 162 | fp16 = bool(int(os.environ.get("FP16", 1))) 163 | if fp16: 164 | self.W = np.array([666, 975, 235, 160]) 165 | # (peak) memory usage with batch 1 and 512x512 166 | self.W += [2115, 2115, 35, 2115] 167 | else: 168 | # fp32 weights 169 | self.W = np.array([1331, 1949, 470, 320]) 170 | self.W += [2185, 2185, 116, 4280] 171 | 172 | single_model = bool(os.environ.get("SINGLE_MODEL_PARALLEL", False)) 173 | MAX_MODELS = int(os.environ.get("MAX_MODEL_PARALLEL", 12)) 174 | # easy way to ensure single model multiple gpus, useful for debugging 175 | if single_model: 176 | self._max_models = 1 177 | else: 178 | # max number of models you can have considering pooled VRam as it if was a single GPU, 179 | # "upper bounded" by max number of processes 180 | self._max_models = min( 181 | multiprocessing.cpu_count(), 182 | np.floor(self.G.sum() / self.W.sum()), 183 | MAX_MODELS, 184 | ) 185 | if np.floor(self.G.sum() / self.W.sum()) == 0: 186 | raise Exception( 187 | "You don't have enough combined VRam to host a single model! Try to run the container using the FP16 mode." 188 | ) 189 | 190 | def state_evaluation(self, state: np.ndarray): 191 | """ 192 | 2 conditions: 193 | - each model component must appear in the same number (implicitly generated) 194 | - allocation on each GPUs must not be greater than its capacity 195 | """ 196 | return (state @ self.W <= self.G).all() 197 | 198 | def add_model(self, state: np.ndarray, rnd=True, sample_size=2) -> List[np.ndarray]: 199 | """ 200 | This function takes an assignment state and tries to add a "model" to it: 201 | adding a model means assigning *each of the 4 components* to a device. 202 | It does so by brute-force searching for valid assignments that support 203 | the addition of another model. 204 | If no such assignment exist, an empty list is returned. 205 | can be 206 | changed through `sample_size` 207 | Args: 208 | state (np.ndarray): The initial state from which the search starts from. 209 | rnd (bool, optional): Whether to generate new assignments in a random fashion, 210 | rather than proceeding "linearly". Defaults to True. 211 | sample_size (int, optional): The number of valid assignments needed to 212 | interrupt the search before the whole space is visited. Defaults to 2. 213 | """ 214 | 215 | def get_device_permutation(): 216 | if rnd: 217 | return np.random.permutation(self.N) 218 | return np.arange(self.N) 219 | 220 | # beware, this will modify state in-place 221 | valid = [] 222 | # N^4 possible combinations 223 | # +1 on cells (0, a), (1, b), (2, c), (3, d) 224 | for a in get_device_permutation(): 225 | state[a, 0] += 1 226 | for b in get_device_permutation(): 227 | state[b, 1] += 1 228 | for c in get_device_permutation(): 229 | state[c, 2] += 1 230 | for d in get_device_permutation(): 231 | state[d, 3] += 1 232 | # evaluate state, return first valid or keep a list of valid ones? Or one with max "score"? 233 | # greedy return one, can't guarantee to find (one of the) optimum(s) 234 | if self.state_evaluation(state): 235 | # could be compressed by only storing a,b,c,d.. 236 | valid.append(state.copy()) 237 | # here state wasn't backtracked! 238 | if sample_size > 0 and len(valid) >= sample_size: 239 | return valid 240 | # backtrack! 241 | state[d, 3] -= 1 242 | state[c, 2] -= 1 243 | state[b, 1] -= 1 244 | state[a, 0] -= 1 245 | return valid 246 | 247 | def find_best_assignment( 248 | self, state: np.ndarray, curr_n_models: int, **kwargs 249 | ) -> Tuple[int, List[np.ndarray]]: 250 | """ 251 | Starting from the intial empty assignment, tries to add a model to the multi-gpu 252 | setup recursively, stopping whenever this is impossible. 253 | """ 254 | if curr_n_models >= self._max_models: 255 | return -1, [] 256 | prev = state.copy() 257 | valid = self.add_model(state, **kwargs) 258 | # can't generate valid assignments with an extra model, return current one 259 | if not len(valid): 260 | return curr_n_models, [prev] 261 | # visit children 262 | children = [] 263 | for next_state in valid: 264 | # insert only valid states 265 | depth, ss = self.find_best_assignment( 266 | next_state, curr_n_models + 1, **kwargs 267 | ) 268 | if depth > 0 and len(ss): 269 | children.append((depth, ss)) 270 | 271 | # can't add more models 272 | if not len(children): 273 | return curr_n_models + 1, valid 274 | # return best child, the one that assigns more models (and number of models too) 275 | return max(children, key=lambda t: t[0]) 276 | 277 | def __call__(self) -> np.ndarray: 278 | # initial empty assignment, #GPUs x #model_parts 279 | I = np.zeros((self.N, 4), dtype=np.uint16) 280 | # returns a valid assignment of split component to devices 281 | n_models, ass = self.find_best_assignment(I, 0) 282 | ass = ass[0] 283 | print( 284 | f"Search has found that {n_models} model(s) can be split over {self.N} device(s)!" 285 | ) 286 | print("Assignment:", ass) 287 | # format output into a [{model_component->device}], one per model to create 288 | model_ass = [{i: -1 for i in range(4)} for _ in range(n_models)] 289 | for dev in range(self.N): 290 | for comp in range(4): 291 | # this assignment entry might say "component_0 to device_1 3 times" 292 | for _ in range(ass[dev, comp]): 293 | for m in model_ass: 294 | # assign to first model that doesn't have an allocated component yet 295 | if m[comp] == -1: 296 | m[comp] = dev 297 | break 298 | return model_ass 299 | --------------------------------------------------------------------------------