├── .github
    └── workflows
    │   ├── build-and-push-docker-image.yml
    │   └── build-image-on-prs.yml
├── .gitignore
├── Dockerfile
├── LICENSE.md
├── README.md
├── assets
    ├── 0.png
    ├── 1.png
    ├── 2.png
    ├── 3.png
    ├── 4.png
    ├── 5.png
    ├── 6.png
    ├── 7.png
    ├── bluelove.png
    ├── greenlove.png
    ├── redlove.png
    ├── screen.png
    ├── village_0.png
    ├── village_10_2.png
    ├── village_15_2.png
    ├── village_5.png
    └── village_5_2.png
├── clip_config.pickle
├── main.py
├── parallel.py
├── requirements.txt
├── sb.py
├── schedulers.py
├── server.py
├── tests
    └── test_image_generation.py
└── utils.py


/.github/workflows/build-and-push-docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build and Push CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ['master']
 6 |     paths: 
 7 |       - '**.py'
 8 |       - 'Dockerfile'
 9 | jobs:
10 |   docker:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       -
14 |         name: Set up Docker Buildx
15 |         uses: docker/setup-buildx-action@v2
16 |       -
17 |         name: Login to DockerHub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: nicklucche
21 |           password: ${{ secrets.DOCKERHUB_PASS }}
22 |       -
23 |         name: Build and push
24 |         uses: docker/build-push-action@v3
25 |         with:
26 |           push: true
27 |           tags: nicklucche/stable-diffusion:latest


--------------------------------------------------------------------------------
/.github/workflows/build-image-on-prs.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: ['master']
 6 |     paths: 
 7 |       - '**.py'
 8 |       - 'Dockerfile'
 9 | jobs:
10 |   docker-build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Remove unnecessary files
14 |         run: |
15 |           sudo rm -rf /usr/share/dotnet
16 |           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
17 |       - name: Checkout
18 |         uses: actions/checkout@v2
19 |       - name: Build Docker image
20 |         run: docker build -t stable-diffusion:pr .
21 |       # cpu-only tests on github runner, override entrypoint while mounting whole repo 
22 |       # TODO: consider artifacts or git lfs to avoid downloading models for testing 
23 |       - name: Run Tests
24 |         run: docker run --entrypoint bash --rm -v .:/app2 stable-diffusion:pr -c 'cd /app2 && python3 -m pip install pytest && python3 -m pytest -v tests/'
25 | 
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime
2 | WORKDIR /app
3 | COPY *.py requirements.txt *.pickle /app
4 | RUN apt update && apt install -y git
5 | RUN pip install -r requirements.txt
6 | ENV GRADIO_SERVER_PORT=7860
7 | ENV GRADIO_SERVER_NAME=0.0.0.0
8 | EXPOSE 7860
9 | ENTRYPOINT ["python3", "server.py"]


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Nicolò Lucchesi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | A friend of mine working in art/design wanted to try out [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) on his own GPU-equipped PC, but he doesn't know much about coding, so I thought that baking a quick docker build was an easy way to help him out. This repo holds the files that go into that build.
  2 | 
  3 | I also took the liberty of throwing in a simple web UI (made with gradio) to wrap the model. Perhaps we can evolve it a bit to offer a few more functionalities (see TODO).
  4 | 
  5 | **UPDATE:** we now support inference on multiple GPUs with a "Data Parallel" approach.
  6 | 
  7 | ~~**UPDATE 2:** we now support inference on multiple GPUs with a "Model Parallel" approach (see `Multi-GPU` section).~~
  8 | 
  9 | **UPDATE 3 but really it's a v2:** [Stable Diffusion 2.0](https://stability.ai/blog/stable-diffusion-v2-release) is out generating images more beautiful than ever! This is now the default model being loaded and it supports all previous features and more. I've also added support for *img2img* and *image inpainting* and refreshed the UI, give it a try! 
 10 | 
 11 | # Requirements
 12 |  - OS: Ubuntu (tested on 20.04) or Windows (tested on Windows 10 21H2)
 13 |  - Nvidia GPU with at least 6GB vRAM (gtx 700 onward, please refer [here](https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html)). Mind that the bigger the image size (or the number of images) you want to dream, the more memory you're gonna need. For reference, dreaming a 256x256 image should take up ~5gb, while a 512x512 around 7gb. 
 14 |  - Free Disk space > 2.8gb
 15 |  - Docker and Nvidia-docker.
 16 |  - HuggingFace account as well as ~~registration to this repository https://huggingface.co/CompVis/stable-diffusion-v1-4 (simply click on `Access Repository`)~~. No longer needed if you use default v2 model (see "About model versions" below).
 17 | 
 18 | # Installation
 19 | 
 20 | First of all, make sure to have docker and nvidia-docker installed in your machine.
 21 | 
 22 | **Windows users**: [install WSL/Ubuntu](https://stackoverflow.com/a/56783810) from store->install [docker](https://docs.docker.com/desktop/windows/wsl/) and start it->update Windows 10 to version 21H2 (Windows 11 should be ok as is)->test out [GPU-support](https://docs.nvidia.com/cuda/wsl-user-guide/index.html#cuda-support-for-wsl2) (a simple `nvidia-smi` in WSL should do). If `nvidia-smi` does not work from WSL, make sure you have updated your nvidia drivers from the official app. 
 23 | 
 24 | The easiest way to try out the model is to simply use the pre-built image at `nicklucche/stable-diffusion`.   
 25 | 
 26 | My advice is that you start the container with:
 27 | 
 28 | `docker run --name stable-diffusion --pull=always --gpus all -it -p 7860:7860 nicklucche/stable-diffusion` 
 29 | 
 30 | the *first time* you run it, as it will download the model weights (can take a few minutes to do so) and store them on disk (as long as you don't delete the container).
 31 | Then you can simply do `docker stop stable-diffusion` to stop the container and `docker start stable-diffusion` to bring it back up whenever you need.
 32 | `--pull=always` is to make sure you get the latest image from dockerhub, you can skip it if you already have it locally.
 33 | 
 34 | Once the init phase is finished a message will pop-up in your terminal (`docker logs stable-diffusion`) and you should be able to head to http://localhost:7860/ in your favorite browser and see something like this:
 35 | 
 36 | ![](assets/screen.png)
 37 | 
 38 | By default, the half-precision/fp16 model is loaded. This is the recommended approach if you're planning to run the model on a GPU with < 10GB of memory (takes half the space, ~half the time and yields similar output). To disable FP16 and run inference using single-precision (FP32), set the environment variable FP16=0 as a docker run option, like so:
 39 | 
 40 | `docker run .. -e FP16=0 ...`  
 41 | 
 42 | ## Multi-GPU
 43 | 
 44 | The model can be run in both a "DataParallel" or a combined "Model+Data Parallel" fashion to speed up inference time and leverage your multi-gpu setup to its fullest.
 45 | 
 46 | ### Data Parallel
 47 | 
 48 | This means that the model is replicated over multiple GPUs, each handled by a separate sub-process. By default, the model runs on device 0 (no parallelism). You can change that by specifying the desired device(s) by adding one of the following options:
 49 | 
 50 |  - `-e DEVICES=1 ...` runs model on GPU 1 (starts from 0)
 51 |  - `-e DEVICES=0,1 ...` runs model on GPU 0 and 1
 52 |  - `-e DEVICES=all ...` runs model on all available GPUs
 53 | 
 54 | Each device/model generates a full image, so make sure you increase the `Number of Images` slider to generate multiple images in parallel!
 55 | (Single image generation speed won't be affected).
 56 | 
 57 | I should also mention that adding the nsfw filter (by checking corresponding box) includes moving an additional model to GPU, so it can cause out of memory issues.
 58 | 
 59 | ### ~~Model Parallel~~ -Currently disabled! Use "Data Parallel" for true parallelism!-
 60 | 
 61 | It works by splitting the model into a fixed number of parts, assigning each part to a device and then handling data transfer from one device to the other (more technical details [here](https://github.com/NickLucche/stable-diffusion-nvidia-docker/issues/8) or from source).
 62 | This was originally intended to support setups that had GPUs with small amounts of VRAM that could only run the model by combining their resources, but now it also supports splitting multiple models to accomodate for bigger GPUs, effectively combining Model and Data Parallel.
 63 | 
 64 | Single image inference will be slower in this modality (since we may need to move data from one device to the other), but it allows to fill your memory more efficiently if you have big GPUs by creating multiple models.
 65 | You can try out this option with:
 66 | 
 67 | `-e MODEL_PARALLEL=1` 
 68 | 
 69 | Note that if your system has highly imbalanced GPU memory distribution (e.g. gpu0->6Gb, gpu1->24Gb.. ) the smallest device might bottleneck the inference process; the easiest way to fix that, is to ignore the smallest device by *not* specifying it in the `DEVICES` list (e.g. `-e DEVICES=1,2..`).
 70 | 
 71 | ## About models
 72 | 
 73 | By default, the model loaded is [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base). Many other checkpoints have been created that are compatible with [diffusers](https://github.com/huggingface/diffusers) (awesome library, ckeck it out) and you can provide them as an additional environment variable like so:
 74 | 
 75 | `-e MODEL_ID=runwayml/stable-diffusion-v1-5`
 76 | 
 77 | Model weights are downloaded to and loaded from `/root/.cache/huggingface/diffusers`, so if you want to share your model across multiple containers runs, you can provide this path as a [docker volume](https://docs.docker.com/storage/volumes/):
 78 | 
 79 | `-v /path/to/your/hugginface/cache:/root/.cache/huggingface/diffusers`
 80 | 
 81 | Mind that the host path (first path up to ":") might very well be the same as the second if you're using the same diffusers library on the host and you didn't modify `HF_HOME`.
 82 | 
 83 | Some models may require a huggingface token to be downloaded, you can get yours at https://huggingface.co/settings/tokens after registering for free on their website. You can then add the token to your env with `-e TOKEN=<YOUR_TOKEN>`.
 84 | 
 85 | **P.S:** Feel free to open an issue for any problem you may face during installation.
 86 | 
 87 | # Samples
 88 | 
 89 | The internet is full of these, but I felt I couldn't let this repo go without sharing a few of "my own".. 
 90 | 
 91 | <p align="center" width="100%">
 92 |     <img width="48%" src="assets/0.png">
 93 |     <img width="48%" src="assets/1.png">
 94 | </p>
 95 | 
 96 | Fixed seed, slightly change text input (thanks to @mronchetti for the cool  prompt):
 97 | <p align="center" width="100%">
 98 |     <img width="32%" src="assets/redlove.png">
 99 |     <img width="32%" src="assets/greenlove.png">
100 |     <img width="32%" src="assets/bluelove.png">
101 | </p>
102 | 
103 | Fixed seed, same input, increase `guidance_scale` (more "adherent" to text) with a step of 5:
104 | <p align="center" width="100%">
105 |     <img width="32%" src="assets/village_5_2.png">
106 |     <img width="32%" src="assets/village_10_2.png">
107 |     <img width="32%" src="assets/village_15_2.png">
108 | </p>
109 | <p align="center" width="100%">
110 |     <img width="48%" src="assets/village_0.png">
111 |     <img width="48%" src="assets/village_5.png">
112 | </p>
113 | 
114 | 'Picture' vs 'Drawing' text input:
115 | <p align="center" width="100%">
116 |     <img width="48%" src="assets/3.png">
117 |     <img width="48%" src="assets/4.png">
118 | </p>
119 | 
120 | 
121 | ## TODO
122 |  - [x] allow other input modalities (images)
123 |  - [ ] support extra v2 features (depth-based generation, upscaling) 
124 |  - [x] move model to specifiec GPU number (env variable)
125 |  - [x] multi-gpu support (data parallel)
126 |  - [x] multi-gpu support (PipelineParallel/model parallel)
127 |  - [ ] Data+Model parallel: optimize memory assignment for 512x512 inference
128 |  - [ ] dump and clear prompt history
129 |  - [ ] test on older cudnn
130 | 


--------------------------------------------------------------------------------
/assets/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/0.png


--------------------------------------------------------------------------------
/assets/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/1.png


--------------------------------------------------------------------------------
/assets/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/2.png


--------------------------------------------------------------------------------
/assets/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/3.png


--------------------------------------------------------------------------------
/assets/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/4.png


--------------------------------------------------------------------------------
/assets/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/5.png


--------------------------------------------------------------------------------
/assets/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/6.png


--------------------------------------------------------------------------------
/assets/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/7.png


--------------------------------------------------------------------------------
/assets/bluelove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/bluelove.png


--------------------------------------------------------------------------------
/assets/greenlove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/greenlove.png


--------------------------------------------------------------------------------
/assets/redlove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/redlove.png


--------------------------------------------------------------------------------
/assets/screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/screen.png


--------------------------------------------------------------------------------
/assets/village_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_0.png


--------------------------------------------------------------------------------
/assets/village_10_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_10_2.png


--------------------------------------------------------------------------------
/assets/village_15_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_15_2.png


--------------------------------------------------------------------------------
/assets/village_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_5.png


--------------------------------------------------------------------------------
/assets/village_5_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/assets/village_5_2.png


--------------------------------------------------------------------------------
/clip_config.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickLucche/stable-diffusion-nvidia-docker/b4e0bc0bd41b2f8fc8874c34c7fe94b195554619/clip_config.pickle


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Union
  2 | import torch
  3 | from PIL import Image
  4 | import os
  5 | from utils import ModelParts2GPUsAssigner, get_gpu_setting
  6 | from parallel import StableDiffusionModelParallel, StableDiffusionMultiProcessing
  7 | import numpy as np
  8 | from sb import DiffusionModel
  9 | 
 10 | # read env variables
 11 | TOKEN = os.environ.get("TOKEN", None)
 12 | MODEL_ID = os.environ.get("MODEL_ID", "stabilityai/stable-diffusion-2-base")
 13 | 
 14 | # If you are limited by GPU memory (e.g <10GB VRAM), please make sure to load in fp16 precision
 15 | fp16 = bool(int(os.environ.get("FP16", 1)))
 16 | # MP = bool(int(os.environ.get("MODEL_PARALLEL", 0)))
 17 | MP = False  # disabled
 18 | MIN_INPAINT_MASK_PERCENT = 0.1
 19 | 
 20 | # FIXME devices=0,1 causes cuda error on memory access..?
 21 | IS_MULTI, DEVICES = get_gpu_setting(os.environ.get("DEVICES", "0"))
 22 | 
 23 | # TODO docs
 24 | def init_pipeline(model_or_path=MODEL_ID, devices: List[int]=DEVICES)->Union[DiffusionModel, StableDiffusionMultiProcessing]:
 25 |     kwargs = dict(
 26 |         pretrained_model_name_or_path=model_or_path,
 27 |         revision="fp16" if fp16 else None,
 28 |         torch_dtype=torch.float16 if fp16 else None,
 29 |         use_auth_token=TOKEN,
 30 |         requires_safety_checker=False,
 31 |     )
 32 |     model_ass = None
 33 |     # single-gpu multiple models currently disabled
 34 |     if MP and len(devices) > 1:
 35 |         # setup for model parallel: find model parts->gpus assignment
 36 |         print(
 37 |             f"Looking for a valid assignment in which to split model parts to device(s): {devices}"
 38 |         )
 39 |         ass_finder = ModelParts2GPUsAssigner(devices)
 40 |         model_ass = ass_finder()
 41 |         if not len(model_ass):
 42 |             raise Exception(
 43 |                 "Unable to find a valid assignment of model parts to GPUs! This could be bad luck in sampling!"
 44 |             )
 45 |         print("Assignments:", model_ass)
 46 | 
 47 |     # TODO move logic
 48 |     # if multi and pipe is not None:
 49 |         # avoid re-creating processes in multi-gpu mode, have them reload a different model
 50 |         # pipe.reload_model(model_or_path)
 51 |     if IS_MULTI:
 52 |         # DataParallel: one process *per GPU* (each has a copy of the model)
 53 |         # ModelParallel: one process *per model*, each model (possibly) on multiple GPUs
 54 |         n_procs = len(devices) if not MP else len(model_ass)
 55 |         pipe = StableDiffusionMultiProcessing.from_pretrained(
 56 |             n_procs, devices, model_parallel_assignment=model_ass, **kwargs
 57 |         )
 58 |     else:
 59 |         pipe = DiffusionModel.from_pretrained(**kwargs)
 60 |         if len(devices):
 61 |             pipe.to(f"cuda:{devices[0]}")
 62 | 
 63 |     return pipe
 64 | 
 65 | 
 66 | def inference(
 67 |     pipe: DiffusionModel,
 68 |     prompt,
 69 |     num_images=1,
 70 |     num_inference_steps=50,
 71 |     height=512,
 72 |     width=512,
 73 |     guidance_scale=7,
 74 |     seed=None,
 75 |     nsfw_filter=False,
 76 |     low_vram=False,
 77 |     noise_scheduler=None,
 78 |     inv_strenght=0.0,
 79 |     input_image=None,
 80 |     input_sketch=None,
 81 |     masked_image=None,
 82 | ):
 83 |     prompt = [prompt] * num_images
 84 |     input_kwargs = dict(
 85 |         inference_type = "text",
 86 |         prompt=prompt,
 87 |         # number of denoising steps run during inference (the higher the better)
 88 |         num_inference_steps=num_inference_steps,
 89 |         height=height,
 90 |         width=width,
 91 |         guidance_scale=guidance_scale,
 92 |         # NOTE seed with multiples gpus will be different for each one but fixed!
 93 |         generator=seed,
 94 |     )
 95 |     # input sketch has priority over input image
 96 |     if input_sketch is not None:
 97 |         input_image = input_sketch
 98 | 
 99 |     # TODO batch images by providing a torch tensor
100 |     if input_image is not None:
101 |         # image guided generation
102 |         input_image = input_image.resize((width, height))
103 |         # TODO negative prompt?
104 |         input_kwargs["init_image"] = input_image
105 |         input_kwargs["strength"] = 1.0 - inv_strenght
106 |         input_kwargs["inference_type"] = "img2img"
107 |     elif masked_image is not None:
108 |         # resize to specified shape
109 |         masked_image = {
110 |             k: v.convert("RGB").resize((width, height)) for k, v in masked_image.items()
111 |         }
112 | 
113 |         # to do image inpainting, we must provide a big enough mask
114 |         if np.count_nonzero(masked_image["mask"].convert("1")) < (
115 |             width * height * MIN_INPAINT_MASK_PERCENT
116 |         ):
117 |             raise ValueError("Mask is too small. Please paint-over a larger area")
118 |         input_kwargs["image"] = masked_image["image"]
119 |         input_kwargs["mask_image"] = masked_image["mask"]
120 |         input_kwargs["inference_type"] = "inpaint"
121 | 
122 |     pipe.set_nsfw(nsfw_filter)
123 | 
124 |     # needed on 16GB RAM 768x768 fp32
125 |     pipe.enable_attention_slicing("auto" if low_vram else None)
126 | 
127 |     # set noise scheduler for inference
128 |     if noise_scheduler is not None:
129 |         pipe.scheduler = noise_scheduler
130 | 
131 |     with torch.autocast("cuda"):
132 |         images: List[Image.Image] = pipe(**input_kwargs)["images"]
133 |     return images
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     from utils import image_grid
138 | 
139 |     images = inference(input("Input prompt:"))
140 |     grid = image_grid(images, rows=1, cols=1)
141 |     grid.show()
142 | 


--------------------------------------------------------------------------------
/parallel.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | import torch
  4 | import torch.multiprocessing as mp
  5 | import torch.nn as nn
  6 | from transformers import CLIPConfig
  7 | from schedulers import schedulers
  8 | import pickle
  9 | from diffusers.pipelines.stable_diffusion.safety_checker import (
 10 |     StableDiffusionSafetyChecker,
 11 | )
 12 | from diffusers.pipelines.stable_diffusion import (
 13 |     StableDiffusionPipeline,
 14 |     StableDiffusionImg2ImgPipeline,
 15 |     StableDiffusionInpaintPipeline,
 16 | )
 17 | from utils import ToGPUWrapper, dummy_checker, dummy_extractor, remove_nsfw
 18 | from typing import Any, Dict, List, Optional, Union
 19 | import random
 20 | from sb import DiffusionModel
 21 | 
 22 | ## Data Parallel: each process handles a copy of the model, executed on a different device ##
 23 | ## +Model Parallel: model components are (potentially) scattered across different devices, each model handled by a process ##
 24 | def cuda_inference_process(
 25 |     worker_id: int,
 26 |     devices: List[torch.device],
 27 |     in_q: mp.Queue,
 28 |     out_q: mp.Queue,
 29 |     model_kwargs: Dict[Any, Any],
 30 | ):
 31 |     """Code executed by the torch.multiprocessing process, handling inference on device `device_id`.
 32 |     It's a simple loop in which the worker pulls data from a shared input queue, and puts result
 33 |     into an output queue.
 34 |     """
 35 |     # wont work in pytorch 1.12 https://github.com/pytorch/pytorch/issues/80876
 36 |     # os.environ["CUDA_VISIBLE_DEVICES"]=str(device_id)
 37 |     mp_ass: Dict[int, int] = model_kwargs.pop("model_parallel_assignment", None)
 38 |     # each worker gets a different starting seed so they can be fixed and yet produce different results
 39 |     worker_seed = random.randint(0, int(2**32 - 1))
 40 |     # TODO replace with custom `StableDiffusion` model, single process == multi-process
 41 |     try:
 42 |         if mp_ass is None:
 43 |             # TODO should we make sure we're downloading the model only once?
 44 |             device_id = devices[worker_id]
 45 |             print(
 46 |                 f"Creating and moving model to cuda:{device_id} ({torch.cuda.get_device_name(device_id)}).."
 47 |             )
 48 |             model: DiffusionModel = DiffusionModel.from_pretrained(**model_kwargs).to(
 49 |                 f"cuda:{device_id}"
 50 |             )
 51 |         else:
 52 |             mp_ass = mp_ass[worker_id]
 53 |             print("Model parallel worker component assignment:", mp_ass)
 54 |             print(f"Creating and moving model parts to respective devices..")
 55 |             model = StableDiffusionModelParallel.from_pretrained(**model_kwargs).to(
 56 |                 mp_ass
 57 |             )
 58 |             # TODO add for model parallel, but likely to refactor too
 59 |             # safety_checker, safety_extr = remove_nsfw(model)
 60 |             # create nsfw clip filter so we can re-set it if needed
 61 |             # safety_checker = StableDiffusionSafetyChecker(
 62 |             #     CLIPConfig(**model_kwargs.pop("clip_config"))
 63 |             # )
 64 |         out_q.put(True)
 65 |     except Exception as e:
 66 |         print(e)
 67 |         out_q.put(False)
 68 |         return
 69 |     # inference loop
 70 |     while True:
 71 |         # get prompt
 72 |         prompts, kwargs = in_q.get()
 73 |         if type(prompts) is not list:
 74 |             # special commands
 75 |             if prompts == "quit":
 76 |                 break
 77 |             elif prompts == "safety_checker" and mp_ass is not None:
 78 |                 # TODO
 79 |                 raise NotImplementedError()
 80 |             elif prompts == "safety_checker":
 81 |                 # safety checker is also be moved to GPU (it can cause crashes) when 'clip' is passed
 82 |                 model.set_nsfw(kwargs == "clip")
 83 |             elif prompts == "scheduler":
 84 |                 model.scheduler = kwargs
 85 |             elif prompts == "low_vram":
 86 |                 model.enable_attention_slicing(kwargs)
 87 |             elif prompts == "reload_model":
 88 |                 print(f"Worker {device_id}- Reloading model from disk..")
 89 |                 model_path_or_id = kwargs
 90 |                 model = model.reload_model(model_path_or_id)  # maintains device
 91 |                 # model loading needs ack
 92 |                 out_q.put(True)
 93 |             continue
 94 |         if not len(prompts):
 95 |             images = []
 96 |         else:
 97 |             # actual inference
 98 |             # print("Inference", prompts, kwargs, model.device)
 99 |             if kwargs.get("generator", None) is not None and kwargs["generator"] > 0:
100 |                 # NOTE different seed for each worker, but fixed!
101 |                 kwargs["generator"] = kwargs["generator"] + worker_seed
102 |                 # for repeatable results: tensor generated on cpu for model parallel
103 |                 # TODO unify model parallel interface, still using StableDiffusionPipeline
104 |                 if mp_ass is not None:
105 |                     kwargs["generator"] = torch.Generator("cpu").manual_seed(
106 |                         kwargs["generator"]
107 |                     )
108 |             else:
109 |                 kwargs.pop("generator", None)
110 |             try:
111 |                 with torch.autocast("cuda"):
112 |                     images: List[Image.Image] = model(prompt=prompts, **kwargs).images
113 |             except Exception as e:
114 |                 print(f"[Model {device_id}] Error during inference:", e)
115 |                 # TODO proper error propagation to master process
116 |                 images = [
117 |                     Image.fromarray(
118 |                         np.zeros((kwargs["height"], kwargs["width"], 3), dtype=np.uint8)
119 |                     )
120 |                 ]
121 |         out_q.put(images)
122 | 
123 | 
124 | # class that handles multi-gpu models, mimicking original interface
125 | class StableDiffusionMultiProcessing(object):
126 |     def __init__(
127 |         self, n_procs: int, devices: List[int], model_id_or_path: str = ""
128 |     ) -> None:
129 |         self.devices = devices
130 |         self.n = n_procs
131 |         self._safety_checker = "dummy"
132 |         self._scheduler = "PNDM"
133 |         self._pipeline_type = "text"
134 |         self._pipe_name = model_id_or_path
135 | 
136 |     def _send_cmd(self, k1, k2, wait_ack=True):
137 |         # send a cmd to all processes (put item in queue)
138 |         for i in range(self.n):
139 |             self.q.put((k1[i], k2[i]))
140 |         # and wait for its completion
141 |         res = []
142 |         if wait_ack:
143 |             for _ in range(self.n):
144 |                 res.append(self.outq.get())
145 |         return res
146 | 
147 |     def _send_cmd_to_all(self, k1, k2, wait_ack=True):
148 |         return self._send_cmd([k1] * self.n, [k2] * self.n, wait_ack=wait_ack)
149 | 
150 |     def __call__(self, prompt, **kwargs):
151 |         # run inference on different processes, each handles a model on a different GPU (split load evenly)
152 |         # FIXME when n_prompts < n, unused processes get an empty list as input, so we can always wait all processes
153 |         prompt = [list(p) for p in np.array_split(prompt, self.n)]
154 |         # request inference and block for result
155 |         res = self._send_cmd(prompt, [kwargs] * self.n)
156 |         # mimic interface
157 |         return {"images": [img for images in res for img in images]}
158 | 
159 |     @classmethod
160 |     def from_pretrained(
161 |         cls, n_processes: int, devices: List[int], **kwargs
162 |     ) -> "StableDiffusionMultiProcessing":
163 |         # create communication i/o "channels"
164 |         cls.q = mp.Queue()
165 |         cls.outq = mp.Queue()
166 |         # load nsfw filter CLIP configuration
167 |         # TODO still needed?
168 |         with open("./clip_config.pickle", "rb") as f:
169 |             d = pickle.load(f)
170 |         kwargs["clip_config"] = d
171 | 
172 |         # create models in their own process and move them to correspoding device
173 |         cls._procs: List[mp.Process] = []
174 |         for i in range(n_processes):
175 |             p = mp.Process(
176 |                 target=cuda_inference_process,
177 |                 args=(i, devices, cls.q, cls.outq, kwargs),
178 |                 daemon=False,
179 |             )
180 |             p.start()
181 |             cls._procs.append(p)
182 | 
183 |         # wait until you move all models to their respective gpu (consistent with single mode)
184 |         for _ in range(n_processes):
185 |             d = cls.outq.get()
186 |             assert d
187 |         # cls.pipes: List[StableDiffusionPipeline] = models
188 |         return cls(n_processes, devices, kwargs["pretrained_model_name_or_path"])
189 | 
190 |     def __del__(self):
191 |         # exit and join condition
192 |         for _ in range(self.n):
193 |             self.q.put(("quit", ""))
194 |         for p in self._procs:
195 |             p.join()
196 | 
197 |     def __len__(self):
198 |         return self.n
199 | 
200 |     # mimic interface
201 |     @property
202 |     def safety_checker(self):
203 |         return self._safety_checker
204 | 
205 |     @safety_checker.setter
206 |     def safety_checker(self, value):
207 |         # value=None->set filter, o/w set nsfw filter off
208 |         nsfw_on = value is None
209 |         # switch nsfw on, otherwise don't bother re-setting on processes
210 |         if self.safety_checker == "dummy" and nsfw_on:
211 |             self._safety_checker == "clip"
212 |             self._send_cmd_to_all("safety_checker", "clip", wait_ack=False)
213 |         elif self.safety_checker == "clip" and not nsfw_on:
214 |             self._safety_checker == "dummy"
215 |             self._send_cmd_to_all("safety_checker", "dummy", wait_ack=False)
216 | 
217 |     @property
218 |     def scheduler(self):
219 |         return self._scheduler
220 | 
221 |     @scheduler.setter
222 |     def scheduler(self, value):
223 |         # avoid re-setting if already set
224 |         if self.scheduler == value or value not in schedulers:
225 |             return
226 |         self._scheduler = value
227 |         self._send_cmd_to_all("scheduler", value, wait_ack=False)
228 | 
229 |     def enable_attention_slicing(self, value):
230 |         self._send_cmd_to_all("low_vram", value, wait_ack=False)
231 | 
232 |     # def disable_attention_slicing(self):
233 |     # self._send_cmd_to_all("low_vram", None, wait_ack=False)
234 | 
235 |     def change_pipeline_type(self, new_type: str):
236 |         assert new_type in ["text", "img2img", "inpaint"]
237 |         if new_type == self._pipeline_type:
238 |             return
239 |         self._pipeline_type = new_type
240 |         self._send_cmd_to_all("pipeline_type", new_type, wait_ack=False)
241 | 
242 |     def reload_model(self, model_or_path: str):
243 |         # reset all other options to default so they can be restored on next call
244 |         self._send_cmd_to_all("reload_model", model_or_path, wait_ack=True)
245 |         self._safety_checker = "dummy"
246 |         self._scheduler = "PNDM"
247 |         self._pipeline_type = "text"
248 | 
249 |     def set_nsfw(self, nsfw: bool):
250 |         # this will avoid unnecessary inter-process calls and only set if needed
251 |         if nsfw:
252 |             self.safety_checker = None
253 |         else:
254 |             self.safety_checker = dummy_checker
255 | 
256 | 
257 | from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
258 | 
259 | from diffusers.models import AutoencoderKL, UNet2DConditionModel
260 | from diffusers.pipeline_utils import DiffusionPipeline
261 | from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
262 | 
263 | 
264 | class StableDiffusionModelParallel(StableDiffusionPipeline):
265 |     def __init__(
266 |         self,
267 |         vae: AutoencoderKL,
268 |         text_encoder: CLIPTextModel,
269 |         tokenizer: CLIPTokenizer,
270 |         unet: UNet2DConditionModel,
271 |         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
272 |         safety_checker: StableDiffusionSafetyChecker,
273 |         feature_extractor: CLIPFeatureExtractor,
274 |     ):
275 |         """
276 |         Model can be split into 4 main components:
277 |             - unet_encoder (downblocks to middle block)
278 |             - unet_decoder (up_blocks+)
279 |             - text_encoder
280 |             - vae
281 |         This class handles the components of a model that are split among multiple GPUs,
282 |         taking care of moving tensors and Modules to the right devices: e.g.
283 |         unet_encoder GPU_0 -> unet_decoder GPU_1 -> text_encoder GPU_1 -> vae GPU_0.
284 |         Result is eventually moved back to CPU at the end of each foward call.
285 |         """
286 |         super().__init__(
287 |             vae,
288 |             text_encoder,
289 |             tokenizer,
290 |             unet,
291 |             scheduler,
292 |             safety_checker,
293 |             feature_extractor,
294 |         )
295 |         self._scheduler = self.scheduler
296 |         # self._safety_checker = self.safety_checker
297 | 
298 |     def to(self, part_to_device: Dict[int, torch.device]):
299 |         # move each component onto the specified device
300 |         self.vae = ToGPUWrapper(self.vae, part_to_device[3])
301 |         self.text_encoder = ToGPUWrapper(self.text_encoder, part_to_device[2])
302 | 
303 |         # move unet, requires a bit more work as it is chunked further into multiple parts
304 |         # move encoder
305 |         for layer in [
306 |             "time_proj",
307 |             "time_embedding",
308 |             "conv_in",
309 |             "down_blocks",
310 |             "mid_block",
311 |         ]:
312 |             module = getattr(self.unet, layer)
313 |             if type(module) is nn.ModuleList:
314 |                 mlist = nn.ModuleList(
315 |                     [ToGPUWrapper(mod, part_to_device[0]) for mod in module]
316 |                 )
317 |                 setattr(self.unet, layer, mlist)
318 |             else:
319 |                 setattr(self.unet, layer, ToGPUWrapper(module, part_to_device[0]))
320 | 
321 |         # move decoder
322 |         for layer in ["up_blocks", "conv_norm_out", "conv_act", "conv_out"]:
323 |             module = getattr(self.unet, layer)
324 |             if type(module) is nn.ModuleList:
325 |                 mlist = nn.ModuleList(
326 |                     [ToGPUWrapper(mod, part_to_device[1]) for mod in module]
327 |                 )
328 |                 setattr(self.unet, layer, mlist)
329 |             else:
330 |                 setattr(self.unet, layer, ToGPUWrapper(module, part_to_device[1]))
331 | 
332 |         # need to wrap scheduler.step to move sampled noise to unet gpu
333 |         self._wrap_scheduler_step()
334 |         return self
335 | 
336 |     @property
337 |     def device(self) -> torch.device:
338 |         # NOTE this overrides super so we can handle all tensors devices manually, all `to(self.device)`
339 |         # done in the forward pass become a no-op
340 |         return None
341 | 
342 |     def _wrap_scheduler_step(self):
343 |         prev_foo = self._scheduler.step
344 | 
345 |         def wrapper(x, i, sample: torch.Tensor, *args, **kwargs):
346 |             sample = sample.to(self.unet.up_blocks.device)
347 |             return prev_foo(x, i, sample, *args, **kwargs)
348 | 
349 |         self._scheduler.step = wrapper
350 | 
351 |     # override this interface for setting
352 |     # @property
353 |     # def safety_checker(self):
354 |     #     return self._safety_checker
355 | 
356 |     # @safety_checker.setter
357 |     # def safety_checker(self, value):
358 |     #     # switch nsfw on, otherwise don't bother re-setting on processes
359 |     #     if self.safety_checker is None and value is not None:
360 |     #         self._safety_checker == value
361 |     #     elif self.safety_checker is not None and value is None:
362 |     #         self._safety_checker == None
363 | 
364 |     @property
365 |     def scheduler(self):
366 |         return self._scheduler
367 | 
368 |     @scheduler.setter
369 |     def scheduler(
370 |         self, value: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
371 |     ):
372 |         # if self.scheduler.__class__.__name__ == value.__class__.__name__:
373 |         # return
374 |         if not hasattr(self, "_scheduler"):
375 |             # used during init phase
376 |             self._scheduler = value
377 |         else:
378 |             self._scheduler = value
379 |             self._wrap_scheduler_step()
380 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio==3.11.0
2 | diffusers==0.9.0
3 | transformers==4.25.1
4 | xformers==0.0.16
5 | scipy==1.7.0
6 | ftfy==6.1.1
7 | invisible-watermark
8 | accelerate==0.14.0


--------------------------------------------------------------------------------
/sb.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Optional, Union
  2 | from diffusers.pipelines.stable_diffusion import (
  3 |     StableDiffusionPipeline,
  4 |     StableDiffusionPipelineOutput,
  5 |     StableDiffusionImg2ImgPipeline,
  6 |     StableDiffusionInpaintPipeline,
  7 | )
  8 | import torch
  9 | from diffusers.pipelines.stable_diffusion.safety_checker import (
 10 |     StableDiffusionSafetyChecker,
 11 | )
 12 | from utils import remove_nsfw
 13 | from schedulers import schedulers
 14 | from transformers import CLIPFeatureExtractor
 15 | 
 16 | 
 17 | class DiffusionModel:
 18 |     def __init__(self, pipe: StableDiffusionPipeline = None) -> None:
 19 |         self.pipe: StableDiffusionPipeline = pipe
 20 |         self._safety: StableDiffusionSafetyChecker = None
 21 |         self._safety_extractor: CLIPFeatureExtractor = None
 22 |         self._pipe_name = ""
 23 |         self._device = torch.cpu
 24 | 
 25 |     @classmethod
 26 |     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 27 |         return cls()._load_pipeline(pretrained_model_name_or_path, **kwargs)
 28 | 
 29 |     def _load_pipeline(self, model_or_path, **kwargs):
 30 |         if self.pipe is not None and self._pipe_name == model_or_path:
 31 |             # avoid re-loading same model
 32 |             return
 33 | 
 34 |         print(f"Loading {model_or_path} from disk..")
 35 |         self.pipe = StableDiffusionPipeline.from_pretrained(
 36 |             pretrained_model_name_or_path=model_or_path, **kwargs
 37 |         )
 38 |         # remove safety checker so it doesn't use up GPU memory (by default)
 39 |         self._safety, self._safety_extractor = remove_nsfw(self.pipe)
 40 | 
 41 |         self._pipe_name = model_or_path
 42 |         print("Model Loaded!")
 43 |         return self
 44 | 
 45 |     def __call__(
 46 |         self, inference_type: str, *args: Any, **kwargs: Any
 47 |     ) -> StableDiffusionPipelineOutput:
 48 |         # NOTE: to avoid re-loading the model, we ""cast"" the pipeline
 49 |         if inference_type == "text":
 50 |             self.pipe.__class__ = StableDiffusionPipeline
 51 |         elif inference_type == "img2img":
 52 |             self.pipe.__class__ = StableDiffusionImg2ImgPipeline
 53 |         elif inference_type == "inpaint":
 54 |             self.pipe.__class__ = StableDiffusionInpaintPipeline
 55 |         # generator cant be pickled for multiprocessing, provide a coherent interface
 56 |         if kwargs.get("generator", None) is not None and kwargs["generator"] > 0:
 57 |             kwargs["generator"] = torch.Generator(self._device).manual_seed(
 58 |                 kwargs["generator"]
 59 |             )
 60 |         else:
 61 |             kwargs.pop("generator", None)  # ignore seed < 0
 62 | 
 63 |         return self.pipe(*args, **kwargs)
 64 | 
 65 |     def reload_model(self, model_or_path: str, **kwargs):
 66 |         # this is separated from __call__ hoping that we can get a single model that can do inpainting and img2img without reloading
 67 |         return self._load_pipeline(model_or_path, **kwargs).to(
 68 |             self._device
 69 |         )  # maintain device!
 70 | 
 71 |     def to(self, device: Union[torch.device, str]):
 72 |         self.pipe.to(device)
 73 |         self._device = device
 74 |         return self
 75 | 
 76 |     def set_nsfw(self, nsfw: bool):
 77 |         if nsfw:
 78 |             # re- instatiate safety checkers
 79 |             self.pipe.safety_checker = self._safety_checker.to(self._device)
 80 |             self.pipe.feature_extractor = self._safety_extractor
 81 |         else:
 82 |             # ignore return value, we already have the safety network
 83 |             remove_nsfw(self.pipe)
 84 | 
 85 |     # mimic interface
 86 |     @property
 87 |     def scheduler(self):
 88 |         return self.pipe.scheduler
 89 | 
 90 |     @scheduler.setter
 91 |     def scheduler(self, scheduler: str):
 92 |         assert isinstance(scheduler, str)
 93 |         if self.scheduler.__class__.__name__ == schedulers[scheduler].__name__:
 94 |             # avoid re-setting same scheduler
 95 |             pass
 96 |         elif scheduler is not None and scheduler in schedulers:
 97 |             print(f"Setting noise scheduler to {scheduler}")
 98 |             # TODO use a default config instead of self.pipe.scheduler.config?
 99 |             s = getattr(schedulers[scheduler], "from_config")(
100 |                 self.pipe.scheduler.config
101 |             )
102 |             self.pipe.scheduler = s
103 |         else:
104 |             raise ValueError(f"Invalid Scheduler {scheduler}!")
105 | 
106 |     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
107 |         # TODO this can be further pushed
108 |         # when slice_size is None, this is disabled
109 |         return self.pipe.enable_attention_slicing(slice_size)
110 | 


--------------------------------------------------------------------------------
/schedulers.py:
--------------------------------------------------------------------------------
 1 | # just a util file to gather the supported noise schedulers
 2 | from diffusers.schedulers import *
 3 | # setup noise schedulers
 4 | schedulers_names = [
 5 |     "EulerDiscrete",
 6 |     "DDIM",
 7 |     "PNDM",
 8 |     "K-LMS linear",
 9 |     "K-LMS scaled",
10 | ]
11 | schedulers_cls = [
12 |     EulerDiscreteScheduler,
13 |     DDIMScheduler,
14 |     PNDMScheduler,
15 |     LMSDiscreteScheduler,
16 |     LMSDiscreteScheduler,
17 | ]
18 | # NOTE scheduler params are now loaded from pre-trained model
19 | # schedulers_args = [
20 | #     dict(),
21 | #     {
22 | #         "beta_end": 0.012,
23 | #         "beta_schedule": "scaled_linear",
24 | #         "beta_start": 0.00085,
25 | #         "num_train_timesteps": 1000,
26 | #         "skip_prk_steps": True,
27 | #     },
28 | #     dict(),
29 | #     dict(beta_schedule="scaled_linear"),
30 | # ]
31 | # scheduler_name -> scheduler_class
32 | schedulers = dict(zip(schedulers_names, schedulers_cls))


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import numpy as np
  3 | import torch.multiprocessing as mp
  4 | from schedulers import schedulers_names
  5 | import json
  6 | 
  7 | def pop_up_exceptions(func):
  8 |     def wrapper(*args, **kwargs):
  9 |         try:
 10 |             result = func(*args, **kwargs)
 11 |         except Exception as e:
 12 |             raise gr.Error(str(e))
 13 |         return result
 14 |     return wrapper
 15 | 
 16 | if __name__ == "__main__":
 17 |     mp.set_start_method("spawn", force=True)
 18 | 
 19 |     from main import inference, MP as model_parallel, init_pipeline, MODEL_ID, DEVICES
 20 | 
 21 |     # create model(s)
 22 |     pipeline = init_pipeline()
 23 | 
 24 |     @pop_up_exceptions
 25 |     def change_model(choice: str):
 26 |         if choice == "Base Model":
 27 |             pipeline.reload_model(MODEL_ID)
 28 |         elif choice == "Inpainting":
 29 |             pipeline.reload_model("stabilityai/stable-diffusion-2-inpainting")
 30 |         return gr.Image.update(interactive=choice =="Inpainting")
 31 | 
 32 |     history = []
 33 |     @pop_up_exceptions
 34 |     def dream(
 35 |         prompt: str,
 36 |         *args
 37 |     ):
 38 |         # return [(np.random.randn(512, 512, 3)).astype(np.uint8)], [["test"]]
 39 |         if not len(prompt.strip()):
 40 |             return [], history
 41 |         images = inference(pipeline, prompt, *args)
 42 |         if not len(history) or [prompt] != history[-1]:
 43 |             history.append([prompt])
 44 | 
 45 |         return images, history
 46 | 
 47 |     # v2 model was trained on sfw data only, hence no safety checker
 48 |     enable_nsfw_toggle=not model_parallel and MODEL_ID!="stabilityai/stable-diffusion-2-base"
 49 | 
 50 |     with gr.Blocks() as demo:
 51 |         with gr.Row():
 52 |             with gr.Column():
 53 |                 with gr.Tabs():
 54 |                     inputs = []
 55 |                     with gr.TabItem("Text2Img"):
 56 |                         with gr.Column():
 57 |                             # FIXME crashes with weird error if no input
 58 |                             inputs.append(gr.Textbox(placeholder="Place your input prompt here and start dreaming!", label="Input Prompt")),
 59 |                             inputs.append(gr.Slider(1, max(24, len(DEVICES)*2), 1, step=1, label="Number of Images")),
 60 |                             inputs.append(gr.Slider(1, 200, 50, step=1, label="Steps")),
 61 |                             inputs.append(gr.Slider(256, 1024, 512, step=64, label="Height")),
 62 |                             inputs.append(gr.Slider(256, 1024, 512, step=64, label="Width")),
 63 |                             inputs.append(gr.Slider(0, 20, 7.5, step=0.5, label="Guidance Scale")),
 64 |                             inputs.append(gr.Number(label="Seed", precision=0)),
 65 |                             # inputs.append(# gr.Checkbox(True, label="FP16")),
 66 |                             inputs.append(gr.Checkbox(False, label="NSFW Filter", interactive=enable_nsfw_toggle)),
 67 |                             inputs.append(gr.Checkbox(False, label="Low VRAM mode")),
 68 |                             inputs.append(gr.Dropdown(schedulers_names, value="PNDM", label="Noise Scheduler")),
 69 |                     with gr.TabItem("Img2Img"):
 70 |                         with gr.Column():
 71 |                             gr.Markdown("Image and prompt guided generation. Use one of the box below to provide an input image: if two are provided, `Sketch2Img` has priority. Remember to clear the input by pressing `clear`.")
 72 |                             inputs.append(gr.Slider(0, 1, 0.25, step=0.05, label="Img2Img input fidelity")),
 73 |                             inputs.append(gr.Image(type="pil", tool=None, label="Image Conditioning")),
 74 |                             # FIXME state is not resetting when clicking x! resets when clear is pressed
 75 |                             inputs.append(gr.Image(type="pil", source='canvas', tool='color-sketch', label="Sketch2Img"))
 76 |                     with gr.TabItem("Image Inpainting"):
 77 |                         with gr.Column():
 78 |                             gr.Markdown("NOTE: Using image inpainting requires loading a different model from disk!")
 79 |                             inpainting = gr.Image(type="pil", tool='sketch', label="Image Inpaint", interactive=False)
 80 |                             inputs.append(inpainting),
 81 |                 with gr.Row():
 82 |                     clear_btn = gr.Button("Clear", variant="secondary")
 83 |                     button = gr.Button("Generate Image!", variant="primary")
 84 |             with gr.Column(variant="box"):
 85 |                 outputs=[gr.Gallery(show_label=False).style(grid=2, container=True)]           
 86 |                 load_radio = gr.Radio(["Base Model", "Inpainting"], value="Base Model",label="Model to load:")
 87 |                 outputs.append(gr.Dataframe(col_count=(1, "fixed"),headers=["Prompt History"], interactive=True))
 88 |         # sample prompt from https://strikingloo.github.io/DALL-E-2-prompt-guide
 89 |         # NOTE prompt MUST be first input, since UI order is forwarded as is to `inference` 
 90 |         gr.Examples(["A digital illustration of a medieval town, 4k, detailed, trending in artstation, fantasy"], inputs=inputs[:1])
 91 |         button.click(dream, inputs=inputs, outputs=outputs)
 92 |         # clear inputs and outputs
 93 |         clear_btn.click(
 94 |             None,
 95 |             [],
 96 |             (
 97 |                 inputs
 98 |                 + outputs
 99 |             ),
100 |             _js=f"""() => {json.dumps(
101 |                 [component.cleared_value if hasattr(component, "cleared_value") else None
102 |                     for component in inputs+outputs]
103 |                 )
104 |             }
105 |             """,
106 |         )
107 |         load_radio.change(change_model, inputs=load_radio,outputs=inpainting)
108 |         demo.launch(share=False)


--------------------------------------------------------------------------------
/tests/test_image_generation.py:
--------------------------------------------------------------------------------
  1 | from main import init_pipeline, inference
  2 | from diffusers import StableDiffusionPipeline
  3 | from diffusers.pipelines.stable_diffusion import (
  4 |     StableDiffusionImg2ImgPipeline,
  5 |     StableDiffusionInpaintPipeline,
  6 | )
  7 | import pytest
  8 | from typing import List
  9 | from PIL import Image
 10 | import torch
 11 | import numpy as np
 12 | import psutil
 13 | 
 14 | PROMPT = "A starry night"
 15 | 
 16 | 
 17 | @pytest.fixture(scope="module")
 18 | def txt2img() -> StableDiffusionPipeline:
 19 |     pipe = StableDiffusionPipeline.from_pretrained(
 20 |         "stabilityai/stable-diffusion-2-base",
 21 |         # revision="fp16", cpu needs fp32
 22 |         # torch_dtype=torch.float16,
 23 |     )
 24 |     if torch.cuda.is_available():
 25 |         pipe.to(torch.device("cuda"))
 26 |     return pipe
 27 | 
 28 | 
 29 | def requires_cuda(func):
 30 |     def wrapper(*args, **kwargs):
 31 |         if not torch.cuda.is_available():
 32 |             pytest.skip("This test needs a GPU")
 33 |         return func(*args, **kwargs)
 34 | 
 35 |     return wrapper
 36 | 
 37 | def enable_multiprocessing(func):
 38 |     def wrapper(*args, **kwargs):
 39 |         from main import IS_MULTI
 40 |         IS_MULTI = True
 41 |         try:
 42 |             res = func(*args, **kwargs)
 43 |         except Exception as e:
 44 |             raise e
 45 |         finally:
 46 |             IS_MULTI = False
 47 |         return res
 48 | 
 49 |     return wrapper
 50 | 
 51 | 
 52 | def check_n_free_GBs(n: int = 0):
 53 |     # guard against executing tests where not enough space is present
 54 |     # mostly a workaround for github runners (TODO check `HF_HOME` root not /home)
 55 |     ps = psutil.disk_usage("/home")
 56 |     if ps.free / (1024.0**3) < n:
 57 |         pytest.skip(
 58 |             f"This test needs {n} gigabytes to run! Space left is only {ps.free / (1024.0 ** 3)}G."
 59 |         )
 60 | 
 61 | 
 62 | # these tests have to run on cpu..
 63 | def test_txt2img(txt2img: StableDiffusionPipeline):
 64 |     check_n_free_GBs(n=17)
 65 |     input_kwargs = dict(
 66 |         prompt=PROMPT,
 67 |         num_inference_steps=3,
 68 |         height=512,
 69 |         width=512,
 70 |         generator=None,
 71 |     )
 72 |     images: List[Image.Image] = txt2img(**input_kwargs)["images"]
 73 |     assert images[0].size == (512, 512)
 74 | 
 75 | 
 76 | @requires_cuda
 77 | def test_txt2img_pipeline():
 78 |     check_n_free_GBs(n=2.5)
 79 |     pipe = init_pipeline("stabilityai/stable-diffusion-2-base", [0])
 80 |     images = inference(
 81 |         pipe, PROMPT, num_images=1, num_inference_steps=3, height=512, width=512
 82 |     )
 83 |     assert len(images) == 1 and images[0].size == (512, 512)
 84 | 
 85 | @enable_multiprocessing
 86 | def test_txt2img_pipeline_multiprocessing():
 87 |     test_txt2img_pipeline()
 88 | 
 89 | @requires_cuda
 90 | def test_img2img_pipeline():
 91 |     check_n_free_GBs(n=2.5)
 92 |     pipe = init_pipeline("stabilityai/stable-diffusion-2-base", [0])
 93 |     image = Image.open("./assets/0.png")
 94 |     images = inference(
 95 |         pipe,
 96 |         PROMPT,
 97 |         num_images=1,
 98 |         num_inference_steps=3,
 99 |         height=512,
100 |         width=512,
101 |         input_image=image,
102 |         inv_strenght=0.5,
103 |     )
104 |     assert len(images) == 1 and images[0].size == (512, 512)
105 |     # test multi-processing 
106 | 
107 | @enable_multiprocessing
108 | def test_img2img_pipeline_multiprocessing():
109 |     test_img2img_pipeline()
110 | 
111 | @requires_cuda
112 | def test_imginpainting_pipeline():
113 |     check_n_free_GBs(n=2.5)
114 |     pipe = init_pipeline("stabilityai/stable-diffusion-2-inpainting", [0])
115 |     image = Image.open("./assets/0.png")
116 |     # mask image
117 |     mask = np.array(image)
118 |     mask[:, : image.size[0] // 2] = 0
119 |     mask = Image.fromarray(mask)
120 |     images = inference(
121 |         pipe,
122 |         PROMPT,
123 |         num_images=1,
124 |         num_inference_steps=3,
125 |         height=512,
126 |         width=512,
127 |         masked_image={"image": image, "mask": mask},
128 |     )
129 |     assert len(images) == 1 and images[0].size == (512, 512)
130 |     # masked part more diverse than the "fixed" one
131 |     res, source = np.array(images[0]), np.array(image)
132 |     assert (source[:, : image.size[0] // 2] - res[:, : image.size[0] // 2]).sum() < (
133 |         source[:, image.size[0] // 2 :] - res[:, image.size[0] // 2 :]
134 |     ).sum()
135 | 
136 | @enable_multiprocessing
137 | def test_imginpainting_pipeline_multiprocessing():
138 |     test_imginpainting_pipeline()


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Tuple, List
  3 | import torch
  4 | import torch.nn as nn
  5 | from PIL import Image
  6 | import numpy as np
  7 | import multiprocessing
  8 | from diffusers import StableDiffusionPipeline
  9 | from diffusers.pipelines.stable_diffusion.safety_checker import (
 10 |     StableDiffusionSafetyChecker,
 11 | )
 12 | from transformers import CLIPFeatureExtractor
 13 | from transformers.feature_extraction_utils import BatchFeature
 14 | 
 15 | 
 16 | def image_grid(imgs, rows, cols):
 17 |     assert len(imgs) == rows * cols
 18 | 
 19 |     w, h = imgs[0].size
 20 |     grid = Image.new("RGB", size=(cols * w, rows * h))
 21 |     grid_w, grid_h = grid.size
 22 | 
 23 |     for i, img in enumerate(imgs):
 24 |         grid.paste(img, box=(i % cols * w, i // cols * h))
 25 |     return grid
 26 | 
 27 | 
 28 | def dummy_checker(images, *args, **kwargs):
 29 |     # removes nsfw filter
 30 |     return images, False
 31 | 
 32 | 
 33 | def dummy_extractor(images, return_tensors="pt"):
 34 |     # print(type(images), type(images[0]))
 35 |     if type(images) is list:
 36 |         images = [np.array(img) for img in images]
 37 |     data = {"pixel_values": images}
 38 |     return BatchFeature(data=data, tensor_type=return_tensors)
 39 | 
 40 | 
 41 | def remove_nsfw(
 42 |     model: StableDiffusionPipeline,
 43 | ) -> Tuple[StableDiffusionSafetyChecker, CLIPFeatureExtractor]:
 44 |     nsfw_model: StableDiffusionSafetyChecker = model.safety_checker
 45 |     # don't panic is safety_checker is already a dummy
 46 |     if isinstance(nsfw_model, StableDiffusionSafetyChecker):
 47 |         nsfw_model = nsfw_model.cpu()
 48 |     model.safety_checker = dummy_checker
 49 |     extr = model.feature_extractor
 50 |     model.feature_extractor = dummy_extractor
 51 |     return nsfw_model, extr
 52 | 
 53 | 
 54 | def get_gpu_setting(env_var: str) -> Tuple[bool, List[int]]:
 55 |     if not torch.cuda.is_available():
 56 |         print("GPU not detected! Make sure you have a GPU to reduce inference time!")
 57 |         return False, []
 58 |     # reads user input, returns multi_gpu flag and gpu id(s)
 59 |     n = torch.cuda.device_count()
 60 |     if env_var == "all":
 61 |         gpus = list(range(n))
 62 |     elif "," in env_var:
 63 |         gpus = [int(gnum) for gnum in env_var.split(",") if int(gnum) < n]
 64 |     else:
 65 |         gpus = [int(env_var)]
 66 |     assert len(
 67 |         gpus
 68 |     ), f"Make sure to provide valid device ids! You have {n} GPU(s), you can specify the following values: {list(range(n))}"
 69 |     return len(gpus) > 1, gpus
 70 | 
 71 | 
 72 | def get_free_memory_Mb(device: int):
 73 |     # returns (free, total) device memory, in bytes
 74 |     return torch.cuda.mem_get_info(device)[0] / 2**20
 75 | 
 76 | 
 77 | def model_size_Mb(model):
 78 |     # from the legend @ptrblck himself https://discuss.pytorch.org/t/finding-model-size/130275/2
 79 |     param_size = 0
 80 |     for param in model.parameters():
 81 |         param_size += param.nelement() * param.element_size()
 82 |     buffer_size = 0
 83 |     for buffer in model.buffers():
 84 |         buffer_size += buffer.nelement() * buffer.element_size()
 85 |     return (param_size + buffer_size) / 1024**2
 86 | 
 87 | 
 88 | class ToGPUWrapper(nn.Module, object):
 89 |     def __init__(self, layer: nn.Module, device: torch.device) -> None:
 90 |         # composition design, we wrap a nn.Module, change forward
 91 |         super().__init__()
 92 |         self.device = device
 93 |         # move wrapped model to correct device
 94 |         self.layer = layer.to(device)
 95 | 
 96 |     def forward(self, x: torch.Tensor = None, *args, **kwargs):
 97 |         # move input and output to given device
 98 |         # print(self.layer.__class__.__name__)
 99 |         args = [a.to(self.device) if type(a) is torch.Tensor else a for a in args]
100 |         for k in kwargs:
101 |             if type(kwargs[k]) is torch.Tensor:
102 |                 kwargs[k] = kwargs[k].to(self.device)
103 |         if x is None:
104 |             y = self.layer(*args, **kwargs)
105 |         else:
106 |             y = self.layer(x.to(self.device), *args, **kwargs)
107 |         # text model wraps output.. this could be made more generic
108 |         if self.layer.__class__.__name__ == "CLIPTextModel":
109 |             # getting does something like this self.to_tuple()[k]
110 |             y.last_hidden_state = y.last_hidden_state.to(self.device)
111 |             return y
112 |         return y.to(self.device)
113 | 
114 |     # FIXME this is giving recursion problems
115 |     # def __getattr__(self, name: str):
116 |     # return getattr(self.layer, name)
117 | 
118 |     def __iter__(self):
119 |         return iter(self.layer)
120 | 
121 |     def __next__(self):
122 |         return next(self.layer)
123 | 
124 |     def decode(self, z):
125 |         # for vae output
126 |         return self.layer.decode(z.to(self.device))
127 | 
128 | 
129 | class ModelParts2GPUsAssigner:
130 |     def __init__(
131 |         self,
132 |         devices: List[int],
133 |     ) -> None:
134 |         """
135 |         Finds a valid assignment of model parts (unet, vae..) to available GPUs
136 |         using a stochastic brute-force approach. The problem is formulated
137 |         as a Integer Linear Programming one:
138 |             maximize w^t X with  w=[a, b, c, d]
139 |             subject to x_1 a + y_1 b + z_1 c + k_1 d \leq v_1
140 |             \dots
141 |             x_n a + y_n b + z_n c + k_n d \leq v_n
142 |             with \sum x_i=\sum y_i=\sum z_i=\sum k_i
143 |             x, y, z, k \geq 0
144 |             x, y, z, k \in Z^n
145 | 
146 |         `self.W` represents the memory requirements of each component in which the model is split
147 |         into.
148 |         `self.G` is a vector of size N, containing the available memory of each device. Available
149 |         memory is conservatively taken as 60% of the free memory.
150 |         The assignment state I is a Nx4 matrix where I[i,j] represents the number of components j
151 |         assigned to GPU i (initially 0).
152 |         """
153 |         self.N = len(devices)
154 |         # memory "budget" for each device: we consider 90% of the available GPU memory
155 |         G = [int(get_free_memory_Mb(d) * 0.9) for d in devices]
156 |         print("Free GPU memory (per device): ", G)
157 |         # FIXME G is kind of a function of n_models itself, as the more models you have
158 |         # the more memory you will be using for storing intermediate results...
159 |         self.G = np.array(G, dtype=np.uint16)
160 |         # model components memory usage, fixed order: unet_e, unet_d, text_encoder, vae
161 |         # TODO make dynamic using `model_size_Mb(model.text_encoder)`,
162 |         fp16 = bool(int(os.environ.get("FP16", 1)))
163 |         if fp16:
164 |             self.W = np.array([666, 975, 235, 160])
165 |             # (peak) memory usage with batch 1 and 512x512
166 |             self.W += [2115, 2115, 35, 2115]
167 |         else:
168 |             # fp32 weights
169 |             self.W = np.array([1331, 1949, 470, 320])
170 |             self.W += [2185, 2185, 116, 4280]
171 | 
172 |         single_model = bool(os.environ.get("SINGLE_MODEL_PARALLEL", False))
173 |         MAX_MODELS = int(os.environ.get("MAX_MODEL_PARALLEL", 12))
174 |         # easy way to ensure single model multiple gpus, useful for debugging
175 |         if single_model:
176 |             self._max_models = 1
177 |         else:
178 |             # max number of models you can have considering pooled VRam as it if was a single GPU,
179 |             # "upper bounded" by max number of processes
180 |             self._max_models = min(
181 |                 multiprocessing.cpu_count(),
182 |                 np.floor(self.G.sum() / self.W.sum()),
183 |                 MAX_MODELS,
184 |             )
185 |         if np.floor(self.G.sum() / self.W.sum()) == 0:
186 |             raise Exception(
187 |                 "You don't have enough combined VRam to host a single model! Try to run the container using the FP16 mode."
188 |             )
189 | 
190 |     def state_evaluation(self, state: np.ndarray):
191 |         """
192 |         2 conditions:
193 |             - each model component must appear in the same number (implicitly generated)
194 |             - allocation on each GPUs must not be greater than its capacity
195 |         """
196 |         return (state @ self.W <= self.G).all()
197 | 
198 |     def add_model(self, state: np.ndarray, rnd=True, sample_size=2) -> List[np.ndarray]:
199 |         """
200 |         This function takes an assignment state and tries to add a "model" to it:
201 |         adding a model means assigning *each of the 4 components* to a device.
202 |         It does so by brute-force searching for valid assignments that support
203 |         the addition of another model.
204 |         If no such assignment exist, an empty list is returned.
205 |         can be
206 |         changed through `sample_size`
207 |         Args:
208 |             state (np.ndarray): The initial state from which the search starts from.
209 |             rnd (bool, optional): Whether to generate new assignments in a random fashion,
210 |             rather than proceeding "linearly". Defaults to True.
211 |             sample_size (int, optional): The number of valid assignments needed to
212 |             interrupt the search before the whole space is visited. Defaults to 2.
213 |         """
214 | 
215 |         def get_device_permutation():
216 |             if rnd:
217 |                 return np.random.permutation(self.N)
218 |             return np.arange(self.N)
219 | 
220 |         # beware, this will modify state in-place
221 |         valid = []
222 |         # N^4 possible combinations
223 |         # +1 on cells (0, a), (1, b), (2, c), (3, d)
224 |         for a in get_device_permutation():
225 |             state[a, 0] += 1
226 |             for b in get_device_permutation():
227 |                 state[b, 1] += 1
228 |                 for c in get_device_permutation():
229 |                     state[c, 2] += 1
230 |                     for d in get_device_permutation():
231 |                         state[d, 3] += 1
232 |                         # evaluate state, return first valid or keep a list of valid ones? Or one with max "score"?
233 |                         # greedy return one, can't guarantee to find (one of the) optimum(s)
234 |                         if self.state_evaluation(state):
235 |                             # could be compressed by only storing a,b,c,d..
236 |                             valid.append(state.copy())
237 |                         # here state wasn't backtracked!
238 |                         if sample_size > 0 and len(valid) >= sample_size:
239 |                             return valid
240 |                         # backtrack!
241 |                         state[d, 3] -= 1
242 |                     state[c, 2] -= 1
243 |                 state[b, 1] -= 1
244 |             state[a, 0] -= 1
245 |         return valid
246 | 
247 |     def find_best_assignment(
248 |         self, state: np.ndarray, curr_n_models: int, **kwargs
249 |     ) -> Tuple[int, List[np.ndarray]]:
250 |         """
251 |         Starting from the intial empty assignment, tries to add a model to the multi-gpu
252 |         setup recursively, stopping whenever this is impossible.
253 |         """
254 |         if curr_n_models >= self._max_models:
255 |             return -1, []
256 |         prev = state.copy()
257 |         valid = self.add_model(state, **kwargs)
258 |         # can't generate valid assignments with an extra model, return current one
259 |         if not len(valid):
260 |             return curr_n_models, [prev]
261 |         # visit children
262 |         children = []
263 |         for next_state in valid:
264 |             # insert only valid states
265 |             depth, ss = self.find_best_assignment(
266 |                 next_state, curr_n_models + 1, **kwargs
267 |             )
268 |             if depth > 0 and len(ss):
269 |                 children.append((depth, ss))
270 | 
271 |         # can't add more models
272 |         if not len(children):
273 |             return curr_n_models + 1, valid
274 |         # return best child, the one that assigns more models (and number of models too)
275 |         return max(children, key=lambda t: t[0])
276 | 
277 |     def __call__(self) -> np.ndarray:
278 |         # initial empty assignment, #GPUs x #model_parts
279 |         I = np.zeros((self.N, 4), dtype=np.uint16)
280 |         # returns a valid assignment of split component to devices
281 |         n_models, ass = self.find_best_assignment(I, 0)
282 |         ass = ass[0]
283 |         print(
284 |             f"Search has found that {n_models} model(s) can be split over {self.N} device(s)!"
285 |         )
286 |         print("Assignment:", ass)
287 |         # format output into a [{model_component->device}], one per model to create
288 |         model_ass = [{i: -1 for i in range(4)} for _ in range(n_models)]
289 |         for dev in range(self.N):
290 |             for comp in range(4):
291 |                 # this assignment entry might say "component_0 to device_1 3 times"
292 |                 for _ in range(ass[dev, comp]):
293 |                     for m in model_ass:
294 |                         # assign to first model that doesn't have an allocated component yet
295 |                         if m[comp] == -1:
296 |                             m[comp] = dev
297 |                             break
298 |         return model_ass
299 | 


--------------------------------------------------------------------------------