├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── assets └── looseControl_teaser.png ├── cross_frame_attention.py ├── loose_controlnet_example ├── comfyui_workflow.json ├── comfyui_workflow_lcm.json └── depth.jpeg ├── loosecontrol.py └── weight_fusion.py /.gitignore: -------------------------------------------------------------------------------- 1 | env/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Shariq F. Bhat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LooseControlNet: Fused ControlNet Weights from LooseControl 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) 4 | 5 | ## How it works 6 | 7 | In LooseControl, the authors trained a LoRA of `ControlNet-depth`, but now few libraries or frameworks support *LoRA of 8 | ControlNet*, so they hacked through `ControlNetModel` of `diffusers` with `UNet2DConditionLoadersMixin`. 9 | 10 | However, we can't run the code in frameworks like A1111's WebUI or ComfyUI, so we fused the weights 11 | of `ControlNet-depth` and `LooseControl` to make it work in any frameworks. For details, please refer to [the script](./weight_fusion.py). 12 | 13 | > *Important Note:* 14 | > The authors of LooseControl did more than just training a LoRA. Let's not forget that. Please refer to the original paper and code for more 15 | > details. 16 | 17 | ## Usage 18 | 19 | Download the fused ControlNet weights from [huggingface](https://huggingface.co/AIRDGempoll/LooseControlNet) and used it 20 | anywhere (e.g. A1111's WebUI or ComfyUI) you can use `ControlNet-depth` to loosely control image generation using depth 21 | images. 22 | 23 | [Example folder](./loose_controlnet_example) contains an simple workflow for using LooseControlNet in ComfyUI. 24 | 25 | ## Contributing 26 | 27 | If you like it, you can contribute by: 28 | 29 | * Upvote this [issue](https://github.com/huggingface/diffusers/issues/6354) in `diffusers` repo or possibly make a PR to 30 | resolve it. 31 | * Bring consistency mechanisms devised in LooseControl to frameworks like A1111's WebUI or ComfyUI. 32 | * Bring box editors to frameworks like A1111's WebUI or ComfyUI. 33 | * Perhaps train a better LooseControlNet 34 | 35 | ## Licenses 36 | 37 | The extra code we add is released under MIT License and the fused weights are released under Apache 2.0 License, 38 | which follows the original license, MIT License, of LooseControl and Apache 2.0 License of ControlNet. 39 | 40 | ## References 41 | 42 | ### LooseControl 43 | 44 | This is the official repository for LooseControl: 45 | > #### [LooseControl: Lifting ControlNet for Generalized Depth Conditioning](#) 46 | > ##### [Shariq Farooq Bhat](https://shariqfarooq123.github.io), [Niloy J. Mitra](http://www0.cs.ucl.ac.uk/staff/n.mitra/), [Peter Wonka](http://peterwonka.net/) 47 | > 48 | 49 | [[Project Page]](https://shariqfarooq123.github.io/loose-control/) [[Paper]](https://arxiv.org/abs/2312.03079) [[Demo 🤗]](https://huggingface.co/spaces/shariqfarooq/LooseControl) [[Weights (3D Box Control)]](https://huggingface.co/shariqfarooq/loose-control-3dbox) 50 | 51 | ![teaser](assets/looseControl_teaser.png) 52 | 53 | #### Citation 54 | 55 | ```bibtex 56 | @misc{bhat2023loosecontrol, 57 | title={LooseControl: Lifting ControlNet for Generalized Depth Conditioning}, 58 | author={Shariq Farooq Bhat and Niloy J. Mitra and Peter Wonka}, 59 | year={2023}, 60 | eprint={2312.03079}, 61 | archivePrefix={arXiv}, 62 | primaryClass={cs.CV} 63 | } 64 | ``` 65 | 66 | ### ControlNet 67 | 68 | Please refer to its official [repository](https://github.com/lllyasviel/ControlNet) for more details. 69 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from dataclasses import dataclass 3 | import PIL 4 | import PIL.Image 5 | 6 | import torch 7 | import numpy as np 8 | from gradio_editor3d import Editor3D as g3deditor 9 | import copy 10 | from loosecontrol import LooseControlNet 11 | 12 | 13 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 14 | cn = LooseControlNet() 15 | cn.pipe = cn.pipe.to(torch_device=device, torch_dtype=torch.float16) 16 | 17 | # Need to figure out a better way how to do this per user, making 'cf attention' act like a state per user. 18 | # For now, we just copy the model. 19 | cn_with_cf = copy.deepcopy(cn) 20 | cn_with_cf.set_cf_attention() 21 | 22 | 23 | @dataclass 24 | class FixedInputs: 25 | prompt: str 26 | seed: int 27 | depth: PIL.Image.Image 28 | 29 | 30 | negative_prompt = "blurry, text, caption, lowquality, lowresolution, low res, grainy, ugly" 31 | def depth2image(prompt, seed, depth): 32 | seed = int(seed) 33 | gen = cn(prompt, control_image=depth, controlnet_conditioning_scale=1.0, generator=torch.Generator().manual_seed(seed), num_inference_steps=20, negative_prompt=negative_prompt) 34 | return gen 35 | 36 | def edit_previous(prompt, seed, depth, fixed_inputs): 37 | seed = int(seed) 38 | control_image = [fixed_inputs.depth, depth] 39 | prompt = [fixed_inputs.prompt, prompt] 40 | neg_prompt = [negative_prompt, negative_prompt] 41 | generator = [torch.Generator().manual_seed(fixed_inputs.seed), torch.Generator().manual_seed(seed)] 42 | gen = cn_with_cf(prompt, control_image=control_image, controlnet_conditioning_scale=1.0, generator=generator, num_inference_steps=20, negative_prompt=neg_prompt)[-1] 43 | return gen 44 | 45 | def run(prompt, seed, depth, should_edit, fixed_inputs): 46 | depth = depth.convert("RGB") 47 | # all values below [3,3,3] in depth should actually be set to [255,255,255] 48 | # This is to due the nature of training data and is experimental right now. 49 | # Not in use for now. 50 | # depth = np.array(depth) 51 | # depth[depth < 3] = 255 52 | # depth = PIL.Image.fromarray(depth) 53 | 54 | fixed_inputs = fixed_inputs[0] 55 | if should_edit and fixed_inputs is not None: 56 | return edit_previous(prompt, seed, depth, fixed_inputs) 57 | else: 58 | return depth2image(prompt, seed, depth) 59 | 60 | def handle_edit_change(edit, prompt, seed, image_input, fixed_inputs): 61 | if edit: 62 | fixed_inputs[0] = FixedInputs(prompt, int(seed), image_input) 63 | else: 64 | fixed_inputs[0] = None 65 | return fixed_inputs 66 | 67 | 68 | css = """ 69 | 70 | #image_output { 71 | width: 512px; 72 | height: 512px; 73 | """ 74 | 75 | 76 | main_description = """ 77 | # LooseControl 78 | 79 | This is the official demo for the paper [LooseControl: Lifting ControlNet for Generalized Depth Conditioning](https://shariqfarooq123.github.io/loose-control/). 80 | Our 3D Box Editing allows users to interactively edit the 3D boxes representing objects in the scene. Users can change the position, size, and orientation of 3D boxes, allowing to quickly create and edit the scenes to their liking in a 3D-aware manner. 81 | Best viewed on desktop. 82 | """ 83 | 84 | instructions_editor3d = """ 85 | ## Instructions for Editor3D UI 86 | - Use 'WASD' keys to move the camera. 87 | - Click on an object to select it. 88 | - Use the sliders to change the position, size, and orientation of the selected object. Sliders support click and drag for faster editing. 89 | - Use the 'Add Box', 'Delete', and 'Duplicate' buttons to add, delete, and duplicate objects. 90 | - Delete and Duplicate buttons work on the selected object. Duplicate creates a copy and selects it. 91 | - Use the 'Toggle Mode' to switch between "normal" and "depth" mode. Final image sent to the model should be in "depth" mode. 92 | - Use the 'Render' button to render the scene and send it to the model for generation. 93 | 94 | ### Lock style checkbox - Fixes the style of the latest generated image. 95 | This allows users to edit the 3D boxes without changing the style of the generated image. This is useful when the user is satisfied with the style/content of the generated image and wants to edit the 3D boxes without changing the overall essence of the scene. 96 | It can be used to create stop motion videos like those shown [here](https://shariqfarooq123.github.io/loose-control/). 97 | 98 | """ 99 | 100 | 101 | 102 | with gr.Blocks(css=css) as demo: 103 | gr.Markdown(main_description) 104 | 105 | fixed_inputs = gr.State([None]) 106 | with gr.Row(): 107 | prompt = gr.Textbox(label="Prompt", placeholder="Write your prompt", elem_id="input") 108 | seed = gr.Textbox(value=42, label="Seed", elem_id="seed") 109 | should_edit = gr.Checkbox(label="Lock style", elem_id="edit") 110 | 111 | with gr.Row(): 112 | image_input = g3deditor(elem_id="image_input") 113 | 114 | with gr.Row(): 115 | image_output = gr.Image(elem_id="image_output", type='pil') 116 | 117 | should_edit.change(fn=handle_edit_change, inputs=[should_edit, prompt, seed, image_input, fixed_inputs], outputs=[fixed_inputs]) 118 | image_input.change(fn=run, inputs=[prompt, seed, image_input, should_edit, fixed_inputs], outputs=[image_output]) 119 | with gr.Accordion("Instructions"): 120 | gr.Markdown(instructions_editor3d) 121 | 122 | demo.queue().launch() 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /assets/looseControl_teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GempollAI/LooseControlNet/d9e7989e454f546cbc43df806cf04b0b6b696d05/assets/looseControl_teaser.png -------------------------------------------------------------------------------- /cross_frame_attention.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/Picsart-AI-Research/Text2Video-Zero 2 | import torch 3 | from einops import rearrange 4 | 5 | class CrossFrameAttnProcessor: 6 | def __init__(self, unet_chunk_size=2): 7 | self.unet_chunk_size = unet_chunk_size 8 | 9 | def __call__( 10 | self, 11 | attn, 12 | hidden_states, 13 | encoder_hidden_states=None, 14 | attention_mask=None, **kwargs): 15 | batch_size, sequence_length, _ = hidden_states.shape 16 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 17 | query = attn.to_q(hidden_states) 18 | 19 | is_cross_attention = encoder_hidden_states is not None 20 | if encoder_hidden_states is None: 21 | encoder_hidden_states = hidden_states 22 | elif attn.norm_cross: 23 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 24 | key = attn.to_k(encoder_hidden_states) 25 | value = attn.to_v(encoder_hidden_states) 26 | # Sparse Attention 27 | if not is_cross_attention: 28 | video_length = key.size()[0] // self.unet_chunk_size 29 | # print("Video length is", video_length) 30 | # former_frame_index = torch.arange(video_length) - 1 31 | # former_frame_index[0] = 0 32 | former_frame_index = [0] * video_length 33 | key = rearrange(key, "(b f) d c -> b f d c", f=video_length) 34 | key = key[:, former_frame_index] 35 | key = rearrange(key, "b f d c -> (b f) d c") 36 | value = rearrange(value, "(b f) d c -> b f d c", f=video_length) 37 | value = value[:, former_frame_index] 38 | value = rearrange(value, "b f d c -> (b f) d c") 39 | 40 | query = attn.head_to_batch_dim(query) 41 | key = attn.head_to_batch_dim(key) 42 | value = attn.head_to_batch_dim(value) 43 | 44 | attention_probs = attn.get_attention_scores(query, key, attention_mask) 45 | hidden_states = torch.bmm(attention_probs, value) 46 | hidden_states = attn.batch_to_head_dim(hidden_states) 47 | 48 | # linear proj 49 | hidden_states = attn.to_out[0](hidden_states) 50 | # dropout 51 | hidden_states = attn.to_out[1](hidden_states) 52 | 53 | return hidden_states 54 | 55 | 56 | 57 | class AttnProcessorX: 58 | r""" 59 | Default processor for performing attention-related computations. 60 | """ 61 | 62 | def __call__( 63 | self, 64 | attn, 65 | hidden_states, 66 | encoder_hidden_states=None, 67 | attention_mask=None, 68 | temb=None, 69 | scale=1.0, 70 | ): 71 | residual = hidden_states 72 | 73 | if attn.spatial_norm is not None: 74 | hidden_states = attn.spatial_norm(hidden_states, temb) 75 | 76 | input_ndim = hidden_states.ndim 77 | 78 | if input_ndim == 4: 79 | batch_size, channel, height, width = hidden_states.shape 80 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 81 | 82 | batch_size, sequence_length, _ = ( 83 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 84 | ) 85 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 86 | 87 | if attn.group_norm is not None: 88 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 89 | 90 | query = attn.to_q(hidden_states, scale=scale) 91 | 92 | if encoder_hidden_states is None: 93 | encoder_hidden_states = hidden_states 94 | elif attn.norm_cross: 95 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 96 | 97 | key = attn.to_k(encoder_hidden_states, scale=scale) 98 | value = attn.to_v(encoder_hidden_states, scale=scale) 99 | 100 | query = attn.head_to_batch_dim(query) 101 | key = attn.head_to_batch_dim(key) 102 | value = attn.head_to_batch_dim(value) 103 | 104 | attention_probs = attn.get_attention_scores(query, key, attention_mask) 105 | hidden_states = torch.bmm(attention_probs, value) 106 | hidden_states = attn.batch_to_head_dim(hidden_states) 107 | 108 | # linear proj 109 | hidden_states = attn.to_out[0](hidden_states, scale=scale) 110 | # dropout 111 | hidden_states = attn.to_out[1](hidden_states) 112 | 113 | if input_ndim == 4: 114 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 115 | 116 | if attn.residual_connection: 117 | hidden_states = hidden_states + residual 118 | 119 | hidden_states = hidden_states / attn.rescale_output_factor 120 | 121 | return hidden_states 122 | -------------------------------------------------------------------------------- /loose_controlnet_example/comfyui_workflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 17, 3 | "last_link_id": 16, 4 | "nodes": [ 5 | { 6 | "id": 4, 7 | "type": "CheckpointLoaderSimple", 8 | "pos": [ 9 | -67, 10 | 31 11 | ], 12 | "size": { 13 | "0": 315, 14 | "1": 98 15 | }, 16 | "flags": {}, 17 | "order": 0, 18 | "mode": 0, 19 | "outputs": [ 20 | { 21 | "name": "MODEL", 22 | "type": "MODEL", 23 | "links": [ 24 | 1 25 | ], 26 | "slot_index": 0, 27 | "label": "MODEL" 28 | }, 29 | { 30 | "name": "CLIP", 31 | "type": "CLIP", 32 | "links": [ 33 | 3, 34 | 5 35 | ], 36 | "slot_index": 1, 37 | "label": "CLIP" 38 | }, 39 | { 40 | "name": "VAE", 41 | "type": "VAE", 42 | "links": [ 43 | 8 44 | ], 45 | "slot_index": 2, 46 | "label": "VAE" 47 | } 48 | ], 49 | "properties": { 50 | "Node name for S&R": "CheckpointLoaderSimple" 51 | }, 52 | "widgets_values": [ 53 | "revAnimated_v11.safetensors" 54 | ] 55 | }, 56 | { 57 | "id": 16, 58 | "type": "LoadImage", 59 | "pos": [ 60 | -91, 61 | 688 62 | ], 63 | "size": [ 64 | 315, 65 | 314 66 | ], 67 | "flags": {}, 68 | "order": 1, 69 | "mode": 0, 70 | "outputs": [ 71 | { 72 | "name": "IMAGE", 73 | "type": "IMAGE", 74 | "links": [ 75 | 14 76 | ], 77 | "shape": 3, 78 | "label": "IMAGE", 79 | "slot_index": 0 80 | }, 81 | { 82 | "name": "MASK", 83 | "type": "MASK", 84 | "links": null, 85 | "shape": 3, 86 | "label": "MASK" 87 | } 88 | ], 89 | "properties": { 90 | "Node name for S&R": "LoadImage" 91 | }, 92 | "widgets_values": [ 93 | "depth.jpeg", 94 | "image" 95 | ] 96 | }, 97 | { 98 | "id": 6, 99 | "type": "CLIPTextEncode", 100 | "pos": [ 101 | 384, 102 | -28 103 | ], 104 | "size": { 105 | "0": 422.84503173828125, 106 | "1": 164.31304931640625 107 | }, 108 | "flags": {}, 109 | "order": 4, 110 | "mode": 0, 111 | "inputs": [ 112 | { 113 | "name": "clip", 114 | "type": "CLIP", 115 | "link": 3, 116 | "label": "clip" 117 | } 118 | ], 119 | "outputs": [ 120 | { 121 | "name": "CONDITIONING", 122 | "type": "CONDITIONING", 123 | "links": [ 124 | 10 125 | ], 126 | "slot_index": 0, 127 | "label": "CONDITIONING" 128 | } 129 | ], 130 | "properties": { 131 | "Node name for S&R": "CLIPTextEncode" 132 | }, 133 | "widgets_values": [ 134 | "Sofa in a living room, masterpiece, photorealistic, 8k" 135 | ] 136 | }, 137 | { 138 | "id": 5, 139 | "type": "EmptyLatentImage", 140 | "pos": [ 141 | 424, 142 | 484 143 | ], 144 | "size": { 145 | "0": 315, 146 | "1": 106 147 | }, 148 | "flags": {}, 149 | "order": 2, 150 | "mode": 0, 151 | "outputs": [ 152 | { 153 | "name": "LATENT", 154 | "type": "LATENT", 155 | "links": [ 156 | 2 157 | ], 158 | "slot_index": 0, 159 | "label": "LATENT" 160 | } 161 | ], 162 | "properties": { 163 | "Node name for S&R": "EmptyLatentImage" 164 | }, 165 | "widgets_values": [ 166 | 960, 167 | 544, 168 | 1 169 | ] 170 | }, 171 | { 172 | "id": 3, 173 | "type": "KSampler", 174 | "pos": [ 175 | 1337, 176 | 284 177 | ], 178 | "size": { 179 | "0": 315, 180 | "1": 262 181 | }, 182 | "flags": {}, 183 | "order": 7, 184 | "mode": 0, 185 | "inputs": [ 186 | { 187 | "name": "model", 188 | "type": "MODEL", 189 | "link": 1, 190 | "label": "model" 191 | }, 192 | { 193 | "name": "positive", 194 | "type": "CONDITIONING", 195 | "link": 15, 196 | "label": "positive" 197 | }, 198 | { 199 | "name": "negative", 200 | "type": "CONDITIONING", 201 | "link": 6, 202 | "label": "negative" 203 | }, 204 | { 205 | "name": "latent_image", 206 | "type": "LATENT", 207 | "link": 2, 208 | "label": "latent_image" 209 | } 210 | ], 211 | "outputs": [ 212 | { 213 | "name": "LATENT", 214 | "type": "LATENT", 215 | "links": [ 216 | 7 217 | ], 218 | "slot_index": 0, 219 | "label": "LATENT" 220 | } 221 | ], 222 | "properties": { 223 | "Node name for S&R": "KSampler" 224 | }, 225 | "widgets_values": [ 226 | 485278465394722, 227 | "randomize", 228 | 30, 229 | 7, 230 | "euler", 231 | "normal", 232 | 1 233 | ] 234 | }, 235 | { 236 | "id": 8, 237 | "type": "VAEDecode", 238 | "pos": [ 239 | 1687, 240 | 42 241 | ], 242 | "size": { 243 | "0": 210, 244 | "1": 46 245 | }, 246 | "flags": {}, 247 | "order": 8, 248 | "mode": 0, 249 | "inputs": [ 250 | { 251 | "name": "samples", 252 | "type": "LATENT", 253 | "link": 7, 254 | "label": "samples" 255 | }, 256 | { 257 | "name": "vae", 258 | "type": "VAE", 259 | "link": 8, 260 | "label": "vae" 261 | } 262 | ], 263 | "outputs": [ 264 | { 265 | "name": "IMAGE", 266 | "type": "IMAGE", 267 | "links": [ 268 | 16 269 | ], 270 | "slot_index": 0, 271 | "label": "IMAGE" 272 | } 273 | ], 274 | "properties": { 275 | "Node name for S&R": "VAEDecode" 276 | } 277 | }, 278 | { 279 | "id": 12, 280 | "type": "ControlNetApply", 281 | "pos": [ 282 | 943, 283 | 677 284 | ], 285 | "size": { 286 | "0": 317.4000244140625, 287 | "1": 98 288 | }, 289 | "flags": {}, 290 | "order": 6, 291 | "mode": 0, 292 | "inputs": [ 293 | { 294 | "name": "conditioning", 295 | "type": "CONDITIONING", 296 | "link": 10, 297 | "label": "conditioning" 298 | }, 299 | { 300 | "name": "control_net", 301 | "type": "CONTROL_NET", 302 | "link": 13, 303 | "label": "control_net" 304 | }, 305 | { 306 | "name": "image", 307 | "type": "IMAGE", 308 | "link": 14, 309 | "label": "image", 310 | "slot_index": 2 311 | } 312 | ], 313 | "outputs": [ 314 | { 315 | "name": "CONDITIONING", 316 | "type": "CONDITIONING", 317 | "links": [ 318 | 15 319 | ], 320 | "shape": 3, 321 | "label": "CONDITIONING", 322 | "slot_index": 0 323 | } 324 | ], 325 | "properties": { 326 | "Node name for S&R": "ControlNetApply" 327 | }, 328 | "widgets_values": [ 329 | 0.8 330 | ] 331 | }, 332 | { 333 | "id": 17, 334 | "type": "PreviewImage", 335 | "pos": [ 336 | 2095, 337 | 42 338 | ], 339 | "size": [ 340 | 210, 341 | 246 342 | ], 343 | "flags": {}, 344 | "order": 9, 345 | "mode": 0, 346 | "inputs": [ 347 | { 348 | "name": "images", 349 | "type": "IMAGE", 350 | "link": 16, 351 | "label": "images" 352 | } 353 | ], 354 | "properties": { 355 | "Node name for S&R": "PreviewImage" 356 | } 357 | }, 358 | { 359 | "id": 15, 360 | "type": "ControlNetLoader", 361 | "pos": [ 362 | -94, 363 | 574 364 | ], 365 | "size": { 366 | "0": 315, 367 | "1": 58 368 | }, 369 | "flags": {}, 370 | "order": 3, 371 | "mode": 0, 372 | "outputs": [ 373 | { 374 | "name": "CONTROL_NET", 375 | "type": "CONTROL_NET", 376 | "links": [ 377 | 13 378 | ], 379 | "shape": 3, 380 | "label": "CONTROL_NET", 381 | "slot_index": 0 382 | } 383 | ], 384 | "properties": { 385 | "Node name for S&R": "ControlNetLoader" 386 | }, 387 | "widgets_values": [ 388 | "loose_controlnet.safetensors" 389 | ] 390 | }, 391 | { 392 | "id": 7, 393 | "type": "CLIPTextEncode", 394 | "pos": [ 395 | 395, 396 | 224 397 | ], 398 | "size": { 399 | "0": 425.27801513671875, 400 | "1": 180.6060791015625 401 | }, 402 | "flags": {}, 403 | "order": 5, 404 | "mode": 0, 405 | "inputs": [ 406 | { 407 | "name": "clip", 408 | "type": "CLIP", 409 | "link": 5, 410 | "label": "clip" 411 | } 412 | ], 413 | "outputs": [ 414 | { 415 | "name": "CONDITIONING", 416 | "type": "CONDITIONING", 417 | "links": [ 418 | 6 419 | ], 420 | "slot_index": 0, 421 | "label": "CONDITIONING" 422 | } 423 | ], 424 | "properties": { 425 | "Node name for S&R": "CLIPTextEncode" 426 | }, 427 | "widgets_values": [ 428 | "text, watermark, blur" 429 | ] 430 | } 431 | ], 432 | "links": [ 433 | [ 434 | 1, 435 | 4, 436 | 0, 437 | 3, 438 | 0, 439 | "MODEL" 440 | ], 441 | [ 442 | 2, 443 | 5, 444 | 0, 445 | 3, 446 | 3, 447 | "LATENT" 448 | ], 449 | [ 450 | 3, 451 | 4, 452 | 1, 453 | 6, 454 | 0, 455 | "CLIP" 456 | ], 457 | [ 458 | 5, 459 | 4, 460 | 1, 461 | 7, 462 | 0, 463 | "CLIP" 464 | ], 465 | [ 466 | 6, 467 | 7, 468 | 0, 469 | 3, 470 | 2, 471 | "CONDITIONING" 472 | ], 473 | [ 474 | 7, 475 | 3, 476 | 0, 477 | 8, 478 | 0, 479 | "LATENT" 480 | ], 481 | [ 482 | 8, 483 | 4, 484 | 2, 485 | 8, 486 | 1, 487 | "VAE" 488 | ], 489 | [ 490 | 10, 491 | 6, 492 | 0, 493 | 12, 494 | 0, 495 | "CONDITIONING" 496 | ], 497 | [ 498 | 13, 499 | 15, 500 | 0, 501 | 12, 502 | 1, 503 | "CONTROL_NET" 504 | ], 505 | [ 506 | 14, 507 | 16, 508 | 0, 509 | 12, 510 | 2, 511 | "IMAGE" 512 | ], 513 | [ 514 | 15, 515 | 12, 516 | 0, 517 | 3, 518 | 1, 519 | "CONDITIONING" 520 | ], 521 | [ 522 | 16, 523 | 8, 524 | 0, 525 | 17, 526 | 0, 527 | "IMAGE" 528 | ] 529 | ], 530 | "groups": [], 531 | "config": {}, 532 | "extra": {}, 533 | "version": 0.4 534 | } -------------------------------------------------------------------------------- /loose_controlnet_example/comfyui_workflow_lcm.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 18, 3 | "last_link_id": 21, 4 | "nodes": [ 5 | { 6 | "id": 16, 7 | "type": "LoadImage", 8 | "pos": [ 9 | -91, 10 | 688 11 | ], 12 | "size": [ 13 | 315, 14 | 314 15 | ], 16 | "flags": {}, 17 | "order": 0, 18 | "mode": 0, 19 | "outputs": [ 20 | { 21 | "name": "IMAGE", 22 | "type": "IMAGE", 23 | "links": [ 24 | 14 25 | ], 26 | "shape": 3, 27 | "label": "IMAGE", 28 | "slot_index": 0 29 | }, 30 | { 31 | "name": "MASK", 32 | "type": "MASK", 33 | "links": null, 34 | "shape": 3, 35 | "label": "MASK" 36 | } 37 | ], 38 | "properties": { 39 | "Node name for S&R": "LoadImage" 40 | }, 41 | "widgets_values": [ 42 | "depth.jpeg", 43 | "image" 44 | ] 45 | }, 46 | { 47 | "id": 6, 48 | "type": "CLIPTextEncode", 49 | "pos": [ 50 | 384, 51 | -28 52 | ], 53 | "size": { 54 | "0": 422.84503173828125, 55 | "1": 164.31304931640625 56 | }, 57 | "flags": {}, 58 | "order": 5, 59 | "mode": 0, 60 | "inputs": [ 61 | { 62 | "name": "clip", 63 | "type": "CLIP", 64 | "link": 19, 65 | "label": "clip" 66 | } 67 | ], 68 | "outputs": [ 69 | { 70 | "name": "CONDITIONING", 71 | "type": "CONDITIONING", 72 | "links": [ 73 | 10 74 | ], 75 | "slot_index": 0, 76 | "label": "CONDITIONING" 77 | } 78 | ], 79 | "properties": { 80 | "Node name for S&R": "CLIPTextEncode" 81 | }, 82 | "widgets_values": [ 83 | "Sofa in a living room, masterpiece, photorealistic, 8k" 84 | ] 85 | }, 86 | { 87 | "id": 5, 88 | "type": "EmptyLatentImage", 89 | "pos": [ 90 | 424, 91 | 484 92 | ], 93 | "size": { 94 | "0": 315, 95 | "1": 106 96 | }, 97 | "flags": {}, 98 | "order": 1, 99 | "mode": 0, 100 | "outputs": [ 101 | { 102 | "name": "LATENT", 103 | "type": "LATENT", 104 | "links": [ 105 | 2 106 | ], 107 | "slot_index": 0, 108 | "label": "LATENT" 109 | } 110 | ], 111 | "properties": { 112 | "Node name for S&R": "EmptyLatentImage" 113 | }, 114 | "widgets_values": [ 115 | 960, 116 | 544, 117 | 1 118 | ] 119 | }, 120 | { 121 | "id": 15, 122 | "type": "ControlNetLoader", 123 | "pos": [ 124 | -94, 125 | 574 126 | ], 127 | "size": { 128 | "0": 315, 129 | "1": 58 130 | }, 131 | "flags": {}, 132 | "order": 2, 133 | "mode": 0, 134 | "outputs": [ 135 | { 136 | "name": "CONTROL_NET", 137 | "type": "CONTROL_NET", 138 | "links": [ 139 | 13 140 | ], 141 | "shape": 3, 142 | "label": "CONTROL_NET", 143 | "slot_index": 0 144 | } 145 | ], 146 | "properties": { 147 | "Node name for S&R": "ControlNetLoader" 148 | }, 149 | "widgets_values": [ 150 | "loose_controlnet.safetensors" 151 | ] 152 | }, 153 | { 154 | "id": 7, 155 | "type": "CLIPTextEncode", 156 | "pos": [ 157 | 395, 158 | 224 159 | ], 160 | "size": { 161 | "0": 425.27801513671875, 162 | "1": 180.6060791015625 163 | }, 164 | "flags": {}, 165 | "order": 6, 166 | "mode": 0, 167 | "inputs": [ 168 | { 169 | "name": "clip", 170 | "type": "CLIP", 171 | "link": 20, 172 | "label": "clip" 173 | } 174 | ], 175 | "outputs": [ 176 | { 177 | "name": "CONDITIONING", 178 | "type": "CONDITIONING", 179 | "links": [ 180 | 6 181 | ], 182 | "slot_index": 0, 183 | "label": "CONDITIONING" 184 | } 185 | ], 186 | "properties": { 187 | "Node name for S&R": "CLIPTextEncode" 188 | }, 189 | "widgets_values": [ 190 | "text, watermark, blur" 191 | ] 192 | }, 193 | { 194 | "id": 8, 195 | "type": "VAEDecode", 196 | "pos": [ 197 | 1724, 198 | 81 199 | ], 200 | "size": { 201 | "0": 210, 202 | "1": 46 203 | }, 204 | "flags": {}, 205 | "order": 9, 206 | "mode": 0, 207 | "inputs": [ 208 | { 209 | "name": "samples", 210 | "type": "LATENT", 211 | "link": 7, 212 | "label": "samples" 213 | }, 214 | { 215 | "name": "vae", 216 | "type": "VAE", 217 | "link": 8, 218 | "label": "vae" 219 | } 220 | ], 221 | "outputs": [ 222 | { 223 | "name": "IMAGE", 224 | "type": "IMAGE", 225 | "links": [ 226 | 16 227 | ], 228 | "slot_index": 0, 229 | "label": "IMAGE" 230 | } 231 | ], 232 | "properties": { 233 | "Node name for S&R": "VAEDecode" 234 | } 235 | }, 236 | { 237 | "id": 3, 238 | "type": "KSampler", 239 | "pos": [ 240 | 1387, 241 | 286 242 | ], 243 | "size": { 244 | "0": 315, 245 | "1": 262 246 | }, 247 | "flags": {}, 248 | "order": 8, 249 | "mode": 0, 250 | "inputs": [ 251 | { 252 | "name": "model", 253 | "type": "MODEL", 254 | "link": 21, 255 | "label": "model" 256 | }, 257 | { 258 | "name": "positive", 259 | "type": "CONDITIONING", 260 | "link": 15, 261 | "label": "positive" 262 | }, 263 | { 264 | "name": "negative", 265 | "type": "CONDITIONING", 266 | "link": 6, 267 | "label": "negative" 268 | }, 269 | { 270 | "name": "latent_image", 271 | "type": "LATENT", 272 | "link": 2, 273 | "label": "latent_image" 274 | } 275 | ], 276 | "outputs": [ 277 | { 278 | "name": "LATENT", 279 | "type": "LATENT", 280 | "links": [ 281 | 7 282 | ], 283 | "slot_index": 0, 284 | "label": "LATENT" 285 | } 286 | ], 287 | "properties": { 288 | "Node name for S&R": "KSampler" 289 | }, 290 | "widgets_values": [ 291 | 63675398705672, 292 | "randomize", 293 | 5, 294 | 1.5, 295 | "lcm", 296 | "normal", 297 | 1 298 | ] 299 | }, 300 | { 301 | "id": 17, 302 | "type": "PreviewImage", 303 | "pos": [ 304 | 2154, 305 | 89 306 | ], 307 | "size": [ 308 | 210, 309 | 246 310 | ], 311 | "flags": {}, 312 | "order": 10, 313 | "mode": 0, 314 | "inputs": [ 315 | { 316 | "name": "images", 317 | "type": "IMAGE", 318 | "link": 16, 319 | "label": "images" 320 | } 321 | ], 322 | "properties": { 323 | "Node name for S&R": "PreviewImage" 324 | } 325 | }, 326 | { 327 | "id": 18, 328 | "type": "LoraLoader", 329 | "pos": [ 330 | -297, 331 | -3 332 | ], 333 | "size": { 334 | "0": 315, 335 | "1": 126 336 | }, 337 | "flags": {}, 338 | "order": 4, 339 | "mode": 0, 340 | "inputs": [ 341 | { 342 | "name": "model", 343 | "type": "MODEL", 344 | "link": 17, 345 | "label": "model" 346 | }, 347 | { 348 | "name": "clip", 349 | "type": "CLIP", 350 | "link": 18, 351 | "label": "clip" 352 | } 353 | ], 354 | "outputs": [ 355 | { 356 | "name": "MODEL", 357 | "type": "MODEL", 358 | "links": [ 359 | 21 360 | ], 361 | "shape": 3, 362 | "label": "MODEL", 363 | "slot_index": 0 364 | }, 365 | { 366 | "name": "CLIP", 367 | "type": "CLIP", 368 | "links": [ 369 | 19, 370 | 20 371 | ], 372 | "shape": 3, 373 | "label": "CLIP", 374 | "slot_index": 1 375 | } 376 | ], 377 | "properties": { 378 | "Node name for S&R": "LoraLoader" 379 | }, 380 | "widgets_values": [ 381 | "LCM_LoRA_Weights_SD15.safetensors", 382 | 1, 383 | 1 384 | ] 385 | }, 386 | { 387 | "id": 4, 388 | "type": "CheckpointLoaderSimple", 389 | "pos": [ 390 | -763, 391 | -6 392 | ], 393 | "size": { 394 | "0": 315, 395 | "1": 98 396 | }, 397 | "flags": {}, 398 | "order": 3, 399 | "mode": 0, 400 | "outputs": [ 401 | { 402 | "name": "MODEL", 403 | "type": "MODEL", 404 | "links": [ 405 | 17 406 | ], 407 | "slot_index": 0, 408 | "label": "MODEL" 409 | }, 410 | { 411 | "name": "CLIP", 412 | "type": "CLIP", 413 | "links": [ 414 | 18 415 | ], 416 | "slot_index": 1, 417 | "label": "CLIP" 418 | }, 419 | { 420 | "name": "VAE", 421 | "type": "VAE", 422 | "links": [ 423 | 8 424 | ], 425 | "slot_index": 2, 426 | "label": "VAE" 427 | } 428 | ], 429 | "properties": { 430 | "Node name for S&R": "CheckpointLoaderSimple" 431 | }, 432 | "widgets_values": [ 433 | "revAnimated_v11.safetensors" 434 | ] 435 | }, 436 | { 437 | "id": 12, 438 | "type": "ControlNetApply", 439 | "pos": [ 440 | 943, 441 | 677 442 | ], 443 | "size": { 444 | "0": 317.4000244140625, 445 | "1": 98 446 | }, 447 | "flags": {}, 448 | "order": 7, 449 | "mode": 0, 450 | "inputs": [ 451 | { 452 | "name": "conditioning", 453 | "type": "CONDITIONING", 454 | "link": 10, 455 | "label": "conditioning" 456 | }, 457 | { 458 | "name": "control_net", 459 | "type": "CONTROL_NET", 460 | "link": 13, 461 | "label": "control_net" 462 | }, 463 | { 464 | "name": "image", 465 | "type": "IMAGE", 466 | "link": 14, 467 | "label": "image", 468 | "slot_index": 2 469 | } 470 | ], 471 | "outputs": [ 472 | { 473 | "name": "CONDITIONING", 474 | "type": "CONDITIONING", 475 | "links": [ 476 | 15 477 | ], 478 | "shape": 3, 479 | "label": "CONDITIONING", 480 | "slot_index": 0 481 | } 482 | ], 483 | "properties": { 484 | "Node name for S&R": "ControlNetApply" 485 | }, 486 | "widgets_values": [ 487 | 0.7000000000000001 488 | ] 489 | } 490 | ], 491 | "links": [ 492 | [ 493 | 2, 494 | 5, 495 | 0, 496 | 3, 497 | 3, 498 | "LATENT" 499 | ], 500 | [ 501 | 6, 502 | 7, 503 | 0, 504 | 3, 505 | 2, 506 | "CONDITIONING" 507 | ], 508 | [ 509 | 7, 510 | 3, 511 | 0, 512 | 8, 513 | 0, 514 | "LATENT" 515 | ], 516 | [ 517 | 8, 518 | 4, 519 | 2, 520 | 8, 521 | 1, 522 | "VAE" 523 | ], 524 | [ 525 | 10, 526 | 6, 527 | 0, 528 | 12, 529 | 0, 530 | "CONDITIONING" 531 | ], 532 | [ 533 | 13, 534 | 15, 535 | 0, 536 | 12, 537 | 1, 538 | "CONTROL_NET" 539 | ], 540 | [ 541 | 14, 542 | 16, 543 | 0, 544 | 12, 545 | 2, 546 | "IMAGE" 547 | ], 548 | [ 549 | 15, 550 | 12, 551 | 0, 552 | 3, 553 | 1, 554 | "CONDITIONING" 555 | ], 556 | [ 557 | 16, 558 | 8, 559 | 0, 560 | 17, 561 | 0, 562 | "IMAGE" 563 | ], 564 | [ 565 | 17, 566 | 4, 567 | 0, 568 | 18, 569 | 0, 570 | "MODEL" 571 | ], 572 | [ 573 | 18, 574 | 4, 575 | 1, 576 | 18, 577 | 1, 578 | "CLIP" 579 | ], 580 | [ 581 | 19, 582 | 18, 583 | 1, 584 | 6, 585 | 0, 586 | "CLIP" 587 | ], 588 | [ 589 | 20, 590 | 18, 591 | 1, 592 | 7, 593 | 0, 594 | "CLIP" 595 | ], 596 | [ 597 | 21, 598 | 18, 599 | 0, 600 | 3, 601 | 0, 602 | "MODEL" 603 | ] 604 | ], 605 | "groups": [], 606 | "config": {}, 607 | "extra": {}, 608 | "version": 0.4 609 | } -------------------------------------------------------------------------------- /loose_controlnet_example/depth.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GempollAI/LooseControlNet/d9e7989e454f546cbc43df806cf04b0b6b696d05/loose_controlnet_example/depth.jpeg -------------------------------------------------------------------------------- /loosecontrol.py: -------------------------------------------------------------------------------- 1 | from diffusers import ( 2 | ControlNetModel, 3 | StableDiffusionControlNetPipeline, 4 | UniPCMultistepScheduler, 5 | ) 6 | import torch 7 | import PIL 8 | import PIL.Image 9 | from diffusers.loaders import UNet2DConditionLoadersMixin 10 | from typing import Dict 11 | from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor 12 | import functools 13 | from cross_frame_attention import CrossFrameAttnProcessor 14 | 15 | TEXT_ENCODER_NAME = "text_encoder" 16 | UNET_NAME = "unet" 17 | NEGATIVE_PROMPT = "blurry, text, caption, lowquality, lowresolution, low res, grainy, ugly" 18 | 19 | def attach_loaders_mixin(model): 20 | # hacky way to make ControlNet work with LoRA. This may not be required in future versions of diffusers. 21 | model.text_encoder_name = TEXT_ENCODER_NAME 22 | model.unet_name = UNET_NAME 23 | r""" 24 | Attach the [`UNet2DConditionLoadersMixin`] to a model. This will add the 25 | all the methods from the mixin 'UNet2DConditionLoadersMixin' to the model. 26 | """ 27 | # mixin_instance = UNet2DConditionLoadersMixin() 28 | for attr_name, attr_value in vars(UNet2DConditionLoadersMixin).items(): 29 | # print(attr_name) 30 | if callable(attr_value): 31 | # setattr(model, attr_name, functools.partialmethod(attr_value, model).__get__(model, model.__class__)) 32 | setattr(model, attr_name, functools.partial(attr_value, model)) 33 | return model 34 | 35 | def set_attn_processor(module, processor, _remove_lora=False): 36 | def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): 37 | if hasattr(module, "set_processor"): 38 | if not isinstance(processor, dict): 39 | module.set_processor(processor, _remove_lora=_remove_lora) 40 | else: 41 | module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora) 42 | 43 | for sub_name, child in module.named_children(): 44 | fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) 45 | 46 | for name, module in module.named_children(): 47 | fn_recursive_attn_processor(name, module, processor) 48 | 49 | 50 | 51 | class ControlNetX(ControlNetModel, UNet2DConditionLoadersMixin): 52 | # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors 53 | # This may not be required in future versions of diffusers. 54 | @property 55 | def attn_processors(self) -> Dict[str, AttentionProcessor]: 56 | r""" 57 | Returns: 58 | `dict` of attention processors: A dictionary containing all attention processors used in the model with 59 | indexed by its weight name. 60 | """ 61 | # set recursively 62 | processors = {} 63 | 64 | def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): 65 | if hasattr(module, "get_processor"): 66 | processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True) 67 | 68 | for sub_name, child in module.named_children(): 69 | fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) 70 | 71 | return processors 72 | 73 | for name, module in self.named_children(): 74 | fn_recursive_add_processors(name, module, processors) 75 | 76 | return processors 77 | 78 | class ControlNetPipeline: 79 | def __init__(self, checkpoint="lllyasviel/control_v11f1p_sd15_depth", sd_checkpoint="runwayml/stable-diffusion-v1-5") -> None: 80 | controlnet = ControlNetX.from_pretrained(checkpoint) 81 | self.pipe = StableDiffusionControlNetPipeline.from_pretrained( 82 | sd_checkpoint, controlnet=controlnet, requires_safety_checker=False, safety_checker=None, 83 | torch_dtype=torch.float16) 84 | self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) 85 | 86 | @torch.no_grad() 87 | def __call__(self, 88 | prompt: str="", 89 | height=512, 90 | width=512, 91 | control_image=None, 92 | controlnet_conditioning_scale=1.0, 93 | num_inference_steps: int=20, 94 | **kwargs) -> PIL.Image.Image: 95 | 96 | out = self.pipe(prompt, control_image, 97 | height=height, width=width, 98 | num_inference_steps=num_inference_steps, 99 | controlnet_conditioning_scale=controlnet_conditioning_scale, 100 | **kwargs).images 101 | 102 | return out[0] if len(out) == 1 else out 103 | 104 | def to(self, *args, **kwargs): 105 | self.pipe.to(*args, **kwargs) 106 | return self 107 | 108 | 109 | class LooseControlNet(ControlNetPipeline): 110 | def __init__(self, loose_control_weights="shariqfarooq/loose-control-3dbox", cn_checkpoint="lllyasviel/control_v11f1p_sd15_depth", sd_checkpoint="runwayml/stable-diffusion-v1-5") -> None: 111 | super().__init__(cn_checkpoint, sd_checkpoint) 112 | self.pipe.controlnet = attach_loaders_mixin(self.pipe.controlnet) 113 | self.pipe.controlnet.load_attn_procs(loose_control_weights) 114 | 115 | def set_normal_attention(self): 116 | self.pipe.unet.set_attn_processor(AttnProcessor()) 117 | 118 | def set_cf_attention(self, _remove_lora=False): 119 | for upblocks in self.pipe.unet.up_blocks[-2:]: 120 | set_attn_processor(upblocks, CrossFrameAttnProcessor(), _remove_lora=_remove_lora) 121 | 122 | def edit(self, depth, depth_edit, prompt, prompt_edit=None, seed=42, seed_edit=None, negative_prompt=NEGATIVE_PROMPT, controlnet_conditioning_scale=1.0, num_inference_steps=20, **kwargs): 123 | if prompt_edit is None: 124 | prompt_edit = prompt 125 | 126 | if seed_edit is None: 127 | seed_edit = seed 128 | 129 | seed = int(seed) 130 | seed_edit = int(seed_edit) 131 | control_image = [depth, depth_edit] 132 | prompt = [prompt, prompt_edit] 133 | generator = [torch.Generator().manual_seed(seed), torch.Generator().manual_seed(seed_edit)] 134 | gen = self.pipe(prompt, control_image=control_image, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator, num_inference_steps=num_inference_steps, negative_prompt=negative_prompt, **kwargs)[-1] 135 | return gen -------------------------------------------------------------------------------- /weight_fusion.py: -------------------------------------------------------------------------------- 1 | from loosecontrol import LooseControlNet 2 | 3 | FUSION_SCALE = 1.0 4 | USE_CUDA = True 5 | USE_HUGGINGFACE_WEIGHTS = True 6 | 7 | if __name__ == "__main__": 8 | print(f""" 9 | Fusing weights with configs: 10 | FUSION_SCALE: {FUSION_SCALE} 11 | USE_CUDA: {USE_CUDA} 12 | USE_HUGGINGFACE_WEIGHTS: {USE_HUGGINGFACE_WEIGHTS} 13 | 14 | You can modify these in this script. 15 | """) 16 | 17 | if USE_HUGGINGFACE_WEIGHTS: 18 | lcn = LooseControlNet("shariqfarooq/loose-control-3dbox") 19 | else: 20 | # Modify below to use your pre-downloaded weights 21 | lcn = LooseControlNet(loose_control_weights="..", cn_checkpoint="..", sd_checkpoint="..") 22 | 23 | if USE_CUDA: 24 | lcn = lcn.to("cuda") 25 | 26 | lcn.pipe.controlnet.fuse_lora(lora_scale=FUSION_SCALE) 27 | lcn.pipe.controlnet.save_pretrained("./fused_weights") 28 | print("Done! Saved to ./fused_weights") 29 | --------------------------------------------------------------------------------