├── .github └── workflows │ └── publish.yml ├── LICENSE ├── README.md ├── __init__.py ├── models ├── jointblock.py └── resampler.py ├── nodes.py ├── pyproject.toml ├── requirements.txt └── workflows └── SD3.5L IP-Adapter.json /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Comfy registry 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "pyproject.toml" 9 | 10 | permissions: 11 | issues: write 12 | 13 | jobs: 14 | publish-node: 15 | name: Publish Custom Node to registry 16 | runs-on: ubuntu-latest 17 | if: ${{ github.repository_owner == 'Slickytail' }} 18 | steps: 19 | - name: Check out code 20 | uses: actions/checkout@v4 21 | - name: Publish Custom Node 22 | uses: Comfy-Org/publish-node-action@v1 23 | with: 24 | ## Add your own personal access token to your Github Repository secrets and reference it here. 25 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-IPAdapter-SD3 2 | 3 | ComfyUI implementation of the [InstantX IP-Adapter for SD3.5 Large](https://huggingface.co/InstantX/SD3.5-Large-IP-Adapter). 4 | 5 | ## Installation 6 | 7 | Download [`ip-adapter.bin` from the original repository](https://huggingface.co/InstantX/SD3.5-Large-IP-Adapter/blob/main/ip-adapter.bin), and place it in the `models/ipadapter` folder of your ComfyUI installation. (I suggest renaming it to something easier to remember). 8 | 9 | Download [`siglip_vision_patch14_384.safetensors` from ComfyUI's rehost](https://huggingface.co/Comfy-Org/sigclip_vision_384) and place it in the `models/clip_vision` folder. 10 | The original model was trained on [google/siglip-400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384). To be honest, I'm not sure where the comfy rehost model comes from, but it gives very similar results: so I suspect that it's a slightly modified version of the original google model. 11 | 12 | ## Usage 13 | The IP-Adapter can be used with **Stable Diffusion 3.5 Large** and **Stable Diffusion 3.5 Large Turbo**. 14 | Please note that the model was originally trained on SD3.5 Large, and so the accuracy of the adapter is not as good when using the Turbo model. 15 | An example workflow can be found in the `workflows` directory. 16 | 17 | I recommend using an image weight of 0.5. 18 | 19 | ## TODOs 20 | - Allow multiple adapters to be added together and not overwrite each other. 21 | - Replace hardcoded parameters (such as hidden size/num layers) with values determined from the model. Would allow the same code to be used for future adapters, e.g. for SD3.5 Medium. 22 | - Convert the adapter to safetensors. 23 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 2 | 3 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] 4 | -------------------------------------------------------------------------------- /models/jointblock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | from einops import rearrange 5 | 6 | from comfy.ldm.modules.attention import optimized_attention 7 | from comfy.ldm.modules.diffusionmodules.mmdit import ( 8 | RMSNorm, 9 | JointBlock, 10 | ) 11 | 12 | 13 | class AdaLayerNorm(nn.Module): 14 | """ 15 | Norm layer adaptive layer norm zero (adaLN-Zero). 16 | 17 | Parameters: 18 | embedding_dim (`int`): The size of each embedding vector. 19 | num_embeddings (`int`): The size of the embeddings dictionary. 20 | """ 21 | 22 | def __init__(self, embedding_dim: int, time_embedding_dim=None, mode="normal"): 23 | super().__init__() 24 | 25 | self.silu = nn.SiLU() 26 | num_params_dict = dict( 27 | zero=6, 28 | normal=2, 29 | ) 30 | num_params = num_params_dict[mode] 31 | self.linear = nn.Linear( 32 | time_embedding_dim or embedding_dim, num_params * embedding_dim, bias=True 33 | ) 34 | self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6) 35 | self.mode = mode 36 | 37 | def forward( 38 | self, 39 | x, 40 | hidden_dtype=None, 41 | emb=None, 42 | ): 43 | emb = self.linear(self.silu(emb)) 44 | if self.mode == "normal": 45 | shift_msa, scale_msa = emb.chunk(2, dim=1) 46 | x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] 47 | return x 48 | 49 | elif self.mode == "zero": 50 | shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk( 51 | 6, dim=1 52 | ) 53 | x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] 54 | return x, gate_msa, shift_mlp, scale_mlp, gate_mlp 55 | 56 | 57 | class IPAttnProcessor(nn.Module): 58 | 59 | def __init__( 60 | self, 61 | hidden_size=None, 62 | cross_attention_dim=None, 63 | ip_hidden_states_dim=None, 64 | ip_encoder_hidden_states_dim=None, 65 | head_dim=None, 66 | timesteps_emb_dim=1280, 67 | ): 68 | super().__init__() 69 | 70 | self.norm_ip = AdaLayerNorm( 71 | ip_hidden_states_dim, time_embedding_dim=timesteps_emb_dim 72 | ) 73 | self.to_k_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False) 74 | self.to_v_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False) 75 | self.norm_q = RMSNorm(head_dim, 1e-6) 76 | self.norm_k = RMSNorm(head_dim, 1e-6) 77 | self.norm_ip_k = RMSNorm(head_dim, 1e-6) 78 | 79 | def forward( 80 | self, 81 | ip_hidden_states, 82 | img_query, 83 | img_key=None, 84 | img_value=None, 85 | t_emb=None, 86 | n_heads=1, 87 | ): 88 | if ip_hidden_states is None: 89 | return None 90 | 91 | if not hasattr(self, "to_k_ip") or not hasattr(self, "to_v_ip"): 92 | return None 93 | 94 | # norm ip input 95 | norm_ip_hidden_states = self.norm_ip(ip_hidden_states, emb=t_emb) 96 | 97 | # to k and v 98 | ip_key = self.to_k_ip(norm_ip_hidden_states) 99 | ip_value = self.to_v_ip(norm_ip_hidden_states) 100 | 101 | # reshape 102 | img_query = rearrange(img_query, "b l (h d) -> b h l d", h=n_heads) 103 | img_key = rearrange(img_key, "b l (h d) -> b h l d", h=n_heads) 104 | # note that the image is in a different shape: b l h d 105 | # so we transpose to b h l d 106 | # or do we have to transpose here? 107 | img_value = torch.transpose(img_value, 1, 2) 108 | ip_key = rearrange(ip_key, "b l (h d) -> b h l d", h=n_heads) 109 | ip_value = rearrange(ip_value, "b l (h d) -> b h l d", h=n_heads) 110 | 111 | # norm 112 | img_query = self.norm_q(img_query) 113 | img_key = self.norm_k(img_key) 114 | ip_key = self.norm_ip_k(ip_key) 115 | 116 | # cat img 117 | key = torch.cat([img_key, ip_key], dim=2) 118 | value = torch.cat([img_value, ip_value], dim=2) 119 | 120 | # 121 | ip_hidden_states = F.scaled_dot_product_attention( 122 | img_query, key, value, dropout_p=0.0, is_causal=False 123 | ) 124 | ip_hidden_states = rearrange(ip_hidden_states, "b h l d -> b l (h d)") 125 | ip_hidden_states = ip_hidden_states.to(img_query.dtype) 126 | return ip_hidden_states 127 | 128 | 129 | class JointBlockIPWrapper: 130 | """To be used as a patch_replace with Comfy""" 131 | 132 | def __init__( 133 | self, 134 | original_block: JointBlock, 135 | adapter: IPAttnProcessor, 136 | ip_options=None, 137 | ): 138 | self.original_block = original_block 139 | self.adapter = adapter 140 | if ip_options is None: 141 | ip_options = {} 142 | self.ip_options = ip_options 143 | 144 | def block_mixing(self, context, x, context_block, x_block, c): 145 | """ 146 | Comes from mmdit.py. Modified to add ipadapter attention. 147 | """ 148 | context_qkv, context_intermediates = context_block.pre_attention(context, c) 149 | 150 | if x_block.x_block_self_attn: 151 | x_qkv, x_qkv2, x_intermediates = x_block.pre_attention_x(x, c) 152 | else: 153 | x_qkv, x_intermediates = x_block.pre_attention(x, c) 154 | 155 | qkv = tuple(torch.cat((context_qkv[j], x_qkv[j]), dim=1) for j in range(3)) 156 | 157 | attn = optimized_attention( 158 | qkv[0], 159 | qkv[1], 160 | qkv[2], 161 | heads=x_block.attn.num_heads, 162 | ) 163 | context_attn, x_attn = ( 164 | attn[:, : context_qkv[0].shape[1]], 165 | attn[:, context_qkv[0].shape[1] :], 166 | ) 167 | # if the current timestep is not in the ipadapter enabling range, then the resampler wasn't run 168 | # and the hidden states will be None 169 | if ( 170 | self.ip_options["hidden_states"] is not None 171 | and self.ip_options["t_emb"] is not None 172 | ): 173 | # IP-Adapter 174 | ip_attn = self.adapter( 175 | self.ip_options["hidden_states"], 176 | *x_qkv, 177 | self.ip_options["t_emb"], 178 | x_block.attn.num_heads, 179 | ) 180 | x_attn = x_attn + ip_attn * self.ip_options["weight"] 181 | 182 | # Everything else is unchanged 183 | if not context_block.pre_only: 184 | context = context_block.post_attention(context_attn, *context_intermediates) 185 | 186 | else: 187 | context = None 188 | if x_block.x_block_self_attn: 189 | attn2 = optimized_attention( 190 | x_qkv2[0], 191 | x_qkv2[1], 192 | x_qkv2[2], 193 | heads=x_block.attn2.num_heads, 194 | ) 195 | x = x_block.post_attention_x(x_attn, attn2, *x_intermediates) 196 | else: 197 | x = x_block.post_attention(x_attn, *x_intermediates) 198 | return context, x 199 | 200 | def __call__(self, args, _): 201 | # Code from mmdit.py: 202 | # in this case, we're blocks_replace[("double_block", i)] 203 | # note that although we're passed the original block, 204 | # we can't actually get it from inside its wrapper 205 | # (which would simplify the whole code...) 206 | # ``` 207 | # def block_wrap(args): 208 | # out = {} 209 | # out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"]) 210 | # return out 211 | # out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod}, {"original_block": block_wrap}) 212 | # context = out["txt"] 213 | # x = out["img"] 214 | # ``` 215 | c, x = self.block_mixing( 216 | args["txt"], 217 | args["img"], 218 | self.original_block.context_block, 219 | self.original_block.x_block, 220 | c=args["vec"], 221 | ) 222 | return {"txt": c, "img": x} 223 | -------------------------------------------------------------------------------- /models/resampler.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py 2 | import math 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from diffusers.models.embeddings import Timesteps, TimestepEmbedding 8 | 9 | 10 | # FFN 11 | def FeedForward(dim, mult=4): 12 | inner_dim = int(dim * mult) 13 | return nn.Sequential( 14 | nn.LayerNorm(dim), 15 | nn.Linear(dim, inner_dim, bias=False), 16 | nn.GELU(), 17 | nn.Linear(inner_dim, dim, bias=False), 18 | ) 19 | 20 | 21 | def reshape_tensor(x, heads): 22 | bs, length, width = x.shape 23 | # (bs, length, width) --> (bs, length, n_heads, dim_per_head) 24 | x = x.view(bs, length, heads, -1) 25 | # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) 26 | x = x.transpose(1, 2) 27 | # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) 28 | x = x.reshape(bs, heads, length, -1) 29 | return x 30 | 31 | 32 | class PerceiverAttention(nn.Module): 33 | def __init__(self, *, dim, dim_head=64, heads=8): 34 | super().__init__() 35 | self.scale = dim_head**-0.5 36 | self.dim_head = dim_head 37 | self.heads = heads 38 | inner_dim = dim_head * heads 39 | 40 | self.norm1 = nn.LayerNorm(dim) 41 | self.norm2 = nn.LayerNorm(dim) 42 | 43 | self.to_q = nn.Linear(dim, inner_dim, bias=False) 44 | self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) 45 | self.to_out = nn.Linear(inner_dim, dim, bias=False) 46 | 47 | def forward(self, x, latents, shift=None, scale=None): 48 | """ 49 | Args: 50 | x (torch.Tensor): image features 51 | shape (b, n1, D) 52 | latent (torch.Tensor): latent features 53 | shape (b, n2, D) 54 | """ 55 | x = self.norm1(x) 56 | latents = self.norm2(latents) 57 | 58 | if shift is not None and scale is not None: 59 | latents = latents * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) 60 | 61 | b, l, _ = latents.shape 62 | 63 | q = self.to_q(latents) 64 | kv_input = torch.cat((x, latents), dim=-2) 65 | k, v = self.to_kv(kv_input).chunk(2, dim=-1) 66 | 67 | q = reshape_tensor(q, self.heads) 68 | k = reshape_tensor(k, self.heads) 69 | v = reshape_tensor(v, self.heads) 70 | 71 | # attention 72 | scale = 1 / math.sqrt(math.sqrt(self.dim_head)) 73 | weight = (q * scale) @ (k * scale).transpose( 74 | -2, -1 75 | ) # More stable with f16 than dividing afterwards 76 | weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) 77 | out = weight @ v 78 | 79 | out = out.permute(0, 2, 1, 3).reshape(b, l, -1) 80 | 81 | return self.to_out(out) 82 | 83 | 84 | class Resampler(nn.Module): 85 | def __init__( 86 | self, 87 | dim=1024, 88 | depth=8, 89 | dim_head=64, 90 | heads=16, 91 | num_queries=8, 92 | embedding_dim=768, 93 | output_dim=1024, 94 | ff_mult=4, 95 | *args, 96 | **kwargs, 97 | ): 98 | super().__init__() 99 | 100 | self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) 101 | 102 | self.proj_in = nn.Linear(embedding_dim, dim) 103 | 104 | self.proj_out = nn.Linear(dim, output_dim) 105 | self.norm_out = nn.LayerNorm(output_dim) 106 | 107 | self.layers = nn.ModuleList([]) 108 | for _ in range(depth): 109 | self.layers.append( 110 | nn.ModuleList( 111 | [ 112 | PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), 113 | FeedForward(dim=dim, mult=ff_mult), 114 | ] 115 | ) 116 | ) 117 | 118 | def forward(self, x): 119 | 120 | latents = self.latents.repeat(x.size(0), 1, 1) 121 | 122 | x = self.proj_in(x) 123 | 124 | for attn, ff in self.layers: 125 | latents = attn(x, latents) + latents 126 | latents = ff(latents) + latents 127 | 128 | latents = self.proj_out(latents) 129 | return self.norm_out(latents) 130 | 131 | 132 | class TimeResampler(nn.Module): 133 | def __init__( 134 | self, 135 | dim=1024, 136 | depth=8, 137 | dim_head=64, 138 | heads=16, 139 | num_queries=8, 140 | embedding_dim=768, 141 | output_dim=1024, 142 | ff_mult=4, 143 | timestep_in_dim=320, 144 | timestep_flip_sin_to_cos=True, 145 | timestep_freq_shift=0, 146 | ): 147 | super().__init__() 148 | 149 | self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) 150 | 151 | self.proj_in = nn.Linear(embedding_dim, dim) 152 | 153 | self.proj_out = nn.Linear(dim, output_dim) 154 | self.norm_out = nn.LayerNorm(output_dim) 155 | 156 | self.layers = nn.ModuleList([]) 157 | for _ in range(depth): 158 | self.layers.append( 159 | nn.ModuleList( 160 | [ 161 | # msa 162 | PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), 163 | # ff 164 | FeedForward(dim=dim, mult=ff_mult), 165 | # adaLN 166 | nn.Sequential(nn.SiLU(), nn.Linear(dim, 4 * dim, bias=True)), 167 | ] 168 | ) 169 | ) 170 | 171 | # time 172 | self.time_proj = Timesteps( 173 | timestep_in_dim, timestep_flip_sin_to_cos, timestep_freq_shift 174 | ) 175 | self.time_embedding = TimestepEmbedding(timestep_in_dim, dim, act_fn="silu") 176 | 177 | # adaLN 178 | # self.adaLN_modulation = nn.Sequential( 179 | # nn.SiLU(), 180 | # nn.Linear(timestep_out_dim, 6 * timestep_out_dim, bias=True) 181 | # ) 182 | 183 | def forward(self, x, timestep, need_temb=False): 184 | timestep_emb = self.embedding_time(x, timestep) # bs, dim 185 | 186 | latents = self.latents.repeat(x.size(0), 1, 1) 187 | 188 | x = self.proj_in(x) 189 | x = x + timestep_emb[:, None] 190 | 191 | for attn, ff, adaLN_modulation in self.layers: 192 | shift_msa, scale_msa, shift_mlp, scale_mlp = adaLN_modulation( 193 | timestep_emb 194 | ).chunk(4, dim=1) 195 | latents = attn(x, latents, shift_msa, scale_msa) + latents 196 | 197 | res = latents 198 | for idx_ff in range(len(ff)): 199 | layer_ff = ff[idx_ff] 200 | latents = layer_ff(latents) 201 | if idx_ff == 0 and isinstance(layer_ff, nn.LayerNorm): # adaLN 202 | latents = latents * ( 203 | 1 + scale_mlp.unsqueeze(1) 204 | ) + shift_mlp.unsqueeze(1) 205 | latents = latents + res 206 | 207 | # latents = ff(latents) + latents 208 | 209 | latents = self.proj_out(latents) 210 | latents = self.norm_out(latents) 211 | 212 | if need_temb: 213 | return latents, timestep_emb 214 | else: 215 | return latents 216 | 217 | def embedding_time(self, sample, timestep): 218 | 219 | # 1. time 220 | timesteps = timestep 221 | if not torch.is_tensor(timesteps): 222 | # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can 223 | # This would be a good case for the `match` statement (Python 3.10+) 224 | is_mps = sample.device.type == "mps" 225 | if isinstance(timestep, float): 226 | dtype = torch.float32 if is_mps else torch.float64 227 | else: 228 | dtype = torch.int32 if is_mps else torch.int64 229 | timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) 230 | elif len(timesteps.shape) == 0: 231 | timesteps = timesteps[None].to(sample.device) 232 | 233 | # broadcast to batch dimension in a way that's compatible with ONNX/Core ML 234 | timesteps = timesteps.expand(sample.shape[0]) 235 | 236 | t_emb = self.time_proj(timesteps) 237 | 238 | # timesteps does not contain any weights and will always return f32 tensors 239 | # but time_embedding might actually be running in fp16. so we need to cast here. 240 | # there might be better ways to encapsulate this. 241 | t_emb = t_emb.to(dtype=sample.dtype) 242 | 243 | emb = self.time_embedding(t_emb, None) 244 | return emb 245 | -------------------------------------------------------------------------------- /nodes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import torch 5 | import folder_paths 6 | 7 | from .models.resampler import TimeResampler 8 | from .models.jointblock import JointBlockIPWrapper, IPAttnProcessor 9 | 10 | MODELS_DIR = os.path.join(folder_paths.models_dir, "ipadapter") 11 | if "ipadapter" not in folder_paths.folder_names_and_paths: 12 | current_paths = [MODELS_DIR] 13 | else: 14 | current_paths, _ = folder_paths.folder_names_and_paths["ipadapter"] 15 | folder_paths.folder_names_and_paths["ipadapter"] = ( 16 | current_paths, 17 | folder_paths.supported_pt_extensions, 18 | ) 19 | 20 | 21 | def patch( 22 | patcher, 23 | ip_procs, 24 | resampler: TimeResampler, 25 | clip_embeds, 26 | weight=1.0, 27 | start=0.0, 28 | end=1.0, 29 | ): 30 | """ 31 | Patches a model_sampler to add the ipadapter 32 | """ 33 | mmdit = patcher.model.diffusion_model 34 | timestep_schedule_max = patcher.model.model_config.sampling_settings.get( 35 | "timesteps", 1000 36 | ) 37 | # hook the model's forward function 38 | # so that when it gets called, we can grab the timestep and send it to the resampler 39 | ip_options = { 40 | "hidden_states": None, 41 | "t_emb": None, 42 | "weight": weight, 43 | } 44 | 45 | def ddit_wrapper(forward, args): 46 | # this is between 0 and 1, so the adapters can calculate start_point and end_point 47 | # actually, do we need to get the sigma value instead? 48 | t_percent = 1 - args["timestep"].flatten()[0].cpu().item() 49 | if start <= t_percent <= end: 50 | batch_size = args["input"].shape[0] // len(args["cond_or_uncond"]) 51 | # if we're only doing cond or only doing uncond, only pass one of them through the resampler 52 | embeds = clip_embeds[args["cond_or_uncond"]] 53 | # slight efficiency optimization todo: pass the embeds through and then afterwards 54 | # repeat to the batch size 55 | embeds = torch.repeat_interleave(embeds, batch_size, dim=0) 56 | # the resampler wants between 0 and MAX_STEPS 57 | timestep = args["timestep"] * timestep_schedule_max 58 | image_emb, t_emb = resampler(embeds, timestep, need_temb=True) 59 | # these will need to be accessible to the IPAdapters 60 | ip_options["hidden_states"] = image_emb 61 | ip_options["t_emb"] = t_emb 62 | else: 63 | ip_options["hidden_states"] = None 64 | ip_options["t_emb"] = None 65 | 66 | return forward(args["input"], args["timestep"], **args["c"]) 67 | 68 | patcher.set_model_unet_function_wrapper(ddit_wrapper) 69 | # patch each dit block 70 | for i, block in enumerate(mmdit.joint_blocks): 71 | wrapper = JointBlockIPWrapper(block, ip_procs[i], ip_options) 72 | patcher.set_model_patch_replace(wrapper, "dit", "double_block", i) 73 | 74 | 75 | class SD3IPAdapter: 76 | def __init__(self, checkpoint: str, device): 77 | self.device = device 78 | # load the checkpoint right away 79 | self.state_dict = torch.load( 80 | os.path.join(MODELS_DIR, checkpoint), 81 | map_location=self.device, 82 | weights_only=True, 83 | ) 84 | # todo: infer some of the params from the checkpoint instead of hardcoded 85 | self.resampler = TimeResampler( 86 | dim=1280, 87 | depth=4, 88 | dim_head=64, 89 | heads=20, 90 | num_queries=64, 91 | embedding_dim=1152, 92 | output_dim=2432, 93 | ff_mult=4, 94 | timestep_in_dim=320, 95 | timestep_flip_sin_to_cos=True, 96 | timestep_freq_shift=0, 97 | ) 98 | self.resampler.eval() 99 | self.resampler.to(self.device, dtype=torch.float16) 100 | self.resampler.load_state_dict(self.state_dict["image_proj"]) 101 | 102 | # now we'll create the attention processors 103 | # ip_adapter.keys looks like [0.proj, 0.to_k, ..., 1.proj, 1.to_k, ...] 104 | n_procs = len( 105 | set(x.split(".")[0] for x in self.state_dict["ip_adapter"].keys()) 106 | ) 107 | self.procs = torch.nn.ModuleList( 108 | [ 109 | # this is hardcoded for SD3.5L 110 | IPAttnProcessor( 111 | hidden_size=2432, 112 | cross_attention_dim=2432, 113 | ip_hidden_states_dim=2432, 114 | ip_encoder_hidden_states_dim=2432, 115 | head_dim=64, 116 | timesteps_emb_dim=1280, 117 | ).to(self.device, dtype=torch.float16) 118 | for _ in range(n_procs) 119 | ] 120 | ) 121 | self.procs.load_state_dict(self.state_dict["ip_adapter"]) 122 | 123 | 124 | class IPAdapterSD3Loader: 125 | @classmethod 126 | def INPUT_TYPES(s): 127 | return { 128 | "required": { 129 | "ipadapter": (folder_paths.get_filename_list("ipadapter"),), 130 | "provider": (["cuda", "cpu", "mps"],), 131 | } 132 | } 133 | 134 | RETURN_TYPES = ("IP_ADAPTER_SD3_INSTANTX",) 135 | RETURN_NAMES = ("ipadapter",) 136 | FUNCTION = "load_model" 137 | CATEGORY = "InstantXNodes" 138 | 139 | def load_model(self, ipadapter, provider): 140 | logging.info("Loading InstantX IPAdapter SD3 model.") 141 | model = SD3IPAdapter(ipadapter, provider) 142 | return (model,) 143 | 144 | 145 | class ApplyIPAdapterSD3: 146 | @classmethod 147 | def INPUT_TYPES(s): 148 | return { 149 | "required": { 150 | "model": ("MODEL",), 151 | "ipadapter": ("IP_ADAPTER_SD3_INSTANTX",), 152 | "image_embed": ("CLIP_VISION_OUTPUT",), 153 | "weight": ( 154 | "FLOAT", 155 | {"default": 1.0, "min": -1.0, "max": 5.0, "step": 0.05}, 156 | ), 157 | "start_percent": ( 158 | "FLOAT", 159 | {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}, 160 | ), 161 | "end_percent": ( 162 | "FLOAT", 163 | {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}, 164 | ), 165 | }, 166 | } 167 | 168 | RETURN_TYPES = ("MODEL",) 169 | FUNCTION = "apply_ipadapter" 170 | CATEGORY = "InstantXNodes" 171 | 172 | def apply_ipadapter( 173 | self, model, ipadapter, image_embed, weight, start_percent, end_percent 174 | ): 175 | # set model 176 | new_model = model.clone() 177 | # add uncond embedding 178 | image_embed = image_embed.penultimate_hidden_states 179 | embeds = torch.cat([image_embed, torch.zeros_like(image_embed)], dim=0).to( 180 | ipadapter.device, dtype=torch.float16 181 | ) 182 | patch( 183 | new_model, 184 | ipadapter.procs, 185 | ipadapter.resampler, 186 | embeds, 187 | weight=weight, 188 | start=start_percent, 189 | end=end_percent, 190 | ) 191 | return (new_model,) 192 | 193 | 194 | NODE_CLASS_MAPPINGS = { 195 | "IPAdapterSD3Loader": IPAdapterSD3Loader, 196 | "ApplyIPAdapterSD3": ApplyIPAdapterSD3, 197 | } 198 | 199 | NODE_DISPLAY_NAME_MAPPINGS = { 200 | "IPAdapterSD3Loader": "Load IPAdapter SD3 Model", 201 | "ApplyIPAdapterSD3": "Apply IPAdapter SD3 Model", 202 | } 203 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui-instantx-ipadapter-sd3" 3 | description = "ComfyUI implementation of the [a/InstantX IP-Adapter for SD3.5 Large](https://huggingface.co/InstantX/SD3.5-Large-IP-Adapter)." 4 | version = "1.0.0" 5 | license = {file = "LICENSE"} 6 | dependencies = ["torch", "einops", "diffusers"] 7 | 8 | [project.urls] 9 | Repository = "https://github.com/Slickytail/ComfyUI-InstantX-IPAdapter-SD3" 10 | # Used by Comfy Registry https://comfyregistry.org 11 | 12 | [tool.comfy] 13 | PublisherId = "slickytail" 14 | DisplayName = "ComfyUI-InstantX-IPAdapter-SD3" 15 | Icon = "" 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | einops 3 | diffusers 4 | -------------------------------------------------------------------------------- /workflows/SD3.5L IP-Adapter.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 50, 3 | "last_link_id": 81, 4 | "nodes": [ 5 | { 6 | "id": 8, 7 | "type": "VAEDecode", 8 | "pos": [ 9 | 1151, 10 | 195 11 | ], 12 | "size": [ 13 | 210, 14 | 46 15 | ], 16 | "flags": {}, 17 | "order": 11, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "samples", 22 | "type": "LATENT", 23 | "link": 52 24 | }, 25 | { 26 | "name": "vae", 27 | "type": "VAE", 28 | "link": 72 29 | } 30 | ], 31 | "outputs": [ 32 | { 33 | "name": "IMAGE", 34 | "type": "IMAGE", 35 | "links": [ 36 | 9 37 | ], 38 | "slot_index": 0 39 | } 40 | ], 41 | "properties": { 42 | "Node name for S&R": "VAEDecode" 43 | }, 44 | "widgets_values": [] 45 | }, 46 | { 47 | "id": 33, 48 | "type": "CLIPTextEncode", 49 | "pos": [ 50 | 312.6510009765625, 51 | 394.7559814453125 52 | ], 53 | "size": [ 54 | 422.84503173828125, 55 | 164.31304931640625 56 | ], 57 | "flags": { 58 | "collapsed": true 59 | }, 60 | "order": 7, 61 | "mode": 0, 62 | "inputs": [ 63 | { 64 | "name": "clip", 65 | "type": "CLIP", 66 | "link": 65, 67 | "slot_index": 0 68 | } 69 | ], 70 | "outputs": [ 71 | { 72 | "name": "CONDITIONING", 73 | "type": "CONDITIONING", 74 | "links": [ 75 | 55 76 | ], 77 | "slot_index": 0 78 | } 79 | ], 80 | "title": "CLIP Text Encode (Negative Prompt)", 81 | "properties": { 82 | "Node name for S&R": "CLIPTextEncode" 83 | }, 84 | "widgets_values": [ 85 | "" 86 | ], 87 | "color": "#322", 88 | "bgcolor": "#533" 89 | }, 90 | { 91 | "id": 27, 92 | "type": "EmptySD3LatentImage", 93 | "pos": [ 94 | 372.675048828125, 95 | 451.0669860839844 96 | ], 97 | "size": [ 98 | 315, 99 | 106 100 | ], 101 | "flags": {}, 102 | "order": 0, 103 | "mode": 0, 104 | "inputs": [], 105 | "outputs": [ 106 | { 107 | "name": "LATENT", 108 | "type": "LATENT", 109 | "links": [ 110 | 51 111 | ], 112 | "slot_index": 0, 113 | "shape": 3 114 | } 115 | ], 116 | "properties": { 117 | "Node name for S&R": "EmptySD3LatentImage" 118 | }, 119 | "widgets_values": [ 120 | 1024, 121 | 1024, 122 | 1 123 | ], 124 | "color": "#323", 125 | "bgcolor": "#535" 126 | }, 127 | { 128 | "id": 42, 129 | "type": "TripleCLIPLoader", 130 | "pos": [ 131 | -210.8244171142578, 132 | 375.64593505859375 133 | ], 134 | "size": [ 135 | 315, 136 | 106 137 | ], 138 | "flags": {}, 139 | "order": 1, 140 | "mode": 0, 141 | "inputs": [], 142 | "outputs": [ 143 | { 144 | "name": "CLIP", 145 | "type": "CLIP", 146 | "links": [ 147 | 64, 148 | 65 149 | ], 150 | "slot_index": 0 151 | } 152 | ], 153 | "properties": { 154 | "Node name for S&R": "TripleCLIPLoader" 155 | }, 156 | "widgets_values": [ 157 | "clip_g.safetensors", 158 | "clip_l.safetensors", 159 | "t5xxl_fp16.safetensors" 160 | ] 161 | }, 162 | { 163 | "id": 6, 164 | "type": "CLIPTextEncode", 165 | "pos": [ 166 | 258.14404296875, 167 | 159.2249755859375 168 | ], 169 | "size": [ 170 | 422.84503173828125, 171 | 164.31304931640625 172 | ], 173 | "flags": {}, 174 | "order": 6, 175 | "mode": 0, 176 | "inputs": [ 177 | { 178 | "name": "clip", 179 | "type": "CLIP", 180 | "link": 64 181 | } 182 | ], 183 | "outputs": [ 184 | { 185 | "name": "CONDITIONING", 186 | "type": "CONDITIONING", 187 | "links": [ 188 | 58 189 | ], 190 | "slot_index": 0 191 | } 192 | ], 193 | "title": "CLIP Text Encode (Positive Prompt)", 194 | "properties": { 195 | "Node name for S&R": "CLIPTextEncode" 196 | }, 197 | "widgets_values": [ 198 | "a man with red hair" 199 | ], 200 | "color": "#232", 201 | "bgcolor": "#353" 202 | }, 203 | { 204 | "id": 45, 205 | "type": "CheckpointLoaderSimple", 206 | "pos": [ 207 | -205.12525939941406, 208 | 203.60665893554688 209 | ], 210 | "size": [ 211 | 315, 212 | 98 213 | ], 214 | "flags": {}, 215 | "order": 2, 216 | "mode": 0, 217 | "inputs": [], 218 | "outputs": [ 219 | { 220 | "name": "MODEL", 221 | "type": "MODEL", 222 | "links": [ 223 | 80 224 | ], 225 | "slot_index": 0 226 | }, 227 | { 228 | "name": "CLIP", 229 | "type": "CLIP", 230 | "links": null 231 | }, 232 | { 233 | "name": "VAE", 234 | "type": "VAE", 235 | "links": [ 236 | 72 237 | ], 238 | "slot_index": 2 239 | } 240 | ], 241 | "properties": { 242 | "Node name for S&R": "CheckpointLoaderSimple" 243 | }, 244 | "widgets_values": [ 245 | "SD3.5-L.safetensors" 246 | ] 247 | }, 248 | { 249 | "id": 47, 250 | "type": "ApplyIPAdapterSD3", 251 | "pos": [ 252 | 401.8241882324219, 253 | 623.6758422851562 254 | ], 255 | "size": [ 256 | 315, 257 | 146 258 | ], 259 | "flags": {}, 260 | "order": 9, 261 | "mode": 0, 262 | "inputs": [ 263 | { 264 | "name": "model", 265 | "type": "MODEL", 266 | "link": 80 267 | }, 268 | { 269 | "name": "ipadapter", 270 | "type": "IP_ADAPTER_SD3_INSTANTX", 271 | "link": 75 272 | }, 273 | { 274 | "name": "image_embed", 275 | "type": "CLIP_VISION_OUTPUT", 276 | "link": 79 277 | } 278 | ], 279 | "outputs": [ 280 | { 281 | "name": "MODEL", 282 | "type": "MODEL", 283 | "links": [ 284 | 81 285 | ], 286 | "slot_index": 0 287 | } 288 | ], 289 | "properties": { 290 | "Node name for S&R": "ApplyIPAdapterSD3" 291 | }, 292 | "widgets_values": [ 293 | 0.5, 294 | 0, 295 | 1 296 | ] 297 | }, 298 | { 299 | "id": 38, 300 | "type": "LoadImage", 301 | "pos": [ 302 | -193.93606567382812, 303 | 539.2610473632812 304 | ], 305 | "size": [ 306 | 315, 307 | 314 308 | ], 309 | "flags": {}, 310 | "order": 3, 311 | "mode": 0, 312 | "inputs": [], 313 | "outputs": [ 314 | { 315 | "name": "IMAGE", 316 | "type": "IMAGE", 317 | "links": [ 318 | 77 319 | ], 320 | "slot_index": 0 321 | }, 322 | { 323 | "name": "MASK", 324 | "type": "MASK", 325 | "links": null 326 | } 327 | ], 328 | "properties": { 329 | "Node name for S&R": "LoadImage" 330 | }, 331 | "widgets_values": [ 332 | "oppenheimer.png", 333 | "image" 334 | ] 335 | }, 336 | { 337 | "id": 50, 338 | "type": "CLIPVisionEncode", 339 | "pos": [ 340 | -195.75955200195312, 341 | 904.2363891601562 342 | ], 343 | "size": [ 344 | 380.4000244140625, 345 | 78 346 | ], 347 | "flags": {}, 348 | "order": 8, 349 | "mode": 0, 350 | "inputs": [ 351 | { 352 | "name": "clip_vision", 353 | "type": "CLIP_VISION", 354 | "link": 78 355 | }, 356 | { 357 | "name": "image", 358 | "type": "IMAGE", 359 | "link": 77 360 | } 361 | ], 362 | "outputs": [ 363 | { 364 | "name": "CLIP_VISION_OUTPUT", 365 | "type": "CLIP_VISION_OUTPUT", 366 | "links": [ 367 | 79 368 | ], 369 | "slot_index": 0 370 | } 371 | ], 372 | "properties": { 373 | "Node name for S&R": "CLIPVisionEncode" 374 | }, 375 | "widgets_values": [ 376 | "center" 377 | ] 378 | }, 379 | { 380 | "id": 48, 381 | "type": "CLIPVisionLoader", 382 | "pos": [ 383 | -174.8817596435547, 384 | 1020.2452392578125 385 | ], 386 | "size": [ 387 | 315, 388 | 58 389 | ], 390 | "flags": {}, 391 | "order": 4, 392 | "mode": 0, 393 | "inputs": [], 394 | "outputs": [ 395 | { 396 | "name": "CLIP_VISION", 397 | "type": "CLIP_VISION", 398 | "links": [ 399 | 78 400 | ] 401 | } 402 | ], 403 | "properties": { 404 | "Node name for S&R": "CLIPVisionLoader" 405 | }, 406 | "widgets_values": [ 407 | "sigclip_vision_patch14_384.safetensors" 408 | ] 409 | }, 410 | { 411 | "id": 46, 412 | "type": "IPAdapterSD3Loader", 413 | "pos": [ 414 | 393.33660888671875, 415 | 859.3168334960938 416 | ], 417 | "size": [ 418 | 315, 419 | 82 420 | ], 421 | "flags": {}, 422 | "order": 5, 423 | "mode": 0, 424 | "inputs": [], 425 | "outputs": [ 426 | { 427 | "name": "ipadapter", 428 | "type": "IP_ADAPTER_SD3_INSTANTX", 429 | "links": [ 430 | 75 431 | ], 432 | "slot_index": 0 433 | } 434 | ], 435 | "properties": { 436 | "Node name for S&R": "IPAdapterSD3Loader" 437 | }, 438 | "widgets_values": [ 439 | "ip_sd35l_instantx.bin", 440 | "cuda" 441 | ] 442 | }, 443 | { 444 | "id": 9, 445 | "type": "SaveImage", 446 | "pos": [ 447 | 836.613037109375, 448 | 534.8397827148438 449 | ], 450 | "size": [ 451 | 560.4580688476562, 452 | 558.8997192382812 453 | ], 454 | "flags": {}, 455 | "order": 12, 456 | "mode": 0, 457 | "inputs": [ 458 | { 459 | "name": "images", 460 | "type": "IMAGE", 461 | "link": 9 462 | } 463 | ], 464 | "outputs": [], 465 | "properties": { 466 | "Node name for S&R": "SaveImage" 467 | }, 468 | "widgets_values": [ 469 | "ComfyUI" 470 | ] 471 | }, 472 | { 473 | "id": 31, 474 | "type": "KSampler", 475 | "pos": [ 476 | 816, 477 | 192 478 | ], 479 | "size": [ 480 | 315, 481 | 262 482 | ], 483 | "flags": {}, 484 | "order": 10, 485 | "mode": 0, 486 | "inputs": [ 487 | { 488 | "name": "model", 489 | "type": "MODEL", 490 | "link": 81 491 | }, 492 | { 493 | "name": "positive", 494 | "type": "CONDITIONING", 495 | "link": 58 496 | }, 497 | { 498 | "name": "negative", 499 | "type": "CONDITIONING", 500 | "link": 55 501 | }, 502 | { 503 | "name": "latent_image", 504 | "type": "LATENT", 505 | "link": 51 506 | } 507 | ], 508 | "outputs": [ 509 | { 510 | "name": "LATENT", 511 | "type": "LATENT", 512 | "links": [ 513 | 52 514 | ], 515 | "slot_index": 0, 516 | "shape": 3 517 | } 518 | ], 519 | "properties": { 520 | "Node name for S&R": "KSampler" 521 | }, 522 | "widgets_values": [ 523 | 0, 524 | "fixed", 525 | 25, 526 | 4.5, 527 | "euler", 528 | "normal", 529 | 1 530 | ] 531 | } 532 | ], 533 | "links": [ 534 | [ 535 | 9, 536 | 8, 537 | 0, 538 | 9, 539 | 0, 540 | "IMAGE" 541 | ], 542 | [ 543 | 51, 544 | 27, 545 | 0, 546 | 31, 547 | 3, 548 | "LATENT" 549 | ], 550 | [ 551 | 52, 552 | 31, 553 | 0, 554 | 8, 555 | 0, 556 | "LATENT" 557 | ], 558 | [ 559 | 55, 560 | 33, 561 | 0, 562 | 31, 563 | 2, 564 | "CONDITIONING" 565 | ], 566 | [ 567 | 58, 568 | 6, 569 | 0, 570 | 31, 571 | 1, 572 | "CONDITIONING" 573 | ], 574 | [ 575 | 64, 576 | 42, 577 | 0, 578 | 6, 579 | 0, 580 | "CLIP" 581 | ], 582 | [ 583 | 65, 584 | 42, 585 | 0, 586 | 33, 587 | 0, 588 | "CLIP" 589 | ], 590 | [ 591 | 72, 592 | 45, 593 | 2, 594 | 8, 595 | 1, 596 | "VAE" 597 | ], 598 | [ 599 | 75, 600 | 46, 601 | 0, 602 | 47, 603 | 1, 604 | "IP_ADAPTER_SD3_INSTANTX" 605 | ], 606 | [ 607 | 77, 608 | 38, 609 | 0, 610 | 50, 611 | 1, 612 | "IMAGE" 613 | ], 614 | [ 615 | 78, 616 | 48, 617 | 0, 618 | 50, 619 | 0, 620 | "CLIP_VISION" 621 | ], 622 | [ 623 | 79, 624 | 50, 625 | 0, 626 | 47, 627 | 2, 628 | "CLIP_VISION_OUTPUT" 629 | ], 630 | [ 631 | 80, 632 | 45, 633 | 0, 634 | 47, 635 | 0, 636 | "MODEL" 637 | ], 638 | [ 639 | 81, 640 | 47, 641 | 0, 642 | 31, 643 | 0, 644 | "MODEL" 645 | ] 646 | ], 647 | "groups": [], 648 | "config": {}, 649 | "extra": { 650 | "ds": { 651 | "scale": 0.6727499949325677, 652 | "offset": [ 653 | 739.7288455474428, 654 | 151.74268927695144 655 | ] 656 | }, 657 | "ue_links": [] 658 | }, 659 | "version": 0.4 660 | } --------------------------------------------------------------------------------