├── .github
    └── workflows
    │   └── publish.yml
├── LICENSE
├── README.md
├── __init__.py
├── models
    ├── jointblock.py
    └── resampler.py
├── nodes.py
├── pyproject.toml
├── requirements.txt
└── workflows
    └── SD3.5L IP-Adapter.json


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - "pyproject.toml"
 9 | 
10 | permissions:
11 |   issues: write
12 | 
13 | jobs:
14 |   publish-node:
15 |     name: Publish Custom Node to registry
16 |     runs-on: ubuntu-latest
17 |     if: ${{ github.repository_owner == 'Slickytail' }}
18 |     steps:
19 |       - name: Check out code
20 |         uses: actions/checkout@v4
21 |       - name: Publish Custom Node
22 |         uses: Comfy-Org/publish-node-action@v1
23 |         with:
24 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
25 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI-IPAdapter-SD3
 2 | 
 3 | ComfyUI implementation of the [InstantX IP-Adapter for SD3.5 Large](https://huggingface.co/InstantX/SD3.5-Large-IP-Adapter).
 4 | 
 5 | ## Installation
 6 | 
 7 | Download [`ip-adapter.bin` from the original repository](https://huggingface.co/InstantX/SD3.5-Large-IP-Adapter/blob/main/ip-adapter.bin), and place it in the `models/ipadapter` folder of your ComfyUI installation. (I suggest renaming it to something easier to remember).
 8 | 
 9 | Download [`siglip_vision_patch14_384.safetensors` from ComfyUI's rehost](https://huggingface.co/Comfy-Org/sigclip_vision_384) and place it in the `models/clip_vision` folder.  
10 | The original model was trained on [google/siglip-400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384). To be honest, I'm not sure where the comfy rehost model comes from, but it gives very similar results: so I suspect that it's a slightly modified version of the original google model.
11 | 
12 | ## Usage
13 | The IP-Adapter can be used with **Stable Diffusion 3.5 Large** and **Stable Diffusion 3.5 Large Turbo**.  
14 | Please note that the model was originally trained on SD3.5 Large, and so the accuracy of the adapter is not as good when using the Turbo model.  
15 | An example workflow can be found in the `workflows` directory.
16 | 
17 | I recommend using an image weight of 0.5.
18 | 
19 | ## TODOs
20 | - Allow multiple adapters to be added together and not overwrite each other.
21 | - Replace hardcoded parameters (such as hidden size/num layers) with values determined from the model. Would allow the same code to be used for future adapters, e.g. for SD3.5 Medium.
22 | - Convert the adapter to safetensors.
23 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2 | 
3 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
4 | 


--------------------------------------------------------------------------------
/models/jointblock.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | from einops import rearrange
  5 | 
  6 | from comfy.ldm.modules.attention import optimized_attention
  7 | from comfy.ldm.modules.diffusionmodules.mmdit import (
  8 |     RMSNorm,
  9 |     JointBlock,
 10 | )
 11 | 
 12 | 
 13 | class AdaLayerNorm(nn.Module):
 14 |     """
 15 |     Norm layer adaptive layer norm zero (adaLN-Zero).
 16 | 
 17 |     Parameters:
 18 |         embedding_dim (`int`): The size of each embedding vector.
 19 |         num_embeddings (`int`): The size of the embeddings dictionary.
 20 |     """
 21 | 
 22 |     def __init__(self, embedding_dim: int, time_embedding_dim=None, mode="normal"):
 23 |         super().__init__()
 24 | 
 25 |         self.silu = nn.SiLU()
 26 |         num_params_dict = dict(
 27 |             zero=6,
 28 |             normal=2,
 29 |         )
 30 |         num_params = num_params_dict[mode]
 31 |         self.linear = nn.Linear(
 32 |             time_embedding_dim or embedding_dim, num_params * embedding_dim, bias=True
 33 |         )
 34 |         self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
 35 |         self.mode = mode
 36 | 
 37 |     def forward(
 38 |         self,
 39 |         x,
 40 |         hidden_dtype=None,
 41 |         emb=None,
 42 |     ):
 43 |         emb = self.linear(self.silu(emb))
 44 |         if self.mode == "normal":
 45 |             shift_msa, scale_msa = emb.chunk(2, dim=1)
 46 |             x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
 47 |             return x
 48 | 
 49 |         elif self.mode == "zero":
 50 |             shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(
 51 |                 6, dim=1
 52 |             )
 53 |             x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
 54 |             return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 55 | 
 56 | 
 57 | class IPAttnProcessor(nn.Module):
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         hidden_size=None,
 62 |         cross_attention_dim=None,
 63 |         ip_hidden_states_dim=None,
 64 |         ip_encoder_hidden_states_dim=None,
 65 |         head_dim=None,
 66 |         timesteps_emb_dim=1280,
 67 |     ):
 68 |         super().__init__()
 69 | 
 70 |         self.norm_ip = AdaLayerNorm(
 71 |             ip_hidden_states_dim, time_embedding_dim=timesteps_emb_dim
 72 |         )
 73 |         self.to_k_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False)
 74 |         self.to_v_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False)
 75 |         self.norm_q = RMSNorm(head_dim, 1e-6)
 76 |         self.norm_k = RMSNorm(head_dim, 1e-6)
 77 |         self.norm_ip_k = RMSNorm(head_dim, 1e-6)
 78 | 
 79 |     def forward(
 80 |         self,
 81 |         ip_hidden_states,
 82 |         img_query,
 83 |         img_key=None,
 84 |         img_value=None,
 85 |         t_emb=None,
 86 |         n_heads=1,
 87 |     ):
 88 |         if ip_hidden_states is None:
 89 |             return None
 90 | 
 91 |         if not hasattr(self, "to_k_ip") or not hasattr(self, "to_v_ip"):
 92 |             return None
 93 | 
 94 |         # norm ip input
 95 |         norm_ip_hidden_states = self.norm_ip(ip_hidden_states, emb=t_emb)
 96 | 
 97 |         # to k and v
 98 |         ip_key = self.to_k_ip(norm_ip_hidden_states)
 99 |         ip_value = self.to_v_ip(norm_ip_hidden_states)
100 | 
101 |         # reshape
102 |         img_query = rearrange(img_query, "b l (h d) -> b h l d", h=n_heads)
103 |         img_key = rearrange(img_key, "b l (h d) -> b h l d", h=n_heads)
104 |         # note that the image is in a different shape: b l h d
105 |         # so we transpose to b h l d
106 |         # or do we have to transpose here?
107 |         img_value = torch.transpose(img_value, 1, 2)
108 |         ip_key = rearrange(ip_key, "b l (h d) -> b h l d", h=n_heads)
109 |         ip_value = rearrange(ip_value, "b l (h d) -> b h l d", h=n_heads)
110 | 
111 |         # norm
112 |         img_query = self.norm_q(img_query)
113 |         img_key = self.norm_k(img_key)
114 |         ip_key = self.norm_ip_k(ip_key)
115 | 
116 |         # cat img
117 |         key = torch.cat([img_key, ip_key], dim=2)
118 |         value = torch.cat([img_value, ip_value], dim=2)
119 | 
120 |         #
121 |         ip_hidden_states = F.scaled_dot_product_attention(
122 |             img_query, key, value, dropout_p=0.0, is_causal=False
123 |         )
124 |         ip_hidden_states = rearrange(ip_hidden_states, "b h l d -> b l (h d)")
125 |         ip_hidden_states = ip_hidden_states.to(img_query.dtype)
126 |         return ip_hidden_states
127 | 
128 | 
129 | class JointBlockIPWrapper:
130 |     """To be used as a patch_replace with Comfy"""
131 | 
132 |     def __init__(
133 |         self,
134 |         original_block: JointBlock,
135 |         adapter: IPAttnProcessor,
136 |         ip_options=None,
137 |     ):
138 |         self.original_block = original_block
139 |         self.adapter = adapter
140 |         if ip_options is None:
141 |             ip_options = {}
142 |         self.ip_options = ip_options
143 | 
144 |     def block_mixing(self, context, x, context_block, x_block, c):
145 |         """
146 |         Comes from mmdit.py. Modified to add ipadapter attention.
147 |         """
148 |         context_qkv, context_intermediates = context_block.pre_attention(context, c)
149 | 
150 |         if x_block.x_block_self_attn:
151 |             x_qkv, x_qkv2, x_intermediates = x_block.pre_attention_x(x, c)
152 |         else:
153 |             x_qkv, x_intermediates = x_block.pre_attention(x, c)
154 | 
155 |         qkv = tuple(torch.cat((context_qkv[j], x_qkv[j]), dim=1) for j in range(3))
156 | 
157 |         attn = optimized_attention(
158 |             qkv[0],
159 |             qkv[1],
160 |             qkv[2],
161 |             heads=x_block.attn.num_heads,
162 |         )
163 |         context_attn, x_attn = (
164 |             attn[:, : context_qkv[0].shape[1]],
165 |             attn[:, context_qkv[0].shape[1] :],
166 |         )
167 |         # if the current timestep is not in the ipadapter enabling range, then the resampler wasn't run
168 |         # and the hidden states will be None
169 |         if (
170 |             self.ip_options["hidden_states"] is not None
171 |             and self.ip_options["t_emb"] is not None
172 |         ):
173 |             # IP-Adapter
174 |             ip_attn = self.adapter(
175 |                 self.ip_options["hidden_states"],
176 |                 *x_qkv,
177 |                 self.ip_options["t_emb"],
178 |                 x_block.attn.num_heads,
179 |             )
180 |             x_attn = x_attn + ip_attn * self.ip_options["weight"]
181 | 
182 |         # Everything else is unchanged
183 |         if not context_block.pre_only:
184 |             context = context_block.post_attention(context_attn, *context_intermediates)
185 | 
186 |         else:
187 |             context = None
188 |         if x_block.x_block_self_attn:
189 |             attn2 = optimized_attention(
190 |                 x_qkv2[0],
191 |                 x_qkv2[1],
192 |                 x_qkv2[2],
193 |                 heads=x_block.attn2.num_heads,
194 |             )
195 |             x = x_block.post_attention_x(x_attn, attn2, *x_intermediates)
196 |         else:
197 |             x = x_block.post_attention(x_attn, *x_intermediates)
198 |         return context, x
199 | 
200 |     def __call__(self, args, _):
201 |         # Code from mmdit.py:
202 |         # in this case, we're blocks_replace[("double_block", i)]
203 |         # note that although we're passed the original block,
204 |         # we can't actually get it from inside its wrapper
205 |         # (which would simplify the whole code...)
206 |         #   ```
207 |         #   def block_wrap(args):
208 |         #       out = {}
209 |         #       out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"])
210 |         #       return out
211 |         #   out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod}, {"original_block": block_wrap})
212 |         #   context = out["txt"]
213 |         #   x = out["img"]
214 |         #   ```
215 |         c, x = self.block_mixing(
216 |             args["txt"],
217 |             args["img"],
218 |             self.original_block.context_block,
219 |             self.original_block.x_block,
220 |             c=args["vec"],
221 |         )
222 |         return {"txt": c, "img": x}
223 | 


--------------------------------------------------------------------------------
/models/resampler.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
  2 | import math
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from diffusers.models.embeddings import Timesteps, TimestepEmbedding
  8 | 
  9 | 
 10 | # FFN
 11 | def FeedForward(dim, mult=4):
 12 |     inner_dim = int(dim * mult)
 13 |     return nn.Sequential(
 14 |         nn.LayerNorm(dim),
 15 |         nn.Linear(dim, inner_dim, bias=False),
 16 |         nn.GELU(),
 17 |         nn.Linear(inner_dim, dim, bias=False),
 18 |     )
 19 | 
 20 | 
 21 | def reshape_tensor(x, heads):
 22 |     bs, length, width = x.shape
 23 |     # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
 24 |     x = x.view(bs, length, heads, -1)
 25 |     # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
 26 |     x = x.transpose(1, 2)
 27 |     # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
 28 |     x = x.reshape(bs, heads, length, -1)
 29 |     return x
 30 | 
 31 | 
 32 | class PerceiverAttention(nn.Module):
 33 |     def __init__(self, *, dim, dim_head=64, heads=8):
 34 |         super().__init__()
 35 |         self.scale = dim_head**-0.5
 36 |         self.dim_head = dim_head
 37 |         self.heads = heads
 38 |         inner_dim = dim_head * heads
 39 | 
 40 |         self.norm1 = nn.LayerNorm(dim)
 41 |         self.norm2 = nn.LayerNorm(dim)
 42 | 
 43 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 44 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
 45 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 46 | 
 47 |     def forward(self, x, latents, shift=None, scale=None):
 48 |         """
 49 |         Args:
 50 |             x (torch.Tensor): image features
 51 |                 shape (b, n1, D)
 52 |             latent (torch.Tensor): latent features
 53 |                 shape (b, n2, D)
 54 |         """
 55 |         x = self.norm1(x)
 56 |         latents = self.norm2(latents)
 57 | 
 58 |         if shift is not None and scale is not None:
 59 |             latents = latents * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
 60 | 
 61 |         b, l, _ = latents.shape
 62 | 
 63 |         q = self.to_q(latents)
 64 |         kv_input = torch.cat((x, latents), dim=-2)
 65 |         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
 66 | 
 67 |         q = reshape_tensor(q, self.heads)
 68 |         k = reshape_tensor(k, self.heads)
 69 |         v = reshape_tensor(v, self.heads)
 70 | 
 71 |         # attention
 72 |         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
 73 |         weight = (q * scale) @ (k * scale).transpose(
 74 |             -2, -1
 75 |         )  # More stable with f16 than dividing afterwards
 76 |         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
 77 |         out = weight @ v
 78 | 
 79 |         out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
 80 | 
 81 |         return self.to_out(out)
 82 | 
 83 | 
 84 | class Resampler(nn.Module):
 85 |     def __init__(
 86 |         self,
 87 |         dim=1024,
 88 |         depth=8,
 89 |         dim_head=64,
 90 |         heads=16,
 91 |         num_queries=8,
 92 |         embedding_dim=768,
 93 |         output_dim=1024,
 94 |         ff_mult=4,
 95 |         *args,
 96 |         **kwargs,
 97 |     ):
 98 |         super().__init__()
 99 | 
100 |         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
101 | 
102 |         self.proj_in = nn.Linear(embedding_dim, dim)
103 | 
104 |         self.proj_out = nn.Linear(dim, output_dim)
105 |         self.norm_out = nn.LayerNorm(output_dim)
106 | 
107 |         self.layers = nn.ModuleList([])
108 |         for _ in range(depth):
109 |             self.layers.append(
110 |                 nn.ModuleList(
111 |                     [
112 |                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
113 |                         FeedForward(dim=dim, mult=ff_mult),
114 |                     ]
115 |                 )
116 |             )
117 | 
118 |     def forward(self, x):
119 | 
120 |         latents = self.latents.repeat(x.size(0), 1, 1)
121 | 
122 |         x = self.proj_in(x)
123 | 
124 |         for attn, ff in self.layers:
125 |             latents = attn(x, latents) + latents
126 |             latents = ff(latents) + latents
127 | 
128 |         latents = self.proj_out(latents)
129 |         return self.norm_out(latents)
130 | 
131 | 
132 | class TimeResampler(nn.Module):
133 |     def __init__(
134 |         self,
135 |         dim=1024,
136 |         depth=8,
137 |         dim_head=64,
138 |         heads=16,
139 |         num_queries=8,
140 |         embedding_dim=768,
141 |         output_dim=1024,
142 |         ff_mult=4,
143 |         timestep_in_dim=320,
144 |         timestep_flip_sin_to_cos=True,
145 |         timestep_freq_shift=0,
146 |     ):
147 |         super().__init__()
148 | 
149 |         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
150 | 
151 |         self.proj_in = nn.Linear(embedding_dim, dim)
152 | 
153 |         self.proj_out = nn.Linear(dim, output_dim)
154 |         self.norm_out = nn.LayerNorm(output_dim)
155 | 
156 |         self.layers = nn.ModuleList([])
157 |         for _ in range(depth):
158 |             self.layers.append(
159 |                 nn.ModuleList(
160 |                     [
161 |                         # msa
162 |                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
163 |                         # ff
164 |                         FeedForward(dim=dim, mult=ff_mult),
165 |                         # adaLN
166 |                         nn.Sequential(nn.SiLU(), nn.Linear(dim, 4 * dim, bias=True)),
167 |                     ]
168 |                 )
169 |             )
170 | 
171 |         # time
172 |         self.time_proj = Timesteps(
173 |             timestep_in_dim, timestep_flip_sin_to_cos, timestep_freq_shift
174 |         )
175 |         self.time_embedding = TimestepEmbedding(timestep_in_dim, dim, act_fn="silu")
176 | 
177 |         # adaLN
178 |         # self.adaLN_modulation = nn.Sequential(
179 |         #     nn.SiLU(),
180 |         #     nn.Linear(timestep_out_dim, 6 * timestep_out_dim, bias=True)
181 |         # )
182 | 
183 |     def forward(self, x, timestep, need_temb=False):
184 |         timestep_emb = self.embedding_time(x, timestep)  # bs, dim
185 | 
186 |         latents = self.latents.repeat(x.size(0), 1, 1)
187 | 
188 |         x = self.proj_in(x)
189 |         x = x + timestep_emb[:, None]
190 | 
191 |         for attn, ff, adaLN_modulation in self.layers:
192 |             shift_msa, scale_msa, shift_mlp, scale_mlp = adaLN_modulation(
193 |                 timestep_emb
194 |             ).chunk(4, dim=1)
195 |             latents = attn(x, latents, shift_msa, scale_msa) + latents
196 | 
197 |             res = latents
198 |             for idx_ff in range(len(ff)):
199 |                 layer_ff = ff[idx_ff]
200 |                 latents = layer_ff(latents)
201 |                 if idx_ff == 0 and isinstance(layer_ff, nn.LayerNorm):  # adaLN
202 |                     latents = latents * (
203 |                         1 + scale_mlp.unsqueeze(1)
204 |                     ) + shift_mlp.unsqueeze(1)
205 |             latents = latents + res
206 | 
207 |             # latents = ff(latents) + latents
208 | 
209 |         latents = self.proj_out(latents)
210 |         latents = self.norm_out(latents)
211 | 
212 |         if need_temb:
213 |             return latents, timestep_emb
214 |         else:
215 |             return latents
216 | 
217 |     def embedding_time(self, sample, timestep):
218 | 
219 |         # 1. time
220 |         timesteps = timestep
221 |         if not torch.is_tensor(timesteps):
222 |             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
223 |             # This would be a good case for the `match` statement (Python 3.10+)
224 |             is_mps = sample.device.type == "mps"
225 |             if isinstance(timestep, float):
226 |                 dtype = torch.float32 if is_mps else torch.float64
227 |             else:
228 |                 dtype = torch.int32 if is_mps else torch.int64
229 |             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
230 |         elif len(timesteps.shape) == 0:
231 |             timesteps = timesteps[None].to(sample.device)
232 | 
233 |         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
234 |         timesteps = timesteps.expand(sample.shape[0])
235 | 
236 |         t_emb = self.time_proj(timesteps)
237 | 
238 |         # timesteps does not contain any weights and will always return f32 tensors
239 |         # but time_embedding might actually be running in fp16. so we need to cast here.
240 |         # there might be better ways to encapsulate this.
241 |         t_emb = t_emb.to(dtype=sample.dtype)
242 | 
243 |         emb = self.time_embedding(t_emb, None)
244 |         return emb
245 | 


--------------------------------------------------------------------------------
/nodes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | 
  4 | import torch
  5 | import folder_paths
  6 | 
  7 | from .models.resampler import TimeResampler
  8 | from .models.jointblock import JointBlockIPWrapper, IPAttnProcessor
  9 | 
 10 | MODELS_DIR = os.path.join(folder_paths.models_dir, "ipadapter")
 11 | if "ipadapter" not in folder_paths.folder_names_and_paths:
 12 |     current_paths = [MODELS_DIR]
 13 | else:
 14 |     current_paths, _ = folder_paths.folder_names_and_paths["ipadapter"]
 15 | folder_paths.folder_names_and_paths["ipadapter"] = (
 16 |     current_paths,
 17 |     folder_paths.supported_pt_extensions,
 18 | )
 19 | 
 20 | 
 21 | def patch(
 22 |     patcher,
 23 |     ip_procs,
 24 |     resampler: TimeResampler,
 25 |     clip_embeds,
 26 |     weight=1.0,
 27 |     start=0.0,
 28 |     end=1.0,
 29 | ):
 30 |     """
 31 |     Patches a model_sampler to add the ipadapter
 32 |     """
 33 |     mmdit = patcher.model.diffusion_model
 34 |     timestep_schedule_max = patcher.model.model_config.sampling_settings.get(
 35 |         "timesteps", 1000
 36 |     )
 37 |     # hook the model's forward function
 38 |     # so that when it gets called, we can grab the timestep and send it to the resampler
 39 |     ip_options = {
 40 |         "hidden_states": None,
 41 |         "t_emb": None,
 42 |         "weight": weight,
 43 |     }
 44 | 
 45 |     def ddit_wrapper(forward, args):
 46 |         # this is between 0 and 1, so the adapters can calculate start_point and end_point
 47 |         # actually, do we need to get the sigma value instead?
 48 |         t_percent = 1 - args["timestep"].flatten()[0].cpu().item()
 49 |         if start <= t_percent <= end:
 50 |             batch_size = args["input"].shape[0] // len(args["cond_or_uncond"])
 51 |             # if we're only doing cond or only doing uncond, only pass one of them through the resampler
 52 |             embeds = clip_embeds[args["cond_or_uncond"]]
 53 |             # slight efficiency optimization todo: pass the embeds through and then afterwards
 54 |             # repeat to the batch size
 55 |             embeds = torch.repeat_interleave(embeds, batch_size, dim=0)
 56 |             # the resampler wants between 0 and MAX_STEPS
 57 |             timestep = args["timestep"] * timestep_schedule_max
 58 |             image_emb, t_emb = resampler(embeds, timestep, need_temb=True)
 59 |             # these will need to be accessible to the IPAdapters
 60 |             ip_options["hidden_states"] = image_emb
 61 |             ip_options["t_emb"] = t_emb
 62 |         else:
 63 |             ip_options["hidden_states"] = None
 64 |             ip_options["t_emb"] = None
 65 | 
 66 |         return forward(args["input"], args["timestep"], **args["c"])
 67 | 
 68 |     patcher.set_model_unet_function_wrapper(ddit_wrapper)
 69 |     # patch each dit block
 70 |     for i, block in enumerate(mmdit.joint_blocks):
 71 |         wrapper = JointBlockIPWrapper(block, ip_procs[i], ip_options)
 72 |         patcher.set_model_patch_replace(wrapper, "dit", "double_block", i)
 73 | 
 74 | 
 75 | class SD3IPAdapter:
 76 |     def __init__(self, checkpoint: str, device):
 77 |         self.device = device
 78 |         # load the checkpoint right away
 79 |         self.state_dict = torch.load(
 80 |             os.path.join(MODELS_DIR, checkpoint),
 81 |             map_location=self.device,
 82 |             weights_only=True,
 83 |         )
 84 |         # todo: infer some of the params from the checkpoint instead of hardcoded
 85 |         self.resampler = TimeResampler(
 86 |             dim=1280,
 87 |             depth=4,
 88 |             dim_head=64,
 89 |             heads=20,
 90 |             num_queries=64,
 91 |             embedding_dim=1152,
 92 |             output_dim=2432,
 93 |             ff_mult=4,
 94 |             timestep_in_dim=320,
 95 |             timestep_flip_sin_to_cos=True,
 96 |             timestep_freq_shift=0,
 97 |         )
 98 |         self.resampler.eval()
 99 |         self.resampler.to(self.device, dtype=torch.float16)
100 |         self.resampler.load_state_dict(self.state_dict["image_proj"])
101 | 
102 |         # now we'll create the attention processors
103 |         # ip_adapter.keys looks like [0.proj, 0.to_k, ..., 1.proj, 1.to_k, ...]
104 |         n_procs = len(
105 |             set(x.split(".")[0] for x in self.state_dict["ip_adapter"].keys())
106 |         )
107 |         self.procs = torch.nn.ModuleList(
108 |             [
109 |                 # this is hardcoded for SD3.5L
110 |                 IPAttnProcessor(
111 |                     hidden_size=2432,
112 |                     cross_attention_dim=2432,
113 |                     ip_hidden_states_dim=2432,
114 |                     ip_encoder_hidden_states_dim=2432,
115 |                     head_dim=64,
116 |                     timesteps_emb_dim=1280,
117 |                 ).to(self.device, dtype=torch.float16)
118 |                 for _ in range(n_procs)
119 |             ]
120 |         )
121 |         self.procs.load_state_dict(self.state_dict["ip_adapter"])
122 | 
123 | 
124 | class IPAdapterSD3Loader:
125 |     @classmethod
126 |     def INPUT_TYPES(s):
127 |         return {
128 |             "required": {
129 |                 "ipadapter": (folder_paths.get_filename_list("ipadapter"),),
130 |                 "provider": (["cuda", "cpu", "mps"],),
131 |             }
132 |         }
133 | 
134 |     RETURN_TYPES = ("IP_ADAPTER_SD3_INSTANTX",)
135 |     RETURN_NAMES = ("ipadapter",)
136 |     FUNCTION = "load_model"
137 |     CATEGORY = "InstantXNodes"
138 | 
139 |     def load_model(self, ipadapter, provider):
140 |         logging.info("Loading InstantX IPAdapter SD3 model.")
141 |         model = SD3IPAdapter(ipadapter, provider)
142 |         return (model,)
143 | 
144 | 
145 | class ApplyIPAdapterSD3:
146 |     @classmethod
147 |     def INPUT_TYPES(s):
148 |         return {
149 |             "required": {
150 |                 "model": ("MODEL",),
151 |                 "ipadapter": ("IP_ADAPTER_SD3_INSTANTX",),
152 |                 "image_embed": ("CLIP_VISION_OUTPUT",),
153 |                 "weight": (
154 |                     "FLOAT",
155 |                     {"default": 1.0, "min": -1.0, "max": 5.0, "step": 0.05},
156 |                 ),
157 |                 "start_percent": (
158 |                     "FLOAT",
159 |                     {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01},
160 |                 ),
161 |                 "end_percent": (
162 |                     "FLOAT",
163 |                     {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01},
164 |                 ),
165 |             },
166 |         }
167 | 
168 |     RETURN_TYPES = ("MODEL",)
169 |     FUNCTION = "apply_ipadapter"
170 |     CATEGORY = "InstantXNodes"
171 | 
172 |     def apply_ipadapter(
173 |         self, model, ipadapter, image_embed, weight, start_percent, end_percent
174 |     ):
175 |         # set model
176 |         new_model = model.clone()
177 |         # add uncond embedding
178 |         image_embed = image_embed.penultimate_hidden_states
179 |         embeds = torch.cat([image_embed, torch.zeros_like(image_embed)], dim=0).to(
180 |             ipadapter.device, dtype=torch.float16
181 |         )
182 |         patch(
183 |             new_model,
184 |             ipadapter.procs,
185 |             ipadapter.resampler,
186 |             embeds,
187 |             weight=weight,
188 |             start=start_percent,
189 |             end=end_percent,
190 |         )
191 |         return (new_model,)
192 | 
193 | 
194 | NODE_CLASS_MAPPINGS = {
195 |     "IPAdapterSD3Loader": IPAdapterSD3Loader,
196 |     "ApplyIPAdapterSD3": ApplyIPAdapterSD3,
197 | }
198 | 
199 | NODE_DISPLAY_NAME_MAPPINGS = {
200 |     "IPAdapterSD3Loader": "Load IPAdapter SD3 Model",
201 |     "ApplyIPAdapterSD3": "Apply IPAdapter SD3 Model",
202 | }
203 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui-instantx-ipadapter-sd3"
 3 | description = "ComfyUI implementation of the [a/InstantX IP-Adapter for SD3.5 Large](https://huggingface.co/InstantX/SD3.5-Large-IP-Adapter)."
 4 | version = "1.0.0"
 5 | license = {file = "LICENSE"}
 6 | dependencies = ["torch", "einops", "diffusers"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/Slickytail/ComfyUI-InstantX-IPAdapter-SD3"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = "slickytail"
14 | DisplayName = "ComfyUI-InstantX-IPAdapter-SD3"
15 | Icon = ""
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | einops
3 | diffusers
4 | 


--------------------------------------------------------------------------------
/workflows/SD3.5L IP-Adapter.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 50,
  3 |   "last_link_id": 81,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 8,
  7 |       "type": "VAEDecode",
  8 |       "pos": [
  9 |         1151,
 10 |         195
 11 |       ],
 12 |       "size": [
 13 |         210,
 14 |         46
 15 |       ],
 16 |       "flags": {},
 17 |       "order": 11,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "samples",
 22 |           "type": "LATENT",
 23 |           "link": 52
 24 |         },
 25 |         {
 26 |           "name": "vae",
 27 |           "type": "VAE",
 28 |           "link": 72
 29 |         }
 30 |       ],
 31 |       "outputs": [
 32 |         {
 33 |           "name": "IMAGE",
 34 |           "type": "IMAGE",
 35 |           "links": [
 36 |             9
 37 |           ],
 38 |           "slot_index": 0
 39 |         }
 40 |       ],
 41 |       "properties": {
 42 |         "Node name for S&R": "VAEDecode"
 43 |       },
 44 |       "widgets_values": []
 45 |     },
 46 |     {
 47 |       "id": 33,
 48 |       "type": "CLIPTextEncode",
 49 |       "pos": [
 50 |         312.6510009765625,
 51 |         394.7559814453125
 52 |       ],
 53 |       "size": [
 54 |         422.84503173828125,
 55 |         164.31304931640625
 56 |       ],
 57 |       "flags": {
 58 |         "collapsed": true
 59 |       },
 60 |       "order": 7,
 61 |       "mode": 0,
 62 |       "inputs": [
 63 |         {
 64 |           "name": "clip",
 65 |           "type": "CLIP",
 66 |           "link": 65,
 67 |           "slot_index": 0
 68 |         }
 69 |       ],
 70 |       "outputs": [
 71 |         {
 72 |           "name": "CONDITIONING",
 73 |           "type": "CONDITIONING",
 74 |           "links": [
 75 |             55
 76 |           ],
 77 |           "slot_index": 0
 78 |         }
 79 |       ],
 80 |       "title": "CLIP Text Encode (Negative Prompt)",
 81 |       "properties": {
 82 |         "Node name for S&R": "CLIPTextEncode"
 83 |       },
 84 |       "widgets_values": [
 85 |         ""
 86 |       ],
 87 |       "color": "#322",
 88 |       "bgcolor": "#533"
 89 |     },
 90 |     {
 91 |       "id": 27,
 92 |       "type": "EmptySD3LatentImage",
 93 |       "pos": [
 94 |         372.675048828125,
 95 |         451.0669860839844
 96 |       ],
 97 |       "size": [
 98 |         315,
 99 |         106
100 |       ],
101 |       "flags": {},
102 |       "order": 0,
103 |       "mode": 0,
104 |       "inputs": [],
105 |       "outputs": [
106 |         {
107 |           "name": "LATENT",
108 |           "type": "LATENT",
109 |           "links": [
110 |             51
111 |           ],
112 |           "slot_index": 0,
113 |           "shape": 3
114 |         }
115 |       ],
116 |       "properties": {
117 |         "Node name for S&R": "EmptySD3LatentImage"
118 |       },
119 |       "widgets_values": [
120 |         1024,
121 |         1024,
122 |         1
123 |       ],
124 |       "color": "#323",
125 |       "bgcolor": "#535"
126 |     },
127 |     {
128 |       "id": 42,
129 |       "type": "TripleCLIPLoader",
130 |       "pos": [
131 |         -210.8244171142578,
132 |         375.64593505859375
133 |       ],
134 |       "size": [
135 |         315,
136 |         106
137 |       ],
138 |       "flags": {},
139 |       "order": 1,
140 |       "mode": 0,
141 |       "inputs": [],
142 |       "outputs": [
143 |         {
144 |           "name": "CLIP",
145 |           "type": "CLIP",
146 |           "links": [
147 |             64,
148 |             65
149 |           ],
150 |           "slot_index": 0
151 |         }
152 |       ],
153 |       "properties": {
154 |         "Node name for S&R": "TripleCLIPLoader"
155 |       },
156 |       "widgets_values": [
157 |         "clip_g.safetensors",
158 |         "clip_l.safetensors",
159 |         "t5xxl_fp16.safetensors"
160 |       ]
161 |     },
162 |     {
163 |       "id": 6,
164 |       "type": "CLIPTextEncode",
165 |       "pos": [
166 |         258.14404296875,
167 |         159.2249755859375
168 |       ],
169 |       "size": [
170 |         422.84503173828125,
171 |         164.31304931640625
172 |       ],
173 |       "flags": {},
174 |       "order": 6,
175 |       "mode": 0,
176 |       "inputs": [
177 |         {
178 |           "name": "clip",
179 |           "type": "CLIP",
180 |           "link": 64
181 |         }
182 |       ],
183 |       "outputs": [
184 |         {
185 |           "name": "CONDITIONING",
186 |           "type": "CONDITIONING",
187 |           "links": [
188 |             58
189 |           ],
190 |           "slot_index": 0
191 |         }
192 |       ],
193 |       "title": "CLIP Text Encode (Positive Prompt)",
194 |       "properties": {
195 |         "Node name for S&R": "CLIPTextEncode"
196 |       },
197 |       "widgets_values": [
198 |         "a man with red hair"
199 |       ],
200 |       "color": "#232",
201 |       "bgcolor": "#353"
202 |     },
203 |     {
204 |       "id": 45,
205 |       "type": "CheckpointLoaderSimple",
206 |       "pos": [
207 |         -205.12525939941406,
208 |         203.60665893554688
209 |       ],
210 |       "size": [
211 |         315,
212 |         98
213 |       ],
214 |       "flags": {},
215 |       "order": 2,
216 |       "mode": 0,
217 |       "inputs": [],
218 |       "outputs": [
219 |         {
220 |           "name": "MODEL",
221 |           "type": "MODEL",
222 |           "links": [
223 |             80
224 |           ],
225 |           "slot_index": 0
226 |         },
227 |         {
228 |           "name": "CLIP",
229 |           "type": "CLIP",
230 |           "links": null
231 |         },
232 |         {
233 |           "name": "VAE",
234 |           "type": "VAE",
235 |           "links": [
236 |             72
237 |           ],
238 |           "slot_index": 2
239 |         }
240 |       ],
241 |       "properties": {
242 |         "Node name for S&R": "CheckpointLoaderSimple"
243 |       },
244 |       "widgets_values": [
245 |         "SD3.5-L.safetensors"
246 |       ]
247 |     },
248 |     {
249 |       "id": 47,
250 |       "type": "ApplyIPAdapterSD3",
251 |       "pos": [
252 |         401.8241882324219,
253 |         623.6758422851562
254 |       ],
255 |       "size": [
256 |         315,
257 |         146
258 |       ],
259 |       "flags": {},
260 |       "order": 9,
261 |       "mode": 0,
262 |       "inputs": [
263 |         {
264 |           "name": "model",
265 |           "type": "MODEL",
266 |           "link": 80
267 |         },
268 |         {
269 |           "name": "ipadapter",
270 |           "type": "IP_ADAPTER_SD3_INSTANTX",
271 |           "link": 75
272 |         },
273 |         {
274 |           "name": "image_embed",
275 |           "type": "CLIP_VISION_OUTPUT",
276 |           "link": 79
277 |         }
278 |       ],
279 |       "outputs": [
280 |         {
281 |           "name": "MODEL",
282 |           "type": "MODEL",
283 |           "links": [
284 |             81
285 |           ],
286 |           "slot_index": 0
287 |         }
288 |       ],
289 |       "properties": {
290 |         "Node name for S&R": "ApplyIPAdapterSD3"
291 |       },
292 |       "widgets_values": [
293 |         0.5,
294 |         0,
295 |         1
296 |       ]
297 |     },
298 |     {
299 |       "id": 38,
300 |       "type": "LoadImage",
301 |       "pos": [
302 |         -193.93606567382812,
303 |         539.2610473632812
304 |       ],
305 |       "size": [
306 |         315,
307 |         314
308 |       ],
309 |       "flags": {},
310 |       "order": 3,
311 |       "mode": 0,
312 |       "inputs": [],
313 |       "outputs": [
314 |         {
315 |           "name": "IMAGE",
316 |           "type": "IMAGE",
317 |           "links": [
318 |             77
319 |           ],
320 |           "slot_index": 0
321 |         },
322 |         {
323 |           "name": "MASK",
324 |           "type": "MASK",
325 |           "links": null
326 |         }
327 |       ],
328 |       "properties": {
329 |         "Node name for S&R": "LoadImage"
330 |       },
331 |       "widgets_values": [
332 |         "oppenheimer.png",
333 |         "image"
334 |       ]
335 |     },
336 |     {
337 |       "id": 50,
338 |       "type": "CLIPVisionEncode",
339 |       "pos": [
340 |         -195.75955200195312,
341 |         904.2363891601562
342 |       ],
343 |       "size": [
344 |         380.4000244140625,
345 |         78
346 |       ],
347 |       "flags": {},
348 |       "order": 8,
349 |       "mode": 0,
350 |       "inputs": [
351 |         {
352 |           "name": "clip_vision",
353 |           "type": "CLIP_VISION",
354 |           "link": 78
355 |         },
356 |         {
357 |           "name": "image",
358 |           "type": "IMAGE",
359 |           "link": 77
360 |         }
361 |       ],
362 |       "outputs": [
363 |         {
364 |           "name": "CLIP_VISION_OUTPUT",
365 |           "type": "CLIP_VISION_OUTPUT",
366 |           "links": [
367 |             79
368 |           ],
369 |           "slot_index": 0
370 |         }
371 |       ],
372 |       "properties": {
373 |         "Node name for S&R": "CLIPVisionEncode"
374 |       },
375 |       "widgets_values": [
376 |         "center"
377 |       ]
378 |     },
379 |     {
380 |       "id": 48,
381 |       "type": "CLIPVisionLoader",
382 |       "pos": [
383 |         -174.8817596435547,
384 |         1020.2452392578125
385 |       ],
386 |       "size": [
387 |         315,
388 |         58
389 |       ],
390 |       "flags": {},
391 |       "order": 4,
392 |       "mode": 0,
393 |       "inputs": [],
394 |       "outputs": [
395 |         {
396 |           "name": "CLIP_VISION",
397 |           "type": "CLIP_VISION",
398 |           "links": [
399 |             78
400 |           ]
401 |         }
402 |       ],
403 |       "properties": {
404 |         "Node name for S&R": "CLIPVisionLoader"
405 |       },
406 |       "widgets_values": [
407 |         "sigclip_vision_patch14_384.safetensors"
408 |       ]
409 |     },
410 |     {
411 |       "id": 46,
412 |       "type": "IPAdapterSD3Loader",
413 |       "pos": [
414 |         393.33660888671875,
415 |         859.3168334960938
416 |       ],
417 |       "size": [
418 |         315,
419 |         82
420 |       ],
421 |       "flags": {},
422 |       "order": 5,
423 |       "mode": 0,
424 |       "inputs": [],
425 |       "outputs": [
426 |         {
427 |           "name": "ipadapter",
428 |           "type": "IP_ADAPTER_SD3_INSTANTX",
429 |           "links": [
430 |             75
431 |           ],
432 |           "slot_index": 0
433 |         }
434 |       ],
435 |       "properties": {
436 |         "Node name for S&R": "IPAdapterSD3Loader"
437 |       },
438 |       "widgets_values": [
439 |         "ip_sd35l_instantx.bin",
440 |         "cuda"
441 |       ]
442 |     },
443 |     {
444 |       "id": 9,
445 |       "type": "SaveImage",
446 |       "pos": [
447 |         836.613037109375,
448 |         534.8397827148438
449 |       ],
450 |       "size": [
451 |         560.4580688476562,
452 |         558.8997192382812
453 |       ],
454 |       "flags": {},
455 |       "order": 12,
456 |       "mode": 0,
457 |       "inputs": [
458 |         {
459 |           "name": "images",
460 |           "type": "IMAGE",
461 |           "link": 9
462 |         }
463 |       ],
464 |       "outputs": [],
465 |       "properties": {
466 |         "Node name for S&R": "SaveImage"
467 |       },
468 |       "widgets_values": [
469 |         "ComfyUI"
470 |       ]
471 |     },
472 |     {
473 |       "id": 31,
474 |       "type": "KSampler",
475 |       "pos": [
476 |         816,
477 |         192
478 |       ],
479 |       "size": [
480 |         315,
481 |         262
482 |       ],
483 |       "flags": {},
484 |       "order": 10,
485 |       "mode": 0,
486 |       "inputs": [
487 |         {
488 |           "name": "model",
489 |           "type": "MODEL",
490 |           "link": 81
491 |         },
492 |         {
493 |           "name": "positive",
494 |           "type": "CONDITIONING",
495 |           "link": 58
496 |         },
497 |         {
498 |           "name": "negative",
499 |           "type": "CONDITIONING",
500 |           "link": 55
501 |         },
502 |         {
503 |           "name": "latent_image",
504 |           "type": "LATENT",
505 |           "link": 51
506 |         }
507 |       ],
508 |       "outputs": [
509 |         {
510 |           "name": "LATENT",
511 |           "type": "LATENT",
512 |           "links": [
513 |             52
514 |           ],
515 |           "slot_index": 0,
516 |           "shape": 3
517 |         }
518 |       ],
519 |       "properties": {
520 |         "Node name for S&R": "KSampler"
521 |       },
522 |       "widgets_values": [
523 |         0,
524 |         "fixed",
525 |         25,
526 |         4.5,
527 |         "euler",
528 |         "normal",
529 |         1
530 |       ]
531 |     }
532 |   ],
533 |   "links": [
534 |     [
535 |       9,
536 |       8,
537 |       0,
538 |       9,
539 |       0,
540 |       "IMAGE"
541 |     ],
542 |     [
543 |       51,
544 |       27,
545 |       0,
546 |       31,
547 |       3,
548 |       "LATENT"
549 |     ],
550 |     [
551 |       52,
552 |       31,
553 |       0,
554 |       8,
555 |       0,
556 |       "LATENT"
557 |     ],
558 |     [
559 |       55,
560 |       33,
561 |       0,
562 |       31,
563 |       2,
564 |       "CONDITIONING"
565 |     ],
566 |     [
567 |       58,
568 |       6,
569 |       0,
570 |       31,
571 |       1,
572 |       "CONDITIONING"
573 |     ],
574 |     [
575 |       64,
576 |       42,
577 |       0,
578 |       6,
579 |       0,
580 |       "CLIP"
581 |     ],
582 |     [
583 |       65,
584 |       42,
585 |       0,
586 |       33,
587 |       0,
588 |       "CLIP"
589 |     ],
590 |     [
591 |       72,
592 |       45,
593 |       2,
594 |       8,
595 |       1,
596 |       "VAE"
597 |     ],
598 |     [
599 |       75,
600 |       46,
601 |       0,
602 |       47,
603 |       1,
604 |       "IP_ADAPTER_SD3_INSTANTX"
605 |     ],
606 |     [
607 |       77,
608 |       38,
609 |       0,
610 |       50,
611 |       1,
612 |       "IMAGE"
613 |     ],
614 |     [
615 |       78,
616 |       48,
617 |       0,
618 |       50,
619 |       0,
620 |       "CLIP_VISION"
621 |     ],
622 |     [
623 |       79,
624 |       50,
625 |       0,
626 |       47,
627 |       2,
628 |       "CLIP_VISION_OUTPUT"
629 |     ],
630 |     [
631 |       80,
632 |       45,
633 |       0,
634 |       47,
635 |       0,
636 |       "MODEL"
637 |     ],
638 |     [
639 |       81,
640 |       47,
641 |       0,
642 |       31,
643 |       0,
644 |       "MODEL"
645 |     ]
646 |   ],
647 |   "groups": [],
648 |   "config": {},
649 |   "extra": {
650 |     "ds": {
651 |       "scale": 0.6727499949325677,
652 |       "offset": [
653 |         739.7288455474428,
654 |         151.74268927695144
655 |       ]
656 |     },
657 |     "ue_links": []
658 |   },
659 |   "version": 0.4
660 | }


--------------------------------------------------------------------------------