├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── conditioning
    ├── embeddings.py
    ├── fusers_patch.py
    └── instance_conditioning.py
├── constants.py
├── example_workflows
    ├── Spline_Editor_InstanceDiffusion_kijai_01.json
    └── fourpeople_workflow.json
├── model_helpers
    ├── prepare_fusers.py
    ├── prepare_positionnet.py
    └── prepare_scaleu.py
├── modules
    ├── convnext.py
    ├── scaleu.py
    ├── text_grounding_net.py
    ├── text_grounding_tokenizer_input.py
    └── util.py
├── nodes
    ├── apply_scaleu_model_node.py
    ├── download_and_load_models.py
    ├── instance_diffusion_tracking_prompt_node.py
    ├── load_instance_fusers_node.py
    ├── load_instance_positionnet_node.py
    └── load_instance_scaleu_node.py
├── pyproject.toml
├── requirements.txt
└── utils
    ├── decode_item.py
    ├── model_utils.py
    └── prompt_utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | .vscode
3 | **/__pycache__
4 | **/*.ckpt
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI-InstanceDiffusion
 2 | ComfyUI nodes to use InstanceDiffusion.
 3 | 
 4 | Original research repo: https://github.com/frank-xwang/InstanceDiffusion
 5 | 
 6 | ## Table of Contents
 7 | - [Installation](#installation)
 8 |   - [How to Install](#how-to-install)
 9 |   - [How to Configure Models](#how-to-configure-models)
10 | - [Accompanying Node Repos](#accompanying-node-repos)
11 | - [Examples](#examples)
12 | - [Acknowledgements](#acknowledgements)
13 | 
14 | ## Installation
15 | 
16 | ### How to Install
17 | Clone or download this repo into your `ComfyUI/custom_nodes/` directory.
18 | There are no Python package requirements outside of the standard ComfyUI requirements at this time.
19 | 
20 | ### How to Configure Models
21 | These models were trained by [frank-xwang](https://github.com/frank-xwang) baked inside of StableDiffusion 1.5. These are spliced out into individual models to be used with other SD1.5 checkpoints.
22 | Download each of these checkpoints and place them into the Installation Directory within `ComfyUI/models/instance_models/` directory.
23 | 
24 | | Model Name | URL | Installation Directory |
25 | |------------|-----|------------------------|
26 | | fusers.ckpt     | [huggingface](https://huggingface.co/spaces/logtd/instancediffusion/blob/main/fusers.ckpt) | `instance_models/fuser_models/`      |
27 | | positionnet.ckpt     | [huggingface](https://huggingface.co/spaces/logtd/instancediffusion/blob/main/position_net.ckpt) | `instance_models/positionnet_models/`      |
28 | | scaleu.ckpt     | [huggingface](https://huggingface.co/spaces/logtd/instancediffusion/blob/main/scaleu.ckpt) | `instance_models/scaleu_models/`      |
29 | 
30 | 
31 | ## Accompanying Node Repos
32 | * [KJNodes for BBoxes](https://github.com/kijai/ComfyUI-KJNodes)
33 | * [Tracking Nodes for videos](https://github.com/logtd/ComfyUI-TrackingNodes)
34 | * [AnimateDiff-Evolved](https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
35 | * [Video Helper Suite](https://github.com/Kosinkadink/ComfyUI-VideoHelperSuite)
36 | 
37 | ## Examples
38 | 
39 | ### Text2Vid example using [Kijai](https://github.com/kijai)'s Spline Editor
40 | ![spline_editor_instances](https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/8830e2e7-b0c3-4f4f-95b7-12ee21997fb1)
41 | 
42 | 
43 | ### Vid2Vid examples
44 | Example workflows can be found in the `example_workflows/` directory.
45 | 
46 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/ee42891a-cc38-421c-98bf-03a1be11d315
47 | 
48 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/40038526-5850-4cb6-9658-c38c7e4b20f9
49 | 
50 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/eae3520c-9a3d-4cde-b32f-1af9231ad2d4
51 | 
52 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/85b7d9df-7f7e-43c7-b2fa-b14fd5ec5e6d
53 | 
54 | ## Unsupported Features
55 | InstanceDiffusion supports a wide range of inputs. The inputs that do not have nodes that can convert their input into InstanceDiffusion:
56 | * Scribbles
57 | * Points
58 | * Segments
59 | * Masks
60 | 
61 | Points, segments, and masks are planned todo after proper tracking for these input types is implemented in ComfyUI.
62 | 
63 | ## Acknowledgements
64 | * [frank-xwang](https://github.com/frank-xwang) for creating the original repo, training models, etc.
65 | * [Kosinkadink](https://github.com/Kosinkadink) for creating AnimateDiff-Evolved and providing support on integration
66 | * [Kijai](https://github.com/kijai) for improving the speed and adding tracking nodes
67 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | from .nodes.apply_scaleu_model_node import ApplyScaleUModelNode
 2 | from .nodes.load_instance_scaleu_node import LoadInstanceScaleUNode
 3 | from .nodes.load_instance_fusers_node import LoadInstanceFusersNode
 4 | from .nodes.load_instance_positionnet_node import LoadInstancePositionNetNode
 5 | from .nodes.instance_diffusion_tracking_prompt_node import InstanceDiffusionTrackingPromptNode
 6 | from .nodes.download_and_load_models import DownloadInstanceDiffusionModels
 7 | 
 8 | 
 9 | NODE_CLASS_MAPPINGS = {
10 |     "ApplyScaleUModelNode": ApplyScaleUModelNode,
11 |     "LoadInstanceScaleUNode": LoadInstanceScaleUNode,
12 |     "LoadInstancePositionNetModel": LoadInstancePositionNetNode,
13 |     "LoadInstanceFusersNode": LoadInstanceFusersNode,
14 |     "InstanceDiffusionTrackingPrompt": InstanceDiffusionTrackingPromptNode,
15 |     "DownloadInstanceDiffusionModels": DownloadInstanceDiffusionModels
16 | }
17 | 
18 | NODE_DISPLAY_NAME_MAPPINGS = {
19 |     "ApplyScaleUModelNode": "Apply Instance Diffusion ScaleU",
20 |     "LoadInstancePositionNetModel": "Load Instance PositionNet Model",
21 |     "LoadInstanceScaleUModel": "Load Instance ScaleU Model",
22 |     "LoadInstanceFusersNode": "Load Instance Fusers Model",
23 |     "InstanceDiffusionTrackingPrompt": "Instance Diffusion Tracking Prompt",
24 |     "DownloadInstanceDiffusionModels": "(Down)Load Instance Diffusion Models"
25 | }
26 | 


--------------------------------------------------------------------------------
/conditioning/embeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from skimage.transform import resize
  4 | from ..utils.decode_item import binary_mask_to_polygon, sample_uniform_sparse_points
  5 | 
  6 | 
  7 | N_SCRIBBLE_POINTS = 20
  8 | N_POLYGON_POINTS = 256
  9 | N_MAX_OBJECTS = 30
 10 | 
 11 | 
 12 | def get_point_from_box(bbox):
 13 |     x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
 14 |     return [(x0 + x1) / 2.0, (y0 + y1) / 2.0]
 15 | 
 16 | 
 17 | def get_empty_binary_mask(img_width, img_height):
 18 |     return np.zeros((img_width, img_height, 1))
 19 | 
 20 | 
 21 | def sample_random_points_from_mask(mask, k=N_SCRIBBLE_POINTS):
 22 |     mask = mask[:, :, 0]
 23 |     # Find the coordinates of non-zero pixels in the binary mask
 24 |     nonzero_coords = np.transpose(np.nonzero(mask))
 25 | 
 26 |     # Randomly sample 'k' points
 27 |     # return all zeros if there is no non-zero pixel
 28 |     if len(nonzero_coords) == 0:
 29 |         xy_points = [0 for _ in range(k * 2)]
 30 |         return xy_points
 31 | 
 32 |     # randomly sample with replacement if there are not enough non-zero pixels
 33 |     if len(nonzero_coords) < k and len(nonzero_coords) > 0:
 34 |         random_indices = np.random.choice(len(nonzero_coords), k, replace=True)
 35 |     # randomly sample withiout replacement if there are enough non-zero pixels
 36 |     else:
 37 |         random_indices = np.random.choice(
 38 |             len(nonzero_coords), k, replace=False)
 39 |     sampled_points = nonzero_coords[random_indices]
 40 | 
 41 |     # order the points by their distance to (0, 0)
 42 |     # center = np.array([mask.shape[0] // 2, mask.shape[1] // 2])
 43 |     center = np.array([0, 0])
 44 |     sampled_points = sorted(sampled_points, key=lambda x: np.linalg.norm(
 45 |         np.array(x) - center))  # np.linalg.norm
 46 | 
 47 |     # concatenate x and y coordinates and return them as a list
 48 |     # [x1,y1,x2,y2,...,x_k,y_k]
 49 |     xy_points = []
 50 |     for x in sampled_points:
 51 |         xy_points.append(float(x[1]))
 52 |         xy_points.append(float(x[0]))
 53 |     return xy_points
 54 | 
 55 | 
 56 | def convert_points(points, img_width, img_height):
 57 |     # convert polygons/scribbless' coordinates to the relative values (0, 1)
 58 |     for i in range(len(points)):
 59 |         if i % 2 == 0:
 60 |             points[i] = min(points[i] / img_width, 1.0)
 61 |         else:
 62 |             points[i] = min(points[i] / img_height, 1.0)
 63 |     return points
 64 | 
 65 | 
 66 | def sample_sparse_points_from_mask(mask, k=256):
 67 |     n_points = k
 68 |     n_polygons = n_points // 2  # half points should be sampled from the polygons
 69 |     mask = mask[:, :, 0]
 70 |     # sample sparse points from the polygons (boundary)
 71 |     polygons = binary_mask_to_polygon(mask, tolerance=0.0)
 72 |     # concatenate polygons to a single list
 73 |     polygons_single = []
 74 |     for polygon in polygons:
 75 |         polygons_single += polygon
 76 |     if len(polygons_single) != 0:
 77 |         # uniformly sample points from the polygon
 78 |         polygons_single = np.array(polygons_single).reshape(-1, 2)
 79 |         indexes = np.linspace(0, polygons_single.shape[0] - 1, n_polygons)
 80 |         indexes = list([int(i) for i in indexes])
 81 | 
 82 |         polygons_single = polygons_single[indexes]
 83 |         sampled_polygons = [(x[0], x[1]) for x in polygons_single]
 84 |     else:
 85 |         return [0 for _ in range(256 * 2)]
 86 | 
 87 |     # sample sparse points from the mask
 88 |     n_inside_points = n_points - len(sampled_polygons)
 89 |     inside_points = sample_uniform_sparse_points(mask, n_inside_points)
 90 | 
 91 |     # combine inside_points and sampled_polygons
 92 |     xy_points = inside_points + sampled_polygons
 93 | 
 94 |     # order the points by their distance to (0, 0)
 95 |     center = np.array([0, 0])
 96 |     xy_points = sorted(xy_points, key=lambda x: np.linalg.norm(
 97 |         np.array(x) - center))  # np.linalg.norm
 98 | 
 99 |     # return the sampled points
100 |     sampled_points = []
101 |     for x in xy_points:
102 |         sampled_points.append(x[0])
103 |         sampled_points.append(x[1])
104 |     return sampled_points
105 | 
106 | 
107 | # [x0, y0, x1, y1]
108 | def get_grounding_input_from_coords(coords, img_width, img_height):
109 |     x0, y0, x1, y1, coord_width, coord_height = coords
110 |     location = [x0 / coord_width, y0 / coord_height,
111 |                 x1 / coord_width, y1 / coord_height]
112 | 
113 |     point = get_point_from_box(location)
114 |     binary_mask = get_empty_binary_mask(img_width, img_height)
115 | 
116 |     scribble = sample_random_points_from_mask(binary_mask, k=N_SCRIBBLE_POINTS)
117 |     scribble = convert_points(scribble, img_width, img_height)
118 | 
119 |     polygon = sample_sparse_points_from_mask(binary_mask, k=N_POLYGON_POINTS)
120 |     polygon = convert_points(polygon, img_width, img_height)
121 | 
122 |     segment = resize(binary_mask.astype(np.float32),
123 |                      (img_width, img_height)).squeeze()
124 |     # segment = np.stack(segment).astype(np.float32).squeeze() if len(segment) > 0 else segment
125 | 
126 |     return dict(
127 |         polygon=polygon,
128 |         scribble=scribble,
129 |         segment=segment,
130 |         box=location,
131 |         point=point,
132 |     )
133 | 
134 | 
135 | def create_zero_input_tensors(n_frames, img_width, img_height):
136 |     masks = torch.zeros(n_frames, N_MAX_OBJECTS)
137 |     text_masks = torch.zeros(n_frames, N_MAX_OBJECTS)
138 |     text_embeddings = torch.zeros(n_frames, N_MAX_OBJECTS, 768)
139 |     box_embeddings = torch.zeros(n_frames, N_MAX_OBJECTS, 4)
140 |     polygon_embeddings = torch.zeros(
141 |         n_frames, N_MAX_OBJECTS, N_POLYGON_POINTS * 2)
142 |     scribble_embeddings = torch.zeros(
143 |         n_frames, N_MAX_OBJECTS, N_SCRIBBLE_POINTS * 2)
144 |     segment_embeddings = torch.zeros(
145 |         n_frames, N_MAX_OBJECTS, img_width, img_height)  # TODO: width height order
146 |     point_embeddings = torch.zeros(n_frames, N_MAX_OBJECTS, 2)
147 | 
148 |     return dict(
149 |         masks=masks,
150 |         text_masks=text_masks,
151 |         prompts=text_embeddings,
152 |         boxes=box_embeddings,
153 |         polygons=polygon_embeddings,
154 |         scribbles=scribble_embeddings,
155 |         segments=segment_embeddings,
156 |         points=point_embeddings
157 |     )
158 | 
159 | 
160 | def get_attn_mask(img_size=64):
161 |     return torch.zeros(N_MAX_OBJECTS, img_size, img_size)
162 | 
163 | 
164 | def prepare_embeddings(conds, latent_shape, idxs, use_masked_att=False):
165 |     batch_size, _, latent_height, latent_width = latent_shape
166 |     if idxs is None:
167 |         idxs = list(range(batch_size))
168 |     else:
169 |         batch_size = len(idxs)
170 |     embeddings = create_zero_input_tensors(
171 |         batch_size, latent_width, latent_height)
172 |     if use_masked_att:
173 |         embeddings['att_masks'] = torch.zeros(
174 |             batch_size, N_MAX_OBJECTS, latent_width, latent_height)
175 | 
176 |     for grounding_idx, frame_idx in enumerate(idxs):
177 |         for cond_idx, cond in enumerate(conds):
178 |             if cond['positions'][frame_idx] is None:
179 |                 continue
180 | 
181 |             grounding = get_grounding_input_from_coords(
182 |                 cond['positions'][frame_idx], latent_width, latent_height)
183 |             embeddings['masks'][grounding_idx][cond_idx] = 1
184 |             embeddings['text_masks'][grounding_idx][cond_idx] = 1
185 |             embeddings['prompts'][grounding_idx][cond_idx] = cond['cond_pooled']
186 |             embeddings['boxes'][grounding_idx][cond_idx] = torch.tensor(
187 |                 grounding['box'])
188 |             embeddings['polygons'][grounding_idx][cond_idx] = torch.tensor(
189 |                 grounding['polygon'])
190 |             embeddings['scribbles'][grounding_idx][cond_idx] = torch.tensor(
191 |                 grounding['scribble'])
192 |             embeddings['segments'][grounding_idx][cond_idx] = torch.tensor(
193 |                 grounding['segment'])
194 |             embeddings['points'][grounding_idx][cond_idx] = torch.tensor(
195 |                 grounding['point'])
196 | 
197 |             if use_masked_att:
198 |                 box = grounding['box']
199 |                 x1, y1, x2, y2 = int(np.round(box[0] * latent_width)), int(np.round(box[1] * latent_height)), int(
200 |                     np.round(box[2] * latent_width)), int(np.round(box[3] * latent_height))
201 |                 embeddings['att_masks'][grounding_idx][cond_idx][x1:x2, y1:y2] = 1
202 | 
203 |     return embeddings
204 | 


--------------------------------------------------------------------------------
/conditioning/fusers_patch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .embeddings import prepare_embeddings
 4 | 
 5 | 
 6 | block_map = {
 7 |     'input': {
 8 |         1: 0,
 9 |         2: 1,
10 |         4: 2,
11 |         5: 3,
12 |         7: 4,
13 |         8: 5
14 |     },
15 |     'middle': {
16 |         0: 6,
17 |     },
18 |     'output': {
19 |         3: 7,
20 |         4: 8,
21 |         5: 9,
22 |         6: 10,
23 |         7: 11,
24 |         8: 12,
25 |         9: 13,
26 |         10: 14,
27 |         11: 15
28 |     }
29 | }
30 | 
31 | 
32 | class FusersPatch(torch.nn.Module):
33 |     def __init__(self, conds, fusers_list, positionnet, latent_shape, idxs, device):
34 |         super(FusersPatch, self).__init__()
35 |         self.conds = conds
36 |         self.fusers_list = fusers_list
37 |         self.positionnet = positionnet
38 |         self.latent_shape = latent_shape
39 |         self.idxs = idxs
40 |         self.device = device
41 | 
42 |     def _get_position_objs(self, idxs):
43 |         embeddings = prepare_embeddings(
44 |             self.conds, self.latent_shape, idxs, True)
45 |         for key in embeddings:
46 |             embeddings[key] = embeddings[key].to(self.device)
47 |         objs, drop_box_mask = self.positionnet(embeddings)
48 |         return {'objs': objs, 'drop_box_mask': drop_box_mask}
49 | 
50 |     def _get_idxs(self, x, extra_options):
51 |         if extra_options is not None:
52 |             if 'ad_params' in extra_options:
53 |                 return extra_options['ad_params']['sub_idxs']
54 |             elif 'sub_idxs' in extra_options:
55 |                 return extra_options['sub_idxs']
56 | 
57 |         return list(range(x.shape[0]))
58 | 
59 |     @torch.no_grad()
60 |     def forward(self, x, extra_options):
61 |         block, idx = extra_options['block']
62 |         fuser_idx = block_map[block][idx]
63 |         fuser = self.fusers_list[fuser_idx]
64 |         attn_total = []
65 |         idxs = self._get_idxs(x, extra_options)
66 | 
67 |         attn_total = fuser(x, self._get_position_objs(idxs))
68 |         return attn_total.to(torch.float16)
69 | 


--------------------------------------------------------------------------------
/conditioning/instance_conditioning.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import comfy.model_management
 4 | 
 5 | from .fusers_patch import FusersPatch
 6 | 
 7 | 
 8 | class InstanceConditioning:
 9 |     """
10 |     This class masquerades as Gligen in order to trigger setup
11 |     """
12 | 
13 |     def __init__(self, fusers, positionnet):
14 |         self.fusers_list = fusers['model_list']
15 |         self.positionnet = positionnet['model']
16 |         self.conds = []
17 |         self.current_device = comfy.model_management.intermediate_device()
18 | 
19 |         # Gligen hacks
20 |         self.model = self
21 |         self.load_device = comfy.model_management.get_torch_device()
22 |         self.offload_device = comfy.model_management.intermediate_device()
23 | 
24 |     def loaded_size(self):
25 |         return 0
26 | 
27 |     def current_loaded_device(self):
28 |         return comfy.model_management.intermediate_device()
29 | 
30 |     def get_fusers_patch(self, latent_shape, idxs, device):
31 |         return FusersPatch(self.conds, self.fusers_list, self.positionnet, latent_shape, idxs, device)
32 | 
33 |     def set_position(self, latent_shape, _, device):
34 |         # Called in samplers by gligen cond to return middle attention patch
35 |         batch_size = latent_shape[0]
36 |         idxs = list(range(batch_size))
37 |         fusers_patch = self.get_fusers_patch(latent_shape, idxs, device)
38 |         return fusers_patch
39 | 
40 |     def add_conds(self, conds):
41 |         self.conds.extend(conds)
42 | 
43 |     def get_models(self, *args, **kwargs) -> list[torch.nn.Module]:
44 |         # Used to get models for loading/offloading
45 |         return [(None, model) for model in [*self.fusers_list, self.positionnet]]
46 | 
47 |     def inference_memory_requirements(self, dtype, *args, **kwargs) -> int:
48 |         # Used to calculate memory requirements by ControlNet
49 |         return 0
50 | 
51 |     def is_clone(self, other, *args, **kwargs):
52 |         return other == self
53 | 
54 |     def clone(self):
55 |         return self
56 | 
57 |     def model_size(self, *args, **kwargs):
58 |         return 0
59 | 
60 |     def memory_required(self, *args, **kwargs):
61 |         return 0
62 | 
63 |     def model_patches_to(self, device_or_dtype, *args, **kwargs):
64 |         if device_or_dtype == torch.float16 or device_or_dtype == torch.float32:
65 |             return
66 |         if device_or_dtype is None:
67 |             return
68 |         self.positionnet = self.positionnet.to(device_or_dtype)
69 |         for i, fuser in enumerate(self.fusers_list):
70 |             self.fusers_list[i] = fuser.to(device_or_dtype)
71 | 
72 |     def model_dtype(self, *args, **kwargs):
73 |         return torch.float32
74 | 
75 |     def patch_model(self, *args, **kwargs):
76 |         return
77 | 
78 |     def unpatch_weights(self, *args, **kwargs):
79 |         return
80 | 
81 |     def unpatch_model(self, *args, **kwargs):
82 |         return
83 | 
84 |     def set_model_patch(self, *args, **kwargs):
85 |         return
86 | 
87 |     def set_model_patch_replace(self, *args, **kwargs):
88 |         return
89 | 


--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
1 | 
2 | INSTANCE_MODELS_DIR = "instance_models"
3 | INSTANCE_FUSERS_DIR = "fuser_models"
4 | INSTANCE_POSITIONNET_DIR = "positionnet_models"
5 | INSTANCE_SCALEU_DIR = "scaleu_models"
6 | 


--------------------------------------------------------------------------------
/example_workflows/fourpeople_workflow.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "last_node_id": 91,
   3 |   "last_link_id": 279,
   4 |   "nodes": [
   5 |     {
   6 |       "id": 64,
   7 |       "type": "CLIPTextEncode",
   8 |       "pos": [
   9 |         877,
  10 |         294
  11 |       ],
  12 |       "size": {
  13 |         "0": 210,
  14 |         "1": 85.9561767578125
  15 |       },
  16 |       "flags": {},
  17 |       "order": 15,
  18 |       "mode": 0,
  19 |       "inputs": [
  20 |         {
  21 |           "name": "clip",
  22 |           "type": "CLIP",
  23 |           "link": 149
  24 |         }
  25 |       ],
  26 |       "outputs": [
  27 |         {
  28 |           "name": "CONDITIONING",
  29 |           "type": "CONDITIONING",
  30 |           "links": [],
  31 |           "slot_index": 0
  32 |         }
  33 |       ],
  34 |       "properties": {
  35 |         "Node name for S&R": "CLIPTextEncode"
  36 |       },
  37 |       "widgets_values": [
  38 |         "nsfw, naked"
  39 |       ]
  40 |     },
  41 |     {
  42 |       "id": 45,
  43 |       "type": "ControlNetLoader",
  44 |       "pos": [
  45 |         2006,
  46 |         -326
  47 |       ],
  48 |       "size": {
  49 |         "0": 315,
  50 |         "1": 58
  51 |       },
  52 |       "flags": {},
  53 |       "order": 0,
  54 |       "mode": 0,
  55 |       "outputs": [
  56 |         {
  57 |           "name": "CONTROL_NET",
  58 |           "type": "CONTROL_NET",
  59 |           "links": [
  60 |             225
  61 |           ],
  62 |           "shape": 3,
  63 |           "slot_index": 0
  64 |         }
  65 |       ],
  66 |       "properties": {
  67 |         "Node name for S&R": "ControlNetLoader"
  68 |       },
  69 |       "widgets_values": [
  70 |         "control_v11p_sd15_softedge.pth"
  71 |       ]
  72 |     },
  73 |     {
  74 |       "id": 26,
  75 |       "type": "ControlNetLoader",
  76 |       "pos": [
  77 |         1272,
  78 |         -317
  79 |       ],
  80 |       "size": {
  81 |         "0": 315,
  82 |         "1": 58
  83 |       },
  84 |       "flags": {},
  85 |       "order": 1,
  86 |       "mode": 0,
  87 |       "outputs": [
  88 |         {
  89 |           "name": "CONTROL_NET",
  90 |           "type": "CONTROL_NET",
  91 |           "links": [
  92 |             233
  93 |           ],
  94 |           "shape": 3,
  95 |           "slot_index": 0
  96 |         }
  97 |       ],
  98 |       "properties": {
  99 |         "Node name for S&R": "ControlNetLoader"
 100 |       },
 101 |       "widgets_values": [
 102 |         "control_v11f1p_sd15_depth.pth"
 103 |       ]
 104 |     },
 105 |     {
 106 |       "id": 55,
 107 |       "type": "ControlNetLoader",
 108 |       "pos": [
 109 |         1635,
 110 |         -323
 111 |       ],
 112 |       "size": {
 113 |         "0": 315,
 114 |         "1": 58
 115 |       },
 116 |       "flags": {},
 117 |       "order": 2,
 118 |       "mode": 0,
 119 |       "outputs": [
 120 |         {
 121 |           "name": "CONTROL_NET",
 122 |           "type": "CONTROL_NET",
 123 |           "links": [],
 124 |           "shape": 3,
 125 |           "slot_index": 0
 126 |         }
 127 |       ],
 128 |       "properties": {
 129 |         "Node name for S&R": "ControlNetLoader"
 130 |       },
 131 |       "widgets_values": [
 132 |         "control_v11p_sd15_openpose_fp16.safetensors"
 133 |       ]
 134 |     },
 135 |     {
 136 |       "id": 27,
 137 |       "type": "MiDaS-DepthMapPreprocessor",
 138 |       "pos": [
 139 |         1292,
 140 |         -504
 141 |       ],
 142 |       "size": {
 143 |         "0": 315,
 144 |         "1": 106
 145 |       },
 146 |       "flags": {},
 147 |       "order": 21,
 148 |       "mode": 0,
 149 |       "inputs": [
 150 |         {
 151 |           "name": "image",
 152 |           "type": "IMAGE",
 153 |           "link": 48
 154 |         }
 155 |       ],
 156 |       "outputs": [
 157 |         {
 158 |           "name": "IMAGE",
 159 |           "type": "IMAGE",
 160 |           "links": [
 161 |             234
 162 |           ],
 163 |           "shape": 3,
 164 |           "slot_index": 0
 165 |         }
 166 |       ],
 167 |       "properties": {
 168 |         "Node name for S&R": "MiDaS-DepthMapPreprocessor"
 169 |       },
 170 |       "widgets_values": [
 171 |         6.283185307179586,
 172 |         0.1,
 173 |         512
 174 |       ]
 175 |     },
 176 |     {
 177 |       "id": 46,
 178 |       "type": "HEDPreprocessor",
 179 |       "pos": [
 180 |         2080,
 181 |         -475
 182 |       ],
 183 |       "size": {
 184 |         "0": 210,
 185 |         "1": 82
 186 |       },
 187 |       "flags": {},
 188 |       "order": 22,
 189 |       "mode": 0,
 190 |       "inputs": [
 191 |         {
 192 |           "name": "image",
 193 |           "type": "IMAGE",
 194 |           "link": 107
 195 |         }
 196 |       ],
 197 |       "outputs": [
 198 |         {
 199 |           "name": "IMAGE",
 200 |           "type": "IMAGE",
 201 |           "links": [
 202 |             226
 203 |           ],
 204 |           "shape": 3,
 205 |           "slot_index": 0
 206 |         }
 207 |       ],
 208 |       "properties": {
 209 |         "Node name for S&R": "HEDPreprocessor"
 210 |       },
 211 |       "widgets_values": [
 212 |         "enable",
 213 |         512
 214 |       ]
 215 |     },
 216 |     {
 217 |       "id": 48,
 218 |       "type": "ModelSamplingDiscrete",
 219 |       "pos": [
 220 |         1505,
 221 |         582
 222 |       ],
 223 |       "size": {
 224 |         "0": 315,
 225 |         "1": 82
 226 |       },
 227 |       "flags": {},
 228 |       "order": 19,
 229 |       "mode": 0,
 230 |       "inputs": [
 231 |         {
 232 |           "name": "model",
 233 |           "type": "MODEL",
 234 |           "link": 112
 235 |         }
 236 |       ],
 237 |       "outputs": [
 238 |         {
 239 |           "name": "MODEL",
 240 |           "type": "MODEL",
 241 |           "links": [
 242 |             138
 243 |           ],
 244 |           "shape": 3,
 245 |           "slot_index": 0
 246 |         }
 247 |       ],
 248 |       "properties": {
 249 |         "Node name for S&R": "ModelSamplingDiscrete"
 250 |       },
 251 |       "widgets_values": [
 252 |         "eps",
 253 |         false
 254 |       ]
 255 |     },
 256 |     {
 257 |       "id": 60,
 258 |       "type": "ADE_ApplyAnimateDiffModel",
 259 |       "pos": [
 260 |         1483,
 261 |         724
 262 |       ],
 263 |       "size": {
 264 |         "0": 319.20001220703125,
 265 |         "1": 182
 266 |       },
 267 |       "flags": {},
 268 |       "order": 12,
 269 |       "mode": 0,
 270 |       "inputs": [
 271 |         {
 272 |           "name": "motion_model",
 273 |           "type": "MOTION_MODEL_ADE",
 274 |           "link": 136,
 275 |           "slot_index": 0
 276 |         },
 277 |         {
 278 |           "name": "motion_lora",
 279 |           "type": "MOTION_LORA",
 280 |           "link": null
 281 |         },
 282 |         {
 283 |           "name": "scale_multival",
 284 |           "type": "MULTIVAL",
 285 |           "link": null,
 286 |           "slot_index": 2
 287 |         },
 288 |         {
 289 |           "name": "effect_multival",
 290 |           "type": "MULTIVAL",
 291 |           "link": 137,
 292 |           "slot_index": 3
 293 |         },
 294 |         {
 295 |           "name": "ad_keyframes",
 296 |           "type": "AD_KEYFRAMES",
 297 |           "link": null
 298 |         },
 299 |         {
 300 |           "name": "prev_m_models",
 301 |           "type": "M_MODELS",
 302 |           "link": null
 303 |         }
 304 |       ],
 305 |       "outputs": [
 306 |         {
 307 |           "name": "M_MODELS",
 308 |           "type": "M_MODELS",
 309 |           "links": [
 310 |             133
 311 |           ],
 312 |           "shape": 3,
 313 |           "slot_index": 0
 314 |         }
 315 |       ],
 316 |       "properties": {
 317 |         "Node name for S&R": "ADE_ApplyAnimateDiffModel"
 318 |       },
 319 |       "widgets_values": [
 320 |         0,
 321 |         1
 322 |       ]
 323 |     },
 324 |     {
 325 |       "id": 59,
 326 |       "type": "ADE_AnimateDiffSamplingSettings",
 327 |       "pos": [
 328 |         1480,
 329 |         1264
 330 |       ],
 331 |       "size": {
 332 |         "0": 315,
 333 |         "1": 234
 334 |       },
 335 |       "flags": {},
 336 |       "order": 3,
 337 |       "mode": 0,
 338 |       "inputs": [
 339 |         {
 340 |           "name": "noise_layers",
 341 |           "type": "NOISE_LAYERS",
 342 |           "link": null,
 343 |           "slot_index": 0
 344 |         },
 345 |         {
 346 |           "name": "iteration_opts",
 347 |           "type": "ITERATION_OPTS",
 348 |           "link": null,
 349 |           "slot_index": 1
 350 |         },
 351 |         {
 352 |           "name": "seed_override",
 353 |           "type": "INT",
 354 |           "link": null,
 355 |           "widget": {
 356 |             "name": "seed_override"
 357 |           }
 358 |         },
 359 |         {
 360 |           "name": "sigma_schedule",
 361 |           "type": "SIGMA_SCHEDULE",
 362 |           "link": null
 363 |         },
 364 |         {
 365 |           "name": "seed_override",
 366 |           "type": "INT",
 367 |           "link": null,
 368 |           "widget": {
 369 |             "name": "seed_override"
 370 |           }
 371 |         }
 372 |       ],
 373 |       "outputs": [
 374 |         {
 375 |           "name": "settings",
 376 |           "type": "SAMPLE_SETTINGS",
 377 |           "links": [
 378 |             135
 379 |           ],
 380 |           "shape": 3
 381 |         }
 382 |       ],
 383 |       "properties": {
 384 |         "Node name for S&R": "ADE_AnimateDiffSamplingSettings"
 385 |       },
 386 |       "widgets_values": [
 387 |         0,
 388 |         "FreeNoise",
 389 |         "comfy",
 390 |         0,
 391 |         0,
 392 |         false
 393 |       ]
 394 |     },
 395 |     {
 396 |       "id": 61,
 397 |       "type": "ADE_MultivalDynamic",
 398 |       "pos": [
 399 |         1117,
 400 |         854
 401 |       ],
 402 |       "size": {
 403 |         "0": 315,
 404 |         "1": 58
 405 |       },
 406 |       "flags": {},
 407 |       "order": 4,
 408 |       "mode": 0,
 409 |       "inputs": [
 410 |         {
 411 |           "name": "mask_optional",
 412 |           "type": "MASK",
 413 |           "link": null
 414 |         }
 415 |       ],
 416 |       "outputs": [
 417 |         {
 418 |           "name": "MULTIVAL",
 419 |           "type": "MULTIVAL",
 420 |           "links": [
 421 |             137
 422 |           ],
 423 |           "shape": 3
 424 |         }
 425 |       ],
 426 |       "properties": {
 427 |         "Node name for S&R": "ADE_MultivalDynamic"
 428 |       },
 429 |       "widgets_values": [
 430 |         0.9
 431 |       ]
 432 |     },
 433 |     {
 434 |       "id": 58,
 435 |       "type": "ADE_LoadAnimateDiffModel",
 436 |       "pos": [
 437 |         1137,
 438 |         722
 439 |       ],
 440 |       "size": {
 441 |         "0": 315,
 442 |         "1": 58
 443 |       },
 444 |       "flags": {},
 445 |       "order": 5,
 446 |       "mode": 0,
 447 |       "inputs": [
 448 |         {
 449 |           "name": "ad_settings",
 450 |           "type": "MOTION_MODEL_SETTINGS",
 451 |           "link": null
 452 |         }
 453 |       ],
 454 |       "outputs": [
 455 |         {
 456 |           "name": "MOTION_MODEL",
 457 |           "type": "MOTION_MODEL_ADE",
 458 |           "links": [
 459 |             136
 460 |           ],
 461 |           "shape": 3,
 462 |           "slot_index": 0
 463 |         }
 464 |       ],
 465 |       "properties": {
 466 |         "Node name for S&R": "ADE_LoadAnimateDiffModel"
 467 |       },
 468 |       "widgets_values": [
 469 |         "v3_sd15_mm.ckpt"
 470 |       ]
 471 |     },
 472 |     {
 473 |       "id": 7,
 474 |       "type": "CLIPTextEncode",
 475 |       "pos": [
 476 |         924,
 477 |         -28
 478 |       ],
 479 |       "size": {
 480 |         "0": 210,
 481 |         "1": 85.9561767578125
 482 |       },
 483 |       "flags": {},
 484 |       "order": 16,
 485 |       "mode": 0,
 486 |       "inputs": [
 487 |         {
 488 |           "name": "clip",
 489 |           "type": "CLIP",
 490 |           "link": 243
 491 |         }
 492 |       ],
 493 |       "outputs": [
 494 |         {
 495 |           "name": "CONDITIONING",
 496 |           "type": "CONDITIONING",
 497 |           "links": [
 498 |             232
 499 |           ],
 500 |           "slot_index": 0
 501 |         }
 502 |       ],
 503 |       "properties": {
 504 |         "Node name for S&R": "CLIPTextEncode"
 505 |       },
 506 |       "widgets_values": [
 507 |         "nsfw, naked"
 508 |       ]
 509 |     },
 510 |     {
 511 |       "id": 57,
 512 |       "type": "ADE_StandardUniformContextOptions",
 513 |       "pos": [
 514 |         1475,
 515 |         953
 516 |       ],
 517 |       "size": {
 518 |         "0": 327.6000061035156,
 519 |         "1": 222
 520 |       },
 521 |       "flags": {},
 522 |       "order": 6,
 523 |       "mode": 0,
 524 |       "inputs": [
 525 |         {
 526 |           "name": "prev_context",
 527 |           "type": "CONTEXT_OPTIONS",
 528 |           "link": null
 529 |         },
 530 |         {
 531 |           "name": "view_opts",
 532 |           "type": "VIEW_OPTS",
 533 |           "link": null
 534 |         }
 535 |       ],
 536 |       "outputs": [
 537 |         {
 538 |           "name": "CONTEXT_OPTS",
 539 |           "type": "CONTEXT_OPTIONS",
 540 |           "links": [
 541 |             134
 542 |           ],
 543 |           "shape": 3,
 544 |           "slot_index": 0
 545 |         }
 546 |       ],
 547 |       "properties": {
 548 |         "Node name for S&R": "ADE_StandardUniformContextOptions"
 549 |       },
 550 |       "widgets_values": [
 551 |         16,
 552 |         1,
 553 |         4,
 554 |         "pyramid",
 555 |         false,
 556 |         0,
 557 |         1
 558 |       ]
 559 |     },
 560 |     {
 561 |       "id": 49,
 562 |       "type": "CheckpointLoaderSimpleWithNoiseSelect",
 563 |       "pos": [
 564 |         212,
 565 |         253
 566 |       ],
 567 |       "size": {
 568 |         "0": 319.20001220703125,
 569 |         "1": 170
 570 |       },
 571 |       "flags": {},
 572 |       "order": 7,
 573 |       "mode": 0,
 574 |       "outputs": [
 575 |         {
 576 |           "name": "MODEL",
 577 |           "type": "MODEL",
 578 |           "links": [
 579 |             111
 580 |           ],
 581 |           "shape": 3,
 582 |           "slot_index": 0
 583 |         },
 584 |         {
 585 |           "name": "CLIP",
 586 |           "type": "CLIP",
 587 |           "links": [
 588 |             148,
 589 |             149,
 590 |             243,
 591 |             244,
 592 |             265
 593 |           ],
 594 |           "shape": 3,
 595 |           "slot_index": 1
 596 |         },
 597 |         {
 598 |           "name": "VAE",
 599 |           "type": "VAE",
 600 |           "links": [
 601 |             245,
 602 |             247
 603 |           ],
 604 |           "shape": 3,
 605 |           "slot_index": 2
 606 |         }
 607 |       ],
 608 |       "properties": {
 609 |         "Node name for S&R": "CheckpointLoaderSimpleWithNoiseSelect"
 610 |       },
 611 |       "widgets_values": [
 612 |         "juggernaut_reborn.safetensors",
 613 |         "sqrt_linear (AnimateDiff)",
 614 |         false,
 615 |         0.18215
 616 |       ]
 617 |     },
 618 |     {
 619 |       "id": 12,
 620 |       "type": "ImageScale",
 621 |       "pos": [
 622 |         660,
 623 |         -374
 624 |       ],
 625 |       "size": {
 626 |         "0": 210,
 627 |         "1": 130
 628 |       },
 629 |       "flags": {},
 630 |       "order": 18,
 631 |       "mode": 0,
 632 |       "inputs": [
 633 |         {
 634 |           "name": "image",
 635 |           "type": "IMAGE",
 636 |           "link": 10
 637 |         }
 638 |       ],
 639 |       "outputs": [
 640 |         {
 641 |           "name": "IMAGE",
 642 |           "type": "IMAGE",
 643 |           "links": [
 644 |             19,
 645 |             48,
 646 |             107,
 647 |             274
 648 |           ],
 649 |           "shape": 3,
 650 |           "slot_index": 0
 651 |         }
 652 |       ],
 653 |       "properties": {
 654 |         "Node name for S&R": "ImageScale"
 655 |       },
 656 |       "widgets_values": [
 657 |         "nearest-exact",
 658 |         512,
 659 |         512,
 660 |         "disabled"
 661 |       ]
 662 |     },
 663 |     {
 664 |       "id": 16,
 665 |       "type": "VAEEncode",
 666 |       "pos": [
 667 |         3205,
 668 |         -567
 669 |       ],
 670 |       "size": {
 671 |         "0": 210,
 672 |         "1": 46
 673 |       },
 674 |       "flags": {},
 675 |       "order": 20,
 676 |       "mode": 0,
 677 |       "inputs": [
 678 |         {
 679 |           "name": "pixels",
 680 |           "type": "IMAGE",
 681 |           "link": 19
 682 |         },
 683 |         {
 684 |           "name": "vae",
 685 |           "type": "VAE",
 686 |           "link": 245
 687 |         }
 688 |       ],
 689 |       "outputs": [
 690 |         {
 691 |           "name": "LATENT",
 692 |           "type": "LATENT",
 693 |           "links": [
 694 |             214
 695 |           ],
 696 |           "shape": 3,
 697 |           "slot_index": 0
 698 |         }
 699 |       ],
 700 |       "properties": {
 701 |         "Node name for S&R": "VAEEncode"
 702 |       }
 703 |     },
 704 |     {
 705 |       "id": 85,
 706 |       "type": "ACN_AdvancedControlNetApply",
 707 |       "pos": [
 708 |         1303,
 709 |         -210
 710 |       ],
 711 |       "size": {
 712 |         "0": 285.6000061035156,
 713 |         "1": 266
 714 |       },
 715 |       "flags": {},
 716 |       "order": 25,
 717 |       "mode": 0,
 718 |       "inputs": [
 719 |         {
 720 |           "name": "positive",
 721 |           "type": "CONDITIONING",
 722 |           "link": 231
 723 |         },
 724 |         {
 725 |           "name": "negative",
 726 |           "type": "CONDITIONING",
 727 |           "link": 232
 728 |         },
 729 |         {
 730 |           "name": "control_net",
 731 |           "type": "CONTROL_NET",
 732 |           "link": 233
 733 |         },
 734 |         {
 735 |           "name": "image",
 736 |           "type": "IMAGE",
 737 |           "link": 234
 738 |         },
 739 |         {
 740 |           "name": "mask_optional",
 741 |           "type": "MASK",
 742 |           "link": null
 743 |         },
 744 |         {
 745 |           "name": "timestep_kf",
 746 |           "type": "TIMESTEP_KEYFRAME",
 747 |           "link": null
 748 |         },
 749 |         {
 750 |           "name": "latent_kf_override",
 751 |           "type": "LATENT_KEYFRAME",
 752 |           "link": null
 753 |         },
 754 |         {
 755 |           "name": "weights_override",
 756 |           "type": "CONTROL_NET_WEIGHTS",
 757 |           "link": null
 758 |         },
 759 |         {
 760 |           "name": "model_optional",
 761 |           "type": "MODEL",
 762 |           "link": null
 763 |         }
 764 |       ],
 765 |       "outputs": [
 766 |         {
 767 |           "name": "positive",
 768 |           "type": "CONDITIONING",
 769 |           "links": [
 770 |             239
 771 |           ],
 772 |           "shape": 3,
 773 |           "slot_index": 0
 774 |         },
 775 |         {
 776 |           "name": "negative",
 777 |           "type": "CONDITIONING",
 778 |           "links": [
 779 |             240
 780 |           ],
 781 |           "shape": 3,
 782 |           "slot_index": 1
 783 |         },
 784 |         {
 785 |           "name": "model_opt",
 786 |           "type": "MODEL",
 787 |           "links": null,
 788 |           "shape": 3
 789 |         }
 790 |       ],
 791 |       "properties": {
 792 |         "Node name for S&R": "ACN_AdvancedControlNetApply"
 793 |       },
 794 |       "widgets_values": [
 795 |         0.35000000000000003,
 796 |         0,
 797 |         0.65
 798 |       ]
 799 |     },
 800 |     {
 801 |       "id": 6,
 802 |       "type": "CLIPTextEncode",
 803 |       "pos": [
 804 |         879,
 805 |         -213
 806 |       ],
 807 |       "size": {
 808 |         "0": 265.12786865234375,
 809 |         "1": 95.60565948486328
 810 |       },
 811 |       "flags": {},
 812 |       "order": 17,
 813 |       "mode": 0,
 814 |       "inputs": [
 815 |         {
 816 |           "name": "clip",
 817 |           "type": "CLIP",
 818 |           "link": 244
 819 |         }
 820 |       ],
 821 |       "outputs": [
 822 |         {
 823 |           "name": "CONDITIONING",
 824 |           "type": "CONDITIONING",
 825 |           "links": [
 826 |             231
 827 |           ],
 828 |           "slot_index": 0
 829 |         }
 830 |       ],
 831 |       "properties": {
 832 |         "Node name for S&R": "CLIPTextEncode"
 833 |       },
 834 |       "widgets_values": [
 835 |         "portrait photo, uhd 4k, afternoon, brightly lit, (castle:0.6)"
 836 |       ]
 837 |     },
 838 |     {
 839 |       "id": 63,
 840 |       "type": "CLIPTextEncode",
 841 |       "pos": [
 842 |         853,
 843 |         130
 844 |       ],
 845 |       "size": {
 846 |         "0": 265.12786865234375,
 847 |         "1": 95.60565948486328
 848 |       },
 849 |       "flags": {},
 850 |       "order": 14,
 851 |       "mode": 0,
 852 |       "inputs": [
 853 |         {
 854 |           "name": "clip",
 855 |           "type": "CLIP",
 856 |           "link": 148
 857 |         }
 858 |       ],
 859 |       "outputs": [
 860 |         {
 861 |           "name": "CONDITIONING",
 862 |           "type": "CONDITIONING",
 863 |           "links": [],
 864 |           "slot_index": 0
 865 |         }
 866 |       ],
 867 |       "properties": {
 868 |         "Node name for S&R": "CLIPTextEncode"
 869 |       },
 870 |       "widgets_values": [
 871 |         "portrait photo, uhd 4k, afternoon, brightly lit, (castle:0.6)"
 872 |       ]
 873 |     },
 874 |     {
 875 |       "id": 56,
 876 |       "type": "ADE_UseEvolvedSampling",
 877 |       "pos": [
 878 |         1907,
 879 |         644
 880 |       ],
 881 |       "size": {
 882 |         "0": 301.4368896484375,
 883 |         "1": 118
 884 |       },
 885 |       "flags": {},
 886 |       "order": 24,
 887 |       "mode": 0,
 888 |       "inputs": [
 889 |         {
 890 |           "name": "model",
 891 |           "type": "MODEL",
 892 |           "link": 138,
 893 |           "slot_index": 0
 894 |         },
 895 |         {
 896 |           "name": "m_models",
 897 |           "type": "M_MODELS",
 898 |           "link": 133,
 899 |           "slot_index": 1
 900 |         },
 901 |         {
 902 |           "name": "context_options",
 903 |           "type": "CONTEXT_OPTIONS",
 904 |           "link": 134,
 905 |           "slot_index": 2
 906 |         },
 907 |         {
 908 |           "name": "sample_settings",
 909 |           "type": "SAMPLE_SETTINGS",
 910 |           "link": 135,
 911 |           "slot_index": 3
 912 |         }
 913 |       ],
 914 |       "outputs": [
 915 |         {
 916 |           "name": "MODEL",
 917 |           "type": "MODEL",
 918 |           "links": [
 919 |             272
 920 |           ],
 921 |           "shape": 3,
 922 |           "slot_index": 0
 923 |         }
 924 |       ],
 925 |       "properties": {
 926 |         "Node name for S&R": "ADE_UseEvolvedSampling"
 927 |       },
 928 |       "widgets_values": [
 929 |         "sqrt_linear (AnimateDiff)"
 930 |       ]
 931 |     },
 932 |     {
 933 |       "id": 90,
 934 |       "type": "ApplyScaleUModelNode",
 935 |       "pos": [
 936 |         2326,
 937 |         628
 938 |       ],
 939 |       "size": {
 940 |         "0": 260.3999938964844,
 941 |         "1": 46
 942 |       },
 943 |       "flags": {},
 944 |       "order": 27,
 945 |       "mode": 0,
 946 |       "inputs": [
 947 |         {
 948 |           "name": "model",
 949 |           "type": "MODEL",
 950 |           "link": 272
 951 |         },
 952 |         {
 953 |           "name": "scaleu",
 954 |           "type": "SCALEU",
 955 |           "link": 271
 956 |         }
 957 |       ],
 958 |       "outputs": [
 959 |         {
 960 |           "name": "MODEL",
 961 |           "type": "MODEL",
 962 |           "links": [
 963 |             273
 964 |           ],
 965 |           "shape": 3,
 966 |           "slot_index": 0
 967 |         }
 968 |       ],
 969 |       "properties": {
 970 |         "Node name for S&R": "ApplyScaleUModelNode"
 971 |       }
 972 |     },
 973 |     {
 974 |       "id": 10,
 975 |       "type": "VHS_LoadVideo",
 976 |       "pos": [
 977 |         397,
 978 |         618
 979 |       ],
 980 |       "size": [
 981 |         235.1999969482422,
 982 |         471.1999969482422
 983 |       ],
 984 |       "flags": {},
 985 |       "order": 8,
 986 |       "mode": 0,
 987 |       "inputs": [
 988 |         {
 989 |           "name": "batch_manager",
 990 |           "type": "VHS_BatchManager",
 991 |           "link": null
 992 |         }
 993 |       ],
 994 |       "outputs": [
 995 |         {
 996 |           "name": "IMAGE",
 997 |           "type": "IMAGE",
 998 |           "links": [
 999 |             10
1000 |           ],
1001 |           "shape": 3,
1002 |           "slot_index": 0
1003 |         },
1004 |         {
1005 |           "name": "frame_count",
1006 |           "type": "INT",
1007 |           "links": [],
1008 |           "shape": 3,
1009 |           "slot_index": 1
1010 |         },
1011 |         {
1012 |           "name": "audio",
1013 |           "type": "VHS_AUDIO",
1014 |           "links": null,
1015 |           "shape": 3
1016 |         }
1017 |       ],
1018 |       "properties": {
1019 |         "Node name for S&R": "VHS_LoadVideo"
1020 |       },
1021 |       "widgets_values": {
1022 |         "video": "fourpeople.mp4",
1023 |         "force_rate": 0,
1024 |         "force_size": "Disabled",
1025 |         "custom_width": 512,
1026 |         "custom_height": 512,
1027 |         "frame_load_cap": 20,
1028 |         "skip_first_frames": 0,
1029 |         "select_every_nth": 4,
1030 |         "choose video to upload": "image",
1031 |         "videopreview": {
1032 |           "hidden": false,
1033 |           "paused": false,
1034 |           "params": {
1035 |             "frame_load_cap": 20,
1036 |             "skip_first_frames": 0,
1037 |             "force_rate": 0,
1038 |             "select_every_nth": 4,
1039 |             "filename": "fourpeople.mp4",
1040 |             "type": "input",
1041 |             "format": "video/mp4"
1042 |           }
1043 |         }
1044 |       }
1045 |     },
1046 |     {
1047 |       "id": 78,
1048 |       "type": "VHS_VideoCombine",
1049 |       "pos": [
1050 |         2614,
1051 |         297
1052 |       ],
1053 |       "size": [
1054 |         315,
1055 |         599
1056 |       ],
1057 |       "flags": {},
1058 |       "order": 29,
1059 |       "mode": 0,
1060 |       "inputs": [
1061 |         {
1062 |           "name": "images",
1063 |           "type": "IMAGE",
1064 |           "link": 260
1065 |         },
1066 |         {
1067 |           "name": "audio",
1068 |           "type": "VHS_AUDIO",
1069 |           "link": null
1070 |         },
1071 |         {
1072 |           "name": "batch_manager",
1073 |           "type": "VHS_BatchManager",
1074 |           "link": null
1075 |         }
1076 |       ],
1077 |       "outputs": [
1078 |         {
1079 |           "name": "Filenames",
1080 |           "type": "VHS_FILENAMES",
1081 |           "links": null,
1082 |           "shape": 3
1083 |         }
1084 |       ],
1085 |       "properties": {
1086 |         "Node name for S&R": "VHS_VideoCombine"
1087 |       },
1088 |       "widgets_values": {
1089 |         "frame_rate": 8,
1090 |         "loop_count": 0,
1091 |         "filename_prefix": "AnimateDiff",
1092 |         "format": "video/h264-mp4",
1093 |         "pix_fmt": "yuv420p",
1094 |         "crf": 19,
1095 |         "save_metadata": true,
1096 |         "pingpong": false,
1097 |         "save_output": true,
1098 |         "videopreview": {
1099 |           "hidden": false,
1100 |           "paused": false,
1101 |           "params": {
1102 |             "filename": "AnimateDiff_00802.mp4",
1103 |             "subfolder": "",
1104 |             "type": "output",
1105 |             "format": "video/h264-mp4"
1106 |           }
1107 |         }
1108 |       }
1109 |     },
1110 |     {
1111 |       "id": 8,
1112 |       "type": "VAEDecode",
1113 |       "pos": [
1114 |         3455,
1115 |         -179
1116 |       ],
1117 |       "size": {
1118 |         "0": 140,
1119 |         "1": 46
1120 |       },
1121 |       "flags": {},
1122 |       "order": 32,
1123 |       "mode": 0,
1124 |       "inputs": [
1125 |         {
1126 |           "name": "samples",
1127 |           "type": "LATENT",
1128 |           "link": 215
1129 |         },
1130 |         {
1131 |           "name": "vae",
1132 |           "type": "VAE",
1133 |           "link": 247
1134 |         }
1135 |       ],
1136 |       "outputs": [
1137 |         {
1138 |           "name": "IMAGE",
1139 |           "type": "IMAGE",
1140 |           "links": [
1141 |             22
1142 |           ],
1143 |           "slot_index": 0
1144 |         }
1145 |       ],
1146 |       "properties": {
1147 |         "Node name for S&R": "VAEDecode"
1148 |       }
1149 |     },
1150 |     {
1151 |       "id": 18,
1152 |       "type": "VHS_VideoCombine",
1153 |       "pos": [
1154 |         3667,
1155 |         -350
1156 |       ],
1157 |       "size": [
1158 |         315,
1159 |         599
1160 |       ],
1161 |       "flags": {},
1162 |       "order": 33,
1163 |       "mode": 0,
1164 |       "inputs": [
1165 |         {
1166 |           "name": "images",
1167 |           "type": "IMAGE",
1168 |           "link": 22
1169 |         },
1170 |         {
1171 |           "name": "audio",
1172 |           "type": "VHS_AUDIO",
1173 |           "link": null
1174 |         },
1175 |         {
1176 |           "name": "batch_manager",
1177 |           "type": "VHS_BatchManager",
1178 |           "link": null
1179 |         }
1180 |       ],
1181 |       "outputs": [
1182 |         {
1183 |           "name": "Filenames",
1184 |           "type": "VHS_FILENAMES",
1185 |           "links": null,
1186 |           "shape": 3
1187 |         }
1188 |       ],
1189 |       "properties": {
1190 |         "Node name for S&R": "VHS_VideoCombine"
1191 |       },
1192 |       "widgets_values": {
1193 |         "frame_rate": 8,
1194 |         "loop_count": 0,
1195 |         "filename_prefix": "AnimateDiff",
1196 |         "format": "video/h264-mp4",
1197 |         "pix_fmt": "yuv420p",
1198 |         "crf": 19,
1199 |         "save_metadata": true,
1200 |         "pingpong": false,
1201 |         "save_output": true,
1202 |         "videopreview": {
1203 |           "hidden": false,
1204 |           "paused": false,
1205 |           "params": {
1206 |             "filename": "AnimateDiff_00804.mp4",
1207 |             "subfolder": "",
1208 |             "type": "output",
1209 |             "format": "video/h264-mp4"
1210 |           }
1211 |         }
1212 |       }
1213 |     },
1214 |     {
1215 |       "id": 91,
1216 |       "type": "YOLOTrackerNode",
1217 |       "pos": [
1218 |         2505,
1219 |         -626
1220 |       ],
1221 |       "size": {
1222 |         "0": 315,
1223 |         "1": 78
1224 |       },
1225 |       "flags": {},
1226 |       "order": 23,
1227 |       "mode": 0,
1228 |       "inputs": [
1229 |         {
1230 |           "name": "images",
1231 |           "type": "IMAGE",
1232 |           "link": 274
1233 |         }
1234 |       ],
1235 |       "outputs": [
1236 |         {
1237 |           "name": "IMAGE",
1238 |           "type": "IMAGE",
1239 |           "links": [
1240 |             276
1241 |           ],
1242 |           "shape": 3,
1243 |           "slot_index": 0
1244 |         },
1245 |         {
1246 |           "name": "TRACKING",
1247 |           "type": "TRACKING",
1248 |           "links": [
1249 |             275
1250 |           ],
1251 |           "shape": 3,
1252 |           "slot_index": 1
1253 |         }
1254 |       ],
1255 |       "properties": {
1256 |         "Node name for S&R": "YOLOTrackerNode"
1257 |       },
1258 |       "widgets_values": [
1259 |         "yolov8m.pt"
1260 |       ]
1261 |     },
1262 |     {
1263 |       "id": 79,
1264 |       "type": "ImageScale",
1265 |       "pos": [
1266 |         2293,
1267 |         784
1268 |       ],
1269 |       "size": {
1270 |         "0": 210,
1271 |         "1": 130
1272 |       },
1273 |       "flags": {},
1274 |       "order": 26,
1275 |       "mode": 0,
1276 |       "inputs": [
1277 |         {
1278 |           "name": "image",
1279 |           "type": "IMAGE",
1280 |           "link": 276
1281 |         }
1282 |       ],
1283 |       "outputs": [
1284 |         {
1285 |           "name": "IMAGE",
1286 |           "type": "IMAGE",
1287 |           "links": [
1288 |             260
1289 |           ],
1290 |           "shape": 3,
1291 |           "slot_index": 0
1292 |         }
1293 |       ],
1294 |       "properties": {
1295 |         "Node name for S&R": "ImageScale"
1296 |       },
1297 |       "widgets_values": [
1298 |         "nearest-exact",
1299 |         1024,
1300 |         1024,
1301 |         "disabled"
1302 |       ]
1303 |     },
1304 |     {
1305 |       "id": 83,
1306 |       "type": "ACN_AdvancedControlNetApply",
1307 |       "pos": [
1308 |         1675,
1309 |         -210
1310 |       ],
1311 |       "size": {
1312 |         "0": 285.6000061035156,
1313 |         "1": 266
1314 |       },
1315 |       "flags": {},
1316 |       "order": 28,
1317 |       "mode": 0,
1318 |       "inputs": [
1319 |         {
1320 |           "name": "positive",
1321 |           "type": "CONDITIONING",
1322 |           "link": 239
1323 |         },
1324 |         {
1325 |           "name": "negative",
1326 |           "type": "CONDITIONING",
1327 |           "link": 240
1328 |         },
1329 |         {
1330 |           "name": "control_net",
1331 |           "type": "CONTROL_NET",
1332 |           "link": 225
1333 |         },
1334 |         {
1335 |           "name": "image",
1336 |           "type": "IMAGE",
1337 |           "link": 226
1338 |         },
1339 |         {
1340 |           "name": "mask_optional",
1341 |           "type": "MASK",
1342 |           "link": null
1343 |         },
1344 |         {
1345 |           "name": "timestep_kf",
1346 |           "type": "TIMESTEP_KEYFRAME",
1347 |           "link": null
1348 |         },
1349 |         {
1350 |           "name": "latent_kf_override",
1351 |           "type": "LATENT_KEYFRAME",
1352 |           "link": null
1353 |         },
1354 |         {
1355 |           "name": "weights_override",
1356 |           "type": "CONTROL_NET_WEIGHTS",
1357 |           "link": null
1358 |         },
1359 |         {
1360 |           "name": "model_optional",
1361 |           "type": "MODEL",
1362 |           "link": null
1363 |         }
1364 |       ],
1365 |       "outputs": [
1366 |         {
1367 |           "name": "positive",
1368 |           "type": "CONDITIONING",
1369 |           "links": [
1370 |             278
1371 |           ],
1372 |           "shape": 3,
1373 |           "slot_index": 0
1374 |         },
1375 |         {
1376 |           "name": "negative",
1377 |           "type": "CONDITIONING",
1378 |           "links": [
1379 |             279
1380 |           ],
1381 |           "shape": 3,
1382 |           "slot_index": 1
1383 |         },
1384 |         {
1385 |           "name": "model_opt",
1386 |           "type": "MODEL",
1387 |           "links": null,
1388 |           "shape": 3
1389 |         }
1390 |       ],
1391 |       "properties": {
1392 |         "Node name for S&R": "ACN_AdvancedControlNetApply"
1393 |       },
1394 |       "widgets_values": [
1395 |         0.15,
1396 |         0,
1397 |         0.25
1398 |       ]
1399 |     },
1400 |     {
1401 |       "id": 86,
1402 |       "type": "InstanceDiffusionTrackingPrompt",
1403 |       "pos": [
1404 |         2451,
1405 |         -213
1406 |       ],
1407 |       "size": [
1408 |         514.9908203124996,
1409 |         347.78492431640575
1410 |       ],
1411 |       "flags": {},
1412 |       "order": 30,
1413 |       "mode": 0,
1414 |       "inputs": [
1415 |         {
1416 |           "name": "positive",
1417 |           "type": "CONDITIONING",
1418 |           "link": 278
1419 |         },
1420 |         {
1421 |           "name": "negative",
1422 |           "type": "CONDITIONING",
1423 |           "link": 279
1424 |         },
1425 |         {
1426 |           "name": "clip",
1427 |           "type": "CLIP",
1428 |           "link": 265
1429 |         },
1430 |         {
1431 |           "name": "tracking",
1432 |           "type": "TRACKING",
1433 |           "link": 275
1434 |         },
1435 |         {
1436 |           "name": "positionnet",
1437 |           "type": "POSITIONNET",
1438 |           "link": 267
1439 |         },
1440 |         {
1441 |           "name": "fusers",
1442 |           "type": "FUSERS",
1443 |           "link": 268
1444 |         }
1445 |       ],
1446 |       "outputs": [
1447 |         {
1448 |           "name": "positive",
1449 |           "type": "CONDITIONING",
1450 |           "links": [
1451 |             269
1452 |           ],
1453 |           "shape": 3,
1454 |           "slot_index": 0
1455 |         },
1456 |         {
1457 |           "name": "negative",
1458 |           "type": "CONDITIONING",
1459 |           "links": [
1460 |             270
1461 |           ],
1462 |           "shape": 3,
1463 |           "slot_index": 1
1464 |         }
1465 |       ],
1466 |       "properties": {
1467 |         "Node name for S&R": "InstanceDiffusionTrackingPrompt"
1468 |       },
1469 |       "widgets_values": [
1470 |         5,
1471 |         "\"1.person\": \"(((The Mad Hatter, purple suit, purple top hot, red hair)))\",\n\"2.person\": \"(((a white rabbit wearing a suit, white bunny ears)))\",\n\"3.person\": \"((((Alice in Wonderland, blue dress, white apron, blonde))))\",\n\"4.person\": \"((((the Queen of Hearts, red and black dress, crown))))\",",
1472 |         ""
1473 |       ]
1474 |     },
1475 |     {
1476 |       "id": 47,
1477 |       "type": "LoraLoaderModelOnly",
1478 |       "pos": [
1479 |         1092,
1480 |         493
1481 |       ],
1482 |       "size": {
1483 |         "0": 315,
1484 |         "1": 82
1485 |       },
1486 |       "flags": {},
1487 |       "order": 13,
1488 |       "mode": 0,
1489 |       "inputs": [
1490 |         {
1491 |           "name": "model",
1492 |           "type": "MODEL",
1493 |           "link": 111
1494 |         }
1495 |       ],
1496 |       "outputs": [
1497 |         {
1498 |           "name": "MODEL",
1499 |           "type": "MODEL",
1500 |           "links": [
1501 |             112
1502 |           ],
1503 |           "shape": 3,
1504 |           "slot_index": 0
1505 |         }
1506 |       ],
1507 |       "properties": {
1508 |         "Node name for S&R": "LoraLoaderModelOnly"
1509 |       },
1510 |       "widgets_values": [
1511 |         "lcm/SD1.5/pytorch_lora_weights.safetensors",
1512 |         1
1513 |       ]
1514 |     },
1515 |     {
1516 |       "id": 81,
1517 |       "type": "KSampler",
1518 |       "pos": [
1519 |         3128,
1520 |         -243
1521 |       ],
1522 |       "size": [
1523 |         247.09541992187496,
1524 |         262
1525 |       ],
1526 |       "flags": {},
1527 |       "order": 31,
1528 |       "mode": 0,
1529 |       "inputs": [
1530 |         {
1531 |           "name": "model",
1532 |           "type": "MODEL",
1533 |           "link": 273
1534 |         },
1535 |         {
1536 |           "name": "positive",
1537 |           "type": "CONDITIONING",
1538 |           "link": 269
1539 |         },
1540 |         {
1541 |           "name": "negative",
1542 |           "type": "CONDITIONING",
1543 |           "link": 270
1544 |         },
1545 |         {
1546 |           "name": "latent_image",
1547 |           "type": "LATENT",
1548 |           "link": 214
1549 |         }
1550 |       ],
1551 |       "outputs": [
1552 |         {
1553 |           "name": "LATENT",
1554 |           "type": "LATENT",
1555 |           "links": [
1556 |             215
1557 |           ],
1558 |           "shape": 3,
1559 |           "slot_index": 0
1560 |         }
1561 |       ],
1562 |       "properties": {
1563 |         "Node name for S&R": "KSampler"
1564 |       },
1565 |       "widgets_values": [
1566 |         677130511272592,
1567 |         "fixed",
1568 |         7,
1569 |         2,
1570 |         "lcm",
1571 |         "karras",
1572 |         1
1573 |       ]
1574 |     },
1575 |     {
1576 |       "id": 89,
1577 |       "type": "LoadInstanceScaleUNode",
1578 |       "pos": [
1579 |         2019,
1580 |         458
1581 |       ],
1582 |       "size": {
1583 |         "0": 315,
1584 |         "1": 58
1585 |       },
1586 |       "flags": {},
1587 |       "order": 9,
1588 |       "mode": 0,
1589 |       "outputs": [
1590 |         {
1591 |           "name": "SCALEU",
1592 |           "type": "SCALEU",
1593 |           "links": [
1594 |             271
1595 |           ],
1596 |           "shape": 3,
1597 |           "slot_index": 0
1598 |         }
1599 |       ],
1600 |       "properties": {
1601 |         "Node name for S&R": "LoadInstanceScaleUNode"
1602 |       },
1603 |       "widgets_values": [
1604 |         "scaleu.ckpt"
1605 |       ]
1606 |     },
1607 |     {
1608 |       "id": 88,
1609 |       "type": "LoadInstanceFusersNode",
1610 |       "pos": [
1611 |         2034,
1612 |         339
1613 |       ],
1614 |       "size": {
1615 |         "0": 315,
1616 |         "1": 58
1617 |       },
1618 |       "flags": {},
1619 |       "order": 10,
1620 |       "mode": 0,
1621 |       "outputs": [
1622 |         {
1623 |           "name": "FUSERS",
1624 |           "type": "FUSERS",
1625 |           "links": [
1626 |             268
1627 |           ],
1628 |           "shape": 3,
1629 |           "slot_index": 0
1630 |         }
1631 |       ],
1632 |       "properties": {
1633 |         "Node name for S&R": "LoadInstanceFusersNode"
1634 |       },
1635 |       "widgets_values": [
1636 |         "fusers.ckpt"
1637 |       ]
1638 |     },
1639 |     {
1640 |       "id": 87,
1641 |       "type": "LoadInstancePositionNetModel",
1642 |       "pos": [
1643 |         2024,
1644 |         221
1645 |       ],
1646 |       "size": {
1647 |         "0": 315,
1648 |         "1": 58
1649 |       },
1650 |       "flags": {},
1651 |       "order": 11,
1652 |       "mode": 0,
1653 |       "outputs": [
1654 |         {
1655 |           "name": "POSITIONNET",
1656 |           "type": "POSITIONNET",
1657 |           "links": [
1658 |             267
1659 |           ],
1660 |           "shape": 3,
1661 |           "slot_index": 0
1662 |         }
1663 |       ],
1664 |       "properties": {
1665 |         "Node name for S&R": "LoadInstancePositionNetModel"
1666 |       },
1667 |       "widgets_values": [
1668 |         "position_net.ckpt"
1669 |       ]
1670 |     }
1671 |   ],
1672 |   "links": [
1673 |     [
1674 |       10,
1675 |       10,
1676 |       0,
1677 |       12,
1678 |       0,
1679 |       "IMAGE"
1680 |     ],
1681 |     [
1682 |       19,
1683 |       12,
1684 |       0,
1685 |       16,
1686 |       0,
1687 |       "IMAGE"
1688 |     ],
1689 |     [
1690 |       22,
1691 |       8,
1692 |       0,
1693 |       18,
1694 |       0,
1695 |       "IMAGE"
1696 |     ],
1697 |     [
1698 |       48,
1699 |       12,
1700 |       0,
1701 |       27,
1702 |       0,
1703 |       "IMAGE"
1704 |     ],
1705 |     [
1706 |       107,
1707 |       12,
1708 |       0,
1709 |       46,
1710 |       0,
1711 |       "IMAGE"
1712 |     ],
1713 |     [
1714 |       111,
1715 |       49,
1716 |       0,
1717 |       47,
1718 |       0,
1719 |       "MODEL"
1720 |     ],
1721 |     [
1722 |       112,
1723 |       47,
1724 |       0,
1725 |       48,
1726 |       0,
1727 |       "MODEL"
1728 |     ],
1729 |     [
1730 |       133,
1731 |       60,
1732 |       0,
1733 |       56,
1734 |       1,
1735 |       "M_MODELS"
1736 |     ],
1737 |     [
1738 |       134,
1739 |       57,
1740 |       0,
1741 |       56,
1742 |       2,
1743 |       "CONTEXT_OPTIONS"
1744 |     ],
1745 |     [
1746 |       135,
1747 |       59,
1748 |       0,
1749 |       56,
1750 |       3,
1751 |       "SAMPLE_SETTINGS"
1752 |     ],
1753 |     [
1754 |       136,
1755 |       58,
1756 |       0,
1757 |       60,
1758 |       0,
1759 |       "MOTION_MODEL_ADE"
1760 |     ],
1761 |     [
1762 |       137,
1763 |       61,
1764 |       0,
1765 |       60,
1766 |       3,
1767 |       "MULTIVAL"
1768 |     ],
1769 |     [
1770 |       138,
1771 |       48,
1772 |       0,
1773 |       56,
1774 |       0,
1775 |       "MODEL"
1776 |     ],
1777 |     [
1778 |       148,
1779 |       49,
1780 |       1,
1781 |       63,
1782 |       0,
1783 |       "CLIP"
1784 |     ],
1785 |     [
1786 |       149,
1787 |       49,
1788 |       1,
1789 |       64,
1790 |       0,
1791 |       "CLIP"
1792 |     ],
1793 |     [
1794 |       214,
1795 |       16,
1796 |       0,
1797 |       81,
1798 |       3,
1799 |       "LATENT"
1800 |     ],
1801 |     [
1802 |       215,
1803 |       81,
1804 |       0,
1805 |       8,
1806 |       0,
1807 |       "LATENT"
1808 |     ],
1809 |     [
1810 |       225,
1811 |       45,
1812 |       0,
1813 |       83,
1814 |       2,
1815 |       "CONTROL_NET"
1816 |     ],
1817 |     [
1818 |       226,
1819 |       46,
1820 |       0,
1821 |       83,
1822 |       3,
1823 |       "IMAGE"
1824 |     ],
1825 |     [
1826 |       231,
1827 |       6,
1828 |       0,
1829 |       85,
1830 |       0,
1831 |       "CONDITIONING"
1832 |     ],
1833 |     [
1834 |       232,
1835 |       7,
1836 |       0,
1837 |       85,
1838 |       1,
1839 |       "CONDITIONING"
1840 |     ],
1841 |     [
1842 |       233,
1843 |       26,
1844 |       0,
1845 |       85,
1846 |       2,
1847 |       "CONTROL_NET"
1848 |     ],
1849 |     [
1850 |       234,
1851 |       27,
1852 |       0,
1853 |       85,
1854 |       3,
1855 |       "IMAGE"
1856 |     ],
1857 |     [
1858 |       239,
1859 |       85,
1860 |       0,
1861 |       83,
1862 |       0,
1863 |       "CONDITIONING"
1864 |     ],
1865 |     [
1866 |       240,
1867 |       85,
1868 |       1,
1869 |       83,
1870 |       1,
1871 |       "CONDITIONING"
1872 |     ],
1873 |     [
1874 |       243,
1875 |       49,
1876 |       1,
1877 |       7,
1878 |       0,
1879 |       "CLIP"
1880 |     ],
1881 |     [
1882 |       244,
1883 |       49,
1884 |       1,
1885 |       6,
1886 |       0,
1887 |       "CLIP"
1888 |     ],
1889 |     [
1890 |       245,
1891 |       49,
1892 |       2,
1893 |       16,
1894 |       1,
1895 |       "VAE"
1896 |     ],
1897 |     [
1898 |       247,
1899 |       49,
1900 |       2,
1901 |       8,
1902 |       1,
1903 |       "VAE"
1904 |     ],
1905 |     [
1906 |       260,
1907 |       79,
1908 |       0,
1909 |       78,
1910 |       0,
1911 |       "IMAGE"
1912 |     ],
1913 |     [
1914 |       265,
1915 |       49,
1916 |       1,
1917 |       86,
1918 |       2,
1919 |       "CLIP"
1920 |     ],
1921 |     [
1922 |       267,
1923 |       87,
1924 |       0,
1925 |       86,
1926 |       4,
1927 |       "POSITIONNET"
1928 |     ],
1929 |     [
1930 |       268,
1931 |       88,
1932 |       0,
1933 |       86,
1934 |       5,
1935 |       "FUSERS"
1936 |     ],
1937 |     [
1938 |       269,
1939 |       86,
1940 |       0,
1941 |       81,
1942 |       1,
1943 |       "CONDITIONING"
1944 |     ],
1945 |     [
1946 |       270,
1947 |       86,
1948 |       1,
1949 |       81,
1950 |       2,
1951 |       "CONDITIONING"
1952 |     ],
1953 |     [
1954 |       271,
1955 |       89,
1956 |       0,
1957 |       90,
1958 |       1,
1959 |       "SCALEU"
1960 |     ],
1961 |     [
1962 |       272,
1963 |       56,
1964 |       0,
1965 |       90,
1966 |       0,
1967 |       "MODEL"
1968 |     ],
1969 |     [
1970 |       273,
1971 |       90,
1972 |       0,
1973 |       81,
1974 |       0,
1975 |       "MODEL"
1976 |     ],
1977 |     [
1978 |       274,
1979 |       12,
1980 |       0,
1981 |       91,
1982 |       0,
1983 |       "IMAGE"
1984 |     ],
1985 |     [
1986 |       275,
1987 |       91,
1988 |       1,
1989 |       86,
1990 |       3,
1991 |       "TRACKING"
1992 |     ],
1993 |     [
1994 |       276,
1995 |       91,
1996 |       0,
1997 |       79,
1998 |       0,
1999 |       "IMAGE"
2000 |     ],
2001 |     [
2002 |       278,
2003 |       83,
2004 |       0,
2005 |       86,
2006 |       0,
2007 |       "CONDITIONING"
2008 |     ],
2009 |     [
2010 |       279,
2011 |       83,
2012 |       1,
2013 |       86,
2014 |       1,
2015 |       "CONDITIONING"
2016 |     ]
2017 |   ],
2018 |   "groups": [],
2019 |   "config": {},
2020 |   "extra": {},
2021 |   "version": 0.4
2022 | }


--------------------------------------------------------------------------------
/model_helpers/prepare_fusers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | #from ..modules.attention import GatedSelfAttentionDense
 4 | from comfy.gligen import GatedSelfAttentionDense as GSAD
 5 | 
 6 | class GatedSelfAttentionDense(GSAD):
 7 |     
 8 |     def forward(self, x, instance_options={}):
 9 |         objs = instance_options['objs']
10 |         x = super().forward(x, objs)
11 |         return x.to(torch.float16)
12 | 
13 | def prepare_fusers(fusers_ckpt, fusers_scale) -> list[torch.nn.Module]:
14 |     fusers_list = []
15 |     for key in fusers_ckpt['input_blocks']:
16 |         fusers_ckpt['input_blocks'][key]['params']['query_dim'] = fusers_ckpt['input_blocks'][key]['params']['n_heads'] * \
17 |             fusers_ckpt['input_blocks'][key]['params']['d_head']
18 |         fuser = GatedSelfAttentionDense(
19 |             **fusers_ckpt['input_blocks'][key]['params'])
20 |         fuser.load_state_dict(fusers_ckpt['input_blocks'][key]['state'])
21 |         fuser.scale = fusers_scale
22 |         fusers_list.append(fuser)
23 | 
24 |     fusers_ckpt['middle_block']['1']['params']['query_dim'] = fusers_ckpt['middle_block']['1']['params']['n_heads'] * \
25 |         fusers_ckpt['middle_block']['1']['params']['d_head']
26 |     fuser = GatedSelfAttentionDense(
27 |         **fusers_ckpt['middle_block']['1']['params'])
28 |     fuser.load_state_dict(fusers_ckpt['middle_block']['1']['state'])
29 |     fuser.scale = fusers_scale
30 |     fusers_list.append(fuser)
31 | 
32 |     for key in fusers_ckpt['output_blocks']:
33 |         fusers_ckpt['output_blocks'][key]['params']['query_dim'] = fusers_ckpt['output_blocks'][key]['params']['n_heads'] * \
34 |             fusers_ckpt['output_blocks'][key]['params']['d_head']
35 |         fuser = GatedSelfAttentionDense(
36 |             **fusers_ckpt['output_blocks'][key]['params'])
37 |         fuser.load_state_dict(fusers_ckpt['output_blocks'][key]['state'])
38 |         fuser.scale = fusers_scale
39 |         fusers_list.append(fuser)
40 | 
41 |     return fusers_list
42 | 


--------------------------------------------------------------------------------
/model_helpers/prepare_positionnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from ..modules.text_grounding_net import UniFusion
 4 | 
 5 | 
 6 | def get_positionnet_default_params():
 7 |     return {
 8 |         "in_dim": 768,
 9 |         "mid_dim": 3072,
10 |         "out_dim": 768,
11 |         "test_drop_boxes": False,
12 |         "test_drop_masks": True,
13 |         "test_drop_points": False,
14 |         "test_drop_scribbles": True,
15 |         "train_add_boxes": True,
16 |         "train_add_masks": True,
17 |         "train_add_points": True,
18 |         "train_add_scribbles": True,
19 |         "use_seperate_tokenizer": True,
20 |     }
21 | 
22 | 
23 | def prepare_positionnet(checkpoint, params) -> torch.nn.Module:
24 |     model = UniFusion(**params)
25 |     model.load_state_dict(checkpoint, strict=False)
26 |     return model
27 | 


--------------------------------------------------------------------------------
/model_helpers/prepare_scaleu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from ..modules.scaleu import ScaleU
 4 | 
 5 | 
 6 | def get_scaleu_patch(scaleu_nets):
 7 |     def scaleu_patch(h, hsp, transformer_options):
 8 |         _, idx = transformer_options['block']
 9 |         sk = scaleu_nets[idx](h, hsp)
10 |         return sk
11 | 
12 |     return scaleu_patch
13 | 
14 | 
15 | def prepare_scaleu_nets(scaleu_ckpt) -> torch.nn.Module:
16 |     scaleu_nets = []
17 |     for i in range(12):
18 |         ckpt = scaleu_ckpt[f'{i}']
19 |         scaleu = ScaleU(True, len(ckpt['scaleu_b']), len(ckpt['scaleu_s']))
20 |         scaleu.load_state_dict(ckpt)
21 |         scaleu_nets.append(scaleu)
22 |     return scaleu_nets
23 | 


--------------------------------------------------------------------------------
/modules/convnext.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | 
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the original repo.
  7 | 
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | from timm.models.layers import trunc_normal_, DropPath
 13 | from timm.models.registry import register_model
 14 | 
 15 | import comfy.ops
 16 | ops = comfy.ops.manual_cast
 17 | 
 18 | class Block(nn.Module):
 19 |     r""" ConvNeXt Block. There are two equivalent implementations:
 20 |     (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
 21 |     (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
 22 |     We use (2) as we find it slightly faster in PyTorch
 23 | 
 24 |     Args:
 25 |         dim (int): Number of input channels.
 26 |         drop_path (float): Stochastic depth rate. Default: 0.0
 27 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 28 |     """
 29 | 
 30 |     def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
 31 |         super().__init__()
 32 |         self.dwconv = ops.Conv2d(dim, dim, kernel_size=7,
 33 |                                 padding=3, groups=dim)  # depthwise conv
 34 |         self.norm = LayerNorm(dim, eps=1e-6)
 35 |         # pointwise/1x1 convs, implemented with linear layers
 36 |         self.pwconv1 = ops.Linear(dim, 4 * dim)
 37 |         self.act = nn.GELU()
 38 |         self.pwconv2 = ops.Linear(4 * dim, dim)
 39 |         self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
 40 |                                   requires_grad=True) if layer_scale_init_value > 0 else None
 41 |         self.drop_path = DropPath(
 42 |             drop_path) if drop_path > 0. else nn.Identity()
 43 | 
 44 |     def forward(self, x):
 45 |         input = x
 46 |         x = self.dwconv(x)
 47 |         x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
 48 |         x = self.norm(x)
 49 |         x = self.pwconv1(x)
 50 |         x = self.act(x)
 51 |         x = self.pwconv2(x)
 52 |         if self.gamma is not None:
 53 |             x = self.gamma * x
 54 |         x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
 55 | 
 56 |         x = input + self.drop_path(x)
 57 |         return x
 58 | 
 59 | 
 60 | class ConvNeXt(nn.Module):
 61 |     r""" ConvNeXt
 62 |         A PyTorch impl of : `A ConvNet for the 2020s`  -
 63 |           https://arxiv.org/pdf/2201.03545.pdf
 64 | 
 65 |     Args:
 66 |         in_chans (int): Number of input image channels. Default: 3
 67 |         num_classes (int): Number of classes for classification head. Default: 1000
 68 |         depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
 69 |         dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
 70 |         drop_path_rate (float): Stochastic depth rate. Default: 0.
 71 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 72 |         head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
 73 |     """
 74 | 
 75 |     def __init__(self, in_chans=3, num_classes=1000,
 76 |                  depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
 77 |                  layer_scale_init_value=1e-6, head_init_scale=1.,
 78 |                  ):
 79 |         super().__init__()
 80 | 
 81 |         # stem and 3 intermediate downsampling conv layers
 82 |         self.downsample_layers = nn.ModuleList()
 83 |         stem = nn.Sequential(
 84 |             ops.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
 85 |             LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
 86 |         )
 87 |         self.downsample_layers.append(stem)
 88 |         for i in range(3):
 89 |             downsample_layer = nn.Sequential(
 90 |                 LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
 91 |                 ops.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
 92 |             )
 93 |             self.downsample_layers.append(downsample_layer)
 94 | 
 95 |         # 4 feature resolution stages, each consisting of multiple residual blocks
 96 |         self.stages = nn.ModuleList()
 97 |         dp_rates = [x.item()
 98 |                     for x in torch.linspace(0, drop_path_rate, sum(depths))]
 99 |         cur = 0
100 |         for i in range(4):
101 |             stage = nn.Sequential(
102 |                 *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
103 |                         layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
104 |             )
105 |             self.stages.append(stage)
106 |             cur += depths[i]
107 | 
108 |     def _init_weights(self, m):
109 |         if isinstance(m, (ops.Conv2d, ops.Linear)):
110 |             trunc_normal_(m.weight, std=.02)
111 |             nn.init.constant_(m.bias, 0)
112 | 
113 |     def forward_features(self, x):
114 |         for i in range(4):
115 |             x = self.downsample_layers[i](x)
116 |             x = self.stages[i](x)
117 |         return x
118 | 
119 |     def forward(self, x):
120 |         x = self.forward_features(x)
121 |         # x = self.head(x)
122 |         return x
123 | 
124 | 
125 | class LayerNorm(nn.Module):
126 |     r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
127 |     The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
128 |     shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
129 |     with shape (batch_size, channels, height, width).
130 |     """
131 | 
132 |     def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
133 |         super().__init__()
134 |         self.weight = nn.Parameter(torch.ones(normalized_shape))
135 |         self.bias = nn.Parameter(torch.zeros(normalized_shape))
136 |         self.eps = eps
137 |         self.data_format = data_format
138 |         if self.data_format not in ["channels_last", "channels_first"]:
139 |             raise NotImplementedError
140 |         self.normalized_shape = (normalized_shape, )
141 | 
142 |     def forward(self, x):
143 |         if self.data_format == "channels_last":
144 |             return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
145 |         elif self.data_format == "channels_first":
146 |             u = x.mean(1, keepdim=True)
147 |             s = (x - u).pow(2).mean(1, keepdim=True)
148 |             x = (x - u) / torch.sqrt(s + self.eps)
149 |             x = self.weight[:, None, None] * x + self.bias[:, None, None]
150 |             return x
151 | 
152 | 
153 | model_urls = {
154 |     "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
155 |     "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
156 |     "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
157 |     "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
158 |     "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
159 |     "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
160 |     "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
161 |     "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
162 |     "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
163 | }
164 | 
165 | 
166 | @register_model
167 | def convnext_tiny(pretrained=False, in_22k=False, **kwargs):
168 |     model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
169 |     if pretrained:
170 |         url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k']
171 |         checkpoint = torch.hub.load_state_dict_from_url(
172 |             url=url, map_location="cpu", check_hash=True)
173 |         # we remove classifer head
174 |         model.load_state_dict(checkpoint["model"], strict=False)
175 |     return model
176 | 
177 | 
178 | # @register_model
179 | # def convnext_small(pretrained=False, in_22k=False, **kwargs):
180 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
181 | #     if pretrained:
182 | #         url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k']
183 | #         checkpoint = torch.hub.load_state_dict_from_url(
184 | #             url=url, map_location="cpu")
185 | #         model.load_state_dict(checkpoint["model"])
186 | #     return model
187 | 
188 | 
189 | # @register_model
190 | # def convnext_base(pretrained=False, in_22k=False, **kwargs):
191 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[
192 | #                      128, 256, 512, 1024], **kwargs)
193 | #     if pretrained:
194 | #         url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k']
195 | #         checkpoint = torch.hub.load_state_dict_from_url(
196 | #             url=url, map_location="cpu")
197 | #         model.load_state_dict(checkpoint["model"])
198 | #     return model
199 | 
200 | 
201 | # @register_model
202 | # def convnext_large(pretrained=False, in_22k=False, **kwargs):
203 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[
204 | #                      192, 384, 768, 1536], **kwargs)
205 | #     if pretrained:
206 | #         url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k']
207 | #         checkpoint = torch.hub.load_state_dict_from_url(
208 | #             url=url, map_location="cpu")
209 | #         model.load_state_dict(checkpoint["model"])
210 | #     return model
211 | 
212 | 
213 | # @register_model
214 | # def convnext_xlarge(pretrained=False, in_22k=False, **kwargs):
215 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[
216 | #                      256, 512, 1024, 2048], **kwargs)
217 | #     if pretrained:
218 | #         assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True"
219 | #         url = model_urls['convnext_xlarge_22k']
220 | #         checkpoint = torch.hub.load_state_dict_from_url(
221 | #             url=url, map_location="cpu")
222 | #         model.load_state_dict(checkpoint["model"])
223 | #     return model
224 | 


--------------------------------------------------------------------------------
/modules/scaleu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.fft as fft
 4 | 
 5 | 
 6 | def Fourier_filter(x_in, threshold, scale):
 7 |     x = x_in
 8 |     B, C, H, W = x.shape
 9 | 
10 |     # Non-power of 2 images must be float32
11 |     if (W & (W - 1)) != 0 or (H & (H - 1)) != 0:
12 |         x = x.to(dtype=torch.float32)
13 | 
14 |     # FFT
15 |     x_freq = fft.fftn(x, dim=(-2, -1))
16 |     x_freq = fft.fftshift(x_freq, dim=(-2, -1))
17 | 
18 |     B, C, H, W = x_freq.shape
19 |     mask = torch.ones((B, C, H, W), device=x.device)
20 | 
21 |     crow, ccol = H // 2, W // 2
22 |     mask[..., crow - threshold: crow + threshold,
23 |          ccol - threshold: ccol + threshold] = scale
24 |     x_freq = x_freq * mask
25 | 
26 |     # IFFT
27 |     x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
28 |     x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
29 | 
30 |     return x_filtered.to(dtype=x_in.dtype)
31 | 
32 | 
33 | class ScaleU(nn.Module):
34 |     def __init__(self, enable_se_scaleu=True, b_size=1280, s_size=1):
35 |         super(ScaleU, self).__init__()
36 |         self.scaleu_b = nn.Parameter(torch.zeros(b_size))
37 |         self.scaleu_s = nn.Parameter(torch.zeros(s_size))
38 |         self.enable_se_scaleu = enable_se_scaleu
39 | 
40 |     def forward(self, h, hs_, transformer_options={}):
41 |         h = h.to(torch.float32)
42 |         hs_ = hs_.to(torch.float32)
43 |         b = torch.tanh(self.scaleu_b) + 1
44 |         s = torch.tanh(self.scaleu_s) + 1
45 |         if self.enable_se_scaleu:
46 |             hidden_mean = h.mean(1).unsqueeze(1)  # B,1,H,W
47 |             B = hidden_mean.shape[0]
48 |             hidden_max, _ = torch.max(hidden_mean.view(
49 |                 B, -1), dim=-1, keepdim=True)  # B,1
50 |             hidden_min, _ = torch.min(hidden_mean.view(
51 |                 B, -1), dim=-1, keepdim=True)  # B,1
52 |             # duplicate the hidden_mean dimension 1 to C
53 |             hidden_mean = (hidden_mean - hidden_min.unsqueeze(2).unsqueeze(3)) / \
54 |                 (hidden_max - hidden_min).unsqueeze(2).unsqueeze(3)  # B,1,H,W
55 |             b = torch.einsum('c,bchw->bchw', b-1, hidden_mean) + 1.0  # B,C,H,W
56 |             h = torch.einsum('bchw,bchw->bchw', h, b)
57 |         else:
58 |             h = torch.einsum('bchw,c->bchw', h, b)
59 | 
60 |         hs_ = Fourier_filter(hs_, threshold=1, scale=s)
61 |         return h.to(torch.float16), hs_.to(torch.float16)
62 | 


--------------------------------------------------------------------------------
/modules/text_grounding_net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from .util import FourierEmbedder
  4 | from .convnext import convnext_tiny
  5 | 
  6 | import comfy.ops
  7 | ops = comfy.ops.manual_cast
  8 | 
  9 | class UniFusion(nn.Module):
 10 |     def __init__(self, in_dim, out_dim, mid_dim=3072, fourier_freqs=8,
 11 |                  train_add_boxes=True, train_add_points=True, train_add_scribbles=True, train_add_masks=True,
 12 |                  test_drop_boxes=False, test_drop_points=False, test_drop_scribbles=True, test_drop_masks=False,
 13 |                  use_seperate_tokenizer=True, use_segs=True):
 14 |         super().__init__()
 15 |         self.in_dim = in_dim
 16 |         self.out_dim = out_dim
 17 |         self.mid_dim = mid_dim
 18 | 
 19 |         # InstanceDiffusion hyper-parameters
 20 |         self.n_scribble_points = 20
 21 |         self.n_polygon_points = 256
 22 |         fourier_freqs = 16
 23 |         fourier_freqs_polygons = 16
 24 |         self.add_boxes = train_add_boxes
 25 |         self.add_points = train_add_points
 26 |         self.add_scribbles = train_add_scribbles
 27 |         self.add_masks = train_add_masks
 28 |         self.use_seperate_tokenizer = use_seperate_tokenizer
 29 | 
 30 |         # Use instance masks as additional model inputs for mask conditioned image generation
 31 |         #self.use_segs = True if self.add_masks else False
 32 |         self.use_segs = use_segs
 33 | 
 34 |         if self.use_segs:
 35 |             in_dim = 30
 36 |             self.resize_input = 512
 37 |             self.down_factor = 64  # determined by the convnext backbone
 38 |             # from num_sem to 3 channels
 39 |             self.in_conv = ops.Conv2d(in_dim, 3, 3, 1, 1)
 40 |             self.convnext_tiny_backbone = convnext_tiny(pretrained=True)
 41 |             self.num_tokens = (self.resize_input // self.down_factor) ** 2
 42 |             self.convnext_feature_dim = 3072
 43 |             self.pos_embedding = nn.Parameter(torch.empty(
 44 |                 1, self.num_tokens, self.convnext_feature_dim).normal_(std=0.02))  # from BERT
 45 | 
 46 |         self.test_drop_boxes = test_drop_boxes
 47 |         self.test_drop_points = test_drop_points
 48 |         self.test_drop_scribbles = test_drop_scribbles
 49 |         self.test_drop_masks = test_drop_masks
 50 |         self.test_drop_segs = test_drop_masks
 51 | 
 52 |         self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
 53 |         self.fourier_embedder_polygons = FourierEmbedder(
 54 |             num_freqs=fourier_freqs_polygons)
 55 |         input_dim = self.in_dim
 56 |         input_dim_list = []
 57 |         if self.add_boxes:
 58 |             # 2: sin and cos; 4: (x1,y1) and (x2,y2)
 59 |             self.position_dim = fourier_freqs * 2 * 4
 60 |             input_dim += self.position_dim
 61 |             input_dim_list.append(self.in_dim + self.position_dim)
 62 |         if self.add_points:
 63 |             self.point_dim = fourier_freqs * 2 * 2  # 2: sin and cos; 2: (x,y)
 64 |             input_dim += self.point_dim
 65 |             input_dim_list.append(self.in_dim + self.point_dim)
 66 |         if self.add_scribbles:
 67 |             self.scribble_dim = fourier_freqs_polygons * 2 * \
 68 |                 self.n_scribble_points * 2  # 2: sin and cos; 2: (x,y)
 69 |             input_dim += self.scribble_dim
 70 |             input_dim_list.append(self.in_dim + self.scribble_dim)
 71 |         if self.add_masks:
 72 |             self.polygon_dim = fourier_freqs_polygons * 2 * \
 73 |                 self.n_polygon_points * 2  # 2: sin and cos; 2: (x,y)
 74 |             input_dim += self.polygon_dim
 75 |             input_dim_list.append(self.in_dim + self.polygon_dim)
 76 |             if self.use_segs:
 77 |                 input_dim += self.convnext_feature_dim
 78 |                 input_dim_list.append(self.convnext_feature_dim)
 79 | 
 80 |         if self.use_seperate_tokenizer:
 81 |             self.linears_list = nn.ModuleList([])
 82 |             for idx, input_dim_ in enumerate(input_dim_list):
 83 |                 mid_dim = self.mid_dim
 84 |                 self.linears_list.append(nn.Sequential(
 85 |                     ops.Linear(input_dim_, mid_dim),
 86 |                     nn.SiLU(),
 87 |                     ops.Linear(mid_dim, mid_dim),
 88 |                     nn.SiLU(),
 89 |                     ops.Linear(mid_dim, out_dim),
 90 |                 ))
 91 |         else:
 92 |             self.linears = nn.Sequential(
 93 |                 ops.Linear(input_dim, self.mid_dim),
 94 |                 nn.SiLU(),
 95 |                 ops.Linear(self.mid_dim, self.mid_dim),
 96 |                 nn.SiLU(),
 97 |                 ops.Linear(self.mid_dim, out_dim),
 98 |             )
 99 | 
100 |         self.null_positive_feature = torch.nn.Parameter(
101 |             torch.zeros([self.in_dim]))  # text
102 |         if self.add_boxes:
103 |             self.null_position_feature = torch.nn.Parameter(
104 |                 torch.zeros([self.position_dim]))
105 |         if self.add_points:
106 |             self.null_point_feature = torch.nn.Parameter(
107 |                 torch.zeros([self.point_dim]))
108 |         if self.add_scribbles:
109 |             self.null_scribble_feature = torch.nn.Parameter(
110 |                 torch.zeros([self.scribble_dim]))
111 |         if self.add_masks:
112 |             self.null_polygon_feature = torch.nn.Parameter(
113 |                 torch.zeros([self.polygon_dim]))
114 |             if self.use_segs:
115 |                 self.null_seg_feature = torch.nn.Parameter(
116 |                     torch.zeros([self.convnext_feature_dim]))
117 | 
118 |     def reset_dropout_test(self):
119 |         # drop_box = True
120 |         # drop_point = False
121 |         # drop_scribble = True
122 |         # drop_polygons = True
123 |         # drop_segs = True
124 |         drop_box = self.test_drop_boxes
125 |         drop_point = self.test_drop_points
126 |         drop_scribble = self.test_drop_scribbles
127 |         drop_polygons = self.test_drop_masks
128 |         drop_segs = self.test_drop_masks
129 | 
130 |         return drop_point, drop_box, drop_scribble, drop_polygons, drop_segs
131 | 
132 |     def reset_dropout(self):
133 |         drop_box = False
134 |         drop_point = False
135 |         drop_scribble = False
136 |         drop_polygons = False
137 |         drop_segs = False
138 |         return drop_point, drop_box, drop_scribble, drop_polygons, drop_segs
139 | 
140 |     def reset_dropout_train(self, drop_point, drop_box, drop_scribble, drop_polygons, drop_segs):
141 |         if not drop_polygons:
142 |             drop_box = False
143 |             drop_point = False
144 |         if not drop_box or not drop_polygons:
145 |             drop_point = False
146 | 
147 |         # keep point only for 10% of the time
148 |         keep_point_only_ratio = 0.1
149 |         keep_point_only = torch.rand(1).item() < keep_point_only_ratio
150 |         if keep_point_only:
151 |             drop_point = False
152 |             drop_box = True
153 |             drop_scribble = True
154 |             drop_polygons = True
155 |             drop_segs = True
156 | 
157 |         # keep scribble only for 0% of the time
158 |         keep_scribble_only_ratio = 0.0
159 |         keep_scribble_only = torch.rand(
160 |             1).item() < keep_scribble_only_ratio and not drop_scribble
161 |         if keep_scribble_only:
162 |             drop_point = True
163 |             drop_box = True
164 |             drop_scribble = False
165 |             drop_polygons = True
166 |             drop_segs = True
167 | 
168 |         # keep mask only for 0% of the time
169 |         keep_mask_only_ratio = 0.0
170 |         keep_mask_only = torch.rand(
171 |             1).item() < keep_mask_only_ratio and not drop_polygons
172 |         if keep_mask_only:
173 |             drop_point = True
174 |             drop_box = True
175 |             drop_scribble = True
176 |             drop_polygons = False
177 |             drop_segs = False
178 | 
179 |         # keep seg only for 10% of the time
180 |         keep_seg_only_ratio = 0.1  # default 0.1
181 |         keep_seg_only = torch.rand(
182 |             1).item() < keep_seg_only_ratio and not drop_segs
183 |         if keep_seg_only:
184 |             drop_point = False
185 |             drop_box = False
186 |             drop_scribble = True
187 |             drop_polygons = False
188 |             drop_segs = False
189 | 
190 |         # keep box only for 0% of the time
191 |         keep_box_only_ratio = 0.0  # default 0.0
192 |         keep_box_only = torch.rand(
193 |             1).item() < keep_box_only_ratio and not drop_box
194 |         if keep_box_only:
195 |             drop_point = True
196 |             drop_box = False
197 |             drop_scribble = True
198 |             drop_polygons = True
199 |             drop_segs = True
200 | 
201 |         return drop_point, drop_box, drop_scribble, drop_polygons, drop_segs
202 | 
203 |     def forward(self, embeddings):
204 |         boxes = embeddings['boxes']
205 |         masks = embeddings['masks']
206 |         positive_embeddings = embeddings['prompts']
207 |         scribbles = embeddings['scribbles']
208 |         polygons = embeddings['polygons']
209 |         segs = embeddings['segments']
210 |         points = embeddings['points']
211 | 
212 |         B, N, _ = boxes.shape
213 |         masks = masks.unsqueeze(-1)
214 | 
215 |         drop_point, drop_box, drop_scribble, drop_polygons, drop_segs = self.reset_dropout()
216 |         # randomly drop boxes or points embeddings.
217 |         if self.add_boxes:
218 |             drop_box_ratio = 0.1
219 |             drop_box = torch.rand(1).item() < drop_box_ratio
220 |         if self.add_points:
221 |             drop_point_ratio = 0.1
222 |             drop_point = torch.rand(1).item() < drop_point_ratio
223 |         if self.add_scribbles:
224 |             drop_scribble_ratio = 0.1
225 |             drop_scribble = torch.rand(1).item() < drop_scribble_ratio
226 |         if self.add_masks:
227 |             drop_polygon_ratio = 0.1
228 |             drop_polygons = torch.rand(1).item() < drop_polygon_ratio
229 |             drop_segs = drop_polygons
230 | 
231 |         # Not training, always keep both boxes and points
232 |         if not self.training:
233 |             drop_point, drop_box, drop_scribble, drop_polygons, drop_segs = self.reset_dropout_test()
234 |         else:
235 |             drop_point, drop_box, drop_scribble, drop_polygons, drop_segs = self.reset_dropout_train(
236 |                 drop_point, drop_box, drop_scribble, drop_polygons, drop_segs)
237 | 
238 |         # set drop_box to False if all other inputs are dropped
239 |         if drop_point and drop_box and drop_scribble and drop_polygons and drop_segs:
240 |             drop_box = False
241 | 
242 |         # embedding position (it may includes padding as placeholder)
243 |         if self.add_boxes:
244 |             xyxy_embedding = self.fourier_embedder(
245 |                 boxes)  # B*N*4 --> B*N*C (C=8*2*4)
246 |         if self.add_points:
247 |             if points is None:  # we can always get a point using a box
248 |                 points = (boxes[:, :, :2] + boxes[:, :, 2:]) / 2.0
249 |             point_embedding = self.fourier_embedder(
250 |                 points)  # B*N*2 --> B*N*(8*2*2)
251 |         if self.add_scribbles:
252 |             scribble_embedding = self.fourier_embedder_polygons(
253 |                 scribbles)  # B*N*20 --> B*N*(8*20*2)
254 |         if self.add_masks:
255 |             polygon_embedding = self.fourier_embedder_polygons(
256 |                 polygons)  # B*N*128 --> B*N*(16*128*2)
257 |         if self.use_segs:
258 |             segs = torch.nn.functional.interpolate(
259 |                 segs, self.resize_input, mode="nearest")
260 |             segs_feature = self.in_conv(segs)
261 |             segs_feature = self.convnext_tiny_backbone(segs_feature)
262 |             segs_feature = segs_feature.reshape(B, -1, self.num_tokens)
263 |             segs_feature = segs_feature.permute(0, 2, 1)
264 | 
265 |         # learnable null embedding
266 |         positive_null = self.null_positive_feature.view(1, 1, -1)
267 |         if self.add_boxes:
268 |             xyxy_null = self.null_position_feature.view(1, 1, -1)
269 |         if self.add_points:
270 |             point_null = self.null_point_feature.view(1, 1, -1)
271 |         if self.add_scribbles:
272 |             scribble_null = self.null_scribble_feature.view(1, 1, -1)
273 |         if self.add_masks:
274 |             polygon_null = self.null_polygon_feature.view(1, 1, -1)
275 |         if self.use_segs:
276 |             seg_null = self.null_seg_feature.view(1, 1, -1)
277 |             seg_null = seg_null.repeat(B, self.num_tokens, 1)
278 | 
279 |         # replace padding with learnable null embedding
280 |         positive_embeddings = positive_embeddings * \
281 |             masks + (1 - masks) * positive_null
282 |         if self.use_seperate_tokenizer:
283 |             embeddings_list = []
284 |         if self.add_boxes:
285 |             # replace padding with learnable null embedding for boxes
286 |             xyxy_masks = torch.zeros_like(masks).to(
287 |                 masks.device) if drop_box else masks.detach().clone()
288 |             xyxy_embedding = xyxy_embedding * \
289 |                 xyxy_masks + (1 - xyxy_masks) * xyxy_null
290 |             if self.use_seperate_tokenizer:
291 |                 embeddings_list.append(xyxy_embedding)
292 |         if self.add_points:
293 |             # replace padding with learnable null embedding for points
294 |             point_masks = torch.zeros_like(masks).to(
295 |                 boxes.device) if drop_point else masks.detach().clone()
296 |             point_embedding = point_embedding * \
297 |                 point_masks + (1 - point_masks) * point_null
298 |             if self.use_seperate_tokenizer:
299 |                 embeddings_list.append(point_embedding)
300 |         if self.add_scribbles:
301 |             # sum along the batch dimension and check if all scribbles are 0s
302 |             # replace padding with learnable null embedding for scribbles
303 |             # scribble_embedding: torch.Size([bs, n_objs, 640]); masks_scribble: torch.Size([bs, n_objs, 1]); scribble_null: torch.Size([1, 1, 640])
304 |             masks_scribble = torch.zeros_like(masks).to(masks.device) if drop_scribble else (
305 |                 (torch.sum(scribbles, dim=-1).unsqueeze(-1) + masks.detach().clone()) > 0).float()
306 |             scribble_embedding = scribble_embedding * \
307 |                 masks_scribble + (1 - masks_scribble) * scribble_null
308 |             if self.use_seperate_tokenizer:
309 |                 embeddings_list.append(scribble_embedding)
310 |         if self.add_masks:
311 |             masks_polygons = torch.zeros_like(masks).to(masks.device) if drop_polygons else (
312 |                 (torch.sum(polygons, dim=-1).unsqueeze(-1) + masks.detach().clone()) > 0).float()
313 |             assert torch.sum(
314 |                 scribbles, dim=-1).unsqueeze(-1).size() == masks.size()
315 |             polygon_embedding = polygon_embedding * \
316 |                 masks_polygons + (1 - masks_polygons) * polygon_null
317 |             if self.use_seperate_tokenizer:
318 |                 embeddings_list.append(polygon_embedding)
319 |         if self.use_segs:
320 |             # mask replacing
321 |             masks_segs = torch.zeros(masks.shape[0]).to(masks.device) if drop_segs else (
322 |                 torch.sum(segs, dim=(1, 2, 3)) > 0).float()
323 |             masks_segs = masks_segs.view(-1, 1, 1)
324 |             assert masks_segs.size()[0] == masks.shape[0]
325 |             seg_embedding = segs_feature * masks_segs
326 |             seg_embedding = seg_embedding + (1 - masks_segs) * seg_null
327 |             # add pos
328 |             seg_embedding = seg_embedding + self.pos_embedding
329 |             if self.use_seperate_tokenizer:
330 |                 embeddings_list.append(seg_embedding)
331 | 
332 |         inputs = [positive_embeddings]
333 |         if self.use_seperate_tokenizer:
334 |             objs = []
335 |             # forward all types of embeddings using the corresponding linear layers
336 |             for i, (linears, layout_embeddings) in enumerate(zip(self.linears_list, embeddings_list)):
337 |                 if i == len(embeddings_list) - 1 and self.use_segs:
338 |                     objs.append(linears(layout_embeddings))
339 |                 else:
340 |                     objs.append(
341 |                         linears(torch.cat([positive_embeddings, layout_embeddings], dim=-1)))
342 |             objs = torch.cat(objs, dim=1)
343 |         else:
344 |             # NOTE: orders should the same for training and testing
345 |             if self.add_boxes:
346 |                 inputs.append(xyxy_embedding)
347 |             if self.add_points:
348 |                 inputs.append(point_embedding)
349 |             if self.add_scribbles:
350 |                 inputs.append(scribble_embedding)
351 |             if self.add_masks:
352 |                 inputs.append(polygon_embedding)
353 | 
354 |             objs = self.linears(torch.cat(inputs, dim=-1))
355 |             assert objs.shape == torch.Size([B, N, self.out_dim])
356 |         drop_box_mask = True if drop_box and drop_polygons else False
357 |         return objs, drop_box_mask
358 | 


--------------------------------------------------------------------------------
/modules/text_grounding_tokenizer_input.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class GroundingNetInput:
 5 |     def __init__(self):
 6 |         self.set = False
 7 |         self.return_att_masks = False
 8 |         self.image_size = 64
 9 |         self.return_att_masks32 = False
10 | 
11 |     def prepare(self, batch, image_size=64, device=None, dtype=None, return_att_masks=False):
12 |         """
13 |         batch should be the output from dataset.
14 |         Please define here how to process the batch and prepare the 
15 |         input only for the ground tokenizer. 
16 |         """
17 |         output = {}
18 |         self.set = True
19 |         self.return_att_masks = return_att_masks
20 | 
21 |         boxes = batch['boxes']
22 |         masks = batch['masks']
23 |         positive_embeddings = batch["prompts"]
24 | 
25 |         if self.return_att_masks:
26 |             assert 'att_masks' in batch
27 |             att_masks = batch['att_masks']
28 | 
29 |         scribbles = batch['scribbles']
30 |         polygons = batch['polygons']
31 |         self.dim_scribbles = scribbles.shape[-1]
32 |         self.dim_polygons = polygons.shape[-1]
33 |         # NOTE: New Seg
34 |         segs = batch["segments"]
35 |         self.dim_segs = segs.shape[-1]
36 |         points = batch["points"]
37 | 
38 |         self.batch, self.max_box, self.in_dim = positive_embeddings.shape
39 |         self.device = positive_embeddings.device
40 |         self.dtype = positive_embeddings.dtype
41 | 
42 |         output = {
43 |             "boxes": boxes,
44 |             "masks": masks,
45 |             "prompts": positive_embeddings,
46 |         }
47 |         output["scribbles"] = scribbles
48 |         output["polygons"] = polygons
49 |         output["segments"] = segs
50 |         output["points"] = points
51 | 
52 |         if self.return_att_masks:
53 |             output['att_masks'] = att_masks
54 |         return output
55 | 
56 |     def get_null_input(self, batch=None, latent_width=64, latent_height=64, device=None, dtype=None):
57 |         """
58 |         Guidance for training (drop) or inference, 
59 |         please define the null input for the grounding tokenizer 
60 |         """
61 | 
62 |         assert self.set, "not set yet, cannot call this funcion"
63 |         batch = self.batch if batch is None else batch
64 |         device = self.device if device is None else device
65 |         dtype = self.dtype if dtype is None else dtype
66 | 
67 |         boxes = torch.zeros(batch, self.max_box, 4,).type(dtype).to(device)
68 |         masks = torch.zeros(batch, self.max_box).type(dtype).to(device)
69 |         # NOTE: New Seg
70 |         segs = torch.zeros(batch, self.max_box, self.dim_segs,
71 |                            self.dim_segs).type(dtype).to(device)
72 | 
73 |         scribbles = torch.zeros(batch, self.max_box,
74 |                                 self.dim_scribbles).type(dtype).to(device)
75 |         polygons = torch.zeros(batch, self.max_box,
76 |                                self.dim_polygons).type(dtype).to(device)
77 |         points = torch.zeros(batch, self.max_box, 2).type(dtype).to(device)
78 | 
79 |         positive_embeddings = torch.zeros(
80 |             batch, self.max_box, self.in_dim).type(dtype).to(device)
81 | 
82 |         output = {
83 |             "boxes": boxes,
84 |             "masks": masks,
85 |             "prompts": positive_embeddings,
86 |         }
87 |         output["scribbles"] = scribbles
88 |         output["polygons"] = polygons
89 |         output["segments"] = segs
90 |         output["points"] = points
91 | 
92 |         if self.return_att_masks:
93 |             att_masks = torch.zeros(batch, self.max_box, latent_width, latent_height).type(
94 |                 dtype).to(device)  # TODO Order width/height
95 |             output['att_masks'] = att_masks
96 |         return output
97 | 


--------------------------------------------------------------------------------
/modules/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class FourierEmbedder():
 5 |     def __init__(self, num_freqs=64, temperature=100):
 6 | 
 7 |         self.num_freqs = num_freqs
 8 |         self.temperature = temperature
 9 |         self.freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
10 | 
11 |     @ torch.no_grad()
12 |     def __call__(self, x, cat_dim=-1):
13 |         "x: arbitrary shape of tensor. dim: cat dim"
14 |         out = []
15 |         for freq in self.freq_bands:
16 |             out.append(torch.sin(freq*x))
17 |             out.append(torch.cos(freq*x))
18 |         return torch.cat(out, cat_dim)
19 | 


--------------------------------------------------------------------------------
/nodes/apply_scaleu_model_node.py:
--------------------------------------------------------------------------------
 1 | import comfy.model_management
 2 | 
 3 | from .. import constants as constants
 4 | from ..model_helpers.prepare_scaleu import get_scaleu_patch
 5 | 
 6 | 
 7 | class ApplyScaleUModelNode:
 8 |     @classmethod
 9 |     def INPUT_TYPES(s):
10 |         return {"required": {
11 |             "model": ("MODEL",),
12 |             "scaleu": ("SCALEU",),
13 |         }}
14 | 
15 |     RETURN_TYPES = ("MODEL",)
16 |     FUNCTION = "apply"
17 | 
18 |     CATEGORY = "instance"
19 | 
20 |     def apply(self, model, scaleu):
21 |         # Validate patches dict is setup correctly
22 |         transformer_options = model.model_options['transformer_options']
23 |         if 'patches' not in transformer_options:
24 |             transformer_options['patches'] = {}
25 | 
26 |         if 'output_block_patch' not in transformer_options['patches']:
27 |             transformer_options['patches']['output_block_patch'] = []
28 | 
29 |         # Add scaleu patch to model patches
30 |         scaleu_nets = scaleu['model_list']
31 |         # TODO make this load in KSampler
32 |         for i, scaleu in enumerate(scaleu_nets):
33 |             scaleu_nets[i] = scaleu.to(
34 |                 comfy.model_management.get_torch_device())
35 |         transformer_options['patches']['output_block_patch'].append(
36 |             get_scaleu_patch(scaleu_nets))
37 |         return (model,)
38 | 


--------------------------------------------------------------------------------
/nodes/download_and_load_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import folder_paths
 3 | import comfy.utils
 4 | from .. import constants as constants
 5 | from ..model_helpers.prepare_positionnet import prepare_positionnet, get_positionnet_default_params
 6 | from ..model_helpers.prepare_scaleu import prepare_scaleu_nets
 7 | from ..model_helpers.prepare_fusers import prepare_fusers
 8 | from huggingface_hub import snapshot_download
 9 | 
10 | INSTANCE_FUSERS_DIR = "fuser_models"
11 | 
12 | INSTANCE_SCALEU_DIR = "scaleu_models"
13 | 
14 | class DownloadInstanceDiffusionModels:
15 |     @classmethod
16 |     def INPUT_TYPES(s):
17 |         return {"required": {
18 |             "use_segs": ("BOOLEAN", {"default": True}),
19 |             "fusers_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
20 |         }}
21 | 
22 |     RETURN_TYPES = ("POSITIONNET", "FUSERS", "SCALEU", )
23 |     FUNCTION = "load_model"
24 | 
25 |     CATEGORY = "instance/loaders"
26 | 
27 |     def load_model(self, use_segs: bool, fusers_scale: float):
28 |         repo_id = "logtd/instance_diffusion"
29 |         instance_models_folder = os.path.join(folder_paths.models_dir, constants.INSTANCE_MODELS_DIR)
30 | 
31 |         models_to_download = [
32 |             ("position_net", constants.INSTANCE_POSITIONNET_DIR, "position_net.ckpt"),
33 |             ("fusers", constants.INSTANCE_FUSERS_DIR, "fusers.ckpt"),
34 |             ("scaleu", constants.INSTANCE_SCALEU_DIR, "scaleu.ckpt")
35 |         ]
36 | 
37 |         for model_name, model_folder, model_file in models_to_download:
38 |             model_folder_path = os.path.join(instance_models_folder, model_folder)
39 |             model_file_path = os.path.join(model_folder_path, model_file)
40 | 
41 |             if not os.path.exists(model_file_path):
42 |                 print(f"Selected model: {model_file_path} not found, downloading...")
43 |                 allow_patterns = [f"*{model_name}*"]
44 |                 snapshot_download(repo_id=repo_id, 
45 |                                   allow_patterns=allow_patterns, 
46 |                                   local_dir=model_folder_path, 
47 |                                   local_dir_use_symlinks=False
48 |                                   )
49 |                 
50 |         positionnet_file = os.path.join(instance_models_folder, constants.INSTANCE_POSITIONNET_DIR, "position_net.ckpt")
51 |         fusers_file = os.path.join(instance_models_folder, constants.INSTANCE_FUSERS_DIR, "fusers.ckpt")
52 |         scaleu_file = os.path.join(instance_models_folder, constants.INSTANCE_SCALEU_DIR, "scaleu.ckpt")
53 | 
54 |         pos_checkpoint = comfy.utils.load_torch_file(positionnet_file, safe_load=True)
55 |         params = get_positionnet_default_params()
56 |         params["use_segs"] = use_segs
57 |         model = prepare_positionnet(pos_checkpoint, params)
58 |         positionnet = {
59 |             'model': model,
60 |         }
61 | 
62 |         fusers_checkpoint = comfy.utils.load_torch_file(fusers_file, safe_load=True)
63 |         fusers_list = prepare_fusers(fusers_checkpoint, fusers_scale)
64 |         fusers = {
65 |             'model_list': fusers_list
66 |         }
67 |         scaleu_checkpoint = comfy.utils.load_torch_file(scaleu_file, safe_load=True)
68 |         scaleu_list = prepare_scaleu_nets(scaleu_checkpoint)
69 |         scaleu = {
70 |             'model_list': scaleu_list
71 |         }
72 |         return (positionnet, fusers, scaleu)
73 |     
74 | 


--------------------------------------------------------------------------------
/nodes/instance_diffusion_tracking_prompt_node.py:
--------------------------------------------------------------------------------
 1 | from ..utils.prompt_utils import extract_prompts
 2 | from ..conditioning.instance_conditioning import InstanceConditioning
 3 | 
 4 | 
 5 | class InstanceDiffusionTrackingPromptNode:
 6 |     @classmethod
 7 |     def INPUT_TYPES(s):
 8 |         return {"required": {"positive": ("CONDITIONING", ),
 9 |                              "negative": ("CONDITIONING", ),
10 |                              "clip": ("CLIP", ),
11 |                              "tracking": ("TRACKING", ),
12 |                              "positionnet": ("POSITIONNET", ),
13 |                              "fusers": ("FUSERS", ),
14 |                              "positive_text": ("STRING", {"multiline": True}),
15 |                              "negative_text": ("STRING", {"multiline": True}),
16 |                              }}
17 |     RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
18 |     RETURN_NAMES = ("positive", "negative")
19 |     FUNCTION = "append"
20 | 
21 |     CATEGORY = "instance/conditioning"
22 | 
23 |     def _get_position_conds(self, clip, tracking, text):
24 |         # Get prompts and their class id and trakcer id
25 |         prompt_pairs = extract_prompts(text)
26 | 
27 |         # Go through prompt pairs, encode prompts, and join with positions from tracking
28 |         position_conds = []
29 |         for tracker_id, class_id, prompt in prompt_pairs:
30 |             _, cond_pooled = clip.encode_from_tokens(
31 |                 clip.tokenize(prompt), return_pooled=True)
32 |             # A tracker_id of -1 means that it is prompting all instances of a single class
33 |             if tracker_id != -1:
34 |                 position_cond = {'cond_pooled': cond_pooled, 'positions':
35 |                                  tracking[class_id][tracker_id]}
36 |                 position_conds.append(position_cond)
37 |             else:
38 |                 for tracker_id in tracking[class_id]:
39 |                     position_cond = {'cond_pooled': cond_pooled,
40 |                                      'positions': tracking[class_id][tracker_id]}
41 |                     position_conds.append(position_cond)
42 | 
43 |         return position_conds
44 | 
45 |     def _apply_position_conds(self, position_conds, conditioning, fusers, positionnet):
46 |         # Add prompts+embeddings to the input conditionings
47 |         cond_out = []
48 |         for t in conditioning:
49 |             n = [t[0], t[1].copy()]
50 |             cond = n[1]
51 |             prev = []
52 |             has_instance = 'instance_diffusion' in cond
53 |             instance_conditioning = conditioning['instance_diffusion'] if has_instance else InstanceConditioning(
54 |                 fusers, positionnet)
55 |             cond['instance_diffusion'] = instance_conditioning
56 |             instance_conditioning.add_conds(position_conds)
57 | 
58 |             cond['gligen'] = ('position', instance_conditioning, None)
59 | 
60 |             cond_out.append(n)
61 | 
62 |         return cond_out
63 | 
64 |     def append(self, positive, negative, clip, tracking, fusers, positionnet, positive_text, negative_text, fusers_batch_size=None):
65 | 
66 |         positive_positions = self._get_position_conds(
67 |             clip, tracking, positive_text)
68 |         positive = self._apply_position_conds(
69 |             positive_positions, positive, fusers, positionnet)
70 | 
71 |         negative_positions = self._get_position_conds(
72 |             clip, tracking, negative_text)
73 |         negative = self._apply_position_conds(
74 |             negative_positions, negative, fusers, positionnet)
75 | 
76 |         return (positive, negative)
77 | 


--------------------------------------------------------------------------------
/nodes/load_instance_fusers_node.py:
--------------------------------------------------------------------------------
 1 | from .. import constants as constants
 2 | from ..utils.model_utils import get_model_list, load_checkpoint
 3 | from ..model_helpers.prepare_fusers import prepare_fusers
 4 | 
 5 | 
 6 | class LoadInstanceFusersNode:
 7 |     @classmethod
 8 |     def INPUT_TYPES(s):
 9 |         return {"required": {
10 |             "model_filename": (get_model_list(constants.INSTANCE_FUSERS_DIR),),
11 |             "fusers_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
12 |         }}
13 | 
14 |     RETURN_TYPES = ("FUSERS",)
15 |     FUNCTION = "load_model"
16 | 
17 |     CATEGORY = "instance/loaders"
18 | 
19 |     def load_model(self, model_filename: str, fusers_scale: float):
20 |         checkpoint = load_checkpoint(
21 |             constants.INSTANCE_FUSERS_DIR, model_filename)
22 |         fusers_list = prepare_fusers(checkpoint, fusers_scale)
23 |         fusers = {
24 |             'model_list': fusers_list
25 |         }
26 |         return (fusers,)
27 | 


--------------------------------------------------------------------------------
/nodes/load_instance_positionnet_node.py:
--------------------------------------------------------------------------------
 1 | from .. import constants as constants
 2 | from ..utils.model_utils import get_model_list, load_checkpoint
 3 | from ..model_helpers.prepare_positionnet import prepare_positionnet, get_positionnet_default_params
 4 | 
 5 | 
 6 | class LoadInstancePositionNetNode:
 7 |     @classmethod
 8 |     def INPUT_TYPES(s):
 9 |         return {"required": {
10 |             "model_filename": (get_model_list(constants.INSTANCE_POSITIONNET_DIR),),
11 |             "use_segs": ("BOOLEAN", {"default": True}),
12 |         }}
13 | 
14 |     RETURN_TYPES = ("POSITIONNET", "FUSERS", "SCALEU",)
15 |     FUNCTION = "load_model"
16 | 
17 |     CATEGORY = "instance/loaders"
18 | 
19 |     def load_model(self, model_filename: str, use_segs: bool):
20 |         checkpoint = load_checkpoint(
21 |             constants.INSTANCE_POSITIONNET_DIR, model_filename)
22 |         params = get_positionnet_default_params()
23 |         params["use_segs"] = use_segs
24 |         model = prepare_positionnet(checkpoint, params)
25 |         positionnet = {
26 |             'model': model,
27 |         }
28 |         return (positionnet,)
29 | 


--------------------------------------------------------------------------------
/nodes/load_instance_scaleu_node.py:
--------------------------------------------------------------------------------
 1 | from .. import constants as constants
 2 | from ..utils.model_utils import get_model_list, load_checkpoint
 3 | from ..model_helpers.prepare_scaleu import prepare_scaleu_nets
 4 | 
 5 | 
 6 | class LoadInstanceScaleUNode:
 7 |     @classmethod
 8 |     def INPUT_TYPES(s):
 9 |         return {"required": {
10 |             "model_filename": (get_model_list(constants.INSTANCE_SCALEU_DIR),),
11 |         }}
12 | 
13 |     RETURN_TYPES = ("SCALEU",)
14 |     FUNCTION = "load_model"
15 | 
16 |     CATEGORY = "instance/loaders"
17 | 
18 |     def load_model(self, model_filename: str):
19 |         checkpoint = load_checkpoint(
20 |             constants.INSTANCE_SCALEU_DIR, model_filename)
21 |         scaleu_list = prepare_scaleu_nets(checkpoint)
22 |         scaleu = {
23 |             'model_list': scaleu_list
24 |         }
25 |         return (scaleu,)
26 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui-instancediffusion"
 3 | description = "A set of nodes to perform multi-object prompting with InstanceDiffusion"
 4 | version = "1.0.0"
 5 | license = "LICENSE"
 6 | dependencies = ["huggingface_hub"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/logtd/ComfyUI-InstanceDiffusion"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = "logtd"
14 | DisplayName = "ComfyUI-InstanceDiffusion"
15 | Icon = ""
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface_hub


--------------------------------------------------------------------------------
/utils/decode_item.py:
--------------------------------------------------------------------------------
  1 | # Directly taken from InstanceDiffusion repo
  2 | import torch
  3 | import random
  4 | import base64
  5 | import numpy as np
  6 | from io import BytesIO
  7 | from collections import Counter
  8 | from PIL import Image, ImageDraw
  9 | import base64
 10 | from skimage import measure
 11 | 
 12 | 
 13 | # import nltk
 14 | # from nltk.corpus import stopwords
 15 | 
 16 | def decode_base64_to_pillow(image_b64):
 17 |     return Image.open(BytesIO(base64.b64decode(image_b64))).convert('RGB')
 18 | 
 19 | 
 20 | def decode_tensor_from_string(arr_str, use_tensor=True):
 21 |     arr = np.frombuffer(base64.b64decode(arr_str), dtype='float32')
 22 |     if use_tensor:
 23 |         arr = torch.from_numpy(arr)
 24 |     return arr
 25 | 
 26 | 
 27 | def close_contour(contour):
 28 |     if not np.array_equal(contour[0], contour[-1]):
 29 |         contour = np.vstack((contour, contour[0]))
 30 |     return contour
 31 | 
 32 | # convert binay mask to polygon format
 33 | 
 34 | 
 35 | def binary_mask_to_polygon(binary_mask, tolerance=0):
 36 |     """Converts a binary mask to COCO polygon representation
 37 | 
 38 |     Args:
 39 |         binary_mask: a 2D binary numpy array where '1's represent the object
 40 |         tolerance: Maximum distance from original points of polygon to approximated
 41 |             polygonal chain. If tolerance is 0, the original coordinate array is returned.
 42 | 
 43 |     """
 44 |     polygons = []
 45 |     # pad mask to close contours of shapes which start and end at an edge
 46 |     padded_binary_mask = np.pad(
 47 |         binary_mask, pad_width=1, mode='constant', constant_values=0)
 48 |     contours = measure.find_contours(padded_binary_mask, tolerance)
 49 |     polygons = []
 50 |     # print(contours)
 51 |     for contour in contours:
 52 |         contour = close_contour(contour)
 53 |         contour = measure.approximate_polygon(contour, tolerance)
 54 |         if len(contour) < 3:
 55 |             continue
 56 |         contour = np.flip(contour, axis=1)
 57 |         segmentation = contour.ravel().tolist()
 58 |         # after padding and subtracting 1 we may get -0.5 points in our segmentation
 59 |         segmentation = [0 if i < 0 else i for i in segmentation]
 60 |         polygons.append(segmentation)
 61 | 
 62 |     return polygons
 63 | 
 64 | 
 65 | def sample_random_points_from_mask(mask, k):
 66 |     mask = mask[:, :, 0]
 67 |     # Find the coordinates of non-zero pixels in the binary mask
 68 |     nonzero_coords = np.transpose(np.nonzero(mask))
 69 | 
 70 |     # Randomly sample 'k' points
 71 |     # return all zeros if there is no non-zero pixel
 72 |     if len(nonzero_coords) == 0:
 73 |         xy_points = [0 for _ in range(k * 2)]
 74 |         return xy_points
 75 | 
 76 |     # randomly sample with replacement if there are not enough non-zero pixels
 77 |     if len(nonzero_coords) < k and len(nonzero_coords) > 0:
 78 |         random_indices = np.random.choice(len(nonzero_coords), k, replace=True)
 79 |     # randomly sample withiout replacement if there are enough non-zero pixels
 80 |     else:
 81 |         random_indices = np.random.choice(
 82 |             len(nonzero_coords), k, replace=False)
 83 |     sampled_points = nonzero_coords[random_indices]
 84 | 
 85 |     # order the points by their distance to (0, 0)
 86 |     # center = np.array([mask.shape[0] // 2, mask.shape[1] // 2])
 87 |     center = np.array([0, 0])
 88 |     sampled_points = sorted(sampled_points, key=lambda x: np.linalg.norm(
 89 |         np.array(x) - center))  # np.linalg.norm
 90 | 
 91 |     # concatenate x and y coordinates and return them as a list
 92 |     # [x1,y1,x2,y2,...,x_k,y_k]
 93 |     xy_points = []
 94 |     for x in sampled_points:
 95 |         xy_points.append(float(x[1]))
 96 |         xy_points.append(float(x[0]))
 97 |     return xy_points
 98 | 
 99 | # convert numpy array of bool mask to float mask
100 | 
101 | 
102 | def binary_mask_to_int(binary_mask):
103 |     return binary_mask.astype(np.int32)
104 | 
105 | # uniformly sample points from the mask
106 | 
107 | 
108 | def sample_sparse_points(binary_mask, k, return_2d=False):
109 |     # Find the coordinates of non-zero pixels in the binary mask
110 |     nonzero_coords = np.array(np.nonzero(binary_mask))
111 |     if len(nonzero_coords) == 0:
112 |         xy_points = [0 for _ in range(k * 2)]
113 |         return xy_points
114 | 
115 |     # Calculate the total number of non-zero pixels
116 |     num_nonzero_pixels = len(nonzero_coords)
117 | 
118 |     xy_points = []
119 |     if k >= num_nonzero_pixels:
120 |         for x in nonzero_coords:
121 |             xy_points.append(float(x[1]))
122 |             xy_points.append(float(x[0]))
123 |         for _ in range(k - num_nonzero_pixels):
124 |             xy_points.append(nonzero_coords[-1][1])
125 |             xy_points.append(nonzero_coords[-1][0])
126 |         return nonzero_coords
127 | 
128 |     # Calculate the number of points to sample in each dimension
129 |     num_points_per_dim = int(np.sqrt(k))
130 | 
131 |     # Calculate the step size to ensure equal spacing
132 |     step_size = max(1, num_nonzero_pixels // (num_points_per_dim ** 2))
133 | 
134 |     # Sample points with equal spacing
135 |     sampled_points = nonzero_coords[::step_size][:k]
136 |     if return_2d:
137 |         sampled_points = [(x[1], x[0]) for x in sampled_points]
138 |     else:
139 |         for x in sampled_points:
140 |             xy_points.append(float(x[1]))
141 |             xy_points.append(float(x[0]))
142 |         return xy_points
143 | 
144 | 
145 | def sample_uniform_sparse_points(binary_mask, k):
146 |     # binary_mask = binary_mask[:,:,0]
147 |     # Step 1: Get the indices of '1' values in the binary mask
148 |     foreground_indices = np.argwhere(binary_mask == 1)
149 | 
150 |     if len(foreground_indices) == 0:
151 |         return []
152 | 
153 |     selected_points = []
154 |     if len(foreground_indices) < k:
155 |         # randomly sample with replacement if there are not enough non-zero pixels
156 |         for i in range(k):
157 |             random_point = random.choice(foreground_indices)
158 |             selected_points.append((random_point[1], random_point[0]))
159 |     else:
160 |         # rank the points by their distance to the mean of the foreground_indices
161 |         center = np.mean(foreground_indices, axis=0)
162 |         # print(center)
163 |         foreground_indices = sorted(
164 |             foreground_indices, key=lambda x: np.linalg.norm(x - center))  # np.linalg.norm
165 |         # Calculate the number of points to select from each segment
166 |         points_per_segment = len(foreground_indices) // k
167 | 
168 |         # Step 2: Randomly select one point from each segment
169 |         # print(k)
170 |         for i in range(k):
171 |             segment_points = foreground_indices[i *
172 |                                                 points_per_segment: (i + 1) * points_per_segment]
173 |             # choose the middle point in each segment
174 |             random_point = segment_points[len(segment_points) // 2]
175 |             # random_point = random.choice(segment_points)
176 |             selected_points.append((random_point[1], random_point[0]))
177 | 
178 |     return selected_points
179 | 
180 | 
181 | def sample_sparse_points_from_mask(mask, k):
182 |     n_points = k
183 |     n_polygons = n_points // 2  # half points should be sampled from the polygons
184 |     mask = mask[:, :, 0]
185 |     # sample sparse points from the polygons (boundary)
186 |     polygons = binary_mask_to_polygon(mask, tolerance=0.0)
187 |     # concatenate polygons to a single list
188 |     polygons_single = []
189 |     for polygon in polygons:
190 |         polygons_single += polygon
191 |     if len(polygons_single) != 0:
192 |         # uniformly sample points from the polygon
193 |         polygons_single = np.array(polygons_single).reshape(-1, 2)
194 |         indexes = np.linspace(0, polygons_single.shape[0] - 1, n_polygons)
195 |         indexes = list([int(i) for i in indexes])
196 | 
197 |         polygons_single = polygons_single[indexes]
198 |         sampled_polygons = [(x[0], x[1]) for x in polygons_single]
199 |     else:
200 |         return None
201 | 
202 |     # sample sparse points from the mask
203 |     n_inside_points = n_points - len(sampled_polygons)
204 |     inside_points = sample_uniform_sparse_points(mask, n_inside_points)
205 | 
206 |     # combine inside_points and sampled_polygons
207 |     xy_points = inside_points + sampled_polygons
208 | 
209 |     # order the points by their distance to (0, 0)
210 |     center = np.array([0, 0])
211 |     xy_points = sorted(xy_points, key=lambda x: np.linalg.norm(
212 |         np.array(x) - center))  # np.linalg.norm
213 | 
214 |     # return the sampled points
215 |     sampled_points = []
216 |     for x in xy_points:
217 |         sampled_points.append(x[0])
218 |         sampled_points.append(x[1])
219 |     return sampled_points
220 | 
221 | 
222 | def get_polygons_from_mask(mask, tolerance=0, n_polygon_points=256):
223 |     mask = binary_mask_to_int(mask)
224 |     return_polygons = True
225 |     if return_polygons:
226 |         # convert float mask to polygons
227 |         polygons = binary_mask_to_polygon(mask[:, :, 0], tolerance=tolerance)
228 | 
229 |         # return all zeros if there is no polygon
230 |         if len(polygons) == 0:
231 |             polygons = [0 for _ in range(n_polygon_points * 2)]
232 |             return polygons
233 | 
234 |         # concatenate polygons to a single list
235 |         polygon = []
236 |         for p in polygons:
237 |             polygon += p
238 | 
239 |         # uniformly sample points the polygon
240 |         polygon = np.array(polygon).reshape(-1, 2)
241 |         indexes = np.linspace(0, polygon.shape[0] - 1, n_polygon_points)
242 |         indexes = [int(i) for i in indexes]
243 |         polygon = polygon[indexes].reshape(-1)
244 | 
245 |         return polygon
246 |     else:
247 |         sampled_points = sample_sparse_points(mask, n_polygon_points)
248 |         return sampled_points
249 | 
250 | 
251 | def decode_item(item):
252 |     # convert string to dict
253 |     if "image" in item and isinstance(item['image'], Image.Image):
254 |         return item
255 | 
256 |     item['image'] = decode_base64_to_pillow(item['image'])
257 |     segs = []
258 |     for anno in item['annos']:
259 |         anno['image_embedding_before'] = decode_tensor_from_string(
260 |             anno['image_embedding_before'])
261 |         anno['text_embedding_before'] = decode_tensor_from_string(
262 |             anno['text_embedding_before'])
263 |         anno['image_embedding_after'] = decode_tensor_from_string(
264 |             anno['image_embedding_after'])
265 |         anno['text_embedding_after'] = decode_tensor_from_string(
266 |             anno['text_embedding_after'])
267 |         if "blip_clip_embeddings" in anno:
268 |             anno['blip_clip_embeddings'] = decode_tensor_from_string(
269 |                 anno['blip_clip_embeddings'])
270 |         if 'mask' in anno:
271 |             # sample k random points from the mask
272 |             n_scribble_points = 20
273 |             rle = anno['mask']
274 |             binary_mask = decodeToBinaryMask(rle)
275 |             segs.append(binary_mask)
276 |             if "scribbles" in anno:
277 |                 anno['scribbles'] = anno["scribbles"]
278 |             else:
279 |                 anno['scribbles'] = sample_random_points_from_mask(
280 |                     binary_mask, n_scribble_points)
281 |             # convert mask to polygon
282 |             n_polygon_points = 256
283 |             polygons = sample_sparse_points_from_mask(
284 |                 binary_mask, k=n_polygon_points)
285 |             if polygons != None:
286 |                 anno['polygons'] = polygons
287 |             else:
288 |                 anno['polygons'] = [0 for _ in range(n_polygon_points * 2)]
289 |     if len(segs) > 0:
290 |         item['segs'] = np.stack(segs).astype(np.float32).squeeze()
291 |     return item
292 | 
293 | 
294 | def check_unique(images, fields):
295 |     for field in fields:
296 |         temp_list = []
297 |         for img_info in images:
298 |             temp_list.append(img_info[field])
299 |         assert len(set(temp_list)) == len(temp_list), field
300 | 
301 | 
302 | def clean_data(data):
303 |     for data_info in data:
304 |         data_info.pop("original_img_id", None)
305 |         data_info.pop("original_id", None)
306 |         # sentence id for each image (multiple sentences for one image)
307 |         data_info.pop("sentence_id", None)
308 |         data_info.pop("dataset_name", None)
309 |         data_info.pop("data_source", None)
310 |         data_info["data_id"] = data_info.pop("id")
311 | 
312 | 
313 | def clean_annotations(annotations):
314 |     for anno_info in annotations:
315 |         anno_info.pop("iscrowd", None)
316 |         anno_info.pop("category_id", None)
317 |         anno_info.pop("area", None)
318 |         anno_info["data_id"] = anno_info.pop("image_id")
319 | 
320 | 
321 | def draw_box(img, boxes):
322 |     draw = ImageDraw.Draw(img)
323 |     for box in boxes:
324 |         draw.rectangle([box[0], box[1], box[2], box[3]],
325 |                        outline="red", width=2)  # x0 y0 x1 y1
326 |     return img
327 | 
328 | 
329 | def xyhw2xyxy(box):
330 |     x0, y0, w, h = box
331 |     return [x0, y0, x0 + w, y0 + h]
332 | 
333 | 
334 | def make_a_sentence_count_nums(obj_names):
335 |     # count the number of duplicated strings in the list
336 |     # ["dog", "dog", "cat"]
337 |     obj_names = dict(Counter(obj_names))
338 |     # {'dog': 2, 'cat': 1}
339 |     caption = ""
340 |     for item in obj_names:
341 |         caption += str(obj_names[item]) + " " + item + ", "
342 |     return caption[:-2]
343 | 
344 | 
345 | def make_a_sentence(obj_names, clean=False):
346 | 
347 |     if clean:
348 |         obj_names = [name[:-6] if ("-other" in name)
349 |                      else name for name in obj_names]
350 | 
351 |     caption = ""
352 |     tokens_positive = []
353 |     for obj_name in obj_names:
354 |         start_len = len(caption)
355 |         caption += obj_name
356 |         end_len = len(caption)
357 |         caption += ", "
358 |         tokens_positive.append(
359 |             # in real caption, positive tokens can be disjoint, thus using list of list
360 |             [[start_len, end_len]]
361 |         )
362 |     caption = caption[:-2]  # remove last ", "
363 | 
364 |     return caption  # , tokens_positive
365 | 
366 | 
367 | def mask_for_random_drop_text_or_image_feature(masks, random_drop_embedding):
368 |     """
369 |     input masks tell how many valid grounding tokens for this image
370 |     e.g., 1,1,1,1,0,0,0,0,0,0...
371 | 
372 |     If random_drop_embedding=both.  we will random drop either image or
373 |     text feature for each token, 
374 |     but we always make sure there is at least one feature used. 
375 |     In other words, the following masks are not valid 
376 |     (because for the second obj, no feature at all):
377 |     image: 1,0,1,1,0,0,0,0,0
378 |     text:  1,0,0,0,0,0,0,0,0
379 | 
380 |     if random_drop_embedding=image. we will random drop image feature 
381 |     and always keep the text one.  
382 | 
383 |     """
384 |     N = masks.shape[0]
385 | 
386 |     if random_drop_embedding == 'both':
387 |         temp_mask = torch.ones(2, N)
388 |         for i in range(N):
389 |             if random.uniform(0, 1) < 0.5:  # else keep both features
390 |                 # randomly choose to drop image or text feature
391 |                 idx = random.sample([0, 1], 1)[0]
392 |                 temp_mask[idx, i] = 0
393 |         image_masks = temp_mask[0] * masks
394 |         text_masks = temp_mask[1] * masks
395 | 
396 |     if random_drop_embedding == 'image':
397 |         image_masks = masks * (torch.rand(N) > 0.5) * 1
398 |         text_masks = masks
399 | 
400 |     return image_masks, text_masks
401 | 
402 | 
403 | def project(x, projection_matrix):
404 |     """
405 |     x (Batch*768) should be the penultimate feature of CLIP (before projection)
406 |     projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer 
407 |     defined in CLIP (out_dim, in_dim), thus we need to apply transpose below.  
408 |     this function will return the CLIP feature (without normalziation)
409 |     """
410 |     return x @ torch.transpose(projection_matrix, 0, 1)
411 | 
412 | 
413 | def inv_project(y, projection_matrix):
414 |     """
415 |     y (Batch*768) should be the CLIP feature (after projection)
416 |     projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer 
417 |     defined in CLIP (out_dim, in_dim).  
418 |     this function will return the CLIP penultimate feature. 
419 | 
420 |     Note: to make sure getting the correct penultimate feature, the input y should not be normalized. 
421 |     If it is normalized, then the result will be scaled by CLIP feature norm, which is unknown.   
422 |     """
423 |     return y @ torch.transpose(torch.linalg.inv(projection_matrix), 0, 1)
424 | 


--------------------------------------------------------------------------------
/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import folder_paths
 3 | 
 4 | import comfy.utils
 5 | 
 6 | from .. import constants
 7 | 
 8 | 
 9 | def get_model_dir(model_dir):
10 |     root_path = folder_paths.models_dir
11 |     path = os.path.join(root_path, constants.INSTANCE_MODELS_DIR, model_dir)
12 |     return path
13 | 
14 | 
15 | def get_model_list(model_dir) -> list[str]:
16 |     path = get_model_dir(model_dir)
17 |     return os.listdir(path)
18 | 
19 | 
20 | def load_checkpoint(model_dir, filename):
21 |     checkpoint_path = os.path.join(get_model_dir(model_dir), filename)
22 |     checkpoint = comfy.utils.load_torch_file(checkpoint_path, safe_load=True)
23 |     return checkpoint
24 | 


--------------------------------------------------------------------------------
/utils/prompt_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def extract_prompts(input_string):
 5 |     # Define the new regex pattern to include negative numbers
 6 |     pattern = r"\"(-?\d+)\.([^\"]+)\":\s*\"([^\"]+)\""
 7 | 
 8 |     # Find all matches using the pattern
 9 |     matches = re.findall(pattern, input_string)
10 | 
11 |     # Convert matches to a list of tuples (number, name, text)
12 |     result = [(int(number), name.strip(), text)
13 |               for number, name, text in matches]
14 | 
15 |     return result
16 | 


--------------------------------------------------------------------------------