├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── conditioning ├── embeddings.py ├── fusers_patch.py └── instance_conditioning.py ├── constants.py ├── example_workflows ├── Spline_Editor_InstanceDiffusion_kijai_01.json └── fourpeople_workflow.json ├── model_helpers ├── prepare_fusers.py ├── prepare_positionnet.py └── prepare_scaleu.py ├── modules ├── convnext.py ├── scaleu.py ├── text_grounding_net.py ├── text_grounding_tokenizer_input.py └── util.py ├── nodes ├── apply_scaleu_model_node.py ├── download_and_load_models.py ├── instance_diffusion_tracking_prompt_node.py ├── load_instance_fusers_node.py ├── load_instance_positionnet_node.py └── load_instance_scaleu_node.py ├── pyproject.toml ├── requirements.txt └── utils ├── decode_item.py ├── model_utils.py └── prompt_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .vscode 3 | **/__pycache__ 4 | **/*.ckpt 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-InstanceDiffusion 2 | ComfyUI nodes to use InstanceDiffusion. 3 | 4 | Original research repo: https://github.com/frank-xwang/InstanceDiffusion 5 | 6 | ## Table of Contents 7 | - [Installation](#installation) 8 | - [How to Install](#how-to-install) 9 | - [How to Configure Models](#how-to-configure-models) 10 | - [Accompanying Node Repos](#accompanying-node-repos) 11 | - [Examples](#examples) 12 | - [Acknowledgements](#acknowledgements) 13 | 14 | ## Installation 15 | 16 | ### How to Install 17 | Clone or download this repo into your `ComfyUI/custom_nodes/` directory. 18 | There are no Python package requirements outside of the standard ComfyUI requirements at this time. 19 | 20 | ### How to Configure Models 21 | These models were trained by [frank-xwang](https://github.com/frank-xwang) baked inside of StableDiffusion 1.5. These are spliced out into individual models to be used with other SD1.5 checkpoints. 22 | Download each of these checkpoints and place them into the Installation Directory within `ComfyUI/models/instance_models/` directory. 23 | 24 | | Model Name | URL | Installation Directory | 25 | |------------|-----|------------------------| 26 | | fusers.ckpt | [huggingface](https://huggingface.co/spaces/logtd/instancediffusion/blob/main/fusers.ckpt) | `instance_models/fuser_models/` | 27 | | positionnet.ckpt | [huggingface](https://huggingface.co/spaces/logtd/instancediffusion/blob/main/position_net.ckpt) | `instance_models/positionnet_models/` | 28 | | scaleu.ckpt | [huggingface](https://huggingface.co/spaces/logtd/instancediffusion/blob/main/scaleu.ckpt) | `instance_models/scaleu_models/` | 29 | 30 | 31 | ## Accompanying Node Repos 32 | * [KJNodes for BBoxes](https://github.com/kijai/ComfyUI-KJNodes) 33 | * [Tracking Nodes for videos](https://github.com/logtd/ComfyUI-TrackingNodes) 34 | * [AnimateDiff-Evolved](https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved) 35 | * [Video Helper Suite](https://github.com/Kosinkadink/ComfyUI-VideoHelperSuite) 36 | 37 | ## Examples 38 | 39 | ### Text2Vid example using [Kijai](https://github.com/kijai)'s Spline Editor 40 | ![spline_editor_instances](https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/8830e2e7-b0c3-4f4f-95b7-12ee21997fb1) 41 | 42 | 43 | ### Vid2Vid examples 44 | Example workflows can be found in the `example_workflows/` directory. 45 | 46 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/ee42891a-cc38-421c-98bf-03a1be11d315 47 | 48 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/40038526-5850-4cb6-9658-c38c7e4b20f9 49 | 50 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/eae3520c-9a3d-4cde-b32f-1af9231ad2d4 51 | 52 | https://github.com/logtd/ComfyUI-InstanceDiffusion/assets/160989552/85b7d9df-7f7e-43c7-b2fa-b14fd5ec5e6d 53 | 54 | ## Unsupported Features 55 | InstanceDiffusion supports a wide range of inputs. The inputs that do not have nodes that can convert their input into InstanceDiffusion: 56 | * Scribbles 57 | * Points 58 | * Segments 59 | * Masks 60 | 61 | Points, segments, and masks are planned todo after proper tracking for these input types is implemented in ComfyUI. 62 | 63 | ## Acknowledgements 64 | * [frank-xwang](https://github.com/frank-xwang) for creating the original repo, training models, etc. 65 | * [Kosinkadink](https://github.com/Kosinkadink) for creating AnimateDiff-Evolved and providing support on integration 66 | * [Kijai](https://github.com/kijai) for improving the speed and adding tracking nodes 67 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .nodes.apply_scaleu_model_node import ApplyScaleUModelNode 2 | from .nodes.load_instance_scaleu_node import LoadInstanceScaleUNode 3 | from .nodes.load_instance_fusers_node import LoadInstanceFusersNode 4 | from .nodes.load_instance_positionnet_node import LoadInstancePositionNetNode 5 | from .nodes.instance_diffusion_tracking_prompt_node import InstanceDiffusionTrackingPromptNode 6 | from .nodes.download_and_load_models import DownloadInstanceDiffusionModels 7 | 8 | 9 | NODE_CLASS_MAPPINGS = { 10 | "ApplyScaleUModelNode": ApplyScaleUModelNode, 11 | "LoadInstanceScaleUNode": LoadInstanceScaleUNode, 12 | "LoadInstancePositionNetModel": LoadInstancePositionNetNode, 13 | "LoadInstanceFusersNode": LoadInstanceFusersNode, 14 | "InstanceDiffusionTrackingPrompt": InstanceDiffusionTrackingPromptNode, 15 | "DownloadInstanceDiffusionModels": DownloadInstanceDiffusionModels 16 | } 17 | 18 | NODE_DISPLAY_NAME_MAPPINGS = { 19 | "ApplyScaleUModelNode": "Apply Instance Diffusion ScaleU", 20 | "LoadInstancePositionNetModel": "Load Instance PositionNet Model", 21 | "LoadInstanceScaleUModel": "Load Instance ScaleU Model", 22 | "LoadInstanceFusersNode": "Load Instance Fusers Model", 23 | "InstanceDiffusionTrackingPrompt": "Instance Diffusion Tracking Prompt", 24 | "DownloadInstanceDiffusionModels": "(Down)Load Instance Diffusion Models" 25 | } 26 | -------------------------------------------------------------------------------- /conditioning/embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from skimage.transform import resize 4 | from ..utils.decode_item import binary_mask_to_polygon, sample_uniform_sparse_points 5 | 6 | 7 | N_SCRIBBLE_POINTS = 20 8 | N_POLYGON_POINTS = 256 9 | N_MAX_OBJECTS = 30 10 | 11 | 12 | def get_point_from_box(bbox): 13 | x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3] 14 | return [(x0 + x1) / 2.0, (y0 + y1) / 2.0] 15 | 16 | 17 | def get_empty_binary_mask(img_width, img_height): 18 | return np.zeros((img_width, img_height, 1)) 19 | 20 | 21 | def sample_random_points_from_mask(mask, k=N_SCRIBBLE_POINTS): 22 | mask = mask[:, :, 0] 23 | # Find the coordinates of non-zero pixels in the binary mask 24 | nonzero_coords = np.transpose(np.nonzero(mask)) 25 | 26 | # Randomly sample 'k' points 27 | # return all zeros if there is no non-zero pixel 28 | if len(nonzero_coords) == 0: 29 | xy_points = [0 for _ in range(k * 2)] 30 | return xy_points 31 | 32 | # randomly sample with replacement if there are not enough non-zero pixels 33 | if len(nonzero_coords) < k and len(nonzero_coords) > 0: 34 | random_indices = np.random.choice(len(nonzero_coords), k, replace=True) 35 | # randomly sample withiout replacement if there are enough non-zero pixels 36 | else: 37 | random_indices = np.random.choice( 38 | len(nonzero_coords), k, replace=False) 39 | sampled_points = nonzero_coords[random_indices] 40 | 41 | # order the points by their distance to (0, 0) 42 | # center = np.array([mask.shape[0] // 2, mask.shape[1] // 2]) 43 | center = np.array([0, 0]) 44 | sampled_points = sorted(sampled_points, key=lambda x: np.linalg.norm( 45 | np.array(x) - center)) # np.linalg.norm 46 | 47 | # concatenate x and y coordinates and return them as a list 48 | # [x1,y1,x2,y2,...,x_k,y_k] 49 | xy_points = [] 50 | for x in sampled_points: 51 | xy_points.append(float(x[1])) 52 | xy_points.append(float(x[0])) 53 | return xy_points 54 | 55 | 56 | def convert_points(points, img_width, img_height): 57 | # convert polygons/scribbless' coordinates to the relative values (0, 1) 58 | for i in range(len(points)): 59 | if i % 2 == 0: 60 | points[i] = min(points[i] / img_width, 1.0) 61 | else: 62 | points[i] = min(points[i] / img_height, 1.0) 63 | return points 64 | 65 | 66 | def sample_sparse_points_from_mask(mask, k=256): 67 | n_points = k 68 | n_polygons = n_points // 2 # half points should be sampled from the polygons 69 | mask = mask[:, :, 0] 70 | # sample sparse points from the polygons (boundary) 71 | polygons = binary_mask_to_polygon(mask, tolerance=0.0) 72 | # concatenate polygons to a single list 73 | polygons_single = [] 74 | for polygon in polygons: 75 | polygons_single += polygon 76 | if len(polygons_single) != 0: 77 | # uniformly sample points from the polygon 78 | polygons_single = np.array(polygons_single).reshape(-1, 2) 79 | indexes = np.linspace(0, polygons_single.shape[0] - 1, n_polygons) 80 | indexes = list([int(i) for i in indexes]) 81 | 82 | polygons_single = polygons_single[indexes] 83 | sampled_polygons = [(x[0], x[1]) for x in polygons_single] 84 | else: 85 | return [0 for _ in range(256 * 2)] 86 | 87 | # sample sparse points from the mask 88 | n_inside_points = n_points - len(sampled_polygons) 89 | inside_points = sample_uniform_sparse_points(mask, n_inside_points) 90 | 91 | # combine inside_points and sampled_polygons 92 | xy_points = inside_points + sampled_polygons 93 | 94 | # order the points by their distance to (0, 0) 95 | center = np.array([0, 0]) 96 | xy_points = sorted(xy_points, key=lambda x: np.linalg.norm( 97 | np.array(x) - center)) # np.linalg.norm 98 | 99 | # return the sampled points 100 | sampled_points = [] 101 | for x in xy_points: 102 | sampled_points.append(x[0]) 103 | sampled_points.append(x[1]) 104 | return sampled_points 105 | 106 | 107 | # [x0, y0, x1, y1] 108 | def get_grounding_input_from_coords(coords, img_width, img_height): 109 | x0, y0, x1, y1, coord_width, coord_height = coords 110 | location = [x0 / coord_width, y0 / coord_height, 111 | x1 / coord_width, y1 / coord_height] 112 | 113 | point = get_point_from_box(location) 114 | binary_mask = get_empty_binary_mask(img_width, img_height) 115 | 116 | scribble = sample_random_points_from_mask(binary_mask, k=N_SCRIBBLE_POINTS) 117 | scribble = convert_points(scribble, img_width, img_height) 118 | 119 | polygon = sample_sparse_points_from_mask(binary_mask, k=N_POLYGON_POINTS) 120 | polygon = convert_points(polygon, img_width, img_height) 121 | 122 | segment = resize(binary_mask.astype(np.float32), 123 | (img_width, img_height)).squeeze() 124 | # segment = np.stack(segment).astype(np.float32).squeeze() if len(segment) > 0 else segment 125 | 126 | return dict( 127 | polygon=polygon, 128 | scribble=scribble, 129 | segment=segment, 130 | box=location, 131 | point=point, 132 | ) 133 | 134 | 135 | def create_zero_input_tensors(n_frames, img_width, img_height): 136 | masks = torch.zeros(n_frames, N_MAX_OBJECTS) 137 | text_masks = torch.zeros(n_frames, N_MAX_OBJECTS) 138 | text_embeddings = torch.zeros(n_frames, N_MAX_OBJECTS, 768) 139 | box_embeddings = torch.zeros(n_frames, N_MAX_OBJECTS, 4) 140 | polygon_embeddings = torch.zeros( 141 | n_frames, N_MAX_OBJECTS, N_POLYGON_POINTS * 2) 142 | scribble_embeddings = torch.zeros( 143 | n_frames, N_MAX_OBJECTS, N_SCRIBBLE_POINTS * 2) 144 | segment_embeddings = torch.zeros( 145 | n_frames, N_MAX_OBJECTS, img_width, img_height) # TODO: width height order 146 | point_embeddings = torch.zeros(n_frames, N_MAX_OBJECTS, 2) 147 | 148 | return dict( 149 | masks=masks, 150 | text_masks=text_masks, 151 | prompts=text_embeddings, 152 | boxes=box_embeddings, 153 | polygons=polygon_embeddings, 154 | scribbles=scribble_embeddings, 155 | segments=segment_embeddings, 156 | points=point_embeddings 157 | ) 158 | 159 | 160 | def get_attn_mask(img_size=64): 161 | return torch.zeros(N_MAX_OBJECTS, img_size, img_size) 162 | 163 | 164 | def prepare_embeddings(conds, latent_shape, idxs, use_masked_att=False): 165 | batch_size, _, latent_height, latent_width = latent_shape 166 | if idxs is None: 167 | idxs = list(range(batch_size)) 168 | else: 169 | batch_size = len(idxs) 170 | embeddings = create_zero_input_tensors( 171 | batch_size, latent_width, latent_height) 172 | if use_masked_att: 173 | embeddings['att_masks'] = torch.zeros( 174 | batch_size, N_MAX_OBJECTS, latent_width, latent_height) 175 | 176 | for grounding_idx, frame_idx in enumerate(idxs): 177 | for cond_idx, cond in enumerate(conds): 178 | if cond['positions'][frame_idx] is None: 179 | continue 180 | 181 | grounding = get_grounding_input_from_coords( 182 | cond['positions'][frame_idx], latent_width, latent_height) 183 | embeddings['masks'][grounding_idx][cond_idx] = 1 184 | embeddings['text_masks'][grounding_idx][cond_idx] = 1 185 | embeddings['prompts'][grounding_idx][cond_idx] = cond['cond_pooled'] 186 | embeddings['boxes'][grounding_idx][cond_idx] = torch.tensor( 187 | grounding['box']) 188 | embeddings['polygons'][grounding_idx][cond_idx] = torch.tensor( 189 | grounding['polygon']) 190 | embeddings['scribbles'][grounding_idx][cond_idx] = torch.tensor( 191 | grounding['scribble']) 192 | embeddings['segments'][grounding_idx][cond_idx] = torch.tensor( 193 | grounding['segment']) 194 | embeddings['points'][grounding_idx][cond_idx] = torch.tensor( 195 | grounding['point']) 196 | 197 | if use_masked_att: 198 | box = grounding['box'] 199 | x1, y1, x2, y2 = int(np.round(box[0] * latent_width)), int(np.round(box[1] * latent_height)), int( 200 | np.round(box[2] * latent_width)), int(np.round(box[3] * latent_height)) 201 | embeddings['att_masks'][grounding_idx][cond_idx][x1:x2, y1:y2] = 1 202 | 203 | return embeddings 204 | -------------------------------------------------------------------------------- /conditioning/fusers_patch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .embeddings import prepare_embeddings 4 | 5 | 6 | block_map = { 7 | 'input': { 8 | 1: 0, 9 | 2: 1, 10 | 4: 2, 11 | 5: 3, 12 | 7: 4, 13 | 8: 5 14 | }, 15 | 'middle': { 16 | 0: 6, 17 | }, 18 | 'output': { 19 | 3: 7, 20 | 4: 8, 21 | 5: 9, 22 | 6: 10, 23 | 7: 11, 24 | 8: 12, 25 | 9: 13, 26 | 10: 14, 27 | 11: 15 28 | } 29 | } 30 | 31 | 32 | class FusersPatch(torch.nn.Module): 33 | def __init__(self, conds, fusers_list, positionnet, latent_shape, idxs, device): 34 | super(FusersPatch, self).__init__() 35 | self.conds = conds 36 | self.fusers_list = fusers_list 37 | self.positionnet = positionnet 38 | self.latent_shape = latent_shape 39 | self.idxs = idxs 40 | self.device = device 41 | 42 | def _get_position_objs(self, idxs): 43 | embeddings = prepare_embeddings( 44 | self.conds, self.latent_shape, idxs, True) 45 | for key in embeddings: 46 | embeddings[key] = embeddings[key].to(self.device) 47 | objs, drop_box_mask = self.positionnet(embeddings) 48 | return {'objs': objs, 'drop_box_mask': drop_box_mask} 49 | 50 | def _get_idxs(self, x, extra_options): 51 | if extra_options is not None: 52 | if 'ad_params' in extra_options: 53 | return extra_options['ad_params']['sub_idxs'] 54 | elif 'sub_idxs' in extra_options: 55 | return extra_options['sub_idxs'] 56 | 57 | return list(range(x.shape[0])) 58 | 59 | @torch.no_grad() 60 | def forward(self, x, extra_options): 61 | block, idx = extra_options['block'] 62 | fuser_idx = block_map[block][idx] 63 | fuser = self.fusers_list[fuser_idx] 64 | attn_total = [] 65 | idxs = self._get_idxs(x, extra_options) 66 | 67 | attn_total = fuser(x, self._get_position_objs(idxs)) 68 | return attn_total.to(torch.float16) 69 | -------------------------------------------------------------------------------- /conditioning/instance_conditioning.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import comfy.model_management 4 | 5 | from .fusers_patch import FusersPatch 6 | 7 | 8 | class InstanceConditioning: 9 | """ 10 | This class masquerades as Gligen in order to trigger setup 11 | """ 12 | 13 | def __init__(self, fusers, positionnet): 14 | self.fusers_list = fusers['model_list'] 15 | self.positionnet = positionnet['model'] 16 | self.conds = [] 17 | self.current_device = comfy.model_management.intermediate_device() 18 | 19 | # Gligen hacks 20 | self.model = self 21 | self.load_device = comfy.model_management.get_torch_device() 22 | self.offload_device = comfy.model_management.intermediate_device() 23 | 24 | def loaded_size(self): 25 | return 0 26 | 27 | def current_loaded_device(self): 28 | return comfy.model_management.intermediate_device() 29 | 30 | def get_fusers_patch(self, latent_shape, idxs, device): 31 | return FusersPatch(self.conds, self.fusers_list, self.positionnet, latent_shape, idxs, device) 32 | 33 | def set_position(self, latent_shape, _, device): 34 | # Called in samplers by gligen cond to return middle attention patch 35 | batch_size = latent_shape[0] 36 | idxs = list(range(batch_size)) 37 | fusers_patch = self.get_fusers_patch(latent_shape, idxs, device) 38 | return fusers_patch 39 | 40 | def add_conds(self, conds): 41 | self.conds.extend(conds) 42 | 43 | def get_models(self, *args, **kwargs) -> list[torch.nn.Module]: 44 | # Used to get models for loading/offloading 45 | return [(None, model) for model in [*self.fusers_list, self.positionnet]] 46 | 47 | def inference_memory_requirements(self, dtype, *args, **kwargs) -> int: 48 | # Used to calculate memory requirements by ControlNet 49 | return 0 50 | 51 | def is_clone(self, other, *args, **kwargs): 52 | return other == self 53 | 54 | def clone(self): 55 | return self 56 | 57 | def model_size(self, *args, **kwargs): 58 | return 0 59 | 60 | def memory_required(self, *args, **kwargs): 61 | return 0 62 | 63 | def model_patches_to(self, device_or_dtype, *args, **kwargs): 64 | if device_or_dtype == torch.float16 or device_or_dtype == torch.float32: 65 | return 66 | if device_or_dtype is None: 67 | return 68 | self.positionnet = self.positionnet.to(device_or_dtype) 69 | for i, fuser in enumerate(self.fusers_list): 70 | self.fusers_list[i] = fuser.to(device_or_dtype) 71 | 72 | def model_dtype(self, *args, **kwargs): 73 | return torch.float32 74 | 75 | def patch_model(self, *args, **kwargs): 76 | return 77 | 78 | def unpatch_weights(self, *args, **kwargs): 79 | return 80 | 81 | def unpatch_model(self, *args, **kwargs): 82 | return 83 | 84 | def set_model_patch(self, *args, **kwargs): 85 | return 86 | 87 | def set_model_patch_replace(self, *args, **kwargs): 88 | return 89 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | 2 | INSTANCE_MODELS_DIR = "instance_models" 3 | INSTANCE_FUSERS_DIR = "fuser_models" 4 | INSTANCE_POSITIONNET_DIR = "positionnet_models" 5 | INSTANCE_SCALEU_DIR = "scaleu_models" 6 | -------------------------------------------------------------------------------- /example_workflows/fourpeople_workflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 91, 3 | "last_link_id": 279, 4 | "nodes": [ 5 | { 6 | "id": 64, 7 | "type": "CLIPTextEncode", 8 | "pos": [ 9 | 877, 10 | 294 11 | ], 12 | "size": { 13 | "0": 210, 14 | "1": 85.9561767578125 15 | }, 16 | "flags": {}, 17 | "order": 15, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "clip", 22 | "type": "CLIP", 23 | "link": 149 24 | } 25 | ], 26 | "outputs": [ 27 | { 28 | "name": "CONDITIONING", 29 | "type": "CONDITIONING", 30 | "links": [], 31 | "slot_index": 0 32 | } 33 | ], 34 | "properties": { 35 | "Node name for S&R": "CLIPTextEncode" 36 | }, 37 | "widgets_values": [ 38 | "nsfw, naked" 39 | ] 40 | }, 41 | { 42 | "id": 45, 43 | "type": "ControlNetLoader", 44 | "pos": [ 45 | 2006, 46 | -326 47 | ], 48 | "size": { 49 | "0": 315, 50 | "1": 58 51 | }, 52 | "flags": {}, 53 | "order": 0, 54 | "mode": 0, 55 | "outputs": [ 56 | { 57 | "name": "CONTROL_NET", 58 | "type": "CONTROL_NET", 59 | "links": [ 60 | 225 61 | ], 62 | "shape": 3, 63 | "slot_index": 0 64 | } 65 | ], 66 | "properties": { 67 | "Node name for S&R": "ControlNetLoader" 68 | }, 69 | "widgets_values": [ 70 | "control_v11p_sd15_softedge.pth" 71 | ] 72 | }, 73 | { 74 | "id": 26, 75 | "type": "ControlNetLoader", 76 | "pos": [ 77 | 1272, 78 | -317 79 | ], 80 | "size": { 81 | "0": 315, 82 | "1": 58 83 | }, 84 | "flags": {}, 85 | "order": 1, 86 | "mode": 0, 87 | "outputs": [ 88 | { 89 | "name": "CONTROL_NET", 90 | "type": "CONTROL_NET", 91 | "links": [ 92 | 233 93 | ], 94 | "shape": 3, 95 | "slot_index": 0 96 | } 97 | ], 98 | "properties": { 99 | "Node name for S&R": "ControlNetLoader" 100 | }, 101 | "widgets_values": [ 102 | "control_v11f1p_sd15_depth.pth" 103 | ] 104 | }, 105 | { 106 | "id": 55, 107 | "type": "ControlNetLoader", 108 | "pos": [ 109 | 1635, 110 | -323 111 | ], 112 | "size": { 113 | "0": 315, 114 | "1": 58 115 | }, 116 | "flags": {}, 117 | "order": 2, 118 | "mode": 0, 119 | "outputs": [ 120 | { 121 | "name": "CONTROL_NET", 122 | "type": "CONTROL_NET", 123 | "links": [], 124 | "shape": 3, 125 | "slot_index": 0 126 | } 127 | ], 128 | "properties": { 129 | "Node name for S&R": "ControlNetLoader" 130 | }, 131 | "widgets_values": [ 132 | "control_v11p_sd15_openpose_fp16.safetensors" 133 | ] 134 | }, 135 | { 136 | "id": 27, 137 | "type": "MiDaS-DepthMapPreprocessor", 138 | "pos": [ 139 | 1292, 140 | -504 141 | ], 142 | "size": { 143 | "0": 315, 144 | "1": 106 145 | }, 146 | "flags": {}, 147 | "order": 21, 148 | "mode": 0, 149 | "inputs": [ 150 | { 151 | "name": "image", 152 | "type": "IMAGE", 153 | "link": 48 154 | } 155 | ], 156 | "outputs": [ 157 | { 158 | "name": "IMAGE", 159 | "type": "IMAGE", 160 | "links": [ 161 | 234 162 | ], 163 | "shape": 3, 164 | "slot_index": 0 165 | } 166 | ], 167 | "properties": { 168 | "Node name for S&R": "MiDaS-DepthMapPreprocessor" 169 | }, 170 | "widgets_values": [ 171 | 6.283185307179586, 172 | 0.1, 173 | 512 174 | ] 175 | }, 176 | { 177 | "id": 46, 178 | "type": "HEDPreprocessor", 179 | "pos": [ 180 | 2080, 181 | -475 182 | ], 183 | "size": { 184 | "0": 210, 185 | "1": 82 186 | }, 187 | "flags": {}, 188 | "order": 22, 189 | "mode": 0, 190 | "inputs": [ 191 | { 192 | "name": "image", 193 | "type": "IMAGE", 194 | "link": 107 195 | } 196 | ], 197 | "outputs": [ 198 | { 199 | "name": "IMAGE", 200 | "type": "IMAGE", 201 | "links": [ 202 | 226 203 | ], 204 | "shape": 3, 205 | "slot_index": 0 206 | } 207 | ], 208 | "properties": { 209 | "Node name for S&R": "HEDPreprocessor" 210 | }, 211 | "widgets_values": [ 212 | "enable", 213 | 512 214 | ] 215 | }, 216 | { 217 | "id": 48, 218 | "type": "ModelSamplingDiscrete", 219 | "pos": [ 220 | 1505, 221 | 582 222 | ], 223 | "size": { 224 | "0": 315, 225 | "1": 82 226 | }, 227 | "flags": {}, 228 | "order": 19, 229 | "mode": 0, 230 | "inputs": [ 231 | { 232 | "name": "model", 233 | "type": "MODEL", 234 | "link": 112 235 | } 236 | ], 237 | "outputs": [ 238 | { 239 | "name": "MODEL", 240 | "type": "MODEL", 241 | "links": [ 242 | 138 243 | ], 244 | "shape": 3, 245 | "slot_index": 0 246 | } 247 | ], 248 | "properties": { 249 | "Node name for S&R": "ModelSamplingDiscrete" 250 | }, 251 | "widgets_values": [ 252 | "eps", 253 | false 254 | ] 255 | }, 256 | { 257 | "id": 60, 258 | "type": "ADE_ApplyAnimateDiffModel", 259 | "pos": [ 260 | 1483, 261 | 724 262 | ], 263 | "size": { 264 | "0": 319.20001220703125, 265 | "1": 182 266 | }, 267 | "flags": {}, 268 | "order": 12, 269 | "mode": 0, 270 | "inputs": [ 271 | { 272 | "name": "motion_model", 273 | "type": "MOTION_MODEL_ADE", 274 | "link": 136, 275 | "slot_index": 0 276 | }, 277 | { 278 | "name": "motion_lora", 279 | "type": "MOTION_LORA", 280 | "link": null 281 | }, 282 | { 283 | "name": "scale_multival", 284 | "type": "MULTIVAL", 285 | "link": null, 286 | "slot_index": 2 287 | }, 288 | { 289 | "name": "effect_multival", 290 | "type": "MULTIVAL", 291 | "link": 137, 292 | "slot_index": 3 293 | }, 294 | { 295 | "name": "ad_keyframes", 296 | "type": "AD_KEYFRAMES", 297 | "link": null 298 | }, 299 | { 300 | "name": "prev_m_models", 301 | "type": "M_MODELS", 302 | "link": null 303 | } 304 | ], 305 | "outputs": [ 306 | { 307 | "name": "M_MODELS", 308 | "type": "M_MODELS", 309 | "links": [ 310 | 133 311 | ], 312 | "shape": 3, 313 | "slot_index": 0 314 | } 315 | ], 316 | "properties": { 317 | "Node name for S&R": "ADE_ApplyAnimateDiffModel" 318 | }, 319 | "widgets_values": [ 320 | 0, 321 | 1 322 | ] 323 | }, 324 | { 325 | "id": 59, 326 | "type": "ADE_AnimateDiffSamplingSettings", 327 | "pos": [ 328 | 1480, 329 | 1264 330 | ], 331 | "size": { 332 | "0": 315, 333 | "1": 234 334 | }, 335 | "flags": {}, 336 | "order": 3, 337 | "mode": 0, 338 | "inputs": [ 339 | { 340 | "name": "noise_layers", 341 | "type": "NOISE_LAYERS", 342 | "link": null, 343 | "slot_index": 0 344 | }, 345 | { 346 | "name": "iteration_opts", 347 | "type": "ITERATION_OPTS", 348 | "link": null, 349 | "slot_index": 1 350 | }, 351 | { 352 | "name": "seed_override", 353 | "type": "INT", 354 | "link": null, 355 | "widget": { 356 | "name": "seed_override" 357 | } 358 | }, 359 | { 360 | "name": "sigma_schedule", 361 | "type": "SIGMA_SCHEDULE", 362 | "link": null 363 | }, 364 | { 365 | "name": "seed_override", 366 | "type": "INT", 367 | "link": null, 368 | "widget": { 369 | "name": "seed_override" 370 | } 371 | } 372 | ], 373 | "outputs": [ 374 | { 375 | "name": "settings", 376 | "type": "SAMPLE_SETTINGS", 377 | "links": [ 378 | 135 379 | ], 380 | "shape": 3 381 | } 382 | ], 383 | "properties": { 384 | "Node name for S&R": "ADE_AnimateDiffSamplingSettings" 385 | }, 386 | "widgets_values": [ 387 | 0, 388 | "FreeNoise", 389 | "comfy", 390 | 0, 391 | 0, 392 | false 393 | ] 394 | }, 395 | { 396 | "id": 61, 397 | "type": "ADE_MultivalDynamic", 398 | "pos": [ 399 | 1117, 400 | 854 401 | ], 402 | "size": { 403 | "0": 315, 404 | "1": 58 405 | }, 406 | "flags": {}, 407 | "order": 4, 408 | "mode": 0, 409 | "inputs": [ 410 | { 411 | "name": "mask_optional", 412 | "type": "MASK", 413 | "link": null 414 | } 415 | ], 416 | "outputs": [ 417 | { 418 | "name": "MULTIVAL", 419 | "type": "MULTIVAL", 420 | "links": [ 421 | 137 422 | ], 423 | "shape": 3 424 | } 425 | ], 426 | "properties": { 427 | "Node name for S&R": "ADE_MultivalDynamic" 428 | }, 429 | "widgets_values": [ 430 | 0.9 431 | ] 432 | }, 433 | { 434 | "id": 58, 435 | "type": "ADE_LoadAnimateDiffModel", 436 | "pos": [ 437 | 1137, 438 | 722 439 | ], 440 | "size": { 441 | "0": 315, 442 | "1": 58 443 | }, 444 | "flags": {}, 445 | "order": 5, 446 | "mode": 0, 447 | "inputs": [ 448 | { 449 | "name": "ad_settings", 450 | "type": "MOTION_MODEL_SETTINGS", 451 | "link": null 452 | } 453 | ], 454 | "outputs": [ 455 | { 456 | "name": "MOTION_MODEL", 457 | "type": "MOTION_MODEL_ADE", 458 | "links": [ 459 | 136 460 | ], 461 | "shape": 3, 462 | "slot_index": 0 463 | } 464 | ], 465 | "properties": { 466 | "Node name for S&R": "ADE_LoadAnimateDiffModel" 467 | }, 468 | "widgets_values": [ 469 | "v3_sd15_mm.ckpt" 470 | ] 471 | }, 472 | { 473 | "id": 7, 474 | "type": "CLIPTextEncode", 475 | "pos": [ 476 | 924, 477 | -28 478 | ], 479 | "size": { 480 | "0": 210, 481 | "1": 85.9561767578125 482 | }, 483 | "flags": {}, 484 | "order": 16, 485 | "mode": 0, 486 | "inputs": [ 487 | { 488 | "name": "clip", 489 | "type": "CLIP", 490 | "link": 243 491 | } 492 | ], 493 | "outputs": [ 494 | { 495 | "name": "CONDITIONING", 496 | "type": "CONDITIONING", 497 | "links": [ 498 | 232 499 | ], 500 | "slot_index": 0 501 | } 502 | ], 503 | "properties": { 504 | "Node name for S&R": "CLIPTextEncode" 505 | }, 506 | "widgets_values": [ 507 | "nsfw, naked" 508 | ] 509 | }, 510 | { 511 | "id": 57, 512 | "type": "ADE_StandardUniformContextOptions", 513 | "pos": [ 514 | 1475, 515 | 953 516 | ], 517 | "size": { 518 | "0": 327.6000061035156, 519 | "1": 222 520 | }, 521 | "flags": {}, 522 | "order": 6, 523 | "mode": 0, 524 | "inputs": [ 525 | { 526 | "name": "prev_context", 527 | "type": "CONTEXT_OPTIONS", 528 | "link": null 529 | }, 530 | { 531 | "name": "view_opts", 532 | "type": "VIEW_OPTS", 533 | "link": null 534 | } 535 | ], 536 | "outputs": [ 537 | { 538 | "name": "CONTEXT_OPTS", 539 | "type": "CONTEXT_OPTIONS", 540 | "links": [ 541 | 134 542 | ], 543 | "shape": 3, 544 | "slot_index": 0 545 | } 546 | ], 547 | "properties": { 548 | "Node name for S&R": "ADE_StandardUniformContextOptions" 549 | }, 550 | "widgets_values": [ 551 | 16, 552 | 1, 553 | 4, 554 | "pyramid", 555 | false, 556 | 0, 557 | 1 558 | ] 559 | }, 560 | { 561 | "id": 49, 562 | "type": "CheckpointLoaderSimpleWithNoiseSelect", 563 | "pos": [ 564 | 212, 565 | 253 566 | ], 567 | "size": { 568 | "0": 319.20001220703125, 569 | "1": 170 570 | }, 571 | "flags": {}, 572 | "order": 7, 573 | "mode": 0, 574 | "outputs": [ 575 | { 576 | "name": "MODEL", 577 | "type": "MODEL", 578 | "links": [ 579 | 111 580 | ], 581 | "shape": 3, 582 | "slot_index": 0 583 | }, 584 | { 585 | "name": "CLIP", 586 | "type": "CLIP", 587 | "links": [ 588 | 148, 589 | 149, 590 | 243, 591 | 244, 592 | 265 593 | ], 594 | "shape": 3, 595 | "slot_index": 1 596 | }, 597 | { 598 | "name": "VAE", 599 | "type": "VAE", 600 | "links": [ 601 | 245, 602 | 247 603 | ], 604 | "shape": 3, 605 | "slot_index": 2 606 | } 607 | ], 608 | "properties": { 609 | "Node name for S&R": "CheckpointLoaderSimpleWithNoiseSelect" 610 | }, 611 | "widgets_values": [ 612 | "juggernaut_reborn.safetensors", 613 | "sqrt_linear (AnimateDiff)", 614 | false, 615 | 0.18215 616 | ] 617 | }, 618 | { 619 | "id": 12, 620 | "type": "ImageScale", 621 | "pos": [ 622 | 660, 623 | -374 624 | ], 625 | "size": { 626 | "0": 210, 627 | "1": 130 628 | }, 629 | "flags": {}, 630 | "order": 18, 631 | "mode": 0, 632 | "inputs": [ 633 | { 634 | "name": "image", 635 | "type": "IMAGE", 636 | "link": 10 637 | } 638 | ], 639 | "outputs": [ 640 | { 641 | "name": "IMAGE", 642 | "type": "IMAGE", 643 | "links": [ 644 | 19, 645 | 48, 646 | 107, 647 | 274 648 | ], 649 | "shape": 3, 650 | "slot_index": 0 651 | } 652 | ], 653 | "properties": { 654 | "Node name for S&R": "ImageScale" 655 | }, 656 | "widgets_values": [ 657 | "nearest-exact", 658 | 512, 659 | 512, 660 | "disabled" 661 | ] 662 | }, 663 | { 664 | "id": 16, 665 | "type": "VAEEncode", 666 | "pos": [ 667 | 3205, 668 | -567 669 | ], 670 | "size": { 671 | "0": 210, 672 | "1": 46 673 | }, 674 | "flags": {}, 675 | "order": 20, 676 | "mode": 0, 677 | "inputs": [ 678 | { 679 | "name": "pixels", 680 | "type": "IMAGE", 681 | "link": 19 682 | }, 683 | { 684 | "name": "vae", 685 | "type": "VAE", 686 | "link": 245 687 | } 688 | ], 689 | "outputs": [ 690 | { 691 | "name": "LATENT", 692 | "type": "LATENT", 693 | "links": [ 694 | 214 695 | ], 696 | "shape": 3, 697 | "slot_index": 0 698 | } 699 | ], 700 | "properties": { 701 | "Node name for S&R": "VAEEncode" 702 | } 703 | }, 704 | { 705 | "id": 85, 706 | "type": "ACN_AdvancedControlNetApply", 707 | "pos": [ 708 | 1303, 709 | -210 710 | ], 711 | "size": { 712 | "0": 285.6000061035156, 713 | "1": 266 714 | }, 715 | "flags": {}, 716 | "order": 25, 717 | "mode": 0, 718 | "inputs": [ 719 | { 720 | "name": "positive", 721 | "type": "CONDITIONING", 722 | "link": 231 723 | }, 724 | { 725 | "name": "negative", 726 | "type": "CONDITIONING", 727 | "link": 232 728 | }, 729 | { 730 | "name": "control_net", 731 | "type": "CONTROL_NET", 732 | "link": 233 733 | }, 734 | { 735 | "name": "image", 736 | "type": "IMAGE", 737 | "link": 234 738 | }, 739 | { 740 | "name": "mask_optional", 741 | "type": "MASK", 742 | "link": null 743 | }, 744 | { 745 | "name": "timestep_kf", 746 | "type": "TIMESTEP_KEYFRAME", 747 | "link": null 748 | }, 749 | { 750 | "name": "latent_kf_override", 751 | "type": "LATENT_KEYFRAME", 752 | "link": null 753 | }, 754 | { 755 | "name": "weights_override", 756 | "type": "CONTROL_NET_WEIGHTS", 757 | "link": null 758 | }, 759 | { 760 | "name": "model_optional", 761 | "type": "MODEL", 762 | "link": null 763 | } 764 | ], 765 | "outputs": [ 766 | { 767 | "name": "positive", 768 | "type": "CONDITIONING", 769 | "links": [ 770 | 239 771 | ], 772 | "shape": 3, 773 | "slot_index": 0 774 | }, 775 | { 776 | "name": "negative", 777 | "type": "CONDITIONING", 778 | "links": [ 779 | 240 780 | ], 781 | "shape": 3, 782 | "slot_index": 1 783 | }, 784 | { 785 | "name": "model_opt", 786 | "type": "MODEL", 787 | "links": null, 788 | "shape": 3 789 | } 790 | ], 791 | "properties": { 792 | "Node name for S&R": "ACN_AdvancedControlNetApply" 793 | }, 794 | "widgets_values": [ 795 | 0.35000000000000003, 796 | 0, 797 | 0.65 798 | ] 799 | }, 800 | { 801 | "id": 6, 802 | "type": "CLIPTextEncode", 803 | "pos": [ 804 | 879, 805 | -213 806 | ], 807 | "size": { 808 | "0": 265.12786865234375, 809 | "1": 95.60565948486328 810 | }, 811 | "flags": {}, 812 | "order": 17, 813 | "mode": 0, 814 | "inputs": [ 815 | { 816 | "name": "clip", 817 | "type": "CLIP", 818 | "link": 244 819 | } 820 | ], 821 | "outputs": [ 822 | { 823 | "name": "CONDITIONING", 824 | "type": "CONDITIONING", 825 | "links": [ 826 | 231 827 | ], 828 | "slot_index": 0 829 | } 830 | ], 831 | "properties": { 832 | "Node name for S&R": "CLIPTextEncode" 833 | }, 834 | "widgets_values": [ 835 | "portrait photo, uhd 4k, afternoon, brightly lit, (castle:0.6)" 836 | ] 837 | }, 838 | { 839 | "id": 63, 840 | "type": "CLIPTextEncode", 841 | "pos": [ 842 | 853, 843 | 130 844 | ], 845 | "size": { 846 | "0": 265.12786865234375, 847 | "1": 95.60565948486328 848 | }, 849 | "flags": {}, 850 | "order": 14, 851 | "mode": 0, 852 | "inputs": [ 853 | { 854 | "name": "clip", 855 | "type": "CLIP", 856 | "link": 148 857 | } 858 | ], 859 | "outputs": [ 860 | { 861 | "name": "CONDITIONING", 862 | "type": "CONDITIONING", 863 | "links": [], 864 | "slot_index": 0 865 | } 866 | ], 867 | "properties": { 868 | "Node name for S&R": "CLIPTextEncode" 869 | }, 870 | "widgets_values": [ 871 | "portrait photo, uhd 4k, afternoon, brightly lit, (castle:0.6)" 872 | ] 873 | }, 874 | { 875 | "id": 56, 876 | "type": "ADE_UseEvolvedSampling", 877 | "pos": [ 878 | 1907, 879 | 644 880 | ], 881 | "size": { 882 | "0": 301.4368896484375, 883 | "1": 118 884 | }, 885 | "flags": {}, 886 | "order": 24, 887 | "mode": 0, 888 | "inputs": [ 889 | { 890 | "name": "model", 891 | "type": "MODEL", 892 | "link": 138, 893 | "slot_index": 0 894 | }, 895 | { 896 | "name": "m_models", 897 | "type": "M_MODELS", 898 | "link": 133, 899 | "slot_index": 1 900 | }, 901 | { 902 | "name": "context_options", 903 | "type": "CONTEXT_OPTIONS", 904 | "link": 134, 905 | "slot_index": 2 906 | }, 907 | { 908 | "name": "sample_settings", 909 | "type": "SAMPLE_SETTINGS", 910 | "link": 135, 911 | "slot_index": 3 912 | } 913 | ], 914 | "outputs": [ 915 | { 916 | "name": "MODEL", 917 | "type": "MODEL", 918 | "links": [ 919 | 272 920 | ], 921 | "shape": 3, 922 | "slot_index": 0 923 | } 924 | ], 925 | "properties": { 926 | "Node name for S&R": "ADE_UseEvolvedSampling" 927 | }, 928 | "widgets_values": [ 929 | "sqrt_linear (AnimateDiff)" 930 | ] 931 | }, 932 | { 933 | "id": 90, 934 | "type": "ApplyScaleUModelNode", 935 | "pos": [ 936 | 2326, 937 | 628 938 | ], 939 | "size": { 940 | "0": 260.3999938964844, 941 | "1": 46 942 | }, 943 | "flags": {}, 944 | "order": 27, 945 | "mode": 0, 946 | "inputs": [ 947 | { 948 | "name": "model", 949 | "type": "MODEL", 950 | "link": 272 951 | }, 952 | { 953 | "name": "scaleu", 954 | "type": "SCALEU", 955 | "link": 271 956 | } 957 | ], 958 | "outputs": [ 959 | { 960 | "name": "MODEL", 961 | "type": "MODEL", 962 | "links": [ 963 | 273 964 | ], 965 | "shape": 3, 966 | "slot_index": 0 967 | } 968 | ], 969 | "properties": { 970 | "Node name for S&R": "ApplyScaleUModelNode" 971 | } 972 | }, 973 | { 974 | "id": 10, 975 | "type": "VHS_LoadVideo", 976 | "pos": [ 977 | 397, 978 | 618 979 | ], 980 | "size": [ 981 | 235.1999969482422, 982 | 471.1999969482422 983 | ], 984 | "flags": {}, 985 | "order": 8, 986 | "mode": 0, 987 | "inputs": [ 988 | { 989 | "name": "batch_manager", 990 | "type": "VHS_BatchManager", 991 | "link": null 992 | } 993 | ], 994 | "outputs": [ 995 | { 996 | "name": "IMAGE", 997 | "type": "IMAGE", 998 | "links": [ 999 | 10 1000 | ], 1001 | "shape": 3, 1002 | "slot_index": 0 1003 | }, 1004 | { 1005 | "name": "frame_count", 1006 | "type": "INT", 1007 | "links": [], 1008 | "shape": 3, 1009 | "slot_index": 1 1010 | }, 1011 | { 1012 | "name": "audio", 1013 | "type": "VHS_AUDIO", 1014 | "links": null, 1015 | "shape": 3 1016 | } 1017 | ], 1018 | "properties": { 1019 | "Node name for S&R": "VHS_LoadVideo" 1020 | }, 1021 | "widgets_values": { 1022 | "video": "fourpeople.mp4", 1023 | "force_rate": 0, 1024 | "force_size": "Disabled", 1025 | "custom_width": 512, 1026 | "custom_height": 512, 1027 | "frame_load_cap": 20, 1028 | "skip_first_frames": 0, 1029 | "select_every_nth": 4, 1030 | "choose video to upload": "image", 1031 | "videopreview": { 1032 | "hidden": false, 1033 | "paused": false, 1034 | "params": { 1035 | "frame_load_cap": 20, 1036 | "skip_first_frames": 0, 1037 | "force_rate": 0, 1038 | "select_every_nth": 4, 1039 | "filename": "fourpeople.mp4", 1040 | "type": "input", 1041 | "format": "video/mp4" 1042 | } 1043 | } 1044 | } 1045 | }, 1046 | { 1047 | "id": 78, 1048 | "type": "VHS_VideoCombine", 1049 | "pos": [ 1050 | 2614, 1051 | 297 1052 | ], 1053 | "size": [ 1054 | 315, 1055 | 599 1056 | ], 1057 | "flags": {}, 1058 | "order": 29, 1059 | "mode": 0, 1060 | "inputs": [ 1061 | { 1062 | "name": "images", 1063 | "type": "IMAGE", 1064 | "link": 260 1065 | }, 1066 | { 1067 | "name": "audio", 1068 | "type": "VHS_AUDIO", 1069 | "link": null 1070 | }, 1071 | { 1072 | "name": "batch_manager", 1073 | "type": "VHS_BatchManager", 1074 | "link": null 1075 | } 1076 | ], 1077 | "outputs": [ 1078 | { 1079 | "name": "Filenames", 1080 | "type": "VHS_FILENAMES", 1081 | "links": null, 1082 | "shape": 3 1083 | } 1084 | ], 1085 | "properties": { 1086 | "Node name for S&R": "VHS_VideoCombine" 1087 | }, 1088 | "widgets_values": { 1089 | "frame_rate": 8, 1090 | "loop_count": 0, 1091 | "filename_prefix": "AnimateDiff", 1092 | "format": "video/h264-mp4", 1093 | "pix_fmt": "yuv420p", 1094 | "crf": 19, 1095 | "save_metadata": true, 1096 | "pingpong": false, 1097 | "save_output": true, 1098 | "videopreview": { 1099 | "hidden": false, 1100 | "paused": false, 1101 | "params": { 1102 | "filename": "AnimateDiff_00802.mp4", 1103 | "subfolder": "", 1104 | "type": "output", 1105 | "format": "video/h264-mp4" 1106 | } 1107 | } 1108 | } 1109 | }, 1110 | { 1111 | "id": 8, 1112 | "type": "VAEDecode", 1113 | "pos": [ 1114 | 3455, 1115 | -179 1116 | ], 1117 | "size": { 1118 | "0": 140, 1119 | "1": 46 1120 | }, 1121 | "flags": {}, 1122 | "order": 32, 1123 | "mode": 0, 1124 | "inputs": [ 1125 | { 1126 | "name": "samples", 1127 | "type": "LATENT", 1128 | "link": 215 1129 | }, 1130 | { 1131 | "name": "vae", 1132 | "type": "VAE", 1133 | "link": 247 1134 | } 1135 | ], 1136 | "outputs": [ 1137 | { 1138 | "name": "IMAGE", 1139 | "type": "IMAGE", 1140 | "links": [ 1141 | 22 1142 | ], 1143 | "slot_index": 0 1144 | } 1145 | ], 1146 | "properties": { 1147 | "Node name for S&R": "VAEDecode" 1148 | } 1149 | }, 1150 | { 1151 | "id": 18, 1152 | "type": "VHS_VideoCombine", 1153 | "pos": [ 1154 | 3667, 1155 | -350 1156 | ], 1157 | "size": [ 1158 | 315, 1159 | 599 1160 | ], 1161 | "flags": {}, 1162 | "order": 33, 1163 | "mode": 0, 1164 | "inputs": [ 1165 | { 1166 | "name": "images", 1167 | "type": "IMAGE", 1168 | "link": 22 1169 | }, 1170 | { 1171 | "name": "audio", 1172 | "type": "VHS_AUDIO", 1173 | "link": null 1174 | }, 1175 | { 1176 | "name": "batch_manager", 1177 | "type": "VHS_BatchManager", 1178 | "link": null 1179 | } 1180 | ], 1181 | "outputs": [ 1182 | { 1183 | "name": "Filenames", 1184 | "type": "VHS_FILENAMES", 1185 | "links": null, 1186 | "shape": 3 1187 | } 1188 | ], 1189 | "properties": { 1190 | "Node name for S&R": "VHS_VideoCombine" 1191 | }, 1192 | "widgets_values": { 1193 | "frame_rate": 8, 1194 | "loop_count": 0, 1195 | "filename_prefix": "AnimateDiff", 1196 | "format": "video/h264-mp4", 1197 | "pix_fmt": "yuv420p", 1198 | "crf": 19, 1199 | "save_metadata": true, 1200 | "pingpong": false, 1201 | "save_output": true, 1202 | "videopreview": { 1203 | "hidden": false, 1204 | "paused": false, 1205 | "params": { 1206 | "filename": "AnimateDiff_00804.mp4", 1207 | "subfolder": "", 1208 | "type": "output", 1209 | "format": "video/h264-mp4" 1210 | } 1211 | } 1212 | } 1213 | }, 1214 | { 1215 | "id": 91, 1216 | "type": "YOLOTrackerNode", 1217 | "pos": [ 1218 | 2505, 1219 | -626 1220 | ], 1221 | "size": { 1222 | "0": 315, 1223 | "1": 78 1224 | }, 1225 | "flags": {}, 1226 | "order": 23, 1227 | "mode": 0, 1228 | "inputs": [ 1229 | { 1230 | "name": "images", 1231 | "type": "IMAGE", 1232 | "link": 274 1233 | } 1234 | ], 1235 | "outputs": [ 1236 | { 1237 | "name": "IMAGE", 1238 | "type": "IMAGE", 1239 | "links": [ 1240 | 276 1241 | ], 1242 | "shape": 3, 1243 | "slot_index": 0 1244 | }, 1245 | { 1246 | "name": "TRACKING", 1247 | "type": "TRACKING", 1248 | "links": [ 1249 | 275 1250 | ], 1251 | "shape": 3, 1252 | "slot_index": 1 1253 | } 1254 | ], 1255 | "properties": { 1256 | "Node name for S&R": "YOLOTrackerNode" 1257 | }, 1258 | "widgets_values": [ 1259 | "yolov8m.pt" 1260 | ] 1261 | }, 1262 | { 1263 | "id": 79, 1264 | "type": "ImageScale", 1265 | "pos": [ 1266 | 2293, 1267 | 784 1268 | ], 1269 | "size": { 1270 | "0": 210, 1271 | "1": 130 1272 | }, 1273 | "flags": {}, 1274 | "order": 26, 1275 | "mode": 0, 1276 | "inputs": [ 1277 | { 1278 | "name": "image", 1279 | "type": "IMAGE", 1280 | "link": 276 1281 | } 1282 | ], 1283 | "outputs": [ 1284 | { 1285 | "name": "IMAGE", 1286 | "type": "IMAGE", 1287 | "links": [ 1288 | 260 1289 | ], 1290 | "shape": 3, 1291 | "slot_index": 0 1292 | } 1293 | ], 1294 | "properties": { 1295 | "Node name for S&R": "ImageScale" 1296 | }, 1297 | "widgets_values": [ 1298 | "nearest-exact", 1299 | 1024, 1300 | 1024, 1301 | "disabled" 1302 | ] 1303 | }, 1304 | { 1305 | "id": 83, 1306 | "type": "ACN_AdvancedControlNetApply", 1307 | "pos": [ 1308 | 1675, 1309 | -210 1310 | ], 1311 | "size": { 1312 | "0": 285.6000061035156, 1313 | "1": 266 1314 | }, 1315 | "flags": {}, 1316 | "order": 28, 1317 | "mode": 0, 1318 | "inputs": [ 1319 | { 1320 | "name": "positive", 1321 | "type": "CONDITIONING", 1322 | "link": 239 1323 | }, 1324 | { 1325 | "name": "negative", 1326 | "type": "CONDITIONING", 1327 | "link": 240 1328 | }, 1329 | { 1330 | "name": "control_net", 1331 | "type": "CONTROL_NET", 1332 | "link": 225 1333 | }, 1334 | { 1335 | "name": "image", 1336 | "type": "IMAGE", 1337 | "link": 226 1338 | }, 1339 | { 1340 | "name": "mask_optional", 1341 | "type": "MASK", 1342 | "link": null 1343 | }, 1344 | { 1345 | "name": "timestep_kf", 1346 | "type": "TIMESTEP_KEYFRAME", 1347 | "link": null 1348 | }, 1349 | { 1350 | "name": "latent_kf_override", 1351 | "type": "LATENT_KEYFRAME", 1352 | "link": null 1353 | }, 1354 | { 1355 | "name": "weights_override", 1356 | "type": "CONTROL_NET_WEIGHTS", 1357 | "link": null 1358 | }, 1359 | { 1360 | "name": "model_optional", 1361 | "type": "MODEL", 1362 | "link": null 1363 | } 1364 | ], 1365 | "outputs": [ 1366 | { 1367 | "name": "positive", 1368 | "type": "CONDITIONING", 1369 | "links": [ 1370 | 278 1371 | ], 1372 | "shape": 3, 1373 | "slot_index": 0 1374 | }, 1375 | { 1376 | "name": "negative", 1377 | "type": "CONDITIONING", 1378 | "links": [ 1379 | 279 1380 | ], 1381 | "shape": 3, 1382 | "slot_index": 1 1383 | }, 1384 | { 1385 | "name": "model_opt", 1386 | "type": "MODEL", 1387 | "links": null, 1388 | "shape": 3 1389 | } 1390 | ], 1391 | "properties": { 1392 | "Node name for S&R": "ACN_AdvancedControlNetApply" 1393 | }, 1394 | "widgets_values": [ 1395 | 0.15, 1396 | 0, 1397 | 0.25 1398 | ] 1399 | }, 1400 | { 1401 | "id": 86, 1402 | "type": "InstanceDiffusionTrackingPrompt", 1403 | "pos": [ 1404 | 2451, 1405 | -213 1406 | ], 1407 | "size": [ 1408 | 514.9908203124996, 1409 | 347.78492431640575 1410 | ], 1411 | "flags": {}, 1412 | "order": 30, 1413 | "mode": 0, 1414 | "inputs": [ 1415 | { 1416 | "name": "positive", 1417 | "type": "CONDITIONING", 1418 | "link": 278 1419 | }, 1420 | { 1421 | "name": "negative", 1422 | "type": "CONDITIONING", 1423 | "link": 279 1424 | }, 1425 | { 1426 | "name": "clip", 1427 | "type": "CLIP", 1428 | "link": 265 1429 | }, 1430 | { 1431 | "name": "tracking", 1432 | "type": "TRACKING", 1433 | "link": 275 1434 | }, 1435 | { 1436 | "name": "positionnet", 1437 | "type": "POSITIONNET", 1438 | "link": 267 1439 | }, 1440 | { 1441 | "name": "fusers", 1442 | "type": "FUSERS", 1443 | "link": 268 1444 | } 1445 | ], 1446 | "outputs": [ 1447 | { 1448 | "name": "positive", 1449 | "type": "CONDITIONING", 1450 | "links": [ 1451 | 269 1452 | ], 1453 | "shape": 3, 1454 | "slot_index": 0 1455 | }, 1456 | { 1457 | "name": "negative", 1458 | "type": "CONDITIONING", 1459 | "links": [ 1460 | 270 1461 | ], 1462 | "shape": 3, 1463 | "slot_index": 1 1464 | } 1465 | ], 1466 | "properties": { 1467 | "Node name for S&R": "InstanceDiffusionTrackingPrompt" 1468 | }, 1469 | "widgets_values": [ 1470 | 5, 1471 | "\"1.person\": \"(((The Mad Hatter, purple suit, purple top hot, red hair)))\",\n\"2.person\": \"(((a white rabbit wearing a suit, white bunny ears)))\",\n\"3.person\": \"((((Alice in Wonderland, blue dress, white apron, blonde))))\",\n\"4.person\": \"((((the Queen of Hearts, red and black dress, crown))))\",", 1472 | "" 1473 | ] 1474 | }, 1475 | { 1476 | "id": 47, 1477 | "type": "LoraLoaderModelOnly", 1478 | "pos": [ 1479 | 1092, 1480 | 493 1481 | ], 1482 | "size": { 1483 | "0": 315, 1484 | "1": 82 1485 | }, 1486 | "flags": {}, 1487 | "order": 13, 1488 | "mode": 0, 1489 | "inputs": [ 1490 | { 1491 | "name": "model", 1492 | "type": "MODEL", 1493 | "link": 111 1494 | } 1495 | ], 1496 | "outputs": [ 1497 | { 1498 | "name": "MODEL", 1499 | "type": "MODEL", 1500 | "links": [ 1501 | 112 1502 | ], 1503 | "shape": 3, 1504 | "slot_index": 0 1505 | } 1506 | ], 1507 | "properties": { 1508 | "Node name for S&R": "LoraLoaderModelOnly" 1509 | }, 1510 | "widgets_values": [ 1511 | "lcm/SD1.5/pytorch_lora_weights.safetensors", 1512 | 1 1513 | ] 1514 | }, 1515 | { 1516 | "id": 81, 1517 | "type": "KSampler", 1518 | "pos": [ 1519 | 3128, 1520 | -243 1521 | ], 1522 | "size": [ 1523 | 247.09541992187496, 1524 | 262 1525 | ], 1526 | "flags": {}, 1527 | "order": 31, 1528 | "mode": 0, 1529 | "inputs": [ 1530 | { 1531 | "name": "model", 1532 | "type": "MODEL", 1533 | "link": 273 1534 | }, 1535 | { 1536 | "name": "positive", 1537 | "type": "CONDITIONING", 1538 | "link": 269 1539 | }, 1540 | { 1541 | "name": "negative", 1542 | "type": "CONDITIONING", 1543 | "link": 270 1544 | }, 1545 | { 1546 | "name": "latent_image", 1547 | "type": "LATENT", 1548 | "link": 214 1549 | } 1550 | ], 1551 | "outputs": [ 1552 | { 1553 | "name": "LATENT", 1554 | "type": "LATENT", 1555 | "links": [ 1556 | 215 1557 | ], 1558 | "shape": 3, 1559 | "slot_index": 0 1560 | } 1561 | ], 1562 | "properties": { 1563 | "Node name for S&R": "KSampler" 1564 | }, 1565 | "widgets_values": [ 1566 | 677130511272592, 1567 | "fixed", 1568 | 7, 1569 | 2, 1570 | "lcm", 1571 | "karras", 1572 | 1 1573 | ] 1574 | }, 1575 | { 1576 | "id": 89, 1577 | "type": "LoadInstanceScaleUNode", 1578 | "pos": [ 1579 | 2019, 1580 | 458 1581 | ], 1582 | "size": { 1583 | "0": 315, 1584 | "1": 58 1585 | }, 1586 | "flags": {}, 1587 | "order": 9, 1588 | "mode": 0, 1589 | "outputs": [ 1590 | { 1591 | "name": "SCALEU", 1592 | "type": "SCALEU", 1593 | "links": [ 1594 | 271 1595 | ], 1596 | "shape": 3, 1597 | "slot_index": 0 1598 | } 1599 | ], 1600 | "properties": { 1601 | "Node name for S&R": "LoadInstanceScaleUNode" 1602 | }, 1603 | "widgets_values": [ 1604 | "scaleu.ckpt" 1605 | ] 1606 | }, 1607 | { 1608 | "id": 88, 1609 | "type": "LoadInstanceFusersNode", 1610 | "pos": [ 1611 | 2034, 1612 | 339 1613 | ], 1614 | "size": { 1615 | "0": 315, 1616 | "1": 58 1617 | }, 1618 | "flags": {}, 1619 | "order": 10, 1620 | "mode": 0, 1621 | "outputs": [ 1622 | { 1623 | "name": "FUSERS", 1624 | "type": "FUSERS", 1625 | "links": [ 1626 | 268 1627 | ], 1628 | "shape": 3, 1629 | "slot_index": 0 1630 | } 1631 | ], 1632 | "properties": { 1633 | "Node name for S&R": "LoadInstanceFusersNode" 1634 | }, 1635 | "widgets_values": [ 1636 | "fusers.ckpt" 1637 | ] 1638 | }, 1639 | { 1640 | "id": 87, 1641 | "type": "LoadInstancePositionNetModel", 1642 | "pos": [ 1643 | 2024, 1644 | 221 1645 | ], 1646 | "size": { 1647 | "0": 315, 1648 | "1": 58 1649 | }, 1650 | "flags": {}, 1651 | "order": 11, 1652 | "mode": 0, 1653 | "outputs": [ 1654 | { 1655 | "name": "POSITIONNET", 1656 | "type": "POSITIONNET", 1657 | "links": [ 1658 | 267 1659 | ], 1660 | "shape": 3, 1661 | "slot_index": 0 1662 | } 1663 | ], 1664 | "properties": { 1665 | "Node name for S&R": "LoadInstancePositionNetModel" 1666 | }, 1667 | "widgets_values": [ 1668 | "position_net.ckpt" 1669 | ] 1670 | } 1671 | ], 1672 | "links": [ 1673 | [ 1674 | 10, 1675 | 10, 1676 | 0, 1677 | 12, 1678 | 0, 1679 | "IMAGE" 1680 | ], 1681 | [ 1682 | 19, 1683 | 12, 1684 | 0, 1685 | 16, 1686 | 0, 1687 | "IMAGE" 1688 | ], 1689 | [ 1690 | 22, 1691 | 8, 1692 | 0, 1693 | 18, 1694 | 0, 1695 | "IMAGE" 1696 | ], 1697 | [ 1698 | 48, 1699 | 12, 1700 | 0, 1701 | 27, 1702 | 0, 1703 | "IMAGE" 1704 | ], 1705 | [ 1706 | 107, 1707 | 12, 1708 | 0, 1709 | 46, 1710 | 0, 1711 | "IMAGE" 1712 | ], 1713 | [ 1714 | 111, 1715 | 49, 1716 | 0, 1717 | 47, 1718 | 0, 1719 | "MODEL" 1720 | ], 1721 | [ 1722 | 112, 1723 | 47, 1724 | 0, 1725 | 48, 1726 | 0, 1727 | "MODEL" 1728 | ], 1729 | [ 1730 | 133, 1731 | 60, 1732 | 0, 1733 | 56, 1734 | 1, 1735 | "M_MODELS" 1736 | ], 1737 | [ 1738 | 134, 1739 | 57, 1740 | 0, 1741 | 56, 1742 | 2, 1743 | "CONTEXT_OPTIONS" 1744 | ], 1745 | [ 1746 | 135, 1747 | 59, 1748 | 0, 1749 | 56, 1750 | 3, 1751 | "SAMPLE_SETTINGS" 1752 | ], 1753 | [ 1754 | 136, 1755 | 58, 1756 | 0, 1757 | 60, 1758 | 0, 1759 | "MOTION_MODEL_ADE" 1760 | ], 1761 | [ 1762 | 137, 1763 | 61, 1764 | 0, 1765 | 60, 1766 | 3, 1767 | "MULTIVAL" 1768 | ], 1769 | [ 1770 | 138, 1771 | 48, 1772 | 0, 1773 | 56, 1774 | 0, 1775 | "MODEL" 1776 | ], 1777 | [ 1778 | 148, 1779 | 49, 1780 | 1, 1781 | 63, 1782 | 0, 1783 | "CLIP" 1784 | ], 1785 | [ 1786 | 149, 1787 | 49, 1788 | 1, 1789 | 64, 1790 | 0, 1791 | "CLIP" 1792 | ], 1793 | [ 1794 | 214, 1795 | 16, 1796 | 0, 1797 | 81, 1798 | 3, 1799 | "LATENT" 1800 | ], 1801 | [ 1802 | 215, 1803 | 81, 1804 | 0, 1805 | 8, 1806 | 0, 1807 | "LATENT" 1808 | ], 1809 | [ 1810 | 225, 1811 | 45, 1812 | 0, 1813 | 83, 1814 | 2, 1815 | "CONTROL_NET" 1816 | ], 1817 | [ 1818 | 226, 1819 | 46, 1820 | 0, 1821 | 83, 1822 | 3, 1823 | "IMAGE" 1824 | ], 1825 | [ 1826 | 231, 1827 | 6, 1828 | 0, 1829 | 85, 1830 | 0, 1831 | "CONDITIONING" 1832 | ], 1833 | [ 1834 | 232, 1835 | 7, 1836 | 0, 1837 | 85, 1838 | 1, 1839 | "CONDITIONING" 1840 | ], 1841 | [ 1842 | 233, 1843 | 26, 1844 | 0, 1845 | 85, 1846 | 2, 1847 | "CONTROL_NET" 1848 | ], 1849 | [ 1850 | 234, 1851 | 27, 1852 | 0, 1853 | 85, 1854 | 3, 1855 | "IMAGE" 1856 | ], 1857 | [ 1858 | 239, 1859 | 85, 1860 | 0, 1861 | 83, 1862 | 0, 1863 | "CONDITIONING" 1864 | ], 1865 | [ 1866 | 240, 1867 | 85, 1868 | 1, 1869 | 83, 1870 | 1, 1871 | "CONDITIONING" 1872 | ], 1873 | [ 1874 | 243, 1875 | 49, 1876 | 1, 1877 | 7, 1878 | 0, 1879 | "CLIP" 1880 | ], 1881 | [ 1882 | 244, 1883 | 49, 1884 | 1, 1885 | 6, 1886 | 0, 1887 | "CLIP" 1888 | ], 1889 | [ 1890 | 245, 1891 | 49, 1892 | 2, 1893 | 16, 1894 | 1, 1895 | "VAE" 1896 | ], 1897 | [ 1898 | 247, 1899 | 49, 1900 | 2, 1901 | 8, 1902 | 1, 1903 | "VAE" 1904 | ], 1905 | [ 1906 | 260, 1907 | 79, 1908 | 0, 1909 | 78, 1910 | 0, 1911 | "IMAGE" 1912 | ], 1913 | [ 1914 | 265, 1915 | 49, 1916 | 1, 1917 | 86, 1918 | 2, 1919 | "CLIP" 1920 | ], 1921 | [ 1922 | 267, 1923 | 87, 1924 | 0, 1925 | 86, 1926 | 4, 1927 | "POSITIONNET" 1928 | ], 1929 | [ 1930 | 268, 1931 | 88, 1932 | 0, 1933 | 86, 1934 | 5, 1935 | "FUSERS" 1936 | ], 1937 | [ 1938 | 269, 1939 | 86, 1940 | 0, 1941 | 81, 1942 | 1, 1943 | "CONDITIONING" 1944 | ], 1945 | [ 1946 | 270, 1947 | 86, 1948 | 1, 1949 | 81, 1950 | 2, 1951 | "CONDITIONING" 1952 | ], 1953 | [ 1954 | 271, 1955 | 89, 1956 | 0, 1957 | 90, 1958 | 1, 1959 | "SCALEU" 1960 | ], 1961 | [ 1962 | 272, 1963 | 56, 1964 | 0, 1965 | 90, 1966 | 0, 1967 | "MODEL" 1968 | ], 1969 | [ 1970 | 273, 1971 | 90, 1972 | 0, 1973 | 81, 1974 | 0, 1975 | "MODEL" 1976 | ], 1977 | [ 1978 | 274, 1979 | 12, 1980 | 0, 1981 | 91, 1982 | 0, 1983 | "IMAGE" 1984 | ], 1985 | [ 1986 | 275, 1987 | 91, 1988 | 1, 1989 | 86, 1990 | 3, 1991 | "TRACKING" 1992 | ], 1993 | [ 1994 | 276, 1995 | 91, 1996 | 0, 1997 | 79, 1998 | 0, 1999 | "IMAGE" 2000 | ], 2001 | [ 2002 | 278, 2003 | 83, 2004 | 0, 2005 | 86, 2006 | 0, 2007 | "CONDITIONING" 2008 | ], 2009 | [ 2010 | 279, 2011 | 83, 2012 | 1, 2013 | 86, 2014 | 1, 2015 | "CONDITIONING" 2016 | ] 2017 | ], 2018 | "groups": [], 2019 | "config": {}, 2020 | "extra": {}, 2021 | "version": 0.4 2022 | } -------------------------------------------------------------------------------- /model_helpers/prepare_fusers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | #from ..modules.attention import GatedSelfAttentionDense 4 | from comfy.gligen import GatedSelfAttentionDense as GSAD 5 | 6 | class GatedSelfAttentionDense(GSAD): 7 | 8 | def forward(self, x, instance_options={}): 9 | objs = instance_options['objs'] 10 | x = super().forward(x, objs) 11 | return x.to(torch.float16) 12 | 13 | def prepare_fusers(fusers_ckpt, fusers_scale) -> list[torch.nn.Module]: 14 | fusers_list = [] 15 | for key in fusers_ckpt['input_blocks']: 16 | fusers_ckpt['input_blocks'][key]['params']['query_dim'] = fusers_ckpt['input_blocks'][key]['params']['n_heads'] * \ 17 | fusers_ckpt['input_blocks'][key]['params']['d_head'] 18 | fuser = GatedSelfAttentionDense( 19 | **fusers_ckpt['input_blocks'][key]['params']) 20 | fuser.load_state_dict(fusers_ckpt['input_blocks'][key]['state']) 21 | fuser.scale = fusers_scale 22 | fusers_list.append(fuser) 23 | 24 | fusers_ckpt['middle_block']['1']['params']['query_dim'] = fusers_ckpt['middle_block']['1']['params']['n_heads'] * \ 25 | fusers_ckpt['middle_block']['1']['params']['d_head'] 26 | fuser = GatedSelfAttentionDense( 27 | **fusers_ckpt['middle_block']['1']['params']) 28 | fuser.load_state_dict(fusers_ckpt['middle_block']['1']['state']) 29 | fuser.scale = fusers_scale 30 | fusers_list.append(fuser) 31 | 32 | for key in fusers_ckpt['output_blocks']: 33 | fusers_ckpt['output_blocks'][key]['params']['query_dim'] = fusers_ckpt['output_blocks'][key]['params']['n_heads'] * \ 34 | fusers_ckpt['output_blocks'][key]['params']['d_head'] 35 | fuser = GatedSelfAttentionDense( 36 | **fusers_ckpt['output_blocks'][key]['params']) 37 | fuser.load_state_dict(fusers_ckpt['output_blocks'][key]['state']) 38 | fuser.scale = fusers_scale 39 | fusers_list.append(fuser) 40 | 41 | return fusers_list 42 | -------------------------------------------------------------------------------- /model_helpers/prepare_positionnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from ..modules.text_grounding_net import UniFusion 4 | 5 | 6 | def get_positionnet_default_params(): 7 | return { 8 | "in_dim": 768, 9 | "mid_dim": 3072, 10 | "out_dim": 768, 11 | "test_drop_boxes": False, 12 | "test_drop_masks": True, 13 | "test_drop_points": False, 14 | "test_drop_scribbles": True, 15 | "train_add_boxes": True, 16 | "train_add_masks": True, 17 | "train_add_points": True, 18 | "train_add_scribbles": True, 19 | "use_seperate_tokenizer": True, 20 | } 21 | 22 | 23 | def prepare_positionnet(checkpoint, params) -> torch.nn.Module: 24 | model = UniFusion(**params) 25 | model.load_state_dict(checkpoint, strict=False) 26 | return model 27 | -------------------------------------------------------------------------------- /model_helpers/prepare_scaleu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from ..modules.scaleu import ScaleU 4 | 5 | 6 | def get_scaleu_patch(scaleu_nets): 7 | def scaleu_patch(h, hsp, transformer_options): 8 | _, idx = transformer_options['block'] 9 | sk = scaleu_nets[idx](h, hsp) 10 | return sk 11 | 12 | return scaleu_patch 13 | 14 | 15 | def prepare_scaleu_nets(scaleu_ckpt) -> torch.nn.Module: 16 | scaleu_nets = [] 17 | for i in range(12): 18 | ckpt = scaleu_ckpt[f'{i}'] 19 | scaleu = ScaleU(True, len(ckpt['scaleu_b']), len(ckpt['scaleu_s'])) 20 | scaleu.load_state_dict(ckpt) 21 | scaleu_nets.append(scaleu) 22 | return scaleu_nets 23 | -------------------------------------------------------------------------------- /modules/convnext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | 3 | # All rights reserved. 4 | 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the original repo. 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from timm.models.layers import trunc_normal_, DropPath 13 | from timm.models.registry import register_model 14 | 15 | import comfy.ops 16 | ops = comfy.ops.manual_cast 17 | 18 | class Block(nn.Module): 19 | r""" ConvNeXt Block. There are two equivalent implementations: 20 | (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) 21 | (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back 22 | We use (2) as we find it slightly faster in PyTorch 23 | 24 | Args: 25 | dim (int): Number of input channels. 26 | drop_path (float): Stochastic depth rate. Default: 0.0 27 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 28 | """ 29 | 30 | def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): 31 | super().__init__() 32 | self.dwconv = ops.Conv2d(dim, dim, kernel_size=7, 33 | padding=3, groups=dim) # depthwise conv 34 | self.norm = LayerNorm(dim, eps=1e-6) 35 | # pointwise/1x1 convs, implemented with linear layers 36 | self.pwconv1 = ops.Linear(dim, 4 * dim) 37 | self.act = nn.GELU() 38 | self.pwconv2 = ops.Linear(4 * dim, dim) 39 | self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 40 | requires_grad=True) if layer_scale_init_value > 0 else None 41 | self.drop_path = DropPath( 42 | drop_path) if drop_path > 0. else nn.Identity() 43 | 44 | def forward(self, x): 45 | input = x 46 | x = self.dwconv(x) 47 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) 48 | x = self.norm(x) 49 | x = self.pwconv1(x) 50 | x = self.act(x) 51 | x = self.pwconv2(x) 52 | if self.gamma is not None: 53 | x = self.gamma * x 54 | x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) 55 | 56 | x = input + self.drop_path(x) 57 | return x 58 | 59 | 60 | class ConvNeXt(nn.Module): 61 | r""" ConvNeXt 62 | A PyTorch impl of : `A ConvNet for the 2020s` - 63 | https://arxiv.org/pdf/2201.03545.pdf 64 | 65 | Args: 66 | in_chans (int): Number of input image channels. Default: 3 67 | num_classes (int): Number of classes for classification head. Default: 1000 68 | depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] 69 | dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] 70 | drop_path_rate (float): Stochastic depth rate. Default: 0. 71 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 72 | head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. 73 | """ 74 | 75 | def __init__(self, in_chans=3, num_classes=1000, 76 | depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0., 77 | layer_scale_init_value=1e-6, head_init_scale=1., 78 | ): 79 | super().__init__() 80 | 81 | # stem and 3 intermediate downsampling conv layers 82 | self.downsample_layers = nn.ModuleList() 83 | stem = nn.Sequential( 84 | ops.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), 85 | LayerNorm(dims[0], eps=1e-6, data_format="channels_first") 86 | ) 87 | self.downsample_layers.append(stem) 88 | for i in range(3): 89 | downsample_layer = nn.Sequential( 90 | LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), 91 | ops.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2), 92 | ) 93 | self.downsample_layers.append(downsample_layer) 94 | 95 | # 4 feature resolution stages, each consisting of multiple residual blocks 96 | self.stages = nn.ModuleList() 97 | dp_rates = [x.item() 98 | for x in torch.linspace(0, drop_path_rate, sum(depths))] 99 | cur = 0 100 | for i in range(4): 101 | stage = nn.Sequential( 102 | *[Block(dim=dims[i], drop_path=dp_rates[cur + j], 103 | layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] 104 | ) 105 | self.stages.append(stage) 106 | cur += depths[i] 107 | 108 | def _init_weights(self, m): 109 | if isinstance(m, (ops.Conv2d, ops.Linear)): 110 | trunc_normal_(m.weight, std=.02) 111 | nn.init.constant_(m.bias, 0) 112 | 113 | def forward_features(self, x): 114 | for i in range(4): 115 | x = self.downsample_layers[i](x) 116 | x = self.stages[i](x) 117 | return x 118 | 119 | def forward(self, x): 120 | x = self.forward_features(x) 121 | # x = self.head(x) 122 | return x 123 | 124 | 125 | class LayerNorm(nn.Module): 126 | r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 127 | The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 128 | shape (batch_size, height, width, channels) while channels_first corresponds to inputs 129 | with shape (batch_size, channels, height, width). 130 | """ 131 | 132 | def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): 133 | super().__init__() 134 | self.weight = nn.Parameter(torch.ones(normalized_shape)) 135 | self.bias = nn.Parameter(torch.zeros(normalized_shape)) 136 | self.eps = eps 137 | self.data_format = data_format 138 | if self.data_format not in ["channels_last", "channels_first"]: 139 | raise NotImplementedError 140 | self.normalized_shape = (normalized_shape, ) 141 | 142 | def forward(self, x): 143 | if self.data_format == "channels_last": 144 | return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) 145 | elif self.data_format == "channels_first": 146 | u = x.mean(1, keepdim=True) 147 | s = (x - u).pow(2).mean(1, keepdim=True) 148 | x = (x - u) / torch.sqrt(s + self.eps) 149 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 150 | return x 151 | 152 | 153 | model_urls = { 154 | "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", 155 | "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", 156 | "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", 157 | "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", 158 | "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", 159 | "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", 160 | "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", 161 | "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", 162 | "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", 163 | } 164 | 165 | 166 | @register_model 167 | def convnext_tiny(pretrained=False, in_22k=False, **kwargs): 168 | model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs) 169 | if pretrained: 170 | url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k'] 171 | checkpoint = torch.hub.load_state_dict_from_url( 172 | url=url, map_location="cpu", check_hash=True) 173 | # we remove classifer head 174 | model.load_state_dict(checkpoint["model"], strict=False) 175 | return model 176 | 177 | 178 | # @register_model 179 | # def convnext_small(pretrained=False, in_22k=False, **kwargs): 180 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) 181 | # if pretrained: 182 | # url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k'] 183 | # checkpoint = torch.hub.load_state_dict_from_url( 184 | # url=url, map_location="cpu") 185 | # model.load_state_dict(checkpoint["model"]) 186 | # return model 187 | 188 | 189 | # @register_model 190 | # def convnext_base(pretrained=False, in_22k=False, **kwargs): 191 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[ 192 | # 128, 256, 512, 1024], **kwargs) 193 | # if pretrained: 194 | # url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k'] 195 | # checkpoint = torch.hub.load_state_dict_from_url( 196 | # url=url, map_location="cpu") 197 | # model.load_state_dict(checkpoint["model"]) 198 | # return model 199 | 200 | 201 | # @register_model 202 | # def convnext_large(pretrained=False, in_22k=False, **kwargs): 203 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[ 204 | # 192, 384, 768, 1536], **kwargs) 205 | # if pretrained: 206 | # url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k'] 207 | # checkpoint = torch.hub.load_state_dict_from_url( 208 | # url=url, map_location="cpu") 209 | # model.load_state_dict(checkpoint["model"]) 210 | # return model 211 | 212 | 213 | # @register_model 214 | # def convnext_xlarge(pretrained=False, in_22k=False, **kwargs): 215 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[ 216 | # 256, 512, 1024, 2048], **kwargs) 217 | # if pretrained: 218 | # assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True" 219 | # url = model_urls['convnext_xlarge_22k'] 220 | # checkpoint = torch.hub.load_state_dict_from_url( 221 | # url=url, map_location="cpu") 222 | # model.load_state_dict(checkpoint["model"]) 223 | # return model 224 | -------------------------------------------------------------------------------- /modules/scaleu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.fft as fft 4 | 5 | 6 | def Fourier_filter(x_in, threshold, scale): 7 | x = x_in 8 | B, C, H, W = x.shape 9 | 10 | # Non-power of 2 images must be float32 11 | if (W & (W - 1)) != 0 or (H & (H - 1)) != 0: 12 | x = x.to(dtype=torch.float32) 13 | 14 | # FFT 15 | x_freq = fft.fftn(x, dim=(-2, -1)) 16 | x_freq = fft.fftshift(x_freq, dim=(-2, -1)) 17 | 18 | B, C, H, W = x_freq.shape 19 | mask = torch.ones((B, C, H, W), device=x.device) 20 | 21 | crow, ccol = H // 2, W // 2 22 | mask[..., crow - threshold: crow + threshold, 23 | ccol - threshold: ccol + threshold] = scale 24 | x_freq = x_freq * mask 25 | 26 | # IFFT 27 | x_freq = fft.ifftshift(x_freq, dim=(-2, -1)) 28 | x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real 29 | 30 | return x_filtered.to(dtype=x_in.dtype) 31 | 32 | 33 | class ScaleU(nn.Module): 34 | def __init__(self, enable_se_scaleu=True, b_size=1280, s_size=1): 35 | super(ScaleU, self).__init__() 36 | self.scaleu_b = nn.Parameter(torch.zeros(b_size)) 37 | self.scaleu_s = nn.Parameter(torch.zeros(s_size)) 38 | self.enable_se_scaleu = enable_se_scaleu 39 | 40 | def forward(self, h, hs_, transformer_options={}): 41 | h = h.to(torch.float32) 42 | hs_ = hs_.to(torch.float32) 43 | b = torch.tanh(self.scaleu_b) + 1 44 | s = torch.tanh(self.scaleu_s) + 1 45 | if self.enable_se_scaleu: 46 | hidden_mean = h.mean(1).unsqueeze(1) # B,1,H,W 47 | B = hidden_mean.shape[0] 48 | hidden_max, _ = torch.max(hidden_mean.view( 49 | B, -1), dim=-1, keepdim=True) # B,1 50 | hidden_min, _ = torch.min(hidden_mean.view( 51 | B, -1), dim=-1, keepdim=True) # B,1 52 | # duplicate the hidden_mean dimension 1 to C 53 | hidden_mean = (hidden_mean - hidden_min.unsqueeze(2).unsqueeze(3)) / \ 54 | (hidden_max - hidden_min).unsqueeze(2).unsqueeze(3) # B,1,H,W 55 | b = torch.einsum('c,bchw->bchw', b-1, hidden_mean) + 1.0 # B,C,H,W 56 | h = torch.einsum('bchw,bchw->bchw', h, b) 57 | else: 58 | h = torch.einsum('bchw,c->bchw', h, b) 59 | 60 | hs_ = Fourier_filter(hs_, threshold=1, scale=s) 61 | return h.to(torch.float16), hs_.to(torch.float16) 62 | -------------------------------------------------------------------------------- /modules/text_grounding_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .util import FourierEmbedder 4 | from .convnext import convnext_tiny 5 | 6 | import comfy.ops 7 | ops = comfy.ops.manual_cast 8 | 9 | class UniFusion(nn.Module): 10 | def __init__(self, in_dim, out_dim, mid_dim=3072, fourier_freqs=8, 11 | train_add_boxes=True, train_add_points=True, train_add_scribbles=True, train_add_masks=True, 12 | test_drop_boxes=False, test_drop_points=False, test_drop_scribbles=True, test_drop_masks=False, 13 | use_seperate_tokenizer=True, use_segs=True): 14 | super().__init__() 15 | self.in_dim = in_dim 16 | self.out_dim = out_dim 17 | self.mid_dim = mid_dim 18 | 19 | # InstanceDiffusion hyper-parameters 20 | self.n_scribble_points = 20 21 | self.n_polygon_points = 256 22 | fourier_freqs = 16 23 | fourier_freqs_polygons = 16 24 | self.add_boxes = train_add_boxes 25 | self.add_points = train_add_points 26 | self.add_scribbles = train_add_scribbles 27 | self.add_masks = train_add_masks 28 | self.use_seperate_tokenizer = use_seperate_tokenizer 29 | 30 | # Use instance masks as additional model inputs for mask conditioned image generation 31 | #self.use_segs = True if self.add_masks else False 32 | self.use_segs = use_segs 33 | 34 | if self.use_segs: 35 | in_dim = 30 36 | self.resize_input = 512 37 | self.down_factor = 64 # determined by the convnext backbone 38 | # from num_sem to 3 channels 39 | self.in_conv = ops.Conv2d(in_dim, 3, 3, 1, 1) 40 | self.convnext_tiny_backbone = convnext_tiny(pretrained=True) 41 | self.num_tokens = (self.resize_input // self.down_factor) ** 2 42 | self.convnext_feature_dim = 3072 43 | self.pos_embedding = nn.Parameter(torch.empty( 44 | 1, self.num_tokens, self.convnext_feature_dim).normal_(std=0.02)) # from BERT 45 | 46 | self.test_drop_boxes = test_drop_boxes 47 | self.test_drop_points = test_drop_points 48 | self.test_drop_scribbles = test_drop_scribbles 49 | self.test_drop_masks = test_drop_masks 50 | self.test_drop_segs = test_drop_masks 51 | 52 | self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs) 53 | self.fourier_embedder_polygons = FourierEmbedder( 54 | num_freqs=fourier_freqs_polygons) 55 | input_dim = self.in_dim 56 | input_dim_list = [] 57 | if self.add_boxes: 58 | # 2: sin and cos; 4: (x1,y1) and (x2,y2) 59 | self.position_dim = fourier_freqs * 2 * 4 60 | input_dim += self.position_dim 61 | input_dim_list.append(self.in_dim + self.position_dim) 62 | if self.add_points: 63 | self.point_dim = fourier_freqs * 2 * 2 # 2: sin and cos; 2: (x,y) 64 | input_dim += self.point_dim 65 | input_dim_list.append(self.in_dim + self.point_dim) 66 | if self.add_scribbles: 67 | self.scribble_dim = fourier_freqs_polygons * 2 * \ 68 | self.n_scribble_points * 2 # 2: sin and cos; 2: (x,y) 69 | input_dim += self.scribble_dim 70 | input_dim_list.append(self.in_dim + self.scribble_dim) 71 | if self.add_masks: 72 | self.polygon_dim = fourier_freqs_polygons * 2 * \ 73 | self.n_polygon_points * 2 # 2: sin and cos; 2: (x,y) 74 | input_dim += self.polygon_dim 75 | input_dim_list.append(self.in_dim + self.polygon_dim) 76 | if self.use_segs: 77 | input_dim += self.convnext_feature_dim 78 | input_dim_list.append(self.convnext_feature_dim) 79 | 80 | if self.use_seperate_tokenizer: 81 | self.linears_list = nn.ModuleList([]) 82 | for idx, input_dim_ in enumerate(input_dim_list): 83 | mid_dim = self.mid_dim 84 | self.linears_list.append(nn.Sequential( 85 | ops.Linear(input_dim_, mid_dim), 86 | nn.SiLU(), 87 | ops.Linear(mid_dim, mid_dim), 88 | nn.SiLU(), 89 | ops.Linear(mid_dim, out_dim), 90 | )) 91 | else: 92 | self.linears = nn.Sequential( 93 | ops.Linear(input_dim, self.mid_dim), 94 | nn.SiLU(), 95 | ops.Linear(self.mid_dim, self.mid_dim), 96 | nn.SiLU(), 97 | ops.Linear(self.mid_dim, out_dim), 98 | ) 99 | 100 | self.null_positive_feature = torch.nn.Parameter( 101 | torch.zeros([self.in_dim])) # text 102 | if self.add_boxes: 103 | self.null_position_feature = torch.nn.Parameter( 104 | torch.zeros([self.position_dim])) 105 | if self.add_points: 106 | self.null_point_feature = torch.nn.Parameter( 107 | torch.zeros([self.point_dim])) 108 | if self.add_scribbles: 109 | self.null_scribble_feature = torch.nn.Parameter( 110 | torch.zeros([self.scribble_dim])) 111 | if self.add_masks: 112 | self.null_polygon_feature = torch.nn.Parameter( 113 | torch.zeros([self.polygon_dim])) 114 | if self.use_segs: 115 | self.null_seg_feature = torch.nn.Parameter( 116 | torch.zeros([self.convnext_feature_dim])) 117 | 118 | def reset_dropout_test(self): 119 | # drop_box = True 120 | # drop_point = False 121 | # drop_scribble = True 122 | # drop_polygons = True 123 | # drop_segs = True 124 | drop_box = self.test_drop_boxes 125 | drop_point = self.test_drop_points 126 | drop_scribble = self.test_drop_scribbles 127 | drop_polygons = self.test_drop_masks 128 | drop_segs = self.test_drop_masks 129 | 130 | return drop_point, drop_box, drop_scribble, drop_polygons, drop_segs 131 | 132 | def reset_dropout(self): 133 | drop_box = False 134 | drop_point = False 135 | drop_scribble = False 136 | drop_polygons = False 137 | drop_segs = False 138 | return drop_point, drop_box, drop_scribble, drop_polygons, drop_segs 139 | 140 | def reset_dropout_train(self, drop_point, drop_box, drop_scribble, drop_polygons, drop_segs): 141 | if not drop_polygons: 142 | drop_box = False 143 | drop_point = False 144 | if not drop_box or not drop_polygons: 145 | drop_point = False 146 | 147 | # keep point only for 10% of the time 148 | keep_point_only_ratio = 0.1 149 | keep_point_only = torch.rand(1).item() < keep_point_only_ratio 150 | if keep_point_only: 151 | drop_point = False 152 | drop_box = True 153 | drop_scribble = True 154 | drop_polygons = True 155 | drop_segs = True 156 | 157 | # keep scribble only for 0% of the time 158 | keep_scribble_only_ratio = 0.0 159 | keep_scribble_only = torch.rand( 160 | 1).item() < keep_scribble_only_ratio and not drop_scribble 161 | if keep_scribble_only: 162 | drop_point = True 163 | drop_box = True 164 | drop_scribble = False 165 | drop_polygons = True 166 | drop_segs = True 167 | 168 | # keep mask only for 0% of the time 169 | keep_mask_only_ratio = 0.0 170 | keep_mask_only = torch.rand( 171 | 1).item() < keep_mask_only_ratio and not drop_polygons 172 | if keep_mask_only: 173 | drop_point = True 174 | drop_box = True 175 | drop_scribble = True 176 | drop_polygons = False 177 | drop_segs = False 178 | 179 | # keep seg only for 10% of the time 180 | keep_seg_only_ratio = 0.1 # default 0.1 181 | keep_seg_only = torch.rand( 182 | 1).item() < keep_seg_only_ratio and not drop_segs 183 | if keep_seg_only: 184 | drop_point = False 185 | drop_box = False 186 | drop_scribble = True 187 | drop_polygons = False 188 | drop_segs = False 189 | 190 | # keep box only for 0% of the time 191 | keep_box_only_ratio = 0.0 # default 0.0 192 | keep_box_only = torch.rand( 193 | 1).item() < keep_box_only_ratio and not drop_box 194 | if keep_box_only: 195 | drop_point = True 196 | drop_box = False 197 | drop_scribble = True 198 | drop_polygons = True 199 | drop_segs = True 200 | 201 | return drop_point, drop_box, drop_scribble, drop_polygons, drop_segs 202 | 203 | def forward(self, embeddings): 204 | boxes = embeddings['boxes'] 205 | masks = embeddings['masks'] 206 | positive_embeddings = embeddings['prompts'] 207 | scribbles = embeddings['scribbles'] 208 | polygons = embeddings['polygons'] 209 | segs = embeddings['segments'] 210 | points = embeddings['points'] 211 | 212 | B, N, _ = boxes.shape 213 | masks = masks.unsqueeze(-1) 214 | 215 | drop_point, drop_box, drop_scribble, drop_polygons, drop_segs = self.reset_dropout() 216 | # randomly drop boxes or points embeddings. 217 | if self.add_boxes: 218 | drop_box_ratio = 0.1 219 | drop_box = torch.rand(1).item() < drop_box_ratio 220 | if self.add_points: 221 | drop_point_ratio = 0.1 222 | drop_point = torch.rand(1).item() < drop_point_ratio 223 | if self.add_scribbles: 224 | drop_scribble_ratio = 0.1 225 | drop_scribble = torch.rand(1).item() < drop_scribble_ratio 226 | if self.add_masks: 227 | drop_polygon_ratio = 0.1 228 | drop_polygons = torch.rand(1).item() < drop_polygon_ratio 229 | drop_segs = drop_polygons 230 | 231 | # Not training, always keep both boxes and points 232 | if not self.training: 233 | drop_point, drop_box, drop_scribble, drop_polygons, drop_segs = self.reset_dropout_test() 234 | else: 235 | drop_point, drop_box, drop_scribble, drop_polygons, drop_segs = self.reset_dropout_train( 236 | drop_point, drop_box, drop_scribble, drop_polygons, drop_segs) 237 | 238 | # set drop_box to False if all other inputs are dropped 239 | if drop_point and drop_box and drop_scribble and drop_polygons and drop_segs: 240 | drop_box = False 241 | 242 | # embedding position (it may includes padding as placeholder) 243 | if self.add_boxes: 244 | xyxy_embedding = self.fourier_embedder( 245 | boxes) # B*N*4 --> B*N*C (C=8*2*4) 246 | if self.add_points: 247 | if points is None: # we can always get a point using a box 248 | points = (boxes[:, :, :2] + boxes[:, :, 2:]) / 2.0 249 | point_embedding = self.fourier_embedder( 250 | points) # B*N*2 --> B*N*(8*2*2) 251 | if self.add_scribbles: 252 | scribble_embedding = self.fourier_embedder_polygons( 253 | scribbles) # B*N*20 --> B*N*(8*20*2) 254 | if self.add_masks: 255 | polygon_embedding = self.fourier_embedder_polygons( 256 | polygons) # B*N*128 --> B*N*(16*128*2) 257 | if self.use_segs: 258 | segs = torch.nn.functional.interpolate( 259 | segs, self.resize_input, mode="nearest") 260 | segs_feature = self.in_conv(segs) 261 | segs_feature = self.convnext_tiny_backbone(segs_feature) 262 | segs_feature = segs_feature.reshape(B, -1, self.num_tokens) 263 | segs_feature = segs_feature.permute(0, 2, 1) 264 | 265 | # learnable null embedding 266 | positive_null = self.null_positive_feature.view(1, 1, -1) 267 | if self.add_boxes: 268 | xyxy_null = self.null_position_feature.view(1, 1, -1) 269 | if self.add_points: 270 | point_null = self.null_point_feature.view(1, 1, -1) 271 | if self.add_scribbles: 272 | scribble_null = self.null_scribble_feature.view(1, 1, -1) 273 | if self.add_masks: 274 | polygon_null = self.null_polygon_feature.view(1, 1, -1) 275 | if self.use_segs: 276 | seg_null = self.null_seg_feature.view(1, 1, -1) 277 | seg_null = seg_null.repeat(B, self.num_tokens, 1) 278 | 279 | # replace padding with learnable null embedding 280 | positive_embeddings = positive_embeddings * \ 281 | masks + (1 - masks) * positive_null 282 | if self.use_seperate_tokenizer: 283 | embeddings_list = [] 284 | if self.add_boxes: 285 | # replace padding with learnable null embedding for boxes 286 | xyxy_masks = torch.zeros_like(masks).to( 287 | masks.device) if drop_box else masks.detach().clone() 288 | xyxy_embedding = xyxy_embedding * \ 289 | xyxy_masks + (1 - xyxy_masks) * xyxy_null 290 | if self.use_seperate_tokenizer: 291 | embeddings_list.append(xyxy_embedding) 292 | if self.add_points: 293 | # replace padding with learnable null embedding for points 294 | point_masks = torch.zeros_like(masks).to( 295 | boxes.device) if drop_point else masks.detach().clone() 296 | point_embedding = point_embedding * \ 297 | point_masks + (1 - point_masks) * point_null 298 | if self.use_seperate_tokenizer: 299 | embeddings_list.append(point_embedding) 300 | if self.add_scribbles: 301 | # sum along the batch dimension and check if all scribbles are 0s 302 | # replace padding with learnable null embedding for scribbles 303 | # scribble_embedding: torch.Size([bs, n_objs, 640]); masks_scribble: torch.Size([bs, n_objs, 1]); scribble_null: torch.Size([1, 1, 640]) 304 | masks_scribble = torch.zeros_like(masks).to(masks.device) if drop_scribble else ( 305 | (torch.sum(scribbles, dim=-1).unsqueeze(-1) + masks.detach().clone()) > 0).float() 306 | scribble_embedding = scribble_embedding * \ 307 | masks_scribble + (1 - masks_scribble) * scribble_null 308 | if self.use_seperate_tokenizer: 309 | embeddings_list.append(scribble_embedding) 310 | if self.add_masks: 311 | masks_polygons = torch.zeros_like(masks).to(masks.device) if drop_polygons else ( 312 | (torch.sum(polygons, dim=-1).unsqueeze(-1) + masks.detach().clone()) > 0).float() 313 | assert torch.sum( 314 | scribbles, dim=-1).unsqueeze(-1).size() == masks.size() 315 | polygon_embedding = polygon_embedding * \ 316 | masks_polygons + (1 - masks_polygons) * polygon_null 317 | if self.use_seperate_tokenizer: 318 | embeddings_list.append(polygon_embedding) 319 | if self.use_segs: 320 | # mask replacing 321 | masks_segs = torch.zeros(masks.shape[0]).to(masks.device) if drop_segs else ( 322 | torch.sum(segs, dim=(1, 2, 3)) > 0).float() 323 | masks_segs = masks_segs.view(-1, 1, 1) 324 | assert masks_segs.size()[0] == masks.shape[0] 325 | seg_embedding = segs_feature * masks_segs 326 | seg_embedding = seg_embedding + (1 - masks_segs) * seg_null 327 | # add pos 328 | seg_embedding = seg_embedding + self.pos_embedding 329 | if self.use_seperate_tokenizer: 330 | embeddings_list.append(seg_embedding) 331 | 332 | inputs = [positive_embeddings] 333 | if self.use_seperate_tokenizer: 334 | objs = [] 335 | # forward all types of embeddings using the corresponding linear layers 336 | for i, (linears, layout_embeddings) in enumerate(zip(self.linears_list, embeddings_list)): 337 | if i == len(embeddings_list) - 1 and self.use_segs: 338 | objs.append(linears(layout_embeddings)) 339 | else: 340 | objs.append( 341 | linears(torch.cat([positive_embeddings, layout_embeddings], dim=-1))) 342 | objs = torch.cat(objs, dim=1) 343 | else: 344 | # NOTE: orders should the same for training and testing 345 | if self.add_boxes: 346 | inputs.append(xyxy_embedding) 347 | if self.add_points: 348 | inputs.append(point_embedding) 349 | if self.add_scribbles: 350 | inputs.append(scribble_embedding) 351 | if self.add_masks: 352 | inputs.append(polygon_embedding) 353 | 354 | objs = self.linears(torch.cat(inputs, dim=-1)) 355 | assert objs.shape == torch.Size([B, N, self.out_dim]) 356 | drop_box_mask = True if drop_box and drop_polygons else False 357 | return objs, drop_box_mask 358 | -------------------------------------------------------------------------------- /modules/text_grounding_tokenizer_input.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class GroundingNetInput: 5 | def __init__(self): 6 | self.set = False 7 | self.return_att_masks = False 8 | self.image_size = 64 9 | self.return_att_masks32 = False 10 | 11 | def prepare(self, batch, image_size=64, device=None, dtype=None, return_att_masks=False): 12 | """ 13 | batch should be the output from dataset. 14 | Please define here how to process the batch and prepare the 15 | input only for the ground tokenizer. 16 | """ 17 | output = {} 18 | self.set = True 19 | self.return_att_masks = return_att_masks 20 | 21 | boxes = batch['boxes'] 22 | masks = batch['masks'] 23 | positive_embeddings = batch["prompts"] 24 | 25 | if self.return_att_masks: 26 | assert 'att_masks' in batch 27 | att_masks = batch['att_masks'] 28 | 29 | scribbles = batch['scribbles'] 30 | polygons = batch['polygons'] 31 | self.dim_scribbles = scribbles.shape[-1] 32 | self.dim_polygons = polygons.shape[-1] 33 | # NOTE: New Seg 34 | segs = batch["segments"] 35 | self.dim_segs = segs.shape[-1] 36 | points = batch["points"] 37 | 38 | self.batch, self.max_box, self.in_dim = positive_embeddings.shape 39 | self.device = positive_embeddings.device 40 | self.dtype = positive_embeddings.dtype 41 | 42 | output = { 43 | "boxes": boxes, 44 | "masks": masks, 45 | "prompts": positive_embeddings, 46 | } 47 | output["scribbles"] = scribbles 48 | output["polygons"] = polygons 49 | output["segments"] = segs 50 | output["points"] = points 51 | 52 | if self.return_att_masks: 53 | output['att_masks'] = att_masks 54 | return output 55 | 56 | def get_null_input(self, batch=None, latent_width=64, latent_height=64, device=None, dtype=None): 57 | """ 58 | Guidance for training (drop) or inference, 59 | please define the null input for the grounding tokenizer 60 | """ 61 | 62 | assert self.set, "not set yet, cannot call this funcion" 63 | batch = self.batch if batch is None else batch 64 | device = self.device if device is None else device 65 | dtype = self.dtype if dtype is None else dtype 66 | 67 | boxes = torch.zeros(batch, self.max_box, 4,).type(dtype).to(device) 68 | masks = torch.zeros(batch, self.max_box).type(dtype).to(device) 69 | # NOTE: New Seg 70 | segs = torch.zeros(batch, self.max_box, self.dim_segs, 71 | self.dim_segs).type(dtype).to(device) 72 | 73 | scribbles = torch.zeros(batch, self.max_box, 74 | self.dim_scribbles).type(dtype).to(device) 75 | polygons = torch.zeros(batch, self.max_box, 76 | self.dim_polygons).type(dtype).to(device) 77 | points = torch.zeros(batch, self.max_box, 2).type(dtype).to(device) 78 | 79 | positive_embeddings = torch.zeros( 80 | batch, self.max_box, self.in_dim).type(dtype).to(device) 81 | 82 | output = { 83 | "boxes": boxes, 84 | "masks": masks, 85 | "prompts": positive_embeddings, 86 | } 87 | output["scribbles"] = scribbles 88 | output["polygons"] = polygons 89 | output["segments"] = segs 90 | output["points"] = points 91 | 92 | if self.return_att_masks: 93 | att_masks = torch.zeros(batch, self.max_box, latent_width, latent_height).type( 94 | dtype).to(device) # TODO Order width/height 95 | output['att_masks'] = att_masks 96 | return output 97 | -------------------------------------------------------------------------------- /modules/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class FourierEmbedder(): 5 | def __init__(self, num_freqs=64, temperature=100): 6 | 7 | self.num_freqs = num_freqs 8 | self.temperature = temperature 9 | self.freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs) 10 | 11 | @ torch.no_grad() 12 | def __call__(self, x, cat_dim=-1): 13 | "x: arbitrary shape of tensor. dim: cat dim" 14 | out = [] 15 | for freq in self.freq_bands: 16 | out.append(torch.sin(freq*x)) 17 | out.append(torch.cos(freq*x)) 18 | return torch.cat(out, cat_dim) 19 | -------------------------------------------------------------------------------- /nodes/apply_scaleu_model_node.py: -------------------------------------------------------------------------------- 1 | import comfy.model_management 2 | 3 | from .. import constants as constants 4 | from ..model_helpers.prepare_scaleu import get_scaleu_patch 5 | 6 | 7 | class ApplyScaleUModelNode: 8 | @classmethod 9 | def INPUT_TYPES(s): 10 | return {"required": { 11 | "model": ("MODEL",), 12 | "scaleu": ("SCALEU",), 13 | }} 14 | 15 | RETURN_TYPES = ("MODEL",) 16 | FUNCTION = "apply" 17 | 18 | CATEGORY = "instance" 19 | 20 | def apply(self, model, scaleu): 21 | # Validate patches dict is setup correctly 22 | transformer_options = model.model_options['transformer_options'] 23 | if 'patches' not in transformer_options: 24 | transformer_options['patches'] = {} 25 | 26 | if 'output_block_patch' not in transformer_options['patches']: 27 | transformer_options['patches']['output_block_patch'] = [] 28 | 29 | # Add scaleu patch to model patches 30 | scaleu_nets = scaleu['model_list'] 31 | # TODO make this load in KSampler 32 | for i, scaleu in enumerate(scaleu_nets): 33 | scaleu_nets[i] = scaleu.to( 34 | comfy.model_management.get_torch_device()) 35 | transformer_options['patches']['output_block_patch'].append( 36 | get_scaleu_patch(scaleu_nets)) 37 | return (model,) 38 | -------------------------------------------------------------------------------- /nodes/download_and_load_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import folder_paths 3 | import comfy.utils 4 | from .. import constants as constants 5 | from ..model_helpers.prepare_positionnet import prepare_positionnet, get_positionnet_default_params 6 | from ..model_helpers.prepare_scaleu import prepare_scaleu_nets 7 | from ..model_helpers.prepare_fusers import prepare_fusers 8 | from huggingface_hub import snapshot_download 9 | 10 | INSTANCE_FUSERS_DIR = "fuser_models" 11 | 12 | INSTANCE_SCALEU_DIR = "scaleu_models" 13 | 14 | class DownloadInstanceDiffusionModels: 15 | @classmethod 16 | def INPUT_TYPES(s): 17 | return {"required": { 18 | "use_segs": ("BOOLEAN", {"default": True}), 19 | "fusers_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), 20 | }} 21 | 22 | RETURN_TYPES = ("POSITIONNET", "FUSERS", "SCALEU", ) 23 | FUNCTION = "load_model" 24 | 25 | CATEGORY = "instance/loaders" 26 | 27 | def load_model(self, use_segs: bool, fusers_scale: float): 28 | repo_id = "logtd/instance_diffusion" 29 | instance_models_folder = os.path.join(folder_paths.models_dir, constants.INSTANCE_MODELS_DIR) 30 | 31 | models_to_download = [ 32 | ("position_net", constants.INSTANCE_POSITIONNET_DIR, "position_net.ckpt"), 33 | ("fusers", constants.INSTANCE_FUSERS_DIR, "fusers.ckpt"), 34 | ("scaleu", constants.INSTANCE_SCALEU_DIR, "scaleu.ckpt") 35 | ] 36 | 37 | for model_name, model_folder, model_file in models_to_download: 38 | model_folder_path = os.path.join(instance_models_folder, model_folder) 39 | model_file_path = os.path.join(model_folder_path, model_file) 40 | 41 | if not os.path.exists(model_file_path): 42 | print(f"Selected model: {model_file_path} not found, downloading...") 43 | allow_patterns = [f"*{model_name}*"] 44 | snapshot_download(repo_id=repo_id, 45 | allow_patterns=allow_patterns, 46 | local_dir=model_folder_path, 47 | local_dir_use_symlinks=False 48 | ) 49 | 50 | positionnet_file = os.path.join(instance_models_folder, constants.INSTANCE_POSITIONNET_DIR, "position_net.ckpt") 51 | fusers_file = os.path.join(instance_models_folder, constants.INSTANCE_FUSERS_DIR, "fusers.ckpt") 52 | scaleu_file = os.path.join(instance_models_folder, constants.INSTANCE_SCALEU_DIR, "scaleu.ckpt") 53 | 54 | pos_checkpoint = comfy.utils.load_torch_file(positionnet_file, safe_load=True) 55 | params = get_positionnet_default_params() 56 | params["use_segs"] = use_segs 57 | model = prepare_positionnet(pos_checkpoint, params) 58 | positionnet = { 59 | 'model': model, 60 | } 61 | 62 | fusers_checkpoint = comfy.utils.load_torch_file(fusers_file, safe_load=True) 63 | fusers_list = prepare_fusers(fusers_checkpoint, fusers_scale) 64 | fusers = { 65 | 'model_list': fusers_list 66 | } 67 | scaleu_checkpoint = comfy.utils.load_torch_file(scaleu_file, safe_load=True) 68 | scaleu_list = prepare_scaleu_nets(scaleu_checkpoint) 69 | scaleu = { 70 | 'model_list': scaleu_list 71 | } 72 | return (positionnet, fusers, scaleu) 73 | 74 | -------------------------------------------------------------------------------- /nodes/instance_diffusion_tracking_prompt_node.py: -------------------------------------------------------------------------------- 1 | from ..utils.prompt_utils import extract_prompts 2 | from ..conditioning.instance_conditioning import InstanceConditioning 3 | 4 | 5 | class InstanceDiffusionTrackingPromptNode: 6 | @classmethod 7 | def INPUT_TYPES(s): 8 | return {"required": {"positive": ("CONDITIONING", ), 9 | "negative": ("CONDITIONING", ), 10 | "clip": ("CLIP", ), 11 | "tracking": ("TRACKING", ), 12 | "positionnet": ("POSITIONNET", ), 13 | "fusers": ("FUSERS", ), 14 | "positive_text": ("STRING", {"multiline": True}), 15 | "negative_text": ("STRING", {"multiline": True}), 16 | }} 17 | RETURN_TYPES = ("CONDITIONING", "CONDITIONING") 18 | RETURN_NAMES = ("positive", "negative") 19 | FUNCTION = "append" 20 | 21 | CATEGORY = "instance/conditioning" 22 | 23 | def _get_position_conds(self, clip, tracking, text): 24 | # Get prompts and their class id and trakcer id 25 | prompt_pairs = extract_prompts(text) 26 | 27 | # Go through prompt pairs, encode prompts, and join with positions from tracking 28 | position_conds = [] 29 | for tracker_id, class_id, prompt in prompt_pairs: 30 | _, cond_pooled = clip.encode_from_tokens( 31 | clip.tokenize(prompt), return_pooled=True) 32 | # A tracker_id of -1 means that it is prompting all instances of a single class 33 | if tracker_id != -1: 34 | position_cond = {'cond_pooled': cond_pooled, 'positions': 35 | tracking[class_id][tracker_id]} 36 | position_conds.append(position_cond) 37 | else: 38 | for tracker_id in tracking[class_id]: 39 | position_cond = {'cond_pooled': cond_pooled, 40 | 'positions': tracking[class_id][tracker_id]} 41 | position_conds.append(position_cond) 42 | 43 | return position_conds 44 | 45 | def _apply_position_conds(self, position_conds, conditioning, fusers, positionnet): 46 | # Add prompts+embeddings to the input conditionings 47 | cond_out = [] 48 | for t in conditioning: 49 | n = [t[0], t[1].copy()] 50 | cond = n[1] 51 | prev = [] 52 | has_instance = 'instance_diffusion' in cond 53 | instance_conditioning = conditioning['instance_diffusion'] if has_instance else InstanceConditioning( 54 | fusers, positionnet) 55 | cond['instance_diffusion'] = instance_conditioning 56 | instance_conditioning.add_conds(position_conds) 57 | 58 | cond['gligen'] = ('position', instance_conditioning, None) 59 | 60 | cond_out.append(n) 61 | 62 | return cond_out 63 | 64 | def append(self, positive, negative, clip, tracking, fusers, positionnet, positive_text, negative_text, fusers_batch_size=None): 65 | 66 | positive_positions = self._get_position_conds( 67 | clip, tracking, positive_text) 68 | positive = self._apply_position_conds( 69 | positive_positions, positive, fusers, positionnet) 70 | 71 | negative_positions = self._get_position_conds( 72 | clip, tracking, negative_text) 73 | negative = self._apply_position_conds( 74 | negative_positions, negative, fusers, positionnet) 75 | 76 | return (positive, negative) 77 | -------------------------------------------------------------------------------- /nodes/load_instance_fusers_node.py: -------------------------------------------------------------------------------- 1 | from .. import constants as constants 2 | from ..utils.model_utils import get_model_list, load_checkpoint 3 | from ..model_helpers.prepare_fusers import prepare_fusers 4 | 5 | 6 | class LoadInstanceFusersNode: 7 | @classmethod 8 | def INPUT_TYPES(s): 9 | return {"required": { 10 | "model_filename": (get_model_list(constants.INSTANCE_FUSERS_DIR),), 11 | "fusers_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), 12 | }} 13 | 14 | RETURN_TYPES = ("FUSERS",) 15 | FUNCTION = "load_model" 16 | 17 | CATEGORY = "instance/loaders" 18 | 19 | def load_model(self, model_filename: str, fusers_scale: float): 20 | checkpoint = load_checkpoint( 21 | constants.INSTANCE_FUSERS_DIR, model_filename) 22 | fusers_list = prepare_fusers(checkpoint, fusers_scale) 23 | fusers = { 24 | 'model_list': fusers_list 25 | } 26 | return (fusers,) 27 | -------------------------------------------------------------------------------- /nodes/load_instance_positionnet_node.py: -------------------------------------------------------------------------------- 1 | from .. import constants as constants 2 | from ..utils.model_utils import get_model_list, load_checkpoint 3 | from ..model_helpers.prepare_positionnet import prepare_positionnet, get_positionnet_default_params 4 | 5 | 6 | class LoadInstancePositionNetNode: 7 | @classmethod 8 | def INPUT_TYPES(s): 9 | return {"required": { 10 | "model_filename": (get_model_list(constants.INSTANCE_POSITIONNET_DIR),), 11 | "use_segs": ("BOOLEAN", {"default": True}), 12 | }} 13 | 14 | RETURN_TYPES = ("POSITIONNET", "FUSERS", "SCALEU",) 15 | FUNCTION = "load_model" 16 | 17 | CATEGORY = "instance/loaders" 18 | 19 | def load_model(self, model_filename: str, use_segs: bool): 20 | checkpoint = load_checkpoint( 21 | constants.INSTANCE_POSITIONNET_DIR, model_filename) 22 | params = get_positionnet_default_params() 23 | params["use_segs"] = use_segs 24 | model = prepare_positionnet(checkpoint, params) 25 | positionnet = { 26 | 'model': model, 27 | } 28 | return (positionnet,) 29 | -------------------------------------------------------------------------------- /nodes/load_instance_scaleu_node.py: -------------------------------------------------------------------------------- 1 | from .. import constants as constants 2 | from ..utils.model_utils import get_model_list, load_checkpoint 3 | from ..model_helpers.prepare_scaleu import prepare_scaleu_nets 4 | 5 | 6 | class LoadInstanceScaleUNode: 7 | @classmethod 8 | def INPUT_TYPES(s): 9 | return {"required": { 10 | "model_filename": (get_model_list(constants.INSTANCE_SCALEU_DIR),), 11 | }} 12 | 13 | RETURN_TYPES = ("SCALEU",) 14 | FUNCTION = "load_model" 15 | 16 | CATEGORY = "instance/loaders" 17 | 18 | def load_model(self, model_filename: str): 19 | checkpoint = load_checkpoint( 20 | constants.INSTANCE_SCALEU_DIR, model_filename) 21 | scaleu_list = prepare_scaleu_nets(checkpoint) 22 | scaleu = { 23 | 'model_list': scaleu_list 24 | } 25 | return (scaleu,) 26 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui-instancediffusion" 3 | description = "A set of nodes to perform multi-object prompting with InstanceDiffusion" 4 | version = "1.0.0" 5 | license = "LICENSE" 6 | dependencies = ["huggingface_hub"] 7 | 8 | [project.urls] 9 | Repository = "https://github.com/logtd/ComfyUI-InstanceDiffusion" 10 | # Used by Comfy Registry https://comfyregistry.org 11 | 12 | [tool.comfy] 13 | PublisherId = "logtd" 14 | DisplayName = "ComfyUI-InstanceDiffusion" 15 | Icon = "" 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub -------------------------------------------------------------------------------- /utils/decode_item.py: -------------------------------------------------------------------------------- 1 | # Directly taken from InstanceDiffusion repo 2 | import torch 3 | import random 4 | import base64 5 | import numpy as np 6 | from io import BytesIO 7 | from collections import Counter 8 | from PIL import Image, ImageDraw 9 | import base64 10 | from skimage import measure 11 | 12 | 13 | # import nltk 14 | # from nltk.corpus import stopwords 15 | 16 | def decode_base64_to_pillow(image_b64): 17 | return Image.open(BytesIO(base64.b64decode(image_b64))).convert('RGB') 18 | 19 | 20 | def decode_tensor_from_string(arr_str, use_tensor=True): 21 | arr = np.frombuffer(base64.b64decode(arr_str), dtype='float32') 22 | if use_tensor: 23 | arr = torch.from_numpy(arr) 24 | return arr 25 | 26 | 27 | def close_contour(contour): 28 | if not np.array_equal(contour[0], contour[-1]): 29 | contour = np.vstack((contour, contour[0])) 30 | return contour 31 | 32 | # convert binay mask to polygon format 33 | 34 | 35 | def binary_mask_to_polygon(binary_mask, tolerance=0): 36 | """Converts a binary mask to COCO polygon representation 37 | 38 | Args: 39 | binary_mask: a 2D binary numpy array where '1's represent the object 40 | tolerance: Maximum distance from original points of polygon to approximated 41 | polygonal chain. If tolerance is 0, the original coordinate array is returned. 42 | 43 | """ 44 | polygons = [] 45 | # pad mask to close contours of shapes which start and end at an edge 46 | padded_binary_mask = np.pad( 47 | binary_mask, pad_width=1, mode='constant', constant_values=0) 48 | contours = measure.find_contours(padded_binary_mask, tolerance) 49 | polygons = [] 50 | # print(contours) 51 | for contour in contours: 52 | contour = close_contour(contour) 53 | contour = measure.approximate_polygon(contour, tolerance) 54 | if len(contour) < 3: 55 | continue 56 | contour = np.flip(contour, axis=1) 57 | segmentation = contour.ravel().tolist() 58 | # after padding and subtracting 1 we may get -0.5 points in our segmentation 59 | segmentation = [0 if i < 0 else i for i in segmentation] 60 | polygons.append(segmentation) 61 | 62 | return polygons 63 | 64 | 65 | def sample_random_points_from_mask(mask, k): 66 | mask = mask[:, :, 0] 67 | # Find the coordinates of non-zero pixels in the binary mask 68 | nonzero_coords = np.transpose(np.nonzero(mask)) 69 | 70 | # Randomly sample 'k' points 71 | # return all zeros if there is no non-zero pixel 72 | if len(nonzero_coords) == 0: 73 | xy_points = [0 for _ in range(k * 2)] 74 | return xy_points 75 | 76 | # randomly sample with replacement if there are not enough non-zero pixels 77 | if len(nonzero_coords) < k and len(nonzero_coords) > 0: 78 | random_indices = np.random.choice(len(nonzero_coords), k, replace=True) 79 | # randomly sample withiout replacement if there are enough non-zero pixels 80 | else: 81 | random_indices = np.random.choice( 82 | len(nonzero_coords), k, replace=False) 83 | sampled_points = nonzero_coords[random_indices] 84 | 85 | # order the points by their distance to (0, 0) 86 | # center = np.array([mask.shape[0] // 2, mask.shape[1] // 2]) 87 | center = np.array([0, 0]) 88 | sampled_points = sorted(sampled_points, key=lambda x: np.linalg.norm( 89 | np.array(x) - center)) # np.linalg.norm 90 | 91 | # concatenate x and y coordinates and return them as a list 92 | # [x1,y1,x2,y2,...,x_k,y_k] 93 | xy_points = [] 94 | for x in sampled_points: 95 | xy_points.append(float(x[1])) 96 | xy_points.append(float(x[0])) 97 | return xy_points 98 | 99 | # convert numpy array of bool mask to float mask 100 | 101 | 102 | def binary_mask_to_int(binary_mask): 103 | return binary_mask.astype(np.int32) 104 | 105 | # uniformly sample points from the mask 106 | 107 | 108 | def sample_sparse_points(binary_mask, k, return_2d=False): 109 | # Find the coordinates of non-zero pixels in the binary mask 110 | nonzero_coords = np.array(np.nonzero(binary_mask)) 111 | if len(nonzero_coords) == 0: 112 | xy_points = [0 for _ in range(k * 2)] 113 | return xy_points 114 | 115 | # Calculate the total number of non-zero pixels 116 | num_nonzero_pixels = len(nonzero_coords) 117 | 118 | xy_points = [] 119 | if k >= num_nonzero_pixels: 120 | for x in nonzero_coords: 121 | xy_points.append(float(x[1])) 122 | xy_points.append(float(x[0])) 123 | for _ in range(k - num_nonzero_pixels): 124 | xy_points.append(nonzero_coords[-1][1]) 125 | xy_points.append(nonzero_coords[-1][0]) 126 | return nonzero_coords 127 | 128 | # Calculate the number of points to sample in each dimension 129 | num_points_per_dim = int(np.sqrt(k)) 130 | 131 | # Calculate the step size to ensure equal spacing 132 | step_size = max(1, num_nonzero_pixels // (num_points_per_dim ** 2)) 133 | 134 | # Sample points with equal spacing 135 | sampled_points = nonzero_coords[::step_size][:k] 136 | if return_2d: 137 | sampled_points = [(x[1], x[0]) for x in sampled_points] 138 | else: 139 | for x in sampled_points: 140 | xy_points.append(float(x[1])) 141 | xy_points.append(float(x[0])) 142 | return xy_points 143 | 144 | 145 | def sample_uniform_sparse_points(binary_mask, k): 146 | # binary_mask = binary_mask[:,:,0] 147 | # Step 1: Get the indices of '1' values in the binary mask 148 | foreground_indices = np.argwhere(binary_mask == 1) 149 | 150 | if len(foreground_indices) == 0: 151 | return [] 152 | 153 | selected_points = [] 154 | if len(foreground_indices) < k: 155 | # randomly sample with replacement if there are not enough non-zero pixels 156 | for i in range(k): 157 | random_point = random.choice(foreground_indices) 158 | selected_points.append((random_point[1], random_point[0])) 159 | else: 160 | # rank the points by their distance to the mean of the foreground_indices 161 | center = np.mean(foreground_indices, axis=0) 162 | # print(center) 163 | foreground_indices = sorted( 164 | foreground_indices, key=lambda x: np.linalg.norm(x - center)) # np.linalg.norm 165 | # Calculate the number of points to select from each segment 166 | points_per_segment = len(foreground_indices) // k 167 | 168 | # Step 2: Randomly select one point from each segment 169 | # print(k) 170 | for i in range(k): 171 | segment_points = foreground_indices[i * 172 | points_per_segment: (i + 1) * points_per_segment] 173 | # choose the middle point in each segment 174 | random_point = segment_points[len(segment_points) // 2] 175 | # random_point = random.choice(segment_points) 176 | selected_points.append((random_point[1], random_point[0])) 177 | 178 | return selected_points 179 | 180 | 181 | def sample_sparse_points_from_mask(mask, k): 182 | n_points = k 183 | n_polygons = n_points // 2 # half points should be sampled from the polygons 184 | mask = mask[:, :, 0] 185 | # sample sparse points from the polygons (boundary) 186 | polygons = binary_mask_to_polygon(mask, tolerance=0.0) 187 | # concatenate polygons to a single list 188 | polygons_single = [] 189 | for polygon in polygons: 190 | polygons_single += polygon 191 | if len(polygons_single) != 0: 192 | # uniformly sample points from the polygon 193 | polygons_single = np.array(polygons_single).reshape(-1, 2) 194 | indexes = np.linspace(0, polygons_single.shape[0] - 1, n_polygons) 195 | indexes = list([int(i) for i in indexes]) 196 | 197 | polygons_single = polygons_single[indexes] 198 | sampled_polygons = [(x[0], x[1]) for x in polygons_single] 199 | else: 200 | return None 201 | 202 | # sample sparse points from the mask 203 | n_inside_points = n_points - len(sampled_polygons) 204 | inside_points = sample_uniform_sparse_points(mask, n_inside_points) 205 | 206 | # combine inside_points and sampled_polygons 207 | xy_points = inside_points + sampled_polygons 208 | 209 | # order the points by their distance to (0, 0) 210 | center = np.array([0, 0]) 211 | xy_points = sorted(xy_points, key=lambda x: np.linalg.norm( 212 | np.array(x) - center)) # np.linalg.norm 213 | 214 | # return the sampled points 215 | sampled_points = [] 216 | for x in xy_points: 217 | sampled_points.append(x[0]) 218 | sampled_points.append(x[1]) 219 | return sampled_points 220 | 221 | 222 | def get_polygons_from_mask(mask, tolerance=0, n_polygon_points=256): 223 | mask = binary_mask_to_int(mask) 224 | return_polygons = True 225 | if return_polygons: 226 | # convert float mask to polygons 227 | polygons = binary_mask_to_polygon(mask[:, :, 0], tolerance=tolerance) 228 | 229 | # return all zeros if there is no polygon 230 | if len(polygons) == 0: 231 | polygons = [0 for _ in range(n_polygon_points * 2)] 232 | return polygons 233 | 234 | # concatenate polygons to a single list 235 | polygon = [] 236 | for p in polygons: 237 | polygon += p 238 | 239 | # uniformly sample points the polygon 240 | polygon = np.array(polygon).reshape(-1, 2) 241 | indexes = np.linspace(0, polygon.shape[0] - 1, n_polygon_points) 242 | indexes = [int(i) for i in indexes] 243 | polygon = polygon[indexes].reshape(-1) 244 | 245 | return polygon 246 | else: 247 | sampled_points = sample_sparse_points(mask, n_polygon_points) 248 | return sampled_points 249 | 250 | 251 | def decode_item(item): 252 | # convert string to dict 253 | if "image" in item and isinstance(item['image'], Image.Image): 254 | return item 255 | 256 | item['image'] = decode_base64_to_pillow(item['image']) 257 | segs = [] 258 | for anno in item['annos']: 259 | anno['image_embedding_before'] = decode_tensor_from_string( 260 | anno['image_embedding_before']) 261 | anno['text_embedding_before'] = decode_tensor_from_string( 262 | anno['text_embedding_before']) 263 | anno['image_embedding_after'] = decode_tensor_from_string( 264 | anno['image_embedding_after']) 265 | anno['text_embedding_after'] = decode_tensor_from_string( 266 | anno['text_embedding_after']) 267 | if "blip_clip_embeddings" in anno: 268 | anno['blip_clip_embeddings'] = decode_tensor_from_string( 269 | anno['blip_clip_embeddings']) 270 | if 'mask' in anno: 271 | # sample k random points from the mask 272 | n_scribble_points = 20 273 | rle = anno['mask'] 274 | binary_mask = decodeToBinaryMask(rle) 275 | segs.append(binary_mask) 276 | if "scribbles" in anno: 277 | anno['scribbles'] = anno["scribbles"] 278 | else: 279 | anno['scribbles'] = sample_random_points_from_mask( 280 | binary_mask, n_scribble_points) 281 | # convert mask to polygon 282 | n_polygon_points = 256 283 | polygons = sample_sparse_points_from_mask( 284 | binary_mask, k=n_polygon_points) 285 | if polygons != None: 286 | anno['polygons'] = polygons 287 | else: 288 | anno['polygons'] = [0 for _ in range(n_polygon_points * 2)] 289 | if len(segs) > 0: 290 | item['segs'] = np.stack(segs).astype(np.float32).squeeze() 291 | return item 292 | 293 | 294 | def check_unique(images, fields): 295 | for field in fields: 296 | temp_list = [] 297 | for img_info in images: 298 | temp_list.append(img_info[field]) 299 | assert len(set(temp_list)) == len(temp_list), field 300 | 301 | 302 | def clean_data(data): 303 | for data_info in data: 304 | data_info.pop("original_img_id", None) 305 | data_info.pop("original_id", None) 306 | # sentence id for each image (multiple sentences for one image) 307 | data_info.pop("sentence_id", None) 308 | data_info.pop("dataset_name", None) 309 | data_info.pop("data_source", None) 310 | data_info["data_id"] = data_info.pop("id") 311 | 312 | 313 | def clean_annotations(annotations): 314 | for anno_info in annotations: 315 | anno_info.pop("iscrowd", None) 316 | anno_info.pop("category_id", None) 317 | anno_info.pop("area", None) 318 | anno_info["data_id"] = anno_info.pop("image_id") 319 | 320 | 321 | def draw_box(img, boxes): 322 | draw = ImageDraw.Draw(img) 323 | for box in boxes: 324 | draw.rectangle([box[0], box[1], box[2], box[3]], 325 | outline="red", width=2) # x0 y0 x1 y1 326 | return img 327 | 328 | 329 | def xyhw2xyxy(box): 330 | x0, y0, w, h = box 331 | return [x0, y0, x0 + w, y0 + h] 332 | 333 | 334 | def make_a_sentence_count_nums(obj_names): 335 | # count the number of duplicated strings in the list 336 | # ["dog", "dog", "cat"] 337 | obj_names = dict(Counter(obj_names)) 338 | # {'dog': 2, 'cat': 1} 339 | caption = "" 340 | for item in obj_names: 341 | caption += str(obj_names[item]) + " " + item + ", " 342 | return caption[:-2] 343 | 344 | 345 | def make_a_sentence(obj_names, clean=False): 346 | 347 | if clean: 348 | obj_names = [name[:-6] if ("-other" in name) 349 | else name for name in obj_names] 350 | 351 | caption = "" 352 | tokens_positive = [] 353 | for obj_name in obj_names: 354 | start_len = len(caption) 355 | caption += obj_name 356 | end_len = len(caption) 357 | caption += ", " 358 | tokens_positive.append( 359 | # in real caption, positive tokens can be disjoint, thus using list of list 360 | [[start_len, end_len]] 361 | ) 362 | caption = caption[:-2] # remove last ", " 363 | 364 | return caption # , tokens_positive 365 | 366 | 367 | def mask_for_random_drop_text_or_image_feature(masks, random_drop_embedding): 368 | """ 369 | input masks tell how many valid grounding tokens for this image 370 | e.g., 1,1,1,1,0,0,0,0,0,0... 371 | 372 | If random_drop_embedding=both. we will random drop either image or 373 | text feature for each token, 374 | but we always make sure there is at least one feature used. 375 | In other words, the following masks are not valid 376 | (because for the second obj, no feature at all): 377 | image: 1,0,1,1,0,0,0,0,0 378 | text: 1,0,0,0,0,0,0,0,0 379 | 380 | if random_drop_embedding=image. we will random drop image feature 381 | and always keep the text one. 382 | 383 | """ 384 | N = masks.shape[0] 385 | 386 | if random_drop_embedding == 'both': 387 | temp_mask = torch.ones(2, N) 388 | for i in range(N): 389 | if random.uniform(0, 1) < 0.5: # else keep both features 390 | # randomly choose to drop image or text feature 391 | idx = random.sample([0, 1], 1)[0] 392 | temp_mask[idx, i] = 0 393 | image_masks = temp_mask[0] * masks 394 | text_masks = temp_mask[1] * masks 395 | 396 | if random_drop_embedding == 'image': 397 | image_masks = masks * (torch.rand(N) > 0.5) * 1 398 | text_masks = masks 399 | 400 | return image_masks, text_masks 401 | 402 | 403 | def project(x, projection_matrix): 404 | """ 405 | x (Batch*768) should be the penultimate feature of CLIP (before projection) 406 | projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer 407 | defined in CLIP (out_dim, in_dim), thus we need to apply transpose below. 408 | this function will return the CLIP feature (without normalziation) 409 | """ 410 | return x @ torch.transpose(projection_matrix, 0, 1) 411 | 412 | 413 | def inv_project(y, projection_matrix): 414 | """ 415 | y (Batch*768) should be the CLIP feature (after projection) 416 | projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer 417 | defined in CLIP (out_dim, in_dim). 418 | this function will return the CLIP penultimate feature. 419 | 420 | Note: to make sure getting the correct penultimate feature, the input y should not be normalized. 421 | If it is normalized, then the result will be scaled by CLIP feature norm, which is unknown. 422 | """ 423 | return y @ torch.transpose(torch.linalg.inv(projection_matrix), 0, 1) 424 | -------------------------------------------------------------------------------- /utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import folder_paths 3 | 4 | import comfy.utils 5 | 6 | from .. import constants 7 | 8 | 9 | def get_model_dir(model_dir): 10 | root_path = folder_paths.models_dir 11 | path = os.path.join(root_path, constants.INSTANCE_MODELS_DIR, model_dir) 12 | return path 13 | 14 | 15 | def get_model_list(model_dir) -> list[str]: 16 | path = get_model_dir(model_dir) 17 | return os.listdir(path) 18 | 19 | 20 | def load_checkpoint(model_dir, filename): 21 | checkpoint_path = os.path.join(get_model_dir(model_dir), filename) 22 | checkpoint = comfy.utils.load_torch_file(checkpoint_path, safe_load=True) 23 | return checkpoint 24 | -------------------------------------------------------------------------------- /utils/prompt_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def extract_prompts(input_string): 5 | # Define the new regex pattern to include negative numbers 6 | pattern = r"\"(-?\d+)\.([^\"]+)\":\s*\"([^\"]+)\"" 7 | 8 | # Find all matches using the pattern 9 | matches = re.findall(pattern, input_string) 10 | 11 | # Convert matches to a list of tuples (number, name, text) 12 | result = [(int(number), name.strip(), text) 13 | for number, name, text in matches] 14 | 15 | return result 16 | --------------------------------------------------------------------------------