├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── README.md
├── __init__.py
├── docs
    ├── demo-pics
    │   ├── Selection_001.png
    │   ├── Selection_002.png
    │   └── Selection_003.png
    └── workflow-examples
    │   ├── img2img.json
    │   ├── inpaint.json
    │   └── txt2img.json
├── pyproject.toml
├── requirements.txt
├── src
    ├── __init__.py
    ├── blip_img2txt.py
    ├── img2txt_node.py
    ├── llava_img2txt.py
    └── mini_cpm_img2txt.py
└── web
    └── show-output-text.js


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     paths:
 8 |       - "pyproject.toml"
 9 | 
10 | permissions:
11 |   issues: write
12 | 
13 | jobs:
14 |   publish-node:
15 |     name: Publish Custom Node to registry
16 |     runs-on: ubuntu-latest
17 |     if: ${{ github.repository_owner == 'christian-byrne' }}
18 |     steps:
19 |       - name: Check out code
20 |         uses: actions/checkout@v4
21 |       - name: Publish Custom Node
22 |         uses: Comfy-Org/publish-node-action@v1
23 |         with:
24 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
25 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .python-version
 2 | **/*.pyc
 3 | **/__pycache__/
 4 | **/__pycache__
 5 | temp/**
 6 | temp.txt
 7 | demo-workflows
 8 | test-temp
 9 | **/venv
10 | todo.md


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | **Auto-generate caption (BLIP)**:
 4 | 
 5 | ![alt text](docs/demo-pics/Selection_003.png)
 6 | 
 7 | **Using to automate img2img process (BLIP and Llava)**
 8 | 
 9 | ![alt text](docs/demo-pics/Selection_002.png)
10 | 
11 | 
12 | ## Requirements/Dependencies
13 | 
14 | For Llava
15 | 
16 | ```
17 | bitsandbytes>=0.43.0
18 | accelerate>=0.3.0
19 | ```
20 | 
21 | For MiniCPM
22 | 
23 | ```
24 | transformers<=4.41.2
25 | timm>=1.0.7
26 | sentencepiece
27 | ```
28 | 
29 | ## Installation
30 | 
31 | 
32 | - `cd` into `ComfyUI/custom_nodes` directory
33 | - `git clone` this repo
34 | - `cd img2txt-comfyui-nodes`
35 | - `pip install -r requirements.txt`
36 | - Models will be automatically downloaded per-use. If you never toggle a model on in the UI, it will never be downloaded.
37 | - To ask a list of specific questions about the image, use the Llava or MiniPCM models. The questions are separated by line in the multiline text input box.
38 | 
39 | ## Support for Chinese
40 | 
41 | - The `MiniCPM` model works with Chinese text input without any additional configuration. The output will also be in Chinese. 
42 |   - "MiniCPM-V 2.0 supports strong bilingual multimodal capabilities in both English and Chinese. This is enabled by generalizing multimodal capabilities across languages, a technique from VisCPM"
43 | - Please support the creators of MiniCPM [here](https://github.com/OpenBMB/MiniCPM-V)
44 | 
45 | ## Tips
46 | 
47 | - The multi-line input can be used to ask any type of questions. You can even ask very specific or complex questions about images.
48 | - To get best results for a prompt that will be fed back into a txt2img or img2img prompt, usually it's best to only ask one or two questions, asking for a general description of the image and the most salient features and styles.
49 | 
50 | ## Model Locations/Paths
51 | 
52 | - Models are downloaded automatically using the Huggingface cache system and the transformers `from_pretrained` method so no manual installation of models is necessary.
53 | - If you really want to manually download the models, please refer to [Huggingface's documentation concerning the cache system](https://huggingface.co/docs/transformers/main/en/installation#cache-setup). Here is the relevant except:
54 |   - > Pretrained models are downloaded and locally cached at  `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable TRANSFORMERS_CACHE. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
55 |     > - Shell environment variable (default): HUGGINGFACE_HUB_CACHE or TRANSFORMERS_CACHE.
56 |     > - Shell environment variable: HF_HOME.
57 |     > - Shell environment variable: XDG_CACHE_HOME + /huggingface.
58 | 
59 | 
60 | ## Models
61 | 
62 | - [MiniCPM](https://huggingface.co/openbmb/MiniCPM-V-2/tree/main) (Chinese & English)
63 |   - **Title**: MiniCPM-V-2 - Strong multimodal large language model for efficient end-side deployment
64 |   - **Datasets**: HuggingFaceM4VQAv2, RLHF-V-Dataset, LLaVA-Instruct-150K
65 |   - **Size**: ~ 6.8GB
66 | - [Salesforce - blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
67 |   - **Title**: BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation 
68 |   - **Size**: ~ 2GB
69 |   - **Dataset**: COCO (The MS COCO dataset is a large-scale object detection, image segmentation, and captioning dataset published by Microsoft)
70 | - [llava - llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
71 |   - **Title**: LLava: Large Language Models for Vision and Language Tasks 
72 |   - **Size**: ~ 15GB
73 |   - **Dataset**: 558K filtered image-text pairs from LAION/CC/SBU, captioned by BLIP, 158K GPT-generated multimodal instruction-following data, 450K academic-task-oriented VQA data mixture, 40K ShareGPT data.
74 | <!-- -(https://huggingface.co/models?pipeline_tag=image-to-text&sort=trending) -->
75 | 
76 | ## Prompts
77 | 
78 | 
79 | This is the guide for the format of an "ideal" txt2img prompt (using BLIP). Use as the basis for the questions to ask the img2txt models.
80 | 
81 | - **Subject** - you can specify region, write the most about the subject
82 | - **Medium** - material used to make artwork. Some examples are illustration, oil painting, 3D rendering, and photography. Medium has a strong effect because one keyword alone can dramatically change the style.
83 | - **Style** - artistic style of the image. Examples include impressionist, surrealist, pop art, etc.
84 | - **Artists**  - Artist names are strong modifiers. They allow you to dial in the exact style using a particular artist as a reference. It is also common to use multiple artist names to blend their styles. Now let’s add Stanley Artgerm Lau, a superhero comic artist, and Alphonse Mucha, a portrait painter in the 19th century.
85 | - **Website** - Niche graphic websites such as Artstation and Deviant Art aggregate many images of distinct genres. Using them in a prompt is a sure way to steer the image toward these styles.
86 | - **Resolution** - Resolution represents how sharp and detailed the image is. Let’s add keywords highly detailed and sharp focus
87 | - **Enviornment**
88 | - **Additional** Details and objects - Additional details are sweeteners added to modify an image. We will add sci-fi, stunningly beautiful and dystopian to add some vibe to the image.
89 | - **Composition** - camera type, detail, cinematography, blur, depth-of-field
90 | - **Color/Warmth** - You can control the overall color of the image by adding color keywords. The colors you specified may appear as a tone or in objects.
91 | - **Lighting** - Any photographer would tell you lighting is a key factor in creating successful images. Lighting keywords can have a huge effect on how the image looks. Let’s add cinematic lighting and dark to the prompt.
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | from .src.img2txt_node import Img2TxtNode
 2 | 
 3 | NODE_CLASS_MAPPINGS = {
 4 |     "img2txt BLIP/Llava Multimodel Tagger": Img2TxtNode,
 5 | }
 6 | NODE_DISPLAY_NAME_MAPPINGS = {
 7 |     "img2txt BLIP/Llava Multimodel Tagger": "Image to Text - Auto Caption"
 8 | }
 9 | WEB_DIRECTORY = "./web"
10 | 


--------------------------------------------------------------------------------
/docs/demo-pics/Selection_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/christian-byrne/img2txt-comfyui-nodes/80e638a4edeccf6ddbea5711ca64f1855581e938/docs/demo-pics/Selection_001.png


--------------------------------------------------------------------------------
/docs/demo-pics/Selection_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/christian-byrne/img2txt-comfyui-nodes/80e638a4edeccf6ddbea5711ca64f1855581e938/docs/demo-pics/Selection_002.png


--------------------------------------------------------------------------------
/docs/demo-pics/Selection_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/christian-byrne/img2txt-comfyui-nodes/80e638a4edeccf6ddbea5711ca64f1855581e938/docs/demo-pics/Selection_003.png


--------------------------------------------------------------------------------
/docs/workflow-examples/img2img.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 51,
  3 |   "last_link_id": 60,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 41,
  7 |       "type": "CLIPTextEncode",
  8 |       "pos": [
  9 |         1055,
 10 |         571
 11 |       ],
 12 |       "size": {
 13 |         "0": 348.9403381347656,
 14 |         "1": 56.439388275146484
 15 |       },
 16 |       "flags": {},
 17 |       "order": 5,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "clip",
 22 |           "type": "CLIP",
 23 |           "link": 50
 24 |         },
 25 |         {
 26 |           "name": "text",
 27 |           "type": "STRING",
 28 |           "link": 60,
 29 |           "widget": {
 30 |             "name": "text"
 31 |           }
 32 |         }
 33 |       ],
 34 |       "outputs": [
 35 |         {
 36 |           "name": "CONDITIONING",
 37 |           "type": "CONDITIONING",
 38 |           "links": [
 39 |             44
 40 |           ],
 41 |           "shape": 3,
 42 |           "slot_index": 0
 43 |         }
 44 |       ],
 45 |       "properties": {
 46 |         "Node name for S&R": "CLIPTextEncode"
 47 |       },
 48 |       "widgets_values": [
 49 |         ""
 50 |       ]
 51 |     },
 52 |     {
 53 |       "id": 39,
 54 |       "type": "KSampler",
 55 |       "pos": [
 56 |         1587,
 57 |         982
 58 |       ],
 59 |       "size": {
 60 |         "0": 315,
 61 |         "1": 262
 62 |       },
 63 |       "flags": {},
 64 |       "order": 6,
 65 |       "mode": 0,
 66 |       "inputs": [
 67 |         {
 68 |           "name": "model",
 69 |           "type": "MODEL",
 70 |           "link": 42
 71 |         },
 72 |         {
 73 |           "name": "positive",
 74 |           "type": "CONDITIONING",
 75 |           "link": 44
 76 |         },
 77 |         {
 78 |           "name": "negative",
 79 |           "type": "CONDITIONING",
 80 |           "link": 45
 81 |         },
 82 |         {
 83 |           "name": "latent_image",
 84 |           "type": "LATENT",
 85 |           "link": 58
 86 |         }
 87 |       ],
 88 |       "outputs": [
 89 |         {
 90 |           "name": "LATENT",
 91 |           "type": "LATENT",
 92 |           "links": [
 93 |             48
 94 |           ],
 95 |           "shape": 3,
 96 |           "slot_index": 0
 97 |         }
 98 |       ],
 99 |       "properties": {
100 |         "Node name for S&R": "KSampler"
101 |       },
102 |       "widgets_values": [
103 |         290872458059323,
104 |         "randomize",
105 |         20,
106 |         8,
107 |         "euler",
108 |         "normal",
109 |         1
110 |       ]
111 |     },
112 |     {
113 |       "id": 45,
114 |       "type": "VAEDecode",
115 |       "pos": [
116 |         1998,
117 |         1018
118 |       ],
119 |       "size": {
120 |         "0": 210,
121 |         "1": 46
122 |       },
123 |       "flags": {},
124 |       "order": 7,
125 |       "mode": 0,
126 |       "inputs": [
127 |         {
128 |           "name": "samples",
129 |           "type": "LATENT",
130 |           "link": 48
131 |         },
132 |         {
133 |           "name": "vae",
134 |           "type": "VAE",
135 |           "link": 49
136 |         }
137 |       ],
138 |       "outputs": [
139 |         {
140 |           "name": "IMAGE",
141 |           "type": "IMAGE",
142 |           "links": [
143 |             55
144 |           ],
145 |           "shape": 3,
146 |           "slot_index": 0
147 |         }
148 |       ],
149 |       "properties": {
150 |         "Node name for S&R": "VAEDecode"
151 |       }
152 |     },
153 |     {
154 |       "id": 48,
155 |       "type": "PreviewImage",
156 |       "pos": [
157 |         2039,
158 |         1262
159 |       ],
160 |       "size": {
161 |         "0": 210,
162 |         "1": 246
163 |       },
164 |       "flags": {},
165 |       "order": 8,
166 |       "mode": 0,
167 |       "inputs": [
168 |         {
169 |           "name": "images",
170 |           "type": "IMAGE",
171 |           "link": 55
172 |         }
173 |       ],
174 |       "properties": {
175 |         "Node name for S&R": "PreviewImage"
176 |       }
177 |     },
178 |     {
179 |       "id": 42,
180 |       "type": "CLIPTextEncode",
181 |       "pos": [
182 |         1056,
183 |         683
184 |       ],
185 |       "size": {
186 |         "0": 352.9139404296875,
187 |         "1": 113.16606140136719
188 |       },
189 |       "flags": {},
190 |       "order": 3,
191 |       "mode": 0,
192 |       "inputs": [
193 |         {
194 |           "name": "clip",
195 |           "type": "CLIP",
196 |           "link": 51
197 |         }
198 |       ],
199 |       "outputs": [
200 |         {
201 |           "name": "CONDITIONING",
202 |           "type": "CONDITIONING",
203 |           "links": [
204 |             45
205 |           ],
206 |           "shape": 3,
207 |           "slot_index": 0
208 |         }
209 |       ],
210 |       "properties": {
211 |         "Node name for S&R": "CLIPTextEncode"
212 |       },
213 |       "widgets_values": [
214 |         "text, watermark"
215 |       ]
216 |     },
217 |     {
218 |       "id": 50,
219 |       "type": "VAEEncode",
220 |       "pos": [
221 |         1119,
222 |         1329
223 |       ],
224 |       "size": {
225 |         "0": 201.4841766357422,
226 |         "1": 55.59581756591797
227 |       },
228 |       "flags": {},
229 |       "order": 4,
230 |       "mode": 0,
231 |       "inputs": [
232 |         {
233 |           "name": "pixels",
234 |           "type": "IMAGE",
235 |           "link": 56
236 |         },
237 |         {
238 |           "name": "vae",
239 |           "type": "VAE",
240 |           "link": 57
241 |         }
242 |       ],
243 |       "outputs": [
244 |         {
245 |           "name": "LATENT",
246 |           "type": "LATENT",
247 |           "links": [
248 |             58
249 |           ],
250 |           "shape": 3,
251 |           "slot_index": 0
252 |         }
253 |       ],
254 |       "properties": {
255 |         "Node name for S&R": "VAEEncode"
256 |       }
257 |     },
258 |     {
259 |       "id": 11,
260 |       "type": "LoadImage",
261 |       "pos": [
262 |         -135,
263 |         907
264 |       ],
265 |       "size": {
266 |         "0": 670,
267 |         "1": 460
268 |       },
269 |       "flags": {},
270 |       "order": 0,
271 |       "mode": 0,
272 |       "outputs": [
273 |         {
274 |           "name": "IMAGE",
275 |           "type": "IMAGE",
276 |           "links": [
277 |             56,
278 |             59
279 |           ],
280 |           "shape": 3,
281 |           "slot_index": 0
282 |         },
283 |         {
284 |           "name": "MASK",
285 |           "type": "MASK",
286 |           "links": [],
287 |           "shape": 3,
288 |           "slot_index": 1
289 |         }
290 |       ],
291 |       "properties": {
292 |         "Node name for S&R": "LoadImage"
293 |       },
294 |       "widgets_values": [
295 |         "example.png",
296 |         "image"
297 |       ]
298 |     },
299 |     {
300 |       "id": 40,
301 |       "type": "CheckpointLoaderSimple",
302 |       "pos": [
303 |         1124,
304 |         1019
305 |       ],
306 |       "size": {
307 |         "0": 315,
308 |         "1": 98
309 |       },
310 |       "flags": {},
311 |       "order": 1,
312 |       "mode": 0,
313 |       "outputs": [
314 |         {
315 |           "name": "MODEL",
316 |           "type": "MODEL",
317 |           "links": [
318 |             42
319 |           ],
320 |           "shape": 3,
321 |           "slot_index": 0
322 |         },
323 |         {
324 |           "name": "CLIP",
325 |           "type": "CLIP",
326 |           "links": [
327 |             50,
328 |             51
329 |           ],
330 |           "shape": 3,
331 |           "slot_index": 1
332 |         },
333 |         {
334 |           "name": "VAE",
335 |           "type": "VAE",
336 |           "links": [
337 |             49,
338 |             57
339 |           ],
340 |           "shape": 3,
341 |           "slot_index": 2
342 |         }
343 |       ],
344 |       "properties": {
345 |         "Node name for S&R": "CheckpointLoaderSimple"
346 |       },
347 |       "widgets_values": [
348 |         "dreamshaper_8.safetensors"
349 |       ]
350 |     },
351 |     {
352 |       "id": 51,
353 |       "type": "img2txt BLIP/Llava Multimodel Tagger",
354 |       "pos": [
355 |         605,
356 |         881
357 |       ],
358 |       "size": {
359 |         "0": 427.2057800292969,
360 |         "1": 476.26934814453125
361 |       },
362 |       "flags": {},
363 |       "order": 2,
364 |       "mode": 0,
365 |       "inputs": [
366 |         {
367 |           "name": "input_image",
368 |           "type": "IMAGE",
369 |           "link": 59
370 |         }
371 |       ],
372 |       "outputs": [
373 |         {
374 |           "name": "caption",
375 |           "type": "STRING",
376 |           "links": [
377 |             60
378 |           ],
379 |           "shape": 3,
380 |           "slot_index": 0
381 |         }
382 |       ],
383 |       "properties": {
384 |         "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
385 |       },
386 |       "widgets_values": [
387 |         true,
388 |         false,
389 |         false,
390 |         false,
391 |         "a photograph of",
392 |         "What is the subject and background of this image?",
393 |         0.7000000000000001,
394 |         1.26,
395 |         36,
396 |         128,
397 |         5,
398 |         "watermark, text, writing",
399 |         "a photograph of a girl dressed up, in pink dress and bright blue eyes poses in the grass with arms spread out in front of her face, holding an umbrella on a sky, "
400 |       ],
401 |       "color": "#322",
402 |       "bgcolor": "#533"
403 |     }
404 |   ],
405 |   "links": [
406 |     [
407 |       42,
408 |       40,
409 |       0,
410 |       39,
411 |       0,
412 |       "MODEL"
413 |     ],
414 |     [
415 |       44,
416 |       41,
417 |       0,
418 |       39,
419 |       1,
420 |       "CONDITIONING"
421 |     ],
422 |     [
423 |       45,
424 |       42,
425 |       0,
426 |       39,
427 |       2,
428 |       "CONDITIONING"
429 |     ],
430 |     [
431 |       48,
432 |       39,
433 |       0,
434 |       45,
435 |       0,
436 |       "LATENT"
437 |     ],
438 |     [
439 |       49,
440 |       40,
441 |       2,
442 |       45,
443 |       1,
444 |       "VAE"
445 |     ],
446 |     [
447 |       50,
448 |       40,
449 |       1,
450 |       41,
451 |       0,
452 |       "CLIP"
453 |     ],
454 |     [
455 |       51,
456 |       40,
457 |       1,
458 |       42,
459 |       0,
460 |       "CLIP"
461 |     ],
462 |     [
463 |       55,
464 |       45,
465 |       0,
466 |       48,
467 |       0,
468 |       "IMAGE"
469 |     ],
470 |     [
471 |       56,
472 |       11,
473 |       0,
474 |       50,
475 |       0,
476 |       "IMAGE"
477 |     ],
478 |     [
479 |       57,
480 |       40,
481 |       2,
482 |       50,
483 |       1,
484 |       "VAE"
485 |     ],
486 |     [
487 |       58,
488 |       50,
489 |       0,
490 |       39,
491 |       3,
492 |       "LATENT"
493 |     ],
494 |     [
495 |       59,
496 |       11,
497 |       0,
498 |       51,
499 |       0,
500 |       "IMAGE"
501 |     ],
502 |     [
503 |       60,
504 |       51,
505 |       0,
506 |       41,
507 |       1,
508 |       "STRING"
509 |     ]
510 |   ],
511 |   "groups": [],
512 |   "config": {},
513 |   "extra": {
514 |     "ds": {
515 |       "scale": 0.9090909090909091,
516 |       "offset": {
517 |         "0": 304.575645264068,
518 |         "1": -258.56908735931404
519 |       }
520 |     }
521 |   },
522 |   "version": 0.4
523 | }


--------------------------------------------------------------------------------
/docs/workflow-examples/inpaint.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 61,
  3 |   "last_link_id": 80,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 45,
  7 |       "type": "VAEDecode",
  8 |       "pos": [
  9 |         1998,
 10 |         1018
 11 |       ],
 12 |       "size": {
 13 |         "0": 210,
 14 |         "1": 46
 15 |       },
 16 |       "flags": {},
 17 |       "order": 10,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "samples",
 22 |           "type": "LATENT",
 23 |           "link": 71
 24 |         },
 25 |         {
 26 |           "name": "vae",
 27 |           "type": "VAE",
 28 |           "link": 49
 29 |         }
 30 |       ],
 31 |       "outputs": [
 32 |         {
 33 |           "name": "IMAGE",
 34 |           "type": "IMAGE",
 35 |           "links": [
 36 |             55
 37 |           ],
 38 |           "shape": 3,
 39 |           "slot_index": 0
 40 |         }
 41 |       ],
 42 |       "properties": {
 43 |         "Node name for S&R": "VAEDecode"
 44 |       }
 45 |     },
 46 |     {
 47 |       "id": 42,
 48 |       "type": "CLIPTextEncode",
 49 |       "pos": [
 50 |         1056,
 51 |         683
 52 |       ],
 53 |       "size": {
 54 |         "0": 352.9139404296875,
 55 |         "1": 113.16606140136719
 56 |       },
 57 |       "flags": {},
 58 |       "order": 2,
 59 |       "mode": 0,
 60 |       "inputs": [
 61 |         {
 62 |           "name": "clip",
 63 |           "type": "CLIP",
 64 |           "link": 51
 65 |         }
 66 |       ],
 67 |       "outputs": [
 68 |         {
 69 |           "name": "CONDITIONING",
 70 |           "type": "CONDITIONING",
 71 |           "links": [
 72 |             63
 73 |           ],
 74 |           "shape": 3,
 75 |           "slot_index": 0
 76 |         }
 77 |       ],
 78 |       "properties": {
 79 |         "Node name for S&R": "CLIPTextEncode"
 80 |       },
 81 |       "widgets_values": [
 82 |         "text, watermark"
 83 |       ]
 84 |     },
 85 |     {
 86 |       "id": 41,
 87 |       "type": "CLIPTextEncode",
 88 |       "pos": [
 89 |         1055,
 90 |         571
 91 |       ],
 92 |       "size": {
 93 |         "0": 348.9403381347656,
 94 |         "1": 56.439388275146484
 95 |       },
 96 |       "flags": {},
 97 |       "order": 6,
 98 |       "mode": 0,
 99 |       "inputs": [
100 |         {
101 |           "name": "clip",
102 |           "type": "CLIP",
103 |           "link": 50
104 |         },
105 |         {
106 |           "name": "text",
107 |           "type": "STRING",
108 |           "link": 80,
109 |           "widget": {
110 |             "name": "text"
111 |           }
112 |         }
113 |       ],
114 |       "outputs": [
115 |         {
116 |           "name": "CONDITIONING",
117 |           "type": "CONDITIONING",
118 |           "links": [
119 |             64
120 |           ],
121 |           "shape": 3,
122 |           "slot_index": 0
123 |         }
124 |       ],
125 |       "properties": {
126 |         "Node name for S&R": "CLIPTextEncode"
127 |       },
128 |       "widgets_values": [
129 |         ""
130 |       ]
131 |     },
132 |     {
133 |       "id": 58,
134 |       "type": "PreviewImage",
135 |       "pos": [
136 |         616,
137 |         1631
138 |       ],
139 |       "size": {
140 |         "0": 401.17840576171875,
141 |         "1": 246
142 |       },
143 |       "flags": {},
144 |       "order": 7,
145 |       "mode": 0,
146 |       "inputs": [
147 |         {
148 |           "name": "images",
149 |           "type": "IMAGE",
150 |           "link": 73
151 |         }
152 |       ],
153 |       "properties": {
154 |         "Node name for S&R": "PreviewImage"
155 |       }
156 |     },
157 |     {
158 |       "id": 57,
159 |       "type": "MaskToImage",
160 |       "pos": [
161 |         617,
162 |         1543
163 |       ],
164 |       "size": {
165 |         "0": 210,
166 |         "1": 26
167 |       },
168 |       "flags": {},
169 |       "order": 5,
170 |       "mode": 0,
171 |       "inputs": [
172 |         {
173 |           "name": "mask",
174 |           "type": "MASK",
175 |           "link": 78
176 |         }
177 |       ],
178 |       "outputs": [
179 |         {
180 |           "name": "IMAGE",
181 |           "type": "IMAGE",
182 |           "links": [
183 |             73
184 |           ],
185 |           "shape": 3,
186 |           "slot_index": 0
187 |         }
188 |       ],
189 |       "properties": {
190 |         "Node name for S&R": "MaskToImage"
191 |       }
192 |     },
193 |     {
194 |       "id": 40,
195 |       "type": "CheckpointLoaderSimple",
196 |       "pos": [
197 |         1044,
198 |         1032
199 |       ],
200 |       "size": {
201 |         "0": 315,
202 |         "1": 98
203 |       },
204 |       "flags": {},
205 |       "order": 0,
206 |       "mode": 0,
207 |       "outputs": [
208 |         {
209 |           "name": "MODEL",
210 |           "type": "MODEL",
211 |           "links": [
212 |             68
213 |           ],
214 |           "shape": 3,
215 |           "slot_index": 0
216 |         },
217 |         {
218 |           "name": "CLIP",
219 |           "type": "CLIP",
220 |           "links": [
221 |             50,
222 |             51
223 |           ],
224 |           "shape": 3,
225 |           "slot_index": 1
226 |         },
227 |         {
228 |           "name": "VAE",
229 |           "type": "VAE",
230 |           "links": [
231 |             49,
232 |             69
233 |           ],
234 |           "shape": 3,
235 |           "slot_index": 2
236 |         }
237 |       ],
238 |       "properties": {
239 |         "Node name for S&R": "CheckpointLoaderSimple"
240 |       },
241 |       "widgets_values": [
242 |         "experience_70-inpainting.safetensors"
243 |       ]
244 |     },
245 |     {
246 |       "id": 48,
247 |       "type": "PreviewImage",
248 |       "pos": [
249 |         2039,
250 |         1262
251 |       ],
252 |       "size": {
253 |         "0": 295.2332458496094,
254 |         "1": 293.2945251464844
255 |       },
256 |       "flags": {},
257 |       "order": 11,
258 |       "mode": 0,
259 |       "inputs": [
260 |         {
261 |           "name": "images",
262 |           "type": "IMAGE",
263 |           "link": 55
264 |         }
265 |       ],
266 |       "properties": {
267 |         "Node name for S&R": "PreviewImage"
268 |       }
269 |     },
270 |     {
271 |       "id": 56,
272 |       "type": "KSampler",
273 |       "pos": [
274 |         1642,
275 |         820
276 |       ],
277 |       "size": {
278 |         "0": 315,
279 |         "1": 262
280 |       },
281 |       "flags": {},
282 |       "order": 9,
283 |       "mode": 0,
284 |       "inputs": [
285 |         {
286 |           "name": "model",
287 |           "type": "MODEL",
288 |           "link": 68
289 |         },
290 |         {
291 |           "name": "positive",
292 |           "type": "CONDITIONING",
293 |           "link": 66
294 |         },
295 |         {
296 |           "name": "negative",
297 |           "type": "CONDITIONING",
298 |           "link": 67
299 |         },
300 |         {
301 |           "name": "latent_image",
302 |           "type": "LATENT",
303 |           "link": 65
304 |         }
305 |       ],
306 |       "outputs": [
307 |         {
308 |           "name": "LATENT",
309 |           "type": "LATENT",
310 |           "links": [
311 |             71
312 |           ],
313 |           "shape": 3,
314 |           "slot_index": 0
315 |         }
316 |       ],
317 |       "properties": {
318 |         "Node name for S&R": "KSampler"
319 |       },
320 |       "widgets_values": [
321 |         492464952856155,
322 |         "randomize",
323 |         30,
324 |         7,
325 |         "dpmpp_2m_sde_gpu",
326 |         "normal",
327 |         0.8
328 |       ]
329 |     },
330 |     {
331 |       "id": 55,
332 |       "type": "ImageColorToMask",
333 |       "pos": [
334 |         610,
335 |         1425
336 |       ],
337 |       "size": {
338 |         "0": 315,
339 |         "1": 58
340 |       },
341 |       "flags": {},
342 |       "order": 3,
343 |       "mode": 0,
344 |       "inputs": [
345 |         {
346 |           "name": "image",
347 |           "type": "IMAGE",
348 |           "link": 61
349 |         }
350 |       ],
351 |       "outputs": [
352 |         {
353 |           "name": "MASK",
354 |           "type": "MASK",
355 |           "links": [
356 |             77,
357 |             78
358 |           ],
359 |           "shape": 3,
360 |           "slot_index": 0
361 |         }
362 |       ],
363 |       "properties": {
364 |         "Node name for S&R": "ImageColorToMask"
365 |       },
366 |       "widgets_values": [
367 |         6198527
368 |       ]
369 |     },
370 |     {
371 |       "id": 54,
372 |       "type": "InpaintModelConditioning",
373 |       "pos": [
374 |         1289,
375 |         1377
376 |       ],
377 |       "size": {
378 |         "0": 216.59999084472656,
379 |         "1": 106
380 |       },
381 |       "flags": {},
382 |       "order": 8,
383 |       "mode": 0,
384 |       "inputs": [
385 |         {
386 |           "name": "positive",
387 |           "type": "CONDITIONING",
388 |           "link": 64
389 |         },
390 |         {
391 |           "name": "negative",
392 |           "type": "CONDITIONING",
393 |           "link": 63
394 |         },
395 |         {
396 |           "name": "vae",
397 |           "type": "VAE",
398 |           "link": 69
399 |         },
400 |         {
401 |           "name": "pixels",
402 |           "type": "IMAGE",
403 |           "link": 70
404 |         },
405 |         {
406 |           "name": "mask",
407 |           "type": "MASK",
408 |           "link": 77
409 |         }
410 |       ],
411 |       "outputs": [
412 |         {
413 |           "name": "positive",
414 |           "type": "CONDITIONING",
415 |           "links": [
416 |             66
417 |           ],
418 |           "shape": 3,
419 |           "slot_index": 0
420 |         },
421 |         {
422 |           "name": "negative",
423 |           "type": "CONDITIONING",
424 |           "links": [
425 |             67
426 |           ],
427 |           "shape": 3,
428 |           "slot_index": 1
429 |         },
430 |         {
431 |           "name": "latent",
432 |           "type": "LATENT",
433 |           "links": [
434 |             65
435 |           ],
436 |           "shape": 3,
437 |           "slot_index": 2
438 |         }
439 |       ],
440 |       "properties": {
441 |         "Node name for S&R": "InpaintModelConditioning"
442 |       }
443 |     },
444 |     {
445 |       "id": 11,
446 |       "type": "LoadImage",
447 |       "pos": [
448 |         -135,
449 |         907
450 |       ],
451 |       "size": {
452 |         "0": 670,
453 |         "1": 460
454 |       },
455 |       "flags": {},
456 |       "order": 1,
457 |       "mode": 0,
458 |       "outputs": [
459 |         {
460 |           "name": "IMAGE",
461 |           "type": "IMAGE",
462 |           "links": [
463 |             61,
464 |             70,
465 |             79
466 |           ],
467 |           "shape": 3,
468 |           "slot_index": 0
469 |         },
470 |         {
471 |           "name": "MASK",
472 |           "type": "MASK",
473 |           "links": [],
474 |           "shape": 3,
475 |           "slot_index": 1
476 |         }
477 |       ],
478 |       "properties": {
479 |         "Node name for S&R": "LoadImage"
480 |       },
481 |       "widgets_values": [
482 |         "example.png",
483 |         "image"
484 |       ]
485 |     },
486 |     {
487 |       "id": 61,
488 |       "type": "img2txt BLIP/Llava Multimodel Tagger",
489 |       "pos": [
490 |         599,
491 |         886
492 |       ],
493 |       "size": [
494 |         414.8329491017887,
495 |         453.3791344354013
496 |       ],
497 |       "flags": {},
498 |       "order": 4,
499 |       "mode": 0,
500 |       "inputs": [
501 |         {
502 |           "name": "input_image",
503 |           "type": "IMAGE",
504 |           "link": 79
505 |         }
506 |       ],
507 |       "outputs": [
508 |         {
509 |           "name": "caption",
510 |           "type": "STRING",
511 |           "links": [
512 |             80
513 |           ],
514 |           "shape": 3,
515 |           "slot_index": 0
516 |         }
517 |       ],
518 |       "properties": {
519 |         "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
520 |       },
521 |       "widgets_values": [
522 |         true,
523 |         false,
524 |         false,
525 |         false,
526 |         "a photograph of",
527 |         "What is the subject of this image?\n",
528 |         0.8,
529 |         1.2,
530 |         36,
531 |         128,
532 |         5,
533 |         "watermark, text, writing"
534 |       ],
535 |       "color": "#322",
536 |       "bgcolor": "#533"
537 |     }
538 |   ],
539 |   "links": [
540 |     [
541 |       49,
542 |       40,
543 |       2,
544 |       45,
545 |       1,
546 |       "VAE"
547 |     ],
548 |     [
549 |       50,
550 |       40,
551 |       1,
552 |       41,
553 |       0,
554 |       "CLIP"
555 |     ],
556 |     [
557 |       51,
558 |       40,
559 |       1,
560 |       42,
561 |       0,
562 |       "CLIP"
563 |     ],
564 |     [
565 |       55,
566 |       45,
567 |       0,
568 |       48,
569 |       0,
570 |       "IMAGE"
571 |     ],
572 |     [
573 |       61,
574 |       11,
575 |       0,
576 |       55,
577 |       0,
578 |       "IMAGE"
579 |     ],
580 |     [
581 |       63,
582 |       42,
583 |       0,
584 |       54,
585 |       1,
586 |       "CONDITIONING"
587 |     ],
588 |     [
589 |       64,
590 |       41,
591 |       0,
592 |       54,
593 |       0,
594 |       "CONDITIONING"
595 |     ],
596 |     [
597 |       65,
598 |       54,
599 |       2,
600 |       56,
601 |       3,
602 |       "LATENT"
603 |     ],
604 |     [
605 |       66,
606 |       54,
607 |       0,
608 |       56,
609 |       1,
610 |       "CONDITIONING"
611 |     ],
612 |     [
613 |       67,
614 |       54,
615 |       1,
616 |       56,
617 |       2,
618 |       "CONDITIONING"
619 |     ],
620 |     [
621 |       68,
622 |       40,
623 |       0,
624 |       56,
625 |       0,
626 |       "MODEL"
627 |     ],
628 |     [
629 |       69,
630 |       40,
631 |       2,
632 |       54,
633 |       2,
634 |       "VAE"
635 |     ],
636 |     [
637 |       70,
638 |       11,
639 |       0,
640 |       54,
641 |       3,
642 |       "IMAGE"
643 |     ],
644 |     [
645 |       71,
646 |       56,
647 |       0,
648 |       45,
649 |       0,
650 |       "LATENT"
651 |     ],
652 |     [
653 |       73,
654 |       57,
655 |       0,
656 |       58,
657 |       0,
658 |       "IMAGE"
659 |     ],
660 |     [
661 |       77,
662 |       55,
663 |       0,
664 |       54,
665 |       4,
666 |       "MASK"
667 |     ],
668 |     [
669 |       78,
670 |       55,
671 |       0,
672 |       57,
673 |       0,
674 |       "MASK"
675 |     ],
676 |     [
677 |       79,
678 |       11,
679 |       0,
680 |       61,
681 |       0,
682 |       "IMAGE"
683 |     ],
684 |     [
685 |       80,
686 |       61,
687 |       0,
688 |       41,
689 |       1,
690 |       "STRING"
691 |     ]
692 |   ],
693 |   "groups": [],
694 |   "config": {},
695 |   "extra": {
696 |     "ds": {
697 |       "scale": 0.8264462809917354,
698 |       "offset": {
699 |         "0": 478.9515963527572,
700 |         "1": -472.76124333876595
701 |       }
702 |     }
703 |   },
704 |   "version": 0.4
705 | }


--------------------------------------------------------------------------------
/docs/workflow-examples/txt2img.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 53,
  3 |   "last_link_id": 61,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 41,
  7 |       "type": "CLIPTextEncode",
  8 |       "pos": [
  9 |         1055,
 10 |         571
 11 |       ],
 12 |       "size": {
 13 |         "0": 348.9403381347656,
 14 |         "1": 56.439388275146484
 15 |       },
 16 |       "flags": {},
 17 |       "order": 5,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "clip",
 22 |           "type": "CLIP",
 23 |           "link": 50
 24 |         },
 25 |         {
 26 |           "name": "text",
 27 |           "type": "STRING",
 28 |           "link": 61,
 29 |           "widget": {
 30 |             "name": "text"
 31 |           }
 32 |         }
 33 |       ],
 34 |       "outputs": [
 35 |         {
 36 |           "name": "CONDITIONING",
 37 |           "type": "CONDITIONING",
 38 |           "links": [
 39 |             44
 40 |           ],
 41 |           "shape": 3,
 42 |           "slot_index": 0
 43 |         }
 44 |       ],
 45 |       "properties": {
 46 |         "Node name for S&R": "CLIPTextEncode"
 47 |       },
 48 |       "widgets_values": [
 49 |         ""
 50 |       ]
 51 |     },
 52 |     {
 53 |       "id": 39,
 54 |       "type": "KSampler",
 55 |       "pos": [
 56 |         1587,
 57 |         982
 58 |       ],
 59 |       "size": {
 60 |         "0": 315,
 61 |         "1": 262
 62 |       },
 63 |       "flags": {},
 64 |       "order": 6,
 65 |       "mode": 0,
 66 |       "inputs": [
 67 |         {
 68 |           "name": "model",
 69 |           "type": "MODEL",
 70 |           "link": 42
 71 |         },
 72 |         {
 73 |           "name": "positive",
 74 |           "type": "CONDITIONING",
 75 |           "link": 44
 76 |         },
 77 |         {
 78 |           "name": "negative",
 79 |           "type": "CONDITIONING",
 80 |           "link": 45
 81 |         },
 82 |         {
 83 |           "name": "latent_image",
 84 |           "type": "LATENT",
 85 |           "link": 59
 86 |         }
 87 |       ],
 88 |       "outputs": [
 89 |         {
 90 |           "name": "LATENT",
 91 |           "type": "LATENT",
 92 |           "links": [
 93 |             48
 94 |           ],
 95 |           "shape": 3,
 96 |           "slot_index": 0
 97 |         }
 98 |       ],
 99 |       "properties": {
100 |         "Node name for S&R": "KSampler"
101 |       },
102 |       "widgets_values": [
103 |         438454791536393,
104 |         "randomize",
105 |         20,
106 |         8,
107 |         "euler",
108 |         "normal",
109 |         1
110 |       ]
111 |     },
112 |     {
113 |       "id": 45,
114 |       "type": "VAEDecode",
115 |       "pos": [
116 |         1998,
117 |         1018
118 |       ],
119 |       "size": {
120 |         "0": 210,
121 |         "1": 46
122 |       },
123 |       "flags": {},
124 |       "order": 7,
125 |       "mode": 0,
126 |       "inputs": [
127 |         {
128 |           "name": "samples",
129 |           "type": "LATENT",
130 |           "link": 48
131 |         },
132 |         {
133 |           "name": "vae",
134 |           "type": "VAE",
135 |           "link": 49
136 |         }
137 |       ],
138 |       "outputs": [
139 |         {
140 |           "name": "IMAGE",
141 |           "type": "IMAGE",
142 |           "links": [
143 |             55
144 |           ],
145 |           "shape": 3,
146 |           "slot_index": 0
147 |         }
148 |       ],
149 |       "properties": {
150 |         "Node name for S&R": "VAEDecode"
151 |       }
152 |     },
153 |     {
154 |       "id": 48,
155 |       "type": "PreviewImage",
156 |       "pos": [
157 |         2039,
158 |         1262
159 |       ],
160 |       "size": {
161 |         "0": 210,
162 |         "1": 246
163 |       },
164 |       "flags": {},
165 |       "order": 8,
166 |       "mode": 0,
167 |       "inputs": [
168 |         {
169 |           "name": "images",
170 |           "type": "IMAGE",
171 |           "link": 55
172 |         }
173 |       ],
174 |       "properties": {
175 |         "Node name for S&R": "PreviewImage"
176 |       }
177 |     },
178 |     {
179 |       "id": 42,
180 |       "type": "CLIPTextEncode",
181 |       "pos": [
182 |         1056,
183 |         683
184 |       ],
185 |       "size": {
186 |         "0": 352.9139404296875,
187 |         "1": 113.16606140136719
188 |       },
189 |       "flags": {},
190 |       "order": 4,
191 |       "mode": 0,
192 |       "inputs": [
193 |         {
194 |           "name": "clip",
195 |           "type": "CLIP",
196 |           "link": 51
197 |         }
198 |       ],
199 |       "outputs": [
200 |         {
201 |           "name": "CONDITIONING",
202 |           "type": "CONDITIONING",
203 |           "links": [
204 |             45
205 |           ],
206 |           "shape": 3,
207 |           "slot_index": 0
208 |         }
209 |       ],
210 |       "properties": {
211 |         "Node name for S&R": "CLIPTextEncode"
212 |       },
213 |       "widgets_values": [
214 |         "text, watermark"
215 |       ]
216 |     },
217 |     {
218 |       "id": 52,
219 |       "type": "EmptyLatentImage",
220 |       "pos": [
221 |         1126,
222 |         1189
223 |       ],
224 |       "size": {
225 |         "0": 315,
226 |         "1": 106
227 |       },
228 |       "flags": {},
229 |       "order": 0,
230 |       "mode": 0,
231 |       "outputs": [
232 |         {
233 |           "name": "LATENT",
234 |           "type": "LATENT",
235 |           "links": [
236 |             59
237 |           ],
238 |           "shape": 3,
239 |           "slot_index": 0
240 |         }
241 |       ],
242 |       "properties": {
243 |         "Node name for S&R": "EmptyLatentImage"
244 |       },
245 |       "widgets_values": [
246 |         512,
247 |         512,
248 |         1
249 |       ]
250 |     },
251 |     {
252 |       "id": 11,
253 |       "type": "LoadImage",
254 |       "pos": [
255 |         -135,
256 |         907
257 |       ],
258 |       "size": {
259 |         "0": 670,
260 |         "1": 460
261 |       },
262 |       "flags": {},
263 |       "order": 1,
264 |       "mode": 0,
265 |       "outputs": [
266 |         {
267 |           "name": "IMAGE",
268 |           "type": "IMAGE",
269 |           "links": [
270 |             60
271 |           ],
272 |           "shape": 3,
273 |           "slot_index": 0
274 |         },
275 |         {
276 |           "name": "MASK",
277 |           "type": "MASK",
278 |           "links": [],
279 |           "shape": 3,
280 |           "slot_index": 1
281 |         }
282 |       ],
283 |       "properties": {
284 |         "Node name for S&R": "LoadImage"
285 |       },
286 |       "widgets_values": [
287 |         "example.png",
288 |         "image"
289 |       ]
290 |     },
291 |     {
292 |       "id": 40,
293 |       "type": "CheckpointLoaderSimple",
294 |       "pos": [
295 |         1124,
296 |         1019
297 |       ],
298 |       "size": {
299 |         "0": 315,
300 |         "1": 98
301 |       },
302 |       "flags": {},
303 |       "order": 2,
304 |       "mode": 0,
305 |       "outputs": [
306 |         {
307 |           "name": "MODEL",
308 |           "type": "MODEL",
309 |           "links": [
310 |             42
311 |           ],
312 |           "shape": 3,
313 |           "slot_index": 0
314 |         },
315 |         {
316 |           "name": "CLIP",
317 |           "type": "CLIP",
318 |           "links": [
319 |             50,
320 |             51
321 |           ],
322 |           "shape": 3,
323 |           "slot_index": 1
324 |         },
325 |         {
326 |           "name": "VAE",
327 |           "type": "VAE",
328 |           "links": [
329 |             49
330 |           ],
331 |           "shape": 3,
332 |           "slot_index": 2
333 |         }
334 |       ],
335 |       "properties": {
336 |         "Node name for S&R": "CheckpointLoaderSimple"
337 |       },
338 |       "widgets_values": [
339 |         "dreamshaper_8.safetensors"
340 |       ]
341 |     },
342 |     {
343 |       "id": 53,
344 |       "type": "img2txt BLIP/Llava Multimodel Tagger",
345 |       "pos": [
346 |         584,
347 |         865
348 |       ],
349 |       "size": [
350 |         462.2727684830322,
351 |         532.8236759410865
352 |       ],
353 |       "flags": {},
354 |       "order": 3,
355 |       "mode": 0,
356 |       "inputs": [
357 |         {
358 |           "name": "input_image",
359 |           "type": "IMAGE",
360 |           "link": 60
361 |         }
362 |       ],
363 |       "outputs": [
364 |         {
365 |           "name": "caption",
366 |           "type": "STRING",
367 |           "links": [
368 |             61
369 |           ],
370 |           "shape": 3,
371 |           "slot_index": 0
372 |         }
373 |       ],
374 |       "properties": {
375 |         "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
376 |       },
377 |       "widgets_values": [
378 |         false,
379 |         false,
380 |         true,
381 |         false,
382 |         "a photograph of",
383 |         "What is a detailed description of this image?\nWhat is the background of this image?",
384 |         0.8,
385 |         1.2,
386 |         36,
387 |         128,
388 |         5,
389 |         "watermark, text, writing",
390 |         "The image features a cartoon character standing against an abstract background consisting of green, blue, and white elements. The main focus is on the woman with bright yellow wings wearing pink attire while smiling at something off-frame in front of her that seems to be representing \"clouds\" or possibly another object within view but not clearly visible due to its distance from us as viewers., "
391 |       ],
392 |       "color": "#322",
393 |       "bgcolor": "#533"
394 |     }
395 |   ],
396 |   "links": [
397 |     [
398 |       42,
399 |       40,
400 |       0,
401 |       39,
402 |       0,
403 |       "MODEL"
404 |     ],
405 |     [
406 |       44,
407 |       41,
408 |       0,
409 |       39,
410 |       1,
411 |       "CONDITIONING"
412 |     ],
413 |     [
414 |       45,
415 |       42,
416 |       0,
417 |       39,
418 |       2,
419 |       "CONDITIONING"
420 |     ],
421 |     [
422 |       48,
423 |       39,
424 |       0,
425 |       45,
426 |       0,
427 |       "LATENT"
428 |     ],
429 |     [
430 |       49,
431 |       40,
432 |       2,
433 |       45,
434 |       1,
435 |       "VAE"
436 |     ],
437 |     [
438 |       50,
439 |       40,
440 |       1,
441 |       41,
442 |       0,
443 |       "CLIP"
444 |     ],
445 |     [
446 |       51,
447 |       40,
448 |       1,
449 |       42,
450 |       0,
451 |       "CLIP"
452 |     ],
453 |     [
454 |       55,
455 |       45,
456 |       0,
457 |       48,
458 |       0,
459 |       "IMAGE"
460 |     ],
461 |     [
462 |       59,
463 |       52,
464 |       0,
465 |       39,
466 |       3,
467 |       "LATENT"
468 |     ],
469 |     [
470 |       60,
471 |       11,
472 |       0,
473 |       53,
474 |       0,
475 |       "IMAGE"
476 |     ],
477 |     [
478 |       61,
479 |       53,
480 |       0,
481 |       41,
482 |       1,
483 |       "STRING"
484 |     ]
485 |   ],
486 |   "groups": [],
487 |   "config": {},
488 |   "extra": {
489 |     "ds": {
490 |       "scale": 0.9090909090909091,
491 |       "offset": {
492 |         "0": 278.52736579431155,
493 |         "1": -323.6237095104226
494 |       }
495 |     }
496 |   },
497 |   "version": 0.4
498 | }


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "img2txt-comfyui-nodes"
 3 | description = "Get general description or specify questions to ask about images (medium, art style, background, etc.). Supports Chinese 🇨🇳 questions via MiniCPM model."
 4 | version = "1.2.1"
 5 | license = "LICENSE"
 6 | dependencies = ["transformers<=4.41.2", "bitsandbytes>=0.43.0", "timm>=1.0.7", "sentencepiece", "accelerate>=0.3.0"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/christian-byrne/img2txt-comfyui-nodes"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = "christian-byrne"
14 | DisplayName = "img2txt-comfyui-nodes"
15 | Icon = "https://img.icons8.com/?size=100&id=49374&format=png&color=000000"
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers<=4.41.2
2 | bitsandbytes>=0.43.0
3 | timm>=1.0.7
4 | sentencepiece
5 | accelerate>=0.3.0
6 | TensorImgUtils
7 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/christian-byrne/img2txt-comfyui-nodes/80e638a4edeccf6ddbea5711ca64f1855581e938/src/__init__.py


--------------------------------------------------------------------------------
/src/blip_img2txt.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import (
 3 |     BlipProcessor,
 4 |     BlipForConditionalGeneration,
 5 |     BlipConfig,
 6 |     BlipTextConfig,
 7 |     BlipVisionConfig,
 8 | )
 9 | 
10 | import torch
11 | import model_management
12 | 
13 | 
14 | class BLIPImg2Txt:
15 |     def __init__(
16 |         self,
17 |         conditional_caption: str,
18 |         min_words: int,
19 |         max_words: int,
20 |         temperature: float,
21 |         repetition_penalty: float,
22 |         search_beams: int,
23 |         model_id: str = "Salesforce/blip-image-captioning-large",
24 |     ):
25 |         self.conditional_caption = conditional_caption
26 |         self.model_id = model_id
27 | 
28 |         # Determine do_sample and num_beams
29 |         if temperature > 1.1 or temperature < 0.90:
30 |             do_sample = True
31 |             num_beams = 1  # Sampling does not use beam search
32 |         else:
33 |             do_sample = False
34 |             num_beams = (
35 |                 search_beams if search_beams > 1 else 1
36 |             )  # Use beam search if num_beams > 1
37 | 
38 |         # Initialize text config kwargs
39 |         self.text_config_kwargs = {
40 |             "do_sample": do_sample,
41 |             "max_length": max_words,
42 |             "min_length": min_words,
43 |             "repetition_penalty": repetition_penalty,
44 |             "padding": "max_length",
45 |         }
46 |         if not do_sample:
47 |             self.text_config_kwargs["temperature"] = temperature
48 |             self.text_config_kwargs["num_beams"] = num_beams
49 | 
50 |     def generate_caption(self, image: Image.Image) -> str:
51 |         if image.mode != "RGB":
52 |             image = image.convert("RGB")
53 | 
54 |         processor = BlipProcessor.from_pretrained(self.model_id)
55 | 
56 |         # Update and apply configurations
57 |         config_text = BlipTextConfig.from_pretrained(self.model_id)
58 |         config_text.update(self.text_config_kwargs)
59 |         config_vision = BlipVisionConfig.from_pretrained(self.model_id)
60 |         config = BlipConfig.from_text_vision_configs(config_text, config_vision)
61 | 
62 |         model = BlipForConditionalGeneration.from_pretrained(
63 |             self.model_id,
64 |             config=config,
65 |             torch_dtype=torch.float16,
66 |         ).to(model_management.get_torch_device())
67 | 
68 |         inputs = processor(
69 |             image,
70 |             self.conditional_caption,
71 |             return_tensors="pt",
72 |         ).to(model_management.get_torch_device(), torch.float16)
73 | 
74 |         with torch.no_grad():
75 |             out = model.generate(**inputs)
76 |             ret = processor.decode(out[0], skip_special_tokens=True)
77 | 
78 |         del model
79 |         torch.cuda.empty_cache()
80 | 
81 |         return ret
82 | 


--------------------------------------------------------------------------------
/src/img2txt_node.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: christian-byrne
  3 | @title: Img2Txt auto captioning
  4 | """
  5 | 
  6 | import torch
  7 | from torchvision import transforms
  8 | 
  9 | from tensor_img_utils import TensorImgUtils
 10 | from .llava_img2txt import LlavaImg2Txt
 11 | from .blip_img2txt import BLIPImg2Txt
 12 | from .mini_cpm_img2txt import MiniPCMImg2Txt
 13 | 
 14 | from typing import Tuple
 15 | 
 16 | 
 17 | class Img2TxtNode:
 18 |     CATEGORY = "img2txt"
 19 | 
 20 |     @classmethod
 21 |     def INPUT_TYPES(s):
 22 |         return {
 23 |             "required": {
 24 |                 "input_image": ("IMAGE",),
 25 |             },
 26 |             "optional": {
 27 |                 "use_blip_model": (
 28 |                     "BOOLEAN",
 29 |                     {
 30 |                         "default": True,
 31 |                         "label_on": "Use BLIP (Requires 2Gb Disk)",
 32 |                         "label_off": "Don't use BLIP",
 33 |                     },
 34 |                 ),
 35 |                 "use_llava_model": (
 36 |                     "BOOLEAN",
 37 |                     {
 38 |                         "default": False,
 39 |                         "label_on": "Use Llava (Requires 15Gb Disk)",
 40 |                         "label_off": "Don't use Llava",
 41 |                     },
 42 |                 ),
 43 |                 "use_mini_pcm_model": (
 44 |                     "BOOLEAN",
 45 |                     {
 46 |                         "default": False,
 47 |                         "label_on": "Use MiniCPM (Requires 6Gb Disk)",
 48 |                         "label_off": "Don't use MiniCPM",
 49 |                     },
 50 |                 ),
 51 |                 "use_all_models": (
 52 |                     "BOOLEAN",
 53 |                     {
 54 |                         "default": False,
 55 |                         "label_on": "Use all models and combine outputs (Total Size: 20+Gb)",
 56 |                         "label_off": "Use selected models only",
 57 |                     },
 58 |                 ),
 59 |                 "blip_caption_prefix": (
 60 |                     "STRING",
 61 |                     {
 62 |                         "default": "a photograph of",
 63 |                     },
 64 |                 ),
 65 |                 "prompt_questions": (
 66 |                     "STRING",
 67 |                     {
 68 |                         "default": "What is the subject of this image?\nWhat are the mediums used to make this?\nWhat are the artistic styles this is reminiscent of?\nWhich famous artists is this reminiscent of?\nHow sharp or detailed is this image?\nWhat is the environment and background of this image?\nWhat are the objects in this image?\nWhat is the composition of this image?\nWhat is the color palette in this image?\nWhat is the lighting in this image?",
 69 |                         "multiline": True,
 70 |                     },
 71 |                 ),
 72 |                 "temperature": (
 73 |                     "FLOAT",
 74 |                     {
 75 |                         "default": 0.8,
 76 |                         "min": 0.1,
 77 |                         "max": 2.0,
 78 |                         "step": 0.01,
 79 |                         "display": "slider",
 80 |                     },
 81 |                 ),
 82 |                 "repetition_penalty": (
 83 |                     "FLOAT",
 84 |                     {
 85 |                         "default": 1.2,
 86 |                         "min": 0.1,
 87 |                         "max": 2.0,
 88 |                         "step": 0.01,
 89 |                         "display": "slider",
 90 |                     },
 91 |                 ),
 92 |                 "min_words": ("INT", {"default": 36}),
 93 |                 "max_words": ("INT", {"default": 128}),
 94 |                 "search_beams": ("INT", {"default": 5}),
 95 |                 "exclude_terms": (
 96 |                     "STRING",
 97 |                     {
 98 |                         "default": "watermark, text, writing",
 99 |                     },
100 |                 ),
101 |             },
102 |             "hidden": {
103 |                 "unique_id": "UNIQUE_ID",
104 |                 "extra_pnginfo": "EXTRA_PNGINFO",
105 |                 "output_text": (
106 |                     "STRING",
107 |                     {
108 |                         "default": "",
109 |                     },
110 |                 ),
111 |             },
112 |         }
113 | 
114 |     RETURN_TYPES = ("STRING",)
115 |     RETURN_NAMES = ("caption",)
116 |     FUNCTION = "main"
117 |     OUTPUT_NODE = True
118 | 
119 |     def main(
120 |         self,
121 |         input_image: torch.Tensor,  # [Batch_n, H, W, 3-channel]
122 |         use_blip_model: bool,
123 |         use_llava_model: bool,
124 |         use_all_models: bool,
125 |         use_mini_pcm_model: bool,
126 |         blip_caption_prefix: str,
127 |         prompt_questions: str,
128 |         temperature: float,
129 |         repetition_penalty: float,
130 |         min_words: int,
131 |         max_words: int,
132 |         search_beams: int,
133 |         exclude_terms: str,
134 |         output_text: str = "",
135 |         unique_id=None,
136 |         extra_pnginfo=None,
137 |     ) -> Tuple[str, ...]:
138 |         raw_image = transforms.ToPILImage()(
139 |             TensorImgUtils.convert_to_type(input_image, "CHW")
140 |         ).convert("RGB")
141 | 
142 |         if blip_caption_prefix == "":
143 |             blip_caption_prefix = "a photograph of"
144 | 
145 |         captions = []
146 |         if use_all_models or use_blip_model:
147 |             blip = BLIPImg2Txt(
148 |                 conditional_caption=blip_caption_prefix,
149 |                 min_words=min_words,
150 |                 max_words=max_words,
151 |                 temperature=temperature,
152 |                 repetition_penalty=repetition_penalty,
153 |                 search_beams=search_beams,
154 |             )
155 |             captions.append(blip.generate_caption(raw_image))
156 | 
157 |         if use_all_models or use_llava_model:
158 |             llava_questions = prompt_questions.split("\n")
159 |             llava_questions = [
160 |                 q
161 |                 for q in llava_questions
162 |                 if q != "" and q != " " and q != "\n" and q != "\n\n"
163 |             ]
164 |             if len(llava_questions) > 0:
165 |                 llava = LlavaImg2Txt(
166 |                     question_list=llava_questions,
167 |                     model_id="llava-hf/llava-1.5-7b-hf",
168 |                     use_4bit_quantization=True,
169 |                     use_low_cpu_mem=True,
170 |                     use_flash2_attention=False,
171 |                     max_tokens_per_chunk=300,
172 |                 )
173 |                 captions.append(llava.generate_caption(raw_image))
174 | 
175 |         if use_all_models or use_mini_pcm_model:
176 |             mini_pcm = MiniPCMImg2Txt(
177 |                 question_list=prompt_questions.split("\n"),
178 |                 temperature=temperature,
179 |             )
180 |             captions.append(mini_pcm.generate_captions(raw_image))
181 | 
182 |         out_string = self.exclude(exclude_terms, self.merge_captions(captions))
183 | 
184 |         return {"ui": {"text": out_string}, "result": (out_string,)}
185 | 
186 |     def merge_captions(self, captions: list) -> str:
187 |         """Merge captions from multiple models into one string.
188 |         Necessary because we can expect the generated captions will generally
189 |         be comma-separated fragments ordered by relevance - so combine
190 |         fragments in an alternating order."""
191 |         merged_caption = ""
192 |         captions = [c.split(",") for c in captions]
193 |         for i in range(max(len(c) for c in captions)):
194 |             for j in range(len(captions)):
195 |                 if i < len(captions[j]) and captions[j][i].strip() != "":
196 |                     merged_caption += captions[j][i].strip() + ", "
197 |         return merged_caption
198 | 
199 |     def exclude(self, exclude_terms: str, out_string: str) -> str:
200 |         # https://huggingface.co/Salesforce/blip-image-captioning-large/discussions/20
201 |         exclude_terms = "arafed," + exclude_terms
202 |         exclude_terms = [
203 |             term.strip().lower() for term in exclude_terms.split(",") if term != ""
204 |         ]
205 |         for term in exclude_terms:
206 |             out_string = out_string.replace(term, "")
207 | 
208 |         return out_string
209 | 


--------------------------------------------------------------------------------
/src/llava_img2txt.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | import torch
  3 | import model_management
  4 | from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
  5 | 
  6 | 
  7 | class LlavaImg2Txt:
  8 |     """
  9 |     A class to generate text captions for images using the Llava model.
 10 | 
 11 |     Args:
 12 |         question_list (list[str]): A list of questions to ask the model about the image.
 13 |         model_id (str): The model's name in the Hugging Face model hub.
 14 |         use_4bit_quantization (bool): Whether to use 4-bit quantization to reduce memory usage. 4-bit quantization reduces the precision of model parameters, potentially affecting the quality of generated outputs. Use if VRAM is limited. Default is True.
 15 |         use_low_cpu_mem (bool): In low_cpu_mem_usage mode, the model is initialized with optimizations aimed at reducing CPU memory consumption. This can be beneficial when working with large models or limited computational resources. Default is True.
 16 |         use_flash2_attention (bool): Whether to use Flash-Attention 2. Flash-Attention 2 focuses on optimizing attention mechanisms, which are crucial for the model's performance during generation. Use if computational resources are abundant. Default is False.
 17 |         max_tokens_per_chunk (int): The maximum number of tokens to generate per prompt chunk. Default is 300.
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         question_list,
 23 |         model_id: str = "llava-hf/llava-1.5-7b-hf",
 24 |         use_4bit_quantization: bool = True,
 25 |         use_low_cpu_mem: bool = True,
 26 |         use_flash2_attention: bool = False,
 27 |         max_tokens_per_chunk: int = 300,
 28 |     ):
 29 |         self.question_list = question_list
 30 |         self.model_id = model_id
 31 |         self.use_4bit = use_4bit_quantization
 32 |         self.use_flash2 = use_flash2_attention
 33 |         self.use_low_cpu_mem = use_low_cpu_mem
 34 |         self.max_tokens_per_chunk = max_tokens_per_chunk
 35 | 
 36 |     def generate_caption(
 37 |         self,
 38 |         raw_image: Image.Image,
 39 |     ) -> str:
 40 |         """
 41 |         Generate a caption for an image using the Llava model.
 42 | 
 43 |         Args:
 44 |             raw_image (Image): Image to generate caption for
 45 |         """
 46 |         # Convert Image to RGB first
 47 |         if raw_image.mode != "RGB":
 48 |             raw_image = raw_image.convert("RGB")
 49 | 
 50 |         dtype = torch.float16
 51 |         quant_config = BitsAndBytesConfig(
 52 |             load_in_4bit=self.use_4bit,
 53 |             bnb_4bit_compute_dtype=dtype,
 54 |             bnb_4bit_quant_type="fp4"
 55 |         )
 56 | 
 57 |         model = LlavaForConditionalGeneration.from_pretrained(
 58 |             self.model_id,
 59 |             torch_dtype=dtype,
 60 |             low_cpu_mem_usage=self.use_low_cpu_mem,
 61 |             use_flash_attention_2=self.use_flash2,
 62 |             quantization_config=quant_config,
 63 |         )
 64 | 
 65 |         # model.to() is not supported for 4-bit or 8-bit bitsandbytes models. With 4-bit quantization, use the model as it is, since the model will already be set to the correct devices and casted to the correct `dtype`.
 66 |         if torch.cuda.is_available() and not self.use_4bit:
 67 |             model = model.to(model_management.get_torch_device(), torch.float16)
 68 | 
 69 |         processor = AutoProcessor.from_pretrained(self.model_id)
 70 |         prompt_chunks = self.__get_prompt_chunks(chunk_size=4)
 71 | 
 72 |         caption = ""
 73 |         with torch.no_grad():
 74 |             for prompt_list in prompt_chunks:
 75 |                 prompt = self.__get_single_answer_prompt(prompt_list)
 76 |                 inputs = processor(prompt, raw_image, return_tensors="pt").to(
 77 |                     model_management.get_torch_device(), torch.float16
 78 |                 )
 79 |                 output = model.generate(
 80 |                     **inputs, max_new_tokens=self.max_tokens_per_chunk, do_sample=False
 81 |                 )
 82 |                 decoded = processor.decode(output[0][2:])
 83 |                 cleaned = self.clean_output(decoded)
 84 |                 caption += cleaned
 85 | 
 86 |         del model
 87 |         torch.cuda.empty_cache()
 88 | 
 89 |         return caption
 90 | 
 91 |     def clean_output(self, decoded_output, delimiter=","):
 92 |         output_only = decoded_output.split("ASSISTANT: ")[1]
 93 |         lines = output_only.split("\n")
 94 |         cleaned_output = ""
 95 |         for line in lines:
 96 |             cleaned_output += self.__replace_delimiter(line, ".", delimiter)
 97 | 
 98 |         return cleaned_output
 99 | 
100 |     def __get_single_answer_prompt(self, questions):
101 |         """
102 |         For multiple turns conversation:
103 |         "USER: <image>\n<prompt1> ASSISTANT: <answer1></s>USER: <prompt2> ASSISTANT: <answer2></s>USER: <prompt3> ASSISTANT:"
104 |         From: https://huggingface.co/docs/transformers/en/model_doc/llava#usage-tips
105 |         Not sure how the formatting works for multi-turn but those are the docs.
106 |         """
107 |         prompt = "USER: <image>\n"
108 |         for index, question in enumerate(questions):
109 |             if index != 0:
110 |                 prompt += "USER: "
111 |             prompt += f"{question} </s >"
112 |         prompt += "ASSISTANT: "
113 | 
114 |         return prompt
115 | 
116 |     def __replace_delimiter(self, text: str, old, new=","):
117 |         """Replace only the LAST instance of old with new"""
118 |         if old not in text:
119 |             return text.strip() + " "
120 |         last_old_index = text.rindex(old)
121 |         replaced = text[:last_old_index] + new + text[last_old_index + len(old) :]
122 |         return replaced.strip() + " "
123 | 
124 |     def __get_prompt_chunks(self, chunk_size=4):
125 |         prompt_chunks = []
126 |         for index, feature in enumerate(self.question_list):
127 |             if index % chunk_size == 0:
128 |                 prompt_chunks.append([feature])
129 |             else:
130 |                 prompt_chunks[-1].append(feature)
131 |         return prompt_chunks
132 | 


--------------------------------------------------------------------------------
/src/mini_cpm_img2txt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import AutoModel, AutoTokenizer
 4 | 
 5 | import model_management
 6 | 
 7 | class MiniPCMImg2Txt:
 8 |     def __init__(self, question_list: list[str], temperature: float = 0.7):
 9 |         self.model_id = "openbmb/MiniCPM-V-2"
10 |         self.question_list = question_list
11 |         self.question_list = self.__create_question_list()
12 |         self.temperature = temperature
13 | 
14 |     def __create_question_list(self) -> list:
15 |         ret = []
16 |         for q in self.question_list:
17 |             ret.append({"role": "user", "content": q})
18 |         return ret
19 | 
20 |     def generate_captions(self, raw_image: Image.Image) -> str:
21 |         device = model_management.get_torch_device()
22 | 
23 |         # For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
24 |         # For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
25 |         torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
26 | 
27 |         model = AutoModel.from_pretrained(
28 |             "openbmb/MiniCPM-V-2", trust_remote_code=True, torch_dtype=torch_dtype
29 |         )
30 |         model = model.to(device=device, dtype=torch_dtype)
31 | 
32 |         tokenizer = AutoTokenizer.from_pretrained(
33 |             self.model_id, trust_remote_code=True
34 |         )
35 |         model.eval()
36 | 
37 |         if raw_image.mode != "RGB":
38 |             raw_image = raw_image.convert("RGB")
39 | 
40 |         with torch.no_grad():
41 |             res, _, _ = model.chat(
42 |                 image=raw_image,
43 |                 msgs=self.question_list,
44 |                 context=None,
45 |                 tokenizer=tokenizer,
46 |                 sampling=True,
47 |                 temperature=self.temperature,
48 |             )
49 | 
50 |         del model
51 |         torch.cuda.empty_cache()
52 | 
53 |         return res
54 | 


--------------------------------------------------------------------------------
/web/show-output-text.js:
--------------------------------------------------------------------------------
 1 | import { app } from "../../../scripts/app.js";
 2 | import { ComfyWidgets } from "../../../scripts/widgets.js";
 3 | 
 4 | // Displays output caption text
 5 | app.registerExtension({
 6 |   name: "Img2TxtNode",
 7 |   async beforeRegisterNodeDef(nodeType, nodeData, app) {
 8 |     if (nodeData.name === "img2txt BLIP/Llava Multimodel Tagger") {
 9 |       function populate(message) {
10 |         console.log("message", message);
11 |         console.log("message.text", message.text);
12 | 
13 |         const insertIndex = this.widgets.findIndex((w) => w.name === "output_text");
14 |         if (insertIndex !== -1) {
15 |           for (let i = insertIndex; i < this.widgets.length; i++) {
16 |             this.widgets[i].onRemove?.();
17 |           }
18 |           this.widgets.length = insertIndex;
19 |         }
20 | 
21 |         const outputWidget = ComfyWidgets["STRING"](
22 |           this,
23 |           "output_text",
24 |           ["STRING", { multiline: true }],
25 |           app
26 |         ).widget;
27 |         outputWidget.inputEl.readOnly = true;
28 |         outputWidget.inputEl.style.opacity = 0.6;
29 |         outputWidget.value = message.text.join("");
30 | 
31 |         requestAnimationFrame(() => {
32 |           const size_ = this.computeSize();
33 |           if (size_[0] < this.size[0]) {
34 |             size_[0] = this.size[0];
35 |           }
36 |           if (size_[1] < this.size[1]) {
37 |             size_[1] = this.size[1];
38 |           }
39 |           this.onResize?.(size_);
40 |           app.graph.setDirtyCanvas(true, false);
41 |         });
42 |       }
43 | 
44 |       const onExecuted = nodeType.prototype.onExecuted;
45 |       nodeType.prototype.onExecuted = function (message) {
46 |         onExecuted?.apply(this, arguments);
47 |         populate.call(this, message);
48 |       };
49 |     }
50 |   },
51 | });
52 | 


--------------------------------------------------------------------------------