├── .github
    └── workflows
    │   └── publish.yml
├── JC2.py
├── JCBO.py
├── LICENSE
├── README.md
├── __init__.py
├── example
    ├── JoyCaption Beta_One_example.json
    └── JoyCaption Beta_One_example.png
├── extra_option.json
├── pyproject.toml
├── requirements.txt
└── 安装liger-kernel.bat


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |     paths:
 9 |       - "pyproject.toml"
10 | 
11 | jobs:
12 |   publish-node:
13 |     name: Publish Custom Node to registry
14 |     runs-on: ubuntu-latest
15 |     # if this is a forked repository. Skipping the workflow.
16 |     if: github.event.repository.fork == false 
17 |     steps:
18 |       - name: Check out code
19 |         uses: actions/checkout@v4
20 |       - name: Publish Custom Node
21 |         uses: Comfy-Org/publish-node-action@main
22 |         with:
23 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
24 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
25 | 


--------------------------------------------------------------------------------
/JC2.py:
--------------------------------------------------------------------------------
  1 | # Based on https://huggingface.co/John6666/joy-caption-alpha-two-cli-modand https://github.com/chflame163/ComfyUI_LayerStyle
  2 | 
  3 | import os
  4 | import sys
  5 | import torch
  6 | from torch import nn
  7 | from typing import List, Union
  8 | from PIL import Image
  9 | import torchvision.transforms.functional as TVF
 10 | from torchvision.transforms import ToPILImage
 11 | import numpy as np
 12 | import folder_paths
 13 | import json
 14 | import logging
 15 | from transformers import AutoProcessor, AutoModelForCausalLM
 16 | from huggingface_hub import snapshot_download
 17 | import shutil
 18 | import gc
 19 | import comfy.model_management as mm
 20 | import comfy.sd
 21 | 
 22 | # Define the Joy2_Model class
 23 | class Joy2_Model:
 24 |     def __init__(self, clip_processor, clip_model, tokenizer, text_model, image_adapter):
 25 |         self.clip_processor = clip_processor
 26 |         self.clip_model = clip_model
 27 |         self.tokenizer = tokenizer
 28 |         self.text_model = text_model
 29 |         self.image_adapter = image_adapter
 30 | 
 31 | # Define the ImageAdapter class
 32 | class ImageAdapter(nn.Module):
 33 |     def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int,
 34 |                  deep_extract: bool):
 35 |         super().__init__()
 36 |         self.deep_extract = deep_extract
 37 | 
 38 |         if self.deep_extract:
 39 |             input_features = input_features * 5
 40 | 
 41 |         self.linear1 = nn.Linear(input_features, output_features)
 42 |         self.activation = nn.GELU()
 43 |         self.linear2 = nn.Linear(output_features, output_features)
 44 |         self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
 45 |         self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
 46 | 
 47 |         # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>)
 48 |         self.other_tokens = nn.Embedding(3, output_features)
 49 |         self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)  # Matches HF's implementation of LLaMA
 50 | 
 51 |     def forward(self, vision_outputs: torch.Tensor):
 52 |         if self.deep_extract:
 53 |             x = torch.cat((
 54 |                 vision_outputs[-2],
 55 |                 vision_outputs[3],
 56 |                 vision_outputs[7],
 57 |                 vision_outputs[13],
 58 |                 vision_outputs[20],
 59 |             ), dim=-1)
 60 |             assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"  # batch, tokens, features
 61 |             assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
 62 |         else:
 63 |             x = vision_outputs[-2]
 64 | 
 65 |         x = self.ln1(x)
 66 | 
 67 |         if self.pos_emb is not None:
 68 |             assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
 69 |             x = x + self.pos_emb
 70 | 
 71 |         x = self.linear1(x)
 72 |         x = self.activation(x)
 73 |         x = self.linear2(x)
 74 | 
 75 |         other_tokens = self.other_tokens(
 76 |             torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
 77 |         assert other_tokens.shape == (
 78 |             x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
 79 |         x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
 80 | 
 81 |         return x
 82 | 
 83 |     def get_eot_embedding(self):
 84 |         return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
 85 | 
 86 | # 设置全局设备变量
 87 | current_device = "cuda:0"
 88 | 
 89 | def get_torch_device_patched():
 90 |     global current_device
 91 |     if (
 92 |         not torch.cuda.is_available()
 93 |         or comfy.model_management.cpu_state == comfy.model_management.CPUState.CPU
 94 |     ):
 95 |         return torch.device("cpu")
 96 | 
 97 |     return torch.device(current_device)
 98 | 
 99 | # 覆盖ComfyUI的设备获取函数
100 | comfy.model_management.get_torch_device = get_torch_device_patched
101 | 
102 | def load_models(model_path, dtype, device="cuda:0", device_map=None):
103 |     global current_device
104 |     current_device = device  # 设置当前设备
105 |     from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
106 |     from peft import PeftModel
107 | 
108 |     JC_lora = "text_model"
109 |     use_lora = True if JC_lora != "none" else False
110 |     CLIP_PATH = os.path.join(folder_paths.models_dir, "clip_vision", "google--siglip-so400m-patch14-384")
111 |     CHECKPOINT_PATH = os.path.join(folder_paths.models_dir, "Joy_caption", "cgrkzexw-599808")
112 |     LORA_PATH = os.path.join(CHECKPOINT_PATH, "text_model")
113 | 
114 |     if os.path.exists(CLIP_PATH):
115 |         print("Start to load existing VLM")
116 |     else:
117 |         print("VLM not found locally. Downloading google/siglip-so400m-patch14-384...")
118 |         try:
119 |             snapshot_download(
120 |                 repo_id="google/siglip-so400m-patch14-384", 
121 |                 local_dir=os.path.join(folder_paths.models_dir, "clip_vision", "cache--google--siglip-so400m-patch14-384"),
122 |                 local_dir_use_symlinks=False,
123 |                 resume_download=True
124 |             )
125 |             shutil.move(os.path.join(folder_paths.models_dir, "clip_vision", "cache--google--siglip-so400m-patch14-384"), CLIP_PATH)
126 |             print(f"VLM has been downloaded to {CLIP_PATH}")
127 |         except Exception as e:
128 |             print(f"Error downloading CLIP model: {e}")
129 |             raise
130 | 
131 |     try:
132 |         if dtype == "nf4":
133 |             from transformers import BitsAndBytesConfig
134 |             nf4_config = BitsAndBytesConfig(
135 |                 load_in_4bit=True, 
136 |                 bnb_4bit_quant_type="nf4",
137 |                 bnb_4bit_use_double_quant=True, 
138 |                 bnb_4bit_compute_dtype=torch.bfloat16
139 |             )
140 |             print("Loading in NF4")
141 |             print("Loading CLIP")
142 |             clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
143 |             clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
144 | 
145 |             print("Loading VLM's custom vision model")
146 |             checkpoint = torch.load(os.path.join(CHECKPOINT_PATH, "clip_model.pt"), map_location=current_device, weights_only=False)
147 |             checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
148 |             clip_model.load_state_dict(checkpoint)
149 |             del checkpoint
150 |             clip_model.eval().requires_grad_(False).to(current_device)
151 | 
152 |             print("Loading tokenizer")
153 |             tokenizer = AutoTokenizer.from_pretrained(os.path.join(CHECKPOINT_PATH, "text_model"), use_fast=True)
154 |             assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
155 | 
156 |             print(f"Loading LLM: {model_path}")
157 |             text_model = AutoModelForCausalLM.from_pretrained(
158 |                 model_path, 
159 |                 quantization_config=nf4_config,
160 |                 device_map=current_device,  # 统一使用指定设备
161 |                 torch_dtype=torch.bfloat16
162 |             ).eval()
163 | 
164 |             if use_lora and os.path.exists(LORA_PATH):
165 |                 print("Loading VLM's custom text model")
166 |                 text_model = PeftModel.from_pretrained(
167 |                     model=text_model, 
168 |                     model_id=LORA_PATH, 
169 |                     device_map=current_device,  # 统一使用指定设备
170 |                     quantization_config=nf4_config
171 |                 )
172 |                 text_model = text_model.merge_and_unload(safe_merge=True)
173 |             else:
174 |                 print("VLM's custom text model isn't loaded")
175 | 
176 |             print("Loading image adapter")
177 |             image_adapter = ImageAdapter(
178 |                 clip_model.config.hidden_size, 
179 |                 text_model.config.hidden_size, 
180 |                 False, False, 38,
181 |                 False
182 |             ).eval().to("cpu")
183 |             image_adapter.load_state_dict(
184 |                 torch.load(os.path.join(CHECKPOINT_PATH, "image_adapter.pt"), map_location=current_device, weights_only=False)
185 |             )
186 |             image_adapter.eval().to(current_device)
187 |         else:  # bf16
188 |             print("Loading in bfloat16")
189 |             print("Loading CLIP")
190 |             clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
191 |             clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
192 |             if os.path.exists(os.path.join(CHECKPOINT_PATH, "clip_model.pt")):
193 |                 print("Loading VLM's custom vision model")
194 |                 checkpoint = torch.load(os.path.join(CHECKPOINT_PATH, "clip_model.pt"), map_location=current_device, weights_only=False)
195 |                 checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
196 |                 clip_model.load_state_dict(checkpoint)
197 |                 del checkpoint
198 |             clip_model.eval().requires_grad_(False).to(current_device)
199 | 
200 |             print("Loading tokenizer")
201 |             tokenizer = AutoTokenizer.from_pretrained(os.path.join(CHECKPOINT_PATH, "text_model"), use_fast=True)
202 |             assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
203 | 
204 |             print(f"Loading LLM: {model_path}")
205 |             text_model = AutoModelForCausalLM.from_pretrained(
206 |                 model_path, 
207 |                 device_map=current_device,  # 统一使用指定设备
208 |                 torch_dtype=torch.bfloat16
209 |             ).eval()
210 | 
211 |             if use_lora and os.path.exists(LORA_PATH):
212 |                 print("Loading VLM's custom text model")
213 |                 text_model = PeftModel.from_pretrained(
214 |                     model=text_model, 
215 |                     model_id=LORA_PATH, 
216 |                     device_map=current_device  # 统一使用指定设备
217 |                 )
218 |                 text_model = text_model.merge_and_unload(safe_merge=True)
219 |             else:
220 |                 print("VLM's custom text model isn't loaded")
221 | 
222 |             print("Loading image adapter")
223 |             image_adapter = ImageAdapter(
224 |                 clip_model.config.hidden_size, 
225 |                 text_model.config.hidden_size, 
226 |                 False, False, 38,
227 |                 False
228 |             ).eval().to(current_device)
229 |             image_adapter.load_state_dict(
230 |                 torch.load(os.path.join(CHECKPOINT_PATH, "image_adapter.pt"), map_location=current_device, weights_only=False)
231 |             )
232 |     except Exception as e:
233 |         print(f"Error loading models: {e}", )
234 |     finally:
235 |         pass  # 可以在这里添加内存释放逻辑（如果需要）
236 | 
237 |     return Joy2_Model(clip_processor, clip_model, tokenizer, text_model, image_adapter)
238 | 
239 | # Define the stream_chat function
240 | @torch.inference_mode()
241 | def stream_chat(input_images: List[Image.Image], caption_type: str, caption_length: Union[str, int],
242 |                 extra_options: list[str], name_input: str, custom_prompt: str,
243 |                 max_new_tokens: int, top_p: float, temperature: float, batch_size: int, model: Joy2_Model, current_device=str):
244 | 
245 |     # 确定 chat_device
246 |     if 'cuda' in current_device:
247 |         chat_device = 'cuda'
248 |     elif 'cpu' in current_device:
249 |         chat_device = 'cpu'
250 |     else:
251 |         raise ValueError(f"Unsupported device type: {current_device}")
252 | 
253 | 
254 |     CAPTION_TYPE_MAP = {
255 |         "Descriptive": [
256 |             "Write a descriptive caption for this image in a formal tone.",
257 |             "Write a descriptive caption for this image in a formal tone within {word_count} words.",
258 |             "Write a {length} descriptive caption for this image in a formal tone.",
259 |         ],
260 |         "Descriptive (Informal)": [
261 |             "Write a descriptive caption for this image in a casual tone.",
262 |             "Write a descriptive caption for this image in a casual tone within {word_count} words.",
263 |             "Write a {length} descriptive caption for this image in a casual tone.",
264 |         ],
265 |         "Training Prompt": [
266 |             "Write a stable diffusion prompt for this image.",
267 |             "Write a stable diffusion prompt for this image within {word_count} words.",
268 |             "Write a {length} stable diffusion prompt for this image.",
269 |         ],
270 |         "MidJourney": [
271 |             "Write a MidJourney prompt for this image.",
272 |             "Write a MidJourney prompt for this image within {word_count} words.",
273 |             "Write a {length} MidJourney prompt for this image.",
274 |         ],
275 |         "Booru tag list": [
276 |             "Write a list of Booru tags for this image.",
277 |             "Write a list of Booru tags for this image within {word_count} words.",
278 |             "Write a {length} list of Booru tags for this image.",
279 |         ],
280 |         "Booru-like tag list": [
281 |             "Write a list of Booru-like tags for this image.",
282 |             "Write a list of Booru-like tags for this image within {word_count} words.",
283 |             "Write a {length} list of Booru-like tags for this image.",
284 |         ],
285 |         "Art Critic": [
286 |             "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
287 |             "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
288 |             "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
289 |         ],
290 |         "Product Listing": [
291 |             "Write a caption for this image as though it were a product listing.",
292 |             "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
293 |             "Write a {length} caption for this image as though it were a product listing.",
294 |         ],
295 |         "Social Media Post": [
296 |             "Write a caption for this image as if it were being used for a social media post.",
297 |             "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
298 |             "Write a {length} caption for this image as if it were being used for a social media post.",
299 |         ],
300 |     }
301 | 
302 |     all_captions = []
303 | 
304 |     # 'any' means no length specified
305 |     length = None if caption_length == "any" else caption_length
306 | 
307 |     if isinstance(length, str):
308 |         try:
309 |             length = int(length)
310 |         except ValueError:
311 |             pass
312 | 
313 |     # Build prompt
314 |     if length is None:
315 |         map_idx = 0
316 |     elif isinstance(length, int):
317 |         map_idx = 1
318 |     elif isinstance(length, str):
319 |         map_idx = 2
320 |     else:
321 |         raise ValueError(f"Invalid caption length: {length}")
322 | 
323 |     prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
324 | 
325 |     # Add extra options
326 |     if len(extra_options) > 0:
327 |         prompt_str += " " + " ".join(extra_options)
328 | 
329 |     # Add name, length, word_count
330 |     prompt_str = prompt_str.format(name=name_input, length=caption_length, word_count=caption_length)
331 | 
332 |     if custom_prompt.strip() != "":
333 |         prompt_str = custom_prompt.strip()
334 | 
335 |     # For debugging
336 |     print(f"Prompt: {prompt_str}")
337 | 
338 |     for i in range(0, len(input_images), batch_size):
339 |         batch = input_images[i:i + batch_size]
340 | 
341 |         for input_image in batch:
342 |             try:
343 |                 # Preprocess image
344 |                 image = input_image.resize((384, 384), Image.LANCZOS)
345 |                 pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
346 |                 pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
347 |                 pixel_values = pixel_values.to(chat_device)
348 |             except ValueError as e:
349 |                 print(f"Error processing image: {e}")
350 |                 print("Skipping this image and continuing...")
351 |                 continue
352 | 
353 |             # Embed image
354 |             with torch.amp.autocast_mode.autocast(chat_device, enabled=True):
355 |                 vision_outputs = model.clip_model(pixel_values=pixel_values, output_hidden_states=True)
356 |                 image_features = vision_outputs.hidden_states
357 |                 embedded_images = model.image_adapter(image_features).to(chat_device)
358 | 
359 |             # Build the conversation
360 |             convo = [
361 |                 {
362 |                     "role": "system",
363 |                     "content": "You are a helpful image captioner.",
364 |                 },
365 |                 {
366 |                     "role": "user",
367 |                     "content": prompt_str,
368 |                 },
369 |             ]
370 | 
371 |             # Format the conversation
372 |             if hasattr(model.tokenizer, 'apply_chat_template'):
373 |                 convo_string = model.tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
374 |             else:
375 |                 # Fallback if apply_chat_template is not available
376 |                 convo_string = "<|eot_id|>\n"
377 |                 for message in convo:
378 |                     if message['role'] == 'system':
379 |                         convo_string += f"<|system|>{message['content']}<|endoftext|>\n"
380 |                     elif message['role'] == 'user':
381 |                         convo_string += f"<|user|>{message['content']}<|endoftext|>\n"
382 |                     else:
383 |                         convo_string += f"{message['content']}<|endoftext|>\n"
384 |                 convo_string += "<|eot_id|>"
385 | 
386 |             assert isinstance(convo_string, str)
387 | 
388 |             # Tokenize the conversation
389 |             convo_tokens = model.tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False,
390 |                                                   truncation=False)
391 |             prompt_tokens = model.tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False,
392 |                                                    truncation=False)
393 |             assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
394 |             convo_tokens = convo_tokens.squeeze(0)
395 |             prompt_tokens = prompt_tokens.squeeze(0)
396 | 
397 |             # Calculate where to inject the image
398 |             eot_id_indices = (convo_tokens == model.tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[
399 |                 0].tolist()
400 |             assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
401 | 
402 |             preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]
403 | 
404 |             # Embed the tokens
405 |             convo_embeds = model.text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(current_device))
406 | 
407 |             # Construct the input
408 |             input_embeds = torch.cat([
409 |                 convo_embeds[:, :preamble_len],
410 |                 embedded_images.to(dtype=convo_embeds.dtype),
411 |                 convo_embeds[:, preamble_len:],
412 |             ], dim=1).to(chat_device)
413 | 
414 |             input_ids = torch.cat([
415 |                 convo_tokens[:preamble_len].unsqueeze(0),
416 |                 torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
417 |                 convo_tokens[preamble_len:].unsqueeze(0),
418 |             ], dim=1).to(chat_device)
419 |             attention_mask = torch.ones_like(input_ids)
420 | 
421 |             generate_ids = model.text_model.generate(input_ids=input_ids, inputs_embeds=input_embeds,
422 |                                                      attention_mask=attention_mask, do_sample=True,
423 |                                                      suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p,
424 |                                                      temperature=temperature)
425 | 
426 |             # Trim off the prompt
427 |             generate_ids = generate_ids[:, input_ids.shape[1]:]
428 |             if generate_ids[0][-1] == model.tokenizer.eos_token_id or generate_ids[0][-1] == model.tokenizer.convert_tokens_to_ids(
429 |                     "<|eot_id|>"):
430 |                 generate_ids = generate_ids[:, :-1]
431 | 
432 |             caption = model.tokenizer.batch_decode(generate_ids, skip_special_tokens=False,
433 |                                                    clean_up_tokenization_spaces=False)[0]
434 |             all_captions.append(caption.strip())
435 | 
436 |     return all_captions
437 | 
438 | def free_memory():
439 |     import gc
440 |     gc.collect()
441 |     if torch.cuda.is_available():
442 |         torch.cuda.empty_cache()
443 |         torch.cuda.ipc_collect()
444 |         
445 | 
446 | def cleanGPU():
447 |     gc.collect()
448 |     mm.unload_all_models()
449 |     mm.soft_empty_cache()
450 |             
451 | 
452 | class JoyCaption2:
453 |     
454 |     CATEGORY = 'TTP_Toolset'
455 |     FUNCTION = "joycaption2"
456 |     RETURN_TYPES = ("STRING",)
457 |     RETURN_NAMES = ("text",)
458 |     OUTPUT_IS_LIST = (True,)
459 |     
460 |     def __init__(self):
461 |         self.NODE_NAME = 'JoyCaption2'
462 |         self.previous_model = None
463 | 
464 |     @classmethod
465 |     def INPUT_TYPES(cls):
466 |         llm_model_list = ["unsloth/Meta-Llama-3.1-8B-Instruct", "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"]
467 |         dtype_list = ['nf4', 'bf16']
468 |         caption_type_list = [
469 |             "Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney",
470 |             "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing",
471 |             "Social Media Post"
472 |         ]
473 |         caption_length_list = ["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 5)]
474 |         
475 |         # 获取extra_option.json路径
476 |         base_dir = os.path.dirname(os.path.abspath(__file__))
477 |         extra_option_file = os.path.join(base_dir, "extra_option.json") 
478 | 
479 |         # 加载extra_options_list
480 |         extra_options_list = {}
481 |         if os.path.isfile(extra_option_file):
482 |             try:
483 |                 with open(extra_option_file, "r", encoding='utf-8') as f:
484 |                     json_content = json.load(f)
485 |                     for item in json_content:
486 |                         option_name = item.get("name")
487 |                         if option_name:
488 |                             extra_options_list[option_name] = ("BOOLEAN", {"default": False})
489 |             except Exception as e:
490 |                 print(f"Error loading extra_option.json: {e}")
491 |         else:
492 |             print(f"extra_option.json not found at {extra_option_file}. No extra options will be available.")
493 | 
494 |         # 获取可用的GPU设备列表
495 |         gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
496 |         if not gpu_devices:
497 |             gpu_devices = ["cpu"]  # 如果没有GPU可用，则仅提供CPU选项
498 | 
499 |         # 定义额外的输入字段
500 |         return {
501 |             "required": {
502 |                 "image": ("IMAGE",),
503 |                 "llm_model": (llm_model_list,),
504 |                 "dtype": (dtype_list,),
505 |                 "caption_type": (caption_type_list,),
506 |                 "caption_length": (caption_length_list,),
507 |                 "user_prompt": ("STRING", {"default": "", "multiline": True}),
508 |                 "max_new_tokens": ("INT", {"default": 260, "min": 8, "max": 4096, "step": 1}),
509 |                 "top_p": ("FLOAT", {"default": 0.8, "min": 0, "max": 1, "step": 0.01}),
510 |                 "temperature": ("FLOAT", {"default": 0.6, "min": 0, "max": 1, "step": 0.01}),
511 |                 "cache_model": ("BOOLEAN", {"default": False}),
512 |                 "device": (gpu_devices,),  # 新增GPU设备选择                
513 |                 "enable_extra_options": ("BOOLEAN", {"default": True, "label": "启用额外选项"}),  # 新增开关
514 |                 **extra_options_list,  
515 |                 "character_name": ("STRING", {"default": "", "multiline": False}),
516 |             },
517 |         }
518 | 
519 |     def joycaption2(
520 |         self, image, llm_model, dtype, caption_type, caption_length,
521 |         user_prompt, max_new_tokens, top_p, temperature, cache_model, device, 
522 |         enable_extra_options, character_name, **extra_options  
523 |     ):
524 |         ret_text = [] 
525 |         comfy_model_dir = os.path.join(folder_paths.models_dir, "LLM")
526 |         print(f"comfy_model_dir: {comfy_model_dir}")
527 |         if not os.path.exists(comfy_model_dir):
528 |             os.mkdir(comfy_model_dir)
529 |         
530 |         sanitized_model_name = llm_model.replace('/', '--')
531 |         llm_model_path = os.path.join(comfy_model_dir, sanitized_model_name)  
532 |         llm_model_path_cache = os.path.join(comfy_model_dir, "cache--" + sanitized_model_name)
533 | 
534 |         # 使用用户选择的设备
535 |         selected_device = device if torch.cuda.is_available() else 'cpu'
536 |         model_loaded_on = selected_device  # 跟踪模型加载在哪个设备上
537 | 
538 |         try:
539 |             if os.path.exists(llm_model_path):
540 |                 print(f"Start to load existing model on {selected_device}")
541 |             else:
542 |                 print(f"Model not found locally. Downloading {llm_model}...")
543 |                 snapshot_download(
544 |                     repo_id=llm_model, 
545 |                     local_dir=llm_model_path_cache, 
546 |                     local_dir_use_symlinks=False, 
547 |                     resume_download=True
548 |                 )
549 |                 shutil.move(llm_model_path_cache, llm_model_path)   
550 |                 print(f"Model downloaded to {llm_model_path}...")
551 |             
552 |             if self.previous_model is None:
553 |                 try:
554 |                     # 尝试加载模型
555 |                     free_vram_bytes = mm.get_free_memory()
556 |                     free_vram_gb = free_vram_bytes / (1024 ** 3)
557 |                     print(f"Free VRAM: {free_vram_gb:.2f} GB")
558 |                     if dtype == 'nf4' and free_vram_gb < 10:
559 |                         print("Free VRAM is less than 10GB when loading 'nf4' model. Performing VRAM cleanup.")
560 |                         cleanGPU()
561 |                     elif dtype == 'bf16' and free_vram_gb < 20:
562 |                         print("Free VRAM is less than 20GB when loading 'bf16' model. Performing VRAM cleanup.")
563 |                         cleanGPU()                    
564 |                     # 统一使用选择的设备
565 |                     model = load_models(
566 |                         model_path=llm_model_path, dtype=dtype, device=selected_device
567 |                     )
568 |                 except RuntimeError:
569 |                     print("An error occurred while loading the model. Please check your configuration.")
570 |             else:
571 |                 model = self.previous_model
572 | 
573 |         except Exception as e:
574 |             print(f"Error loading model: {e}")
575 |             return None
576 | 
577 |         print(f"Model loaded on {model_loaded_on}")
578 | 
579 |         extra_prompts = []
580 | 
581 |         if enable_extra_options:
582 |             base_dir = os.path.dirname(os.path.abspath(__file__))
583 |             extra_option_file = os.path.join(base_dir, "extra_option.json")
584 |             if os.path.isfile(extra_option_file):
585 |                 try:
586 |                     with open(extra_option_file, "r", encoding='utf-8') as f:
587 |                         json_content = json.load(f)
588 |                         for item in json_content:
589 |                             name = item.get("name")
590 |                             prompt = item.get("prompt")
591 |                             if name and prompt:
592 |                                 if extra_options.get(name):
593 |                                     # 如果 prompt 中包含 {name}，则替换为 character_name
594 |                                     if "{name}" in prompt:
595 |                                         prompt = prompt.replace("{name}", character_name)
596 |                                     extra_prompts.append(prompt)
597 |                 except Exception as e:
598 |                     print(f"Error reading extra_option.json: {e}")
599 |             else:
600 |                 print(f"extra_option.json not found at {extra_option_file} during processing.")
601 | 
602 |         extra = []
603 |         if enable_extra_options:
604 |             extra = extra_prompts
605 |             print(f"Extra options enabled: {extra_prompts}")  
606 |         else:
607 |             print("No extra options provided.")            
608 | 
609 |         processed_images = [
610 |             Image.fromarray(
611 |                 np.clip(255.0 * img.unsqueeze(0).cpu().numpy().squeeze(), 0, 255).astype(np.uint8)
612 |             ).convert('RGB')
613 |             for img in image
614 |         ]
615 | 
616 |         try:
617 |             captions = stream_chat(
618 |                 processed_images, caption_type, caption_length,
619 |                 extra, "", user_prompt,
620 |                 max_new_tokens, top_p, temperature, len(processed_images),
621 |                 model, device  # 确保传递正确的设备
622 |             )
623 |             ret_text.extend(captions)
624 |         except Exception as e:
625 |             print(f"Error during stream_chat: {e}")
626 |             return None
627 | 
628 |         if cache_model:
629 |             self.previous_model = model
630 |         else:
631 |             self.previous_model = None
632 |             del model
633 |             free_memory()
634 | 
635 |         return (ret_text,)
636 | 
637 | 
638 | class ExtraOptionsNode:
639 |     CATEGORY = 'TTP_Toolset'
640 |     FUNCTION = "extra_options"
641 |     RETURN_TYPES = ("STRING",)  # 改为返回单一字符串
642 |     RETURN_NAMES = ("extra_options_str",)
643 |     OUTPUT_IS_LIST = (False,)  # 单一字符串输出
644 | 
645 |     def __init__(self):
646 |         self.NODE_NAME = 'ExtraOptionsNode'
647 | 
648 |     @classmethod
649 |     def INPUT_TYPES(cls):
650 |         # 获取 extra_option.json 的路径并加载选项
651 |         base_dir = os.path.dirname(os.path.abspath(__file__))
652 |         extra_option_file = os.path.join(base_dir, "extra_option.json")
653 |         extra_options_list = {}
654 | 
655 |         if os.path.isfile(extra_option_file):
656 |             try:
657 |                 with open(extra_option_file, "r", encoding='utf-8') as f:
658 |                     json_content = json.load(f)
659 |                     for item in json_content:
660 |                         option_name = item.get("name")
661 |                         if option_name:
662 |                             # 定义每个额外选项为布尔输入
663 |                             extra_options_list[option_name] = ("BOOLEAN", {"default": False})
664 |             except Exception as e:
665 |                 print(f"Error loading extra_option.json: {e}")
666 |         else:
667 |             print(f"extra_option.json not found at {extra_option_file}. No extra options will be available.")
668 | 
669 |         # 定义输入字段，包括开关和 character_name
670 |         return {
671 |             "required": {
672 |                 "enable_extra_options": ("BOOLEAN", {"default": True, "label": "启用额外选项"}),  # 开关
673 |                 **extra_options_list,  # 动态加载的额外选项
674 |                 "character_name": ("STRING", {"default": "", "multiline": False}),  # 移动 character_name
675 |             },
676 |         }
677 | 
678 |     def extra_options(self, enable_extra_options, character_name, **extra_options):
679 |         """
680 |         处理额外选项并返回已启用的提示列表。
681 |         如果启用了替换角色名称选项，并提供了 character_name，则进行替换。
682 |         """
683 |         extra_prompts = []
684 |         if enable_extra_options:
685 |             base_dir = os.path.dirname(os.path.abspath(__file__))
686 |             extra_option_file = os.path.join(base_dir, "extra_option.json")
687 |             if os.path.isfile(extra_option_file):
688 |                 try:
689 |                     with open(extra_option_file, "r", encoding='utf-8') as f:
690 |                         json_content = json.load(f)
691 |                         for item in json_content:
692 |                             name = item.get("name")
693 |                             prompt = item.get("prompt")
694 |                             if name and prompt:
695 |                                 if extra_options.get(name):
696 |                                     # 如果 prompt 中包含 {name}，则替换为 character_name
697 |                                     if "{name}" in prompt:
698 |                                         prompt = prompt.replace("{name}", character_name)
699 |                                     extra_prompts.append(prompt)
700 |                 except Exception as e:
701 |                     print(f"Error reading extra_option.json: {e}")
702 |             else:
703 |                 print(f"extra_option.json not found at {extra_option_file} during processing.")
704 | 
705 |         # 将所有启用的提示拼接成一个字符串
706 |         return (" ".join(extra_prompts),)  # 返回一个单一的合并字符串
707 | 
708 | class JoyCaption2_simple:
709 |     
710 |     CATEGORY = 'TTP_Toolset'
711 |     FUNCTION = "joycaption2_simple"
712 |     RETURN_TYPES = ("STRING",)
713 |     RETURN_NAMES = ("text",)
714 |     OUTPUT_IS_LIST = (True,)
715 |     
716 |     def __init__(self):
717 |         self.NODE_NAME = 'JoyCaption2_simple'
718 |         self.previous_model = None
719 | 
720 |     @classmethod
721 |     def INPUT_TYPES(cls):
722 |         llm_model_list = [
723 |             "unsloth/Meta-Llama-3.1-8B-Instruct",
724 |             "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
725 |         ]
726 |         dtype_list = ['nf4', 'bf16']
727 |         caption_type_list = [
728 |             "Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney",
729 |             "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing",
730 |             "Social Media Post"
731 |         ]
732 |         caption_length_list = [
733 |             "any", "very short", "short", "medium-length", "long", "very long"
734 |         ] + [str(i) for i in range(20, 261, 5)]
735 | 
736 |         # 获取可用的GPU设备列表
737 |         gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
738 |         if not gpu_devices:
739 |             gpu_devices = ["cpu"]  # 如果没有GPU可用，则仅提供CPU选项
740 |             
741 |         # 定义额外的输入字段
742 |         return {
743 |             "required": {
744 |                 "image": ("IMAGE",),
745 |                 "llm_model": (llm_model_list,),
746 |                 "dtype": (dtype_list,),
747 |                 "caption_type": (caption_type_list,),
748 |                 "caption_length": (caption_length_list,),
749 |                 "user_prompt": ("STRING", {"default": "", "multiline": True}),
750 |                 "max_new_tokens": ("INT", {"default": 260, "min": 8, "max": 4096, "step": 1}),
751 |                 "top_p": ("FLOAT", {"default": 0.8, "min": 0, "max": 1, "step": 0.01}),
752 |                 "temperature": ("FLOAT", {"default": 0.6, "min": 0, "max": 1, "step": 0.01}),
753 |                 "cache_model": ("BOOLEAN", {"default": False}),
754 |                 "device": (gpu_devices,),  # 新增GPU设备选择 
755 |             },
756 |             "optional": {
757 |                 "extra_options_node": ("STRING",{"forceInput": True}),  # 接收来自 ExtraOptionsNode 的单一字符串
758 |             },    
759 |         }
760 | 
761 |     def joycaption2_simple(
762 |         self, image, llm_model, dtype, caption_type, caption_length,
763 |         user_prompt, max_new_tokens, top_p, temperature, cache_model, device, 
764 |         extra_options_node=None  # 设置默认值为 None
765 |     ):
766 |         ret_text = [] 
767 |         comfy_model_dir = os.path.join(folder_paths.models_dir, "LLM")
768 |         print(f"comfy_model_dir: {comfy_model_dir}")
769 |         if not os.path.exists(comfy_model_dir):
770 |             os.mkdir(comfy_model_dir)
771 |         
772 |         sanitized_model_name = llm_model.replace('/', '--')
773 |         llm_model_path = os.path.join(comfy_model_dir, sanitized_model_name)  
774 |         llm_model_path_cache = os.path.join(comfy_model_dir, "cache--" + sanitized_model_name)
775 | 
776 |         # 使用用户选择的设备
777 |         selected_device = device if torch.cuda.is_available() else 'cpu'
778 |         model_loaded_on = selected_device  # 跟踪模型加载在哪个设备上
779 | 
780 |         try:
781 |             if os.path.exists(llm_model_path):
782 |                 print(f"Start to load existing model on {selected_device}")
783 |             else:
784 |                 print(f"Model not found locally. Downloading {llm_model}...")
785 |                 snapshot_download(
786 |                     repo_id=llm_model, 
787 |                     local_dir=llm_model_path_cache, 
788 |                     local_dir_use_symlinks=False, 
789 |                     resume_download=True
790 |                 )
791 |                 shutil.move(llm_model_path_cache, llm_model_path)   
792 |                 print(f"Model downloaded to {llm_model_path}...")
793 |             
794 |             if self.previous_model is None:
795 |                 try:
796 |                     # 尝试加载模型
797 |                     free_vram_bytes = mm.get_free_memory()
798 |                     free_vram_gb = free_vram_bytes / (1024 ** 3)
799 |                     print(f"Free VRAM: {free_vram_gb:.2f} GB")
800 |                     if dtype == 'nf4' and free_vram_gb < 10:
801 |                         print("Free VRAM is less than 10GB when loading 'nf4' model. Performing VRAM cleanup.")
802 |                         cleanGPU()
803 |                     elif dtype == 'bf16' and free_vram_gb < 20:
804 |                         print("Free VRAM is less than 20GB when loading 'bf16' model. Performing VRAM cleanup.")
805 |                         cleanGPU()                    
806 |                     # 统一使用选择的设备
807 | 
808 |                     model = load_models(
809 |                         model_path=llm_model_path, dtype=dtype, device=selected_device)
810 |                 except RuntimeError:
811 |                     print("An error occurred while loading the model. Please check your configuration.")
812 |             else:
813 |                 model = self.previous_model
814 | 
815 |         except Exception as e:
816 |             print(f"Error loading model: {e}")
817 |             return None
818 | 
819 |         print(f"Model loaded on {model_loaded_on}")
820 | 
821 |         # 接收来自 ExtraOptionsNode 的额外提示
822 |         extra = []
823 |         if extra_options_node and extra_options_node.strip():
824 |             extra = [extra_options_node]  # 将单一字符串包装成列表
825 |             print(f"Extra options enabled: {extra_options_node}")
826 |         else:
827 |             print("No extra options provided.")
828 | 
829 |         # 处理图像
830 |         processed_images = [
831 |             Image.fromarray(
832 |                 np.clip(255.0 * img.unsqueeze(0).cpu().numpy().squeeze(), 0, 255).astype(np.uint8)
833 |             ).convert('RGB')
834 |             for img in image
835 |         ]
836 | 
837 |         try:
838 |             captions = stream_chat(
839 |                 processed_images, caption_type, caption_length,
840 |                 extra, "", user_prompt,
841 |                 max_new_tokens, top_p, temperature, len(processed_images),
842 |                 model, device  # 确保传递正确的设备
843 |             )
844 |             ret_text.extend(captions)
845 |         except Exception as e:
846 |             print(f"Error during stream_chat: {e}")
847 |             return ("Error generating captions.",)
848 | 
849 |         if cache_model:
850 |             self.previous_model = model
851 |         else:
852 |             self.previous_model = None
853 |             del model
854 |             free_memory()
855 | 
856 |         return (ret_text,)
857 | 
858 |         
859 | # Register the node
860 | NODE_CLASS_MAPPINGS = {
861 |     "JoyCaption2": JoyCaption2,
862 |     "ExtraOptionsNode": ExtraOptionsNode,
863 |     "JoyCaption2_simple": JoyCaption2_simple,
864 | }
865 | 
866 | NODE_DISPLAY_NAME_MAPPINGS = {
867 |     "JoyCaption2": "TTP_JoyCaption2_Full",
868 |     "ExtraOptionsNode": "TTP_ExtraOptionsNode",
869 |     "JoyCaption2_simple": "TTP_JoyCaption2_simple",
870 | }
871 | 


--------------------------------------------------------------------------------
/JCBO.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | from torch import nn
  5 | from typing import List, Union, Generator
  6 | from PIL import Image
  7 | import torchvision.transforms.functional as TVF
  8 | import numpy as np
  9 | import folder_paths
 10 | import json
 11 | import logging
 12 | from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor, BitsAndBytesConfig
 13 | from huggingface_hub import snapshot_download
 14 | import shutil
 15 | import gc
 16 | import comfy.model_management as mm
 17 | import comfy.sd
 18 | from threading import Thread
 19 | 
 20 | # LIGER Kernel import attempt
 21 | try:
 22 |     from liger_kernel.transformers import apply_liger_kernel_to_llama
 23 |     LIGER_KERNEL_AVAILABLE = True
 24 | except ImportError:
 25 |     LIGER_KERNEL_AVAILABLE = False
 26 |     print("LIGER kernel not found. The option to enable it will be disabled.")
 27 | 
 28 | # Global model cache
 29 | CACHED_MODEL = None
 30 | CACHED_PROCESSOR = None
 31 | CACHED_MODEL_PATH_HF_ID = None # Stores the HuggingFace model ID used for the cache
 32 | CACHED_LIGER_ENABLED = None
 33 | CACHED_QUANTIZATION_MODE = None
 34 | CACHED_MODEL_LOCAL_PATH = None # Stores the local disk path of the cached model
 35 | 
 36 | QUANTIZATION_CONFIGS = {
 37 |     "nf4": {
 38 |             "load_in_4bit": True,
 39 |             "bnb_4bit_quant_type": "nf4",
 40 |             "bnb_4bit_compute_dtype": torch.bfloat16,
 41 |             "bnb_4bit_use_double_quant": True,
 42 |         },
 43 |     "int8": {
 44 |         "load_in_8bit": True,
 45 |     },
 46 |     "bf16": { # bf16 is not a quantization config, but a torch_dtype. Handled separately.
 47 |     },
 48 | }
 49 | LLM_SKIP_MODULES = ["vision_tower", "multi_modal_projector"]
 50 | MODEL_PATH_HF_DEFAULT = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 51 | 
 52 | # Define the CAPTION_TYPE_MAP for JoyCaptionBetaOne
 53 | CAPTION_TYPE_MAP_BETA = {
 54 |     "Descriptive": [
 55 |         "Write a detailed description for this image.",
 56 |         "Write a detailed description for this image in {word_count} words or less.",
 57 |         "Write a {length} detailed description for this image.",
 58 |     ],
 59 |     "Descriptive (Casual)": [
 60 |         "Write a descriptive caption for this image in a casual tone.",
 61 |         "Write a descriptive caption for this image in a casual tone within {word_count} words.",
 62 |         "Write a {length} descriptive caption for this image in a casual tone.",
 63 |     ],
 64 |     "Straightforward": [
 65 |         '''Write a straightforward caption for this image. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what\'s absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with "This image is…" or similar phrasing.''',
 66 |         '''Write a straightforward caption for this image within {word_count} words. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what\'s absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with "This image is…" or similar phrasing.''',
 67 |         '''Write a {length} straightforward caption for this image. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what\'s absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with "This image is…" or similar phrasing.''',
 68 |     ],
 69 |     "Stable Diffusion Prompt": [
 70 |         "Output a stable diffusion prompt that is indistinguishable from a real stable diffusion prompt.",
 71 |         "Output a stable diffusion prompt that is indistinguishable from a real stable diffusion prompt. {word_count} words or less.",
 72 |         "Output a {length} stable diffusion prompt that is indistinguishable from a real stable diffusion prompt.",
 73 |     ],
 74 |     "MidJourney": [
 75 |         "Write a MidJourney prompt for this image.",
 76 |         "Write a MidJourney prompt for this image within {word_count} words.",
 77 |         "Write a {length} MidJourney prompt for this image.",
 78 |     ],
 79 |     "Danbooru tag list": [
 80 |         "Generate only comma-separated Danbooru tags (lowercase_underscores). Strict order: `artist:`, `copyright:`, `character:`, `meta:`, then general tags. Include counts (1girl), appearance, clothing, accessories, pose, expression, actions, background. Use precise Danbooru syntax. No extra text.",
 81 |         "Generate only comma-separated Danbooru tags (lowercase_underscores). Strict order: `artist:`, `copyright:`, `character:`, `meta:`, then general tags. Include counts (1girl), appearance, clothing, accessories, pose, expression, actions, background. Use precise Danbooru syntax. No extra text. {word_count} words or less.",
 82 |         "Generate only comma-separated Danbooru tags (lowercase_underscores). Strict order: `artist:`, `copyright:`, `character:`, `meta:`, then general tags. Include counts (1girl), appearance, clothing, accessories, pose, expression, actions, background. Use precise Danbooru syntax. No extra text. {length} length.",
 83 |     ],
 84 |     "e621 tag list": [
 85 |         "Write a comma-separated list of e621 tags in alphabetical order for this image. Start with the artist, copyright, character, species, meta, and lore tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', \'species:\', \'meta:\', and \'lore:\'. Then all the general tags.",
 86 |         "Write a comma-separated list of e621 tags in alphabetical order for this image. Start with the artist, copyright, character, species, meta, and lore tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', \'species:\', \'meta:\', and \'lore:\'. Then all the general tags. Keep it under {word_count} words.",
 87 |         "Write a {length} comma-separated list of e621 tags in alphabetical order for this image. Start with the artist, copyright, character, species, meta, and lore tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', \'species:\', \'meta:\', and \'lore:\'. Then all the general tags.",
 88 |     ],
 89 |     "Rule34 tag list": [
 90 |         "Write a comma-separated list of rule34 tags in alphabetical order for this image. Start with the artist, copyright, character, and meta tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', and \'meta:\'. Then all the general tags.",
 91 |         "Write a comma-separated list of rule34 tags in alphabetical order for this image. Start with the artist, copyright, character, and meta tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', and \'meta:\'. Then all the general tags. Keep it under {word_count} words.",
 92 |         "Write a {length} comma-separated list of rule34 tags in alphabetical order for this image. Start with the artist, copyright, character, and meta tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', and \'meta:\'. Then all the general tags.",
 93 |     ],
 94 |     "Booru-like tag list": [
 95 |         "Write a list of Booru-like tags for this image.",
 96 |         "Write a list of Booru-like tags for this image within {word_count} words.",
 97 |         "Write a {length} list of Booru-like tags for this image.",
 98 |     ],
 99 |     "Art Critic": [
100 |         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
101 |         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
102 |         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
103 |     ],
104 |     "Product Listing": [
105 |         "Write a caption for this image as though it were a product listing.",
106 |         "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
107 |         "Write a {length} caption for this image as though it were a product listing.",
108 |     ],
109 |     "Social Media Post": [
110 |         "Write a caption for this image as if it were being used for a social media post.",
111 |         "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
112 |         "Write a {length} caption for this image as if it were being used for a social media post.",
113 |     ],
114 | }
115 | 
116 | NAME_OPTION_PROMPT = "If there is a person/character in the image you must refer to them as {name}."
117 | 
118 | # Extra options for the node
119 | EXTRA_OPTIONS_LIST = [
120 |     NAME_OPTION_PROMPT,
121 |     "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
122 |     "Include information about lighting.",
123 |     "Include information about camera angle.",
124 |     "Include information about whether there is a watermark or not.",
125 |     "Include information about whether there are JPEG artifacts or not.",
126 |     "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
127 |     "Do NOT include anything sexual; keep it PG.",
128 |     "Do NOT mention the image\'s resolution.",
129 |     "You MUST include information about the subjective aesthetic quality of the image from low to very high.",
130 |     "Include information on the image\'s composition style, such as leading lines, rule of thirds, or symmetry.",
131 |     "Do NOT mention any text that is in the image.",
132 |     "Specify the depth of field and whether the background is in focus or blurred.",
133 |     "If applicable, mention the likely use of artificial or natural lighting sources.",
134 |     "Do NOT use any ambiguous language.",
135 |     "Include whether the image is sfw, suggestive, or nsfw.",
136 |     "ONLY describe the most important elements of the image.",
137 |     "If it is a work of art, do not include the artist\'s name or the title of the work.",
138 |     "Identify the image orientation (portrait, landscape, or square) and aspect ratio if obvious.",
139 |     """Use vulgar slang and profanity, such as (but not limited to) "fucking," "slut," "cock," etc.""",
140 |     "Do NOT use polite euphemisms—lean into blunt, casual phrasing.",
141 |     "Include information about the ages of any people/characters when applicable.",
142 |     "Mention whether the image depicts an extreme close-up, close-up, medium close-up, medium shot, cowboy shot, medium wide shot, wide shot, or extreme wide shot.",
143 |     "Do not mention the mood/feeling/etc of the image.",
144 |     "Explicitly specify the vantage height (eye-level, low-angle worm's-eye, bird's-eye, drone, rooftop, etc.).",
145 |     "If there is a watermark, you must mention it.",
146 |     """Your response will be used by a text-to-image model, so avoid useless meta phrases like "This image shows…", "You are looking at...", etc.""",
147 | ]
148 | 
149 | def _build_prompt_beta_shared(caption_type: str, caption_length: str, list_of_extra_prompts: list[str], character_name_val: str, custom_prompt_override: str) -> str:
150 |     if custom_prompt_override and custom_prompt_override.strip():
151 |         if character_name_val and "{name}" in custom_prompt_override:
152 |             return custom_prompt_override.replace("{name}", character_name_val)
153 |         return custom_prompt_override
154 | 
155 |     if caption_length == "any": map_idx = 0
156 |     elif isinstance(caption_length, str) and caption_length.isdigit(): map_idx = 1
157 |     else: map_idx = 2
158 |     
159 |     base_prompt = CAPTION_TYPE_MAP_BETA[caption_type][map_idx]
160 |     final_extra_prompts = []
161 | 
162 |     for extra_prompt_template in list_of_extra_prompts:
163 |         if extra_prompt_template == NAME_OPTION_PROMPT:
164 |             if character_name_val: # Only include and format name if provided
165 |                 final_extra_prompts.append(extra_prompt_template.format(name=character_name_val))
166 |             # If character_name_val is empty, this prompt is skipped entirely.
167 |         else:
168 |             final_extra_prompts.append(extra_prompt_template)
169 |         
170 |     full_prompt_parts = [base_prompt]
171 |     if final_extra_prompts:
172 |         full_prompt_parts.extend(final_extra_prompts)
173 |     
174 |     # Format the base prompt part; extra prompts are already formatted or don't need it here
175 |     # This assumes {word_count} and {length} are only in the base_prompt template
176 |     full_prompt_parts[0] = full_prompt_parts[0].format(length=caption_length, word_count=caption_length)
177 |     
178 |     return " ".join(full_prompt_parts)
179 | 
180 | def _free_model_memory_shared():
181 |     global CACHED_MODEL, CACHED_PROCESSOR, CACHED_MODEL_PATH_HF_ID, CACHED_LIGER_ENABLED, CACHED_QUANTIZATION_MODE, CACHED_MODEL_LOCAL_PATH
182 |     CACHED_MODEL = None
183 |     CACHED_PROCESSOR = None
184 |     CACHED_MODEL_PATH_HF_ID = None
185 |     CACHED_LIGER_ENABLED = None
186 |     CACHED_QUANTIZATION_MODE = None
187 |     CACHED_MODEL_LOCAL_PATH = None
188 |     gc.collect()
189 |     if torch.cuda.is_available():
190 |         torch.cuda.empty_cache()
191 |         torch.cuda.ipc_collect()
192 |     print("JoyCaptionBetaOne (Shared): Model and processor released from cache.")
193 | 
194 | def _clean_gpu_shared():
195 |     gc.collect()
196 |     mm.unload_all_models()
197 |     mm.soft_empty_cache()
198 |     print("JoyCaptionBetaOne (Shared): ComfyUI models unloaded and cache soft-emptied.")
199 | 
200 | def _load_model_shared(model_hf_id: str, quantization_mode: str, target_device: str, enable_liger: bool):
201 |     global CACHED_MODEL, CACHED_PROCESSOR, CACHED_MODEL_PATH_HF_ID, CACHED_LIGER_ENABLED, CACHED_QUANTIZATION_MODE, CACHED_MODEL_LOCAL_PATH
202 | 
203 |     model_dir_base = os.path.join(folder_paths.models_dir, "LLM_llava")
204 |     if not os.path.exists(model_dir_base): os.makedirs(model_dir_base, exist_ok=True)
205 |     sanitized_model_repo_name = model_hf_id.replace('/', '--')
206 |     model_path_local = os.path.join(model_dir_base, sanitized_model_repo_name)
207 |     model_path_cache_tmp = os.path.join(model_dir_base, "cache--" + sanitized_model_repo_name)
208 | 
209 |     effective_device = target_device if torch.cuda.is_available() else "cpu"
210 |     print(f"JoyCaptionBetaOne (Shared): Using effective device: {effective_device} for model {model_hf_id}")
211 | 
212 |     reload_needed = False
213 |     if CACHED_MODEL is None or \
214 |        CACHED_PROCESSOR is None or \
215 |        CACHED_MODEL_PATH_HF_ID != model_hf_id or \
216 |        CACHED_MODEL_LOCAL_PATH != model_path_local or \
217 |        CACHED_QUANTIZATION_MODE != quantization_mode or \
218 |        (LIGER_KERNEL_AVAILABLE and CACHED_LIGER_ENABLED != enable_liger):
219 |         reload_needed = True
220 |         if CACHED_MODEL is not None:
221 |             print(f"JoyCaptionBetaOne (Shared): Config changed (Prev: {CACHED_MODEL_PATH_HF_ID}, {CACHED_QUANTIZATION_MODE}, Liger: {CACHED_LIGER_ENABLED}. New: {model_hf_id}, {quantization_mode}, Liger: {enable_liger}). Reloading.")
222 |             _free_model_memory_shared()
223 | 
224 |     if reload_needed:
225 |         print(f"JoyCaptionBetaOne (Shared): Loading model from {model_path_local} (HF: {model_hf_id})")
226 |         if not os.path.exists(model_path_local):
227 |             print(f"JoyCaptionBetaOne (Shared): Downloading {model_hf_id}...")
228 |             try:
229 |                 snapshot_download(repo_id=model_hf_id, local_dir=model_path_cache_tmp, local_dir_use_symlinks=False, resume_download=True)
230 |                 shutil.move(model_path_cache_tmp, model_path_local)
231 |                 print(f"JoyCaptionBetaOne (Shared): Model {model_hf_id} downloaded to {model_path_local}")
232 |             except Exception as e: raise RuntimeError(f"Error downloading model {model_hf_id}: {e}")
233 |         try:
234 |             print(f"JoyCaptionBetaOne (Shared): Loading processor from {model_path_local}...")
235 |             processor = AutoProcessor.from_pretrained(model_path_local)
236 |             print(f"JoyCaptionBetaOne (Shared): Loading model {model_hf_id} with quantization '{quantization_mode}'...")
237 |             
238 |             model_load_kwargs = {}
239 |             final_torch_dtype = None
240 |             final_device_map = "auto" # Default to auto, will be overridden if needed
241 | 
242 |             current_quant_mode = quantization_mode
243 |             if "cuda" not in effective_device and current_quant_mode in ["nf4", "int8"]:
244 |                 print(f"JoyCaptionBetaOne (Shared): Quantization '{current_quant_mode}' needs CUDA. Falling back to bf16 for CPU for {model_hf_id}.")
245 |                 current_quant_mode = "bf16"
246 |             
247 |             if current_quant_mode == "bf16":
248 |                 final_torch_dtype = torch.bfloat16
249 |                 final_device_map = None if "cpu" in effective_device else effective_device
250 |             elif current_quant_mode in ["nf4", "int8"]:
251 |                 # This block is for CUDA devices as per the check above
252 |                 bnb_config_params = QUANTIZATION_CONFIGS[current_quant_mode].copy()
253 |                 bnb_config_params["llm_int8_skip_modules"] = LLM_SKIP_MODULES
254 |                 q_config = BitsAndBytesConfig(**bnb_config_params)
255 |                 model_load_kwargs["quantization_config"] = q_config
256 |                 final_torch_dtype = torch.bfloat16 if current_quant_mode == "nf4" else "auto"
257 |                 final_device_map = effective_device # MODIFICATION: Use the user-selected CUDA device
258 |                 print(f"JoyCaptionBetaOne (Shared): Preparing {current_quant_mode} for specific device: {effective_device}")
259 |             else: # Fallback / fp32 (though not an explicit option)
260 |                 final_torch_dtype = torch.float32 if "cpu" in effective_device else torch.bfloat16
261 |                 final_device_map = None if "cpu" in effective_device else effective_device
262 | 
263 |             model_load_kwargs["torch_dtype"] = final_torch_dtype
264 |             model_load_kwargs["device_map"] = final_device_map
265 |             
266 |             if "cuda" in effective_device:
267 |                 free_vram_gb = mm.get_free_memory(effective_device) / (1024**3)
268 |                 # Basic VRAM check - can be more sophisticated
269 |                 if free_vram_gb < 4 and current_quant_mode != "nf4": # NF4 is very light
270 |                      print(f"Warning: Low VRAM ({free_vram_gb:.2f}GB on {effective_device}) for {current_quant_mode}")
271 |                      # _clean_gpu_shared() # Consider if cleanup is aggressive enough or needed
272 |             
273 |             model = LlavaForConditionalGeneration.from_pretrained(model_path_local, **model_load_kwargs)
274 |             assert isinstance(model, LlavaForConditionalGeneration)
275 |             model.eval()
276 | 
277 |             if LIGER_KERNEL_AVAILABLE and enable_liger and "cuda" in str(model.device).lower(): # Check actual model device for LIGER
278 |                 try:
279 |                     print(f"JoyCaptionBetaOne (Shared): Applying LIGER kernel to {model_hf_id} on {model.device}...")
280 |                     apply_liger_kernel_to_llama(model=model.language_model)
281 |                     CACHED_LIGER_ENABLED = True
282 |                 except Exception as e: print(f"JoyCaptionBetaOne (Shared): LIGER kernel apply failed for {model_hf_id}: {e}"); CACHED_LIGER_ENABLED = False
283 |             else: CACHED_LIGER_ENABLED = False
284 |             
285 |             CACHED_MODEL = model
286 |             CACHED_PROCESSOR = processor
287 |             CACHED_MODEL_PATH_HF_ID = model_hf_id
288 |             CACHED_MODEL_LOCAL_PATH = model_path_local
289 |             CACHED_QUANTIZATION_MODE = quantization_mode # Cache the original requested mode
290 |             print(f"JoyCaptionBetaOne (Shared): Model {model_hf_id} loaded. Effective quantization: '{current_quant_mode}', LIGER: {CACHED_LIGER_ENABLED}, Device map: '{str(model.hf_device_map)}'.")
291 |         except Exception as e: 
292 |             _free_model_memory_shared()
293 |             raise RuntimeError(f"Error loading model {model_hf_id}: {e}")
294 |     else:
295 |         print(f"JoyCaptionBetaOne (Shared): Using cached model ({CACHED_MODEL_PATH_HF_ID}, Quant: {CACHED_QUANTIZATION_MODE}, LIGER: {CACHED_LIGER_ENABLED}).")
296 |         model = CACHED_MODEL
297 |         processor = CACHED_PROCESSOR
298 |     return model, processor
299 | 
300 | class JoyCaptionBetaOne_Full:
301 |     CATEGORY = 'TTP_Toolset'
302 |     FUNCTION = "caption_image"
303 |     RETURN_TYPES = ("STRING",)
304 |     RETURN_NAMES = ("caption",)
305 |     OUTPUT_IS_LIST = (True,)
306 |     
307 |     def __init__(self):
308 |         self.NODE_NAME = 'JoyCaptionBetaOne_Full'
309 | 
310 |     @classmethod
311 |     def INPUT_TYPES(cls):
312 |         caption_type_keys = list(CAPTION_TYPE_MAP_BETA.keys())
313 |         caption_length_list = ["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 10)]
314 |         quantization_mode_list = ['bf16', 'nf4', 'int8'] 
315 |         
316 |         gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
317 |         if not gpu_devices:
318 |             gpu_devices = ["cpu"]
319 | 
320 |         extra_options_inputs = {}
321 |         for i, option_text in enumerate(EXTRA_OPTIONS_LIST):
322 |             label = option_text.split('.')[0].replace(' ', '_').replace('/', '_').lower()
323 |             if len(label) > 30: label = label[:30]
324 |             extra_options_inputs[f"extra_option_{i}_{label}"] = ("BOOLEAN", {"default": False, "label": option_text[:100]})
325 | 
326 |         inputs = {
327 |             "required": {
328 |                 "image": ("IMAGE",),
329 |                 "caption_type": (caption_type_keys,),
330 |                 "caption_length": (caption_length_list,),
331 |                 "quantization_mode": (quantization_mode_list, {"default": 'bf16'}),
332 |                 "custom_prompt": ("STRING", {"default": "", "multiline": True, "label": "Custom Prompt (Overrides caption type & extras)"}),
333 |                 "character_name": ("STRING", {"default": "", "multiline": False, "label": "Person/Character Name (for {name} in extras)"}),
334 |                 "temperature": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 2.0, "step": 0.05}),
335 |                 "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}),
336 |                 "max_new_tokens": ("INT", {"default": 512, "min": 1, "max": 2048, "step": 1}),
337 |                 "device": (gpu_devices,),
338 |                 "cache_model": ("BOOLEAN", {"default": True, "label": "Cache Model in Memory"}),
339 |             },
340 |             "optional": {} 
341 |         }
342 |         
343 |         if LIGER_KERNEL_AVAILABLE:
344 |             inputs["required"]["enable_liger_kernel"] = ("BOOLEAN", {"default": True, "label": "Enable LIGER Kernel (CUDA only)"})
345 |         else:
346 |             inputs["required"]["info_liger_unavailable"] = ("STRING", {"default": "LIGER Kernel not installed/available.", "label": "LIGER Kernel Status", "input": "hidden"})
347 | 
348 |         # Add the dynamically generated extra options to 'required'
349 |         inputs["required"].update(extra_options_inputs)
350 |         return inputs
351 | 
352 |     @torch.no_grad()
353 |     def caption_image(self, image: torch.Tensor, caption_type: str, caption_length: str,
354 |                       quantization_mode: str, 
355 |                       custom_prompt: str, character_name: str,
356 |                       temperature: float, top_p: float, max_new_tokens: int,
357 |                       device: str, cache_model: bool, **kwargs):
358 |         enable_liger_kernel = kwargs.get('enable_liger_kernel', False) if LIGER_KERNEL_AVAILABLE else False
359 |         try:
360 |             model, processor = _load_model_shared(MODEL_PATH_HF_DEFAULT, quantization_mode, device, enable_liger_kernel)
361 |         except Exception as e:
362 |             print(f"Error in {self.NODE_NAME}: {e}")
363 |             return ([str(e)],) # Return error message as list of strings
364 |         
365 |         selected_extra_options_prompts = []
366 |         for i, option_text_template in enumerate(EXTRA_OPTIONS_LIST):
367 |             key_label_part = option_text_template.split('.')[0].replace(' ', '_').replace('/', '_').lower()
368 |             if len(key_label_part) > 30: key_label_part = key_label_part[:30]
369 |             extra_option_key = f"extra_option_{i}_{key_label_part}"
370 |             if kwargs.get(extra_option_key, False): selected_extra_options_prompts.append(option_text_template)
371 |         
372 |         pil_images = [Image.fromarray(np.clip(255. * img.cpu().numpy().squeeze(),0,255).astype(np.uint8)).convert("RGB") for img in image]
373 |         all_captions = []
374 | 
375 |         for input_image_pil in pil_images:
376 |             actual_prompt_str = _build_prompt_beta_shared(caption_type, caption_length, selected_extra_options_prompts, character_name, custom_prompt)
377 |             print(f"{self.NODE_NAME}: Prompt: {actual_prompt_str}")
378 |             convo = [{"role": "system", "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions."}, {"role": "user", "content": actual_prompt_str.strip()}]
379 |             convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
380 |             
381 |             model_device = model.device # Use the actual device of the loaded model
382 |             inputs_on_device = processor(text=[convo_string], images=[input_image_pil], return_tensors="pt").to(model_device)
383 |             inputs_on_device['pixel_values'] = inputs_on_device['pixel_values'].to(model.dtype) # Ensure correct dtype for pixel_values
384 |             
385 |             try:
386 |                 with torch.cuda.amp.autocast(enabled=("cuda" in str(model_device).lower() and model.dtype != torch.float32)):
387 |                     generate_ids = model.generate(**inputs_on_device, max_new_tokens=max_new_tokens, do_sample=(temperature > 0), temperature=temperature if temperature > 0 else None, top_p=top_p if temperature > 0 else None, use_cache=True)
388 |             except Exception as e:
389 |                 print(f"{self.NODE_NAME}: Generation error: {e}")
390 |                 if "out of memory" in str(e).lower() and "cuda" in str(model_device).lower(): 
391 |                     print(f"{self.NODE_NAME}: OOM error detected. Clearing model cache."); _free_model_memory_shared()
392 |                 return ([f"Error generating caption: {e}"],)
393 |             input_token_len = inputs_on_device.input_ids.shape[1]
394 |             generated_text_ids = generate_ids[:, input_token_len:]
395 |             caption = processor.batch_decode(generated_text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
396 |             all_captions.append(caption.strip())
397 |         
398 |         if not cache_model:
399 |             print(f"{self.NODE_NAME}: Not caching model, releasing from memory.")
400 |             _free_model_memory_shared()
401 |         return (all_captions,)
402 | 
403 | class ExtraOptionsNode_Beta:
404 |     CATEGORY = 'TTP_Toolset'
405 |     FUNCTION = "compile_extra_options"
406 |     RETURN_TYPES = ("STRING",)
407 |     RETURN_NAMES = ("extra_options_str",)
408 |     OUTPUT_IS_LIST = (False,) 
409 | 
410 |     def __init__(self):
411 |         self.NODE_NAME = 'ExtraOptionsNode_Beta'
412 | 
413 |     @classmethod
414 |     def INPUT_TYPES(cls):
415 |         extra_options_inputs = {}
416 |         for i, option_text in enumerate(EXTRA_OPTIONS_LIST):
417 |             label = option_text.split('.')[0].replace(' ', '_').replace('/', '_').lower()
418 |             if len(label) > 30: label = label[:30]
419 |             extra_options_inputs[f"extra_option_{i}_{label}"] = ("BOOLEAN", {"default": False, "label": option_text[:100]})
420 |         inputs = {
421 |             "required": {
422 |                 "enable_extra_options": ("BOOLEAN", {"default": True, "label": "Enable Extra Options"}),
423 |                 "character_name": ("STRING", {"default": "", "multiline": False, "label": "Person/Character Name (for {name})"}),
424 |             },
425 |         }
426 |         inputs["required"].update(extra_options_inputs)
427 |         return inputs
428 | 
429 |     def compile_extra_options(self, enable_extra_options, character_name, **kwargs):
430 |         if not enable_extra_options:
431 |             return ("",)
432 |         
433 |         compiled_options = []
434 |         for i, option_text_template in enumerate(EXTRA_OPTIONS_LIST):
435 |             key_label_part = option_text_template.split('.')[0].replace(' ', '_').replace('/', '_').lower()
436 |             if len(key_label_part) > 30: key_label_part = key_label_part[:30]
437 |             extra_option_key = f"extra_option_{i}_{key_label_part}"
438 |             if kwargs.get(extra_option_key, False):
439 |                 if option_text_template == NAME_OPTION_PROMPT:
440 |                     if character_name: # Only add if name is provided
441 |                         compiled_options.append(option_text_template.format(name=character_name))
442 |                 else:
443 |                     compiled_options.append(option_text_template)
444 |         return (" ".join(compiled_options),)
445 | 
446 | class JoyCaptionBetaOne_Simple:
447 |     CATEGORY = 'TTP_Toolset'
448 |     FUNCTION = "caption_image_simple"
449 |     RETURN_TYPES = ("STRING",)
450 |     RETURN_NAMES = ("caption",)
451 |     OUTPUT_IS_LIST = (True,)
452 | 
453 |     def __init__(self):
454 |         self.NODE_NAME = 'JoyCaptionBetaOne_Simple'
455 |     
456 |     @classmethod
457 |     def INPUT_TYPES(cls):
458 |         caption_type_keys = list(CAPTION_TYPE_MAP_BETA.keys())
459 |         caption_length_list = ["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 10)]
460 |         quantization_mode_list = ['bf16', 'nf4', 'int8'] 
461 |         
462 |         gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
463 |         if not gpu_devices:
464 |             gpu_devices = ["cpu"]
465 |         return {
466 |             "required": {
467 |                 "image": ("IMAGE",),
468 |                 "caption_type": (caption_type_keys,),
469 |                 "caption_length": (caption_length_list,),
470 |                 "quantization_mode": (quantization_mode_list, {"default": 'bf16'}),
471 |                 "custom_prompt": ("STRING", {"default": "", "multiline": True, "label": "Custom Prompt (Overrides caption type & extras)"}),
472 |                 "temperature": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 2.0, "step": 0.05}),
473 |                 "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}),
474 |                 "max_new_tokens": ("INT", {"default": 512, "min": 1, "max": 2048, "step": 1}),
475 |                 "device": (gpu_devices,),
476 |                 "cache_model": ("BOOLEAN", {"default": True, "label": "Cache Model in Memory"}),
477 |             },
478 |             "optional": {
479 |                 "extra_options_str": ("STRING", {"forceInput": True, "default": ""}),
480 |                  "enable_liger_kernel_opt": ("BOOLEAN", {"default": True, "label": "Enable LIGER Kernel (CUDA only)"}), # Liger as optional for simple
481 |             }
482 |         }
483 | 
484 |     @torch.no_grad()
485 |     def caption_image_simple(self, image: torch.Tensor, caption_type: str, caption_length: str,
486 |                              quantization_mode: str, custom_prompt: str,
487 |                              temperature: float, top_p: float, max_new_tokens: int,
488 |                              device: str, cache_model: bool, 
489 |                              extra_options_str:str="", enable_liger_kernel_opt:bool=True):
490 |         enable_liger_kernel = enable_liger_kernel_opt if LIGER_KERNEL_AVAILABLE else False
491 |         try:
492 |             model, processor = _load_model_shared(MODEL_PATH_HF_DEFAULT, quantization_mode, device, enable_liger_kernel)
493 |         except Exception as e:
494 |             print(f"Error in {self.NODE_NAME}: {e}")
495 |             return ([str(e)],) # Return error message as list of strings
496 |         
497 |         list_of_extra_prompts = [extra_options_str] if extra_options_str.strip() else []
498 | 
499 |         pil_images = [Image.fromarray(np.clip(255. * img.cpu().numpy().squeeze(),0,255).astype(np.uint8)).convert("RGB") for img in image]
500 |         all_captions = []
501 | 
502 |         for input_image_pil in pil_images:
503 |             # For the simple node, character_name is not a direct input. If name handling is desired via extra_options_str,
504 |             # it must have been compiled into that string by ExtraOptionsNode_Beta.
505 |             # So, we pass an empty string for character_name_val to _build_prompt_beta_shared.
506 |             actual_prompt_str = _build_prompt_beta_shared(caption_type, caption_length, list_of_extra_prompts, "", custom_prompt)
507 |             print(f"{self.NODE_NAME}: Prompt: {actual_prompt_str}")
508 |             convo = [{"role": "system", "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions."}, {"role": "user", "content": actual_prompt_str.strip()}]
509 |             convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
510 |             
511 |             model_device = model.device
512 |             inputs_on_device = processor(text=[convo_string], images=[input_image_pil], return_tensors="pt").to(model_device)
513 |             inputs_on_device['pixel_values'] = inputs_on_device['pixel_values'].to(model.dtype)
514 |             try:
515 |                 with torch.cuda.amp.autocast(enabled=("cuda" in str(model_device).lower() and model.dtype != torch.float32)):
516 |                     generate_ids = model.generate(**inputs_on_device, max_new_tokens=max_new_tokens, do_sample=(temperature > 0), temperature=temperature if temperature > 0 else None, top_p=top_p if temperature > 0 else None, use_cache=True)
517 |             except Exception as e:
518 |                 print(f"{self.NODE_NAME}: Generation error: {e}")
519 |                 if "out of memory" in str(e).lower() and "cuda" in str(model_device).lower(): 
520 |                     print(f"{self.NODE_NAME}: OOM error detected. Clearing model cache."); _free_model_memory_shared()
521 |                 return ([f"Error generating caption: {e}"],)
522 |             input_token_len = inputs_on_device.input_ids.shape[1]
523 |             generated_text_ids = generate_ids[:, input_token_len:]
524 |             caption = processor.batch_decode(generated_text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
525 |             all_captions.append(caption.strip())
526 |         
527 |         if not cache_model:
528 |             print(f"{self.NODE_NAME}: Not caching model, releasing from memory.")
529 |             _free_model_memory_shared()
530 |         return (all_captions,)
531 | 
532 | NODE_CLASS_MAPPINGS = {
533 |     "JoyCaptionBetaOne_Full": JoyCaptionBetaOne_Full,
534 |     "ExtraOptionsNode_Beta": ExtraOptionsNode_Beta,
535 |     "JoyCaptionBetaOne_Simple": JoyCaptionBetaOne_Simple,
536 | }
537 | NODE_DISPLAY_NAME_MAPPINGS = {
538 |     "JoyCaptionBetaOne_Full": "TTP_JoyCaption_BetaOne_Full",
539 |     "ExtraOptionsNode_Beta": "TTP_ExtraOptionsNode_Beta",
540 |     "JoyCaptionBetaOne_Simple": "TTP_JoyCaption_BetaOne_Simple",
541 | }
542 | print("JoyCaptionBetaOne (JCBO.py) nodes (Full, Simple, ExtraOptions) loaded with refined quantization.") 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🧠 ComfyUI Joy Caption Wrapper (Alpha Two & Beta One)
  2 | 
  3 | > 💡 支持 Alpha Two 与全新 Beta One 模型  
  4 | > 🎮 一键部署 / 自动下载（Beta One 模型无需手动放置）  
  5 | > 📦 GitHub Repo: https://github.com/TTPlanetPig/Comfyui_JC2
  6 | >
  7 | > Comfyui workflow example：
  8 | > https://github.com/TTPlanetPig/Comfyui_JC2/blob/main/example/JoyCaption%20Beta_One_example.png
  9 | 
 10 | ---
 11 | 
 12 | ## 🌟 简介(这里为旧的Joy Caption Alpha Two介绍，已过时）
 13 | 
 14 | 这是为 [ComfyUI](https://github.com/comfyanonymous/ComfyUI) 封装的 Joy Caption 节点：
 15 | 
 16 | - ✅ 支持 `joy-caption-alpha-two` 以及 **全新版本** [`joy-caption-beta-one`](https://huggingface.co/spaces/fancyfeast/joy-caption-beta-one)
 17 | - 🧊 对低显存卡用户推荐使用 `nf4` 模式，兼顾速度与效果
 18 | - 🔁 参考实现来自：
 19 |   - [chflame163/ComfyUI_LayerStyle](https://github.com/chflame163/ComfyUI_LayerStyle)
 20 |   - [John6666/joy-caption-alpha-two-cli-mod](https://huggingface.co/John6666/joy-caption-alpha-two-cli-mod)
 21 | 
 22 | ---
 23 | 
 24 | ## ⚠️ VRAM 要求
 25 | 
 26 | | 模式 | 最低显存 | 说明 |
 27 | |------|-----------|------|
 28 | | `bf16` | ≥ 19GB | 推荐给 3090 / 4090 用户 |
 29 | | `nf4`  | ≥ 10GB | 推荐低于 19GB 显存时使用 |
 30 | 
 31 | > 显存不足将导致 ComfyUI 报错或运行失败。
 32 | 
 33 | ---
 34 | 
 35 | ## 🚀 安装方式
 36 | 
 37 | ### ✅ 安装节点：
 38 | 
 39 | 方法一：通过 ComfyUI 内置 Manager 安装  
 40 | 方法二：手动克隆
 41 | 
 42 | ```bash
 43 | cd ./comfyui/custom_nodes
 44 | git clone https://github.com/TTPlanetPig/Comfyui_JC2
 45 | ```
 46 | 
 47 | ### ✅ 安装依赖（适用于 `python_embedded`）：
 48 | 
 49 | ```bash
 50 | cd ./comfyui/custom_nodes/Comfyui_JC2
 51 | ../../../python_embeded/python.exe -m pip install -r requirements.txt
 52 | ```
 53 | 
 54 | ### ✅ 安装 PyTorch（如果未预装）
 55 | 
 56 | ```bash
 57 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
 58 | ```
 59 | 
 60 | ---
 61 | 
 62 | ## 🚅 提速建议：开启 Liger Kernel
 63 | 
 64 | 若需要进一步**提升运行速度**，推荐启用 **liger-kernel**：
 65 | 
 66 | - 📁 请点击节点目录下的 `安装liger-kernel.bat`
 67 | - ✅ 适用于 ComfyUI 官方一键包（`python_embeded` 构建）
 68 | 
 69 | ---
 70 | 
 71 | ## 📥 模型准备
 72 | 
 73 | | 模型 | 下载链接 | 放置路径 |
 74 | |------|-----------|----------|
 75 | | `clip_vision` | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | `ComfyUI/models/clip_vision/google--siglip-so400m-patch14-384` |
 76 | | `LLM` | [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct) | `ComfyUI/models/LLM/unsloth--Meta-Llama-3.1-8B-Instruct` |
 77 | | `Joy Caption LoRA` (alpha two) | [joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two) | `ComfyUI/models/Joy_caption/cgrkzexw-599808` |
 78 | 
 79 | 📦 推荐使用 `huggingface-cli` 下载以避免路径或名称出错。
 80 | 
 81 | 或者使用百度网盘打包下载：
 82 | 
 83 | > 链接: https://pan.baidu.com/s/1yYRlDKclehSPv-tUVwfVHw 提取码: `b84c`
 84 | 
 85 | ---
 86 | 
 87 | ## 🆕 新增支持：joy-caption-beta-one 🎉
 88 | 
 89 | - ✅ 已集成 [joy-caption-beta-one](https://huggingface.co/spaces/fancyfeast/joy-caption-beta-one)
 90 | - ✅ **无需用户手动下载模型**，ComfyUI 节点会自动拉取 HuggingFace 资源
 91 | - ✅ 保持与 Alpha Two 同样的调用逻辑，支持 GPU 显存检测与模式选择
 92 | 
 93 | ---
 94 | 
 95 | ## 📸 界面 & 使用要点
 96 | 
 97 | ### 🎛 关键参数介绍：
 98 | 
 99 | 1. 模式选择（`bf16` / `nf4`）  
100 |    推荐 3090 / 4090 使用 `bf16`，其余使用 `nf4`  
101 |    ![bf16 vs nf4](https://github.com/user-attachments/assets/8001e70b-cea3-4971-a8c2-f483a2c4f91c)
102 | 
103 | 2. 提示词模式选择（多种任务类型）  
104 |    ![prompt type](https://github.com/user-attachments/assets/110f25f6-ea25-4395-b698-c0ec358940ae)
105 | 
106 | 3. 文本长度选择（不总是越长越好）  
107 |    ![length not always better](https://github.com/user-attachments/assets/05e8cfbe-f983-4c8e-813a-761779d0ba4e)
108 | 
109 | 4. 模型 offload 开关（决定是否将模型保留在显存）  
110 |    ![offload setting](https://github.com/user-attachments/assets/804d3326-0f44-4cd2-98c9-56e174e552c1)
111 | 
112 | 5. 控制附加选项是否生效（需搭配使用）  
113 |    ![extra enable](https://github.com/user-attachments/assets/6cb00a63-a1e6-4502-87ff-b99800d37912)
114 | 
115 | 6. 联动选项，需同时启用才有效果  
116 |    ![combo 1](https://github.com/user-attachments/assets/16d11016-6ff1-4d62-90ca-c3d820af4cd3)  
117 |    ![combo 2](https://github.com/user-attachments/assets/6fe8dbd4-affe-4753-b10e-aa4120ab5149)
118 | 
119 | ---
120 | 
121 | ## 🖼 文件夹结构示意
122 | 
123 | 确保模型文件正确放置，如图所示：
124 | 
125 | ![结构1](https://github.com/user-attachments/assets/4675b67c-38f8-4d6a-9785-607215038337)  
126 | ![结构2](https://github.com/user-attachments/assets/9ae0a410-539e-49c5-a1b4-4434da02dc28)  
127 | ![结构3](https://github.com/user-attachments/assets/2d17e8d2-42af-4040-9cf9-019eb25464e0)  
128 | ![结构4](https://github.com/user-attachments/assets/aeba0145-81c7-4c86-a31c-bbb9c317cad8)
129 | 
130 | ---
131 | 
132 | ## ⭐ Star History
133 | 
134 | <a href="https://star-history.com/#TTPlanetPig/Comfyui_JC2&Date">
135 |  <picture>
136 |    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=TTPlanetPig/Comfyui_JC2&type=Date&theme=dark" />
137 |    <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=TTPlanetPig/Comfyui_JC2&type=Date" />
138 |    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=TTPlanetPig/Comfyui_JC2&type=Date" />
139 |  </picture>
140 | </a>
141 | 
142 | ---
143 | 
144 | 🧪 欢迎测试并反馈问题，Enjoy!
145 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .JC2 import NODE_CLASS_MAPPINGS as JC2_NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS as JC2_NODE_DISPLAY_NAME_MAPPINGS
2 | from .JCBO import NODE_CLASS_MAPPINGS as JCBO_NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS as JCBO_NODE_DISPLAY_NAME_MAPPINGS
3 | 
4 | NODE_CLASS_MAPPINGS = {**JC2_NODE_CLASS_MAPPINGS, **JCBO_NODE_CLASS_MAPPINGS}
5 | NODE_DISPLAY_NAME_MAPPINGS = {**JC2_NODE_DISPLAY_NAME_MAPPINGS, **JCBO_NODE_DISPLAY_NAME_MAPPINGS}
6 | 
7 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]


--------------------------------------------------------------------------------
/example/JoyCaption Beta_One_example.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "1bbd7edc-7ede-4b03-829f-f8da37fb8ad2",
  3 |   "revision": 0,
  4 |   "last_node_id": 8,
  5 |   "last_link_id": 6,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 1,
  9 |       "type": "JoyCaptionBetaOne_Full",
 10 |       "pos": [
 11 |         -2323.431884765625,
 12 |         683.6752319335938
 13 |       ],
 14 |       "size": [
 15 |         446.90234375,
 16 |         976
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 3,
 20 |       "mode": 0,
 21 |       "inputs": [
 22 |         {
 23 |           "name": "image",
 24 |           "type": "IMAGE",
 25 |           "link": 1
 26 |         }
 27 |       ],
 28 |       "outputs": [
 29 |         {
 30 |           "name": "caption",
 31 |           "shape": 6,
 32 |           "type": "STRING",
 33 |           "links": [
 34 |             2
 35 |           ]
 36 |         }
 37 |       ],
 38 |       "properties": {
 39 |         "cnr_id": "comfyui_jc2",
 40 |         "ver": "712b89398d0a7b005235c8d36f333e86a0beea1b",
 41 |         "Node name for S&R": "JoyCaptionBetaOne_Full",
 42 |         "widget_ue_connectable": {}
 43 |       },
 44 |       "widgets_values": [
 45 |         "Descriptive",
 46 |         "any",
 47 |         "nf4",
 48 |         "",
 49 |         "",
 50 |         0.6,
 51 |         0.9,
 52 |         512,
 53 |         "cuda:0",
 54 |         true,
 55 |         true,
 56 |         false,
 57 |         false,
 58 |         false,
 59 |         false,
 60 |         false,
 61 |         false,
 62 |         false,
 63 |         false,
 64 |         false,
 65 |         false,
 66 |         false,
 67 |         false,
 68 |         false,
 69 |         false,
 70 |         false,
 71 |         false,
 72 |         false,
 73 |         false,
 74 |         false,
 75 |         false,
 76 |         false,
 77 |         false,
 78 |         false,
 79 |         false,
 80 |         false,
 81 |         false,
 82 |         false
 83 |       ]
 84 |     },
 85 |     {
 86 |       "id": 2,
 87 |       "type": "LoadImage",
 88 |       "pos": [
 89 |         -2817.219970703125,
 90 |         755.2782592773438
 91 |       ],
 92 |       "size": [
 93 |         274.080078125,
 94 |         314
 95 |       ],
 96 |       "flags": {},
 97 |       "order": 0,
 98 |       "mode": 0,
 99 |       "inputs": [],
100 |       "outputs": [
101 |         {
102 |           "name": "IMAGE",
103 |           "type": "IMAGE",
104 |           "links": [
105 |             1
106 |           ]
107 |         },
108 |         {
109 |           "name": "MASK",
110 |           "type": "MASK",
111 |           "links": null
112 |         }
113 |       ],
114 |       "properties": {
115 |         "cnr_id": "comfy-core",
116 |         "ver": "0.3.34",
117 |         "Node name for S&R": "LoadImage",
118 |         "widget_ue_connectable": {}
119 |       },
120 |       "widgets_values": [
121 |         "ComfyUI_33602_.png",
122 |         "image"
123 |       ]
124 |     },
125 |     {
126 |       "id": 5,
127 |       "type": "LoadImage",
128 |       "pos": [
129 |         -1350.49267578125,
130 |         652.2406616210938
131 |       ],
132 |       "size": [
133 |         274.080078125,
134 |         314
135 |       ],
136 |       "flags": {},
137 |       "order": 1,
138 |       "mode": 0,
139 |       "inputs": [],
140 |       "outputs": [
141 |         {
142 |           "name": "IMAGE",
143 |           "type": "IMAGE",
144 |           "links": [
145 |             3
146 |           ]
147 |         },
148 |         {
149 |           "name": "MASK",
150 |           "type": "MASK",
151 |           "links": null
152 |         }
153 |       ],
154 |       "properties": {
155 |         "cnr_id": "comfy-core",
156 |         "ver": "0.3.34",
157 |         "Node name for S&R": "LoadImage",
158 |         "widget_ue_connectable": {}
159 |       },
160 |       "widgets_values": [
161 |         "ComfyUI_33602_.png",
162 |         "image"
163 |       ]
164 |     },
165 |     {
166 |       "id": 3,
167 |       "type": "ShowText|pysssss",
168 |       "pos": [
169 |         -1677.072998046875,
170 |         731.69580078125
171 |       ],
172 |       "size": [
173 |         221.86968994140625,
174 |         33.35912322998047
175 |       ],
176 |       "flags": {},
177 |       "order": 5,
178 |       "mode": 0,
179 |       "inputs": [
180 |         {
181 |           "name": "text",
182 |           "type": "STRING",
183 |           "link": 2
184 |         }
185 |       ],
186 |       "outputs": [
187 |         {
188 |           "name": "STRING",
189 |           "shape": 6,
190 |           "type": "STRING",
191 |           "links": null
192 |         }
193 |       ],
194 |       "properties": {
195 |         "cnr_id": "comfyui-custom-scripts",
196 |         "ver": "aac13aa7ce35b07d43633c3bbe654a38c00d74f5",
197 |         "Node name for S&R": "ShowText|pysssss",
198 |         "widget_ue_connectable": {}
199 |       },
200 |       "widgets_values": []
201 |     },
202 |     {
203 |       "id": 8,
204 |       "type": "ShowText|pysssss",
205 |       "pos": [
206 |         -292.3197937011719,
207 |         702.7871704101562
208 |       ],
209 |       "size": [
210 |         221.86968994140625,
211 |         33.35912322998047
212 |       ],
213 |       "flags": {},
214 |       "order": 6,
215 |       "mode": 0,
216 |       "inputs": [
217 |         {
218 |           "name": "text",
219 |           "type": "STRING",
220 |           "link": 6
221 |         }
222 |       ],
223 |       "outputs": [
224 |         {
225 |           "name": "STRING",
226 |           "shape": 6,
227 |           "type": "STRING",
228 |           "links": null
229 |         }
230 |       ],
231 |       "properties": {
232 |         "cnr_id": "comfyui-custom-scripts",
233 |         "ver": "aac13aa7ce35b07d43633c3bbe654a38c00d74f5",
234 |         "Node name for S&R": "ShowText|pysssss",
235 |         "widget_ue_connectable": {}
236 |       },
237 |       "widgets_values": []
238 |     },
239 |     {
240 |       "id": 7,
241 |       "type": "ExtraOptionsNode_Beta",
242 |       "pos": [
243 |         -857.732666015625,
244 |         1049.2509765625
245 |       ],
246 |       "size": [
247 |         446.90234375,
248 |         730
249 |       ],
250 |       "flags": {},
251 |       "order": 2,
252 |       "mode": 0,
253 |       "inputs": [],
254 |       "outputs": [
255 |         {
256 |           "name": "extra_options_str",
257 |           "type": "STRING",
258 |           "links": [
259 |             5
260 |           ]
261 |         }
262 |       ],
263 |       "properties": {
264 |         "cnr_id": "comfyui_jc2",
265 |         "ver": "712b89398d0a7b005235c8d36f333e86a0beea1b",
266 |         "Node name for S&R": "ExtraOptionsNode_Beta",
267 |         "widget_ue_connectable": {}
268 |       },
269 |       "widgets_values": [
270 |         true,
271 |         "",
272 |         false,
273 |         false,
274 |         false,
275 |         false,
276 |         false,
277 |         false,
278 |         false,
279 |         false,
280 |         false,
281 |         false,
282 |         false,
283 |         false,
284 |         false,
285 |         false,
286 |         false,
287 |         false,
288 |         false,
289 |         false,
290 |         false,
291 |         false,
292 |         false,
293 |         false,
294 |         false,
295 |         false,
296 |         false,
297 |         false,
298 |         false
299 |       ]
300 |     },
301 |     {
302 |       "id": 4,
303 |       "type": "JoyCaptionBetaOne_Simple",
304 |       "pos": [
305 |         -841.1376342773438,
306 |         660.6044921875
307 |       ],
308 |       "size": [
309 |         400,
310 |         324
311 |       ],
312 |       "flags": {},
313 |       "order": 4,
314 |       "mode": 0,
315 |       "inputs": [
316 |         {
317 |           "name": "image",
318 |           "type": "IMAGE",
319 |           "link": 3
320 |         },
321 |         {
322 |           "name": "extra_options_str",
323 |           "shape": 7,
324 |           "type": "STRING",
325 |           "link": 5
326 |         }
327 |       ],
328 |       "outputs": [
329 |         {
330 |           "name": "caption",
331 |           "shape": 6,
332 |           "type": "STRING",
333 |           "links": [
334 |             6
335 |           ]
336 |         }
337 |       ],
338 |       "properties": {
339 |         "cnr_id": "comfyui_jc2",
340 |         "ver": "712b89398d0a7b005235c8d36f333e86a0beea1b",
341 |         "Node name for S&R": "JoyCaptionBetaOne_Simple",
342 |         "widget_ue_connectable": {}
343 |       },
344 |       "widgets_values": [
345 |         "Descriptive",
346 |         "any",
347 |         "bf16",
348 |         "",
349 |         0.6,
350 |         0.9,
351 |         512,
352 |         "cuda:0",
353 |         true,
354 |         true
355 |       ]
356 |     }
357 |   ],
358 |   "links": [
359 |     [
360 |       1,
361 |       2,
362 |       0,
363 |       1,
364 |       0,
365 |       "IMAGE"
366 |     ],
367 |     [
368 |       2,
369 |       1,
370 |       0,
371 |       3,
372 |       0,
373 |       "STRING"
374 |     ],
375 |     [
376 |       3,
377 |       5,
378 |       0,
379 |       4,
380 |       0,
381 |       "IMAGE"
382 |     ],
383 |     [
384 |       5,
385 |       7,
386 |       0,
387 |       4,
388 |       1,
389 |       "STRING"
390 |     ],
391 |     [
392 |       6,
393 |       4,
394 |       0,
395 |       8,
396 |       0,
397 |       "STRING"
398 |     ]
399 |   ],
400 |   "groups": [],
401 |   "config": {},
402 |   "extra": {
403 |     "ue_links": [],
404 |     "frontendVersion": "1.18.9",
405 |     "VHS_latentpreview": false,
406 |     "VHS_latentpreviewrate": 0,
407 |     "VHS_MetadataImage": true,
408 |     "VHS_KeepIntermediate": true
409 |   },
410 |   "version": 0.4
411 | }


--------------------------------------------------------------------------------
/example/JoyCaption Beta_One_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TTPlanetPig/Comfyui_JC2/69a7d6830807d65595da8848f1169a261c5dff5e/example/JoyCaption Beta_One_example.png


--------------------------------------------------------------------------------
/extra_option.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "name": "replace_character_names",
 4 |         "prompt": "If there is a person/character in the image you must refer to them as {name}."
 5 |     },
 6 |     {
 7 |         "name": "exclude_unchangeable_attributes",
 8 |         "prompt": "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style)."
 9 |     },
10 |     {
11 |         "name": "include_lighting_details",
12 |         "prompt": "Include information about lighting."
13 |     },
14 |     {
15 |         "name": "include_camera_angle",
16 |         "prompt": "Include information about camera angle."
17 |     },
18 |     {
19 |         "name": "mention_watermark_presence",
20 |         "prompt": "Include information about whether there is a watermark or not."
21 |     },
22 |     {
23 |         "name": "note_jpeg_artifacts",
24 |         "prompt": "Include information about whether there are JPEG artifacts or not."
25 |     },
26 |     {
27 |         "name": "include_exif_data",
28 |         "prompt": "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc."
29 |     },
30 |     {
31 |         "name": "exclude_sexual_content",
32 |         "prompt": "Do NOT include anything sexual; keep it PG."
33 |     },
34 |     {
35 |         "name": "exclude_image_resolution",
36 |         "prompt": "Do NOT mention the image's resolution."
37 |     },
38 |     {
39 |         "name": "describe_aesthetic_quality",
40 |         "prompt": "You MUST include information about the subjective aesthetic quality of the image from low to very high."
41 |     },
42 |     {
43 |         "name": "include_composition_style",
44 |         "prompt": "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry."
45 |     },
46 |     {
47 |         "name": "exclude_text_elements",
48 |         "prompt": "Do NOT mention any text that is in the image."
49 |     },
50 |     {
51 |         "name": "specify_depth_of_field",
52 |         "prompt": "Specify the depth of field and whether the background is in focus or blurred."
53 |     },
54 |     {
55 |         "name": "specify_lighting_sources",
56 |         "prompt": "If applicable, mention the likely use of artificial or natural lighting sources."
57 |     },
58 |     {
59 |         "name": "avoid_ambiguous_language",
60 |         "prompt": "Do NOT use any ambiguous language."
61 |     },
62 |     {
63 |         "name": "classify_image_as_sfw_nsfw",
64 |         "prompt": "Include whether the image is sfw, suggestive, or nsfw."
65 |     },
66 |     {
67 |         "name": "describe_key_elements_only",
68 |         "prompt": "ONLY describe the most important elements of the image."
69 |     }
70 | ]
71 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui_jc2"
 3 | description = "Wrapped Joy Caption alpha 2 node for comfyui: Easy use, for GPU with less 19G, please use nf4 for better balanced speed and result. "
 4 | version = "1.0.8"
 5 | license = {file = "LICENSE"}
 6 | dependencies = ["huggingface_hub>=0.23.4,<=0.25", "accelerate", "transformers>=4.43.2,<=4.45.1", "sentencepiece", "peft==0.12.0", "bitsandbytes"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/TTPlanetPig/Comfyui_JC2"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = "ttplanet"
14 | DisplayName = "Comfyui_JC2"
15 | Icon = "🪐"
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | peft==0.12.0
3 | bitsandbytes
4 | huggingface_hub==0.30.1
5 | transformers>=4.51.0
6 | sentencepiece
7 | triton-windows<=3.2.0
8 | 


--------------------------------------------------------------------------------
/安装liger-kernel.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | REM 设置 Python 路径
 4 | set PYTHON_PATH=..\..\..\python_embeded\python.exe
 5 | 
 6 | echo Installing liger-kernel without dependencies...
 7 | %PYTHON_PATH% -m pip install liger-kernel==0.5.9 --no-deps
 8 | 
 9 | echo Installing remaining dependencies from requirements.txt...
10 | %PYTHON_PATH% -m pip install -r requirements.txt
11 | 
12 | echo All done.
13 | pause
14 | 


--------------------------------------------------------------------------------