├── .github └── workflows │ └── publish.yml ├── JC2.py ├── JCBO.py ├── LICENSE ├── README.md ├── __init__.py ├── example ├── JoyCaption Beta_One_example.json └── JoyCaption Beta_One_example.png ├── extra_option.json ├── pyproject.toml ├── requirements.txt └── 安装liger-kernel.bat /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Comfy registry 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | paths: 9 | - "pyproject.toml" 10 | 11 | jobs: 12 | publish-node: 13 | name: Publish Custom Node to registry 14 | runs-on: ubuntu-latest 15 | # if this is a forked repository. Skipping the workflow. 16 | if: github.event.repository.fork == false 17 | steps: 18 | - name: Check out code 19 | uses: actions/checkout@v4 20 | - name: Publish Custom Node 21 | uses: Comfy-Org/publish-node-action@main 22 | with: 23 | ## Add your own personal access token to your Github Repository secrets and reference it here. 24 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} 25 | -------------------------------------------------------------------------------- /JC2.py: -------------------------------------------------------------------------------- 1 | # Based on https://huggingface.co/John6666/joy-caption-alpha-two-cli-modand https://github.com/chflame163/ComfyUI_LayerStyle 2 | 3 | import os 4 | import sys 5 | import torch 6 | from torch import nn 7 | from typing import List, Union 8 | from PIL import Image 9 | import torchvision.transforms.functional as TVF 10 | from torchvision.transforms import ToPILImage 11 | import numpy as np 12 | import folder_paths 13 | import json 14 | import logging 15 | from transformers import AutoProcessor, AutoModelForCausalLM 16 | from huggingface_hub import snapshot_download 17 | import shutil 18 | import gc 19 | import comfy.model_management as mm 20 | import comfy.sd 21 | 22 | # Define the Joy2_Model class 23 | class Joy2_Model: 24 | def __init__(self, clip_processor, clip_model, tokenizer, text_model, image_adapter): 25 | self.clip_processor = clip_processor 26 | self.clip_model = clip_model 27 | self.tokenizer = tokenizer 28 | self.text_model = text_model 29 | self.image_adapter = image_adapter 30 | 31 | # Define the ImageAdapter class 32 | class ImageAdapter(nn.Module): 33 | def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, 34 | deep_extract: bool): 35 | super().__init__() 36 | self.deep_extract = deep_extract 37 | 38 | if self.deep_extract: 39 | input_features = input_features * 5 40 | 41 | self.linear1 = nn.Linear(input_features, output_features) 42 | self.activation = nn.GELU() 43 | self.linear2 = nn.Linear(output_features, output_features) 44 | self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features) 45 | self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features)) 46 | 47 | # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>) 48 | self.other_tokens = nn.Embedding(3, output_features) 49 | self.other_tokens.weight.data.normal_(mean=0.0, std=0.02) # Matches HF's implementation of LLaMA 50 | 51 | def forward(self, vision_outputs: torch.Tensor): 52 | if self.deep_extract: 53 | x = torch.cat(( 54 | vision_outputs[-2], 55 | vision_outputs[3], 56 | vision_outputs[7], 57 | vision_outputs[13], 58 | vision_outputs[20], 59 | ), dim=-1) 60 | assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}" # batch, tokens, features 61 | assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}" 62 | else: 63 | x = vision_outputs[-2] 64 | 65 | x = self.ln1(x) 66 | 67 | if self.pos_emb is not None: 68 | assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}" 69 | x = x + self.pos_emb 70 | 71 | x = self.linear1(x) 72 | x = self.activation(x) 73 | x = self.linear2(x) 74 | 75 | other_tokens = self.other_tokens( 76 | torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1)) 77 | assert other_tokens.shape == ( 78 | x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}" 79 | x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1) 80 | 81 | return x 82 | 83 | def get_eot_embedding(self): 84 | return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0) 85 | 86 | # 设置全局设备变量 87 | current_device = "cuda:0" 88 | 89 | def get_torch_device_patched(): 90 | global current_device 91 | if ( 92 | not torch.cuda.is_available() 93 | or comfy.model_management.cpu_state == comfy.model_management.CPUState.CPU 94 | ): 95 | return torch.device("cpu") 96 | 97 | return torch.device(current_device) 98 | 99 | # 覆盖ComfyUI的设备获取函数 100 | comfy.model_management.get_torch_device = get_torch_device_patched 101 | 102 | def load_models(model_path, dtype, device="cuda:0", device_map=None): 103 | global current_device 104 | current_device = device # 设置当前设备 105 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 106 | from peft import PeftModel 107 | 108 | JC_lora = "text_model" 109 | use_lora = True if JC_lora != "none" else False 110 | CLIP_PATH = os.path.join(folder_paths.models_dir, "clip_vision", "google--siglip-so400m-patch14-384") 111 | CHECKPOINT_PATH = os.path.join(folder_paths.models_dir, "Joy_caption", "cgrkzexw-599808") 112 | LORA_PATH = os.path.join(CHECKPOINT_PATH, "text_model") 113 | 114 | if os.path.exists(CLIP_PATH): 115 | print("Start to load existing VLM") 116 | else: 117 | print("VLM not found locally. Downloading google/siglip-so400m-patch14-384...") 118 | try: 119 | snapshot_download( 120 | repo_id="google/siglip-so400m-patch14-384", 121 | local_dir=os.path.join(folder_paths.models_dir, "clip_vision", "cache--google--siglip-so400m-patch14-384"), 122 | local_dir_use_symlinks=False, 123 | resume_download=True 124 | ) 125 | shutil.move(os.path.join(folder_paths.models_dir, "clip_vision", "cache--google--siglip-so400m-patch14-384"), CLIP_PATH) 126 | print(f"VLM has been downloaded to {CLIP_PATH}") 127 | except Exception as e: 128 | print(f"Error downloading CLIP model: {e}") 129 | raise 130 | 131 | try: 132 | if dtype == "nf4": 133 | from transformers import BitsAndBytesConfig 134 | nf4_config = BitsAndBytesConfig( 135 | load_in_4bit=True, 136 | bnb_4bit_quant_type="nf4", 137 | bnb_4bit_use_double_quant=True, 138 | bnb_4bit_compute_dtype=torch.bfloat16 139 | ) 140 | print("Loading in NF4") 141 | print("Loading CLIP") 142 | clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 143 | clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model 144 | 145 | print("Loading VLM's custom vision model") 146 | checkpoint = torch.load(os.path.join(CHECKPOINT_PATH, "clip_model.pt"), map_location=current_device, weights_only=False) 147 | checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()} 148 | clip_model.load_state_dict(checkpoint) 149 | del checkpoint 150 | clip_model.eval().requires_grad_(False).to(current_device) 151 | 152 | print("Loading tokenizer") 153 | tokenizer = AutoTokenizer.from_pretrained(os.path.join(CHECKPOINT_PATH, "text_model"), use_fast=True) 154 | assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}" 155 | 156 | print(f"Loading LLM: {model_path}") 157 | text_model = AutoModelForCausalLM.from_pretrained( 158 | model_path, 159 | quantization_config=nf4_config, 160 | device_map=current_device, # 统一使用指定设备 161 | torch_dtype=torch.bfloat16 162 | ).eval() 163 | 164 | if use_lora and os.path.exists(LORA_PATH): 165 | print("Loading VLM's custom text model") 166 | text_model = PeftModel.from_pretrained( 167 | model=text_model, 168 | model_id=LORA_PATH, 169 | device_map=current_device, # 统一使用指定设备 170 | quantization_config=nf4_config 171 | ) 172 | text_model = text_model.merge_and_unload(safe_merge=True) 173 | else: 174 | print("VLM's custom text model isn't loaded") 175 | 176 | print("Loading image adapter") 177 | image_adapter = ImageAdapter( 178 | clip_model.config.hidden_size, 179 | text_model.config.hidden_size, 180 | False, False, 38, 181 | False 182 | ).eval().to("cpu") 183 | image_adapter.load_state_dict( 184 | torch.load(os.path.join(CHECKPOINT_PATH, "image_adapter.pt"), map_location=current_device, weights_only=False) 185 | ) 186 | image_adapter.eval().to(current_device) 187 | else: # bf16 188 | print("Loading in bfloat16") 189 | print("Loading CLIP") 190 | clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 191 | clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model 192 | if os.path.exists(os.path.join(CHECKPOINT_PATH, "clip_model.pt")): 193 | print("Loading VLM's custom vision model") 194 | checkpoint = torch.load(os.path.join(CHECKPOINT_PATH, "clip_model.pt"), map_location=current_device, weights_only=False) 195 | checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()} 196 | clip_model.load_state_dict(checkpoint) 197 | del checkpoint 198 | clip_model.eval().requires_grad_(False).to(current_device) 199 | 200 | print("Loading tokenizer") 201 | tokenizer = AutoTokenizer.from_pretrained(os.path.join(CHECKPOINT_PATH, "text_model"), use_fast=True) 202 | assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}" 203 | 204 | print(f"Loading LLM: {model_path}") 205 | text_model = AutoModelForCausalLM.from_pretrained( 206 | model_path, 207 | device_map=current_device, # 统一使用指定设备 208 | torch_dtype=torch.bfloat16 209 | ).eval() 210 | 211 | if use_lora and os.path.exists(LORA_PATH): 212 | print("Loading VLM's custom text model") 213 | text_model = PeftModel.from_pretrained( 214 | model=text_model, 215 | model_id=LORA_PATH, 216 | device_map=current_device # 统一使用指定设备 217 | ) 218 | text_model = text_model.merge_and_unload(safe_merge=True) 219 | else: 220 | print("VLM's custom text model isn't loaded") 221 | 222 | print("Loading image adapter") 223 | image_adapter = ImageAdapter( 224 | clip_model.config.hidden_size, 225 | text_model.config.hidden_size, 226 | False, False, 38, 227 | False 228 | ).eval().to(current_device) 229 | image_adapter.load_state_dict( 230 | torch.load(os.path.join(CHECKPOINT_PATH, "image_adapter.pt"), map_location=current_device, weights_only=False) 231 | ) 232 | except Exception as e: 233 | print(f"Error loading models: {e}", ) 234 | finally: 235 | pass # 可以在这里添加内存释放逻辑(如果需要) 236 | 237 | return Joy2_Model(clip_processor, clip_model, tokenizer, text_model, image_adapter) 238 | 239 | # Define the stream_chat function 240 | @torch.inference_mode() 241 | def stream_chat(input_images: List[Image.Image], caption_type: str, caption_length: Union[str, int], 242 | extra_options: list[str], name_input: str, custom_prompt: str, 243 | max_new_tokens: int, top_p: float, temperature: float, batch_size: int, model: Joy2_Model, current_device=str): 244 | 245 | # 确定 chat_device 246 | if 'cuda' in current_device: 247 | chat_device = 'cuda' 248 | elif 'cpu' in current_device: 249 | chat_device = 'cpu' 250 | else: 251 | raise ValueError(f"Unsupported device type: {current_device}") 252 | 253 | 254 | CAPTION_TYPE_MAP = { 255 | "Descriptive": [ 256 | "Write a descriptive caption for this image in a formal tone.", 257 | "Write a descriptive caption for this image in a formal tone within {word_count} words.", 258 | "Write a {length} descriptive caption for this image in a formal tone.", 259 | ], 260 | "Descriptive (Informal)": [ 261 | "Write a descriptive caption for this image in a casual tone.", 262 | "Write a descriptive caption for this image in a casual tone within {word_count} words.", 263 | "Write a {length} descriptive caption for this image in a casual tone.", 264 | ], 265 | "Training Prompt": [ 266 | "Write a stable diffusion prompt for this image.", 267 | "Write a stable diffusion prompt for this image within {word_count} words.", 268 | "Write a {length} stable diffusion prompt for this image.", 269 | ], 270 | "MidJourney": [ 271 | "Write a MidJourney prompt for this image.", 272 | "Write a MidJourney prompt for this image within {word_count} words.", 273 | "Write a {length} MidJourney prompt for this image.", 274 | ], 275 | "Booru tag list": [ 276 | "Write a list of Booru tags for this image.", 277 | "Write a list of Booru tags for this image within {word_count} words.", 278 | "Write a {length} list of Booru tags for this image.", 279 | ], 280 | "Booru-like tag list": [ 281 | "Write a list of Booru-like tags for this image.", 282 | "Write a list of Booru-like tags for this image within {word_count} words.", 283 | "Write a {length} list of Booru-like tags for this image.", 284 | ], 285 | "Art Critic": [ 286 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.", 287 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.", 288 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.", 289 | ], 290 | "Product Listing": [ 291 | "Write a caption for this image as though it were a product listing.", 292 | "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.", 293 | "Write a {length} caption for this image as though it were a product listing.", 294 | ], 295 | "Social Media Post": [ 296 | "Write a caption for this image as if it were being used for a social media post.", 297 | "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.", 298 | "Write a {length} caption for this image as if it were being used for a social media post.", 299 | ], 300 | } 301 | 302 | all_captions = [] 303 | 304 | # 'any' means no length specified 305 | length = None if caption_length == "any" else caption_length 306 | 307 | if isinstance(length, str): 308 | try: 309 | length = int(length) 310 | except ValueError: 311 | pass 312 | 313 | # Build prompt 314 | if length is None: 315 | map_idx = 0 316 | elif isinstance(length, int): 317 | map_idx = 1 318 | elif isinstance(length, str): 319 | map_idx = 2 320 | else: 321 | raise ValueError(f"Invalid caption length: {length}") 322 | 323 | prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx] 324 | 325 | # Add extra options 326 | if len(extra_options) > 0: 327 | prompt_str += " " + " ".join(extra_options) 328 | 329 | # Add name, length, word_count 330 | prompt_str = prompt_str.format(name=name_input, length=caption_length, word_count=caption_length) 331 | 332 | if custom_prompt.strip() != "": 333 | prompt_str = custom_prompt.strip() 334 | 335 | # For debugging 336 | print(f"Prompt: {prompt_str}") 337 | 338 | for i in range(0, len(input_images), batch_size): 339 | batch = input_images[i:i + batch_size] 340 | 341 | for input_image in batch: 342 | try: 343 | # Preprocess image 344 | image = input_image.resize((384, 384), Image.LANCZOS) 345 | pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0 346 | pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]) 347 | pixel_values = pixel_values.to(chat_device) 348 | except ValueError as e: 349 | print(f"Error processing image: {e}") 350 | print("Skipping this image and continuing...") 351 | continue 352 | 353 | # Embed image 354 | with torch.amp.autocast_mode.autocast(chat_device, enabled=True): 355 | vision_outputs = model.clip_model(pixel_values=pixel_values, output_hidden_states=True) 356 | image_features = vision_outputs.hidden_states 357 | embedded_images = model.image_adapter(image_features).to(chat_device) 358 | 359 | # Build the conversation 360 | convo = [ 361 | { 362 | "role": "system", 363 | "content": "You are a helpful image captioner.", 364 | }, 365 | { 366 | "role": "user", 367 | "content": prompt_str, 368 | }, 369 | ] 370 | 371 | # Format the conversation 372 | if hasattr(model.tokenizer, 'apply_chat_template'): 373 | convo_string = model.tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 374 | else: 375 | # Fallback if apply_chat_template is not available 376 | convo_string = "<|eot_id|>\n" 377 | for message in convo: 378 | if message['role'] == 'system': 379 | convo_string += f"<|system|>{message['content']}<|endoftext|>\n" 380 | elif message['role'] == 'user': 381 | convo_string += f"<|user|>{message['content']}<|endoftext|>\n" 382 | else: 383 | convo_string += f"{message['content']}<|endoftext|>\n" 384 | convo_string += "<|eot_id|>" 385 | 386 | assert isinstance(convo_string, str) 387 | 388 | # Tokenize the conversation 389 | convo_tokens = model.tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, 390 | truncation=False) 391 | prompt_tokens = model.tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, 392 | truncation=False) 393 | assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor) 394 | convo_tokens = convo_tokens.squeeze(0) 395 | prompt_tokens = prompt_tokens.squeeze(0) 396 | 397 | # Calculate where to inject the image 398 | eot_id_indices = (convo_tokens == model.tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[ 399 | 0].tolist() 400 | assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}" 401 | 402 | preamble_len = eot_id_indices[1] - prompt_tokens.shape[0] 403 | 404 | # Embed the tokens 405 | convo_embeds = model.text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(current_device)) 406 | 407 | # Construct the input 408 | input_embeds = torch.cat([ 409 | convo_embeds[:, :preamble_len], 410 | embedded_images.to(dtype=convo_embeds.dtype), 411 | convo_embeds[:, preamble_len:], 412 | ], dim=1).to(chat_device) 413 | 414 | input_ids = torch.cat([ 415 | convo_tokens[:preamble_len].unsqueeze(0), 416 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 417 | convo_tokens[preamble_len:].unsqueeze(0), 418 | ], dim=1).to(chat_device) 419 | attention_mask = torch.ones_like(input_ids) 420 | 421 | generate_ids = model.text_model.generate(input_ids=input_ids, inputs_embeds=input_embeds, 422 | attention_mask=attention_mask, do_sample=True, 423 | suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, 424 | temperature=temperature) 425 | 426 | # Trim off the prompt 427 | generate_ids = generate_ids[:, input_ids.shape[1]:] 428 | if generate_ids[0][-1] == model.tokenizer.eos_token_id or generate_ids[0][-1] == model.tokenizer.convert_tokens_to_ids( 429 | "<|eot_id|>"): 430 | generate_ids = generate_ids[:, :-1] 431 | 432 | caption = model.tokenizer.batch_decode(generate_ids, skip_special_tokens=False, 433 | clean_up_tokenization_spaces=False)[0] 434 | all_captions.append(caption.strip()) 435 | 436 | return all_captions 437 | 438 | def free_memory(): 439 | import gc 440 | gc.collect() 441 | if torch.cuda.is_available(): 442 | torch.cuda.empty_cache() 443 | torch.cuda.ipc_collect() 444 | 445 | 446 | def cleanGPU(): 447 | gc.collect() 448 | mm.unload_all_models() 449 | mm.soft_empty_cache() 450 | 451 | 452 | class JoyCaption2: 453 | 454 | CATEGORY = 'TTP_Toolset' 455 | FUNCTION = "joycaption2" 456 | RETURN_TYPES = ("STRING",) 457 | RETURN_NAMES = ("text",) 458 | OUTPUT_IS_LIST = (True,) 459 | 460 | def __init__(self): 461 | self.NODE_NAME = 'JoyCaption2' 462 | self.previous_model = None 463 | 464 | @classmethod 465 | def INPUT_TYPES(cls): 466 | llm_model_list = ["unsloth/Meta-Llama-3.1-8B-Instruct", "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"] 467 | dtype_list = ['nf4', 'bf16'] 468 | caption_type_list = [ 469 | "Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", 470 | "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", 471 | "Social Media Post" 472 | ] 473 | caption_length_list = ["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 5)] 474 | 475 | # 获取extra_option.json路径 476 | base_dir = os.path.dirname(os.path.abspath(__file__)) 477 | extra_option_file = os.path.join(base_dir, "extra_option.json") 478 | 479 | # 加载extra_options_list 480 | extra_options_list = {} 481 | if os.path.isfile(extra_option_file): 482 | try: 483 | with open(extra_option_file, "r", encoding='utf-8') as f: 484 | json_content = json.load(f) 485 | for item in json_content: 486 | option_name = item.get("name") 487 | if option_name: 488 | extra_options_list[option_name] = ("BOOLEAN", {"default": False}) 489 | except Exception as e: 490 | print(f"Error loading extra_option.json: {e}") 491 | else: 492 | print(f"extra_option.json not found at {extra_option_file}. No extra options will be available.") 493 | 494 | # 获取可用的GPU设备列表 495 | gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] 496 | if not gpu_devices: 497 | gpu_devices = ["cpu"] # 如果没有GPU可用,则仅提供CPU选项 498 | 499 | # 定义额外的输入字段 500 | return { 501 | "required": { 502 | "image": ("IMAGE",), 503 | "llm_model": (llm_model_list,), 504 | "dtype": (dtype_list,), 505 | "caption_type": (caption_type_list,), 506 | "caption_length": (caption_length_list,), 507 | "user_prompt": ("STRING", {"default": "", "multiline": True}), 508 | "max_new_tokens": ("INT", {"default": 260, "min": 8, "max": 4096, "step": 1}), 509 | "top_p": ("FLOAT", {"default": 0.8, "min": 0, "max": 1, "step": 0.01}), 510 | "temperature": ("FLOAT", {"default": 0.6, "min": 0, "max": 1, "step": 0.01}), 511 | "cache_model": ("BOOLEAN", {"default": False}), 512 | "device": (gpu_devices,), # 新增GPU设备选择 513 | "enable_extra_options": ("BOOLEAN", {"default": True, "label": "启用额外选项"}), # 新增开关 514 | **extra_options_list, 515 | "character_name": ("STRING", {"default": "", "multiline": False}), 516 | }, 517 | } 518 | 519 | def joycaption2( 520 | self, image, llm_model, dtype, caption_type, caption_length, 521 | user_prompt, max_new_tokens, top_p, temperature, cache_model, device, 522 | enable_extra_options, character_name, **extra_options 523 | ): 524 | ret_text = [] 525 | comfy_model_dir = os.path.join(folder_paths.models_dir, "LLM") 526 | print(f"comfy_model_dir: {comfy_model_dir}") 527 | if not os.path.exists(comfy_model_dir): 528 | os.mkdir(comfy_model_dir) 529 | 530 | sanitized_model_name = llm_model.replace('/', '--') 531 | llm_model_path = os.path.join(comfy_model_dir, sanitized_model_name) 532 | llm_model_path_cache = os.path.join(comfy_model_dir, "cache--" + sanitized_model_name) 533 | 534 | # 使用用户选择的设备 535 | selected_device = device if torch.cuda.is_available() else 'cpu' 536 | model_loaded_on = selected_device # 跟踪模型加载在哪个设备上 537 | 538 | try: 539 | if os.path.exists(llm_model_path): 540 | print(f"Start to load existing model on {selected_device}") 541 | else: 542 | print(f"Model not found locally. Downloading {llm_model}...") 543 | snapshot_download( 544 | repo_id=llm_model, 545 | local_dir=llm_model_path_cache, 546 | local_dir_use_symlinks=False, 547 | resume_download=True 548 | ) 549 | shutil.move(llm_model_path_cache, llm_model_path) 550 | print(f"Model downloaded to {llm_model_path}...") 551 | 552 | if self.previous_model is None: 553 | try: 554 | # 尝试加载模型 555 | free_vram_bytes = mm.get_free_memory() 556 | free_vram_gb = free_vram_bytes / (1024 ** 3) 557 | print(f"Free VRAM: {free_vram_gb:.2f} GB") 558 | if dtype == 'nf4' and free_vram_gb < 10: 559 | print("Free VRAM is less than 10GB when loading 'nf4' model. Performing VRAM cleanup.") 560 | cleanGPU() 561 | elif dtype == 'bf16' and free_vram_gb < 20: 562 | print("Free VRAM is less than 20GB when loading 'bf16' model. Performing VRAM cleanup.") 563 | cleanGPU() 564 | # 统一使用选择的设备 565 | model = load_models( 566 | model_path=llm_model_path, dtype=dtype, device=selected_device 567 | ) 568 | except RuntimeError: 569 | print("An error occurred while loading the model. Please check your configuration.") 570 | else: 571 | model = self.previous_model 572 | 573 | except Exception as e: 574 | print(f"Error loading model: {e}") 575 | return None 576 | 577 | print(f"Model loaded on {model_loaded_on}") 578 | 579 | extra_prompts = [] 580 | 581 | if enable_extra_options: 582 | base_dir = os.path.dirname(os.path.abspath(__file__)) 583 | extra_option_file = os.path.join(base_dir, "extra_option.json") 584 | if os.path.isfile(extra_option_file): 585 | try: 586 | with open(extra_option_file, "r", encoding='utf-8') as f: 587 | json_content = json.load(f) 588 | for item in json_content: 589 | name = item.get("name") 590 | prompt = item.get("prompt") 591 | if name and prompt: 592 | if extra_options.get(name): 593 | # 如果 prompt 中包含 {name},则替换为 character_name 594 | if "{name}" in prompt: 595 | prompt = prompt.replace("{name}", character_name) 596 | extra_prompts.append(prompt) 597 | except Exception as e: 598 | print(f"Error reading extra_option.json: {e}") 599 | else: 600 | print(f"extra_option.json not found at {extra_option_file} during processing.") 601 | 602 | extra = [] 603 | if enable_extra_options: 604 | extra = extra_prompts 605 | print(f"Extra options enabled: {extra_prompts}") 606 | else: 607 | print("No extra options provided.") 608 | 609 | processed_images = [ 610 | Image.fromarray( 611 | np.clip(255.0 * img.unsqueeze(0).cpu().numpy().squeeze(), 0, 255).astype(np.uint8) 612 | ).convert('RGB') 613 | for img in image 614 | ] 615 | 616 | try: 617 | captions = stream_chat( 618 | processed_images, caption_type, caption_length, 619 | extra, "", user_prompt, 620 | max_new_tokens, top_p, temperature, len(processed_images), 621 | model, device # 确保传递正确的设备 622 | ) 623 | ret_text.extend(captions) 624 | except Exception as e: 625 | print(f"Error during stream_chat: {e}") 626 | return None 627 | 628 | if cache_model: 629 | self.previous_model = model 630 | else: 631 | self.previous_model = None 632 | del model 633 | free_memory() 634 | 635 | return (ret_text,) 636 | 637 | 638 | class ExtraOptionsNode: 639 | CATEGORY = 'TTP_Toolset' 640 | FUNCTION = "extra_options" 641 | RETURN_TYPES = ("STRING",) # 改为返回单一字符串 642 | RETURN_NAMES = ("extra_options_str",) 643 | OUTPUT_IS_LIST = (False,) # 单一字符串输出 644 | 645 | def __init__(self): 646 | self.NODE_NAME = 'ExtraOptionsNode' 647 | 648 | @classmethod 649 | def INPUT_TYPES(cls): 650 | # 获取 extra_option.json 的路径并加载选项 651 | base_dir = os.path.dirname(os.path.abspath(__file__)) 652 | extra_option_file = os.path.join(base_dir, "extra_option.json") 653 | extra_options_list = {} 654 | 655 | if os.path.isfile(extra_option_file): 656 | try: 657 | with open(extra_option_file, "r", encoding='utf-8') as f: 658 | json_content = json.load(f) 659 | for item in json_content: 660 | option_name = item.get("name") 661 | if option_name: 662 | # 定义每个额外选项为布尔输入 663 | extra_options_list[option_name] = ("BOOLEAN", {"default": False}) 664 | except Exception as e: 665 | print(f"Error loading extra_option.json: {e}") 666 | else: 667 | print(f"extra_option.json not found at {extra_option_file}. No extra options will be available.") 668 | 669 | # 定义输入字段,包括开关和 character_name 670 | return { 671 | "required": { 672 | "enable_extra_options": ("BOOLEAN", {"default": True, "label": "启用额外选项"}), # 开关 673 | **extra_options_list, # 动态加载的额外选项 674 | "character_name": ("STRING", {"default": "", "multiline": False}), # 移动 character_name 675 | }, 676 | } 677 | 678 | def extra_options(self, enable_extra_options, character_name, **extra_options): 679 | """ 680 | 处理额外选项并返回已启用的提示列表。 681 | 如果启用了替换角色名称选项,并提供了 character_name,则进行替换。 682 | """ 683 | extra_prompts = [] 684 | if enable_extra_options: 685 | base_dir = os.path.dirname(os.path.abspath(__file__)) 686 | extra_option_file = os.path.join(base_dir, "extra_option.json") 687 | if os.path.isfile(extra_option_file): 688 | try: 689 | with open(extra_option_file, "r", encoding='utf-8') as f: 690 | json_content = json.load(f) 691 | for item in json_content: 692 | name = item.get("name") 693 | prompt = item.get("prompt") 694 | if name and prompt: 695 | if extra_options.get(name): 696 | # 如果 prompt 中包含 {name},则替换为 character_name 697 | if "{name}" in prompt: 698 | prompt = prompt.replace("{name}", character_name) 699 | extra_prompts.append(prompt) 700 | except Exception as e: 701 | print(f"Error reading extra_option.json: {e}") 702 | else: 703 | print(f"extra_option.json not found at {extra_option_file} during processing.") 704 | 705 | # 将所有启用的提示拼接成一个字符串 706 | return (" ".join(extra_prompts),) # 返回一个单一的合并字符串 707 | 708 | class JoyCaption2_simple: 709 | 710 | CATEGORY = 'TTP_Toolset' 711 | FUNCTION = "joycaption2_simple" 712 | RETURN_TYPES = ("STRING",) 713 | RETURN_NAMES = ("text",) 714 | OUTPUT_IS_LIST = (True,) 715 | 716 | def __init__(self): 717 | self.NODE_NAME = 'JoyCaption2_simple' 718 | self.previous_model = None 719 | 720 | @classmethod 721 | def INPUT_TYPES(cls): 722 | llm_model_list = [ 723 | "unsloth/Meta-Llama-3.1-8B-Instruct", 724 | "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" 725 | ] 726 | dtype_list = ['nf4', 'bf16'] 727 | caption_type_list = [ 728 | "Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", 729 | "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", 730 | "Social Media Post" 731 | ] 732 | caption_length_list = [ 733 | "any", "very short", "short", "medium-length", "long", "very long" 734 | ] + [str(i) for i in range(20, 261, 5)] 735 | 736 | # 获取可用的GPU设备列表 737 | gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] 738 | if not gpu_devices: 739 | gpu_devices = ["cpu"] # 如果没有GPU可用,则仅提供CPU选项 740 | 741 | # 定义额外的输入字段 742 | return { 743 | "required": { 744 | "image": ("IMAGE",), 745 | "llm_model": (llm_model_list,), 746 | "dtype": (dtype_list,), 747 | "caption_type": (caption_type_list,), 748 | "caption_length": (caption_length_list,), 749 | "user_prompt": ("STRING", {"default": "", "multiline": True}), 750 | "max_new_tokens": ("INT", {"default": 260, "min": 8, "max": 4096, "step": 1}), 751 | "top_p": ("FLOAT", {"default": 0.8, "min": 0, "max": 1, "step": 0.01}), 752 | "temperature": ("FLOAT", {"default": 0.6, "min": 0, "max": 1, "step": 0.01}), 753 | "cache_model": ("BOOLEAN", {"default": False}), 754 | "device": (gpu_devices,), # 新增GPU设备选择 755 | }, 756 | "optional": { 757 | "extra_options_node": ("STRING",{"forceInput": True}), # 接收来自 ExtraOptionsNode 的单一字符串 758 | }, 759 | } 760 | 761 | def joycaption2_simple( 762 | self, image, llm_model, dtype, caption_type, caption_length, 763 | user_prompt, max_new_tokens, top_p, temperature, cache_model, device, 764 | extra_options_node=None # 设置默认值为 None 765 | ): 766 | ret_text = [] 767 | comfy_model_dir = os.path.join(folder_paths.models_dir, "LLM") 768 | print(f"comfy_model_dir: {comfy_model_dir}") 769 | if not os.path.exists(comfy_model_dir): 770 | os.mkdir(comfy_model_dir) 771 | 772 | sanitized_model_name = llm_model.replace('/', '--') 773 | llm_model_path = os.path.join(comfy_model_dir, sanitized_model_name) 774 | llm_model_path_cache = os.path.join(comfy_model_dir, "cache--" + sanitized_model_name) 775 | 776 | # 使用用户选择的设备 777 | selected_device = device if torch.cuda.is_available() else 'cpu' 778 | model_loaded_on = selected_device # 跟踪模型加载在哪个设备上 779 | 780 | try: 781 | if os.path.exists(llm_model_path): 782 | print(f"Start to load existing model on {selected_device}") 783 | else: 784 | print(f"Model not found locally. Downloading {llm_model}...") 785 | snapshot_download( 786 | repo_id=llm_model, 787 | local_dir=llm_model_path_cache, 788 | local_dir_use_symlinks=False, 789 | resume_download=True 790 | ) 791 | shutil.move(llm_model_path_cache, llm_model_path) 792 | print(f"Model downloaded to {llm_model_path}...") 793 | 794 | if self.previous_model is None: 795 | try: 796 | # 尝试加载模型 797 | free_vram_bytes = mm.get_free_memory() 798 | free_vram_gb = free_vram_bytes / (1024 ** 3) 799 | print(f"Free VRAM: {free_vram_gb:.2f} GB") 800 | if dtype == 'nf4' and free_vram_gb < 10: 801 | print("Free VRAM is less than 10GB when loading 'nf4' model. Performing VRAM cleanup.") 802 | cleanGPU() 803 | elif dtype == 'bf16' and free_vram_gb < 20: 804 | print("Free VRAM is less than 20GB when loading 'bf16' model. Performing VRAM cleanup.") 805 | cleanGPU() 806 | # 统一使用选择的设备 807 | 808 | model = load_models( 809 | model_path=llm_model_path, dtype=dtype, device=selected_device) 810 | except RuntimeError: 811 | print("An error occurred while loading the model. Please check your configuration.") 812 | else: 813 | model = self.previous_model 814 | 815 | except Exception as e: 816 | print(f"Error loading model: {e}") 817 | return None 818 | 819 | print(f"Model loaded on {model_loaded_on}") 820 | 821 | # 接收来自 ExtraOptionsNode 的额外提示 822 | extra = [] 823 | if extra_options_node and extra_options_node.strip(): 824 | extra = [extra_options_node] # 将单一字符串包装成列表 825 | print(f"Extra options enabled: {extra_options_node}") 826 | else: 827 | print("No extra options provided.") 828 | 829 | # 处理图像 830 | processed_images = [ 831 | Image.fromarray( 832 | np.clip(255.0 * img.unsqueeze(0).cpu().numpy().squeeze(), 0, 255).astype(np.uint8) 833 | ).convert('RGB') 834 | for img in image 835 | ] 836 | 837 | try: 838 | captions = stream_chat( 839 | processed_images, caption_type, caption_length, 840 | extra, "", user_prompt, 841 | max_new_tokens, top_p, temperature, len(processed_images), 842 | model, device # 确保传递正确的设备 843 | ) 844 | ret_text.extend(captions) 845 | except Exception as e: 846 | print(f"Error during stream_chat: {e}") 847 | return ("Error generating captions.",) 848 | 849 | if cache_model: 850 | self.previous_model = model 851 | else: 852 | self.previous_model = None 853 | del model 854 | free_memory() 855 | 856 | return (ret_text,) 857 | 858 | 859 | # Register the node 860 | NODE_CLASS_MAPPINGS = { 861 | "JoyCaption2": JoyCaption2, 862 | "ExtraOptionsNode": ExtraOptionsNode, 863 | "JoyCaption2_simple": JoyCaption2_simple, 864 | } 865 | 866 | NODE_DISPLAY_NAME_MAPPINGS = { 867 | "JoyCaption2": "TTP_JoyCaption2_Full", 868 | "ExtraOptionsNode": "TTP_ExtraOptionsNode", 869 | "JoyCaption2_simple": "TTP_JoyCaption2_simple", 870 | } 871 | -------------------------------------------------------------------------------- /JCBO.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | from torch import nn 5 | from typing import List, Union, Generator 6 | from PIL import Image 7 | import torchvision.transforms.functional as TVF 8 | import numpy as np 9 | import folder_paths 10 | import json 11 | import logging 12 | from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor, BitsAndBytesConfig 13 | from huggingface_hub import snapshot_download 14 | import shutil 15 | import gc 16 | import comfy.model_management as mm 17 | import comfy.sd 18 | from threading import Thread 19 | 20 | # LIGER Kernel import attempt 21 | try: 22 | from liger_kernel.transformers import apply_liger_kernel_to_llama 23 | LIGER_KERNEL_AVAILABLE = True 24 | except ImportError: 25 | LIGER_KERNEL_AVAILABLE = False 26 | print("LIGER kernel not found. The option to enable it will be disabled.") 27 | 28 | # Global model cache 29 | CACHED_MODEL = None 30 | CACHED_PROCESSOR = None 31 | CACHED_MODEL_PATH_HF_ID = None # Stores the HuggingFace model ID used for the cache 32 | CACHED_LIGER_ENABLED = None 33 | CACHED_QUANTIZATION_MODE = None 34 | CACHED_MODEL_LOCAL_PATH = None # Stores the local disk path of the cached model 35 | 36 | QUANTIZATION_CONFIGS = { 37 | "nf4": { 38 | "load_in_4bit": True, 39 | "bnb_4bit_quant_type": "nf4", 40 | "bnb_4bit_compute_dtype": torch.bfloat16, 41 | "bnb_4bit_use_double_quant": True, 42 | }, 43 | "int8": { 44 | "load_in_8bit": True, 45 | }, 46 | "bf16": { # bf16 is not a quantization config, but a torch_dtype. Handled separately. 47 | }, 48 | } 49 | LLM_SKIP_MODULES = ["vision_tower", "multi_modal_projector"] 50 | MODEL_PATH_HF_DEFAULT = "fancyfeast/llama-joycaption-beta-one-hf-llava" 51 | 52 | # Define the CAPTION_TYPE_MAP for JoyCaptionBetaOne 53 | CAPTION_TYPE_MAP_BETA = { 54 | "Descriptive": [ 55 | "Write a detailed description for this image.", 56 | "Write a detailed description for this image in {word_count} words or less.", 57 | "Write a {length} detailed description for this image.", 58 | ], 59 | "Descriptive (Casual)": [ 60 | "Write a descriptive caption for this image in a casual tone.", 61 | "Write a descriptive caption for this image in a casual tone within {word_count} words.", 62 | "Write a {length} descriptive caption for this image in a casual tone.", 63 | ], 64 | "Straightforward": [ 65 | '''Write a straightforward caption for this image. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what\'s absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with "This image is…" or similar phrasing.''', 66 | '''Write a straightforward caption for this image within {word_count} words. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what\'s absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with "This image is…" or similar phrasing.''', 67 | '''Write a {length} straightforward caption for this image. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what\'s absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with "This image is…" or similar phrasing.''', 68 | ], 69 | "Stable Diffusion Prompt": [ 70 | "Output a stable diffusion prompt that is indistinguishable from a real stable diffusion prompt.", 71 | "Output a stable diffusion prompt that is indistinguishable from a real stable diffusion prompt. {word_count} words or less.", 72 | "Output a {length} stable diffusion prompt that is indistinguishable from a real stable diffusion prompt.", 73 | ], 74 | "MidJourney": [ 75 | "Write a MidJourney prompt for this image.", 76 | "Write a MidJourney prompt for this image within {word_count} words.", 77 | "Write a {length} MidJourney prompt for this image.", 78 | ], 79 | "Danbooru tag list": [ 80 | "Generate only comma-separated Danbooru tags (lowercase_underscores). Strict order: `artist:`, `copyright:`, `character:`, `meta:`, then general tags. Include counts (1girl), appearance, clothing, accessories, pose, expression, actions, background. Use precise Danbooru syntax. No extra text.", 81 | "Generate only comma-separated Danbooru tags (lowercase_underscores). Strict order: `artist:`, `copyright:`, `character:`, `meta:`, then general tags. Include counts (1girl), appearance, clothing, accessories, pose, expression, actions, background. Use precise Danbooru syntax. No extra text. {word_count} words or less.", 82 | "Generate only comma-separated Danbooru tags (lowercase_underscores). Strict order: `artist:`, `copyright:`, `character:`, `meta:`, then general tags. Include counts (1girl), appearance, clothing, accessories, pose, expression, actions, background. Use precise Danbooru syntax. No extra text. {length} length.", 83 | ], 84 | "e621 tag list": [ 85 | "Write a comma-separated list of e621 tags in alphabetical order for this image. Start with the artist, copyright, character, species, meta, and lore tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', \'species:\', \'meta:\', and \'lore:\'. Then all the general tags.", 86 | "Write a comma-separated list of e621 tags in alphabetical order for this image. Start with the artist, copyright, character, species, meta, and lore tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', \'species:\', \'meta:\', and \'lore:\'. Then all the general tags. Keep it under {word_count} words.", 87 | "Write a {length} comma-separated list of e621 tags in alphabetical order for this image. Start with the artist, copyright, character, species, meta, and lore tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', \'species:\', \'meta:\', and \'lore:\'. Then all the general tags.", 88 | ], 89 | "Rule34 tag list": [ 90 | "Write a comma-separated list of rule34 tags in alphabetical order for this image. Start with the artist, copyright, character, and meta tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', and \'meta:\'. Then all the general tags.", 91 | "Write a comma-separated list of rule34 tags in alphabetical order for this image. Start with the artist, copyright, character, and meta tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', and \'meta:\'. Then all the general tags. Keep it under {word_count} words.", 92 | "Write a {length} comma-separated list of rule34 tags in alphabetical order for this image. Start with the artist, copyright, character, and meta tags (if any), prefixed by \'artist:\', \'copyright:\', \'character:\', and \'meta:\'. Then all the general tags.", 93 | ], 94 | "Booru-like tag list": [ 95 | "Write a list of Booru-like tags for this image.", 96 | "Write a list of Booru-like tags for this image within {word_count} words.", 97 | "Write a {length} list of Booru-like tags for this image.", 98 | ], 99 | "Art Critic": [ 100 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.", 101 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.", 102 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.", 103 | ], 104 | "Product Listing": [ 105 | "Write a caption for this image as though it were a product listing.", 106 | "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.", 107 | "Write a {length} caption for this image as though it were a product listing.", 108 | ], 109 | "Social Media Post": [ 110 | "Write a caption for this image as if it were being used for a social media post.", 111 | "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.", 112 | "Write a {length} caption for this image as if it were being used for a social media post.", 113 | ], 114 | } 115 | 116 | NAME_OPTION_PROMPT = "If there is a person/character in the image you must refer to them as {name}." 117 | 118 | # Extra options for the node 119 | EXTRA_OPTIONS_LIST = [ 120 | NAME_OPTION_PROMPT, 121 | "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).", 122 | "Include information about lighting.", 123 | "Include information about camera angle.", 124 | "Include information about whether there is a watermark or not.", 125 | "Include information about whether there are JPEG artifacts or not.", 126 | "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.", 127 | "Do NOT include anything sexual; keep it PG.", 128 | "Do NOT mention the image\'s resolution.", 129 | "You MUST include information about the subjective aesthetic quality of the image from low to very high.", 130 | "Include information on the image\'s composition style, such as leading lines, rule of thirds, or symmetry.", 131 | "Do NOT mention any text that is in the image.", 132 | "Specify the depth of field and whether the background is in focus or blurred.", 133 | "If applicable, mention the likely use of artificial or natural lighting sources.", 134 | "Do NOT use any ambiguous language.", 135 | "Include whether the image is sfw, suggestive, or nsfw.", 136 | "ONLY describe the most important elements of the image.", 137 | "If it is a work of art, do not include the artist\'s name or the title of the work.", 138 | "Identify the image orientation (portrait, landscape, or square) and aspect ratio if obvious.", 139 | """Use vulgar slang and profanity, such as (but not limited to) "fucking," "slut," "cock," etc.""", 140 | "Do NOT use polite euphemisms—lean into blunt, casual phrasing.", 141 | "Include information about the ages of any people/characters when applicable.", 142 | "Mention whether the image depicts an extreme close-up, close-up, medium close-up, medium shot, cowboy shot, medium wide shot, wide shot, or extreme wide shot.", 143 | "Do not mention the mood/feeling/etc of the image.", 144 | "Explicitly specify the vantage height (eye-level, low-angle worm's-eye, bird's-eye, drone, rooftop, etc.).", 145 | "If there is a watermark, you must mention it.", 146 | """Your response will be used by a text-to-image model, so avoid useless meta phrases like "This image shows…", "You are looking at...", etc.""", 147 | ] 148 | 149 | def _build_prompt_beta_shared(caption_type: str, caption_length: str, list_of_extra_prompts: list[str], character_name_val: str, custom_prompt_override: str) -> str: 150 | if custom_prompt_override and custom_prompt_override.strip(): 151 | if character_name_val and "{name}" in custom_prompt_override: 152 | return custom_prompt_override.replace("{name}", character_name_val) 153 | return custom_prompt_override 154 | 155 | if caption_length == "any": map_idx = 0 156 | elif isinstance(caption_length, str) and caption_length.isdigit(): map_idx = 1 157 | else: map_idx = 2 158 | 159 | base_prompt = CAPTION_TYPE_MAP_BETA[caption_type][map_idx] 160 | final_extra_prompts = [] 161 | 162 | for extra_prompt_template in list_of_extra_prompts: 163 | if extra_prompt_template == NAME_OPTION_PROMPT: 164 | if character_name_val: # Only include and format name if provided 165 | final_extra_prompts.append(extra_prompt_template.format(name=character_name_val)) 166 | # If character_name_val is empty, this prompt is skipped entirely. 167 | else: 168 | final_extra_prompts.append(extra_prompt_template) 169 | 170 | full_prompt_parts = [base_prompt] 171 | if final_extra_prompts: 172 | full_prompt_parts.extend(final_extra_prompts) 173 | 174 | # Format the base prompt part; extra prompts are already formatted or don't need it here 175 | # This assumes {word_count} and {length} are only in the base_prompt template 176 | full_prompt_parts[0] = full_prompt_parts[0].format(length=caption_length, word_count=caption_length) 177 | 178 | return " ".join(full_prompt_parts) 179 | 180 | def _free_model_memory_shared(): 181 | global CACHED_MODEL, CACHED_PROCESSOR, CACHED_MODEL_PATH_HF_ID, CACHED_LIGER_ENABLED, CACHED_QUANTIZATION_MODE, CACHED_MODEL_LOCAL_PATH 182 | CACHED_MODEL = None 183 | CACHED_PROCESSOR = None 184 | CACHED_MODEL_PATH_HF_ID = None 185 | CACHED_LIGER_ENABLED = None 186 | CACHED_QUANTIZATION_MODE = None 187 | CACHED_MODEL_LOCAL_PATH = None 188 | gc.collect() 189 | if torch.cuda.is_available(): 190 | torch.cuda.empty_cache() 191 | torch.cuda.ipc_collect() 192 | print("JoyCaptionBetaOne (Shared): Model and processor released from cache.") 193 | 194 | def _clean_gpu_shared(): 195 | gc.collect() 196 | mm.unload_all_models() 197 | mm.soft_empty_cache() 198 | print("JoyCaptionBetaOne (Shared): ComfyUI models unloaded and cache soft-emptied.") 199 | 200 | def _load_model_shared(model_hf_id: str, quantization_mode: str, target_device: str, enable_liger: bool): 201 | global CACHED_MODEL, CACHED_PROCESSOR, CACHED_MODEL_PATH_HF_ID, CACHED_LIGER_ENABLED, CACHED_QUANTIZATION_MODE, CACHED_MODEL_LOCAL_PATH 202 | 203 | model_dir_base = os.path.join(folder_paths.models_dir, "LLM_llava") 204 | if not os.path.exists(model_dir_base): os.makedirs(model_dir_base, exist_ok=True) 205 | sanitized_model_repo_name = model_hf_id.replace('/', '--') 206 | model_path_local = os.path.join(model_dir_base, sanitized_model_repo_name) 207 | model_path_cache_tmp = os.path.join(model_dir_base, "cache--" + sanitized_model_repo_name) 208 | 209 | effective_device = target_device if torch.cuda.is_available() else "cpu" 210 | print(f"JoyCaptionBetaOne (Shared): Using effective device: {effective_device} for model {model_hf_id}") 211 | 212 | reload_needed = False 213 | if CACHED_MODEL is None or \ 214 | CACHED_PROCESSOR is None or \ 215 | CACHED_MODEL_PATH_HF_ID != model_hf_id or \ 216 | CACHED_MODEL_LOCAL_PATH != model_path_local or \ 217 | CACHED_QUANTIZATION_MODE != quantization_mode or \ 218 | (LIGER_KERNEL_AVAILABLE and CACHED_LIGER_ENABLED != enable_liger): 219 | reload_needed = True 220 | if CACHED_MODEL is not None: 221 | print(f"JoyCaptionBetaOne (Shared): Config changed (Prev: {CACHED_MODEL_PATH_HF_ID}, {CACHED_QUANTIZATION_MODE}, Liger: {CACHED_LIGER_ENABLED}. New: {model_hf_id}, {quantization_mode}, Liger: {enable_liger}). Reloading.") 222 | _free_model_memory_shared() 223 | 224 | if reload_needed: 225 | print(f"JoyCaptionBetaOne (Shared): Loading model from {model_path_local} (HF: {model_hf_id})") 226 | if not os.path.exists(model_path_local): 227 | print(f"JoyCaptionBetaOne (Shared): Downloading {model_hf_id}...") 228 | try: 229 | snapshot_download(repo_id=model_hf_id, local_dir=model_path_cache_tmp, local_dir_use_symlinks=False, resume_download=True) 230 | shutil.move(model_path_cache_tmp, model_path_local) 231 | print(f"JoyCaptionBetaOne (Shared): Model {model_hf_id} downloaded to {model_path_local}") 232 | except Exception as e: raise RuntimeError(f"Error downloading model {model_hf_id}: {e}") 233 | try: 234 | print(f"JoyCaptionBetaOne (Shared): Loading processor from {model_path_local}...") 235 | processor = AutoProcessor.from_pretrained(model_path_local) 236 | print(f"JoyCaptionBetaOne (Shared): Loading model {model_hf_id} with quantization '{quantization_mode}'...") 237 | 238 | model_load_kwargs = {} 239 | final_torch_dtype = None 240 | final_device_map = "auto" # Default to auto, will be overridden if needed 241 | 242 | current_quant_mode = quantization_mode 243 | if "cuda" not in effective_device and current_quant_mode in ["nf4", "int8"]: 244 | print(f"JoyCaptionBetaOne (Shared): Quantization '{current_quant_mode}' needs CUDA. Falling back to bf16 for CPU for {model_hf_id}.") 245 | current_quant_mode = "bf16" 246 | 247 | if current_quant_mode == "bf16": 248 | final_torch_dtype = torch.bfloat16 249 | final_device_map = None if "cpu" in effective_device else effective_device 250 | elif current_quant_mode in ["nf4", "int8"]: 251 | # This block is for CUDA devices as per the check above 252 | bnb_config_params = QUANTIZATION_CONFIGS[current_quant_mode].copy() 253 | bnb_config_params["llm_int8_skip_modules"] = LLM_SKIP_MODULES 254 | q_config = BitsAndBytesConfig(**bnb_config_params) 255 | model_load_kwargs["quantization_config"] = q_config 256 | final_torch_dtype = torch.bfloat16 if current_quant_mode == "nf4" else "auto" 257 | final_device_map = effective_device # MODIFICATION: Use the user-selected CUDA device 258 | print(f"JoyCaptionBetaOne (Shared): Preparing {current_quant_mode} for specific device: {effective_device}") 259 | else: # Fallback / fp32 (though not an explicit option) 260 | final_torch_dtype = torch.float32 if "cpu" in effective_device else torch.bfloat16 261 | final_device_map = None if "cpu" in effective_device else effective_device 262 | 263 | model_load_kwargs["torch_dtype"] = final_torch_dtype 264 | model_load_kwargs["device_map"] = final_device_map 265 | 266 | if "cuda" in effective_device: 267 | free_vram_gb = mm.get_free_memory(effective_device) / (1024**3) 268 | # Basic VRAM check - can be more sophisticated 269 | if free_vram_gb < 4 and current_quant_mode != "nf4": # NF4 is very light 270 | print(f"Warning: Low VRAM ({free_vram_gb:.2f}GB on {effective_device}) for {current_quant_mode}") 271 | # _clean_gpu_shared() # Consider if cleanup is aggressive enough or needed 272 | 273 | model = LlavaForConditionalGeneration.from_pretrained(model_path_local, **model_load_kwargs) 274 | assert isinstance(model, LlavaForConditionalGeneration) 275 | model.eval() 276 | 277 | if LIGER_KERNEL_AVAILABLE and enable_liger and "cuda" in str(model.device).lower(): # Check actual model device for LIGER 278 | try: 279 | print(f"JoyCaptionBetaOne (Shared): Applying LIGER kernel to {model_hf_id} on {model.device}...") 280 | apply_liger_kernel_to_llama(model=model.language_model) 281 | CACHED_LIGER_ENABLED = True 282 | except Exception as e: print(f"JoyCaptionBetaOne (Shared): LIGER kernel apply failed for {model_hf_id}: {e}"); CACHED_LIGER_ENABLED = False 283 | else: CACHED_LIGER_ENABLED = False 284 | 285 | CACHED_MODEL = model 286 | CACHED_PROCESSOR = processor 287 | CACHED_MODEL_PATH_HF_ID = model_hf_id 288 | CACHED_MODEL_LOCAL_PATH = model_path_local 289 | CACHED_QUANTIZATION_MODE = quantization_mode # Cache the original requested mode 290 | print(f"JoyCaptionBetaOne (Shared): Model {model_hf_id} loaded. Effective quantization: '{current_quant_mode}', LIGER: {CACHED_LIGER_ENABLED}, Device map: '{str(model.hf_device_map)}'.") 291 | except Exception as e: 292 | _free_model_memory_shared() 293 | raise RuntimeError(f"Error loading model {model_hf_id}: {e}") 294 | else: 295 | print(f"JoyCaptionBetaOne (Shared): Using cached model ({CACHED_MODEL_PATH_HF_ID}, Quant: {CACHED_QUANTIZATION_MODE}, LIGER: {CACHED_LIGER_ENABLED}).") 296 | model = CACHED_MODEL 297 | processor = CACHED_PROCESSOR 298 | return model, processor 299 | 300 | class JoyCaptionBetaOne_Full: 301 | CATEGORY = 'TTP_Toolset' 302 | FUNCTION = "caption_image" 303 | RETURN_TYPES = ("STRING",) 304 | RETURN_NAMES = ("caption",) 305 | OUTPUT_IS_LIST = (True,) 306 | 307 | def __init__(self): 308 | self.NODE_NAME = 'JoyCaptionBetaOne_Full' 309 | 310 | @classmethod 311 | def INPUT_TYPES(cls): 312 | caption_type_keys = list(CAPTION_TYPE_MAP_BETA.keys()) 313 | caption_length_list = ["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 10)] 314 | quantization_mode_list = ['bf16', 'nf4', 'int8'] 315 | 316 | gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] 317 | if not gpu_devices: 318 | gpu_devices = ["cpu"] 319 | 320 | extra_options_inputs = {} 321 | for i, option_text in enumerate(EXTRA_OPTIONS_LIST): 322 | label = option_text.split('.')[0].replace(' ', '_').replace('/', '_').lower() 323 | if len(label) > 30: label = label[:30] 324 | extra_options_inputs[f"extra_option_{i}_{label}"] = ("BOOLEAN", {"default": False, "label": option_text[:100]}) 325 | 326 | inputs = { 327 | "required": { 328 | "image": ("IMAGE",), 329 | "caption_type": (caption_type_keys,), 330 | "caption_length": (caption_length_list,), 331 | "quantization_mode": (quantization_mode_list, {"default": 'bf16'}), 332 | "custom_prompt": ("STRING", {"default": "", "multiline": True, "label": "Custom Prompt (Overrides caption type & extras)"}), 333 | "character_name": ("STRING", {"default": "", "multiline": False, "label": "Person/Character Name (for {name} in extras)"}), 334 | "temperature": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 2.0, "step": 0.05}), 335 | "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}), 336 | "max_new_tokens": ("INT", {"default": 512, "min": 1, "max": 2048, "step": 1}), 337 | "device": (gpu_devices,), 338 | "cache_model": ("BOOLEAN", {"default": True, "label": "Cache Model in Memory"}), 339 | }, 340 | "optional": {} 341 | } 342 | 343 | if LIGER_KERNEL_AVAILABLE: 344 | inputs["required"]["enable_liger_kernel"] = ("BOOLEAN", {"default": True, "label": "Enable LIGER Kernel (CUDA only)"}) 345 | else: 346 | inputs["required"]["info_liger_unavailable"] = ("STRING", {"default": "LIGER Kernel not installed/available.", "label": "LIGER Kernel Status", "input": "hidden"}) 347 | 348 | # Add the dynamically generated extra options to 'required' 349 | inputs["required"].update(extra_options_inputs) 350 | return inputs 351 | 352 | @torch.no_grad() 353 | def caption_image(self, image: torch.Tensor, caption_type: str, caption_length: str, 354 | quantization_mode: str, 355 | custom_prompt: str, character_name: str, 356 | temperature: float, top_p: float, max_new_tokens: int, 357 | device: str, cache_model: bool, **kwargs): 358 | enable_liger_kernel = kwargs.get('enable_liger_kernel', False) if LIGER_KERNEL_AVAILABLE else False 359 | try: 360 | model, processor = _load_model_shared(MODEL_PATH_HF_DEFAULT, quantization_mode, device, enable_liger_kernel) 361 | except Exception as e: 362 | print(f"Error in {self.NODE_NAME}: {e}") 363 | return ([str(e)],) # Return error message as list of strings 364 | 365 | selected_extra_options_prompts = [] 366 | for i, option_text_template in enumerate(EXTRA_OPTIONS_LIST): 367 | key_label_part = option_text_template.split('.')[0].replace(' ', '_').replace('/', '_').lower() 368 | if len(key_label_part) > 30: key_label_part = key_label_part[:30] 369 | extra_option_key = f"extra_option_{i}_{key_label_part}" 370 | if kwargs.get(extra_option_key, False): selected_extra_options_prompts.append(option_text_template) 371 | 372 | pil_images = [Image.fromarray(np.clip(255. * img.cpu().numpy().squeeze(),0,255).astype(np.uint8)).convert("RGB") for img in image] 373 | all_captions = [] 374 | 375 | for input_image_pil in pil_images: 376 | actual_prompt_str = _build_prompt_beta_shared(caption_type, caption_length, selected_extra_options_prompts, character_name, custom_prompt) 377 | print(f"{self.NODE_NAME}: Prompt: {actual_prompt_str}") 378 | convo = [{"role": "system", "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions."}, {"role": "user", "content": actual_prompt_str.strip()}] 379 | convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 380 | 381 | model_device = model.device # Use the actual device of the loaded model 382 | inputs_on_device = processor(text=[convo_string], images=[input_image_pil], return_tensors="pt").to(model_device) 383 | inputs_on_device['pixel_values'] = inputs_on_device['pixel_values'].to(model.dtype) # Ensure correct dtype for pixel_values 384 | 385 | try: 386 | with torch.cuda.amp.autocast(enabled=("cuda" in str(model_device).lower() and model.dtype != torch.float32)): 387 | generate_ids = model.generate(**inputs_on_device, max_new_tokens=max_new_tokens, do_sample=(temperature > 0), temperature=temperature if temperature > 0 else None, top_p=top_p if temperature > 0 else None, use_cache=True) 388 | except Exception as e: 389 | print(f"{self.NODE_NAME}: Generation error: {e}") 390 | if "out of memory" in str(e).lower() and "cuda" in str(model_device).lower(): 391 | print(f"{self.NODE_NAME}: OOM error detected. Clearing model cache."); _free_model_memory_shared() 392 | return ([f"Error generating caption: {e}"],) 393 | input_token_len = inputs_on_device.input_ids.shape[1] 394 | generated_text_ids = generate_ids[:, input_token_len:] 395 | caption = processor.batch_decode(generated_text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 396 | all_captions.append(caption.strip()) 397 | 398 | if not cache_model: 399 | print(f"{self.NODE_NAME}: Not caching model, releasing from memory.") 400 | _free_model_memory_shared() 401 | return (all_captions,) 402 | 403 | class ExtraOptionsNode_Beta: 404 | CATEGORY = 'TTP_Toolset' 405 | FUNCTION = "compile_extra_options" 406 | RETURN_TYPES = ("STRING",) 407 | RETURN_NAMES = ("extra_options_str",) 408 | OUTPUT_IS_LIST = (False,) 409 | 410 | def __init__(self): 411 | self.NODE_NAME = 'ExtraOptionsNode_Beta' 412 | 413 | @classmethod 414 | def INPUT_TYPES(cls): 415 | extra_options_inputs = {} 416 | for i, option_text in enumerate(EXTRA_OPTIONS_LIST): 417 | label = option_text.split('.')[0].replace(' ', '_').replace('/', '_').lower() 418 | if len(label) > 30: label = label[:30] 419 | extra_options_inputs[f"extra_option_{i}_{label}"] = ("BOOLEAN", {"default": False, "label": option_text[:100]}) 420 | inputs = { 421 | "required": { 422 | "enable_extra_options": ("BOOLEAN", {"default": True, "label": "Enable Extra Options"}), 423 | "character_name": ("STRING", {"default": "", "multiline": False, "label": "Person/Character Name (for {name})"}), 424 | }, 425 | } 426 | inputs["required"].update(extra_options_inputs) 427 | return inputs 428 | 429 | def compile_extra_options(self, enable_extra_options, character_name, **kwargs): 430 | if not enable_extra_options: 431 | return ("",) 432 | 433 | compiled_options = [] 434 | for i, option_text_template in enumerate(EXTRA_OPTIONS_LIST): 435 | key_label_part = option_text_template.split('.')[0].replace(' ', '_').replace('/', '_').lower() 436 | if len(key_label_part) > 30: key_label_part = key_label_part[:30] 437 | extra_option_key = f"extra_option_{i}_{key_label_part}" 438 | if kwargs.get(extra_option_key, False): 439 | if option_text_template == NAME_OPTION_PROMPT: 440 | if character_name: # Only add if name is provided 441 | compiled_options.append(option_text_template.format(name=character_name)) 442 | else: 443 | compiled_options.append(option_text_template) 444 | return (" ".join(compiled_options),) 445 | 446 | class JoyCaptionBetaOne_Simple: 447 | CATEGORY = 'TTP_Toolset' 448 | FUNCTION = "caption_image_simple" 449 | RETURN_TYPES = ("STRING",) 450 | RETURN_NAMES = ("caption",) 451 | OUTPUT_IS_LIST = (True,) 452 | 453 | def __init__(self): 454 | self.NODE_NAME = 'JoyCaptionBetaOne_Simple' 455 | 456 | @classmethod 457 | def INPUT_TYPES(cls): 458 | caption_type_keys = list(CAPTION_TYPE_MAP_BETA.keys()) 459 | caption_length_list = ["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 10)] 460 | quantization_mode_list = ['bf16', 'nf4', 'int8'] 461 | 462 | gpu_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] 463 | if not gpu_devices: 464 | gpu_devices = ["cpu"] 465 | return { 466 | "required": { 467 | "image": ("IMAGE",), 468 | "caption_type": (caption_type_keys,), 469 | "caption_length": (caption_length_list,), 470 | "quantization_mode": (quantization_mode_list, {"default": 'bf16'}), 471 | "custom_prompt": ("STRING", {"default": "", "multiline": True, "label": "Custom Prompt (Overrides caption type & extras)"}), 472 | "temperature": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 2.0, "step": 0.05}), 473 | "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}), 474 | "max_new_tokens": ("INT", {"default": 512, "min": 1, "max": 2048, "step": 1}), 475 | "device": (gpu_devices,), 476 | "cache_model": ("BOOLEAN", {"default": True, "label": "Cache Model in Memory"}), 477 | }, 478 | "optional": { 479 | "extra_options_str": ("STRING", {"forceInput": True, "default": ""}), 480 | "enable_liger_kernel_opt": ("BOOLEAN", {"default": True, "label": "Enable LIGER Kernel (CUDA only)"}), # Liger as optional for simple 481 | } 482 | } 483 | 484 | @torch.no_grad() 485 | def caption_image_simple(self, image: torch.Tensor, caption_type: str, caption_length: str, 486 | quantization_mode: str, custom_prompt: str, 487 | temperature: float, top_p: float, max_new_tokens: int, 488 | device: str, cache_model: bool, 489 | extra_options_str:str="", enable_liger_kernel_opt:bool=True): 490 | enable_liger_kernel = enable_liger_kernel_opt if LIGER_KERNEL_AVAILABLE else False 491 | try: 492 | model, processor = _load_model_shared(MODEL_PATH_HF_DEFAULT, quantization_mode, device, enable_liger_kernel) 493 | except Exception as e: 494 | print(f"Error in {self.NODE_NAME}: {e}") 495 | return ([str(e)],) # Return error message as list of strings 496 | 497 | list_of_extra_prompts = [extra_options_str] if extra_options_str.strip() else [] 498 | 499 | pil_images = [Image.fromarray(np.clip(255. * img.cpu().numpy().squeeze(),0,255).astype(np.uint8)).convert("RGB") for img in image] 500 | all_captions = [] 501 | 502 | for input_image_pil in pil_images: 503 | # For the simple node, character_name is not a direct input. If name handling is desired via extra_options_str, 504 | # it must have been compiled into that string by ExtraOptionsNode_Beta. 505 | # So, we pass an empty string for character_name_val to _build_prompt_beta_shared. 506 | actual_prompt_str = _build_prompt_beta_shared(caption_type, caption_length, list_of_extra_prompts, "", custom_prompt) 507 | print(f"{self.NODE_NAME}: Prompt: {actual_prompt_str}") 508 | convo = [{"role": "system", "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions."}, {"role": "user", "content": actual_prompt_str.strip()}] 509 | convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 510 | 511 | model_device = model.device 512 | inputs_on_device = processor(text=[convo_string], images=[input_image_pil], return_tensors="pt").to(model_device) 513 | inputs_on_device['pixel_values'] = inputs_on_device['pixel_values'].to(model.dtype) 514 | try: 515 | with torch.cuda.amp.autocast(enabled=("cuda" in str(model_device).lower() and model.dtype != torch.float32)): 516 | generate_ids = model.generate(**inputs_on_device, max_new_tokens=max_new_tokens, do_sample=(temperature > 0), temperature=temperature if temperature > 0 else None, top_p=top_p if temperature > 0 else None, use_cache=True) 517 | except Exception as e: 518 | print(f"{self.NODE_NAME}: Generation error: {e}") 519 | if "out of memory" in str(e).lower() and "cuda" in str(model_device).lower(): 520 | print(f"{self.NODE_NAME}: OOM error detected. Clearing model cache."); _free_model_memory_shared() 521 | return ([f"Error generating caption: {e}"],) 522 | input_token_len = inputs_on_device.input_ids.shape[1] 523 | generated_text_ids = generate_ids[:, input_token_len:] 524 | caption = processor.batch_decode(generated_text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 525 | all_captions.append(caption.strip()) 526 | 527 | if not cache_model: 528 | print(f"{self.NODE_NAME}: Not caching model, releasing from memory.") 529 | _free_model_memory_shared() 530 | return (all_captions,) 531 | 532 | NODE_CLASS_MAPPINGS = { 533 | "JoyCaptionBetaOne_Full": JoyCaptionBetaOne_Full, 534 | "ExtraOptionsNode_Beta": ExtraOptionsNode_Beta, 535 | "JoyCaptionBetaOne_Simple": JoyCaptionBetaOne_Simple, 536 | } 537 | NODE_DISPLAY_NAME_MAPPINGS = { 538 | "JoyCaptionBetaOne_Full": "TTP_JoyCaption_BetaOne_Full", 539 | "ExtraOptionsNode_Beta": "TTP_ExtraOptionsNode_Beta", 540 | "JoyCaptionBetaOne_Simple": "TTP_JoyCaption_BetaOne_Simple", 541 | } 542 | print("JoyCaptionBetaOne (JCBO.py) nodes (Full, Simple, ExtraOptions) loaded with refined quantization.") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🧠 ComfyUI Joy Caption Wrapper (Alpha Two & Beta One) 2 | 3 | > 💡 支持 Alpha Two 与全新 Beta One 模型 4 | > 🎮 一键部署 / 自动下载(Beta One 模型无需手动放置) 5 | > 📦 GitHub Repo: https://github.com/TTPlanetPig/Comfyui_JC2 6 | > 7 | > Comfyui workflow example: 8 | > https://github.com/TTPlanetPig/Comfyui_JC2/blob/main/example/JoyCaption%20Beta_One_example.png 9 | 10 | --- 11 | 12 | ## 🌟 简介(这里为旧的Joy Caption Alpha Two介绍,已过时) 13 | 14 | 这是为 [ComfyUI](https://github.com/comfyanonymous/ComfyUI) 封装的 Joy Caption 节点: 15 | 16 | - ✅ 支持 `joy-caption-alpha-two` 以及 **全新版本** [`joy-caption-beta-one`](https://huggingface.co/spaces/fancyfeast/joy-caption-beta-one) 17 | - 🧊 对低显存卡用户推荐使用 `nf4` 模式,兼顾速度与效果 18 | - 🔁 参考实现来自: 19 | - [chflame163/ComfyUI_LayerStyle](https://github.com/chflame163/ComfyUI_LayerStyle) 20 | - [John6666/joy-caption-alpha-two-cli-mod](https://huggingface.co/John6666/joy-caption-alpha-two-cli-mod) 21 | 22 | --- 23 | 24 | ## ⚠️ VRAM 要求 25 | 26 | | 模式 | 最低显存 | 说明 | 27 | |------|-----------|------| 28 | | `bf16` | ≥ 19GB | 推荐给 3090 / 4090 用户 | 29 | | `nf4` | ≥ 10GB | 推荐低于 19GB 显存时使用 | 30 | 31 | > 显存不足将导致 ComfyUI 报错或运行失败。 32 | 33 | --- 34 | 35 | ## 🚀 安装方式 36 | 37 | ### ✅ 安装节点: 38 | 39 | 方法一:通过 ComfyUI 内置 Manager 安装 40 | 方法二:手动克隆 41 | 42 | ```bash 43 | cd ./comfyui/custom_nodes 44 | git clone https://github.com/TTPlanetPig/Comfyui_JC2 45 | ``` 46 | 47 | ### ✅ 安装依赖(适用于 `python_embedded`): 48 | 49 | ```bash 50 | cd ./comfyui/custom_nodes/Comfyui_JC2 51 | ../../../python_embeded/python.exe -m pip install -r requirements.txt 52 | ``` 53 | 54 | ### ✅ 安装 PyTorch(如果未预装) 55 | 56 | ```bash 57 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 58 | ``` 59 | 60 | --- 61 | 62 | ## 🚅 提速建议:开启 Liger Kernel 63 | 64 | 若需要进一步**提升运行速度**,推荐启用 **liger-kernel**: 65 | 66 | - 📁 请点击节点目录下的 `安装liger-kernel.bat` 67 | - ✅ 适用于 ComfyUI 官方一键包(`python_embeded` 构建) 68 | 69 | --- 70 | 71 | ## 📥 模型准备 72 | 73 | | 模型 | 下载链接 | 放置路径 | 74 | |------|-----------|----------| 75 | | `clip_vision` | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | `ComfyUI/models/clip_vision/google--siglip-so400m-patch14-384` | 76 | | `LLM` | [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct) | `ComfyUI/models/LLM/unsloth--Meta-Llama-3.1-8B-Instruct` | 77 | | `Joy Caption LoRA` (alpha two) | [joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two) | `ComfyUI/models/Joy_caption/cgrkzexw-599808` | 78 | 79 | 📦 推荐使用 `huggingface-cli` 下载以避免路径或名称出错。 80 | 81 | 或者使用百度网盘打包下载: 82 | 83 | > 链接: https://pan.baidu.com/s/1yYRlDKclehSPv-tUVwfVHw 提取码: `b84c` 84 | 85 | --- 86 | 87 | ## 🆕 新增支持:joy-caption-beta-one 🎉 88 | 89 | - ✅ 已集成 [joy-caption-beta-one](https://huggingface.co/spaces/fancyfeast/joy-caption-beta-one) 90 | - ✅ **无需用户手动下载模型**,ComfyUI 节点会自动拉取 HuggingFace 资源 91 | - ✅ 保持与 Alpha Two 同样的调用逻辑,支持 GPU 显存检测与模式选择 92 | 93 | --- 94 | 95 | ## 📸 界面 & 使用要点 96 | 97 | ### 🎛 关键参数介绍: 98 | 99 | 1. 模式选择(`bf16` / `nf4`) 100 | 推荐 3090 / 4090 使用 `bf16`,其余使用 `nf4` 101 | ![bf16 vs nf4](https://github.com/user-attachments/assets/8001e70b-cea3-4971-a8c2-f483a2c4f91c) 102 | 103 | 2. 提示词模式选择(多种任务类型) 104 | ![prompt type](https://github.com/user-attachments/assets/110f25f6-ea25-4395-b698-c0ec358940ae) 105 | 106 | 3. 文本长度选择(不总是越长越好) 107 | ![length not always better](https://github.com/user-attachments/assets/05e8cfbe-f983-4c8e-813a-761779d0ba4e) 108 | 109 | 4. 模型 offload 开关(决定是否将模型保留在显存) 110 | ![offload setting](https://github.com/user-attachments/assets/804d3326-0f44-4cd2-98c9-56e174e552c1) 111 | 112 | 5. 控制附加选项是否生效(需搭配使用) 113 | ![extra enable](https://github.com/user-attachments/assets/6cb00a63-a1e6-4502-87ff-b99800d37912) 114 | 115 | 6. 联动选项,需同时启用才有效果 116 | ![combo 1](https://github.com/user-attachments/assets/16d11016-6ff1-4d62-90ca-c3d820af4cd3) 117 | ![combo 2](https://github.com/user-attachments/assets/6fe8dbd4-affe-4753-b10e-aa4120ab5149) 118 | 119 | --- 120 | 121 | ## 🖼 文件夹结构示意 122 | 123 | 确保模型文件正确放置,如图所示: 124 | 125 | ![结构1](https://github.com/user-attachments/assets/4675b67c-38f8-4d6a-9785-607215038337) 126 | ![结构2](https://github.com/user-attachments/assets/9ae0a410-539e-49c5-a1b4-4434da02dc28) 127 | ![结构3](https://github.com/user-attachments/assets/2d17e8d2-42af-4040-9cf9-019eb25464e0) 128 | ![结构4](https://github.com/user-attachments/assets/aeba0145-81c7-4c86-a31c-bbb9c317cad8) 129 | 130 | --- 131 | 132 | ## ⭐ Star History 133 | 134 | 135 | 136 | 137 | 138 | Star History Chart 139 | 140 | 141 | 142 | --- 143 | 144 | 🧪 欢迎测试并反馈问题,Enjoy! 145 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .JC2 import NODE_CLASS_MAPPINGS as JC2_NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS as JC2_NODE_DISPLAY_NAME_MAPPINGS 2 | from .JCBO import NODE_CLASS_MAPPINGS as JCBO_NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS as JCBO_NODE_DISPLAY_NAME_MAPPINGS 3 | 4 | NODE_CLASS_MAPPINGS = {**JC2_NODE_CLASS_MAPPINGS, **JCBO_NODE_CLASS_MAPPINGS} 5 | NODE_DISPLAY_NAME_MAPPINGS = {**JC2_NODE_DISPLAY_NAME_MAPPINGS, **JCBO_NODE_DISPLAY_NAME_MAPPINGS} 6 | 7 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] -------------------------------------------------------------------------------- /example/JoyCaption Beta_One_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "1bbd7edc-7ede-4b03-829f-f8da37fb8ad2", 3 | "revision": 0, 4 | "last_node_id": 8, 5 | "last_link_id": 6, 6 | "nodes": [ 7 | { 8 | "id": 1, 9 | "type": "JoyCaptionBetaOne_Full", 10 | "pos": [ 11 | -2323.431884765625, 12 | 683.6752319335938 13 | ], 14 | "size": [ 15 | 446.90234375, 16 | 976 17 | ], 18 | "flags": {}, 19 | "order": 3, 20 | "mode": 0, 21 | "inputs": [ 22 | { 23 | "name": "image", 24 | "type": "IMAGE", 25 | "link": 1 26 | } 27 | ], 28 | "outputs": [ 29 | { 30 | "name": "caption", 31 | "shape": 6, 32 | "type": "STRING", 33 | "links": [ 34 | 2 35 | ] 36 | } 37 | ], 38 | "properties": { 39 | "cnr_id": "comfyui_jc2", 40 | "ver": "712b89398d0a7b005235c8d36f333e86a0beea1b", 41 | "Node name for S&R": "JoyCaptionBetaOne_Full", 42 | "widget_ue_connectable": {} 43 | }, 44 | "widgets_values": [ 45 | "Descriptive", 46 | "any", 47 | "nf4", 48 | "", 49 | "", 50 | 0.6, 51 | 0.9, 52 | 512, 53 | "cuda:0", 54 | true, 55 | true, 56 | false, 57 | false, 58 | false, 59 | false, 60 | false, 61 | false, 62 | false, 63 | false, 64 | false, 65 | false, 66 | false, 67 | false, 68 | false, 69 | false, 70 | false, 71 | false, 72 | false, 73 | false, 74 | false, 75 | false, 76 | false, 77 | false, 78 | false, 79 | false, 80 | false, 81 | false, 82 | false 83 | ] 84 | }, 85 | { 86 | "id": 2, 87 | "type": "LoadImage", 88 | "pos": [ 89 | -2817.219970703125, 90 | 755.2782592773438 91 | ], 92 | "size": [ 93 | 274.080078125, 94 | 314 95 | ], 96 | "flags": {}, 97 | "order": 0, 98 | "mode": 0, 99 | "inputs": [], 100 | "outputs": [ 101 | { 102 | "name": "IMAGE", 103 | "type": "IMAGE", 104 | "links": [ 105 | 1 106 | ] 107 | }, 108 | { 109 | "name": "MASK", 110 | "type": "MASK", 111 | "links": null 112 | } 113 | ], 114 | "properties": { 115 | "cnr_id": "comfy-core", 116 | "ver": "0.3.34", 117 | "Node name for S&R": "LoadImage", 118 | "widget_ue_connectable": {} 119 | }, 120 | "widgets_values": [ 121 | "ComfyUI_33602_.png", 122 | "image" 123 | ] 124 | }, 125 | { 126 | "id": 5, 127 | "type": "LoadImage", 128 | "pos": [ 129 | -1350.49267578125, 130 | 652.2406616210938 131 | ], 132 | "size": [ 133 | 274.080078125, 134 | 314 135 | ], 136 | "flags": {}, 137 | "order": 1, 138 | "mode": 0, 139 | "inputs": [], 140 | "outputs": [ 141 | { 142 | "name": "IMAGE", 143 | "type": "IMAGE", 144 | "links": [ 145 | 3 146 | ] 147 | }, 148 | { 149 | "name": "MASK", 150 | "type": "MASK", 151 | "links": null 152 | } 153 | ], 154 | "properties": { 155 | "cnr_id": "comfy-core", 156 | "ver": "0.3.34", 157 | "Node name for S&R": "LoadImage", 158 | "widget_ue_connectable": {} 159 | }, 160 | "widgets_values": [ 161 | "ComfyUI_33602_.png", 162 | "image" 163 | ] 164 | }, 165 | { 166 | "id": 3, 167 | "type": "ShowText|pysssss", 168 | "pos": [ 169 | -1677.072998046875, 170 | 731.69580078125 171 | ], 172 | "size": [ 173 | 221.86968994140625, 174 | 33.35912322998047 175 | ], 176 | "flags": {}, 177 | "order": 5, 178 | "mode": 0, 179 | "inputs": [ 180 | { 181 | "name": "text", 182 | "type": "STRING", 183 | "link": 2 184 | } 185 | ], 186 | "outputs": [ 187 | { 188 | "name": "STRING", 189 | "shape": 6, 190 | "type": "STRING", 191 | "links": null 192 | } 193 | ], 194 | "properties": { 195 | "cnr_id": "comfyui-custom-scripts", 196 | "ver": "aac13aa7ce35b07d43633c3bbe654a38c00d74f5", 197 | "Node name for S&R": "ShowText|pysssss", 198 | "widget_ue_connectable": {} 199 | }, 200 | "widgets_values": [] 201 | }, 202 | { 203 | "id": 8, 204 | "type": "ShowText|pysssss", 205 | "pos": [ 206 | -292.3197937011719, 207 | 702.7871704101562 208 | ], 209 | "size": [ 210 | 221.86968994140625, 211 | 33.35912322998047 212 | ], 213 | "flags": {}, 214 | "order": 6, 215 | "mode": 0, 216 | "inputs": [ 217 | { 218 | "name": "text", 219 | "type": "STRING", 220 | "link": 6 221 | } 222 | ], 223 | "outputs": [ 224 | { 225 | "name": "STRING", 226 | "shape": 6, 227 | "type": "STRING", 228 | "links": null 229 | } 230 | ], 231 | "properties": { 232 | "cnr_id": "comfyui-custom-scripts", 233 | "ver": "aac13aa7ce35b07d43633c3bbe654a38c00d74f5", 234 | "Node name for S&R": "ShowText|pysssss", 235 | "widget_ue_connectable": {} 236 | }, 237 | "widgets_values": [] 238 | }, 239 | { 240 | "id": 7, 241 | "type": "ExtraOptionsNode_Beta", 242 | "pos": [ 243 | -857.732666015625, 244 | 1049.2509765625 245 | ], 246 | "size": [ 247 | 446.90234375, 248 | 730 249 | ], 250 | "flags": {}, 251 | "order": 2, 252 | "mode": 0, 253 | "inputs": [], 254 | "outputs": [ 255 | { 256 | "name": "extra_options_str", 257 | "type": "STRING", 258 | "links": [ 259 | 5 260 | ] 261 | } 262 | ], 263 | "properties": { 264 | "cnr_id": "comfyui_jc2", 265 | "ver": "712b89398d0a7b005235c8d36f333e86a0beea1b", 266 | "Node name for S&R": "ExtraOptionsNode_Beta", 267 | "widget_ue_connectable": {} 268 | }, 269 | "widgets_values": [ 270 | true, 271 | "", 272 | false, 273 | false, 274 | false, 275 | false, 276 | false, 277 | false, 278 | false, 279 | false, 280 | false, 281 | false, 282 | false, 283 | false, 284 | false, 285 | false, 286 | false, 287 | false, 288 | false, 289 | false, 290 | false, 291 | false, 292 | false, 293 | false, 294 | false, 295 | false, 296 | false, 297 | false, 298 | false 299 | ] 300 | }, 301 | { 302 | "id": 4, 303 | "type": "JoyCaptionBetaOne_Simple", 304 | "pos": [ 305 | -841.1376342773438, 306 | 660.6044921875 307 | ], 308 | "size": [ 309 | 400, 310 | 324 311 | ], 312 | "flags": {}, 313 | "order": 4, 314 | "mode": 0, 315 | "inputs": [ 316 | { 317 | "name": "image", 318 | "type": "IMAGE", 319 | "link": 3 320 | }, 321 | { 322 | "name": "extra_options_str", 323 | "shape": 7, 324 | "type": "STRING", 325 | "link": 5 326 | } 327 | ], 328 | "outputs": [ 329 | { 330 | "name": "caption", 331 | "shape": 6, 332 | "type": "STRING", 333 | "links": [ 334 | 6 335 | ] 336 | } 337 | ], 338 | "properties": { 339 | "cnr_id": "comfyui_jc2", 340 | "ver": "712b89398d0a7b005235c8d36f333e86a0beea1b", 341 | "Node name for S&R": "JoyCaptionBetaOne_Simple", 342 | "widget_ue_connectable": {} 343 | }, 344 | "widgets_values": [ 345 | "Descriptive", 346 | "any", 347 | "bf16", 348 | "", 349 | 0.6, 350 | 0.9, 351 | 512, 352 | "cuda:0", 353 | true, 354 | true 355 | ] 356 | } 357 | ], 358 | "links": [ 359 | [ 360 | 1, 361 | 2, 362 | 0, 363 | 1, 364 | 0, 365 | "IMAGE" 366 | ], 367 | [ 368 | 2, 369 | 1, 370 | 0, 371 | 3, 372 | 0, 373 | "STRING" 374 | ], 375 | [ 376 | 3, 377 | 5, 378 | 0, 379 | 4, 380 | 0, 381 | "IMAGE" 382 | ], 383 | [ 384 | 5, 385 | 7, 386 | 0, 387 | 4, 388 | 1, 389 | "STRING" 390 | ], 391 | [ 392 | 6, 393 | 4, 394 | 0, 395 | 8, 396 | 0, 397 | "STRING" 398 | ] 399 | ], 400 | "groups": [], 401 | "config": {}, 402 | "extra": { 403 | "ue_links": [], 404 | "frontendVersion": "1.18.9", 405 | "VHS_latentpreview": false, 406 | "VHS_latentpreviewrate": 0, 407 | "VHS_MetadataImage": true, 408 | "VHS_KeepIntermediate": true 409 | }, 410 | "version": 0.4 411 | } -------------------------------------------------------------------------------- /example/JoyCaption Beta_One_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TTPlanetPig/Comfyui_JC2/69a7d6830807d65595da8848f1169a261c5dff5e/example/JoyCaption Beta_One_example.png -------------------------------------------------------------------------------- /extra_option.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "replace_character_names", 4 | "prompt": "If there is a person/character in the image you must refer to them as {name}." 5 | }, 6 | { 7 | "name": "exclude_unchangeable_attributes", 8 | "prompt": "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style)." 9 | }, 10 | { 11 | "name": "include_lighting_details", 12 | "prompt": "Include information about lighting." 13 | }, 14 | { 15 | "name": "include_camera_angle", 16 | "prompt": "Include information about camera angle." 17 | }, 18 | { 19 | "name": "mention_watermark_presence", 20 | "prompt": "Include information about whether there is a watermark or not." 21 | }, 22 | { 23 | "name": "note_jpeg_artifacts", 24 | "prompt": "Include information about whether there are JPEG artifacts or not." 25 | }, 26 | { 27 | "name": "include_exif_data", 28 | "prompt": "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc." 29 | }, 30 | { 31 | "name": "exclude_sexual_content", 32 | "prompt": "Do NOT include anything sexual; keep it PG." 33 | }, 34 | { 35 | "name": "exclude_image_resolution", 36 | "prompt": "Do NOT mention the image's resolution." 37 | }, 38 | { 39 | "name": "describe_aesthetic_quality", 40 | "prompt": "You MUST include information about the subjective aesthetic quality of the image from low to very high." 41 | }, 42 | { 43 | "name": "include_composition_style", 44 | "prompt": "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry." 45 | }, 46 | { 47 | "name": "exclude_text_elements", 48 | "prompt": "Do NOT mention any text that is in the image." 49 | }, 50 | { 51 | "name": "specify_depth_of_field", 52 | "prompt": "Specify the depth of field and whether the background is in focus or blurred." 53 | }, 54 | { 55 | "name": "specify_lighting_sources", 56 | "prompt": "If applicable, mention the likely use of artificial or natural lighting sources." 57 | }, 58 | { 59 | "name": "avoid_ambiguous_language", 60 | "prompt": "Do NOT use any ambiguous language." 61 | }, 62 | { 63 | "name": "classify_image_as_sfw_nsfw", 64 | "prompt": "Include whether the image is sfw, suggestive, or nsfw." 65 | }, 66 | { 67 | "name": "describe_key_elements_only", 68 | "prompt": "ONLY describe the most important elements of the image." 69 | } 70 | ] 71 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui_jc2" 3 | description = "Wrapped Joy Caption alpha 2 node for comfyui: Easy use, for GPU with less 19G, please use nf4 for better balanced speed and result. " 4 | version = "1.0.8" 5 | license = {file = "LICENSE"} 6 | dependencies = ["huggingface_hub>=0.23.4,<=0.25", "accelerate", "transformers>=4.43.2,<=4.45.1", "sentencepiece", "peft==0.12.0", "bitsandbytes"] 7 | 8 | [project.urls] 9 | Repository = "https://github.com/TTPlanetPig/Comfyui_JC2" 10 | # Used by Comfy Registry https://comfyregistry.org 11 | 12 | [tool.comfy] 13 | PublisherId = "ttplanet" 14 | DisplayName = "Comfyui_JC2" 15 | Icon = "🪐" 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | peft==0.12.0 3 | bitsandbytes 4 | huggingface_hub==0.30.1 5 | transformers>=4.51.0 6 | sentencepiece 7 | triton-windows<=3.2.0 8 | -------------------------------------------------------------------------------- /安装liger-kernel.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | REM 设置 Python 路径 4 | set PYTHON_PATH=..\..\..\python_embeded\python.exe 5 | 6 | echo Installing liger-kernel without dependencies... 7 | %PYTHON_PATH% -m pip install liger-kernel==0.5.9 --no-deps 8 | 9 | echo Installing remaining dependencies from requirements.txt... 10 | %PYTHON_PATH% -m pip install -r requirements.txt 11 | 12 | echo All done. 13 | pause 14 | --------------------------------------------------------------------------------