├── .gitignore ├── CXH_Min2_6_classifiy.py ├── Joy_caption_alpha.py ├── Joy_caption_node.py ├── LICENSE ├── README.md ├── __init__.py ├── __pycache__ ├── Joy_caption_alpha.cpython-311.pyc ├── florence_nodes.cpython-311.pyc ├── miniCPMv2_6_prompt_generator.cpython-311.pyc └── miniCpMV3_4_chat.cpython-311.pyc ├── florence_nodes.py ├── ic_lora_batch.py ├── install_req.bat ├── lib ├── __init__.py ├── xfile.py ├── ximg.py └── xmodel.py ├── miniCPMv2_6_prompt_generator.py ├── miniCpMV3_4_chat.py ├── requirements.txt ├── smolvlm.py └── worflow ├── Min2.6+joy+Florence2.json ├── MinCPM3_4B.json ├── florence_PromptGen.json ├── florence_PromptGen.png ├── flux.png ├── joy.json ├── joy.png ├── joy_4b.png ├── joy批量打标.png ├── workflow_min2.6classifiy_.png ├── 二级文件夹批量打标.png ├── 批量打标(Batch marking).json └── 批量打标(Batch marking).png /.gitignore: -------------------------------------------------------------------------------- 1 | lib/__pycache__/__init__.cpython-310.pyc 2 | lib/__pycache__/__init__.cpython-311.pyc 3 | lib/__pycache__/ximg.cpython-310.pyc 4 | lib/__pycache__/ximg.cpython-311.pyc 5 | lib/__pycache__/xmodel.cpython-311.pyc 6 | __pycache__/__init__.cpython-311.pyc 7 | __pycache__/Joy_caption_node.cpython-311.pyc 8 | -------------------------------------------------------------------------------- /CXH_Min2_6_classifiy.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import InferenceClient 3 | from torch import nn 4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 5 | from pathlib import Path 6 | import torch 7 | import torch.amp.autocast_mode 8 | from PIL import Image 9 | import os 10 | import folder_paths 11 | import time 12 | 13 | from .lib.ximg import * 14 | from .lib.xmodel import * 15 | 16 | classification_rules = """ 17 | You are a fashion image classifier. Analyze clothing images following these priority rules and categories. When an item could fit multiple categories, use the highest priority category. 18 | Priority Order (Highest to Lowest): 19 | 1. MAN 20 | 2. WoMAN 21 | Required Output Format: 22 | [CATEGORY_NAME] 23 | 24 | Classification Rules: 25 | 1. Always check categories in order from highest to lowest priority 26 | 2. Use the highest priority category that applies 27 | 3. Output only the category name in all caps 28 | 4. No additional text or explanations in output 29 | """ 30 | 31 | def process_category_name(category_name): 32 | # 如果字符串包含方括号,则删除它们 33 | if category_name.startswith('[') and category_name.endswith(']'): 34 | category_name = category_name[1:-1] 35 | return category_name 36 | 37 | class CXH_Min2_6_classifiy : 38 | 39 | def __init__(self): 40 | pass 41 | 42 | @classmethod 43 | def INPUT_TYPES(s): 44 | return { 45 | "required": { 46 | "pipe": ("CXH_Hg_Pipe",), 47 | "img_dir": ("STRING", {"multiline": False, "default": ""},), 48 | "save_dir": ("STRING", {"multiline": False, "default": ""},), 49 | "prompt": ("STRING", {"multiline": True, "default": classification_rules},), 50 | "format": (["png", "jpg"],), 51 | "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}), 52 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 53 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 54 | } 55 | } 56 | 57 | RETURN_TYPES = () #RETURN_TYPES = () RETURN_TYPES = ("DICT",)返回字典 58 | FUNCTION = "gen" 59 | OUTPUT_NODE = True #OUTPUT_NODE = True 没输出 60 | CATEGORY = "CXH/LLM" 61 | 62 | def gen(self,pipe,img_dir,save_dir,prompt,format,max_tokens,temperature,seed): 63 | 64 | dir_files = batch_image(img_dir) 65 | 66 | # prompt = f"Determine whether the following pictures belong to the following types:{str(classifiy_type)},You only need to output the type, you do not need to output anything else to remember!" 67 | 68 | # 创建保存目录 69 | if not os.path.exists(save_dir): 70 | os.makedirs(save_dir) 71 | 72 | index1 = 0 73 | for image_path in dir_files: 74 | if os.path.isdir(image_path) and os.path.ex: 75 | continue 76 | start = time.time() 77 | input_image = open_image(image_path) 78 | input_image = ImageOps.exif_transpose(input_image) 79 | image = input_image.convert("RGB") 80 | 81 | question = prompt 82 | msgs = [{'role': 'user', 'content': [image, question]}] 83 | 84 | res = pipe.text_model.chat( 85 | image=None, 86 | msgs=msgs, 87 | tokenizer=pipe.tokenizer 88 | ) 89 | 90 | ## if you want to use streaming, please make sure sampling=True and stream=True 91 | ## the model.chat will return a generator 92 | res = pipe.text_model.chat( 93 | image=None, 94 | msgs=msgs, 95 | tokenizer=pipe.tokenizer, 96 | sampling=False, 97 | stream=False, 98 | max_tokens=max_tokens, 99 | temperature=temperature, 100 | ) 101 | 102 | generated_text = process_category_name(res) 103 | 104 | if len(generated_text) >= 80: 105 | generated_text = "UNKNOWN" 106 | 107 | 108 | savePath = os.path.join(save_dir,generated_text) 109 | # 创建保存目录 110 | if not os.path.exists(savePath): 111 | os.makedirs(savePath) 112 | 113 | lenName = str(index1) 114 | # img_file_name = f"{lenName}.{format}" 115 | img_file_name = os.path.basename(image_path) 116 | input_image = image 117 | if format != "png": 118 | if input_image.mode == "RGBA": 119 | input_image = input_image.convert("RGB") 120 | 121 | img_save_path = os.path.join(savePath, img_file_name) 122 | input_image.save(img_save_path) 123 | 124 | end = time.time() 125 | execution_time = calculate_seconds_difference(start, end) 126 | temp = f":{execution_time:.3f}s" 127 | index1 = index1 + 1 128 | print(str(index1)+"/"+str(len(dir_files)) +":"+temp) 129 | 130 | return () 131 | 132 | -------------------------------------------------------------------------------- /Joy_caption_alpha.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import InferenceClient 3 | from torch import nn 4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 5 | from pathlib import Path 6 | import torch 7 | import torch.amp.autocast_mode 8 | from PIL import Image 9 | import os 10 | import folder_paths 11 | import torchvision.transforms.functional as TVF 12 | 13 | from .lib.ximg import * 14 | from .lib.xmodel import * 15 | import re 16 | import time 17 | from datetime import datetime, timedelta 18 | 19 | from comfy.model_management import unload_all_models, soft_empty_cache,get_torch_device 20 | 21 | DEVICE = get_torch_device() 22 | 23 | def modify_json_value(file_path, key_to_modify, new_value): 24 | """ 25 | 读取 JSON 文件,修改指定 key 的 value 值,并保存修改后的文件。 26 | 27 | Args: 28 | file_path: JSON 文件路径。 29 | key_to_modify: 需要修改的 key。 30 | new_value: 新的 value 值。 31 | """ 32 | try: 33 | with open(file_path, 'r', encoding='utf-8') as f: 34 | data = json.load(f) 35 | 36 | # 查找并修改 key 的 value 37 | if key_to_modify in data: 38 | data[key_to_modify] = new_value 39 | else: 40 | print(f"Warning: Key '{key_to_modify}' not found in JSON file.") 41 | 42 | # 保存修改后的 JSON 文件 43 | with open(file_path, 'w', encoding='utf-8') as f: 44 | json.dump(data, f, indent=4) # 使用 indent 参数格式化输出 45 | 46 | print(f"Successfully modified '{key_to_modify}' value in '{file_path}'.") 47 | 48 | except FileNotFoundError: 49 | print(f"Error: File '{file_path}' not found.") 50 | except json.JSONDecodeError: 51 | print(f"Error: Invalid JSON format in '{file_path}'.") 52 | 53 | CAPTION_TYPE_MAP = { 54 | "Descriptive": [ 55 | "Write a descriptive caption for this image in a formal tone.", 56 | "Write a descriptive caption for this image in a formal tone within {word_count} words.", 57 | "Write a {length} descriptive caption for this image in a formal tone.", 58 | ], 59 | "Descriptive (Informal)": [ 60 | "Write a descriptive caption for this image in a casual tone.", 61 | "Write a descriptive caption for this image in a casual tone within {word_count} words.", 62 | "Write a {length} descriptive caption for this image in a casual tone.", 63 | ], 64 | "Training Prompt": [ 65 | "Write a stable diffusion prompt for this image.", 66 | "Write a stable diffusion prompt for this image within {word_count} words.", 67 | "Write a {length} stable diffusion prompt for this image.", 68 | ], 69 | "MidJourney": [ 70 | "Write a MidJourney prompt for this image.", 71 | "Write a MidJourney prompt for this image within {word_count} words.", 72 | "Write a {length} MidJourney prompt for this image.", 73 | ], 74 | "Booru tag list": [ 75 | "Write a list of Booru tags for this image.", 76 | "Write a list of Booru tags for this image within {word_count} words.", 77 | "Write a {length} list of Booru tags for this image.", 78 | ], 79 | "Booru-like tag list": [ 80 | "Write a list of Booru-like tags for this image.", 81 | "Write a list of Booru-like tags for this image within {word_count} words.", 82 | "Write a {length} list of Booru-like tags for this image.", 83 | ], 84 | "Art Critic": [ 85 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.", 86 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.", 87 | "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.", 88 | ], 89 | "Product Listing": [ 90 | "Write a caption for this image as though it were a product listing.", 91 | "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.", 92 | "Write a {length} caption for this image as though it were a product listing.", 93 | ], 94 | "Social Media Post": [ 95 | "Write a caption for this image as if it were being used for a social media post.", 96 | "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.", 97 | "Write a {length} caption for this image as if it were being used for a social media post.", 98 | ], 99 | } 100 | 101 | extra_options_parts = [ 102 | "[如果图像中有人物/角色,你必须用name来指代他们。] If there is a person/character in the image you must refer to them as *name* .", 103 | "[不要包含无法改变的人物/角色信息如:种族、性别等,但要包含可以改变的属性如:发型。] Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).", 104 | "[包含关于光线的信息] Include information about lighting.", 105 | "[包含关于相机角度的信息] Include information about camera angle.", 106 | "[包含关于是否有水印的信息] Include information about whether there is a watermark or not.", 107 | "[包含关于是否有JPEG压缩痕迹的信息] Include information about whether there are JPEG artifacts or not.", 108 | "[如果是照片,你必须包含可能使用的相机类型以及诸如光圈、快门速度、ISO等细节信息] If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.", 109 | "[不要包含任何性相关的内容;保持内容适合全年龄] Do NOT include anything sexual; keep it PG.", 110 | "[不要提及图像的分辨率] Do NOT mention the image's resolution.", 111 | "[你必须包含关于图像主观审美质量的评价,从低到非常高] You MUST include information about the subjective aesthetic quality of the image from low to very high.", 112 | "[包含关于图像构图风格的信息,如引导线、三分法或对称性] Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.", 113 | "[不要提及图像中的任何文字] Do NOT mention any text that is in the image.", 114 | "[指明景深以及背景是否聚焦或模糊] Specify the depth of field and whether the background is in focus or blurred.", 115 | "[如果适用,提及可能使用的人工或自然光源] If applicable, mention the likely use of artificial or natural lighting sources.", 116 | "[不要使用任何模棱两可的语言] Do NOT use any ambiguous language.", 117 | "[包含图像是否适合工作场合(sfw)、暗示性的还是不适合工作场合(nsfw)] Include whether the image is sfw, suggestive, or nsfw.", 118 | "[只描述图像中最重要的元素] ONLY describe the most important elements of the image." 119 | ] 120 | 121 | class JoyPipeline_alpha: 122 | def __init__(self): 123 | self.clip_model = None 124 | self.clip_processor =None 125 | self.tokenizer = None 126 | self.text_model = None 127 | self.image_adapter = None 128 | self.parent = None 129 | 130 | def clearCache(self): 131 | self.clip_model = None 132 | self.clip_processor =None 133 | self.tokenizer = None 134 | self.text_model = None 135 | self.image_adapter = None 136 | 137 | 138 | 139 | class ImageAdapter_alpha(nn.Module): 140 | def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool): 141 | super().__init__() 142 | self.deep_extract = deep_extract 143 | 144 | if self.deep_extract: 145 | input_features = input_features * 5 146 | 147 | self.linear1 = nn.Linear(input_features, output_features) 148 | self.activation = nn.GELU() 149 | self.linear2 = nn.Linear(output_features, output_features) 150 | self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features) 151 | self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features)) 152 | 153 | # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>) 154 | self.other_tokens = nn.Embedding(3, output_features) 155 | self.other_tokens.weight.data.normal_(mean=0.0, std=0.02) # Matches HF's implementation of llama3 156 | 157 | def forward(self, vision_outputs: torch.Tensor): 158 | if self.deep_extract: 159 | x = torch.concat(( 160 | vision_outputs[-2], 161 | vision_outputs[3], 162 | vision_outputs[7], 163 | vision_outputs[13], 164 | vision_outputs[20], 165 | ), dim=-1) 166 | assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}" # batch, tokens, features 167 | assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}" 168 | else: 169 | x = vision_outputs[-2] 170 | 171 | x = self.ln1(x) 172 | 173 | if self.pos_emb is not None: 174 | assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}" 175 | x = x + self.pos_emb 176 | 177 | x = self.linear1(x) 178 | x = self.activation(x) 179 | x = self.linear2(x) 180 | 181 | # <|image_start|>, IMAGE, <|image_end|> 182 | other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1)) 183 | assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}" 184 | x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1) 185 | 186 | return x 187 | 188 | def get_eot_embedding(self): 189 | return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0) 190 | 191 | 192 | 193 | class Joy_caption_alpha_load: 194 | 195 | def __init__(self): 196 | self.model = None 197 | self.pipeline = JoyPipeline_alpha() 198 | self.pipeline.parent = self 199 | pass 200 | 201 | @classmethod 202 | def INPUT_TYPES(s): 203 | return { 204 | "required": { 205 | "model": (["Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2","unsloth/Meta-Llama-3.1-8B-bnb-4bit"],), 206 | } 207 | } 208 | 209 | CATEGORY = "CXH/LLM" 210 | RETURN_TYPES = ("JoyPipeline_alpha",) 211 | FUNCTION = "gen" 212 | 213 | def loadCheckPoint(self): 214 | # 清除一波 215 | if self.pipeline != None: 216 | self.pipeline.clearCache() 217 | 218 | # Image Adapter 219 | adapter_path = os.path.join(folder_paths.models_dir,"Joy_caption_alpha","image_adapter.pt") 220 | 221 | clip_model_path = os.path.join(folder_paths.models_dir,"Joy_caption_alpha","clip_model.pt") 222 | 223 | CHECKPOINT_PATH = os.path.join(folder_paths.models_dir,"Joy_caption_alpha","text_model") 224 | 225 | # clip 226 | model_id = "google/siglip-so400m-patch14-384" 227 | CLIP_PATH = download_hg_model(model_id,"clip") 228 | 229 | clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 230 | clip_model = AutoModel.from_pretrained( 231 | CLIP_PATH, 232 | trust_remote_code=True 233 | ) 234 | clip_model = clip_model.vision_model 235 | 236 | print("Loading VLM's custom vision model") 237 | checkpoint = torch.load(clip_model_path, map_location='cpu') 238 | checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()} 239 | clip_model.load_state_dict(checkpoint) 240 | del checkpoint 241 | 242 | clip_model.eval() 243 | clip_model.requires_grad_(False) 244 | clip_model.to("cuda") 245 | 246 | # Tokenizer 247 | text_model_path = CHECKPOINT_PATH 248 | LLM_PATH = download_hg_model(self.model, "LLM") 249 | modify_json_value(os.path.join(text_model_path, "adapter_config.json"), "base_model_name_or_path", 250 | LLM_PATH) 251 | 252 | print("Loading tokenizer") 253 | tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH, use_fast=False) 254 | assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}" 255 | 256 | # LLM 257 | print("Loading LLM") 258 | print("Loading VLM's custom text model") 259 | 260 | # text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH , device_map=0, trust_remote_code=True) 261 | text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH, device_map="auto", 262 | torch_dtype=torch.bfloat16).eval() 263 | 264 | image_adapter = ImageAdapter_alpha(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False) # ImageAdapter(clip_model.config.hidden_size, 4096) 265 | image_adapter.load_state_dict(torch.load(adapter_path, map_location="cpu")) 266 | adjusted_adapter = image_adapter 267 | adjusted_adapter.eval() 268 | adjusted_adapter.to("cuda") 269 | 270 | self.pipeline.clip_model = clip_model 271 | self.pipeline.clip_processor = clip_processor 272 | self.pipeline.tokenizer = tokenizer 273 | self.pipeline.text_model = text_model 274 | self.pipeline.image_adapter = adjusted_adapter 275 | 276 | def clearCache(self): 277 | if self.pipeline != None: 278 | self.pipeline.clearCache() 279 | 280 | def gen(self,model): 281 | if self.model == None or self.model != model or self.pipeline == None: 282 | self.model = model 283 | self.loadCheckPoint() 284 | return (self.pipeline,) 285 | 286 | def remove_brackets_content(text): 287 | # 使用正则表达式找到所有被 [] 括起来的内容,并将其删除 288 | result = re.sub(r'\[.*?\]', '', text) 289 | return result 290 | 291 | class Joy_caption_alpha_prompt: 292 | 293 | def __init__(self): 294 | pass 295 | 296 | @classmethod 297 | def INPUT_TYPES(s): 298 | options = list(extra_options_parts) 299 | required = { 300 | "caption_type": (["Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", "Social Media Post"],), 301 | "caption_length":(["any", "very short", "short", "medium-length", "long", "very long","20","50","80","100","120","250","500"],), 302 | "name":("STRING", {"multiline": False, "default": ""},), 303 | } 304 | for option in options: 305 | required[option] = ("BOOLEAN", {"default": False}) 306 | return { 307 | "required": required 308 | } 309 | 310 | CATEGORY = "CXH/LLM" 311 | RETURN_TYPES = ("STRING",) 312 | FUNCTION = "gen" 313 | # def gen(self,caption_type,caption_length,extra_options): 314 | def gen(self,**kwargs): 315 | options_selected = list(kwargs.values()) 316 | 317 | caption_type = kwargs["caption_type"] 318 | caption_length = kwargs["caption_length"] 319 | name = kwargs["name"] 320 | 321 | 322 | # 额外选项从第三个参数开始 323 | extra_options = options_selected[3:] 324 | 325 | length = None if caption_length == "any" else caption_length 326 | if isinstance(length, str): 327 | try: 328 | length = int(length) 329 | except ValueError: 330 | pass 331 | 332 | if length is None: 333 | map_idx = 0 334 | elif isinstance(length, int): 335 | map_idx = 1 336 | elif isinstance(length, str): 337 | map_idx = 2 338 | else: 339 | raise ValueError(f"Invalid caption length: {length}") 340 | 341 | prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx] 342 | 343 | prompt_str = prompt_str.format(length=caption_length, word_count=caption_length) 344 | options = list(extra_options_parts) 345 | for selected, option in zip(extra_options, options): 346 | if selected: 347 | prompt_str = prompt_str + remove_brackets_content(option) 348 | prompt_str = prompt_str.replace("*name*", name) 349 | print(prompt_str) 350 | return (prompt_str,) 351 | 352 | class Joy_caption_alpha_run: 353 | 354 | def __init__(self): 355 | pass 356 | @classmethod 357 | def INPUT_TYPES(s): 358 | return { 359 | "required": { 360 | "JoyPipeline_alpha": ("JoyPipeline_alpha",), 361 | "image": ("IMAGE",), 362 | "prompt": ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},), 363 | "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}), 364 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 365 | "cache": ("BOOLEAN", {"default": False}), 366 | "low_vram": ("BOOLEAN", {"default": False}), 367 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 368 | } 369 | } 370 | 371 | CATEGORY = "CXH/LLM" 372 | RETURN_TYPES = ("STRING",) 373 | FUNCTION = "gen" 374 | def gen(self,JoyPipeline_alpha,image,prompt,max_new_tokens,temperature,cache,low_vram,seed): 375 | 376 | torch.cuda.empty_cache() 377 | 378 | if low_vram : 379 | unload_all_models() 380 | 381 | joy_pipeline = JoyPipeline_alpha 382 | if joy_pipeline.clip_processor == None : 383 | joy_pipeline.parent.loadCheckPoint() 384 | 385 | clip_processor = joy_pipeline.clip_processor 386 | tokenizer = joy_pipeline.tokenizer 387 | clip_model = joy_pipeline.clip_model 388 | image_adapter = joy_pipeline.image_adapter 389 | text_model = joy_pipeline.text_model 390 | 391 | 392 | 393 | input_image = tensor2pil(image) 394 | 395 | # Preprocess image 396 | # pImge = clip_processor(images=input_image, return_tensors='pt').pixel_values 397 | # pImge = pImge.to(DEVICE) 398 | 399 | image = input_image.resize((384, 384), Image.LANCZOS) 400 | pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0 401 | pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]) 402 | pixel_values = pixel_values.to('cuda') 403 | 404 | # Tokenize the prompt 405 | # prompt = tokenizer.encode(prompt, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False) 406 | # Embed image 407 | 408 | with torch.amp.autocast_mode.autocast('cuda', enabled=True): 409 | vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True) 410 | embedded_images = image_adapter(vision_outputs.hidden_states) 411 | embedded_images = embedded_images.to('cuda') 412 | 413 | 414 | convo = [ 415 | { 416 | "role": "system", 417 | "content": "You are a helpful image captioner.", 418 | }, 419 | { 420 | "role": "user", 421 | "content": prompt, 422 | }, 423 | ] 424 | 425 | convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 426 | assert isinstance(convo_string, str) 427 | 428 | convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False) 429 | prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False) 430 | assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor) 431 | convo_tokens = convo_tokens.squeeze(0) # Squeeze just to make the following easier 432 | prompt_tokens = prompt_tokens.squeeze(0) 433 | 434 | eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[ 435 | 0].tolist() 436 | assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}" 437 | 438 | preamble_len = eot_id_indices[1] - prompt_tokens.shape[0] # Number of tokens before the prompt 439 | 440 | 441 | # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model) 442 | # Embed the tokens 443 | convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda')) 444 | 445 | input_embeds = torch.cat([ 446 | convo_embeds[:, :preamble_len], # Part before the prompt 447 | embedded_images.to(dtype=convo_embeds.dtype), # Image 448 | convo_embeds[:, preamble_len:], # The prompt and anything after it 449 | ], dim=1).to('cuda') 450 | 451 | input_ids = torch.cat([ 452 | convo_tokens[:preamble_len].unsqueeze(0), 453 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 454 | # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input) 455 | convo_tokens[preamble_len:].unsqueeze(0), 456 | ], dim=1).to('cuda') 457 | attention_mask = torch.ones_like(input_ids) 458 | 459 | generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, 460 | max_new_tokens=max_new_tokens, do_sample=True, 461 | suppress_tokens=None) # Uses the default which is temp=0.6, top_p=0.9 462 | 463 | 464 | generate_ids = generate_ids[:, input_ids.shape[1]:] 465 | if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids( 466 | "<|eot_id|>"): 467 | generate_ids = generate_ids[:, :-1] 468 | 469 | caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 470 | 471 | if cache == False: 472 | joy_pipeline.parent.clearCache() 473 | torch.cuda.empty_cache() 474 | import gc 475 | gc.collect() 476 | if low_vram: 477 | unload_all_models() 478 | soft_empty_cache() 479 | 480 | return (caption.strip(), ) 481 | 482 | 483 | # ===============批量打标============= 484 | class Joy_caption_alpha_batch: 485 | 486 | def __init__(self): 487 | pass 488 | @classmethod 489 | def INPUT_TYPES(s): 490 | return { 491 | "required": { 492 | "JoyPipeline_alpha": ("JoyPipeline_alpha",), 493 | "img_dir": ("STRING", {"multiline": True, "default": ""},), 494 | "save_dir": ("STRING", {"multiline": True, "default": ""},), 495 | "trigger": ("STRING", {"multiline": False, "default": "trigger"},), 496 | "prompt": ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},), 497 | "format": (["png", "jpg"],), 498 | "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}), 499 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 500 | "cache": ("BOOLEAN", {"default": False}), 501 | "low_vram": ("BOOLEAN", {"default": False}), 502 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 503 | } 504 | } 505 | 506 | CATEGORY = "CXH/LLM" 507 | RETURN_TYPES = ("STRING",) 508 | FUNCTION = "gen" 509 | def gen(self,JoyPipeline_alpha,img_dir,save_dir,trigger,prompt,format,max_new_tokens,temperature,cache,low_vram,seed): 510 | 511 | torch.cuda.empty_cache() 512 | directory = img_dir 513 | if low_vram : 514 | unload_all_models() 515 | 516 | joy_pipeline = JoyPipeline_alpha 517 | if joy_pipeline.clip_processor == None : 518 | joy_pipeline.parent.loadCheckPoint() 519 | 520 | clip_processor = joy_pipeline.clip_processor 521 | tokenizer = joy_pipeline.tokenizer 522 | clip_model = joy_pipeline.clip_model 523 | image_adapter = joy_pipeline.image_adapter 524 | text_model = joy_pipeline.text_model 525 | 526 | # 批量读取 527 | if not os.path.isdir(directory): 528 | raise FileNotFoundError(f"Directory '{directory}' cannot be found.") 529 | dir_files = os.listdir(directory) 530 | if len(dir_files) == 0: 531 | raise FileNotFoundError(f"No files in directory '{directory}'.") 532 | 533 | valid_extensions = ['.jpg', '.jpeg', '.png', '.webp'] 534 | dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)] 535 | 536 | dir_files = sorted(dir_files) 537 | dir_files = [os.path.join(directory, x) for x in dir_files] 538 | 539 | # 创建保存目录 540 | if not os.path.exists(save_dir): 541 | os.makedirs(save_dir) 542 | 543 | convo = [ 544 | { 545 | "role": "system", 546 | "content": "You are a helpful image captioner.", 547 | }, 548 | { 549 | "role": "user", 550 | "content": prompt, 551 | }, 552 | ] 553 | 554 | convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 555 | assert isinstance(convo_string, str) 556 | 557 | convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False) 558 | prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False) 559 | assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor) 560 | convo_tokens = convo_tokens.squeeze(0) # Squeeze just to make the following easier 561 | prompt_tokens = prompt_tokens.squeeze(0) 562 | 563 | eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[ 564 | 0].tolist() 565 | assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}" 566 | 567 | preamble_len = eot_id_indices[1] - prompt_tokens.shape[0] # Number of tokens before the prompt 568 | 569 | 570 | # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model) 571 | # Embed the tokens 572 | convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda')) 573 | 574 | index1 = 0 575 | for image_path in dir_files: 576 | if os.path.isdir(image_path) and os.path.ex: 577 | continue 578 | start = time.time() 579 | 580 | input_image = open_image(image_path) 581 | input_image = ImageOps.exif_transpose(input_image) 582 | input_image = input_image.convert("RGB") 583 | 584 | 585 | image = input_image.resize((384, 384), Image.LANCZOS) 586 | pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0 587 | pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]) 588 | pixel_values = pixel_values.to('cuda') 589 | 590 | 591 | with torch.amp.autocast_mode.autocast('cuda', enabled=True): 592 | vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True) 593 | embedded_images = image_adapter(vision_outputs.hidden_states) 594 | embedded_images = embedded_images.to('cuda') 595 | 596 | input_embeds = torch.cat([ 597 | convo_embeds[:, :preamble_len], # Part before the prompt 598 | embedded_images.to(dtype=convo_embeds.dtype), # Image 599 | convo_embeds[:, preamble_len:], # The prompt and anything after it 600 | ], dim=1).to('cuda') 601 | 602 | input_ids = torch.cat([ 603 | convo_tokens[:preamble_len].unsqueeze(0), 604 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 605 | # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input) 606 | convo_tokens[preamble_len:].unsqueeze(0), 607 | ], dim=1).to('cuda') 608 | attention_mask = torch.ones_like(input_ids) 609 | 610 | generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, 611 | max_new_tokens=max_new_tokens, do_sample=True, 612 | suppress_tokens=None) # Uses the default which is temp=0.6, top_p=0.9 613 | 614 | 615 | generate_ids = generate_ids[:, input_ids.shape[1]:] 616 | if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids( 617 | "<|eot_id|>"): 618 | generate_ids = generate_ids[:, :-1] 619 | 620 | caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 621 | # 提示词 622 | lenName = str(index1) 623 | txt_content = trigger + "," + caption.strip() 624 | txt_file_name = f"{trigger}_{lenName}.txt" 625 | txt_save_path = os.path.join(save_dir, txt_file_name) 626 | try: 627 | with open(txt_save_path, 'w', encoding='utf-8') as file: 628 | file.write(txt_content) 629 | except IOError as e: 630 | print(f"保存文件时发生错误: {e}") 631 | # 图片 632 | img_file_name = f"{trigger}_{lenName}.{format}" 633 | if format != "png": 634 | if input_image.mode == "RGBA": 635 | input_image = input_image.convert("RGB") 636 | img_save_path = os.path.join(save_dir, img_file_name) 637 | input_image.save(img_save_path) 638 | end = time.time() 639 | execution_time = calculate_seconds_difference(start, end) 640 | temp = f":{execution_time:.3f}s" 641 | index1 = index1 + 1 642 | print(str(index1)+"/"+str(len(dir_files)) +":"+temp) 643 | print("finish结束") 644 | 645 | if cache == False: 646 | joy_pipeline.parent.clearCache() 647 | torch.cuda.empty_cache() 648 | import gc 649 | gc.collect() 650 | if low_vram: 651 | unload_all_models() 652 | soft_empty_cache() 653 | lenName = len(os.listdir(save_dir)) 654 | return (str(lenName/2), ) 655 | 656 | # ===============批量打标============= 657 | def get_subdirectories(directory): 658 | # 检查目录是否存在 659 | if not os.path.isdir(directory): 660 | raise FileNotFoundError(f"Directory '{directory}' cannot be found.") 661 | 662 | # 获取目录中的所有文件夹 663 | subdirectories = [name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name))] 664 | return subdirectories 665 | 666 | def get_trigger_from_string(s): 667 | # Split the string by the underscore character 668 | parts = s.split('_') 669 | # Check if the length of the parts is at least 2 670 | if len(parts) >= 2: 671 | # Return the second part which is the trigger 672 | return parts[1] 673 | else: 674 | # Return None if the format is not as expected 675 | return None 676 | 677 | class Joy_caption_alpha_batch_Dirs: 678 | 679 | def __init__(self): 680 | pass 681 | @classmethod 682 | def INPUT_TYPES(s): 683 | return { 684 | "required": { 685 | "JoyPipeline_alpha": ("JoyPipeline_alpha",), 686 | "img_dir": ("STRING", {"multiline": True, "default": ""},), 687 | "save_dir": ("STRING", {"multiline": True, "default": ""},), 688 | "prompt": ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},), 689 | "format": (["png", "jpg"],), 690 | "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}), 691 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 692 | "cache": ("BOOLEAN", {"default": False}), 693 | "low_vram": ("BOOLEAN", {"default": False}), 694 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 695 | } 696 | } 697 | 698 | CATEGORY = "CXH/LLM" 699 | RETURN_TYPES = ("STRING",) 700 | FUNCTION = "gen" 701 | def gen(self,JoyPipeline_alpha,img_dir,save_dir,prompt,format,max_new_tokens,temperature,cache,low_vram,seed): 702 | 703 | torch.cuda.empty_cache() 704 | directory = img_dir 705 | if low_vram : 706 | unload_all_models() 707 | 708 | joy_pipeline = JoyPipeline_alpha 709 | if joy_pipeline.clip_processor == None : 710 | joy_pipeline.parent.loadCheckPoint() 711 | 712 | clip_processor = joy_pipeline.clip_processor 713 | tokenizer = joy_pipeline.tokenizer 714 | clip_model = joy_pipeline.clip_model 715 | image_adapter = joy_pipeline.image_adapter 716 | text_model = joy_pipeline.text_model 717 | 718 | # 批量读取 719 | if not os.path.isdir(directory): 720 | raise FileNotFoundError(f"Directory '{directory}' cannot be found.") 721 | 722 | convo = [ 723 | { 724 | "role": "system", 725 | "content": "You are a helpful image captioner.", 726 | }, 727 | { 728 | "role": "user", 729 | "content": prompt, 730 | }, 731 | ] 732 | convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 733 | assert isinstance(convo_string, str) 734 | 735 | convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False) 736 | prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False) 737 | assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor) 738 | convo_tokens = convo_tokens.squeeze(0) # Squeeze just to make the following easier 739 | prompt_tokens = prompt_tokens.squeeze(0) 740 | 741 | eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[ 742 | 0].tolist() 743 | assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}" 744 | 745 | preamble_len = eot_id_indices[1] - prompt_tokens.shape[0] # Number of tokens before the prompt 746 | 747 | 748 | # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model) 749 | # Embed the tokens 750 | convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda')) 751 | subdirs = get_subdirectories(directory) 752 | 753 | for subdir in subdirs: 754 | print("开始文件夹:"+subdir) 755 | subdir_path = os.path.join(directory, subdir) 756 | if not os.path.isdir(subdir_path): 757 | continue 758 | dir_files = os.listdir(subdir_path) 759 | if len(dir_files) == 0: 760 | raise FileNotFoundError(f"No files in directory '{directory}'.") 761 | 762 | valid_extensions = ['.jpg', '.jpeg', '.png', '.webp'] 763 | dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)] 764 | 765 | dir_files = sorted(dir_files) 766 | dir_files = [os.path.join(subdir_path, x) for x in dir_files] 767 | 768 | # 创建保存目录 769 | if not os.path.exists(save_dir): 770 | os.makedirs(save_dir) 771 | if not os.path.exists(os.path.join(save_dir,subdir)): 772 | os.makedirs(os.path.join(save_dir,subdir)) 773 | 774 | index1 = 0 775 | for image_path in dir_files: 776 | if os.path.isdir(image_path) and os.path.ex: 777 | continue 778 | start = time.time() 779 | # print(image_path) 780 | input_image = open_image(image_path) 781 | input_image = ImageOps.exif_transpose(input_image) 782 | input_image = input_image.convert("RGB") 783 | 784 | 785 | image = input_image.resize((384, 384), Image.LANCZOS) 786 | pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0 787 | pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]) 788 | pixel_values = pixel_values.to('cuda') 789 | 790 | 791 | with torch.amp.autocast_mode.autocast('cuda', enabled=True): 792 | vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True) 793 | embedded_images = image_adapter(vision_outputs.hidden_states) 794 | embedded_images = embedded_images.to('cuda') 795 | 796 | input_embeds = torch.cat([ 797 | convo_embeds[:, :preamble_len], # Part before the prompt 798 | embedded_images.to(dtype=convo_embeds.dtype), # Image 799 | convo_embeds[:, preamble_len:], # The prompt and anything after it 800 | ], dim=1).to('cuda') 801 | 802 | input_ids = torch.cat([ 803 | convo_tokens[:preamble_len].unsqueeze(0), 804 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 805 | # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input) 806 | convo_tokens[preamble_len:].unsqueeze(0), 807 | ], dim=1).to('cuda') 808 | attention_mask = torch.ones_like(input_ids) 809 | 810 | generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, 811 | max_new_tokens=max_new_tokens, do_sample=True, 812 | suppress_tokens=None) # Uses the default which is temp=0.6, top_p=0.9 813 | 814 | 815 | generate_ids = generate_ids[:, input_ids.shape[1]:] 816 | if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids( 817 | "<|eot_id|>"): 818 | generate_ids = generate_ids[:, :-1] 819 | 820 | caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 821 | # 提示词 822 | lenName = str(index1) 823 | trigger = get_trigger_from_string(subdir) 824 | if trigger is not None: 825 | txt_content = trigger + "," + caption.strip() 826 | txt_file_name = f"{trigger}_{lenName}.txt" 827 | txt_save_path = os.path.join(save_dir,subdir, txt_file_name) 828 | img_file_name = f"{trigger}_{lenName}.{format}" 829 | else: 830 | txt_content = caption.strip() 831 | txt_file_name = f"{lenName}.txt" 832 | txt_save_path = os.path.join(save_dir,subdir, txt_file_name) 833 | img_file_name = f"{lenName}.{format}" 834 | try: 835 | with open(txt_save_path, 'w', encoding='utf-8') as file: 836 | file.write(txt_content) 837 | except IOError as e: 838 | print(f"保存文件时发生错误: {e}") 839 | # 图片 840 | 841 | if format != "png": 842 | if input_image.mode == "RGBA": 843 | input_image = input_image.convert("RGB") 844 | img_save_path = os.path.join(save_dir,subdir, img_file_name) 845 | input_image.save(img_save_path) 846 | end = time.time() 847 | execution_time = calculate_seconds_difference(start, end) 848 | temp = f":{execution_time:.3f}s" 849 | index1 = index1 + 1 850 | print(str(index1)+"/"+str(len(dir_files)) +":"+temp) 851 | print("结束"+subdir) 852 | index1 = 0 853 | 854 | if cache == False: 855 | joy_pipeline.parent.clearCache() 856 | torch.cuda.empty_cache() 857 | import gc 858 | gc.collect() 859 | if low_vram: 860 | unload_all_models() 861 | soft_empty_cache() 862 | lenName = len(os.listdir(save_dir)) 863 | return (str(lenName/2), ) 864 | 865 | -------------------------------------------------------------------------------- /Joy_caption_node.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import InferenceClient 3 | from torch import nn 4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 5 | from pathlib import Path 6 | import torch 7 | import torch.amp.autocast_mode 8 | from PIL import Image 9 | import os 10 | import folder_paths 11 | 12 | from .lib.ximg import * 13 | from .lib.xmodel import * 14 | 15 | from model_management import get_torch_device 16 | DEVICE = get_torch_device() 17 | # def get_torch_device(): 18 | # """ 19 | # 返回PyTorch模型应该运行的设备(CPU或GPU) 20 | # 如果系统支持CUDA并且至少有一个GPU可用,则返回GPU设备;否则返回CPU设备。 21 | # """ 22 | # if torch.cuda.is_available(): 23 | # # 选择第一个可用的GPU 24 | # device = torch.device("cuda:0") 25 | # print(f"There are {torch.cuda.device_count()} GPU(s) available.") 26 | # print(f"We will use the GPU: {device}") 27 | # else: 28 | # # 如果没有GPU可用,则使用CPU 29 | # device = torch.device("cpu") 30 | # print("No GPU available, using the CPU instead.") 31 | # return device 32 | 33 | class JoyPipeline: 34 | def __init__(self): 35 | self.clip_model = None 36 | self.clip_processor =None 37 | self.tokenizer = None 38 | self.text_model = None 39 | self.image_adapter = None 40 | self.parent = None 41 | 42 | def clearCache(self): 43 | self.clip_model = None 44 | self.clip_processor =None 45 | self.tokenizer = None 46 | self.text_model = None 47 | self.image_adapter = None 48 | 49 | 50 | class ImageAdapter(nn.Module): 51 | def __init__(self, input_features: int, output_features: int): 52 | super().__init__() 53 | self.linear1 = nn.Linear(input_features, output_features) 54 | self.activation = nn.GELU() 55 | self.linear2 = nn.Linear(output_features, output_features) 56 | 57 | def forward(self, vision_outputs: torch.Tensor): 58 | x = self.linear1(vision_outputs) 59 | x = self.activation(x) 60 | x = self.linear2(x) 61 | return x 62 | 63 | class Joy_caption_load: 64 | 65 | def __init__(self): 66 | self.model = None 67 | self.pipeline = JoyPipeline() 68 | self.pipeline.parent = self 69 | pass 70 | 71 | @classmethod 72 | def INPUT_TYPES(s): 73 | return { 74 | "required": { 75 | "model": (["unsloth/Meta-Llama-3.1-8B-bnb-4bit", "meta-llama/Meta-Llama-3.1-8B"],), 76 | 77 | } 78 | } 79 | 80 | CATEGORY = "CXH/LLM" 81 | RETURN_TYPES = ("JoyPipeline",) 82 | FUNCTION = "gen" 83 | 84 | def loadCheckPoint(self): 85 | # 清除一波 86 | if self.pipeline != None: 87 | self.pipeline.clearCache() 88 | 89 | # clip 90 | model_id = "google/siglip-so400m-patch14-384" 91 | CLIP_PATH = download_hg_model(model_id,"clip") 92 | 93 | clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 94 | clip_model = AutoModel.from_pretrained( 95 | CLIP_PATH, 96 | trust_remote_code=True 97 | ) 98 | 99 | clip_model = clip_model.vision_model 100 | clip_model.eval() 101 | clip_model.requires_grad_(False) 102 | clip_model.to("cuda") 103 | 104 | 105 | # LLM 106 | MODEL_PATH = download_hg_model(self.model,"LLM") 107 | tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,use_fast=False) 108 | assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}" 109 | 110 | text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto",trust_remote_code=True) 111 | text_model.eval() 112 | 113 | # Image Adapter 114 | adapter_path = os.path.join(folder_paths.models_dir,"Joy_caption","image_adapter.pt") 115 | 116 | image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size) # ImageAdapter(clip_model.config.hidden_size, 4096) 117 | image_adapter.load_state_dict(torch.load(adapter_path, map_location="cpu")) 118 | adjusted_adapter = image_adapter #AdjustedImageAdapter(image_adapter, text_model.config.hidden_size) 119 | adjusted_adapter.eval() 120 | adjusted_adapter.to("cuda") 121 | 122 | self.pipeline.clip_model = clip_model 123 | self.pipeline.clip_processor = clip_processor 124 | self.pipeline.tokenizer = tokenizer 125 | self.pipeline.text_model = text_model 126 | self.pipeline.image_adapter = adjusted_adapter 127 | 128 | def clearCache(self): 129 | if self.pipeline != None: 130 | self.pipeline.clearCache() 131 | 132 | def gen(self,model): 133 | if self.model == None or self.model != model or self.pipeline == None: 134 | self.model = model 135 | self.loadCheckPoint() 136 | return (self.pipeline,) 137 | 138 | class Joy_caption: 139 | 140 | def __init__(self): 141 | pass 142 | 143 | @classmethod 144 | def INPUT_TYPES(s): 145 | return { 146 | "required": { 147 | "joy_pipeline": ("JoyPipeline",), 148 | "image": ("IMAGE",), 149 | "prompt": ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},), 150 | "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}), 151 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 152 | "cache": ("BOOLEAN", {"default": False}), 153 | } 154 | } 155 | 156 | CATEGORY = "CXH/LLM" 157 | RETURN_TYPES = ("STRING",) 158 | FUNCTION = "gen" 159 | def gen(self,joy_pipeline,image,prompt,max_new_tokens,temperature,cache): 160 | 161 | if joy_pipeline.clip_processor == None : 162 | joy_pipeline.parent.loadCheckPoint() 163 | 164 | clip_processor = joy_pipeline.clip_processor 165 | tokenizer = joy_pipeline.tokenizer 166 | clip_model = joy_pipeline.clip_model 167 | image_adapter = joy_pipeline.image_adapter 168 | text_model = joy_pipeline.text_model 169 | 170 | 171 | 172 | input_image = tensor2pil(image) 173 | 174 | # Preprocess image 175 | pImge = clip_processor(images=input_image, return_tensors='pt').pixel_values 176 | pImge = pImge.to(DEVICE) 177 | 178 | # Tokenize the prompt 179 | prompt = tokenizer.encode(prompt, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False) 180 | # Embed image 181 | with torch.amp.autocast_mode.autocast(str(DEVICE), enabled=True): 182 | vision_outputs = clip_model(pixel_values=pImge, output_hidden_states=True) 183 | image_features = vision_outputs.hidden_states[-2] 184 | embedded_images = image_adapter(image_features) 185 | embedded_images = embedded_images.to(DEVICE) 186 | 187 | # Embed prompt 188 | prompt_embeds = text_model.model.embed_tokens(prompt.to(DEVICE)) 189 | assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}" 190 | embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64)) 191 | 192 | # Construct prompts 193 | inputs_embeds = torch.cat([ 194 | embedded_bos.expand(embedded_images.shape[0], -1, -1), 195 | embedded_images.to(dtype=embedded_bos.dtype), 196 | prompt_embeds.expand(embedded_images.shape[0], -1, -1), 197 | ], dim=1) 198 | 199 | input_ids = torch.cat([ 200 | torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long), 201 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 202 | prompt, 203 | ], dim=1).to(DEVICE) 204 | attention_mask = torch.ones_like(input_ids) 205 | 206 | generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=temperature, suppress_tokens=None) 207 | 208 | # Trim off the prompt 209 | generate_ids = generate_ids[:, input_ids.shape[1]:] 210 | if generate_ids[0][-1] == tokenizer.eos_token_id: 211 | generate_ids = generate_ids[:, :-1] 212 | 213 | caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 214 | r = caption.strip() 215 | 216 | if cache == False: 217 | joy_pipeline.parent.clearCache() 218 | 219 | return (r,) 220 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 添加批量文件夹打标:文件夹命名规则 名字_trigg 2 | ![workflow](https://github.com/user-attachments/assets/d30a2d7f-918a-4837-b85c-be01913d2775) 3 | ![1737366489766](https://github.com/user-attachments/assets/cb885492-a158-49bf-ba2e-956a1ba2d780) 4 | 5 | 6 | .20240-10-30 添加批量图片分类 7 | 8 | ![workflow_min2 6classifiy_](https://github.com/user-attachments/assets/1687cc01-89c4-4628-8f8c-abc641c62a43) 9 | 10 | 11 | .2024-10-16 添加批量打标:4090大概4~5秒一张图 12 | 13 | ![批量打标](https://github.com/user-attachments/assets/15e4075b-ed78-4e88-b586-09f65483c991) 14 | 15 | ![1729064090078](https://github.com/user-attachments/assets/bb61ac24-5bec-4018-98cf-8007533d4dbc) 16 | 17 | .2024-10-12 添加joy alpha2 18 | 19 | 模型下载:https://pan.baidu.com/s/1dOjbUEacUOhzFitAQ3uIeQ?pwd=4ypv#list/path=%2F 20 | 21 | Joy_caption_alpha 放到 models\Joy_caption_alpha 下载:https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two/tree/main/cgrkzexw-599808 22 | 23 | ![1728728834716](https://github.com/user-attachments/assets/3adc7c92-1247-436e-8589-f5c64d33378e) 24 | 25 | 26 | ![joy_alpha](https://github.com/user-attachments/assets/4ab7de6a-405e-405b-b03e-0850522e3951) 27 | 28 | 29 | .2024-9-9 florence2 Add Florence-2-large-PromptGen-v1.5 and MiniCPM3-4B(CXH_MinCP3_4B_Load CXH_MinCP3_4B_Chat) 30 | MiniCPM3-4B聊天 翻译,改写都很强 31 | 32 | .2024-9-6 florence2 Add Florence-2-base-PromptGen-v1.5 33 | 34 | .2024-9-2 更新批量打标案例(Update batch marking cases) 速度:florence2 list[str]: 23 | if not str(filename).endswith("modeling_florence2.py"): 24 | return get_imports(filename) 25 | imports = get_imports(filename) 26 | # imports.remove("flash_attn") 27 | return imports 28 | 29 | 30 | import comfy.model_management as mm 31 | from comfy.utils import ProgressBar 32 | import folder_paths 33 | 34 | script_directory = os.path.dirname(os.path.abspath(__file__)) 35 | 36 | from transformers import AutoModelForCausalLM, AutoProcessor 37 | 38 | class CXH_DownloadAndLoadFlorence2Model: 39 | @classmethod 40 | def INPUT_TYPES(s): 41 | return {"required": { 42 | "model": ( 43 | [ 44 | 'microsoft/Florence-2-base', 45 | 'microsoft/Florence-2-base-ft', 46 | 'microsoft/Florence-2-large', 47 | 'microsoft/Florence-2-large-ft', 48 | 'HuggingFaceM4/Florence-2-DocVQA', 49 | 'thwri/CogFlorence-2-Large-Freeze', 50 | 'thwri/CogFlorence-2.2-Large', 51 | 'MiaoshouAI/Florence-2-base-PromptGen-v1.5', 52 | 'MiaoshouAI/Florence-2-large-PromptGen-v1.5' 53 | ], 54 | { 55 | "default": 'MiaoshouAI/Florence-2-large-PromptGen-v1.5' 56 | }), 57 | "precision": ([ 'fp16','bf16','fp32'], 58 | { 59 | "default": 'fp16' 60 | }), 61 | "attention": ( 62 | [ 'flash_attention_2', 'sdpa', 'eager'], 63 | { 64 | "default": 'sdpa' 65 | }), 66 | 67 | }, 68 | } 69 | 70 | RETURN_TYPES = ("FL2MODEL",) 71 | RETURN_NAMES = ("florence2_model",) 72 | FUNCTION = "loadmodel" 73 | CATEGORY = "CXH/LLM" 74 | 75 | def loadmodel(self, model, precision, attention): 76 | device = mm.get_torch_device() 77 | offload_device = mm.unet_offload_device() 78 | dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] 79 | 80 | model_name = model.rsplit('/', 1)[-1] 81 | model_path = os.path.join(folder_paths.models_dir, "LLM", model_name) 82 | 83 | if not os.path.exists(model_path): 84 | print(f"Downloading Lumina model to: {model_path}") 85 | from huggingface_hub import snapshot_download 86 | snapshot_download(repo_id=model, 87 | local_dir=model_path, 88 | local_dir_use_symlinks=False) 89 | 90 | print(f"using {attention} for attention") 91 | with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports): #workaround for unnecessary flash_attn requirement 92 | model = AutoModelForCausalLM.from_pretrained(model_path, attn_implementation=attention, device_map=device, torch_dtype=dtype,trust_remote_code=True) 93 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 94 | 95 | florence2_model = { 96 | 'model': model, 97 | 'processor': processor, 98 | 'dtype': dtype 99 | } 100 | 101 | return (florence2_model,) 102 | 103 | def calculate_bounding_box(width, height, flat_points) -> List[float]: 104 | """ 105 | Calculate the bounding box for a polygon. 106 | 107 | Args: 108 | flat_points (list of int): Flat list of x, y coordinates defining the polygon points. 109 | 110 | Returns: 111 | tuple: (min_x, min_y, max_x, max_y) defining the bounding box. 112 | """ 113 | if not flat_points or len(flat_points) % 2 != 0: 114 | raise ValueError("The list of points must be non-empty and have an even number of elements") 115 | 116 | x_coords = flat_points[0::2] 117 | y_coords = flat_points[1::2] 118 | 119 | min_x = min(x_coords) 120 | max_x = max(x_coords) 121 | min_y = min(y_coords) 122 | max_y = max(y_coords) 123 | 124 | return [min_x / width, min_y / height, max_x / width, max_y / height] 125 | 126 | class CXH_Florence2Run: 127 | @classmethod 128 | def INPUT_TYPES(s): 129 | return { 130 | "required": { 131 | "image": ("IMAGE", ), 132 | "florence2_model": ("FL2MODEL", ), 133 | "text_input": ("STRING", {"default": "", "multiline": True}), 134 | "task": ( 135 | [ 136 | 'region_caption', 137 | 'dense_region_caption', 138 | 'region_proposal', 139 | 'caption', 140 | 'detailed_caption', 141 | 'more_detailed_caption', 142 | 'caption_to_phrase_grounding', 143 | 'referring_expression_segmentation', 144 | 'ocr', 145 | 'ocr_with_region', 146 | 'docvqa', 147 | 'mixed_caption(PromptGen 1.5)', 148 | 'generate_tags(PromptGen 1.5)' 149 | ], 150 | { 151 | "default": 'more_detailed_caption' 152 | } 153 | ), 154 | "fill_mask": ("BOOLEAN", {"default": True}), 155 | "keep_model_loaded": ("BOOLEAN", {"default": False}), 156 | "max_new_tokens": ("INT", {"default": 1024, "min": 1, "max": 4096}), 157 | "num_beams": ("INT", {"default": 3, "min": 1, "max": 64}), 158 | "do_sample": ("BOOLEAN", {"default": True}), 159 | "output_mask_select": ("STRING", {"default": ""}), 160 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 161 | } 162 | 163 | } 164 | 165 | RETURN_TYPES = ("IMAGE", "MASK", "STRING", "JSON") 166 | RETURN_NAMES =("image", "mask", "caption", "data") 167 | FUNCTION = "encode" 168 | CATEGORY = "Florence2" 169 | 170 | def encode(self, image, text_input, florence2_model, task, fill_mask,keep_model_loaded, 171 | num_beams, max_new_tokens, do_sample, output_mask_select,seed): 172 | device = mm.get_torch_device() 173 | _, height, width, _ = image.shape 174 | offload_device = mm.unet_offload_device() 175 | annotated_image_tensor = None 176 | mask_tensor = None 177 | processor = florence2_model['processor'] 178 | model = florence2_model['model'] 179 | dtype = florence2_model['dtype'] 180 | model.to(device) 181 | 182 | colormap = ['blue','orange','green','purple','brown','pink','olive','cyan','red', 183 | 'lime','indigo','violet','aqua','magenta','gold','tan','skyblue'] 184 | 185 | prompts = { 186 | 'region_caption': '', 187 | 'dense_region_caption': '', 188 | 'region_proposal': '', 189 | 'caption': '', 190 | 'detailed_caption': '', 191 | 'more_detailed_caption': '', 192 | 'caption_to_phrase_grounding': '', 193 | 'referring_expression_segmentation': '', 194 | 'ocr': '', 195 | 'ocr_with_region': '', 196 | 'docvqa': '', 197 | 'mixed_caption(PromptGen 1.5)':'', 198 | 'generate_tags(PromptGen 1.5)':'' 199 | } 200 | task_prompt = prompts.get(task, '') 201 | 202 | # if (task not in ['referring_expression_segmentation', 'caption_to_phrase_grounding', 'docvqa']) and text_input: 203 | # raise ValueError("Text input (prompt) is only supported for 'referring_expression_segmentation', 'caption_to_phrase_grounding', and 'docvqa'") 204 | 205 | if text_input != "": 206 | prompt = task_prompt + " " + text_input 207 | else: 208 | prompt = task_prompt 209 | 210 | image = image.permute(0, 3, 1, 2) 211 | 212 | out = [] 213 | out_masks = [] 214 | out_results = [] 215 | out_data = [] 216 | pbar = ProgressBar(len(image)) 217 | for img in image: 218 | image_pil = F.to_pil_image(img) 219 | inputs = processor(text=prompt, images=image_pil, return_tensors="pt", do_rescale=False).to(dtype).to(device) 220 | 221 | generated_ids = model.generate( 222 | input_ids=inputs["input_ids"], 223 | pixel_values=inputs["pixel_values"], 224 | max_new_tokens=max_new_tokens, 225 | do_sample=do_sample, 226 | num_beams=num_beams, 227 | ) 228 | 229 | results = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] 230 | print(results) 231 | # cleanup the special tokens from the final list 232 | if task == 'ocr_with_region': 233 | clean_results = str(results) 234 | cleaned_string = re.sub(r'|<[^>]*>', '\n', clean_results) 235 | clean_results = re.sub(r'\n+', '\n', cleaned_string) 236 | else: 237 | clean_results = str(results) 238 | clean_results = clean_results.replace('', '') 239 | clean_results = clean_results.replace('', '') 240 | 241 | #return single string if only one image for compatibility with nodes that can't handle string lists 242 | if len(image) == 1: 243 | out_results = clean_results 244 | else: 245 | out_results.append(clean_results) 246 | 247 | W, H = image_pil.size 248 | 249 | parsed_answer = processor.post_process_generation(results, task=task_prompt, image_size=(W, H)) 250 | 251 | if task == 'region_caption' or task == 'dense_region_caption' or task == 'caption_to_phrase_grounding' or task == 'region_proposal': 252 | fig, ax = plt.subplots(figsize=(W / 100, H / 100), dpi=100) 253 | fig.subplots_adjust(left=0, right=1, top=1, bottom=0) 254 | ax.imshow(image_pil) 255 | bboxes = parsed_answer[task_prompt]['bboxes'] 256 | labels = parsed_answer[task_prompt]['labels'] 257 | 258 | mask_indexes = [] 259 | # Determine mask indexes outside the loop 260 | if output_mask_select != "": 261 | mask_indexes = [n for n in output_mask_select.split(",")] 262 | print(mask_indexes) 263 | else: 264 | mask_indexes = [str(i) for i in range(len(bboxes))] 265 | 266 | # Initialize mask_layer only if needed 267 | if fill_mask: 268 | mask_layer = Image.new('RGB', image_pil.size, (0, 0, 0)) 269 | mask_draw = ImageDraw.Draw(mask_layer) 270 | 271 | for index, (bbox, label) in enumerate(zip(bboxes, labels)): 272 | # Modify the label to include the index 273 | indexed_label = f"{index}.{label}" 274 | 275 | if fill_mask: 276 | if str(index) in mask_indexes: 277 | print("match index:", str(index), "in mask_indexes:", mask_indexes) 278 | mask_draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], fill=(255, 255, 255)) 279 | if label in mask_indexes: 280 | print("match label") 281 | mask_draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], fill=(255, 255, 255)) 282 | 283 | # Create a Rectangle patch 284 | rect = patches.Rectangle( 285 | (bbox[0], bbox[1]), # (x,y) - lower left corner 286 | bbox[2] - bbox[0], # Width 287 | bbox[3] - bbox[1], # Height 288 | linewidth=1, 289 | edgecolor='r', 290 | facecolor='none', 291 | label=indexed_label 292 | ) 293 | # Calculate text width with a rough estimation 294 | text_width = len(label) * 6 # Adjust multiplier based on your font size 295 | text_height = 12 # Adjust based on your font size 296 | 297 | # Initial text position 298 | text_x = bbox[0] 299 | text_y = bbox[1] - text_height # Position text above the top-left of the bbox 300 | 301 | # Adjust text_x if text is going off the left or right edge 302 | if text_x < 0: 303 | text_x = 0 304 | elif text_x + text_width > W: 305 | text_x = W - text_width 306 | 307 | # Adjust text_y if text is going off the top edge 308 | if text_y < 0: 309 | text_y = bbox[3] # Move text below the bottom-left of the bbox if it doesn't overlap with bbox 310 | 311 | # Add the rectangle to the plot 312 | ax.add_patch(rect) 313 | facecolor = random.choice(colormap) if len(image) == 1 else 'red' 314 | # Add the label 315 | plt.text( 316 | text_x, 317 | text_y, 318 | indexed_label, 319 | color='white', 320 | fontsize=12, 321 | bbox=dict(facecolor=facecolor, alpha=0.5) 322 | ) 323 | if fill_mask: 324 | mask_tensor = F.to_tensor(mask_layer) 325 | mask_tensor = mask_tensor.unsqueeze(0).permute(0, 2, 3, 1).cpu().float() 326 | mask_tensor = mask_tensor.mean(dim=0, keepdim=True) 327 | mask_tensor = mask_tensor.repeat(1, 1, 1, 3) 328 | mask_tensor = mask_tensor[:, :, :, 0] 329 | out_masks.append(mask_tensor) 330 | 331 | # Remove axis and padding around the image 332 | ax.axis('off') 333 | ax.margins(0,0) 334 | ax.get_xaxis().set_major_locator(plt.NullLocator()) 335 | ax.get_yaxis().set_major_locator(plt.NullLocator()) 336 | fig.canvas.draw() 337 | buf = io.BytesIO() 338 | plt.savefig(buf, format='png', pad_inches=0) 339 | buf.seek(0) 340 | annotated_image_pil = Image.open(buf) 341 | 342 | annotated_image_tensor = F.to_tensor(annotated_image_pil) 343 | out_tensor = annotated_image_tensor[:3, :, :].unsqueeze(0).permute(0, 2, 3, 1).cpu().float() 344 | out.append(out_tensor) 345 | 346 | 347 | pbar.update(1) 348 | 349 | plt.close(fig) 350 | 351 | elif task == 'referring_expression_segmentation': 352 | # Create a new black image 353 | mask_image = Image.new('RGB', (W, H), 'black') 354 | mask_draw = ImageDraw.Draw(mask_image) 355 | 356 | predictions = parsed_answer[task_prompt] 357 | 358 | # Iterate over polygons and labels 359 | for polygons, label in zip(predictions['polygons'], predictions['labels']): 360 | color = random.choice(colormap) 361 | for _polygon in polygons: 362 | _polygon = np.array(_polygon).reshape(-1, 2) 363 | # Clamp polygon points to image boundaries 364 | _polygon = np.clip(_polygon, [0, 0], [W - 1, H - 1]) 365 | if len(_polygon) < 3: 366 | print('Invalid polygon:', _polygon) 367 | continue 368 | 369 | _polygon = _polygon.reshape(-1).tolist() 370 | 371 | # Draw the polygon 372 | if fill_mask: 373 | overlay = Image.new('RGBA', image_pil.size, (255, 255, 255, 0)) 374 | image_pil = image_pil.convert('RGBA') 375 | draw = ImageDraw.Draw(overlay) 376 | color_with_opacity = ImageColor.getrgb(color) + (180,) 377 | draw.polygon(_polygon, outline=color, fill=color_with_opacity, width=3) 378 | image_pil = Image.alpha_composite(image_pil, overlay) 379 | else: 380 | draw = ImageDraw.Draw(image_pil) 381 | draw.polygon(_polygon, outline=color, width=3) 382 | 383 | #draw mask 384 | mask_draw.polygon(_polygon, outline="white", fill="white") 385 | 386 | image_tensor = F.to_tensor(image_pil) 387 | image_tensor = image_tensor[:3, :, :].unsqueeze(0).permute(0, 2, 3, 1).cpu().float() 388 | out.append(image_tensor) 389 | 390 | mask_tensor = F.to_tensor(mask_image) 391 | mask_tensor = mask_tensor.unsqueeze(0).permute(0, 2, 3, 1).cpu().float() 392 | mask_tensor = mask_tensor.mean(dim=0, keepdim=True) 393 | mask_tensor = mask_tensor.repeat(1, 1, 1, 3) 394 | mask_tensor = mask_tensor[:, :, :, 0] 395 | out_masks.append(mask_tensor) 396 | pbar.update(1) 397 | 398 | elif task == 'ocr_with_region': 399 | try: 400 | font = ImageFont.load_default().font_variant(size=24) 401 | except: 402 | font = ImageFont.load_default() 403 | predictions = parsed_answer[task_prompt] 404 | scale = 1 405 | draw = ImageDraw.Draw(image_pil) 406 | bboxes, labels = predictions['quad_boxes'], predictions['labels'] 407 | 408 | for box, label in zip(bboxes, labels): 409 | bbox = calculate_bounding_box(width, height, box) 410 | out_data.append({"label": label, "polygon": box, "box": bbox}) 411 | color = random.choice(colormap) 412 | new_box = (np.array(box) * scale).tolist() 413 | draw.polygon(new_box, width=3, outline=color) 414 | draw.text((new_box[0]+8, new_box[1]+2), 415 | "{}".format(label), 416 | align="right", 417 | font=font, 418 | fill=color) 419 | 420 | image_tensor = F.to_tensor(image_pil) 421 | image_tensor = image_tensor[:3, :, :].unsqueeze(0).permute(0, 2, 3, 1).cpu().float() 422 | out.append(image_tensor) 423 | 424 | elif task == 'docvqa': 425 | if text_input == "": 426 | raise ValueError("Text input (prompt) is required for 'docvqa'") 427 | prompt = " " + text_input 428 | 429 | inputs = processor(text=prompt, images=image_pil, return_tensors="pt", do_rescale=False).to(dtype).to(device) 430 | generated_ids = model.generate( 431 | input_ids=inputs["input_ids"], 432 | pixel_values=inputs["pixel_values"], 433 | max_new_tokens=max_new_tokens, 434 | do_sample=do_sample, 435 | num_beams=num_beams, 436 | ) 437 | 438 | results = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] 439 | clean_results = results.replace('', '').replace('', '') 440 | 441 | if len(image) == 1: 442 | out_results = clean_results 443 | else: 444 | out_results.append(clean_results) 445 | 446 | out.append(F.to_tensor(image_pil).unsqueeze(0).permute(0, 2, 3, 1).cpu().float()) 447 | 448 | pbar.update(1) 449 | 450 | if len(out) > 0: 451 | out_tensor = torch.cat(out, dim=0) 452 | else: 453 | out_tensor = torch.zeros((1, 64,64, 3), dtype=torch.float32, device="cpu") 454 | if len(out_masks) > 0: 455 | out_mask_tensor = torch.cat(out_masks, dim=0) 456 | else: 457 | out_mask_tensor = torch.zeros((1,64,64), dtype=torch.float32, device="cpu") 458 | 459 | if not keep_model_loaded: 460 | print("Offloading model...") 461 | model.to(offload_device) 462 | mm.soft_empty_cache() 463 | 464 | return (out_tensor, out_mask_tensor, out_results, out_data) 465 | 466 | # NODE_CLASS_MAPPINGS = { 467 | # "DownloadAndLoadFlorence2Model": DownloadAndLoadFlorence2Model, 468 | # "Florence2Run": Florence2Run, 469 | # } 470 | # NODE_DISPLAY_NAME_MAPPINGS = { 471 | # "DownloadAndLoadFlorence2Model": "DownloadAndLoadFlorence2Model", 472 | # "Florence2Run": "Florence2Run", 473 | # } -------------------------------------------------------------------------------- /ic_lora_batch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision.transforms.functional as F 3 | import io 4 | import os 5 | from typing import List 6 | import matplotlib 7 | matplotlib.use('Agg') 8 | import matplotlib.pyplot as plt 9 | import matplotlib.patches as patches 10 | from PIL import Image, ImageDraw, ImageColor, ImageFont 11 | import random 12 | import numpy as np 13 | import re 14 | import time 15 | from .lib.ximg import * 16 | from .lib.xmodel import * 17 | from comfy.utils import ProgressBar, common_upscale 18 | import torchvision.transforms.functional as TVF 19 | 20 | #workaround for unnecessary flash_attn requirement 21 | from unittest.mock import patch 22 | from transformers.dynamic_module_utils import get_imports 23 | 24 | def fixed_get_imports(filename: str | os.PathLike) -> list[str]: 25 | if not str(filename).endswith("modeling_florence2.py"): 26 | return get_imports(filename) 27 | imports = get_imports(filename) 28 | # imports.remove("flash_attn") 29 | return imports 30 | 31 | 32 | import comfy.model_management as mm 33 | from comfy.utils import ProgressBar 34 | import folder_paths 35 | 36 | script_directory = os.path.dirname(os.path.abspath(__file__)) 37 | 38 | 39 | 40 | class CXH_IC_Lora_Florence2Run: 41 | @classmethod 42 | def INPUT_TYPES(s): 43 | return { 44 | "required": { 45 | "tip_pipe": ("STRING", {"multiline": False, "default": "", "forceInput": True},), 46 | "florence2_model": ("FL2MODEL", ), 47 | "format": (["png", "jpg"],), 48 | "max_new_tokens":("INT", {"default": 512, "min": 10, "max": 4096, "step": 1}), 49 | "dir1": ("STRING", {"default": ""}), 50 | "dir2": ("STRING", {"default": ""}), 51 | "saveDir": ("STRING", {"default": ""}), 52 | "text1": ("STRING", {"default": "", "multiline": True, "label": "Text Box 1"}), 53 | "text2": ("STRING", {"default": "", "multiline": True, "label": "Text Box 2"}), 54 | "template": ("STRING", {"default": "Realistic style, [cloth-on], the image pair highlights a transformation from a clothing sample photo to the effect of actually wearing it. [image1] {caption} [image2] a female model is wearing the cloth from [image1] with {caption}", "multiline": True, "label": ""}), 55 | "direction": ( 56 | [ 'right', 57 | 'down', 58 | 'left', 59 | 'up', 60 | ], 61 | { 62 | "default": 'right' 63 | }), 64 | "match_image_size": ("BOOLEAN", {"default": True}), 65 | } 66 | } 67 | 68 | RETURN_TYPES = ( "STRING", ) 69 | RETURN_NAMES =("caption", ) 70 | FUNCTION = "encode" 71 | CATEGORY = "Florence2" 72 | 73 | def encode(self,tip_pipe,florence2_model,format,max_new_tokens,dir1,dir2,saveDir,text1,text2,template,direction, match_image_size,first_image_shape=None): 74 | print("执行完成:"+tip_pipe) 75 | torch.cuda.empty_cache() 76 | 77 | device = mm.get_torch_device() 78 | offload_device = mm.unet_offload_device() 79 | processor = florence2_model['processor'] 80 | model = florence2_model['model'] 81 | dtype = florence2_model['dtype'] 82 | model.to(device) 83 | 84 | 85 | task_prompt = "" 86 | 87 | prompt = task_prompt 88 | 89 | # image = image.permute(0, 3, 1, 2) 90 | 91 | # 批量读取 92 | if not os.path.isdir(dir1): 93 | raise FileNotFoundError(f"Directory '{dir1}' cannot be found.") 94 | dir_files = os.listdir(dir1) 95 | 96 | if len(dir_files) == 0: 97 | raise FileNotFoundError(f"No files in directory '{dir1}'.") 98 | 99 | valid_extensions = ['.jpg', '.jpeg', '.png', '.webp'] 100 | dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)] 101 | 102 | dir_files = sorted(dir_files) 103 | dir_files = [os.path.join(dir1, x) for x in dir_files] 104 | 105 | 106 | # 创建保存目录 107 | if not os.path.exists(saveDir): 108 | os.makedirs(saveDir) 109 | index1 = 0 110 | for image_path in dir_files: 111 | if os.path.isdir(image_path) and os.path.ex: 112 | continue 113 | start = time.time() 114 | 115 | #查找两张图片 116 | # 获取文件名(不包含路径) 117 | file_name = os.path.basename(image_path) 118 | 119 | # 构造第二张图片的路径 120 | second_image_path = os.path.join(dir2, file_name) 121 | 122 | # 检查第二张图片是否存在 123 | if not os.path.isfile(second_image_path): 124 | print(f"Second image not found for {file_name}") 125 | continue 126 | 127 | #检查是否已经存在 128 | file_name_without_ext, _ = os.path.splitext(file_name) 129 | img_file_name = f"{file_name_without_ext}.{format}" 130 | 131 | # if format != "png": 132 | # if input_image.mode == "RGBA": 133 | # input_image = input_image.convert("RGB") 134 | img_save_path = os.path.join(saveDir, img_file_name) 135 | if os.path.isfile(img_save_path): 136 | print(f"存在跳过: {img_file_name}") 137 | index1 = index1 + 1 138 | continue 139 | 140 | # 打开图片 141 | input_image = open_image(image_path) 142 | input_image = ImageOps.exif_transpose(input_image) 143 | input_image = input_image.convert("RGB") 144 | 145 | second_image = open_image(second_image_path) 146 | second_image = ImageOps.exif_transpose(second_image) 147 | second_image = second_image.convert("RGB") 148 | 149 | image_pil = input_image 150 | inputs = processor(text=prompt, images=image_pil, return_tensors="pt", do_rescale=False).to(dtype).to(device) 151 | 152 | generated_ids = model.generate( 153 | input_ids=inputs["input_ids"], 154 | pixel_values=inputs["pixel_values"], 155 | max_new_tokens=max_new_tokens, 156 | do_sample=True, 157 | num_beams=3, 158 | ) 159 | 160 | results = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] 161 | clean_results = str(results) 162 | clean_results = clean_results.replace('', '') 163 | clean_results = clean_results.replace('', '') 164 | 165 | W, H = image_pil.size 166 | parsed_answer = processor.post_process_generation(results, task=task_prompt, image_size=(W, H)) 167 | caption = parsed_answer[task_prompt] 168 | 169 | # 提示词 170 | # file_name_without_ext, _ = os.path.splitext(file_name) 171 | caption = caption.strip() 172 | txt_file_name = f"{file_name_without_ext}.txt" 173 | txt_save_path = os.path.join(saveDir, txt_file_name) 174 | final_text = template.replace("{caption}", caption).replace("{text1}", text1).replace("{text2}", text2) 175 | 176 | try: 177 | with open(txt_save_path, 'w', encoding='utf-8') as file: 178 | file.write(final_text) 179 | except IOError as e: 180 | print(f"保存文件时发生错误: {e}") 181 | 182 | # Check if the batch sizes are different 183 | image1 = pil2tensor(input_image) 184 | image2 = pil2tensor(second_image) 185 | batch_size1 = image1.shape[0] 186 | batch_size2 = image2.shape[0] 187 | 188 | if batch_size1 != batch_size2: 189 | # Calculate the number of repetitions needed 190 | max_batch_size = max(batch_size1, batch_size2) 191 | repeats1 = max_batch_size // batch_size1 192 | repeats2 = max_batch_size // batch_size2 193 | 194 | # Repeat the images to match the largest batch size 195 | image1 = image1.repeat(repeats1, 1, 1, 1) 196 | image2 = image2.repeat(repeats2, 1, 1, 1) 197 | 198 | if match_image_size: 199 | # Use first_image_shape if provided; otherwise, default to image1's shape 200 | target_shape = first_image_shape if first_image_shape is not None else image1.shape 201 | 202 | original_height = image2.shape[1] 203 | original_width = image2.shape[2] 204 | original_aspect_ratio = original_width / original_height 205 | 206 | if direction in ['left', 'right']: 207 | # Match the height and adjust the width to preserve aspect ratio 208 | target_height = target_shape[1] # B, H, W, C format 209 | target_width = int(target_height * original_aspect_ratio) 210 | elif direction in ['up', 'down']: 211 | # Match the width and adjust the height to preserve aspect ratio 212 | target_width = target_shape[2] # B, H, W, C format 213 | target_height = int(target_width / original_aspect_ratio) 214 | 215 | # Adjust image2 to the expected format for common_upscale 216 | image2_for_upscale = image2.movedim(-1, 1) # Move C to the second position (B, C, H, W) 217 | 218 | # Resize image2 to match the target size while preserving aspect ratio 219 | image2_resized = common_upscale(image2_for_upscale, target_width, target_height, "lanczos", "disabled") 220 | 221 | # Adjust image2 back to the original format (B, H, W, C) after resizing 222 | image2_resized = image2_resized.movedim(1, -1) 223 | else: 224 | image2_resized = image2 225 | 226 | # Concatenate based on the specified direction 227 | if direction == 'right': 228 | concatenated_image = torch.cat((image1, image2_resized), dim=2) # Concatenate along width 229 | elif direction == 'down': 230 | concatenated_image = torch.cat((image1, image2_resized), dim=1) # Concatenate along height 231 | elif direction == 'left': 232 | concatenated_image = torch.cat((image2_resized, image1), dim=2) # Concatenate along width 233 | elif direction == 'up': 234 | concatenated_image = torch.cat((image2_resized, image1), dim=1) # Concatenate along height 235 | 236 | input_image = tensor2pil(concatenated_image) 237 | 238 | # 图片 239 | # img_file_name = f"{file_name_without_ext}.{format}" 240 | 241 | # if format != "png": 242 | # if input_image.mode == "RGBA": 243 | # input_image = input_image.convert("RGB") 244 | # img_save_path = os.path.join(saveDir, img_file_name) 245 | input_image.save(img_save_path) 246 | end = time.time() 247 | execution_time = calculate_seconds_difference(start, end) 248 | temp = f":{execution_time:.3f}s" 249 | index1 = index1 + 1 250 | print(str(index1)+"/"+str(len(dir_files)) +":"+temp) 251 | 252 | print("finish结束") 253 | model.to(offload_device) 254 | mm.soft_empty_cache() 255 | 256 | return (saveDir,) 257 | 258 | class CXH_Ic_lora_Joy_batch: 259 | 260 | def __init__(self): 261 | pass 262 | 263 | @classmethod 264 | def INPUT_TYPES(s): 265 | return { 266 | "required": { 267 | "JoyPipeline_alpha": ("JoyPipeline_alpha",), 268 | "prompt": ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},), 269 | "format": (["png", "jpg"],), 270 | "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}), 271 | "dir1": ("STRING", {"default": ""}), 272 | "dir2": ("STRING", {"default": ""}), 273 | "saveDir": ("STRING", {"default": ""}), 274 | "text1": ("STRING", {"default": "", "multiline": True, "label": "Text Box 1"}), 275 | "text2": ("STRING", {"default": "", "multiline": True, "label": "Text Box 2"}), 276 | "template": ("STRING", {"default": "Realistic style, [cloth-on], the image pair highlights a transformation from a clothing sample photo to the effect of actually wearing it. [image1] {caption} [image2] a female model is wearing the cloth from [image1] with {caption}", "multiline": True, "label": ""}), 277 | "direction": ( 278 | [ 'right', 279 | 'down', 280 | 'left', 281 | 'up', 282 | ], 283 | { 284 | "default": 'right' 285 | }), 286 | "match_image_size": ("BOOLEAN", {"default": True}), 287 | } 288 | } 289 | 290 | RETURN_TYPES = () 291 | FUNCTION = "gen" 292 | OUTPUT_NODE = True 293 | CATEGORY = "CXH/Images" 294 | 295 | def gen(self,JoyPipeline_alpha,prompt,format,max_new_tokens,dir1,dir2,saveDir,text1,text2,template,direction, match_image_size, first_image_shape=None): 296 | 297 | torch.cuda.empty_cache() 298 | 299 | joy_pipeline = JoyPipeline_alpha 300 | if joy_pipeline.clip_processor == None : 301 | joy_pipeline.parent.loadCheckPoint() 302 | 303 | clip_processor = joy_pipeline.clip_processor 304 | tokenizer = joy_pipeline.tokenizer 305 | clip_model = joy_pipeline.clip_model 306 | image_adapter = joy_pipeline.image_adapter 307 | text_model = joy_pipeline.text_model 308 | 309 | convo = [ 310 | { 311 | "role": "system", 312 | "content": "You are a helpful image captioner.", 313 | }, 314 | { 315 | "role": "user", 316 | "content": prompt, 317 | }, 318 | ] 319 | 320 | convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True) 321 | assert isinstance(convo_string, str) 322 | 323 | convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False) 324 | prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False) 325 | assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor) 326 | convo_tokens = convo_tokens.squeeze(0) # Squeeze just to make the following easier 327 | prompt_tokens = prompt_tokens.squeeze(0) 328 | 329 | eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[ 330 | 0].tolist() 331 | assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}" 332 | 333 | preamble_len = eot_id_indices[1] - prompt_tokens.shape[0] # Number of tokens before the prompt 334 | 335 | 336 | # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model) 337 | # Embed the tokens 338 | convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda')) 339 | 340 | 341 | 342 | # 批量读取 343 | if not os.path.isdir(dir1): 344 | raise FileNotFoundError(f"Directory '{dir1}' cannot be found.") 345 | dir_files = os.listdir(dir1) 346 | 347 | # if not os.path.isdir(dir2): 348 | # raise FileNotFoundError(f"Directory '{dir2}' cannot be found.") 349 | # dir_files_2 = os.listdir(dir2) 350 | 351 | if len(dir_files) == 0: 352 | raise FileNotFoundError(f"No files in directory '{dir1}'.") 353 | 354 | valid_extensions = ['.jpg', '.jpeg', '.png', '.webp'] 355 | dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)] 356 | 357 | dir_files = sorted(dir_files) 358 | dir_files = [os.path.join(dir1, x) for x in dir_files] 359 | 360 | 361 | # 创建保存目录 362 | if not os.path.exists(saveDir): 363 | os.makedirs(saveDir) 364 | 365 | index1 = 0 366 | for image_path in dir_files: 367 | if os.path.isdir(image_path) and os.path.ex: 368 | continue 369 | start = time.time() 370 | 371 | #查找两张图片 372 | # 获取文件名(不包含路径) 373 | file_name = os.path.basename(image_path) 374 | 375 | # 构造第二张图片的路径 376 | second_image_path = os.path.join(dir2, file_name) 377 | 378 | # 检查第二张图片是否存在 379 | if not os.path.isfile(second_image_path): 380 | print(f"Second image not found for {file_name}") 381 | index1 = index1 + 1 382 | continue 383 | 384 | #检查是否已经存在 385 | file_name_without_ext, _ = os.path.splitext(file_name) 386 | img_file_name = f"{file_name_without_ext}.{format}" 387 | 388 | # if format != "png": 389 | # if input_image.mode == "RGBA": 390 | # input_image = input_image.convert("RGB") 391 | img_save_path = os.path.join(saveDir, img_file_name) 392 | if os.path.isfile(img_save_path): 393 | print(f"存在跳过 {img_file_name}") 394 | index1 = index1 + 1 395 | continue 396 | 397 | # 打开图片 398 | input_image = open_image(image_path) 399 | input_image = ImageOps.exif_transpose(input_image) 400 | input_image = input_image.convert("RGB") 401 | 402 | second_image = open_image(second_image_path) 403 | second_image = ImageOps.exif_transpose(second_image) 404 | second_image = second_image.convert("RGB") 405 | 406 | 407 | image = input_image.resize((384, 384), Image.LANCZOS) 408 | pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0 409 | pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]) 410 | pixel_values = pixel_values.to('cuda') 411 | 412 | 413 | with torch.amp.autocast_mode.autocast('cuda', enabled=True): 414 | vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True) 415 | embedded_images = image_adapter(vision_outputs.hidden_states) 416 | embedded_images = embedded_images.to('cuda') 417 | 418 | input_embeds = torch.cat([ 419 | convo_embeds[:, :preamble_len], # Part before the prompt 420 | embedded_images.to(dtype=convo_embeds.dtype), # Image 421 | convo_embeds[:, preamble_len:], # The prompt and anything after it 422 | ], dim=1).to('cuda') 423 | 424 | input_ids = torch.cat([ 425 | convo_tokens[:preamble_len].unsqueeze(0), 426 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 427 | # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input) 428 | convo_tokens[preamble_len:].unsqueeze(0), 429 | ], dim=1).to('cuda') 430 | attention_mask = torch.ones_like(input_ids) 431 | 432 | generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, 433 | max_new_tokens=max_new_tokens, do_sample=True, 434 | suppress_tokens=None) # Uses the default which is temp=0.6, top_p=0.9 435 | 436 | 437 | generate_ids = generate_ids[:, input_ids.shape[1]:] 438 | if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids( 439 | "<|eot_id|>"): 440 | generate_ids = generate_ids[:, :-1] 441 | 442 | caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0] 443 | 444 | # 提示词 445 | 446 | caption = caption.strip() 447 | txt_file_name = f"{file_name_without_ext}.txt" 448 | txt_save_path = os.path.join(saveDir, txt_file_name) 449 | final_text = template.replace("{caption}", caption).replace("{text1}", text1).replace("{text2}", text2) 450 | 451 | try: 452 | with open(txt_save_path, 'w', encoding='utf-8') as file: 453 | file.write(final_text) 454 | except IOError as e: 455 | print(f"保存文件时发生错误: {e}") 456 | 457 | 458 | # Check if the batch sizes are different 459 | image1 = pil2tensor(input_image) 460 | image2 = pil2tensor(second_image) 461 | batch_size1 = image1.shape[0] 462 | batch_size2 = image2.shape[0] 463 | 464 | if batch_size1 != batch_size2: 465 | # Calculate the number of repetitions needed 466 | max_batch_size = max(batch_size1, batch_size2) 467 | repeats1 = max_batch_size // batch_size1 468 | repeats2 = max_batch_size // batch_size2 469 | 470 | # Repeat the images to match the largest batch size 471 | image1 = image1.repeat(repeats1, 1, 1, 1) 472 | image2 = image2.repeat(repeats2, 1, 1, 1) 473 | 474 | if match_image_size: 475 | # Use first_image_shape if provided; otherwise, default to image1's shape 476 | target_shape = first_image_shape if first_image_shape is not None else image1.shape 477 | 478 | original_height = image2.shape[1] 479 | original_width = image2.shape[2] 480 | original_aspect_ratio = original_width / original_height 481 | 482 | if direction in ['left', 'right']: 483 | # Match the height and adjust the width to preserve aspect ratio 484 | target_height = target_shape[1] # B, H, W, C format 485 | target_width = int(target_height * original_aspect_ratio) 486 | elif direction in ['up', 'down']: 487 | # Match the width and adjust the height to preserve aspect ratio 488 | target_width = target_shape[2] # B, H, W, C format 489 | target_height = int(target_width / original_aspect_ratio) 490 | 491 | # Adjust image2 to the expected format for common_upscale 492 | image2_for_upscale = image2.movedim(-1, 1) # Move C to the second position (B, C, H, W) 493 | 494 | # Resize image2 to match the target size while preserving aspect ratio 495 | image2_resized = common_upscale(image2_for_upscale, target_width, target_height, "lanczos", "disabled") 496 | 497 | # Adjust image2 back to the original format (B, H, W, C) after resizing 498 | image2_resized = image2_resized.movedim(1, -1) 499 | else: 500 | image2_resized = image2 501 | 502 | # Concatenate based on the specified direction 503 | if direction == 'right': 504 | concatenated_image = torch.cat((image1, image2_resized), dim=2) # Concatenate along width 505 | elif direction == 'down': 506 | concatenated_image = torch.cat((image1, image2_resized), dim=1) # Concatenate along height 507 | elif direction == 'left': 508 | concatenated_image = torch.cat((image2_resized, image1), dim=2) # Concatenate along width 509 | elif direction == 'up': 510 | concatenated_image = torch.cat((image2_resized, image1), dim=1) # Concatenate along height 511 | 512 | input_image = tensor2pil(concatenated_image) 513 | 514 | 515 | input_image.save(img_save_path) 516 | end = time.time() 517 | execution_time = calculate_seconds_difference(start, end) 518 | temp = f":{execution_time:.3f}s" 519 | index1 = index1 + 1 520 | print(str(index1)+"/"+str(len(dir_files)) +":"+temp) 521 | 522 | print("finish结束") 523 | joy_pipeline.parent.clearCache() 524 | torch.cuda.empty_cache() 525 | import gc 526 | gc.collect() 527 | return (saveDir, ) 528 | 529 | class CXH_IC_lora_reversal: 530 | @classmethod 531 | def INPUT_TYPES(s): 532 | return { 533 | "required": { 534 | "dir1": ("STRING", {"default": ""}), 535 | "dir2": ("STRING", {"default": ""}), 536 | "text_dir": ("STRING", {"default": ""}), 537 | "save_dir": ("STRING", {"default": ""}), 538 | "slic_start": ("STRING", {"default": "[image1]"}), 539 | "slic_end": ("STRING", {"default": "[image2]"}), 540 | "format": (["png", "jpg"],), 541 | "text1": ("STRING", {"default": "", "multiline": True, "label": "Text Box 1"}), 542 | "text2": ("STRING", {"default": "", "multiline": True, "label": "Text Box 2"}), 543 | "template": ("STRING", {"default": "Realistic style, [cloth-on], the image pair highlights a transformation from a female model wearing the cloth to its clothing sample photo. [image1] a female model is wearing a cloth with {caption} [image2] the clothing sample photo of what the model is wearing in [image1] with {caption}", "multiline": True, "label": ""}), 544 | "direction": ( 545 | [ 'right', 546 | 'down', 547 | 'left', 548 | 'up', 549 | ], 550 | { 551 | "default": 'right' 552 | }), 553 | "match_image_size": ("BOOLEAN", {"default": True}), 554 | } 555 | } 556 | 557 | RETURN_TYPES = ( "STRING", ) 558 | RETURN_NAMES =("caption", ) 559 | FUNCTION = "encode" 560 | CATEGORY = "Florence2" 561 | 562 | def encode(self,dir1,dir2,text_dir,save_dir,slic_start,slic_end,format,text1,text2,template,direction, match_image_size,first_image_shape=None): 563 | 564 | # 批量读取 565 | if not os.path.isdir(dir1): 566 | raise FileNotFoundError(f"Directory '{dir1}' cannot be found.") 567 | dir_files = os.listdir(dir1) 568 | 569 | if len(dir_files) == 0: 570 | raise FileNotFoundError(f"No files in directory '{dir1}'.") 571 | 572 | valid_extensions = ['.jpg', '.jpeg', '.png', '.webp'] 573 | dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)] 574 | 575 | dir_files = sorted(dir_files) 576 | dir_files = [os.path.join(dir1, x) for x in dir_files] 577 | 578 | 579 | # 创建保存目录 580 | if not os.path.exists(save_dir): 581 | os.makedirs(save_dir) 582 | index1 = 0 583 | for image_path in dir_files: 584 | if os.path.isdir(image_path) and os.path.ex: 585 | continue 586 | start = time.time() 587 | 588 | #查找两张图片 589 | # 获取文件名(不包含路径) 590 | file_name = os.path.basename(image_path) 591 | 592 | # 构造第二张图片的路径 593 | second_image_path = os.path.join(dir2, file_name) 594 | 595 | #检查是否已经存在 596 | file_name_without_ext, _ = os.path.splitext(file_name) 597 | # 第三个txt 598 | text_file = os.path.join(text_dir, file_name_without_ext+".txt") 599 | 600 | # 检查第二张图片是否存在 601 | if not os.path.isfile(second_image_path) or not os.path.isfile(text_file) : 602 | print(f"Second image not found for {file_name}") 603 | continue 604 | 605 | img_file_name = f"{file_name_without_ext}.{format}" 606 | 607 | 608 | 609 | # if format != "png": 610 | # if input_image.mode == "RGBA": 611 | # input_image = input_image.convert("RGB") 612 | img_save_path = os.path.join(save_dir, img_file_name) 613 | if os.path.isfile(img_save_path): 614 | print(f"存在跳过: {img_file_name}") 615 | index1 = index1 + 1 616 | continue 617 | 618 | # 打开图片 619 | input_image = open_image(image_path) 620 | input_image = ImageOps.exif_transpose(input_image) 621 | input_image = input_image.convert("RGB") 622 | 623 | second_image = open_image(second_image_path) 624 | second_image = ImageOps.exif_transpose(second_image) 625 | second_image = second_image.convert("RGB") 626 | 627 | #打开prompt 628 | # 使用 open 函数打开文件,模式为 'r' 表示读取模式 629 | with open(text_file, 'r', encoding='utf-8') as file: 630 | # 读取文件的所有内容,并存储在变量中 631 | content = file.read() 632 | # 找到字符的索引 633 | start_index = content.index(slic_start) + len(slic_start) 634 | end_index = content.index(slic_end) # 不需要加1,因为我们不包含end_char 635 | 636 | # 使用切片语法截取字符串 637 | sliced_string = content[start_index:end_index] 638 | 639 | final_text = template.replace("{caption}", sliced_string).replace("{text1}", text1).replace("{text2}", text2) 640 | 641 | txt_file_name = f"{file_name_without_ext}.txt" 642 | txt_save_path = os.path.join(save_dir, txt_file_name) 643 | try: 644 | with open(txt_save_path, 'w', encoding='utf-8') as file: 645 | file.write(final_text) 646 | except IOError as e: 647 | print(f"保存文件时发生错误: {e}") 648 | 649 | 650 | # Check if the batch sizes are different 651 | image1 = pil2tensor(input_image) 652 | image2 = pil2tensor(second_image) 653 | batch_size1 = image1.shape[0] 654 | batch_size2 = image2.shape[0] 655 | 656 | if batch_size1 != batch_size2: 657 | # Calculate the number of repetitions needed 658 | max_batch_size = max(batch_size1, batch_size2) 659 | repeats1 = max_batch_size // batch_size1 660 | repeats2 = max_batch_size // batch_size2 661 | 662 | # Repeat the images to match the largest batch size 663 | image1 = image1.repeat(repeats1, 1, 1, 1) 664 | image2 = image2.repeat(repeats2, 1, 1, 1) 665 | 666 | if match_image_size: 667 | # Use first_image_shape if provided; otherwise, default to image1's shape 668 | target_shape = first_image_shape if first_image_shape is not None else image1.shape 669 | 670 | original_height = image2.shape[1] 671 | original_width = image2.shape[2] 672 | original_aspect_ratio = original_width / original_height 673 | 674 | if direction in ['left', 'right']: 675 | # Match the height and adjust the width to preserve aspect ratio 676 | target_height = target_shape[1] # B, H, W, C format 677 | target_width = int(target_height * original_aspect_ratio) 678 | elif direction in ['up', 'down']: 679 | # Match the width and adjust the height to preserve aspect ratio 680 | target_width = target_shape[2] # B, H, W, C format 681 | target_height = int(target_width / original_aspect_ratio) 682 | 683 | # Adjust image2 to the expected format for common_upscale 684 | image2_for_upscale = image2.movedim(-1, 1) # Move C to the second position (B, C, H, W) 685 | 686 | # Resize image2 to match the target size while preserving aspect ratio 687 | image2_resized = common_upscale(image2_for_upscale, target_width, target_height, "lanczos", "disabled") 688 | 689 | # Adjust image2 back to the original format (B, H, W, C) after resizing 690 | image2_resized = image2_resized.movedim(1, -1) 691 | else: 692 | image2_resized = image2 693 | 694 | # Concatenate based on the specified direction 695 | if direction == 'right': 696 | concatenated_image = torch.cat((image1, image2_resized), dim=2) # Concatenate along width 697 | elif direction == 'down': 698 | concatenated_image = torch.cat((image1, image2_resized), dim=1) # Concatenate along height 699 | elif direction == 'left': 700 | concatenated_image = torch.cat((image2_resized, image1), dim=2) # Concatenate along width 701 | elif direction == 'up': 702 | concatenated_image = torch.cat((image2_resized, image1), dim=1) # Concatenate along height 703 | 704 | input_image = tensor2pil(concatenated_image) 705 | 706 | 707 | input_image.save(img_save_path) 708 | end = time.time() 709 | execution_time = calculate_seconds_difference(start, end) 710 | temp = f":{execution_time:.3f}s" 711 | index1 = index1 + 1 712 | print(str(index1)+"/"+str(len(dir_files)) +":"+temp) 713 | 714 | print("finish结束") 715 | return (save_dir, ) -------------------------------------------------------------------------------- /install_req.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set SCRIPT_DIR=%~dp0 4 | 5 | cd /d "%SCRIPT_DIR%../../../python_embeded" 6 | 7 | 8 | python.exe -m pip install -r "%SCRIPT_DIR%requirements.txt" 9 | 10 | pause 11 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/lib/__init__.py -------------------------------------------------------------------------------- /lib/xfile.py: -------------------------------------------------------------------------------- 1 | import folder_paths 2 | import os 3 | import base64 4 | import numpy as np 5 | from PIL import Image,ImageOps, ImageFilter 6 | 7 | import io 8 | 9 | comfy_path = os.path.dirname(folder_paths.__file__) 10 | custom_nodes_path = os.path.join(comfy_path, "custom_nodes") 11 | 12 | # D:\comfyui\ComfyUI_windows_portable\ComfyUI\custom_nodes\Comfyui_CXH_ALY 13 | # current_folder = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | # 节点路径 16 | def node_path(node_name): 17 | return os.path.join(custom_nodes_path,node_name) 18 | 19 | # 创建文件夹 20 | def mkdir(path): 21 | folder = os.path.exists(path) 22 | if not folder: #判断是否存在文件夹如果不存在则创建为文件夹 23 | os.makedirs(path) #makedirs 创建文件时如果路径不存在会创建这个路径 24 | 25 | # 获取所有图片文件路径 26 | def get_all_image_paths(directory): 27 | image_paths = [] 28 | for root, dirs, files in os.walk(directory): 29 | for file in files: 30 | if file.lower().endswith(('.png', '.jpg', '.jpeg')): 31 | image_paths.append(os.path.join(root, file)) 32 | return image_paths 33 | 34 | 35 | -------------------------------------------------------------------------------- /lib/ximg.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | ''' 3 | @File :ximg.py 4 | @Description :图片操作类封装 5 | @Time :2024/04/30 09:46:01 6 | @Author :ChenXingHua 7 | @Version :1.0 8 | ''' 9 | 10 | import os 11 | import torch 12 | from PIL import Image, ImageOps, ImageSequence, ImageFile,UnidentifiedImageError 13 | import numpy as np 14 | import cv2 as cv 15 | import io 16 | import base64 17 | import requests 18 | from io import BytesIO 19 | from datetime import datetime, timedelta 20 | 21 | def tensor2pil(t_image: torch.Tensor) -> Image: 22 | return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8)) 23 | 24 | def pil2tensor(image:Image) -> torch.Tensor: 25 | return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0) 26 | 27 | def tensor2cv2(image:torch.Tensor) -> np.array: 28 | if image.dim() == 4: 29 | image = image.squeeze() 30 | npimage = image.numpy() 31 | cv2image = np.uint8(npimage * 255 / npimage.max()) 32 | return cv.cvtColor(cv2image, cv.COLOR_RGB2BGR) 33 | 34 | def cv22pil(cv2_img:np.ndarray) -> Image: 35 | cv2_img = cv.cvtColor(cv2_img, cv.COLOR_BGR2RGB) 36 | return Image.fromarray(cv2_img) 37 | 38 | # pil转io 39 | def pil2iobyte(pil_image,format='PNG'): 40 | byte_arr = io.BytesIO() 41 | pil_image.save(byte_arr, format=format) 42 | byte_arr = byte_arr.getvalue() 43 | return byte_arr 44 | 45 | # pil转64 46 | def pilTobase64(pil_image,format='PNG'): 47 | byte_arr = pil2iobyte(pil_image,format) 48 | image_base64 = base64.b64encode(byte_arr).decode('utf-8') 49 | return image_base64 50 | 51 | def ioBytes2tensor(bytes): 52 | image = Image.open(bytes) 53 | return pil2tensor(image) 54 | 55 | def getImageSize(image): 56 | if image.shape[0] > 0: 57 | image = torch.unsqueeze(image[0], 0) 58 | _image = tensor2pil(image) 59 | 60 | return (_image.width, _image.height) 61 | 62 | # 转成mask 63 | def imageToMask(img): 64 | i = img 65 | if i.mode == 'I': 66 | i = i.point(lambda i: i * (1 / 255)) 67 | image = i.convert("RGB") 68 | image = np.array(image).astype(np.float32) / 255.0 69 | image = torch.from_numpy(image)[None,] 70 | 71 | if 'A' in i.getbands(): 72 | mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0 73 | mask = 1. - torch.from_numpy(mask) 74 | else: 75 | mask = torch.zeros((64,64), dtype=torch.float32, device="cpu") 76 | return tensor2pil(mask) 77 | 78 | # ret_masks.append(image2mask(_mask)) 79 | def image2mask(image:Image) -> torch.Tensor: 80 | _image = image.convert('RGBA') 81 | alpha = _image.split() [0] 82 | bg = Image.new("L", _image.size) 83 | _image = Image.merge('RGBA', (bg, bg, bg, alpha)) 84 | ret_mask = torch.tensor([pil2tensor(_image)[0, :, :, 3].tolist()]) 85 | return ret_mask 86 | 87 | # 图像回帖 88 | def croppImg(original_image,cropped_avatar,left_x,top_y): 89 | # 获取原始图像的大小 90 | original_width, original_height = original_image.size 91 | return croppImageBySize(cropped_avatar,left_x,top_y,original_width,original_height) 92 | 93 | def croppImageBySize(cropped_avatar,left_x,top_y,original_w,original_h): 94 | # 获取原始图像的大小 95 | original_width, original_height = original_w,original_h 96 | # 获取头像的大小 97 | avatar_width, avatar_height = cropped_avatar.size 98 | # 创建一个与原始图像相同大小的透明图像 99 | extended_image = Image.new("RGBA", (original_width, original_height), (0, 0, 0, 0)) 100 | # 将裁剪后的头像粘贴到新图像 101 | extended_image.paste(cropped_avatar, (left_x, top_y), cropped_avatar) 102 | 103 | return extended_image 104 | 105 | 106 | # 将图片转换为Base64编码 107 | def image_to_base64(image_path): 108 | with open(image_path, 'rb') as image_file: 109 | return base64.b64encode(image_file.read()).decode('utf-8') 110 | 111 | # 获取网络图片 112 | def img_from_url(url): 113 | # 发送HTTP请求获取图片 114 | response = requests.get(url) 115 | response.raise_for_status() # 如果请求失败,这会抛出异常 116 | # 将响应内容作为BytesIO对象打开,以便PIL可以读取它 117 | image = Image.open(BytesIO(response.content)) 118 | return image 119 | 120 | def open_image(path): 121 | prev_value = None 122 | 123 | try: 124 | img = Image.open(path) 125 | except (UnidentifiedImageError, ValueError): #PIL issues #4472 and #2445 126 | prev_value = ImageFile.LOAD_TRUNCATED_IMAGES 127 | ImageFile.LOAD_TRUNCATED_IMAGES = True 128 | img = Image.open(path) 129 | finally: 130 | if prev_value is not None: 131 | ImageFile.LOAD_TRUNCATED_IMAGES = prev_value 132 | return img 133 | 134 | # 批量读取 135 | def batch_image(directory): 136 | if not os.path.isdir(directory): 137 | raise FileNotFoundError(f"Directory '{directory}' cannot be found.") 138 | dir_files = os.listdir(directory) 139 | if len(dir_files) == 0: 140 | raise FileNotFoundError(f"No files in directory '{directory}'.") 141 | 142 | valid_extensions = ['.jpg', '.jpeg', '.png', '.webp'] 143 | dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)] 144 | 145 | dir_files = sorted(dir_files) 146 | dir_files = [os.path.join(directory, x) for x in dir_files] 147 | return dir_files 148 | 149 | def calculate_seconds_difference(start_time, end_time): 150 | """ 151 | 计算两个时间点之间的秒数差异 152 | 153 | :param start_time: 开始时间(可以是时间戳或datetime对象) 154 | :param end_time: 结束时间(可以是时间戳或datetime对象) 155 | :return: 秒数差异(浮点数) 156 | """ 157 | # 如果输入是datetime对象,转换为时间戳 158 | if isinstance(start_time, datetime): 159 | start_time = start_time.timestamp() 160 | if isinstance(end_time, datetime): 161 | end_time = end_time.timestamp() 162 | 163 | return end_time - start_time -------------------------------------------------------------------------------- /lib/xmodel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import folder_paths 3 | import json 4 | from transformers import AutoProcessor 5 | import torch 6 | 7 | # def get_torch_device(): 8 | # """ 9 | # 返回PyTorch模型应该运行的设备(CPU或GPU) 10 | # 如果系统支持CUDA并且至少有一个GPU可用,则返回GPU设备;否则返回CPU设备。 11 | # """ 12 | # if torch.cuda.is_available(): 13 | # # 选择第一个可用的GPU 14 | # device = torch.device("cuda:0") 15 | # print(f"There are {torch.cuda.device_count()} GPU(s) available.") 16 | # print(f"We will use the GPU: {device}") 17 | # else: 18 | # # 如果没有GPU可用,则使用CPU 19 | # device = torch.device("cpu") 20 | # print("No GPU available, using the CPU instead.") 21 | # return device 22 | 23 | # 下载hg 模型到本地 24 | def download_hg_model(model_id:str,exDir:str=''): 25 | # 下载本地 26 | model_checkpoint = os.path.join(folder_paths.models_dir, exDir, os.path.basename(model_id)) 27 | print(model_checkpoint) 28 | if not os.path.exists(model_checkpoint): 29 | from huggingface_hub import snapshot_download 30 | snapshot_download(repo_id=model_id, local_dir=model_checkpoint, local_dir_use_symlinks=False) 31 | return model_checkpoint 32 | 33 | # clip_model = AutoModelForCausalLM.from_pretrained( 34 | # CLIP_PATH, 35 | # device_map="cuda", 36 | # trust_remote_code=True, 37 | # torch_dtype="auto" 38 | # ) 39 | 40 | # clip_processor = AutoProcessor.from_pretrained(CLIP_PATH, trust_remote_code=True) -------------------------------------------------------------------------------- /miniCPMv2_6_prompt_generator.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import InferenceClient 3 | from torch import nn 4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 5 | from pathlib import Path 6 | import torch 7 | import torch.amp.autocast_mode 8 | from PIL import Image 9 | import os 10 | import folder_paths 11 | 12 | from .lib.ximg import * 13 | from .lib.xmodel import * 14 | 15 | class CXH_Hg_Pipe: 16 | 17 | def __init__(self): 18 | self.text_model = None 19 | self.tokenizer =None 20 | 21 | 22 | class CXH_HG_Model_Load: 23 | 24 | def __init__(self): 25 | self.pipe = None 26 | 27 | @classmethod 28 | def INPUT_TYPES(s): 29 | return { 30 | "required": { 31 | "model": (["pzc163/MiniCPMv2_6-prompt-generator"],), 32 | } 33 | } 34 | 35 | CATEGORY = "CXH/LLM" 36 | RETURN_TYPES = ("CXH_Hg_Pipe",) 37 | RETURN_NAMES = ("pipe",) 38 | FUNCTION = "gen" 39 | 40 | def gen(self,model): 41 | 42 | self.pipe = CXH_Hg_Pipe() 43 | 44 | MODEL_PATH = download_hg_model(model,"LLM") 45 | tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) 46 | assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}" 47 | 48 | text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, trust_remote_code=True) 49 | text_model.eval() 50 | 51 | self.pipe.text_model = text_model 52 | self.pipe.tokenizer = tokenizer 53 | 54 | return (self.pipe,) 55 | 56 | class CXH_Min2_6_prompt_Run : 57 | 58 | def __init__(self): 59 | pass 60 | 61 | @classmethod 62 | def INPUT_TYPES(s): 63 | return { 64 | "required": { 65 | "pipe": ("CXH_Hg_Pipe",), 66 | "image": ("IMAGE",), 67 | "prompt": ("STRING", {"multiline": True, "default": "Provide a detailed description of the details and content contained in the image, and generate a short prompt that can be used for image generation tasks in Stable Diffusion,remind you only need respons prompt itself and no other information."},), 68 | "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}), 69 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 70 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 71 | } 72 | } 73 | 74 | CATEGORY = "CXH/LLM" 75 | RETURN_TYPES = ("STRING",) 76 | FUNCTION = "gen" 77 | def gen(self,pipe,image,prompt,max_tokens,temperature,seed): 78 | 79 | image = tensor2pil(image) 80 | question = prompt 81 | msgs = [{'role': 'user', 'content': [image, question]}] 82 | 83 | res = pipe.text_model.chat( 84 | image=None, 85 | msgs=msgs, 86 | tokenizer=pipe.tokenizer 87 | ) 88 | 89 | ## if you want to use streaming, please make sure sampling=True and stream=True 90 | ## the model.chat will return a generator 91 | res = pipe.text_model.chat( 92 | image=None, 93 | msgs=msgs, 94 | tokenizer=pipe.tokenizer, 95 | sampling=False, 96 | stream=False, 97 | max_tokens=max_tokens, 98 | temperature=temperature, 99 | ) 100 | 101 | generated_text = "" 102 | for new_text in res: 103 | generated_text += new_text 104 | print(new_text, flush=True, end='') 105 | 106 | 107 | return (generated_text,) 108 | -------------------------------------------------------------------------------- /miniCpMV3_4_chat.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import InferenceClient 3 | from torch import nn 4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 5 | 6 | # from vllm import LLM, SamplingParams 7 | from pathlib import Path 8 | import torch 9 | import torch.amp.autocast_mode 10 | from PIL import Image 11 | import os 12 | import folder_paths 13 | 14 | from .lib.ximg import * 15 | from .lib.xmodel import * 16 | 17 | device = "cuda" 18 | 19 | class CXH_MinCP3_4B_Pipe: 20 | 21 | def __init__(self): 22 | self.model = None 23 | self.tokenizer =None 24 | 25 | 26 | class CXH_MinCP3_4B_Load: 27 | 28 | def __init__(self): 29 | self.pipe = None 30 | 31 | @classmethod 32 | def INPUT_TYPES(s): 33 | return { 34 | "required": { 35 | "model": (["openbmb/MiniCPM3-4B","openbmb/MiniCPM3-4B-GPTQ-Int4"],), 36 | } 37 | } 38 | 39 | CATEGORY = "CXH/LLM" 40 | RETURN_TYPES = ("CXH_MinCP3_4B_Pipe",) 41 | RETURN_NAMES = ("pipe",) 42 | FUNCTION = "gen" 43 | 44 | def gen(self,model): 45 | 46 | self.pipe = CXH_MinCP3_4B_Pipe() 47 | 48 | MODEL_PATH = download_hg_model(model,"LLM") 49 | 50 | 51 | tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) 52 | model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True) 53 | 54 | self.pipe.model = model 55 | self.pipe.tokenizer = tokenizer 56 | 57 | return (self.pipe,) 58 | 59 | class CXH_MinCP3_4B_Chat: 60 | 61 | def __init__(self): 62 | pass 63 | 64 | @classmethod 65 | def INPUT_TYPES(s): 66 | return { 67 | "required": { 68 | "pipe": ("CXH_MinCP3_4B_Pipe",), 69 | "prompt": ("STRING", {"multiline": True, "default": "Provide a detailed description of the details and content contained in the image, and generate a short prompt that can be used for image generation tasks in Stable Diffusion,remind you only need respons prompt itself and no other information."},), 70 | "top_p":("FLOAT", {"default": 0.7, "min": 0.0, "max": 1, "step": 1}), 71 | "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}), 72 | "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}), 73 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 74 | } 75 | } 76 | 77 | CATEGORY = "CXH/LLM" 78 | RETURN_TYPES = ("STRING",) 79 | FUNCTION = "gen" 80 | def gen(self,pipe,prompt,top_p,max_tokens,temperature,seed): 81 | 82 | messages = [ 83 | {"role": "user", "content": prompt}, 84 | ] 85 | 86 | model_inputs = pipe.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(device) 87 | 88 | model_outputs = pipe.model.generate( 89 | model_inputs, 90 | max_new_tokens=max_tokens, 91 | top_p=top_p, 92 | temperature=temperature 93 | ) 94 | 95 | output_token_ids = [ 96 | model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs)) 97 | ] 98 | 99 | responses = pipe.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0] 100 | # print(responses) 101 | return (responses,) 102 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub==0.24.3 2 | transformers>=4.44.2 3 | tqdm 4 | numpy 5 | surrealist 6 | boto3==1.34.86 7 | llama-cpp-python==0.2.89 8 | Pillow==10.1.0 9 | sentencepiece==0.1.99 10 | accelerate>=0.30.1 11 | bitsandbytes>=0.43.1 12 | peft>=0.9.0 13 | datamodel-code-generator>=0.26.0 14 | matplotlib 15 | pyvips -------------------------------------------------------------------------------- /smolvlm.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import InferenceClient 3 | from torch import nn 4 | from transformers import AutoModelForVision2Seq,CLIPImageProcessor, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM 5 | from pathlib import Path 6 | import torch 7 | import torch.amp.autocast_mode 8 | from PIL import Image 9 | import os 10 | import folder_paths 11 | import time 12 | import re 13 | 14 | from .lib.ximg import * 15 | from .lib.xmodel import * 16 | 17 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" 18 | 19 | class CXH_SmolVlm_Pipe: 20 | 21 | def __init__(self): 22 | self.model = None 23 | self.processor =None 24 | 25 | class CXH_SmolVlm_Load: 26 | @classmethod 27 | def INPUT_TYPES(s): 28 | return { 29 | "required": { 30 | "model": (["HuggingFaceTB/SmolVLM-Instruct"],), 31 | } 32 | } 33 | 34 | CATEGORY = "CXH/LLM" 35 | RETURN_TYPES = ("CXH_SmolVlm_Pipe",) 36 | RETURN_NAMES = ("pipe",) 37 | FUNCTION = "gen" 38 | def gen(self,model): 39 | self.pipe = CXH_SmolVlm_Pipe() 40 | 41 | MODEL_PATH = download_hg_model(model,"LLM") 42 | print(MODEL_PATH) 43 | 44 | # Initialize processor and model 45 | processor = AutoProcessor.from_pretrained(MODEL_PATH,trust_remote_code=True) 46 | model1 = AutoModelForVision2Seq.from_pretrained( 47 | MODEL_PATH, 48 | torch_dtype=torch.bfloat16, 49 | # _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", 50 | ).to(DEVICE) 51 | 52 | 53 | self.pipe.model = model1 54 | self.pipe.processor = processor 55 | return (self.pipe,) 56 | 57 | class CXH_SmolVlm_Run : 58 | 59 | def __init__(self): 60 | pass 61 | 62 | @classmethod 63 | def INPUT_TYPES(s): 64 | return { 65 | "required": { 66 | "pipe": ("CXH_SmolVlm_Pipe",), 67 | "image": ("IMAGE",), 68 | "prompt": ("STRING", {"multiline": True, "default": "Provide a detailed description of the details and content contained in the image, and generate a short prompt that can be used for image generation tasks in Stable Diffusion,remind you only need respons prompt itself and no other information."},), 69 | "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}), 70 | "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}), 71 | } 72 | } 73 | 74 | CATEGORY = "CXH/LLM" 75 | RETURN_TYPES = ("STRING",) 76 | FUNCTION = "gen" 77 | def gen(self,pipe,image,prompt,max_tokens,seed): 78 | 79 | image = tensor2pil(image) 80 | # Create input messages 81 | messages = [ 82 | { 83 | "role": "user", 84 | "content": [ 85 | {"type": "image"}, 86 | {"type": "text", "text": prompt} 87 | ] 88 | }, 89 | ] 90 | # Prepare inputs 91 | prompt = pipe.processor.apply_chat_template(messages, add_generation_prompt=True) 92 | inputs = pipe.processor(text=prompt, images=[image], return_tensors="pt") 93 | inputs = inputs.to(DEVICE) 94 | 95 | # Generate outputs 96 | generated_ids = pipe.model.generate(**inputs, max_new_tokens=max_tokens) 97 | generated_texts = pipe.processor.batch_decode( 98 | generated_ids, 99 | skip_special_tokens=True, 100 | ) 101 | print(generated_texts[0]) 102 | pattern = re.compile(r"Assistant:\s*(.*)") 103 | match = pattern.search(generated_texts[0]) 104 | 105 | if match: 106 | number = match.group(1) 107 | return (number,) 108 | else: 109 | print("No number found.") 110 | return (generated_texts[0],) 111 | 112 | -------------------------------------------------------------------------------- /worflow/Min2.6+joy+Florence2.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 11, 3 | "last_link_id": 11, 4 | "nodes": [ 5 | { 6 | "id": 9, 7 | "type": "CXH_Min2_6_prompt_Run", 8 | "pos": [ 9 | 1177, 10 | 407 11 | ], 12 | "size": { 13 | "0": 400, 14 | "1": 200 15 | }, 16 | "flags": {}, 17 | "order": 5, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "pipe", 22 | "type": "CXH_Hg_Pipe", 23 | "link": 6 24 | }, 25 | { 26 | "name": "image", 27 | "type": "IMAGE", 28 | "link": 7 29 | } 30 | ], 31 | "outputs": [ 32 | { 33 | "name": "STRING", 34 | "type": "STRING", 35 | "links": [ 36 | 10 37 | ], 38 | "shape": 3, 39 | "slot_index": 0 40 | } 41 | ], 42 | "properties": { 43 | "Node name for S&R": "CXH_Min2_6_prompt_Run" 44 | }, 45 | "widgets_values": [ 46 | "A descriptive caption for this image ", 47 | 2048, 48 | 0.7 49 | ], 50 | "color": "#1b4669", 51 | "bgcolor": "#29699c" 52 | }, 53 | { 54 | "id": 4, 55 | "type": "Joy_caption", 56 | "pos": [ 57 | 1195, 58 | 897 59 | ], 60 | "size": { 61 | "0": 400, 62 | "1": 200 63 | }, 64 | "flags": {}, 65 | "order": 4, 66 | "mode": 0, 67 | "inputs": [ 68 | { 69 | "name": "joy_pipeline", 70 | "type": "JoyPipeline", 71 | "link": 1 72 | }, 73 | { 74 | "name": "image", 75 | "type": "IMAGE", 76 | "link": 2 77 | } 78 | ], 79 | "outputs": [ 80 | { 81 | "name": "STRING", 82 | "type": "STRING", 83 | "links": [ 84 | 3 85 | ], 86 | "slot_index": 0, 87 | "shape": 3 88 | } 89 | ], 90 | "properties": { 91 | "Node name for S&R": "Joy_caption" 92 | }, 93 | "widgets_values": [ 94 | "A descriptive caption for this image ", 95 | 2048, 96 | 0.7000000000000001, 97 | false 98 | ], 99 | "color": "#1b4669", 100 | "bgcolor": "#29699c" 101 | }, 102 | { 103 | "id": 8, 104 | "type": "CXH_Florence2Run", 105 | "pos": [ 106 | 1210, 107 | 1417 108 | ], 109 | "size": { 110 | "0": 400, 111 | "1": 304 112 | }, 113 | "flags": {}, 114 | "order": 6, 115 | "mode": 0, 116 | "inputs": [ 117 | { 118 | "name": "image", 119 | "type": "IMAGE", 120 | "link": 8 121 | }, 122 | { 123 | "name": "florence2_model", 124 | "type": "FL2MODEL", 125 | "link": 9 126 | } 127 | ], 128 | "outputs": [ 129 | { 130 | "name": "image", 131 | "type": "IMAGE", 132 | "links": null, 133 | "shape": 3, 134 | "slot_index": 0 135 | }, 136 | { 137 | "name": "mask", 138 | "type": "MASK", 139 | "links": null, 140 | "shape": 3 141 | }, 142 | { 143 | "name": "caption", 144 | "type": "STRING", 145 | "links": [ 146 | 11 147 | ], 148 | "shape": 3, 149 | "slot_index": 2 150 | }, 151 | { 152 | "name": "data", 153 | "type": "JSON", 154 | "links": null, 155 | "shape": 3 156 | } 157 | ], 158 | "properties": { 159 | "Node name for S&R": "CXH_Florence2Run" 160 | }, 161 | "widgets_values": [ 162 | "", 163 | "more_detailed_caption", 164 | true, 165 | false, 166 | 2048, 167 | 3, 168 | true, 169 | "" 170 | ], 171 | "color": "#1b4669", 172 | "bgcolor": "#29699c" 173 | }, 174 | { 175 | "id": 7, 176 | "type": "CXH_HG_Model_Load", 177 | "pos": [ 178 | 1187, 179 | 286 180 | ], 181 | "size": { 182 | "0": 315, 183 | "1": 58 184 | }, 185 | "flags": {}, 186 | "order": 0, 187 | "mode": 0, 188 | "outputs": [ 189 | { 190 | "name": "pipe", 191 | "type": "CXH_Hg_Pipe", 192 | "links": [ 193 | 6 194 | ], 195 | "shape": 3, 196 | "slot_index": 0 197 | } 198 | ], 199 | "properties": { 200 | "Node name for S&R": "CXH_HG_Model_Load" 201 | }, 202 | "widgets_values": [ 203 | "pzc163/MiniCPMv2_6-prompt-generator" 204 | ], 205 | "color": "#1b4669", 206 | "bgcolor": "#29699c" 207 | }, 208 | { 209 | "id": 10, 210 | "type": "CXH_DownloadAndLoadFlorence2Model", 211 | "pos": [ 212 | 1209, 213 | 1257 214 | ], 215 | "size": { 216 | "0": 315, 217 | "1": 106 218 | }, 219 | "flags": {}, 220 | "order": 1, 221 | "mode": 0, 222 | "outputs": [ 223 | { 224 | "name": "florence2_model", 225 | "type": "FL2MODEL", 226 | "links": [ 227 | 9 228 | ], 229 | "shape": 3, 230 | "slot_index": 0 231 | } 232 | ], 233 | "properties": { 234 | "Node name for S&R": "CXH_DownloadAndLoadFlorence2Model" 235 | }, 236 | "widgets_values": [ 237 | "thwri/CogFlorence-2.2-Large", 238 | "fp16", 239 | "sdpa" 240 | ], 241 | "color": "#1b4669", 242 | "bgcolor": "#29699c" 243 | }, 244 | { 245 | "id": 3, 246 | "type": "Joy_caption_load", 247 | "pos": [ 248 | 1210, 249 | 791 250 | ], 251 | "size": { 252 | "0": 315, 253 | "1": 58 254 | }, 255 | "flags": {}, 256 | "order": 2, 257 | "mode": 0, 258 | "outputs": [ 259 | { 260 | "name": "JoyPipeline", 261 | "type": "JoyPipeline", 262 | "links": [ 263 | 1 264 | ], 265 | "slot_index": 0, 266 | "shape": 3 267 | } 268 | ], 269 | "properties": { 270 | "Node name for S&R": "Joy_caption_load" 271 | }, 272 | "widgets_values": [ 273 | "unsloth/Meta-Llama-3.1-8B-bnb-4bit" 274 | ], 275 | "color": "#1b4669", 276 | "bgcolor": "#29699c" 277 | }, 278 | { 279 | "id": 1, 280 | "type": "LoadImage", 281 | "pos": [ 282 | 500, 283 | 673 284 | ], 285 | "size": [ 286 | 558.8251844824922, 287 | 765.5085685298109 288 | ], 289 | "flags": {}, 290 | "order": 3, 291 | "mode": 0, 292 | "outputs": [ 293 | { 294 | "name": "IMAGE", 295 | "type": "IMAGE", 296 | "links": [ 297 | 2, 298 | 7, 299 | 8 300 | ], 301 | "slot_index": 0, 302 | "shape": 3 303 | }, 304 | { 305 | "name": "MASK", 306 | "type": "MASK", 307 | "links": null, 308 | "shape": 3 309 | } 310 | ], 311 | "properties": { 312 | "Node name for S&R": "LoadImage" 313 | }, 314 | "widgets_values": [ 315 | "26124763.jpg", 316 | "image" 317 | ] 318 | }, 319 | { 320 | "id": 2, 321 | "type": "easy showAnything", 322 | "pos": [ 323 | 1687, 324 | 401 325 | ], 326 | "size": { 327 | "0": 390.0909423828125, 328 | "1": 252.36358642578125 329 | }, 330 | "flags": {}, 331 | "order": 8, 332 | "mode": 0, 333 | "inputs": [ 334 | { 335 | "name": "anything", 336 | "type": "*", 337 | "link": 10 338 | } 339 | ], 340 | "properties": { 341 | "Node name for S&R": "easy showAnything" 342 | }, 343 | "widgets_values": [ 344 | "The image presents a striking digital illustration of a knight in full armor, standing resolute against a backdrop of a tumultuous sky. The knight, clad in a dark, ornate suit of armor, wields a sword that glows with an otherworldly light, suggesting it may be imbued with magical properties. The armor is intricately designed, featuring gold accents that catch the eye amidst the darker tones. The knight's helmet is adorned with a plume, adding to the regal appearance. The background is a dramatic canvas of dark clouds, hinting at an impending storm, which contrasts with the fiery glow emanating from the sword, creating a sense of tension and anticipation. The overall composition of the image suggests a narrative of conflict and heroism, with the knight poised to face whatever challenges lie ahead." 345 | ] 346 | }, 347 | { 348 | "id": 5, 349 | "type": "easy showAnything", 350 | "pos": [ 351 | 1690, 352 | 854 353 | ], 354 | "size": { 355 | "0": 462.2198791503906, 356 | "1": 255.30990600585938 357 | }, 358 | "flags": {}, 359 | "order": 7, 360 | "mode": 0, 361 | "inputs": [ 362 | { 363 | "name": "anything", 364 | "type": "*", 365 | "link": 3 366 | } 367 | ], 368 | "properties": { 369 | "Node name for S&R": "easy showAnything" 370 | }, 371 | "widgets_values": [ 372 | "1. This is a digital illustration depicting a majestic, armored warrior standing in a dramatic, stormy landscape. The warrior is a tall, imposing figure, clad in dark, metallic armor with intricate designs and sharp spikes. His helmet resembles a fearsome beast's head, with long, sharp horns curving backward. He wears a flowing cape that billows in the wind, adding a sense of movement and strength to his imposing stance. His eyes are hidden behind a visor, giving him an aura of mystery and intensity. \n\nHis left arm is sheathed in a long sword with a glowing, fiery blade, and his right hand grasps a similar sword with flames licking along its edge. The background is a tumultuous sky filled with dark clouds and flashes of lightning, creating a sense of impending danger and chaos. The ground is rugged and rocky, with small explosions of fiery orange gas rising from the surface, enhancing the sense of volatility and power. \n\nThe overall mood of the image is dark and foreboding, with a sense of otherworldly majesty. The artwork utilizes a detailed, realistic style, with a focus on the textures of the armor and the dynamic, swirling motion of the cape and the fiery blades." 373 | ] 374 | }, 375 | { 376 | "id": 11, 377 | "type": "easy showAnything", 378 | "pos": [ 379 | 1640, 380 | 1455 381 | ], 382 | "size": { 383 | "0": 462.2198791503906, 384 | "1": 255.30990600585938 385 | }, 386 | "flags": {}, 387 | "order": 9, 388 | "mode": 0, 389 | "inputs": [ 390 | { 391 | "name": "anything", 392 | "type": "*", 393 | "link": 11 394 | } 395 | ], 396 | "properties": { 397 | "Node name for S&R": "easy showAnything" 398 | }, 399 | "widgets_values": [ 400 | "A dramatic portrayal of a dark, armored warrior in a dynamic pose, wielding a long, fiery sword. The warrior wears ornate, dark armor with intricate designs and a helmet featuring a crown-like visor. The background is a stormy sky filled with dark clouds, and the ground is covered in fiery orange and yellow hues, indicating either either dawn or dusk. The overall color palette is dominated by dark blues, blacks, and fiery oranges, creating a sense of foreboding and intensity." 401 | ] 402 | } 403 | ], 404 | "links": [ 405 | [ 406 | 1, 407 | 3, 408 | 0, 409 | 4, 410 | 0, 411 | "JoyPipeline" 412 | ], 413 | [ 414 | 2, 415 | 1, 416 | 0, 417 | 4, 418 | 1, 419 | "IMAGE" 420 | ], 421 | [ 422 | 3, 423 | 4, 424 | 0, 425 | 5, 426 | 0, 427 | "*" 428 | ], 429 | [ 430 | 6, 431 | 7, 432 | 0, 433 | 9, 434 | 0, 435 | "CXH_Hg_Pipe" 436 | ], 437 | [ 438 | 7, 439 | 1, 440 | 0, 441 | 9, 442 | 1, 443 | "IMAGE" 444 | ], 445 | [ 446 | 8, 447 | 1, 448 | 0, 449 | 8, 450 | 0, 451 | "IMAGE" 452 | ], 453 | [ 454 | 9, 455 | 10, 456 | 0, 457 | 8, 458 | 1, 459 | "FL2MODEL" 460 | ], 461 | [ 462 | 10, 463 | 9, 464 | 0, 465 | 2, 466 | 0, 467 | "*" 468 | ], 469 | [ 470 | 11, 471 | 8, 472 | 2, 473 | 11, 474 | 0, 475 | "*" 476 | ] 477 | ], 478 | "groups": [ 479 | { 480 | "title": "Min2_6", 481 | "bounding": [ 482 | 1156, 483 | 167, 484 | 928, 485 | 501 486 | ], 487 | "color": "#3f789e", 488 | "font_size": 24, 489 | "locked": false 490 | }, 491 | { 492 | "title": "Joy_caption", 493 | "bounding": [ 494 | 1154, 495 | 699, 496 | 1032, 497 | 449 498 | ], 499 | "color": "#3f789e", 500 | "font_size": 24, 501 | "locked": false 502 | }, 503 | { 504 | "title": "florence2", 505 | "bounding": [ 506 | 1148, 507 | 1164, 508 | 1041, 509 | 586 510 | ], 511 | "color": "#3f789e", 512 | "font_size": 24, 513 | "locked": false 514 | } 515 | ], 516 | "config": {}, 517 | "extra": { 518 | "ds": { 519 | "scale": 0.5131581182307073, 520 | "offset": [ 521 | -133.45930116147088, 522 | -137.71244198828424 523 | ] 524 | } 525 | }, 526 | "version": 0.4 527 | } -------------------------------------------------------------------------------- /worflow/MinCPM3_4B.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 22, 3 | "last_link_id": 23, 4 | "nodes": [ 5 | { 6 | "id": 22, 7 | "type": "easy showAnything", 8 | "pos": { 9 | "0": 1101, 10 | "1": 666, 11 | "2": 0, 12 | "3": 0, 13 | "4": 0, 14 | "5": 0, 15 | "6": 0, 16 | "7": 0, 17 | "8": 0, 18 | "9": 0 19 | }, 20 | "size": { 21 | "0": 419.568115234375, 22 | "1": 274.0469055175781 23 | }, 24 | "flags": {}, 25 | "order": 2, 26 | "mode": 0, 27 | "inputs": [ 28 | { 29 | "name": "anything", 30 | "type": "*", 31 | "link": 22 32 | } 33 | ], 34 | "outputs": [], 35 | "properties": { 36 | "Node name for S&R": "easy showAnything" 37 | }, 38 | "widgets_values": [ 39 | "中国位于亚洲的东部,太平洋的西岸。其领土范围广阔,北至黑龙江省漠河县北端的黑龙江主航道中心线(53°N),南至海南省南沙群岛的曾母暗沙(4°N),东至黑龙江省黑龙江与乌苏里江主航道中心线的汇合处(135°E),西至新疆维吾尔自治区帕米尔高原(73°E)。中国陆地总面积约960万平方千米,东部和南部大陆海岸线1.8万千米,海域总面积约473万平方千米。" 40 | ] 41 | }, 42 | { 43 | "id": 20, 44 | "type": "CXH_MinCP3_4B_Load", 45 | "pos": { 46 | "0": 301, 47 | "1": 685, 48 | "2": 0, 49 | "3": 0, 50 | "4": 0, 51 | "5": 0, 52 | "6": 0, 53 | "7": 0, 54 | "8": 0, 55 | "9": 0 56 | }, 57 | "size": { 58 | "0": 315, 59 | "1": 58 60 | }, 61 | "flags": {}, 62 | "order": 0, 63 | "mode": 0, 64 | "inputs": [], 65 | "outputs": [ 66 | { 67 | "name": "pipe", 68 | "type": "CXH_MinCP3_4B_Pipe", 69 | "links": [ 70 | 21 71 | ], 72 | "shape": 3, 73 | "slot_index": 0 74 | } 75 | ], 76 | "properties": { 77 | "Node name for S&R": "CXH_MinCP3_4B_Load" 78 | }, 79 | "widgets_values": [ 80 | "openbmb/MiniCPM3-4B" 81 | ], 82 | "color": "#1b4669", 83 | "bgcolor": "#29699c" 84 | }, 85 | { 86 | "id": 21, 87 | "type": "CXH_MinCP3_4B_Chat", 88 | "pos": { 89 | "0": 673, 90 | "1": 681, 91 | "2": 0, 92 | "3": 0, 93 | "4": 0, 94 | "5": 0, 95 | "6": 0, 96 | "7": 0, 97 | "8": 0, 98 | "9": 0 99 | }, 100 | "size": { 101 | "0": 400, 102 | "1": 200 103 | }, 104 | "flags": {}, 105 | "order": 1, 106 | "mode": 0, 107 | "inputs": [ 108 | { 109 | "name": "pipe", 110 | "type": "CXH_MinCP3_4B_Pipe", 111 | "link": 21 112 | } 113 | ], 114 | "outputs": [ 115 | { 116 | "name": "STRING", 117 | "type": "STRING", 118 | "links": [ 119 | 22 120 | ], 121 | "shape": 3, 122 | "slot_index": 0 123 | } 124 | ], 125 | "properties": { 126 | "Node name for S&R": "CXH_MinCP3_4B_Chat" 127 | }, 128 | "widgets_values": [ 129 | "中国在哪里?", 130 | 0.7, 131 | 1024, 132 | 0.7 133 | ], 134 | "color": "#1b4669", 135 | "bgcolor": "#29699c" 136 | } 137 | ], 138 | "links": [ 139 | [ 140 | 21, 141 | 20, 142 | 0, 143 | 21, 144 | 0, 145 | "CXH_MinCP3_4B_Pipe" 146 | ], 147 | [ 148 | 22, 149 | 21, 150 | 0, 151 | 22, 152 | 0, 153 | "*" 154 | ] 155 | ], 156 | "groups": [], 157 | "config": {}, 158 | "extra": { 159 | "ds": { 160 | "scale": 0.7067058488964866, 161 | "offset": [ 162 | 262.3736311173096, 163 | -229.61227808628627 164 | ] 165 | } 166 | }, 167 | "version": 0.4 168 | } -------------------------------------------------------------------------------- /worflow/florence_PromptGen.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 9, 3 | "last_link_id": 9, 4 | "nodes": [ 5 | { 6 | "id": 5, 7 | "type": "CXH_Florence2Run", 8 | "pos": { 9 | "0": 650, 10 | "1": 831, 11 | "2": 0, 12 | "3": 0, 13 | "4": 0, 14 | "5": 0, 15 | "6": 0, 16 | "7": 0, 17 | "8": 0, 18 | "9": 0 19 | }, 20 | "size": { 21 | "0": 400, 22 | "1": 352 23 | }, 24 | "flags": {}, 25 | "order": 2, 26 | "mode": 0, 27 | "inputs": [ 28 | { 29 | "name": "image", 30 | "type": "IMAGE", 31 | "link": 4 32 | }, 33 | { 34 | "name": "florence2_model", 35 | "type": "FL2MODEL", 36 | "link": 5 37 | } 38 | ], 39 | "outputs": [ 40 | { 41 | "name": "image", 42 | "type": "IMAGE", 43 | "links": null, 44 | "shape": 3 45 | }, 46 | { 47 | "name": "mask", 48 | "type": "MASK", 49 | "links": null, 50 | "shape": 3 51 | }, 52 | { 53 | "name": "caption", 54 | "type": "STRING", 55 | "links": [ 56 | 6 57 | ], 58 | "slot_index": 2, 59 | "shape": 3 60 | }, 61 | { 62 | "name": "data", 63 | "type": "JSON", 64 | "links": null, 65 | "shape": 3 66 | } 67 | ], 68 | "properties": { 69 | "Node name for S&R": "CXH_Florence2Run" 70 | }, 71 | "widgets_values": [ 72 | "", 73 | "mixed_caption(PromptGen 1.5)", 74 | true, 75 | false, 76 | 1024, 77 | 3, 78 | true, 79 | "", 80 | 1942, 81 | "randomize" 82 | ], 83 | "color": "#1b4669" 84 | }, 85 | { 86 | "id": 6, 87 | "type": "CXH_DownloadAndLoadFlorence2Model", 88 | "pos": { 89 | "0": 628, 90 | "1": 671, 91 | "2": 0, 92 | "3": 0, 93 | "4": 0, 94 | "5": 0, 95 | "6": 0, 96 | "7": 0, 97 | "8": 0, 98 | "9": 0 99 | }, 100 | "size": { 101 | "0": 415.8000183105469, 102 | "1": 106 103 | }, 104 | "flags": {}, 105 | "order": 0, 106 | "mode": 0, 107 | "inputs": [], 108 | "outputs": [ 109 | { 110 | "name": "florence2_model", 111 | "type": "FL2MODEL", 112 | "links": [ 113 | 5, 114 | 8 115 | ], 116 | "shape": 3, 117 | "slot_index": 0 118 | } 119 | ], 120 | "properties": { 121 | "Node name for S&R": "CXH_DownloadAndLoadFlorence2Model" 122 | }, 123 | "widgets_values": [ 124 | "MiaoshouAI/Florence-2-large-PromptGen-v1.5", 125 | "fp16", 126 | "sdpa" 127 | ], 128 | "color": "#1b4669" 129 | }, 130 | { 131 | "id": 9, 132 | "type": "easy showAnything", 133 | "pos": { 134 | "0": 1125, 135 | "1": 1279, 136 | "2": 0, 137 | "3": 0, 138 | "4": 0, 139 | "5": 0, 140 | "6": 0, 141 | "7": 0, 142 | "8": 0, 143 | "9": 0 144 | }, 145 | "size": { 146 | "0": 402.45989990234375, 147 | "1": 164.83221435546875 148 | }, 149 | "flags": {}, 150 | "order": 5, 151 | "mode": 0, 152 | "inputs": [ 153 | { 154 | "name": "anything", 155 | "type": "*", 156 | "link": 9 157 | } 158 | ], 159 | "outputs": [], 160 | "properties": { 161 | "Node name for S&R": "easy showAnything" 162 | }, 163 | "widgets_values": [ 164 | "1girl, solo, long hair, looking at viewer, skirt, red hair, thighhighs, long sleeves, closed mouth, standing, full body, shoes, pleated skirt, socks, indoors, miniskirt, hood, striped, white footwear, hoodie, crossed arms, table, white socks, sneakers, tennis ball, rack" 165 | ] 166 | }, 167 | { 168 | "id": 8, 169 | "type": "CXH_Florence2Run", 170 | "pos": { 171 | "0": 660, 172 | "1": 1237, 173 | "2": 0, 174 | "3": 0, 175 | "4": 0, 176 | "5": 0, 177 | "6": 0, 178 | "7": 0, 179 | "8": 0, 180 | "9": 0 181 | }, 182 | "size": { 183 | "0": 400, 184 | "1": 352 185 | }, 186 | "flags": {}, 187 | "order": 3, 188 | "mode": 0, 189 | "inputs": [ 190 | { 191 | "name": "image", 192 | "type": "IMAGE", 193 | "link": 7 194 | }, 195 | { 196 | "name": "florence2_model", 197 | "type": "FL2MODEL", 198 | "link": 8 199 | } 200 | ], 201 | "outputs": [ 202 | { 203 | "name": "image", 204 | "type": "IMAGE", 205 | "links": null, 206 | "shape": 3 207 | }, 208 | { 209 | "name": "mask", 210 | "type": "MASK", 211 | "links": null, 212 | "shape": 3 213 | }, 214 | { 215 | "name": "caption", 216 | "type": "STRING", 217 | "links": [ 218 | 9 219 | ], 220 | "slot_index": 2, 221 | "shape": 3 222 | }, 223 | { 224 | "name": "data", 225 | "type": "JSON", 226 | "links": null, 227 | "shape": 3 228 | } 229 | ], 230 | "properties": { 231 | "Node name for S&R": "CXH_Florence2Run" 232 | }, 233 | "widgets_values": [ 234 | "", 235 | "generate_tags(PromptGen 1.5)", 236 | true, 237 | false, 238 | 1024, 239 | 3, 240 | true, 241 | "", 242 | 470, 243 | "randomize" 244 | ], 245 | "color": "#1b4669" 246 | }, 247 | { 248 | "id": 3, 249 | "type": "LoadImage", 250 | "pos": { 251 | "0": 180, 252 | "1": 1064, 253 | "2": 0, 254 | "3": 0, 255 | "4": 0, 256 | "5": 0, 257 | "6": 0, 258 | "7": 0, 259 | "8": 0, 260 | "9": 0 261 | }, 262 | "size": { 263 | "0": 315, 264 | "1": 314 265 | }, 266 | "flags": {}, 267 | "order": 1, 268 | "mode": 0, 269 | "inputs": [], 270 | "outputs": [ 271 | { 272 | "name": "IMAGE", 273 | "type": "IMAGE", 274 | "links": [ 275 | 4, 276 | 7 277 | ], 278 | "slot_index": 0, 279 | "shape": 3 280 | }, 281 | { 282 | "name": "MASK", 283 | "type": "MASK", 284 | "links": null, 285 | "slot_index": 1, 286 | "shape": 3 287 | } 288 | ], 289 | "properties": { 290 | "Node name for S&R": "LoadImage" 291 | }, 292 | "widgets_values": [ 293 | "风格趋势_68550099(1).jpg", 294 | "image" 295 | ] 296 | }, 297 | { 298 | "id": 7, 299 | "type": "easy showAnything", 300 | "pos": { 301 | "0": 1113, 302 | "1": 876, 303 | "2": 0, 304 | "3": 0, 305 | "4": 0, 306 | "5": 0, 307 | "6": 0, 308 | "7": 0, 309 | "8": 0, 310 | "9": 0 311 | }, 312 | "size": [ 313 | 496.79671515656423, 314 | 313.44720309527156 315 | ], 316 | "flags": {}, 317 | "order": 4, 318 | "mode": 0, 319 | "inputs": [ 320 | { 321 | "name": "anything", 322 | "type": "*", 323 | "link": 6 324 | } 325 | ], 326 | "outputs": [], 327 | "properties": { 328 | "Node name for S&R": "easy showAnything" 329 | }, 330 | "widgets_values": [ 331 | "a high-resolution photograph featuring a young woman with fair skin and long, wavy red hair, standing against a dark blue wall, she has a slender physique with a slender build and fair skin, she is wearing a navy blue hoodie with a white, diamond-patterned design, a matching navy blue mini skirt, and white knee-high socks with yellow stripes, her outfit is accessorized with white sneakers and a white headband with a black and white striped pattern, the background consists of a minimalist, modern setting with a carpeted floor in various shades of green and beige, scattered around her are several white tennis balls, to her left, there is a green cabinet with a rattan-like texture, and to her right, a white wire basket filled with tennis balls is placed on a metal stand, to the right, on the floor is a vintage radio, adding a retro touch to the scene, the overall color palette is dominated by dark blue and green tones, creating a visually striking contrast, the lighting is soft and natural, enhancing the textures and details of the objects and the woman's outfit, the photograph is likely taken during the day, as indicated by the high level of detail and the softness of the carpet and the smoothness of her skin\n\n \\(polo\\), 1girl, solo, long hair, looking at viewer, skirt, brown hair" 332 | ] 333 | } 334 | ], 335 | "links": [ 336 | [ 337 | 4, 338 | 3, 339 | 0, 340 | 5, 341 | 0, 342 | "IMAGE" 343 | ], 344 | [ 345 | 5, 346 | 6, 347 | 0, 348 | 5, 349 | 1, 350 | "FL2MODEL" 351 | ], 352 | [ 353 | 6, 354 | 5, 355 | 2, 356 | 7, 357 | 0, 358 | "*" 359 | ], 360 | [ 361 | 7, 362 | 3, 363 | 0, 364 | 8, 365 | 0, 366 | "IMAGE" 367 | ], 368 | [ 369 | 8, 370 | 6, 371 | 0, 372 | 8, 373 | 1, 374 | "FL2MODEL" 375 | ], 376 | [ 377 | 9, 378 | 8, 379 | 2, 380 | 9, 381 | 0, 382 | "*" 383 | ] 384 | ], 385 | "groups": [], 386 | "config": {}, 387 | "extra": { 388 | "ds": { 389 | "scale": 0.9090909090909091, 390 | "offset": [ 391 | -461.8324362747284, 392 | -716.5616485145835 393 | ] 394 | } 395 | }, 396 | "version": 0.4 397 | } -------------------------------------------------------------------------------- /worflow/florence_PromptGen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/florence_PromptGen.png -------------------------------------------------------------------------------- /worflow/flux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/flux.png -------------------------------------------------------------------------------- /worflow/joy.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 5, 3 | "last_link_id": 4, 4 | "nodes": [ 5 | { 6 | "id": 2, 7 | "type": "Joy_caption", 8 | "pos": [ 9 | 828, 10 | 498 11 | ], 12 | "size": { 13 | "0": 400, 14 | "1": 200 15 | }, 16 | "flags": {}, 17 | "order": 2, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "joy_pipeline", 22 | "type": "JoyPipeline", 23 | "link": 4 24 | }, 25 | { 26 | "name": "image", 27 | "type": "IMAGE", 28 | "link": 2, 29 | "slot_index": 1 30 | } 31 | ], 32 | "outputs": [ 33 | { 34 | "name": "STRING", 35 | "type": "STRING", 36 | "links": [ 37 | 3 38 | ], 39 | "shape": 3, 40 | "slot_index": 0 41 | } 42 | ], 43 | "properties": { 44 | "Node name for S&R": "Joy_caption" 45 | }, 46 | "widgets_values": [ 47 | "A descriptive caption for this image", 48 | 300, 49 | 0.5 50 | ] 51 | }, 52 | { 53 | "id": 5, 54 | "type": "Joy_caption_load", 55 | "pos": [ 56 | 454, 57 | 446 58 | ], 59 | "size": { 60 | "0": 315, 61 | "1": 58 62 | }, 63 | "flags": {}, 64 | "order": 0, 65 | "mode": 0, 66 | "outputs": [ 67 | { 68 | "name": "JoyPipeline", 69 | "type": "JoyPipeline", 70 | "links": [ 71 | 4 72 | ], 73 | "shape": 3, 74 | "slot_index": 0 75 | } 76 | ], 77 | "properties": { 78 | "Node name for S&R": "Joy_caption_load" 79 | }, 80 | "widgets_values": [ 81 | "meta-llama/Meta-Llama-3.1-8B" 82 | ] 83 | }, 84 | { 85 | "id": 4, 86 | "type": "easy showAnything", 87 | "pos": [ 88 | 1255, 89 | 502 90 | ], 91 | "size": { 92 | "0": 356.4357604980469, 93 | "1": 250.48460388183594 94 | }, 95 | "flags": {}, 96 | "order": 3, 97 | "mode": 0, 98 | "inputs": [ 99 | { 100 | "name": "anything", 101 | "type": "*", 102 | "link": 3 103 | } 104 | ], 105 | "properties": { 106 | "Node name for S&R": "easy showAnything" 107 | }, 108 | "widgets_values": [ 109 | "of a young girl standing on a lush green lawn, surrounded by tall trees with budding leaves, under a cloudy sky. The girl, approximately 3-5 years old, has light blonde hair and a cheerful expression, smiling with her teeth showing. She wears a white, short-sleeved dress adorned with colorful floral appliqués in shades of pink, yellow, and orange, and a matching white hat with a large pink flower on the side. Her dress has a full skirt and is knee-length, with delicate lace trim along the hem. She also wears white tights and white shoes, enhancing the purity of her attire. In her hands, she carries a bouquet of fresh flowers, including yellow, pink, and white varieties, held close to her chest. The background is softly blurred, emphasizing the girl as the focal point, with the trees and sky providing a serene, natural setting. The overall mood is joyful and whimsical, capturing the innocence and beauty of childhood." 110 | ] 111 | }, 112 | { 113 | "id": 3, 114 | "type": "LoadImage", 115 | "pos": [ 116 | 198, 117 | 577 118 | ], 119 | "size": [ 120 | 570.2863188912281, 121 | 474.07759457475504 122 | ], 123 | "flags": {}, 124 | "order": 1, 125 | "mode": 0, 126 | "outputs": [ 127 | { 128 | "name": "IMAGE", 129 | "type": "IMAGE", 130 | "links": [ 131 | 2 132 | ], 133 | "shape": 3 134 | }, 135 | { 136 | "name": "MASK", 137 | "type": "MASK", 138 | "links": null, 139 | "shape": 3 140 | } 141 | ], 142 | "properties": { 143 | "Node name for S&R": "LoadImage" 144 | }, 145 | "widgets_values": [ 146 | "balabala_schnell.png", 147 | "image" 148 | ] 149 | } 150 | ], 151 | "links": [ 152 | [ 153 | 2, 154 | 3, 155 | 0, 156 | 2, 157 | 1, 158 | "IMAGE" 159 | ], 160 | [ 161 | 3, 162 | 2, 163 | 0, 164 | 4, 165 | 0, 166 | "*" 167 | ], 168 | [ 169 | 4, 170 | 5, 171 | 0, 172 | 2, 173 | 0, 174 | "JoyPipeline" 175 | ] 176 | ], 177 | "groups": [], 178 | "config": {}, 179 | "extra": { 180 | "ds": { 181 | "scale": 0.8769226950000014, 182 | "offset": [ 183 | -5.531487925200333, 184 | -181.09476693793715 185 | ] 186 | } 187 | }, 188 | "version": 0.4 189 | } -------------------------------------------------------------------------------- /worflow/joy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/joy.png -------------------------------------------------------------------------------- /worflow/joy_4b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/joy_4b.png -------------------------------------------------------------------------------- /worflow/joy批量打标.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/joy批量打标.png -------------------------------------------------------------------------------- /worflow/workflow_min2.6classifiy_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/workflow_min2.6classifiy_.png -------------------------------------------------------------------------------- /worflow/二级文件夹批量打标.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/二级文件夹批量打标.png -------------------------------------------------------------------------------- /worflow/批量打标(Batch marking).json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 72, 3 | "last_link_id": 91, 4 | "nodes": [ 5 | { 6 | "id": 68, 7 | "type": "Joy_caption_load", 8 | "pos": [ 9 | 1401, 10 | 373 11 | ], 12 | "size": { 13 | "0": 315, 14 | "1": 58 15 | }, 16 | "flags": {}, 17 | "order": 0, 18 | "mode": 0, 19 | "outputs": [ 20 | { 21 | "name": "JoyPipeline", 22 | "type": "JoyPipeline", 23 | "links": [ 24 | 84 25 | ], 26 | "slot_index": 0, 27 | "shape": 3 28 | } 29 | ], 30 | "properties": { 31 | "Node name for S&R": "Joy_caption_load" 32 | }, 33 | "widgets_values": [ 34 | "unsloth/Meta-Llama-3.1-8B-bnb-4bit" 35 | ], 36 | "color": "#1b4669", 37 | "bgcolor": "#29699c" 38 | }, 39 | { 40 | "id": 60, 41 | "type": "LayerUtility: String", 42 | "pos": [ 43 | 1836, 44 | 377 45 | ], 46 | "size": { 47 | "0": 315, 48 | "1": 58 49 | }, 50 | "flags": {}, 51 | "order": 1, 52 | "mode": 0, 53 | "outputs": [ 54 | { 55 | "name": "string", 56 | "type": "STRING", 57 | "links": [ 58 | 75 59 | ], 60 | "slot_index": 0, 61 | "shape": 3 62 | } 63 | ], 64 | "properties": { 65 | "Node name for S&R": "LayerUtility: String" 66 | }, 67 | "widgets_values": [ 68 | "TriggerWord" 69 | ] 70 | }, 71 | { 72 | "id": 67, 73 | "type": "Joy_caption", 74 | "pos": [ 75 | 1377, 76 | 488 77 | ], 78 | "size": { 79 | "0": 400, 80 | "1": 200 81 | }, 82 | "flags": {}, 83 | "order": 7, 84 | "mode": 0, 85 | "inputs": [ 86 | { 87 | "name": "joy_pipeline", 88 | "type": "JoyPipeline", 89 | "link": 84 90 | }, 91 | { 92 | "name": "image", 93 | "type": "IMAGE", 94 | "link": 85 95 | } 96 | ], 97 | "outputs": [ 98 | { 99 | "name": "STRING", 100 | "type": "STRING", 101 | "links": [ 102 | 89 103 | ], 104 | "slot_index": 0, 105 | "shape": 3 106 | } 107 | ], 108 | "properties": { 109 | "Node name for S&R": "Joy_caption" 110 | }, 111 | "widgets_values": [ 112 | "A descriptive caption for this image", 113 | 300, 114 | 0.5, 115 | true 116 | ], 117 | "color": "#1b4669", 118 | "bgcolor": "#29699c" 119 | }, 120 | { 121 | "id": 48, 122 | "type": "LoadImageListFromDir //Inspire", 123 | "pos": [ 124 | 596, 125 | 537 126 | ], 127 | "size": { 128 | "0": 315, 129 | "1": 170 130 | }, 131 | "flags": {}, 132 | "order": 3, 133 | "mode": 0, 134 | "inputs": [ 135 | { 136 | "name": "directory", 137 | "type": "STRING", 138 | "link": 70, 139 | "widget": { 140 | "name": "directory" 141 | } 142 | } 143 | ], 144 | "outputs": [ 145 | { 146 | "name": "IMAGE", 147 | "type": "IMAGE", 148 | "links": [ 149 | 90 150 | ], 151 | "slot_index": 0, 152 | "shape": 6 153 | }, 154 | { 155 | "name": "MASK", 156 | "type": "MASK", 157 | "links": null, 158 | "shape": 6 159 | }, 160 | { 161 | "name": "FILE PATH", 162 | "type": "STRING", 163 | "links": null, 164 | "shape": 6 165 | } 166 | ], 167 | "properties": { 168 | "Node name for S&R": "LoadImageListFromDir //Inspire" 169 | }, 170 | "widgets_values": [ 171 | "E:\\tmp\\test", 172 | 0, 173 | 0, 174 | false 175 | ] 176 | }, 177 | { 178 | "id": 57, 179 | "type": "LayerUtility: String", 180 | "pos": [ 181 | 597, 182 | 430 183 | ], 184 | "size": { 185 | "0": 315, 186 | "1": 58 187 | }, 188 | "flags": {}, 189 | "order": 2, 190 | "mode": 0, 191 | "outputs": [ 192 | { 193 | "name": "string", 194 | "type": "STRING", 195 | "links": [ 196 | 70 197 | ], 198 | "slot_index": 0, 199 | "shape": 3 200 | } 201 | ], 202 | "properties": { 203 | "Node name for S&R": "LayerUtility: String" 204 | }, 205 | "widgets_values": [ 206 | "C:\\Users\\chenxinghua\\Desktop\\新建文件夹 (3)\\test" 207 | ] 208 | }, 209 | { 210 | "id": 61, 211 | "type": "LayerUtility: TextJoin", 212 | "pos": [ 213 | 1840, 214 | 490 215 | ], 216 | "size": { 217 | "0": 315, 218 | "1": 130 219 | }, 220 | "flags": {}, 221 | "order": 8, 222 | "mode": 0, 223 | "inputs": [ 224 | { 225 | "name": "text_1", 226 | "type": "STRING", 227 | "link": 75, 228 | "widget": { 229 | "name": "text_1" 230 | } 231 | }, 232 | { 233 | "name": "text_2", 234 | "type": "STRING", 235 | "link": 89, 236 | "widget": { 237 | "name": "text_2" 238 | } 239 | } 240 | ], 241 | "outputs": [ 242 | { 243 | "name": "text", 244 | "type": "STRING", 245 | "links": [ 246 | 77 247 | ], 248 | "slot_index": 0, 249 | "shape": 3 250 | } 251 | ], 252 | "properties": { 253 | "Node name for S&R": "LayerUtility: TextJoin" 254 | }, 255 | "widgets_values": [ 256 | "", 257 | "", 258 | "", 259 | "" 260 | ] 261 | }, 262 | { 263 | "id": 31, 264 | "type": "LayerUtility: ImageTaggerSave", 265 | "pos": [ 266 | 2180, 267 | 475 268 | ], 269 | "size": { 270 | "0": 397.0539245605469, 271 | "1": 422.8654479980469 272 | }, 273 | "flags": {}, 274 | "order": 9, 275 | "mode": 0, 276 | "inputs": [ 277 | { 278 | "name": "image", 279 | "type": "IMAGE", 280 | "link": 88 281 | }, 282 | { 283 | "name": "tag_text", 284 | "type": "STRING", 285 | "link": 77, 286 | "widget": { 287 | "name": "tag_text" 288 | } 289 | } 290 | ], 291 | "properties": { 292 | "Node name for S&R": "LayerUtility: ImageTaggerSave" 293 | }, 294 | "widgets_values": [ 295 | "", 296 | "C:\\Users\\chenxinghua\\Desktop\\新建文件夹 (3)\\test2", 297 | "my_training_set", 298 | "None", 299 | "png", 300 | 80, 301 | true 302 | ] 303 | }, 304 | { 305 | "id": 65, 306 | "type": "LayerUtility: ImageRemoveAlpha", 307 | "pos": [ 308 | 964, 309 | 400 310 | ], 311 | "size": { 312 | "0": 315, 313 | "1": 102 314 | }, 315 | "flags": {}, 316 | "order": 6, 317 | "mode": 4, 318 | "inputs": [ 319 | { 320 | "name": "RGBA_image", 321 | "type": "IMAGE", 322 | "link": 81 323 | }, 324 | { 325 | "name": "mask", 326 | "type": "MASK", 327 | "link": 82 328 | } 329 | ], 330 | "outputs": [ 331 | { 332 | "name": "RGB_image", 333 | "type": "IMAGE", 334 | "links": [ 335 | 85, 336 | 88 337 | ], 338 | "slot_index": 0, 339 | "shape": 3 340 | } 341 | ], 342 | "properties": { 343 | "Node name for S&R": "LayerUtility: ImageRemoveAlpha" 344 | }, 345 | "widgets_values": [ 346 | true, 347 | "#FFFFFF" 348 | ] 349 | }, 350 | { 351 | "id": 63, 352 | "type": "LayerMask: TransparentBackgroundUltra", 353 | "pos": [ 354 | 959, 355 | 549 356 | ], 357 | "size": { 358 | "0": 327.6000061035156, 359 | "1": 270 360 | }, 361 | "flags": {}, 362 | "order": 4, 363 | "mode": 4, 364 | "inputs": [ 365 | { 366 | "name": "image", 367 | "type": "IMAGE", 368 | "link": 90 369 | } 370 | ], 371 | "outputs": [ 372 | { 373 | "name": "image", 374 | "type": "IMAGE", 375 | "links": [ 376 | 81, 377 | 91 378 | ], 379 | "slot_index": 0, 380 | "shape": 3 381 | }, 382 | { 383 | "name": "mask", 384 | "type": "MASK", 385 | "links": [ 386 | 82 387 | ], 388 | "slot_index": 1, 389 | "shape": 3 390 | } 391 | ], 392 | "properties": { 393 | "Node name for S&R": "LayerMask: TransparentBackgroundUltra" 394 | }, 395 | "widgets_values": [ 396 | "ckpt_base.pth", 397 | "VITMatte", 398 | 6, 399 | 6, 400 | 0.01, 401 | 0.99, 402 | true, 403 | "cuda", 404 | 2 405 | ] 406 | }, 407 | { 408 | "id": 72, 409 | "type": "PreviewImage", 410 | "pos": [ 411 | 964, 412 | 865 413 | ], 414 | "size": [ 415 | 329.67821458121784, 416 | 156.48269739093905 417 | ], 418 | "flags": {}, 419 | "order": 5, 420 | "mode": 0, 421 | "inputs": [ 422 | { 423 | "name": "images", 424 | "type": "IMAGE", 425 | "link": 91 426 | } 427 | ], 428 | "properties": { 429 | "Node name for S&R": "PreviewImage" 430 | } 431 | } 432 | ], 433 | "links": [ 434 | [ 435 | 70, 436 | 57, 437 | 0, 438 | 48, 439 | 0, 440 | "STRING" 441 | ], 442 | [ 443 | 75, 444 | 60, 445 | 0, 446 | 61, 447 | 0, 448 | "STRING" 449 | ], 450 | [ 451 | 77, 452 | 61, 453 | 0, 454 | 31, 455 | 1, 456 | "STRING" 457 | ], 458 | [ 459 | 81, 460 | 63, 461 | 0, 462 | 65, 463 | 0, 464 | "IMAGE" 465 | ], 466 | [ 467 | 82, 468 | 63, 469 | 1, 470 | 65, 471 | 1, 472 | "MASK" 473 | ], 474 | [ 475 | 84, 476 | 68, 477 | 0, 478 | 67, 479 | 0, 480 | "JoyPipeline" 481 | ], 482 | [ 483 | 85, 484 | 65, 485 | 0, 486 | 67, 487 | 1, 488 | "IMAGE" 489 | ], 490 | [ 491 | 88, 492 | 65, 493 | 0, 494 | 31, 495 | 0, 496 | "IMAGE" 497 | ], 498 | [ 499 | 89, 500 | 67, 501 | 0, 502 | 61, 503 | 1, 504 | "STRING" 505 | ], 506 | [ 507 | 90, 508 | 48, 509 | 0, 510 | 63, 511 | 0, 512 | "IMAGE" 513 | ], 514 | [ 515 | 91, 516 | 63, 517 | 0, 518 | 72, 519 | 0, 520 | "IMAGE" 521 | ] 522 | ], 523 | "groups": [], 524 | "config": {}, 525 | "extra": { 526 | "ds": { 527 | "scale": 0.6115909044841474, 528 | "offset": [ 529 | -315.6032329072262, 530 | -178.67335371150824 531 | ] 532 | } 533 | }, 534 | "version": 0.4 535 | } -------------------------------------------------------------------------------- /worflow/批量打标(Batch marking).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/批量打标(Batch marking).png --------------------------------------------------------------------------------