├── .gitignore
├── CXH_Min2_6_classifiy.py
├── Joy_caption_alpha.py
├── Joy_caption_node.py
├── LICENSE
├── README.md
├── __init__.py
├── __pycache__
    ├── Joy_caption_alpha.cpython-311.pyc
    ├── florence_nodes.cpython-311.pyc
    ├── miniCPMv2_6_prompt_generator.cpython-311.pyc
    └── miniCpMV3_4_chat.cpython-311.pyc
├── florence_nodes.py
├── ic_lora_batch.py
├── install_req.bat
├── lib
    ├── __init__.py
    ├── xfile.py
    ├── ximg.py
    └── xmodel.py
├── miniCPMv2_6_prompt_generator.py
├── miniCpMV3_4_chat.py
├── requirements.txt
├── smolvlm.py
└── worflow
    ├── Min2.6+joy+Florence2.json
    ├── MinCPM3_4B.json
    ├── florence_PromptGen.json
    ├── florence_PromptGen.png
    ├── flux.png
    ├── joy.json
    ├── joy.png
    ├── joy_4b.png
    ├── joy批量打标.png
    ├── workflow_min2.6classifiy_.png
    ├── 二级文件夹批量打标.png
    ├── 批量打标(Batch marking).json
    └── 批量打标(Batch marking).png


/.gitignore:
--------------------------------------------------------------------------------
1 | lib/__pycache__/__init__.cpython-310.pyc
2 | lib/__pycache__/__init__.cpython-311.pyc
3 | lib/__pycache__/ximg.cpython-310.pyc
4 | lib/__pycache__/ximg.cpython-311.pyc
5 | lib/__pycache__/xmodel.cpython-311.pyc
6 | __pycache__/__init__.cpython-311.pyc
7 | __pycache__/Joy_caption_node.cpython-311.pyc
8 | 


--------------------------------------------------------------------------------
/CXH_Min2_6_classifiy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huggingface_hub import InferenceClient
  3 | from torch import nn
  4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
  5 | from pathlib import Path
  6 | import torch
  7 | import torch.amp.autocast_mode
  8 | from PIL import Image
  9 | import os
 10 | import folder_paths
 11 | import time
 12 | 
 13 | from .lib.ximg import *
 14 | from .lib.xmodel import *
 15 | 
 16 | classification_rules = """
 17 | You are a fashion image classifier. Analyze clothing images following these priority rules and categories. When an item could fit multiple categories, use the highest priority category.
 18 | Priority Order (Highest to Lowest):
 19 | 1. MAN
 20 | 2. WoMAN
 21 | Required Output Format:
 22 | [CATEGORY_NAME]
 23 | 
 24 | Classification Rules:
 25 | 1. Always check categories in order from highest to lowest priority
 26 | 2. Use the highest priority category that applies
 27 | 3. Output only the category name in all caps
 28 | 4. No additional text or explanations in output
 29 | """
 30 | 
 31 | def process_category_name(category_name):
 32 |     # 如果字符串包含方括号，则删除它们
 33 |     if category_name.startswith('[') and category_name.endswith(']'):
 34 |         category_name = category_name[1:-1]
 35 |     return category_name
 36 | 
 37 | class CXH_Min2_6_classifiy :
 38 | 
 39 |     def __init__(self):
 40 |         pass
 41 | 
 42 |     @classmethod
 43 |     def INPUT_TYPES(s):
 44 |         return {
 45 |             "required": {
 46 |                 "pipe": ("CXH_Hg_Pipe",),
 47 |                 "img_dir": ("STRING", {"multiline": False, "default": ""},),
 48 |                 "save_dir":   ("STRING", {"multiline": False, "default": ""},),
 49 |                 "prompt":    ("STRING", {"multiline": True, "default": classification_rules},),
 50 |                 "format": (["png", "jpg"],),
 51 |                 "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}),
 52 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
 53 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
 54 |             }
 55 |         }
 56 | 
 57 |     RETURN_TYPES = () #RETURN_TYPES = () RETURN_TYPES = ("DICT",)返回字典
 58 |     FUNCTION = "gen"
 59 |     OUTPUT_NODE = True #OUTPUT_NODE = True 没输出
 60 |     CATEGORY = "CXH/LLM"
 61 | 
 62 |     def gen(self,pipe,img_dir,save_dir,prompt,format,max_tokens,temperature,seed): 
 63 | 
 64 |         dir_files = batch_image(img_dir)
 65 | 
 66 |         # prompt = f"Determine whether the following pictures belong to the following types:{str(classifiy_type)},You only need to output the type, you do not need to output anything else to remember!"
 67 | 
 68 |          # 创建保存目录
 69 |         if not os.path.exists(save_dir):
 70 |             os.makedirs(save_dir)
 71 | 
 72 |         index1 = 0
 73 |         for image_path in dir_files:
 74 |             if os.path.isdir(image_path) and os.path.ex:
 75 |                 continue
 76 |             start = time.time()   
 77 |             input_image = open_image(image_path)
 78 |             input_image = ImageOps.exif_transpose(input_image)
 79 |             image = input_image.convert("RGB") 
 80 | 
 81 |             question = prompt
 82 |             msgs = [{'role': 'user', 'content': [image, question]}]
 83 | 
 84 |             res = pipe.text_model.chat(
 85 |                 image=None,
 86 |                 msgs=msgs,
 87 |                 tokenizer=pipe.tokenizer
 88 |             )
 89 | 
 90 |             ## if you want to use streaming, please make sure sampling=True and stream=True
 91 |             ## the model.chat will return a generator
 92 |             res = pipe.text_model.chat(
 93 |                 image=None,
 94 |                 msgs=msgs,
 95 |                 tokenizer=pipe.tokenizer,
 96 |                 sampling=False,
 97 |                 stream=False,
 98 |                 max_tokens=max_tokens,
 99 |                 temperature=temperature,
100 |             )
101 |  
102 |             generated_text = process_category_name(res)
103 | 
104 |             if len(generated_text) >= 80:
105 |                 generated_text = "UNKNOWN"
106 | 
107 | 
108 |             savePath = os.path.join(save_dir,generated_text)
109 |              # 创建保存目录
110 |             if not os.path.exists(savePath):
111 |                 os.makedirs(savePath)
112 | 
113 |             lenName = str(index1)
114 |             # img_file_name = f"{lenName}.{format}"
115 |             img_file_name = os.path.basename(image_path)
116 |             input_image = image
117 |             if format != "png":
118 |                 if input_image.mode == "RGBA":
119 |                     input_image = input_image.convert("RGB")
120 |                     
121 |             img_save_path = os.path.join(savePath, img_file_name)
122 |             input_image.save(img_save_path)
123 | 
124 |             end = time.time()
125 |             execution_time = calculate_seconds_difference(start, end)
126 |             temp = f":{execution_time:.3f}s"
127 |             index1 = index1 + 1
128 |             print(str(index1)+"/"+str(len(dir_files)) +":"+temp)    
129 | 
130 |         return ()
131 | 
132 | 


--------------------------------------------------------------------------------
/Joy_caption_alpha.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huggingface_hub import InferenceClient
  3 | from torch import nn
  4 | from transformers import  AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
  5 | from pathlib import Path
  6 | import torch
  7 | import torch.amp.autocast_mode
  8 | from PIL import Image
  9 | import os
 10 | import folder_paths
 11 | import torchvision.transforms.functional as TVF
 12 | 
 13 | from .lib.ximg import *
 14 | from .lib.xmodel import *
 15 | import re
 16 | import time
 17 | from datetime import datetime, timedelta
 18 | 
 19 | from comfy.model_management import  unload_all_models, soft_empty_cache,get_torch_device
 20 | 
 21 | DEVICE = get_torch_device()
 22 | 
 23 | def modify_json_value(file_path, key_to_modify, new_value):
 24 |   """
 25 |   读取 JSON 文件，修改指定 key 的 value 值，并保存修改后的文件。
 26 | 
 27 |   Args:
 28 |     file_path: JSON 文件路径。
 29 |     key_to_modify: 需要修改的 key。
 30 |     new_value:  新的 value 值。
 31 |   """
 32 |   try:
 33 |     with open(file_path, 'r', encoding='utf-8') as f:
 34 |       data = json.load(f)
 35 | 
 36 |     # 查找并修改 key 的 value
 37 |     if key_to_modify in data:
 38 |       data[key_to_modify] = new_value
 39 |     else:
 40 |       print(f"Warning: Key '{key_to_modify}' not found in JSON file.")
 41 | 
 42 |     # 保存修改后的 JSON 文件
 43 |     with open(file_path, 'w', encoding='utf-8') as f:
 44 |       json.dump(data, f, indent=4)  # 使用 indent 参数格式化输出
 45 | 
 46 |     print(f"Successfully modified '{key_to_modify}' value in '{file_path}'.")
 47 | 
 48 |   except FileNotFoundError:
 49 |     print(f"Error: File '{file_path}' not found.")
 50 |   except json.JSONDecodeError:
 51 |     print(f"Error: Invalid JSON format in '{file_path}'.")
 52 | 
 53 | CAPTION_TYPE_MAP = {
 54 | 	"Descriptive": [
 55 | 		"Write a descriptive caption for this image in a formal tone.",
 56 | 		"Write a descriptive caption for this image in a formal tone within {word_count} words.",
 57 | 		"Write a {length} descriptive caption for this image in a formal tone.",
 58 | 	],
 59 | 	"Descriptive (Informal)": [
 60 | 		"Write a descriptive caption for this image in a casual tone.",
 61 | 		"Write a descriptive caption for this image in a casual tone within {word_count} words.",
 62 | 		"Write a {length} descriptive caption for this image in a casual tone.",
 63 | 	],
 64 | 	"Training Prompt": [
 65 | 		"Write a stable diffusion prompt for this image.",
 66 | 		"Write a stable diffusion prompt for this image within {word_count} words.",
 67 | 		"Write a {length} stable diffusion prompt for this image.",
 68 | 	],
 69 | 	"MidJourney": [
 70 | 		"Write a MidJourney prompt for this image.",
 71 | 		"Write a MidJourney prompt for this image within {word_count} words.",
 72 | 		"Write a {length} MidJourney prompt for this image.",
 73 | 	],
 74 | 	"Booru tag list": [
 75 | 		"Write a list of Booru tags for this image.",
 76 | 		"Write a list of Booru tags for this image within {word_count} words.",
 77 | 		"Write a {length} list of Booru tags for this image.",
 78 | 	],
 79 | 	"Booru-like tag list": [
 80 | 		"Write a list of Booru-like tags for this image.",
 81 | 		"Write a list of Booru-like tags for this image within {word_count} words.",
 82 | 		"Write a {length} list of Booru-like tags for this image.",
 83 | 	],
 84 | 	"Art Critic": [
 85 | 		"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
 86 | 		"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
 87 | 		"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
 88 | 	],
 89 | 	"Product Listing": [
 90 | 		"Write a caption for this image as though it were a product listing.",
 91 | 		"Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
 92 | 		"Write a {length} caption for this image as though it were a product listing.",
 93 | 	],
 94 | 	"Social Media Post": [
 95 | 		"Write a caption for this image as if it were being used for a social media post.",
 96 | 		"Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
 97 | 		"Write a {length} caption for this image as if it were being used for a social media post.",
 98 | 	],
 99 | }
100 | 
101 | extra_options_parts = [
102 |         "[如果图像中有人物/角色，你必须用name来指代他们。] If there is a person/character in the image you must refer to them as *name* .",
103 |         "[不要包含无法改变的人物/角色信息如:种族、性别等，但要包含可以改变的属性如:发型。] Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
104 | 		"[包含关于光线的信息] Include information about lighting.",
105 | 		"[包含关于相机角度的信息] Include information about camera angle.",
106 | 		"[包含关于是否有水印的信息] Include information about whether there is a watermark or not.",
107 | 		"[包含关于是否有JPEG压缩痕迹的信息] Include information about whether there are JPEG artifacts or not.",
108 | 		"[如果是照片，你必须包含可能使用的相机类型以及诸如光圈、快门速度、ISO等细节信息] If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
109 | 		"[不要包含任何性相关的内容；保持内容适合全年龄] Do NOT include anything sexual; keep it PG.",
110 | 		"[不要提及图像的分辨率] Do NOT mention the image's resolution.",
111 | 		"[你必须包含关于图像主观审美质量的评价，从低到非常高] You MUST include information about the subjective aesthetic quality of the image from low to very high.",
112 | 		"[包含关于图像构图风格的信息，如引导线、三分法或对称性] Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
113 | 		"[不要提及图像中的任何文字] Do NOT mention any text that is in the image.",
114 | 		"[指明景深以及背景是否聚焦或模糊] Specify the depth of field and whether the background is in focus or blurred.",
115 | 		"[如果适用，提及可能使用的人工或自然光源] If applicable, mention the likely use of artificial or natural lighting sources.",
116 | 		"[不要使用任何模棱两可的语言] Do NOT use any ambiguous language.",
117 | 		"[包含图像是否适合工作场合（sfw）、暗示性的还是不适合工作场合（nsfw）] Include whether the image is sfw, suggestive, or nsfw.",
118 | 		"[只描述图像中最重要的元素] ONLY describe the most important elements of the image."
119 | 	]
120 | 
121 | class JoyPipeline_alpha:
122 |     def __init__(self):
123 |         self.clip_model = None
124 |         self.clip_processor =None
125 |         self.tokenizer = None
126 |         self.text_model = None
127 |         self.image_adapter = None
128 |         self.parent = None
129 |     
130 |     def clearCache(self):
131 |         self.clip_model = None
132 |         self.clip_processor =None
133 |         self.tokenizer = None
134 |         self.text_model = None
135 |         self.image_adapter = None 
136 |         
137 | 
138 | 
139 | class ImageAdapter_alpha(nn.Module):
140 | 	def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
141 | 		super().__init__()
142 | 		self.deep_extract = deep_extract
143 | 
144 | 		if self.deep_extract:
145 | 			input_features = input_features * 5
146 | 
147 | 		self.linear1 = nn.Linear(input_features, output_features)
148 | 		self.activation = nn.GELU()
149 | 		self.linear2 = nn.Linear(output_features, output_features)
150 | 		self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
151 | 		self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
152 | 
153 | 		# Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>)
154 | 		self.other_tokens = nn.Embedding(3, output_features)
155 | 		self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)   # Matches HF's implementation of llama3
156 | 
157 | 	def forward(self, vision_outputs: torch.Tensor):
158 | 		if self.deep_extract:
159 | 			x = torch.concat((
160 | 				vision_outputs[-2],
161 | 				vision_outputs[3],
162 | 				vision_outputs[7],
163 | 				vision_outputs[13],
164 | 				vision_outputs[20],
165 | 			), dim=-1)
166 | 			assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"  # batch, tokens, features
167 | 			assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
168 | 		else:
169 | 			x = vision_outputs[-2]
170 | 
171 | 		x = self.ln1(x)
172 | 
173 | 		if self.pos_emb is not None:
174 | 			assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
175 | 			x = x + self.pos_emb
176 | 
177 | 		x = self.linear1(x)
178 | 		x = self.activation(x)
179 | 		x = self.linear2(x)
180 | 
181 | 		# <|image_start|>, IMAGE, <|image_end|>
182 | 		other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
183 | 		assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
184 | 		x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
185 | 
186 | 		return x
187 | 
188 | 	def get_eot_embedding(self):
189 | 		return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
190 | 
191 | 
192 | 
193 | class Joy_caption_alpha_load:
194 | 
195 |     def __init__(self):
196 |         self.model = None
197 |         self.pipeline = JoyPipeline_alpha()
198 |         self.pipeline.parent = self
199 |         pass
200 | 
201 |     @classmethod
202 |     def INPUT_TYPES(s):
203 |         return {
204 |             "required": {
205 |                 "model": (["Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2","unsloth/Meta-Llama-3.1-8B-bnb-4bit"],), 
206 |             }
207 |         }
208 | 
209 |     CATEGORY = "CXH/LLM"
210 |     RETURN_TYPES = ("JoyPipeline_alpha",)
211 |     FUNCTION = "gen"
212 | 
213 |     def loadCheckPoint(self):
214 |         # 清除一波
215 |         if self.pipeline != None:
216 |             self.pipeline.clearCache() 
217 | 
218 |         # Image Adapter
219 |         adapter_path =  os.path.join(folder_paths.models_dir,"Joy_caption_alpha","image_adapter.pt")
220 | 
221 |         clip_model_path = os.path.join(folder_paths.models_dir,"Joy_caption_alpha","clip_model.pt")
222 | 
223 |         CHECKPOINT_PATH = os.path.join(folder_paths.models_dir,"Joy_caption_alpha","text_model")
224 |        
225 |          # clip
226 |         model_id = "google/siglip-so400m-patch14-384"
227 |         CLIP_PATH = download_hg_model(model_id,"clip")
228 | 
229 |         clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 
230 |         clip_model = AutoModel.from_pretrained(
231 |                 CLIP_PATH,
232 |                 trust_remote_code=True
233 |             )
234 |         clip_model = clip_model.vision_model
235 | 
236 |         print("Loading VLM's custom vision model")
237 |         checkpoint = torch.load(clip_model_path, map_location='cpu')
238 |         checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
239 |         clip_model.load_state_dict(checkpoint)
240 |         del checkpoint
241 | 
242 |         clip_model.eval()
243 |         clip_model.requires_grad_(False)
244 |         clip_model.to("cuda")
245 | 
246 |         # Tokenizer
247 |         text_model_path = CHECKPOINT_PATH
248 |         LLM_PATH = download_hg_model(self.model, "LLM")
249 |         modify_json_value(os.path.join(text_model_path, "adapter_config.json"), "base_model_name_or_path",
250 |                               LLM_PATH)
251 |         
252 |         print("Loading tokenizer")
253 |         tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH, use_fast=False)
254 |         assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
255 | 
256 |         # LLM
257 |         print("Loading LLM")
258 |         print("Loading VLM's custom text model")
259 |         
260 |         # text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH , device_map=0, trust_remote_code=True)
261 |         text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH, device_map="auto",
262 |                                                               torch_dtype=torch.bfloat16).eval() 
263 |         
264 |         image_adapter = ImageAdapter_alpha(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False) # ImageAdapter(clip_model.config.hidden_size, 4096) 
265 |         image_adapter.load_state_dict(torch.load(adapter_path, map_location="cpu"))
266 |         adjusted_adapter =  image_adapter 
267 |         adjusted_adapter.eval()
268 |         adjusted_adapter.to("cuda")
269 | 
270 |         self.pipeline.clip_model = clip_model
271 |         self.pipeline.clip_processor = clip_processor
272 |         self.pipeline.tokenizer = tokenizer
273 |         self.pipeline.text_model = text_model
274 |         self.pipeline.image_adapter = adjusted_adapter
275 |     
276 |     def clearCache(self):
277 |          if self.pipeline != None:
278 |               self.pipeline.clearCache()
279 | 
280 |     def gen(self,model):
281 |         if self.model == None or self.model != model or self.pipeline == None:
282 |             self.model = model
283 |             self.loadCheckPoint()
284 |         return (self.pipeline,)
285 | 
286 | def remove_brackets_content(text):
287 |     # 使用正则表达式找到所有被 [] 括起来的内容，并将其删除
288 |     result = re.sub(r'\[.*?\]', '', text)
289 |     return result
290 | 
291 | class Joy_caption_alpha_prompt:
292 | 
293 |     def __init__(self):
294 |         pass
295 | 
296 |     @classmethod
297 |     def INPUT_TYPES(s):
298 |         options = list(extra_options_parts)
299 |         required = {
300 |                 "caption_type": (["Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", "Social Media Post"],),
301 |                 "caption_length":(["any", "very short", "short", "medium-length", "long", "very long","20","50","80","100","120","250","500"],),
302 |                 "name":("STRING", {"multiline": False, "default": ""},),
303 |             }
304 |         for option in options:
305 |             required[option] = ("BOOLEAN", {"default": False})
306 |         return {
307 |             "required": required
308 |         }
309 | 
310 |     CATEGORY = "CXH/LLM"
311 |     RETURN_TYPES = ("STRING",)
312 |     FUNCTION = "gen"
313 |     # def gen(self,caption_type,caption_length,extra_options): 
314 |     def gen(self,**kwargs):
315 |         options_selected = list(kwargs.values())
316 | 
317 |         caption_type = kwargs["caption_type"]
318 |         caption_length = kwargs["caption_length"]
319 |         name = kwargs["name"]
320 | 
321 |          
322 |         # 额外选项从第三个参数开始
323 |         extra_options = options_selected[3:]
324 | 
325 |         length = None if caption_length == "any" else caption_length
326 |         if isinstance(length, str):
327 |             try:
328 |                 length = int(length)
329 |             except ValueError:
330 |                 pass
331 | 
332 |         if length is None:
333 |             map_idx = 0
334 |         elif isinstance(length, int):
335 |             map_idx = 1
336 |         elif isinstance(length, str):
337 |             map_idx = 2
338 |         else:
339 |             raise ValueError(f"Invalid caption length: {length}")
340 |         
341 |         prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
342 | 
343 |         prompt_str = prompt_str.format(length=caption_length, word_count=caption_length)
344 |         options = list(extra_options_parts)
345 |         for selected, option in zip(extra_options, options):
346 |             if selected:
347 |                 prompt_str = prompt_str + remove_brackets_content(option)
348 |         prompt_str = prompt_str.replace("*name*", name)
349 |         print(prompt_str)
350 |         return (prompt_str,)
351 |     
352 | class Joy_caption_alpha_run:
353 | 
354 |     def __init__(self):
355 |         pass
356 |     @classmethod
357 |     def INPUT_TYPES(s):
358 |         return {
359 |             "required": {
360 |                 "JoyPipeline_alpha": ("JoyPipeline_alpha",),
361 |                 "image": ("IMAGE",),
362 |                 "prompt":   ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},),
363 |                 "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}),
364 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
365 |                 "cache": ("BOOLEAN", {"default": False}),
366 |                 "low_vram": ("BOOLEAN", {"default": False}),
367 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
368 |             }
369 |         }
370 | 
371 |     CATEGORY = "CXH/LLM"
372 |     RETURN_TYPES = ("STRING",)
373 |     FUNCTION = "gen"
374 |     def gen(self,JoyPipeline_alpha,image,prompt,max_new_tokens,temperature,cache,low_vram,seed): 
375 | 
376 |         torch.cuda.empty_cache()
377 | 
378 |         if low_vram :
379 |             unload_all_models()
380 | 
381 |         joy_pipeline =  JoyPipeline_alpha 
382 |         if joy_pipeline.clip_processor == None :
383 |             joy_pipeline.parent.loadCheckPoint()    
384 | 
385 |         clip_processor = joy_pipeline.clip_processor
386 |         tokenizer = joy_pipeline.tokenizer
387 |         clip_model = joy_pipeline.clip_model
388 |         image_adapter = joy_pipeline.image_adapter
389 |         text_model = joy_pipeline.text_model
390 | 
391 |      
392 | 
393 |         input_image = tensor2pil(image)
394 | 
395 |         # Preprocess image
396 |         # pImge = clip_processor(images=input_image, return_tensors='pt').pixel_values
397 |         # pImge = pImge.to(DEVICE)
398 | 
399 |         image = input_image.resize((384, 384), Image.LANCZOS)
400 |         pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
401 |         pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
402 |         pixel_values = pixel_values.to('cuda')
403 | 
404 |         # Tokenize the prompt
405 |         # prompt = tokenizer.encode(prompt, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
406 |         # Embed image
407 | 
408 |         with torch.amp.autocast_mode.autocast('cuda', enabled=True):
409 |             vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
410 |             embedded_images = image_adapter(vision_outputs.hidden_states)
411 |             embedded_images = embedded_images.to('cuda')
412 | 
413 | 
414 |         convo = [
415 |             {
416 |                 "role": "system",
417 |                 "content": "You are a helpful image captioner.",
418 |             },
419 |             {
420 |                 "role": "user",
421 |                 "content": prompt,
422 |             },
423 |         ]
424 | 
425 |         convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
426 |         assert isinstance(convo_string, str)
427 | 
428 |         convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
429 |         prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False)
430 |         assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
431 |         convo_tokens = convo_tokens.squeeze(0)  # Squeeze just to make the following easier
432 |         prompt_tokens = prompt_tokens.squeeze(0)
433 | 
434 |         eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[
435 |             0].tolist()
436 |         assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
437 | 
438 |         preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]  # Number of tokens before the prompt
439 | 
440 | 
441 |         # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model)
442 |         # Embed the tokens
443 |         convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda'))
444 | 
445 |         input_embeds = torch.cat([
446 |             convo_embeds[:, :preamble_len],  # Part before the prompt
447 |             embedded_images.to(dtype=convo_embeds.dtype),  # Image
448 |             convo_embeds[:, preamble_len:],  # The prompt and anything after it
449 |         ], dim=1).to('cuda')
450 | 
451 |         input_ids = torch.cat([
452 |             convo_tokens[:preamble_len].unsqueeze(0),
453 |             torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
454 |             # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
455 |             convo_tokens[preamble_len:].unsqueeze(0),
456 |         ], dim=1).to('cuda')
457 |         attention_mask = torch.ones_like(input_ids)
458 | 
459 |         generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask,
460 |                                            max_new_tokens=max_new_tokens, do_sample=True,
461 |                                            suppress_tokens=None)  # Uses the default which is temp=0.6, top_p=0.9
462 | 
463 | 
464 |         generate_ids = generate_ids[:, input_ids.shape[1]:]
465 |         if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids(
466 |                 "<|eot_id|>"):
467 |             generate_ids = generate_ids[:, :-1]
468 | 
469 |         caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
470 | 
471 |         if cache == False:
472 |             joy_pipeline.parent.clearCache()  
473 |             torch.cuda.empty_cache()
474 |             import gc
475 |             gc.collect()
476 |             if low_vram:
477 |                 unload_all_models()
478 |                 soft_empty_cache()
479 | 
480 |         return (caption.strip(), )
481 |     
482 | 
483 | # ===============批量打标=============
484 | class Joy_caption_alpha_batch:
485 | 
486 |     def __init__(self):
487 |         pass
488 |     @classmethod
489 |     def INPUT_TYPES(s):
490 |         return {
491 |             "required": {
492 |                 "JoyPipeline_alpha": ("JoyPipeline_alpha",),
493 |                 "img_dir": ("STRING", {"multiline": True, "default": ""},),
494 |                 "save_dir":   ("STRING", {"multiline": True, "default": ""},),
495 |                 "trigger":   ("STRING", {"multiline": False, "default": "trigger"},),
496 |                 "prompt":   ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},),
497 |                 "format": (["png", "jpg"],),
498 |                 "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}),
499 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
500 |                 "cache": ("BOOLEAN", {"default": False}),
501 |                 "low_vram": ("BOOLEAN", {"default": False}),
502 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
503 |             }
504 |         }
505 | 
506 |     CATEGORY = "CXH/LLM"
507 |     RETURN_TYPES = ("STRING",)
508 |     FUNCTION = "gen"
509 |     def gen(self,JoyPipeline_alpha,img_dir,save_dir,trigger,prompt,format,max_new_tokens,temperature,cache,low_vram,seed): 
510 | 
511 |         torch.cuda.empty_cache()
512 |         directory = img_dir
513 |         if low_vram :
514 |             unload_all_models()
515 | 
516 |         joy_pipeline =  JoyPipeline_alpha 
517 |         if joy_pipeline.clip_processor == None :
518 |             joy_pipeline.parent.loadCheckPoint()    
519 | 
520 |         clip_processor = joy_pipeline.clip_processor
521 |         tokenizer = joy_pipeline.tokenizer
522 |         clip_model = joy_pipeline.clip_model
523 |         image_adapter = joy_pipeline.image_adapter
524 |         text_model = joy_pipeline.text_model
525 | 
526 |         # 批量读取
527 |         if not os.path.isdir(directory):
528 |             raise FileNotFoundError(f"Directory '{directory}' cannot be found.")
529 |         dir_files = os.listdir(directory)
530 |         if len(dir_files) == 0:
531 |             raise FileNotFoundError(f"No files in directory '{directory}'.")
532 | 
533 |         valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
534 |         dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)]
535 | 
536 |         dir_files = sorted(dir_files)
537 |         dir_files = [os.path.join(directory, x) for x in dir_files]
538 | 
539 |          # 创建保存目录
540 |         if not os.path.exists(save_dir):
541 |             os.makedirs(save_dir)
542 | 
543 |         convo = [
544 |                 {
545 |                     "role": "system",
546 |                     "content": "You are a helpful image captioner.",
547 |                 },
548 |                 {
549 |                     "role": "user",
550 |                     "content": prompt,
551 |                 },
552 |             ]
553 | 
554 |         convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
555 |         assert isinstance(convo_string, str)
556 | 
557 |         convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
558 |         prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False)
559 |         assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
560 |         convo_tokens = convo_tokens.squeeze(0)  # Squeeze just to make the following easier
561 |         prompt_tokens = prompt_tokens.squeeze(0)
562 | 
563 |         eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[
564 |                 0].tolist()
565 |         assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
566 | 
567 |         preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]  # Number of tokens before the prompt
568 | 
569 | 
570 |         # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model)
571 |         # Embed the tokens
572 |         convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda'))
573 | 
574 |         index1 = 0
575 |         for image_path in dir_files:
576 |             if os.path.isdir(image_path) and os.path.ex:
577 |                 continue
578 |             start = time.time()
579 |            
580 |             input_image = open_image(image_path)
581 |             input_image = ImageOps.exif_transpose(input_image)
582 |             input_image = input_image.convert("RGB")
583 | 
584 | 
585 |             image = input_image.resize((384, 384), Image.LANCZOS)
586 |             pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
587 |             pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
588 |             pixel_values = pixel_values.to('cuda')
589 | 
590 | 
591 |             with torch.amp.autocast_mode.autocast('cuda', enabled=True):
592 |                 vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
593 |                 embedded_images = image_adapter(vision_outputs.hidden_states)
594 |                 embedded_images = embedded_images.to('cuda')
595 | 
596 |             input_embeds = torch.cat([
597 |                 convo_embeds[:, :preamble_len],  # Part before the prompt
598 |                 embedded_images.to(dtype=convo_embeds.dtype),  # Image
599 |                 convo_embeds[:, preamble_len:],  # The prompt and anything after it
600 |             ], dim=1).to('cuda')
601 | 
602 |             input_ids = torch.cat([
603 |                 convo_tokens[:preamble_len].unsqueeze(0),
604 |                 torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
605 |                 # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
606 |                 convo_tokens[preamble_len:].unsqueeze(0),
607 |             ], dim=1).to('cuda')
608 |             attention_mask = torch.ones_like(input_ids)
609 | 
610 |             generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask,
611 |                                             max_new_tokens=max_new_tokens, do_sample=True,
612 |                                             suppress_tokens=None)  # Uses the default which is temp=0.6, top_p=0.9
613 | 
614 | 
615 |             generate_ids = generate_ids[:, input_ids.shape[1]:]
616 |             if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids(
617 |                     "<|eot_id|>"):
618 |                 generate_ids = generate_ids[:, :-1]
619 | 
620 |             caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
621 |             # 提示词
622 |             lenName = str(index1)
623 |             txt_content = trigger + "," + caption.strip()
624 |             txt_file_name = f"{trigger}_{lenName}.txt"
625 |             txt_save_path = os.path.join(save_dir, txt_file_name)
626 |             try:
627 |                 with open(txt_save_path, 'w', encoding='utf-8') as file:
628 |                     file.write(txt_content)
629 |             except IOError as e:
630 |                 print(f"保存文件时发生错误: {e}")
631 |             # 图片
632 |             img_file_name = f"{trigger}_{lenName}.{format}"
633 |             if format != "png":
634 |                 if input_image.mode == "RGBA":
635 |                     input_image = input_image.convert("RGB")
636 |             img_save_path = os.path.join(save_dir, img_file_name)
637 |             input_image.save(img_save_path)
638 |             end = time.time()
639 |             execution_time = calculate_seconds_difference(start, end)
640 |             temp = f":{execution_time:.3f}s"
641 |             index1 = index1 + 1
642 |             print(str(index1)+"/"+str(len(dir_files)) +":"+temp)
643 |         print("finish结束")
644 | 
645 |         if cache == False:
646 |             joy_pipeline.parent.clearCache()  
647 |             torch.cuda.empty_cache()
648 |             import gc
649 |             gc.collect()
650 |             if low_vram:
651 |                 unload_all_models()
652 |                 soft_empty_cache()
653 |         lenName = len(os.listdir(save_dir))
654 |         return (str(lenName/2), )
655 |     
656 | # ===============批量打标=============
657 | def get_subdirectories(directory):
658 |     # 检查目录是否存在
659 |     if not os.path.isdir(directory):
660 |         raise FileNotFoundError(f"Directory '{directory}' cannot be found.")
661 |     
662 |     # 获取目录中的所有文件夹
663 |     subdirectories = [name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name))]
664 |     return subdirectories
665 | 
666 | def get_trigger_from_string(s):
667 |     # Split the string by the underscore character
668 |     parts = s.split('_')
669 |     # Check if the length of the parts is at least 2
670 |     if len(parts) >= 2:
671 |         # Return the second part which is the trigger
672 |         return parts[1]
673 |     else:
674 |         # Return None if the format is not as expected
675 |         return None
676 | 
677 | class Joy_caption_alpha_batch_Dirs:
678 | 
679 |     def __init__(self):
680 |         pass
681 |     @classmethod
682 |     def INPUT_TYPES(s):
683 |         return {
684 |             "required": {
685 |                 "JoyPipeline_alpha": ("JoyPipeline_alpha",),
686 |                 "img_dir": ("STRING", {"multiline": True, "default": ""},),
687 |                 "save_dir":   ("STRING", {"multiline": True, "default": ""},),
688 |                 "prompt":   ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},),
689 |                 "format": (["png", "jpg"],),
690 |                 "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}),
691 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
692 |                 "cache": ("BOOLEAN", {"default": False}),
693 |                 "low_vram": ("BOOLEAN", {"default": False}),
694 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
695 |             }
696 |         }
697 | 
698 |     CATEGORY = "CXH/LLM"
699 |     RETURN_TYPES = ("STRING",)
700 |     FUNCTION = "gen"
701 |     def gen(self,JoyPipeline_alpha,img_dir,save_dir,prompt,format,max_new_tokens,temperature,cache,low_vram,seed): 
702 | 
703 |         torch.cuda.empty_cache()
704 |         directory = img_dir
705 |         if low_vram :
706 |             unload_all_models()
707 | 
708 |         joy_pipeline =  JoyPipeline_alpha 
709 |         if joy_pipeline.clip_processor == None :
710 |             joy_pipeline.parent.loadCheckPoint()    
711 | 
712 |         clip_processor = joy_pipeline.clip_processor
713 |         tokenizer = joy_pipeline.tokenizer
714 |         clip_model = joy_pipeline.clip_model
715 |         image_adapter = joy_pipeline.image_adapter
716 |         text_model = joy_pipeline.text_model
717 | 
718 |         # 批量读取
719 |         if not os.path.isdir(directory):
720 |             raise FileNotFoundError(f"Directory '{directory}' cannot be found.")
721 |         
722 |         convo = [
723 |                     {
724 |                         "role": "system",
725 |                         "content": "You are a helpful image captioner.",
726 |                     },
727 |                     {
728 |                         "role": "user",
729 |                         "content": prompt,
730 |                     },
731 |                 ]
732 |         convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
733 |         assert isinstance(convo_string, str)
734 | 
735 |         convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
736 |         prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False)
737 |         assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
738 |         convo_tokens = convo_tokens.squeeze(0)  # Squeeze just to make the following easier
739 |         prompt_tokens = prompt_tokens.squeeze(0)
740 | 
741 |         eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[
742 |                     0].tolist()
743 |         assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
744 | 
745 |         preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]  # Number of tokens before the prompt
746 | 
747 | 
748 |         # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model)
749 |         # Embed the tokens
750 |         convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda'))    
751 |         subdirs = get_subdirectories(directory)
752 | 
753 |         for subdir in subdirs:
754 |             print("开始文件夹:"+subdir)
755 |             subdir_path = os.path.join(directory, subdir) 
756 |             if not os.path.isdir(subdir_path):
757 |                 continue
758 |             dir_files = os.listdir(subdir_path)
759 |             if len(dir_files) == 0:
760 |                 raise FileNotFoundError(f"No files in directory '{directory}'.")
761 | 
762 |             valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
763 |             dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)]
764 | 
765 |             dir_files = sorted(dir_files)
766 |             dir_files = [os.path.join(subdir_path, x) for x in dir_files]
767 | 
768 |             # 创建保存目录
769 |             if not os.path.exists(save_dir):
770 |                 os.makedirs(save_dir)
771 |             if not os.path.exists(os.path.join(save_dir,subdir)):
772 |                 os.makedirs(os.path.join(save_dir,subdir))
773 | 
774 |             index1 = 0
775 |             for image_path in dir_files:
776 |                 if os.path.isdir(image_path) and os.path.ex:
777 |                     continue
778 |                 start = time.time()
779 |                 # print(image_path)
780 |                 input_image = open_image(image_path)
781 |                 input_image = ImageOps.exif_transpose(input_image)
782 |                 input_image = input_image.convert("RGB")
783 | 
784 | 
785 |                 image = input_image.resize((384, 384), Image.LANCZOS)
786 |                 pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
787 |                 pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
788 |                 pixel_values = pixel_values.to('cuda')
789 | 
790 | 
791 |                 with torch.amp.autocast_mode.autocast('cuda', enabled=True):
792 |                     vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
793 |                     embedded_images = image_adapter(vision_outputs.hidden_states)
794 |                     embedded_images = embedded_images.to('cuda')
795 | 
796 |                 input_embeds = torch.cat([
797 |                     convo_embeds[:, :preamble_len],  # Part before the prompt
798 |                     embedded_images.to(dtype=convo_embeds.dtype),  # Image
799 |                     convo_embeds[:, preamble_len:],  # The prompt and anything after it
800 |                 ], dim=1).to('cuda')
801 | 
802 |                 input_ids = torch.cat([
803 |                     convo_tokens[:preamble_len].unsqueeze(0),
804 |                     torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
805 |                     # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
806 |                     convo_tokens[preamble_len:].unsqueeze(0),
807 |                 ], dim=1).to('cuda')
808 |                 attention_mask = torch.ones_like(input_ids)
809 | 
810 |                 generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask,
811 |                                                 max_new_tokens=max_new_tokens, do_sample=True,
812 |                                                 suppress_tokens=None)  # Uses the default which is temp=0.6, top_p=0.9
813 | 
814 | 
815 |                 generate_ids = generate_ids[:, input_ids.shape[1]:]
816 |                 if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids(
817 |                         "<|eot_id|>"):
818 |                     generate_ids = generate_ids[:, :-1]
819 | 
820 |                 caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
821 |                 # 提示词
822 |                 lenName = str(index1)
823 |                 trigger = get_trigger_from_string(subdir)
824 |                 if trigger is not None:
825 |                     txt_content = trigger + "," + caption.strip()
826 |                     txt_file_name = f"{trigger}_{lenName}.txt"
827 |                     txt_save_path = os.path.join(save_dir,subdir, txt_file_name)
828 |                     img_file_name = f"{trigger}_{lenName}.{format}"
829 |                 else:
830 |                     txt_content = caption.strip()
831 |                     txt_file_name = f"{lenName}.txt"
832 |                     txt_save_path = os.path.join(save_dir,subdir, txt_file_name)
833 |                     img_file_name = f"{lenName}.{format}"
834 |                 try:
835 |                     with open(txt_save_path, 'w', encoding='utf-8') as file:
836 |                         file.write(txt_content)
837 |                 except IOError as e:
838 |                     print(f"保存文件时发生错误: {e}")
839 |                 # 图片
840 |                 
841 |                 if format != "png":
842 |                     if input_image.mode == "RGBA":
843 |                         input_image = input_image.convert("RGB")
844 |                 img_save_path = os.path.join(save_dir,subdir, img_file_name)
845 |                 input_image.save(img_save_path)
846 |                 end = time.time()
847 |                 execution_time = calculate_seconds_difference(start, end)
848 |                 temp = f":{execution_time:.3f}s"
849 |                 index1 = index1 + 1
850 |                 print(str(index1)+"/"+str(len(dir_files)) +":"+temp)
851 |             print("结束"+subdir)
852 |             index1 = 0
853 | 
854 |         if cache == False:
855 |             joy_pipeline.parent.clearCache()  
856 |             torch.cuda.empty_cache()
857 |             import gc
858 |             gc.collect()
859 |             if low_vram:
860 |                 unload_all_models()
861 |                 soft_empty_cache()
862 |         lenName = len(os.listdir(save_dir))
863 |         return (str(lenName/2), )
864 |     
865 | 


--------------------------------------------------------------------------------
/Joy_caption_node.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huggingface_hub import InferenceClient
  3 | from torch import nn
  4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
  5 | from pathlib import Path
  6 | import torch
  7 | import torch.amp.autocast_mode
  8 | from PIL import Image
  9 | import os
 10 | import folder_paths
 11 | 
 12 | from .lib.ximg import *
 13 | from .lib.xmodel import *
 14 | 
 15 | from model_management import get_torch_device
 16 | DEVICE = get_torch_device()
 17 | # def get_torch_device():  
 18 | #     """  
 19 | #     返回PyTorch模型应该运行的设备（CPU或GPU）  
 20 | #     如果系统支持CUDA并且至少有一个GPU可用，则返回GPU设备；否则返回CPU设备。  
 21 | #     """  
 22 | #     if torch.cuda.is_available():  
 23 | #         # 选择第一个可用的GPU  
 24 | #         device = torch.device("cuda:0")  
 25 | #         print(f"There are {torch.cuda.device_count()} GPU(s) available.")  
 26 | #         print(f"We will use the GPU: {device}")  
 27 | #     else:  
 28 | #         # 如果没有GPU可用，则使用CPU  
 29 | #         device = torch.device("cpu")  
 30 | #         print("No GPU available, using the CPU instead.")  
 31 | #     return device
 32 | 
 33 | class JoyPipeline:
 34 |     def __init__(self):
 35 |         self.clip_model = None
 36 |         self.clip_processor =None
 37 |         self.tokenizer = None
 38 |         self.text_model = None
 39 |         self.image_adapter = None
 40 |         self.parent = None
 41 |     
 42 |     def clearCache(self):
 43 |         self.clip_model = None
 44 |         self.clip_processor =None
 45 |         self.tokenizer = None
 46 |         self.text_model = None
 47 |         self.image_adapter = None 
 48 | 
 49 | 
 50 | class ImageAdapter(nn.Module):
 51 | 	def __init__(self, input_features: int, output_features: int):
 52 | 		super().__init__()
 53 | 		self.linear1 = nn.Linear(input_features, output_features)
 54 | 		self.activation = nn.GELU()
 55 | 		self.linear2 = nn.Linear(output_features, output_features)
 56 | 	
 57 | 	def forward(self, vision_outputs: torch.Tensor):
 58 | 		x = self.linear1(vision_outputs)
 59 | 		x = self.activation(x)
 60 | 		x = self.linear2(x)
 61 | 		return x
 62 | 
 63 | class Joy_caption_load:
 64 | 
 65 |     def __init__(self):
 66 |         self.model = None
 67 |         self.pipeline = JoyPipeline()
 68 |         self.pipeline.parent = self
 69 |         pass
 70 | 
 71 |     @classmethod
 72 |     def INPUT_TYPES(s):
 73 |         return {
 74 |             "required": {
 75 |                 "model": (["unsloth/Meta-Llama-3.1-8B-bnb-4bit", "meta-llama/Meta-Llama-3.1-8B"],), 
 76 |                
 77 |             }
 78 |         }
 79 | 
 80 |     CATEGORY = "CXH/LLM"
 81 |     RETURN_TYPES = ("JoyPipeline",)
 82 |     FUNCTION = "gen"
 83 | 
 84 |     def loadCheckPoint(self):
 85 |         # 清除一波
 86 |         if self.pipeline != None:
 87 |             self.pipeline.clearCache() 
 88 |        
 89 |          # clip
 90 |         model_id = "google/siglip-so400m-patch14-384"
 91 |         CLIP_PATH = download_hg_model(model_id,"clip")
 92 | 
 93 |         clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 
 94 |         clip_model = AutoModel.from_pretrained(
 95 |                 CLIP_PATH,
 96 |                 trust_remote_code=True
 97 |             )
 98 |             
 99 |         clip_model = clip_model.vision_model
100 |         clip_model.eval()
101 |         clip_model.requires_grad_(False)
102 |         clip_model.to("cuda")
103 | 
104 |        
105 |         # LLM
106 |         MODEL_PATH = download_hg_model(self.model,"LLM")
107 |         tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,use_fast=False)
108 |         assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
109 | 
110 |         text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto",trust_remote_code=True)
111 |         text_model.eval()
112 | 
113 |         # Image Adapter
114 |         adapter_path =  os.path.join(folder_paths.models_dir,"Joy_caption","image_adapter.pt")
115 | 
116 |         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size) # ImageAdapter(clip_model.config.hidden_size, 4096) 
117 |         image_adapter.load_state_dict(torch.load(adapter_path, map_location="cpu"))
118 |         adjusted_adapter =  image_adapter #AdjustedImageAdapter(image_adapter, text_model.config.hidden_size)
119 |         adjusted_adapter.eval()
120 |         adjusted_adapter.to("cuda")
121 | 
122 |         self.pipeline.clip_model = clip_model
123 |         self.pipeline.clip_processor = clip_processor
124 |         self.pipeline.tokenizer = tokenizer
125 |         self.pipeline.text_model = text_model
126 |         self.pipeline.image_adapter = adjusted_adapter
127 |     
128 |     def clearCache(self):
129 |          if self.pipeline != None:
130 |               self.pipeline.clearCache()
131 | 
132 |     def gen(self,model):
133 |         if self.model == None or self.model != model or self.pipeline == None:
134 |             self.model = model
135 |             self.loadCheckPoint()
136 |         return (self.pipeline,)
137 | 
138 | class Joy_caption:
139 | 
140 |     def __init__(self):
141 |         pass
142 | 
143 |     @classmethod
144 |     def INPUT_TYPES(s):
145 |         return {
146 |             "required": {
147 |                 "joy_pipeline": ("JoyPipeline",),
148 |                 "image": ("IMAGE",),
149 |                 "prompt":   ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},),
150 |                 "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}),
151 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
152 |                 "cache": ("BOOLEAN", {"default": False}),
153 |             }
154 |         }
155 | 
156 |     CATEGORY = "CXH/LLM"
157 |     RETURN_TYPES = ("STRING",)
158 |     FUNCTION = "gen"
159 |     def gen(self,joy_pipeline,image,prompt,max_new_tokens,temperature,cache): 
160 | 
161 |         if joy_pipeline.clip_processor == None :
162 |             joy_pipeline.parent.loadCheckPoint()    
163 | 
164 |         clip_processor = joy_pipeline.clip_processor
165 |         tokenizer = joy_pipeline.tokenizer
166 |         clip_model = joy_pipeline.clip_model
167 |         image_adapter = joy_pipeline.image_adapter
168 |         text_model = joy_pipeline.text_model
169 | 
170 |      
171 | 
172 |         input_image = tensor2pil(image)
173 | 
174 |         # Preprocess image
175 |         pImge = clip_processor(images=input_image, return_tensors='pt').pixel_values
176 |         pImge = pImge.to(DEVICE)
177 | 
178 |         # Tokenize the prompt
179 |         prompt = tokenizer.encode(prompt, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
180 |         # Embed image
181 |         with torch.amp.autocast_mode.autocast(str(DEVICE), enabled=True):
182 |             vision_outputs = clip_model(pixel_values=pImge, output_hidden_states=True)
183 |             image_features = vision_outputs.hidden_states[-2]
184 |             embedded_images = image_adapter(image_features)
185 |             embedded_images = embedded_images.to(DEVICE)
186 | 
187 |         # Embed prompt
188 |         prompt_embeds = text_model.model.embed_tokens(prompt.to(DEVICE))
189 |         assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
190 |         embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))   
191 | 
192 |         # Construct prompts
193 |         inputs_embeds = torch.cat([
194 |             embedded_bos.expand(embedded_images.shape[0], -1, -1),
195 |             embedded_images.to(dtype=embedded_bos.dtype),
196 |             prompt_embeds.expand(embedded_images.shape[0], -1, -1),
197 |         ], dim=1)
198 | 
199 |         input_ids = torch.cat([
200 |             torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
201 |             torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
202 |             prompt,
203 |         ], dim=1).to(DEVICE)
204 |         attention_mask = torch.ones_like(input_ids)
205 |         
206 |         generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=temperature, suppress_tokens=None)
207 | 
208 |         # Trim off the prompt
209 |         generate_ids = generate_ids[:, input_ids.shape[1]:]
210 |         if generate_ids[0][-1] == tokenizer.eos_token_id:
211 |             generate_ids = generate_ids[:, :-1]
212 | 
213 |         caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
214 |         r = caption.strip()
215 | 
216 |         if cache == False:
217 |            joy_pipeline.parent.clearCache()  
218 | 
219 |         return (r,)
220 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 添加批量文件夹打标：文件夹命名规则 名字_trigg
 2 | ![workflow](https://github.com/user-attachments/assets/d30a2d7f-918a-4837-b85c-be01913d2775)
 3 | ![1737366489766](https://github.com/user-attachments/assets/cb885492-a158-49bf-ba2e-956a1ba2d780)
 4 | 
 5 | 
 6 | .20240-10-30 添加批量图片分类
 7 | 
 8 | ![workflow_min2 6classifiy_](https://github.com/user-attachments/assets/1687cc01-89c4-4628-8f8c-abc641c62a43)
 9 | 
10 | 
11 | .2024-10-16 添加批量打标：4090大概4~5秒一张图
12 | 
13 | ![批量打标](https://github.com/user-attachments/assets/15e4075b-ed78-4e88-b586-09f65483c991)
14 | 
15 | ![1729064090078](https://github.com/user-attachments/assets/bb61ac24-5bec-4018-98cf-8007533d4dbc)
16 | 
17 | .2024-10-12 添加joy alpha2
18 | 
19 | 模型下载：https://pan.baidu.com/s/1dOjbUEacUOhzFitAQ3uIeQ?pwd=4ypv#list/path=%2F
20 | 
21 | Joy_caption_alpha 放到 models\Joy_caption_alpha 下载：https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two/tree/main/cgrkzexw-599808
22 | 
23 | ![1728728834716](https://github.com/user-attachments/assets/3adc7c92-1247-436e-8589-f5c64d33378e)
24 | 
25 | 
26 | ![joy_alpha](https://github.com/user-attachments/assets/4ab7de6a-405e-405b-b03e-0850522e3951)
27 | 
28 | 
29 | .2024-9-9 florence2 Add Florence-2-large-PromptGen-v1.5 and MiniCPM3-4B(CXH_MinCP3_4B_Load CXH_MinCP3_4B_Chat) 
30 |     MiniCPM3-4B聊天 翻译，改写都很强
31 | 
32 | .2024-9-6 florence2 Add Florence-2-base-PromptGen-v1.5 
33 | 
34 | .2024-9-2 更新批量打标案例(Update batch marking cases) 速度：florence2<min2.6<joy
35 | 
36 | ![1724901350282](https://github.com/user-attachments/assets/c9d9cd10-fbd6-4aeb-91b6-f2740c3998cc)
37 | 
38 | (1).基于comfyui节点图片放推(Recommended based on comfyui node pictures)
39 | 
40 |     1.Joy_caption
41 | 
42 |     2.miniCPMv2_6_prompt_generator
43 | 
44 |     3.florence2
45 | 
46 | (2).安装(Installation)：
47 | 
48 |   1.（Comfyui evn python.exe） python -m pip install -r requirements.txt or click install_req.bat
49 | 
50 |   注意：transformers 版本不能太低（Note: The version of transformers cannot be too low）
51 | 
52 |   2. 下载模型或者运行comfyui自动下载模型到合适文件夹(Download the model or run Comfyui to automatically download the model to the appropriate folder)
53 | 
54 | (3) 模型安装（Install model）
55 | 
56 |    1).Joy_caption
57 | 
58 |    .运行自动下载模型(推荐手动下载) Run automatic download model (manual download recommended)
59 |    
60 |     1.https://huggingface.co/google/siglip-so400m-patch14-384 放到(put in)clip/siglip-so400m-patch14-384
61 |       
62 | ![1724901434148](https://github.com/user-attachments/assets/12ad9627-e121-4bc8-98cc-313fa491bde4)
63 | 
64 |     
65 |     2. https://huggingface.co/unsloth/Meta-Llama-3.1-8B-bnb-4bit 放到(put in)LLM/Meta-Llama-3.1-8B-bnb-4bit
66 |       
67 | ![1724901495135](https://github.com/user-attachments/assets/3cac31a7-8150-4d78-96d1-8aa3198fe572)
68 | 
69 | 
70 |     3.必须手动下载(Must be downloaded manually):https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha/tree/main/wpkklhc6   (put in)Joy_caption 
71 |       
72 | ![1724901527482](https://github.com/user-attachments/assets/e8ec1be6-a96c-4e73-9422-7bcdafb8f1d4)
73 | 
74 |  2).MiniCPMv2_6-prompt-generator + CogFlorence
75 |  
76 |  https://huggingface.co/pzc163/MiniCPMv2_6-prompt-generator
77 |  
78 |  https://huggingface.co/thwri/CogFlorence-2.2-Large
79 |  
80 |  ![1724902196890](https://github.com/user-attachments/assets/22373c22-8083-4b3f-af10-774d86560f16)
81 | 
82 |  Run with:flux1-dev-Q8_0.gguf
83 | 
84 |  ![e8ad7fa14f807184a99ea23b31e8a60](https://github.com/user-attachments/assets/178ee440-919e-4b28-b1bd-c2c1e2e0ceb4)
85 | 
86 |  ![1724897220972](https://github.com/user-attachments/assets/ac3c072d-dccc-4f29-bcbd-45c7945407be)
87 | 
88 |  ![1724897584034](https://github.com/user-attachments/assets/584adc69-3e0d-4cb9-8392-0fe337dc34a2)
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .Joy_caption_node import Joy_caption_load
 3 | from .Joy_caption_node import Joy_caption
 4 | from .miniCPMv2_6_prompt_generator import CXH_HG_Model_Load,CXH_Min2_6_prompt_Run
 5 | from .florence_nodes import CXH_DownloadAndLoadFlorence2Model,CXH_Florence2Run
 6 | from .miniCpMV3_4_chat import  CXH_MinCP3_4B_Load,CXH_MinCP3_4B_Chat
 7 | from .Joy_caption_alpha import Joy_caption_alpha_load,Joy_caption_alpha_run,Joy_caption_alpha_prompt,Joy_caption_alpha_batch,Joy_caption_alpha_batch_Dirs
 8 | from .CXH_Min2_6_classifiy import CXH_Min2_6_classifiy
 9 | from .ic_lora_batch import CXH_IC_Lora_Florence2Run,CXH_Ic_lora_Joy_batch,CXH_IC_lora_reversal
10 | from .smolvlm import CXH_SmolVlm_Load,CXH_SmolVlm_Run
11 | 
12 | NODE_CLASS_MAPPINGS = {
13 |     "Joy_caption_load":Joy_caption_load,
14 |     "Joy_caption":Joy_caption,
15 |     "CXH_HG_Model_Load":CXH_HG_Model_Load,
16 |     "CXH_Min2_6_prompt_Run":CXH_Min2_6_prompt_Run,
17 |     "CXH_DownloadAndLoadFlorence2Model":CXH_DownloadAndLoadFlorence2Model,
18 |     "CXH_Florence2Run":CXH_Florence2Run,
19 |     "CXH_MinCP3_4B_Load":CXH_MinCP3_4B_Load,
20 |     "CXH_MinCP3_4B_Chat":CXH_MinCP3_4B_Chat,
21 |     "Joy_caption_alpha_load":Joy_caption_alpha_load,
22 |     "Joy_caption_alpha_run":Joy_caption_alpha_run,
23 |     "Joy_caption_alpha_prompt":Joy_caption_alpha_prompt,
24 |     "Joy_caption_alpha_batch":Joy_caption_alpha_batch,
25 |     "CXH_Min2_6_classifiy":CXH_Min2_6_classifiy,
26 |     "CXH_Ic_lora_Joy_batch":CXH_Ic_lora_Joy_batch,
27 |     "CXH_IC_Lora_Florence2Run":CXH_IC_Lora_Florence2Run,
28 |     "CXH_IC_lora_reversal":CXH_IC_lora_reversal,
29 |     "CXH_SmolVlm_Load":CXH_SmolVlm_Load,
30 |     "CXH_SmolVlm_Run":CXH_SmolVlm_Run,
31 |     "Joy_caption_alpha_batch_Dirs":Joy_caption_alpha_batch_Dirs
32 | }
33 | 
34 | NODE_DISPLAY_NAME_MAPPINGS = {
35 |     "Joy_caption_load":"Joy_caption_load",
36 |     "Joy_caption":"Joy_caption",
37 |     "CXH_HG_Model_Load":"CXH_HG_Model_Load",
38 |     "CXH_Min2_6_prompt_Run":"CXH_Min2_6_prompt_Run",
39 |     "CXH_DownloadAndLoadFlorence2Model":"CXH_DownloadAndLoadFlorence2Model",
40 |     "CXH_Florence2Run":"CXH_Florence2Run",
41 |     "CXH_MinCP3_4B_Load":"CXH_MinCP3_4B_Load",
42 |     "CXH_MinCP3_4B_Chat":"CXH_MinCP3_4B_Chat",
43 |     "Joy_caption_alpha_load":"Joy_caption_alpha_load",
44 |     "Joy_caption_alpha_run":"Joy_caption_alpha_run",
45 |     "Joy_caption_alpha_prompt":"Joy_caption_alpha_prompt",
46 |     "Joy_caption_alpha_batch":"Joy_caption_alpha_batch",
47 |     "CXH_Min2_6_classifiy":"CXH_Min2_6_classifiy",
48 |     "CXH_Ic_lora_Joy_batch":"CXH_Ic_lora_Joy_batch",
49 |     "CXH_IC_Lora_Florence2Run":"CXH_IC_Lora_Florence2Run",
50 |     "CXH_IC_lora_reversal":"CXH_IC_lora_reversal",
51 |     "CXH_SmolVlm_Load":"CXH_SmolVlm_Load",
52 |     "CXH_SmolVlm_Run":"CXH_SmolVlm_Run",
53 |     "Joy_caption_alpha_batch_Dirs":"Joy_caption_alpha_batch_Dirs"
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/__pycache__/Joy_caption_alpha.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/__pycache__/Joy_caption_alpha.cpython-311.pyc


--------------------------------------------------------------------------------
/__pycache__/florence_nodes.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/__pycache__/florence_nodes.cpython-311.pyc


--------------------------------------------------------------------------------
/__pycache__/miniCPMv2_6_prompt_generator.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/__pycache__/miniCPMv2_6_prompt_generator.cpython-311.pyc


--------------------------------------------------------------------------------
/__pycache__/miniCpMV3_4_chat.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/__pycache__/miniCpMV3_4_chat.cpython-311.pyc


--------------------------------------------------------------------------------
/florence_nodes.py:
--------------------------------------------------------------------------------
  1 | # 原作者项目: https://github.com/un-seen/comfyui-tensorops
  2 | # 修改原因：comfyui升级使用不了
  3 | 
  4 | import torch
  5 | import torchvision.transforms.functional as F
  6 | import io
  7 | import os
  8 | from typing import List
  9 | import matplotlib
 10 | matplotlib.use('Agg')   
 11 | import matplotlib.pyplot as plt
 12 | import matplotlib.patches as patches
 13 | from PIL import Image, ImageDraw, ImageColor, ImageFont
 14 | import random
 15 | import numpy as np
 16 | import re
 17 | 
 18 | #workaround for unnecessary flash_attn requirement
 19 | from unittest.mock import patch
 20 | from transformers.dynamic_module_utils import get_imports
 21 | 
 22 | def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
 23 |     if not str(filename).endswith("modeling_florence2.py"):
 24 |         return get_imports(filename)
 25 |     imports = get_imports(filename)
 26 |     # imports.remove("flash_attn")
 27 |     return imports
 28 | 
 29 | 
 30 | import comfy.model_management as mm
 31 | from comfy.utils import ProgressBar
 32 | import folder_paths
 33 | 
 34 | script_directory = os.path.dirname(os.path.abspath(__file__))
 35 | 
 36 | from transformers import AutoModelForCausalLM, AutoProcessor
 37 | 
 38 | class CXH_DownloadAndLoadFlorence2Model:
 39 |     @classmethod
 40 |     def INPUT_TYPES(s):
 41 |         return {"required": {
 42 |             "model": (
 43 |                     [ 
 44 |                     'microsoft/Florence-2-base',
 45 |                     'microsoft/Florence-2-base-ft',
 46 |                     'microsoft/Florence-2-large',
 47 |                     'microsoft/Florence-2-large-ft',
 48 |                     'HuggingFaceM4/Florence-2-DocVQA',
 49 |                     'thwri/CogFlorence-2-Large-Freeze',
 50 |                     'thwri/CogFlorence-2.2-Large',
 51 |                     'MiaoshouAI/Florence-2-base-PromptGen-v1.5',
 52 |                     'MiaoshouAI/Florence-2-large-PromptGen-v1.5'
 53 |                     ],
 54 |                     {
 55 |                     "default": 'MiaoshouAI/Florence-2-large-PromptGen-v1.5'
 56 |                     }),
 57 |             "precision": ([ 'fp16','bf16','fp32'],
 58 |                     {
 59 |                     "default": 'fp16'
 60 |                     }),
 61 |             "attention": (
 62 |                     [ 'flash_attention_2', 'sdpa', 'eager'],
 63 |                     {
 64 |                     "default": 'sdpa'
 65 |                     }),
 66 | 
 67 |             },
 68 |         }
 69 | 
 70 |     RETURN_TYPES = ("FL2MODEL",)
 71 |     RETURN_NAMES = ("florence2_model",)
 72 |     FUNCTION = "loadmodel"
 73 |     CATEGORY = "CXH/LLM"
 74 | 
 75 |     def loadmodel(self, model, precision, attention):
 76 |         device = mm.get_torch_device()
 77 |         offload_device = mm.unet_offload_device()
 78 |         dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 79 | 
 80 |         model_name = model.rsplit('/', 1)[-1]
 81 |         model_path = os.path.join(folder_paths.models_dir, "LLM", model_name)
 82 |         
 83 |         if not os.path.exists(model_path):
 84 |             print(f"Downloading Lumina model to: {model_path}")
 85 |             from huggingface_hub import snapshot_download
 86 |             snapshot_download(repo_id=model,
 87 |                             local_dir=model_path,
 88 |                             local_dir_use_symlinks=False)
 89 |             
 90 |         print(f"using {attention} for attention")
 91 |         with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports): #workaround for unnecessary flash_attn requirement
 92 |             model = AutoModelForCausalLM.from_pretrained(model_path, attn_implementation=attention, device_map=device, torch_dtype=dtype,trust_remote_code=True)
 93 |         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 94 |         
 95 |         florence2_model = {
 96 |             'model': model, 
 97 |             'processor': processor,
 98 |             'dtype': dtype
 99 |             }
100 | 
101 |         return (florence2_model,)
102 | 
103 | def calculate_bounding_box(width, height, flat_points) -> List[float]:
104 |     """
105 |     Calculate the bounding box for a polygon.
106 | 
107 |     Args:
108 |     flat_points (list of int): Flat list of x, y coordinates defining the polygon points.
109 | 
110 |     Returns:
111 |     tuple: (min_x, min_y, max_x, max_y) defining the bounding box.
112 |     """
113 |     if not flat_points or len(flat_points) % 2 != 0:
114 |         raise ValueError("The list of points must be non-empty and have an even number of elements")
115 | 
116 |     x_coords = flat_points[0::2]
117 |     y_coords = flat_points[1::2]
118 | 
119 |     min_x = min(x_coords)
120 |     max_x = max(x_coords)
121 |     min_y = min(y_coords)
122 |     max_y = max(y_coords)
123 | 
124 |     return [min_x / width, min_y / height, max_x / width, max_y / height]
125 |     
126 | class CXH_Florence2Run:
127 |     @classmethod
128 |     def INPUT_TYPES(s):
129 |         return {
130 |             "required": {
131 |                 "image": ("IMAGE", ),
132 |                 "florence2_model": ("FL2MODEL", ),
133 |                 "text_input": ("STRING", {"default": "", "multiline": True}),
134 |                 "task": (
135 |                     [ 
136 |                     'region_caption',
137 |                     'dense_region_caption',
138 |                     'region_proposal',
139 |                     'caption',
140 |                     'detailed_caption',
141 |                     'more_detailed_caption',
142 |                     'caption_to_phrase_grounding',
143 |                     'referring_expression_segmentation',
144 |                     'ocr',
145 |                     'ocr_with_region',
146 |                     'docvqa',
147 |                     'mixed_caption(PromptGen 1.5)',
148 |                     'generate_tags(PromptGen 1.5)'
149 |                     ],
150 |                     {
151 |                     "default": 'more_detailed_caption'
152 |                     }
153 |                    ),
154 |                 "fill_mask": ("BOOLEAN", {"default": True}),
155 |                 "keep_model_loaded": ("BOOLEAN", {"default": False}),
156 |                 "max_new_tokens": ("INT", {"default": 1024, "min": 1, "max": 4096}),
157 |                 "num_beams": ("INT", {"default": 3, "min": 1, "max": 64}),
158 |                 "do_sample": ("BOOLEAN", {"default": True}),
159 |                 "output_mask_select": ("STRING", {"default": ""}),
160 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
161 |             }
162 |     
163 |         }
164 |     
165 |     RETURN_TYPES = ("IMAGE", "MASK", "STRING", "JSON")
166 |     RETURN_NAMES =("image", "mask", "caption", "data") 
167 |     FUNCTION = "encode"
168 |     CATEGORY = "Florence2"
169 | 
170 |     def encode(self, image, text_input, florence2_model, task, fill_mask,keep_model_loaded, 
171 |             num_beams, max_new_tokens, do_sample, output_mask_select,seed):
172 |         device = mm.get_torch_device()
173 |         _, height, width, _ = image.shape
174 |         offload_device = mm.unet_offload_device()
175 |         annotated_image_tensor = None
176 |         mask_tensor = None
177 |         processor = florence2_model['processor']
178 |         model = florence2_model['model']
179 |         dtype = florence2_model['dtype']
180 |         model.to(device)
181 | 
182 |         colormap = ['blue','orange','green','purple','brown','pink','olive','cyan','red',
183 |                     'lime','indigo','violet','aqua','magenta','gold','tan','skyblue']
184 | 
185 |         prompts = {
186 |             'region_caption': '<OD>',
187 |             'dense_region_caption': '<DENSE_REGION_CAPTION>',
188 |             'region_proposal': '<REGION_PROPOSAL>',
189 |             'caption': '<CAPTION>',
190 |             'detailed_caption': '<DETAILED_CAPTION>',
191 |             'more_detailed_caption': '<MORE_DETAILED_CAPTION>',
192 |             'caption_to_phrase_grounding': '<CAPTION_TO_PHRASE_GROUNDING>',
193 |             'referring_expression_segmentation': '<REFERRING_EXPRESSION_SEGMENTATION>',
194 |             'ocr': '<OCR>',
195 |             'ocr_with_region': '<OCR_WITH_REGION>',
196 |             'docvqa': '<DocVQA>',
197 |             'mixed_caption(PromptGen 1.5)':'<MIXED_CAPTION>',
198 |             'generate_tags(PromptGen 1.5)':'<GENERATE_TAGS>'
199 |         }
200 |         task_prompt = prompts.get(task, '<OD>')
201 | 
202 |         # if (task not in ['referring_expression_segmentation', 'caption_to_phrase_grounding', 'docvqa']) and text_input:
203 |         #     raise ValueError("Text input (prompt) is only supported for 'referring_expression_segmentation', 'caption_to_phrase_grounding', and 'docvqa'")
204 | 
205 |         if text_input != "":
206 |             prompt = task_prompt + " " + text_input
207 |         else:
208 |             prompt = task_prompt
209 | 
210 |         image = image.permute(0, 3, 1, 2)
211 |         
212 |         out = []
213 |         out_masks = []
214 |         out_results = []
215 |         out_data = []
216 |         pbar = ProgressBar(len(image))
217 |         for img in image:
218 |             image_pil = F.to_pil_image(img)
219 |             inputs = processor(text=prompt, images=image_pil, return_tensors="pt", do_rescale=False).to(dtype).to(device)
220 | 
221 |             generated_ids = model.generate(
222 |                 input_ids=inputs["input_ids"],
223 |                 pixel_values=inputs["pixel_values"],
224 |                 max_new_tokens=max_new_tokens,
225 |                 do_sample=do_sample,
226 |                 num_beams=num_beams,
227 |             )
228 | 
229 |             results = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
230 |             print(results)
231 |             # cleanup the special tokens from the final list
232 |             if task == 'ocr_with_region':
233 |                 clean_results = str(results)       
234 |                 cleaned_string = re.sub(r'</?s>|<[^>]*>', '\n',  clean_results)
235 |                 clean_results = re.sub(r'\n+', '\n', cleaned_string)
236 |             else:
237 |                 clean_results = str(results)       
238 |                 clean_results = clean_results.replace('</s>', '')
239 |                 clean_results = clean_results.replace('<s>', '')
240 | 
241 |             #return single string if only one image for compatibility with nodes that can't handle string lists
242 |             if len(image) == 1:
243 |                 out_results = clean_results
244 |             else:
245 |                 out_results.append(clean_results)
246 | 
247 |             W, H = image_pil.size
248 |             
249 |             parsed_answer = processor.post_process_generation(results, task=task_prompt, image_size=(W, H))
250 | 
251 |             if task == 'region_caption' or task == 'dense_region_caption' or task == 'caption_to_phrase_grounding' or task == 'region_proposal':           
252 |                 fig, ax = plt.subplots(figsize=(W / 100, H / 100), dpi=100)
253 |                 fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
254 |                 ax.imshow(image_pil)
255 |                 bboxes = parsed_answer[task_prompt]['bboxes']
256 |                 labels = parsed_answer[task_prompt]['labels']
257 | 
258 |                 mask_indexes = []
259 |                 # Determine mask indexes outside the loop
260 |                 if output_mask_select != "":
261 |                     mask_indexes = [n for n in output_mask_select.split(",")]
262 |                     print(mask_indexes)
263 |                 else:
264 |                     mask_indexes = [str(i) for i in range(len(bboxes))]
265 | 
266 |                 # Initialize mask_layer only if needed
267 |                 if fill_mask:
268 |                     mask_layer = Image.new('RGB', image_pil.size, (0, 0, 0))
269 |                     mask_draw = ImageDraw.Draw(mask_layer)
270 | 
271 |                 for index, (bbox, label) in enumerate(zip(bboxes, labels)):
272 |                     # Modify the label to include the index
273 |                     indexed_label = f"{index}.{label}"
274 |                     
275 |                     if fill_mask:
276 |                         if str(index) in mask_indexes:
277 |                             print("match index:", str(index), "in mask_indexes:", mask_indexes)
278 |                             mask_draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], fill=(255, 255, 255))
279 |                         if label in mask_indexes:
280 |                             print("match label")
281 |                             mask_draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], fill=(255, 255, 255))
282 | 
283 |                     # Create a Rectangle patch
284 |                     rect = patches.Rectangle(
285 |                         (bbox[0], bbox[1]),  # (x,y) - lower left corner
286 |                         bbox[2] - bbox[0],   # Width
287 |                         bbox[3] - bbox[1],   # Height
288 |                         linewidth=1,
289 |                         edgecolor='r',
290 |                         facecolor='none',
291 |                         label=indexed_label
292 |                     )
293 |                      # Calculate text width with a rough estimation
294 |                     text_width = len(label) * 6  # Adjust multiplier based on your font size
295 |                     text_height = 12  # Adjust based on your font size
296 | 
297 |                     # Initial text position
298 |                     text_x = bbox[0]
299 |                     text_y = bbox[1] - text_height  # Position text above the top-left of the bbox
300 | 
301 |                     # Adjust text_x if text is going off the left or right edge
302 |                     if text_x < 0:
303 |                         text_x = 0
304 |                     elif text_x + text_width > W:
305 |                         text_x = W - text_width
306 | 
307 |                     # Adjust text_y if text is going off the top edge
308 |                     if text_y < 0:
309 |                         text_y = bbox[3]  # Move text below the bottom-left of the bbox if it doesn't overlap with bbox
310 | 
311 |                     # Add the rectangle to the plot
312 |                     ax.add_patch(rect)
313 |                     facecolor = random.choice(colormap) if len(image) == 1 else 'red'
314 |                     # Add the label
315 |                     plt.text(
316 |                         text_x,
317 |                         text_y,
318 |                         indexed_label,
319 |                         color='white',
320 |                         fontsize=12,
321 |                         bbox=dict(facecolor=facecolor, alpha=0.5)
322 |                     )
323 |                 if fill_mask:             
324 |                     mask_tensor = F.to_tensor(mask_layer)
325 |                     mask_tensor = mask_tensor.unsqueeze(0).permute(0, 2, 3, 1).cpu().float()
326 |                     mask_tensor = mask_tensor.mean(dim=0, keepdim=True)
327 |                     mask_tensor = mask_tensor.repeat(1, 1, 1, 3)
328 |                     mask_tensor = mask_tensor[:, :, :, 0]
329 |                     out_masks.append(mask_tensor)           
330 | 
331 |                 # Remove axis and padding around the image
332 |                 ax.axis('off')
333 |                 ax.margins(0,0)
334 |                 ax.get_xaxis().set_major_locator(plt.NullLocator())
335 |                 ax.get_yaxis().set_major_locator(plt.NullLocator())
336 |                 fig.canvas.draw() 
337 |                 buf = io.BytesIO()
338 |                 plt.savefig(buf, format='png', pad_inches=0)
339 |                 buf.seek(0)
340 |                 annotated_image_pil = Image.open(buf)
341 | 
342 |                 annotated_image_tensor = F.to_tensor(annotated_image_pil)
343 |                 out_tensor = annotated_image_tensor[:3, :, :].unsqueeze(0).permute(0, 2, 3, 1).cpu().float()
344 |                 out.append(out_tensor)
345 | 
346 |                 
347 |                 pbar.update(1)
348 |     
349 |                 plt.close(fig)
350 | 
351 |             elif task == 'referring_expression_segmentation':
352 |                 # Create a new black image
353 |                 mask_image = Image.new('RGB', (W, H), 'black')
354 |                 mask_draw = ImageDraw.Draw(mask_image)
355 |   
356 |                 predictions = parsed_answer[task_prompt]
357 |     
358 |                 # Iterate over polygons and labels  
359 |                 for polygons, label in zip(predictions['polygons'], predictions['labels']):
360 |                     color = random.choice(colormap)
361 |                     for _polygon in polygons:  
362 |                         _polygon = np.array(_polygon).reshape(-1, 2)
363 |                         # Clamp polygon points to image boundaries
364 |                         _polygon = np.clip(_polygon, [0, 0], [W - 1, H - 1])
365 |                         if len(_polygon) < 3:  
366 |                             print('Invalid polygon:', _polygon)
367 |                             continue  
368 |                         
369 |                         _polygon = _polygon.reshape(-1).tolist()
370 |                         
371 |                         # Draw the polygon
372 |                         if fill_mask:
373 |                             overlay = Image.new('RGBA', image_pil.size, (255, 255, 255, 0))
374 |                             image_pil = image_pil.convert('RGBA')
375 |                             draw = ImageDraw.Draw(overlay)
376 |                             color_with_opacity = ImageColor.getrgb(color) + (180,)
377 |                             draw.polygon(_polygon, outline=color, fill=color_with_opacity, width=3)
378 |                             image_pil = Image.alpha_composite(image_pil, overlay)
379 |                         else:
380 |                             draw = ImageDraw.Draw(image_pil)
381 |                             draw.polygon(_polygon, outline=color, width=3)
382 | 
383 |                         #draw mask
384 |                         mask_draw.polygon(_polygon, outline="white", fill="white")
385 |                         
386 |                 image_tensor = F.to_tensor(image_pil)
387 |                 image_tensor = image_tensor[:3, :, :].unsqueeze(0).permute(0, 2, 3, 1).cpu().float() 
388 |                 out.append(image_tensor)
389 | 
390 |                 mask_tensor = F.to_tensor(mask_image)
391 |                 mask_tensor = mask_tensor.unsqueeze(0).permute(0, 2, 3, 1).cpu().float()
392 |                 mask_tensor = mask_tensor.mean(dim=0, keepdim=True)
393 |                 mask_tensor = mask_tensor.repeat(1, 1, 1, 3)
394 |                 mask_tensor = mask_tensor[:, :, :, 0]
395 |                 out_masks.append(mask_tensor)
396 |                 pbar.update(1)
397 | 
398 |             elif task == 'ocr_with_region':
399 |                 try:
400 |                     font = ImageFont.load_default().font_variant(size=24)
401 |                 except:
402 |                     font = ImageFont.load_default()
403 |                 predictions = parsed_answer[task_prompt]
404 |                 scale = 1
405 |                 draw = ImageDraw.Draw(image_pil)
406 |                 bboxes, labels = predictions['quad_boxes'], predictions['labels']
407 |                 
408 |                 for box, label in zip(bboxes, labels):
409 |                     bbox = calculate_bounding_box(width, height, box)
410 |                     out_data.append({"label": label, "polygon": box, "box": bbox})
411 |                     color = random.choice(colormap)
412 |                     new_box = (np.array(box) * scale).tolist()
413 |                     draw.polygon(new_box, width=3, outline=color)
414 |                     draw.text((new_box[0]+8, new_box[1]+2),
415 |                                 "{}".format(label),
416 |                                 align="right",
417 |                                 font=font,
418 |                                 fill=color)
419 |                     
420 |                 image_tensor = F.to_tensor(image_pil)
421 |                 image_tensor = image_tensor[:3, :, :].unsqueeze(0).permute(0, 2, 3, 1).cpu().float()
422 |                 out.append(image_tensor)
423 | 
424 |             elif task == 'docvqa':
425 |                 if text_input == "":
426 |                     raise ValueError("Text input (prompt) is required for 'docvqa'")
427 |                 prompt = "<DocVQA> " + text_input
428 | 
429 |                 inputs = processor(text=prompt, images=image_pil, return_tensors="pt", do_rescale=False).to(dtype).to(device)
430 |                 generated_ids = model.generate(
431 |                     input_ids=inputs["input_ids"],
432 |                     pixel_values=inputs["pixel_values"],
433 |                     max_new_tokens=max_new_tokens,
434 |                     do_sample=do_sample,
435 |                     num_beams=num_beams,
436 |                 )
437 | 
438 |                 results = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
439 |                 clean_results = results.replace('</s>', '').replace('<s>', '')
440 |                 
441 |                 if len(image) == 1:
442 |                     out_results = clean_results
443 |                 else:
444 |                     out_results.append(clean_results)
445 |                     
446 |                 out.append(F.to_tensor(image_pil).unsqueeze(0).permute(0, 2, 3, 1).cpu().float())
447 | 
448 |                 pbar.update(1)
449 |             
450 |         if len(out) > 0:
451 |             out_tensor = torch.cat(out, dim=0)
452 |         else:
453 |             out_tensor = torch.zeros((1, 64,64, 3), dtype=torch.float32, device="cpu")
454 |         if len(out_masks) > 0:
455 |             out_mask_tensor = torch.cat(out_masks, dim=0)
456 |         else:
457 |             out_mask_tensor = torch.zeros((1,64,64), dtype=torch.float32, device="cpu")
458 | 
459 |         if not keep_model_loaded:
460 |             print("Offloading model...")
461 |             model.to(offload_device)
462 |             mm.soft_empty_cache()
463 |         
464 |         return (out_tensor, out_mask_tensor, out_results, out_data)
465 |      
466 | # NODE_CLASS_MAPPINGS = {
467 | #     "DownloadAndLoadFlorence2Model": DownloadAndLoadFlorence2Model,
468 | #     "Florence2Run": Florence2Run,
469 | # }
470 | # NODE_DISPLAY_NAME_MAPPINGS = {
471 | #     "DownloadAndLoadFlorence2Model": "DownloadAndLoadFlorence2Model",
472 | #     "Florence2Run": "Florence2Run",
473 | # }


--------------------------------------------------------------------------------
/ic_lora_batch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision.transforms.functional as F
  3 | import io
  4 | import os
  5 | from typing import List
  6 | import matplotlib
  7 | matplotlib.use('Agg')   
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.patches as patches
 10 | from PIL import Image, ImageDraw, ImageColor, ImageFont
 11 | import random
 12 | import numpy as np
 13 | import re
 14 | import time
 15 | from .lib.ximg import *
 16 | from .lib.xmodel import *
 17 | from comfy.utils import ProgressBar, common_upscale
 18 | import torchvision.transforms.functional as TVF
 19 | 
 20 | #workaround for unnecessary flash_attn requirement
 21 | from unittest.mock import patch
 22 | from transformers.dynamic_module_utils import get_imports
 23 | 
 24 | def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
 25 |     if not str(filename).endswith("modeling_florence2.py"):
 26 |         return get_imports(filename)
 27 |     imports = get_imports(filename)
 28 |     # imports.remove("flash_attn")
 29 |     return imports
 30 | 
 31 | 
 32 | import comfy.model_management as mm
 33 | from comfy.utils import ProgressBar
 34 | import folder_paths
 35 | 
 36 | script_directory = os.path.dirname(os.path.abspath(__file__))
 37 | 
 38 | 
 39 | 
 40 | class CXH_IC_Lora_Florence2Run:
 41 |     @classmethod
 42 |     def INPUT_TYPES(s):
 43 |         return {
 44 |             "required": {
 45 |                 "tip_pipe":   ("STRING", {"multiline": False, "default": "", "forceInput": True},),
 46 |                 "florence2_model": ("FL2MODEL", ),
 47 |                 "format": (["png", "jpg"],),
 48 |                 "max_new_tokens":("INT", {"default": 512, "min": 10, "max": 4096, "step": 1}),
 49 |                 "dir1": ("STRING", {"default": ""}),
 50 |                 "dir2":  ("STRING", {"default": ""}),
 51 |                 "saveDir":  ("STRING", {"default": ""}),
 52 |                 "text1": ("STRING", {"default": "", "multiline": True, "label": "Text Box 1"}),
 53 |                 "text2": ("STRING", {"default": "", "multiline": True, "label": "Text Box 2"}),
 54 |                 "template": ("STRING", {"default": "Realistic style, [cloth-on], the image pair highlights a transformation from a clothing sample photo to the effect of actually wearing it. [image1] {caption} [image2] a female model is wearing the cloth from [image1] with {caption}", "multiline": True, "label": ""}),
 55 |                 "direction": (
 56 |                 [   'right',
 57 |                     'down',
 58 |                     'left',
 59 |                     'up',
 60 |                 ],
 61 |                 {
 62 |                 "default": 'right'
 63 |                 }),
 64 |                 "match_image_size": ("BOOLEAN", {"default": True}),
 65 |                 }
 66 |             }
 67 |     
 68 |     RETURN_TYPES = ( "STRING", )
 69 |     RETURN_NAMES =("caption", ) 
 70 |     FUNCTION = "encode"
 71 |     CATEGORY = "Florence2"
 72 | 
 73 |     def encode(self,tip_pipe,florence2_model,format,max_new_tokens,dir1,dir2,saveDir,text1,text2,template,direction, match_image_size,first_image_shape=None):
 74 |         print("执行完成:"+tip_pipe)
 75 |         torch.cuda.empty_cache()
 76 | 
 77 |         device = mm.get_torch_device()
 78 |         offload_device = mm.unet_offload_device()
 79 |         processor = florence2_model['processor']
 80 |         model = florence2_model['model']
 81 |         dtype = florence2_model['dtype']
 82 |         model.to(device)
 83 | 
 84 | 
 85 |         task_prompt = "<MORE_DETAILED_CAPTION>"
 86 | 
 87 |         prompt = task_prompt
 88 | 
 89 |         # image = image.permute(0, 3, 1, 2)
 90 | 
 91 |         # 批量读取
 92 |         if not os.path.isdir(dir1):
 93 |             raise FileNotFoundError(f"Directory '{dir1}' cannot be found.")
 94 |         dir_files = os.listdir(dir1)
 95 | 
 96 |         if len(dir_files) == 0:
 97 |             raise FileNotFoundError(f"No files in directory '{dir1}'.")
 98 |         
 99 |         valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
100 |         dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)]
101 | 
102 |         dir_files = sorted(dir_files)
103 |         dir_files = [os.path.join(dir1, x) for x in dir_files]
104 | 
105 |     
106 |          # 创建保存目录
107 |         if not os.path.exists(saveDir):
108 |             os.makedirs(saveDir)
109 |         index1 = 0 
110 |         for image_path in dir_files:
111 |             if os.path.isdir(image_path) and os.path.ex:
112 |                 continue
113 |             start = time.time()
114 | 
115 |             #查找两张图片
116 |             # 获取文件名（不包含路径）
117 |             file_name = os.path.basename(image_path)
118 | 
119 |              # 构造第二张图片的路径
120 |             second_image_path = os.path.join(dir2, file_name)
121 | 
122 |             # 检查第二张图片是否存在
123 |             if not os.path.isfile(second_image_path):
124 |                 print(f"Second image not found for {file_name}")
125 |                 continue
126 | 
127 |              #检查是否已经存在
128 |             file_name_without_ext, _ = os.path.splitext(file_name)
129 |             img_file_name = f"{file_name_without_ext}.{format}"
130 | 
131 |             # if format != "png":
132 |             #     if input_image.mode == "RGBA":
133 |             #         input_image = input_image.convert("RGB")
134 |             img_save_path = os.path.join(saveDir, img_file_name)
135 |             if os.path.isfile(img_save_path):
136 |                 print(f"存在跳过: {img_file_name}")
137 |                 index1 = index1 + 1
138 |                 continue
139 | 
140 |             # 打开图片
141 |             input_image = open_image(image_path)
142 |             input_image = ImageOps.exif_transpose(input_image)
143 |             input_image = input_image.convert("RGB")
144 | 
145 |             second_image = open_image(second_image_path)
146 |             second_image = ImageOps.exif_transpose(second_image)
147 |             second_image = second_image.convert("RGB")
148 |         
149 |             image_pil = input_image
150 |             inputs = processor(text=prompt, images=image_pil, return_tensors="pt", do_rescale=False).to(dtype).to(device)
151 | 
152 |             generated_ids = model.generate(
153 |                 input_ids=inputs["input_ids"],
154 |                 pixel_values=inputs["pixel_values"],
155 |                 max_new_tokens=max_new_tokens,
156 |                 do_sample=True,
157 |                 num_beams=3,
158 |             )
159 | 
160 |             results = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
161 |             clean_results = str(results)       
162 |             clean_results = clean_results.replace('</s>', '')
163 |             clean_results = clean_results.replace('<s>', '')
164 | 
165 |             W, H = image_pil.size
166 |             parsed_answer = processor.post_process_generation(results, task=task_prompt, image_size=(W, H))
167 |             caption = parsed_answer[task_prompt]
168 | 
169 |              # 提示词
170 |             # file_name_without_ext, _ = os.path.splitext(file_name)
171 |             caption = caption.strip()
172 |             txt_file_name = f"{file_name_without_ext}.txt"
173 |             txt_save_path = os.path.join(saveDir, txt_file_name)
174 |             final_text = template.replace("{caption}", caption).replace("{text1}", text1).replace("{text2}", text2)
175 | 
176 |             try:
177 |                 with open(txt_save_path, 'w', encoding='utf-8') as file:
178 |                     file.write(final_text)
179 |             except IOError as e:
180 |                 print(f"保存文件时发生错误: {e}")
181 | 
182 |              # Check if the batch sizes are different
183 |             image1 = pil2tensor(input_image)
184 |             image2 = pil2tensor(second_image)
185 |             batch_size1 = image1.shape[0]
186 |             batch_size2 = image2.shape[0]
187 | 
188 |             if batch_size1 != batch_size2:
189 |                 # Calculate the number of repetitions needed
190 |                 max_batch_size = max(batch_size1, batch_size2)
191 |                 repeats1 = max_batch_size // batch_size1
192 |                 repeats2 = max_batch_size // batch_size2
193 |                 
194 |                 # Repeat the images to match the largest batch size
195 |                 image1 = image1.repeat(repeats1, 1, 1, 1)
196 |                 image2 = image2.repeat(repeats2, 1, 1, 1)
197 | 
198 |             if match_image_size:
199 |                 # Use first_image_shape if provided; otherwise, default to image1's shape
200 |                 target_shape = first_image_shape if first_image_shape is not None else image1.shape
201 | 
202 |                 original_height = image2.shape[1]
203 |                 original_width = image2.shape[2]
204 |                 original_aspect_ratio = original_width / original_height
205 | 
206 |                 if direction in ['left', 'right']:
207 |                     # Match the height and adjust the width to preserve aspect ratio
208 |                     target_height = target_shape[1]  # B, H, W, C format
209 |                     target_width = int(target_height * original_aspect_ratio)
210 |                 elif direction in ['up', 'down']:
211 |                     # Match the width and adjust the height to preserve aspect ratio
212 |                     target_width = target_shape[2]  # B, H, W, C format
213 |                     target_height = int(target_width / original_aspect_ratio)
214 |                 
215 |                 # Adjust image2 to the expected format for common_upscale
216 |                 image2_for_upscale = image2.movedim(-1, 1)  # Move C to the second position (B, C, H, W)
217 |                 
218 |                 # Resize image2 to match the target size while preserving aspect ratio
219 |                 image2_resized = common_upscale(image2_for_upscale, target_width, target_height, "lanczos", "disabled")
220 |                 
221 |                 # Adjust image2 back to the original format (B, H, W, C) after resizing
222 |                 image2_resized = image2_resized.movedim(1, -1)
223 |             else:
224 |                 image2_resized = image2
225 | 
226 |             # Concatenate based on the specified direction
227 |             if direction == 'right':
228 |                 concatenated_image = torch.cat((image1, image2_resized), dim=2)  # Concatenate along width
229 |             elif direction == 'down':
230 |                 concatenated_image = torch.cat((image1, image2_resized), dim=1)  # Concatenate along height
231 |             elif direction == 'left':
232 |                 concatenated_image = torch.cat((image2_resized, image1), dim=2)  # Concatenate along width
233 |             elif direction == 'up':
234 |                 concatenated_image = torch.cat((image2_resized, image1), dim=1)  # Concatenate along height
235 | 
236 |             input_image = tensor2pil(concatenated_image)
237 | 
238 |             # 图片
239 |             # img_file_name = f"{file_name_without_ext}.{format}"
240 | 
241 |             # if format != "png":
242 |             #     if input_image.mode == "RGBA":
243 |             #         input_image = input_image.convert("RGB")
244 |             # img_save_path = os.path.join(saveDir, img_file_name)
245 |             input_image.save(img_save_path)
246 |             end = time.time()
247 |             execution_time = calculate_seconds_difference(start, end)
248 |             temp = f":{execution_time:.3f}s"
249 |             index1 = index1 + 1
250 |             print(str(index1)+"/"+str(len(dir_files)) +":"+temp)
251 | 
252 |         print("finish结束")    
253 |         model.to(offload_device)
254 |         mm.soft_empty_cache()
255 |         
256 |         return (saveDir,)
257 | 
258 | class CXH_Ic_lora_Joy_batch:
259 | 
260 |     def __init__(self):
261 |         pass
262 | 
263 |     @classmethod
264 |     def INPUT_TYPES(s):
265 |         return {
266 |             "required": {
267 |                 "JoyPipeline_alpha": ("JoyPipeline_alpha",),
268 |                 "prompt":   ("STRING", {"multiline": True, "default": "A descriptive caption for this image"},),
269 |                 "format": (["png", "jpg"],),
270 |                 "max_new_tokens":("INT", {"default": 1024, "min": 10, "max": 4096, "step": 1}),
271 |                 "dir1": ("STRING", {"default": ""}),
272 |                 "dir2":  ("STRING", {"default": ""}),
273 |                 "saveDir":  ("STRING", {"default": ""}),
274 |                 "text1": ("STRING", {"default": "", "multiline": True, "label": "Text Box 1"}),
275 |                 "text2": ("STRING", {"default": "", "multiline": True, "label": "Text Box 2"}),
276 |                 "template": ("STRING", {"default": "Realistic style, [cloth-on], the image pair highlights a transformation from a clothing sample photo to the effect of actually wearing it. [image1] {caption} [image2] a female model is wearing the cloth from [image1] with {caption}", "multiline": True, "label": ""}),
277 |                 "direction": (
278 |                 [   'right',
279 |                     'down',
280 |                     'left',
281 |                     'up',
282 |                 ],
283 |                 {
284 |                 "default": 'right'
285 |                 }),
286 |                 "match_image_size": ("BOOLEAN", {"default": True}),
287 |                 }
288 |         }
289 | 
290 |     RETURN_TYPES = ()
291 |     FUNCTION = "gen"
292 |     OUTPUT_NODE = True
293 |     CATEGORY = "CXH/Images"
294 | 
295 |     def gen(self,JoyPipeline_alpha,prompt,format,max_new_tokens,dir1,dir2,saveDir,text1,text2,template,direction, match_image_size, first_image_shape=None):
296 | 
297 |         torch.cuda.empty_cache()
298 | 
299 |         joy_pipeline =  JoyPipeline_alpha 
300 |         if joy_pipeline.clip_processor == None :
301 |             joy_pipeline.parent.loadCheckPoint()    
302 | 
303 |         clip_processor = joy_pipeline.clip_processor
304 |         tokenizer = joy_pipeline.tokenizer
305 |         clip_model = joy_pipeline.clip_model
306 |         image_adapter = joy_pipeline.image_adapter
307 |         text_model = joy_pipeline.text_model
308 | 
309 |         convo = [
310 |                 {
311 |                     "role": "system",
312 |                     "content": "You are a helpful image captioner.",
313 |                 },
314 |                 {
315 |                     "role": "user",
316 |                     "content": prompt,
317 |                 },
318 |             ]
319 | 
320 |         convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
321 |         assert isinstance(convo_string, str)
322 | 
323 |         convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
324 |         prompt_tokens = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False, truncation=False)
325 |         assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
326 |         convo_tokens = convo_tokens.squeeze(0)  # Squeeze just to make the following easier
327 |         prompt_tokens = prompt_tokens.squeeze(0)
328 | 
329 |         eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[
330 |                 0].tolist()
331 |         assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
332 | 
333 |         preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]  # Number of tokens before the prompt
334 | 
335 | 
336 |         # text_model = joy_two_pipeline.llm.load_llm_model(joy_two_pipeline.model)
337 |         # Embed the tokens
338 |         convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda'))
339 | 
340 | 
341 | 
342 |         # 批量读取
343 |         if not os.path.isdir(dir1):
344 |             raise FileNotFoundError(f"Directory '{dir1}' cannot be found.")
345 |         dir_files = os.listdir(dir1)
346 | 
347 |         # if not os.path.isdir(dir2):
348 |         #     raise FileNotFoundError(f"Directory '{dir2}' cannot be found.")
349 |         # dir_files_2 = os.listdir(dir2)
350 | 
351 |         if len(dir_files) == 0:
352 |             raise FileNotFoundError(f"No files in directory '{dir1}'.")
353 |         
354 |         valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
355 |         dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)]
356 | 
357 |         dir_files = sorted(dir_files)
358 |         dir_files = [os.path.join(dir1, x) for x in dir_files]
359 | 
360 |     
361 |          # 创建保存目录
362 |         if not os.path.exists(saveDir):
363 |             os.makedirs(saveDir)
364 | 
365 |         index1 = 0
366 |         for image_path in dir_files:
367 |             if os.path.isdir(image_path) and os.path.ex:
368 |                 continue
369 |             start = time.time()
370 | 
371 |             #查找两张图片
372 |             # 获取文件名（不包含路径）
373 |             file_name = os.path.basename(image_path)
374 | 
375 |              # 构造第二张图片的路径
376 |             second_image_path = os.path.join(dir2, file_name)
377 | 
378 |             # 检查第二张图片是否存在
379 |             if not os.path.isfile(second_image_path):
380 |                 print(f"Second image not found for {file_name}")
381 |                 index1 = index1 + 1
382 |                 continue
383 | 
384 |             #检查是否已经存在
385 |             file_name_without_ext, _ = os.path.splitext(file_name)
386 |             img_file_name = f"{file_name_without_ext}.{format}"
387 | 
388 |             # if format != "png":
389 |             #     if input_image.mode == "RGBA":
390 |             #         input_image = input_image.convert("RGB")
391 |             img_save_path = os.path.join(saveDir, img_file_name)
392 |             if os.path.isfile(img_save_path):
393 |                 print(f"存在跳过 {img_file_name}")
394 |                 index1 = index1 + 1
395 |                 continue
396 | 
397 |             # 打开图片
398 |             input_image = open_image(image_path)
399 |             input_image = ImageOps.exif_transpose(input_image)
400 |             input_image = input_image.convert("RGB")
401 | 
402 |             second_image = open_image(second_image_path)
403 |             second_image = ImageOps.exif_transpose(second_image)
404 |             second_image = second_image.convert("RGB")
405 | 
406 | 
407 |             image = input_image.resize((384, 384), Image.LANCZOS)
408 |             pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
409 |             pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
410 |             pixel_values = pixel_values.to('cuda')
411 | 
412 | 
413 |             with torch.amp.autocast_mode.autocast('cuda', enabled=True):
414 |                 vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
415 |                 embedded_images = image_adapter(vision_outputs.hidden_states)
416 |                 embedded_images = embedded_images.to('cuda')
417 | 
418 |             input_embeds = torch.cat([
419 |                 convo_embeds[:, :preamble_len],  # Part before the prompt
420 |                 embedded_images.to(dtype=convo_embeds.dtype),  # Image
421 |                 convo_embeds[:, preamble_len:],  # The prompt and anything after it
422 |             ], dim=1).to('cuda')
423 | 
424 |             input_ids = torch.cat([
425 |                 convo_tokens[:preamble_len].unsqueeze(0),
426 |                 torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
427 |                 # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
428 |                 convo_tokens[preamble_len:].unsqueeze(0),
429 |             ], dim=1).to('cuda')
430 |             attention_mask = torch.ones_like(input_ids)
431 | 
432 |             generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask,
433 |                                             max_new_tokens=max_new_tokens, do_sample=True,
434 |                                             suppress_tokens=None)  # Uses the default which is temp=0.6, top_p=0.9
435 | 
436 | 
437 |             generate_ids = generate_ids[:, input_ids.shape[1]:]
438 |             if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids(
439 |                     "<|eot_id|>"):
440 |                 generate_ids = generate_ids[:, :-1]
441 | 
442 |             caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
443 | 
444 |             # 提示词
445 |            
446 |             caption = caption.strip()
447 |             txt_file_name = f"{file_name_without_ext}.txt"
448 |             txt_save_path = os.path.join(saveDir, txt_file_name)
449 |             final_text = template.replace("{caption}", caption).replace("{text1}", text1).replace("{text2}", text2)
450 | 
451 |             try:
452 |                 with open(txt_save_path, 'w', encoding='utf-8') as file:
453 |                     file.write(final_text)
454 |             except IOError as e:
455 |                 print(f"保存文件时发生错误: {e}")
456 | 
457 |           
458 |             # Check if the batch sizes are different
459 |             image1 = pil2tensor(input_image)
460 |             image2 = pil2tensor(second_image)
461 |             batch_size1 = image1.shape[0]
462 |             batch_size2 = image2.shape[0]
463 | 
464 |             if batch_size1 != batch_size2:
465 |                 # Calculate the number of repetitions needed
466 |                 max_batch_size = max(batch_size1, batch_size2)
467 |                 repeats1 = max_batch_size // batch_size1
468 |                 repeats2 = max_batch_size // batch_size2
469 |                 
470 |                 # Repeat the images to match the largest batch size
471 |                 image1 = image1.repeat(repeats1, 1, 1, 1)
472 |                 image2 = image2.repeat(repeats2, 1, 1, 1)
473 | 
474 |             if match_image_size:
475 |                 # Use first_image_shape if provided; otherwise, default to image1's shape
476 |                 target_shape = first_image_shape if first_image_shape is not None else image1.shape
477 | 
478 |                 original_height = image2.shape[1]
479 |                 original_width = image2.shape[2]
480 |                 original_aspect_ratio = original_width / original_height
481 | 
482 |                 if direction in ['left', 'right']:
483 |                     # Match the height and adjust the width to preserve aspect ratio
484 |                     target_height = target_shape[1]  # B, H, W, C format
485 |                     target_width = int(target_height * original_aspect_ratio)
486 |                 elif direction in ['up', 'down']:
487 |                     # Match the width and adjust the height to preserve aspect ratio
488 |                     target_width = target_shape[2]  # B, H, W, C format
489 |                     target_height = int(target_width / original_aspect_ratio)
490 |                 
491 |                 # Adjust image2 to the expected format for common_upscale
492 |                 image2_for_upscale = image2.movedim(-1, 1)  # Move C to the second position (B, C, H, W)
493 |                 
494 |                 # Resize image2 to match the target size while preserving aspect ratio
495 |                 image2_resized = common_upscale(image2_for_upscale, target_width, target_height, "lanczos", "disabled")
496 |                 
497 |                 # Adjust image2 back to the original format (B, H, W, C) after resizing
498 |                 image2_resized = image2_resized.movedim(1, -1)
499 |             else:
500 |                 image2_resized = image2
501 | 
502 |             # Concatenate based on the specified direction
503 |             if direction == 'right':
504 |                 concatenated_image = torch.cat((image1, image2_resized), dim=2)  # Concatenate along width
505 |             elif direction == 'down':
506 |                 concatenated_image = torch.cat((image1, image2_resized), dim=1)  # Concatenate along height
507 |             elif direction == 'left':
508 |                 concatenated_image = torch.cat((image2_resized, image1), dim=2)  # Concatenate along width
509 |             elif direction == 'up':
510 |                 concatenated_image = torch.cat((image2_resized, image1), dim=1)  # Concatenate along height
511 | 
512 |             input_image = tensor2pil(concatenated_image)
513 | 
514 |            
515 |             input_image.save(img_save_path)
516 |             end = time.time()
517 |             execution_time = calculate_seconds_difference(start, end)
518 |             temp = f":{execution_time:.3f}s"
519 |             index1 = index1 + 1
520 |             print(str(index1)+"/"+str(len(dir_files)) +":"+temp)
521 | 
522 |         print("finish结束")
523 |         joy_pipeline.parent.clearCache()  
524 |         torch.cuda.empty_cache()
525 |         import gc
526 |         gc.collect()
527 |         return (saveDir, )
528 | 
529 | class CXH_IC_lora_reversal:
530 |     @classmethod
531 |     def INPUT_TYPES(s):
532 |         return {
533 |             "required": {
534 |                 "dir1": ("STRING", {"default": ""}),
535 |                 "dir2":  ("STRING", {"default": ""}),
536 |                 "text_dir":  ("STRING", {"default": ""}),
537 |                 "save_dir":  ("STRING", {"default": ""}),
538 |                 "slic_start":  ("STRING", {"default": "[image1]"}),
539 |                 "slic_end":  ("STRING", {"default": "[image2]"}),
540 |                 "format": (["png", "jpg"],),
541 |                 "text1": ("STRING", {"default": "", "multiline": True, "label": "Text Box 1"}),
542 |                 "text2": ("STRING", {"default": "", "multiline": True, "label": "Text Box 2"}),
543 |                 "template": ("STRING", {"default": "Realistic style, [cloth-on], the image pair highlights a transformation from a female model wearing the cloth to its clothing sample photo. [image1] a female model is wearing a cloth with {caption} [image2] the clothing sample photo of what the model is wearing in [image1] with {caption}", "multiline": True, "label": ""}),
544 |                 "direction": (
545 |                 [   'right',
546 |                     'down',
547 |                     'left',
548 |                     'up',
549 |                 ],
550 |                 {
551 |                 "default": 'right'
552 |                 }),
553 |                 "match_image_size": ("BOOLEAN", {"default": True}),
554 |                 }
555 |             }
556 |     
557 |     RETURN_TYPES = ( "STRING", )
558 |     RETURN_NAMES =("caption", ) 
559 |     FUNCTION = "encode"
560 |     CATEGORY = "Florence2"
561 | 
562 |     def encode(self,dir1,dir2,text_dir,save_dir,slic_start,slic_end,format,text1,text2,template,direction, match_image_size,first_image_shape=None):
563 | 
564 |         # 批量读取
565 |         if not os.path.isdir(dir1):
566 |             raise FileNotFoundError(f"Directory '{dir1}' cannot be found.")
567 |         dir_files = os.listdir(dir1)
568 | 
569 |         if len(dir_files) == 0:
570 |             raise FileNotFoundError(f"No files in directory '{dir1}'.")
571 |         
572 |         valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
573 |         dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)]
574 | 
575 |         dir_files = sorted(dir_files)
576 |         dir_files = [os.path.join(dir1, x) for x in dir_files]
577 | 
578 |     
579 |          # 创建保存目录
580 |         if not os.path.exists(save_dir):
581 |             os.makedirs(save_dir)
582 |         index1 = 0 
583 |         for image_path in dir_files:
584 |             if os.path.isdir(image_path) and os.path.ex:
585 |                 continue
586 |             start = time.time()
587 | 
588 |             #查找两张图片
589 |             # 获取文件名（不包含路径）
590 |             file_name = os.path.basename(image_path)
591 | 
592 |             # 构造第二张图片的路径
593 |             second_image_path = os.path.join(dir2, file_name)
594 | 
595 |             #检查是否已经存在
596 |             file_name_without_ext, _ = os.path.splitext(file_name)
597 |             # 第三个txt
598 |             text_file =  os.path.join(text_dir, file_name_without_ext+".txt")
599 | 
600 |             # 检查第二张图片是否存在
601 |             if not os.path.isfile(second_image_path) or not os.path.isfile(text_file) :
602 |                 print(f"Second image not found for {file_name}")
603 |                 continue
604 | 
605 |             img_file_name = f"{file_name_without_ext}.{format}"
606 | 
607 |            
608 | 
609 |             # if format != "png":
610 |             #     if input_image.mode == "RGBA":
611 |             #         input_image = input_image.convert("RGB")
612 |             img_save_path = os.path.join(save_dir, img_file_name)
613 |             if os.path.isfile(img_save_path):
614 |                 print(f"存在跳过: {img_file_name}")
615 |                 index1 = index1 + 1
616 |                 continue
617 | 
618 |             # 打开图片
619 |             input_image = open_image(image_path)
620 |             input_image = ImageOps.exif_transpose(input_image)
621 |             input_image = input_image.convert("RGB")
622 | 
623 |             second_image = open_image(second_image_path)
624 |             second_image = ImageOps.exif_transpose(second_image)
625 |             second_image = second_image.convert("RGB")
626 |             
627 |             #打开prompt
628 |             # 使用 open 函数打开文件，模式为 'r' 表示读取模式
629 |             with open(text_file, 'r', encoding='utf-8') as file:
630 |                 # 读取文件的所有内容，并存储在变量中
631 |                 content = file.read()
632 |             # 找到字符的索引
633 |             start_index = content.index(slic_start) + len(slic_start)
634 |             end_index = content.index(slic_end)  # 不需要加1，因为我们不包含end_char
635 |             
636 |             # 使用切片语法截取字符串
637 |             sliced_string = content[start_index:end_index]
638 | 
639 |             final_text = template.replace("{caption}", sliced_string).replace("{text1}", text1).replace("{text2}", text2)
640 | 
641 |             txt_file_name = f"{file_name_without_ext}.txt"
642 |             txt_save_path = os.path.join(save_dir, txt_file_name)
643 |             try:
644 |                 with open(txt_save_path, 'w', encoding='utf-8') as file:
645 |                     file.write(final_text)
646 |             except IOError as e:
647 |                 print(f"保存文件时发生错误: {e}")
648 | 
649 |           
650 |             # Check if the batch sizes are different
651 |             image1 = pil2tensor(input_image)
652 |             image2 = pil2tensor(second_image)
653 |             batch_size1 = image1.shape[0]
654 |             batch_size2 = image2.shape[0]
655 | 
656 |             if batch_size1 != batch_size2:
657 |                 # Calculate the number of repetitions needed
658 |                 max_batch_size = max(batch_size1, batch_size2)
659 |                 repeats1 = max_batch_size // batch_size1
660 |                 repeats2 = max_batch_size // batch_size2
661 |                 
662 |                 # Repeat the images to match the largest batch size
663 |                 image1 = image1.repeat(repeats1, 1, 1, 1)
664 |                 image2 = image2.repeat(repeats2, 1, 1, 1)
665 | 
666 |             if match_image_size:
667 |                 # Use first_image_shape if provided; otherwise, default to image1's shape
668 |                 target_shape = first_image_shape if first_image_shape is not None else image1.shape
669 | 
670 |                 original_height = image2.shape[1]
671 |                 original_width = image2.shape[2]
672 |                 original_aspect_ratio = original_width / original_height
673 | 
674 |                 if direction in ['left', 'right']:
675 |                     # Match the height and adjust the width to preserve aspect ratio
676 |                     target_height = target_shape[1]  # B, H, W, C format
677 |                     target_width = int(target_height * original_aspect_ratio)
678 |                 elif direction in ['up', 'down']:
679 |                     # Match the width and adjust the height to preserve aspect ratio
680 |                     target_width = target_shape[2]  # B, H, W, C format
681 |                     target_height = int(target_width / original_aspect_ratio)
682 |                 
683 |                 # Adjust image2 to the expected format for common_upscale
684 |                 image2_for_upscale = image2.movedim(-1, 1)  # Move C to the second position (B, C, H, W)
685 |                 
686 |                 # Resize image2 to match the target size while preserving aspect ratio
687 |                 image2_resized = common_upscale(image2_for_upscale, target_width, target_height, "lanczos", "disabled")
688 |                 
689 |                 # Adjust image2 back to the original format (B, H, W, C) after resizing
690 |                 image2_resized = image2_resized.movedim(1, -1)
691 |             else:
692 |                 image2_resized = image2
693 | 
694 |             # Concatenate based on the specified direction
695 |             if direction == 'right':
696 |                 concatenated_image = torch.cat((image1, image2_resized), dim=2)  # Concatenate along width
697 |             elif direction == 'down':
698 |                 concatenated_image = torch.cat((image1, image2_resized), dim=1)  # Concatenate along height
699 |             elif direction == 'left':
700 |                 concatenated_image = torch.cat((image2_resized, image1), dim=2)  # Concatenate along width
701 |             elif direction == 'up':
702 |                 concatenated_image = torch.cat((image2_resized, image1), dim=1)  # Concatenate along height
703 | 
704 |             input_image = tensor2pil(concatenated_image)
705 | 
706 |            
707 |             input_image.save(img_save_path)
708 |             end = time.time()
709 |             execution_time = calculate_seconds_difference(start, end)
710 |             temp = f":{execution_time:.3f}s"
711 |             index1 = index1 + 1
712 |             print(str(index1)+"/"+str(len(dir_files)) +":"+temp)
713 | 
714 |         print("finish结束")
715 |         return (save_dir, )


--------------------------------------------------------------------------------
/install_req.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | set SCRIPT_DIR=%~dp0
 4 | 
 5 | cd /d "%SCRIPT_DIR%../../../python_embeded"
 6 | 
 7 | 
 8 | python.exe -m pip install -r "%SCRIPT_DIR%requirements.txt"
 9 | 
10 | pause
11 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/lib/__init__.py


--------------------------------------------------------------------------------
/lib/xfile.py:
--------------------------------------------------------------------------------
 1 | import folder_paths
 2 | import os
 3 | import base64
 4 | import numpy as np
 5 | from PIL import Image,ImageOps, ImageFilter
 6 | 
 7 | import io
 8 | 
 9 | comfy_path = os.path.dirname(folder_paths.__file__)
10 | custom_nodes_path = os.path.join(comfy_path, "custom_nodes")
11 | 
12 | # D:\comfyui\ComfyUI_windows_portable\ComfyUI\custom_nodes\Comfyui_CXH_ALY
13 | # current_folder = os.path.dirname(os.path.abspath(__file__))
14 | 
15 | # 节点路径
16 | def node_path(node_name):
17 |     return os.path.join(custom_nodes_path,node_name)
18 | 
19 | # 创建文件夹
20 | def mkdir(path):
21 | 	folder = os.path.exists(path)
22 | 	if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
23 | 		os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径
24 | 
25 | # 获取所有图片文件路径
26 | def get_all_image_paths(directory):
27 |     image_paths = []
28 |     for root, dirs, files in os.walk(directory):
29 |         for file in files:
30 |             if file.lower().endswith(('.png', '.jpg', '.jpeg')):
31 |                 image_paths.append(os.path.join(root, file))
32 |     return image_paths
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/lib/ximg.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8 -*-
  2 | '''
  3 | @File         :ximg.py
  4 | @Description  :图片操作类封装
  5 | @Time         :2024/04/30 09:46:01
  6 | @Author       :ChenXingHua
  7 | @Version      :1.0
  8 | '''
  9 | 
 10 | import os
 11 | import torch
 12 | from PIL import Image, ImageOps, ImageSequence, ImageFile,UnidentifiedImageError
 13 | import numpy as np
 14 | import cv2 as cv
 15 | import io
 16 | import base64
 17 | import requests
 18 | from io import BytesIO
 19 | from datetime import datetime, timedelta
 20 | 
 21 | def tensor2pil(t_image: torch.Tensor)  -> Image:
 22 |     return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
 23 | 
 24 | def pil2tensor(image:Image) -> torch.Tensor:
 25 |     return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
 26 | 
 27 | def tensor2cv2(image:torch.Tensor) -> np.array:
 28 |     if image.dim() == 4:
 29 |         image = image.squeeze()
 30 |     npimage = image.numpy()
 31 |     cv2image = np.uint8(npimage * 255 / npimage.max())
 32 |     return cv.cvtColor(cv2image, cv.COLOR_RGB2BGR)
 33 | 
 34 | def cv22pil(cv2_img:np.ndarray) -> Image:
 35 |     cv2_img = cv.cvtColor(cv2_img, cv.COLOR_BGR2RGB)
 36 |     return Image.fromarray(cv2_img)
 37 | 
 38 | # pil转io
 39 | def pil2iobyte(pil_image,format='PNG'):
 40 |     byte_arr = io.BytesIO()
 41 |     pil_image.save(byte_arr, format=format)  
 42 |     byte_arr = byte_arr.getvalue() 
 43 |     return byte_arr
 44 | 
 45 | # pil转64
 46 | def pilTobase64(pil_image,format='PNG'):
 47 |     byte_arr = pil2iobyte(pil_image,format)
 48 |     image_base64 = base64.b64encode(byte_arr).decode('utf-8') 
 49 |     return image_base64
 50 | 
 51 | def ioBytes2tensor(bytes):
 52 |     image = Image.open(bytes)
 53 |     return pil2tensor(image)
 54 | 
 55 | def getImageSize(image):
 56 |     if image.shape[0] > 0:
 57 |         image = torch.unsqueeze(image[0], 0)
 58 |     _image = tensor2pil(image)
 59 | 
 60 |     return (_image.width, _image.height)
 61 | 
 62 | # 转成mask
 63 | def imageToMask(img):
 64 |     i = img
 65 |     if i.mode == 'I':
 66 |         i = i.point(lambda i: i * (1 / 255))
 67 |     image = i.convert("RGB")
 68 |     image = np.array(image).astype(np.float32) / 255.0
 69 |     image = torch.from_numpy(image)[None,]
 70 | 
 71 |     if 'A' in i.getbands():
 72 |         mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0
 73 |         mask = 1. - torch.from_numpy(mask)
 74 |     else:
 75 |         mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
 76 |     return tensor2pil(mask)
 77 | 
 78 | #   ret_masks.append(image2mask(_mask))
 79 | def image2mask(image:Image) -> torch.Tensor:
 80 |     _image = image.convert('RGBA')
 81 |     alpha = _image.split() [0]
 82 |     bg = Image.new("L", _image.size)
 83 |     _image = Image.merge('RGBA', (bg, bg, bg, alpha))
 84 |     ret_mask = torch.tensor([pil2tensor(_image)[0, :, :, 3].tolist()])
 85 |     return ret_mask
 86 | 
 87 | # 图像回帖
 88 | def croppImg(original_image,cropped_avatar,left_x,top_y):
 89 |     # 获取原始图像的大小
 90 |     original_width, original_height = original_image.size
 91 |     return croppImageBySize(cropped_avatar,left_x,top_y,original_width,original_height)
 92 | 
 93 | def croppImageBySize(cropped_avatar,left_x,top_y,original_w,original_h):
 94 |     # 获取原始图像的大小
 95 |     original_width, original_height = original_w,original_h
 96 |     # 获取头像的大小
 97 |     avatar_width, avatar_height = cropped_avatar.size
 98 |     # 创建一个与原始图像相同大小的透明图像
 99 |     extended_image = Image.new("RGBA", (original_width, original_height), (0, 0, 0, 0))
100 |     # 将裁剪后的头像粘贴到新图像
101 |     extended_image.paste(cropped_avatar, (left_x, top_y), cropped_avatar)
102 |    
103 |     return extended_image
104 | 
105 | 
106 | # 将图片转换为Base64编码
107 | def image_to_base64(image_path):
108 |     with open(image_path, 'rb') as image_file:
109 |         return base64.b64encode(image_file.read()).decode('utf-8')
110 | 
111 | # 获取网络图片
112 | def img_from_url(url):
113 |     # 发送HTTP请求获取图片  
114 |     response = requests.get(url)  
115 |     response.raise_for_status()  # 如果请求失败，这会抛出异常
116 |     # 将响应内容作为BytesIO对象打开，以便PIL可以读取它  
117 |     image = Image.open(BytesIO(response.content))
118 |     return image
119 | 
120 | def open_image(path):
121 |     prev_value = None
122 | 
123 |     try:
124 |         img = Image.open(path)
125 |     except (UnidentifiedImageError, ValueError): #PIL issues #4472 and #2445
126 |         prev_value = ImageFile.LOAD_TRUNCATED_IMAGES
127 |         ImageFile.LOAD_TRUNCATED_IMAGES = True
128 |         img = Image.open(path)
129 |     finally:
130 |         if prev_value is not None:
131 |             ImageFile.LOAD_TRUNCATED_IMAGES = prev_value
132 |         return img
133 | 
134 | # 批量读取
135 | def batch_image(directory):
136 |     if not os.path.isdir(directory):
137 |         raise FileNotFoundError(f"Directory '{directory}' cannot be found.")
138 |     dir_files = os.listdir(directory)
139 |     if len(dir_files) == 0:
140 |         raise FileNotFoundError(f"No files in directory '{directory}'.")
141 | 
142 |     valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
143 |     dir_files = [f for f in dir_files if any(f.lower().endswith(ext) for ext in valid_extensions)]
144 | 
145 |     dir_files = sorted(dir_files)
146 |     dir_files = [os.path.join(directory, x) for x in dir_files]    
147 |     return dir_files
148 | 
149 | def calculate_seconds_difference(start_time, end_time):
150 |     """
151 |     计算两个时间点之间的秒数差异
152 |     
153 |     :param start_time: 开始时间（可以是时间戳或datetime对象）
154 |     :param end_time: 结束时间（可以是时间戳或datetime对象）
155 |     :return: 秒数差异（浮点数）
156 |     """
157 |     # 如果输入是datetime对象，转换为时间戳
158 |     if isinstance(start_time, datetime):
159 |         start_time = start_time.timestamp()
160 |     if isinstance(end_time, datetime):
161 |         end_time = end_time.timestamp()
162 |     
163 |     return end_time - start_time


--------------------------------------------------------------------------------
/lib/xmodel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import folder_paths
 3 | import json
 4 | from transformers import  AutoProcessor
 5 | import torch
 6 | 
 7 | # def get_torch_device():  
 8 | #     """  
 9 | #     返回PyTorch模型应该运行的设备（CPU或GPU）  
10 | #     如果系统支持CUDA并且至少有一个GPU可用，则返回GPU设备；否则返回CPU设备。  
11 | #     """  
12 | #     if torch.cuda.is_available():  
13 | #         # 选择第一个可用的GPU  
14 | #         device = torch.device("cuda:0")  
15 | #         print(f"There are {torch.cuda.device_count()} GPU(s) available.")  
16 | #         print(f"We will use the GPU: {device}")  
17 | #     else:  
18 | #         # 如果没有GPU可用，则使用CPU  
19 | #         device = torch.device("cpu")  
20 | #         print("No GPU available, using the CPU instead.")  
21 | #     return device
22 | 
23 | # 下载hg 模型到本地
24 | def download_hg_model(model_id:str,exDir:str=''):
25 |     # 下载本地
26 |     model_checkpoint = os.path.join(folder_paths.models_dir, exDir, os.path.basename(model_id))
27 |     print(model_checkpoint)
28 |     if not os.path.exists(model_checkpoint):
29 |         from huggingface_hub import snapshot_download
30 |         snapshot_download(repo_id=model_id, local_dir=model_checkpoint, local_dir_use_symlinks=False)
31 |     return model_checkpoint
32 | 
33 | # clip_model = AutoModelForCausalLM.from_pretrained(
34 | #                 CLIP_PATH,
35 | #                 device_map="cuda",
36 | #                 trust_remote_code=True, 
37 | #                 torch_dtype="auto"
38 | #             )
39 |             
40 | #         clip_processor = AutoProcessor.from_pretrained(CLIP_PATH, trust_remote_code=True)


--------------------------------------------------------------------------------
/miniCPMv2_6_prompt_generator.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huggingface_hub import InferenceClient
  3 | from torch import nn
  4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
  5 | from pathlib import Path
  6 | import torch
  7 | import torch.amp.autocast_mode
  8 | from PIL import Image
  9 | import os
 10 | import folder_paths
 11 | 
 12 | from .lib.ximg import *
 13 | from .lib.xmodel import *
 14 | 
 15 | class CXH_Hg_Pipe:
 16 | 
 17 |     def __init__(self):
 18 |         self.text_model = None
 19 |         self.tokenizer  =None
 20 | 
 21 | 
 22 | class CXH_HG_Model_Load:
 23 | 
 24 |     def __init__(self):
 25 |         self.pipe = None
 26 | 
 27 |     @classmethod
 28 |     def INPUT_TYPES(s):
 29 |         return {
 30 |             "required": {
 31 |                 "model": (["pzc163/MiniCPMv2_6-prompt-generator"],), 
 32 |             }
 33 |         }
 34 | 
 35 |     CATEGORY = "CXH/LLM"
 36 |     RETURN_TYPES = ("CXH_Hg_Pipe",)
 37 |     RETURN_NAMES = ("pipe",)
 38 |     FUNCTION = "gen"
 39 | 
 40 |     def gen(self,model):
 41 |         
 42 |         self.pipe = CXH_Hg_Pipe()
 43 | 
 44 |         MODEL_PATH = download_hg_model(model,"LLM")
 45 |         tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 46 |         assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
 47 | 
 48 |         text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, trust_remote_code=True)
 49 |         text_model.eval()
 50 | 
 51 |         self.pipe.text_model = text_model
 52 |         self.pipe.tokenizer = tokenizer
 53 | 
 54 |         return (self.pipe,)
 55 | 
 56 | class CXH_Min2_6_prompt_Run :
 57 | 
 58 |     def __init__(self):
 59 |         pass
 60 | 
 61 |     @classmethod
 62 |     def INPUT_TYPES(s):
 63 |         return {
 64 |             "required": {
 65 |                 "pipe": ("CXH_Hg_Pipe",),
 66 |                 "image": ("IMAGE",),
 67 |                 "prompt":   ("STRING", {"multiline": True, "default": "Provide a detailed description of the details and content contained in the image, and generate a short prompt that can be used for image generation tasks in Stable Diffusion,remind you only need respons prompt itself and no other information."},),
 68 |                 "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}),
 69 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
 70 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
 71 |             }
 72 |         }
 73 | 
 74 |     CATEGORY = "CXH/LLM"
 75 |     RETURN_TYPES = ("STRING",)
 76 |     FUNCTION = "gen"
 77 |     def gen(self,pipe,image,prompt,max_tokens,temperature,seed): 
 78 | 
 79 |         image = tensor2pil(image)
 80 |         question = prompt
 81 |         msgs = [{'role': 'user', 'content': [image, question]}]
 82 | 
 83 |         res = pipe.text_model.chat(
 84 |             image=None,
 85 |             msgs=msgs,
 86 |             tokenizer=pipe.tokenizer
 87 |         )
 88 | 
 89 |         ## if you want to use streaming, please make sure sampling=True and stream=True
 90 |         ## the model.chat will return a generator
 91 |         res = pipe.text_model.chat(
 92 |             image=None,
 93 |             msgs=msgs,
 94 |             tokenizer=pipe.tokenizer,
 95 |             sampling=False,
 96 |             stream=False,
 97 |             max_tokens=max_tokens,
 98 |             temperature=temperature,
 99 |         )
100 | 
101 |         generated_text = ""
102 |         for new_text in res:
103 |             generated_text += new_text
104 |             print(new_text, flush=True, end='')
105 |             
106 | 
107 |         return (generated_text,)
108 | 


--------------------------------------------------------------------------------
/miniCpMV3_4_chat.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huggingface_hub import InferenceClient
  3 | from torch import nn
  4 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
  5 | 
  6 | # from vllm import LLM, SamplingParams
  7 | from pathlib import Path
  8 | import torch
  9 | import torch.amp.autocast_mode
 10 | from PIL import Image
 11 | import os
 12 | import folder_paths
 13 | 
 14 | from .lib.ximg import *
 15 | from .lib.xmodel import *
 16 | 
 17 | device = "cuda"
 18 | 
 19 | class CXH_MinCP3_4B_Pipe:
 20 | 
 21 |     def __init__(self):
 22 |         self.model = None
 23 |         self.tokenizer  =None
 24 | 
 25 | 
 26 | class CXH_MinCP3_4B_Load:
 27 | 
 28 |     def __init__(self):
 29 |         self.pipe = None
 30 | 
 31 |     @classmethod
 32 |     def INPUT_TYPES(s):
 33 |         return {
 34 |             "required": {
 35 |                 "model": (["openbmb/MiniCPM3-4B","openbmb/MiniCPM3-4B-GPTQ-Int4"],), 
 36 |             }
 37 |         }
 38 | 
 39 |     CATEGORY = "CXH/LLM"
 40 |     RETURN_TYPES = ("CXH_MinCP3_4B_Pipe",)
 41 |     RETURN_NAMES = ("pipe",)
 42 |     FUNCTION = "gen"
 43 | 
 44 |     def gen(self,model):
 45 |         
 46 |         self.pipe = CXH_MinCP3_4B_Pipe()
 47 | 
 48 |         MODEL_PATH = download_hg_model(model,"LLM")
 49 | 
 50 |         
 51 |         tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 52 |         model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
 53 | 
 54 |         self.pipe.model = model
 55 |         self.pipe.tokenizer = tokenizer
 56 | 
 57 |         return (self.pipe,)
 58 | 
 59 | class CXH_MinCP3_4B_Chat:
 60 | 
 61 |     def __init__(self):
 62 |         pass
 63 | 
 64 |     @classmethod
 65 |     def INPUT_TYPES(s):
 66 |         return {
 67 |             "required": {
 68 |                 "pipe": ("CXH_MinCP3_4B_Pipe",),
 69 |                 "prompt":   ("STRING", {"multiline": True, "default": "Provide a detailed description of the details and content contained in the image, and generate a short prompt that can be used for image generation tasks in Stable Diffusion,remind you only need respons prompt itself and no other information."},),
 70 |                 "top_p":("FLOAT", {"default": 0.7, "min": 0.0, "max": 1, "step": 1}),
 71 |                 "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}),
 72 |                 "temperature": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.01}),
 73 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
 74 |             }
 75 |         }
 76 | 
 77 |     CATEGORY = "CXH/LLM"
 78 |     RETURN_TYPES = ("STRING",)
 79 |     FUNCTION = "gen"
 80 |     def gen(self,pipe,prompt,top_p,max_tokens,temperature,seed): 
 81 | 
 82 |         messages = [
 83 |             {"role": "user", "content": prompt},
 84 |         ]
 85 | 
 86 |         model_inputs = pipe.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(device)
 87 | 
 88 |         model_outputs = pipe.model.generate(
 89 |             model_inputs,
 90 |             max_new_tokens=max_tokens,
 91 |             top_p=top_p,
 92 |             temperature=temperature
 93 |         )
 94 | 
 95 |         output_token_ids = [
 96 |             model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
 97 |         ]
 98 | 
 99 |         responses = pipe.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
100 |         # print(responses)
101 |         return (responses,)
102 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | huggingface_hub==0.24.3
 2 | transformers>=4.44.2
 3 | tqdm
 4 | numpy
 5 | surrealist
 6 | boto3==1.34.86
 7 | llama-cpp-python==0.2.89
 8 | Pillow==10.1.0
 9 | sentencepiece==0.1.99
10 | accelerate>=0.30.1
11 | bitsandbytes>=0.43.1
12 | peft>=0.9.0
13 | datamodel-code-generator>=0.26.0
14 | matplotlib
15 | pyvips


--------------------------------------------------------------------------------
/smolvlm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huggingface_hub import InferenceClient
  3 | from torch import nn
  4 | from transformers import AutoModelForVision2Seq,CLIPImageProcessor, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
  5 | from pathlib import Path
  6 | import torch
  7 | import torch.amp.autocast_mode
  8 | from PIL import Image
  9 | import os
 10 | import folder_paths
 11 | import time
 12 | import re
 13 | 
 14 | from .lib.ximg import *
 15 | from .lib.xmodel import *
 16 | 
 17 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 18 | 
 19 | class CXH_SmolVlm_Pipe:
 20 | 
 21 |     def __init__(self):
 22 |         self.model = None
 23 |         self.processor  =None
 24 | 
 25 | class CXH_SmolVlm_Load:
 26 |     @classmethod
 27 |     def INPUT_TYPES(s):
 28 |         return {
 29 |             "required": {
 30 |                 "model": (["HuggingFaceTB/SmolVLM-Instruct"],), 
 31 |             }
 32 |         }
 33 | 
 34 |     CATEGORY = "CXH/LLM"
 35 |     RETURN_TYPES = ("CXH_SmolVlm_Pipe",)
 36 |     RETURN_NAMES = ("pipe",)
 37 |     FUNCTION = "gen"
 38 |     def gen(self,model): 
 39 |         self.pipe = CXH_SmolVlm_Pipe()
 40 | 
 41 |         MODEL_PATH = download_hg_model(model,"LLM")
 42 |         print(MODEL_PATH)
 43 | 
 44 |         # Initialize processor and model
 45 |         processor = AutoProcessor.from_pretrained(MODEL_PATH,trust_remote_code=True)
 46 |         model1 = AutoModelForVision2Seq.from_pretrained(
 47 |             MODEL_PATH,
 48 |             torch_dtype=torch.bfloat16,
 49 |             # _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
 50 |         ).to(DEVICE)
 51 |         
 52 | 
 53 |         self.pipe.model = model1
 54 |         self.pipe.processor = processor
 55 |         return (self.pipe,)
 56 | 
 57 | class CXH_SmolVlm_Run :
 58 | 
 59 |     def __init__(self):
 60 |         pass
 61 | 
 62 |     @classmethod
 63 |     def INPUT_TYPES(s):
 64 |         return {
 65 |             "required": {
 66 |                 "pipe": ("CXH_SmolVlm_Pipe",),
 67 |                 "image": ("IMAGE",),
 68 |                 "prompt":   ("STRING", {"multiline": True, "default": "Provide a detailed description of the details and content contained in the image, and generate a short prompt that can be used for image generation tasks in Stable Diffusion,remind you only need respons prompt itself and no other information."},),
 69 |                 "max_tokens":("INT", {"default": 1024, "min": 10, "max": 4048, "step": 1}),
 70 |                 "seed": ("INT", {"default": 656545, "min": 0, "max": 1000000}),
 71 |             }
 72 |         }
 73 | 
 74 |     CATEGORY = "CXH/LLM"
 75 |     RETURN_TYPES = ("STRING",)
 76 |     FUNCTION = "gen"
 77 |     def gen(self,pipe,image,prompt,max_tokens,seed): 
 78 | 
 79 |         image = tensor2pil(image)
 80 |         # Create input messages
 81 |         messages = [
 82 |             {
 83 |                 "role": "user",
 84 |                 "content": [
 85 |                     {"type": "image"},
 86 |                     {"type": "text", "text": prompt}
 87 |                 ]
 88 |             },
 89 |         ]
 90 |         # Prepare inputs
 91 |         prompt = pipe.processor.apply_chat_template(messages, add_generation_prompt=True)
 92 |         inputs = pipe.processor(text=prompt, images=[image], return_tensors="pt")
 93 |         inputs = inputs.to(DEVICE)
 94 | 
 95 |         # Generate outputs
 96 |         generated_ids = pipe.model.generate(**inputs, max_new_tokens=max_tokens)
 97 |         generated_texts = pipe.processor.batch_decode(
 98 |             generated_ids,
 99 |             skip_special_tokens=True,
100 |         )
101 |         print(generated_texts[0])
102 |         pattern = re.compile(r"Assistant:\s*(.*)")
103 |         match = pattern.search(generated_texts[0])
104 | 
105 |         if match:
106 |             number = match.group(1)
107 |             return (number,)
108 |         else:
109 |             print("No number found.")
110 |             return (generated_texts[0],)
111 | 
112 | 


--------------------------------------------------------------------------------
/worflow/Min2.6+joy+Florence2.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 11,
  3 |   "last_link_id": 11,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 9,
  7 |       "type": "CXH_Min2_6_prompt_Run",
  8 |       "pos": [
  9 |         1177,
 10 |         407
 11 |       ],
 12 |       "size": {
 13 |         "0": 400,
 14 |         "1": 200
 15 |       },
 16 |       "flags": {},
 17 |       "order": 5,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "pipe",
 22 |           "type": "CXH_Hg_Pipe",
 23 |           "link": 6
 24 |         },
 25 |         {
 26 |           "name": "image",
 27 |           "type": "IMAGE",
 28 |           "link": 7
 29 |         }
 30 |       ],
 31 |       "outputs": [
 32 |         {
 33 |           "name": "STRING",
 34 |           "type": "STRING",
 35 |           "links": [
 36 |             10
 37 |           ],
 38 |           "shape": 3,
 39 |           "slot_index": 0
 40 |         }
 41 |       ],
 42 |       "properties": {
 43 |         "Node name for S&R": "CXH_Min2_6_prompt_Run"
 44 |       },
 45 |       "widgets_values": [
 46 |         "A descriptive caption for this image ",
 47 |         2048,
 48 |         0.7
 49 |       ],
 50 |       "color": "#1b4669",
 51 |       "bgcolor": "#29699c"
 52 |     },
 53 |     {
 54 |       "id": 4,
 55 |       "type": "Joy_caption",
 56 |       "pos": [
 57 |         1195,
 58 |         897
 59 |       ],
 60 |       "size": {
 61 |         "0": 400,
 62 |         "1": 200
 63 |       },
 64 |       "flags": {},
 65 |       "order": 4,
 66 |       "mode": 0,
 67 |       "inputs": [
 68 |         {
 69 |           "name": "joy_pipeline",
 70 |           "type": "JoyPipeline",
 71 |           "link": 1
 72 |         },
 73 |         {
 74 |           "name": "image",
 75 |           "type": "IMAGE",
 76 |           "link": 2
 77 |         }
 78 |       ],
 79 |       "outputs": [
 80 |         {
 81 |           "name": "STRING",
 82 |           "type": "STRING",
 83 |           "links": [
 84 |             3
 85 |           ],
 86 |           "slot_index": 0,
 87 |           "shape": 3
 88 |         }
 89 |       ],
 90 |       "properties": {
 91 |         "Node name for S&R": "Joy_caption"
 92 |       },
 93 |       "widgets_values": [
 94 |         "A descriptive caption for this image ",
 95 |         2048,
 96 |         0.7000000000000001,
 97 |         false
 98 |       ],
 99 |       "color": "#1b4669",
100 |       "bgcolor": "#29699c"
101 |     },
102 |     {
103 |       "id": 8,
104 |       "type": "CXH_Florence2Run",
105 |       "pos": [
106 |         1210,
107 |         1417
108 |       ],
109 |       "size": {
110 |         "0": 400,
111 |         "1": 304
112 |       },
113 |       "flags": {},
114 |       "order": 6,
115 |       "mode": 0,
116 |       "inputs": [
117 |         {
118 |           "name": "image",
119 |           "type": "IMAGE",
120 |           "link": 8
121 |         },
122 |         {
123 |           "name": "florence2_model",
124 |           "type": "FL2MODEL",
125 |           "link": 9
126 |         }
127 |       ],
128 |       "outputs": [
129 |         {
130 |           "name": "image",
131 |           "type": "IMAGE",
132 |           "links": null,
133 |           "shape": 3,
134 |           "slot_index": 0
135 |         },
136 |         {
137 |           "name": "mask",
138 |           "type": "MASK",
139 |           "links": null,
140 |           "shape": 3
141 |         },
142 |         {
143 |           "name": "caption",
144 |           "type": "STRING",
145 |           "links": [
146 |             11
147 |           ],
148 |           "shape": 3,
149 |           "slot_index": 2
150 |         },
151 |         {
152 |           "name": "data",
153 |           "type": "JSON",
154 |           "links": null,
155 |           "shape": 3
156 |         }
157 |       ],
158 |       "properties": {
159 |         "Node name for S&R": "CXH_Florence2Run"
160 |       },
161 |       "widgets_values": [
162 |         "",
163 |         "more_detailed_caption",
164 |         true,
165 |         false,
166 |         2048,
167 |         3,
168 |         true,
169 |         ""
170 |       ],
171 |       "color": "#1b4669",
172 |       "bgcolor": "#29699c"
173 |     },
174 |     {
175 |       "id": 7,
176 |       "type": "CXH_HG_Model_Load",
177 |       "pos": [
178 |         1187,
179 |         286
180 |       ],
181 |       "size": {
182 |         "0": 315,
183 |         "1": 58
184 |       },
185 |       "flags": {},
186 |       "order": 0,
187 |       "mode": 0,
188 |       "outputs": [
189 |         {
190 |           "name": "pipe",
191 |           "type": "CXH_Hg_Pipe",
192 |           "links": [
193 |             6
194 |           ],
195 |           "shape": 3,
196 |           "slot_index": 0
197 |         }
198 |       ],
199 |       "properties": {
200 |         "Node name for S&R": "CXH_HG_Model_Load"
201 |       },
202 |       "widgets_values": [
203 |         "pzc163/MiniCPMv2_6-prompt-generator"
204 |       ],
205 |       "color": "#1b4669",
206 |       "bgcolor": "#29699c"
207 |     },
208 |     {
209 |       "id": 10,
210 |       "type": "CXH_DownloadAndLoadFlorence2Model",
211 |       "pos": [
212 |         1209,
213 |         1257
214 |       ],
215 |       "size": {
216 |         "0": 315,
217 |         "1": 106
218 |       },
219 |       "flags": {},
220 |       "order": 1,
221 |       "mode": 0,
222 |       "outputs": [
223 |         {
224 |           "name": "florence2_model",
225 |           "type": "FL2MODEL",
226 |           "links": [
227 |             9
228 |           ],
229 |           "shape": 3,
230 |           "slot_index": 0
231 |         }
232 |       ],
233 |       "properties": {
234 |         "Node name for S&R": "CXH_DownloadAndLoadFlorence2Model"
235 |       },
236 |       "widgets_values": [
237 |         "thwri/CogFlorence-2.2-Large",
238 |         "fp16",
239 |         "sdpa"
240 |       ],
241 |       "color": "#1b4669",
242 |       "bgcolor": "#29699c"
243 |     },
244 |     {
245 |       "id": 3,
246 |       "type": "Joy_caption_load",
247 |       "pos": [
248 |         1210,
249 |         791
250 |       ],
251 |       "size": {
252 |         "0": 315,
253 |         "1": 58
254 |       },
255 |       "flags": {},
256 |       "order": 2,
257 |       "mode": 0,
258 |       "outputs": [
259 |         {
260 |           "name": "JoyPipeline",
261 |           "type": "JoyPipeline",
262 |           "links": [
263 |             1
264 |           ],
265 |           "slot_index": 0,
266 |           "shape": 3
267 |         }
268 |       ],
269 |       "properties": {
270 |         "Node name for S&R": "Joy_caption_load"
271 |       },
272 |       "widgets_values": [
273 |         "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
274 |       ],
275 |       "color": "#1b4669",
276 |       "bgcolor": "#29699c"
277 |     },
278 |     {
279 |       "id": 1,
280 |       "type": "LoadImage",
281 |       "pos": [
282 |         500,
283 |         673
284 |       ],
285 |       "size": [
286 |         558.8251844824922,
287 |         765.5085685298109
288 |       ],
289 |       "flags": {},
290 |       "order": 3,
291 |       "mode": 0,
292 |       "outputs": [
293 |         {
294 |           "name": "IMAGE",
295 |           "type": "IMAGE",
296 |           "links": [
297 |             2,
298 |             7,
299 |             8
300 |           ],
301 |           "slot_index": 0,
302 |           "shape": 3
303 |         },
304 |         {
305 |           "name": "MASK",
306 |           "type": "MASK",
307 |           "links": null,
308 |           "shape": 3
309 |         }
310 |       ],
311 |       "properties": {
312 |         "Node name for S&R": "LoadImage"
313 |       },
314 |       "widgets_values": [
315 |         "26124763.jpg",
316 |         "image"
317 |       ]
318 |     },
319 |     {
320 |       "id": 2,
321 |       "type": "easy showAnything",
322 |       "pos": [
323 |         1687,
324 |         401
325 |       ],
326 |       "size": {
327 |         "0": 390.0909423828125,
328 |         "1": 252.36358642578125
329 |       },
330 |       "flags": {},
331 |       "order": 8,
332 |       "mode": 0,
333 |       "inputs": [
334 |         {
335 |           "name": "anything",
336 |           "type": "*",
337 |           "link": 10
338 |         }
339 |       ],
340 |       "properties": {
341 |         "Node name for S&R": "easy showAnything"
342 |       },
343 |       "widgets_values": [
344 |         "The image presents a striking digital illustration of a knight in full armor, standing resolute against a backdrop of a tumultuous sky. The knight, clad in a dark, ornate suit of armor, wields a sword that glows with an otherworldly light, suggesting it may be imbued with magical properties. The armor is intricately designed, featuring gold accents that catch the eye amidst the darker tones. The knight's helmet is adorned with a plume, adding to the regal appearance. The background is a dramatic canvas of dark clouds, hinting at an impending storm, which contrasts with the fiery glow emanating from the sword, creating a sense of tension and anticipation. The overall composition of the image suggests a narrative of conflict and heroism, with the knight poised to face whatever challenges lie ahead."
345 |       ]
346 |     },
347 |     {
348 |       "id": 5,
349 |       "type": "easy showAnything",
350 |       "pos": [
351 |         1690,
352 |         854
353 |       ],
354 |       "size": {
355 |         "0": 462.2198791503906,
356 |         "1": 255.30990600585938
357 |       },
358 |       "flags": {},
359 |       "order": 7,
360 |       "mode": 0,
361 |       "inputs": [
362 |         {
363 |           "name": "anything",
364 |           "type": "*",
365 |           "link": 3
366 |         }
367 |       ],
368 |       "properties": {
369 |         "Node name for S&R": "easy showAnything"
370 |       },
371 |       "widgets_values": [
372 |         "1. This is a digital illustration depicting a majestic, armored warrior standing in a dramatic, stormy landscape. The warrior is a tall, imposing figure, clad in dark, metallic armor with intricate designs and sharp spikes. His helmet resembles a fearsome beast's head, with long, sharp horns curving backward. He wears a flowing cape that billows in the wind, adding a sense of movement and strength to his imposing stance. His eyes are hidden behind a visor, giving him an aura of mystery and intensity. \n\nHis left arm is sheathed in a long sword with a glowing, fiery blade, and his right hand grasps a similar sword with flames licking along its edge. The background is a tumultuous sky filled with dark clouds and flashes of lightning, creating a sense of impending danger and chaos. The ground is rugged and rocky, with small explosions of fiery orange gas rising from the surface, enhancing the sense of volatility and power. \n\nThe overall mood of the image is dark and foreboding, with a sense of otherworldly majesty. The artwork utilizes a detailed, realistic style, with a focus on the textures of the armor and the dynamic, swirling motion of the cape and the fiery blades."
373 |       ]
374 |     },
375 |     {
376 |       "id": 11,
377 |       "type": "easy showAnything",
378 |       "pos": [
379 |         1640,
380 |         1455
381 |       ],
382 |       "size": {
383 |         "0": 462.2198791503906,
384 |         "1": 255.30990600585938
385 |       },
386 |       "flags": {},
387 |       "order": 9,
388 |       "mode": 0,
389 |       "inputs": [
390 |         {
391 |           "name": "anything",
392 |           "type": "*",
393 |           "link": 11
394 |         }
395 |       ],
396 |       "properties": {
397 |         "Node name for S&R": "easy showAnything"
398 |       },
399 |       "widgets_values": [
400 |         "A dramatic portrayal of a dark, armored warrior in a dynamic pose, wielding a long, fiery sword. The warrior wears ornate, dark armor with intricate designs and a helmet featuring a crown-like visor. The background is a stormy sky filled with dark clouds, and the ground is covered in fiery orange and yellow hues, indicating either either dawn or dusk. The overall color palette is dominated by dark blues, blacks, and fiery oranges, creating a sense of foreboding and intensity."
401 |       ]
402 |     }
403 |   ],
404 |   "links": [
405 |     [
406 |       1,
407 |       3,
408 |       0,
409 |       4,
410 |       0,
411 |       "JoyPipeline"
412 |     ],
413 |     [
414 |       2,
415 |       1,
416 |       0,
417 |       4,
418 |       1,
419 |       "IMAGE"
420 |     ],
421 |     [
422 |       3,
423 |       4,
424 |       0,
425 |       5,
426 |       0,
427 |       "*"
428 |     ],
429 |     [
430 |       6,
431 |       7,
432 |       0,
433 |       9,
434 |       0,
435 |       "CXH_Hg_Pipe"
436 |     ],
437 |     [
438 |       7,
439 |       1,
440 |       0,
441 |       9,
442 |       1,
443 |       "IMAGE"
444 |     ],
445 |     [
446 |       8,
447 |       1,
448 |       0,
449 |       8,
450 |       0,
451 |       "IMAGE"
452 |     ],
453 |     [
454 |       9,
455 |       10,
456 |       0,
457 |       8,
458 |       1,
459 |       "FL2MODEL"
460 |     ],
461 |     [
462 |       10,
463 |       9,
464 |       0,
465 |       2,
466 |       0,
467 |       "*"
468 |     ],
469 |     [
470 |       11,
471 |       8,
472 |       2,
473 |       11,
474 |       0,
475 |       "*"
476 |     ]
477 |   ],
478 |   "groups": [
479 |     {
480 |       "title": "Min2_6",
481 |       "bounding": [
482 |         1156,
483 |         167,
484 |         928,
485 |         501
486 |       ],
487 |       "color": "#3f789e",
488 |       "font_size": 24,
489 |       "locked": false
490 |     },
491 |     {
492 |       "title": "Joy_caption",
493 |       "bounding": [
494 |         1154,
495 |         699,
496 |         1032,
497 |         449
498 |       ],
499 |       "color": "#3f789e",
500 |       "font_size": 24,
501 |       "locked": false
502 |     },
503 |     {
504 |       "title": "florence2",
505 |       "bounding": [
506 |         1148,
507 |         1164,
508 |         1041,
509 |         586
510 |       ],
511 |       "color": "#3f789e",
512 |       "font_size": 24,
513 |       "locked": false
514 |     }
515 |   ],
516 |   "config": {},
517 |   "extra": {
518 |     "ds": {
519 |       "scale": 0.5131581182307073,
520 |       "offset": [
521 |         -133.45930116147088,
522 |         -137.71244198828424
523 |       ]
524 |     }
525 |   },
526 |   "version": 0.4
527 | }


--------------------------------------------------------------------------------
/worflow/MinCPM3_4B.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 22,
  3 |   "last_link_id": 23,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 22,
  7 |       "type": "easy showAnything",
  8 |       "pos": {
  9 |         "0": 1101,
 10 |         "1": 666,
 11 |         "2": 0,
 12 |         "3": 0,
 13 |         "4": 0,
 14 |         "5": 0,
 15 |         "6": 0,
 16 |         "7": 0,
 17 |         "8": 0,
 18 |         "9": 0
 19 |       },
 20 |       "size": {
 21 |         "0": 419.568115234375,
 22 |         "1": 274.0469055175781
 23 |       },
 24 |       "flags": {},
 25 |       "order": 2,
 26 |       "mode": 0,
 27 |       "inputs": [
 28 |         {
 29 |           "name": "anything",
 30 |           "type": "*",
 31 |           "link": 22
 32 |         }
 33 |       ],
 34 |       "outputs": [],
 35 |       "properties": {
 36 |         "Node name for S&R": "easy showAnything"
 37 |       },
 38 |       "widgets_values": [
 39 |         "中国位于亚洲的东部，太平洋的西岸。其领土范围广阔，北至黑龙江省漠河县北端的黑龙江主航道中心线（53°N），南至海南省南沙群岛的曾母暗沙（4°N），东至黑龙江省黑龙江与乌苏里江主航道中心线的汇合处（135°E），西至新疆维吾尔自治区帕米尔高原（73°E）。中国陆地总面积约960万平方千米，东部和南部大陆海岸线1.8万千米，海域总面积约473万平方千米。"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "id": 20,
 44 |       "type": "CXH_MinCP3_4B_Load",
 45 |       "pos": {
 46 |         "0": 301,
 47 |         "1": 685,
 48 |         "2": 0,
 49 |         "3": 0,
 50 |         "4": 0,
 51 |         "5": 0,
 52 |         "6": 0,
 53 |         "7": 0,
 54 |         "8": 0,
 55 |         "9": 0
 56 |       },
 57 |       "size": {
 58 |         "0": 315,
 59 |         "1": 58
 60 |       },
 61 |       "flags": {},
 62 |       "order": 0,
 63 |       "mode": 0,
 64 |       "inputs": [],
 65 |       "outputs": [
 66 |         {
 67 |           "name": "pipe",
 68 |           "type": "CXH_MinCP3_4B_Pipe",
 69 |           "links": [
 70 |             21
 71 |           ],
 72 |           "shape": 3,
 73 |           "slot_index": 0
 74 |         }
 75 |       ],
 76 |       "properties": {
 77 |         "Node name for S&R": "CXH_MinCP3_4B_Load"
 78 |       },
 79 |       "widgets_values": [
 80 |         "openbmb/MiniCPM3-4B"
 81 |       ],
 82 |       "color": "#1b4669",
 83 |       "bgcolor": "#29699c"
 84 |     },
 85 |     {
 86 |       "id": 21,
 87 |       "type": "CXH_MinCP3_4B_Chat",
 88 |       "pos": {
 89 |         "0": 673,
 90 |         "1": 681,
 91 |         "2": 0,
 92 |         "3": 0,
 93 |         "4": 0,
 94 |         "5": 0,
 95 |         "6": 0,
 96 |         "7": 0,
 97 |         "8": 0,
 98 |         "9": 0
 99 |       },
100 |       "size": {
101 |         "0": 400,
102 |         "1": 200
103 |       },
104 |       "flags": {},
105 |       "order": 1,
106 |       "mode": 0,
107 |       "inputs": [
108 |         {
109 |           "name": "pipe",
110 |           "type": "CXH_MinCP3_4B_Pipe",
111 |           "link": 21
112 |         }
113 |       ],
114 |       "outputs": [
115 |         {
116 |           "name": "STRING",
117 |           "type": "STRING",
118 |           "links": [
119 |             22
120 |           ],
121 |           "shape": 3,
122 |           "slot_index": 0
123 |         }
124 |       ],
125 |       "properties": {
126 |         "Node name for S&R": "CXH_MinCP3_4B_Chat"
127 |       },
128 |       "widgets_values": [
129 |         "中国在哪里？",
130 |         0.7,
131 |         1024,
132 |         0.7
133 |       ],
134 |       "color": "#1b4669",
135 |       "bgcolor": "#29699c"
136 |     }
137 |   ],
138 |   "links": [
139 |     [
140 |       21,
141 |       20,
142 |       0,
143 |       21,
144 |       0,
145 |       "CXH_MinCP3_4B_Pipe"
146 |     ],
147 |     [
148 |       22,
149 |       21,
150 |       0,
151 |       22,
152 |       0,
153 |       "*"
154 |     ]
155 |   ],
156 |   "groups": [],
157 |   "config": {},
158 |   "extra": {
159 |     "ds": {
160 |       "scale": 0.7067058488964866,
161 |       "offset": [
162 |         262.3736311173096,
163 |         -229.61227808628627
164 |       ]
165 |     }
166 |   },
167 |   "version": 0.4
168 | }


--------------------------------------------------------------------------------
/worflow/florence_PromptGen.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 9,
  3 |   "last_link_id": 9,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 5,
  7 |       "type": "CXH_Florence2Run",
  8 |       "pos": {
  9 |         "0": 650,
 10 |         "1": 831,
 11 |         "2": 0,
 12 |         "3": 0,
 13 |         "4": 0,
 14 |         "5": 0,
 15 |         "6": 0,
 16 |         "7": 0,
 17 |         "8": 0,
 18 |         "9": 0
 19 |       },
 20 |       "size": {
 21 |         "0": 400,
 22 |         "1": 352
 23 |       },
 24 |       "flags": {},
 25 |       "order": 2,
 26 |       "mode": 0,
 27 |       "inputs": [
 28 |         {
 29 |           "name": "image",
 30 |           "type": "IMAGE",
 31 |           "link": 4
 32 |         },
 33 |         {
 34 |           "name": "florence2_model",
 35 |           "type": "FL2MODEL",
 36 |           "link": 5
 37 |         }
 38 |       ],
 39 |       "outputs": [
 40 |         {
 41 |           "name": "image",
 42 |           "type": "IMAGE",
 43 |           "links": null,
 44 |           "shape": 3
 45 |         },
 46 |         {
 47 |           "name": "mask",
 48 |           "type": "MASK",
 49 |           "links": null,
 50 |           "shape": 3
 51 |         },
 52 |         {
 53 |           "name": "caption",
 54 |           "type": "STRING",
 55 |           "links": [
 56 |             6
 57 |           ],
 58 |           "slot_index": 2,
 59 |           "shape": 3
 60 |         },
 61 |         {
 62 |           "name": "data",
 63 |           "type": "JSON",
 64 |           "links": null,
 65 |           "shape": 3
 66 |         }
 67 |       ],
 68 |       "properties": {
 69 |         "Node name for S&R": "CXH_Florence2Run"
 70 |       },
 71 |       "widgets_values": [
 72 |         "",
 73 |         "mixed_caption(PromptGen 1.5)",
 74 |         true,
 75 |         false,
 76 |         1024,
 77 |         3,
 78 |         true,
 79 |         "",
 80 |         1942,
 81 |         "randomize"
 82 |       ],
 83 |       "color": "#1b4669"
 84 |     },
 85 |     {
 86 |       "id": 6,
 87 |       "type": "CXH_DownloadAndLoadFlorence2Model",
 88 |       "pos": {
 89 |         "0": 628,
 90 |         "1": 671,
 91 |         "2": 0,
 92 |         "3": 0,
 93 |         "4": 0,
 94 |         "5": 0,
 95 |         "6": 0,
 96 |         "7": 0,
 97 |         "8": 0,
 98 |         "9": 0
 99 |       },
100 |       "size": {
101 |         "0": 415.8000183105469,
102 |         "1": 106
103 |       },
104 |       "flags": {},
105 |       "order": 0,
106 |       "mode": 0,
107 |       "inputs": [],
108 |       "outputs": [
109 |         {
110 |           "name": "florence2_model",
111 |           "type": "FL2MODEL",
112 |           "links": [
113 |             5,
114 |             8
115 |           ],
116 |           "shape": 3,
117 |           "slot_index": 0
118 |         }
119 |       ],
120 |       "properties": {
121 |         "Node name for S&R": "CXH_DownloadAndLoadFlorence2Model"
122 |       },
123 |       "widgets_values": [
124 |         "MiaoshouAI/Florence-2-large-PromptGen-v1.5",
125 |         "fp16",
126 |         "sdpa"
127 |       ],
128 |       "color": "#1b4669"
129 |     },
130 |     {
131 |       "id": 9,
132 |       "type": "easy showAnything",
133 |       "pos": {
134 |         "0": 1125,
135 |         "1": 1279,
136 |         "2": 0,
137 |         "3": 0,
138 |         "4": 0,
139 |         "5": 0,
140 |         "6": 0,
141 |         "7": 0,
142 |         "8": 0,
143 |         "9": 0
144 |       },
145 |       "size": {
146 |         "0": 402.45989990234375,
147 |         "1": 164.83221435546875
148 |       },
149 |       "flags": {},
150 |       "order": 5,
151 |       "mode": 0,
152 |       "inputs": [
153 |         {
154 |           "name": "anything",
155 |           "type": "*",
156 |           "link": 9
157 |         }
158 |       ],
159 |       "outputs": [],
160 |       "properties": {
161 |         "Node name for S&R": "easy showAnything"
162 |       },
163 |       "widgets_values": [
164 |         "1girl, solo, long hair, looking at viewer, skirt, red hair, thighhighs, long sleeves, closed mouth, standing, full body, shoes, pleated skirt, socks, indoors, miniskirt, hood, striped, white footwear, hoodie, crossed arms, table, white socks, sneakers, tennis ball, rack"
165 |       ]
166 |     },
167 |     {
168 |       "id": 8,
169 |       "type": "CXH_Florence2Run",
170 |       "pos": {
171 |         "0": 660,
172 |         "1": 1237,
173 |         "2": 0,
174 |         "3": 0,
175 |         "4": 0,
176 |         "5": 0,
177 |         "6": 0,
178 |         "7": 0,
179 |         "8": 0,
180 |         "9": 0
181 |       },
182 |       "size": {
183 |         "0": 400,
184 |         "1": 352
185 |       },
186 |       "flags": {},
187 |       "order": 3,
188 |       "mode": 0,
189 |       "inputs": [
190 |         {
191 |           "name": "image",
192 |           "type": "IMAGE",
193 |           "link": 7
194 |         },
195 |         {
196 |           "name": "florence2_model",
197 |           "type": "FL2MODEL",
198 |           "link": 8
199 |         }
200 |       ],
201 |       "outputs": [
202 |         {
203 |           "name": "image",
204 |           "type": "IMAGE",
205 |           "links": null,
206 |           "shape": 3
207 |         },
208 |         {
209 |           "name": "mask",
210 |           "type": "MASK",
211 |           "links": null,
212 |           "shape": 3
213 |         },
214 |         {
215 |           "name": "caption",
216 |           "type": "STRING",
217 |           "links": [
218 |             9
219 |           ],
220 |           "slot_index": 2,
221 |           "shape": 3
222 |         },
223 |         {
224 |           "name": "data",
225 |           "type": "JSON",
226 |           "links": null,
227 |           "shape": 3
228 |         }
229 |       ],
230 |       "properties": {
231 |         "Node name for S&R": "CXH_Florence2Run"
232 |       },
233 |       "widgets_values": [
234 |         "",
235 |         "generate_tags(PromptGen 1.5)",
236 |         true,
237 |         false,
238 |         1024,
239 |         3,
240 |         true,
241 |         "",
242 |         470,
243 |         "randomize"
244 |       ],
245 |       "color": "#1b4669"
246 |     },
247 |     {
248 |       "id": 3,
249 |       "type": "LoadImage",
250 |       "pos": {
251 |         "0": 180,
252 |         "1": 1064,
253 |         "2": 0,
254 |         "3": 0,
255 |         "4": 0,
256 |         "5": 0,
257 |         "6": 0,
258 |         "7": 0,
259 |         "8": 0,
260 |         "9": 0
261 |       },
262 |       "size": {
263 |         "0": 315,
264 |         "1": 314
265 |       },
266 |       "flags": {},
267 |       "order": 1,
268 |       "mode": 0,
269 |       "inputs": [],
270 |       "outputs": [
271 |         {
272 |           "name": "IMAGE",
273 |           "type": "IMAGE",
274 |           "links": [
275 |             4,
276 |             7
277 |           ],
278 |           "slot_index": 0,
279 |           "shape": 3
280 |         },
281 |         {
282 |           "name": "MASK",
283 |           "type": "MASK",
284 |           "links": null,
285 |           "slot_index": 1,
286 |           "shape": 3
287 |         }
288 |       ],
289 |       "properties": {
290 |         "Node name for S&R": "LoadImage"
291 |       },
292 |       "widgets_values": [
293 |         "风格趋势_68550099(1).jpg",
294 |         "image"
295 |       ]
296 |     },
297 |     {
298 |       "id": 7,
299 |       "type": "easy showAnything",
300 |       "pos": {
301 |         "0": 1113,
302 |         "1": 876,
303 |         "2": 0,
304 |         "3": 0,
305 |         "4": 0,
306 |         "5": 0,
307 |         "6": 0,
308 |         "7": 0,
309 |         "8": 0,
310 |         "9": 0
311 |       },
312 |       "size": [
313 |         496.79671515656423,
314 |         313.44720309527156
315 |       ],
316 |       "flags": {},
317 |       "order": 4,
318 |       "mode": 0,
319 |       "inputs": [
320 |         {
321 |           "name": "anything",
322 |           "type": "*",
323 |           "link": 6
324 |         }
325 |       ],
326 |       "outputs": [],
327 |       "properties": {
328 |         "Node name for S&R": "easy showAnything"
329 |       },
330 |       "widgets_values": [
331 |         "a high-resolution photograph featuring a young woman with fair skin and long, wavy red hair, standing against a dark blue wall, she has a slender physique with a slender build and fair skin, she is wearing a navy blue hoodie with a white, diamond-patterned design, a matching navy blue mini skirt, and white knee-high socks with yellow stripes, her outfit is accessorized with white sneakers and a white headband with a black and white striped pattern, the background consists of a minimalist, modern setting with a carpeted floor in various shades of green and beige, scattered around her are several white tennis balls, to her left, there is a green cabinet with a rattan-like texture, and to her right, a white wire basket filled with tennis balls is placed on a metal stand, to the right, on the floor is a vintage radio, adding a retro touch to the scene, the overall color palette is dominated by dark blue and green tones, creating a visually striking contrast, the lighting is soft and natural, enhancing the textures and details of the objects and the woman's outfit, the photograph is likely taken during the day, as indicated by the high level of detail and the softness of the carpet and the smoothness of her skin\n\n \\(polo\\), 1girl, solo, long hair, looking at viewer, skirt, brown hair"
332 |       ]
333 |     }
334 |   ],
335 |   "links": [
336 |     [
337 |       4,
338 |       3,
339 |       0,
340 |       5,
341 |       0,
342 |       "IMAGE"
343 |     ],
344 |     [
345 |       5,
346 |       6,
347 |       0,
348 |       5,
349 |       1,
350 |       "FL2MODEL"
351 |     ],
352 |     [
353 |       6,
354 |       5,
355 |       2,
356 |       7,
357 |       0,
358 |       "*"
359 |     ],
360 |     [
361 |       7,
362 |       3,
363 |       0,
364 |       8,
365 |       0,
366 |       "IMAGE"
367 |     ],
368 |     [
369 |       8,
370 |       6,
371 |       0,
372 |       8,
373 |       1,
374 |       "FL2MODEL"
375 |     ],
376 |     [
377 |       9,
378 |       8,
379 |       2,
380 |       9,
381 |       0,
382 |       "*"
383 |     ]
384 |   ],
385 |   "groups": [],
386 |   "config": {},
387 |   "extra": {
388 |     "ds": {
389 |       "scale": 0.9090909090909091,
390 |       "offset": [
391 |         -461.8324362747284,
392 |         -716.5616485145835
393 |       ]
394 |     }
395 |   },
396 |   "version": 0.4
397 | }


--------------------------------------------------------------------------------
/worflow/florence_PromptGen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/florence_PromptGen.png


--------------------------------------------------------------------------------
/worflow/flux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/flux.png


--------------------------------------------------------------------------------
/worflow/joy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 5,
  3 |   "last_link_id": 4,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 2,
  7 |       "type": "Joy_caption",
  8 |       "pos": [
  9 |         828,
 10 |         498
 11 |       ],
 12 |       "size": {
 13 |         "0": 400,
 14 |         "1": 200
 15 |       },
 16 |       "flags": {},
 17 |       "order": 2,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "joy_pipeline",
 22 |           "type": "JoyPipeline",
 23 |           "link": 4
 24 |         },
 25 |         {
 26 |           "name": "image",
 27 |           "type": "IMAGE",
 28 |           "link": 2,
 29 |           "slot_index": 1
 30 |         }
 31 |       ],
 32 |       "outputs": [
 33 |         {
 34 |           "name": "STRING",
 35 |           "type": "STRING",
 36 |           "links": [
 37 |             3
 38 |           ],
 39 |           "shape": 3,
 40 |           "slot_index": 0
 41 |         }
 42 |       ],
 43 |       "properties": {
 44 |         "Node name for S&R": "Joy_caption"
 45 |       },
 46 |       "widgets_values": [
 47 |         "A descriptive caption for this image",
 48 |         300,
 49 |         0.5
 50 |       ]
 51 |     },
 52 |     {
 53 |       "id": 5,
 54 |       "type": "Joy_caption_load",
 55 |       "pos": [
 56 |         454,
 57 |         446
 58 |       ],
 59 |       "size": {
 60 |         "0": 315,
 61 |         "1": 58
 62 |       },
 63 |       "flags": {},
 64 |       "order": 0,
 65 |       "mode": 0,
 66 |       "outputs": [
 67 |         {
 68 |           "name": "JoyPipeline",
 69 |           "type": "JoyPipeline",
 70 |           "links": [
 71 |             4
 72 |           ],
 73 |           "shape": 3,
 74 |           "slot_index": 0
 75 |         }
 76 |       ],
 77 |       "properties": {
 78 |         "Node name for S&R": "Joy_caption_load"
 79 |       },
 80 |       "widgets_values": [
 81 |         "meta-llama/Meta-Llama-3.1-8B"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "id": 4,
 86 |       "type": "easy showAnything",
 87 |       "pos": [
 88 |         1255,
 89 |         502
 90 |       ],
 91 |       "size": {
 92 |         "0": 356.4357604980469,
 93 |         "1": 250.48460388183594
 94 |       },
 95 |       "flags": {},
 96 |       "order": 3,
 97 |       "mode": 0,
 98 |       "inputs": [
 99 |         {
100 |           "name": "anything",
101 |           "type": "*",
102 |           "link": 3
103 |         }
104 |       ],
105 |       "properties": {
106 |         "Node name for S&R": "easy showAnything"
107 |       },
108 |       "widgets_values": [
109 |         "of a young girl standing on a lush green lawn, surrounded by tall trees with budding leaves, under a cloudy sky. The girl, approximately 3-5 years old, has light blonde hair and a cheerful expression, smiling with her teeth showing. She wears a white, short-sleeved dress adorned with colorful floral appliqués in shades of pink, yellow, and orange, and a matching white hat with a large pink flower on the side. Her dress has a full skirt and is knee-length, with delicate lace trim along the hem. She also wears white tights and white shoes, enhancing the purity of her attire. In her hands, she carries a bouquet of fresh flowers, including yellow, pink, and white varieties, held close to her chest. The background is softly blurred, emphasizing the girl as the focal point, with the trees and sky providing a serene, natural setting. The overall mood is joyful and whimsical, capturing the innocence and beauty of childhood."
110 |       ]
111 |     },
112 |     {
113 |       "id": 3,
114 |       "type": "LoadImage",
115 |       "pos": [
116 |         198,
117 |         577
118 |       ],
119 |       "size": [
120 |         570.2863188912281,
121 |         474.07759457475504
122 |       ],
123 |       "flags": {},
124 |       "order": 1,
125 |       "mode": 0,
126 |       "outputs": [
127 |         {
128 |           "name": "IMAGE",
129 |           "type": "IMAGE",
130 |           "links": [
131 |             2
132 |           ],
133 |           "shape": 3
134 |         },
135 |         {
136 |           "name": "MASK",
137 |           "type": "MASK",
138 |           "links": null,
139 |           "shape": 3
140 |         }
141 |       ],
142 |       "properties": {
143 |         "Node name for S&R": "LoadImage"
144 |       },
145 |       "widgets_values": [
146 |         "balabala_schnell.png",
147 |         "image"
148 |       ]
149 |     }
150 |   ],
151 |   "links": [
152 |     [
153 |       2,
154 |       3,
155 |       0,
156 |       2,
157 |       1,
158 |       "IMAGE"
159 |     ],
160 |     [
161 |       3,
162 |       2,
163 |       0,
164 |       4,
165 |       0,
166 |       "*"
167 |     ],
168 |     [
169 |       4,
170 |       5,
171 |       0,
172 |       2,
173 |       0,
174 |       "JoyPipeline"
175 |     ]
176 |   ],
177 |   "groups": [],
178 |   "config": {},
179 |   "extra": {
180 |     "ds": {
181 |       "scale": 0.8769226950000014,
182 |       "offset": [
183 |         -5.531487925200333,
184 |         -181.09476693793715
185 |       ]
186 |     }
187 |   },
188 |   "version": 0.4
189 | }


--------------------------------------------------------------------------------
/worflow/joy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/joy.png


--------------------------------------------------------------------------------
/worflow/joy_4b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/joy_4b.png


--------------------------------------------------------------------------------
/worflow/joy批量打标.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/joy批量打标.png


--------------------------------------------------------------------------------
/worflow/workflow_min2.6classifiy_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/workflow_min2.6classifiy_.png


--------------------------------------------------------------------------------
/worflow/二级文件夹批量打标.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/二级文件夹批量打标.png


--------------------------------------------------------------------------------
/worflow/批量打标(Batch marking).json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 72,
  3 |   "last_link_id": 91,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 68,
  7 |       "type": "Joy_caption_load",
  8 |       "pos": [
  9 |         1401,
 10 |         373
 11 |       ],
 12 |       "size": {
 13 |         "0": 315,
 14 |         "1": 58
 15 |       },
 16 |       "flags": {},
 17 |       "order": 0,
 18 |       "mode": 0,
 19 |       "outputs": [
 20 |         {
 21 |           "name": "JoyPipeline",
 22 |           "type": "JoyPipeline",
 23 |           "links": [
 24 |             84
 25 |           ],
 26 |           "slot_index": 0,
 27 |           "shape": 3
 28 |         }
 29 |       ],
 30 |       "properties": {
 31 |         "Node name for S&R": "Joy_caption_load"
 32 |       },
 33 |       "widgets_values": [
 34 |         "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
 35 |       ],
 36 |       "color": "#1b4669",
 37 |       "bgcolor": "#29699c"
 38 |     },
 39 |     {
 40 |       "id": 60,
 41 |       "type": "LayerUtility: String",
 42 |       "pos": [
 43 |         1836,
 44 |         377
 45 |       ],
 46 |       "size": {
 47 |         "0": 315,
 48 |         "1": 58
 49 |       },
 50 |       "flags": {},
 51 |       "order": 1,
 52 |       "mode": 0,
 53 |       "outputs": [
 54 |         {
 55 |           "name": "string",
 56 |           "type": "STRING",
 57 |           "links": [
 58 |             75
 59 |           ],
 60 |           "slot_index": 0,
 61 |           "shape": 3
 62 |         }
 63 |       ],
 64 |       "properties": {
 65 |         "Node name for S&R": "LayerUtility: String"
 66 |       },
 67 |       "widgets_values": [
 68 |         "TriggerWord"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "id": 67,
 73 |       "type": "Joy_caption",
 74 |       "pos": [
 75 |         1377,
 76 |         488
 77 |       ],
 78 |       "size": {
 79 |         "0": 400,
 80 |         "1": 200
 81 |       },
 82 |       "flags": {},
 83 |       "order": 7,
 84 |       "mode": 0,
 85 |       "inputs": [
 86 |         {
 87 |           "name": "joy_pipeline",
 88 |           "type": "JoyPipeline",
 89 |           "link": 84
 90 |         },
 91 |         {
 92 |           "name": "image",
 93 |           "type": "IMAGE",
 94 |           "link": 85
 95 |         }
 96 |       ],
 97 |       "outputs": [
 98 |         {
 99 |           "name": "STRING",
100 |           "type": "STRING",
101 |           "links": [
102 |             89
103 |           ],
104 |           "slot_index": 0,
105 |           "shape": 3
106 |         }
107 |       ],
108 |       "properties": {
109 |         "Node name for S&R": "Joy_caption"
110 |       },
111 |       "widgets_values": [
112 |         "A descriptive caption for this image",
113 |         300,
114 |         0.5,
115 |         true
116 |       ],
117 |       "color": "#1b4669",
118 |       "bgcolor": "#29699c"
119 |     },
120 |     {
121 |       "id": 48,
122 |       "type": "LoadImageListFromDir //Inspire",
123 |       "pos": [
124 |         596,
125 |         537
126 |       ],
127 |       "size": {
128 |         "0": 315,
129 |         "1": 170
130 |       },
131 |       "flags": {},
132 |       "order": 3,
133 |       "mode": 0,
134 |       "inputs": [
135 |         {
136 |           "name": "directory",
137 |           "type": "STRING",
138 |           "link": 70,
139 |           "widget": {
140 |             "name": "directory"
141 |           }
142 |         }
143 |       ],
144 |       "outputs": [
145 |         {
146 |           "name": "IMAGE",
147 |           "type": "IMAGE",
148 |           "links": [
149 |             90
150 |           ],
151 |           "slot_index": 0,
152 |           "shape": 6
153 |         },
154 |         {
155 |           "name": "MASK",
156 |           "type": "MASK",
157 |           "links": null,
158 |           "shape": 6
159 |         },
160 |         {
161 |           "name": "FILE PATH",
162 |           "type": "STRING",
163 |           "links": null,
164 |           "shape": 6
165 |         }
166 |       ],
167 |       "properties": {
168 |         "Node name for S&R": "LoadImageListFromDir //Inspire"
169 |       },
170 |       "widgets_values": [
171 |         "E:\\tmp\\test",
172 |         0,
173 |         0,
174 |         false
175 |       ]
176 |     },
177 |     {
178 |       "id": 57,
179 |       "type": "LayerUtility: String",
180 |       "pos": [
181 |         597,
182 |         430
183 |       ],
184 |       "size": {
185 |         "0": 315,
186 |         "1": 58
187 |       },
188 |       "flags": {},
189 |       "order": 2,
190 |       "mode": 0,
191 |       "outputs": [
192 |         {
193 |           "name": "string",
194 |           "type": "STRING",
195 |           "links": [
196 |             70
197 |           ],
198 |           "slot_index": 0,
199 |           "shape": 3
200 |         }
201 |       ],
202 |       "properties": {
203 |         "Node name for S&R": "LayerUtility: String"
204 |       },
205 |       "widgets_values": [
206 |         "C:\\Users\\chenxinghua\\Desktop\\新建文件夹 (3)\\test"
207 |       ]
208 |     },
209 |     {
210 |       "id": 61,
211 |       "type": "LayerUtility: TextJoin",
212 |       "pos": [
213 |         1840,
214 |         490
215 |       ],
216 |       "size": {
217 |         "0": 315,
218 |         "1": 130
219 |       },
220 |       "flags": {},
221 |       "order": 8,
222 |       "mode": 0,
223 |       "inputs": [
224 |         {
225 |           "name": "text_1",
226 |           "type": "STRING",
227 |           "link": 75,
228 |           "widget": {
229 |             "name": "text_1"
230 |           }
231 |         },
232 |         {
233 |           "name": "text_2",
234 |           "type": "STRING",
235 |           "link": 89,
236 |           "widget": {
237 |             "name": "text_2"
238 |           }
239 |         }
240 |       ],
241 |       "outputs": [
242 |         {
243 |           "name": "text",
244 |           "type": "STRING",
245 |           "links": [
246 |             77
247 |           ],
248 |           "slot_index": 0,
249 |           "shape": 3
250 |         }
251 |       ],
252 |       "properties": {
253 |         "Node name for S&R": "LayerUtility: TextJoin"
254 |       },
255 |       "widgets_values": [
256 |         "",
257 |         "",
258 |         "",
259 |         ""
260 |       ]
261 |     },
262 |     {
263 |       "id": 31,
264 |       "type": "LayerUtility: ImageTaggerSave",
265 |       "pos": [
266 |         2180,
267 |         475
268 |       ],
269 |       "size": {
270 |         "0": 397.0539245605469,
271 |         "1": 422.8654479980469
272 |       },
273 |       "flags": {},
274 |       "order": 9,
275 |       "mode": 0,
276 |       "inputs": [
277 |         {
278 |           "name": "image",
279 |           "type": "IMAGE",
280 |           "link": 88
281 |         },
282 |         {
283 |           "name": "tag_text",
284 |           "type": "STRING",
285 |           "link": 77,
286 |           "widget": {
287 |             "name": "tag_text"
288 |           }
289 |         }
290 |       ],
291 |       "properties": {
292 |         "Node name for S&R": "LayerUtility: ImageTaggerSave"
293 |       },
294 |       "widgets_values": [
295 |         "",
296 |         "C:\\Users\\chenxinghua\\Desktop\\新建文件夹 (3)\\test2",
297 |         "my_training_set",
298 |         "None",
299 |         "png",
300 |         80,
301 |         true
302 |       ]
303 |     },
304 |     {
305 |       "id": 65,
306 |       "type": "LayerUtility: ImageRemoveAlpha",
307 |       "pos": [
308 |         964,
309 |         400
310 |       ],
311 |       "size": {
312 |         "0": 315,
313 |         "1": 102
314 |       },
315 |       "flags": {},
316 |       "order": 6,
317 |       "mode": 4,
318 |       "inputs": [
319 |         {
320 |           "name": "RGBA_image",
321 |           "type": "IMAGE",
322 |           "link": 81
323 |         },
324 |         {
325 |           "name": "mask",
326 |           "type": "MASK",
327 |           "link": 82
328 |         }
329 |       ],
330 |       "outputs": [
331 |         {
332 |           "name": "RGB_image",
333 |           "type": "IMAGE",
334 |           "links": [
335 |             85,
336 |             88
337 |           ],
338 |           "slot_index": 0,
339 |           "shape": 3
340 |         }
341 |       ],
342 |       "properties": {
343 |         "Node name for S&R": "LayerUtility: ImageRemoveAlpha"
344 |       },
345 |       "widgets_values": [
346 |         true,
347 |         "#FFFFFF"
348 |       ]
349 |     },
350 |     {
351 |       "id": 63,
352 |       "type": "LayerMask: TransparentBackgroundUltra",
353 |       "pos": [
354 |         959,
355 |         549
356 |       ],
357 |       "size": {
358 |         "0": 327.6000061035156,
359 |         "1": 270
360 |       },
361 |       "flags": {},
362 |       "order": 4,
363 |       "mode": 4,
364 |       "inputs": [
365 |         {
366 |           "name": "image",
367 |           "type": "IMAGE",
368 |           "link": 90
369 |         }
370 |       ],
371 |       "outputs": [
372 |         {
373 |           "name": "image",
374 |           "type": "IMAGE",
375 |           "links": [
376 |             81,
377 |             91
378 |           ],
379 |           "slot_index": 0,
380 |           "shape": 3
381 |         },
382 |         {
383 |           "name": "mask",
384 |           "type": "MASK",
385 |           "links": [
386 |             82
387 |           ],
388 |           "slot_index": 1,
389 |           "shape": 3
390 |         }
391 |       ],
392 |       "properties": {
393 |         "Node name for S&R": "LayerMask: TransparentBackgroundUltra"
394 |       },
395 |       "widgets_values": [
396 |         "ckpt_base.pth",
397 |         "VITMatte",
398 |         6,
399 |         6,
400 |         0.01,
401 |         0.99,
402 |         true,
403 |         "cuda",
404 |         2
405 |       ]
406 |     },
407 |     {
408 |       "id": 72,
409 |       "type": "PreviewImage",
410 |       "pos": [
411 |         964,
412 |         865
413 |       ],
414 |       "size": [
415 |         329.67821458121784,
416 |         156.48269739093905
417 |       ],
418 |       "flags": {},
419 |       "order": 5,
420 |       "mode": 0,
421 |       "inputs": [
422 |         {
423 |           "name": "images",
424 |           "type": "IMAGE",
425 |           "link": 91
426 |         }
427 |       ],
428 |       "properties": {
429 |         "Node name for S&R": "PreviewImage"
430 |       }
431 |     }
432 |   ],
433 |   "links": [
434 |     [
435 |       70,
436 |       57,
437 |       0,
438 |       48,
439 |       0,
440 |       "STRING"
441 |     ],
442 |     [
443 |       75,
444 |       60,
445 |       0,
446 |       61,
447 |       0,
448 |       "STRING"
449 |     ],
450 |     [
451 |       77,
452 |       61,
453 |       0,
454 |       31,
455 |       1,
456 |       "STRING"
457 |     ],
458 |     [
459 |       81,
460 |       63,
461 |       0,
462 |       65,
463 |       0,
464 |       "IMAGE"
465 |     ],
466 |     [
467 |       82,
468 |       63,
469 |       1,
470 |       65,
471 |       1,
472 |       "MASK"
473 |     ],
474 |     [
475 |       84,
476 |       68,
477 |       0,
478 |       67,
479 |       0,
480 |       "JoyPipeline"
481 |     ],
482 |     [
483 |       85,
484 |       65,
485 |       0,
486 |       67,
487 |       1,
488 |       "IMAGE"
489 |     ],
490 |     [
491 |       88,
492 |       65,
493 |       0,
494 |       31,
495 |       0,
496 |       "IMAGE"
497 |     ],
498 |     [
499 |       89,
500 |       67,
501 |       0,
502 |       61,
503 |       1,
504 |       "STRING"
505 |     ],
506 |     [
507 |       90,
508 |       48,
509 |       0,
510 |       63,
511 |       0,
512 |       "IMAGE"
513 |     ],
514 |     [
515 |       91,
516 |       63,
517 |       0,
518 |       72,
519 |       0,
520 |       "IMAGE"
521 |     ]
522 |   ],
523 |   "groups": [],
524 |   "config": {},
525 |   "extra": {
526 |     "ds": {
527 |       "scale": 0.6115909044841474,
528 |       "offset": [
529 |         -315.6032329072262,
530 |         -178.67335371150824
531 |       ]
532 |     }
533 |   },
534 |   "version": 0.4
535 | }


--------------------------------------------------------------------------------
/worflow/批量打标(Batch marking).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StartHua/Comfyui_CXH_joy_caption/a6a6c910443f1a2d004f331d1e6e2538679c24ff/worflow/批量打标(Batch marking).png


--------------------------------------------------------------------------------