├── .gitignore
├── Florence.py
├── LICENSE
├── README.md
├── __init__.py
├── workflow.png
├── workflow_bbox.png
└── workflow_seg_crop.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | models/
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 


--------------------------------------------------------------------------------
/Florence.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import copy
  4 | import gc
  5 | from unittest.mock import patch
  6 | import random
  7 | 
  8 | from PIL import Image, ImageDraw, ImageFont 
  9 | import matplotlib.pyplot as plt
 10 | import matplotlib.patches as patches
 11 | 
 12 | import numpy as np
 13 | import torch
 14 | from transformers import AutoProcessor, AutoModelForCausalLM
 15 | from transformers.dynamic_module_utils import get_imports
 16 | 
 17 | # Comfy Utils
 18 | import folder_paths
 19 | import comfy.model_management
 20 | 
 21 | colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
 22 |             'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
 23 | 
 24 | def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
 25 |     """Workaround for FlashAttention"""
 26 |     if os.path.basename(filename) != "modeling_florence2.py":
 27 |         return get_imports(filename)
 28 |     imports = get_imports(filename)
 29 |     imports.remove("flash_attn")
 30 |     return imports
 31 | 
 32 | def load_model(version, device):
 33 |     comfy_model_dir = os.path.join(folder_paths.models_dir, "LLM")
 34 |     if not os.path.exists(comfy_model_dir):
 35 |         os.mkdir(comfy_model_dir)
 36 |     
 37 |     identifier = "microsoft/Florence-2-" + version
 38 |     
 39 |     with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
 40 |         model = AutoModelForCausalLM.from_pretrained(identifier, cache_dir=comfy_model_dir, trust_remote_code=True)
 41 |         processor = AutoProcessor.from_pretrained(identifier, cache_dir=comfy_model_dir, trust_remote_code=True)
 42 |     
 43 |     model = model.to(device)
 44 |     return (model, processor)
 45 | 
 46 | def fig_to_pil(fig):
 47 |     buf = io.BytesIO()
 48 |     fig.savefig(buf, format='png', dpi=100, bbox_inches='tight', pad_inches=0)
 49 |     buf.seek(0)
 50 |     pil = Image.open(buf)
 51 |     plt.close()
 52 |     return pil
 53 | 
 54 | def plot_bbox(image, data):
 55 |     fig, ax = plt.subplots()
 56 |     fig.set_size_inches(image.width / 100, image.height / 100)
 57 |     ax.imshow(image)
 58 |     for i, (bbox, label) in enumerate(zip(data['bboxes'], data['labels'])):
 59 |         x1, y1, x2, y2 = bbox
 60 |         rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor='r', facecolor='none')
 61 |         ax.add_patch(rect)
 62 |         enum_label = f"{i}: {label}"
 63 |         plt.text(x1 + 7, y1 + 17, enum_label, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5))
 64 |     ax.axis('off')
 65 |     return fig
 66 | 
 67 | def draw_polygons(image, prediction, fill_mask=False):
 68 |     output_image = copy.deepcopy(image)
 69 |     draw = ImageDraw.Draw(output_image)
 70 |     scale = 1
 71 |     for polygons, label in zip(prediction['polygons'], prediction['labels']):
 72 |         color = random.choice(colormap)
 73 |         fill_color = color if fill_mask else None
 74 |         for _polygon in polygons:
 75 |             _polygon = np.array(_polygon).reshape(-1, 2)
 76 |             if len(_polygon) < 3:
 77 |                 print('Invalid polygon:', _polygon)
 78 |                 continue
 79 |             _polygon = (_polygon * scale).reshape(-1).tolist()
 80 |             if fill_mask:
 81 |                 draw.polygon(_polygon, outline=color, fill=fill_color)
 82 |             else:
 83 |                 draw.polygon(_polygon, outline=color)
 84 |             draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
 85 |     return output_image
 86 | 
 87 | def convert_to_od_format(data):
 88 |     od_results = {
 89 |         'bboxes': data.get('bboxes', []),
 90 |         'labels': data.get('bboxes_labels', [])
 91 |     }
 92 |     return od_results
 93 | 
 94 | def draw_ocr_bboxes(image, prediction):
 95 |     scale = 1
 96 |     output_image = copy.deepcopy(image)
 97 |     draw = ImageDraw.Draw(output_image)
 98 |     bboxes, labels = prediction['quad_boxes'], prediction['labels']
 99 |     for box, label in zip(bboxes, labels):
100 |         color = random.choice(colormap)
101 |         new_box = (np.array(box) * scale).tolist()
102 |         draw.polygon(new_box, width=3, outline=color)
103 |         draw.text((new_box[0]+8, new_box[1]+2),
104 |                   "{}".format(label),
105 |                   align="right",
106 |                   fill=color)
107 |     return output_image
108 | 
109 | TASK_OPTIONS = [
110 |     "caption",
111 |     "detailed caption",
112 |     "more detailed caption",
113 |     "object detection",
114 |     "dense region caption",
115 |     "region proposal",
116 |     "caption to phrase grounding",
117 |     "referring expression segmentation",
118 |     "region to segmentation",
119 |     "open vocabulary detection",
120 |     "region to category",
121 |     "region to description",
122 |     "OCR",
123 |     "OCR with region"
124 |     ]
125 | 
126 | class LoadFlorence2Model:
127 |     def __init__(self):
128 |         self.model = None
129 |         self.processor = None
130 |         self.version = None
131 |         self.device = comfy.model_management.get_torch_device()
132 |     
133 |     @classmethod
134 |     def INPUT_TYPES(s):
135 |         return {
136 |             "required": {
137 |                 "version": (["base", "base-ft", "large", "large-ft"],),
138 |             },
139 |         }
140 |     
141 |     RETURN_TYPES = ("FLORENCE2", )
142 |     FUNCTION = "load"
143 |     CATEGORY = "Florence2"
144 |     
145 |     def load(self, version):
146 |         
147 |         if self.version != version:
148 |             self.model, self.processor = load_model(version, self.device)
149 |             self.version = version
150 |         
151 |         return ({'model': self.model, 'processor': self.processor, 'version': self.version, 'device': self.device}, )
152 | 
153 | class Florence2:
154 |     def __init__(self):
155 |         self.model = None
156 |         self.processor = None
157 |         self.version = None
158 |         self.device = None
159 |     
160 |     @classmethod
161 |     def INPUT_TYPES(s):
162 |         return {
163 |             "required": {
164 |                 "FLORENCE2": ("FLORENCE2",),
165 |                 "image": ("IMAGE",),
166 |                 "task": (TASK_OPTIONS, {"default": TASK_OPTIONS[0]}),
167 |                 "text_input": ("STRING", {}),
168 |                 "max_new_tokens": ("INT", {"default": 1024, "step": 1 }),
169 |                 "num_beams": ("INT", {"default": 3,  "min": 1, "step": 1 }),
170 |                 "do_sample": ('BOOLEAN', {"default":False}),
171 |                 "fill_mask": ('BOOLEAN', {"default":False}),
172 |             },
173 |         }
174 |     
175 |     RETURN_TYPES = ("IMAGE", "STRING", "F_BBOXES",)
176 |     RETURN_NAMES = ("preview", "string", "F_BBOXES",)
177 |     FUNCTION = "apply"
178 |     CATEGORY = "Florence2"
179 |     
180 |     def apply(self, FLORENCE2, image, task, text_input, max_new_tokens, num_beams, do_sample, fill_mask):
181 |         img = 255. * image[0].cpu().numpy()
182 |         img = Image.fromarray(np.clip(img, 0, 255).astype(np.uint8)) 
183 |         
184 |         self.model = FLORENCE2['model']
185 |         self.processor = FLORENCE2['processor']
186 |         self.version = FLORENCE2['version']
187 |         self.device = FLORENCE2['device']
188 |         
189 |         results, output_image = self.process_image(img, task, max_new_tokens, num_beams, do_sample, fill_mask, text_input)
190 |         # bboxes, labels OR polygons, labels
191 |         if isinstance(results, dict):
192 |             results["width"] = img.width
193 |             results["height"] = img.height
194 | 
195 |         if output_image == None:
196 |             output_image = image[0].detach().clone().unsqueeze(0)
197 |         else:
198 |             output_image = np.asarray(output_image).astype(np.float32) / 255
199 |             output_image = torch.from_numpy(output_image).unsqueeze(0)
200 |         
201 |         return (output_image, str(results), results)
202 |     
203 |     def run_example(self, task_prompt, image, max_new_tokens, num_beams, do_sample, text_input=None):
204 |         if text_input is None:
205 |             prompt = task_prompt
206 |         else:
207 |             prompt = task_prompt + text_input
208 |         inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.device)
209 |         generated_ids = self.model.generate(
210 |             input_ids=inputs["input_ids"],
211 |             pixel_values=inputs["pixel_values"],
212 |             max_new_tokens=max_new_tokens,
213 |             early_stopping=False,
214 |             do_sample=do_sample,
215 |             num_beams=num_beams,
216 |         )
217 |         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
218 |         parsed_answer = self.processor.post_process_generation(
219 |             generated_text,
220 |             task=task_prompt,
221 |             image_size=(image.width, image.height)
222 |         )
223 |         return parsed_answer
224 |     
225 |     def process_image(self, image, task_prompt, max_new_tokens, num_beams, do_sample, fill_mask, text_input=None):
226 |         if task_prompt == 'caption':
227 |             task_prompt = '<CAPTION>'
228 |             result = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
229 |             return result[task_prompt], None
230 |         elif task_prompt == 'detailed caption':
231 |             task_prompt = '<DETAILED_CAPTION>'
232 |             result = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
233 |             return result[task_prompt], None
234 |         elif task_prompt == 'more detailed caption':
235 |             task_prompt = '<MORE_DETAILED_CAPTION>'
236 |             result = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
237 |             return result[task_prompt], None
238 |         elif task_prompt == 'object detection':
239 |             task_prompt = '<OD>'
240 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
241 |             fig = plot_bbox(image, results['<OD>'])
242 |             return results[task_prompt], fig_to_pil(fig)
243 |         elif task_prompt == 'dense region caption':
244 |             task_prompt = '<DENSE_REGION_CAPTION>'
245 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
246 |             fig = plot_bbox(image, results['<DENSE_REGION_CAPTION>'])
247 |             return results[task_prompt], fig_to_pil(fig)
248 |         elif task_prompt == 'region proposal':
249 |             task_prompt = '<REGION_PROPOSAL>'
250 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
251 |             fig = plot_bbox(image, results['<REGION_PROPOSAL>'])
252 |             return results[task_prompt], fig_to_pil(fig)
253 |         elif task_prompt == 'caption to phrase grounding':
254 |             task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
255 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample, text_input)
256 |             fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
257 |             return results[task_prompt], fig_to_pil(fig)
258 |         elif task_prompt == 'referring expression segmentation':
259 |             task_prompt = '<REFERRING_EXPRESSION_SEGMENTATION>'
260 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample, text_input)
261 |             output_image = draw_polygons(image, results['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask)
262 |             return results[task_prompt], output_image
263 |         elif task_prompt == 'region to segmentation':
264 |             task_prompt = '<REGION_TO_SEGMENTATION>'
265 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample, text_input)
266 |             output_image = draw_polygons(image, results['<REGION_TO_SEGMENTATION>'], fill_mask)
267 |             return results[task_prompt], output_image
268 |         elif task_prompt == 'open vocabulary detection':
269 |             task_prompt = '<OPEN_VOCABULARY_DETECTION>'
270 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample, text_input)
271 |             bbox_results = convert_to_od_format(results['<OPEN_VOCABULARY_DETECTION>'])
272 |             fig = plot_bbox(image, bbox_results)
273 |             return bbox_results, fig_to_pil(fig)
274 |         elif task_prompt == 'region to category':
275 |             task_prompt = '<REGION_TO_CATEGORY>'
276 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample, text_input)
277 |             return results[task_prompt], None
278 |         elif task_prompt == 'region to description':
279 |             task_prompt = '<REGION_TO_DESCRIPTION>'
280 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample, text_input)
281 |             return results[task_prompt], None
282 |         elif task_prompt == 'OCR':
283 |             task_prompt = '<OCR>'
284 |             result = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
285 |             return result[task_prompt], None
286 |         elif task_prompt == 'OCR with region':
287 |             task_prompt = '<OCR_WITH_REGION>'
288 |             results = self.run_example(task_prompt, image, max_new_tokens, num_beams, do_sample)
289 |             output_image = draw_ocr_bboxes(image, results['<OCR_WITH_REGION>'])
290 |             output_results = {'bboxes': results[task_prompt].get('quad_boxes', []),
291 |                               'labels': results[task_prompt].get('labels', [])}
292 |             return output_results, output_image
293 |         else:
294 |             return "", None  # Return empty string and None for unknown task prompts
295 | 
296 | class Florence2Postprocess:
297 |     @classmethod
298 |     def INPUT_TYPES(s):
299 |         return {
300 |             "required": {
301 |                 "F_BBOXES": ("F_BBOXES",),
302 |                 "index": ("INT", {"default": 0, "min": 0}),
303 |             },
304 |         }
305 |     
306 |     RETURN_TYPES = ("MASK", "STRING", "STRING", "INT", "INT", "INT", "INT")
307 |     RETURN_NAMES = ("mask", "label", "loc_string", "width", "height", "x", "y")
308 |     FUNCTION = "apply"
309 |     CATEGORY = "Florence2"
310 |     
311 |     def apply(self, F_BBOXES, index):
312 |         if isinstance(F_BBOXES, str):
313 |             return (torch.zeros(1, 512, 512, dtype=torch.float32), F_BBOXES, "", 0, 0, 0, 0)
314 |         
315 |         width = F_BBOXES["width"]
316 |         height = F_BBOXES["height"]
317 |         mask = np.zeros((height, width), dtype=np.uint8)
318 | 
319 |         x1 = y1 = x2 = y2 = 0
320 |         label = ""
321 |         if "bboxes" in F_BBOXES:
322 |             if index < len(F_BBOXES["labels"]):
323 |                 bbox = F_BBOXES["bboxes"][index]
324 |                 label = F_BBOXES["labels"][index]
325 |                 label = label.removeprefix("</s>")
326 | 
327 |                 if len(bbox) == 4:
328 |                     x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
329 |                 elif len(bbox) == 8:
330 |                     x1 = int(min(bbox[0::2]))
331 |                     x2 = int(max(bbox[0::2]))
332 |                     y1 = int(min(bbox[1::2]))
333 |                     y2 = int(max(bbox[1::2]))
334 | 
335 |                 mask[y1:y2, x1:x2] = 1
336 | 
337 |         else:
338 |             if index < len(F_BBOXES["polygons"][0]):
339 |                 polygon = F_BBOXES["polygons"][0][index]
340 |                 label = F_BBOXES["labels"][0]
341 | 
342 |                 image = Image.new('RGB', (width, height), color='black')
343 |                 draw = ImageDraw.Draw(image)
344 |                 _polygon = np.array(polygon).reshape(-1, 2)
345 |                 if len(_polygon) < 3:
346 |                     print('Invalid polygon:', _polygon)
347 |                 else:
348 |                     _polygon = (_polygon).reshape(-1).tolist()
349 |                     draw.polygon(_polygon, outline='white', fill='white')
350 | 
351 |                 x1 = int(min(polygon[0::2]))
352 |                 x2 = int(max(polygon[0::2]))
353 |                 y1 = int(min(polygon[1::2]))
354 |                 y2 = int(max(polygon[1::2]))
355 | 
356 |                 mask = np.asarray(image)[..., 0].astype(np.float32) / 255
357 |         mask = torch.from_numpy(mask.astype(np.float32)).unsqueeze(0)
358 |         loc_string = f"<loc_{x1 * 999 // width}><loc_{y1 * 999 // height}><loc_{x2 * 999 // width}><loc_{y2 * 999 // height}>"
359 |         return (mask, label, loc_string, x2 - x1 + 1, y2 - y1 + 1, x1, y1)
360 | 
361 | class Florence2PostprocessAll:
362 |     @classmethod
363 |     def INPUT_TYPES(s):
364 |         return {
365 |             "required": {
366 |                 "F_BBOXES": ("F_BBOXES",),
367 |             },
368 |         }
369 | 
370 |     RETURN_TYPES = ("MASK", "STRING", "STRING", "INT", "INT", "INT", "INT")
371 |     RETURN_NAMES = ("mask", "label", "loc_string", "width", "height", "x", "y")
372 |     FUNCTION = "apply"
373 |     CATEGORY = "Florence2"
374 |     def apply(self, F_BBOXES):
375 |         if isinstance(F_BBOXES, str):
376 |             return (torch.zeros(1, 512, 512, dtype=torch.float32), F_BBOXES, "", 0, 0, 0, 0)
377 |         
378 |         width = F_BBOXES["width"]
379 |         height = F_BBOXES["height"]
380 |         mask = np.zeros((height, width), dtype=np.uint8)
381 |         
382 |         x1_c = width
383 |         y1_c = height
384 |         x2_c = y2_c = 0
385 |         label = ""
386 |         if "bboxes" in F_BBOXES:
387 |             for idx in range(len(F_BBOXES["bboxes"])):
388 |                 bbox = F_BBOXES["bboxes"][idx]
389 |                 
390 |                 new_label = F_BBOXES["labels"][idx].removeprefix("</s>")
391 |                 if new_label not in label:
392 |                     if idx > 0:
393 |                         label = label + ", "
394 |                     label = label + new_label
395 |                 
396 |                 if len(bbox) == 4:
397 |                     x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
398 |                 elif len(bbox) == 8:
399 |                     x1 = int(min(bbox[0::2]))
400 |                     x2 = int(max(bbox[0::2]))
401 |                     y1 = int(min(bbox[1::2]))
402 |                     y2 = int(max(bbox[1::2]))
403 |                 else:
404 |                     continue
405 |                 
406 |                 x1_c = min(x1_c, x1)
407 |                 y1_c = min(y1_c, y1)
408 |                 x2_c = max(x2_c, x2)
409 |                 y2_c = max(y2_c, y2)
410 |                 
411 |                 mask[y1:y2, x1:x2] = 1
412 |         
413 |         else:
414 |             image = Image.new('RGB', (width, height), color='black')
415 |             draw = ImageDraw.Draw(image)
416 | 
417 |             x1_c = width
418 |             y1_c = height
419 |             x2_c = y2_c = 0
420 | 
421 |             for polygon in F_BBOXES["polygons"][0]:
422 |                 _polygon = np.array(polygon).reshape(-1, 2)
423 |                 if len(_polygon) < 3:
424 |                     print('Invalid polygon:', _polygon)
425 |                     continue
426 | 
427 |                 draw.polygon(_polygon.flatten().tolist(), outline='white', fill='white')
428 | 
429 |                 x1_c = min(x1_c, int(min(polygon[0::2])))
430 |                 x2_c = max(x2_c, int(max(polygon[0::2])))
431 |                 y1_c = min(y1_c, int(min(polygon[1::2])))
432 |                 y2_c = max(y2_c, int(max(polygon[1::2])))
433 | 
434 |             mask = np.asarray(image)[..., 0].astype(np.float32) / 255
435 |         
436 |         mask = torch.from_numpy(mask.astype(np.float32)).unsqueeze(0)
437 |         loc_string = f"<loc_{x1_c * 999 // width}><loc_{y1_c * 999 // height}><loc_{x2_c * 999 // width}><loc_{y2_c * 999 // height}>"
438 |         return (mask, label, loc_string, x2_c - x1_c + 1, y2_c - y1_c + 1, x1_c, y1_c)
439 | 
440 | NODE_CLASS_MAPPINGS = {
441 |     "LoadFlorence2Model": LoadFlorence2Model,
442 |     "Florence2": Florence2,
443 |     "Florence2Postprocess": Florence2Postprocess,
444 |     "Florence2PostprocessAll": Florence2PostprocessAll,
445 |     }
446 | 
447 | NODE_DISPLAY_NAME_MAPPINGS = {
448 |     "LoadFlorence2Model": "Load Florence2 Model",
449 |     "Florence2": "Florence2",
450 |     "Florence2Postprocess": "Florence2 Postprocess Single",
451 |     "Florence2PostprocessAll": "Florence2 Postprocess All",
452 |     }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 spacepxl
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI-Florence-2
 2 | 
 3 | https://huggingface.co/microsoft/Florence-2-large
 4 | 
 5 | All four models, initial support for all output types
 6 | 
 7 | <details>
 8 | 
 9 | <summary>Examples</summary>
10 | 
11 | ![](workflow.png)
12 | 
13 | ![](workflow_bbox.png)
14 | 
15 | ![](workflow_seg_crop.png)
16 | 
17 | </details>
18 | 
19 | TODO:
20 |  - all-in-one CLIP masked conditioning node
21 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .Florence import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2 | 
3 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']


--------------------------------------------------------------------------------
/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spacepxl/ComfyUI-Florence-2/d52e1fe16343cad9e2b0770d42362d646fc1585f/workflow.png


--------------------------------------------------------------------------------
/workflow_bbox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spacepxl/ComfyUI-Florence-2/d52e1fe16343cad9e2b0770d42362d646fc1585f/workflow_bbox.png


--------------------------------------------------------------------------------
/workflow_seg_crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spacepxl/ComfyUI-Florence-2/d52e1fe16343cad9e2b0770d42362d646fc1585f/workflow_seg_crop.png


--------------------------------------------------------------------------------