├── example.png ├── __init__.py ├── pyproject.toml ├── LICENSE ├── README.md └── Pic2Story_Node.py /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_Pic2Story/HEAD/example.png -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | python = sys.executable 4 | 5 | from .Pic2Story_Node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 6 | 7 | 8 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS'] 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui_pic2story" 3 | description = "you can using pic2story in comfyUI" 4 | version = "1.1.0" 5 | license = { file = "LICENSE" } 6 | 7 | [project.urls] 8 | Repository = "https://github.com/smthemex/ComfyUI_Pic2Story" 9 | # Used by Comfy Registry https://comfyregistry.org 10 | 11 | [tool.comfy] 12 | PublisherId = "smthemex" 13 | DisplayName = "ComfyUI_Pic2Story" 14 | Icon = "" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, smthemex 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI_Pic2Story 2 | 3 | ComfyUI simple node based on BLIP method, with the function of "Image to Txt " . 4 | 5 | Original model: [link](https://huggingface.co/Salesforce/blip-image-captioning-large) 6 | Using model: [link](https://huggingface.co/abhijit2111/Pic2Story) 7 | 8 | 1.Installation 9 | ----- 10 | 1.1 In the .\ComfyUI \ custom_node directory, run the following: 11 | 12 | ``` 13 | git clone https://github.com/smthemex/ComfyUI_Pic2Story.git 14 | ``` 15 | 16 | 1.2 using repo_id or offline 17 | 18 | repo_id: abhijit2111/Pic2Story [link](https://huggingface.co/abhijit2111/Pic2Story/tree/main) 19 | repo_id: google/paligemma2-3b-pt-896 [link](https://huggingface.co/google/paligemma2-3b-pt-896/tree/main) 20 | 21 | 22 | 2.Example 23 | --- 24 | Prompt is not necessary! 提示词不是必须的,可以去掉. 25 | ![](https://github.com/smthemex/ComfyUI_Pic2Story/blob/main/example.png) 26 | 27 | 28 | 29 | 4.Citation 30 | ------ 31 | 32 | ``` python 33 | @misc{https://doi.org/10.48550/arxiv.2201.12086, 34 | doi = {10.48550/ARXIV.2201.12086}, 35 | 36 | url = {https://arxiv.org/abs/2201.12086}, 37 | 38 | author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven}, 39 | 40 | keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, 41 | 42 | title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation}, 43 | 44 | publisher = {arXiv}, 45 | 46 | year = {2022}, 47 | 48 | copyright = {Creative Commons Attribution 4.0 International} 49 | } 50 | ``` 51 | -------------------------------------------------------------------------------- /Pic2Story_Node.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import torch 5 | from PIL import Image 6 | from transformers import BlipProcessor, BlipForConditionalGeneration 7 | 8 | def tensor_to_image(tensor): 9 | image_np = tensor.squeeze().mul(255).clamp(0, 255).byte().numpy() 10 | image = Image.fromarray(image_np, mode='RGB') 11 | return image 12 | 13 | class Pic2Story_Loader: 14 | def __init__(self): 15 | pass 16 | 17 | @classmethod 18 | def INPUT_TYPES(cls): 19 | return { 20 | "required": { 21 | "repo_id": ("STRING",{"default": "abhijit2111/Pic2Story"}), 22 | "inference_mode": (["gpu_float16", "gpu", "cpu"],), 23 | } 24 | } 25 | 26 | RETURN_TYPES = ("PICMODEL",) 27 | RETURN_NAMES = ("model",) 28 | FUNCTION = "load_main" 29 | CATEGORY = "Pic2Story" 30 | 31 | def load_main(self, repo_id, inference_mode): 32 | if not repo_id: 33 | raise ValueError("need a repo_id or local_model_path ") 34 | if "Pic2Story" in repo_id: 35 | mode="story" 36 | else: 37 | mode = "paligemma" 38 | if mode=="story": 39 | if inference_mode == "gpu_float16": 40 | model = BlipForConditionalGeneration.from_pretrained(repo_id, 41 | torch_dtype=torch.float16).to("cuda") 42 | elif inference_mode == "gpu": 43 | model = BlipForConditionalGeneration.from_pretrained(repo_id).to("cuda") 44 | else: 45 | model = BlipForConditionalGeneration.from_pretrained(repo_id) 46 | processor = BlipProcessor.from_pretrained(repo_id) 47 | else: 48 | from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor 49 | 50 | #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 51 | device = "cpu" if "cpu" in inference_mode else "cuda" 52 | model = PaliGemmaForConditionalGeneration.from_pretrained( 53 | repo_id, 54 | torch_dtype=torch.bfloat16, 55 | local_files_only=True 56 | ).to(device) 57 | processor = PaliGemmaProcessor.from_pretrained(repo_id, local_files_only=True) 58 | model={"model":model,"processor":processor,"inference_mode":inference_mode,"mode":mode} 59 | 60 | return (model,) 61 | 62 | 63 | class Pic2Story_Sampler: 64 | def __init__(self): 65 | pass 66 | 67 | @classmethod 68 | def INPUT_TYPES(cls): 69 | return { 70 | "required": { 71 | "image": ("IMAGE",), 72 | "model": ("PICMODEL",), 73 | "prompt": ("STRING", {"default": "a photography of"}), 74 | } 75 | } 76 | 77 | RETURN_TYPES = ("STRING",) 78 | RETURN_NAMES = ("prompt",) 79 | FUNCTION = "pic_to_story" 80 | CATEGORY = "Pic2Story" 81 | 82 | 83 | def pic_to_story(self, image,model, prompt): 84 | processor=model.get("processor") 85 | mode=model.get("mode") 86 | inference_mode=model.get("inference_mode") 87 | model=model.get("model") 88 | pil_image = tensor_to_image(image) 89 | if mode=="story": 90 | if inference_mode == "gpu_float16": 91 | if not prompt: 92 | # unconditional image captioning 93 | inputs = processor(pil_image, return_tensors="pt").to("cuda", torch.float16) 94 | print("processor image without prompt") 95 | else: 96 | # conditional image captioning 97 | inputs = processor(pil_image, prompt, return_tensors="pt").to("cuda", torch.float16) 98 | out = model.generate(**inputs) 99 | story_out = processor.decode(out[0], skip_special_tokens=True) 100 | print(type(story_out)) 101 | 102 | return (story_out,) 103 | elif inference_mode == "gpu": 104 | if not prompt: 105 | # unconditional image captioning 106 | inputs = processor(pil_image, return_tensors="pt").to("cuda") 107 | print("processor image without prompt") 108 | else: 109 | # conditional image captioning 110 | inputs = processor(pil_image, prompt, return_tensors="pt").to("cuda") 111 | out = model.generate(**inputs) 112 | story_out = processor.decode(out[0], skip_special_tokens=True) 113 | return (story_out,) 114 | else: 115 | if not prompt: 116 | # unconditional image captioning 117 | inputs = processor(pil_image, return_tensors="pt") 118 | print("processor image without prompt") 119 | else: 120 | # conditional image captioning 121 | inputs = processor(pil_image, prompt, return_tensors="pt") 122 | out = model.generate(**inputs) 123 | 124 | story_out = processor.decode(out[0], skip_special_tokens=True) 125 | else: 126 | device ="cpu" if "cpu" in inference_mode else "cuda" 127 | 128 | if not prompt: 129 | prompt= "describe en\n" 130 | inputs = processor(text=prompt, images=pil_image, 131 | padding="longest", do_convert_rgb=True, return_tensors="pt").to(device) 132 | inputs = inputs.to(dtype=model.dtype) 133 | 134 | with torch.no_grad(): 135 | output = model.generate(**inputs, max_new_tokens=128) 136 | 137 | story_out = processor.decode(output[0], skip_special_tokens=True) 138 | 139 | story_out=story_out.splitlines()[-1] 140 | return (story_out,) 141 | 142 | 143 | NODE_CLASS_MAPPINGS = { 144 | "Pic2Story_Loader":Pic2Story_Loader, 145 | "Pic2Story_Sampler": Pic2Story_Sampler 146 | } 147 | 148 | NODE_DISPLAY_NAME_MAPPINGS = { 149 | "Pic2Story_Loader":"Pic2Story_Loader", 150 | "Pic2Story_Sampler": "Pic2Story_Sampler" 151 | 152 | } 153 | --------------------------------------------------------------------------------