├── example.png
├── __init__.py
├── pyproject.toml
├── LICENSE
├── README.md
└── Pic2Story_Node.py


/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_Pic2Story/HEAD/example.png


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | python = sys.executable
4 | 
5 | from .Pic2Story_Node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
6 | 
7 | 
8 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui_pic2story"
 3 | description = "you can using pic2story in comfyUI"
 4 | version = "1.1.0"
 5 | license = { file = "LICENSE" }
 6 | 
 7 | [project.urls]
 8 | Repository = "https://github.com/smthemex/ComfyUI_Pic2Story"
 9 | #  Used by Comfy Registry https://comfyregistry.org
10 | 
11 | [tool.comfy]
12 | PublisherId = "smthemex"
13 | DisplayName = "ComfyUI_Pic2Story"
14 | Icon = ""
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2024, smthemex
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI_Pic2Story
 2 | 
 3 | ComfyUI simple node based on BLIP method, with the function of "Image to Txt " .   
 4 | 
 5 | Original model: [link](https://huggingface.co/Salesforce/blip-image-captioning-large)    
 6 | Using model: [link](https://huggingface.co/abhijit2111/Pic2Story)    
 7 | 
 8 | 1.Installation
 9 | -----
10 |   1.1 In the .\ComfyUI \ custom_node directory, run the following:  
11 |   
12 |   ```
13 |   git clone https://github.com/smthemex/ComfyUI_Pic2Story.git
14 |   ```
15 |  
16 |   1.2 using repo_id or offline   
17 |   
18 |   repo_id:  abhijit2111/Pic2Story   [link](https://huggingface.co/abhijit2111/Pic2Story/tree/main)   
19 |   repo_id:  google/paligemma2-3b-pt-896  [link](https://huggingface.co/google/paligemma2-3b-pt-896/tree/main)  
20 | 
21 | 
22 | 2.Example
23 | ---
24 | Prompt is not necessary! 提示词不是必须的,可以去掉.   
25 |  ![](https://github.com/smthemex/ComfyUI_Pic2Story/blob/main/example.png)
26 |  
27 | 
28 | 
29 | 4.Citation
30 | ------
31 | 
32 | ``` python  
33 | @misc{https://doi.org/10.48550/arxiv.2201.12086,
34 |   doi = {10.48550/ARXIV.2201.12086},
35 |   
36 |   url = {https://arxiv.org/abs/2201.12086},
37 |   
38 |   author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
39 |   
40 |   keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
41 |   
42 |   title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
43 |   
44 |   publisher = {arXiv},
45 |   
46 |   year = {2022},
47 |   
48 |   copyright = {Creative Commons Attribution 4.0 International}
49 | }
50 | ```
51 | 


--------------------------------------------------------------------------------
/Pic2Story_Node.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | import torch
  5 | from PIL import Image
  6 | from transformers import BlipProcessor, BlipForConditionalGeneration
  7 | 
  8 | def tensor_to_image(tensor):
  9 |     image_np = tensor.squeeze().mul(255).clamp(0, 255).byte().numpy()
 10 |     image = Image.fromarray(image_np, mode='RGB')
 11 |     return image
 12 | 
 13 | class Pic2Story_Loader:
 14 |     def __init__(self):
 15 |         pass
 16 |     
 17 |     @classmethod
 18 |     def INPUT_TYPES(cls):
 19 |         return {
 20 |             "required": {
 21 |                 "repo_id": ("STRING",{"default": "abhijit2111/Pic2Story"}),
 22 |                 "inference_mode": (["gpu_float16", "gpu", "cpu"],),
 23 |             }
 24 |         }
 25 |     
 26 |     RETURN_TYPES = ("PICMODEL",)
 27 |     RETURN_NAMES = ("model",)
 28 |     FUNCTION = "load_main"
 29 |     CATEGORY = "Pic2Story"
 30 |     
 31 |     def load_main(self, repo_id, inference_mode):
 32 |         if not repo_id:
 33 |             raise ValueError("need a repo_id or local_model_path ")
 34 |         if "Pic2Story" in repo_id:
 35 |             mode="story"
 36 |         else:
 37 |             mode = "paligemma"
 38 |         if mode=="story":
 39 |             if inference_mode == "gpu_float16":
 40 |                 model = BlipForConditionalGeneration.from_pretrained(repo_id,
 41 |                                                                      torch_dtype=torch.float16).to("cuda")
 42 |             elif inference_mode == "gpu":
 43 |                 model = BlipForConditionalGeneration.from_pretrained(repo_id).to("cuda")
 44 |             else:
 45 |                 model = BlipForConditionalGeneration.from_pretrained(repo_id)
 46 |             processor = BlipProcessor.from_pretrained(repo_id)
 47 |         else:
 48 |             from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 49 |             
 50 |             #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 51 |             device = "cpu" if "cpu" in inference_mode else "cuda"
 52 |             model = PaliGemmaForConditionalGeneration.from_pretrained(
 53 |                 repo_id,
 54 |                 torch_dtype=torch.bfloat16,
 55 |                 local_files_only=True
 56 |             ).to(device)
 57 |             processor = PaliGemmaProcessor.from_pretrained(repo_id, local_files_only=True)
 58 |         model={"model":model,"processor":processor,"inference_mode":inference_mode,"mode":mode}
 59 |             
 60 |         return (model,)
 61 |         
 62 | 
 63 | class Pic2Story_Sampler:
 64 |     def __init__(self):
 65 |         pass
 66 | 
 67 |     @classmethod
 68 |     def INPUT_TYPES(cls):
 69 |         return {
 70 |             "required": {
 71 |                 "image": ("IMAGE",),
 72 |                 "model": ("PICMODEL",),
 73 |                 "prompt": ("STRING", {"default": "a photography of"}),
 74 |             }
 75 |         }
 76 | 
 77 |     RETURN_TYPES = ("STRING",)
 78 |     RETURN_NAMES = ("prompt",)
 79 |     FUNCTION = "pic_to_story"
 80 |     CATEGORY = "Pic2Story"
 81 | 
 82 |    
 83 |     def pic_to_story(self, image,model, prompt):
 84 |         processor=model.get("processor")
 85 |         mode=model.get("mode")
 86 |         inference_mode=model.get("inference_mode")
 87 |         model=model.get("model")
 88 |         pil_image = tensor_to_image(image)
 89 |         if mode=="story":
 90 |             if inference_mode == "gpu_float16":
 91 |                 if not prompt:
 92 |                     # unconditional image captioning
 93 |                     inputs = processor(pil_image, return_tensors="pt").to("cuda", torch.float16)
 94 |                     print("processor image without prompt")
 95 |                 else:
 96 |                     # conditional image captioning
 97 |                     inputs = processor(pil_image, prompt, return_tensors="pt").to("cuda", torch.float16)
 98 |                 out = model.generate(**inputs)
 99 |                 story_out = processor.decode(out[0], skip_special_tokens=True)
100 |                 print(type(story_out))
101 |                 
102 |                 return (story_out,)
103 |             elif inference_mode == "gpu":
104 |                 if not prompt:
105 |                     # unconditional image captioning
106 |                     inputs = processor(pil_image, return_tensors="pt").to("cuda")
107 |                     print("processor image without prompt")
108 |                 else:
109 |                     # conditional image captioning
110 |                     inputs = processor(pil_image, prompt, return_tensors="pt").to("cuda")
111 |                 out = model.generate(**inputs)
112 |                 story_out = processor.decode(out[0], skip_special_tokens=True)
113 |                 return (story_out,)
114 |             else:
115 |                 if not prompt:
116 |                     # unconditional image captioning
117 |                     inputs = processor(pil_image, return_tensors="pt")
118 |                     print("processor image without prompt")
119 |                 else:
120 |                     # conditional image captioning
121 |                     inputs = processor(pil_image, prompt, return_tensors="pt")
122 |                 out = model.generate(**inputs)
123 |                 
124 |                 story_out = processor.decode(out[0], skip_special_tokens=True)
125 |         else:
126 |             device ="cpu" if "cpu" in inference_mode else "cuda"
127 |             
128 |             if not prompt:
129 |                 prompt= "describe en\n"
130 |             inputs = processor(text=prompt, images=pil_image,
131 |                                padding="longest", do_convert_rgb=True, return_tensors="pt").to(device)
132 |             inputs = inputs.to(dtype=model.dtype)
133 |             
134 |             with torch.no_grad():
135 |                 output = model.generate(**inputs, max_new_tokens=128)
136 |             
137 |             story_out = processor.decode(output[0], skip_special_tokens=True)
138 |             
139 |             story_out=story_out.splitlines()[-1]
140 |         return (story_out,)
141 | 
142 | 
143 | NODE_CLASS_MAPPINGS = {
144 |     "Pic2Story_Loader":Pic2Story_Loader,
145 |     "Pic2Story_Sampler": Pic2Story_Sampler
146 | }
147 | 
148 | NODE_DISPLAY_NAME_MAPPINGS = {
149 |     "Pic2Story_Loader":"Pic2Story_Loader",
150 |     "Pic2Story_Sampler": "Pic2Story_Sampler"
151 | 
152 | }
153 | 


--------------------------------------------------------------------------------