├── .github └── workflows │ └── publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── VisualQueryTemplate.py ├── __init__.py ├── pyproject.toml └── requirements.txt /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Comfy registry 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | paths: 9 | - "pyproject.toml" 10 | 11 | permissions: 12 | issues: write 13 | 14 | jobs: 15 | publish-node: 16 | name: Publish Custom Node to registry 17 | runs-on: ubuntu-latest 18 | if: ${{ github.repository_owner == 'celoron' }} 19 | steps: 20 | - name: Check out code 21 | uses: actions/checkout@v4 22 | - name: Publish Custom Node 23 | uses: Comfy-Org/publish-node-action@v1 24 | with: 25 | ## Add your own personal access token to your Github Repository secrets and reference it here. 26 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .nox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | cover/ 49 | 50 | # Jupyter Notebook 51 | .ipynb_checkpoints 52 | 53 | # Environments 54 | .env 55 | .venv 56 | env/ 57 | venv/ 58 | ENV/ 59 | env.bak/ 60 | venv.bak/ 61 | 62 | # PyCharm 63 | .idea/ 64 | 65 | # VS Code 66 | .vscode/ 67 | 68 | # macOS 69 | .DS_Store 70 | 71 | # Linux 72 | *.swp 73 | 74 | # Windows 75 | Thumbs.db 76 | ehthumbs.db 77 | Desktop.ini 78 | 79 | # ComfyUI specific 80 | config.json 81 | logs/ 82 | cache/ 83 | 84 | # Custom node specific 85 | node_modules/ 86 | *.log 87 | *.tmp -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Jordan Thompson (WASasquatch) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-VisualQueryTemplate 2 | A ComfyUI node for transforming images into descriptive text using templated visual question answering. Leverages Hugging Face's VQA models with transformers 3 | 4 | ![Screenshot 2024-08-28 144142](https://github.com/user-attachments/assets/0b2f5724-bad4-4e80-9026-20503019c75c) 5 | 6 | ![image](https://github.com/user-attachments/assets/43b3ba72-9cac-4c82-bd53-74ba5baf2c13) 7 | 8 | ![image](https://github.com/user-attachments/assets/29572468-ff95-4f7d-805e-3d901b7e6299) 9 | -------------------------------------------------------------------------------- /VisualQueryTemplate.py: -------------------------------------------------------------------------------- 1 | import re 2 | from PIL import Image 3 | import numpy as np 4 | import time 5 | from transformers import pipeline 6 | import torch 7 | 8 | def tensor2pil(image): 9 | return Image.fromarray(np.clip(255. * image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8)) 10 | 11 | class VisualQueryTemplateNode: 12 | def __init__(self): 13 | pass 14 | 15 | @classmethod 16 | def INPUT_TYPES(cls): 17 | return { 18 | "required": { 19 | "images": ("IMAGE",), 20 | "model": (["Salesforce/blip-vqa-base", "Salesforce/blip-vqa-capfilt-large", "dandelin/vilt-b32-finetuned-vqa", "microsoft/git-large-vqav2"], ), 21 | "question": ("STRING", {"default": "{eye color} eyes, {hair style} {hair color} hair, {ethnicity} {gender}, {age number} years old, {facialhair}", "multiline": True, "dynamicPrompts": False}), 22 | } 23 | } 24 | 25 | RETURN_TYPES = ("STRING",) 26 | OUTPUT_IS_LIST = (True,) 27 | 28 | FUNCTION = "vqa_image" 29 | CATEGORY = "image" 30 | 31 | def vqa_image(self, images, model, question): 32 | start_time = time.time() 33 | 34 | device = 0 if torch.cuda.is_available() else -1 35 | vqa = pipeline(model=model, device=device) 36 | 37 | answers = [] 38 | 39 | for image in images: 40 | pil_image = tensor2pil(image).convert("RGB") 41 | 42 | final_answer = question 43 | 44 | matches = re.findall(r'\{([^}]*)\}', question) 45 | 46 | for match in matches: 47 | 48 | match_answers = vqa(question=match, image=pil_image) 49 | 50 | print(match, match_answers) 51 | 52 | match_answer = match_answers[0]["answer"] 53 | 54 | final_answer = final_answer.replace("{"+match+"}", match_answer) 55 | 56 | 57 | answers.append(final_answer) 58 | 59 | 60 | end_time = time.time() 61 | execution_time = end_time - start_time 62 | print(f"Execution time: {execution_time} seconds") 63 | 64 | return (answers,) 65 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .VisualQueryTemplate import VisualQueryTemplateNode 2 | 3 | NODE_CLASS_MAPPINGS = { 4 | "VisualQueryTemplateNode": VisualQueryTemplateNode 5 | } 6 | 7 | NODE_DISPLAY_NAME_MAPPINGS = { 8 | # --- MAIN NODES --- 9 | "VisualQueryTemplateNode": "Visual Query Template", 10 | } 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui-visualquerytemplate" 3 | description = "" 4 | version = "1.0.0" 5 | license = {file = "LICENSE"} 6 | dependencies = ["torch", "numpy", "transformers"] 7 | 8 | [project.urls] 9 | Repository = "https://github.com/celoron/ComfyUI-VisualQueryTemplate" 10 | # Used by Comfy Registry https://comfyregistry.org 11 | 12 | [tool.comfy] 13 | PublisherId = "celoron" 14 | DisplayName = "ComfyUI-VisualQueryTemplate" 15 | Icon = "" 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | numpy 3 | transformers --------------------------------------------------------------------------------