├── .gitattributes
├── .github
└── workflows
│ └── publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── examples
├── dataset_captioning_example.jpg
├── dataset_captioning_workflow.json
├── llama_vision_bounding_box_example.jpg
├── llama_vision_bounding_box_workflow.json
├── molmo_count_example.jpg
├── molmo_counting_workflow.json
├── molmo_multi_pointing_example.jpg
├── pixtral_caption_example.jpg
├── pixtral_caption_workflow.json
└── pixtral_comparison_example.jpg
├── nodes.py
├── pyproject.toml
└── requirements.txt
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to Comfy registry
2 | on:
3 | workflow_dispatch:
4 | push:
5 | branches:
6 | - main
7 | - master
8 | paths:
9 | - "pyproject.toml"
10 |
11 | jobs:
12 | publish-node:
13 | name: Publish Custom Node to registry
14 | runs-on: ubuntu-latest
15 | # if this is a forked repository. Skipping the workflow.
16 | if: github.event.repository.fork == false
17 | steps:
18 | - name: Check out code
19 | uses: actions/checkout@v4
20 | - name: Publish Custom Node
21 | uses: Comfy-Org/publish-node-action@main
22 | with:
23 | ## Add your own personal access token to your Github Repository secrets and reference it here.
24 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ComfyUI-PixtralLlamaMolmoVision
2 |
3 | For loading and running Pixtral, Llama 3.2 Vision, and Molmo models.
4 |
5 | Important change compared to last version: Models should now be placed in the `ComfyUI/models/LLM` folder for better compatibility with other custom nodes for LLM. I apologize for having to move your models around if you were using the previous version.
6 |
7 | Includes nodes for loading and running VLMs:
8 | - Load Vision Model
9 | - Load Pixtral Model
10 | - Load Llama Vision Model
11 | - Load Molmo Model
12 | - Generate Text with Pixtral
13 | - Generate Text with Llama Vision
14 | - Generate Text with Molmo
15 |
16 | Along with some utility nodes for working with text:
17 | - Parse Bounding Boxes
18 | - Parse Points
19 | - Plot Points
20 | - Regex Split String
21 | - Regex Search
22 | - Regex Find All
23 | - Regex Substitution
24 | - Join String
25 | - Select Index
26 | - Slice List
27 |
28 | The Load Vision Model node is able to load any of these model types, but it will also be able to load other models in the LLM folder. Other model types, like Florence2, will not work with these nodes, though.
29 |
30 | The other model loading nodes are for specific model types and will filter the list to just that model type.
31 |
32 | The text generation nodes are model-specific. Pixtral seems to be the only one out of these that currently supports repetition penalty. I plan to add some more nodes for creating prompts following a chat sequence.
33 |
34 | The Generate Text with Pixtral node can take the `[IMG]` special token in the prompt, and should include it for as many images as you want to process in a single prompt. If these tags aren't added, they will be automatically added to the beginning of your prompt. The Llama and Molmo models will add the images to the beginning of the prompt automatically, and while they do support processing multiple images at once, they don't support including multiple images in different places in the prompt like this.
35 |
36 | System prompts are optional. I didn't include them for Pixtral because the current setup is already using the `[INST]` special token, so the pixtral prompting is already like a system prompt rather than a user conversation. I might change this later.
37 |
38 | Use `trust_remote_code` at your own risk. (I think Molmo looks safe, though)
39 |
40 | ## Installation
41 |
42 | Available in [ComfyUI-Manager](https://github.com/ltdrdata/ComfyUI-Manager) as ComfyUI-PixtralLlamaVision. When installed from ComfyUI-Manager, the required packages will be installed automatically. You may need to update your pytorch version.
43 |
44 | If you install by cloning this repo into your custom nodes folder, you'll need to install `transformers >= 4.45.0` to load Pixtral and Llama Vision models, and you'll also need to make sure `accelerate`, `bitsandbytes`, and `torchvision` are updated. You can install these in the windows portable version of ComfyUI with:
45 | `python_embeded\python.exe -m pip install -r ComfyUI\custom_nodes\ComfyUI-PixtralLlamaVision\requirements.txt`
46 |
47 | Models should be placed in the `ComfyUI/models/LLM` folder, with each model inside a folder with the `model.safetensors` file along with any config files and the tokenizer.
48 |
49 | You can get a 4-bit quantized version of Pixtral-12B and/or Llama-3.2-11B-Vision-Instruct which is compatible with these custom nodes here:
50 |
51 | [https://huggingface.co/SeanScripts/pixtral-12b-nf4](https://huggingface.co/SeanScripts/pixtral-12b-nf4)
52 |
53 | [https://huggingface.co/SeanScripts/Llama-3.2-11B-Vision-Instruct-nf4](https://huggingface.co/SeanScripts/Llama-3.2-11B-Vision-Instruct-nf4)
54 |
55 | Unfortunately, the Pixtral nf4 model has considerably degraded performance on some tasks, like OCR. The Llama Vision model seems to be better for this task.
56 |
57 | ## Examples
58 |
59 | Example Pixtral image captioning (not saving the output to a text file in this example):
60 | 
61 |
62 | All of these models should work very well for image captioning, even in 4-bit quantization. You can also customize your captioning instructions. Larger images might not work as well with Pixtral, so scaling them down to something like 512 x 512 before sending them to the text generation node might be a good idea. It's also worth noting that the nf4 Pixtral model has significantly degraded performance on images which consist of mainly text.
63 |
64 | Example Molmo dataset captioning for a LoRA:
65 | 
66 |
67 | This workflow sends a list of images to the image generation node to caption each of them sequentially, and creates images and text files in a folder with names `1.png`, `1.txt`, etc for easy LoRA training setup.
68 |
69 | Note that for captioning each image separately, this input should be a list, not a batch of images, because these models can take multiple images as input for a single generation. Currently these nodes don't support batched text generation, but I might add that in the future. Doing one text generation task at a time is probably better for people with normal amounts of VRAM though.
70 |
71 | Example Pixtral image comparison:
72 | 
73 |
74 | I haven't been able to get image comparison to work well at all with Llama Vision. It doesn't give any errors, but the multi-image understanding just isn't there. The image tokens have to be **before** the question/instruction and consecutive for the model to even be able to see both images at once (I found this out by looking at the image preprocessor cross-attention implementation), and even then, it seems to randomly mix up which is the first/second, left/right, the colors between them and other details. It doesn't seem usable for purposes involving two images in the same message, in my opinion. Not sure whether the non-quantized model is better at this.
75 |
76 | Since Pixtral directly tokenizes the input images, it's able to handle them inline in the context, with any number of images of any aspect ratio, but it's limited by token lengths, since each image can be around 1000 - 4000 tokens.
77 |
78 | Example Llama Vision object detection with bounding box:
79 | 
80 |
81 | Both Pixtral and Llama kind of work for this, but not that well. They definitely have some understanding of the positions of objects in the image, though. Maybe it needs a better prompt. Or a non-quantized model. Or a finetune. But it does sometimes work. Surprisingly, Molmo is pretty bad at this, though it is capable of pointing and counting.
82 |
83 | Example Molmo counting:
84 | 
85 |
86 | Example Molmo pointing, with labels:
87 | 
88 |
89 | I wasn't able to get it to point at both objects with a single prompt for some reason (it would just assign both labels to both points), but splitting it into two simple prompts like this isn't too bad.
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2 |
3 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
--------------------------------------------------------------------------------
/examples/dataset_captioning_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision/48fab4b9814f4602528bb2144f3fd0da8b9b8c36/examples/dataset_captioning_example.jpg
--------------------------------------------------------------------------------
/examples/dataset_captioning_workflow.json:
--------------------------------------------------------------------------------
1 | {
2 | "last_node_id": 25,
3 | "last_link_id": 24,
4 | "nodes": [
5 | {
6 | "id": 17,
7 | "type": "Note",
8 | "pos": {
9 | "0": 1864,
10 | "1": 871
11 | },
12 | "size": {
13 | "0": 260.5903625488281,
14 | "1": 137.3493194580078
15 | },
16 | "flags": {},
17 | "order": 0,
18 | "mode": 0,
19 | "inputs": [],
20 | "outputs": [],
21 | "properties": {},
22 | "widgets_values": [
23 | "Folder within the ComfyUI output folder where you want the images + captions to be saved with the same name (1.png, 1.txt, etc.)"
24 | ],
25 | "color": "#432",
26 | "bgcolor": "#653"
27 | },
28 | {
29 | "id": 16,
30 | "type": "Note",
31 | "pos": {
32 | "0": 1864,
33 | "1": 431
34 | },
35 | "size": {
36 | "0": 282.4085388183594,
37 | "1": 132.3493194580078
38 | },
39 | "flags": {},
40 | "order": 1,
41 | "mode": 0,
42 | "inputs": [],
43 | "outputs": [],
44 | "properties": {},
45 | "widgets_values": [
46 | "Path here should be the same as the path below, but as a relative path from where you are running ComfyUI, or as an absolute path"
47 | ],
48 | "color": "#432",
49 | "bgcolor": "#653"
50 | },
51 | {
52 | "id": 18,
53 | "type": "Note",
54 | "pos": {
55 | "0": 267,
56 | "1": 795
57 | },
58 | "size": {
59 | "0": 279.317626953125,
60 | "1": 145.62205505371094
61 | },
62 | "flags": {},
63 | "order": 2,
64 | "mode": 0,
65 | "inputs": [],
66 | "outputs": [],
67 | "properties": {},
68 | "widgets_values": [
69 | "Path to a folder with the images you want to caption (filenames don't matter).\nimage_load_cap = 0 will load all the images from the folder."
70 | ],
71 | "color": "#432",
72 | "bgcolor": "#653"
73 | },
74 | {
75 | "id": 24,
76 | "type": "MolmoGenerateText",
77 | "pos": {
78 | "0": 990,
79 | "1": 401
80 | },
81 | "size": {
82 | "0": 400,
83 | "1": 362
84 | },
85 | "flags": {},
86 | "order": 8,
87 | "mode": 0,
88 | "inputs": [
89 | {
90 | "name": "molmo_model",
91 | "type": "VISION_MODEL",
92 | "link": 22
93 | },
94 | {
95 | "name": "images",
96 | "type": "IMAGE",
97 | "link": 23
98 | }
99 | ],
100 | "outputs": [
101 | {
102 | "name": "STRING",
103 | "type": "STRING",
104 | "links": [
105 | 24
106 | ],
107 | "slot_index": 0,
108 | "shape": 3
109 | }
110 | ],
111 | "properties": {
112 | "Node name for S&R": "MolmoGenerateText"
113 | },
114 | "widgets_values": [
115 | "",
116 | "Describe this image in detail. ",
117 | 256,
118 | true,
119 | 0.3,
120 | 0.9,
121 | 40,
122 | "<|endoftext|>",
123 | 3561946545,
124 | "randomize",
125 | false
126 | ]
127 | },
128 | {
129 | "id": 23,
130 | "type": "MolmoModelLoader",
131 | "pos": {
132 | "0": 572,
133 | "1": 407
134 | },
135 | "size": {
136 | "0": 315,
137 | "1": 58
138 | },
139 | "flags": {},
140 | "order": 3,
141 | "mode": 0,
142 | "inputs": [],
143 | "outputs": [
144 | {
145 | "name": "VISION_MODEL",
146 | "type": "VISION_MODEL",
147 | "links": [
148 | 22
149 | ],
150 | "slot_index": 0,
151 | "shape": 3
152 | }
153 | ],
154 | "properties": {
155 | "Node name for S&R": "MolmoModelLoader"
156 | },
157 | "widgets_values": [
158 | "molmo-7B-D-bnb-4bit"
159 | ]
160 | },
161 | {
162 | "id": 19,
163 | "type": "Note",
164 | "pos": {
165 | "0": 290,
166 | "1": 368
167 | },
168 | "size": {
169 | "0": 264.54547119140625,
170 | "1": 134.54547119140625
171 | },
172 | "flags": {},
173 | "order": 4,
174 | "mode": 0,
175 | "inputs": [],
176 | "outputs": [],
177 | "properties": {},
178 | "widgets_values": [
179 | "Llama 3.2 11B Vision and Molmo are probably better quality than Pixtral for this"
180 | ],
181 | "color": "#432",
182 | "bgcolor": "#653"
183 | },
184 | {
185 | "id": 8,
186 | "type": "Save Text File",
187 | "pos": {
188 | "0": 1537,
189 | "1": 407
190 | },
191 | "size": {
192 | "0": 303.0448913574219,
193 | "1": 174
194 | },
195 | "flags": {},
196 | "order": 10,
197 | "mode": 0,
198 | "inputs": [
199 | {
200 | "name": "text",
201 | "type": "STRING",
202 | "link": 24,
203 | "widget": {
204 | "name": "text"
205 | }
206 | },
207 | {
208 | "name": "filename_prefix",
209 | "type": "STRING",
210 | "link": 9,
211 | "widget": {
212 | "name": "filename_prefix"
213 | }
214 | }
215 | ],
216 | "outputs": [],
217 | "properties": {
218 | "Node name for S&R": "Save Text File"
219 | },
220 | "widgets_values": [
221 | "",
222 | ".\\ComfyUI\\output\\images_with_captions",
223 | "",
224 | "",
225 | 0,
226 | ".txt",
227 | "utf-8"
228 | ]
229 | },
230 | {
231 | "id": 4,
232 | "type": "ListCounter //Inspire",
233 | "pos": {
234 | "0": 954,
235 | "1": 864
236 | },
237 | "size": {
238 | "0": 210,
239 | "1": 58
240 | },
241 | "flags": {},
242 | "order": 7,
243 | "mode": 0,
244 | "inputs": [
245 | {
246 | "name": "signal",
247 | "type": "*",
248 | "link": 3
249 | }
250 | ],
251 | "outputs": [
252 | {
253 | "name": "INT",
254 | "type": "INT",
255 | "links": [
256 | 1
257 | ],
258 | "slot_index": 0,
259 | "shape": 3
260 | }
261 | ],
262 | "properties": {
263 | "Node name for S&R": "ListCounter //Inspire"
264 | },
265 | "widgets_values": [
266 | 1
267 | ]
268 | },
269 | {
270 | "id": 6,
271 | "type": "SomethingToString",
272 | "pos": {
273 | "0": 1223,
274 | "1": 865
275 | },
276 | "size": {
277 | "0": 210,
278 | "1": 82
279 | },
280 | "flags": {},
281 | "order": 9,
282 | "mode": 0,
283 | "inputs": [
284 | {
285 | "name": "input",
286 | "type": "*",
287 | "link": 1
288 | }
289 | ],
290 | "outputs": [
291 | {
292 | "name": "STRING",
293 | "type": "STRING",
294 | "links": [
295 | 9,
296 | 14,
297 | 15
298 | ],
299 | "slot_index": 0,
300 | "shape": 3
301 | }
302 | ],
303 | "properties": {
304 | "Node name for S&R": "SomethingToString"
305 | },
306 | "widgets_values": [
307 | "",
308 | ""
309 | ]
310 | },
311 | {
312 | "id": 1,
313 | "type": "LoadImageListFromDir //Inspire",
314 | "pos": {
315 | "0": 570,
316 | "1": 786
317 | },
318 | "size": {
319 | "0": 315,
320 | "1": 170
321 | },
322 | "flags": {},
323 | "order": 5,
324 | "mode": 0,
325 | "inputs": [],
326 | "outputs": [
327 | {
328 | "name": "IMAGE",
329 | "type": "IMAGE",
330 | "links": [
331 | 3,
332 | 13,
333 | 23
334 | ],
335 | "slot_index": 0,
336 | "shape": 6
337 | },
338 | {
339 | "name": "MASK",
340 | "type": "MASK",
341 | "links": null,
342 | "shape": 6
343 | },
344 | {
345 | "name": "FILE PATH",
346 | "type": "STRING",
347 | "links": null,
348 | "shape": 6
349 | }
350 | ],
351 | "properties": {
352 | "Node name for S&R": "LoadImageListFromDir //Inspire"
353 | },
354 | "widgets_values": [
355 | "E:\\datasets\\example",
356 | 0,
357 | 0,
358 | false
359 | ]
360 | },
361 | {
362 | "id": 13,
363 | "type": "> Save Image",
364 | "pos": {
365 | "0": 1540,
366 | "1": 790
367 | },
368 | "size": {
369 | "0": 299.77215576171875,
370 | "1": 406
371 | },
372 | "flags": {},
373 | "order": 11,
374 | "mode": 0,
375 | "inputs": [
376 | {
377 | "name": "images",
378 | "type": "IMAGE",
379 | "link": 13
380 | },
381 | {
382 | "name": "filename_opt",
383 | "type": "STRING",
384 | "link": 14,
385 | "widget": {
386 | "name": "filename_opt"
387 | }
388 | },
389 | {
390 | "name": "filename_prefix",
391 | "type": "STRING",
392 | "link": 15,
393 | "widget": {
394 | "name": "filename_prefix"
395 | }
396 | }
397 | ],
398 | "outputs": [],
399 | "properties": {
400 | "Node name for S&R": "> Save Image"
401 | },
402 | "widgets_values": [
403 | "",
404 | "images_with_captions",
405 | false,
406 | false,
407 | "png",
408 | 100,
409 | ""
410 | ]
411 | },
412 | {
413 | "id": 25,
414 | "type": "Note",
415 | "pos": {
416 | "0": 1301,
417 | "1": 1068
418 | },
419 | "size": [
420 | 221.29719695532322,
421 | 122.35482607937593
422 | ],
423 | "flags": {},
424 | "order": 6,
425 | "mode": 0,
426 | "inputs": [],
427 | "outputs": [],
428 | "properties": {},
429 | "widgets_values": [
430 | "This node is from ComfyUI_yanc, idk why \"Install Missing Custom Nodes\" doesn't work for this one"
431 | ],
432 | "color": "#432",
433 | "bgcolor": "#653"
434 | }
435 | ],
436 | "links": [
437 | [
438 | 1,
439 | 4,
440 | 0,
441 | 6,
442 | 0,
443 | "*"
444 | ],
445 | [
446 | 3,
447 | 1,
448 | 0,
449 | 4,
450 | 0,
451 | "*"
452 | ],
453 | [
454 | 9,
455 | 6,
456 | 0,
457 | 8,
458 | 1,
459 | "STRING"
460 | ],
461 | [
462 | 13,
463 | 1,
464 | 0,
465 | 13,
466 | 0,
467 | "IMAGE"
468 | ],
469 | [
470 | 14,
471 | 6,
472 | 0,
473 | 13,
474 | 1,
475 | "STRING"
476 | ],
477 | [
478 | 15,
479 | 6,
480 | 0,
481 | 13,
482 | 2,
483 | "STRING"
484 | ],
485 | [
486 | 22,
487 | 23,
488 | 0,
489 | 24,
490 | 0,
491 | "VISION_MODEL"
492 | ],
493 | [
494 | 23,
495 | 1,
496 | 0,
497 | 24,
498 | 1,
499 | "IMAGE"
500 | ],
501 | [
502 | 24,
503 | 24,
504 | 0,
505 | 8,
506 | 0,
507 | "STRING"
508 | ]
509 | ],
510 | "groups": [],
511 | "config": {},
512 | "extra": {
513 | "ds": {
514 | "scale": 0.7513148009015777,
515 | "offset": [
516 | -54.49319695532304,
517 | 20.897173920624354
518 | ]
519 | }
520 | },
521 | "version": 0.4
522 | }
--------------------------------------------------------------------------------
/examples/llama_vision_bounding_box_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision/48fab4b9814f4602528bb2144f3fd0da8b9b8c36/examples/llama_vision_bounding_box_example.jpg
--------------------------------------------------------------------------------
/examples/llama_vision_bounding_box_workflow.json:
--------------------------------------------------------------------------------
1 | {"last_node_id":48,"last_link_id":66,"nodes":[{"id":30,"type":"RegexSubstitution","pos":[480,370],"size":[210,102],"flags":{},"order":4,"mode":0,"inputs":[{"name":"string","type":"STRING","link":47,"widget":{"name":"string"}},{"name":"replace","type":"STRING","link":46,"widget":{"name":"replace"}}],"outputs":[{"name":"STRING","type":"STRING","links":[64],"slot_index":0,"shape":3}],"properties":{"Node name for S&R":"RegexSubstitution"},"widgets_values":["{object}","test {object} test","test","M"]},{"id":15,"type":"DF_Text_Box","pos":[75,550],"size":[247.62606811523438,92.8243179321289],"flags":{},"order":0,"mode":0,"inputs":[],"outputs":[{"name":"STRING","type":"STRING","links":[46],"slot_index":0,"shape":3}],"properties":{"Node name for S&R":"DF_Text_Box"},"widgets_values":["text"]},{"id":1,"type":"LoadImage","pos":[366,605],"size":[315,314],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[7,12,63],"slot_index":0,"shape":3},{"name":"MASK","type":"MASK","links":null,"shape":3}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["CogVideoX-I2V_00006.png","image"]},{"id":10,"type":"ParseBoundingBoxes","pos":[1125,315],"size":[210,102],"flags":{},"order":6,"mode":0,"inputs":[{"name":"image","type":"IMAGE","link":12},{"name":"string","type":"STRING","link":65,"widget":{"name":"string"}}],"outputs":[{"name":"BBOX","type":"BBOX","links":[26,27],"slot_index":0,"shape":3}],"properties":{"Node name for S&R":"ParseBoundingBoxes"},"widgets_values":["",true,true]},{"id":12,"type":"Display Any (rgthree)","pos":[1418,251],"size":[248.57421875,85.7146224975586],"flags":{},"order":9,"mode":0,"inputs":[{"name":"source","type":"*","link":27,"dir":3}],"outputs":[],"properties":{"Node name for S&R":"Display Any (rgthree)"},"widgets_values":["[[36, 72, 648, 144]]"]},{"id":6,"type":"BboxVisualize","pos":[1431,410],"size":[210,78],"flags":{},"order":8,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":7},{"name":"bboxes","type":"BBOX","link":26}],"outputs":[{"name":"images","type":"IMAGE","links":[8],"slot_index":0,"shape":3}],"properties":{"Node name for S&R":"BboxVisualize"},"widgets_values":[3]},{"id":7,"type":"PreviewImage","pos":[1702,404],"size":[443,489],"flags":{},"order":10,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":8}],"outputs":[],"properties":{"Node name for S&R":"PreviewImage"},"widgets_values":[]},{"id":4,"type":"Display Any (rgthree)","pos":[1692,138],"size":[502.48602294921875,207.90098571777344],"flags":{},"order":7,"mode":0,"inputs":[{"name":"source","type":"*","link":66,"dir":3}],"outputs":[],"properties":{"Node name for S&R":"Display Any (rgthree)"},"widgets_values":["The bounding box coordinates of the text are: [(0.05, 0.15), (0.95, 0.45)]."]},{"id":44,"type":"LlamaVisionModelLoader","pos":[489,175],"size":[351.37103271484375,58],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"VISION_MODEL","type":"VISION_MODEL","links":[62],"slot_index":0,"shape":3}],"properties":{"Node name for S&R":"LlamaVisionModelLoader"},"widgets_values":["Llama-3.2-11B-Vision-Instruct-nf4"]},{"id":46,"type":"LlamaVisionGenerateText","pos":[722,293],"size":[383,384],"flags":{},"order":5,"mode":0,"inputs":[{"name":"llama_vision_model","type":"VISION_MODEL","link":62},{"name":"images","type":"IMAGE","link":63,"shape":7},{"name":"prompt","type":"STRING","link":64,"widget":{"name":"prompt"}}],"outputs":[{"name":"STRING","type":"STRING","links":[65,66]}],"properties":{"Node name for S&R":"LlamaVisionGenerateText"},"widgets_values":["","Caption this image.",256,true,0.3,0.9,40,"<|eot_id|>",929916883,"randomize",false,false]},{"id":14,"type":"DF_Text_Box","pos":[25,132],"size":[419.071044921875,355.2226867675781],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"STRING","type":"STRING","links":[47],"slot_index":0,"shape":3}],"properties":{"Node name for S&R":"DF_Text_Box"},"widgets_values":["Create an approximate bounding box containing the {object} in this image. Your response should only include the coordinates of the upper-left and lower-right corners of the bounding box relative to the image size, e.g. [(0.0, 0.0), (1.0, 1.0)] is a bounding box that covers the entire image. The bounding box coordinates of the {object} are:"]}],"links":[[7,1,0,6,0,"IMAGE"],[8,6,0,7,0,"IMAGE"],[12,1,0,10,0,"IMAGE"],[26,10,0,6,1,"BBOX"],[27,10,0,12,0,"*"],[46,15,0,30,1,"STRING"],[47,14,0,30,0,"STRING"],[62,44,0,46,0,"VISION_MODEL"],[63,1,0,46,1,"IMAGE"],[64,30,0,46,2,"STRING"],[65,46,0,10,1,"STRING"],[66,46,0,4,0,"*"]],"groups":[],"config":{},"extra":{"ds":{"scale":1,"offset":[191.67246566216193,168.1576626509841]},"node_versions":{"ComfyUI-PixtralLlamaMolmoVision":"01728a16308eaa501dd025cb70f14a3b07a322a1","Derfuu_ComfyUI_ModdedNodes":"d0905bed31249f2bd0814c67585cf4fe3c77c015","comfy-core":"0.3.13","rgthree-comfy":"5d771b8b56a343c24a26e8cea1f0c87c3d58102f","ComfyUI-KJNodes":"2abf557e3d6ae6618456a190044a85a52f2a585a"},"VHS_latentpreview":false,"VHS_latentpreviewrate":0},"version":0.4}
--------------------------------------------------------------------------------
/examples/molmo_count_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision/48fab4b9814f4602528bb2144f3fd0da8b9b8c36/examples/molmo_count_example.jpg
--------------------------------------------------------------------------------
/examples/molmo_counting_workflow.json:
--------------------------------------------------------------------------------
1 | {
2 | "last_node_id": 58,
3 | "last_link_id": 114,
4 | "nodes": [
5 | {
6 | "id": 56,
7 | "type": "ParsePoints",
8 | "pos": {
9 | "0": 1086,
10 | "1": 335
11 | },
12 | "size": [
13 | 315,
14 | 122
15 | ],
16 | "flags": {},
17 | "order": 4,
18 | "mode": 0,
19 | "inputs": [
20 | {
21 | "name": "string",
22 | "type": "STRING",
23 | "link": 111,
24 | "widget": {
25 | "name": "string"
26 | }
27 | }
28 | ],
29 | "outputs": [
30 | {
31 | "name": "POINT",
32 | "type": "POINT",
33 | "links": [
34 | 112
35 | ],
36 | "shape": 3,
37 | "slot_index": 0
38 | },
39 | {
40 | "name": "STRING",
41 | "type": "STRING",
42 | "links": null,
43 | "shape": 3
44 | },
45 | {
46 | "name": "STRING",
47 | "type": "STRING",
48 | "links": null,
49 | "shape": 3
50 | }
51 | ],
52 | "properties": {
53 | "Node name for S&R": "ParsePoints"
54 | },
55 | "widgets_values": [
56 | "",
57 | ""
58 | ]
59 | },
60 | {
61 | "id": 54,
62 | "type": "MolmoModelLoader",
63 | "pos": {
64 | "0": 313,
65 | "1": 248
66 | },
67 | "size": {
68 | "0": 315,
69 | "1": 58
70 | },
71 | "flags": {},
72 | "order": 0,
73 | "mode": 0,
74 | "inputs": [],
75 | "outputs": [
76 | {
77 | "name": "VISION_MODEL",
78 | "type": "VISION_MODEL",
79 | "links": [
80 | 108
81 | ],
82 | "shape": 3,
83 | "slot_index": 0
84 | }
85 | ],
86 | "properties": {
87 | "Node name for S&R": "MolmoModelLoader"
88 | },
89 | "widgets_values": [
90 | "molmo-7B-D-bnb-4bit"
91 | ]
92 | },
93 | {
94 | "id": 58,
95 | "type": "PreviewImage",
96 | "pos": {
97 | "0": 1771,
98 | "1": 334
99 | },
100 | "size": [
101 | 382.1099039489918,
102 | 403.9147862091281
103 | ],
104 | "flags": {},
105 | "order": 6,
106 | "mode": 0,
107 | "inputs": [
108 | {
109 | "name": "images",
110 | "type": "IMAGE",
111 | "link": 113
112 | }
113 | ],
114 | "outputs": [],
115 | "properties": {
116 | "Node name for S&R": "PreviewImage"
117 | }
118 | },
119 | {
120 | "id": 4,
121 | "type": "LoadImage",
122 | "pos": {
123 | "0": 324,
124 | "1": 358
125 | },
126 | "size": {
127 | "0": 294.7367248535156,
128 | "1": 375.8291931152344
129 | },
130 | "flags": {},
131 | "order": 1,
132 | "mode": 0,
133 | "inputs": [],
134 | "outputs": [
135 | {
136 | "name": "IMAGE",
137 | "type": "IMAGE",
138 | "links": [
139 | 109,
140 | 114
141 | ],
142 | "slot_index": 0,
143 | "shape": 3
144 | },
145 | {
146 | "name": "MASK",
147 | "type": "MASK",
148 | "links": null,
149 | "slot_index": 1,
150 | "shape": 3
151 | }
152 | ],
153 | "properties": {
154 | "Node name for S&R": "LoadImage"
155 | },
156 | "widgets_values": [
157 | "Flux_00207_ (1).png",
158 | "image"
159 | ]
160 | },
161 | {
162 | "id": 57,
163 | "type": "PlotPoints",
164 | "pos": {
165 | "0": 1429,
166 | "1": 340
167 | },
168 | "size": {
169 | "0": 315,
170 | "1": 150
171 | },
172 | "flags": {},
173 | "order": 5,
174 | "mode": 0,
175 | "inputs": [
176 | {
177 | "name": "points",
178 | "type": "POINT",
179 | "link": 112
180 | },
181 | {
182 | "name": "image",
183 | "type": "IMAGE",
184 | "link": 114
185 | }
186 | ],
187 | "outputs": [
188 | {
189 | "name": "IMAGE",
190 | "type": "IMAGE",
191 | "links": [
192 | 113
193 | ],
194 | "shape": 3,
195 | "slot_index": 0
196 | }
197 | ],
198 | "properties": {
199 | "Node name for S&R": "PlotPoints"
200 | },
201 | "widgets_values": [
202 | 10,
203 | 40,
204 | "#ff00ff",
205 | ""
206 | ]
207 | },
208 | {
209 | "id": 55,
210 | "type": "MolmoGenerateText",
211 | "pos": {
212 | "0": 656,
213 | "1": 301
214 | },
215 | "size": {
216 | "0": 400,
217 | "1": 312
218 | },
219 | "flags": {},
220 | "order": 2,
221 | "mode": 0,
222 | "inputs": [
223 | {
224 | "name": "molmo_model",
225 | "type": "VISION_MODEL",
226 | "link": 108
227 | },
228 | {
229 | "name": "images",
230 | "type": "IMAGE",
231 | "link": 109
232 | }
233 | ],
234 | "outputs": [
235 | {
236 | "name": "STRING",
237 | "type": "STRING",
238 | "links": [
239 | 110,
240 | 111
241 | ],
242 | "shape": 3,
243 | "slot_index": 0
244 | }
245 | ],
246 | "properties": {
247 | "Node name for S&R": "MolmoGenerateText"
248 | },
249 | "widgets_values": [
250 | "Count the people in this image.",
251 | 512,
252 | true,
253 | 0.3,
254 | 0.9,
255 | 40,
256 | "<|endoftext|>",
257 | 2130269981,
258 | "randomize",
259 | false
260 | ]
261 | },
262 | {
263 | "id": 3,
264 | "type": "Display Any (rgthree)",
265 | "pos": {
266 | "0": 1109,
267 | "1": 6
268 | },
269 | "size": [
270 | 608.1341091642305,
271 | 278.6934670890276
272 | ],
273 | "flags": {},
274 | "order": 3,
275 | "mode": 0,
276 | "inputs": [
277 | {
278 | "name": "source",
279 | "type": "*",
280 | "link": 110,
281 | "dir": 3
282 | }
283 | ],
284 | "outputs": [],
285 | "properties": {
286 | "Node name for S&R": "Display Any (rgthree)"
287 | },
288 | "widgets_values": [
289 | ""
290 | ]
291 | }
292 | ],
293 | "links": [
294 | [
295 | 108,
296 | 54,
297 | 0,
298 | 55,
299 | 0,
300 | "VISION_MODEL"
301 | ],
302 | [
303 | 109,
304 | 4,
305 | 0,
306 | 55,
307 | 1,
308 | "IMAGE"
309 | ],
310 | [
311 | 110,
312 | 55,
313 | 0,
314 | 3,
315 | 0,
316 | "*"
317 | ],
318 | [
319 | 111,
320 | 55,
321 | 0,
322 | 56,
323 | 0,
324 | "STRING"
325 | ],
326 | [
327 | 112,
328 | 56,
329 | 0,
330 | 57,
331 | 0,
332 | "POINT"
333 | ],
334 | [
335 | 113,
336 | 57,
337 | 0,
338 | 58,
339 | 0,
340 | "IMAGE"
341 | ],
342 | [
343 | 114,
344 | 4,
345 | 0,
346 | 57,
347 | 1,
348 | "IMAGE"
349 | ]
350 | ],
351 | "groups": [],
352 | "config": {},
353 | "extra": {
354 | "ds": {
355 | "scale": 1.2284597357367266,
356 | "offset": [
357 | -254.51223899431685,
358 | 74.38924093465424
359 | ]
360 | }
361 | },
362 | "version": 0.4
363 | }
--------------------------------------------------------------------------------
/examples/molmo_multi_pointing_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision/48fab4b9814f4602528bb2144f3fd0da8b9b8c36/examples/molmo_multi_pointing_example.jpg
--------------------------------------------------------------------------------
/examples/pixtral_caption_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision/48fab4b9814f4602528bb2144f3fd0da8b9b8c36/examples/pixtral_caption_example.jpg
--------------------------------------------------------------------------------
/examples/pixtral_caption_workflow.json:
--------------------------------------------------------------------------------
1 | {
2 | "last_node_id": 19,
3 | "last_link_id": 38,
4 | "nodes": [
5 | {
6 | "id": 10,
7 | "type": "PixtralModelLoader",
8 | "pos": {
9 | "0": 364,
10 | "1": 304
11 | },
12 | "size": {
13 | "0": 315,
14 | "1": 58
15 | },
16 | "flags": {},
17 | "order": 0,
18 | "mode": 0,
19 | "inputs": [],
20 | "outputs": [
21 | {
22 | "name": "PIXTRAL_MODEL",
23 | "type": "PIXTRAL_MODEL",
24 | "links": [
25 | 36
26 | ],
27 | "slot_index": 0,
28 | "shape": 3
29 | }
30 | ],
31 | "properties": {
32 | "Node name for S&R": "PixtralModelLoader"
33 | },
34 | "widgets_values": [
35 | "pixtral-12b-nf4"
36 | ]
37 | },
38 | {
39 | "id": 4,
40 | "type": "LoadImage",
41 | "pos": {
42 | "0": 362,
43 | "1": 463
44 | },
45 | "size": {
46 | "0": 315,
47 | "1": 314
48 | },
49 | "flags": {},
50 | "order": 1,
51 | "mode": 0,
52 | "inputs": [],
53 | "outputs": [
54 | {
55 | "name": "IMAGE",
56 | "type": "IMAGE",
57 | "links": [
58 | 37
59 | ],
60 | "slot_index": 0,
61 | "shape": 3
62 | },
63 | {
64 | "name": "MASK",
65 | "type": "MASK",
66 | "links": null,
67 | "shape": 3
68 | }
69 | ],
70 | "properties": {
71 | "Node name for S&R": "LoadImage"
72 | },
73 | "widgets_values": [
74 | "test.png",
75 | "image"
76 | ]
77 | },
78 | {
79 | "id": 19,
80 | "type": "PixtralGenerateText",
81 | "pos": {
82 | "0": 835,
83 | "1": 380
84 | },
85 | "size": [
86 | 405.81818181818176,
87 | 258.090909090909
88 | ],
89 | "flags": {},
90 | "order": 2,
91 | "mode": 0,
92 | "inputs": [
93 | {
94 | "name": "pixtral_model",
95 | "type": "PIXTRAL_MODEL",
96 | "link": 36
97 | },
98 | {
99 | "name": "images",
100 | "type": "IMAGE",
101 | "link": 37
102 | }
103 | ],
104 | "outputs": [
105 | {
106 | "name": "STRING",
107 | "type": "STRING",
108 | "links": [
109 | 38
110 | ],
111 | "shape": 3,
112 | "slot_index": 0
113 | }
114 | ],
115 | "properties": {
116 | "Node name for S&R": "PixtralGenerateText"
117 | },
118 | "widgets_values": [
119 | "[INST]Caption this image:\n[IMG][/INST]",
120 | 256,
121 | true,
122 | 0.5,
123 | 3722012111,
124 | "randomize"
125 | ]
126 | },
127 | {
128 | "id": 3,
129 | "type": "Display Any (rgthree)",
130 | "pos": {
131 | "0": 1333,
132 | "1": 368
133 | },
134 | "size": [
135 | 397.81818181818176,
136 | 337.36363636363626
137 | ],
138 | "flags": {},
139 | "order": 3,
140 | "mode": 0,
141 | "inputs": [
142 | {
143 | "name": "source",
144 | "type": "*",
145 | "link": 38,
146 | "dir": 3
147 | }
148 | ],
149 | "outputs": [],
150 | "properties": {
151 | "Node name for S&R": "Display Any (rgthree)"
152 | },
153 | "widgets_values": [
154 | ""
155 | ]
156 | }
157 | ],
158 | "links": [
159 | [
160 | 36,
161 | 10,
162 | 0,
163 | 19,
164 | 0,
165 | "PIXTRAL_MODEL"
166 | ],
167 | [
168 | 37,
169 | 4,
170 | 0,
171 | 19,
172 | 1,
173 | "IMAGE"
174 | ],
175 | [
176 | 38,
177 | 19,
178 | 0,
179 | 3,
180 | 0,
181 | "*"
182 | ]
183 | ],
184 | "groups": [],
185 | "config": {},
186 | "extra": {
187 | "ds": {
188 | "scale": 1.1,
189 | "offset": [
190 | 60.090909090908944,
191 | 74.63636363636358
192 | ]
193 | }
194 | },
195 | "version": 0.4
196 | }
--------------------------------------------------------------------------------
/examples/pixtral_comparison_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision/48fab4b9814f4602528bb2144f3fd0da8b9b8c36/examples/pixtral_comparison_example.jpg
--------------------------------------------------------------------------------
/nodes.py:
--------------------------------------------------------------------------------
1 | import comfy.utils
2 | import comfy.model_management as mm
3 | #from comfy.model_patcher import ModelPatcher
4 | import folder_paths
5 |
6 | from transformers import (
7 | LlavaForConditionalGeneration,
8 | MllamaForConditionalGeneration,
9 | AutoModelForCausalLM,
10 | AutoProcessor,
11 | BitsAndBytesConfig,
12 | GenerationConfig,
13 | StopStringCriteria,
14 | set_seed
15 | )
16 | from torchvision.transforms.functional import to_pil_image
17 | import numpy as np
18 | import torch
19 |
20 | import json
21 | import os
22 | from pathlib import Path
23 | from PIL import Image, ImageDraw
24 | import re
25 | import time
26 |
27 | # Using folder ComfyUI/models/LLM -- Place each model inside its own folder here, e.g. ComfyUI/models/LLM/pixtral-12b-nf4/model.safetensors
28 | # Also include other config files and tokenizer files in that same folder
29 | llm_model_dir = os.path.join(folder_paths.models_dir, "LLM")
30 | # Add LLM folder if not present
31 | if not os.path.exists(llm_model_dir):
32 | os.makedirs(llm_model_dir)
33 |
34 | model_type_map = {
35 | "LlavaForConditionalGeneration": LlavaForConditionalGeneration,
36 | "MllamaForConditionalGeneration": MllamaForConditionalGeneration,
37 | "MolmoForCausalLM": AutoModelForCausalLM,
38 | # Other vision models can be added here as needed but will require importing
39 | "AutoModelForCausalLM": AutoModelForCausalLM,
40 | }
41 |
42 | def get_models_with_config():
43 | models = []
44 | for model_path in Path(llm_model_dir).iterdir():
45 | if model_path.is_dir():
46 | if os.path.exists(os.path.join(model_path, "config.json")):
47 | models.append(model_path.parts[-1])
48 | return models
49 |
50 | def get_model_type(model_path):
51 | config_path = os.path.join(model_path, "config.json")
52 | if os.path.exists(config_path):
53 | with open(config_path, 'r') as config_file:
54 | config = json.load(config_file)
55 | return config["architectures"][0]
56 | print(f"Invalid model config for model {model_path}")
57 | return "Invalid model config"
58 |
59 | def get_models_of_type(model_type):
60 | models = []
61 | for model_path in Path(llm_model_dir).iterdir():
62 | if model_path.is_dir():
63 | current_model_type = get_model_type(model_path)
64 | if current_model_type == model_type:
65 | models.append(model_path.parts[-1])
66 | return models
67 |
68 | class PixtralModelLoader:
69 | """Loads a Pixtral model. Add models as folders inside the `ComfyUI/models/LLM` folder. Each model folder should contain a standard transformers loadable safetensors model along with a tokenizer and any config files needed."""
70 | @classmethod
71 | def INPUT_TYPES(s):
72 | return {
73 | "required": {
74 | "model_name": (get_models_of_type("LlavaForConditionalGeneration"),),
75 | }
76 | }
77 |
78 | RETURN_TYPES = ("VISION_MODEL",)
79 | FUNCTION = "load_model"
80 | CATEGORY = "PixtralLlamaVision/Pixtral"
81 | TITLE = "Load Pixtral Model"
82 |
83 | def load_model(self, model_name):
84 | model_path = os.path.join(llm_model_dir, model_name)
85 | print(f"Setting Pixtral model: {model_name}")
86 | # Don't load the full model until needed for generation
87 | processor = AutoProcessor.from_pretrained(model_path)
88 | pixtral_model = {
89 | 'path': model_path,
90 | 'processor': processor,
91 | }
92 | return (pixtral_model,)
93 |
94 |
95 | class PixtralGenerateText:
96 | """Generates text using a Pixtral model. Takes a list of images and a string prompt as input. The prompt must contain an equal number of [IMG] tokens to the number of images passed in."""
97 | @classmethod
98 | def INPUT_TYPES(s):
99 | return {
100 | "optional": {
101 | "images": ("IMAGE",),
102 | },
103 | "required": {
104 | "pixtral_model": ("VISION_MODEL",),
105 | #"system_prompt": ("STRING", {"default": "", "multiline": True}),
106 | "prompt": ("STRING", {"default": "Caption this image:\n[IMG]", "multiline": True}),
107 | "max_new_tokens": ("INT", {"default": 256, "min": 1, "max": 4096}),
108 | "do_sample": ("BOOLEAN", {"default": True}),
109 | "temperature": ("FLOAT", {"default": 0.3, "min": 0, "step": 0.1}),
110 | "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.1}),
111 | "top_k": ("INT", {"default": 40, "min": 1}),
112 | "repetition_penalty": ("FLOAT", {"default": 1.1}),
113 | "stop_strings": ("STRING", {"default": ""}),
114 | "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffff}),
115 | "include_prompt_in_output": ("BOOLEAN", {"default": False}),
116 | "unload_after_generate": ("BOOLEAN", {"default": False}),
117 | }
118 | }
119 |
120 | RETURN_TYPES = ("STRING",)
121 | FUNCTION = "generate_text"
122 | CATEGORY = "PixtralLlamaVision/Pixtral"
123 | TITLE = "Generate Text with Pixtral"
124 |
125 | def generate_text(self, pixtral_model, images, prompt, max_new_tokens, do_sample, temperature, top_p, top_k, repetition_penalty, stop_strings, seed, include_prompt_in_output, unload_after_generate):
126 | # Load model now if needed
127 | device = mm.get_torch_device()
128 | if pixtral_model['path'] and 'model' not in pixtral_model:
129 | pixtral_model['model'] = LlavaForConditionalGeneration.from_pretrained(
130 | pixtral_model['path'],
131 | use_safetensors=True,
132 | device_map=device,
133 | )
134 |
135 | # I'm sure there is a way to do this without converting back to numpy and then PIL...
136 | # Pixtral requires PIL input for some reason, and the to_pil_image function requires channels to be the first dimension for a Tensor but the last dimension for a numpy array... Yeah idk
137 | if images != None and len(images) > 0:
138 | print(f"Batch of {images.shape} images")
139 | image_list = [to_pil_image(image.numpy()) for image in images]
140 |
141 | # Process prompt
142 | # Example: [INST]Caption this image:\n[IMG][/INST]
143 | # Images can be placed anywhere, unlike the other models
144 | image_tag_count = prompt.count("[IMG]")
145 | added_image_tags = ""
146 | if image_tag_count > 0 and (images is None or len(images) == 0):
147 | print("Warning: Prompt contains image tags but no image")
148 | elif image_tag_count < len(images):
149 | added_image_tags = "[IMG]"*(len(images) - image_tag_count)
150 | print("Warning: Adding extra images to the beginning of the prompt")
151 | elif image_tag_count > len(images):
152 | print("Warning: Too many image tags")
153 |
154 | # Not sure how system vs user input differs for this model yet
155 | final_prompt = ""
156 | #if system_prompt != "":
157 | # final_prompt += f"[INST]{system_prompt}[/INST]"
158 | final_prompt += f"[INST]{added_image_tags}{prompt}[/INST]"
159 |
160 | inputs = pixtral_model['processor'](images=image_list, text=prompt, return_tensors="pt").to(device)
161 | prompt_tokens = len(inputs['input_ids'][0])
162 | print(f"Prompt tokens: {prompt_tokens}")
163 | stop_strings_list = stop_strings.split(",")
164 | set_seed(seed)
165 | t0 = time.time()
166 | generate_ids = pixtral_model['model'].generate(
167 | **inputs,
168 | generation_config=GenerationConfig(
169 | max_new_tokens=max_new_tokens,
170 | do_sample=do_sample,
171 | temperature=temperature,
172 | top_p=top_p,
173 | top_k=top_k,
174 | repetition_penalty=repetition_penalty,
175 | ),
176 | stopping_criteria=[StopStringCriteria(tokenizer=pixtral_model['processor'].tokenizer, stop_strings=stop_strings_list)],
177 | )
178 | t1 = time.time()
179 | total_time = t1 - t0
180 | generated_tokens = len(generate_ids[0]) - prompt_tokens
181 | time_per_token = generated_tokens/total_time
182 | print(f"Generated {generated_tokens} tokens in {total_time:.3f} s ({time_per_token:.3f} tok/s)")
183 | output_tokens = generate_ids[0] if include_prompt_in_output else generate_ids[0][prompt_tokens:]
184 | output = pixtral_model['processor'].decode(output_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
185 | print(output)
186 |
187 | # Unload model
188 | if unload_after_generate:
189 | del pixtral_model['model']
190 | torch.cuda.empty_cache()
191 | print("Pixtral model unloaded")
192 |
193 | return (output,)
194 |
195 |
196 | class LlamaVisionModelLoader:
197 | """Loads a Llama 3.2 Vision model. Add models as folders inside the `ComfyUI/models/LLM` folder. Each model folder should contain a standard transformers loadable safetensors model along with a tokenizer and any config files needed."""
198 | @classmethod
199 | def INPUT_TYPES(s):
200 | return {
201 | "required": {
202 | "model_name": (get_models_of_type("MllamaForConditionalGeneration"),),
203 | }
204 | }
205 |
206 | RETURN_TYPES = ("VISION_MODEL",)
207 | FUNCTION = "load_model"
208 | CATEGORY = "PixtralLlamaVision/LlamaVision"
209 | TITLE = "Load Llama Vision Model"
210 |
211 | def load_model(self, model_name):
212 | model_path = os.path.join(llm_model_dir, model_name)
213 | print(f"Setting Llama Vision model: {model_name}")
214 | # Don't load the full model until needed for generation
215 | processor = AutoProcessor.from_pretrained(model_path)
216 | llama_vision_model = {
217 | 'path': model_path,
218 | 'processor': processor,
219 | }
220 | return (llama_vision_model,)
221 |
222 |
223 | class LlamaVisionGenerateText:
224 | """Generates text using a Llama 3.2 Vision model. The prompt must contain an equal number of <|image|> tokens to the number of images passed in. Image tokens must also be sequential and before the text you want them to apply to for the image attention to work as intended."""
225 | @classmethod
226 | def INPUT_TYPES(s):
227 | return {
228 | "optional": {
229 | "images": ("IMAGE",),
230 | },
231 | "required": {
232 | "llama_vision_model": ("VISION_MODEL",),
233 | "system_prompt": ("STRING", {"default": "", "multiline": True}),
234 | "prompt": ("STRING", {"default": "Caption this image.", "multiline": True}),
235 | "max_new_tokens": ("INT", {"default": 256, "min": 1, "max": 4096}),
236 | "do_sample": ("BOOLEAN", {"default": True}),
237 | "temperature": ("FLOAT", {"default": 0.3, "min": 0.0, "step": 0.1}),
238 | "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.1}),
239 | "top_k": ("INT", {"default": 40, "min": 1}),
240 | # For some reason, including this causes the CUDA kernel to fail catastrophically? Didn't have this issue with Pixtral
241 | #"repetition_penalty": ("FLOAT", {"default": 1.1}),
242 | "stop_strings": ("STRING", {"default": "<|eot_id|>"}),
243 | "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffff}),
244 | "include_prompt_in_output": ("BOOLEAN", {"default": False}),
245 | "unload_after_generate": ("BOOLEAN", {"default": False}),
246 | }
247 | }
248 |
249 | RETURN_TYPES = ("STRING",)
250 | FUNCTION = "generate_text"
251 | CATEGORY = "PixtralLlamaVision/LlamaVision"
252 | TITLE = "Generate Text with Llama Vision"
253 |
254 | # TODO: Support batching
255 |
256 | def generate_text(self, llama_vision_model, images, system_prompt, prompt, max_new_tokens, do_sample, temperature, top_p, top_k, stop_strings, seed, include_prompt_in_output, unload_after_generate):
257 | # Load model now if needed
258 | device = mm.get_torch_device()
259 | if llama_vision_model['path'] and 'model' not in llama_vision_model:
260 | llama_vision_model['model'] = MllamaForConditionalGeneration.from_pretrained(
261 | llama_vision_model['path'],
262 | use_safetensors=True,
263 | device_map=device,
264 | )
265 |
266 | # I'm sure there is a way to do this without converting back to numpy and then PIL...
267 | # Llama Vision also requires PIL input for some reason, and the to_pil_image function requires channels to be the first dimension for a Tensor but the last dimension for a numpy array... Yeah idk
268 |
269 | if images != None and len(images) > 0:
270 | print(f"Batch of {images.shape} images")
271 | image_list = [to_pil_image(image.numpy()) for image in images]
272 |
273 | # Process prompt
274 | image_tags = "<|image|>"*len(images)
275 | final_prompt = "<|begin_of_text|>"
276 | if system_prompt != "":
277 | final_prompt += f"<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>\n\n"
278 | final_prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{image_tags}{prompt}<|eot_id|>\n\n"
279 | final_prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
280 |
281 | inputs = llama_vision_model['processor'](images=image_list, text=final_prompt, return_tensors="pt").to(device)
282 | prompt_tokens = len(inputs['input_ids'][0])
283 | print(f"Prompt tokens: {prompt_tokens}")
284 | stop_strings_list = stop_strings.split(",")
285 | set_seed(seed)
286 | t0 = time.time()
287 | generate_ids = llama_vision_model['model'].generate(
288 | **inputs,
289 | generation_config=GenerationConfig(
290 | max_new_tokens=max_new_tokens,
291 | do_sample=do_sample,
292 | temperature=temperature,
293 | top_p=top_p,
294 | top_k=top_k,
295 | #repetition_penalty=repetition_penalty,
296 | ),
297 | stopping_criteria=[StopStringCriteria(tokenizer=llama_vision_model['processor'].tokenizer, stop_strings=stop_strings_list)],
298 | )
299 | t1 = time.time()
300 | total_time = t1 - t0
301 | generated_tokens = len(generate_ids[0]) - prompt_tokens
302 | time_per_token = generated_tokens/total_time
303 | print(f"Generated {generated_tokens} tokens in {total_time:.3f} s ({time_per_token:.3f} tok/s)")
304 | output_tokens = generate_ids[0] if include_prompt_in_output else generate_ids[0][prompt_tokens:]
305 | output = llama_vision_model['processor'].decode(output_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
306 | print(output)
307 |
308 | # Unload model
309 | if unload_after_generate:
310 | del llama_vision_model['model']
311 | torch.cuda.empty_cache()
312 | print("Llama vision model unloaded")
313 |
314 | return (output,)
315 |
316 |
317 | class MolmoModelLoader:
318 | """Loads a Molmo model. Add models as folders inside the `ComfyUI/models/LLM` folder. Each model folder should contain a standard transformers loadable safetensors model along with a tokenizer and any config files needed."""
319 | @classmethod
320 | def INPUT_TYPES(s):
321 | return {
322 | "required": {
323 | "model_name": (get_models_of_type("MolmoForCausalLM"),),
324 | }
325 | }
326 |
327 | RETURN_TYPES = ("VISION_MODEL",)
328 | FUNCTION = "load_model"
329 | CATEGORY = "PixtralLlamaVision/Molmo"
330 | TITLE = "Load Molmo Model"
331 |
332 | def load_model(self, model_name):
333 | model_path = os.path.join(llm_model_dir, model_name)
334 | print(f"Setting Molmo model: {model_name}")
335 | # Don't load the full model until needed for generation
336 | processor = AutoProcessor.from_pretrained(
337 | model_path,
338 | torch_dtype="auto",
339 | trust_remote_code=True,
340 | )
341 | molmo_model = {
342 | 'path': model_path,
343 | 'processor': processor,
344 | }
345 | return (molmo_model,)
346 |
347 |
348 | class MolmoGenerateText:
349 | """Generates text using a Molmo model. Takes a list of images and a string prompt as input. The prompt must contain an equal number of [IMG] tokens to the number of images passed in."""
350 | @classmethod
351 | def INPUT_TYPES(s):
352 | return {
353 | "optional": {
354 | "images": ("IMAGE",),
355 | },
356 | "required": {
357 | "molmo_model": ("VISION_MODEL",),
358 | "system_prompt": ("STRING", {"default": "", "multiline": True}),
359 | "prompt": ("STRING", {"default": "Describe this image. ", "multiline": True}),
360 | "max_new_tokens": ("INT", {"default": 256, "min": 1, "max": 4096}),
361 | "do_sample": ("BOOLEAN", {"default": True}),
362 | "temperature": ("FLOAT", {"default": 0.3, "min": 0, "step": 0.1}),
363 | "top_p": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.1}),
364 | "top_k": ("INT", {"default": 40, "min": 1}),
365 | # This doesn't seem to work for this model
366 | #"repetition_penalty": ("FLOAT", {"default": 1.1}),
367 | "stop_strings": ("STRING", {"default": "<|endoftext|>"}),
368 | "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffff}),
369 | "include_prompt_in_output": ("BOOLEAN", {"default": False}),
370 | "unload_after_generate": ("BOOLEAN", {"default": False}),
371 | }
372 | }
373 |
374 | RETURN_TYPES = ("STRING",)
375 | FUNCTION = "generate_text"
376 | CATEGORY = "PixtralLlamaVision/Molmo"
377 | TITLE = "Generate Text with Molmo"
378 |
379 | # TODO: Support batching
380 |
381 | def generate_text(self, molmo_model, images, system_prompt, prompt, max_new_tokens, do_sample, temperature, top_p, top_k, stop_strings, seed, include_prompt_in_output, unload_after_generate):
382 | # Load model now if needed
383 | device = mm.get_torch_device()
384 | if molmo_model['path'] and 'model' not in molmo_model:
385 | molmo_model['model'] = AutoModelForCausalLM.from_pretrained(
386 | molmo_model['path'],
387 | use_safetensors=True,
388 | device_map=device,
389 | torch_dtype="auto",
390 | trust_remote_code=True,
391 | )
392 |
393 | if images != None and len(images) > 0:
394 | print(f"Batch of {images.shape} images")
395 | image_list = [to_pil_image(image.numpy()) for image in images]
396 |
397 | # Process prompt
398 | final_prompt = ""
399 | if system_prompt != "":
400 | final_prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
401 | final_prompt += f"<|im_start|>user\n{prompt}<|im_end|>\n"
402 | final_prompt += "<|im_start|>assistant\n"
403 |
404 | inputs = molmo_model['processor'].process(
405 | images=image_list,
406 | #system_prompt=system_prompt, # Doesn't do anything
407 | text=final_prompt,
408 | message_format="none",
409 | always_start_with_space=False,
410 | )
411 | inputs = {k: v.to(device).unsqueeze(0) for k, v in inputs.items()}
412 |
413 | prompt_tokens = inputs["input_ids"].size(1)
414 | print(f"Prompt tokens: {prompt_tokens}")
415 |
416 | stop_strings_list = stop_strings.split(",")
417 |
418 | set_seed(seed)
419 | t0 = time.time()
420 | output = molmo_model['model'].generate_from_batch(
421 | inputs,
422 | generation_config=GenerationConfig(
423 | max_new_tokens=max_new_tokens,
424 | do_sample=do_sample,
425 | temperature=temperature,
426 | top_p=top_p,
427 | top_k=top_k,
428 | #repetition_penalty=repetition_penalty,
429 | ),
430 | stopping_criteria=[StopStringCriteria(tokenizer=molmo_model['processor'].tokenizer, stop_strings=stop_strings_list)],
431 | tokenizer=molmo_model['processor'].tokenizer,
432 | )
433 | t1 = time.time()
434 |
435 | total_time = t1 - t0
436 | generated_tokens = output.size(1) - prompt_tokens
437 | time_per_token = generated_tokens/total_time
438 | print(f"Generated {generated_tokens} tokens in {total_time:.3f} s ({time_per_token:.3f} tok/s)")
439 |
440 | output_tokens = output[0] if include_prompt_in_output else output[0, prompt_tokens:]
441 | generated_text = molmo_model['processor'].tokenizer.decode(output_tokens, skip_special_tokens=True)
442 | print(generated_text)
443 |
444 | # Unload model
445 | if unload_after_generate:
446 | del molmo_model['model']
447 | torch.cuda.empty_cache()
448 | print("Molmo model unloaded")
449 |
450 | return (generated_text,)
451 |
452 |
453 | class AutoVisionModelLoader:
454 | """Loads a vision model. Add models as folders inside the `ComfyUI/models/LLM` folder. Each model folder should contain a standard transformers loadable safetensors model along with a tokenizer and any config files needed. Use `trust_remote_code` at your own risk."""
455 | @classmethod
456 | def INPUT_TYPES(s):
457 | return {
458 | "required": {
459 | "model_name": (get_models_with_config(),),
460 | "trust_remote_code": ("BOOLEAN", {"default": False}), # No longer very useful. I can add a bit of code checking this when loading Pixtral/Llama Vision if there are custom finetunes of them in the future.
461 | }
462 | }
463 |
464 | RETURN_TYPES = ("VISION_MODEL",)
465 | FUNCTION = "load_model"
466 | CATEGORY = "PixtralLlamaVision/VLM"
467 | TITLE = "Load Vision Model"
468 |
469 | def load_model(self, model_name, trust_remote_code):
470 | model_path = os.path.join(llm_model_dir, model_name)
471 | device = mm.get_torch_device()
472 | # Don't load the full model until needed for generation
473 | try:
474 | model_type_name = get_model_type(model_path)
475 | print(f"Setting vision model: {model_name} of type {model_type_name}")
476 | '''
477 | model_type = model_type_map.get(model_type_name, AutoModelForCausalLM)
478 | model = model_type.from_pretrained(
479 | model_path,
480 | use_safetensors=True,
481 | device_map=device,
482 | torch_dtype="auto",
483 | trust_remote_code=trust_remote_code,
484 | )
485 | '''
486 | processor = AutoProcessor.from_pretrained(
487 | model_path,
488 | torch_dtype="auto",
489 | trust_remote_code=trust_remote_code,
490 | )
491 | vision_model = {
492 | 'path': model_path,
493 | 'model_type_name': model_type_name, # Not used yet
494 | 'trust_remote_code': trust_remote_code, # Not used yet
495 | 'processor': processor,
496 | }
497 | return (vision_model,)
498 | except Exception as e:
499 | print(f"Error loading vision model: {e}")
500 | raise
501 |
502 |
503 | # Utility for bounding boxes (I'm sure this has been done before but I just wanted to try it out to see how well Pixtral can do it)
504 | class ParseBoundingBoxes:
505 | """Uses a regular expression to find bounding boxes in a string, returning a list of bbox objects (compatible with mtb). `relative` means the bounding box uses float values between 0 and 1 if true and absolute image coordinates if false. `corners_only` means the bounding box is [(x1, y1), (x2, y2)] if true and [(x1, y1), (width, height)] if false. Parentheses are treated as optional."""
506 | @classmethod
507 | def INPUT_TYPES(s):
508 | return {
509 | "required": {
510 | "image": ("IMAGE",),
511 | "string": ("STRING",),
512 | "relative": ("BOOLEAN", {"default": True}),
513 | "corners_only": ("BOOLEAN", {"default": True}),
514 | }
515 | }
516 |
517 | RETURN_TYPES = ("BBOX",)
518 | FUNCTION = "generate_bboxes"
519 | CATEGORY = "PixtralLlamaVision/Utility"
520 | TITLE = "Parse Bounding Boxes"
521 |
522 | def generate_bboxes(self, image, string, relative, corners_only):
523 | image_width = image.shape[2]
524 | image_height = image.shape[1]
525 |
526 | bboxes = []
527 | # Ridiculous-looking regex
528 | for match in re.findall(r"""\[?\(?([0-9\.]+),\s*([0-9\.]+)\)?,\s*\(?([0-9\.]+),\s*([0-9\.]+)\)?\]?""", string, flags=re.M):
529 | try:
530 | x1_raw = float(match[0])
531 | y1_raw = float(match[1])
532 | x2_raw = float(match[2])
533 | y2_raw = float(match[3])
534 |
535 | if relative:
536 | x1 = int(image_width*x1_raw)
537 | y1 = int(image_height*y1_raw)
538 | x2 = int(image_width*x2_raw)
539 | y2 = int(image_height*y2_raw)
540 | else:
541 | x1 = int(x1_raw)
542 | y1 = int(y1_raw)
543 | x2 = int(x2_raw)
544 | y2 = int(y2_raw)
545 |
546 | if corners_only:
547 | width = x2 - x1
548 | height = y2 - y1
549 | else:
550 | width = x2
551 | height = y2
552 |
553 | if width <= 0 or width > image_width or height <= 0 or height > image_height:
554 | print(f"Invalid bbox: ({x1}, {y1}, {width}, {height})")
555 | continue
556 | bbox = (x1, y1, width, height)
557 | bboxes.append(bbox)
558 | except Exception as e:
559 | print(f"Failed to parse bbox: {match}")
560 |
561 | return (bboxes,)
562 |
563 |
564 | class ParsePoints:
565 | """eyes"""
566 | @classmethod
567 | def INPUT_TYPES(s):
568 | return {
569 | "required": {
570 | "string": ("STRING",),
571 | "filter": ("STRING",),
572 | }
573 | }
574 |
575 | RETURN_TYPES = ("POINT", "STRING", "STRING")
576 | FUNCTION = "generate_points"
577 | CATEGORY = "PixtralLlamaVision/Utility"
578 | TITLE = "Parse Points"
579 |
580 | def generate_points(self, string, filter):
581 | point_batches = []
582 | label_batches = []
583 | alt_label_batches = []
584 | if type(string) != list:
585 | string = [string] # batch 1
586 | for s in string:
587 | points = []
588 | labels = []
589 | alt_labels = []
590 | # Tried to design this regex in a way where even if the message gets cut off by the token limit, it finds the points
591 | # Another absolutely ridiculous looking regex
592 | for match in re.findall(r"""[<\[]?points?\s*([xy\d\.="\s]*?)\s*(?:alt="([^"]*)")?(?=>|]|$)[>\]]?([^<\[]*)""", s, flags=re.M):
593 | try:
594 | data = match[0]
595 | if len(match) > 1:
596 | alt = match[1]
597 | if len(match) > 2:
598 | inner = match[2]
599 | else:
600 | inner = ""
601 | else:
602 | alt = ""
603 | inner = ""
604 |
605 | # Roughly matching
606 | if alt == "" or filter.lower() in alt.lower() or filter.lower() in inner.lower():
607 | data_parts = data.split(" ")
608 | for i in range(len(data_parts)//2):
609 | # Points from Molmo are expressed as percentages
610 | x = float(data_parts[2*i].split('"')[1])/100.0
611 | y = float(data_parts[2*i+1].split('"')[1])/100.0
612 |
613 | # Check for duplicates
614 | valid = True
615 | for point, label, alt_label in zip(points, labels, alt_labels):
616 | if point[0] == x and point[1] == y and label == inner and alt_label == alt:
617 | print(f"Duplicate point ({x}, {y}, {alt}, {inner})")
618 | valid = False
619 | break
620 | if valid:
621 | points.append([x, y])
622 | labels.append(inner)
623 | alt_labels.append(alt) # I'm not really convinced alt even matters
624 | else:
625 | print(f"Non-matching filter for {match}")
626 | except Exception as e:
627 | print(f"Failed to parse points: {match}: {e}")
628 | point_batches.append(points)
629 | label_batches.append(labels)
630 | alt_label_batches.append(alt_labels)
631 | return (np.array(point_batches), np.array(label_batches), np.array(alt_label_batches))
632 |
633 |
634 | class PlotPoints:
635 | @classmethod
636 | def INPUT_TYPES(s):
637 | return {
638 | "optional": {
639 | "labels": ("STRING",),
640 | },
641 | "required": {
642 | "points": ("POINT",),
643 | "image": ("IMAGE",),
644 | "size": ("INT", {"default": 5, "min": 1, "step": 1}),
645 | "font_size": ("INT", {"default": 40}),
646 | "color": ("STRING", {"default": "#0000ff"}),
647 | }
648 | }
649 |
650 | RETURN_TYPES = ("IMAGE",)
651 | FUNCTION = "plot_points"
652 | CATEGORY = "PixtralLlamaVision/Utility"
653 | TITLE = "Plot Points"
654 |
655 | def plot_points(self, points, labels, image, size, font_size, color):
656 | image_width = image.shape[2]
657 | image_height = image.shape[1]
658 |
659 | if labels is None or len(labels) == 0 or font_size == 0:
660 | labels = np.array([["" for point in point_list] for point_list in points])
661 |
662 | batch_size = image.shape[0]
663 | if len(points) != len(labels) or len(points) != image.shape[0]:
664 | print(f"Warning: Batch size mismatch: Image {image.shape}, points {points.shape}, labels {labels.shape}")
665 | batch_size = min(image.shape[0], len(points), len(labels))
666 |
667 | # font = ImageFont.truetype("Pillow/Tests/fonts/FreeMono.ttf", font_size)
668 | # I might have overengineered this, it doesn't seem like the model can label separate objects in a single call. But you can concatenate the strings anyway.
669 | colors = [color]
670 | if "," in color:
671 | colors = color.split(",")
672 | color_map = {"": colors[0]}
673 | for i, label in enumerate(np.unique(labels)):
674 | color_map[label] = colors[i%len(colors)]
675 |
676 | # Add points to image (which is a tensor of floats of shape (batch, height, width, channels)
677 | changed_images = []
678 | for img, point_list, label_list in zip(image, points, labels):
679 | temp_image = to_pil_image(img.numpy())
680 | draw = ImageDraw.Draw(temp_image)
681 | for point, label in zip(point_list, label_list):
682 | x = int(image_width*point[0])
683 | y = int(image_height*point[1])
684 | draw.circle((x, y), fill=color_map[label], outline=color_map[label], radius=size)
685 | if label != "":
686 | draw.text((x, y-size), label, fill=color_map[label], font_size=font_size, anchor='md')
687 | output_image = np.array(temp_image)/0xff
688 | changed_images.append(output_image)
689 | return (torch.Tensor(np.array(changed_images)),)
690 |
691 |
692 | def process_regex_flags(flags):
693 | # Workaround for Python 3.10 not having re.NOFLAG
694 | flag_value = 0 # re.NOFLAG
695 | if 'a' in flags.lower():
696 | flag_value |= re.A
697 | if 'i' in flags.lower():
698 | flag_value |= re.I
699 | if 'l' in flags.lower():
700 | flag_value |= re.L
701 | if 'm' in flags.lower():
702 | flag_value |= re.M
703 | if 's' in flags.lower():
704 | flag_value |= re.S
705 | if 'u' in flags.lower(): # u for useless
706 | flag_value |= re.U
707 | if 'x' in flags.lower():
708 | flag_value |= re.X
709 | return flag_value
710 |
711 | # Utility nodes that I couldn't find elsewhere, not sure why?
712 | class RegexSplitString:
713 | """Uses a regular expression to split in a string by a pattern into a list of strings"""
714 | @classmethod
715 | def INPUT_TYPES(s):
716 | return {
717 | "required": {
718 | "pattern": ("STRING",),
719 | "string": ("STRING",),
720 | "flags": ("STRING", {"default": "M"}),
721 | }
722 | }
723 |
724 | RETURN_TYPES = ("STRING",)
725 | FUNCTION = "split_string"
726 | CATEGORY = "PixtralLlamaVision/Utility"
727 | TITLE = "Regex Split String"
728 |
729 | def split_string(self, pattern, string, flags):
730 | return (re.split(pattern, string, flags=process_regex_flags(flags)),)
731 |
732 |
733 | class RegexSearch:
734 | """Uses a regular expression to search for the first occurrence of a pattern in a string, returning whether the pattern was found, the start and end positions if found, and the list of match groups if found"""
735 | @classmethod
736 | def INPUT_TYPES(s):
737 | return {
738 | "required": {
739 | "pattern": ("STRING",),
740 | "string": ("STRING",),
741 | "flags": ("STRING", {"default": "M"}),
742 | }
743 | }
744 |
745 | RETURN_TYPES = ("BOOLEAN", "INT", "INT", "STRING")
746 | FUNCTION = "search"
747 | CATEGORY = "PixtralLlamaVision/Utility"
748 | TITLE = "Regex Search"
749 |
750 | def search(self, pattern, string, flags):
751 | match = re.search(pattern, string, flags=process_regex_flags(flags))
752 | if match:
753 | span = match.span()
754 | groups = list(match.groups())
755 | return (True, span[0], span[1], groups)
756 | return (False, 0, 0, [])
757 |
758 |
759 | class RegexFindAll:
760 | """Uses a regular expression to find all matches of a pattern in a string, returning a list of match groups (which could be strings or tuples of strings if you have more than one match group)"""
761 | @classmethod
762 | def INPUT_TYPES(s):
763 | return {
764 | "required": {
765 | "pattern": ("STRING",),
766 | "string": ("STRING",),
767 | "flags": ("STRING", {"default": "M"}),
768 | }
769 | }
770 |
771 | RETURN_TYPES = ("STRING",)
772 | FUNCTION = "find_all"
773 | CATEGORY = "PixtralLlamaVision/Utility"
774 | TITLE = "Regex Find All"
775 |
776 | def find_all(self, pattern, string, flags):
777 | return (re.findall(pattern, string, flags=process_regex_flags(flags)),)
778 |
779 |
780 | # This one is also available in Derfuu_ComfyUI_ModdedNodes
781 | class RegexSubstitution:
782 | """Uses a regular expression to find and replace text in a string"""
783 | @classmethod
784 | def INPUT_TYPES(s):
785 | return {
786 | "required": {
787 | "pattern": ("STRING",),
788 | "string": ("STRING",),
789 | "replace": ("STRING",),
790 | "flags": ("STRING", {"default": "M"}),
791 | }
792 | }
793 |
794 | RETURN_TYPES = ("STRING",)
795 | FUNCTION = "sub"
796 | CATEGORY = "PixtralLlamaVision/Utility"
797 | TITLE = "Regex Substitution"
798 |
799 | def sub(self, pattern, string, replace, flags):
800 | return (re.sub(pattern, replace, string, flags=process_regex_flags(flags)),)
801 |
802 |
803 | class JoinString:
804 | """Joins a list of strings with a delimiter between them"""
805 | @classmethod
806 | def INPUT_TYPES(s):
807 | return {
808 | "required": {
809 | "string_list": ("STRING",),
810 | "delimiter": ("STRING", {"default": ", "}),
811 | }
812 | }
813 |
814 | RETURN_TYPES = ("STRING",)
815 | FUNCTION = "join_string"
816 | CATEGORY = "PixtralLlamaVision/Utility"
817 | TITLE = "Join String"
818 |
819 | def join_string(self, string_list, delimiter):
820 | # Convert to strings just in case? Or is this a bad idea? Well, it'll error if they're not strings, so I guess this will have to do
821 | return (delimiter.join([str(string) for string in string_list]),)
822 |
823 |
824 | # Arbitrary data type for list/tuple indexing
825 | class AnyType(str):
826 | def __ne__(self, __value: object) -> bool:
827 | return False
828 |
829 | ANY = AnyType("*")
830 |
831 | # These ones are especially weird to not be doable in ComfyUI base
832 | class SelectIndex:
833 | """Returns list[index]"""
834 | @classmethod
835 | def INPUT_TYPES(s):
836 | return {
837 | "required": {
838 | "list": (ANY,),
839 | "index": ("INT", {"default": 0}),
840 | }
841 | }
842 |
843 | RETURN_TYPES = (ANY,)
844 | FUNCTION = "select_index"
845 | CATEGORY = "PixtralLlamaVision/Utility"
846 | TITLE = "Select Index"
847 |
848 | def select_index(self, list, index):
849 | return (list[index],)
850 |
851 | class SliceList:
852 | """Returns list[start_index:end_index]"""
853 | @classmethod
854 | def INPUT_TYPES(s):
855 | return {
856 | "required": {
857 | "list": (ANY,),
858 | "start_index": ("INT", {"default": 0}),
859 | "end_index": ("INT", {"default": 1}),
860 | }
861 | }
862 |
863 | RETURN_TYPES = (ANY,)
864 | FUNCTION = "select_index"
865 | CATEGORY = "PixtralLlamaVision/Utility"
866 | TITLE = "Slice List"
867 |
868 | def select_index(self, list, start_index, end_index):
869 | return (list[start_index:end_index],)
870 |
871 | # Batch Count works for getting list length
872 |
873 | NODE_CLASS_MAPPINGS = {
874 | "PixtralModelLoader": PixtralModelLoader,
875 | "PixtralGenerateText": PixtralGenerateText,
876 | # Not really much need to work with the image tokenization directly for something like image captioning, but might be interesting later...
877 | #"PixtralImageEncode": PixtralImageEncode,
878 | #"PixtralTextEncode": PixtralTextEncode,
879 | "LlamaVisionModelLoader": LlamaVisionModelLoader,
880 | "LlamaVisionGenerateText": LlamaVisionGenerateText,
881 | "MolmoModelLoader": MolmoModelLoader,
882 | "MolmoGenerateText": MolmoGenerateText,
883 | "AutoVisionModelLoader": AutoVisionModelLoader,
884 | "RegexSplitString": RegexSplitString,
885 | "RegexSearch": RegexSearch,
886 | "RegexFindAll": RegexFindAll,
887 | "RegexSubstitution": RegexSubstitution,
888 | "JoinString": JoinString,
889 | "ParseBoundingBoxes": ParseBoundingBoxes,
890 | "ParsePoints": ParsePoints,
891 | "PlotPoints": PlotPoints,
892 | "SelectIndex": SelectIndex,
893 | "SliceList": SliceList,
894 | }
895 |
896 | NODE_DISPLAY_NAME_MAPPINGS = {k:v.TITLE for k,v in NODE_CLASS_MAPPINGS.items()}
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "comfyui-pixtralllamamolmovision"
3 | description = "For loading and running Pixtral, Llama 3.2 Vision, and Molmo models. Put models in the models/LLM folder."
4 | version = "3.0.1"
5 | license = {file = "LICENSE"}
6 |
7 | [project.urls]
8 | Repository = "https://github.com/SeanScripts/ComfyUI-PixtralLlamaMolmoVision"
9 | # Used by Comfy Registry https://comfyregistry.org
10 |
11 | [tool.comfy]
12 | PublisherId = "seanscripts"
13 | DisplayName = "ComfyUI-PixtralLlamaMolmoVision"
14 | Icon = ""
15 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers >= 4.45.0
2 | accelerate
3 | bitsandbytes
4 | torchvision >= 0.17
5 |
--------------------------------------------------------------------------------