├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── display_text_nodes.py
├── examples
    ├── AnimateDiff_00002.mp4
    ├── Chat_with_multiple_images_workflow_legacy.json
    ├── Chat_with_multiple_images_workflow_legacy.png
    ├── Chat_with_multiple_images_workflow_polished.json
    ├── Chat_with_multiple_images_workflow_polished.png
    ├── Chat_with_single_image_workflow_legacy.json
    ├── Chat_with_single_image_workflow_legacy.png
    ├── Chat_with_single_image_workflow_polished.json
    ├── Chat_with_single_image_workflow_polished.png
    ├── Chat_with_text_workflow_legacy.json
    ├── Chat_with_text_workflow_legacy.png
    ├── Chat_with_text_workflow_polished.json
    ├── Chat_with_text_workflow_polished.png
    ├── Chat_with_video_workflow_legacy.json
    ├── Chat_with_video_workflow_legacy.png
    ├── Chat_with_video_workflow_polished.json
    ├── Chat_with_video_workflow_polished.png
    ├── ComfyUI_00508_.png
    ├── ComfyUI_00509_.png
    └── ComfyUI_00532_.png
├── favicon.ico
├── image_nodes.py
├── nodes_legacy.py
├── nodes_polished.py
├── pyproject.toml
├── requirements.txt
├── util_nodes.py
└── web
    └── js
        ├── displayText.js
        ├── multipleImagesInput.js
        ├── previewVideo.js
        └── uploadVideo.js


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 OpenBMB
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI_MiniCPM-V-2_6-int4
 2 | 
 3 | This is an implementation of [MiniCPM-V-2_6-int4](https://github.com/OpenBMB/MiniCPM-V) by [ComfyUI](https://github.com/comfyanonymous/ComfyUI), including support for text-based queries, video queries, single-image queries, and multi-image queries to generate captions or responses.
 4 | 
 5 | ---
 6 | 
 7 | ## Recent Updates
 8 | 
 9 | - Added `keep_model_loaded` parameter
10 | 
11 | By default, this parameter is set to False, which indicates that the model will be unloaded from GPU memory after each prediction is made.
12 | 
13 | However, if set to True, the model will remain loaded in GPU memory. This is particularly useful when multiple predictions with the same model are needed, eliminating the need to reload it between uses.
14 | 
15 | - Added `seed` parameter
16 | 
17 | This parameter enables the setting of a random seed for the purpose of ensuring reproducibility in results.
18 | 
19 | ---
20 | 
21 | ## Basic Workflow
22 | 
23 | - **Text-based Query**: Users can submit textual queries to request information or generate descriptions. For instance, a user might input a description like "What is the meaning of life?"
24 | 
25 | > <span style="color: green;">Chat_with_text_workflow_legacy preview</span>
26 | > ![Chat_with_text_workflow_legacy preview](examples/Chat_with_text_workflow_legacy.png)
27 | > <span style="color: green;">Chat_with_text_workflow_polished preview</span>
28 | > ![Chat_with_text_workflow_polished preview](examples/Chat_with_text_workflow_polished.png)
29 | 
30 | - **Video Query**: When a user uploads a video, the system can analyze the content and generate a detailed caption for each frame or a summary of the entire video. For example, "Generate a caption for the given video."
31 | 
32 | > <span style="color: green;">Chat_with_video_workflow_legacy preview</span>
33 | > ![Chat_with_video_workflow_legacy preview](examples/Chat_with_video_workflow_legacy.png)
34 | > <span style="color: green;">Chat_with_video_workflow_polished preview</span>
35 | > ![Chat_with_video_workflow_polished preview](examples/Chat_with_video_workflow_polished.png)
36 | 
37 | - **Single-Image Query**: This workflow supports generating a caption for an individual image. A user could upload a photo and ask, "What does this image show?" resulting in a caption such as "A majestic lion pride relaxing on the savannah."
38 | 
39 | > <span style="color: green;">Chat_with_single_image_workflow_legacy preview</span>
40 | > ![Chat_with_single_image_workflow_legacy preview](examples/Chat_with_single_image_workflow_legacy.png)
41 | > <span style="color: green;">Chat_with_single_image_workflow_polished preview</span>
42 | > ![Chat_with_single_image_workflow_polished preview](examples/Chat_with_single_image_workflow_polished.png)
43 | 
44 | - **Multi-Image Query**: For multiple images, the system can provide a collective description or a narrative that ties the images together. For example, "Create a story from the following series of images: one of a couple at a beach, another at a wedding ceremony, and the last one at a baby's christening."
45 | 
46 | > <span style="color: green;">Chat_with_multiple_images_workflow_legacy preview</span>
47 | > ![Chat_with_multiple_images_workflow_legacy preview](examples/Chat_with_multiple_images_workflow_legacy.png)
48 | > <span style="color: green;">Chat_with_multiple_images_workflow_polished preview</span>
49 | > ![Chat_with_multiple_images_workflow_polished preview](examples/Chat_with_multiple_images_workflow_polished.png)
50 | 
51 | ## Installation
52 | 
53 | - Install from [ComfyUI Manager](https://github.com/ltdrdata/ComfyUI-Manager) (search for `minicpm`)
54 | 
55 | - Download or git clone this repository into the `ComfyUI\custom_nodes\` directory and run:
56 | 
57 | ```python
58 | pip install -r requirements.txt
59 | ```
60 | 
61 | ## Download Models
62 | 
63 | All the models will be downloaded automatically when running the workflow if they are not found in the `ComfyUI\models\prompt_generator\` directory.
64 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | from .nodes_legacy import MiniCPM_VQA
 2 | from .nodes_polished import MiniCPM_VQA_Polished
 3 | from .image_nodes import MultipleImagesInput
 4 | from .util_nodes import LoadVideo,PreviewVideo
 5 | from .display_text_nodes import DisplayText
 6 | WEB_DIRECTORY = "./web"
 7 | # A dictionary that contains all nodes you want to export with their names
 8 | # NOTE: names should be globally unique
 9 | NODE_CLASS_MAPPINGS = {
10 |     "LoadVideo": LoadVideo,
11 |     "PreviewVideo": PreviewVideo,
12 |     "MultipleImagesInput": MultipleImagesInput,
13 |     "MiniCPM_VQA": MiniCPM_VQA,
14 |     "MiniCPM_VQA_Polished": MiniCPM_VQA_Polished,
15 |     "DisplayText": DisplayText,
16 | }
17 | 
18 | # A dictionary that contains the friendly/humanly readable titles for the nodes
19 | NODE_DISPLAY_NAME_MAPPINGS = {
20 |     "LoadVideo": "Load Video",
21 |     "PreviewVideo": "Preview Video",
22 |     "MultipleImagesInput": "Multiple Images Input",
23 |     "MiniCPM_VQA": "MiniCPM VQA",
24 |     "MiniCPM_VQA_Polished": "MiniCPM VQA Polished",
25 |     "DisplayText": "Display Text",
26 | }


--------------------------------------------------------------------------------
/display_text_nodes.py:
--------------------------------------------------------------------------------
 1 | class DisplayText:
 2 |     @classmethod
 3 |     def INPUT_TYPES(s):
 4 |         return {
 5 |             "required": {
 6 |                 "text": ("STRING", {"forceInput": True}),
 7 |             }
 8 |         }
 9 | 
10 |     INPUT_IS_LIST = True
11 |     RETURN_TYPES = ("STRING",)
12 |     OUTPUT_NODE = True
13 |     OUTPUT_IS_LIST = (True,)
14 |     FUNCTION = "display_text"
15 |     CATEGORY = "Comfyui_MiniCPM-V-2_6-int4"
16 | 
17 |     def display_text(self, text):
18 |         return {"ui": {"text": text}, "result": (text,)}
19 | 


--------------------------------------------------------------------------------
/examples/AnimateDiff_00002.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/AnimateDiff_00002.mp4


--------------------------------------------------------------------------------
/examples/Chat_with_multiple_images_workflow_legacy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 50,
  3 |   "last_link_id": 59,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         415,
 10 |         -78
 11 |       ],
 12 |       "size": [
 13 |         436.56812016891763,
 14 |         108.88176616327235
 15 |       ],
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 3,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 43,
 32 |       "type": "LoadImage",
 33 |       "pos": [
 34 |         -361,
 35 |         -193
 36 |       ],
 37 |       "size": {
 38 |         "0": 315,
 39 |         "1": 314
 40 |       },
 41 |       "flags": {},
 42 |       "order": 0,
 43 |       "mode": 0,
 44 |       "outputs": [
 45 |         {
 46 |           "name": "IMAGE",
 47 |           "type": "IMAGE",
 48 |           "links": [
 49 |             56
 50 |           ],
 51 |           "slot_index": 0,
 52 |           "shape": 3
 53 |         },
 54 |         {
 55 |           "name": "MASK",
 56 |           "type": "MASK",
 57 |           "links": null,
 58 |           "shape": 3
 59 |         }
 60 |       ],
 61 |       "properties": {
 62 |         "Node name for S&R": "LoadImage"
 63 |       },
 64 |       "widgets_values": [
 65 |         "ComfyUI_00509_.png",
 66 |         "image"
 67 |       ]
 68 |     },
 69 |     {
 70 |       "id": 45,
 71 |       "type": "LoadImage",
 72 |       "pos": [
 73 |         -691,
 74 |         15
 75 |       ],
 76 |       "size": {
 77 |         "0": 315,
 78 |         "1": 314
 79 |       },
 80 |       "flags": {},
 81 |       "order": 1,
 82 |       "mode": 0,
 83 |       "outputs": [
 84 |         {
 85 |           "name": "IMAGE",
 86 |           "type": "IMAGE",
 87 |           "links": [
 88 |             57
 89 |           ],
 90 |           "slot_index": 0,
 91 |           "shape": 3
 92 |         },
 93 |         {
 94 |           "name": "MASK",
 95 |           "type": "MASK",
 96 |           "links": null,
 97 |           "shape": 3
 98 |         }
 99 |       ],
100 |       "properties": {
101 |         "Node name for S&R": "LoadImage"
102 |       },
103 |       "widgets_values": [
104 |         "ComfyUI_00532_.png",
105 |         "image"
106 |       ]
107 |     },
108 |     {
109 |       "id": 47,
110 |       "type": "LoadImage",
111 |       "pos": [
112 |         -360,
113 |         161
114 |       ],
115 |       "size": {
116 |         "0": 315,
117 |         "1": 314
118 |       },
119 |       "flags": {},
120 |       "order": 2,
121 |       "mode": 0,
122 |       "outputs": [
123 |         {
124 |           "name": "IMAGE",
125 |           "type": "IMAGE",
126 |           "links": [
127 |             58
128 |           ],
129 |           "slot_index": 0,
130 |           "shape": 3
131 |         },
132 |         {
133 |           "name": "MASK",
134 |           "type": "MASK",
135 |           "links": null,
136 |           "shape": 3
137 |         }
138 |       ],
139 |       "properties": {
140 |         "Node name for S&R": "LoadImage"
141 |       },
142 |       "widgets_values": [
143 |         "ComfyUI_00508_.png",
144 |         "image"
145 |       ]
146 |     },
147 |     {
148 |       "id": 48,
149 |       "type": "DisplayText",
150 |       "pos": [
151 |         411,
152 |         79
153 |       ],
154 |       "size": [
155 |         451.1885182898909,
156 |         265.19896846818176
157 |       ],
158 |       "flags": {},
159 |       "order": 5,
160 |       "mode": 0,
161 |       "inputs": [
162 |         {
163 |           "name": "text",
164 |           "type": "STRING",
165 |           "link": 59,
166 |           "widget": {
167 |             "name": "text"
168 |           }
169 |         }
170 |       ],
171 |       "outputs": [
172 |         {
173 |           "name": "STRING",
174 |           "type": "STRING",
175 |           "links": null,
176 |           "shape": 6
177 |         }
178 |       ],
179 |       "properties": {
180 |         "Node name for S&R": "DisplayText"
181 |       },
182 |       "widgets_values": [
183 |         "",
184 |         "In the provided images, there are several differences to note:\n\n1. **Image 1**: This image shows Earth as seen from space, with a focus on the Western Hemisphere (North America and South America). The background is dark space with visible stars.\n\n2. **Image 2**: In this second image, the Earth appears in the center of the frame, surrounded by a starry backdrop with a galaxy-like structure visible in the upper right corner. There is also a smaller celestial body, possibly an asteroid or moon, near the Earth's left side.\n\n3. **Image 3**: The third image depicts Earth with its rings, resembling Saturn's rings, surrounding it. The background remains dark space, similar to Image 1, but the inclusion of the rings adds a distinct astronomical element.\n\nThe primary differences lie in the depiction of celestial bodies around Earth and the complexity of the cosmic environment."
185 |       ]
186 |     },
187 |     {
188 |       "id": 50,
189 |       "type": "MiniCPM_VQA",
190 |       "pos": [
191 |         -13,
192 |         -65
193 |       ],
194 |       "size": {
195 |         "0": 400,
196 |         "1": 400
197 |       },
198 |       "flags": {},
199 |       "order": 4,
200 |       "mode": 0,
201 |       "inputs": [
202 |         {
203 |           "name": "source_video_path",
204 |           "type": "PATH",
205 |           "link": null
206 |         },
207 |         {
208 |           "name": "source_image_path_1st",
209 |           "type": "IMAGE",
210 |           "link": 56
211 |         },
212 |         {
213 |           "name": "source_image_path_2nd",
214 |           "type": "IMAGE",
215 |           "link": 57
216 |         },
217 |         {
218 |           "name": "source_image_path_3rd",
219 |           "type": "IMAGE",
220 |           "link": 58
221 |         }
222 |       ],
223 |       "outputs": [
224 |         {
225 |           "name": "STRING",
226 |           "type": "STRING",
227 |           "links": [
228 |             59
229 |           ],
230 |           "shape": 3,
231 |           "slot_index": 0
232 |         }
233 |       ],
234 |       "properties": {
235 |         "Node name for S&R": "MiniCPM_VQA"
236 |       },
237 |       "widgets_values": [
238 |         "Compare image 1, image 2 and image 3, tell me about the differences among them.",
239 |         "MiniCPM-V-2_6-int4",
240 |         true,
241 |         0.8,
242 |         100,
243 |         0.7,
244 |         1.05,
245 |         2048,
246 |         64,
247 |         2,
248 |         576,
249 |         "randomize"
250 |       ]
251 |     }
252 |   ],
253 |   "links": [
254 |     [
255 |       56,
256 |       43,
257 |       0,
258 |       50,
259 |       1,
260 |       "IMAGE"
261 |     ],
262 |     [
263 |       57,
264 |       45,
265 |       0,
266 |       50,
267 |       2,
268 |       "IMAGE"
269 |     ],
270 |     [
271 |       58,
272 |       47,
273 |       0,
274 |       50,
275 |       3,
276 |       "IMAGE"
277 |     ],
278 |     [
279 |       59,
280 |       50,
281 |       0,
282 |       48,
283 |       0,
284 |       "STRING"
285 |     ]
286 |   ],
287 |   "groups": [],
288 |   "config": {},
289 |   "extra": {
290 |     "ds": {
291 |       "scale": 0.8769226950000022,
292 |       "offset": [
293 |         840.512281646217,
294 |         279.18423579817517
295 |       ]
296 |     }
297 |   },
298 |   "version": 0.4
299 | }


--------------------------------------------------------------------------------
/examples/Chat_with_multiple_images_workflow_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_multiple_images_workflow_legacy.png


--------------------------------------------------------------------------------
/examples/Chat_with_multiple_images_workflow_polished.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 60,
  3 |   "last_link_id": 71,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         -986,
 10 |         -453
 11 |       ],
 12 |       "size": {
 13 |         "0": 717.5083618164062,
 14 |         "1": 82.10267639160156
 15 |       },
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 0,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 51,
 32 |       "type": "DisplayText",
 33 |       "pos": [
 34 |         -697,
 35 |         183
 36 |       ],
 37 |       "size": {
 38 |         "0": 396.3633117675781,
 39 |         "1": 321.38140869140625
 40 |       },
 41 |       "flags": {},
 42 |       "order": 7,
 43 |       "mode": 0,
 44 |       "inputs": [
 45 |         {
 46 |           "name": "text",
 47 |           "type": "STRING",
 48 |           "link": 71,
 49 |           "widget": {
 50 |             "name": "text"
 51 |           }
 52 |         }
 53 |       ],
 54 |       "outputs": [
 55 |         {
 56 |           "name": "STRING",
 57 |           "type": "STRING",
 58 |           "links": null,
 59 |           "shape": 6
 60 |         }
 61 |       ],
 62 |       "properties": {
 63 |         "Node name for S&R": "DisplayText"
 64 |       },
 65 |       "widgets_values": [
 66 |         "",
 67 |         "In comparing image 1, image 2 and image 3, the differences are as follows: In image 1, Earth is shown in isolation with no other celestial bodies present. In image 2, a moon has been added to the scene, positioned to the left of Earth. Finally, in image 3, Saturn's rings have been added around Earth, creating a dramatic visual effect against the backdrop of space."
 68 |       ]
 69 |     },
 70 |     {
 71 |       "id": 54,
 72 |       "type": "MultipleImagesInput",
 73 |       "pos": [
 74 |         -986,
 75 |         -136
 76 |       ],
 77 |       "size": {
 78 |         "0": 210,
 79 |         "1": 122
 80 |       },
 81 |       "flags": {},
 82 |       "order": 4,
 83 |       "mode": 0,
 84 |       "inputs": [
 85 |         {
 86 |           "name": "image_1",
 87 |           "type": "IMAGE",
 88 |           "link": 65
 89 |         },
 90 |         {
 91 |           "name": "image_2",
 92 |           "type": "IMAGE",
 93 |           "link": 66
 94 |         },
 95 |         {
 96 |           "name": "image_3",
 97 |           "type": "IMAGE",
 98 |           "link": 68
 99 |         }
100 |       ],
101 |       "outputs": [
102 |         {
103 |           "name": "images",
104 |           "type": "IMAGE",
105 |           "links": [
106 |             69,
107 |             70
108 |           ],
109 |           "slot_index": 0,
110 |           "shape": 3
111 |         }
112 |       ],
113 |       "properties": {
114 |         "Node name for S&R": "MultipleImagesInput"
115 |       },
116 |       "widgets_values": [
117 |         3,
118 |         null
119 |       ]
120 |     },
121 |     {
122 |       "id": 55,
123 |       "type": "LoadImage",
124 |       "pos": [
125 |         -1232,
126 |         -453
127 |       ],
128 |       "size": [
129 |         210,
130 |         314
131 |       ],
132 |       "flags": {},
133 |       "order": 1,
134 |       "mode": 0,
135 |       "outputs": [
136 |         {
137 |           "name": "IMAGE",
138 |           "type": "IMAGE",
139 |           "links": [
140 |             65
141 |           ],
142 |           "shape": 3
143 |         },
144 |         {
145 |           "name": "MASK",
146 |           "type": "MASK",
147 |           "links": null,
148 |           "shape": 3
149 |         }
150 |       ],
151 |       "properties": {
152 |         "Node name for S&R": "LoadImage"
153 |       },
154 |       "widgets_values": [
155 |         "ComfyUI_00509_.png",
156 |         "image"
157 |       ]
158 |     },
159 |     {
160 |       "id": 56,
161 |       "type": "LoadImage",
162 |       "pos": [
163 |         -1234,
164 |         -122
165 |       ],
166 |       "size": [
167 |         214.43836975097656,
168 |         314
169 |       ],
170 |       "flags": {},
171 |       "order": 2,
172 |       "mode": 0,
173 |       "outputs": [
174 |         {
175 |           "name": "IMAGE",
176 |           "type": "IMAGE",
177 |           "links": [
178 |             66
179 |           ],
180 |           "shape": 3
181 |         },
182 |         {
183 |           "name": "MASK",
184 |           "type": "MASK",
185 |           "links": null,
186 |           "shape": 3
187 |         }
188 |       ],
189 |       "properties": {
190 |         "Node name for S&R": "LoadImage"
191 |       },
192 |       "widgets_values": [
193 |         "ComfyUI_00532_.png",
194 |         "image"
195 |       ]
196 |     },
197 |     {
198 |       "id": 58,
199 |       "type": "LoadImage",
200 |       "pos": [
201 |         -1232,
202 |         227
203 |       ],
204 |       "size": [
205 |         210,
206 |         314
207 |       ],
208 |       "flags": {},
209 |       "order": 3,
210 |       "mode": 0,
211 |       "outputs": [
212 |         {
213 |           "name": "IMAGE",
214 |           "type": "IMAGE",
215 |           "links": [
216 |             68
217 |           ],
218 |           "shape": 3
219 |         },
220 |         {
221 |           "name": "MASK",
222 |           "type": "MASK",
223 |           "links": null,
224 |           "shape": 3
225 |         }
226 |       ],
227 |       "properties": {
228 |         "Node name for S&R": "LoadImage"
229 |       },
230 |       "widgets_values": [
231 |         "ComfyUI_00508_.png",
232 |         "image"
233 |       ]
234 |     },
235 |     {
236 |       "id": 59,
237 |       "type": "PreviewImage",
238 |       "pos": [
239 |         -247,
240 |         -450
241 |       ],
242 |       "size": {
243 |         "0": 321.89825439453125,
244 |         "1": 978.513916015625
245 |       },
246 |       "flags": {},
247 |       "order": 5,
248 |       "mode": 0,
249 |       "inputs": [
250 |         {
251 |           "name": "images",
252 |           "type": "IMAGE",
253 |           "link": 69
254 |         }
255 |       ],
256 |       "properties": {
257 |         "Node name for S&R": "PreviewImage"
258 |       }
259 |     },
260 |     {
261 |       "id": 60,
262 |       "type": "MiniCPM_VQA_Polished",
263 |       "pos": [
264 |         -697,
265 |         -241
266 |       ],
267 |       "size": {
268 |         "0": 400,
269 |         "1": 360
270 |       },
271 |       "flags": {},
272 |       "order": 6,
273 |       "mode": 0,
274 |       "inputs": [
275 |         {
276 |           "name": "source_video_path",
277 |           "type": "PATH",
278 |           "link": null
279 |         },
280 |         {
281 |           "name": "source_image_path",
282 |           "type": "IMAGE",
283 |           "link": 70
284 |         }
285 |       ],
286 |       "outputs": [
287 |         {
288 |           "name": "STRING",
289 |           "type": "STRING",
290 |           "links": [
291 |             71
292 |           ],
293 |           "shape": 3,
294 |           "slot_index": 0
295 |         }
296 |       ],
297 |       "properties": {
298 |         "Node name for S&R": "MiniCPM_VQA_Polished"
299 |       },
300 |       "widgets_values": [
301 |         "Compare image 1, image 2 and image 3, tell me about the differences among them.",
302 |         "MiniCPM-V-2_6-int4",
303 |         true,
304 |         0.8,
305 |         100,
306 |         0.7,
307 |         1.05,
308 |         2048,
309 |         64,
310 |         2,
311 |         1293,
312 |         "randomize"
313 |       ]
314 |     }
315 |   ],
316 |   "links": [
317 |     [
318 |       65,
319 |       55,
320 |       0,
321 |       54,
322 |       0,
323 |       "IMAGE"
324 |     ],
325 |     [
326 |       66,
327 |       56,
328 |       0,
329 |       54,
330 |       1,
331 |       "IMAGE"
332 |     ],
333 |     [
334 |       68,
335 |       58,
336 |       0,
337 |       54,
338 |       2,
339 |       "IMAGE"
340 |     ],
341 |     [
342 |       69,
343 |       54,
344 |       0,
345 |       59,
346 |       0,
347 |       "IMAGE"
348 |     ],
349 |     [
350 |       70,
351 |       54,
352 |       0,
353 |       60,
354 |       1,
355 |       "IMAGE"
356 |     ],
357 |     [
358 |       71,
359 |       60,
360 |       0,
361 |       51,
362 |       0,
363 |       "STRING"
364 |     ]
365 |   ],
366 |   "groups": [],
367 |   "config": {},
368 |   "extra": {
369 |     "ds": {
370 |       "scale": 0.7247295000000027,
371 |       "offset": [
372 |         1602.4864516115556,
373 |         507.1347555005483
374 |       ]
375 |     }
376 |   },
377 |   "version": 0.4
378 | }


--------------------------------------------------------------------------------
/examples/Chat_with_multiple_images_workflow_polished.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_multiple_images_workflow_polished.png


--------------------------------------------------------------------------------
/examples/Chat_with_single_image_workflow_legacy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 55,
  3 |   "last_link_id": 64,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         385,
 10 |         -6
 11 |       ],
 12 |       "size": [
 13 |         681.1074433554127,
 14 |         92.63203200215605
 15 |       ],
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 1,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 51,
 32 |       "type": "LoadImage",
 33 |       "pos": [
 34 |         -363,
 35 |         -8
 36 |       ],
 37 |       "size": [
 38 |         308.78882573073196,
 39 |         398.59335191598814
 40 |       ],
 41 |       "flags": {},
 42 |       "order": 0,
 43 |       "mode": 0,
 44 |       "outputs": [
 45 |         {
 46 |           "name": "IMAGE",
 47 |           "type": "IMAGE",
 48 |           "links": [
 49 |             63
 50 |           ],
 51 |           "slot_index": 0,
 52 |           "shape": 3
 53 |         },
 54 |         {
 55 |           "name": "MASK",
 56 |           "type": "MASK",
 57 |           "links": null,
 58 |           "shape": 3
 59 |         }
 60 |       ],
 61 |       "properties": {
 62 |         "Node name for S&R": "LoadImage"
 63 |       },
 64 |       "widgets_values": [
 65 |         "ComfyUI_00509_.png",
 66 |         "image"
 67 |       ]
 68 |     },
 69 |     {
 70 |       "id": 52,
 71 |       "type": "DisplayText",
 72 |       "pos": [
 73 |         385,
 74 |         128
 75 |       ],
 76 |       "size": [
 77 |         682.011017760673,
 78 |         262.51556121728197
 79 |       ],
 80 |       "flags": {},
 81 |       "order": 3,
 82 |       "mode": 0,
 83 |       "inputs": [
 84 |         {
 85 |           "name": "text",
 86 |           "type": "STRING",
 87 |           "link": 64,
 88 |           "widget": {
 89 |             "name": "text"
 90 |           }
 91 |         }
 92 |       ],
 93 |       "outputs": [
 94 |         {
 95 |           "name": "STRING",
 96 |           "type": "STRING",
 97 |           "links": null,
 98 |           "slot_index": 0,
 99 |           "shape": 6
100 |         }
101 |       ],
102 |       "properties": {
103 |         "Node name for S&R": "DisplayText"
104 |       },
105 |       "widgets_values": [
106 |         "",
107 |         "The image presents a captivating view of Earth, captured from space. The planet is beautifully illuminated by the sun's rays, casting a warm glow over its surface. The curvature of the Earth is clearly visible, emphasizing the vastness of our home planet.\n\nThe continents and oceans are distinctly outlined in shades of green and brown, respectively, providing a stark contrast against the deep blue of the surrounding space. This color palette not only highlights the natural beauty of our world but also underscores the delicate balance between land and water.\n\nThe atmosphere, depicted in hues of white and gray, appears as swirling clouds that blanket the planet. These clouds, reminiscent of cosmic dust storms, add a sense of dynamism to the otherwise serene scene.\n\nIn the backdrop, a distant star can be seen, serving as a reminder of the infinite universe beyond our own. Its presence adds depth to the image, creating a sense of scale and distance.\n\nOverall, the image provides a comprehensive view of Earth, showcasing its unique characteristics and placing it within the context of the cosmos. It's a testament to the awe-inspiring nature of our planet and the wonders of space exploration."
108 |       ]
109 |     },
110 |     {
111 |       "id": 55,
112 |       "type": "MiniCPM_VQA",
113 |       "pos": [
114 |         -34,
115 |         -9
116 |       ],
117 |       "size": {
118 |         "0": 400,
119 |         "1": 400
120 |       },
121 |       "flags": {},
122 |       "order": 2,
123 |       "mode": 0,
124 |       "inputs": [
125 |         {
126 |           "name": "source_video_path",
127 |           "type": "PATH",
128 |           "link": null
129 |         },
130 |         {
131 |           "name": "source_image_path_1st",
132 |           "type": "IMAGE",
133 |           "link": 63
134 |         },
135 |         {
136 |           "name": "source_image_path_2nd",
137 |           "type": "IMAGE",
138 |           "link": null
139 |         },
140 |         {
141 |           "name": "source_image_path_3rd",
142 |           "type": "IMAGE",
143 |           "link": null
144 |         }
145 |       ],
146 |       "outputs": [
147 |         {
148 |           "name": "STRING",
149 |           "type": "STRING",
150 |           "links": [
151 |             64
152 |           ],
153 |           "shape": 3,
154 |           "slot_index": 0
155 |         }
156 |       ],
157 |       "properties": {
158 |         "Node name for S&R": "MiniCPM_VQA"
159 |       },
160 |       "widgets_values": [
161 |         "Describe the image in detail",
162 |         "MiniCPM-V-2_6-int4",
163 |         true,
164 |         0.8,
165 |         100,
166 |         0.7,
167 |         1.05,
168 |         2048,
169 |         64,
170 |         2,
171 |         171,
172 |         "randomize"
173 |       ]
174 |     }
175 |   ],
176 |   "links": [
177 |     [
178 |       63,
179 |       51,
180 |       0,
181 |       55,
182 |       1,
183 |       "IMAGE"
184 |     ],
185 |     [
186 |       64,
187 |       55,
188 |       0,
189 |       52,
190 |       0,
191 |       "STRING"
192 |     ]
193 |   ],
194 |   "groups": [],
195 |   "config": {},
196 |   "extra": {
197 |     "ds": {
198 |       "scale": 0.9646149645000015,
199 |       "offset": [
200 |         482.3653697740519,
201 |         191.776875012469
202 |       ]
203 |     }
204 |   },
205 |   "version": 0.4
206 | }


--------------------------------------------------------------------------------
/examples/Chat_with_single_image_workflow_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_single_image_workflow_legacy.png


--------------------------------------------------------------------------------
/examples/Chat_with_single_image_workflow_polished.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 56,
  3 |   "last_link_id": 66,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         385,
 10 |         -6
 11 |       ],
 12 |       "size": [
 13 |         681.1074433554127,
 14 |         92.63203200215605
 15 |       ],
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 1,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 51,
 32 |       "type": "LoadImage",
 33 |       "pos": [
 34 |         -348,
 35 |         -2
 36 |       ],
 37 |       "size": [
 38 |         293.6592954235593,
 39 |         358.0474368401352
 40 |       ],
 41 |       "flags": {},
 42 |       "order": 0,
 43 |       "mode": 0,
 44 |       "outputs": [
 45 |         {
 46 |           "name": "IMAGE",
 47 |           "type": "IMAGE",
 48 |           "links": [
 49 |             65
 50 |           ],
 51 |           "slot_index": 0,
 52 |           "shape": 3
 53 |         },
 54 |         {
 55 |           "name": "MASK",
 56 |           "type": "MASK",
 57 |           "links": null,
 58 |           "shape": 3
 59 |         }
 60 |       ],
 61 |       "properties": {
 62 |         "Node name for S&R": "LoadImage"
 63 |       },
 64 |       "widgets_values": [
 65 |         "ComfyUI_00509_.png",
 66 |         "image"
 67 |       ]
 68 |     },
 69 |     {
 70 |       "id": 52,
 71 |       "type": "DisplayText",
 72 |       "pos": [
 73 |         385,
 74 |         121
 75 |       ],
 76 |       "size": [
 77 |         680.7480526602474,
 78 |         233.90070457160425
 79 |       ],
 80 |       "flags": {},
 81 |       "order": 3,
 82 |       "mode": 0,
 83 |       "inputs": [
 84 |         {
 85 |           "name": "text",
 86 |           "type": "STRING",
 87 |           "link": 66,
 88 |           "widget": {
 89 |             "name": "text"
 90 |           }
 91 |         }
 92 |       ],
 93 |       "outputs": [
 94 |         {
 95 |           "name": "STRING",
 96 |           "type": "STRING",
 97 |           "links": null,
 98 |           "slot_index": 0,
 99 |           "shape": 6
100 |         }
101 |       ],
102 |       "properties": {
103 |         "Node name for S&R": "DisplayText"
104 |       },
105 |       "widgets_values": [
106 |         "",
107 |         "The image presents a breathtaking view of Earth from space, showcasing the planet's diverse landscapes and vibrant colors. The curvature of the Earth is clearly visible, emphasizing its spherical shape. The oceans are depicted in shades of blue, while the continents stand out with their distinct green hues.\n\nOne of the most striking features is the swirling patterns in the clouds, which create a mesmerizing effect against the backdrop of the dark cosmos. These patterns seem to dance across the sky, adding a dynamic element to the otherwise static scene.\n\nIn the top right corner, a distant star glows brightly, serving as a reminder of the vastness of space. This celestial body stands out starkly against the black expanse, providing a sense of scale and depth to the image.\n\nOverall, the image captures the awe-inspiring beauty of our home planet and the infinite universe that surrounds it. It's a testament to the wonders of nature and the mysteries of the cosmos."
108 |       ]
109 |     },
110 |     {
111 |       "id": 56,
112 |       "type": "MiniCPM_VQA_Polished",
113 |       "pos": [
114 |         -34,
115 |         -4
116 |       ],
117 |       "size": {
118 |         "0": 400,
119 |         "1": 360
120 |       },
121 |       "flags": {},
122 |       "order": 2,
123 |       "mode": 0,
124 |       "inputs": [
125 |         {
126 |           "name": "source_video_path",
127 |           "type": "PATH",
128 |           "link": null
129 |         },
130 |         {
131 |           "name": "source_image_path",
132 |           "type": "IMAGE",
133 |           "link": 65
134 |         }
135 |       ],
136 |       "outputs": [
137 |         {
138 |           "name": "STRING",
139 |           "type": "STRING",
140 |           "links": [
141 |             66
142 |           ],
143 |           "shape": 3,
144 |           "slot_index": 0
145 |         }
146 |       ],
147 |       "properties": {
148 |         "Node name for S&R": "MiniCPM_VQA_Polished"
149 |       },
150 |       "widgets_values": [
151 |         "Describe the image in detail",
152 |         "MiniCPM-V-2_6-int4",
153 |         true,
154 |         0.8,
155 |         100,
156 |         0.7,
157 |         1.05,
158 |         2048,
159 |         64,
160 |         2,
161 |         1923,
162 |         "randomize"
163 |       ]
164 |     }
165 |   ],
166 |   "links": [
167 |     [
168 |       65,
169 |       51,
170 |       0,
171 |       56,
172 |       1,
173 |       "IMAGE"
174 |     ],
175 |     [
176 |       66,
177 |       56,
178 |       0,
179 |       52,
180 |       0,
181 |       "STRING"
182 |     ]
183 |   ],
184 |   "groups": [],
185 |   "config": {},
186 |   "extra": {
187 |     "ds": {
188 |       "scale": 0.9646149645000016,
189 |       "offset": [
190 |         465.71293314624074,
191 |         190.1005845668893
192 |       ]
193 |     }
194 |   },
195 |   "version": 0.4
196 | }


--------------------------------------------------------------------------------
/examples/Chat_with_single_image_workflow_polished.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_single_image_workflow_polished.png


--------------------------------------------------------------------------------
/examples/Chat_with_text_workflow_legacy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 51,
  3 |   "last_link_id": 54,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         398,
 10 |         -255
 11 |       ],
 12 |       "size": [
 13 |         560.1107513648193,
 14 |         103.60144459895338
 15 |       ],
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 1,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 49,
 32 |       "type": "DisplayText",
 33 |       "pos": [
 34 |         395,
 35 |         -115
 36 |       ],
 37 |       "size": [
 38 |         565.5332447212019,
 39 |         259.1318214920463
 40 |       ],
 41 |       "flags": {},
 42 |       "order": 2,
 43 |       "mode": 0,
 44 |       "inputs": [
 45 |         {
 46 |           "name": "text",
 47 |           "type": "STRING",
 48 |           "link": 54,
 49 |           "widget": {
 50 |             "name": "text"
 51 |           }
 52 |         }
 53 |       ],
 54 |       "outputs": [
 55 |         {
 56 |           "name": "STRING",
 57 |           "type": "STRING",
 58 |           "links": null,
 59 |           "shape": 6
 60 |         }
 61 |       ],
 62 |       "properties": {
 63 |         "Node name for S&R": "DisplayText"
 64 |       },
 65 |       "widgets_values": [
 66 |         "",
 67 |         "A quantum group is a mathematical object that generalizes the concept of a classical group to the realm of quantum mechanics. In classical mathematics, a group is a set of elements with a binary operation (such as addition or multiplication) that satisfies certain properties, such as associativity and the existence of an identity element.\nIn the context of quantum mechanics, a quantum group is a Hopf algebra that is associated with a classical group. The elements of a quantum group are represented by operators that satisfy the same algebraic rules as the elements of the corresponding classical group. However, the operators in a quantum group also have additional properties related to the uncertainty principle, which is a fundamental principle in quantum mechanics that states that it is impossible to precisely measure both the position and momentum of a particle at the same time.\nOne example of a quantum group is the quantum group associated with the special unitary group SU(2), which plays a central role in the study of quantum spin systems. Another example is the quantum group associated with the orthogonal group SO(3), which is used in the study of quantum mechanics of angular momentum.\nQuantum groups have many applications in physics and other areas of mathematics, including the study of quantum field theory, integrable systems, and knot theory. They are also important tools in the development of quantum computing, where they can be used to construct quantum algorithms for solving problems that are difficult or impossible to solve using classical computers."
 68 |       ]
 69 |     },
 70 |     {
 71 |       "id": 51,
 72 |       "type": "MiniCPM_VQA",
 73 |       "pos": [
 74 |         -21,
 75 |         -256
 76 |       ],
 77 |       "size": {
 78 |         "0": 400,
 79 |         "1": 400
 80 |       },
 81 |       "flags": {},
 82 |       "order": 0,
 83 |       "mode": 0,
 84 |       "inputs": [
 85 |         {
 86 |           "name": "source_video_path",
 87 |           "type": "PATH",
 88 |           "link": null
 89 |         },
 90 |         {
 91 |           "name": "source_image_path_1st",
 92 |           "type": "IMAGE",
 93 |           "link": null
 94 |         },
 95 |         {
 96 |           "name": "source_image_path_2nd",
 97 |           "type": "IMAGE",
 98 |           "link": null
 99 |         },
100 |         {
101 |           "name": "source_image_path_3rd",
102 |           "type": "IMAGE",
103 |           "link": null
104 |         }
105 |       ],
106 |       "outputs": [
107 |         {
108 |           "name": "STRING",
109 |           "type": "STRING",
110 |           "links": [
111 |             54
112 |           ],
113 |           "shape": 3,
114 |           "slot_index": 0
115 |         }
116 |       ],
117 |       "properties": {
118 |         "Node name for S&R": "MiniCPM_VQA"
119 |       },
120 |       "widgets_values": [
121 |         "Quantum Group",
122 |         "MiniCPM-V-2_6-int4",
123 |         true,
124 |         0.8,
125 |         100,
126 |         0.7,
127 |         1.05,
128 |         2048,
129 |         64,
130 |         2,
131 |         1501,
132 |         "randomize"
133 |       ]
134 |     }
135 |   ],
136 |   "links": [
137 |     [
138 |       54,
139 |       51,
140 |       0,
141 |       49,
142 |       0,
143 |       "STRING"
144 |     ]
145 |   ],
146 |   "groups": [],
147 |   "config": {},
148 |   "extra": {
149 |     "ds": {
150 |       "scale": 1.2839025177495011,
151 |       "offset": [
152 |         174.06698959850874,
153 |         351.1944051783393
154 |       ]
155 |     }
156 |   },
157 |   "version": 0.4
158 | }


--------------------------------------------------------------------------------
/examples/Chat_with_text_workflow_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_text_workflow_legacy.png


--------------------------------------------------------------------------------
/examples/Chat_with_text_workflow_polished.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 52,
  3 |   "last_link_id": 55,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         390,
 10 |         -254
 11 |       ],
 12 |       "size": [
 13 |         443.0672250364122,
 14 |         104.75543052432553
 15 |       ],
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 1,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 49,
 32 |       "type": "DisplayText",
 33 |       "pos": [
 34 |         386,
 35 |         -111
 36 |       ],
 37 |       "size": [
 38 |         452.45997097748295,
 39 |         212.76987788146022
 40 |       ],
 41 |       "flags": {},
 42 |       "order": 2,
 43 |       "mode": 0,
 44 |       "inputs": [
 45 |         {
 46 |           "name": "text",
 47 |           "type": "STRING",
 48 |           "link": 55,
 49 |           "widget": {
 50 |             "name": "text"
 51 |           }
 52 |         }
 53 |       ],
 54 |       "outputs": [
 55 |         {
 56 |           "name": "STRING",
 57 |           "type": "STRING",
 58 |           "links": null,
 59 |           "shape": 6
 60 |         }
 61 |       ],
 62 |       "properties": {
 63 |         "Node name for S&R": "DisplayText"
 64 |       },
 65 |       "widgets_values": [
 66 |         "",
 67 |         "A quantum group is a mathematical object that generalizes the concept of a group to include elements with quantum numbers. In traditional groups, each element has a unique identity and can be multiplied or divided by other elements in the group. However, in quantum groups, some elements may have multiple identities or may not be able to be multiplied or divided.\nQuantum groups are used in various areas of mathematics and physics, including representation theory, statistical mechanics, and quantum field theory. They are particularly important in the study of quantum systems, where they provide a way to describe the behavior of particles that follow quantum mechanical rules.\nOne example of a quantum group is the quantum group associated with the Lie algebra sl(2), which describes the symmetries of the hyperbolic plane. This group has several interesting properties, such as the fact that it contains both integer and non-integer elements, and that it is not commutative."
 68 |       ]
 69 |     },
 70 |     {
 71 |       "id": 52,
 72 |       "type": "MiniCPM_VQA_Polished",
 73 |       "pos": [
 74 |         -28,
 75 |         -258
 76 |       ],
 77 |       "size": {
 78 |         "0": 400,
 79 |         "1": 360
 80 |       },
 81 |       "flags": {},
 82 |       "order": 0,
 83 |       "mode": 0,
 84 |       "inputs": [
 85 |         {
 86 |           "name": "source_video_path",
 87 |           "type": "PATH",
 88 |           "link": null
 89 |         },
 90 |         {
 91 |           "name": "source_image_path",
 92 |           "type": "IMAGE",
 93 |           "link": null
 94 |         }
 95 |       ],
 96 |       "outputs": [
 97 |         {
 98 |           "name": "STRING",
 99 |           "type": "STRING",
100 |           "links": [
101 |             55
102 |           ],
103 |           "shape": 3,
104 |           "slot_index": 0
105 |         }
106 |       ],
107 |       "properties": {
108 |         "Node name for S&R": "MiniCPM_VQA_Polished"
109 |       },
110 |       "widgets_values": [
111 |         "Quantum Group",
112 |         "MiniCPM-V-2_6-int4",
113 |         true,
114 |         0.8,
115 |         100,
116 |         0.7,
117 |         1.05,
118 |         2048,
119 |         64,
120 |         2,
121 |         2021,
122 |         "randomize"
123 |       ]
124 |     }
125 |   ],
126 |   "links": [
127 |     [
128 |       55,
129 |       52,
130 |       0,
131 |       49,
132 |       0,
133 |       "STRING"
134 |     ]
135 |   ],
136 |   "groups": [],
137 |   "config": {},
138 |   "extra": {
139 |     "ds": {
140 |       "scale": 1.4122927695244516,
141 |       "offset": [
142 |         151.62360770168286,
143 |         363.80059898448
144 |       ]
145 |     }
146 |   },
147 |   "version": 0.4
148 | }


--------------------------------------------------------------------------------
/examples/Chat_with_text_workflow_polished.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_text_workflow_polished.png


--------------------------------------------------------------------------------
/examples/Chat_with_video_workflow_legacy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 53,
  3 |   "last_link_id": 63,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         -480,
 10 |         -284
 11 |       ],
 12 |       "size": {
 13 |         "0": 724.4190673828125,
 14 |         "1": 79.42505645751953
 15 |       },
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 0,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 50,
 32 |       "type": "LoadVideo",
 33 |       "pos": [
 34 |         -568,
 35 |         -158
 36 |       ],
 37 |       "size": [
 38 |         469.4351950653901,
 39 |         397.9642639160156
 40 |       ],
 41 |       "flags": {},
 42 |       "order": 1,
 43 |       "mode": 0,
 44 |       "outputs": [
 45 |         {
 46 |           "name": "PATH",
 47 |           "type": "PATH",
 48 |           "links": [
 49 |             62
 50 |           ],
 51 |           "slot_index": 0,
 52 |           "shape": 3
 53 |         }
 54 |       ],
 55 |       "properties": {
 56 |         "Node name for S&R": "LoadVideo"
 57 |       },
 58 |       "widgets_values": [
 59 |         "AnimateDiff_00002.mp4",
 60 |         "Video",
 61 |         {
 62 |           "hidden": false,
 63 |           "paused": false,
 64 |           "params": {}
 65 |         }
 66 |       ]
 67 |     },
 68 |     {
 69 |       "id": 51,
 70 |       "type": "DisplayText",
 71 |       "pos": [
 72 |         -558,
 73 |         287
 74 |       ],
 75 |       "size": [
 76 |         861.4327343832845,
 77 |         107.98898362105666
 78 |       ],
 79 |       "flags": {},
 80 |       "order": 3,
 81 |       "mode": 0,
 82 |       "inputs": [
 83 |         {
 84 |           "name": "text",
 85 |           "type": "STRING",
 86 |           "link": 63,
 87 |           "widget": {
 88 |             "name": "text"
 89 |           }
 90 |         }
 91 |       ],
 92 |       "outputs": [
 93 |         {
 94 |           "name": "STRING",
 95 |           "type": "STRING",
 96 |           "links": null,
 97 |           "shape": 6
 98 |         }
 99 |       ],
100 |       "properties": {
101 |         "Node name for S&R": "DisplayText"
102 |       },
103 |       "widgets_values": [
104 |         "",
105 |         "The video captures a serene sunset scene. The sky is painted with warm hues of orange and red, creating a dramatic backdrop as the sun descends towards the horizon. Silhouetted against this vibrant sky is a solitary tree, its bare branches reaching upwards. A flock of birds can be seen in mid-flight, their dark silhouettes contrasting sharply against the glowing orb of the setting sun. The overall atmosphere is one of tranquility and natural beauty, as the day transitions into night."
106 |       ]
107 |     },
108 |     {
109 |       "id": 53,
110 |       "type": "MiniCPM_VQA",
111 |       "pos": [
112 |         -89,
113 |         -159
114 |       ],
115 |       "size": {
116 |         "0": 400,
117 |         "1": 400
118 |       },
119 |       "flags": {},
120 |       "order": 2,
121 |       "mode": 0,
122 |       "inputs": [
123 |         {
124 |           "name": "source_video_path",
125 |           "type": "PATH",
126 |           "link": 62
127 |         },
128 |         {
129 |           "name": "source_image_path_1st",
130 |           "type": "IMAGE",
131 |           "link": null
132 |         },
133 |         {
134 |           "name": "source_image_path_2nd",
135 |           "type": "IMAGE",
136 |           "link": null
137 |         },
138 |         {
139 |           "name": "source_image_path_3rd",
140 |           "type": "IMAGE",
141 |           "link": null
142 |         }
143 |       ],
144 |       "outputs": [
145 |         {
146 |           "name": "STRING",
147 |           "type": "STRING",
148 |           "links": [
149 |             63
150 |           ],
151 |           "shape": 3,
152 |           "slot_index": 0
153 |         }
154 |       ],
155 |       "properties": {
156 |         "Node name for S&R": "MiniCPM_VQA"
157 |       },
158 |       "widgets_values": [
159 |         "Describe the video in detail",
160 |         "MiniCPM-V-2_6-int4",
161 |         false,
162 |         0.8,
163 |         100,
164 |         0.7,
165 |         1.05,
166 |         2048,
167 |         64,
168 |         2,
169 |         1617,
170 |         "randomize"
171 |       ]
172 |     }
173 |   ],
174 |   "links": [
175 |     [
176 |       62,
177 |       50,
178 |       0,
179 |       53,
180 |       0,
181 |       "PATH"
182 |     ],
183 |     [
184 |       63,
185 |       53,
186 |       0,
187 |       51,
188 |       0,
189 |       "STRING"
190 |     ]
191 |   ],
192 |   "groups": [],
193 |   "config": {},
194 |   "extra": {
195 |     "ds": {
196 |       "scale": 0.9646149645000006,
197 |       "offset": [
198 |         896.8108299009335,
199 |         348.18733398490764
200 |       ]
201 |     }
202 |   },
203 |   "version": 0.4
204 | }


--------------------------------------------------------------------------------
/examples/Chat_with_video_workflow_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_video_workflow_legacy.png


--------------------------------------------------------------------------------
/examples/Chat_with_video_workflow_polished.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 55,
  3 |   "last_link_id": 66,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "Note",
  8 |       "pos": [
  9 |         -428,
 10 |         -256
 11 |       ],
 12 |       "size": {
 13 |         "0": 724.4190673828125,
 14 |         "1": 79.42505645751953
 15 |       },
 16 |       "flags": {
 17 |         "collapsed": false
 18 |       },
 19 |       "order": 1,
 20 |       "mode": 0,
 21 |       "properties": {
 22 |         "text": ""
 23 |       },
 24 |       "widgets_values": [
 25 |         "当 MiniCPM VQA 同时接收到图像和视频信息时，它会仅处理视频信息而忽略图像信息。如果您想处理图像信息，请断开视频信息的输入。\n\nWhen MiniCPM VQA simultaneously receives both image and video information, it processes only the video information while ignoring the image information. If you want to process the image information, please disconnect the input of the video information."
 26 |       ],
 27 |       "color": "#432",
 28 |       "bgcolor": "#653"
 29 |     },
 30 |     {
 31 |       "id": 51,
 32 |       "type": "DisplayText",
 33 |       "pos": [
 34 |         -471,
 35 |         277
 36 |       ],
 37 |       "size": [
 38 |         813.5119048309946,
 39 |         141.5117891133587
 40 |       ],
 41 |       "flags": {},
 42 |       "order": 3,
 43 |       "mode": 0,
 44 |       "inputs": [
 45 |         {
 46 |           "name": "text",
 47 |           "type": "STRING",
 48 |           "link": 65,
 49 |           "widget": {
 50 |             "name": "text"
 51 |           }
 52 |         }
 53 |       ],
 54 |       "outputs": [
 55 |         {
 56 |           "name": "STRING",
 57 |           "type": "STRING",
 58 |           "links": null,
 59 |           "shape": 6
 60 |         }
 61 |       ],
 62 |       "properties": {
 63 |         "Node name for S&R": "DisplayText"
 64 |       },
 65 |       "widgets_values": [
 66 |         "",
 67 |         "The video captures a serene sunset scene with the sun descending towards the horizon. The sky is painted in vibrant hues of orange, yellow, and red, creating a warm gradient that transitions from fiery tones near the setting sun to softer shades as it moves upward. Silhouetted against this colorful backdrop is a solitary tree, its bare branches reaching out into the sky, adding a stark contrast to the vivid colors. A flock of birds can be seen flying across the sky, their dark shapes moving dynamically through the air, contributing to the sense of movement within the otherwise tranquil setting. The overall atmosphere is one of peacefulness and natural beauty, emphasizing the fleeting yet mesmerizing moments of twilight."
 68 |       ]
 69 |     },
 70 |     {
 71 |       "id": 54,
 72 |       "type": "MiniCPM_VQA_Polished",
 73 |       "pos": [
 74 |         -57,
 75 |         -129
 76 |       ],
 77 |       "size": {
 78 |         "0": 400,
 79 |         "1": 360
 80 |       },
 81 |       "flags": {},
 82 |       "order": 2,
 83 |       "mode": 0,
 84 |       "inputs": [
 85 |         {
 86 |           "name": "source_video_path",
 87 |           "type": "PATH",
 88 |           "link": 66
 89 |         },
 90 |         {
 91 |           "name": "source_image_path",
 92 |           "type": "IMAGE",
 93 |           "link": null
 94 |         }
 95 |       ],
 96 |       "outputs": [
 97 |         {
 98 |           "name": "STRING",
 99 |           "type": "STRING",
100 |           "links": [
101 |             65
102 |           ],
103 |           "shape": 3,
104 |           "slot_index": 0
105 |         }
106 |       ],
107 |       "properties": {
108 |         "Node name for S&R": "MiniCPM_VQA_Polished"
109 |       },
110 |       "widgets_values": [
111 |         "Describe the video in detail",
112 |         "MiniCPM-V-2_6-int4",
113 |         true,
114 |         0.8,
115 |         100,
116 |         0.7,
117 |         1.05,
118 |         2048,
119 |         64,
120 |         2,
121 |         1746,
122 |         "randomize"
123 |       ]
124 |     },
125 |     {
126 |       "id": 55,
127 |       "type": "LoadVideo",
128 |       "pos": [
129 |         -479,
130 |         -128
131 |       ],
132 |       "size": [
133 |         409.98541024642964,
134 |         358.0214538574219
135 |       ],
136 |       "flags": {},
137 |       "order": 0,
138 |       "mode": 0,
139 |       "outputs": [
140 |         {
141 |           "name": "PATH",
142 |           "type": "PATH",
143 |           "links": [
144 |             66
145 |           ],
146 |           "shape": 3
147 |         }
148 |       ],
149 |       "properties": {
150 |         "Node name for S&R": "LoadVideo"
151 |       },
152 |       "widgets_values": [
153 |         "AnimateDiff_00002.mp4",
154 |         "Video",
155 |         {
156 |           "hidden": false,
157 |           "paused": false,
158 |           "params": {}
159 |         }
160 |       ]
161 |     }
162 |   ],
163 |   "links": [
164 |     [
165 |       65,
166 |       54,
167 |       0,
168 |       51,
169 |       0,
170 |       "STRING"
171 |     ],
172 |     [
173 |       66,
174 |       55,
175 |       0,
176 |       54,
177 |       0,
178 |       "PATH"
179 |     ]
180 |   ],
181 |   "groups": [],
182 |   "config": {},
183 |   "extra": {
184 |     "ds": {
185 |       "scale": 0.9646149645000013,
186 |       "offset": [
187 |         829.9398344807752,
188 |         321.3710639555439
189 |       ]
190 |     }
191 |   },
192 |   "version": 0.4
193 | }


--------------------------------------------------------------------------------
/examples/Chat_with_video_workflow_polished.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/Chat_with_video_workflow_polished.png


--------------------------------------------------------------------------------
/examples/ComfyUI_00508_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/ComfyUI_00508_.png


--------------------------------------------------------------------------------
/examples/ComfyUI_00509_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/ComfyUI_00509_.png


--------------------------------------------------------------------------------
/examples/ComfyUI_00532_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/examples/ComfyUI_00532_.png


--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4/763f95133a95b611977efaa05ad654ff410670a9/favicon.ico


--------------------------------------------------------------------------------
/image_nodes.py:
--------------------------------------------------------------------------------
 1 | class MultipleImagesInput:
 2 |     @classmethod
 3 |     def INPUT_TYPES(s):
 4 |         return {
 5 |             "required": {
 6 |                 "inputcount": ("INT", {"default": 2, "min": 2, "max": 1000, "step": 1}),
 7 |                 "image_1": ("IMAGE",),
 8 |                 "image_2": ("IMAGE",),
 9 |             },
10 |         }
11 | 
12 |     RETURN_TYPES = ("IMAGE",)
13 |     RETURN_NAMES = ("images",)
14 |     FUNCTION = "combine"
15 |     CATEGORY = "Comfyui_MiniCPM-V-2_6-int4"
16 |     DESCRIPTION = """
17 | Creates an image batch from multiple images.  
18 | You can set how many inputs the node has,  
19 | with the **inputcount** and clicking update.
20 | """
21 | 
22 |     def combine(self, inputcount, **kwargs):
23 |         from nodes import ImageBatch
24 | 
25 |         image_batch_node = ImageBatch()
26 |         image = kwargs["image_1"]
27 |         for c in range(1, inputcount):
28 |             new_image = kwargs[f"image_{c + 1}"]
29 |             (image,) = image_batch_node.batch(image, new_image)
30 |         return (image,)


--------------------------------------------------------------------------------
/nodes_legacy.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import folder_paths
  4 | from transformers import AutoTokenizer, AutoModel
  5 | from torchvision.transforms.v2 import ToPILImage
  6 | from decord import VideoReader, cpu  # pip install decord
  7 | from PIL import Image
  8 | 
  9 | 
 10 | class MiniCPM_VQA:
 11 |     def __init__(self):
 12 |         self.model_checkpoint = None
 13 |         self.tokenizer = None
 14 |         self.model = None
 15 |         self.device = (
 16 |             torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 17 |         )
 18 |         self.bf16_support = (
 19 |             torch.cuda.is_available()
 20 |             and torch.cuda.get_device_capability(self.device)[0] >= 8
 21 |         )
 22 | 
 23 |     @classmethod
 24 |     def INPUT_TYPES(s):
 25 |         return {
 26 |             "required": {
 27 |                 "text": ("STRING", {"default": "", "multiline": True}),
 28 |                 "model": (
 29 |                     ["MiniCPM-V-2_6-int4", "MiniCPM-Llama3-V-2_5-int4"],
 30 |                     {"default": "MiniCPM-V-2_6-int4"},
 31 |                 ),
 32 |                 "keep_model_loaded": ("BOOLEAN", {"default": False}),
 33 |                 "top_p": (
 34 |                     "FLOAT",
 35 |                     {
 36 |                         "default": 0.8,
 37 |                     },
 38 |                 ),
 39 |                 "top_k": (
 40 |                     "INT",
 41 |                     {
 42 |                         "default": 100,
 43 |                     },
 44 |                 ),
 45 |                 "temperature": (
 46 |                     "FLOAT",
 47 |                     {"default": 0.7, "min": 0, "max": 1, "step": 0.1},
 48 |                 ),
 49 |                 "repetition_penalty": (
 50 |                     "FLOAT",
 51 |                     {
 52 |                         "default": 1.05,
 53 |                     },
 54 |                 ),
 55 |                 "max_new_tokens": (
 56 |                     "INT",
 57 |                     {
 58 |                         "default": 2048,
 59 |                     },
 60 |                 ),
 61 |                 "video_max_num_frames": (
 62 |                     "INT",
 63 |                     {
 64 |                         "default": 64,
 65 |                     },
 66 |                 ),  # if cuda OOM set a smaller number
 67 |                 "video_max_slice_nums": (
 68 |                     "INT",
 69 |                     {
 70 |                         "default": 2,
 71 |                     },
 72 |                 ),  # use 1 if cuda OOM and video resolution >  448*448
 73 |                 "seed": ("INT", {"default": -1}),  # add seed parameter, default is -1
 74 |             },
 75 |             "optional": {
 76 |                 "source_video_path": ("PATH",),
 77 |                 "source_image_path_1st": ("IMAGE",),
 78 |                 "source_image_path_2nd": ("IMAGE",),
 79 |                 "source_image_path_3rd": ("IMAGE",),
 80 |             },
 81 |         }
 82 | 
 83 |     RETURN_TYPES = ("STRING",)
 84 |     FUNCTION = "inference"
 85 |     CATEGORY = "Comfyui_MiniCPM-V-2_6-int4"
 86 | 
 87 |     def encode_video(self, source_video_path, MAX_NUM_FRAMES):
 88 |         def uniform_sample(l, n):  # noqa: E741
 89 |             gap = len(l) / n
 90 |             idxs = [int(i * gap + gap / 2) for i in range(n)]
 91 |             return [l[i] for i in idxs]
 92 | 
 93 |         vr = VideoReader(source_video_path, ctx=cpu(0))
 94 |         total_frames = len(vr) + 1
 95 |         print("Total frames:", total_frames)
 96 |         avg_fps = vr.get_avg_fps()
 97 |         print("Get average FPS(frame per second):", avg_fps)
 98 |         sample_fps = round(avg_fps / 1)  # FPS
 99 |         duration = len(vr) / avg_fps
100 |         print("Total duration:", duration, "seconds")
101 |         width = vr[0].shape[1]
102 |         height = vr[0].shape[0]
103 |         print("Video resolution(width x height):", width, "x", height)
104 | 
105 |         frame_idx = [i for i in range(0, len(vr), sample_fps)]
106 |         if len(frame_idx) > MAX_NUM_FRAMES:
107 |             frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
108 |         frames = vr.get_batch(frame_idx).asnumpy()
109 |         frames = [Image.fromarray(v.astype("uint8")) for v in frames]
110 |         print("num frames:", len(frames))
111 |         return frames
112 | 
113 |     def inference(
114 |         self,
115 |         text,
116 |         model,
117 |         keep_model_loaded,
118 |         top_p,
119 |         top_k,
120 |         temperature,
121 |         repetition_penalty,
122 |         max_new_tokens,
123 |         video_max_num_frames,
124 |         video_max_slice_nums,
125 |         seed,
126 |         source_image_path_1st=None,
127 |         source_image_path_2nd=None,
128 |         source_image_path_3rd=None,
129 |         source_video_path=None,
130 |     ):
131 |         if seed != -1:
132 |             torch.manual_seed(seed)
133 |         model_id = f"openbmb/{model}"
134 |         self.model_checkpoint = os.path.join(
135 |             folder_paths.models_dir, "prompt_generator", os.path.basename(model_id)
136 |         )
137 | 
138 |         if not os.path.exists(self.model_checkpoint):
139 |             from huggingface_hub import snapshot_download
140 | 
141 |             snapshot_download(
142 |                 repo_id=model_id,
143 |                 local_dir=self.model_checkpoint,
144 |                 local_dir_use_symlinks=False,
145 |             )
146 | 
147 |         if self.tokenizer is None:
148 |             self.tokenizer = AutoTokenizer.from_pretrained(
149 |                 self.model_checkpoint,
150 |                 trust_remote_code=True,
151 |                 low_cpu_mem_usage=True,
152 |             )
153 |         if self.model is None:
154 |             self.model = AutoModel.from_pretrained(
155 |                 self.model_checkpoint,
156 |                 trust_remote_code=True,
157 |                 low_cpu_mem_usage=True,
158 |                 attn_implementation="sdpa",
159 |                 torch_dtype=torch.bfloat16 if self.bf16_support else torch.float16,
160 |             )
161 | 
162 |         with torch.no_grad():
163 |             if source_video_path:
164 |                 frames = self.encode_video(source_video_path, video_max_num_frames)
165 |                 msgs = [{"role": "user", "content": frames + [text]}]
166 |             elif (
167 |                 source_image_path_1st is not None
168 |                 and source_image_path_2nd is not None
169 |                 and source_image_path_3rd is not None
170 |             ):
171 |                 image1 = ToPILImage()(
172 |                     source_image_path_1st.permute([0, 3, 1, 2])[0]
173 |                 ).convert("RGB")
174 |                 image2 = ToPILImage()(
175 |                     source_image_path_2nd.permute([0, 3, 1, 2])[0]
176 |                 ).convert("RGB")
177 |                 image3 = ToPILImage()(
178 |                     source_image_path_3rd.permute([0, 3, 1, 2])[0]
179 |                 ).convert("RGB")
180 |                 msgs = [{"role": "user", "content": [image1, image2, image3, text]}]
181 |             elif (
182 |                 source_image_path_1st is not None
183 |                 and source_image_path_2nd is not None
184 |                 and source_image_path_3rd is None
185 |             ):
186 |                 image1 = ToPILImage()(
187 |                     source_image_path_1st.permute([0, 3, 1, 2])[0]
188 |                 ).convert("RGB")
189 |                 image2 = ToPILImage()(
190 |                     source_image_path_2nd.permute([0, 3, 1, 2])[0]
191 |                 ).convert("RGB")
192 |                 msgs = [{"role": "user", "content": [image1, image2, text]}]
193 |             elif (
194 |                 source_image_path_1st is not None
195 |                 and source_image_path_2nd is None
196 |                 and source_image_path_3rd is not None
197 |             ):
198 |                 image1 = ToPILImage()(
199 |                     source_image_path_1st.permute([0, 3, 1, 2])[0]
200 |                 ).convert("RGB")
201 |                 image3 = ToPILImage()(
202 |                     source_image_path_3rd.permute([0, 3, 1, 2])[0]
203 |                 ).convert("RGB")
204 |                 msgs = [{"role": "user", "content": [image1, image3, text]}]
205 |             elif (
206 |                 source_image_path_1st is None
207 |                 and source_image_path_2nd is not None
208 |                 and source_image_path_3rd is not None
209 |             ):
210 |                 image2 = ToPILImage()(
211 |                     source_image_path_2nd.permute([0, 3, 1, 2])[0]
212 |                 ).convert("RGB")
213 |                 image3 = ToPILImage()(
214 |                     source_image_path_3rd.permute([0, 3, 1, 2])[0]
215 |                 ).convert("RGB")
216 |                 msgs = [{"role": "user", "content": [image2, image3, text]}]
217 |             elif (
218 |                 source_image_path_1st is not None
219 |                 and source_image_path_2nd is None
220 |                 and source_image_path_3rd is None
221 |             ):
222 |                 image = ToPILImage()(
223 |                     source_image_path_1st.permute([0, 3, 1, 2])[0]
224 |                 ).convert("RGB")
225 |                 msgs = [{"role": "user", "content": [image, text]}]
226 |             elif (
227 |                 source_image_path_1st is None
228 |                 and source_image_path_2nd is not None
229 |                 and source_image_path_3rd is None
230 |             ):
231 |                 image = ToPILImage()(
232 |                     source_image_path_2nd.permute([0, 3, 1, 2])[0]
233 |                 ).convert("RGB")
234 |                 msgs = [{"role": "user", "content": [image, text]}]
235 |             elif (
236 |                 source_image_path_1st is None
237 |                 and source_image_path_2nd is None
238 |                 and source_image_path_3rd is not None
239 |             ):
240 |                 image = ToPILImage()(
241 |                     source_image_path_3rd.permute([0, 3, 1, 2])[0]
242 |                 ).convert("RGB")
243 |                 msgs = [{"role": "user", "content": [image, text]}]
244 |             else:
245 |                 msgs = [{"role": "user", "content": [text]}]
246 |                 # raise ValueError("Either image or video must be provided")
247 | 
248 |             params = {"use_image_id": False, "max_slice_nums": video_max_slice_nums}
249 | 
250 |             # offload model to CPU
251 |             # self.model = self.model.to(torch.device("cpu"))
252 |             # self.model.eval()
253 | 
254 |             result = self.model.chat(
255 |                 image=None,
256 |                 msgs=msgs,
257 |                 tokenizer=self.tokenizer,
258 |                 sampling=True,
259 |                 top_k=top_k,
260 |                 top_p=top_p,
261 |                 temperature=temperature,
262 |                 repetition_penalty=repetition_penalty,
263 |                 max_new_tokens=max_new_tokens,
264 |                 **params,
265 |             )
266 | 
267 |             # offload model to GPU
268 |             # self.model = self.model.to(torch.device("cpu"))
269 |             # self.model.eval()
270 | 
271 |             if not keep_model_loaded:
272 |                 del self.tokenizer  # release tokenizer memory
273 |                 del self.model  # release model memory
274 |                 self.tokenizer = None  # set tokenizer to None
275 |                 self.model = None  # set model to None
276 |                 torch.cuda.empty_cache()  # release GPU memory
277 |                 torch.cuda.ipc_collect()
278 | 
279 |             return (result,)
280 | 


--------------------------------------------------------------------------------
/nodes_polished.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import folder_paths
  4 | from transformers import AutoTokenizer, AutoModel
  5 | from torchvision.transforms.v2 import ToPILImage
  6 | from decord import VideoReader, cpu  # pip install decord
  7 | from PIL import Image
  8 | 
  9 | 
 10 | class MiniCPM_VQA_Polished:
 11 |     def __init__(self):
 12 |         self.model_checkpoint = None
 13 |         self.tokenizer = None
 14 |         self.model = None
 15 |         self.device = (
 16 |             torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 17 |         )
 18 |         self.bf16_support = (
 19 |             torch.cuda.is_available()
 20 |             and torch.cuda.get_device_capability(self.device)[0] >= 8
 21 |         )
 22 | 
 23 |     @classmethod
 24 |     def INPUT_TYPES(s):
 25 |         return {
 26 |             "required": {
 27 |                 "text": ("STRING", {"default": "", "multiline": True}),
 28 |                 "model": (
 29 |                     ["MiniCPM-V-2_6-int4", "MiniCPM-Llama3-V-2_5-int4"],
 30 |                     {"default": "MiniCPM-V-2_6-int4"},
 31 |                 ),
 32 |                 "keep_model_loaded": ("BOOLEAN", {"default": False}),
 33 |                 "top_p": (
 34 |                     "FLOAT",
 35 |                     {
 36 |                         "default": 0.8,
 37 |                     },
 38 |                 ),
 39 |                 "top_k": (
 40 |                     "INT",
 41 |                     {
 42 |                         "default": 100,
 43 |                     },
 44 |                 ),
 45 |                 "temperature": (
 46 |                     "FLOAT",
 47 |                     {"default": 0.7, "min": 0, "max": 1, "step": 0.1},
 48 |                 ),
 49 |                 "repetition_penalty": (
 50 |                     "FLOAT",
 51 |                     {
 52 |                         "default": 1.05,
 53 |                     },
 54 |                 ),
 55 |                 "max_new_tokens": (
 56 |                     "INT",
 57 |                     {
 58 |                         "default": 2048,
 59 |                     },
 60 |                 ),
 61 |                 "video_max_num_frames": (
 62 |                     "INT",
 63 |                     {
 64 |                         "default": 64,
 65 |                     },
 66 |                 ),  # if cuda OOM set a smaller number
 67 |                 "video_max_slice_nums": (
 68 |                     "INT",
 69 |                     {
 70 |                         "default": 2,
 71 |                     },
 72 |                 ),  # use 1 if cuda OOM and video resolution >  448*448
 73 |                 "seed": ("INT", {"default": -1}),  # add seed parameter, default is -1
 74 |             },
 75 |             "optional": {
 76 |                 "source_video_path": ("PATH",),
 77 |                 "source_image_path": ("IMAGE",),
 78 |             },
 79 |         }
 80 | 
 81 |     RETURN_TYPES = ("STRING",)
 82 |     FUNCTION = "inference"
 83 |     CATEGORY = "Comfyui_MiniCPM-V-2_6-int4"
 84 | 
 85 |     def encode_video(self, source_video_path, MAX_NUM_FRAMES):
 86 |         def uniform_sample(l, n):  # noqa: E741
 87 |             gap = len(l) / n
 88 |             idxs = [int(i * gap + gap / 2) for i in range(n)]
 89 |             return [l[i] for i in idxs]
 90 | 
 91 |         vr = VideoReader(source_video_path, ctx=cpu(0))
 92 |         total_frames = len(vr) + 1
 93 |         print("Total frames:", total_frames)
 94 |         avg_fps = vr.get_avg_fps()
 95 |         print("Get average FPS(frame per second):", avg_fps)
 96 |         sample_fps = round(avg_fps / 1)  # FPS
 97 |         duration = len(vr) / avg_fps
 98 |         print("Total duration:", duration, "seconds")
 99 |         width = vr[0].shape[1]
100 |         height = vr[0].shape[0]
101 |         print("Video resolution(width x height):", width, "x", height)
102 | 
103 |         frame_idx = [i for i in range(0, len(vr), sample_fps)]
104 |         if len(frame_idx) > MAX_NUM_FRAMES:
105 |             frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
106 |         frames = vr.get_batch(frame_idx).asnumpy()
107 |         frames = [Image.fromarray(v.astype("uint8")) for v in frames]
108 |         print("num frames:", len(frames))
109 |         return frames
110 | 
111 |     def inference(
112 |         self,
113 |         text,
114 |         model,
115 |         keep_model_loaded,
116 |         top_p,
117 |         top_k,
118 |         temperature,
119 |         repetition_penalty,
120 |         max_new_tokens,
121 |         video_max_num_frames,
122 |         video_max_slice_nums,
123 |         seed,
124 |         source_image_path=None,
125 |         source_video_path=None,
126 |     ):
127 |         if seed != -1:
128 |             torch.manual_seed(seed)
129 |         model_id = f"openbmb/{model}"
130 |         self.model_checkpoint = os.path.join(
131 |             folder_paths.models_dir, "prompt_generator", os.path.basename(model_id)
132 |         )
133 | 
134 |         if not os.path.exists(self.model_checkpoint):
135 |             from huggingface_hub import snapshot_download
136 | 
137 |             snapshot_download(
138 |                 repo_id=model_id,
139 |                 local_dir=self.model_checkpoint,
140 |                 local_dir_use_symlinks=False,
141 |             )
142 | 
143 |         if self.tokenizer is None:
144 |             self.tokenizer = AutoTokenizer.from_pretrained(
145 |                 self.model_checkpoint,
146 |                 trust_remote_code=True,
147 |                 low_cpu_mem_usage=True,
148 |             )
149 | 
150 |         if self.model is None:
151 |             self.model = AutoModel.from_pretrained(
152 |                 self.model_checkpoint,
153 |                 trust_remote_code=True,
154 |                 low_cpu_mem_usage=True,
155 |                 attn_implementation="sdpa",
156 |                 torch_dtype=torch.bfloat16 if self.bf16_support else torch.float16,
157 |             )
158 | 
159 |         with torch.no_grad():
160 |             if source_video_path:
161 |                 print("source_video_path:", source_video_path)
162 |                 frames = self.encode_video(source_video_path, video_max_num_frames)
163 |                 msgs = [{"role": "user", "content": frames + [text]}]
164 |             elif source_image_path is not None:
165 |                 images = source_image_path.permute([0, 3, 1, 2])
166 |                 images = [ToPILImage()(img).convert("RGB") for img in images]
167 |                 msgs = [{"role": "user", "content": images + [text]}]
168 |             else:
169 |                 msgs = [{"role": "user", "content": [text]}]
170 |                 # raise ValueError("Either image or video must be provided")
171 | 
172 |             params = {"use_image_id": False, "max_slice_nums": video_max_slice_nums}
173 | 
174 |             # offload model to CPU
175 |             # self.model = self.model.to(torch.device("cpu"))
176 |             # self.model.eval()
177 | 
178 |             result = self.model.chat(
179 |                 image=None,
180 |                 msgs=msgs,
181 |                 tokenizer=self.tokenizer,
182 |                 sampling=True,
183 |                 top_k=top_k,
184 |                 top_p=top_p,
185 |                 temperature=temperature,
186 |                 repetition_penalty=repetition_penalty,
187 |                 max_new_tokens=max_new_tokens,
188 |                 **params,
189 |             )
190 |             # offload model to GPU
191 |             # self.model = self.model.to(torch.device("cpu"))
192 |             # self.model.eval()
193 |             if not keep_model_loaded:
194 |                 del self.tokenizer  # release tokenizer memory
195 |                 del self.model  # release model memory
196 |                 self.tokenizer = None  # set tokenizer to None
197 |                 self.model = None  # set model to None
198 |                 torch.cuda.empty_cache()  # release GPU memory
199 |                 torch.cuda.ipc_collect()
200 | 
201 |             return (result,)
202 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "ComfyUI_MiniCPM-V-2_6-int4"
 3 | description = "This is an implementation of [MiniCPM-V-2_6-int4](https://github.com/OpenBMB/MiniCPM-V) by [ComfyUI](https://github.com/comfyanonymous/ComfyUI), including support for text-based queries, video queries, single-image queries, and multi-image queries to generate captions or responses."
 4 | version = "1.0.0"
 5 | license = { file = "LICENSE" }
 6 | dependencies = ["torch", "torchvision", "numpy", "pillow", "huggingface_hub", "transformers", "decord", "bitsandbytes","accelerate"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/IuvenisSapiens/ComfyUI_MiniCPM-V-2_6-int4"
10 | 
11 | [tool.comfy]
12 | PublisherId = "IuvenisSapiens"
13 | DisplayName = "ComfyUI_MiniCPM-V-2_6-int4"
14 | Icon = "favicon.ico"
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | torchaudio
 4 | numpy
 5 | pillow
 6 | huggingface_hub
 7 | transformers
 8 | decord
 9 | bitsandbytes
10 | accelerate


--------------------------------------------------------------------------------
/util_nodes.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import folder_paths
 3 | current_dir = os.path.dirname(os.path.abspath(__file__))
 4 | input_dir = folder_paths.get_input_directory()
 5 | output_dir = folder_paths.get_output_directory()
 6 | 
 7 | class LoadVideo:
 8 |     @classmethod
 9 |     def INPUT_TYPES(s):
10 |         files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) and f.split('.')[-1] in ["mp4", "mkv", "mov", "avi", "flv", "wmv", "webm", "m4v"]]
11 |         return {"required":{
12 |             "video":(files,),
13 |         }}
14 |     
15 |     CATEGORY = "Comfyui_MiniCPM-V-2_6-int4"
16 |     DESCRIPTION = "Load Video"
17 | 
18 |     RETURN_TYPES = ("PATH",)
19 | 
20 |     OUTPUT_NODE = False
21 | 
22 |     FUNCTION = "load_video"
23 | 
24 |     def load_video(self, video):
25 |         video_path = os.path.join(input_dir,video)
26 |         return (video_path,)
27 | 
28 | class PreviewVideo:
29 |     @classmethod
30 |     def INPUT_TYPES(s):
31 |         return {"required":{
32 |             "video":("PATH",),
33 |         }}
34 |     
35 |     CATEGORY = "Comfyui_MiniCPM-V-2_6-int4"
36 |     DESCRIPTION = "Load Video"
37 | 
38 |     RETURN_TYPES = ()
39 | 
40 |     OUTPUT_NODE = True
41 | 
42 |     FUNCTION = "load_video"
43 | 
44 |     def load_video(self, video):
45 |         video_name = os.path.basename(video)
46 |         video_path_name = os.path.basename(os.path.dirname(video))
47 |         return {"ui":{"video":[video_name,video_path_name]}}
48 | 


--------------------------------------------------------------------------------
/web/js/displayText.js:
--------------------------------------------------------------------------------
 1 | const app = window.comfyAPI.app.app;
 2 | const ComfyWidgets = window.comfyAPI.widgets.ComfyWidgets;
 3 | 
 4 | app.registerExtension({
 5 | 	name: "Comfyui_MiniCPM-V-2_6-int4.DisplayTextNode",
 6 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
 7 | 		if (nodeData.name === "DisplayText") {
 8 | 			function populate(text) {
 9 | 				if (this.widgets) {
10 | 					for (let i = 1; i < this.widgets.length; i++) {
11 | 						this.widgets[i].onRemove?.();
12 | 					}
13 | 					this.widgets.length = 1;
14 | 				}
15 | 
16 | 				const v = [...text];
17 | 				if (!v[0]) {
18 | 					v.shift();
19 | 				}
20 | 				for (const list of v) {
21 | 					const w = ComfyWidgets["STRING"](this, "text", ["STRING", { multiline: true }], app).widget;
22 | 					w.inputEl.readOnly = true;
23 | 					w.inputEl.style.opacity = 0.6;
24 | 					w.value = list;
25 | 				}
26 | 
27 | 				requestAnimationFrame(() => {
28 | 					const sz = this.computeSize();
29 | 					if (sz[0] < this.size[0]) {
30 | 						sz[0] = this.size[0];
31 | 					}
32 | 					if (sz[1] < this.size[1]) {
33 | 						sz[1] = this.size[1];
34 | 					}
35 | 					this.onResize?.(sz);
36 | 					app.graph.setDirtyCanvas(true, false);
37 | 				});
38 | 			}
39 | 
40 | 			const onExecuted = nodeType.prototype.onExecuted;
41 | 			nodeType.prototype.onExecuted = function (message) {
42 | 				onExecuted?.apply(this, arguments);
43 | 				populate.call(this, message.text);
44 | 			};
45 | 
46 | 			const onConfigure = nodeType.prototype.onConfigure;
47 | 			nodeType.prototype.onConfigure = function () {
48 | 				onConfigure?.apply(this, arguments);
49 | 				if (this.widgets_values?.length) {
50 | 					populate.call(this, this.widgets_values.slice(+this.widgets_values.length > 1));
51 | 				}
52 | 			};
53 | 		}
54 | 	},
55 | });
56 | 


--------------------------------------------------------------------------------
/web/js/multipleImagesInput.js:
--------------------------------------------------------------------------------
 1 | const app = window.comfyAPI.app.app;
 2 | 
 3 | app.registerExtension({
 4 |     name: "Comfyui_MiniCPM-V-2_6-int4.MultipleImagesInput",
 5 |     async beforeRegisterNodeDef(nodeType, nodeData, app) {
 6 |         if (!nodeData?.category?.startsWith("Comfyui_MiniCPM-V-2_6-int4")) {
 7 |             return;
 8 |         }
 9 |         switch (nodeData.name) {
10 |             case "MultipleImagesInput":
11 |                 nodeType.prototype.onNodeCreated = function () {
12 |                     this._type = "IMAGE";
13 |                     this.inputs_offset = nodeData.name.includes("selective") ? 1 : 0;
14 |                     this.addWidget("button", "Update inputs", null, () => {
15 |                         if (!this.inputs) {
16 |                             this.inputs = [];
17 |                         }
18 |                         const target_number_of_inputs = this.widgets.find(
19 |                             (w) => w.name === "inputcount"
20 |                         )["value"];
21 |                         if (target_number_of_inputs === this.inputs.length) return; // already set, do nothing
22 | 
23 |                         if (target_number_of_inputs < this.inputs.length) {
24 |                             for (
25 |                                 let i = this.inputs.length;
26 |                                 i >= this.inputs_offset + target_number_of_inputs;
27 |                                 i--
28 |                             )
29 |                                 this.removeInput(i);
30 |                         } else {
31 |                             for (
32 |                                 let i = this.inputs.length + 1 - this.inputs_offset;
33 |                                 i <= target_number_of_inputs;
34 |                                 ++i
35 |                             )
36 |                                 this.addInput(`image_${i}`, this._type);
37 |                         }
38 |                     });
39 |                 };
40 |                 break;
41 |         }
42 |     },
43 |     async setup() {
44 |         const originalComputeVisibleNodes =
45 |             LGraphCanvas.prototype.computeVisibleNodes;
46 |         LGraphCanvas.prototype.computeVisibleNodes = function () {
47 |             const visibleNodesSet = new Set(
48 |                 originalComputeVisibleNodes.apply(this, arguments)
49 |             );
50 |             for (const node of this.graph._nodes) {
51 |                 if (
52 |                     (node.type === "SetNode" || node.type === "GetNode") &&
53 |                     node.drawConnection
54 |                 ) {
55 |                     visibleNodesSet.add(node);
56 |                 }
57 |             }
58 |             return Array.from(visibleNodesSet);
59 |         };
60 |     },
61 | });
62 | 


--------------------------------------------------------------------------------
/web/js/previewVideo.js:
--------------------------------------------------------------------------------
  1 | const app = window.comfyAPI.app.app;
  2 | const api = window.comfyAPI.api.api;
  3 | 
  4 | function fitHeight(node) {
  5 |     node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]])
  6 |     node?.graph?.setDirtyCanvas(true);
  7 | }
  8 | function chainCallback(object, property, callback) {
  9 |     if (object == undefined) {
 10 |         //This should not happen.
 11 |         console.error("Tried to add callback to non-existant object")
 12 |         return;
 13 |     }
 14 |     if (property in object) {
 15 |         const callback_orig = object[property]
 16 |         object[property] = function () {
 17 |             const r = callback_orig.apply(this, arguments);
 18 |             callback.apply(this, arguments);
 19 |             return r
 20 |         };
 21 |     } else {
 22 |         object[property] = callback;
 23 |     }
 24 | }
 25 | 
 26 | function addPreviewOptions(nodeType) {
 27 |     chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) {
 28 |         // The intended way of appending options is returning a list of extra options,
 29 |         // but this isn't used in widgetInputs.js and would require
 30 |         // less generalization of chainCallback
 31 |         let optNew = []
 32 |         try {
 33 |             const previewWidget = this.widgets.find((w) => w.name === "videopreview");
 34 | 
 35 |             let url = null
 36 |             if (previewWidget.videoEl?.hidden == false && previewWidget.videoEl.src) {
 37 |                 //Use full quality video
 38 |                 //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params));
 39 |                 url = previewWidget.videoEl.src
 40 |             }
 41 |             if (url) {
 42 |                 optNew.push(
 43 |                     {
 44 |                         content: "Open preview",
 45 |                         callback: () => {
 46 |                             window.open(url, "_blank")
 47 |                         },
 48 |                     },
 49 |                     {
 50 |                         content: "Save preview",
 51 |                         callback: () => {
 52 |                             const a = document.createElement("a");
 53 |                             a.href = url;
 54 |                             a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename"));
 55 |                             document.body.append(a);
 56 |                             a.click();
 57 |                             requestAnimationFrame(() => a.remove());
 58 |                         },
 59 |                     }
 60 |                 );
 61 |             }
 62 |             if(options.length > 0 && options[0] != null && optNew.length > 0) {
 63 |                 optNew.push(null);
 64 |             }
 65 |             options.unshift(...optNew);
 66 |             
 67 |         } catch (error) {
 68 |             console.log(error);
 69 |         }
 70 |         
 71 |     });
 72 | }
 73 | function previewVideo(node,file,type){
 74 |     var element = document.createElement("div");
 75 |     const previewNode = node;
 76 |     var previewWidget = node.addDOMWidget("videopreview", "preview", element, {
 77 |         serialize: false,
 78 |         hideOnZoom: false,
 79 |         getValue() {
 80 |             return element.value;
 81 |         },
 82 |         setValue(v) {
 83 |             element.value = v;
 84 |         },
 85 |     });
 86 |     previewWidget.computeSize = function(width) {
 87 |         if (this.aspectRatio && !this.parentEl.hidden) {
 88 |             let height = (previewNode.size[0]-20)/ this.aspectRatio + 10;
 89 |             if (!(height > 0)) {
 90 |                 height = 0;
 91 |             }
 92 |             this.computedHeight = height + 10;
 93 |             return [width, height];
 94 |         }
 95 |         return [width, -4];//no loaded src, widget should not display
 96 |     }
 97 |     // element.style['pointer-events'] = "none"
 98 |     previewWidget.value = {hidden: false, paused: false, params: {}}
 99 |     previewWidget.parentEl = document.createElement("div");
100 |     previewWidget.parentEl.className = "video_preview";
101 |     previewWidget.parentEl.style['width'] = "100%"
102 |     element.appendChild(previewWidget.parentEl);
103 |     previewWidget.videoEl = document.createElement("video");
104 |     previewWidget.videoEl.controls = true;
105 |     previewWidget.videoEl.loop = false;
106 |     previewWidget.videoEl.muted = false;
107 |     previewWidget.videoEl.style['width'] = "100%"
108 |     previewWidget.videoEl.addEventListener("loadedmetadata", () => {
109 | 
110 |         previewWidget.aspectRatio = previewWidget.videoEl.videoWidth / previewWidget.videoEl.videoHeight;
111 |         fitHeight(this);
112 |     });
113 |     previewWidget.videoEl.addEventListener("error", () => {
114 |         previewWidget.parentEl.hidden = true;
115 |         fitHeight(this);
116 |     });
117 | 
118 |     let params =  {
119 |         "filename": file,
120 |         "type": type,
121 |     }
122 |     
123 |     previewWidget.parentEl.hidden = previewWidget.value.hidden;
124 |     previewWidget.videoEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden;
125 |     let target_width = 256
126 |     if (element.style?.width) {
127 |         //overscale to allow scrolling. Endpoint won't return higher than native
128 |         target_width = element.style.width.slice(0,-2)*2;
129 |     }
130 |     if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") {
131 |         params.force_size = target_width+"x?"
132 |     } else {
133 |         let size = params.force_size.split("x")
134 |         let ar = parseInt(size[0])/parseInt(size[1])
135 |         params.force_size = target_width+"x"+(target_width/ar)
136 |     }
137 |     
138 |     previewWidget.videoEl.src = api.apiURL('/view?' + new URLSearchParams(params));
139 | 
140 |     previewWidget.videoEl.hidden = false;
141 |     previewWidget.parentEl.appendChild(previewWidget.videoEl)
142 | }
143 | 
144 | app.registerExtension({
145 | 	name: "Comfyui_MiniCPM-V-2_6-int4.VideoPreviewer",
146 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
147 | 		if (nodeData?.name == "PreviewVideo") {
148 | 			nodeType.prototype.onExecuted = function (data) {
149 | 				previewVideo(this, data.video[0], data.video[1]);
150 |             }
151 | 		}
152 | 	}
153 | });
154 | 


--------------------------------------------------------------------------------
/web/js/uploadVideo.js:
--------------------------------------------------------------------------------
  1 | const app = window.comfyAPI.app.app;
  2 | const api = window.comfyAPI.api.api;
  3 | const ComfyWidgets = window.comfyAPI.widgets.ComfyWidgets;
  4 | 
  5 | function fitHeight(node) {
  6 |     node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]])
  7 |     node?.graph?.setDirtyCanvas(true);
  8 | }
  9 | 
 10 | function previewVideo(node, file) {
 11 |     while (node.widgets.length > 2) {
 12 |         node.widgets.pop()
 13 |     }
 14 |     try {
 15 |         var el = document.getElementById("uploadVideo");
 16 |         el.remove();
 17 |     } catch (error) {
 18 |         console.log(error);
 19 |     }
 20 |     var element = document.createElement("div");
 21 |     element.id = "uploadVideo";
 22 |     const previewNode = node;
 23 |     var previewWidget = node.addDOMWidget("videopreview", "preview", element, {
 24 |         serialize: false,
 25 |         hideOnZoom: false,
 26 |         getValue() {
 27 |             return element.value;
 28 |         },
 29 |         setValue(v) {
 30 |             element.value = v;
 31 |         },
 32 |     });
 33 |     previewWidget.computeSize = function (width) {
 34 |         if (this.aspectRatio && !this.parentEl.hidden) {
 35 |             let height = (previewNode.size[0] - 20) / this.aspectRatio + 10;
 36 |             if (!(height > 0)) {
 37 |                 height = 0;
 38 |             }
 39 |             this.computedHeight = height + 10;
 40 |             return [width, height];
 41 |         }
 42 |         return [width, -4];//no loaded src, widget should not display
 43 |     }
 44 |     // element.style['pointer-events'] = "none"
 45 |     previewWidget.value = { hidden: false, paused: false, params: {} }
 46 |     previewWidget.parentEl = document.createElement("div");
 47 |     previewWidget.parentEl.className = "video_preview";
 48 |     previewWidget.parentEl.style['width'] = "100%"
 49 |     element.appendChild(previewWidget.parentEl);
 50 |     previewWidget.videoEl = document.createElement("video");
 51 |     previewWidget.videoEl.controls = true;
 52 |     previewWidget.videoEl.loop = false;
 53 |     previewWidget.videoEl.muted = false;
 54 |     previewWidget.videoEl.style['width'] = "100%"
 55 |     previewWidget.videoEl.addEventListener("loadedmetadata", () => {
 56 | 
 57 |         previewWidget.aspectRatio = previewWidget.videoEl.videoWidth / previewWidget.videoEl.videoHeight;
 58 |         fitHeight(this);
 59 |     });
 60 |     previewWidget.videoEl.addEventListener("error", () => {
 61 |         previewWidget.parentEl.hidden = true;
 62 |         fitHeight(this);
 63 |     });
 64 | 
 65 |     let params = {
 66 |         "filename": file,
 67 |         "type": "input",
 68 |     }
 69 | 
 70 |     previewWidget.parentEl.hidden = previewWidget.value.hidden;
 71 |     previewWidget.videoEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden;
 72 |     let target_width = 256
 73 |     if (element.style?.width) {
 74 |         //overscale to allow scrolling. Endpoint won't return higher than native
 75 |         target_width = element.style.width.slice(0, -2) * 2;
 76 |     }
 77 |     if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") {
 78 |         params.force_size = target_width + "x?"
 79 |     } else {
 80 |         let size = params.force_size.split("x")
 81 |         let ar = parseInt(size[0]) / parseInt(size[1])
 82 |         params.force_size = target_width + "x" + (target_width / ar)
 83 |     }
 84 | 
 85 |     previewWidget.videoEl.src = api.apiURL('/view?' + new URLSearchParams(params));
 86 | 
 87 |     previewWidget.videoEl.hidden = false;
 88 |     previewWidget.parentEl.appendChild(previewWidget.videoEl)
 89 | }
 90 | 
 91 | function videoUpload(node, inputName, inputData, app) {
 92 |     const videoWidget = node.widgets.find((w) => w.name === "video");
 93 |     let uploadWidget;
 94 |     /* 
 95 |     A method that returns the required style for the html 
 96 |     */
 97 |     var default_value = videoWidget.value;
 98 |     Object.defineProperty(videoWidget, "value", {
 99 |         set: function (value) {
100 |             this._real_value = value;
101 |         },
102 | 
103 |         get: function () {
104 |             let value = "";
105 |             if (this._real_value) {
106 |                 value = this._real_value;
107 |             } else {
108 |                 return default_value;
109 |             }
110 | 
111 |             if (value.filename) {
112 |                 let real_value = value;
113 |                 value = "";
114 |                 if (real_value.subfolder) {
115 |                     value = real_value.subfolder + "/";
116 |                 }
117 | 
118 |                 value += real_value.filename;
119 | 
120 |                 if (real_value.type && real_value.type !== "input")
121 |                     value += ` [${real_value.type}]`;
122 |             }
123 |             return value;
124 |         }
125 |     });
126 |     async function uploadFile(file, updateNode, pasted = false) {
127 |         try {
128 |             // Wrap file in formdata so it includes filename
129 |             const body = new FormData();
130 |             body.append("image", file);
131 |             if (pasted) body.append("subfolder", "pasted");
132 |             const resp = await api.fetchApi("/upload/image", {
133 |                 method: "POST",
134 |                 body,
135 |             });
136 | 
137 |             if (resp.status === 200) {
138 |                 const data = await resp.json();
139 |                 // Add the file to the dropdown list and update the widget value
140 |                 let path = data.name;
141 |                 if (data.subfolder) path = data.subfolder + "/" + path;
142 | 
143 |                 if (!videoWidget.options.values.includes(path)) {
144 |                     videoWidget.options.values.push(path);
145 |                 }
146 | 
147 |                 if (updateNode) {
148 |                     videoWidget.value = path;
149 |                     previewVideo(node, path)
150 | 
151 |                 }
152 |             } else {
153 |                 alert(resp.status + " - " + resp.statusText);
154 |             }
155 |         } catch (error) {
156 |             alert(error);
157 |         }
158 |     }
159 | 
160 |     const fileInput = document.createElement("input");
161 |     Object.assign(fileInput, {
162 |         type: "file",
163 |         accept: "video/mp4,video/mkv,video/mov,video/avi,video/flv,video/wmv,video/webm,video/m4v",
164 |         style: "display: none",
165 |         onchange: async () => {
166 |             if (fileInput.files.length) {
167 |                 await uploadFile(fileInput.files[0], true);
168 |             }
169 |         },
170 |     });
171 |     document.body.append(fileInput);
172 | 
173 |     // Create the button widget for selecting the files
174 |     uploadWidget = node.addWidget("button", "choose video file to upload", "Video", () => {
175 |         fileInput.click();
176 |     });
177 | 
178 |     uploadWidget.serialize = false;
179 | 
180 |     previewVideo(node, videoWidget.value);
181 |     const cb = node.callback;
182 |     videoWidget.callback = function () {
183 |         previewVideo(node, videoWidget.value);
184 |         if (cb) {
185 |             return cb.apply(this, arguments);
186 |         }
187 |     };
188 | 
189 |     return { widget: uploadWidget };
190 | }
191 | 
192 | ComfyWidgets.VIDEOPLOAD = videoUpload;
193 | 
194 | app.registerExtension({
195 |     name: "Comfyui_MiniCPM-V-2_6-int4.UploadVideo",
196 |     async beforeRegisterNodeDef(nodeType, nodeData, app) {
197 |         if (nodeData?.name == "LoadVideo") {
198 |             nodeData.input.required.upload = ["VIDEOPLOAD"];
199 |         }
200 |     },
201 | });
202 | 
203 | 


--------------------------------------------------------------------------------