├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── assets
    ├── meme.jpg
    ├── octupusy.jpg
    ├── teaser.webp
    └── woman.jpg
├── data
    ├── __init__.py
    ├── configs
    │   └── example.yaml
    ├── data_utils.py
    ├── dataset_base.py
    ├── dataset_info.py
    ├── distributed_iterable_dataset.py
    ├── interleave_datasets
    │   ├── __init__.py
    │   ├── edit_dataset.py
    │   └── interleave_t2i_dataset.py
    ├── parquet_utils.py
    ├── t2i_dataset.py
    ├── transforms.py
    ├── video_utils.py
    └── vlm_dataset.py
├── example_workflows
    ├── bagel_image_edit.json
    ├── bagel_image_edit.png
    ├── bagel_image_understanding.json
    ├── bagel_image_understanding.png
    ├── bagel_text_to_image.json
    └── bagel_text_to_image.png
├── inferencer.py
├── modeling
    ├── __init__.py
    ├── autoencoder.py
    ├── bagel
    │   ├── __init__.py
    │   ├── bagel.py
    │   ├── modeling_utils.py
    │   ├── qwen2_navit.py
    │   └── siglip_navit.py
    ├── qwen2
    │   ├── __init__.py
    │   ├── configuration_qwen2.py
    │   ├── modeling_qwen2.py
    │   ├── tokenization_qwen2.py
    │   └── tokenization_qwen2_fast.py
    └── siglip
    │   ├── __init__.py
    │   ├── configuration_siglip.py
    │   ├── convert_siglip_to_hf.py
    │   ├── image_processing_siglip.py
    │   ├── modeling_siglip.py
    │   ├── processing_siglip.py
    │   └── tokenization_siglip.py
├── node.py
├── pyproject.toml
└── requirements.txt


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |     paths:
 9 |       - "pyproject.toml"
10 | 
11 | permissions:
12 |   issues: write
13 | 
14 | jobs:
15 |   publish-node:
16 |     name: Publish Custom Node to registry
17 |     runs-on: ubuntu-latest
18 |     if: ${{ github.repository_owner == 'neverbiasu' }}
19 |     steps:
20 |       - name: Check out code
21 |         uses: actions/checkout@v4
22 |       - name: Publish Custom Node
23 |         uses: Comfy-Org/publish-node-action@v1
24 |         with:
25 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
26 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ComfyUI-Bagel
  2 | 
  3 | A ComfyUI custom node package based on the BAGEL-7B-MoT multimodal model.
  4 | 
  5 | ## About BAGEL
  6 | 
  7 | <p align="center">
  8 |   <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="480"/>
  9 | </p>
 10 | 
 11 | BAGEL is an open-source multimodal foundation model with 7B active parameters (14B total) that adopts a Mixture-of-Transformer-Experts (MoT) architecture. It is designed for multimodal understanding and generation tasks, outperforming top-tier open-source VLMs like Qwen2.5-VL and InternVL-2.5 on standard multimodal understanding leaderboards, and delivering text-to-image quality competitive with specialist generators such as SD3.
 12 | 
 13 | ## Features
 14 | 
 15 | - **Text-to-Image Generation**: Generate high-quality images using natural language prompts
 16 | - **Image Editing**: Edit existing images based on textual descriptions  
 17 | - **Image Understanding**: Perform Q&A and analysis on images
 18 | - **Reasoning Process Display**: Optionally display the model's reasoning process
 19 | - **DFloat11 Quantized Model Support**: Support for DFloat11 quantized version that requires only ~22GB VRAM
 20 | 
 21 | ## Installation
 22 | 
 23 | ### 1. Model Selection and Download
 24 | The ComfyUI-Bagel node now supports automatic model selection via dropdown:
 25 | - **ByteDance-Seed/BAGEL-7B-MoT**: Original standard model (~80GB VRAM recommended)
 26 | - **DFloat11/BAGEL-7B-MoT-DF11**: Quantized model (~22GB VRAM, single 24GB GPU compatible)
 27 | 
 28 | Models will be automatically downloaded to `models/bagel/` when first selected. You can also manually download them:
 29 | 
 30 | #### Standard Model
 31 | ```bash
 32 | # Clone model using git lfs (recommended)
 33 | git lfs install
 34 | git clone https://huggingface.co/ByteDance-Seed/BAGEL-7B-MoT models/bagel/BAGEL-7B-MoT
 35 | 
 36 | # Or use huggingface_hub
 37 | pip install huggingface_hub
 38 | python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='ByteDance-Seed/BAGEL-7B-MoT', local_dir='models/bagel/BAGEL-7B-MoT')"
 39 | ```
 40 | 
 41 | #### DFloat11 Quantized Model (Recommended for single GPU)
 42 | ```bash
 43 | # Clone DFloat11 quantized model
 44 | git clone https://huggingface.co/DFloat11/BAGEL-7B-MoT-DF11 models/bagel/BAGEL-7B-MoT-DF11
 45 | 
 46 | # Or use huggingface_hub
 47 | python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='DFloat11/BAGEL-7B-MoT-DF11', local_dir='models/bagel/BAGEL-7B-MoT-DF11')"
 48 | ```
 49 | 
 50 | ### 2. Install Dependencies
 51 | Install the required dependencies:
 52 | ```bash
 53 | pip install -r requirements.txt
 54 | ```
 55 | 
 56 | For DFloat11 quantized model support, also install:
 57 | ```bash
 58 | pip install dfloat11
 59 | ```
 60 | 
 61 | ### 3. Restart ComfyUI
 62 | Restart ComfyUI to load the new nodes.
 63 | 
 64 | ## Workflows
 65 | 
 66 | ### Text-to-Image Generation
 67 | ![text to image workflow](example_workflows/bagel_text_to_image.png)
 68 | Generate high-quality images from text descriptions. Suitable for creative design and content generation.
 69 | 
 70 | ### Image Editing Workflow
 71 | ![image editing workflow](example_workflows/bagel_image_edit.png)
 72 | Edit existing images based on textual descriptions, supporting local modifications and style adjustments.
 73 | 
 74 | ### Image Understanding Workflow
 75 | ![image understanding workflow](example_workflows/bagel_image_understanding.png)
 76 | Analyze and answer questions about image content, suitable for content understanding and information extraction.
 77 | 
 78 | ## Performance Comparison
 79 | 
 80 | | Metric | BAGEL-7B-MoT (Standard Model) | BAGEL-7B-MoT (DFloat11 Quantized Model) |
 81 | |--------|-------------------------------|-----------------------------------------|
 82 | | Model Size | 29.21 GB | 19.89 GB |
 83 | | Peak GPU Memory (1024x1024 image generation) | 30.07 GB | 21.76 GB |
 84 | | Generation Time (on an RTX4090 GPU) | 482.95 seconds | 154.39 seconds |
 85 | 
 86 | DFloat11 Quantized Model significantly reduces VRAM requirements and speeds up generation time, making it ideal for single GPU setups.
 87 | 
 88 | ## Related Links
 89 | 
 90 | - [BAGEL Official Paper](https://arxiv.org/abs/2505.14683)
 91 | - [BAGEL Model Homepage](https://bagel-ai.org/)
 92 | - [Hugging Face Model](https://huggingface.co/ByteDance-Seed/BAGEL-7B-MoT)
 93 | - [Online Demo](https://demo.bagel-ai.org/)
 94 | - [Discord Community](https://discord.gg/Z836xxzy)
 95 | 
 96 | ## License
 97 | 
 98 | This project is licensed under the Apache 2.0 License. Please refer to the official license terms for the use of the BAGEL model.
 99 | 
100 | ## Contribution
101 | 
102 | Contributions are welcome! Please submit issue reports and feature requests. If you wish to contribute code, please create an issue to discuss your ideas first.
103 | 
104 | ## FAQ
105 | 
106 | ### 1. VRAM Requirements
107 | The official recommendation for generating a 1024×1024 image is over 80GB GPU memory. However, multi-GPU setups can distribute the memory load. For example:
108 | - **Single GPU**: A100 (40GB) takes approximately 340-380 seconds per image.
109 | - **Multi-GPU**: 3 RTX3090 GPUs (24GB each) complete the task in about 1 minute.
110 | - **Compressed Model**: Using the DFloat11 version requires only 22GB VRAM and can run on a single 24GB GPU, with peak memory usage around 21.76GB (A100) and generation time of approximately 58 seconds.
111 | 
112 | For more details, visit the [GitHub issue](https://github.com/ByteDance-Seed/Bagel/issues/4).
113 | 
114 | ### 2. Quantized Version
115 | A quantized version of BAGEL is currently under development, which aims to reduce VRAM requirements further.
116 | 
117 | ### 3. NameError: 'Qwen2Config' is not defined
118 | This issue is likely related to environment or dependency problems. For more information, refer to [this GitHub issue](https://github.com/neverbiasu/ComfyUI-BAGEL/issues/7).
119 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ComfyUI-Bagel - ComfyUI custom node package for the BAGEL multimodal model
3 | """
4 | 
5 | from .node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
6 | 
7 | # Export node mappings for ComfyUI
8 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
9 | 


--------------------------------------------------------------------------------
/assets/meme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/assets/meme.jpg


--------------------------------------------------------------------------------
/assets/octupusy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/assets/octupusy.jpg


--------------------------------------------------------------------------------
/assets/teaser.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/assets/teaser.webp


--------------------------------------------------------------------------------
/assets/woman.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/assets/woman.jpg


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/data/configs/example.yaml:
--------------------------------------------------------------------------------
 1 | t2i_pretrain:
 2 |   dataset_names:
 3 |   - t2i
 4 |   image_transform_args:
 5 |     image_stride: 16
 6 |     max_image_size: 1024
 7 |     min_image_size: 512
 8 |   is_mandatory: true
 9 |   num_used_data: # The sum should be larger that NUM_GPUS x NUM_WORKERS
10 |   - 10
11 |   weight: 1
12 | 
13 | unified_edit:
14 |   dataset_names:
15 |   - seedxedit_multi
16 |   image_transform_args:
17 |     image_stride: 16
18 |     max_image_size: 1024
19 |     min_image_size: 512
20 |   vit_image_transform_args:
21 |     image_stride: 14
22 |     max_image_size: 518
23 |     min_image_size: 224
24 |   is_mandatory: false
25 |   num_used_data:
26 |   - 10
27 |   weight: 1
28 | 
29 | vlm_sft:
30 |   dataset_names:
31 |   - llava_ov
32 |   image_transform_args:
33 |     image_stride: 14
34 |     max_image_size: 980
35 |     min_image_size: 378
36 |     max_pixels: 2_007_040
37 |   frame_sampler_args:
38 |     max_num_frames: 12
39 |     min_num_frames: 8
40 |   is_mandatory: true
41 |   shuffle_lines: True
42 |   shuffle_seed: 0
43 |   num_used_data:
44 |   - 1000
45 |   weight: 1
46 | 


--------------------------------------------------------------------------------
/data/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | 
  5 | import math
  6 | import random
  7 | from PIL import Image
  8 | 
  9 | import torch
 10 | from torch.nn.attention.flex_attention import or_masks, and_masks
 11 | 
 12 | 
 13 | def create_sparse_mask(document_lens, split_lens, attn_modes, device):
 14 |     def causal_mask(b, h, q_idx, kv_idx):
 15 |         return q_idx >= kv_idx
 16 | 
 17 |     def full_and_noise_mask(b, h, q_idx, kv_idx):
 18 |         return (full_and_noise_seq_id[q_idx] == full_and_noise_seq_id[kv_idx]) & (full_and_noise_seq_id[q_idx] >= 0)
 19 | 
 20 |     def remove_noise_mask(b, h, q_idx, kv_idx):
 21 |         return (~((noise_seq_id[kv_idx] >= 0) & (noise_seq_id[q_idx] != noise_seq_id[kv_idx])))
 22 | 
 23 |     def sample_mask(b, h, q_idx, kv_idx):
 24 |         return document_id[q_idx] == document_id[kv_idx]
 25 | 
 26 |     full_and_noise_tmp = []
 27 |     noise_tmp = []
 28 | 
 29 |     for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
 30 |         value = i if model in ['full', 'noise'] else -1
 31 |         full_and_noise_tmp.extend([value] * length)
 32 |         value_noise = i if model == 'noise' else -1
 33 |         noise_tmp.extend([value_noise] * length)
 34 | 
 35 |     full_and_noise_seq_id = torch.Tensor(full_and_noise_tmp).to(device)
 36 |     noise_seq_id = torch.Tensor(noise_tmp).to(device)
 37 | 
 38 |     document_id = torch.cat([torch.full((l,), i) for i, l in enumerate(document_lens, start=1)]).to(device)
 39 | 
 40 |     return and_masks(or_masks(causal_mask, full_and_noise_mask), remove_noise_mask, sample_mask)
 41 | 
 42 | 
 43 | def patchify(image, patch_size):
 44 |     p = patch_size
 45 |     c, h, w = image.shape
 46 |     assert h % p == 0 and w % p == 0
 47 |     image = image.reshape(c, h // p, p, w // p, p)
 48 |     image = torch.einsum("chpwq->hwpqc", image)
 49 |     image = image.reshape(-1, p**2 * c)
 50 |     return image
 51 | 
 52 | 
 53 | def get_flattened_position_ids_extrapolate(img_h, img_w, patch_size, max_num_patches_per_side):
 54 |     num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
 55 |     coords_h = torch.arange(0, num_patches_h)
 56 |     coords_w = torch.arange(0, num_patches_w)
 57 |     pos_ids = (coords_h[:, None] * max_num_patches_per_side + coords_w).flatten()
 58 |     return pos_ids
 59 | 
 60 | 
 61 | def get_flattened_position_ids_interpolate(img_h, img_w, patch_size, max_num_patches_per_side):
 62 |     num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
 63 |     boundaries = torch.arange(1 / max_num_patches_per_side, 1.0, 1 / max_num_patches_per_side)
 64 |     fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / num_patches_h)
 65 |     fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / num_patches_w)
 66 |     bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
 67 |     bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
 68 |     pos_ids = (bucket_coords_h[:, None] * max_num_patches_per_side + bucket_coords_w).flatten()
 69 |     return pos_ids
 70 | 
 71 | 
 72 | def prepare_attention_mask_per_sample(split_lens, attn_modes, device="cpu"):
 73 |     """
 74 |     nested_split_lens: A list of N lists of ints. Each int indicates the length of a split within 
 75 |         a sample, where each sample contains multiple splits with different attn modes.
 76 |     nested_attn_modes: whether to use full attn in each split.
 77 |     """
 78 |     sample_len = sum(split_lens)
 79 |     attention_mask = torch.zeros((sample_len, sample_len), dtype=torch.bool, device=device)
 80 | 
 81 |     csum = 0
 82 |     for s, attn_mode in zip(split_lens, attn_modes):
 83 |         assert attn_mode in ['causal', 'full', 'noise']
 84 |         if attn_mode == "causal":
 85 |             attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s), device=device).tril()
 86 |             attention_mask[csum:csum + s, :csum] = 1
 87 |         else:
 88 |             attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s))
 89 |             attention_mask[csum:csum + s, :csum] = 1
 90 |         csum += s
 91 | 
 92 |     csum = 0
 93 |     for s, attn_mode in zip(split_lens, attn_modes):
 94 |         if attn_mode == "noise":
 95 |             attention_mask[:, csum : csum + s] = torch.zeros((sample_len, s))
 96 |             attention_mask[csum : csum + s, csum : csum + s] = torch.ones((s, s))
 97 |         csum += s
 98 | 
 99 |     attention_mask = torch.zeros_like(attention_mask, dtype=torch.float).masked_fill_(
100 |         ~attention_mask, float("-inf")
101 |     )
102 | 
103 |     return attention_mask
104 | 
105 | 
106 | def split_integer_exp_decay(S, ng_sample_decay=1.0):
107 |     if ng_sample_decay == 1.0:
108 |         N = random.randint(1, S)
109 |     else:
110 |         base = (1 - ng_sample_decay) / (1 - math.pow(ng_sample_decay, S))
111 |         p = [base * math.pow(ng_sample_decay, i) for i in range(S)]
112 |         N = random.choices(list(range(1, S + 1)), p, k=1)[0]
113 |     cumsum = [0] + sorted(random.sample(range(1, S), N - 1)) + [S]
114 |     result = [cumsum[i+1] - cumsum[i] for i in range(len(cumsum) - 1)]
115 |     return result, cumsum
116 | 
117 | 
118 | def pil_img2rgb(image):
119 |     if image.mode == "RGBA" or image.info.get("transparency", None) is not None:
120 |         image = image.convert("RGBA")
121 |         white = Image.new(mode="RGB", size=image.size, color=(255, 255, 255))
122 |         white.paste(image, mask=image.split()[3])
123 |         image = white
124 |     else:
125 |         image = image.convert("RGB")
126 | 
127 |     return image
128 | 
129 | 
130 | def add_special_tokens(tokenizer):
131 |     all_special_tokens = []
132 |     for k, v in tokenizer.special_tokens_map.items():
133 |         if isinstance(v, str):
134 |             all_special_tokens.append(v)
135 |         elif isinstance(v, list):
136 |             all_special_tokens += v
137 | 
138 |     new_tokens = []
139 | 
140 |     if '<|im_start|>' not in all_special_tokens:
141 |         new_tokens.append('<|im_start|>')
142 | 
143 |     if '<|im_end|>' not in all_special_tokens:
144 |         new_tokens.append('<|im_end|>')
145 | 
146 |     if '<|vision_start|>' not in all_special_tokens:
147 |         new_tokens.append('<|vision_start|>')
148 | 
149 |     if '<|vision_end|>' not in all_special_tokens:
150 |         new_tokens.append('<|vision_end|>')
151 | 
152 |     num_new_tokens = tokenizer.add_tokens(new_tokens)
153 |     bos_token_id = tokenizer.convert_tokens_to_ids('<|im_start|>')
154 |     eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>')
155 |     start_of_image = tokenizer.convert_tokens_to_ids('<|vision_start|>')
156 |     end_of_image = tokenizer.convert_tokens_to_ids('<|vision_end|>')
157 | 
158 |     new_token_ids = dict(
159 |         bos_token_id=bos_token_id, 
160 |         eos_token_id=eos_token_id, 
161 |         start_of_image=start_of_image, 
162 |         end_of_image=end_of_image, 
163 |     )
164 | 
165 |     return tokenizer, new_token_ids, num_new_tokens
166 | 
167 | 
168 | def len2weight(x, loss_reduction='square'):
169 |     if x == 0:
170 |         return x
171 |     if loss_reduction == 'token':
172 |         return 1
173 |     if loss_reduction == 'sample':
174 |         return 1 / x
175 |     if loss_reduction == 'square':
176 |         return 1 / (x ** 0.5)
177 |     raise NotImplementedError(loss_reduction)
178 | 


--------------------------------------------------------------------------------
/data/dataset_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from .interleave_datasets import UnifiedEditIterableDataset
 5 | from .t2i_dataset import T2IIterableDataset
 6 | from .vlm_dataset import SftJSONLIterableDataset
 7 | 
 8 | 
 9 | DATASET_REGISTRY = {
10 |     't2i_pretrain': T2IIterableDataset,
11 |     'vlm_sft': SftJSONLIterableDataset,
12 |     'unified_edit': UnifiedEditIterableDataset,
13 | }
14 | 
15 | 
16 | DATASET_INFO = {
17 |     't2i_pretrain': {
18 |         't2i': {
19 |             'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files
20 |             'num_files': 10, # number of data units to be sharded across all ranks and workers
21 |             'num_total_samples': 1000, # number of total samples in the dataset
22 |         },
23 |     },
24 |     'unified_edit':{
25 |         'seedxedit_multi': {
26 |             'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi',
27 |             'num_files': 10,
28 |             'num_total_samples': 1000,
29 |             "parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files
30 | 		},
31 |     },
32 |     'vlm_sft': {
33 |         'llava_ov': {
34 | 			'data_dir': 'your_data_path/bagel_example/vlm/images',
35 | 			'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl',
36 | 			'num_total_samples': 1000
37 | 		},
38 |     },
39 | }


--------------------------------------------------------------------------------
/data/distributed_iterable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import random
 5 | import torch
 6 | 
 7 | 
 8 | class DistributedIterableDataset(torch.utils.data.IterableDataset):
 9 |     def __init__(self, dataset_name, local_rank=0, world_size=1, num_workers=8):
10 |         self.dataset_name = dataset_name
11 |         self.local_rank = local_rank
12 |         self.world_size = world_size
13 |         self.num_workers = num_workers
14 |         self.rng = random.Random()
15 |         self.data_paths = None
16 | 
17 |     def get_data_paths(self, *args, **kwargs):
18 |         raise NotImplementedError
19 | 
20 |     def set_epoch(self, seed=42):
21 |         if self.data_paths is None:
22 |             return
23 | 
24 |         if isinstance(self.data_paths[0], tuple):
25 |             data_paths = sorted(self.data_paths, key=lambda x: (x[0], x[1]))
26 |         elif isinstance(self.data_paths[0], str):
27 |             data_paths = sorted(self.data_paths)
28 |         else:
29 |             raise ValueError(f"Unknown data_paths type: {type(self.data_paths[0])}")
30 | 
31 |         self.rng.seed(seed)
32 |         self.rng.shuffle(data_paths)
33 | 
34 |         num_files_per_rank = len(data_paths) // self.world_size
35 |         local_start = self.local_rank * num_files_per_rank
36 |         local_end = (self.local_rank + 1) * num_files_per_rank
37 |         self.num_files_per_rank = num_files_per_rank
38 |         self.data_paths_per_rank = data_paths[local_start:local_end]
39 | 
40 |     def get_data_paths_per_worker(self):
41 |         if self.data_paths is None:
42 |             return None
43 | 
44 |         info = torch.utils.data.get_worker_info()
45 |         if info is None:
46 |             # Single worker: Use all files assigned to the rank
47 |             return self.data_paths_per_rank, 0
48 | 
49 |         worker_id = info.id
50 |         num_files_per_worker = self.num_files_per_rank // info.num_workers
51 |         start = num_files_per_worker * worker_id
52 |         end = num_files_per_worker * (worker_id + 1)
53 |         data_paths_per_worker = self.data_paths_per_rank[start:end]
54 | 
55 |         return data_paths_per_worker[::-1], worker_id
56 | 
57 |     def __iter__(self):
58 |         raise NotImplementedError
59 | 


--------------------------------------------------------------------------------
/data/interleave_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | from .edit_dataset import UnifiedEditIterableDataset
5 | 
6 | 


--------------------------------------------------------------------------------
/data/interleave_datasets/edit_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import io
 5 | import random
 6 | from PIL import Image, ImageFile, PngImagePlugin
 7 | 
 8 | from .interleave_t2i_dataset import InterleavedBaseIterableDataset, ParquetStandardIterableDataset
 9 | from ..data_utils import pil_img2rgb
10 | 
11 | 
12 | Image.MAX_IMAGE_PIXELS = 200000000
13 | ImageFile.LOAD_TRUNCATED_IMAGES = True
14 | MaximumDecompressedSize = 1024
15 | MegaByte = 2 ** 20
16 | PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
17 | 
18 | 
19 | class UnifiedEditIterableDataset(InterleavedBaseIterableDataset, ParquetStandardIterableDataset):
20 | 
21 |     def parse_row(self, row):
22 |         image_num = len(row["image_list"])
23 |         # randomly choose start and end, return [0, 1] when only two images
24 |         start_idx = random.choice(range(image_num - 1))
25 |         max_end = min(start_idx + 3, image_num)
26 |         end_idx = random.choice(range(start_idx + 1, max_end))
27 | 
28 |         data = self._init_data()
29 |         data = self._add_image(
30 |             data, 
31 |             pil_img2rgb(Image.open(io.BytesIO(row["image_list"][start_idx]))),
32 |             need_loss=False, 
33 |             need_vae=True, 
34 |             need_vit=True, 
35 |         )
36 | 
37 |         if end_idx - start_idx > 1 and random.random() < 0.5: # concat multiple insturction
38 |             if end_idx == image_num - 1:
39 |                 end_idx -= 1
40 | 
41 |             instruction = ""
42 |             for idx in range(start_idx + 1, end_idx + 1):
43 |                 instruction += random.choice(row["instruction_list"][idx-1]) + ". "
44 |             data = self._add_text(data, instruction.rstrip(), need_loss=False)
45 |             data = self._add_image(
46 |                 data, 
47 |                 pil_img2rgb(Image.open(io.BytesIO(row["image_list"][end_idx]))),
48 |                 need_loss=True, 
49 |                 need_vae=False, 
50 |                 need_vit=False,
51 |             )
52 |         else:
53 |             for idx in range(start_idx + 1, end_idx + 1):
54 |                 instruction = random.choice(row["instruction_list"][idx-1])
55 |                 data = self._add_text(data, instruction, need_loss=False)
56 |                 if idx != end_idx:
57 |                     data = self._add_image(
58 |                         data, 
59 |                         pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
60 |                         need_loss=True, 
61 |                         need_vae=True, 
62 |                         need_vit=True,
63 |                     )
64 |                 else:
65 |                     data = self._add_image(
66 |                         data, 
67 |                         pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
68 |                         need_loss=True, 
69 |                         need_vae=False, 
70 |                         need_vit=False,
71 |                     )
72 |         return data
73 | 


--------------------------------------------------------------------------------
/data/interleave_datasets/interleave_t2i_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import pyarrow.parquet as pq
  5 | 
  6 | from ..distributed_iterable_dataset import DistributedIterableDataset
  7 | from ..parquet_utils import get_parquet_data_paths, init_arrow_pf_fs
  8 | 
  9 | 
 10 | class InterleavedBaseIterableDataset(DistributedIterableDataset):
 11 | 
 12 |     def _init_data(self):
 13 |         data = {
 14 |             'sequence_plan': [],
 15 |             'text_ids_list': [],
 16 |             'image_tensor_list': [],
 17 |             'num_tokens': 0,
 18 |         }
 19 |         return data
 20 | 
 21 |     def _add_text(self, data, text, need_loss, enable_cfg=True):
 22 |         text_ids = self.tokenizer.encode(text)
 23 |         data['num_tokens'] += len(text_ids)
 24 |         data['text_ids_list'].append(text_ids)
 25 |         data['sequence_plan'].append(
 26 |             {
 27 |                 'type': 'text',
 28 |                 'enable_cfg': int(enable_cfg),
 29 |                 'loss': int(need_loss),
 30 |                 'special_token_loss': 0,
 31 |                 'special_token_label': None,
 32 |             }
 33 |         )
 34 |         return data
 35 | 
 36 |     def _add_image(self, data, image, need_loss, need_vae, need_vit, enable_cfg=True):
 37 |         assert need_loss or need_vae or need_vit
 38 | 
 39 |         if need_loss:
 40 |             data['sequence_plan'].append(
 41 |                 {
 42 |                     'type': 'vae_image', 
 43 |                     'enable_cfg': 0, 
 44 |                     'loss': 1, 
 45 |                     'special_token_loss': 0,
 46 |                     'special_token_label': None,
 47 |                 }
 48 |             )
 49 | 
 50 |             image_tensor = self.transform(image)
 51 |             height, width = image_tensor.shape[1:]
 52 |             data['num_tokens'] += width * height // self.transform.stride ** 2
 53 |             data['image_tensor_list'].append(image_tensor)
 54 | 
 55 |         if need_vae:
 56 |             data['sequence_plan'].append(
 57 |                 {
 58 |                     'type': 'vae_image', 
 59 |                     'enable_cfg': int(enable_cfg), 
 60 |                     'loss': 0, 
 61 |                     'special_token_loss': 0,
 62 |                     'special_token_label': None,
 63 |                 }
 64 |             )
 65 | 
 66 |             image_tensor = self.transform(image)
 67 |             height, width = image_tensor.shape[1:]
 68 |             data['num_tokens'] += width * height // self.transform.stride ** 2
 69 |             data['image_tensor_list'].append(image_tensor.clone())
 70 | 
 71 |         if need_vit:
 72 |             data['sequence_plan'].append(
 73 |                 {
 74 |                     'type': 'vit_image',
 75 |                     'enable_cfg': int(enable_cfg), 
 76 |                     'loss': 0,
 77 |                     'special_token_loss': 0,
 78 |                     'special_token_label': None,
 79 |                 },
 80 |             )
 81 |             vit_image_tensor = self.vit_transform(image)
 82 |             height, width = vit_image_tensor.shape[1:]
 83 |             data['num_tokens'] += width * height // self.vit_transform.stride ** 2
 84 |             data['image_tensor_list'].append(vit_image_tensor)
 85 | 
 86 |         return data
 87 | 
 88 |     def _add_video(self, data, frames, frame_indexes, need_loss, need_vae, enable_cfg=True):
 89 |         assert int(need_loss) + int(need_vae) == 1
 90 | 
 91 |         if need_loss:
 92 |             for idx, (image, frame_idx) in enumerate(zip(frames, frame_indexes)):
 93 |                 current_sequence_plan = {
 94 |                     'type': 'vae_image', 
 95 |                     'enable_cfg': 0, 
 96 |                     'loss': 1, 
 97 |                     'special_token_loss': 0,
 98 |                     'special_token_label': None,
 99 |                     'split_start': idx == 0,
100 |                     'split_end': idx == len(frames) - 1,
101 |                 }
102 |                 if idx < len(frame_indexes) - 1:
103 |                     current_sequence_plan['frame_delta'] = frame_indexes[idx + 1] - frame_idx
104 |                 data['sequence_plan'].append(current_sequence_plan)
105 |                 image_tensor = self.transform(image)
106 |                 height, width = image_tensor.shape[1:]
107 |                 data['image_tensor_list'].append(image_tensor)
108 |                 data['num_tokens'] += width * height // self.transform.stride ** 2
109 | 
110 |         elif need_vae:
111 |             for idx, (image, frame_idx) in enumerate(zip(frames, frame_indexes)):
112 |                 current_sequence_plan = {
113 |                     'type': 'vae_image', 
114 |                     'enable_cfg': int(enable_cfg), 
115 |                     'loss': 0, 
116 |                     'special_token_loss': 0,
117 |                     'special_token_label': None,
118 |                     'split_start': idx == 0,
119 |                     'split_end': idx == len(frames) - 1,
120 |                 }
121 |                 if idx < len(frame_indexes) - 1:
122 |                     current_sequence_plan['frame_delta'] = frame_indexes[idx + 1] - frame_idx
123 |                 data['sequence_plan'].append(current_sequence_plan)
124 |                 image_tensor = self.transform(image)
125 |                 height, width = image_tensor.shape[1:]
126 |                 data['image_tensor_list'].append(image_tensor)
127 |                 data['num_tokens'] += width * height // self.transform.stride ** 2
128 | 
129 |         return data
130 | 
131 | 
132 | class ParquetStandardIterableDataset(DistributedIterableDataset):
133 | 
134 |     def __init__(
135 |         self, dataset_name, transform, tokenizer, vit_transform, 
136 |         data_dir_list, num_used_data, parquet_info,
137 |         local_rank=0, world_size=1, num_workers=8, data_status=None,
138 |     ):
139 |         """
140 |         data_dir_list: list of data directories contains parquet files
141 |         num_used_data: list of number of sampled data paths for each data directory
142 |         vit_transform: input transform for vit model.
143 |         """
144 |         super().__init__(dataset_name, local_rank, world_size, num_workers)
145 |         self.transform = transform
146 |         self.vit_transform = vit_transform
147 |         self.tokenizer = tokenizer
148 |         self.data_status = data_status
149 |         self.data_paths = self.get_data_paths(data_dir_list, num_used_data, parquet_info)
150 |         self.set_epoch()
151 | 
152 |     def get_data_paths(self, data_dir_list, num_used_data, parquet_info):
153 |         row_groups = []
154 |         for data_dir, num_data_path in zip(data_dir_list, num_used_data):
155 |             data_paths = get_parquet_data_paths([data_dir], [num_data_path])
156 |             for data_path in data_paths:
157 |                 if data_path in parquet_info.keys():
158 |                     num_row_groups = parquet_info[data_path]['num_row_groups']
159 |                     for rg_idx in range(num_row_groups):
160 |                         row_groups.append((data_path, rg_idx))
161 |         return row_groups
162 | 
163 |     def parse_row(self, row):
164 |         raise NotImplementedError
165 | 
166 |     def __iter__(self):
167 |         file_paths_per_worker, worker_id = self.get_data_paths_per_worker()
168 |         if self.data_status is not None:
169 |             global_row_group_start_id = self.data_status[worker_id][0]
170 |             row_start_id = self.data_status[worker_id][1] + 1
171 |         else:
172 |             global_row_group_start_id = 0
173 |             row_start_id = 0
174 | 
175 |         print(
176 |             f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
177 |             f"resuming data at global_rg#{global_row_group_start_id}, row#{row_start_id}"
178 |         )
179 | 
180 |         while True:
181 |             file_paths_per_worker_ = file_paths_per_worker[global_row_group_start_id:]
182 |             for global_row_group_idx, (parquet_file_path, row_group_id) in enumerate(
183 |                 file_paths_per_worker_, start=global_row_group_start_id
184 |             ):
185 |                 fs = init_arrow_pf_fs(parquet_file_path)
186 |                 with fs.open_input_file(parquet_file_path) as f:
187 |                     try:
188 |                         fr = pq.ParquetFile(f)
189 |                         df = fr.read_row_group(row_group_id).to_pandas()
190 |                         df = df.iloc[row_start_id:]
191 |                     except Exception as e:
192 |                         print(f'Error {e} in rg#{row_group_id}, {parquet_file_path}')
193 |                         continue
194 | 
195 |                     for row_idx, row in df.iterrows():
196 |                         try:
197 |                             data = self.parse_row(row)
198 |                             if len(data) == 0:
199 |                                 continue
200 |                             data['data_indexes'] = {
201 |                                 "data_indexes": [global_row_group_idx, row_idx],
202 |                                 "worker_id": worker_id,
203 |                                 "dataset_name": self.dataset_name,
204 |                             }
205 |                         except Exception as e:
206 |                             print(f'Error {e} in rg#{row_group_id}, {parquet_file_path}')
207 |                             continue
208 |                         yield data
209 | 
210 |                     row_start_id = 0
211 |             global_row_group_start_id = 0
212 |             print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")
213 | 


--------------------------------------------------------------------------------
/data/parquet_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | import os
 6 | import xml.etree.ElementTree as ET
 7 | import subprocess
 8 | import logging
 9 | 
10 | import pyarrow.fs as pf
11 | import torch.distributed as dist
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def get_parquet_data_paths(data_dir_list, num_sampled_data_paths, rank=0, world_size=1):
17 |     num_data_dirs = len(data_dir_list)
18 |     if world_size > 1:
19 |         chunk_size = (num_data_dirs + world_size - 1) // world_size
20 |         start_idx = rank * chunk_size
21 |         end_idx = min(start_idx + chunk_size, num_data_dirs)
22 |         local_data_dir_list = data_dir_list[start_idx:end_idx]
23 |         local_num_sampled_data_paths = num_sampled_data_paths[start_idx:end_idx]
24 |     else:
25 |         local_data_dir_list = data_dir_list
26 |         local_num_sampled_data_paths = num_sampled_data_paths
27 | 
28 |     local_data_paths = []
29 |     for data_dir, num_data_path in zip(local_data_dir_list, local_num_sampled_data_paths):
30 |         if data_dir.startswith("hdfs://"):
31 |             files = hdfs_ls_cmd(data_dir)
32 |             data_paths_per_dir = [
33 |                 file for file in files if file.endswith(".parquet")
34 |             ]
35 |         else:
36 |             files = os.listdir(data_dir)
37 |             data_paths_per_dir = [
38 |                 os.path.join(data_dir, name)
39 |                 for name in files
40 |                 if name.endswith(".parquet")
41 |             ]
42 |         repeat = num_data_path // len(data_paths_per_dir)
43 |         data_paths_per_dir = data_paths_per_dir * (repeat + 1)
44 |         local_data_paths.extend(data_paths_per_dir[:num_data_path])
45 | 
46 |     if world_size > 1:
47 |         gather_list = [None] * world_size
48 |         dist.all_gather_object(gather_list, local_data_paths)
49 | 
50 |         combined_chunks = []
51 |         for chunk_list in gather_list:
52 |             if chunk_list is not None:
53 |                 combined_chunks.extend(chunk_list)
54 |     else:
55 |         combined_chunks = local_data_paths
56 | 
57 |     return combined_chunks
58 | 
59 | 
60 | # NOTE: cumtomize this function for your cluster
61 | def get_hdfs_host():
62 |     return "hdfs://xxx"
63 | 
64 | 
65 | # NOTE: cumtomize this function for your cluster
66 | def get_hdfs_block_size():
67 |     return 134217728
68 | 
69 | 
70 | # NOTE: cumtomize this function for your cluster
71 | def get_hdfs_extra_conf():
72 |     return None
73 | 
74 | 
75 | def init_arrow_pf_fs(parquet_file_path):
76 |     if parquet_file_path.startswith("hdfs://"):
77 |         fs = pf.HadoopFileSystem(
78 |             host=get_hdfs_host(),
79 |             port=0,
80 |             buffer_size=get_hdfs_block_size(),
81 |             extra_conf=get_hdfs_extra_conf(),
82 |         )
83 |     else:
84 |         fs = pf.LocalFileSystem()
85 |     return fs
86 | 
87 | 
88 | def hdfs_ls_cmd(dir):
89 |     result = subprocess.run(["hdfs", "dfs", "ls", dir], capture_output=True, text=True).stdout
90 |     return ['hdfs://' + i.split('hdfs://')[-1].strip() for i in result.split('\n') if 'hdfs://' in i]
91 | 


--------------------------------------------------------------------------------
/data/t2i_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import io
  5 | import json
  6 | import pyarrow.parquet as pq
  7 | import random
  8 | from PIL import Image
  9 | 
 10 | from .data_utils import pil_img2rgb
 11 | from .distributed_iterable_dataset import DistributedIterableDataset
 12 | from .parquet_utils import get_parquet_data_paths, init_arrow_pf_fs
 13 | 
 14 | Image.MAX_IMAGE_PIXELS = 20_000_000
 15 | 
 16 | 
 17 | class T2IIterableDataset(DistributedIterableDataset):
 18 |     def __init__(
 19 |         self, dataset_name, transform, tokenizer, data_dir_list, num_used_data, 
 20 |         local_rank=0, world_size=1, num_workers=8, data_status=None,
 21 |     ):
 22 |         """
 23 |         data_dir_list: list of data directories contains parquet files
 24 |         num_used_data: list of number of sampled data paths for each data directory
 25 |         """
 26 |         super().__init__(dataset_name, local_rank, world_size, num_workers)
 27 |         self.transform = transform
 28 |         self.tokenizer = tokenizer
 29 |         self.data_status = data_status
 30 |         self.data_paths = self.get_data_paths(data_dir_list, num_used_data)
 31 |         self.set_epoch()
 32 | 
 33 |     def get_data_paths(self, data_dir_list, num_used_data):
 34 |         return get_parquet_data_paths(data_dir_list, num_used_data)
 35 | 
 36 |     def __iter__(self):
 37 |         data_paths_per_worker, worker_id = self.get_data_paths_per_worker()
 38 |         if self.data_status is not None:
 39 |             parquet_start_id = self.data_status[worker_id][0]
 40 |             row_group_start_id = self.data_status[worker_id][1]
 41 |             row_start_id = self.data_status[worker_id][2] + 1
 42 |         else:
 43 |             parquet_start_id = 0
 44 |             row_group_start_id = 0
 45 |             row_start_id = 0
 46 |         transform_stride = self.transform.stride
 47 | 
 48 |         print(
 49 |             f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
 50 |             f"resuming data at parquet#{parquet_start_id}, rg#{row_group_start_id}, row#{row_start_id}"
 51 |         )
 52 | 
 53 |         while True:
 54 |             data_paths_per_worker_ = data_paths_per_worker[parquet_start_id:]
 55 |             for parquet_idx, parquet_file_path in enumerate(data_paths_per_worker_, start=parquet_start_id):
 56 |                 fs = init_arrow_pf_fs(parquet_file_path)
 57 |                 with fs.open_input_file(parquet_file_path) as f:
 58 |                     fr = pq.ParquetFile(f)
 59 |                     row_group_ids = list(range(fr.num_row_groups))
 60 |                     row_group_ids_ = row_group_ids[row_group_start_id:]
 61 | 
 62 |                     for row_group_id in row_group_ids_:
 63 |                         df = fr.read_row_group(row_group_id).to_pandas()
 64 |                         df = df.iloc[row_start_id:]
 65 | 
 66 |                         for row_idx, row in df.iterrows():
 67 |                             num_tokens = 0
 68 |                             try:
 69 |                                 image_byte = row['image']
 70 |                                 image = pil_img2rgb(Image.open(io.BytesIO(image_byte)))
 71 |                             except Exception as e:
 72 |                                 print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}')
 73 |                                 continue
 74 |                             image_tensor = self.transform(image)
 75 |                             height, width = image_tensor.shape[1:]
 76 |                             num_tokens += width * height // transform_stride ** 2
 77 | 
 78 |                             try:
 79 |                                 caption_dict = row['captions']
 80 |                                 caption_dict = json.loads(caption_dict)
 81 |                             except Exception as e:
 82 |                                 print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}')
 83 |                                 continue
 84 | 
 85 |                             caps_token = [self.tokenizer.encode(v) for _, v in caption_dict.items()]
 86 |                             if len(caps_token) == 0:
 87 |                                 print(f'no caption in rg#{row_group_id}, {parquet_file_path}')
 88 |                                 caption_token = self.tokenizer.encode(' ')
 89 |                             else:
 90 |                                 caption_token = random.choice(caps_token)
 91 | 
 92 |                             sequence_plan, text_ids_list = [], []
 93 |                             text_ids = caption_token
 94 |                             num_tokens += len(caption_token)
 95 |                             text_ids_list.append(text_ids)
 96 |                             sequence_plan.append({
 97 |                                 'type': 'text',
 98 |                                 'enable_cfg': 1,
 99 |                                 'loss': 0,
100 |                                 'special_token_loss': 0,
101 |                                 'special_token_label': None,
102 |                             })
103 |                         
104 |                             sequence_plan.append({
105 |                                 'type': 'vae_image',
106 |                                 'enable_cfg': 0,
107 |                                 'loss': 1,
108 |                                 'special_token_loss': 0,
109 |                                 'special_token_label': None,
110 |                             })
111 | 
112 |                             sample = dict(
113 |                                 image_tensor_list=[image_tensor], 
114 |                                 text_ids_list=text_ids_list,
115 |                                 num_tokens=num_tokens,
116 |                                 sequence_plan=sequence_plan,
117 |                                 data_indexes={
118 |                                     "data_indexes": [parquet_idx, row_group_id, row_idx],
119 |                                     "worker_id": worker_id,
120 |                                     "dataset_name": self.dataset_name,
121 |                                 }
122 |                             )
123 |                             yield sample
124 | 
125 |                         row_start_id = 0
126 |                     row_group_start_id = 0
127 |             parquet_start_id = 0
128 |             print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")
129 | 


--------------------------------------------------------------------------------
/data/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import random
  5 | from PIL import Image
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | import torch
 10 | from torchvision import transforms
 11 | from torchvision.transforms import functional as F
 12 | from torchvision.transforms import InterpolationMode
 13 | 
 14 | 
 15 | class MaxLongEdgeMinShortEdgeResize(torch.nn.Module):
 16 |     """Resize the input image so that its longest side and shortest side are within a specified range,
 17 |     ensuring that both sides are divisible by a specified stride.
 18 | 
 19 |     Args:
 20 |         max_size (int): Maximum size for the longest edge of the image.
 21 |         min_size (int): Minimum size for the shortest edge of the image.
 22 |         stride (int): Value by which the height and width of the image must be divisible.
 23 |         max_pixels (int): Maximum pixels for the full image.
 24 |         interpolation (InterpolationMode): Desired interpolation enum defined by
 25 |             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
 26 |             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
 27 |             ``InterpolationMode.BILINEAR``, and ``InterpolationMode.BICUBIC`` are supported.
 28 |             The corresponding Pillow integer constants, e.g., ``PIL.Image.BILINEAR`` are also accepted.
 29 |         antialias (bool, optional): Whether to apply antialiasing (default is True).
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self, 
 34 |         max_size: int, 
 35 |         min_size: int, 
 36 |         stride: int, 
 37 |         max_pixels: int,
 38 |         interpolation=InterpolationMode.BICUBIC, 
 39 |         antialias=True
 40 |     ):
 41 |         super().__init__()
 42 |         self.max_size = max_size
 43 |         self.min_size = min_size
 44 |         self.stride = stride
 45 |         self.max_pixels = max_pixels
 46 |         self.interpolation = interpolation
 47 |         self.antialias = antialias
 48 | 
 49 |     def _make_divisible(self, value, stride):
 50 |         """Ensure the value is divisible by the stride."""
 51 |         return max(stride, int(round(value / stride) * stride))
 52 | 
 53 |     def _apply_scale(self, width, height, scale):
 54 |         new_width = round(width * scale)
 55 |         new_height = round(height * scale)
 56 |         new_width = self._make_divisible(new_width, self.stride)
 57 |         new_height = self._make_divisible(new_height, self.stride)
 58 |         return new_width, new_height
 59 | 
 60 |     def forward(self, img, img_num=1):
 61 |         """
 62 |         Args:
 63 |             img (PIL Image): Image to be resized.
 64 |             img_num (int): Number of images, used to change max_tokens.
 65 |         Returns:
 66 |             PIL Image or Tensor: Rescaled image with divisible dimensions.
 67 |         """
 68 |         if isinstance(img, torch.Tensor):
 69 |             height, width = img.shape[-2:]
 70 |         else:
 71 |             width, height = img.size
 72 | 
 73 |         scale = min(self.max_size / max(width, height), 1.0)
 74 |         scale = max(scale, self.min_size / min(width, height))
 75 |         new_width, new_height = self._apply_scale(width, height, scale)
 76 | 
 77 |         # Ensure the number of pixels does not exceed max_pixels
 78 |         if new_width * new_height > self.max_pixels / img_num:
 79 |             scale = self.max_pixels / img_num / (new_width * new_height)
 80 |             new_width, new_height = self._apply_scale(new_width, new_height, scale)
 81 | 
 82 |         # Ensure longest edge does not exceed max_size
 83 |         if max(new_width, new_height) > self.max_size:
 84 |             scale = self.max_size / max(new_width, new_height)
 85 |             new_width, new_height = self._apply_scale(new_width, new_height, scale)
 86 | 
 87 |         return F.resize(img, (new_height, new_width), self.interpolation, antialias=self.antialias)
 88 | 
 89 | 
 90 | class ImageTransform:
 91 |     def __init__(
 92 |         self, 
 93 |         max_image_size, 
 94 |         min_image_size, 
 95 |         image_stride, 
 96 |         max_pixels=14*14*9*1024,
 97 |         image_mean=[0.5, 0.5, 0.5], 
 98 |         image_std=[0.5, 0.5, 0.5]
 99 |     ):
100 |         self.stride = image_stride
101 | 
102 |         self.resize_transform = MaxLongEdgeMinShortEdgeResize(
103 |             max_size=max_image_size, 
104 |             min_size=min_image_size, 
105 |             stride=image_stride,
106 |             max_pixels=max_pixels,
107 |         )
108 |         self.to_tensor_transform = transforms.ToTensor()
109 |         self.normalize_transform = transforms.Normalize(mean=image_mean, std=image_std, inplace=True)
110 | 
111 |     def __call__(self, img, img_num=1):
112 |         img = self.resize_transform(img, img_num=img_num)
113 |         img = self.to_tensor_transform(img)
114 |         img = self.normalize_transform(img)
115 |         return img
116 | 
117 | 
118 | def decolorization(image):
119 |     gray_image = image.convert('L')
120 |     return Image.merge(image.mode, [gray_image] * 3) if image.mode in ('RGB', 'L') else gray_image
121 | 
122 | 
123 | def downscale(image, scale_factor):
124 |     new_width = int(round(image.width * scale_factor))
125 |     new_height = int(round(image.height * scale_factor))
126 |     new_width = max(1, new_width)
127 |     new_height = max(1, new_height)
128 |     return image.resize((new_width, new_height), resample=Image.BICUBIC)
129 | 
130 | 
131 | def crop(image, crop_factors):
132 |     target_h, target_w = crop_factors
133 |     img_w, img_h = image.size
134 | 
135 |     if target_h > img_h or target_w > img_w:
136 |         raise ValueError("Crop size exceeds image dimensions")
137 | 
138 |     x = random.randint(0, img_w - target_w)
139 |     y = random.randint(0, img_h - target_h)
140 | 
141 |     return image.crop((x, y, x + target_w, y + target_h)), [[x, y], [x + target_w, y + target_h]]
142 | 
143 | 
144 | def motion_blur_opencv(image, kernel_size=15, angle=0):
145 |     # 线性核
146 |     kernel = np.zeros((kernel_size, kernel_size), dtype=np.float32)
147 |     kernel[kernel_size // 2, :] = np.ones(kernel_size, dtype=np.float32)
148 | 
149 |     # 旋转核
150 |     center = (kernel_size / 2 - 0.5, kernel_size / 2 - 0.5)
151 |     M = cv2.getRotationMatrix2D(center, angle, 1)
152 |     rotated_kernel = cv2.warpAffine(kernel, M, (kernel_size, kernel_size))
153 | 
154 |     # 归一化核
155 |     rotated_kernel /= rotated_kernel.sum() if rotated_kernel.sum() != 0 else 1
156 | 
157 |     img = np.array(image)
158 |     if img.ndim == 2:
159 |         blurred = cv2.filter2D(img, -1, rotated_kernel, borderType=cv2.BORDER_REFLECT)
160 |     else:
161 |         # 对于彩色图像，各通道独立卷积
162 |         blurred = np.zeros_like(img)
163 |         for c in range(img.shape[2]):
164 |             blurred[..., c] = cv2.filter2D(img[..., c], -1, rotated_kernel, borderType=cv2.BORDER_REFLECT)
165 | 
166 |     return Image.fromarray(blurred.astype(np.uint8))
167 | 
168 | 
169 | def shuffle_patch(image, num_splits, gap_size=2):
170 |     """将图像分割为块（允许尺寸不整除），随机打乱后拼接，块间保留间隙"""
171 |     h_splits, w_splits = num_splits
172 |     img_w, img_h = image.size
173 | 
174 |     base_patch_h = img_h // h_splits
175 |     patch_heights = [base_patch_h] * (h_splits - 1)
176 |     patch_heights.append(img_h - sum(patch_heights))
177 | 
178 |     base_patch_w = img_w // w_splits
179 |     patch_widths = [base_patch_w] * (w_splits - 1)
180 |     patch_widths.append(img_w - sum(patch_widths))
181 | 
182 |     patches = []
183 |     current_y = 0
184 |     for i in range(h_splits):
185 |         current_x = 0
186 |         patch_h = patch_heights[i]
187 |         for j in range(w_splits):
188 |             patch_w = patch_widths[j]
189 |             patch = image.crop((current_x, current_y, current_x + patch_w, current_y + patch_h))
190 |             patches.append(patch)
191 |             current_x += patch_w
192 |         current_y += patch_h
193 | 
194 |     random.shuffle(patches)
195 | 
196 |     total_width = sum(patch_widths) + (w_splits - 1) * gap_size
197 |     total_height = sum(patch_heights) + (h_splits - 1) * gap_size
198 |     new_image = Image.new(image.mode, (total_width, total_height), color=(255, 255, 255))
199 | 
200 |     current_y = 0  # 当前行的起始 Y 坐标
201 |     patch_idx = 0  # 当前处理的块索引
202 |     for i in range(h_splits):
203 |         current_x = 0  # 当前列的起始 X 坐标
204 |         patch_h = patch_heights[i]  # 当前行块的高度
205 |         for j in range(w_splits):
206 |             # 取出打乱后的块
207 |             patch = patches[patch_idx]
208 |             patch_w = patch_widths[j]  # 当前列块的宽度
209 |             # 粘贴块（左上角坐标为 (current_x, current_y)）
210 |             new_image.paste(patch, (current_x, current_y))
211 |             # 更新 X 坐标（下一个块的起始位置 = 当前块宽度 + 间隙）
212 |             current_x += patch_w + gap_size
213 |             patch_idx += 1
214 |         # 更新 Y 坐标（下一行的起始位置 = 当前行高度 + 间隙）
215 |         current_y += patch_h + gap_size
216 | 
217 |     return new_image
218 | 
219 | 
220 | def inpainting(image, num_splits, blank_ratio=0.3, blank_color=(255, 255, 255)):
221 |     """
222 |     图像分割后随机空白部分patch，用于inpainting任务
223 |     
224 |     参数：
225 |         image: PIL.Image 输入图像（RGB模式）
226 |         h_splits: int 行分割数（垂直方向分割块数）
227 |         w_splits: int 列分割数（水平方向分割块数）
228 |         blank_ratio: float 空白patch的比例（0~1）
229 |         blank_color: tuple 空白区域的颜色（RGB，如白色(255,255,255)）
230 |     
231 |     返回：
232 |         PIL.Image 处理后拼接的图像
233 |     """
234 |     h_splits, w_splits = num_splits
235 |     img_w, img_h = image.size
236 | 
237 |     base_patch_h = img_h // h_splits
238 |     patch_heights = [base_patch_h] * (h_splits - 1)
239 |     patch_heights.append(img_h - sum(patch_heights))
240 | 
241 |     base_patch_w = img_w // w_splits
242 |     patch_widths = [base_patch_w] * (w_splits - 1)
243 |     patch_widths.append(img_w - sum(patch_widths))
244 | 
245 |     patches = []
246 |     current_y = 0
247 |     for i in range(h_splits):
248 |         current_x = 0
249 |         patch_h = patch_heights[i]
250 |         for j in range(w_splits):
251 |             patch_w = patch_widths[j]
252 |             patch = image.crop((current_x, current_y, current_x + patch_w, current_y + patch_h))
253 |             patches.append(patch)
254 |             current_x += patch_w
255 |         current_y += patch_h
256 | 
257 |     total_patches = h_splits * w_splits
258 |     num_blank = int(total_patches * blank_ratio)
259 |     num_blank = max(0, min(num_blank, total_patches))
260 |     blank_indices = random.sample(range(total_patches), num_blank)
261 | 
262 |     processed_patches = []
263 |     for idx, patch in enumerate(patches):
264 |         if idx in blank_indices:
265 |             blank_patch = Image.new("RGB", patch.size, color=blank_color)
266 |             processed_patches.append(blank_patch)
267 |         else:
268 |             processed_patches.append(patch)
269 | 
270 |     # 创建结果图像（尺寸与原图一致）
271 |     result_image = Image.new("RGB", (img_w, img_h))
272 |     current_y = 0
273 |     patch_idx = 0
274 |     for i in range(h_splits):
275 |         current_x = 0
276 |         patch_h = patch_heights[i]
277 |         for j in range(w_splits):
278 |             # 取出处理后的patch
279 |             patch = processed_patches[patch_idx]
280 |             patch_w = patch_widths[j]
281 |             # 粘贴到原位置
282 |             result_image.paste(patch, (current_x, current_y))
283 |             current_x += patch_w
284 |             patch_idx += 1
285 |         current_y += patch_h
286 | 
287 |     return result_image
288 | 


--------------------------------------------------------------------------------
/data/video_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | 
 13 | import io
 14 | import os
 15 | import random
 16 | import re
 17 | 
 18 | import numpy as np
 19 | import decord
 20 | from PIL import Image
 21 | 
 22 | 
 23 | def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
 24 |     if sample in ['rand', 'middle']: # uniform sampling
 25 |         acc_samples = min(num_frames, vlen)
 26 |         # split the video into `acc_samples` intervals, and sample from each interval.
 27 |         intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
 28 |         ranges = []
 29 |         for idx, interv in enumerate(intervals[:-1]):
 30 |             ranges.append((interv, intervals[idx + 1] - 1))
 31 |         if sample == 'rand':
 32 |             try:
 33 |                 frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
 34 |             except:
 35 |                 frame_indices = np.random.permutation(vlen)[:acc_samples]
 36 |                 frame_indices.sort()
 37 |                 frame_indices = list(frame_indices)
 38 |         elif fix_start is not None:
 39 |             frame_indices = [x[0] + fix_start for x in ranges]
 40 |         elif sample == 'middle':
 41 |             frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
 42 |         else:
 43 |             raise NotImplementedError
 44 | 
 45 |         if len(frame_indices) < num_frames:  # padded with last frame
 46 |             padded_frame_indices = [frame_indices[-1]] * num_frames
 47 |             padded_frame_indices[:len(frame_indices)] = frame_indices
 48 |             frame_indices = padded_frame_indices
 49 |     elif 'fps' in sample:  # fps0.5, sequentially sample frames at 0.5 fps
 50 |         output_fps = float(sample[3:])
 51 |         duration = float(vlen) / input_fps
 52 |         delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
 53 |         frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
 54 |         frame_indices = np.around(frame_seconds * input_fps).astype(int)
 55 |         frame_indices = [e for e in frame_indices if e < vlen]
 56 |         if max_num_frames > 0 and len(frame_indices) > max_num_frames:
 57 |             frame_indices = frame_indices[:max_num_frames]
 58 |     else:
 59 |         raise ValueError
 60 |     return frame_indices
 61 | 
 62 | 
 63 | def read_frames_decord(video_path, num_frames, sample='rand', fix_start=None, clip=None, min_num_frames=4):
 64 |     video_reader = decord.VideoReader(video_path, num_threads=1)
 65 |     vlen = len(video_reader)
 66 |     fps = video_reader.get_avg_fps()
 67 |     duration = vlen / float(fps)
 68 |     if clip:
 69 |         start, end = clip
 70 |         duration = end - start
 71 |         vlen = int(duration * fps)
 72 |         start_index = int(start * fps)
 73 | 
 74 |     t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
 75 | 
 76 |     frame_indices = get_frame_indices(
 77 |         t_num_frames, vlen, sample=sample, fix_start=fix_start,
 78 |         input_fps=fps
 79 |     )
 80 |     if clip:
 81 |         frame_indices = [f + start_index for f in frame_indices]
 82 |     frames = video_reader.get_batch(frame_indices).asnumpy()  # (T, H, W, C), np.uint8
 83 |     frames = [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
 84 |     return frames
 85 | 
 86 | 
 87 | def extract_frame_number(filename):
 88 |     # Extract the numeric part from the filename using regular expressions
 89 |     match = re.search(r'_(\d+).jpg$', filename)
 90 |     return int(match.group(1)) if match else -1
 91 | 
 92 | 
 93 | def sort_frames(frame_paths):
 94 |     # Extract filenames from each path and sort by their numeric part
 95 |     return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x)))
 96 | 
 97 | 
 98 | def read_frames_folder(video_path, num_frames, sample='rand', fix_start=None, min_num_frames=4):
 99 |     image_list = sort_frames(list(os.listdir(video_path)))
100 |     frames = []
101 |     for image in image_list:
102 |         fp = os.path.join(video_path, image)
103 |         frame = Image.open(fp).convert('RGB')
104 |         frames.append(frame)
105 |     vlen = len(frames)
106 | 
107 |     t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
108 | 
109 |     if vlen > t_num_frames:
110 |         frame_indices = get_frame_indices(
111 |             t_num_frames, vlen, sample=sample, fix_start=fix_start
112 |         )
113 |         frames = [frames[i] for i in frame_indices]
114 |     return frames
115 | 
116 | 
117 | class FrameSampler:
118 |     def __init__(self, max_num_frames=-1, min_num_frames=8, sample='rand'):
119 |         self.max_num_frames = max_num_frames
120 |         self.min_num_frames = min_num_frames
121 |         self.sample = sample
122 |     
123 |     def __call__(self, file_name):
124 |         fn = read_frames_folder if file_name.endswith('/') else read_frames_decord
125 |         frames = fn(file_name, num_frames=self.max_num_frames, min_num_frames=self.min_num_frames, sample=self.sample)
126 |         return frames
127 | 
128 | 
129 | def decode_video_byte(video_bytes):
130 |     video_stream = io.BytesIO(video_bytes)
131 |     vr = decord.VideoReader(video_stream)
132 |     return vr
133 | 
134 | 
135 | def sample_mp4_frames(mp4_p, n_frames=None, fps=None, return_frame_indices=False, random_sample=False):
136 |     if isinstance(mp4_p, str):
137 |         vr = decord.VideoReader(mp4_p, num_threads=1)
138 |     elif isinstance(mp4_p, decord.video_reader.VideoReader):
139 |         vr = mp4_p
140 |     video_fps = vr.get_avg_fps()  # 获取视频的帧率
141 |     video_duration = len(vr) / video_fps
142 |     if n_frames is not None:
143 |         if random_sample:
144 |             frame_indices = sorted(random.sample(range(len(vr)), n_frames))
145 |         else:
146 |             frame_indices = np.linspace(0, len(vr)-1, n_frames, dtype=int).tolist()
147 |     else:
148 |         frame_indices = [int(i) for i in np.arange(0, len(vr)-1, video_fps/fps)]
149 |     frames = vr.get_batch(frame_indices).asnumpy()  # 转换为 numpy 数组
150 |     frames = [Image.fromarray(frame).convert("RGB") for frame in frames]
151 |     if not return_frame_indices:
152 |         return frames, video_duration
153 |     else:
154 |         return frames, video_duration, frame_indices
155 | 
156 | 
157 | def sample_mp4_frames_by_indices(mp4_p, frame_indices: list):
158 |     if isinstance(mp4_p, str):
159 |         vr = decord.VideoReader(mp4_p, num_threads=1)
160 |     elif isinstance(mp4_p, decord.video_reader.VideoReader):
161 |         vr = mp4_p
162 |     # sample the frames in frame_indices
163 |     frames = vr.get_batch(frame_indices).asnumpy()  # 转换为 numpy 数组
164 |     frames = [Image.fromarray(frame).convert("RGB") for frame in frames]
165 |     return frames


--------------------------------------------------------------------------------
/data/vlm_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import json
  5 | import os
  6 | import traceback
  7 | from PIL import Image, ImageFile, PngImagePlugin
  8 | 
  9 | from .data_utils import pil_img2rgb
 10 | from .distributed_iterable_dataset import DistributedIterableDataset
 11 | 
 12 | 
 13 | Image.MAX_IMAGE_PIXELS = 200000000
 14 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 15 | MaximumDecompressedSize = 1024
 16 | MegaByte = 2 ** 20
 17 | PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
 18 | 
 19 | 
 20 | class SftJSONLIterableDataset(DistributedIterableDataset):
 21 |     def __init__(
 22 |         self, dataset_name, transform, tokenizer, frame_sampler, 
 23 |         jsonl_path_list, data_dir_list, num_used_data, 
 24 |         local_rank=0, world_size=1, num_workers=8, data_status=None, 
 25 |         shuffle_lines=False, shuffle_seed=0,
 26 |     ):
 27 |         """
 28 |         jsonl_path_list: list of jsonl file paths
 29 |         data_dir_list: list of image directories containing the images of each jsonl file
 30 |         num_used_data: list of number of sampled data points for each jsonl
 31 |         """
 32 |         super().__init__(dataset_name, local_rank, world_size, num_workers)
 33 |         self.transform = transform
 34 |         self.tokenizer = tokenizer
 35 |         self.frame_sampler = frame_sampler
 36 |         self.data_status = data_status
 37 |         self.data_paths = self.get_data_paths(
 38 |             jsonl_path_list, 
 39 |             data_dir_list, 
 40 |             num_used_data, 
 41 |             shuffle_lines, 
 42 |             shuffle_seed,
 43 |         )
 44 |         self.set_epoch()
 45 | 
 46 |     def get_data_paths(
 47 |         self, 
 48 |         jsonl_path_list, 
 49 |         data_dir_list, 
 50 |         num_used_data, 
 51 |         shuffle_lines, 
 52 |         shuffle_seed,
 53 |     ):
 54 |         data_paths = []
 55 |         for jsonl_path, image_dir, num_data_point in zip(
 56 |             jsonl_path_list, data_dir_list, num_used_data
 57 |         ):
 58 |             with open(jsonl_path, 'r') as f:
 59 |                 raw_data = f.readlines()
 60 |             if shuffle_lines:
 61 |                 self.rng.seed(shuffle_seed)
 62 |                 self.rng.shuffle(raw_data)
 63 |             raw_data = raw_data[:num_data_point]
 64 |             data_paths.extend([(json_data, image_dir) for json_data in raw_data])
 65 |         return data_paths
 66 | 
 67 |     def change_format(self, data, num_images):
 68 |         elements = []
 69 |         for conversation in data['conversations']:
 70 |             if conversation['from'] == 'human':
 71 |                 if '<image>' not in conversation['value']:
 72 |                     elements.append({
 73 |                         'type': 'text',
 74 |                         'has_loss': 0,
 75 |                         'text': conversation['value'],
 76 |                     })
 77 |                 else:
 78 |                     text_list = conversation['value'].split('<image>')
 79 |                     for idx, text in enumerate(text_list):
 80 |                         if text.strip() != '':
 81 |                             elements.append({
 82 |                                 'type': 'text',
 83 |                                 'has_loss': 0,
 84 |                                 'text': text.strip(),
 85 |                             })
 86 |                         if (idx != len(text_list) - 1) and (idx < num_images):
 87 |                             elements.append({'type': 'image',})
 88 |             elif conversation['from'] == 'gpt':
 89 |                 elements.append({
 90 |                     'type': 'text',
 91 |                     'has_loss': 1,
 92 |                     'text': conversation['value'],
 93 |                 })
 94 |         return elements
 95 | 
 96 |     def __iter__(self):
 97 |         data_paths_per_worker, worker_id = self.get_data_paths_per_worker()
 98 |         if self.data_status is not None:
 99 |             row_start_id = self.data_status[worker_id] + 1
100 |         else:
101 |             row_start_id = 0
102 |         transform_stride = self.transform.stride
103 | 
104 |         print(
105 |             f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
106 |             f"resuming data at row#{row_start_id}"
107 |         )
108 | 
109 |         while True:
110 |             data_paths_per_worker_ = data_paths_per_worker[row_start_id:]
111 |             for row_idx, (data, image_dir) in enumerate(data_paths_per_worker_, start=row_start_id):
112 |                 num_tokens = 0
113 |                 image_tensor_list = []
114 |                 text_ids_list = []
115 |                 sequence_plan = []
116 | 
117 |                 try:
118 |                     data_item = json.loads(data)
119 |                     raw_images = None
120 |                     if 'image' in data_item:
121 |                         if type(data_item['image']) == list:
122 |                             raw_images = [
123 |                                 pil_img2rgb(Image.open(os.path.join(image_dir, image)))
124 |                                 for image in data_item['image']
125 |                             ]
126 |                         else:
127 |                             raw_images = [
128 |                                 pil_img2rgb(Image.open(os.path.join(image_dir, data_item['image'])))
129 |                             ]
130 |                     elif 'video' in data_item:
131 |                         raw_images = self.frame_sampler(os.path.join(image_dir, data_item['video']))
132 |                         special_tokens = '<image>' * len(raw_images)
133 |                         for item in data_item['conversations']:
134 |                             if '<video>' in item['value']:
135 |                                 item['value'] = item['value'].replace('<video>', special_tokens)
136 |                                 break
137 |                             else:
138 |                                 raise ValueError("Cannot find <video> in the conversation!")
139 |                 except:
140 |                     traceback.print_exc()
141 |                     continue
142 | 
143 |                 if raw_images:
144 |                     for raw_image in raw_images:
145 |                         image_tensor = self.transform(raw_image, img_num=len(raw_images))
146 |                         image_tensor_list.append(image_tensor)
147 |                         height, width = image_tensor.shape[1:]
148 |                         num_tokens += width * height // transform_stride ** 2
149 | 
150 |                 elements = self.change_format(data_item, len(image_tensor_list))
151 | 
152 |                 for item in elements:
153 |                     if item['type'] == 'text':
154 |                         text_data = item['text']
155 |                         text_ids = self.tokenizer.encode(text_data)
156 |                         if len(text_ids) > 0:
157 |                             text_ids_list.append(text_ids)
158 |                             num_tokens += len(text_ids)
159 |                             current_plan = {
160 |                                 'type': 'text',
161 |                                 'enable_cfg': 0,
162 |                                 'loss': item['has_loss'],
163 |                                 'special_token_loss': 0,
164 |                                 'special_token_label': None,
165 |                             }
166 |                             sequence_plan.append(current_plan)
167 |                     elif item['type'] == 'image':
168 |                         current_plan = {
169 |                             'type': 'vit_image',
170 |                             'enable_cfg': 0,
171 |                             'loss': 0,
172 |                             'special_token_loss': 0,
173 |                             'special_token_label': None,
174 |                         }
175 |                         sequence_plan.append(current_plan)
176 | 
177 |                 has_loss = [item['loss'] for item in sequence_plan]
178 |                 if sum(has_loss) == 0:
179 |                     print(f'No loss defined, skipped.')
180 |                     continue
181 | 
182 |                 yield dict(
183 |                     image_tensor_list=image_tensor_list,
184 |                     text_ids_list=text_ids_list,
185 |                     sequence_plan=sequence_plan,
186 |                     num_tokens=num_tokens,
187 |                     data_indexes={
188 |                         "data_indexes": row_idx,
189 |                         "worker_id": worker_id,
190 |                         "dataset_name": self.dataset_name,
191 |                     }
192 |                 )
193 | 
194 |             row_start_id = 0
195 |             print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")
196 | 


--------------------------------------------------------------------------------
/example_workflows/bagel_image_edit.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "3fa96181-e113-4145-ab50-238e8d4aa26b",
  3 |   "revision": 0,
  4 |   "last_node_id": 15,
  5 |   "last_link_id": 14,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 13,
  9 |       "type": "PreviewImage",
 10 |       "pos": [
 11 |         865,
 12 |         80
 13 |       ],
 14 |       "size": [
 15 |         210,
 16 |         246
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 3,
 20 |       "mode": 0,
 21 |       "inputs": [
 22 |         {
 23 |           "label": "images",
 24 |           "name": "images",
 25 |           "type": "IMAGE",
 26 |           "link": 12
 27 |         }
 28 |       ],
 29 |       "outputs": [],
 30 |       "properties": {
 31 |         "cnr_id": "comfy-core",
 32 |         "ver": "0.3.30",
 33 |         "Node name for S&R": "PreviewImage"
 34 |       },
 35 |       "widgets_values": []
 36 |     },
 37 |     {
 38 |       "id": 11,
 39 |       "type": "BagelImageEdit",
 40 |       "pos": [
 41 |         415,
 42 |         80
 43 |       ],
 44 |       "size": [
 45 |         400,
 46 |         372
 47 |       ],
 48 |       "flags": {},
 49 |       "order": 2,
 50 |       "mode": 0,
 51 |       "inputs": [
 52 |         {
 53 |           "label": "model",
 54 |           "name": "model",
 55 |           "type": "BAGEL_MODEL",
 56 |           "link": 14
 57 |         },
 58 |         {
 59 |           "label": "image",
 60 |           "name": "image",
 61 |           "type": "IMAGE",
 62 |           "link": 11
 63 |         }
 64 |       ],
 65 |       "outputs": [
 66 |         {
 67 |           "label": "image",
 68 |           "name": "image",
 69 |           "type": "IMAGE",
 70 |           "links": [
 71 |             12
 72 |           ]
 73 |         },
 74 |         {
 75 |           "label": "thinking",
 76 |           "name": "thinking",
 77 |           "type": "STRING",
 78 |           "links": [
 79 |             13
 80 |           ]
 81 |         }
 82 |       ],
 83 |       "properties": {
 84 |         "aux_id": "neverbiasu/ComfyUI-Bagel",
 85 |         "ver": "4bb49e5232604e4838463f130b3ba7026e428c1d",
 86 |         "Node name for S&R": "BagelImageEdit"
 87 |       },
 88 |       "widgets_values": [
 89 |         "She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes.",
 90 |         414307,
 91 |         "fixed",
 92 |         4,
 93 |         2,
 94 |         50,
 95 |         true,
 96 |         0,
 97 |         3,
 98 |         1,
 99 |         "text_channel",
100 |         0.3,
101 |         [
102 |           false,
103 |           true
104 |         ]
105 |       ]
106 |     },
107 |     {
108 |       "id": 12,
109 |       "type": "LoadImage",
110 |       "pos": [
111 |         50,
112 |         266
113 |       ],
114 |       "size": [
115 |         315,
116 |         314
117 |       ],
118 |       "flags": {},
119 |       "order": 0,
120 |       "mode": 0,
121 |       "inputs": [],
122 |       "outputs": [
123 |         {
124 |           "label": "IMAGE",
125 |           "name": "IMAGE",
126 |           "type": "IMAGE",
127 |           "links": [
128 |             11
129 |           ]
130 |         },
131 |         {
132 |           "label": "MASK",
133 |           "name": "MASK",
134 |           "type": "MASK",
135 |           "links": null
136 |         }
137 |       ],
138 |       "properties": {
139 |         "cnr_id": "comfy-core",
140 |         "ver": "0.3.30",
141 |         "Node name for S&R": "LoadImage"
142 |       },
143 |       "widgets_values": [
144 |         "woman.jpg",
145 |         "image"
146 |       ]
147 |     },
148 |     {
149 |       "id": 14,
150 |       "type": "ShowText|pysssss",
151 |       "pos": [
152 |         873.99853515625,
153 |         383.1902160644531
154 |       ],
155 |       "size": [
156 |         295.75439453125,
157 |         252.52635192871094
158 |       ],
159 |       "flags": {},
160 |       "order": 4,
161 |       "mode": 0,
162 |       "inputs": [
163 |         {
164 |           "label": "text",
165 |           "name": "text",
166 |           "type": "STRING",
167 |           "link": 13
168 |         }
169 |       ],
170 |       "outputs": [
171 |         {
172 |           "label": "STRING",
173 |           "name": "STRING",
174 |           "shape": 6,
175 |           "type": "STRING",
176 |           "links": null
177 |         }
178 |       ],
179 |       "properties": {
180 |         "cnr_id": "comfyui-custom-scripts",
181 |         "ver": "1.2.5",
182 |         "Node name for S&R": "ShowText|pysssss"
183 |       },
184 |       "widgets_values": [
185 |         "<think>\nThe user wants the image to depict the same person in a modern subway setting, reading a newspaper. The red outfit and poodle pin should remain consistent, while the background changes to a subway environment with a bench and subway signage. The pose should reflect quiet reading.\n</think>"
186 |       ]
187 |     },
188 |     {
189 |       "id": 15,
190 |       "type": "BagelModelLoader",
191 |       "pos": [
192 |         50,
193 |         80
194 |       ],
195 |       "size": [
196 |         315,
197 |         58
198 |       ],
199 |       "flags": {},
200 |       "order": 1,
201 |       "mode": 0,
202 |       "inputs": [],
203 |       "outputs": [
204 |         {
205 |           "label": "model",
206 |           "name": "model",
207 |           "type": "BAGEL_MODEL",
208 |           "links": [
209 |             14
210 |           ]
211 |         }
212 |       ],
213 |       "properties": {
214 |         "aux_id": "neverbiasu/ComfyUI-Bagel",
215 |         "ver": "7c8172cd6a319dbd7b120e2a51b2d597550af7d2",
216 |         "Node name for S&R": "BagelModelLoader"
217 |       },
218 |       "widgets_values": [
219 |         "DFloat11/BAGEL-7B-MoT-DF11"
220 |       ]
221 |     }
222 |   ],
223 |   "links": [
224 |     [
225 |       11,
226 |       12,
227 |       0,
228 |       11,
229 |       1,
230 |       "IMAGE"
231 |     ],
232 |     [
233 |       12,
234 |       11,
235 |       0,
236 |       13,
237 |       0,
238 |       "IMAGE"
239 |     ],
240 |     [
241 |       13,
242 |       11,
243 |       1,
244 |       14,
245 |       0,
246 |       "STRING"
247 |     ],
248 |     [
249 |       14,
250 |       15,
251 |       0,
252 |       11,
253 |       0,
254 |       "BAGEL_MODEL"
255 |     ]
256 |   ],
257 |   "groups": [],
258 |   "config": {},
259 |   "extra": {
260 |     "ds": {
261 |       "scale": 0.7972024500000005,
262 |       "offset": [
263 |         314.6055329170149,
264 |         -0.926144800855802
265 |       ]
266 |     },
267 |     "frontendVersion": "1.17.11",
268 |     "VHS_latentpreview": false,
269 |     "VHS_latentpreviewrate": 0,
270 |     "VHS_MetadataImage": true,
271 |     "VHS_KeepIntermediate": true
272 |   },
273 |   "version": 0.4
274 | }


--------------------------------------------------------------------------------
/example_workflows/bagel_image_edit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/example_workflows/bagel_image_edit.png


--------------------------------------------------------------------------------
/example_workflows/bagel_image_understanding.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "832f71f8-5d24-4b6f-bef8-35e3d8c796cb",
  3 |   "revision": 0,
  4 |   "last_node_id": 4,
  5 |   "last_link_id": 3,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 3,
  9 |       "type": "LoadImage",
 10 |       "pos": [
 11 |         100,
 12 |         318
 13 |       ],
 14 |       "size": [
 15 |         315,
 16 |         314
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 0,
 20 |       "mode": 0,
 21 |       "inputs": [],
 22 |       "outputs": [
 23 |         {
 24 |           "label": "IMAGE",
 25 |           "name": "IMAGE",
 26 |           "type": "IMAGE",
 27 |           "links": [
 28 |             2
 29 |           ]
 30 |         },
 31 |         {
 32 |           "label": "MASK",
 33 |           "name": "MASK",
 34 |           "type": "MASK",
 35 |           "links": null
 36 |         }
 37 |       ],
 38 |       "properties": {
 39 |         "cnr_id": "comfy-core",
 40 |         "ver": "0.3.30",
 41 |         "Node name for S&R": "LoadImage"
 42 |       },
 43 |       "widgets_values": [
 44 |         "meme.jpg",
 45 |         "image"
 46 |       ]
 47 |     },
 48 |     {
 49 |       "id": 1,
 50 |       "type": "BagelModelLoader",
 51 |       "pos": [
 52 |         100,
 53 |         130
 54 |       ],
 55 |       "size": [
 56 |         315,
 57 |         58
 58 |       ],
 59 |       "flags": {},
 60 |       "order": 1,
 61 |       "mode": 0,
 62 |       "inputs": [],
 63 |       "outputs": [
 64 |         {
 65 |           "label": "model",
 66 |           "name": "model",
 67 |           "type": "BAGEL_MODEL",
 68 |           "links": [
 69 |             1
 70 |           ]
 71 |         }
 72 |       ],
 73 |       "properties": {
 74 |         "aux_id": "neverbiasu/ComfyUI-Bagel",
 75 |         "ver": "4bb49e5232604e4838463f130b3ba7026e428c1d",
 76 |         "Node name for S&R": "BagelModelLoader"
 77 |       },
 78 |       "widgets_values": [
 79 |         "DFloat11/BAGEL-7B-MoT-DF11"
 80 |       ]
 81 |     },
 82 |     {
 83 |       "id": 4,
 84 |       "type": "ShowText|pysssss",
 85 |       "pos": [
 86 |         1015,
 87 |         130
 88 |       ],
 89 |       "size": [
 90 |         253.62852478027344,
 91 |         331.75177001953125
 92 |       ],
 93 |       "flags": {},
 94 |       "order": 3,
 95 |       "mode": 0,
 96 |       "inputs": [
 97 |         {
 98 |           "label": "text",
 99 |           "name": "text",
100 |           "type": "STRING",
101 |           "link": 3
102 |         }
103 |       ],
104 |       "outputs": [
105 |         {
106 |           "label": "STRING",
107 |           "name": "STRING",
108 |           "shape": 6,
109 |           "type": "STRING",
110 |           "links": null
111 |         }
112 |       ],
113 |       "properties": {
114 |         "cnr_id": "comfyui-custom-scripts",
115 |         "ver": "1.2.5",
116 |         "Node name for S&R": "ShowText|pysssss"
117 |       },
118 |       "widgets_values": [
119 |         "The humor in this meme comes from the exaggerated change in handwriting style over the course of an exam. The first two pages show neat, legible handwriting, suggesting a calm and focused start to the exam. However, as the meme progresses to the middle and last pages, the handwriting becomes increasingly messy and difficult to read, symbolizing the stress and anxiety that often builds up as the exam progresses. The final page, resembling an electrocardiogram (ECG) reading, humorously represents the extreme stress and possibly even the physical symptoms of exam anxiety. The meme plays on the common experience of feeling more and more overwhelmed and stressed as an exam progresses."
120 |       ]
121 |     },
122 |     {
123 |       "id": 2,
124 |       "type": "BagelImageUnderstanding",
125 |       "pos": [
126 |         515,
127 |         130
128 |       ],
129 |       "size": [
130 |         400,
131 |         204
132 |       ],
133 |       "flags": {},
134 |       "order": 2,
135 |       "mode": 0,
136 |       "inputs": [
137 |         {
138 |           "label": "model",
139 |           "name": "model",
140 |           "type": "BAGEL_MODEL",
141 |           "link": 1
142 |         },
143 |         {
144 |           "label": "image",
145 |           "name": "image",
146 |           "type": "IMAGE",
147 |           "link": 2
148 |         }
149 |       ],
150 |       "outputs": [
151 |         {
152 |           "label": "text",
153 |           "name": "text",
154 |           "type": "STRING",
155 |           "links": [
156 |             3
157 |           ]
158 |         }
159 |       ],
160 |       "properties": {
161 |         "aux_id": "neverbiasu/ComfyUI-Bagel",
162 |         "ver": "4bb49e5232604e4838463f130b3ba7026e428c1d",
163 |         "Node name for S&R": "BagelImageUnderstanding"
164 |       },
165 |       "widgets_values": [
166 |         "Can someone explain what's funny about this meme??",
167 |         false,
168 |         false,
169 |         0.3,
170 |         512,
171 |         [
172 |           false,
173 |           true
174 |         ]
175 |       ]
176 |     }
177 |   ],
178 |   "links": [
179 |     [
180 |       1,
181 |       1,
182 |       0,
183 |       2,
184 |       0,
185 |       "BAGEL_MODEL"
186 |     ],
187 |     [
188 |       2,
189 |       3,
190 |       0,
191 |       2,
192 |       1,
193 |       "IMAGE"
194 |     ],
195 |     [
196 |       3,
197 |       2,
198 |       0,
199 |       4,
200 |       0,
201 |       "STRING"
202 |     ]
203 |   ],
204 |   "groups": [],
205 |   "config": {},
206 |   "extra": {
207 |     "ds": {
208 |       "scale": 0.8769226950000005,
209 |       "offset": [
210 |         199.3271554533947,
211 |         10.543933112705858
212 |       ]
213 |     },
214 |     "frontendVersion": "1.17.11",
215 |     "VHS_latentpreview": false,
216 |     "VHS_latentpreviewrate": 0,
217 |     "VHS_MetadataImage": true,
218 |     "VHS_KeepIntermediate": true
219 |   },
220 |   "version": 0.4
221 | }


--------------------------------------------------------------------------------
/example_workflows/bagel_image_understanding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/example_workflows/bagel_image_understanding.png


--------------------------------------------------------------------------------
/example_workflows/bagel_text_to_image.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "4d670fa5-e222-4474-b3e5-6c3f211659a8",
  3 |   "revision": 0,
  4 |   "last_node_id": 7,
  5 |   "last_link_id": 8,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 3,
  9 |       "type": "PreviewImage",
 10 |       "pos": [
 11 |         863.3412475585938,
 12 |         80
 13 |       ],
 14 |       "size": [
 15 |         210,
 16 |         246
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 2,
 20 |       "mode": 0,
 21 |       "inputs": [
 22 |         {
 23 |           "label": "images",
 24 |           "name": "images",
 25 |           "type": "IMAGE",
 26 |           "link": 5
 27 |         }
 28 |       ],
 29 |       "outputs": [],
 30 |       "properties": {
 31 |         "cnr_id": "comfy-core",
 32 |         "ver": "0.3.30",
 33 |         "Node name for S&R": "PreviewImage"
 34 |       },
 35 |       "widgets_values": []
 36 |     },
 37 |     {
 38 |       "id": 5,
 39 |       "type": "BagelTextToImage",
 40 |       "pos": [
 41 |         415,
 42 |         80
 43 |       ],
 44 |       "size": [
 45 |         398.34124755859375,
 46 |         429.22491455078125
 47 |       ],
 48 |       "flags": {},
 49 |       "order": 1,
 50 |       "mode": 0,
 51 |       "inputs": [
 52 |         {
 53 |           "label": "model",
 54 |           "name": "model",
 55 |           "type": "BAGEL_MODEL",
 56 |           "link": 8
 57 |         }
 58 |       ],
 59 |       "outputs": [
 60 |         {
 61 |           "label": "image",
 62 |           "name": "image",
 63 |           "type": "IMAGE",
 64 |           "links": [
 65 |             5
 66 |           ]
 67 |         },
 68 |         {
 69 |           "label": "thinking",
 70 |           "name": "thinking",
 71 |           "type": "STRING",
 72 |           "links": [
 73 |             6
 74 |           ]
 75 |         }
 76 |       ],
 77 |       "properties": {
 78 |         "aux_id": "neverbiasu/ComfyUI-Bagel",
 79 |         "ver": "70a0dddad5c0d416faffd72549909b693baa0113",
 80 |         "Node name for S&R": "BagelTextToImage"
 81 |       },
 82 |       "widgets_values": [
 83 |         "A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere.",
 84 |         495324,
 85 |         "randomize",
 86 |         "1:1",
 87 |         4,
 88 |         50,
 89 |         true,
 90 |         0.4,
 91 |         3,
 92 |         0,
 93 |         "global",
 94 |         0.3,
 95 |         [
 96 |           false,
 97 |           true
 98 |         ]
 99 |       ]
100 |     },
101 |     {
102 |       "id": 4,
103 |       "type": "ShowText|pysssss",
104 |       "pos": [
105 |         863.3412475585938,
106 |         406
107 |       ],
108 |       "size": [
109 |         316.9443359375,
110 |         529.0303955078125
111 |       ],
112 |       "flags": {},
113 |       "order": 3,
114 |       "mode": 0,
115 |       "inputs": [
116 |         {
117 |           "label": "text",
118 |           "name": "text",
119 |           "type": "STRING",
120 |           "link": 6
121 |         }
122 |       ],
123 |       "outputs": [
124 |         {
125 |           "label": "STRING",
126 |           "name": "STRING",
127 |           "shape": 6,
128 |           "type": "STRING",
129 |           "links": null
130 |         }
131 |       ],
132 |       "properties": {
133 |         "cnr_id": "comfyui-custom-scripts",
134 |         "ver": "1.2.5",
135 |         "Node name for S&R": "ShowText|pysssss"
136 |       },
137 |       "widgets_values": [
138 |         "<think>\nAlright, let's bring this vision to life. First, the subject is a female cosplayer dressed as an ethereal fairy or elf. Her outfit should be a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. The fabric should have a shimmering quality, perhaps with a slight iridescence to evoke a magical aura. The dress should be adorned with sparkling jewels and intricate patterns, adding to the enchanting feel. Her pointed ears are a key feature, signifying her elf-like or fairy nature. Her expression should be gentle and enchanting, with a soft gaze that invites wonder. The background is a magical forest, filled with glowing plants that emit a soft, ethereal light. There should be mystical creatures like fairies, small winged beings, or even a dragonfly to enhance the fantastical atmosphere. The lighting should be soft and diffused, creating a serene and dreamlike ambiance. The overall mood should be mystical and enchanting, with a focus on the beauty and otherworldliness of the scene.\n</think>"
139 |       ]
140 |     },
141 |     {
142 |       "id": 7,
143 |       "type": "BagelModelLoader",
144 |       "pos": [
145 |         50,
146 |         80
147 |       ],
148 |       "size": [
149 |         315,
150 |         58
151 |       ],
152 |       "flags": {},
153 |       "order": 0,
154 |       "mode": 0,
155 |       "inputs": [],
156 |       "outputs": [
157 |         {
158 |           "label": "model",
159 |           "name": "model",
160 |           "type": "BAGEL_MODEL",
161 |           "links": [
162 |             8
163 |           ]
164 |         }
165 |       ],
166 |       "properties": {
167 |         "aux_id": "neverbiasu/ComfyUI-Bagel",
168 |         "ver": "7c8172cd6a319dbd7b120e2a51b2d597550af7d2",
169 |         "Node name for S&R": "BagelModelLoader"
170 |       },
171 |       "widgets_values": [
172 |         "DFloat11/BAGEL-7B-MoT-DF11"
173 |       ]
174 |     }
175 |   ],
176 |   "links": [
177 |     [
178 |       5,
179 |       5,
180 |       0,
181 |       3,
182 |       0,
183 |       "IMAGE"
184 |     ],
185 |     [
186 |       6,
187 |       5,
188 |       1,
189 |       4,
190 |       0,
191 |       "STRING"
192 |     ],
193 |     [
194 |       8,
195 |       7,
196 |       0,
197 |       5,
198 |       0,
199 |       "BAGEL_MODEL"
200 |     ]
201 |   ],
202 |   "groups": [],
203 |   "config": {},
204 |   "extra": {
205 |     "ds": {
206 |       "scale": 0.7972024500000007,
207 |       "offset": [
208 |         294.1765257087074,
209 |         -337.1703069744037
210 |       ]
211 |     },
212 |     "frontendVersion": "1.17.11",
213 |     "VHS_latentpreview": false,
214 |     "VHS_latentpreviewrate": 0,
215 |     "VHS_MetadataImage": true,
216 |     "VHS_KeepIntermediate": true
217 |   },
218 |   "version": 0.4
219 | }


--------------------------------------------------------------------------------
/example_workflows/bagel_text_to_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neverbiasu/ComfyUI-BAGEL/777a359273afd21a978ac67ae613c035f18a41a7/example_workflows/bagel_text_to_image.png


--------------------------------------------------------------------------------
/inferencer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | from copy import deepcopy
  5 | from typing import List, Dict, Tuple, Optional, Union, Any
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | from PIL import Image
  9 | import torch
 10 | import torch.nn.functional as F
 11 | from torch import nn
 12 | from torch.nn.attention.flex_attention import create_block_mask
 13 | from transformers.configuration_utils import PretrainedConfig
 14 | from transformers.modeling_utils import PreTrainedModel
 15 | 
 16 | from data.data_utils import pil_img2rgb
 17 | from modeling.bagel.qwen2_navit import NaiveCache
 18 | 
 19 | 
 20 | VLM_THINK_SYSTEM_PROMPT = """You should first think about the reasoning process in the mind and then provide the user with the answer. 
 21 | The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here"""
 22 | 
 23 | GEN_THINK_SYSTEM_PROMPT = """You should first think about the planning process in the mind and then generate the image. 
 24 | The planning process is enclosed within <think> </think> tags, i.e. <think> planning process here </think> image here"""
 25 | 
 26 | 
 27 | class InterleaveInferencer:
 28 |     def __init__(
 29 |         self, model, vae_model, tokenizer, vae_transform, vit_transform, new_token_ids
 30 |     ):
 31 |         self.model = model
 32 |         self.vae_model = vae_model
 33 |         self.tokenizer = tokenizer
 34 |         self.vae_transform = vae_transform
 35 |         self.vit_transform = vit_transform
 36 |         self.new_token_ids = new_token_ids
 37 | 
 38 |     def init_gen_context(self):
 39 |         gen_context = {
 40 |             "kv_lens": [0],
 41 |             "ropes": [0],
 42 |             "past_key_values": NaiveCache(
 43 |                 self.model.config.llm_config.num_hidden_layers
 44 |             ),
 45 |         }
 46 |         return gen_context
 47 | 
 48 |     @torch.no_grad()
 49 |     def update_context_text(self, text, gen_context):
 50 |         # used for interleave data, currently only support 1 data inference,
 51 | 
 52 |         past_key_values = gen_context["past_key_values"]
 53 |         kv_lens = gen_context["kv_lens"]
 54 |         ropes = gen_context["ropes"]
 55 |         generation_input, kv_lens, ropes = self.model.prepare_prompts(
 56 |             curr_kvlens=kv_lens,
 57 |             curr_rope=ropes,
 58 |             prompts=[text],
 59 |             tokenizer=self.tokenizer,
 60 |             new_token_ids=self.new_token_ids,
 61 |         )
 62 | 
 63 |         past_key_values = self.model.forward_cache_update_text(
 64 |             past_key_values, **generation_input
 65 |         )
 66 |         gen_context["kv_lens"] = kv_lens
 67 |         gen_context["ropes"] = ropes
 68 |         gen_context["past_key_values"] = past_key_values
 69 | 
 70 |         return gen_context
 71 | 
 72 |     @torch.no_grad()
 73 |     def update_context_image(self, image, gen_context, vae=True, vit=True):
 74 |         # used for interleave data, currently only support 1 data inference,
 75 | 
 76 |         assert vae or vit
 77 |         past_key_values = gen_context["past_key_values"]
 78 |         kv_lens = gen_context["kv_lens"]
 79 |         ropes = gen_context["ropes"]
 80 | 
 81 |         if vae:
 82 |             ## update vae
 83 |             generation_input, kv_lens, ropes = self.model.prepare_vae_images(
 84 |                 curr_kvlens=kv_lens,
 85 |                 curr_rope=ropes,
 86 |                 images=[image],
 87 |                 transforms=self.vae_transform,
 88 |                 new_token_ids=self.new_token_ids,
 89 |             )
 90 |             past_key_values = self.model.forward_cache_update_vae(
 91 |                 self.vae_model, past_key_values, **generation_input
 92 |             )
 93 | 
 94 |         if vit:
 95 |             ## update vit
 96 |             generation_input, kv_lens, ropes = self.model.prepare_vit_images(
 97 |                 curr_kvlens=kv_lens,
 98 |                 curr_rope=ropes,
 99 |                 images=[image],
100 |                 transforms=self.vit_transform,
101 |                 new_token_ids=self.new_token_ids,
102 |             )
103 |             past_key_values = self.model.forward_cache_update_vit(
104 |                 past_key_values, **generation_input
105 |             )
106 | 
107 |         gen_context["kv_lens"] = kv_lens
108 |         gen_context["ropes"] = ropes
109 |         gen_context["past_key_values"] = past_key_values
110 | 
111 |         return gen_context
112 | 
113 |     @torch.no_grad()
114 |     def gen_image(
115 |         self,
116 |         image_shape,
117 |         gen_context,
118 |         cfg_text_scale=4.0,
119 |         cfg_img_scale=1.5,
120 |         cfg_text_precontext=None,
121 |         cfg_img_precontext=None,
122 |         cfg_interval=(0.4, 1.0),
123 |         cfg_renorm_min=0.0,
124 |         cfg_renorm_type="global",
125 |         num_timesteps=50,
126 |         timestep_shift=3.0,
127 |     ):
128 |         # print(cfg_renorm_type)
129 |         past_key_values = gen_context["past_key_values"]
130 |         kv_lens = gen_context["kv_lens"]
131 |         ropes = gen_context["ropes"]
132 |         generation_input = self.model.prepare_vae_latent(
133 |             curr_kvlens=kv_lens,
134 |             curr_rope=ropes,
135 |             image_sizes=[image_shape],
136 |             new_token_ids=self.new_token_ids,
137 |         )
138 | 
139 |         # text cfg
140 |         cfg_text_past_key_values = cfg_text_precontext["past_key_values"]
141 |         kv_lens_cfg = cfg_text_precontext["kv_lens"]
142 |         ropes_cfg = cfg_text_precontext["ropes"]
143 |         generation_input_cfg_text = self.model.prepare_vae_latent_cfg(
144 |             curr_kvlens=kv_lens_cfg,
145 |             curr_rope=ropes_cfg,
146 |             image_sizes=[image_shape],
147 |         )
148 | 
149 |         # img cfg
150 |         cfg_img_past_key_values = cfg_img_precontext["past_key_values"]
151 |         kv_lens_cfg = cfg_img_precontext["kv_lens"]
152 |         ropes_cfg = cfg_img_precontext["ropes"]
153 |         generation_input_cfg_img = self.model.prepare_vae_latent_cfg(
154 |             curr_kvlens=kv_lens_cfg,
155 |             curr_rope=ropes_cfg,
156 |             image_sizes=[image_shape],
157 |         )
158 | 
159 |         unpacked_latent = self.model.generate_image(
160 |             past_key_values=past_key_values,
161 |             cfg_text_past_key_values=cfg_text_past_key_values,
162 |             cfg_img_past_key_values=cfg_img_past_key_values,
163 |             num_timesteps=num_timesteps,
164 |             cfg_text_scale=cfg_text_scale,
165 |             cfg_img_scale=cfg_img_scale,
166 |             cfg_interval=cfg_interval,
167 |             cfg_renorm_min=cfg_renorm_min,
168 |             cfg_renorm_type=cfg_renorm_type,
169 |             timestep_shift=timestep_shift,
170 |             **generation_input,
171 |             cfg_text_packed_position_ids=generation_input_cfg_text[
172 |                 "cfg_packed_position_ids"
173 |             ],
174 |             cfg_text_packed_query_indexes=generation_input_cfg_text[
175 |                 "cfg_packed_query_indexes"
176 |             ],
177 |             cfg_text_key_values_lens=generation_input_cfg_text["cfg_key_values_lens"],
178 |             cfg_text_packed_key_value_indexes=generation_input_cfg_text[
179 |                 "cfg_packed_key_value_indexes"
180 |             ],
181 |             cfg_img_packed_position_ids=generation_input_cfg_img[
182 |                 "cfg_packed_position_ids"
183 |             ],
184 |             cfg_img_packed_query_indexes=generation_input_cfg_img[
185 |                 "cfg_packed_query_indexes"
186 |             ],
187 |             cfg_img_key_values_lens=generation_input_cfg_img["cfg_key_values_lens"],
188 |             cfg_img_packed_key_value_indexes=generation_input_cfg_img[
189 |                 "cfg_packed_key_value_indexes"
190 |             ],
191 |         )
192 | 
193 |         image = self.decode_image(unpacked_latent[0], image_shape)
194 |         return image
195 | 
196 |     def decode_image(self, latent, image_shape):
197 |         H, W = image_shape
198 |         h, w = H // self.model.latent_downsample, W // self.model.latent_downsample
199 | 
200 |         latent = latent.reshape(
201 |             1,
202 |             h,
203 |             w,
204 |             self.model.latent_patch_size,
205 |             self.model.latent_patch_size,
206 |             self.model.latent_channel,
207 |         )
208 |         latent = torch.einsum("nhwpqc->nchpwq", latent)
209 |         latent = latent.reshape(
210 |             1,
211 |             self.model.latent_channel,
212 |             h * self.model.latent_patch_size,
213 |             w * self.model.latent_patch_size,
214 |         )
215 |         image = self.vae_model.decode(latent)
216 |         image = (image * 0.5 + 0.5).clamp(0, 1)[0].permute(1, 2, 0) * 255
217 |         image = Image.fromarray((image).to(torch.uint8).cpu().numpy())
218 | 
219 |         return image
220 | 
221 |     @torch.no_grad()
222 |     def gen_text(
223 |         self,
224 |         gen_context,
225 |         max_length: int = 500,
226 |         do_sample: bool = True,
227 |         temperature: float = 1.0,
228 |     ):
229 |         gen_context = deepcopy(gen_context)
230 |         past_key_values = gen_context["past_key_values"]
231 |         kv_lens = gen_context["kv_lens"]
232 |         ropes = gen_context["ropes"]
233 | 
234 |         generation_input = self.model.prepare_start_tokens(
235 |             kv_lens, ropes, self.new_token_ids
236 |         )
237 |         unpacked_latent = self.model.generate_text(
238 |             past_key_values=past_key_values,
239 |             max_length=max_length,
240 |             do_sample=do_sample,
241 |             temperature=temperature,
242 |             end_token_id=self.new_token_ids["eos_token_id"],
243 |             **generation_input,
244 |         )
245 |         output = self.tokenizer.decode(unpacked_latent[:, 0])
246 |         output = output.split("<|im_end|>")[0].split("<|im_start|>")[1]
247 |         return output
248 | 
249 |     @torch.no_grad()
250 |     def interleave_inference(
251 |         self,
252 |         input_lists: List[Union[str, Image.Image]],
253 |         think=False,
254 |         understanding_output=False,
255 |         max_think_token_n=1000,
256 |         do_sample=False,
257 |         text_temperature=0.3,
258 |         cfg_text_scale=3.0,
259 |         cfg_img_scale=1.5,
260 |         cfg_interval=[0.4, 1.0],
261 |         timestep_shift=3.0,
262 |         num_timesteps=50,
263 |         cfg_renorm_min=0.0,
264 |         cfg_renorm_type="global",
265 |         image_shapes=(1024, 1024),
266 |     ) -> List[Union[str, Image.Image]]:
267 | 
268 |         output_list = []
269 |         gen_context = self.init_gen_context()
270 |         cfg_text_context = deepcopy(gen_context)
271 |         cfg_img_context = deepcopy(gen_context)
272 | 
273 |         with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
274 |             if think:
275 |                 if understanding_output:
276 |                     system_prompt = VLM_THINK_SYSTEM_PROMPT
277 |                 else:
278 |                     system_prompt = GEN_THINK_SYSTEM_PROMPT
279 |                 gen_context = self.update_context_text(system_prompt, gen_context)
280 |                 cfg_img_context = self.update_context_text(
281 |                     system_prompt, cfg_img_context
282 |                 )
283 | 
284 |             for input_term in input_lists:
285 |                 if isinstance(input_term, str):
286 |                     cfg_text_context = deepcopy(gen_context)
287 |                     gen_context = self.update_context_text(input_term, gen_context)
288 |                     cfg_img_context = self.update_context_text(
289 |                         input_term, cfg_img_context
290 |                     )
291 | 
292 |                 elif isinstance(input_term, Image.Image):
293 |                     input_term = self.vae_transform.resize_transform(
294 |                         pil_img2rgb(input_term)
295 |                     )
296 |                     gen_context = self.update_context_image(
297 |                         input_term, gen_context, vae=not understanding_output
298 |                     )
299 | 
300 |                     image_shapes = input_term.size[::-1]
301 |                     cfg_text_context = deepcopy(gen_context)
302 | 
303 |                 else:
304 |                     raise ValueError(f"Unsupported input type: {type(input_term)}")
305 | 
306 |             if understanding_output:
307 |                 gen_text = self.gen_text(
308 |                     gen_context,
309 |                     do_sample=do_sample,
310 |                     temperature=text_temperature,
311 |                     max_length=max_think_token_n,
312 |                 )
313 |                 output_list.append(gen_text)
314 | 
315 |             else:
316 |                 if think:
317 |                     gen_text = self.gen_text(
318 |                         gen_context,
319 |                         do_sample=do_sample,
320 |                         temperature=text_temperature,
321 |                         max_length=max_think_token_n,
322 |                     )
323 |                     gen_context = self.update_context_text(gen_text, gen_context)
324 |                     output_list.append(gen_text)
325 | 
326 |                 img = self.gen_image(
327 |                     image_shapes,
328 |                     gen_context,
329 |                     cfg_text_precontext=cfg_text_context,
330 |                     cfg_img_precontext=cfg_img_context,
331 |                     cfg_text_scale=cfg_text_scale,
332 |                     cfg_img_scale=cfg_img_scale,
333 |                     cfg_interval=cfg_interval,
334 |                     timestep_shift=timestep_shift,
335 |                     num_timesteps=num_timesteps,
336 |                     cfg_renorm_min=cfg_renorm_min,
337 |                     cfg_renorm_type=cfg_renorm_type,
338 |                 )
339 | 
340 |                 output_list.append(img)
341 | 
342 |         return output_list
343 | 
344 |     def __call__(
345 |         self, image: Optional[Image.Image] = None, text: Optional[str] = None, **kargs
346 |     ) -> Dict[str, Any]:
347 |         output_dict = {"image": None, "text": None}
348 | 
349 |         if image is None and text is None:
350 |             print("Please provide at least one input: either an image or text.")
351 |             return output_dict
352 | 
353 |         input_list = []
354 |         if image is not None:
355 |             input_list.append(image)
356 |         if text is not None:
357 |             input_list.append(text)
358 | 
359 |         output_list = self.interleave_inference(input_list, **kargs)
360 | 
361 |         for i in output_list:
362 |             if isinstance(i, Image.Image):
363 |                 output_dict["image"] = i
364 |             elif isinstance(i, str):
365 |                 output_dict["text"] = i
366 |         return output_dict
367 | 


--------------------------------------------------------------------------------
/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | from . import bagel, qwen2, siglip, autoencoder


--------------------------------------------------------------------------------
/modeling/autoencoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Black Forest Labs.
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under Apache-2.0, with the full license text
  8 | # available at https://github.com/black-forest-labs/flux/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | from dataclasses import dataclass
 13 | 
 14 | import torch
 15 | from einops import rearrange
 16 | from torch import Tensor, nn
 17 | from huggingface_hub import hf_hub_download
 18 | from safetensors.torch import load_file as load_sft
 19 | 
 20 | 
 21 | @dataclass
 22 | class AutoEncoderParams:
 23 |     resolution: int
 24 |     in_channels: int
 25 |     downsample: int
 26 |     ch: int
 27 |     out_ch: int
 28 |     ch_mult: list[int]
 29 |     num_res_blocks: int
 30 |     z_channels: int
 31 |     scale_factor: float
 32 |     shift_factor: float
 33 | 
 34 | 
 35 | def swish(x: Tensor) -> Tensor:
 36 |     return x * torch.sigmoid(x)
 37 | 
 38 | 
 39 | class AttnBlock(nn.Module):
 40 |     def __init__(self, in_channels: int):
 41 |         super().__init__()
 42 |         self.in_channels = in_channels
 43 | 
 44 |         self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
 45 | 
 46 |         self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
 47 |         self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
 48 |         self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
 49 |         self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
 50 | 
 51 |     def attention(self, h_: Tensor) -> Tensor:
 52 |         h_ = self.norm(h_)
 53 |         q = self.q(h_)
 54 |         k = self.k(h_)
 55 |         v = self.v(h_)
 56 | 
 57 |         b, c, h, w = q.shape
 58 |         q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
 59 |         k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
 60 |         v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
 61 |         h_ = nn.functional.scaled_dot_product_attention(q, k, v)
 62 | 
 63 |         return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
 64 | 
 65 |     def forward(self, x: Tensor) -> Tensor:
 66 |         return x + self.proj_out(self.attention(x))
 67 | 
 68 | 
 69 | class ResnetBlock(nn.Module):
 70 |     def __init__(self, in_channels: int, out_channels: int):
 71 |         super().__init__()
 72 |         self.in_channels = in_channels
 73 |         out_channels = in_channels if out_channels is None else out_channels
 74 |         self.out_channels = out_channels
 75 | 
 76 |         self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
 77 |         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
 78 |         self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
 79 |         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
 80 |         if self.in_channels != self.out_channels:
 81 |             self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
 82 | 
 83 |     def forward(self, x):
 84 |         h = x
 85 |         h = self.norm1(h)
 86 |         h = swish(h)
 87 |         h = self.conv1(h)
 88 | 
 89 |         h = self.norm2(h)
 90 |         h = swish(h)
 91 |         h = self.conv2(h)
 92 | 
 93 |         if self.in_channels != self.out_channels:
 94 |             x = self.nin_shortcut(x)
 95 | 
 96 |         return x + h
 97 | 
 98 | 
 99 | class Downsample(nn.Module):
100 |     def __init__(self, in_channels: int):
101 |         super().__init__()
102 |         # no asymmetric padding in torch conv, must do it ourselves
103 |         self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
104 | 
105 |     def forward(self, x: Tensor):
106 |         pad = (0, 1, 0, 1)
107 |         x = nn.functional.pad(x, pad, mode="constant", value=0)
108 |         x = self.conv(x)
109 |         return x
110 | 
111 | 
112 | class Upsample(nn.Module):
113 |     def __init__(self, in_channels: int):
114 |         super().__init__()
115 |         self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
116 | 
117 |     def forward(self, x: Tensor):
118 |         x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
119 |         x = self.conv(x)
120 |         return x
121 | 
122 | 
123 | class Encoder(nn.Module):
124 |     def __init__(
125 |         self,
126 |         resolution: int,
127 |         in_channels: int,
128 |         ch: int,
129 |         ch_mult: list[int],
130 |         num_res_blocks: int,
131 |         z_channels: int,
132 |     ):
133 |         super().__init__()
134 |         self.ch = ch
135 |         self.num_resolutions = len(ch_mult)
136 |         self.num_res_blocks = num_res_blocks
137 |         self.resolution = resolution
138 |         self.in_channels = in_channels
139 |         # downsampling
140 |         self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
141 | 
142 |         curr_res = resolution
143 |         in_ch_mult = (1,) + tuple(ch_mult)
144 |         self.in_ch_mult = in_ch_mult
145 |         self.down = nn.ModuleList()
146 |         block_in = self.ch
147 |         for i_level in range(self.num_resolutions):
148 |             block = nn.ModuleList()
149 |             attn = nn.ModuleList()
150 |             block_in = ch * in_ch_mult[i_level]
151 |             block_out = ch * ch_mult[i_level]
152 |             for _ in range(self.num_res_blocks):
153 |                 block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
154 |                 block_in = block_out
155 |             down = nn.Module()
156 |             down.block = block
157 |             down.attn = attn
158 |             if i_level != self.num_resolutions - 1:
159 |                 down.downsample = Downsample(block_in)
160 |                 curr_res = curr_res // 2
161 |             self.down.append(down)
162 | 
163 |         # middle
164 |         self.mid = nn.Module()
165 |         self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
166 |         self.mid.attn_1 = AttnBlock(block_in)
167 |         self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
168 | 
169 |         # end
170 |         self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
171 |         self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
172 | 
173 |     def forward(self, x: Tensor) -> Tensor:
174 |         # downsampling
175 |         hs = [self.conv_in(x)]
176 |         for i_level in range(self.num_resolutions):
177 |             for i_block in range(self.num_res_blocks):
178 |                 h = self.down[i_level].block[i_block](hs[-1])
179 |                 if len(self.down[i_level].attn) > 0:
180 |                     h = self.down[i_level].attn[i_block](h)
181 |                 hs.append(h)
182 |             if i_level != self.num_resolutions - 1:
183 |                 hs.append(self.down[i_level].downsample(hs[-1]))
184 | 
185 |         # middle
186 |         h = hs[-1]
187 |         h = self.mid.block_1(h)
188 |         h = self.mid.attn_1(h)
189 |         h = self.mid.block_2(h)
190 |         # end
191 |         h = self.norm_out(h)
192 |         h = swish(h)
193 |         h = self.conv_out(h)
194 |         return h
195 | 
196 | 
197 | class Decoder(nn.Module):
198 |     def __init__(
199 |         self,
200 |         ch: int,
201 |         out_ch: int,
202 |         ch_mult: list[int],
203 |         num_res_blocks: int,
204 |         in_channels: int,
205 |         resolution: int,
206 |         z_channels: int,
207 |     ):
208 |         super().__init__()
209 |         self.ch = ch
210 |         self.num_resolutions = len(ch_mult)
211 |         self.num_res_blocks = num_res_blocks
212 |         self.resolution = resolution
213 |         self.in_channels = in_channels
214 |         self.ffactor = 2 ** (self.num_resolutions - 1)
215 | 
216 |         # compute in_ch_mult, block_in and curr_res at lowest res
217 |         block_in = ch * ch_mult[self.num_resolutions - 1]
218 |         curr_res = resolution // 2 ** (self.num_resolutions - 1)
219 |         self.z_shape = (1, z_channels, curr_res, curr_res)
220 | 
221 |         # z to block_in
222 |         self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
223 | 
224 |         # middle
225 |         self.mid = nn.Module()
226 |         self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
227 |         self.mid.attn_1 = AttnBlock(block_in)
228 |         self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
229 | 
230 |         # upsampling
231 |         self.up = nn.ModuleList()
232 |         for i_level in reversed(range(self.num_resolutions)):
233 |             block = nn.ModuleList()
234 |             attn = nn.ModuleList()
235 |             block_out = ch * ch_mult[i_level]
236 |             for _ in range(self.num_res_blocks + 1):
237 |                 block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
238 |                 block_in = block_out
239 |             up = nn.Module()
240 |             up.block = block
241 |             up.attn = attn
242 |             if i_level != 0:
243 |                 up.upsample = Upsample(block_in)
244 |                 curr_res = curr_res * 2
245 |             self.up.insert(0, up)  # prepend to get consistent order
246 | 
247 |         # end
248 |         self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
249 |         self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
250 | 
251 |     def forward(self, z: Tensor) -> Tensor:
252 |         # z to block_in
253 |         h = self.conv_in(z)
254 | 
255 |         # middle
256 |         h = self.mid.block_1(h)
257 |         h = self.mid.attn_1(h)
258 |         h = self.mid.block_2(h)
259 | 
260 |         # upsampling
261 |         for i_level in reversed(range(self.num_resolutions)):
262 |             for i_block in range(self.num_res_blocks + 1):
263 |                 h = self.up[i_level].block[i_block](h)
264 |                 if len(self.up[i_level].attn) > 0:
265 |                     h = self.up[i_level].attn[i_block](h)
266 |             if i_level != 0:
267 |                 h = self.up[i_level].upsample(h)
268 | 
269 |         # end
270 |         h = self.norm_out(h)
271 |         h = swish(h)
272 |         h = self.conv_out(h)
273 |         return h
274 | 
275 | 
276 | class DiagonalGaussian(nn.Module):
277 |     def __init__(self, sample: bool = True, chunk_dim: int = 1):
278 |         super().__init__()
279 |         self.sample = sample
280 |         self.chunk_dim = chunk_dim
281 | 
282 |     def forward(self, z: Tensor) -> Tensor:
283 |         mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
284 |         if self.sample:
285 |             std = torch.exp(0.5 * logvar)
286 |             return mean + std * torch.randn_like(mean)
287 |         else:
288 |             return mean
289 | 
290 | 
291 | class AutoEncoder(nn.Module):
292 |     def __init__(self, params: AutoEncoderParams):
293 |         super().__init__()
294 |         self.encoder = Encoder(
295 |             resolution=params.resolution,
296 |             in_channels=params.in_channels,
297 |             ch=params.ch,
298 |             ch_mult=params.ch_mult,
299 |             num_res_blocks=params.num_res_blocks,
300 |             z_channels=params.z_channels,
301 |         )
302 |         self.decoder = Decoder(
303 |             resolution=params.resolution,
304 |             in_channels=params.in_channels,
305 |             ch=params.ch,
306 |             out_ch=params.out_ch,
307 |             ch_mult=params.ch_mult,
308 |             num_res_blocks=params.num_res_blocks,
309 |             z_channels=params.z_channels,
310 |         )
311 |         self.reg = DiagonalGaussian()
312 | 
313 |         self.scale_factor = params.scale_factor
314 |         self.shift_factor = params.shift_factor
315 | 
316 |     def encode(self, x: Tensor) -> Tensor:
317 |         z = self.reg(self.encoder(x))
318 |         z = self.scale_factor * (z - self.shift_factor)
319 |         return z
320 | 
321 |     def decode(self, z: Tensor) -> Tensor:
322 |         z = z / self.scale_factor + self.shift_factor
323 |         return self.decoder(z)
324 | 
325 |     def forward(self, x: Tensor) -> Tensor:
326 |         return self.decode(self.encode(x))
327 | 
328 | 
329 | def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
330 |     if len(missing) > 0 and len(unexpected) > 0:
331 |         print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
332 |         print("\n" + "-" * 79 + "\n")
333 |         print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
334 |     elif len(missing) > 0:
335 |         print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
336 |     elif len(unexpected) > 0:
337 |         print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
338 | 
339 | 
340 | def load_ae(local_path: str) -> AutoEncoder:
341 |     ae_params = AutoEncoderParams(
342 |             resolution=256,
343 |             in_channels=3,
344 |             downsample=8,
345 |             ch=128,
346 |             out_ch=3,
347 |             ch_mult=[1, 2, 4, 4],
348 |             num_res_blocks=2,
349 |             z_channels=16,
350 |             scale_factor=0.3611,
351 |             shift_factor=0.1159,
352 |     )
353 | 
354 |     # Loading the autoencoder
355 |     ae = AutoEncoder(ae_params)
356 | 
357 |     if local_path is not None:
358 |         sd = load_sft(local_path)
359 |         missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
360 |         print_load_warning(missing, unexpected)
361 |     return ae, ae_params
362 | 


--------------------------------------------------------------------------------
/modeling/bagel/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | from .bagel import BagelConfig, Bagel
 6 | from .qwen2_navit import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 7 | from .siglip_navit import SiglipVisionConfig, SiglipVisionModel
 8 | 
 9 | 
10 | __all__ = [
11 |     'BagelConfig',
12 |     'Bagel',
13 |     'Qwen2Config',
14 |     'Qwen2Model', 
15 |     'Qwen2ForCausalLM',
16 |     'SiglipVisionConfig',
17 |     'SiglipVisionModel',
18 | ]
19 | 


--------------------------------------------------------------------------------
/modeling/bagel/modeling_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: CC BY-NC 4.0
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under CC BY-NC 4.0, with the full license text
  8 | # available at https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import math
 13 | 
 14 | import numpy as np
 15 | import torch
 16 | from torch import nn
 17 | from transformers.activations import ACT2FN
 18 | 
 19 | # --------------------------------------------------------
 20 | # 2D sine-cosine position embedding
 21 | # References:
 22 | # DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
 23 | # --------------------------------------------------------
 24 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
 25 |     grid_h = np.arange(grid_size, dtype=np.float32)
 26 |     grid_w = np.arange(grid_size, dtype=np.float32)
 27 |     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
 28 |     grid = np.stack(grid, axis=0)
 29 | 
 30 |     grid = grid.reshape([2, 1, grid_size, grid_size])
 31 |     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
 32 |     if cls_token and extra_tokens > 0:
 33 |         pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
 34 |     return pos_embed
 35 | 
 36 | 
 37 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
 38 |     assert embed_dim % 2 == 0
 39 | 
 40 |     # use half of dimensions to encode grid_h
 41 |     emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
 42 |     emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
 43 | 
 44 |     emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
 45 |     return emb
 46 | 
 47 | 
 48 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
 49 |     """
 50 |     embed_dim: output dimension for each position
 51 |     pos: a list of positions to be encoded: size (M,)
 52 |     out: (M, D)
 53 |     """
 54 |     assert embed_dim % 2 == 0
 55 |     omega = np.arange(embed_dim // 2, dtype=np.float64)
 56 |     omega /= embed_dim / 2.
 57 |     omega = 1. / 10000**omega  # (D/2,)
 58 | 
 59 |     pos = pos.reshape(-1)  # (M,)
 60 |     out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
 61 | 
 62 |     emb_sin = np.sin(out) # (M, D/2)
 63 |     emb_cos = np.cos(out) # (M, D/2)
 64 | 
 65 |     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
 66 |     return emb
 67 | 
 68 | 
 69 | # --------------------------------------------------------
 70 | # TimestepEmbedder
 71 | # Reference:
 72 | # DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
 73 | # --------------------------------------------------------
 74 | class TimestepEmbedder(nn.Module):
 75 |     """
 76 |     Embeds scalar timesteps into vector representations.
 77 |     """
 78 |     def __init__(self, hidden_size, frequency_embedding_size=256):
 79 |         super().__init__()
 80 |         self.mlp = nn.Sequential(
 81 |             nn.Linear(frequency_embedding_size, hidden_size, bias=True),
 82 |             nn.SiLU(),
 83 |             nn.Linear(hidden_size, hidden_size, bias=True),
 84 |         )
 85 |         self.frequency_embedding_size = frequency_embedding_size
 86 | 
 87 |     @staticmethod
 88 |     def timestep_embedding(t, dim, max_period=10000):
 89 |         """
 90 |         Create sinusoidal timestep embeddings.
 91 |         :param t: a 1-D Tensor of N indices, one per batch element.
 92 |                           These may be fractional.
 93 |         :param dim: the dimension of the output.
 94 |         :param max_period: controls the minimum frequency of the embeddings.
 95 |         :return: an (N, D) Tensor of positional embeddings.
 96 |         """
 97 |         half = dim // 2
 98 |         freqs = torch.exp(
 99 |             -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
100 |         ).to(device=t.device)
101 |         args = t[:, None].float() * freqs[None]
102 |         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
103 |         if dim % 2:
104 |             embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
105 |         return embedding
106 | 
107 |     def forward(self, t):
108 |         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
109 |         t_emb = self.mlp(t_freq)
110 |         return t_emb
111 | 
112 | 
113 | class MLPconnector(nn.Module):
114 |     def __init__(self, in_dim: int, out_dim: int, hidden_act: str):
115 |         super().__init__()
116 |         self.activation_fn = ACT2FN[hidden_act]
117 |         self.fc1 = nn.Linear(in_dim, out_dim)
118 |         self.fc2 = nn.Linear(out_dim, out_dim)
119 | 
120 |     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
121 |         hidden_states = self.fc1(hidden_states)
122 |         hidden_states = self.activation_fn(hidden_states)
123 |         hidden_states = self.fc2(hidden_states)
124 |         return hidden_states
125 | 
126 | 
127 | class PositionEmbedding(nn.Module):
128 |     def __init__(self, max_num_patch_per_side, hidden_size):
129 |         super().__init__()
130 |         self.max_num_patch_per_side = max_num_patch_per_side
131 |         self.hidden_size = hidden_size
132 |         self.pos_embed = nn.Parameter(
133 |             torch.zeros(max_num_patch_per_side ** 2, hidden_size), 
134 |             requires_grad=False
135 |         )
136 |         self._init_weights()
137 | 
138 |     def _init_weights(self):
139 |         # Initialize (and freeze) pos_embed by sin-cos embedding:
140 |         pos_embed = get_2d_sincos_pos_embed(self.hidden_size, self.max_num_patch_per_side)
141 |         self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float())
142 | 
143 |     def forward(self, position_ids):
144 |         return self.pos_embed[position_ids]


--------------------------------------------------------------------------------
/modeling/bagel/siglip_navit.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 The HuggingFace Inc. team.
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under Apache-2.0, with the full license text
  8 | # available at https://github.com/huggingface/transformers/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | 
 15 | from transformers.activations import ACT2FN
 16 | from modeling.siglip.configuration_siglip import SiglipVisionConfig as _SiglipVisionConfig
 17 | from modeling.siglip.modeling_siglip import SiglipAttention, SiglipPreTrainedModel
 18 | from flash_attn import flash_attn_varlen_func
 19 | 
 20 | 
 21 | class SiglipVisionConfig(_SiglipVisionConfig):
 22 |     r"""
 23 |     This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
 24 |     Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
 25 |     configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
 26 |     [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
 27 | 
 28 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 29 |     documentation from [`PretrainedConfig`] for more information.
 30 | 
 31 |     Args:
 32 |         hidden_size (`int`, *optional*, defaults to 768):
 33 |             Dimensionality of the encoder layers and the pooler layer.
 34 |         intermediate_size (`int`, *optional*, defaults to 3072):
 35 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 36 |         num_hidden_layers (`int`, *optional*, defaults to 12):
 37 |             Number of hidden layers in the Transformer encoder.
 38 |         num_attention_heads (`int`, *optional*, defaults to 12):
 39 |             Number of attention heads for each attention layer in the Transformer encoder.
 40 |         num_channels (`int`, *optional*, defaults to 3):
 41 |             Number of channels in the input images.
 42 |         image_size (`int`, *optional*, defaults to 224):
 43 |             The size (resolution) of each image.
 44 |         patch_size (`int`, *optional*, defaults to 16):
 45 |             The size (resolution) of each patch.
 46 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
 47 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 48 |             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
 49 |         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
 50 |             The epsilon used by the layer normalization layers.
 51 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 52 |             The dropout ratio for the attention probabilities.
 53 | 
 54 |     Example:
 55 | 
 56 |     ```python
 57 |     >>> from transformers import SiglipVisionConfig, SiglipVisionModel
 58 | 
 59 |     >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
 60 |     >>> configuration = SiglipVisionConfig()
 61 | 
 62 |     >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
 63 |     >>> model = SiglipVisionModel(configuration)
 64 | 
 65 |     >>> # Accessing the model configuration
 66 |     >>> configuration = model.config
 67 |     ```"""
 68 | 
 69 |     model_type = "siglip_vision_model"
 70 | 
 71 |     def __init__(
 72 |         self,
 73 |         hidden_size=768,
 74 |         intermediate_size=3072,
 75 |         num_hidden_layers=12,
 76 |         num_attention_heads=12,
 77 |         num_channels=3,
 78 |         image_size=224,
 79 |         patch_size=16,
 80 |         hidden_act="gelu_pytorch_tanh",
 81 |         layer_norm_eps=1e-6,
 82 |         attention_dropout=0.0,
 83 |         rope=True,
 84 |         **kwargs,
 85 |     ):
 86 |         super().__init__(
 87 |             hidden_size=hidden_size,
 88 |             intermediate_size=intermediate_size,
 89 |             num_hidden_layers=num_hidden_layers,
 90 |             num_attention_heads=num_attention_heads,
 91 |             num_channels=num_channels,
 92 |             image_size=image_size,
 93 |             patch_size=patch_size,
 94 |             hidden_act=hidden_act,
 95 |             layer_norm_eps=layer_norm_eps,
 96 |             attention_dropout=attention_dropout,
 97 |             **kwargs)
 98 |         
 99 |         self.rope = rope
100 | 
101 | 
102 | class RotaryEmbedding2D(torch.nn.Module):
103 |     def __init__(self, dim, max_h, max_w, base=10000):
104 |         super().__init__()
105 |         freq = torch.arange(0, dim, 2, dtype=torch.int64).float() / dim
106 |         inv_freq = 1.0 / (base ** freq)
107 | 
108 |         grid_h = torch.arange(0, max_h)
109 |         grid_h = grid_h.to(inv_freq.dtype)
110 |         grid_h = grid_h[:, None].repeat(1, max_w)
111 | 
112 |         grid_w = torch.arange(0, max_w)
113 |         grid_w = grid_w.to(inv_freq.dtype)
114 |         grid_w = grid_w[None, :].repeat(max_h, 1)
115 | 
116 |         cos_h, sin_h = self._forward_one_side(grid_h, inv_freq)
117 |         cos_w, sin_w = self._forward_one_side(grid_w, inv_freq)
118 | 
119 |         self.register_buffer("cos_h", cos_h)
120 |         self.register_buffer("sin_h", sin_h)
121 |         self.register_buffer("cos_w", cos_w)
122 |         self.register_buffer("sin_w", sin_w)
123 | 
124 |     def _forward_one_side(self, grid, inv_freq):
125 |         freqs = grid[..., None] * inv_freq[None, None, :]
126 |         emb = torch.cat((freqs, freqs), dim=-1).flatten(0, 1)
127 |         return emb.cos(), emb.sin()
128 | 
129 | 
130 | def rotate_half(x):
131 |     x1 = x[..., : x.shape[-1] // 2]
132 |     x2 = x[..., x.shape[-1] // 2 :]
133 |     return torch.cat((-x2, x1), dim=-1)
134 | 
135 | 
136 | def apply_rotary_pos_emb(q, k, cos, sin):
137 |     # unsqueeze due to the head dimension
138 |     cos = cos.unsqueeze(1)
139 |     sin = sin.unsqueeze(1)
140 |     q_embed = (q * cos) + (rotate_half(q) * sin)
141 |     k_embed = (k * cos) + (rotate_half(k) * sin)
142 |     return q_embed, k_embed
143 | 
144 | 
145 | class SiglipVisionEmbeddings(nn.Module):
146 |     def __init__(self, config: SiglipVisionConfig):
147 |         super().__init__()
148 |         self.config = config
149 |         self.embed_dim = config.hidden_size
150 |         self.image_size = config.image_size
151 |         self.patch_size = config.patch_size
152 | 
153 |         self.patch_embedding = nn.Conv2d(
154 |             in_channels=config.num_channels,
155 |             out_channels=self.embed_dim,
156 |             kernel_size=self.patch_size,
157 |             stride=self.patch_size,
158 |             padding="valid",
159 |         )
160 | 
161 |         self.num_patches_per_side = self.image_size // self.patch_size
162 |         self.num_patches = self.num_patches_per_side**2
163 |         self.num_positions = self.num_patches
164 |         if not config.rope:
165 |             self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
166 | 
167 |     def convert_conv2d_to_linear(self, config, meta=False):
168 |         if meta:
169 |             linear_patch_embedding = nn.Linear(
170 |                 config.num_channels * self.patch_size ** 2, self.embed_dim, bias=True, device='meta'
171 |             )
172 |         else:
173 |             linear_patch_embedding = nn.Linear(
174 |                 config.num_channels * self.patch_size ** 2, self.embed_dim, bias=True
175 |             )
176 |         W = self.patch_embedding.weight.permute(0, 2, 3, 1).reshape(
177 |             self.embed_dim, config.num_channels * self.patch_size ** 2
178 |         )
179 |         linear_patch_embedding.weight.data = W
180 |         linear_patch_embedding.bias.data = self.patch_embedding.bias.data
181 |         del self.patch_embedding
182 |         self.patch_embedding = linear_patch_embedding
183 | 
184 |     def forward(
185 |         self, 
186 |         packed_pixel_values: torch.FloatTensor, 
187 |         packed_flattened_position_ids: torch.LongTensor
188 |     ) -> torch.Tensor:
189 | 
190 |         patch_embeds = self.patch_embedding(packed_pixel_values)
191 |         if not self.config.rope:
192 |             embeddings = patch_embeds + self.position_embedding(packed_flattened_position_ids)
193 |         else:
194 |             embeddings = patch_embeds
195 |         return embeddings
196 | 
197 | 
198 | class SiglipFlashAttention2(SiglipAttention):
199 |     def __init__(self, *args, **kwargs):
200 |         super().__init__(*args, **kwargs)
201 | 
202 |     def forward(
203 |         self,
204 |         hidden_states: torch.Tensor,
205 |         cu_seqlens: torch.IntTensor,
206 |         max_seqlen: int,
207 |         cos_h: torch.Tensor = None,
208 |         sin_h: torch.Tensor = None,
209 |         cos_w: torch.Tensor = None,
210 |         sin_w: torch.Tensor = None,
211 |         **kwargs,
212 |     ) -> torch.Tensor:
213 | 
214 |         total_q_len, _ = hidden_states.size()
215 | 
216 |         query_states = self.q_proj(hidden_states)
217 |         key_states = self.k_proj(hidden_states)
218 |         value_states = self.v_proj(hidden_states)
219 | 
220 |         query_states = query_states.view(total_q_len, self.num_heads, self.head_dim)
221 |         key_states = key_states.view(total_q_len, self.num_heads, self.head_dim)
222 |         value_states = value_states.view(total_q_len, self.num_heads, self.head_dim)
223 | 
224 |         if self.config.rope:
225 |             qh, qw = query_states[:, :, :self.head_dim // 2], query_states[:, :, self.head_dim // 2:] 
226 |             kh, kw = key_states[:, :, :self.head_dim // 2], key_states[:, :, self.head_dim // 2:]
227 |             qh, kh = apply_rotary_pos_emb(qh, kh, cos_h, sin_h)
228 |             qw, kw = apply_rotary_pos_emb(qw, kw, cos_w, sin_w)
229 |             query_states = torch.cat([qh, qw], dim=-1)
230 |             key_states = torch.cat([kh, kw], dim=-1)
231 | 
232 |         attn_output = flash_attn_varlen_func(
233 |             query_states.to(torch.bfloat16),
234 |             key_states.to(torch.bfloat16),
235 |             value_states.to(torch.bfloat16),
236 |             cu_seqlens_q=cu_seqlens,
237 |             cu_seqlens_k=cu_seqlens,
238 |             max_seqlen_q=max_seqlen,
239 |             max_seqlen_k=max_seqlen,
240 |             causal=False,
241 |         )
242 | 
243 |         attn_output = self.out_proj(attn_output.reshape(total_q_len, -1))
244 |         return attn_output
245 | 
246 | 
247 | class SiglipMLP(nn.Module):
248 |     def __init__(self, config):
249 |         super().__init__()
250 |         self.config = config
251 |         self.activation_fn = ACT2FN[config.hidden_act]
252 |         self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
253 |         self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
254 | 
255 |     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
256 |         hidden_states = self.fc1(hidden_states)
257 |         hidden_states = self.activation_fn(hidden_states)
258 |         hidden_states = self.fc2(hidden_states)
259 |         return hidden_states
260 | 
261 | 
262 | class SiglipEncoderLayer(nn.Module):
263 |     def __init__(self, config: SiglipVisionConfig):
264 |         super().__init__()
265 |         self.embed_dim = config.hidden_size
266 |         self.self_attn = SiglipFlashAttention2(config)
267 |         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
268 |         self.mlp = SiglipMLP(config)
269 |         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
270 | 
271 |     def forward(
272 |         self,
273 |         hidden_states: torch.Tensor,
274 |         cu_seqlens: torch.IntTensor,
275 |         max_seqlen: int,
276 |         cos_h: torch.Tensor = None,
277 |         sin_h: torch.Tensor = None,
278 |         cos_w: torch.Tensor = None,
279 |         sin_w: torch.Tensor = None
280 |     ) -> torch.Tensor:
281 |         residual = hidden_states
282 | 
283 |         hidden_states = self.layer_norm1(hidden_states)
284 |         hidden_states = self.self_attn(
285 |             hidden_states=hidden_states,
286 |             cu_seqlens=cu_seqlens,
287 |             max_seqlen=max_seqlen,
288 |             cos_h=cos_h,
289 |             sin_h=sin_h,
290 |             cos_w=cos_w,
291 |             sin_w=sin_w
292 |         )
293 |         hidden_states = residual + hidden_states
294 | 
295 |         residual = hidden_states
296 |         hidden_states = self.layer_norm2(hidden_states)
297 |         hidden_states = self.mlp(hidden_states)
298 |         hidden_states = residual + hidden_states
299 | 
300 |         return hidden_states
301 | 
302 | 
303 | class SiglipEncoder(nn.Module):
304 |     def __init__(self, config: SiglipVisionConfig):
305 |         super().__init__()
306 |         self.config = config
307 |         self.layers = nn.ModuleList(
308 |             [SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]
309 |         )
310 | 
311 |     def forward(
312 |         self,
313 |         inputs_embeds: torch.Tensor,
314 |         cu_seqlens: torch.IntTensor,
315 |         max_seqlen: int,
316 |         cos_h: torch.Tensor = None,
317 |         sin_h: torch.Tensor = None,
318 |         cos_w: torch.Tensor = None,
319 |         sin_w: torch.Tensor = None,
320 |     ) -> torch.Tensor:
321 | 
322 |         hidden_states = inputs_embeds
323 |         for encoder_layer in self.layers:
324 |             hidden_states = encoder_layer(hidden_states, cu_seqlens, max_seqlen,
325 |                                           cos_h=cos_h, sin_h=sin_h, cos_w=cos_w, sin_w=sin_w)
326 | 
327 |         return hidden_states
328 | 
329 | 
330 | class SiglipVisionTransformer(nn.Module):
331 |     def __init__(self, config: SiglipVisionConfig):
332 |         super().__init__()
333 |         self.config = config
334 |         embed_dim = config.hidden_size
335 | 
336 |         self.embeddings = SiglipVisionEmbeddings(config)
337 |         if config.rope:
338 |             max_size = config.image_size // config.patch_size
339 |             dim_head = config.hidden_size // config.num_attention_heads
340 |             self.rope = RotaryEmbedding2D(dim_head // 2, max_size, max_size)
341 | 
342 |         self.encoder = SiglipEncoder(config)
343 |         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
344 | 
345 |     def forward(
346 |         self,
347 |         packed_pixel_values: torch.Tensor,
348 |         packed_flattened_position_ids: torch.LongTensor,
349 |         cu_seqlens: torch.IntTensor,
350 |         max_seqlen: int,
351 |     ) -> torch.Tensor:
352 |         hidden_states = self.embeddings(
353 |             packed_pixel_values=packed_pixel_values, 
354 |             packed_flattened_position_ids=packed_flattened_position_ids
355 |         )
356 | 
357 |         extra_inputs = {}
358 |         if self.config.rope:
359 |             extra_inputs.update(
360 |                 cos_h = self.rope.cos_h[packed_flattened_position_ids],
361 |                 sin_h = self.rope.sin_h[packed_flattened_position_ids],
362 |                 cos_w = self.rope.cos_w[packed_flattened_position_ids],
363 |                 sin_w = self.rope.sin_w[packed_flattened_position_ids]
364 |             )
365 | 
366 |         last_hidden_state = self.encoder(
367 |             inputs_embeds=hidden_states, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, 
368 |             **extra_inputs
369 |         )
370 |         last_hidden_state = self.post_layernorm(last_hidden_state)
371 |         return last_hidden_state
372 | 
373 | 
374 | class SiglipVisionModel(SiglipPreTrainedModel):
375 |     config_class = SiglipVisionConfig
376 |     main_input_name = "packed_pixel_values"
377 | 
378 |     def __init__(self, config: SiglipVisionConfig):
379 |         super().__init__(config)
380 | 
381 |         self.vision_model = SiglipVisionTransformer(config)
382 | 
383 |         # Initialize weights and apply final processing
384 |         self.post_init()
385 | 
386 |     def get_input_embeddings(self) -> nn.Module:
387 |         return self.vision_model.embeddings.patch_embedding
388 | 
389 |     def forward(
390 |         self,
391 |         packed_pixel_values: torch.Tensor,
392 |         packed_flattened_position_ids: torch.LongTensor,
393 |         cu_seqlens: torch.IntTensor,
394 |         max_seqlen: int,
395 |     ) -> torch.Tensor:
396 | 
397 |         return self.vision_model(
398 |             packed_pixel_values=packed_pixel_values,
399 |             packed_flattened_position_ids=packed_flattened_position_ids,
400 |             cu_seqlens=cu_seqlens,
401 |             max_seqlen=max_seqlen,
402 |         )
403 | 


--------------------------------------------------------------------------------
/modeling/qwen2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from transformers.utils import (
 7 |     OptionalDependencyNotAvailable,
 8 |     _LazyModule,
 9 |     is_tokenizers_available,
10 |     is_torch_available,
11 | )
12 | 
13 | 
14 | _import_structure = {
15 |     "configuration_qwen2": ["Qwen2Config"],
16 |     "tokenization_qwen2": ["Qwen2Tokenizer"],
17 | }
18 | 
19 | try:
20 |     if not is_tokenizers_available():
21 |         raise OptionalDependencyNotAvailable()
22 | except OptionalDependencyNotAvailable:
23 |     pass
24 | else:
25 |     _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
26 | 
27 | try:
28 |     if not is_torch_available():
29 |         raise OptionalDependencyNotAvailable()
30 | except OptionalDependencyNotAvailable:
31 |     pass
32 | else:
33 |     _import_structure["modeling_qwen2"] = [
34 |         "Qwen2ForCausalLM",
35 |         "Qwen2Model",
36 |         "Qwen2PreTrainedModel",
37 |     ]
38 | 
39 | 
40 | if TYPE_CHECKING:
41 |     from .configuration_qwen2 import Qwen2Config
42 |     from .tokenization_qwen2 import Qwen2Tokenizer
43 | 
44 |     try:
45 |         if not is_tokenizers_available():
46 |             raise OptionalDependencyNotAvailable()
47 |     except OptionalDependencyNotAvailable:
48 |         pass
49 |     else:
50 |         from .tokenization_qwen2_fast import Qwen2TokenizerFast
51 | 
52 |     try:
53 |         if not is_torch_available():
54 |             raise OptionalDependencyNotAvailable()
55 |     except OptionalDependencyNotAvailable:
56 |         pass
57 |     else:
58 |         from .modeling_qwen2 import (
59 |             Qwen2ForCausalLM,
60 |             Qwen2Model,
61 |             Qwen2PreTrainedModel,
62 |         )
63 | 
64 | 
65 | else:
66 |     import sys
67 | 
68 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
69 | 


--------------------------------------------------------------------------------
/modeling/qwen2/configuration_qwen2.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """Qwen2 model configuration"""
  5 | 
  6 | from transformers.configuration_utils import PretrainedConfig
  7 | from transformers.modeling_rope_utils import rope_config_validation
  8 | from transformers.utils import logging
  9 | 
 10 | 
 11 | logger = logging.get_logger(__name__)
 12 | 
 13 | 
 14 | class Qwen2Config(PretrainedConfig):
 15 |     r"""
 16 |     This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
 17 |     Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
 18 |     with the defaults will yield a similar configuration to that of
 19 |     Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
 20 | 
 21 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 22 |     documentation from [`PretrainedConfig`] for more information.
 23 | 
 24 | 
 25 |     Args:
 26 |         vocab_size (`int`, *optional*, defaults to 151936):
 27 |             Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
 28 |             `inputs_ids` passed when calling [`Qwen2Model`]
 29 |         hidden_size (`int`, *optional*, defaults to 4096):
 30 |             Dimension of the hidden representations.
 31 |         intermediate_size (`int`, *optional*, defaults to 22016):
 32 |             Dimension of the MLP representations.
 33 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 34 |             Number of hidden layers in the Transformer encoder.
 35 |         num_attention_heads (`int`, *optional*, defaults to 32):
 36 |             Number of attention heads for each attention layer in the Transformer encoder.
 37 |         num_key_value_heads (`int`, *optional*, defaults to 32):
 38 |             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
 39 |             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
 40 |             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
 41 |             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
 42 |             by meanpooling all the original heads within that group. For more details checkout [this
 43 |             paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
 44 |         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 45 |             The non-linear activation function (function or string) in the decoder.
 46 |         max_position_embeddings (`int`, *optional*, defaults to 32768):
 47 |             The maximum sequence length that this model might ever be used with.
 48 |         initializer_range (`float`, *optional*, defaults to 0.02):
 49 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 50 |         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
 51 |             The epsilon used by the rms normalization layers.
 52 |         use_cache (`bool`, *optional*, defaults to `True`):
 53 |             Whether or not the model should return the last key/values attentions (not used by all models). Only
 54 |             relevant if `config.is_decoder=True`.
 55 |         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
 56 |             Whether the model's input and output word embeddings should be tied.
 57 |         rope_theta (`float`, *optional*, defaults to 10000.0):
 58 |             The base period of the RoPE embeddings.
 59 |         rope_scaling (`Dict`, *optional*):
 60 |             Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
 61 |             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
 62 |             accordingly.
 63 |             Expected contents:
 64 |                 `rope_type` (`str`):
 65 |                     The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
 66 |                     'llama3'], with 'default' being the original RoPE implementation.
 67 |                 `factor` (`float`, *optional*):
 68 |                     Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
 69 |                     most scaling types, a `factor` of x will enable the model to handle sequences of length x *
 70 |                     original maximum pre-trained length.
 71 |                 `original_max_position_embeddings` (`int`, *optional*):
 72 |                     Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
 73 |                     pretraining.
 74 |                 `attention_factor` (`float`, *optional*):
 75 |                     Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
 76 |                     computation. If unspecified, it defaults to value recommended by the implementation, using the
 77 |                     `factor` field to infer the suggested value.
 78 |                 `beta_fast` (`float`, *optional*):
 79 |                     Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
 80 |                     ramp function. If unspecified, it defaults to 32.
 81 |                 `beta_slow` (`float`, *optional*):
 82 |                     Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
 83 |                     ramp function. If unspecified, it defaults to 1.
 84 |                 `short_factor` (`List[float]`, *optional*):
 85 |                     Only used with 'longrope'. The scaling factor to be applied to short contexts (<
 86 |                     `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
 87 |                     size divided by the number of attention heads divided by 2
 88 |                 `long_factor` (`List[float]`, *optional*):
 89 |                     Only used with 'longrope'. The scaling factor to be applied to long contexts (<
 90 |                     `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
 91 |                     size divided by the number of attention heads divided by 2
 92 |                 `low_freq_factor` (`float`, *optional*):
 93 |                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
 94 |                 `high_freq_factor` (`float`, *optional*):
 95 |                     Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
 96 |         use_sliding_window (`bool`, *optional*, defaults to `False`):
 97 |             Whether to use sliding window attention.
 98 |         sliding_window (`int`, *optional*, defaults to 4096):
 99 |             Sliding window attention (SWA) window size. If not specified, will default to `4096`.
100 |         max_window_layers (`int`, *optional*, defaults to 28):
101 |             The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
102 |         attention_dropout (`float`, *optional*, defaults to 0.0):
103 |             The dropout ratio for the attention probabilities.
104 | 
105 |     ```python
106 |     >>> from transformers import Qwen2Model, Qwen2Config
107 | 
108 |     >>> # Initializing a Qwen2 style configuration
109 |     >>> configuration = Qwen2Config()
110 | 
111 |     >>> # Initializing a model from the Qwen2-7B style configuration
112 |     >>> model = Qwen2Model(configuration)
113 | 
114 |     >>> # Accessing the model configuration
115 |     >>> configuration = model.config
116 |     ```"""
117 | 
118 |     model_type = "qwen2"
119 |     keys_to_ignore_at_inference = ["past_key_values"]
120 | 
121 |     def __init__(
122 |         self,
123 |         vocab_size=151936,
124 |         hidden_size=4096,
125 |         intermediate_size=22016,
126 |         num_hidden_layers=32,
127 |         num_attention_heads=32,
128 |         num_key_value_heads=32,
129 |         hidden_act="silu",
130 |         max_position_embeddings=32768,
131 |         initializer_range=0.02,
132 |         rms_norm_eps=1e-6,
133 |         use_cache=True,
134 |         tie_word_embeddings=False,
135 |         rope_theta=10000.0,
136 |         rope_scaling=None,
137 |         use_sliding_window=False,
138 |         sliding_window=4096,
139 |         max_window_layers=28,
140 |         attention_dropout=0.0,
141 |         is_causal=True,
142 |         _attn_implementation="flash_attention_2",
143 |         **kwargs,
144 |     ):
145 |         self.vocab_size = vocab_size
146 |         self.max_position_embeddings = max_position_embeddings
147 |         self.hidden_size = hidden_size
148 |         self.intermediate_size = intermediate_size
149 |         self.num_hidden_layers = num_hidden_layers
150 |         self.num_attention_heads = num_attention_heads
151 |         self.use_sliding_window = use_sliding_window
152 |         self.sliding_window = sliding_window if use_sliding_window else None
153 |         self.max_window_layers = max_window_layers
154 | 
155 |         # for backward compatibility
156 |         if num_key_value_heads is None:
157 |             num_key_value_heads = num_attention_heads
158 | 
159 |         self.num_key_value_heads = num_key_value_heads
160 |         self.hidden_act = hidden_act
161 |         self.initializer_range = initializer_range
162 |         self.rms_norm_eps = rms_norm_eps
163 |         self.use_cache = use_cache
164 |         self.rope_theta = rope_theta
165 |         self.rope_scaling = rope_scaling
166 |         self.attention_dropout = attention_dropout
167 |         self.is_causal = is_causal
168 |         self._attn_implementation = _attn_implementation
169 | 
170 |         # Validate the correctness of rotary position embeddings parameters
171 |         # BC: if there is a 'type' field, move it to 'rope_type'.
172 |         if self.rope_scaling is not None and "type" in self.rope_scaling:
173 |             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
174 |         rope_config_validation(self)
175 | 
176 |         super().__init__(
177 |             tie_word_embeddings=tie_word_embeddings,
178 |             **kwargs,
179 |         )
180 | 


--------------------------------------------------------------------------------
/modeling/qwen2/tokenization_qwen2.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """Tokenization classes for Qwen2."""
  5 | 
  6 | import json
  7 | import os
  8 | import unicodedata
  9 | from functools import lru_cache
 10 | from typing import Optional, Tuple
 11 | 
 12 | import regex as re
 13 | 
 14 | from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 15 | from transformers.utils import logging
 16 | 
 17 | 
 18 | logger = logging.get_logger(__name__)
 19 | 
 20 | VOCAB_FILES_NAMES = {
 21 |     "vocab_file": "vocab.json",
 22 |     "merges_file": "merges.txt",
 23 | }
 24 | 
 25 | 
 26 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 27 | 
 28 | PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 29 | 
 30 | 
 31 | @lru_cache()
 32 | # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 33 | def bytes_to_unicode():
 34 |     """
 35 |     Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
 36 |     characters the bpe code barfs on.
 37 | 
 38 |     The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
 39 |     if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
 40 |     decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
 41 |     tables between utf-8 bytes and unicode strings.
 42 |     """
 43 |     bs = (
 44 |         list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
 45 |     )
 46 |     cs = bs[:]
 47 |     n = 0
 48 |     for b in range(2**8):
 49 |         if b not in bs:
 50 |             bs.append(b)
 51 |             cs.append(2**8 + n)
 52 |             n += 1
 53 |     cs = [chr(n) for n in cs]
 54 |     return dict(zip(bs, cs))
 55 | 
 56 | 
 57 | # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
 58 | def get_pairs(word):
 59 |     """
 60 |     Return set of symbol pairs in a word.
 61 | 
 62 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 63 |     """
 64 |     pairs = set()
 65 |     prev_char = word[0]
 66 |     for char in word[1:]:
 67 |         pairs.add((prev_char, char))
 68 |         prev_char = char
 69 |     return pairs
 70 | 
 71 | 
 72 | class Qwen2Tokenizer(PreTrainedTokenizer):
 73 |     """
 74 |     Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
 75 | 
 76 |     Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
 77 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 78 | 
 79 |     ```python
 80 |     >>> from transformers import Qwen2Tokenizer
 81 | 
 82 |     >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
 83 |     >>> tokenizer("Hello world")["input_ids"]
 84 |     [9707, 1879]
 85 | 
 86 |     >>> tokenizer(" Hello world")["input_ids"]
 87 |     [21927, 1879]
 88 |     ```
 89 |     This is expected.
 90 | 
 91 |     You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
 92 | 
 93 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
 94 |     this superclass for more information regarding those methods.
 95 | 
 96 |     Args:
 97 |         vocab_file (`str`):
 98 |             Path to the vocabulary file.
 99 |         merges_file (`str`):
100 |             Path to the merges file.
101 |         errors (`str`, *optional*, defaults to `"replace"`):
102 |             Paradigm to follow when decoding bytes to UTF-8. See
103 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
104 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
105 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
106 |             token instead.
107 |         bos_token (`str`, *optional*):
108 |             The beginning of sequence token. Not applicable for this tokenizer.
109 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
110 |             The end of sequence token.
111 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
112 |             The token used for padding, for example when batching sequences of different lengths.
113 |         clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
114 |             Whether or not the model should cleanup the spaces that were added when splitting the input text during the
115 |             tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
116 |         split_special_tokens (`bool`, *optional*, defaults to `False`):
117 |             Whether or not the special tokens should be split during the tokenization process. The default behavior is
118 |             to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
119 |             ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
120 |             '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
121 |     """
122 | 
123 |     vocab_files_names = VOCAB_FILES_NAMES
124 |     model_input_names = ["input_ids", "attention_mask"]
125 | 
126 |     def __init__(
127 |         self,
128 |         vocab_file,
129 |         merges_file,
130 |         errors="replace",
131 |         unk_token="<|endoftext|>",
132 |         bos_token=None,
133 |         eos_token="<|endoftext|>",
134 |         pad_token="<|endoftext|>",
135 |         clean_up_tokenization_spaces=False,
136 |         split_special_tokens=False,
137 |         **kwargs,
138 |     ):
139 |         # Qwen vocab does not contain control tokens; added tokens need to be special
140 |         bos_token = (
141 |             AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
142 |             if isinstance(bos_token, str)
143 |             else bos_token
144 |         )
145 |         eos_token = (
146 |             AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
147 |             if isinstance(eos_token, str)
148 |             else eos_token
149 |         )
150 |         unk_token = (
151 |             AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
152 |             if isinstance(unk_token, str)
153 |             else unk_token
154 |         )
155 |         pad_token = (
156 |             AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
157 |             if isinstance(pad_token, str)
158 |             else pad_token
159 |         )
160 | 
161 |         with open(vocab_file, encoding="utf-8") as vocab_handle:
162 |             self.encoder = json.load(vocab_handle)
163 |         self.decoder = {v: k for k, v in self.encoder.items()}
164 |         self.errors = errors  # how to handle errors in decoding
165 |         self.byte_encoder = bytes_to_unicode()
166 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
167 |         bpe_merges = []
168 |         with open(merges_file, encoding="utf-8") as merges_handle:
169 |             for i, line in enumerate(merges_handle):
170 |                 line = line.strip()
171 |                 if (i == 0 and line.startswith("#version:")) or not line:
172 |                     continue
173 |                 bpe_merges.append(tuple(line.split()))
174 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
175 |         # NOTE: the cache can grow without bound and will get really large for long running processes
176 |         # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
177 |         # not a memory leak but appears as one.
178 |         # GPT2Tokenizer has the same problem, so let's be consistent.
179 |         self.cache = {}
180 | 
181 |         self.pat = re.compile(PRETOKENIZE_REGEX)
182 | 
183 |         if kwargs.get("add_prefix_space", False):
184 |             logger.warning_once(
185 |                 f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
186 |             )
187 | 
188 |         super().__init__(
189 |             errors=errors,
190 |             bos_token=bos_token,
191 |             eos_token=eos_token,
192 |             pad_token=pad_token,
193 |             unk_token=unk_token,
194 |             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
195 |             split_special_tokens=split_special_tokens,
196 |             **kwargs,
197 |         )
198 | 
199 |     @property
200 |     def vocab_size(self) -> int:
201 |         return len(self.encoder)
202 | 
203 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
204 |     def get_vocab(self):
205 |         return dict(self.encoder, **self.added_tokens_encoder)
206 | 
207 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
208 |     def bpe(self, token):
209 |         if token in self.cache:
210 |             return self.cache[token]
211 |         word = tuple(token)
212 |         pairs = get_pairs(word)
213 | 
214 |         if not pairs:
215 |             return token
216 | 
217 |         while True:
218 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
219 |             if bigram not in self.bpe_ranks:
220 |                 break
221 |             first, second = bigram
222 |             new_word = []
223 |             i = 0
224 |             while i < len(word):
225 |                 try:
226 |                     j = word.index(first, i)
227 |                 except ValueError:
228 |                     new_word.extend(word[i:])
229 |                     break
230 |                 else:
231 |                     new_word.extend(word[i:j])
232 |                     i = j
233 | 
234 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
235 |                     new_word.append(first + second)
236 |                     i += 2
237 |                 else:
238 |                     new_word.append(word[i])
239 |                     i += 1
240 |             new_word = tuple(new_word)
241 |             word = new_word
242 |             if len(word) == 1:
243 |                 break
244 |             else:
245 |                 pairs = get_pairs(word)
246 |         word = " ".join(word)
247 |         self.cache[token] = word
248 |         return word
249 | 
250 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
251 |     def _tokenize(self, text):
252 |         """Tokenize a string."""
253 |         bpe_tokens = []
254 |         for token in re.findall(self.pat, text):
255 |             token = "".join(
256 |                 self.byte_encoder[b] for b in token.encode("utf-8")
257 |             )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
258 |             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
259 |         return bpe_tokens
260 | 
261 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
262 |     def _convert_token_to_id(self, token):
263 |         """Converts a token (str) in an id using the vocab."""
264 |         return self.encoder.get(token, self.encoder.get(self.unk_token))
265 | 
266 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
267 |     def _convert_id_to_token(self, index):
268 |         """Converts an index (integer) in a token (str) using the vocab."""
269 |         return self.decoder.get(index)
270 | 
271 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
272 |     def convert_tokens_to_string(self, tokens):
273 |         """Converts a sequence of tokens (string) in a single string."""
274 |         text = "".join(tokens)
275 |         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
276 |         return text
277 | 
278 |     def decode(
279 |         self,
280 |         token_ids,
281 |         skip_special_tokens: bool = False,
282 |         clean_up_tokenization_spaces: Optional[bool] = False,
283 |         spaces_between_special_tokens: bool = False,
284 |         **kwargs,
285 |     ) -> str:
286 |         # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
287 |         # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
288 |         return super().decode(
289 |             token_ids,
290 |             skip_special_tokens=skip_special_tokens,
291 |             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
292 |             spaces_between_special_tokens=spaces_between_special_tokens,
293 |             **kwargs,
294 |         )
295 | 
296 |     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
297 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
298 |         if not os.path.isdir(save_directory):
299 |             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
300 |             return
301 |         vocab_file = os.path.join(
302 |             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
303 |         )
304 |         merge_file = os.path.join(
305 |             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
306 |         )
307 | 
308 |         with open(vocab_file, "w", encoding="utf-8") as f:
309 |             f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
310 | 
311 |         index = 0
312 |         with open(merge_file, "w", encoding="utf-8") as writer:
313 |             writer.write("#version: 0.2\n")
314 |             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
315 |                 if index != token_index:
316 |                     logger.warning(
317 |                         f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
318 |                         " Please check that the tokenizer is not corrupted!"
319 |                     )
320 |                     index = token_index
321 |                 writer.write(" ".join(bpe_tokens) + "\n")
322 |                 index += 1
323 | 
324 |         return vocab_file, merge_file
325 | 
326 |     def prepare_for_tokenization(self, text, **kwargs):
327 |         text = unicodedata.normalize("NFC", text)
328 |         return (text, kwargs)
329 | 


--------------------------------------------------------------------------------
/modeling/qwen2/tokenization_qwen2_fast.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """Tokenization classes for Qwen2."""
  5 | 
  6 | from typing import Optional, Tuple
  7 | 
  8 | from transformers.tokenization_utils import AddedToken
  9 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 10 | from transformers.utils import logging
 11 | from .tokenization_qwen2 import Qwen2Tokenizer
 12 | 
 13 | 
 14 | logger = logging.get_logger(__name__)
 15 | 
 16 | VOCAB_FILES_NAMES = {
 17 |     "vocab_file": "vocab.json",
 18 |     "merges_file": "merges.txt",
 19 |     "tokenizer_file": "tokenizer.json",
 20 | }
 21 | 
 22 | 
 23 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 24 | 
 25 | 
 26 | class Qwen2TokenizerFast(PreTrainedTokenizerFast):
 27 |     """
 28 |     Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
 29 |     Byte-Pair-Encoding.
 30 | 
 31 |     Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
 32 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 33 | 
 34 |     ```python
 35 |     >>> from transformers import Qwen2TokenizerFast
 36 | 
 37 |     >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
 38 |     >>> tokenizer("Hello world")["input_ids"]
 39 |     [9707, 1879]
 40 | 
 41 |     >>> tokenizer(" Hello world")["input_ids"]
 42 |     [21927, 1879]
 43 |     ```
 44 |     This is expected.
 45 | 
 46 |     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
 47 |     refer to this superclass for more information regarding those methods.
 48 | 
 49 |     Args:
 50 |         vocab_file (`str`, *optional*):
 51 |             Path to the vocabulary file.
 52 |         merges_file (`str`, *optional*):
 53 |             Path to the merges file.
 54 |         tokenizer_file (`str`, *optional*):
 55 |             Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
 56 |             contains everything needed to load the tokenizer.
 57 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 58 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 59 |             token instead. Not applicable to this tokenizer.
 60 |         bos_token (`str`, *optional*):
 61 |             The beginning of sequence token. Not applicable for this tokenizer.
 62 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 63 |             The end of sequence token.
 64 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 65 |             The token used for padding, for example when batching sequences of different lengths.
 66 |     """
 67 | 
 68 |     vocab_files_names = VOCAB_FILES_NAMES
 69 |     model_input_names = ["input_ids", "attention_mask"]
 70 |     slow_tokenizer_class = Qwen2Tokenizer
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         vocab_file=None,
 75 |         merges_file=None,
 76 |         tokenizer_file=None,
 77 |         unk_token="<|endoftext|>",
 78 |         bos_token=None,
 79 |         eos_token="<|endoftext|>",
 80 |         pad_token="<|endoftext|>",
 81 |         **kwargs,
 82 |     ):
 83 |         # We need to at least pass vocab_file and merges_file to base class
 84 |         # in case a slow tokenizer needs to be initialized; other can be
 85 |         # configured through files.
 86 |         # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
 87 | 
 88 |         bos_token = (
 89 |             AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
 90 |             if isinstance(bos_token, str)
 91 |             else bos_token
 92 |         )
 93 |         eos_token = (
 94 |             AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
 95 |             if isinstance(eos_token, str)
 96 |             else eos_token
 97 |         )
 98 |         unk_token = (
 99 |             AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
100 |             if isinstance(unk_token, str)
101 |             else unk_token
102 |         )
103 |         pad_token = (
104 |             AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
105 |             if isinstance(pad_token, str)
106 |             else pad_token
107 |         )
108 | 
109 |         super().__init__(
110 |             vocab_file=vocab_file,
111 |             merges_file=merges_file,
112 |             tokenizer_file=tokenizer_file,
113 |             unk_token=unk_token,
114 |             bos_token=bos_token,
115 |             eos_token=eos_token,
116 |             pad_token=pad_token,
117 |             **kwargs,
118 |         )
119 | 
120 |     # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
121 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
122 |         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
123 |         return tuple(files)
124 | 


--------------------------------------------------------------------------------
/modeling/siglip/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Inc. team.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from transformers.utils import (
 7 |     OptionalDependencyNotAvailable,
 8 |     _LazyModule,
 9 |     is_sentencepiece_available,
10 |     is_torch_available,
11 |     is_vision_available,
12 | )
13 | 
14 | 
15 | _import_structure = {
16 |     "configuration_siglip": [
17 |         "SiglipConfig",
18 |         "SiglipTextConfig",
19 |         "SiglipVisionConfig",
20 |     ],
21 |     "processing_siglip": ["SiglipProcessor"],
22 | }
23 | 
24 | try:
25 |     if not is_sentencepiece_available():
26 |         raise OptionalDependencyNotAvailable()
27 | except OptionalDependencyNotAvailable:
28 |     pass
29 | else:
30 |     _import_structure["tokenization_siglip"] = ["SiglipTokenizer"]
31 | 
32 | 
33 | try:
34 |     if not is_vision_available():
35 |         raise OptionalDependencyNotAvailable()
36 | except OptionalDependencyNotAvailable:
37 |     pass
38 | else:
39 |     _import_structure["image_processing_siglip"] = ["SiglipImageProcessor"]
40 | 
41 | try:
42 |     if not is_torch_available():
43 |         raise OptionalDependencyNotAvailable()
44 | except OptionalDependencyNotAvailable:
45 |     pass
46 | else:
47 |     _import_structure["modeling_siglip"] = [
48 |         "SiglipModel",
49 |         "SiglipPreTrainedModel",
50 |         "SiglipTextModel",
51 |         "SiglipVisionModel",
52 |         "SiglipForImageClassification",
53 |     ]
54 | 
55 | 
56 | if TYPE_CHECKING:
57 |     from .configuration_siglip import (
58 |         SiglipConfig,
59 |         SiglipTextConfig,
60 |         SiglipVisionConfig,
61 |     )
62 |     from .processing_siglip import SiglipProcessor
63 | 
64 |     try:
65 |         if not is_sentencepiece_available():
66 |             raise OptionalDependencyNotAvailable()
67 |     except OptionalDependencyNotAvailable:
68 |         pass
69 |     else:
70 |         from .tokenization_siglip import SiglipTokenizer
71 | 
72 |     try:
73 |         if not is_vision_available():
74 |             raise OptionalDependencyNotAvailable()
75 |     except OptionalDependencyNotAvailable:
76 |         pass
77 |     else:
78 |         from .image_processing_siglip import SiglipImageProcessor
79 | 
80 |     try:
81 |         if not is_torch_available():
82 |             raise OptionalDependencyNotAvailable()
83 |     except OptionalDependencyNotAvailable:
84 |         pass
85 |     else:
86 |         from .modeling_siglip import (
87 |             SiglipForImageClassification,
88 |             SiglipModel,
89 |             SiglipPreTrainedModel,
90 |             SiglipTextModel,
91 |             SiglipVisionModel,
92 |         )
93 | 
94 | 
95 | else:
96 |     import sys
97 | 
98 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
99 | 


--------------------------------------------------------------------------------
/modeling/siglip/configuration_siglip.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """Siglip model configuration"""
  5 | 
  6 | import os
  7 | from typing import Union
  8 | 
  9 | from transformers.configuration_utils import PretrainedConfig
 10 | from transformers.utils import logging
 11 | 
 12 | 
 13 | logger = logging.get_logger(__name__)
 14 | 
 15 | 
 16 | class SiglipTextConfig(PretrainedConfig):
 17 |     r"""
 18 |     This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
 19 |     Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
 20 |     configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
 21 |     [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
 22 | 
 23 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 24 |     documentation from [`PretrainedConfig`] for more information.
 25 | 
 26 |     Args:
 27 |         vocab_size (`int`, *optional*, defaults to 32000):
 28 |             Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
 29 |             the `inputs_ids` passed when calling [`SiglipModel`].
 30 |         hidden_size (`int`, *optional*, defaults to 768):
 31 |             Dimensionality of the encoder layers and the pooler layer.
 32 |         intermediate_size (`int`, *optional*, defaults to 3072):
 33 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 34 |         num_hidden_layers (`int`, *optional*, defaults to 12):
 35 |             Number of hidden layers in the Transformer encoder.
 36 |         num_attention_heads (`int`, *optional*, defaults to 12):
 37 |             Number of attention heads for each attention layer in the Transformer encoder.
 38 |         max_position_embeddings (`int`, *optional*, defaults to 64):
 39 |             The maximum sequence length that this model might ever be used with. Typically set this to something large
 40 |             just in case (e.g., 512 or 1024 or 2048).
 41 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
 42 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 43 |             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
 44 |         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
 45 |             The epsilon used by the layer normalization layers.
 46 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 47 |             The dropout ratio for the attention probabilities.
 48 |         pad_token_id (`int`, *optional*, defaults to 1):
 49 |             The id of the padding token in the vocabulary.
 50 |         bos_token_id (`int`, *optional*, defaults to 49406):
 51 |             The id of the beginning-of-sequence token in the vocabulary.
 52 |         eos_token_id (`int`, *optional*, defaults to 49407):
 53 |             The id of the end-of-sequence token in the vocabulary.
 54 | 
 55 |     Example:
 56 | 
 57 |     ```python
 58 |     >>> from transformers import SiglipTextConfig, SiglipTextModel
 59 | 
 60 |     >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
 61 |     >>> configuration = SiglipTextConfig()
 62 | 
 63 |     >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
 64 |     >>> model = SiglipTextModel(configuration)
 65 | 
 66 |     >>> # Accessing the model configuration
 67 |     >>> configuration = model.config
 68 |     ```"""
 69 | 
 70 |     model_type = "siglip_text_model"
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         vocab_size=32000,
 75 |         hidden_size=768,
 76 |         intermediate_size=3072,
 77 |         num_hidden_layers=12,
 78 |         num_attention_heads=12,
 79 |         max_position_embeddings=64,
 80 |         hidden_act="gelu_pytorch_tanh",
 81 |         layer_norm_eps=1e-6,
 82 |         attention_dropout=0.0,
 83 |         # This differs from `CLIPTokenizer`'s default and from openai/siglip
 84 |         # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
 85 |         pad_token_id=1,
 86 |         bos_token_id=49406,
 87 |         eos_token_id=49407,
 88 |         **kwargs,
 89 |     ):
 90 |         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 91 | 
 92 |         self.vocab_size = vocab_size
 93 |         self.hidden_size = hidden_size
 94 |         self.intermediate_size = intermediate_size
 95 |         self.num_hidden_layers = num_hidden_layers
 96 |         self.num_attention_heads = num_attention_heads
 97 |         self.max_position_embeddings = max_position_embeddings
 98 |         self.layer_norm_eps = layer_norm_eps
 99 |         self.hidden_act = hidden_act
100 |         self.attention_dropout = attention_dropout
101 | 
102 |     @classmethod
103 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
104 |         cls._set_token_in_kwargs(kwargs)
105 | 
106 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
107 | 
108 |         # get the text config dict if we are loading from SiglipConfig
109 |         if config_dict.get("model_type") == "siglip":
110 |             config_dict = config_dict["text_config"]
111 | 
112 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
113 |             logger.warning(
114 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
115 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
116 |             )
117 | 
118 |         return cls.from_dict(config_dict, **kwargs)
119 | 
120 | 
121 | class SiglipVisionConfig(PretrainedConfig):
122 |     r"""
123 |     This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
124 |     Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
125 |     configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
126 |     [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
127 | 
128 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
129 |     documentation from [`PretrainedConfig`] for more information.
130 | 
131 |     Args:
132 |         hidden_size (`int`, *optional*, defaults to 768):
133 |             Dimensionality of the encoder layers and the pooler layer.
134 |         intermediate_size (`int`, *optional*, defaults to 3072):
135 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
136 |         num_hidden_layers (`int`, *optional*, defaults to 12):
137 |             Number of hidden layers in the Transformer encoder.
138 |         num_attention_heads (`int`, *optional*, defaults to 12):
139 |             Number of attention heads for each attention layer in the Transformer encoder.
140 |         num_channels (`int`, *optional*, defaults to 3):
141 |             Number of channels in the input images.
142 |         image_size (`int`, *optional*, defaults to 224):
143 |             The size (resolution) of each image.
144 |         patch_size (`int`, *optional*, defaults to 16):
145 |             The size (resolution) of each patch.
146 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
147 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
148 |             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
149 |         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
150 |             The epsilon used by the layer normalization layers.
151 |         attention_dropout (`float`, *optional*, defaults to 0.0):
152 |             The dropout ratio for the attention probabilities.
153 | 
154 |     Example:
155 | 
156 |     ```python
157 |     >>> from transformers import SiglipVisionConfig, SiglipVisionModel
158 | 
159 |     >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
160 |     >>> configuration = SiglipVisionConfig()
161 | 
162 |     >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
163 |     >>> model = SiglipVisionModel(configuration)
164 | 
165 |     >>> # Accessing the model configuration
166 |     >>> configuration = model.config
167 |     ```"""
168 | 
169 |     model_type = "siglip_vision_model"
170 | 
171 |     def __init__(
172 |         self,
173 |         hidden_size=768,
174 |         intermediate_size=3072,
175 |         num_hidden_layers=12,
176 |         num_attention_heads=12,
177 |         num_channels=3,
178 |         image_size=224,
179 |         patch_size=16,
180 |         hidden_act="gelu_pytorch_tanh",
181 |         layer_norm_eps=1e-6,
182 |         attention_dropout=0.0,
183 |         **kwargs,
184 |     ):
185 |         super().__init__(**kwargs)
186 | 
187 |         self.hidden_size = hidden_size
188 |         self.intermediate_size = intermediate_size
189 |         self.num_hidden_layers = num_hidden_layers
190 |         self.num_attention_heads = num_attention_heads
191 |         self.num_channels = num_channels
192 |         self.patch_size = patch_size
193 |         self.image_size = image_size
194 |         self.attention_dropout = attention_dropout
195 |         self.layer_norm_eps = layer_norm_eps
196 |         self.hidden_act = hidden_act
197 | 
198 |     @classmethod
199 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
200 |         cls._set_token_in_kwargs(kwargs)
201 | 
202 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
203 | 
204 |         # get the vision config dict if we are loading from SiglipConfig
205 |         if config_dict.get("model_type") == "siglip":
206 |             config_dict = config_dict["vision_config"]
207 | 
208 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
209 |             logger.warning(
210 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
211 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
212 |             )
213 | 
214 |         return cls.from_dict(config_dict, **kwargs)
215 | 
216 | 
217 | class SiglipConfig(PretrainedConfig):
218 |     r"""
219 |     [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
220 |     instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
221 |     Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
222 |     [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
223 | 
224 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
225 |     documentation from [`PretrainedConfig`] for more information.
226 | 
227 |     Args:
228 |         text_config (`dict`, *optional*):
229 |             Dictionary of configuration options used to initialize [`SiglipTextConfig`].
230 |         vision_config (`dict`, *optional*):
231 |             Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
232 |         kwargs (*optional*):
233 |             Dictionary of keyword arguments.
234 | 
235 |     Example:
236 | 
237 |     ```python
238 |     >>> from transformers import SiglipConfig, SiglipModel
239 | 
240 |     >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
241 |     >>> configuration = SiglipConfig()
242 | 
243 |     >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
244 |     >>> model = SiglipModel(configuration)
245 | 
246 |     >>> # Accessing the model configuration
247 |     >>> configuration = model.config
248 | 
249 |     >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
250 |     >>> from transformers import SiglipTextConfig, SiglipVisionConfig
251 | 
252 |     >>> # Initializing a SiglipText and SiglipVision configuration
253 |     >>> config_text = SiglipTextConfig()
254 |     >>> config_vision = SiglipVisionConfig()
255 | 
256 |     >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
257 |     ```"""
258 | 
259 |     model_type = "siglip"
260 | 
261 |     def __init__(self, text_config=None, vision_config=None, **kwargs):
262 |         super().__init__(**kwargs)
263 | 
264 |         if text_config is None:
265 |             text_config = {}
266 |             logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
267 | 
268 |         if vision_config is None:
269 |             vision_config = {}
270 |             logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
271 | 
272 |         self.text_config = SiglipTextConfig(**text_config)
273 |         self.vision_config = SiglipVisionConfig(**vision_config)
274 | 
275 |         self.initializer_factor = 1.0
276 | 
277 |     @classmethod
278 |     def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
279 |         r"""
280 |         Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
281 |         model configuration.
282 | 
283 |         Returns:
284 |             [`SiglipConfig`]: An instance of a configuration object
285 |         """
286 | 
287 |         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
288 | 


--------------------------------------------------------------------------------
/modeling/siglip/image_processing_siglip.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """Image processor class for SigLIP."""
  5 | 
  6 | from typing import Dict, List, Optional, Union
  7 | 
  8 | from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
  9 | from transformers.image_transforms import (
 10 |     convert_to_rgb,
 11 |     resize,
 12 |     to_channel_dimension_format,
 13 | )
 14 | from transformers.image_utils import (
 15 |     IMAGENET_STANDARD_MEAN,
 16 |     IMAGENET_STANDARD_STD,
 17 |     ChannelDimension,
 18 |     ImageInput,
 19 |     PILImageResampling,
 20 |     infer_channel_dimension_format,
 21 |     is_scaled_image,
 22 |     make_list_of_images,
 23 |     to_numpy_array,
 24 |     valid_images,
 25 |     validate_preprocess_arguments,
 26 | )
 27 | from transformers.utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 28 | 
 29 | 
 30 | logger = logging.get_logger(__name__)
 31 | 
 32 | 
 33 | if is_vision_available():
 34 |     import PIL
 35 | 
 36 | 
 37 | class SiglipImageProcessor(BaseImageProcessor):
 38 |     r"""
 39 |     Constructs a SigLIP image processor.
 40 | 
 41 |     Args:
 42 |         do_resize (`bool`, *optional*, defaults to `True`):
 43 |             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
 44 |             `do_resize` in the `preprocess` method.
 45 |         size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
 46 |             Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
 47 |         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
 48 |             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
 49 |         do_rescale (`bool`, *optional*, defaults to `True`):
 50 |             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
 51 |             the `preprocess` method.
 52 |         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
 53 |             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
 54 |             method.
 55 |         do_normalize (`bool`, *optional*, defaults to `True`):
 56 |             Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
 57 |             `do_normalize` in the `preprocess` method.
 58 |         image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
 59 |             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
 60 |             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
 61 |         image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
 62 |             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
 63 |             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
 64 |             Can be overridden by the `image_std` parameter in the `preprocess` method.
 65 |         do_convert_rgb (`bool`, *optional*, defaults to `True`):
 66 |             Whether to convert the image to RGB.
 67 |     """
 68 | 
 69 |     model_input_names = ["pixel_values"]
 70 | 
 71 |     def __init__(
 72 |         self,
 73 |         do_resize: bool = True,
 74 |         size: Dict[str, int] = None,
 75 |         resample: PILImageResampling = PILImageResampling.BICUBIC,
 76 |         do_rescale: bool = True,
 77 |         rescale_factor: Union[int, float] = 1 / 255,
 78 |         do_normalize: bool = True,
 79 |         image_mean: Optional[Union[float, List[float]]] = None,
 80 |         image_std: Optional[Union[float, List[float]]] = None,
 81 |         do_convert_rgb: bool = None,
 82 |         **kwargs,
 83 |     ) -> None:
 84 |         super().__init__(**kwargs)
 85 |         size = size if size is not None else {"height": 224, "width": 224}
 86 |         image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
 87 |         image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 88 | 
 89 |         self.do_resize = do_resize
 90 |         self.size = size
 91 |         self.resample = resample
 92 |         self.do_rescale = do_rescale
 93 |         self.rescale_factor = rescale_factor
 94 |         self.do_normalize = do_normalize
 95 |         self.image_mean = image_mean
 96 |         self.image_std = image_std
 97 |         self.do_convert_rgb = do_convert_rgb
 98 | 
 99 |     @filter_out_non_signature_kwargs()
100 |     def preprocess(
101 |         self,
102 |         images: ImageInput,
103 |         do_resize: bool = None,
104 |         size: Dict[str, int] = None,
105 |         resample: PILImageResampling = None,
106 |         do_rescale: bool = None,
107 |         rescale_factor: float = None,
108 |         do_normalize: bool = None,
109 |         image_mean: Optional[Union[float, List[float]]] = None,
110 |         image_std: Optional[Union[float, List[float]]] = None,
111 |         return_tensors: Optional[Union[str, TensorType]] = None,
112 |         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
113 |         input_data_format: Optional[Union[str, ChannelDimension]] = None,
114 |         do_convert_rgb: bool = None,
115 |     ) -> PIL.Image.Image:
116 |         """
117 |         Preprocess an image or batch of images.
118 | 
119 |         Args:
120 |             images (`ImageInput`):
121 |                 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
122 |                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
123 |             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
124 |                 Whether to resize the image.
125 |             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
126 |                 Size of the image after resizing.
127 |             resample (`int`, *optional*, defaults to `self.resample`):
128 |                 Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
129 |                 has an effect if `do_resize` is set to `True`.
130 |             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
131 |                 Whether to rescale the image.
132 |             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
133 |                 Rescale factor to rescale the image by if `do_rescale` is set to `True`.
134 |             do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
135 |                 Whether to normalize the image.
136 |             image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
137 |                 Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
138 |             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
139 |                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
140 |                 `True`.
141 |             return_tensors (`str` or `TensorType`, *optional*):
142 |                 The type of tensors to return. Can be one of:
143 |                 - Unset: Return a list of `np.ndarray`.
144 |                 - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
145 |                 - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
146 |                 - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
147 |                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
148 |             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
149 |                 The channel dimension format for the output image. Can be one of:
150 |                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
151 |                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
152 |                 - Unset: Use the channel dimension format of the input image.
153 |             input_data_format (`ChannelDimension` or `str`, *optional*):
154 |                 The channel dimension format for the input image. If unset, the channel dimension format is inferred
155 |                 from the input image. Can be one of:
156 |                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
157 |                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
158 |                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
159 |             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
160 |                 Whether to convert the image to RGB.
161 |         """
162 |         do_resize = do_resize if do_resize is not None else self.do_resize
163 |         size = size if size is not None else self.size
164 |         size = get_size_dict(size, param_name="size", default_to_square=False)
165 |         resample = resample if resample is not None else self.resample
166 |         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
167 |         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
168 |         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
169 |         image_mean = image_mean if image_mean is not None else self.image_mean
170 |         image_std = image_std if image_std is not None else self.image_std
171 |         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
172 | 
173 |         images = make_list_of_images(images)
174 | 
175 |         if not valid_images(images):
176 |             raise ValueError(
177 |                 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
178 |                 "torch.Tensor, tf.Tensor or jax.ndarray."
179 |             )
180 |         validate_preprocess_arguments(
181 |             do_rescale=do_rescale,
182 |             rescale_factor=rescale_factor,
183 |             do_normalize=do_normalize,
184 |             image_mean=image_mean,
185 |             image_std=image_std,
186 |             do_resize=do_resize,
187 |             size=size,
188 |             resample=resample,
189 |         )
190 |         # All transformations expect numpy arrays.
191 |         images = [to_numpy_array(image) for image in images]
192 | 
193 |         if do_convert_rgb:
194 |             images = [convert_to_rgb(image) for image in images]
195 | 
196 |         if is_scaled_image(images[0]) and do_rescale:
197 |             logger.warning_once(
198 |                 "It looks like you are trying to rescale already rescaled images. If the input"
199 |                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
200 |             )
201 | 
202 |         if input_data_format is None:
203 |             # We assume that all images have the same channel dimension format.
204 |             input_data_format = infer_channel_dimension_format(images[0])
205 | 
206 |         if do_resize:
207 |             height, width = size["height"], size["width"]
208 |             images = [
209 |                 resize(image=image, size=(height, width), resample=resample, input_data_format=input_data_format)
210 |                 for image in images
211 |             ]
212 | 
213 |         if do_rescale:
214 |             images = [
215 |                 self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
216 |                 for image in images
217 |             ]
218 | 
219 |         if do_normalize:
220 |             images = [
221 |                 self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
222 |                 for image in images
223 |             ]
224 | 
225 |         images = [
226 |             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
227 |         ]
228 | 
229 |         data = {"pixel_values": images}
230 |         return BatchFeature(data=data, tensor_type=return_tensors)
231 | 


--------------------------------------------------------------------------------
/modeling/siglip/processing_siglip.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """
  5 | Image/Text processor class for SigLIP.
  6 | """
  7 | 
  8 | from typing import List, Optional, Union
  9 | 
 10 | from transformers.feature_extraction_utils import BatchFeature
 11 | from transformers.image_utils import ImageInput
 12 | from transformers.processing_utils import ProcessorMixin
 13 | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 14 | from transformers.utils import TensorType
 15 | 
 16 | 
 17 | class SiglipProcessor(ProcessorMixin):
 18 |     r"""
 19 |     Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
 20 | 
 21 |     [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
 22 |     [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
 23 | 
 24 |     Args:
 25 |         image_processor ([`SiglipImageProcessor`]):
 26 |             The image processor is a required input.
 27 |         tokenizer ([`SiglipTokenizer`]):
 28 |             The tokenizer is a required input.
 29 |     """
 30 | 
 31 |     attributes = ["image_processor", "tokenizer"]
 32 |     image_processor_class = "SiglipImageProcessor"
 33 |     tokenizer_class = "SiglipTokenizer"
 34 | 
 35 |     def __init__(self, image_processor, tokenizer):
 36 |         super().__init__(image_processor, tokenizer)
 37 | 
 38 |     def __call__(
 39 |         self,
 40 |         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
 41 |         images: ImageInput = None,
 42 |         padding: Union[bool, str, PaddingStrategy] = False,
 43 |         truncation: Union[bool, str, TruncationStrategy] = None,
 44 |         max_length: int = None,
 45 |         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
 46 |     ) -> BatchFeature:
 47 |         """
 48 |         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
 49 |         and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
 50 |         the text. To prepare the image(s), this method forwards the `images` argument to
 51 |         SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
 52 |         of the above two methods for more information.
 53 | 
 54 |         Args:
 55 |             text (`str`, `List[str]`, `List[List[str]]`):
 56 |                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
 57 |                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
 58 |                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 59 |             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
 60 |                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
 61 |                 tensor. Both channels-first and channels-last formats are supported.
 62 |             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
 63 |                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
 64 |                 index) among:
 65 |                 - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
 66 |                   sequence if provided).
 67 |                 - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
 68 |                   acceptable input length for the model if that argument is not provided.
 69 |                 - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
 70 |                   lengths).
 71 |             max_length (`int`, *optional*):
 72 |                 Maximum length of the returned list and optionally padding length (see above).
 73 |             truncation (`bool`, *optional*):
 74 |                 Activates truncation to cut input sequences longer than `max_length` to `max_length`.
 75 |             return_tensors (`str` or [`~utils.TensorType`], *optional*):
 76 |                 If set, will return tensors of a particular framework. Acceptable values are:
 77 | 
 78 |                 - `'tf'`: Return TensorFlow `tf.constant` objects.
 79 |                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
 80 |                 - `'np'`: Return NumPy `np.ndarray` objects.
 81 |                 - `'jax'`: Return JAX `jnp.ndarray` objects.
 82 | 
 83 |         Returns:
 84 |             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 85 | 
 86 |             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
 87 |             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
 88 |               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
 89 |               `None`).
 90 |             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
 91 |         """
 92 | 
 93 |         if text is None and images is None:
 94 |             raise ValueError("You have to specify either text or images. Both cannot be none.")
 95 | 
 96 |         if text is not None:
 97 |             encoding = self.tokenizer(
 98 |                 text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
 99 |             )
100 | 
101 |         if images is not None:
102 |             image_features = self.image_processor(images, return_tensors=return_tensors)
103 | 
104 |         if text is not None and images is not None:
105 |             encoding["pixel_values"] = image_features.pixel_values
106 |             return encoding
107 |         elif text is not None:
108 |             return encoding
109 |         else:
110 |             return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
111 | 
112 |     def decode(self, *args, **kwargs):
113 |         """
114 |         This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
115 |         the docstring of this method for more information.
116 |         """
117 |         return self.tokenizer.decode(*args, **kwargs)
118 | 
119 |     def batch_decode(self, *args, **kwargs):
120 |         """
121 |         This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
122 |         refer to the docstring of this method for more information.
123 |         """
124 |         return self.tokenizer.batch_decode(*args, **kwargs)
125 | 
126 |     @property
127 |     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip
128 |     def model_input_names(self):
129 |         tokenizer_input_names = self.tokenizer.model_input_names
130 |         image_processor_input_names = self.image_processor.model_input_names
131 |         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
132 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui-bagel"
 3 | description = "A ComfyUI extention for BAGEL(Unified Model for Multimodal Understanding and Generation)"
 4 | version = "1.0.3"
 5 | license = {file = "LICENSE"}
 6 | dependencies = ["decord==0.6.0", "einops==0.8.1", "huggingface_hub==0.29.1", "matplotlib==3.7.0", "opencv-python==4.7.0.72", "pyarrow==11.0.0", "PyYAML==6.0.2", "Requests==2.32.3", "safetensors==0.4.5", "scipy==1.10.1", "sentencepiece==0.1.99", "transformers==4.49.0", "flash_attn==2.5.8", "accelerate>=0.34.0"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/neverbiasu/ComfyUI-BAGEL"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = "faych"
14 | DisplayName = "ComfyUI-BAGEL"
15 | Icon = ""


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | decord~=0.6.0
 2 | einops~=0.8.1
 3 | huggingface_hub~=0.29.1
 4 | matplotlib~=3.7.0
 5 | opencv_python~=4.7.0.72
 6 | pyarrow~=11.0.0
 7 | PyYAML~=6.0.2
 8 | Requests~=2.32.3
 9 | safetensors~=0.4.5
10 | scipy~=1.10.1
11 | sentencepiece~=0.1.99
12 | transformers~=4.49.0
13 | flash_attn==2.5.8
14 | accelerate~=0.34.0
15 | dfloat11[cuda12]
16 | 


--------------------------------------------------------------------------------