├── .github
    └── workflows
    │   ├── package.json
    │   ├── prerelease.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .swift-format
├── .vscode
    ├── launch.json
    ├── settings.json
    └── tasks.json
├── BENCHMARKS.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── Package.resolved
├── Package.swift
├── README.md
├── VERSION
├── assets
    ├── model_types_bg.png
    └── unum.png
├── docs
    ├── Makefile
    ├── _static
    │   ├── custom.css
    │   └── custom.js
    ├── benchmarks.rst
    ├── conf.py
    ├── contributing.rst
    ├── index.rst
    ├── javascript
    │   ├── index.rst
    │   └── reference.rst.txt
    ├── python
    │   ├── index.rst
    │   └── reference.rst
    └── swift
    │   └── index.rst
├── javascript
    ├── README.md
    ├── encoders.mjs
    ├── encoders_test.js
    ├── hub.mjs
    └── index.mjs
├── package-lock.json
├── package.json
├── pyproject.toml
├── python
    ├── README.md
    ├── scripts
    │   ├── bench_decoders.py
    │   ├── bench_encoders.py
    │   ├── export_decoders.ipynb
    │   ├── export_encoders.ipynb
    │   ├── test_decoders.py
    │   └── test_encoders.py
    └── uform
    │   ├── __init__.py
    │   ├── chat.py
    │   ├── gen_model.py
    │   ├── numpy_processors.py
    │   ├── onnx_encoders.py
    │   ├── shared.py
    │   ├── torch_decoders.py
    │   ├── torch_encoders.py
    │   └── torch_processors.py
├── swift
    ├── Encoders.swift
    ├── EncodersTests.swift
    └── README.md
└── yarn.lock


/.github/workflows/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "devDependencies": {
 3 |     "@semantic-release/exec": "github:semantic-release/exec",
 4 |     "@semantic-release/git": "^10.0.1",
 5 |     "conventional-changelog-eslint": "^3.0.9",
 6 |     "semantic-release": "^20.1.3"
 7 |   },
 8 |   "release": {
 9 |     "branches": [
10 |       "main"
11 |     ],
12 |     "debug": true,
13 |     "ci": true,
14 |     "dryRun": false,
15 |     "plugins": [
16 |       [
17 |         "@semantic-release/commit-analyzer",
18 |         {
19 |           "preset": "eslint",
20 |           "releaseRules": [
21 |             {
22 |               "tag": "Add",
23 |               "release": "minor"
24 |             },
25 |             {
26 |               "tag": "Break",
27 |               "release": "major"
28 |             },
29 |             {
30 |               "tag": "Improve",
31 |               "release": "patch"
32 |             },
33 |             {
34 |               "tag": "Make",
35 |               "release": "patch"
36 |             },
37 |             {
38 |               "tag": "Refactor",
39 |               "release": false
40 |             }
41 |           ]
42 |         }
43 |       ],
44 |       [
45 |         "@semantic-release/release-notes-generator",
46 |         {
47 |           "preset": "eslint",
48 |           "releaseRules": [
49 |             {
50 |               "tag": "Add",
51 |               "release": "minor"
52 |             },
53 |             {
54 |               "tag": "Break",
55 |               "release": "major"
56 |             },
57 |             {
58 |               "tag": "Improve",
59 |               "release": "patch"
60 |             },
61 |             {
62 |               "tag": "Make",
63 |               "release": "patch"
64 |             },
65 |             {
66 |               "tag": "Refactor",
67 |               "release": false
68 |             }
69 |           ]
70 |         }
71 |       ],
72 |       "@semantic-release/github",
73 |       [
74 |         "@semantic-release/exec",
75 |         {
76 |           "prepareCmd": "sed -i 's/version = \".*\"/version = \"${nextRelease.version}\"/' pyproject.toml"
77 |         }
78 |       ],
79 |       [
80 |         "@semantic-release/git",
81 |         {
82 |           "assets": [
83 |             "pyproject.toml"
84 |           ],
85 |           "message": "Build: Released ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
86 |         }
87 |       ]
88 |     ]
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/.github/workflows/prerelease.yml:
--------------------------------------------------------------------------------
 1 | name: Pre-Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main-dev"]
 6 |   pull_request:
 7 |     branches: ["main-dev"]
 8 | 
 9 | env:
10 |   BUILD_TYPE: Release
11 |   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
12 |   PYTHONUTF8: 1
13 | 
14 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   versioning:
20 |     name: Update Version
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v4
25 |         with:
26 |           fetch-depth: 0
27 |           persist-credentials: false
28 |       - name: Run TinySemVer
29 |         uses: ashvardanian/tinysemver@v2.0.7
30 |         with:
31 |           verbose: "true"
32 |           version-file: "VERSION"
33 |           update-version-in: |
34 |             package.json:"version": "(\d+\.\d+\.\d+)"
35 |             package-lock.json:"uform",\n\s+"version": "(\d+\.\d+\.\d+)"
36 |             CITATION.cff:^version: (\d+\.\d+\.\d+)
37 |             pyproject.toml:^version = "(\d+\.\d+\.\d+)"
38 |           dry-run: "true"
39 |           
40 |   test_python:
41 |     name: Test Python
42 |     runs-on: ubuntu-latest
43 | 
44 |     steps:
45 |       - uses: actions/checkout@v4
46 |       - name: Set up Python
47 |         uses: actions/setup-python@v5
48 |         with:
49 |           python-version: "3.11"
50 | 
51 |       - name: Install dependencies
52 |         run: |
53 |           python -m pip install --no-cache-dir --upgrade pip
54 |           pip install -e ".[onnx]"
55 |           pip install pytest
56 | 
57 |         # When running tests in CI, limit ourselves to the small model tests
58 |       - name: Test with PyTest
59 |         run: pytest python/scripts/ -s -x -Wd -v -k small
60 | 
61 |   test_javascript:
62 |     name: Test JavaScript
63 |     runs-on: ubuntu-latest
64 | 
65 |     steps:
66 |       - uses: actions/checkout@v4
67 |       - name: Set up Node.js
68 |         uses: actions/setup-node@v4
69 |         with:
70 |           node-version: 20
71 | 
72 |       # TODO: JavaScript tests pass, but ONNX throws a memory error on exit
73 |       # - name: Build JavaScript
74 |       #   run: npm ci
75 |       # - name: Test JavaScript
76 |       #   run: npm test
77 | 
78 |   test_swift:
79 |     name: Test Swift
80 |     runs-on: macos-14
81 | 
82 |     steps:
83 |       - uses: actions/checkout@v4
84 |       - name: Build
85 |         run: swift build
86 |       - name: Run tests
87 |         run: swift test


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: ["main"]
  6 | 
  7 | env:
  8 |   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
  9 | 
 10 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 11 | permissions:
 12 |   contents: read
 13 |   pages: write
 14 |   id-token: write
 15 | 
 16 | jobs:
 17 |   versioning:
 18 |     name: Update Version
 19 |     runs-on: ubuntu-latest
 20 |     steps:
 21 |       - name: Checkout
 22 |         uses: actions/checkout@v4
 23 |         with:
 24 |           fetch-depth: 0
 25 |           persist-credentials: false
 26 |       - name: Run TinySemVer
 27 |         uses: ashvardanian/tinysemver@v2.0.7
 28 |         with:
 29 |           verbose: "true"
 30 |           version-file: "VERSION"
 31 |           update-version-in: |
 32 |             package.json:"version": "(\d+\.\d+\.\d+)"
 33 |             package-lock.json:"uform",\n\s+"version": "(\d+\.\d+\.\d+)"
 34 |             CITATION.cff:^version: (\d+\.\d+\.\d+)
 35 |             pyproject.toml:^version = "(\d+\.\d+\.\d+)"
 36 |           dry-run: "false"
 37 |           push: "true"
 38 |           create-release: "true"
 39 |           github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
 40 | 
 41 |   rebase:
 42 |     name: Rebase Dev. Branch
 43 |     needs: versioning
 44 |     runs-on: ubuntu-latest
 45 |     steps:
 46 |       - name: Checkout the latest code
 47 |         uses: actions/checkout@v4
 48 |         with:
 49 |           fetch-depth: 0
 50 | 
 51 |       - name: Perform rebase
 52 |         run: |
 53 |           git fetch origin main
 54 |           git checkout main-dev
 55 |           git rebase origin/main
 56 | 
 57 |       - name: Push changes
 58 |         uses: CasperWA/push-protected@v2
 59 |         with:
 60 |           token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
 61 |           branch: main-dev
 62 |           unprotect_reviews: True
 63 |           force: True
 64 | 
 65 |   test_python:
 66 |     name: Run Tests
 67 |     runs-on: ubuntu-latest
 68 |     needs: versioning
 69 |     steps:
 70 |       - uses: actions/checkout@v4
 71 |         with:
 72 |           ref: "main"
 73 | 
 74 |       - name: Set up Python
 75 |         uses: actions/setup-python@v5
 76 |         with:
 77 |           python-version: "3.11"
 78 | 
 79 |       - name: Install dependencies
 80 |         run: |
 81 |           python -m pip install --upgrade pip
 82 |           pip install -e ".[onnx]"
 83 |           pip install pytest
 84 | 
 85 |       - name: Run PyTest
 86 |         run: pytest python/scripts/
 87 | 
 88 |   publish_python:
 89 |     name: Publish Python
 90 |     runs-on: ubuntu-latest
 91 |     needs: [versioning, test_python]
 92 | 
 93 |     steps:
 94 |       - uses: actions/checkout@v4
 95 |         with:
 96 |           ref: "main"
 97 |       - name: Set up Python
 98 |         uses: actions/setup-python@v5
 99 |         with:
100 |           python-version: "3.11"
101 | 
102 |       - name: Install dependencies
103 |         run: |
104 |           python -m pip install --upgrade pip
105 |           pip install build
106 | 
107 |       - name: Build package
108 |         run: python -m build
109 | 
110 |       - name: Publish to PyPi
111 |         uses: pypa/gh-action-pypi-publish@release/v1
112 |         with:
113 |           verbose: true
114 |           print-hash: true
115 | 
116 |   publish_javascript:
117 |     name: Publish JavaScript
118 |     needs: versioning
119 |     runs-on: ubuntu-22.04
120 | 
121 |     steps:
122 |       - uses: actions/checkout@v4
123 |         with:
124 |           ref: "main"
125 | 
126 |       - name: Set up Node.js
127 |         uses: actions/setup-node@v4
128 |         with:
129 |           node-version: 20
130 | 
131 |       # TODO: JavaScript tests pass, but ONNX throws a memory error on exit
132 |       # - name: Build and Test
133 |       #   run: |
134 |       #     npm ci
135 |       #     npm test
136 | 
137 |       - name: Publish
138 |         uses: JS-DevTools/npm-publish@v2
139 |         with:
140 |           token: ${{ secrets.NPM_TOKEN }}
141 | 
142 |   deploy_docs:
143 |     name: Deploy Docs
144 |     environment:
145 |       name: github-pages
146 |       url: ${{ steps.deployment.outputs.page_url }}
147 |     runs-on: ubuntu-22.04
148 |     needs: [publish_python, publish_javascript]
149 |     steps:
150 |       - name: Checkout
151 |         uses: actions/checkout@v4
152 |         with:
153 |           ref: "main"
154 |       - name: Install dependencies
155 |         run: |
156 |           sudo apt update && 
157 |           sudo apt install -y doxygen graphviz dia git && 
158 |           pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 && 
159 |           npm install -g jsdoc
160 |       - name: Setup GitHub Pages
161 |         uses: actions/configure-pages@v2
162 |       - name: Install UForm from PyPi
163 |         run: pip install uform
164 |       - name: Build documentation
165 |         run: cd docs && make html
166 |       - name: Copy assets
167 |         run: cp -r assets build/docs/html/
168 |       - name: Upload artifacts
169 |         uses: actions/upload-pages-artifact@v1
170 |         with:
171 |           # Upload entire repository
172 |           path: "./build/docs/html/"
173 |       - name: Deploy to GitHub Pages
174 |         id: deployment
175 |         uses: actions/deploy-pages@v1
176 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /requirements.txt
 2 | /dist/
 3 | /test
 4 | /build/
 5 | yarn.lock
 6 | *.egg-info
 7 | __pycache__
 8 | .build
 9 | .swiftpm
10 | .hf_token
11 | 
12 | dictionary*
13 | vocab*
14 | /models/
15 | 
16 | # Tensors & ML Model
17 | *.onnx
18 | *.pt
19 | *.safetensors
20 | *.mlpackage
21 | 
22 | # NodeJS
23 | node_modules
24 | node_build
25 | yarn-error.log
26 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autofix_commit_msg: "chore(pre-commit): autofix run"
 3 |   autoupdate_commit_msg: "chore(pre-commit): autoupdate hooks"
 4 | 
 5 | default_install_hook_types:
 6 |   - pre-commit
 7 | 
 8 | repos:
 9 |   - repo: https://github.com/pre-commit/pre-commit-hooks
10 |     rev: v4.5.0
11 |     hooks:
12 |       - id: check-toml
13 |       - id: check-yaml
14 |       - id: debug-statements
15 |       - id: end-of-file-fixer
16 |       - id: name-tests-test
17 |       - id: trailing-whitespace
18 |   - repo: https://github.com/pappasam/toml-sort
19 |     rev: v0.23.1
20 |     hooks:
21 |       - id: toml-sort-fix
22 |   - repo: https://github.com/asottile/add-trailing-comma
23 |     rev: v3.1.0
24 |     hooks:
25 |       - id: add-trailing-comma
26 |   - repo: https://github.com/astral-sh/ruff-pre-commit
27 |     rev: v0.1.11
28 |     hooks:
29 |       # Run the linter
30 |       - id: ruff
31 |         args: [--fix]
32 |       # Run the formatter
33 |       - id: ruff-format
34 | 


--------------------------------------------------------------------------------
/.swift-format:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 1,
 3 |     "lineLength": 120,
 4 |     "indentation": {
 5 |         "spaces": 4
 6 |     },
 7 |     "maximumBlankLines": 1,
 8 |     "respectsExistingLineBreaks": true,
 9 |     "lineBreakBeforeControlFlowKeywords": true,
10 |     "lineBreakBeforeEachArgument": true,
11 |     "multiElementCollectionTrailingCommas": true,
12 |     "spacesAroundRangeFormationOperators": true
13 | }


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python Debugger",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |         },
14 |         {
15 |             "name": "PyTest Debugger",
16 |             "type": "debugpy",
17 |             "request": "launch",
18 |             "program": "pytest",
19 |             "console": "integratedTerminal",
20 |             "args": [
21 |                 "${file}",
22 |                 "-s",
23 |                 "-x",
24 |             ],
25 |         },
26 |         {
27 |             "name": "NodeJS Debugger",
28 |             "type": "node-terminal",
29 |             "request": "launch",
30 |             "command": "npm run test",
31 |         }
32 |     ]
33 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cSpell.words": [
 3 |         "arange",
 4 |         "ashvardanian",
 5 |         "astype",
 6 |         "CFURL",
 7 |         "coreml",
 8 |         "crossattn",
 9 |         "cumsum",
10 |         "dtype",
11 |         "embs",
12 |         "finfo",
13 |         "huggingface",
14 |         "keepdim",
15 |         "linalg",
16 |         "logits",
17 |         "Matryoshka",
18 |         "mlmodel",
19 |         "mlpackage",
20 |         "mlprogram",
21 |         "multimodal",
22 |         "ndarray",
23 |         "numpy",
24 |         "ONNX",
25 |         "onnxconverter",
26 |         "onnxruntime",
27 |         "opset",
28 |         "packbits",
29 |         "preprocess",
30 |         "pretrained",
31 |         "probs",
32 |         "pypi",
33 |         "pytest",
34 |         "randn",
35 |         "rerank",
36 |         "reranker",
37 |         "reranking",
38 |         "sandbeach",
39 |         "sess",
40 |         "SIMD",
41 |         "softmax",
42 |         "Tensorrt",
43 |         "torchvision",
44 |         "transfromers",
45 |         "uform",
46 |         "unimodal",
47 |         "unsqueeze",
48 |         "Vardanian",
49 |         "whitespaces"
50 |     ],
51 |     "[python]": {
52 |         "editor.defaultFormatter": "ms-python.black-formatter"
53 |     },
54 |     "python.formatting.provider": "none",
55 |     "window.autoDetectColorScheme": true,
56 |     "workbench.colorTheme": "Default Dark+",
57 |     "workbench.preferredDarkColorTheme": "Default Dark+"
58 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // See https://go.microsoft.com/fwlink/?LinkId=733558
 3 |     // for the documentation about the tasks.json format
 4 |     "version": "2.0.0",
 5 |     "tasks": [
 6 |         {
 7 |             "label": "Publish",
 8 |             "type": "shell",
 9 |             "command": "python -m pip install build twine && python -m build && twine check dist/* && twine upload dist/*"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
  1 | # UForm Model Benchmarks
  2 | 
  3 | ## Accuracy
  4 | 
  5 | ### Embedding Models
  6 | 
  7 | Few retrieval benchmarks exist for multimodal embeddings.
  8 | The most famous ones for English are "MS-COCO" and "Flickr30k".
  9 | Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
 10 | 
 11 | | Dataset   | Recall @ 1 | Recall @ 5 | Recall @ 10 |
 12 | | :-------- | ---------: | ---------: | ----------: |
 13 | | Flickr    |      0.727 |      0.915 |       0.949 |
 14 | | MS-COCO ¹ |      0.510 |      0.761 |       0.838 |
 15 | 
 16 | For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
 17 | Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
 18 | 
 19 | | Language  | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
 20 | | :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
 21 | | English 🇺🇸 |     __37.8__ |      37.7 |         63.5 |  __65.0__ |          73.5 |   __75.9__ |  1'452 M |
 22 | | Chinese 🇨🇳 |         27.3 |  __32.2__ |         51.3 |  __59.0__ |          62.1 |   __70.5__ |  1'118 M |
 23 | | Hindi 🇮🇳   |         20.7 |  __31.3__ |         42.5 |  __57.9__ |          53.7 |   __69.6__ |    602 M |
 24 | | Spanish 🇪🇸 |         32.6 |  __35.6__ |         58.0 |  __62.8__ |          68.8 |   __73.7__ |    548 M |
 25 | | Arabic 🇸🇦  |         22.7 |  __31.7__ |         44.9 |  __57.8__ |          55.8 |   __69.2__ |    274 M |
 26 | | French 🇫🇷  |         31.3 |  __35.4__ |         56.5 |  __62.6__ |          67.4 |   __73.3__ |    274 M |
 27 | 
 28 | 
 29 | All languages:
 30 | 
 31 | | Language             | OpenCLIP @ 1 |    UForm @ 1 | OpenCLIP @ 5 |    UForm @ 5 | OpenCLIP @ 10 |   UForm @ 10 | Speakers |
 32 | | :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
 33 | | Arabic 🇸🇦             |         22.7 |     __31.7__ |         44.9 |     __57.8__ |          55.8 |     __69.2__ |    274 M |
 34 | | Armenian 🇦🇲           |          5.6 |     __22.0__ |         14.3 |     __44.7__ |          20.2 |     __56.0__ |      4 M |
 35 | | Chinese 🇨🇳            |         27.3 |     __32.2__ |         51.3 |     __59.0__ |          62.1 |     __70.5__ |  1'118 M |
 36 | | English 🇺🇸            |     __37.8__ |         37.7 |         63.5 |     __65.0__ |          73.5 |     __75.9__ |  1'452 M |
 37 | | French 🇫🇷             |         31.3 |     __35.4__ |         56.5 |     __62.6__ |          67.4 |     __73.3__ |    274 M |
 38 | | German 🇩🇪             |         31.7 |     __35.1__ |         56.9 |     __62.2__ |          67.4 |     __73.3__ |    134 M |
 39 | | Hebrew 🇮🇱             |         23.7 |     __26.7__ |         46.3 |     __51.8__ |          57.0 |     __63.5__ |      9 M |
 40 | | Hindi 🇮🇳              |         20.7 |     __31.3__ |         42.5 |     __57.9__ |          53.7 |     __69.6__ |    602 M |
 41 | | Indonesian 🇮🇩         |         26.9 |     __30.7__ |         51.4 |     __57.0__ |          62.7 |     __68.6__ |    199 M |
 42 | | Italian 🇮🇹            |         31.3 |     __34.9__ |         56.7 |     __62.1__ |          67.1 |     __73.1__ |     67 M |
 43 | | Japanese 🇯🇵           |         27.4 |     __32.6__ |         51.5 |     __59.2__ |          62.6 |     __70.6__ |    125 M |
 44 | | Korean 🇰🇷             |         24.4 |     __31.5__ |         48.1 |     __57.8__ |          59.2 |     __69.2__ |     81 M |
 45 | | Persian 🇮🇷            |         24.0 |     __28.8__ |         47.0 |     __54.6__ |          57.8 |     __66.2__ |     77 M |
 46 | | Polish 🇵🇱             |         29.2 |     __33.6__ |         53.9 |     __60.1__ |          64.7 |     __71.3__ |     41 M |
 47 | | Portuguese 🇵🇹         |         31.6 |     __32.7__ |         57.1 |     __59.6__ |          67.9 |     __71.0__ |    257 M |
 48 | | Russian 🇷🇺            |         29.9 |     __33.9__ |         54.8 |     __60.9__ |          65.8 |     __72.0__ |    258 M |
 49 | | Spanish 🇪🇸            |         32.6 |     __35.6__ |         58.0 |     __62.8__ |          68.8 |     __73.7__ |    548 M |
 50 | | Thai 🇹🇭               |         21.5 |     __28.7__ |         43.0 |     __54.6__ |          53.7 |     __66.0__ |     61 M |
 51 | | Turkish 🇹🇷            |         25.5 |     __33.0__ |         49.1 |     __59.6__ |          60.3 |     __70.8__ |     88 M |
 52 | | Ukranian 🇺🇦           |         26.0 |     __30.6__ |         49.9 |     __56.7__ |          60.9 |     __68.1__ |     41 M |
 53 | | Vietnamese 🇻🇳         |         25.4 |     __28.3__ |         49.2 |     __53.9__ |          60.3 |     __65.5__ |     85 M |
 54 | |                      |              |              |              |              |               |              |          |
 55 | | Mean                 |     26.5±6.4 | __31.8±3.5__ |     49.8±9.8 | __58.1±4.5__ |     60.4±10.6 | __69.4±4.3__ |        - |
 56 | | Google Translate     |     27.4±6.3 | __31.5±3.5__ |     51.1±9.5 | __57.8±4.4__ |     61.7±10.3 | __69.1±4.3__ |        - |
 57 | | Microsoft Translator |     27.2±6.4 | __31.4±3.6__ |     50.8±9.8 | __57.7±4.7__ |     61.4±10.6 | __68.9±4.6__ |        - |
 58 | | Meta NLLB            |     24.9±6.7 | __32.4±3.5__ |    47.5±10.3 | __58.9±4.5__ |     58.2±11.2 | __70.2±4.3__ |        - |
 59 | 
 60 | ### Generative Models
 61 | 
 62 | | Model                | LLM Size |  SQA |    MME | MMBench | Average¹ |
 63 | | :------------------- | -------: | ---: | -----: | ------: | -------: |
 64 | | UForm-Gen2-Qwen-500m |     0.5B | 45.5 |  880.1 |    42.0 |    29.31 |
 65 | | MobileVLM v2         |     1.4B | 52.1 | 1302.8 |    57.7 |    36.81 |
 66 | | LLaVA-Phi            |     2.7B | 68.4 | 1335.1 |    59.8 |    42.95 |
 67 | 
 68 | For captioning evaluation we measure CLIPScore and RefCLIPScore³.
 69 | 
 70 | | Model                               | Size | Caption Length | CLIPScore | RefCLIPScore |
 71 | | :---------------------------------- | ---: | -------------: | --------: | -----------: |
 72 | | `llava-hf/llava-1.5-7b-hf`          |   7B |           Long |     0.878 |        0.529 |
 73 | | `llava-hf/llava-1.5-7b-hf`          |   7B |          Short |     0.886 |        0.531 |
 74 | |                                     |      |                |           |              |
 75 | | `Salesforce/instructblip-vicuna-7b` |   7B |           Long |     0.902 |        0.534 |
 76 | | `Salesforce/instructblip-vicuna-7b` |   7B |          Short |     0.848 |        0.523 |
 77 | |                                     |      |                |           |              |
 78 | | `unum-cloud/uform-gen`              | 1.5B |           Long |     0.847 |        0.523 |
 79 | | `unum-cloud/uform-gen`              | 1.5B |          Short |     0.842 |        0.522 |
 80 | |                                     |      |                |           |              |
 81 | | `unum-cloud/uform-gen-chat`         | 1.5B |           Long |     0.860 |        0.525 |
 82 | | `unum-cloud/uform-gen-chat`         | 1.5B |          Short |     0.858 |        0.525 |
 83 | 
 84 | Results for VQAv2 evaluation.
 85 | 
 86 | | Model                      | Size | Accuracy |
 87 | | :------------------------- | ---: | -------: |
 88 | | `llava-hf/llava-1.5-7b-hf` |   7B |     78.5 |
 89 | | `unum-cloud/uform-gen`     | 1.5B |     66.5 |
 90 | 
 91 | <br/>
 92 | 
 93 | > ¹ Train split was in training data. <br/>
 94 | > ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section. <br/>
 95 | > ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
 96 | 
 97 | ## Speed
 98 | 
 99 | ### Embedding Models
100 | 
101 | UForm comes pre-packaged with speed benchmarks for the models.
102 |     
103 | ```bash
104 | $ python python/scripts/bench_encoders.py --help
105 | usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
106 | 
107 | options:
108 |   -h, --help            show this help message and exit
109 |   --filter-out FILTER_OUT
110 |                         Filter out models, backends, or devices with a Regular Expression.
111 |   --batch-size BATCH_SIZE
112 |                         Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
113 | ```
114 | 
115 | Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and 
116 | 
117 | | Model Name                                     | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s |
118 | | :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- |
119 | | unum-cloud/uform3-image-text-english-base      | cpu    | torch   |                 23.03 | 76.57            | 15,978.03            | 562.28          |
120 | | unum-cloud/uform3-image-text-english-base      | cpu    | onnx    |                 23.11 | 77.75            | 13,880.27            | 1,067.40        |
121 | | unum-cloud/uform3-image-text-english-base      | cuda   | torch   |                 22.87 | 1,060.40         | 12,348.94            | 13,242.83       |
122 | | unum-cloud/uform3-image-text-english-large     | cpu    | torch   |                 22.41 | 10.84            | 13,350.45            | 145.12          |
123 | | unum-cloud/uform3-image-text-english-large     | cpu    | onnx    |                 23.13 | 19.60            | 18,031.85            | 960.09          |
124 | | unum-cloud/uform3-image-text-english-large     | cuda   | torch   |                 22.78 | 244.86           | 13,226.40            | 10,204.04       |
125 | | unum-cloud/uform3-image-text-english-small     | cpu    | torch   |                 20.08 | 71.68            | 12,147.05            | 249.63          |
126 | | unum-cloud/uform3-image-text-english-small     | cpu    | onnx    |                 22.84 | 195.27           | 13,636.99            | 1,385.25        |
127 | | unum-cloud/uform3-image-text-english-small     | cuda   | torch   |                 22.63 | 2,662.16         | 14,731.18            | 14,694.87       |
128 | | unum-cloud/uform3-image-text-multilingual-base | cpu    | torch   |                 22.98 | 64.28            | 10,129.27            | 209.76          |
129 | | unum-cloud/uform3-image-text-multilingual-base | cpu    | onnx    |                 23.06 | 66.81            | 8,963.13             | 1,104.32        |
130 | | unum-cloud/uform3-image-text-multilingual-base | cuda   | torch   |                 22.88 | 1,051.95         | 15,639.72            | 12,416.12       |
131 | 
132 | If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates.
133 | On Nvidia RTX 3090:
134 | 
135 | | Model                                            | Multilingual |                  Speed |    Speedup |
136 | | :----------------------------------------------- | -----------: | ---------------------: | ---------: |
137 | | `bert-base-uncased`                              |           No | 1'612 sequences/second |            |
138 | | `distilbert-base-uncased`                        |           No | 3'174 sequences/second |     x 1.96 |
139 | | `sentence-transformers/all-MiniLM-L12-v2`        |      __Yes__ | 3'604 sequences/second |     x 2.24 |
140 | | `unum-cloud/uform3-image-text-multilingual-base` |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
141 | 
142 | Given the small size of the model it also work well on mobile devices.
143 | On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
144 | 
145 | | Device                 |               Speed | Device TDP |        Efficiency |
146 | | :--------------------- | ------------------: | ---------: | ----------------: |
147 | | Nvidia RTX 3090        | ~ 140 tokens/second |     < 350W | 0.40 tokens/joule |
148 | | Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
149 | | Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
150 | | Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
151 | 
152 | ### Generative Models
153 | 
154 | ```bash
155 | $ python python/scripts/bench_decoders.py --help
156 | usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
157 | 
158 | options:
159 |   -h, --help            show this help message and exit
160 |   --batch-size BATCH_SIZE
161 |                         Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
162 |   --max-length MAX_LENGTH
163 |                         Maximum length of the generated text in tokens.
164 | ```
165 | 
166 | On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
167 | 
168 | | Model                               |  Size | Decoding Speed |    Decoding Parallel Streams |
169 | | :---------------------------------- | ----: | -------------: | ---------------------------: |
170 | | `llava-hf/llava-1.5-7b-hf`          |   7 B | ~ 141 tokens/s |  ~ 4 K tokens/s (32 streams) |
171 | | `Salesforce/instructblip-vicuna-7b` |   7 B | ~ 211 tokens/s |  ~ 2 K tokens/s (32 streams) |
172 | | `unum-cloud/uform-gen`              | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) |
173 | | `unum-cloud/uform-gen2-dpo`         | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) |
174 | 
175 | On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
176 | 
177 | | Model                               |  Size | Decoding Speed |   Speedup |
178 | | :---------------------------------- | ----: | -------------: | --------: |
179 | | `llava-hf/llava-1.5-7b-hf`          |   7 B |  ~ 40 tokens/s |           |
180 | | `Salesforce/instructblip-vicuna-7b` |   7 B |  ~ 40 tokens/s |           |
181 | | `unum-cloud/uform-gen`              | 1.5 B | ~ 140 tokens/s | __x 3.5__ |
182 | 
183 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Kim"
 5 |   given-names: "Mikhail"
 6 |   orcid: "https://orcid.org/0009-0003-8413-3221"
 7 | - family-names: "Orshulevich"
 8 |   given-names: "Vladimir"
 9 |   orcid: "https://orcid.org/0009-0007-8961-6969"
10 | - family-names: "Vardanian"
11 |   given-names: "Ash"
12 |   orcid: "https://orcid.org/0000-0002-4882-1815"
13 | title: "UForm by Unum Cloud"
14 | version: 3.1.1
15 | keywords:
16 | - "text-to-image retrieval"
17 | - "multimodal"
18 | - "visual-language pre-training"
19 | doi: 10.5281/zenodo.7951497
20 | date-released: 2023-01-03
21 | url: "https://github.com/unum-cloud/uform"
22 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to UForm
 2 | 
 3 | We welcome contributions to UForm!
 4 | 
 5 | ## Python
 6 | 
 7 | Before submitting any changes, please make sure that the tests pass.
 8 | 
 9 | ```sh
10 | pip install -e ".[dev]"         # For development dependencies
11 | pip install -e ".[torch]"       # For PyTorch
12 | pip install -e ".[onnx]"        # For ONNX on CPU
13 | pip install -e ".[onnx-gpu]"    # For ONNX on GPU, available for some platforms
14 | pip install -e ".[torch,onnx,onnx-gpu,dev]"  # For all
15 | 
16 | pytest python/scripts/ -s -x -Wd -v
17 | pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
18 | ```
19 | 
20 | ## Swift
21 | 
22 | To build and test the Swift package, use the following command:
23 | 
24 | ```bash
25 | swift build
26 | swift test
27 | ```
28 | 
29 | Swift formatting is enforced with `swift-format` default utility from Apple.
30 | To install and run it on all the files in the project, use the following command:
31 | 
32 | ```bash
33 | brew install swift-format
34 | swift-format . -i -r
35 | ```
36 | 
37 | The style is controlled by the `.swift-format` JSON file in the root of the repository.
38 | As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings.
39 | 
40 | ## JavaScript
41 | 
42 | For rapid development you can avoid the TypeScript precompilation step:
43 | 
44 | ```sh
45 | npm install -g ts-node
46 | ts-node javascript/embeddings.mts
47 | ```
48 | 
49 | Before submitting any changes, please make sure that the tests pass.
50 | 
51 | ```sh
52 | npm install
53 | npm test
54 | ```
55 | 
56 | ## Benchmarking
57 | 
58 | If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally.
59 | The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU.
60 | 
61 | ```sh
62 | git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository
63 | cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies
64 | python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large"
65 | ```
66 | 
67 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pins" : [
 3 |     {
 4 |       "identity" : "swift-argument-parser",
 5 |       "kind" : "remoteSourceControl",
 6 |       "location" : "https://github.com/apple/swift-argument-parser.git",
 7 |       "state" : {
 8 |         "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
 9 |         "version" : "1.3.0"
10 |       }
11 |     },
12 |     {
13 |       "identity" : "swift-transformers",
14 |       "kind" : "remoteSourceControl",
15 |       "location" : "https://github.com/ashvardanian/swift-transformers",
16 |       "state" : {
17 |         "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
18 |       }
19 |     }
20 |   ],
21 |   "version" : 2
22 | }
23 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version:5.9
 2 | import PackageDescription
 3 | 
 4 | let package = Package(
 5 |     name: "UForm",
 6 |     platforms: [
 7 |         // Linux doesn't have to be explicitly listed
 8 |         .iOS(.v16),  // For iOS, version 13 and later
 9 |         .tvOS(.v16),  // For tvOS, version 13 and later
10 |         .macOS(.v13),  // For macOS, version 10.15 (Catalina) and later
11 |         .watchOS(.v6),  // For watchOS, version 6 and later
12 |     ],
13 |     products: [
14 |         .library(
15 |             name: "UForm",
16 |             targets: ["UForm"]
17 |         )
18 |     ],
19 |     dependencies: [
20 |         .package(
21 |             url: "https://github.com/ashvardanian/swift-transformers",
22 |             revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
23 |         )
24 |     ],
25 |     targets: [
26 |         .target(
27 |             name: "UForm",
28 |             dependencies: [
29 |                 .product(name: "Transformers", package: "swift-transformers")
30 |             ],
31 |             path: "swift",
32 |             exclude: ["EncodersTests.swift"]
33 |         ),
34 |         .testTarget(
35 |             name: "UFormTests",
36 |             dependencies: ["UForm"],
37 |             path: "swift",
38 |             sources: ["EncodersTests.swift"]
39 |         ),
40 |     ]
41 | )
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">UForm</h1>
  2 | <h3 align="center">
  3 | Pocket-Sized Multimodal AI<br/>
  4 | For Content Understanding and Generation<br/>
  5 | </h3>
  6 | <br/>
  7 | 
  8 | <p align="center">
  9 | <a href="https://discord.gg/jsMURnSFM2"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/discord.svg" alt="Discord"></a>
 10 | &nbsp; &nbsp; &nbsp;
 11 | <a href="https://www.linkedin.com/company/unum-cloud/"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/linkedin.svg" alt="LinkedIn"></a>
 12 | &nbsp; &nbsp; &nbsp;
 13 | <a href="https://twitter.com/unum_cloud"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/twitter.svg" alt="Twitter"></a>
 14 | &nbsp; &nbsp; &nbsp;
 15 | <a href="https://unum.cloud/post"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/blog.svg" alt="Blog"></a>
 16 | &nbsp; &nbsp; &nbsp;
 17 | <a href="https://github.com/unum-cloud/uform"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/github.svg" alt="GitHub"></a>
 18 | </p>
 19 | 
 20 | <p align="center">
 21 | Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
 22 | <br/>
 23 | Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
 24 | <br/>
 25 | ONNX • CoreML • PyTorch
 26 | <br/>
 27 | <a href="https://github.com/unum-cloud/uform/blob/main/python/README.md">Python</a>
 28 |  • 
 29 | <a href="https://github.com/unum-cloud/uform/blob/main/javascript/README.md">JavaScript</a>
 30 |  • 
 31 | <a href="https://github.com/unum-cloud/uform/blob/main/swift/README.md">Swift</a>
 32 | </p>
 33 | 
 34 | ---
 35 | 
 36 | ![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
 37 | 
 38 | Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient.
 39 | UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages.
 40 | UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA).
 41 | With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone.
 42 | 
 43 | ## Features
 44 | 
 45 | - __Tiny Embeddings__: 64-dimensional [Matryoshka][matryoshka]-style embeddings for extremely fast [search][usearch].
 46 | - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors.
 47 | - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform.
 48 | - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall.
 49 | - __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages.
 50 | 
 51 | [usearch]: https://github.com/unum-cloud/usearch
 52 | [matryoshka]: https://arxiv.org/abs/2205.13147
 53 | 
 54 | ## Models
 55 | 
 56 | For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md).
 57 | 
 58 | ### Embedding Models
 59 | 
 60 | <table style="width:100%; border-collapse:collapse;">
 61 |     <thead>
 62 |         <tr>
 63 |             <th>Model</th>
 64 |             <th style="text-align:right;">Parameters</th>
 65 |             <th style="text-align:right;">Languages</th>
 66 |             <th style="text-align:right;">Architecture</th>
 67 |         </tr>
 68 |     </thead>
 69 |     <tbody>
 70 |         <tr>
 71 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english-large/">uform3-image-text-english-large</a></code>  🆕</td>
 72 |             <td style="text-align:right;">365 M</td>
 73 |             <td style="text-align:right;">1</td>
 74 |             <td style="text-align:right;">12 layer BERT, ViT-L/14</td>
 75 |         </tr>
 76 |         <tr>
 77 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english/">uform3-image-text-english-base</a></code></td>
 78 |             <td style="text-align:right;">143 M</td>
 79 |             <td style="text-align:right;">1</td>
 80 |             <td style="text-align:right;">4 layer BERT, ViT-B/16</td>
 81 |         </tr>
 82 |         <tr>
 83 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english-small/">uform3-image-text-english-small</a></code>  🆕</td>
 84 |             <td style="text-align:right;">79 M</td>
 85 |             <td style="text-align:right;">1</td>
 86 |             <td style="text-align:right;">4 layer BERT, ViT-S/16</td>
 87 |         </tr>
 88 |         <tr>
 89 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/">uform3-image-text-multilingual-base</a></code></td>
 90 |             <td style="text-align:right;">206M</td>
 91 |             <td style="text-align:right;">21</td>
 92 |             <td style="text-align:right;">12 layer BERT, ViT-B/16</td>
 93 |         </tr>
 94 |     </tbody>
 95 | </table>
 96 | 
 97 | ### Generative Models
 98 | 
 99 | <table style="width:100%; border-collapse:collapse;">
100 |     <thead>
101 |         <tr>
102 |             <th>Model</th>
103 |             <th style="text-align:right;">Parameters</th>
104 |             <th style="text-align:right;">Purpose</th>
105 |             <th style="text-align:right;">Architecture</th>
106 |         </tr>
107 |     </thead>
108 |     <tbody>
109 |         <tr>
110 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-gen2-dpo/">uform-gen2-dpo</a></code>  🆕</td>
111 |             <td style="text-align:right;">1.2 B</td>
112 |             <td style="text-align:right;">Chat, Image Captioning, VQA</td>
113 |             <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
114 |         </tr>
115 |         <tr>
116 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/">uform-gen2-qwen-500m</a></code></td>
117 |             <td style="text-align:right;">1.2 B</td>
118 |             <td style="text-align:right;">Chat, Image Captioning, VQA</td>
119 |             <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
120 |         </tr>
121 |         <tr>
122 |             <td><code><a href="https://huggingface.co/unum-cloud/uform-gen/">uform-gen</a></code> ⚠️</td>
123 |             <td style="text-align:right;">1.5 B</td>
124 |             <td style="text-align:right;">Image Captioning, VQA</td>
125 |             <td style="text-align:right;">llama-1.3B, ViT-B/16</td>
126 |         </tr>
127 |     </tbody>
128 | </table>
129 | 
130 | ## Quick Start Examples
131 | 
132 | ### Embedding Models
133 | 
134 | First, `pip install uform`.
135 | Then, load the model:
136 | 
137 | ```py
138 | from uform import get_model, Modality
139 | 
140 | processors, models = get_model('unum-cloud/uform3-image-text-english-small')
141 | 
142 | model_text = models[Modality.TEXT_ENCODER]
143 | model_image = models[Modality.IMAGE_ENCODER]
144 | processor_text = processors[Modality.TEXT_ENCODER]
145 | processor_image = processors[Modality.IMAGE_ENCODER]
146 | ```
147 | 
148 | Embed images:
149 | 
150 | ```py
151 | import requests
152 | from io import BytesIO
153 | from PIL import Image
154 | 
155 | image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
156 | image = Image.open(BytesIO(requests.get(image_url).content))
157 | image_data = processor_image(image)
158 | image_features, image_embedding = model_image.encode(image_data, return_features=True)
159 | ```
160 | 
161 | Embed queries:
162 | 
163 | ```py
164 | text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
165 | text_data = processor_text(text)
166 | text_features, text_embedding = model_text.encode(text_data, return_features=True)
167 | ```
168 | 
169 | For more details check out:
170 | 
171 | - Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models)
172 | - JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models)
173 | - Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models)
174 | 
175 | ### Generative Models
176 | 
177 | The generative models are natively compatible with 
178 | 
179 | ```python
180 | from transformers import AutoModel, AutoProcessor
181 | 
182 | model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
183 | processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
184 | 
185 | prompt = 'Question or Instruction'
186 | image = Image.open('image.jpg')
187 | 
188 | inputs = processor(text=[prompt], images=[image], return_tensors='pt')
189 | 
190 | with torch.inference_mode():
191 |      output = model.generate(
192 |         **inputs,
193 |         do_sample=False,
194 |         use_cache=True,
195 |         max_new_tokens=256,
196 |         eos_token_id=151645,
197 |         pad_token_id=processor.tokenizer.pad_token_id
198 |     )
199 | prompt_len = inputs['input_ids'].shape[1]
200 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
201 | ```
202 | 
203 | For more details check out:
204 | 
205 | - Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models)
206 | - JavaScript docs on generative models 🔜
207 | - Swift docs on generative models 🔜
208 | 
209 | ## Technical Details
210 | 
211 | ### Down-casting, Quantization, Matryoshka, and Slicing
212 | 
213 | Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
214 | Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support.
215 | Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries.
216 | Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search.
217 | 
218 | ```python
219 | import numpy as np
220 | 
221 | f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
222 | f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
223 | i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
224 | b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
225 | ```
226 | 
227 | Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner.
228 | 
229 | ```python
230 | import numpy as np
231 | 
232 | large_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
233 | small_embedding: np.ndarray = large_embedding[:, :256]
234 | tiny_embedding: np.ndarray = large_embedding[:, :64]
235 | ```
236 | 
237 | Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries.
238 | When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD.
239 | 
240 | ```python
241 | from simsimd import cosine, hamming
242 | 
243 | distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU
244 | distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU
245 | distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU
246 | distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU
247 | ```
248 | 
249 | Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch.
250 | Here are a couple of examples:
251 | 
252 | ```python
253 | from usearch.index import Index
254 | 
255 | f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings
256 | f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings
257 | i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings
258 | b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings
259 | ```
260 | 
261 | [github-usearch]: https://github.com/unum-cloud/usearch
262 | [github-simsimd]: https://github.com/ashvardanian/simsimd
263 | [report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel
264 | [report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/
265 | 
266 | ### Compact Packaging
267 | 
268 | PyTorch is a heavy dependency to carry, especially if you run on Edge or IoT devices.
269 | Using vanilla ONNX runtime, one can significantly reduce memory consumption and deployment latency.
270 | 
271 | ```sh
272 | $ conda create -n uform_torch python=3.10 -y
273 | $ conda create -n uform_onnx python=3.10 -y
274 | $ conda activate uform_torch && pip install -e ".[torch]" && conda deactivate
275 | $ conda activate uform_onnx && pip install -e ".[onnx]" && conda deactivate
276 | $ du -sh $(conda info --envs | grep 'uform_torch' | awk '{print $2}')
277 | > 5.2G    ~/conda/envs/uform_torch
278 | $ du -sh $(conda info --envs | grep 'uform_onnx' | awk '{print $2}')
279 | > 461M    ~/conda/envs/uform_onnx
280 | ```
281 | 
282 | Most of that weight can be further reduced down to 100 MB for both the model and the runtime.
283 | You can pick one of many supported [ONNX execution providers][onnx-providers], which includes XNNPACK, CUDA and TensorRT for Nvidia GPUs, OpenVINO on Intel, DirectML on Windows, ROCm on AMD, CoreML on Apple devices, and more to come.
284 | 
285 | [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/
286 | 
287 | ### Multimodal Chat in CLI
288 | 
289 | The generative models can be used for chat-like experiences in the command line.
290 | For that, you can use the `uform-chat` CLI tool, which is available in the UForm package.
291 | 
292 | ```bash
293 | $ pip install uform
294 | $ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg
295 | $ uform-chat --model unum-cloud/uform-gen2-dpo \
296 | >     --image="https://bit.ly/3tIVg9M" \
297 | >     --device="cuda:0" \
298 | >     --fp16
299 | ```
300 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 3.1.1
2 | 


--------------------------------------------------------------------------------
/assets/model_types_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unum-cloud/uform/33d5df7951cf3bee8b14d1110cc3bbae1ff6fba8/assets/model_types_bg.png


--------------------------------------------------------------------------------
/assets/unum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unum-cloud/uform/33d5df7951cf3bee8b14d1110cc3bbae1ff6fba8/assets/unum.png


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = ../build/docs
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
 1 | p.caption {
 2 |     font-size: 0 !important;
 3 |     margin: 8px 0px !important;
 4 |     padding: 0 !important;
 5 |     border-bottom: 1px solid #8b7f8b12;
 6 | }
 7 | 
 8 | article>section>h1:nth-child(1) {
 9 |     display: none;
10 | }
11 | 
12 | .sidebar-brand-text {
13 |     cursor: initial;
14 | }
15 | 
16 | table>tbody>tr>td {
17 |     text-align: center;
18 | }
19 | 
20 | table>tbody>tr>td:first-child {
21 |     text-align: left;
22 | }
23 | 
24 | #overview>p>a>img {
25 |     height: 25px !important;
26 | }
27 | 


--------------------------------------------------------------------------------
/docs/_static/custom.js:
--------------------------------------------------------------------------------
1 | $(document).ready(function () {
2 |     const github_logo = `<a style="text-decoration:none;" class="Header-link" href="https://github.com/unum-cloud/uform/" data-hotkey="g d" aria-label="Homepage " data-turbo="false" data-analytics-event="{&quot;category&quot;:&quot;Header&quot;,&quot;action&quot;:&quot;go to dashboard&quot;,&quot;label&quot;:&quot;icon:logo&quot;}">
3 |     <svg style="fill: var(--color-foreground-primary);" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
4 |         </a>`
5 | 
6 |     $(".sidebar-brand-text").html("Unum · UForm<br/> <span style='font-size:0.8em'>2.1.1</span>" + github_logo)
7 | })
8 | 


--------------------------------------------------------------------------------
/docs/benchmarks.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Benchmarks
3 | ====================
4 | 
5 | .. mdinclude:: ../BENCHMARKS.md


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "Unum · UForm"
10 | copyright = "2023, Unum"
11 | author = "Unum"
12 | release = open("../VERSION", "r").read().strip()
13 | with open("_static/custom.js", "r+") as js:
14 |     content = js.read()
15 |     js.seek(0)
16 |     js.truncate()
17 |     js.write(content.replace("$(VERSION)", release))
18 | 
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 | 
22 | extensions = [
23 |     "breathe",
24 |     "m2r2",
25 |     "sphinx.ext.autodoc",
26 |     "sphinx_js",
27 |     "sphinx.ext.autosummary",
28 |     "sphinx.ext.intersphinx",
29 |     "sphinx.ext.napoleon",
30 |     "sphinxcontrib.jquery",
31 |     "sphinxcontrib.googleanalytics",
32 | ]
33 | 
34 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "*.md"]
35 | 
36 | googleanalytics_id = "341385789"
37 | googleanalytics_enabled = True
38 | 
39 | # -- Options for HTML output -------------------------------------------------
40 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
41 | 
42 | html_logo = "../assets/unum.png"
43 | html_theme = "furo"
44 | html_static_path = ["_static"]
45 | html_css_files = ["custom.css"]
46 | html_js_files = ["custom.js"]
47 | html_baseurl = "/docs/uform/"
48 | 
49 | breathe_projects = {"UForm": "../build/xml"}
50 | breathe_default_project = "UForm"
51 | 
52 | js_source_path = "../javascript/"
53 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Contributing
3 | ====================
4 | 
5 | .. mdinclude:: ../CONTRIBUTING.md


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Overview
 3 | ====================
 4 | .. mdinclude:: ../README.md
 5 | 
 6 | .. toctree:: 
 7 |    :hidden:
 8 |    :caption: �
 9 | 
10 |    python/index
11 |    javascript/index
12 |    swift/index
13 | 
14 | .. toctree:: 
15 |    :hidden:
16 |    :caption: �
17 | 
18 |    contributing
19 |    benchmarks
20 | 
21 | .. toctree:: 
22 |    :hidden:
23 |    :caption: �
24 | 
25 |    genindex
26 | 


--------------------------------------------------------------------------------
/docs/javascript/index.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | JavaScript SDK
 3 | ====================
 4 | 
 5 | 
 6 | .. mdinclude:: ../../javascript/README.md
 7 | 
 8 | .. toctree::
 9 |    :hidden:
10 | 


--------------------------------------------------------------------------------
/docs/javascript/reference.rst.txt:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | ====================
 3 | 
 4 | ====================
 5 | Encoders
 6 | ====================
 7 | 
 8 | .. js:autoclass:: ../javascript/encoders.TextProcessor
 9 |    :members:
10 | 
11 | .. js:autoclass:: ../javascript/encoders.ImageProcessor
12 |    :members:
13 | 
14 | .. js:autoclass:: ../javascript/encoders.TextEncoder
15 |    :members:
16 | 
17 | .. js:autoclass:: ../javascript/encoders.ImageEncoder
18 |    :members:
19 | 


--------------------------------------------------------------------------------
/docs/python/index.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Python SDK
 3 | ====================
 4 | 
 5 | 
 6 | .. mdinclude:: ../../python/README.md
 7 | 
 8 | .. toctree::
 9 |    :hidden:
10 | 
11 |    reference


--------------------------------------------------------------------------------
/docs/python/reference.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | ====================
 3 | 
 4 | ====================
 5 | Root
 6 | ====================
 7 | 
 8 | .. automodule:: uform
 9 |     :members:
10 |     :undoc-members:
11 | 
12 | ====================
13 | Torch Encoreds
14 | ====================
15 | 
16 | .. automodule:: uform.torch_encoders
17 |     :members:
18 |     :undoc-members:
19 | 
20 | ====================
21 | Torch Processors
22 | ====================
23 | 
24 | .. automodule:: uform.torch_processors
25 |     :members:
26 |     :undoc-members:
27 | 
28 | ====================
29 | ONNX Encoders
30 | ====================
31 | 
32 | .. automodule:: uform.onnx_encoders
33 |     :members:
34 |     :undoc-members:
35 | 
36 | ====================
37 | NumPy Processors
38 | ====================
39 | 
40 | .. automodule:: uform.numpy_processors
41 |     :members:
42 |     :undoc-members:
43 | 


--------------------------------------------------------------------------------
/docs/swift/index.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Swift SDK
3 | ====================
4 | 
5 | 
6 | .. mdinclude:: ../../swift/README.md
7 | 


--------------------------------------------------------------------------------
/javascript/README.md:
--------------------------------------------------------------------------------
 1 | # UForm for JavaScript
 2 | 
 3 | UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications.
 4 | Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware.
 5 | 
 6 | ## Installation
 7 | 
 8 | There are several ways to install the UForm JavaScript SDK from NPM.
 9 | 
10 | ```bash
11 | pnpm add uform 
12 | npm add uform  
13 | yarn add uform  
14 | ```
15 | 
16 | ## Quick Start
17 | 
18 | ### Embeddings
19 | 
20 | ```js
21 | import { getModel, Modality, TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from '@unum-cloud/uform';
22 | 
23 | const { configPath, modalityPaths, tokenizerPath } = await getModel(
24 |     modelId: 'unum-cloud/uform3-image-text-english-small',
25 |     modalities: [Modality.TextEncoder, Modality.ImageEncoder] 
26 | );
27 | 
28 | const textProcessor = new TextProcessor(configPath, tokenizerPath);
29 | await textProcessor.init();
30 | const processedTexts = await textProcessor.process(["a small red panda in a zoo"]);
31 | 
32 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
33 | await textEncoder.init();
34 | const textOutput = await textEncoder.encode(processedTexts);
35 | assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
36 | await textEncoder.dispose();
37 | 
38 | const imageProcessor = new ImageProcessor(configPath);
39 | await imageProcessor.init();
40 | const processedImages = await imageProcessor.process("path/to/image.png");
41 | 
42 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
43 | await imageEncoder.init();
44 | const imageOutput = await imageEncoder.encode(processedImages);
45 | assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
46 | ```
47 | 
48 | The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK.
49 | The embeddings can later be compared using the cosine similarity or other distance metrics.
50 | 
51 | ### Generative Models
52 | 
53 | Coming soon ...
54 | 
55 | ## Technical Details
56 | 
57 | ### Faster Search
58 | 
59 | Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
60 | Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search.
61 | In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd].
62 | 
63 | [github-usearch]: https://github.com/unum-cloud/usearch
64 | [github-simsimd]: https://github.com/ashvardanian/simsimd
65 | 


--------------------------------------------------------------------------------
/javascript/encoders.mjs:
--------------------------------------------------------------------------------
  1 | import { readFileSync } from 'fs';
  2 | import { InferenceSession, Tensor } from 'onnxruntime-node';
  3 | import { PreTrainedTokenizer } from '@xenova/transformers';
  4 | import sharp from 'sharp';
  5 | 
  6 | /**
  7 |  * A processor for text data that prepares input for the text encoder model.
  8 |  */
  9 | class TextProcessor {
 10 | 
 11 |     /**
 12 |      * Constructs a new TextProcessor instance.
 13 |      *
 14 |      * @param {string} configPath - The path to the configuration file for the text encoder.
 15 |      * @param {string} tokenizerPath - The path to the tokenizer configuration file.
 16 |      */
 17 |     constructor(configPath, tokenizerPath) {
 18 |         this.configPath = configPath;
 19 |         this.tokenizerPath = tokenizerPath;
 20 | 
 21 |         this.maxSeqLen = 0;
 22 |         this.padTokenIdx = 0;
 23 |         this.tokenizer = null;
 24 |     }
 25 | 
 26 |     /**
 27 |      * Initializes the TextProcessor by loading configurations and setting up the tokenizer.
 28 |      */
 29 |     async init() {
 30 |         var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
 31 |         if (config.text_encoder !== undefined) {
 32 |             config = config.text_encoder;
 33 |         }
 34 | 
 35 |         this.maxSeqLen = config.max_position_embeddings;
 36 |         this.padTokenIdx = config.padding_idx;
 37 | 
 38 |         const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' }));
 39 |         this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
 40 |         this.tokenizer.model_max_length = this.maxSeqLen;
 41 |         this.tokenizer.pad_token_id = this.padTokenIdx;
 42 |     }
 43 | 
 44 |     /**
 45 |      * Processes a list of text strings into model-ready format, including padding and attention masks.
 46 |      *
 47 |      * @param {Array<string>} texts - An array of text strings to process.
 48 |      * @return {Object} The processed texts as model input features.
 49 |      */
 50 |     async process(texts) {
 51 | 
 52 |         const encoded = await this.tokenizer(texts, {
 53 |             add_special_tokens: true,
 54 |             padding: 'max_length',
 55 |             max_length: this.maxSeqLen,
 56 |             truncation: true,
 57 |         });
 58 | 
 59 |         return {
 60 |             'input_ids': encoded.input_ids,
 61 |             'attention_mask': encoded.attention_mask,
 62 |         };
 63 |     }
 64 | }
 65 | 
 66 | /**
 67 |  * An encoder for text data that uses a pre-trained model to encode text.
 68 |  */
 69 | class TextEncoder {
 70 | 
 71 |     /**
 72 |      * Constructs a new TextEncoder instance.
 73 |      *
 74 |      * @param {string} modelPath - The path to the pre-trained ONNX model.
 75 |      */
 76 |     constructor(modelPath) {
 77 |         this.modelPath = modelPath;
 78 |         this.session = null;
 79 |     }
 80 | 
 81 |     /**
 82 |      * Initializes the ONNX session with the pre-trained model.
 83 |      */
 84 |     async init() {
 85 |         this.session = await InferenceSession.create(this.modelPath);
 86 |     }
 87 | 
 88 |     /**
 89 |      * Releases the ONNX session resources.
 90 |      */
 91 |     async dispose() {
 92 |         if (this.session) {
 93 |             await this.session.release().catch(error => console.error("Failed to release session", error));
 94 |             this.session = null;
 95 |         }
 96 |     }
 97 | 
 98 |     /**
 99 |      * Encodes the input data using the pre-trained model.
100 |      *
101 |      * @param {Object} inputs - The input data containing input_ids and attention_mask.
102 |      * @return {Object} The encoded outputs from the model.
103 |      */
104 |     async encode(inputs) {
105 |         if (!this.session) {
106 |             throw new Error("Session is not initialized.");
107 |         }
108 | 
109 |         // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
110 |         function ensureInt32Array(data) {
111 |             if (data instanceof Int32Array) {
112 |                 return data; // Use as is if already Int32Array
113 |             }
114 |             if (data instanceof BigInt64Array) {
115 |                 // Convert BigInt64Array to Int32Array, ensuring all values are in range
116 |                 return new Int32Array(Array.from(data).map(bigInt => {
117 |                     if (bigInt > 2147483647n || bigInt < -2147483648n) {
118 |                         throw new Error("Value out of range for Int32.");
119 |                     }
120 |                     return Number(bigInt); // Convert BigInt to Number
121 |                 }));
122 |             }
123 |             // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
124 |             if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) {
125 |                 return new Int32Array(data); // Convert directly
126 |             }
127 |             throw new Error("Unsupported data type for tensor conversion.");
128 |         }
129 | 
130 |         // Prepare tensor data
131 |         const inputIDsData = ensureInt32Array(inputs.input_ids.data);
132 |         const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);
133 | 
134 |         // Create ONNX Tensors as 'int32'
135 |         const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims);
136 |         const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims);
137 | 
138 |         // Run model inference
139 |         return this.session.run({
140 |             input_ids: inputIDs,
141 |             attention_mask: attentionMask,
142 |         });
143 |     }
144 | 
145 | }
146 | 
147 | /**
148 |  * A processor for image data that prepares images for the image encoder model.
149 |  */
150 | class ImageProcessor {
151 |     constructor(configPath) {
152 |         this.configPath = configPath;
153 |     }
154 | 
155 |     /**
156 |      * Initializes the ImageProcessor by loading configuration settings for image preprocessing.
157 |      */
158 |     async init() {
159 |         var config = JSON.parse(readFileSync(this.configPath, 'utf8'));
160 |         if (config.image_encoder !== undefined) {
161 |             config = config.image_encoder;
162 |         }
163 | 
164 |         this.imageSize = config.image_size;
165 |         this.normalizationMeans = config.normalization_means;
166 |         this.normalizationDeviations = config.normalization_deviations;
167 | 
168 |         this.imageMean = new Float32Array(this.normalizationMeans);
169 |         this.imageStd = new Float32Array(this.normalizationDeviations);
170 |     }
171 |     /**
172 |      * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing.
173 |      *
174 |      * @param {Buffer|Array<Buffer>} images - A single image or an array of images to process.
175 |      * @return {Array<Float32Array>} The processed image data as an array of Float32Arrays.
176 |      */
177 |     async process(images) {
178 |         const processSingle = async (image) => {
179 |             let img = sharp(image).toColorspace('srgb');
180 |             const metadata = await img.metadata();
181 |             const scale = this.imageSize / Math.min(metadata.width, metadata.height);
182 |             const scaledWidth = Math.ceil(metadata.width * scale);
183 |             const scaledHeight = Math.ceil(metadata.height * scale);
184 |             img = img.resize({
185 |                 width: scaledWidth,
186 |                 height: scaledHeight,
187 |                 fit: sharp.fit.cover,
188 |                 position: sharp.strategy.entropy,
189 |                 options: sharp.interpolators.bicubic
190 |             }).extract({
191 |                 left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
192 |                 top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
193 |                 width: this.imageSize,
194 |                 height: this.imageSize
195 |             }).removeAlpha();
196 | 
197 |             let buffer = await img.raw().toBuffer();
198 |             let array = new Float32Array(buffer.length);
199 | 
200 |             // When we export into the `array`, we reorder the dimensions of the tensor 
201 |             // from HWC to CHW, and normalize the pixel values.
202 |             let channelSize = this.imageSize * this.imageSize;
203 |             for (let i = 0; i < this.imageSize * this.imageSize; i++) {
204 |                 let r = buffer[i * 3];
205 |                 let g = buffer[i * 3 + 1];
206 |                 let b = buffer[i * 3 + 2];
207 |                 array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0];
208 |                 array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1];
209 |                 array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2];
210 |             }
211 | 
212 |             return array;
213 |         };
214 | 
215 |         if (Array.isArray(images)) {
216 |             return Promise.all(images.map(img => processSingle(img)));
217 |         } else {
218 |             return [await processSingle(images)];
219 |         }
220 |     }
221 | }
222 | 
223 | /**
224 |  * An encoder for image data that uses a pre-trained model to encode images.
225 |  */
226 | class ImageEncoder {
227 |     constructor(modelPath, processor) {
228 |         this.modelPath = modelPath;
229 |         this.imageSize = processor.imageSize;
230 |     }
231 | 
232 |     /**
233 |      * Initializes the ONNX session with the pre-trained model.
234 |      */
235 |     async init() {
236 |         this.session = await InferenceSession.create(this.modelPath);
237 |     }
238 | 
239 |     /**
240 |      * Releases the ONNX session resources.
241 |      */
242 |     async dispose() {
243 |         if (this.session) {
244 |             await this.session.release().catch(error => console.error("Failed to release session", error));
245 |             this.session = null;
246 |         }
247 |     }
248 | 
249 |     /**
250 |      * Encodes the processed image data using the pre-trained model.
251 |      *
252 |      * @param {Float32Array|Array<Float32Array>} images - The processed image data.
253 |      * @return {Object} The encoded outputs from the model.
254 |      */
255 |     async encode(images) {
256 |         if (!this.session) {
257 |             throw new Error("Session is not initialized.");
258 |         }
259 | 
260 |         // Helper function to ensure data is a Float32Array.
261 |         const ensureFloat32Array = (data) => {
262 |             if (!(data instanceof Float32Array)) {
263 |                 throw new Error("Unsupported data type for tensor conversion.");
264 |             }
265 |             return data;
266 |         };
267 | 
268 |         // Helper function to concatenate multiple Float32Arrays into a single Float32Array.
269 |         const concatFloat32Arrays = (arrays) => {
270 |             const totalLength = arrays.reduce((acc, val) => acc + val.length, 0);
271 |             const result = new Float32Array(totalLength);
272 |             let offset = 0;
273 |             for (let arr of arrays) {
274 |                 result.set(arr, offset);
275 |                 offset += arr.length;
276 |             }
277 |             return result;
278 |         };
279 | 
280 |         let imagesData;
281 |         let dims;
282 | 
283 |         if (Array.isArray(images)) {
284 |             // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size.
285 |             const arrays = images.map(ensureFloat32Array);
286 |             imagesData = concatFloat32Arrays(arrays);
287 |             const numImages = arrays.length;
288 |             const numChannels = 3;
289 |             const height = this.imageSize;
290 |             const width = this.imageSize;
291 |             dims = [numImages, numChannels, height, width];
292 |         } else {
293 |             // Single image images, which is already a Float32Array.
294 |             imagesData = ensureFloat32Array(images);
295 |             const numChannels = 3;
296 |             const height = this.imageSize;
297 |             const width = this.imageSize;
298 |             dims = [1, numChannels, height, width];
299 |         }
300 | 
301 |         // Create ONNX Tensor
302 |         const imagesTensor = new Tensor('float32', imagesData, dims);
303 | 
304 |         // Run model inference
305 |         return this.session.run({
306 |             images: imagesTensor,
307 |         });
308 |     }
309 | }
310 | 
311 | export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder };
312 | 


--------------------------------------------------------------------------------
/javascript/encoders_test.js:
--------------------------------------------------------------------------------
  1 | import { existsSync, readFileSync } from 'fs';
  2 | import { fileURLToPath } from 'url';
  3 | import path from 'path';
  4 | import assert from 'assert';
  5 | import fetch from 'node-fetch';
  6 | 
  7 | import { getModel, Modality, TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from './index.mjs';
  8 | 
  9 | // Check if the HuggingFace Hub API token is set in the environment variable.
 10 | let hf_token = process.env.HUGGINGFACE_HUB_TOKEN;
 11 | if (!hf_token) {
 12 |     const dirname = path.dirname(fileURLToPath(import.meta.url));
 13 |     const tokenPath = path.join(dirname, '../', '.hf_token');
 14 |     if (existsSync(tokenPath)) {
 15 |         hf_token = readFileSync(tokenPath, 'utf8').trim();
 16 |     }
 17 | }
 18 | 
 19 | async function tryGettingCheckpoint(modelId, modalities) {
 20 |     const { configPath, modalityPaths, tokenizerPath } = await getModel(
 21 |         modelId,
 22 |         modalities,
 23 |         hf_token,
 24 |         '.onnx'
 25 |     );
 26 | 
 27 |     assert(configPath !== null, "Config path should not be null");
 28 |     assert(modalityPaths !== null, "Modality paths should not be null");
 29 |     assert(tokenizerPath !== null, "Tokenizer path should not be null");
 30 | 
 31 |     // Check if the file actually exists
 32 |     assert(existsSync(configPath), `Config file should exist at ${configPath}`);
 33 |     assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
 34 |     for (const modalityPath of Object.values(modalityPaths)) {
 35 |         assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
 36 |     }
 37 | }
 38 | 
 39 | async function testGetCheckpoint() {
 40 |     console.log("- `testGetCheckpoint`: Start");
 41 | 
 42 |     try {
 43 |         const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
 44 | 
 45 |         for (const modelId of [
 46 |             'unum-cloud/uform3-image-text-english-small',
 47 |             'unum-cloud/uform3-image-text-english-base',
 48 |             'unum-cloud/uform3-image-text-english-large',
 49 |             'unum-cloud/uform3-image-text-multilingual-base',
 50 |         ]) {
 51 |             await tryGettingCheckpoint(modelId, modalities, hf_token);
 52 |         }
 53 | 
 54 |         console.log("- `testGetCheckpoint`: Success");
 55 |     } catch (error) {
 56 |         console.error("- `testGetCheckpoint`: Failed", error);
 57 |     }
 58 | }
 59 | 
 60 | async function tryTextEncoderForwardPass(modelId) {
 61 |     const modalities = [Modality.TextEncoder];
 62 |     const { configPath, modalityPaths, tokenizerPath } = await getModel(
 63 |         modelId,
 64 |         modalities,
 65 |         hf_token,
 66 |         '.onnx'
 67 |     );
 68 | 
 69 |     const textProcessor = new TextProcessor(configPath, tokenizerPath);
 70 |     await textProcessor.init();
 71 |     const processedTexts = await textProcessor.process("a small red panda in a zoo");
 72 | 
 73 |     const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
 74 |     await textEncoder.init();
 75 |     const textOutput = await textEncoder.encode(processedTexts);
 76 |     assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
 77 | 
 78 |     await textEncoder.dispose();
 79 | }
 80 | 
 81 | async function tryImageEncoderForwardPass(modelId) {
 82 |     const modalities = [Modality.ImageEncoder];
 83 |     const { configPath, modalityPaths } = await getModel(
 84 |         modelId,
 85 |         modalities,
 86 |         hf_token,
 87 |         '.onnx'
 88 |     );
 89 | 
 90 |     const imageProcessor = new ImageProcessor(configPath);
 91 |     await imageProcessor.init();
 92 |     const processedImages = await imageProcessor.process("assets/unum.png");
 93 | 
 94 |     const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
 95 |     await imageEncoder.init();
 96 |     const imageOutput = await imageEncoder.encode(processedImages);
 97 |     assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
 98 | 
 99 |     await imageEncoder.dispose();
100 | }
101 | 
102 | function cosineSimilarity(vecA, vecB) {
103 |     // We may be receiving a complex tensor type, so let's check if it
104 |     // has an array member named `data`.
105 |     if (vecA.data) {
106 |         vecA = vecA.data;
107 |     }
108 |     if (vecB.data) {
109 |         vecB = vecB.data;
110 |     }
111 | 
112 |     let dotProduct = 0.0;
113 |     let normA = 0.0;
114 |     let normB = 0.0;
115 |     for (let i = 0; i < vecA.length; i++) {
116 |         dotProduct += vecA[i] * 1.0 * vecB[i];
117 |         normA += vecA[i] * 1.0 * vecA[i];
118 |         normB += vecB[i] * 1.0 * vecB[i];
119 |     }
120 |     if (normA === 0 || normB === 0) {
121 |         return 0;
122 |     } else {
123 |         return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
124 |     }
125 | }
126 | 
127 | async function fetchImage(url) {
128 |     const response = await fetch(url);
129 |     const arrayBuffer = await response.arrayBuffer();
130 |     const buffer = Buffer.from(arrayBuffer);
131 |     return buffer;
132 | }
133 | 
134 | async function tryCrossReferencingImageAndText(modelId) {
135 | 
136 |     const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
137 |     const { configPath, modalityPaths, tokenizerPath } = await getModel(
138 |         modelId,
139 |         modalities,
140 |         hf_token,
141 |         '.onnx'
142 |     );
143 | 
144 |     const imageProcessor = new ImageProcessor(configPath);
145 |     await imageProcessor.init();
146 |     const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
147 |     await imageEncoder.init();
148 |     const textProcessor = new TextProcessor(configPath, tokenizerPath);
149 |     await textProcessor.init();
150 |     const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
151 |     await textEncoder.init();
152 | 
153 |     const texts = [
154 |         "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
155 |         "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
156 |         "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
157 |         "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
158 |         "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
159 |     ];
160 |     const imageUrls = [
161 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
162 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
163 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
164 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
165 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
166 |     ];
167 | 
168 |     const textEmbeddings = [];
169 |     const imageEmbeddings = [];
170 | 
171 |     for (let i = 0; i < texts.length; i++) {
172 |         const text = texts[i];
173 |         const imageUrl = imageUrls[i];
174 |         const imageBuffer = await fetchImage(imageUrl);
175 | 
176 |         const processedText = await textProcessor.process(text);
177 |         const processedImage = await imageProcessor.process(imageBuffer);
178 | 
179 |         const textEmbedding = await textEncoder.encode(processedText);
180 |         const imageEmbedding = await imageEncoder.encode(processedImage);
181 | 
182 |         textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
183 |         imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
184 | 
185 |         // Print-based debugging at its best :)
186 |         // console.log(`Text: ${text}, Image: ${imageUrl}`);
187 |         // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`);
188 |         // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`);
189 |         console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`)
190 |     }
191 | 
192 |     for (let i = 0; i < texts.length; i++) {
193 |         const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]);
194 |         const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i]));
195 |         const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie));
196 | 
197 |         const maxOtherTextSimilarity = Math.max(...otherTextSimilarities);
198 |         const maxOtherImageSimilarity = Math.max(...otherImageSimilarities);
199 | 
200 |         assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images.");
201 |         assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts.");
202 |     }
203 | 
204 |     await textEncoder.dispose();
205 |     await imageEncoder.dispose();
206 | }
207 | 
208 | async function testEncoders() {
209 |     console.log("- `testEncoders`: Start");
210 | 
211 |     try {
212 | 
213 |         // Go through the bi-modal models
214 |         for (const modelId of [
215 |             'unum-cloud/uform3-image-text-english-small',
216 |             // 'unum-cloud/uform3-image-text-english-base',
217 |             // 'unum-cloud/uform3-image-text-english-large',
218 |             // 'unum-cloud/uform3-image-text-multilingual-base',
219 |         ]) {
220 |             await tryTextEncoderForwardPass(modelId, hf_token);
221 |             await tryImageEncoderForwardPass(modelId, hf_token);
222 |             await tryCrossReferencingImageAndText(modelId, hf_token);
223 |         }
224 | 
225 |         console.log("- `testEncoders`: Success");
226 |     } catch (error) {
227 |         console.error("- `testEncoders`: Failed", error);
228 |     }
229 | }
230 | 
231 | process.on('uncaughtException', (error) => {
232 |     console.error('Uncaught Exception:', error);
233 | });
234 | 
235 | testGetCheckpoint();
236 | testEncoders();
237 | 


--------------------------------------------------------------------------------
/javascript/hub.mjs:
--------------------------------------------------------------------------------
  1 | import { join } from "path"
  2 | import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs";
  3 | 
  4 | import { downloadFile, listFiles } from "@huggingface/hub";
  5 | 
  6 | const Modality = {
  7 |     TextEncoder: "text_encoder",
  8 |     ImageEncoder: "image_encoder",
  9 |     VideoEncoder: "video_encoder",
 10 |     TextDecoder: "text_decoder",
 11 | };
 12 | 
 13 | function isModality(value) {
 14 |     return Object.values(Modality).includes(value);
 15 | }
 16 | 
 17 | function normalizeModalities(modalities) {
 18 |     return modalities.map(x => {
 19 |         if (typeof x === "string") {
 20 |             if (isModality(x)) {
 21 |                 return x;
 22 |             } else {
 23 |                 throw new Error(`Invalid modality: ${x}`);
 24 |             }
 25 |         }
 26 |         return x;
 27 |     });
 28 | }
 29 | 
 30 | async function ensureDirectoryExists(dirPath) {
 31 |     if (!existsSync(dirPath)) {
 32 |         mkdirSync(dirPath, { recursive: true });
 33 |     }
 34 | }
 35 | 
 36 | async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
 37 |     modalities = normalizeModalities(modalities);
 38 | 
 39 |     const configNames = ['config.json'];
 40 |     const tokenizerNames = ['tokenizer.json'];
 41 |     const modelFileNames = modalities.map(modality => `${modality}${format}`);
 42 |     const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
 43 | 
 44 |     const repo = { type: "model", name: modelId };
 45 |     const credentials = token ? { accessToken: token } : undefined;
 46 | 
 47 |     let configPath = null;
 48 |     let tokenizerPath = null;
 49 |     const modalityPaths = {};
 50 |     const modelSaveDir = join(saveDir, modelId);
 51 | 
 52 |     await ensureDirectoryExists(modelSaveDir);
 53 | 
 54 |     const fileIterator = listFiles({ repo, recursive: true, credentials });
 55 |     for await (const file of fileIterator) {
 56 |         const fileName = file.path.split('/').pop();
 57 |         if (fileName && allowedPatterns.includes(fileName)) {
 58 |             const filePath = file.path;
 59 |             const savePath = join(modelSaveDir, fileName);
 60 | 
 61 |             if (configNames.includes(fileName)) {
 62 |                 configPath = savePath;
 63 |             } else if (tokenizerNames.includes(fileName)) {
 64 |                 tokenizerPath = savePath;
 65 |             } else {
 66 |                 const modalityName = fileName.split('.')[0];
 67 |                 modalityPaths[modalityName] = savePath;
 68 |             }
 69 | 
 70 |             const response = await downloadFile({ repo, path: filePath, credentials });
 71 |             if (response) {
 72 |                 // HuggingFace might be defining the `env.localModelPath` variable
 73 |                 // to store the downloaded files in a local directory.
 74 |                 // Let's check if the file is there.
 75 |                 // const localPath = join(env.localModelPath, repo, filePath);
 76 |                 // if (existsSync(localPath)) {
 77 |                 //     console.log(`File already exists locally at ${localPath}`);
 78 |                 // }
 79 | 
 80 |                 if (response.body && response.body.pipe) {
 81 |                     const fileStream = createWriteStream(savePath);
 82 |                     response.body.pipe(fileStream);
 83 |                     await new Promise((resolve, reject) => {
 84 |                         fileStream.on('finish', resolve);
 85 |                         fileStream.on('error', reject);
 86 |                     });
 87 |                 } else if (response.arrayBuffer) {
 88 |                     // Handle non-streamable response for environments like Node.js
 89 |                     const buffer = await response.arrayBuffer();
 90 |                     writeFileSync(savePath, Buffer.from(buffer));
 91 |                 } else {
 92 |                     console.error('Unexpected response type');
 93 |                 }
 94 |                 console.log(`Downloaded ${fileName} successfully to ${savePath}`);
 95 |             } else {
 96 |                 console.log('No response received for the file download request.');
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     return { configPath, modalityPaths, tokenizerPath };
102 | }
103 | 
104 | export { getModel, Modality };
105 | 


--------------------------------------------------------------------------------
/javascript/index.mjs:
--------------------------------------------------------------------------------
1 | // Re-export everything from hub.mjs
2 | export * from './hub.mjs';
3 | 
4 | // Re-export everything from encoders.mjs
5 | export * from './encoders.mjs';
6 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@unum-cloud/uform",
 3 |   "type": "module",
 4 |   "version": "3.1.1",
 5 |   "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
 6 |   "dependencies": {
 7 |     "@huggingface/hub": "^0.14.8",
 8 |     "@xenova/transformers": "^2.17.0",
 9 |     "node-fetch": "^3.3.2",
10 |     "onnxruntime-node": "^1.17.0",
11 |     "onnxruntime-web": "^1.17.3"
12 |   },
13 |   "devDependencies": {
14 |     "nodemon": "^2.0.15"
15 |   },
16 |   "scripts": {
17 |     "start": "node javascript/encoders.mjs",
18 |     "test": "node javascript/encoders_test.js"
19 |   },
20 |   "main": "javascript/index.mjs",
21 |   "files": [
22 |     "javascript/index.mjs",
23 |     "javascript/encoders.mjs",
24 |     "javascript/hub.mjs"
25 |   ],
26 |   "directories": {
27 |     "doc": "docs"
28 |   },
29 |   "keywords": [
30 |     "AI",
31 |     "multimodal",
32 |     "content generation",
33 |     "huggingface"
34 |   ],
35 |   "author": "Ash Vardanian, Unum Cloud",
36 |   "license": "Apache-2.0"
37 | }
38 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | build-backend = "setuptools.build_meta"
 3 | requires = ["setuptools>=42"]
 4 | 
 5 | [project]
 6 | authors = [
 7 |     {email = "ash.vardanian@unum.cloud", name = "Ash Vardanian"},
 8 |     {email = "mike.kim@unum.cloud", name = "Mikhail Kim"},
 9 |     {email = "vladimir.orshulevich@unum.cloud", name = "Vladimir Orshulevich"},
10 | ]
11 | classifiers = [
12 |     "Development Status :: 5 - Production/Stable",
13 |     "License :: OSI Approved :: Apache Software License",
14 |     "Natural Language :: Chinese (Simplified)",
15 |     "Natural Language :: English",
16 |     "Natural Language :: French",
17 |     "Natural Language :: German",
18 |     "Natural Language :: Italian",
19 |     "Natural Language :: Japanese",
20 |     "Natural Language :: Korean",
21 |     "Natural Language :: Polish",
22 |     "Natural Language :: Russian",
23 |     "Natural Language :: Spanish",
24 |     "Natural Language :: Turkish",
25 |     "Operating System :: OS Independent",
26 |     "Programming Language :: Python :: 3",
27 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
28 |     "Topic :: Scientific/Engineering :: Image Processing",
29 |     "Topic :: Scientific/Engineering :: Image Recognition",
30 | ]
31 | dependencies = [
32 |     "huggingface_hub>=0.16.4",
33 |     "tokenizers>=0.13.3",
34 |     "pillow",
35 |     "simsimd",
36 | ]
37 | description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
38 | maintainers = [
39 |     {email = "info@unum.cloud", name = "Unum Cloud"},
40 | ]
41 | name = "uform"
42 | readme = "README.md"
43 | requires-python = ">=3.7"
44 | version = "3.1.1"
45 | 
46 | [project.scripts]
47 | uform-chat = "uform.chat:main"
48 | 
49 | [project.optional-dependencies]
50 | torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"]
51 | onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"]
52 | onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"]
53 | dev = ["pytest", "pandas"]
54 | 
55 | [project.urls]
56 | "Homepage" = "https://github.com/unum-cloud/uform"
57 | 
58 | [tool.setuptools.packages.find]
59 | where = ["python"]
60 | include = ["uform"]
61 | namespaces = false
62 | 
63 | [tool.ruff]
64 | ignore = ["C408", "C901", "E501", "E741"]
65 | ignore-init-module-imports = true
66 | select = ["C", "E", "F", "I", "UP", "W"]
67 | 
68 | [tool.ruff.isort]
69 | lines-after-imports = 2
70 | 
71 | [tool.ruff.lint.isort]
72 | known-first-party = ["uform"]
73 | 
74 | [tool.ruff.per-file-ignores]
75 | "__init__.py" = ["E401"]
76 | 
77 | [tool.tomlsort]
78 | all = true
79 | in_place = true
80 | spaces_before_inline_comment = 2
81 | spaces_indent_inline_array = 4
82 | trailing_comma_inline_array = true
83 | 
84 | # Configuration options for the Black formatter:
85 | # https://black.readthedocs.io/en/latest/usage_and_configuration/the_basics.html#where-black-looks-for-the-file
86 | [tool.black]
87 | line-length = 120                   # Set line length to the same value as in `.clang-format` for modern wide screens
88 | target-version = ['py36', 'py312']  # Set target Python versions to 3.6 and 3.12


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
  1 | # UForm Python SDK
  2 | 
  3 | UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications.
  4 | The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware.
  5 | 
  6 | ## Installation
  7 | 
  8 | There are several ways to install the UForm Python SDK, depending on the backend you want to use.
  9 | PyTorch is by far the heaviest, but the most capable.
 10 | ONNX is a lightweight alternative that can run on any CPU, and on some GPUs.
 11 | 
 12 | ```bash
 13 | pip install "uform[torch]"       # For PyTorch
 14 | pip install "uform[onnx]"        # For ONNX on CPU
 15 | pip install "uform[onnx-gpu]"    # For ONNX on GPU, available for some platforms
 16 | pip install "uform[torch,onnx]"  # For PyTorch and ONNX Python tests
 17 | ```
 18 | 
 19 | ## Quick Start
 20 | 
 21 | ### Embeddings
 22 | 
 23 | Load the model:
 24 | 
 25 | ```py
 26 | from uform import get_model, Modality
 27 | 
 28 | model_name = 'unum-cloud/uform3-image-text-english-small'
 29 | modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER]
 30 | processors, models = get_model(model_name, modalities=modalities)
 31 | 
 32 | model_text = models[Modality.TEXT_ENCODER]
 33 | model_image = models[Modality.IMAGE_ENCODER]
 34 | processor_text = processors[Modality.TEXT_ENCODER]
 35 | processor_image = processors[Modality.IMAGE_ENCODER]
 36 | ```
 37 | 
 38 | Embed images:
 39 | 
 40 | ```py
 41 | import requests
 42 | from io import BytesIO
 43 | from PIL import Image
 44 | 
 45 | image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
 46 | image_url = Image.open(BytesIO(requests.get(image_url).content))
 47 | image_data = processor_image(image)
 48 | image_features, image_embedding = model_image.encode(image_data, return_features=True)
 49 | ```
 50 | 
 51 | Embed queries:
 52 | 
 53 | ```py
 54 | text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
 55 | text_data = processor_text(text)
 56 | text_features, text_embedding = model_text.encode(text_data, return_features=True)
 57 | ```
 58 | 
 59 | ### Generative Models
 60 | 
 61 | UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
 62 | Those models can be used to caption images or power multimodal chat experiences.
 63 | 
 64 | ```python
 65 | from transformers import AutoModel, AutoProcessor
 66 | 
 67 | model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
 68 | processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
 69 | 
 70 | prompt = 'Question or Instruction'
 71 | image = Image.open('image.jpg')
 72 | 
 73 | inputs = processor(text=[prompt], images=[image], return_tensors='pt')
 74 | 
 75 | with torch.inference_mode():
 76 |      output = model.generate(
 77 |         **inputs,
 78 |         do_sample=False,
 79 |         use_cache=True,
 80 |         max_new_tokens=256,
 81 |         eos_token_id=151645,
 82 |         pad_token_id=processor.tokenizer.pad_token_id
 83 |     )
 84 | prompt_len = inputs['input_ids'].shape[1]
 85 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
 86 | ```
 87 | 
 88 | You can check examples of different prompts in our demo Gradio spaces on HuggingFace:
 89 | 
 90 | - for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
 91 | - for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo)
 92 | 
 93 | ## Technical Details
 94 | 
 95 | ### Multi-GPU Parallelism
 96 | 
 97 | To achieve higher throughput, you can launch UForm on multiple GPUs.
 98 | For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
 99 | 
100 | ```python
101 | from uform import get_model, Modality
102 | import torch.nn as nn
103 | 
104 | encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch')
105 | 
106 | model_text = models[Modality.TEXT_ENCODER]
107 | model_image = models[Modality.IMAGE_ENCODER]
108 | processor_text = processors[Modality.TEXT_ENCODER]
109 | processor_image = processors[Modality.IMAGE_ENCODER]
110 | 
111 | model_text.return_features = False
112 | model_image.return_features = False
113 | model_text_parallel = nn.DataParallel(model_text)
114 | model_image_parallel = nn.DataParallel(model_image)
115 | ```
116 | 
117 | Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays.
118 | 
119 | ```python
120 | def get_image_embedding(images: List[Image]):
121 |     preprocessed = processor_image(images)
122 |     embedding = model_image_parallel.forward(preprocessed)
123 |     return embedding.detach().cpu().numpy()
124 | 
125 | def get_text_embedding(texts: List[str]):
126 |     preprocessed = processor_text(texts)
127 |     embedding = model_text_parallel.forward(preprocessed)
128 |     return embedding.detach().cpu().numpy()
129 | ```
130 | 
131 | ### ONNX and CUDA
132 | 
133 | The configuration process may include a few additional steps, depending on the environment.
134 | When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
135 | 
136 | ```sh
137 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
138 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
139 | sudo apt-get update
140 | sudo apt-get -y install cuda-toolkit-12
141 | pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
142 | export CUDA_PATH="/usr/local/cuda-12/bin"
143 | export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
144 | export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
145 | pytest python/scripts/ -s -x -Wd -v -k onnx
146 | ```
147 | 
148 | [install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
149 | 


--------------------------------------------------------------------------------
/python/scripts/bench_decoders.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from time import perf_counter
  3 | from dataclasses import dataclass
  4 | from typing import List
  5 | import argparse
  6 | 
  7 | import requests
  8 | import torch
  9 | from PIL import Image
 10 | from transformers import (
 11 |     AutoProcessor,
 12 |     InstructBlipForConditionalGeneration,
 13 |     InstructBlipProcessor,
 14 |     LlavaForConditionalGeneration,
 15 |     AutoModel,
 16 |     AutoProcessor,
 17 | )
 18 | 
 19 | from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 20 | 
 21 | dtype = torch.bfloat16
 22 | low_cpu_mem_usage = False
 23 | device = "cuda:0"
 24 | 
 25 | 
 26 | @dataclass
 27 | class BenchmarkResult:
 28 |     model_name: str
 29 |     device_name: str
 30 |     backend_name: str
 31 |     duration_image_preprocessing: float
 32 |     duration_image_embedding: float
 33 |     duration_text_preprocessing: float
 34 |     duration_text_embedding: float
 35 | 
 36 | 
 37 | def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]:
 38 |     # BLIP models require the prompt to be the first argument
 39 |     prompt = [prompt] * batch_size
 40 |     image = [image] * batch_size
 41 |     try:
 42 |         inputs = processor(prompt, image, return_tensors="pt")
 43 |     except ValueError:
 44 |         inputs = processor(image, prompt, return_tensors="pt")
 45 | 
 46 |     # Downcast and move to device
 47 |     for possible_key in ["images", "pixel_values"]:
 48 |         if possible_key not in inputs:
 49 |             continue
 50 |         inputs[possible_key] = inputs[possible_key].to(dtype)  # Downcast floats
 51 |     inputs = {k: v.to(device) for k, v in inputs.items()}  # Move to the right device
 52 | 
 53 |     with torch.inference_mode():
 54 |         output = model.generate(
 55 |             **inputs,
 56 |             do_sample=False,
 57 |             # use_cache=True,
 58 |             max_new_tokens=max_length,
 59 |             eos_token_id=32001,
 60 |             pad_token_id=processor.tokenizer.pad_token_id,
 61 |         )
 62 |     prompt_len = inputs["input_ids"].shape[1]
 63 |     decoded_texts = processor.batch_decode(
 64 |         output[:, prompt_len:],
 65 |         skip_special_tokens=True,
 66 |     )
 67 |     return decoded_texts
 68 | 
 69 | 
 70 | def duration(callable):
 71 |     """Profile the duration of a callable and return the duration and the result."""
 72 |     start = perf_counter()
 73 |     result = callable()
 74 |     stop = perf_counter()
 75 |     return stop - start, result
 76 | 
 77 | 
 78 | def bench_captions(
 79 |     model,
 80 |     processor,
 81 |     prompt: str,
 82 |     images: List[Image.Image],
 83 |     max_length: int = 256,
 84 |     batch_size: int = 10,
 85 | ) -> List[str]:
 86 |     total_duration = 0
 87 |     total_length = 0
 88 |     model = torch.compile(model)
 89 | 
 90 |     def caption_image(image):
 91 |         return caption(
 92 |             model=model,
 93 |             processor=processor,
 94 |             prompt=prompt,
 95 |             image=image,
 96 |             max_length=max_length,
 97 |             batch_size=batch_size,
 98 |         )
 99 | 
100 |     for image in images:
101 |         seconds, captions = duration(partial(caption_image, image=image))
102 |         total_duration += seconds
103 |         total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions)
104 | 
105 |     del model
106 |     del processor
107 |     print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
108 | 
109 | 
110 | def main(batch_size: int = 10, max_length: int = 256):
111 | 
112 |     image_urls = [
113 |         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
114 |         "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
115 |         "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
116 |         "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
117 |         "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
118 |     ]
119 |     images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
120 |     captions = [
121 |         "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
122 |         "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
123 |         "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
124 |         "asian girl sleeping in a bed. top down view",
125 |         "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
126 |     ]
127 | 
128 |     print("UForm-Gen2")
129 |     bench_captions(
130 |         model=AutoModel.from_pretrained(
131 |             "unum-cloud/uform-gen2-dpo",
132 |             trust_remote_code=True,
133 |             torch_dtype=dtype,
134 |             low_cpu_mem_usage=low_cpu_mem_usage,
135 |             ignore_mismatched_sizes=True,
136 |         ).to(device),
137 |         processor=AutoProcessor.from_pretrained(
138 |             "unum-cloud/uform-gen2-dpo",
139 |             trust_remote_code=True,
140 |         ),
141 |         prompt="Describe the picture in great detail",
142 |         images=images,
143 |         batch_size=batch_size,
144 |         max_length=max_length,
145 |     )
146 | 
147 |     print("UForm-Gen")
148 |     bench_captions(
149 |         model=VLMForCausalLM.from_pretrained(
150 |             "unum-cloud/uform-gen",
151 |             torch_dtype=dtype,
152 |             low_cpu_mem_usage=low_cpu_mem_usage,
153 |             ignore_mismatched_sizes=True,
154 |         ).to(device),
155 |         processor=VLMProcessor.from_pretrained(
156 |             "unum-cloud/uform-gen",
157 |         ),
158 |         prompt="[cap] Summarize the visual content of the image.",
159 |         images=images,
160 |         batch_size=batch_size,
161 |         max_length=max_length,
162 |     )
163 | 
164 |     print("LLaVA")
165 |     bench_captions(
166 |         model=LlavaForConditionalGeneration.from_pretrained(
167 |             "llava-hf/llava-1.5-7b-hf",
168 |             torch_dtype=dtype,
169 |             low_cpu_mem_usage=low_cpu_mem_usage,
170 |         ).to(device),
171 |         processor=AutoProcessor.from_pretrained(
172 |             "llava-hf/llava-1.5-7b-hf",
173 |         ),
174 |         prompt="USER: <image>\nWhat are these?\nASSISTANT:",
175 |         images=images,
176 |         batch_size=batch_size,
177 |         max_length=max_length,
178 |     )
179 | 
180 |     print("InstructBLIP")
181 |     bench_captions(
182 |         model=InstructBlipForConditionalGeneration.from_pretrained(
183 |             "Salesforce/instructblip-vicuna-7b",
184 |             torch_dtype=dtype,
185 |             low_cpu_mem_usage=low_cpu_mem_usage,
186 |         ).to(device),
187 |         processor=InstructBlipProcessor.from_pretrained(
188 |             "Salesforce/instructblip-vicuna-7b",
189 |         ),
190 |         prompt="Summarize the visual content of the image.",
191 |         images=images,
192 |         batch_size=batch_size,
193 |         max_length=max_length,
194 |     )
195 | 
196 | 
197 | if __name__ == "__main__":
198 | 
199 |     parser = argparse.ArgumentParser()
200 |     parser.add_argument(
201 |         "--batch-size",
202 |         type=int,
203 |         default=10,
204 |         help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
205 |     )
206 |     parser.add_argument(
207 |         "--max-length",
208 |         type=str,
209 |         default=256,
210 |         help="Maximum length of the generated text in tokens.",
211 |     )
212 |     args = parser.parse_args()
213 | 
214 |     main(batch_size=args.batch_size, max_length=args.max_length)
215 | 


--------------------------------------------------------------------------------
/python/scripts/bench_encoders.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This script provides the throughput of UForm multimodal embedding models.
  5 | 
  6 | The output of the script will cover:
  7 |     - Time to preprocess an image, and throughput in images/s.
  8 |     - Time to tokenize the text, and throughput in queries/s.
  9 |     - Time to encode the image, and throughput in images/s.
 10 |     - Time to encode the text, and throughput in queries/s.
 11 |     - Share of time spent on each part of the pipeline.
 12 |     
 13 | Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx), 
 14 | and precision (float32 or bfloat16), producing a pretty comprehensive benchmark.
 15 | 
 16 | Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`.
 17 | Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled.
 18 | """
 19 | 
 20 | from functools import partial
 21 | from time import perf_counter
 22 | from dataclasses import dataclass
 23 | from typing import List, Tuple, Literal, Callable, Generator
 24 | import re
 25 | import argparse
 26 | 
 27 | import requests
 28 | from PIL import Image
 29 | import pandas as pd
 30 | 
 31 | from uform import get_model, Modality, ExecutionProviderError
 32 | 
 33 | # Define global constants for the hardware availability
 34 | torch_available = False
 35 | try:
 36 |     import torch
 37 | 
 38 |     torch_available = True
 39 | except ImportError:
 40 |     pass
 41 | onnx_available = False
 42 | try:
 43 |     import onnx
 44 | 
 45 |     onnx_available = True
 46 | except ImportError:
 47 |     pass
 48 | cuda_available = False
 49 | try:
 50 |     if torch_available:
 51 |         cuda_available = torch.cuda.is_available()
 52 |     elif onnx_available:
 53 |         import onnxruntime
 54 | 
 55 |         cuda_available = onnxruntime.get_device() == "GPU"
 56 | except ImportError:
 57 |     pass
 58 | 
 59 | 
 60 | @dataclass
 61 | class BenchmarkResult:
 62 |     model_name: str
 63 |     device_name: Literal["cpu", "cuda"] = "cpu"
 64 |     backend_name: Literal["torch", "onnx"] = "torch"
 65 |     duration_image_preprocessing: float = 0
 66 |     duration_image_embedding: float = 0
 67 |     duration_text_preprocessing: float = 0
 68 |     duration_text_embedding: float = 0
 69 | 
 70 | 
 71 | def duration(callable, synchronize=False):
 72 |     """Profile the duration of a callable and return the duration and the result."""
 73 |     if synchronize and torch_available and cuda_available:
 74 |         torch.cuda.synchronize()  # Wait for CUDA operations to complete
 75 |     start = perf_counter()
 76 |     result = callable()
 77 |     if synchronize and torch_available and cuda_available:
 78 |         torch.cuda.synchronize()  # Ensure all CUDA kernels have finished
 79 |     stop = perf_counter()
 80 |     return stop - start, result
 81 | 
 82 | 
 83 | def get_captioned_images() -> List[Tuple[Image.Image, str]]:
 84 |     """Get a list of pre-downloaded and decoded images and their captions."""
 85 |     image_urls = [
 86 |         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
 87 |         "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
 88 |         "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
 89 |         "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
 90 |         "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
 91 |     ]
 92 |     images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
 93 |     captions = [
 94 |         "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
 95 |         "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
 96 |         "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
 97 |         "asian girl sleeping in a bed. top down view",
 98 |         "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
 99 |     ]
100 |     return list(zip(images, captions))
101 | 
102 | 
103 | def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
104 |     """Yields callable benchmarks for all supported backends of the given model."""
105 | 
106 |     # Pull the content and artificially grow the batch size
107 |     images, captions = zip(*get_captioned_images())
108 | 
109 |     if len(images) < batch_size:
110 |         import math
111 | 
112 |         multiplier = int(math.ceil(batch_size / len(images)))
113 |         images *= multiplier
114 |         captions *= multiplier
115 |     images = images[:batch_size]
116 |     captions = captions[:batch_size]
117 | 
118 |     def run(model_name: str, device: str, backend_name: str):
119 |         result = BenchmarkResult(
120 |             model_name=model_name,
121 |             backend_name=backend_name,
122 |             device_name=device,
123 |             duration_image_preprocessing=0,
124 |             duration_image_embedding=0,
125 |             duration_text_preprocessing=0,
126 |             duration_text_embedding=0,
127 |         )
128 | 
129 |         sync = backend_name == "torch"
130 |         processors, models = get_model(
131 |             model_name,
132 |             device=device,
133 |             modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER],
134 |             backend=backend_name,
135 |         )
136 | 
137 |         model_text = models[Modality.TEXT_ENCODER]
138 |         model_image = models[Modality.IMAGE_ENCODER]
139 |         processor_text = processors[Modality.TEXT_ENCODER]
140 |         processor_image = processors[Modality.IMAGE_ENCODER]
141 | 
142 |         # Image preprocessing
143 |         total_duration = 0
144 |         total_iterations = 0
145 |         while total_duration < 10 and total_iterations < 100:
146 |             seconds, _ = duration(lambda: processor_image(images))
147 |             total_duration += seconds
148 |             total_iterations += len(images)
149 |         duration_per_iteration = total_duration / total_iterations
150 |         result.duration_image_preprocessing = duration_per_iteration
151 | 
152 |         # Image embedding
153 |         total_duration = 0
154 |         total_iterations = 0
155 |         while total_duration < 10 and total_iterations < 100:
156 |             images_data = processor_image(images)
157 |             seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync)
158 |             total_duration += seconds
159 |             total_iterations += len(images)
160 |         duration_per_iteration = total_duration / total_iterations
161 |         result.duration_image_embedding = duration_per_iteration
162 | 
163 |         # Text preprocessing
164 |         total_duration = 0
165 |         total_iterations = 0
166 |         while total_duration < 10 and total_iterations < 100:
167 |             seconds, _ = duration(lambda: processor_text(captions))
168 |             total_duration += seconds
169 |             total_iterations += len(captions)
170 |         duration_per_iteration = total_duration / total_iterations
171 |         result.duration_text_preprocessing = duration_per_iteration
172 | 
173 |         # Text embedding
174 |         total_duration = 0
175 |         total_iterations = 0
176 |         while total_duration < 10 and total_iterations < 100:
177 |             texts_data = processor_text(captions)
178 |             seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync)
179 |             total_duration += seconds
180 |             total_iterations += len(captions)
181 |         duration_per_iteration = total_duration / total_iterations
182 |         result.duration_text_embedding = duration_per_iteration
183 | 
184 |         return result
185 | 
186 |     devices = ["cpu"]
187 |     if cuda_available:
188 |         devices.append("cuda")
189 |     backends = []
190 |     if torch_available:
191 |         backends.append("torch")
192 |     if onnx_available:
193 |         backends.append("onnx")
194 | 
195 |     for device in devices:
196 |         for backend_name in backends:
197 |             for model_name in [
198 |                 "unum-cloud/uform3-image-text-english-small",
199 |                 "unum-cloud/uform3-image-text-english-base",
200 |                 "unum-cloud/uform3-image-text-english-large",
201 |                 "unum-cloud/uform3-image-text-multilingual-base",
202 |             ]:
203 |                 yield BenchmarkResult(
204 |                     model_name=model_name,
205 |                     device_name=device,
206 |                     backend_name=backend_name,
207 |                 ), partial(run, model_name, device, backend_name)
208 | 
209 | 
210 | def main(filter_out: str = None, batch_size: int = 10):
211 |     results = []
212 |     filter_pattern = re.compile(filter_out) if filter_out else None
213 |     for specs, func in yield_benchmarks(batch_size=batch_size):
214 |         if filter_pattern and (
215 |             filter_pattern.search(specs.model_name)
216 |             or filter_pattern.search(specs.backend_name)
217 |             or filter_pattern.search(specs.device_name)
218 |         ):
219 |             continue
220 | 
221 |         try:
222 |             print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
223 |             result = func()
224 |             results.append(result)
225 |         except ExecutionProviderError as e:
226 |             print(f"- skipping missing backend")
227 |             print(e)
228 | 
229 |     results = sorted(results, key=lambda x: x.model_name)
230 |     results = [x.__dict__ for x in results]
231 | 
232 |     df = pd.DataFrame(results)
233 |     df.columns = [
234 |         "Model Name",
235 |         "Device",
236 |         "Backend",
237 |         "Images Preprocessed/s",
238 |         "Images Encoded/s",
239 |         "Texts Preprocessed/s",
240 |         "Texts Encoded/s",
241 |     ]
242 | 
243 |     def inverse(x):
244 |         return 1 / x if x != 0 else 0
245 | 
246 |     # Apply number formatting directly in the DataFrame
247 |     formatted_df = df.copy()
248 |     formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format)
249 |     formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format)
250 |     formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format)
251 |     formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format)
252 | 
253 |     # Convert formatted DataFrame to Markdown
254 |     print(formatted_df.to_markdown())
255 | 
256 | 
257 | if __name__ == "__main__":
258 | 
259 |     parser = argparse.ArgumentParser()
260 |     parser.add_argument(
261 |         "--filter-out",
262 |         type=str,
263 |         default=None,
264 |         help="Filter out models, backends, or devices with a Regular Expression.",
265 |     )
266 |     parser.add_argument(
267 |         "--batch-size",
268 |         type=int,
269 |         default=10,
270 |         help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
271 |     )
272 |     args = parser.parse_args()
273 | 
274 |     main(filter_out=args.filter_out, batch_size=args.batch_size)
275 | 


--------------------------------------------------------------------------------
/python/scripts/export_decoders.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
 8 |     "\n",
 9 |     "Depending on the backend, we prefer different qunatization schemes.\n",
10 |     "\n",
11 |     "- For ONNX we use `uint8` quantization.\n",
12 |     "- For PyTorch we use `bfloat16` quantization.\n",
13 |     "- For CoreML we use `float32` representation."
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": null,
19 |    "metadata": {},
20 |    "outputs": [],
21 |    "source": [
22 |     "!pip install --upgrade \"uform[torch]\" coremltools"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {},
29 |    "outputs": [],
30 |    "source": [
31 |     "import os\n",
32 |     "model_name = \"unum-cloud/uform-gen2-dpo\"\n",
33 |     "output_directory = \"../../\""
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": [
42 |     "import torch\n",
43 |     "import uform\n",
44 |     "from PIL import Image\n",
45 |     "from transformers import AutoModel, AutoProcessor\n",
46 |     "\n",
47 |     "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n",
48 |     "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n",
49 |     "\n",
50 |     "prompt = 'Describe the picture'\n",
51 |     "image = Image.open('../../assets/unum.png')\n",
52 |     "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n",
53 |     "\n",
54 |     "with torch.inference_mode():\n",
55 |     "     output = model.generate(\n",
56 |     "        **inputs,\n",
57 |     "        do_sample=False,\n",
58 |     "        use_cache=True,\n",
59 |     "        max_new_tokens=256,\n",
60 |     "        eos_token_id=151645,\n",
61 |     "        pad_token_id=processor.tokenizer.pad_token_id\n",
62 |     "    )\n",
63 |     "prompt_len = inputs['input_ids'].shape[1]\n",
64 |     "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n",
65 |     "\n",
66 |     "print(decoded_text)"
67 |    ]
68 |   }
69 |  ],
70 |  "metadata": {
71 |   "kernelspec": {
72 |    "display_name": "base",
73 |    "language": "python",
74 |    "name": "python3"
75 |   },
76 |   "language_info": {
77 |    "codemirror_mode": {
78 |     "name": "ipython",
79 |     "version": 3
80 |    },
81 |    "file_extension": ".py",
82 |    "mimetype": "text/x-python",
83 |    "name": "python",
84 |    "nbconvert_exporter": "python",
85 |    "pygments_lexer": "ipython3",
86 |    "version": "3.11.5"
87 |   }
88 |  },
89 |  "nbformat": 4,
90 |  "nbformat_minor": 2
91 | }
92 | 


--------------------------------------------------------------------------------
/python/scripts/test_decoders.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from PIL import Image
 3 | 
 4 | # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
 5 | try:
 6 |     import torch
 7 | 
 8 |     torch_available = True
 9 | except:
10 |     torch_available = False
11 | 
12 | torch_hf_models = [
13 |     "unum-cloud/uform-gen2-qwen-500m",
14 |     "unum-cloud/uform-gen2-dpo",
15 | ]
16 | 
17 | 
18 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
19 | @pytest.mark.parametrize("model_name", torch_hf_models)
20 | def test_one_conversation(model_name: str):
21 |     from transformers import AutoModel, AutoProcessor
22 | 
23 |     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
24 |     processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
25 | 
26 |     prompt = "Describe the image in great detail."
27 |     image = Image.open("assets/unum.png")
28 | 
29 |     inputs = processor(text=[prompt], images=[image], return_tensors="pt")
30 | 
31 |     with torch.inference_mode():
32 |         output = model.generate(
33 |             **inputs,
34 |             do_sample=False,
35 |             use_cache=True,
36 |             max_new_tokens=10,
37 |             pad_token_id=processor.tokenizer.pad_token_id,
38 |         )
39 |     prompt_len = inputs["input_ids"].shape[1]
40 |     decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
41 | 
42 |     assert len(decoded_text), "No text was generated from the model."
43 | 
44 | 
45 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
46 | @pytest.mark.parametrize("model_name", torch_hf_models)
47 | @pytest.mark.parametrize("batch_size", [1, 2])
48 | def test_many_conversations(model_name: str, batch_size: int):
49 | 
50 |     from transformers import AutoModel, AutoProcessor
51 | 
52 |     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
53 |     processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
54 | 
55 |     prompt = "Describe the image in great detail."
56 |     image = Image.open("assets/unum.png")
57 | 
58 |     texts = [prompt] * batch_size
59 |     images = [image] * batch_size
60 |     inputs = processor(text=texts, images=images, return_tensors="pt")
61 | 
62 |     with torch.inference_mode():
63 |         output = model.generate(
64 |             **inputs,
65 |             do_sample=False,
66 |             use_cache=True,
67 |             max_new_tokens=10,
68 |             pad_token_id=processor.tokenizer.pad_token_id,
69 |         )
70 |     prompt_len = inputs["input_ids"].shape[1]
71 |     decoded_texts = processor.batch_decode(output[:, prompt_len:])
72 | 
73 |     assert all(len(decoded_text) for decoded_text in decoded_texts), "No text was generated from the model."
74 | 


--------------------------------------------------------------------------------
/python/scripts/test_encoders.py:
--------------------------------------------------------------------------------
  1 | from functools import wraps
  2 | from typing import Tuple
  3 | import requests
  4 | from io import BytesIO
  5 | import os
  6 | 
  7 | import pytest
  8 | import numpy as np
  9 | from PIL import Image
 10 | 
 11 | from uform import Modality, get_model, ExecutionProviderError
 12 | 
 13 | # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
 14 | try:
 15 |     import torch
 16 | 
 17 |     torch_available = True
 18 | except:
 19 |     torch_available = False
 20 | 
 21 | # ONNX is not a very light dependency either
 22 | try:
 23 |     import onnx
 24 | 
 25 |     onnx_available = True
 26 | except:
 27 |     onnx_available = False
 28 | 
 29 | torch_models = [
 30 |     "unum-cloud/uform3-image-text-english-small",
 31 |     "unum-cloud/uform3-image-text-english-base",
 32 |     "unum-cloud/uform3-image-text-english-large",
 33 |     "unum-cloud/uform3-image-text-multilingual-base",
 34 | ]
 35 | 
 36 | onnx_models = [
 37 |     "unum-cloud/uform3-image-text-english-small",
 38 |     "unum-cloud/uform3-image-text-english-base",
 39 |     "unum-cloud/uform3-image-text-english-large",
 40 |     "unum-cloud/uform3-image-text-multilingual-base",
 41 | ]
 42 | 
 43 | # Let's check if the HuggingFace Hub API token is set in the environment variable.
 44 | # If it's not there, check if the `.hf_token` file is present in the current working directory.
 45 | token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
 46 | if token is None:
 47 |     token_path = "./.hf_token"
 48 |     if os.path.exists(token_path):
 49 |         with open(token_path, "r") as file:
 50 |             token = file.read().strip()
 51 | 
 52 | 
 53 | def skip_on(exception, reason="No good reason :)"):
 54 |     def decorator_func(f):
 55 |         @wraps(f)
 56 |         def wrapper(*args, **kwargs):
 57 |             try:
 58 |                 # Try to run the test
 59 |                 return f(*args, **kwargs)
 60 |             except exception:
 61 |                 pytest.skip(reason)
 62 | 
 63 |         return wrapper
 64 | 
 65 |     return decorator_func
 66 | 
 67 | 
 68 | def cosine_similarity(x, y) -> float:
 69 |     if not isinstance(x, np.ndarray):
 70 |         x = x.detach().numpy()
 71 |     if not isinstance(y, np.ndarray):
 72 |         y = y.detach().numpy()
 73 | 
 74 |     # Unlike NumPy, SimSIMD can properly deal with integer types
 75 |     x = x.astype(np.float32).flatten()
 76 |     y = y.astype(np.float32).flatten()
 77 |     return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
 78 | 
 79 | 
 80 | def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1):
 81 |     """Test if the embeddings of text and image are semantically similar
 82 |     using a small set of example text-image pairs."""
 83 | 
 84 |     texts = [
 85 |         "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
 86 |         "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
 87 |         "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
 88 |         "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
 89 |         "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
 90 |     ]
 91 | 
 92 |     image_urls = [
 93 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
 94 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
 95 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
 96 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
 97 |         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
 98 |     ]
 99 |     assert len(texts) == len(image_urls), "Number of texts and images should be the same."
100 | 
101 |     images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls]
102 |     count_pairs = len(texts)
103 | 
104 |     # Ensure we have a sufficiently large batch
105 |     texts = texts * batch_size_multiple
106 |     images = images * batch_size_multiple
107 | 
108 |     # Compute the embedding in a batch fashion
109 |     text_embeddings = text_to_embedding(texts)
110 |     image_embeddings = image_to_embedding(images)
111 | 
112 |     # Evaluate cosine similarity
113 |     for i in range(count_pairs):
114 |         pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
115 |         other_text_similarities = [
116 |             cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i
117 |         ]
118 |         other_image_similarities = [
119 |             cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i
120 |         ]
121 | 
122 |         assert pair_similarity > max(
123 |             other_text_similarities
124 |         ), "Text should be more similar to its corresponding image than to other images."
125 |         assert pair_similarity > max(
126 |             other_image_similarities
127 |         ), "Image should be more similar to its corresponding text than to other texts."
128 | 
129 | 
130 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
131 | @pytest.mark.parametrize("model_name", torch_models)
132 | def test_torch_one_embedding(model_name: str):
133 |     processors, models = get_model(model_name, token=token, backend="torch")
134 |     model_text = models[Modality.TEXT_ENCODER]
135 |     model_image = models[Modality.IMAGE_ENCODER]
136 |     processor_text = processors[Modality.TEXT_ENCODER]
137 |     processor_image = processors[Modality.IMAGE_ENCODER]
138 | 
139 |     text = "a small red panda in a zoo"
140 |     image_path = "assets/unum.png"
141 | 
142 |     image = Image.open(image_path)
143 |     image_data = processor_image(image)
144 |     text_data = processor_text(text)
145 | 
146 |     image_features, image_embedding = model_image.encode(image_data, return_features=True)
147 |     text_features, text_embedding = model_text.encode(text_data, return_features=True)
148 | 
149 |     assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
150 |     assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
151 | 
152 |     # Test if the model outputs actually make sense
153 |     cross_references_image_and_text_embeddings(
154 |         lambda text: model_text(processor_text(text)),
155 |         lambda image: model_image(processor_image(image)),
156 |     )
157 | 
158 | 
159 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
160 | @pytest.mark.parametrize("model_name", torch_models)
161 | @pytest.mark.parametrize("batch_size", [1, 2])
162 | def test_torch_many_embeddings(model_name: str, batch_size: int):
163 | 
164 |     processors, models = get_model(model_name, token=token, backend="torch")
165 |     model_text = models[Modality.TEXT_ENCODER]
166 |     model_image = models[Modality.IMAGE_ENCODER]
167 |     processor_text = processors[Modality.TEXT_ENCODER]
168 |     processor_image = processors[Modality.IMAGE_ENCODER]
169 | 
170 |     texts = ["a small red panda in a zoo"] * batch_size
171 |     image_paths = ["assets/unum.png"] * batch_size
172 | 
173 |     images = [Image.open(path) for path in image_paths]
174 |     image_data = processor_image(images)
175 |     text_data = processor_text(texts)
176 | 
177 |     image_embeddings = model_image.encode(image_data, return_features=False)
178 |     text_embeddings = model_text.encode(text_data, return_features=False)
179 | 
180 |     assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
181 |     assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
182 | 
183 | 
184 | @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
185 | @pytest.mark.parametrize("model_name", onnx_models)
186 | @pytest.mark.parametrize("device", ["CPUExecutionProvider"])
187 | @skip_on(ExecutionProviderError, reason="Missing execution provider")
188 | def test_onnx_one_embedding(model_name: str, device: str):
189 | 
190 |     processors, models = get_model(model_name, token=token, device=device, backend="onnx")
191 |     model_text = models[Modality.TEXT_ENCODER]
192 |     model_image = models[Modality.IMAGE_ENCODER]
193 |     processor_text = processors[Modality.TEXT_ENCODER]
194 |     processor_image = processors[Modality.IMAGE_ENCODER]
195 | 
196 |     text = "a small red panda in a zoo"
197 |     image_path = "assets/unum.png"
198 | 
199 |     image = Image.open(image_path)
200 |     image_data = processor_image(image)
201 |     text_data = processor_text(text)
202 | 
203 |     image_features, image_embedding = model_image.encode(image_data)
204 |     text_features, text_embedding = model_text.encode(text_data)
205 | 
206 |     assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
207 |     assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
208 | 
209 |     # Nested fucntions are easier to debug, than lambdas
210 |     def get_image_embedding(image_data):
211 |         features, embedding = model_image.encode(processor_image(image_data))
212 |         return embedding
213 | 
214 |     def get_text_embedding(text_data):
215 |         features, embedding = model_text.encode(processor_text(text_data))
216 |         return embedding
217 | 
218 |     # Test if the model outputs actually make sense
219 |     cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
220 | 
221 | 
222 | @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
223 | @pytest.mark.parametrize("model_name", onnx_models)
224 | @pytest.mark.parametrize("batch_size", [1, 2])
225 | @pytest.mark.parametrize("device", ["CPUExecutionProvider"])
226 | @skip_on(ExecutionProviderError, reason="Missing execution provider")
227 | def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
228 | 
229 |     processors, models = get_model(model_name, token=token, device=device, backend="onnx")
230 |     model_text = models[Modality.TEXT_ENCODER]
231 |     model_image = models[Modality.IMAGE_ENCODER]
232 |     processor_text = processors[Modality.TEXT_ENCODER]
233 |     processor_image = processors[Modality.IMAGE_ENCODER]
234 | 
235 |     texts = ["a small red panda in a zoo"] * batch_size
236 |     image_paths = ["assets/unum.png"] * batch_size
237 | 
238 |     images = [Image.open(path) for path in image_paths]
239 |     image_data = processor_image(images)
240 |     text_data = processor_text(texts)
241 | 
242 |     image_embeddings = model_image.encode(image_data, return_features=False)
243 |     text_embeddings = model_text.encode(text_data, return_features=False)
244 | 
245 |     assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
246 |     assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
247 | 
248 | 
249 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
250 | @pytest.mark.parametrize("model_name", torch_models[:1])
251 | def test_torch_multi_gpu(model_name: str):
252 | 
253 |     count_cuda_devices = torch.cuda.device_count()
254 |     if count_cuda_devices < 2:
255 |         pytest.skip("Not enough CUDA devices to run multi-GPU test")
256 | 
257 |     processors, models = get_model(model_name, token=token, backend="torch", device="cuda")
258 |     model_text = models[Modality.TEXT_ENCODER]
259 |     model_image = models[Modality.IMAGE_ENCODER]
260 |     processor_text = processors[Modality.TEXT_ENCODER]
261 |     processor_image = processors[Modality.IMAGE_ENCODER]
262 | 
263 |     import torch.nn as nn
264 | 
265 |     model_text.return_features = False
266 |     model_image.return_features = False
267 |     model_text_parallel = nn.DataParallel(model_text)
268 |     model_image_parallel = nn.DataParallel(model_image)
269 | 
270 |     # Nested fucntions are easier to debug, than lambdas
271 |     def get_image_embedding(image_data):
272 |         preprocessed = processor_image(image_data)
273 |         embedding = model_image_parallel.forward(preprocessed)
274 |         return embedding.detach().cpu().numpy()
275 | 
276 |     def get_text_embedding(text_data):
277 |         preprocessed = processor_text(text_data)
278 |         embedding = model_text_parallel.forward(preprocessed)
279 |         return embedding.detach().cpu().numpy()
280 | 
281 |     # Test if the model outputs actually make sense
282 |     cross_references_image_and_text_embeddings(
283 |         get_text_embedding,
284 |         get_image_embedding,
285 |         batch_size_multiple=count_cuda_devices,
286 |     )
287 | 
288 | 
289 | if __name__ == "__main__":
290 |     # If you want to run this test file individually, you can do so by running:
291 |     # pytest.main(["-s", "-x", __file__])
292 |     pass
293 | 


--------------------------------------------------------------------------------
/python/uform/__init__.py:
--------------------------------------------------------------------------------
  1 | from os.path import join, exists
  2 | from typing import Dict, Optional, Tuple, Literal, Union, Callable
  3 | 
  4 | from huggingface_hub import snapshot_download, utils
  5 | 
  6 | from uform.shared import ExecutionProviderError, Modality
  7 | 
  8 | 
  9 | def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
 10 |     if modalities is None:
 11 |         return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
 12 | 
 13 |     return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities)
 14 | 
 15 | 
 16 | def get_checkpoint(
 17 |     model_name: str,
 18 |     modalities: Tuple[str, Modality],
 19 |     token: Optional[str] = None,
 20 |     format: Literal[".pt", ".onnx"] = ".pt",
 21 | ) -> Tuple[str, Dict[Modality, str], Optional[str]]:
 22 |     """Downloads a model checkpoint from the Hugging Face Hub.
 23 | 
 24 |     :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small`
 25 |     :param token: The Hugging Face API token, if required
 26 |     :param modalities: The modalities to download, like `("text_encoder", "image_encoder")`
 27 |     :param format: The format of the model checkpoint, either `.pt` or `.onnx`
 28 |     :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
 29 |     """
 30 | 
 31 |     modalities = _normalize_modalities(modalities)
 32 | 
 33 |     # It is not recommended to use `.pth` extension when checkpointing models
 34 |     # because it collides with Python path (`.pth`) configuration files.
 35 |     merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]]
 36 |     separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities]
 37 |     config_names = ["torch_config.json", "config.json"]
 38 |     tokenizer_names = ["tokenizer.json"]
 39 | 
 40 |     old_progress_behavior = utils.are_progress_bars_disabled()
 41 |     utils.disable_progress_bars()
 42 | 
 43 |     # The download stats depend on the number of times the `config.json` is pulled
 44 |     # https://huggingface.co/docs/hub/models-download-stats
 45 |     model_path = snapshot_download(
 46 |         repo_id=model_name,
 47 |         token=token,
 48 |         allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
 49 |     )
 50 | 
 51 |     if old_progress_behavior:
 52 |         utils.enable_progress_bars()
 53 | 
 54 |     # Find the first name in `config_names` that is present
 55 |     config_path = None
 56 |     for config_name in config_names:
 57 |         if exists(join(model_path, config_name)):
 58 |             config_path = join(model_path, config_name)
 59 |             break
 60 | 
 61 |     # Same for the tokenizer
 62 |     tokenizer_path = None
 63 |     for tokenizer_name in tokenizer_names:
 64 |         if exists(join(model_path, tokenizer_name)):
 65 |             tokenizer_path = join(model_path, tokenizer_name)
 66 |             break
 67 | 
 68 |     # Ideally, we want to separately fetch all the models.
 69 |     # If those aren't available, aggregate separate modalities and merge them.
 70 |     modality_paths = None
 71 |     for file_name in merged_model_names:
 72 |         if exists(join(model_path, file_name)):
 73 |             modality_paths = join(model_path, file_name)
 74 |             break
 75 | 
 76 |     if modality_paths is None:
 77 |         modality_paths = {}
 78 |         for separate_modality_name in separate_modality_names:
 79 |             if exists(join(model_path, separate_modality_name)):
 80 |                 modality_name, _, _ = separate_modality_name.partition(".")
 81 |                 modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name)
 82 | 
 83 |     return config_path, modality_paths, tokenizer_path
 84 | 
 85 | 
 86 | def get_model_torch(
 87 |     model_name: str,
 88 |     *,
 89 |     token: Optional[str] = None,
 90 |     device: Literal["cpu", "cuda"] = "cpu",
 91 |     modalities: Optional[Tuple[Union[str, Modality]]] = None,
 92 | ) -> Tuple[Dict[Modality, Callable], Dict]:
 93 |     """
 94 |     Fetches and constructs a PyTorch model with its processors based on provided modalities.
 95 | 
 96 |     :param model_name: The identifier of the model on the Hugging Face Hub.
 97 |     :param token: Optional API token for authenticated access to the model.
 98 |     :param device: The device to load the model onto ('cpu' or 'cuda').
 99 |     :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
100 |     :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
101 |     """
102 |     from uform.torch_encoders import TextEncoder, ImageEncoder
103 |     from uform.torch_processors import TextProcessor, ImageProcessor
104 | 
105 |     modalities = _normalize_modalities(modalities)
106 |     config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt")
107 | 
108 |     result_processors = {}
109 |     result_models = {}
110 | 
111 |     if Modality.TEXT_ENCODER in modalities:
112 |         processor = TextProcessor(config_path, tokenizer_path)
113 |         encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER))
114 |         encoder = encoder.eval().to(device)
115 |         result_processors[Modality.TEXT_ENCODER] = processor
116 |         result_models[Modality.TEXT_ENCODER] = encoder
117 | 
118 |     if Modality.IMAGE_ENCODER in modalities:
119 |         processor = ImageProcessor(config_path)
120 |         encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER))
121 |         encoder = encoder.eval().to(device)
122 |         result_processors[Modality.IMAGE_ENCODER] = processor
123 |         result_models[Modality.IMAGE_ENCODER] = encoder
124 | 
125 |     return result_processors, result_models
126 | 
127 | 
128 | def get_model_onnx(
129 |     model_name: str,
130 |     *,
131 |     device: Literal["cpu", "cuda"] = "cpu",
132 |     token: Optional[str] = None,
133 |     modalities: Optional[Tuple[str]] = None,
134 | ):
135 |     """
136 |     Fetches and constructs an ONNX model with its processors based on provided modalities.
137 | 
138 |     :param model_name: The identifier of the model on the Hugging Face Hub.
139 |     :param device: The device on which the model will operate ('cpu' or 'cuda').
140 |     :param token: Optional API token for authenticated access to the model.
141 |     :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
142 |     :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
143 |     """
144 |     from uform.onnx_encoders import TextEncoder, ImageEncoder
145 |     from uform.numpy_processors import TextProcessor, ImageProcessor
146 | 
147 |     modalities = _normalize_modalities(modalities)
148 |     config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx")
149 | 
150 |     result_processors = {}
151 |     result_models = {}
152 | 
153 |     if Modality.TEXT_ENCODER in modalities:
154 |         processor = TextProcessor(config_path, tokenizer_path)
155 |         encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device)
156 |         result_processors[Modality.TEXT_ENCODER] = processor
157 |         result_models[Modality.TEXT_ENCODER] = encoder
158 | 
159 |     if Modality.IMAGE_ENCODER in modalities:
160 |         processor = ImageProcessor(config_path)
161 |         encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device)
162 |         result_processors[Modality.IMAGE_ENCODER] = processor
163 |         result_models[Modality.IMAGE_ENCODER] = encoder
164 | 
165 |     return result_processors, result_models
166 | 
167 | 
168 | def get_model(
169 |     model_name: str,
170 |     *,
171 |     device: Literal["cpu", "cuda"] = "cpu",  # change this if you have a GPU
172 |     backend: Literal["onnx", "torch"] = "onnx",  # lighter = better
173 |     modalities: Optional[Tuple[str, Modality]] = None,  # all by default
174 |     token: Optional[str] = None,  # optional HuggingFace Hub token for private models
175 | ) -> Tuple[Dict[Modality, Callable], Dict]:
176 |     """
177 |     Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend.
178 | 
179 |     :param model_name: The identifier of the model on the Hugging Face Hub.
180 |     :param device: The device to load the model onto ('cpu' or 'cuda').
181 |     :param backend: The backend framework to use ('onnx' or 'torch').
182 |     :param modalities: A tuple specifying the types of model components to fetch.
183 |     :param token: Optional API token for authenticated access to the model.
184 |     :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
185 |     """
186 |     if backend == "onnx":
187 |         return get_model_onnx(model_name, device=device, token=token, modalities=modalities)
188 |     elif backend == "torch":
189 |         return get_model_torch(model_name, device=device, token=token, modalities=modalities)
190 |     else:
191 |         raise ValueError(f"Unknown backend: {backend}")
192 | 


--------------------------------------------------------------------------------
/python/uform/chat.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | 
  3 | import requests
  4 | import torch
  5 | from PIL import Image
  6 | from transformers import TextStreamer, AutoModel, AutoProcessor
  7 | 
  8 | 
  9 | def parse_args():
 10 |     parser = ArgumentParser(description="Chat with UForm generative model")
 11 | 
 12 |     parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path")
 13 |     parser.add_argument("--image", type=str, required=True, help="Path to image or URL")
 14 |     parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`")
 15 |     parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference")
 16 | 
 17 |     return parser.parse_args()
 18 | 
 19 | 
 20 | def run_chat(opts, model, processor):
 21 |     streamer = TextStreamer(
 22 |         processor.tokenizer,
 23 |         skip_prompt=True,
 24 |         skip_special_tokens=True,
 25 |     )
 26 | 
 27 |     messages = [{"role": "system", "content": "You are a helpful assistant."}]
 28 |     is_first_message = True
 29 | 
 30 |     if opts.image.startswith("http"):
 31 |         image = Image.open(requests.get(opts.image, stream=True).raw)
 32 |     else:
 33 |         image = Image.open(opts.image)
 34 | 
 35 |     image = (
 36 |         processor.feature_extractor(image)  #
 37 |         .unsqueeze(0)
 38 |         .to(torch.bfloat16 if opts.fp16 else torch.float32)
 39 |         .to(opts.device)
 40 |     )
 41 | 
 42 |     while True:
 43 |         if messages[-1]["role"] in ("system", "assistant"):
 44 |             message = input("User: ")
 45 |             if is_first_message:
 46 |                 message = f" <image> {message}"
 47 |                 is_first_message = False
 48 |             messages.append({"role": "user", "content": message})
 49 | 
 50 |             print()
 51 | 
 52 |         else:
 53 |             input_ids = processor.tokenizer.apply_chat_template(
 54 |                 messages,
 55 |                 return_tensors="pt",
 56 |                 add_generation_prompt=True,
 57 |             ).to(opts.device)
 58 | 
 59 |             attention_mask = torch.ones(
 60 |                 1,
 61 |                 input_ids.shape[1] + processor.num_image_latents - 1,
 62 |             ).to(opts.device)
 63 |             inputs = {
 64 |                 "input_ids": input_ids,
 65 |                 "attention_mask": attention_mask,
 66 |                 "images": image,
 67 |             }
 68 | 
 69 |             print("Assistant: ", end="")
 70 |             with torch.inference_mode():
 71 |                 output = model.generate(
 72 |                     **inputs,
 73 |                     do_sample=False,
 74 |                     use_cache=True,
 75 |                     max_new_tokens=1024,
 76 |                     eos_token_id=151645,
 77 |                     pad_token_id=processor.tokenizer.pad_token_id,
 78 |                     streamer=streamer,
 79 |                 )
 80 |             print()
 81 | 
 82 |             prompt_len = inputs["input_ids"].shape[1]
 83 |             message = processor.batch_decode(output[:, prompt_len:-1])[0]
 84 | 
 85 |             messages.append({"role": "assistant", "content": message})
 86 | 
 87 | 
 88 | def main():
 89 |     try:
 90 |         opts = parse_args()
 91 |         processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True)
 92 |         model = (
 93 |             AutoModel.from_pretrained(
 94 |                 opts.model,
 95 |                 torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
 96 |                 ignore_mismatched_sizes=True,
 97 |                 trust_remote_code=True,
 98 |             )
 99 |             .eval()
100 |             .to(opts.device)
101 |         )
102 | 
103 |         run_chat(opts, model, processor)
104 | 
105 |     except KeyboardInterrupt:
106 |         print("Bye!")
107 |         pass
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     main()
112 | 


--------------------------------------------------------------------------------
/python/uform/gen_model.py:
--------------------------------------------------------------------------------
1 | from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
2 | 


--------------------------------------------------------------------------------
/python/uform/numpy_processors.py:
--------------------------------------------------------------------------------
  1 | from os import PathLike
  2 | from typing import Dict, List, Union, Sequence
  3 | import json
  4 | 
  5 | from PIL.Image import Image, BICUBIC
  6 | from tokenizers import Tokenizer
  7 | import numpy as np
  8 | 
  9 | from uform.shared import read_config
 10 | 
 11 | 
 12 | class TextProcessor:
 13 |     def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
 14 |         """
 15 |         :param config: model config
 16 |         :param tokenizer_path: path to tokenizer file
 17 |         """
 18 | 
 19 |         config = read_config(config_path)
 20 |         if "text_encoder" in config:
 21 |             config = config["text_encoder"]
 22 | 
 23 |         self._max_seq_len = config["max_position_embeddings"]
 24 |         self._tokenizer = Tokenizer.from_file(tokenizer_path)
 25 |         self._tokenizer.no_padding()
 26 |         self._pad_token_idx = config["padding_idx"]
 27 | 
 28 |     def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]:
 29 |         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 30 | 
 31 |         :param texts: text of list of texts to tokenizer
 32 |         """
 33 |         if isinstance(texts, str):
 34 |             texts = [texts]
 35 | 
 36 |         input_ids = np.full(
 37 |             (len(texts), self._max_seq_len),
 38 |             fill_value=self._pad_token_idx,
 39 |             dtype=np.int32,
 40 |         )
 41 | 
 42 |         attention_mask = np.zeros(
 43 |             (len(texts), self._max_seq_len),
 44 |             dtype=np.int32,
 45 |         )
 46 |         encoded = self._tokenizer.encode_batch(texts)
 47 | 
 48 |         for i, seq in enumerate(encoded):
 49 |             seq_len = min(len(seq), self._max_seq_len)
 50 |             input_ids[i, :seq_len] = seq.ids[:seq_len]
 51 | 
 52 |             attention_mask[i, :seq_len] = 1
 53 | 
 54 |         return {"input_ids": input_ids, "attention_mask": attention_mask}
 55 | 
 56 | 
 57 | class ImageProcessor:
 58 |     def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
 59 |         """
 60 |         :param config: model config
 61 |         :param tokenizer_path: path to tokenizer file
 62 |         :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
 63 |         """
 64 | 
 65 |         config = read_config(config_path)
 66 |         if "image_encoder" in config:
 67 |             config = config["image_encoder"]
 68 | 
 69 |         self._image_size = config["image_size"]
 70 |         self._normalization_means = config["normalization_means"]
 71 |         self._normalization_deviations = config["normalization_deviations"]
 72 | 
 73 |         assert isinstance(self._image_size, int) and self._image_size > 0
 74 |         assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
 75 |         assert len(self._normalization_means) == len(self._normalization_deviations) == 3
 76 | 
 77 |         self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
 78 |         self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
 79 | 
 80 |     def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray:
 81 |         """Transforms one or more Pillow images into Torch Tensors.
 82 | 
 83 |         :param images: image or list of images to preprocess
 84 |         """
 85 | 
 86 |         if isinstance(images, Sequence):
 87 |             batch_images = np.empty(
 88 |                 (len(images), 3, self._image_size, self._image_size),
 89 |                 dtype=np.float32,
 90 |             )
 91 | 
 92 |             for i, image in enumerate(images):
 93 |                 batch_images[i] = self._resize_crop_normalize(image)
 94 | 
 95 |         else:
 96 |             batch_images = self._resize_crop_normalize(images)[None]
 97 | 
 98 |         return batch_images
 99 | 
100 |     def _resize_crop_normalize(self, image: Image):
101 |         width, height = image.size
102 | 
103 |         if width < height:
104 |             width = self._image_size
105 |             height = int(height / width * self._image_size)
106 |         else:
107 |             width = int(width / height * self._image_size)
108 |             height = self._image_size
109 | 
110 |         image = image.resize((width, height), resample=BICUBIC)
111 | 
112 |         left = (width - self._image_size) / 2
113 |         top = (height - self._image_size) / 2
114 |         right = (width + self._image_size) / 2
115 |         bottom = (height + self._image_size) / 2
116 | 
117 |         image = image.convert("RGB").crop((left, top, right, bottom))
118 |         # At this point `image` is a PIL Image with RGB channels.
119 |         # If you convert it to `np.ndarray` it will have shape (H, W, C) where C is the number of channels.
120 |         image = (np.array(image).astype(np.float32) / 255.0 - self.image_mean) / self.image_std
121 | 
122 |         # To make it compatible with PyTorch, we need to transpose the image to (C, H, W).
123 |         return np.transpose(image, (2, 0, 1))
124 | 


--------------------------------------------------------------------------------
/python/uform/onnx_encoders.py:
--------------------------------------------------------------------------------
  1 | from os import PathLike
  2 | from typing import Dict, Optional, Tuple, Union, Literal
  3 | import json
  4 | 
  5 | import onnxruntime as ort
  6 | from numpy import ndarray
  7 | 
  8 | from uform.shared import ExecutionProviderError
  9 | 
 10 | 
 11 | def available_providers(device: Optional[str]) -> Tuple[str, ...]:
 12 |     """Returns a tuple of available execution providers based on the requested device.
 13 |     https://onnxruntime.ai/docs/execution-providers/
 14 | 
 15 |     :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name.
 16 |     :return: Tuple of available execution providers.
 17 |     :raises ExecutionProviderError: If the requested device is not available.
 18 |     """
 19 | 
 20 |     gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
 21 |     cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
 22 |     available = ort.get_available_providers()
 23 | 
 24 |     # If no target device is specified, let's sort all the available ones with respect to our preference
 25 |     if device is None:
 26 |         preferences = gpu_providers + cpu_providers
 27 |         filtered_preferences = tuple(provider for provider in preferences if provider in available)
 28 |         if len(filtered_preferences):
 29 |             return filtered_preferences
 30 |         if len(available):
 31 |             return available
 32 |         raise ExecutionProviderError("No execution providers are available")
 33 | 
 34 |     # If a GPU is requested, but no GPU providers are available, raise an error
 35 |     if device == "gpu" or device == "cuda":
 36 |         if all(provider not in available for provider in gpu_providers):
 37 |             raise ExecutionProviderError(
 38 |                 f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
 39 |             )
 40 |         return [x for x in gpu_providers if x in available]
 41 | 
 42 |     # If a CPU is requested, but no CPU providers are available, raise an error
 43 |     if device == "cpu":
 44 |         if all(provider not in available for provider in cpu_providers):
 45 |             raise ExecutionProviderError(
 46 |                 f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
 47 |             )
 48 |         return [x for x in cpu_providers if x in available]
 49 | 
 50 |     if device not in available:
 51 |         available_providers = ", ".join(available)
 52 |         raise ExecutionProviderError(
 53 |             f"Execution provider {device} is not available. Currently installed: {available_providers}"
 54 |         )
 55 | 
 56 |     return (device,)
 57 | 
 58 | 
 59 | class ImageEncoder:
 60 |     def __init__(
 61 |         self,
 62 |         model_path: str,
 63 |         *,
 64 |         device: Literal["cpu", "cuda"] = "cpu",
 65 |         return_features: bool = True,
 66 |     ):
 67 |         """
 68 |         :param model_path: Path to onnx model
 69 |         :param device: Device name, either cpu or gpu
 70 |         """
 71 | 
 72 |         session_options = ort.SessionOptions()
 73 |         session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
 74 | 
 75 |         self.return_features = return_features
 76 |         self.session = ort.InferenceSession(
 77 |             model_path,
 78 |             sess_options=session_options,
 79 |             providers=available_providers(device),
 80 |         )
 81 | 
 82 |     def encode(
 83 |         self, images: ndarray, return_features: Optional[bool] = None
 84 |     ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
 85 |         features, embeddings = self.session.run(None, {"images": images})
 86 |         return_features = return_features if return_features is not None else self.return_features
 87 |         if return_features:
 88 |             return features, embeddings
 89 |         return embeddings
 90 | 
 91 | 
 92 | class TextEncoder:
 93 |     def __init__(
 94 |         self,
 95 |         model_path: str,
 96 |         *,
 97 |         device: Literal["cpu", "cuda"] = "cpu",
 98 |         return_features: bool = True,
 99 |     ):
100 |         """
101 |         :param text_encoder_path: Path to onnx of text encoder
102 |         :param device: Device name, either cpu or gpu
103 |         """
104 | 
105 |         session_options = ort.SessionOptions()
106 |         session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
107 | 
108 |         self.return_features = return_features
109 |         self.text_encoder_session = ort.InferenceSession(
110 |             model_path,
111 |             sess_options=session_options,
112 |             providers=available_providers(device),
113 |         )
114 | 
115 |     def encode(
116 |         self,
117 |         x: Union[ndarray, dict],
118 |         attention_mask: Optional[ndarray] = None,
119 |         return_features: Optional[bool] = None,
120 |     ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
121 |         if isinstance(x, dict):
122 |             assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
123 |             attention_mask = x["attention_mask"]
124 |             input_ids = x["input_ids"]
125 |         else:
126 |             input_ids = x
127 | 
128 |         features, embeddings = self.text_encoder_session.run(
129 |             None,
130 |             {
131 |                 "input_ids": input_ids,
132 |                 "attention_mask": attention_mask,
133 |             },
134 |         )
135 | 
136 |         return_features = return_features if return_features is not None else self.return_features
137 |         if return_features:
138 |             return features, embeddings
139 |         return embeddings
140 | 


--------------------------------------------------------------------------------
/python/uform/shared.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Union
 3 | from os import PathLike
 4 | import json
 5 | 
 6 | 
 7 | class Modality(Enum):
 8 |     TEXT_ENCODER = "text_encoder"
 9 |     IMAGE_ENCODER = "image_encoder"
10 |     VIDEO_ENCODER = "video_encoder"
11 |     TEXT_DECODER = "text_decoder"
12 | 
13 | 
14 | class ExecutionProviderError(Exception):
15 |     """Exception raised when a requested execution provider is not available."""
16 | 
17 | 
18 | ConfigOrPath = Union[PathLike, str, object]
19 | 
20 | 
21 | def read_config(path_or_object: ConfigOrPath) -> object:
22 |     if isinstance(path_or_object, (PathLike, str)):
23 |         with open(path_or_object, "r") as f:
24 |             return json.load(f)
25 |     else:
26 |         return path_or_object
27 | 


--------------------------------------------------------------------------------
/python/uform/torch_decoders.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import nn
  6 | from torchvision.transforms import (
  7 |     CenterCrop,
  8 |     Compose,
  9 |     InterpolationMode,
 10 |     Normalize,
 11 |     RandomResizedCrop,
 12 |     Resize,
 13 |     ToTensor,
 14 | )
 15 | from transformers import AutoConfig, AutoTokenizer
 16 | from transformers.configuration_utils import PretrainedConfig
 17 | from transformers.modeling_outputs import CausalLMOutputWithPast
 18 | from transformers.modeling_utils import PreTrainedModel
 19 | from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
 20 | from transformers.processing_utils import ProcessorMixin
 21 | from transformers.tokenization_utils_base import BatchEncoding
 22 | 
 23 | from uform.torch_encoders import ImageEncoder
 24 | 
 25 | IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 26 | IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
 27 | 
 28 | 
 29 | def convert_to_rgb(image):
 30 |     return image.convert("RGB")
 31 | 
 32 | 
 33 | class LayerScale(nn.Module):
 34 |     def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
 35 |         super().__init__()
 36 |         self.weight = nn.Parameter(init_values * torch.ones(dim))
 37 |         self.inplace = inplace
 38 | 
 39 |     def forward(self, x):
 40 |         return x.mul_(self.weight) if self.inplace else x * self.weight
 41 | 
 42 | 
 43 | class ImageFeaturesPooler(nn.Module):
 44 |     def __init__(
 45 |         self,
 46 |         input_size,
 47 |         hidden_size,
 48 |         num_attn_heads,
 49 |         intermediate_size,
 50 |         num_latents,
 51 |         initializer_range,
 52 |     ):
 53 |         super().__init__()
 54 |         self.projection = nn.Linear(input_size, hidden_size)
 55 | 
 56 |         self.pooler = nn.TransformerDecoderLayer(
 57 |             hidden_size,
 58 |             num_attn_heads,
 59 |             intermediate_size,
 60 |             activation=nn.functional.silu,
 61 |             batch_first=True,
 62 |             norm_first=True,
 63 |         )
 64 |         self.image_latents = nn.Parameter(
 65 |             torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
 66 |         )
 67 | 
 68 |     def forward(self, features):
 69 |         features = self.projection(features)
 70 |         return self.pooler(
 71 |             self.image_latents.expand(features.shape[0], -1, -1),
 72 |             features,
 73 |         )
 74 | 
 75 | 
 76 | class VLMConfig(PretrainedConfig):
 77 |     model_type = "vlm"
 78 | 
 79 |     def __init__(
 80 |         self,
 81 |         text_decoder_name_or_path: str = "",
 82 |         tokenizer_name_or_path: str = "",
 83 |         image_size: int = 224,
 84 |         image_encoder_hidden_size: int = 768,
 85 |         image_encoder_patch_size: int = 16,
 86 |         image_encoder_num_layers: int = 12,
 87 |         image_encoder_num_heads: int = 12,
 88 |         image_encoder_embedding_dim: int = 256,
 89 |         image_encoder_pooling: str = "cls",
 90 |         image_pooler_num_attn_heads: int = 16,
 91 |         image_pooler_intermediate_size: int = 5504,
 92 |         image_pooler_num_latents: int = 196,
 93 |         image_token_id: int = 32002,
 94 |         initializer_range: float = 0.02,
 95 |         use_cache: bool = True,
 96 |         center_crop: bool = True,
 97 |         **kwargs,
 98 |     ):
 99 |         self.text_decoder_name_or_path = text_decoder_name_or_path
100 |         self.tokenizer_name_or_path = tokenizer_name_or_path
101 | 
102 |         self.image_size = image_size
103 |         self.image_encoder_hidden_size = image_encoder_hidden_size
104 |         self.image_encoder_patch_size = image_encoder_patch_size
105 |         self.image_encoder_num_layers = image_encoder_num_layers
106 |         self.image_encoder_num_heads = image_encoder_num_heads
107 |         self.image_encoder_embedding_dim = image_encoder_embedding_dim
108 |         self.image_encoder_pooling = image_encoder_pooling
109 | 
110 |         self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
111 |         self.image_pooler_intermediate_size = image_pooler_intermediate_size
112 |         self.image_pooler_num_latents = image_pooler_num_latents
113 | 
114 |         self.image_token_id = image_token_id
115 | 
116 |         self.initializer_range = initializer_range
117 |         self.use_cache = use_cache
118 |         self.center_crop = center_crop
119 | 
120 |         super().__init__(**kwargs)
121 | 
122 | 
123 | class VLMPreTrainedModel(PreTrainedModel):
124 |     config_class = VLMConfig
125 |     base_model_prefix = "vlm"
126 |     supports_gradient_checkpointing = True
127 |     _no_split_modules = []
128 |     _skip_keys_device_placement = "past_key_values"
129 | 
130 |     def _init_weights(self, module):
131 |         pass
132 | 
133 |     def _initialize_weights(self, module):
134 |         pass
135 | 
136 | 
137 | class VLMForCausalLM(VLMPreTrainedModel):
138 |     def __init__(self, config: VLMConfig):
139 |         super().__init__(config)
140 | 
141 |         self.config = config
142 |         self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
143 |         self.text_config.vocab_size += 3
144 |         self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
145 | 
146 |         self.image_encoder = ImageEncoder(
147 |             self.config.image_encoder_hidden_size,
148 |             self.config.image_encoder_patch_size,
149 |             self.config.image_size,
150 |             self.config.image_encoder_num_layers,
151 |             self.config.image_encoder_num_heads,
152 |             self.config.image_encoder_embedding_dim,
153 |             self.config.image_encoder_pooling,
154 |         )
155 | 
156 |         # replace models' layerscales because `transformers` automatically renames keys in `state_dict`
157 |         for i in range(len(self.image_encoder.blocks)):
158 |             self.image_encoder.blocks[i].ls1 = LayerScale(
159 |                 self.image_encoder.blocks[i].ls1.dim,
160 |             )
161 |             self.image_encoder.blocks[i].ls2 = LayerScale(
162 |                 self.image_encoder.blocks[i].ls2.dim,
163 |             )
164 | 
165 |         self.image_pooler = ImageFeaturesPooler(
166 |             self.config.image_encoder_hidden_size,
167 |             self.text_config.hidden_size,
168 |             self.config.image_pooler_num_attn_heads,
169 |             self.config.image_pooler_intermediate_size,
170 |             self.config.image_pooler_num_latents,
171 |             self.config.initializer_range,
172 |         )
173 | 
174 |     def get_input_embeddings(self):
175 |         return self.text_decoder.get_input_embeddings()
176 | 
177 |     def set_input_embeddings(self, value):
178 |         self.text_decoder.set_input_embeddings(value)
179 | 
180 |     def get_images_embeddings(self, images):
181 |         features = self.image_encoder.forward_features(images)
182 |         return self.image_pooler(features)
183 | 
184 |     def gather_continuous_embeddings(
185 |         self,
186 |         input_ids: torch.Tensor,
187 |         word_embeddings: torch.Tensor,
188 |         image_embeddings: torch.Tensor,
189 |     ) -> torch.Tensor:
190 |         start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
191 |         embeddings = []
192 | 
193 |         for sample_idx, start_idx in enumerate(start_indices.tolist()):
194 |             embeddings.append(
195 |                 torch.cat(
196 |                     (
197 |                         word_embeddings[sample_idx, :start_idx],
198 |                         image_embeddings[sample_idx],
199 |                         word_embeddings[sample_idx, start_idx + 1 :],
200 |                     ),
201 |                     dim=0,
202 |                 ),
203 |             )
204 | 
205 |         return torch.stack(embeddings, dim=0)
206 | 
207 |     def forward(
208 |         self,
209 |         input_ids: torch.LongTensor = None,
210 |         images: torch.Tensor = None,
211 |         attention_mask: Optional[torch.Tensor] = None,
212 |         position_ids: Optional[torch.LongTensor] = None,
213 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
214 |         inputs_embeds: Optional[torch.FloatTensor] = None,
215 |         use_cache: Optional[bool] = None,
216 |         labels: Optional[torch.Tensor] = None,
217 |         output_attentions: Optional[bool] = None,
218 |         output_hidden_states: Optional[bool] = None,
219 |         return_dict: Optional[bool] = None,
220 |     ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
221 | 
222 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
223 |         output_hidden_states = (
224 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
225 |         )
226 |         use_cache = use_cache if use_cache is not None else self.config.use_cache
227 | 
228 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
229 | 
230 |         if input_ids is not None and inputs_embeds is not None:
231 |             raise ValueError(
232 |                 "You cannot specify both input_ids and inputs_embeds at the same time",
233 |             )
234 |         elif input_ids is None and inputs_embeds is None:
235 |             raise ValueError("You have to specify either input_is or inputs_embeds")
236 | 
237 |         if inputs_embeds is None and past_key_values is None:
238 |             inputs_embeds = self.get_input_embeddings()(input_ids)
239 | 
240 |             if images is not None:
241 |                 image_embeds = self.get_images_embeddings(images)
242 |                 inputs_embeds = self.gather_continuous_embeddings(
243 |                     input_ids,
244 |                     inputs_embeds,
245 |                     image_embeds,
246 |                 )
247 | 
248 |         if position_ids is None:
249 |             seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
250 |             past_key_values_length = 0
251 | 
252 |             if past_key_values is not None:
253 |                 past_key_values_length = past_key_values[0][0].shape[2]
254 | 
255 |             device = input_ids.device if input_ids is not None else inputs_embeds.device
256 |             position_ids = torch.arange(
257 |                 past_key_values_length,
258 |                 seq_length + past_key_values_length,
259 |                 dtype=torch.long,
260 |                 device=device,
261 |             )
262 |             position_ids = position_ids.unsqueeze(0)
263 | 
264 |         outputs = self.text_decoder(
265 |             inputs_embeds=inputs_embeds,
266 |             input_ids=input_ids if past_key_values is not None else None,
267 |             attention_mask=attention_mask,
268 |             labels=labels,
269 |             position_ids=position_ids,
270 |             past_key_values=past_key_values,
271 |             output_attentions=output_attentions,
272 |             output_hidden_states=output_hidden_states,
273 |             use_cache=use_cache,
274 |             return_dict=return_dict,
275 |         )
276 | 
277 |         return outputs
278 | 
279 |     def prepare_inputs_for_generation(
280 |         self,
281 |         input_ids,
282 |         images=None,
283 |         past_key_values=None,
284 |         attention_mask=None,
285 |         inputs_embeds=None,
286 |         **kwargs,
287 |     ):
288 |         if past_key_values:
289 |             input_ids = input_ids[:, -1:]
290 | 
291 |         position_ids = kwargs.get("position_ids", None)
292 |         if attention_mask is not None and position_ids is None:
293 |             # create position_ids on the fly for batch generation
294 |             position_ids = attention_mask.long().cumsum(-1) - 1
295 |             position_ids.masked_fill_(attention_mask == 0, 1)
296 |             if past_key_values:
297 |                 position_ids = position_ids[:, -1].unsqueeze(-1)
298 | 
299 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
300 |         if inputs_embeds is not None and past_key_values is None:
301 |             model_inputs = {"inputs_embeds": inputs_embeds}
302 |         else:
303 |             model_inputs = {"input_ids": input_ids}
304 | 
305 |         if images is not None:
306 |             model_inputs["images"] = images
307 | 
308 |         model_inputs.update(
309 |             {
310 |                 "position_ids": position_ids,
311 |                 "past_key_values": past_key_values,
312 |                 "use_cache": kwargs.get("use_cache"),
313 |                 "attention_mask": attention_mask,
314 |                 "images": images if past_key_values is None else None,
315 |             },
316 |         )
317 |         return model_inputs
318 | 
319 |     @classmethod
320 |     def from_config(cls, config, **kwargs):
321 |         return cls._from_config(config, **kwargs)
322 | 
323 | 
324 | class VLMProcessor(ProcessorMixin):
325 |     def __init__(self, config, **kwargs):
326 |         self.feature_extractor = None
327 |         self.config = config
328 | 
329 |         if config.center_crop:
330 |             self.image_processor = Compose(
331 |                 [
332 |                     Resize(256, interpolation=InterpolationMode.BICUBIC),
333 |                     CenterCrop(config.image_size),
334 |                     convert_to_rgb,
335 |                     ToTensor(),
336 |                     Normalize(
337 |                         mean=IMAGENET_MEAN,
338 |                         std=IMAGENET_STD,
339 |                     ),
340 |                 ],
341 |             )
342 |         else:
343 |             self.image_processor = Compose(
344 |                 [
345 |                     RandomResizedCrop(
346 |                         config.image_size,
347 |                         scale=(0.8, 1),
348 |                         interpolation=InterpolationMode.BICUBIC,
349 |                     ),
350 |                     convert_to_rgb,
351 |                     ToTensor(),
352 |                     Normalize(
353 |                         mean=IMAGENET_MEAN,
354 |                         std=IMAGENET_STD,
355 |                     ),
356 |                 ],
357 |             )
358 | 
359 |         self.tokenizer = AutoTokenizer.from_pretrained(
360 |             config.tokenizer_name_or_path,
361 |             additional_special_tokens=["<|im_end|>"],
362 |         )
363 |         self.num_image_latents = config.image_pooler_num_latents
364 | 
365 |     def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
366 |         if texts is not None:
367 |             if isinstance(texts, str):
368 |                 texts = [texts]
369 | 
370 |             tokenized_texts = []
371 |             for text in texts:
372 |                 messages = [
373 |                     {"role": "system", "content": "You are a helpful assistant."},
374 |                     {"role": "user", "content": f" <image> {text}"},
375 |                 ]
376 |                 tokenized_prompt = self.tokenizer.apply_chat_template(
377 |                     messages,
378 |                     add_generation_prompt=True,
379 |                     return_tensors=return_tensors,
380 |                 )
381 | 
382 |                 tokenized_texts.append(tokenized_prompt)
383 | 
384 |             max_len = max(len(t[0]) for t in tokenized_texts)
385 |             input_ids = torch.full(
386 |                 (len(tokenized_texts), max_len),
387 |                 fill_value=self.tokenizer.pad_token_id,
388 |                 dtype=torch.int64,
389 |             )
390 |             attention_mask = torch.full(
391 |                 (len(tokenized_texts), max_len),
392 |                 fill_value=0,
393 |                 dtype=torch.int64,
394 |             )
395 | 
396 |             for i, tokens in enumerate(tokenized_texts):
397 |                 input_ids[i, -len(tokens[0]) :] = tokens[0]
398 |                 attention_mask[i, -len(tokens[0]) :] = 1
399 | 
400 |             attention_mask = F.pad(
401 |                 attention_mask,
402 |                 pad=(0, self.num_image_latents - 1),
403 |                 value=1,
404 |             )
405 | 
406 |             encoding = BatchEncoding(
407 |                 data={
408 |                     "input_ids": input_ids,
409 |                     "attention_mask": attention_mask,
410 |                 },
411 |             )
412 | 
413 |         if images is not None:
414 |             if isinstance(images, (list, tuple)):
415 |                 image_features = torch.empty(
416 |                     (len(images), 3, self.config.image_size, self.config.image_size),
417 |                     dtype=torch.float32,
418 |                 )
419 | 
420 |                 for i, image in enumerate(images):
421 |                     image_features[i] = self.image_processor(image)
422 |             else:
423 |                 image_features = self.image_processor(images).unsqueeze(0)
424 | 
425 |         if texts is not None and images is not None:
426 |             encoding["images"] = image_features
427 |             return encoding
428 | 
429 |         if texts is not None:
430 |             return encoding
431 | 
432 |         return BatchEncoding(
433 |             data={
434 |                 "images": image_features,
435 |             },
436 |             tensor_type=return_tensors,
437 |         )
438 | 
439 |     def batch_decode(self, *args, **kwargs):
440 |         return self.tokenizer.batch_decode(*args, **kwargs)
441 | 
442 |     def decode(self, *args, **kwargs):
443 |         return self.tokenizer.decode(*args, **kwargs)
444 | 
445 |     @classmethod
446 |     def from_pretrained(
447 |         cls,
448 |         pretrained_model_name_or_path,
449 |         cache_dir=None,
450 |         force_download: bool = False,
451 |         local_files_only: bool = False,
452 |         token=None,
453 |         revision: str = "main",
454 |         **kwargs,
455 |     ):
456 |         config = AutoConfig.from_pretrained(
457 |             pretrained_model_name_or_path,
458 |             cache_dir=cache_dir,
459 |             force_download=force_download,
460 |             local_files_only=local_files_only,
461 |             revision=revision,
462 |             token=token,
463 |             **kwargs,
464 |         )
465 |         return cls(config)
466 | 
467 | 
468 | AutoConfig.register("vlm", VLMConfig)
469 | AutoModel.register(VLMConfig, VLMForCausalLM)
470 | 


--------------------------------------------------------------------------------
/python/uform/torch_encoders.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from dataclasses import dataclass
  4 | from os import PathLike
  5 | from typing import Dict, Optional, Union, Mapping, Any, Tuple
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch import Tensor
 11 | from PIL.Image import Image
 12 | 
 13 | from uform.shared import read_config
 14 | 
 15 | 
 16 | def _is_on_gpu(model: nn.Module) -> bool:
 17 |     try:
 18 |         return next(model.parameters()).device.type == "cuda"
 19 |     except StopIteration:
 20 |         return False
 21 | 
 22 | 
 23 | @dataclass(eq=False)
 24 | class Attention(nn.Module):
 25 |     dim: int
 26 |     num_heads: int
 27 |     dropout_prob: float = 0
 28 | 
 29 |     def __post_init__(self):
 30 |         super().__init__()
 31 | 
 32 |         self.use_sdp = int(torch.__version__[0]) > 1
 33 | 
 34 |         self.query = nn.Linear(self.dim, self.dim)
 35 |         self.key = nn.Linear(self.dim, self.dim)
 36 |         self.value = nn.Linear(self.dim, self.dim)
 37 |         self.out = nn.Linear(self.dim, self.dim)
 38 | 
 39 |         self.head_dim = self.dim // self.num_heads
 40 |         self.scale = self.head_dim**-0.5
 41 | 
 42 |     def forward(
 43 |         self,
 44 |         x: Tensor,
 45 |         attn_mask: Optional[Tensor] = None,
 46 |         context: Optional[Tensor] = None,
 47 |         is_causal: bool = False,
 48 |     ) -> Tensor:
 49 |         query = self.reshape(self.query(x))
 50 |         key = self.reshape(self.key(x if context is None else context))
 51 |         value = self.reshape(self.value(x if context is None else context))
 52 | 
 53 |         if self.use_sdp:
 54 |             x = F.scaled_dot_product_attention(
 55 |                 query,
 56 |                 key,
 57 |                 value,
 58 |                 attn_mask,
 59 |                 dropout_p=self.dropout_prob if self.training else 0,
 60 |                 is_causal=is_causal,
 61 |             )
 62 |         else:
 63 |             attn = query @ key.transpose(-2, -1) * self.scale
 64 |             if attn_mask is not None:
 65 |                 attn += attn_mask
 66 | 
 67 |             attn = attn.softmax(dim=-1)
 68 |             x = attn @ value
 69 | 
 70 |         return self.out(x.transpose(2, 1).flatten(2))
 71 | 
 72 |     def reshape(self, x: Tensor) -> Tensor:
 73 |         batch_size, seq_len, _ = x.shape
 74 |         x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
 75 |         return x.transpose(2, 1)
 76 | 
 77 | 
 78 | @dataclass(eq=False)
 79 | class MLP(nn.Module):
 80 |     dim: int
 81 |     dim_expand_factor: int = 4
 82 | 
 83 |     def __post_init__(self):
 84 |         super().__init__()
 85 | 
 86 |         self.hidden_layer = nn.Linear(self.dim, self.dim * self.dim_expand_factor)
 87 |         self.output_layer = nn.Linear(self.dim * self.dim_expand_factor, self.dim)
 88 | 
 89 |     def forward(self, x: Tensor) -> Tensor:
 90 |         x = F.gelu(self.hidden_layer(x))
 91 |         return self.output_layer(x)
 92 | 
 93 | 
 94 | @dataclass(eq=False)
 95 | class LayerScale(nn.Module):
 96 |     dim: int
 97 |     init_values: float = 1e-5
 98 |     inplace: bool = False
 99 | 
100 |     def __post_init__(self):
101 |         super().__init__()
102 |         self.gamma = nn.Parameter(self.init_values * torch.ones(self.dim))
103 | 
104 |     def forward(self, x: Tensor) -> Tensor:
105 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
106 | 
107 | 
108 | @dataclass(eq=False)
109 | class TextEncoderBlock(nn.Module):
110 |     dim: int
111 |     num_heads: int
112 |     dropout_prob: float
113 |     cross_attention: bool = False
114 | 
115 |     def __post_init__(self):
116 |         super().__init__()
117 | 
118 |         self.norm_attn = nn.LayerNorm(self.dim, eps=1e-12)
119 |         self.attention = Attention(self.dim, self.num_heads, self.dropout_prob)
120 | 
121 |         if self.cross_attention:
122 |             self.norm_crossattn = nn.LayerNorm(self.dim, eps=1e-12)
123 |             self.crossattn = Attention(self.dim, self.num_heads, self.dropout_prob)
124 | 
125 |         self.norm_mlp = nn.LayerNorm(self.dim, eps=1e-12)
126 |         self.mlp = MLP(self.dim)
127 | 
128 |         self.dropout = nn.Dropout(self.dropout_prob)
129 | 
130 |     def forward(
131 |         self,
132 |         x: Tensor,
133 |         attn_mask: Tensor,
134 |         context: Optional[Tensor] = None,
135 |     ) -> Tensor:
136 |         x = self.norm_attn(x + self.dropout(self.attention(x, attn_mask)))
137 | 
138 |         if self.cross_attention and context is not None:
139 |             x = self.norm_crossattn(
140 |                 x + self.dropout(self.crossattn(x, context=context)),
141 |             )
142 | 
143 |         return self.norm_mlp(x + self.dropout(self.mlp(x)))
144 | 
145 | 
146 | @dataclass(eq=False)
147 | class ImageEncoderBlock(nn.Module):
148 |     dim: int
149 |     num_heads: int
150 | 
151 |     def __post_init__(self):
152 |         super().__init__()
153 |         self.norm1 = nn.LayerNorm(self.dim, eps=1e-6)
154 |         self.attn = Attention(self.dim, self.num_heads)
155 |         self.ls1 = LayerScale(self.dim)
156 | 
157 |         self.norm2 = nn.LayerNorm(self.dim, eps=1e-6)
158 |         self.mlp = MLP(self.dim)
159 |         self.ls2 = LayerScale(self.dim)
160 | 
161 |     def forward(self, x: Tensor) -> Tensor:
162 |         x = x + self.ls1(self.attn(self.norm1(x)))
163 |         x = x + self.ls2(self.mlp(self.norm2(x)))
164 |         return x
165 | 
166 | 
167 | @dataclass(eq=False)
168 | class TextEncoder(nn.Module):
169 |     model_type: str
170 |     dim: int
171 |     context_dim: int
172 |     vocab_size: int
173 |     padding_idx: int
174 |     num_layers: int
175 |     num_heads: int
176 |     embedding_dim: int
177 |     multimodal_layers_ids: tuple
178 |     head_one_neuron: bool
179 |     pooling: str = "cls"
180 |     max_position_embeddings: int = 77
181 |     dropout_prob: float = 0
182 | 
183 |     def __post_init__(self):
184 |         super().__init__()
185 | 
186 |         self.word_embeddings = nn.Embedding(
187 |             self.vocab_size,
188 |             self.dim,
189 |             padding_idx=self.padding_idx,
190 |         )
191 |         self.position_embeddings = nn.Embedding(self.max_position_embeddings, self.dim)
192 | 
193 |         if self.model_type == "bert":
194 |             self.register_buffer(
195 |                 "position_ids",
196 |                 torch.arange(self.max_position_embeddings).unsqueeze(0),
197 |                 persistent=False,
198 |             )
199 | 
200 |         self.layer_norm = nn.LayerNorm(self.dim, eps=1e-12)
201 |         self.dropout = nn.Dropout(self.dropout_prob)
202 | 
203 |         self.blocks = nn.ModuleList(
204 |             [
205 |                 TextEncoderBlock(
206 |                     self.dim,
207 |                     self.num_heads,
208 |                     self.dropout_prob,
209 |                     layer_id in self.multimodal_layers_ids,
210 |                 )
211 |                 for layer_id in range(self.num_layers)
212 |             ],
213 |         )
214 | 
215 |         self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
216 |         self.matching_head = nn.Linear(self.dim, 1 if self.head_one_neuron else 2)
217 | 
218 |         if self.context_dim != self.dim:
219 |             self.context_projection = nn.Linear(self.context_dim, self.dim, bias=False)
220 |         else:
221 |             self.context_projection = nn.Identity()
222 |         self.return_features = False
223 | 
224 |     def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
225 |         x = self.embed_text(x)
226 |         attn_mask = self.get_attention_mask(attn_mask, x.dtype)
227 | 
228 |         for block in self.blocks:
229 |             if not block.cross_attention:
230 |                 x = block(x, attn_mask)
231 | 
232 |         return x
233 | 
234 |     def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor:
235 |         return self.embedding_projection(self.pool_features(x, attn_mask))
236 | 
237 |     def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
238 |         if self.pooling == "cls":
239 |             return x[:, 0]
240 | 
241 |         attn_mask = attn_mask.unsqueeze(2).type_as(x)
242 |         return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1)
243 | 
244 |     def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor:
245 |         attn_mask = attn_mask.to(dtype)
246 |         attn_mask = (1.0 - attn_mask) * torch.finfo(dtype).min
247 |         return attn_mask.unsqueeze(1).expand(-1, attn_mask.shape[1], -1).unsqueeze(1)
248 | 
249 |     def get_position_ids(self, x: Tensor) -> Tensor:
250 |         if self.model_type == "roberta":
251 |             mask = x.ne(self.padding_idx).int()
252 |             return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + self.padding_idx
253 | 
254 |         return self.position_ids[:, : x.shape[1]]
255 | 
256 |     def embed_text(self, x: Tensor) -> Tensor:
257 |         positional_embedding = self.position_embeddings(self.get_position_ids(x))
258 |         x = self.word_embeddings(x) + positional_embedding
259 |         return self.dropout(self.layer_norm(x))
260 | 
261 |     def forward(
262 |         self,
263 |         x: Union[Tensor, dict],
264 |         attention_mask: Optional[Tensor] = None,
265 |         return_features: Optional[bool] = None,
266 |     ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
267 | 
268 |         if isinstance(x, dict):
269 |             assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
270 |             attention_mask = x["attention_mask"]
271 |             x = x["input_ids"]
272 |         elif attention_mask is None:
273 |             # If no attention mask is provided - create one with all ones
274 |             attention_mask = torch.ones_like(x)
275 | 
276 |         # If the model is on the GPU and the input matrices are not, shift them there
277 |         if _is_on_gpu(self) and not x.is_cuda:
278 |             x = x.cuda()
279 |             attention_mask = attention_mask.cuda()
280 | 
281 |         features = self.forward_features(x, attention_mask)
282 |         embeddings = self.forward_embedding(features, attention_mask)
283 | 
284 |         return_features = return_features if return_features is not None else self.return_features
285 |         if return_features:
286 |             return features, embeddings
287 |         return embeddings
288 | 
289 |     def encode(
290 |         self,
291 |         x: Union[Tensor, dict],
292 |         attention_mask: Optional[Tensor] = None,
293 |         return_features: Optional[bool] = None,
294 |     ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
295 | 
296 |         result = self.forward(x, attention_mask, return_features)
297 |         if isinstance(result, tuple):
298 |             return result[0].detach(), result[1].detach()
299 |         else:
300 |             return result.detach()
301 | 
302 |     @staticmethod
303 |     def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder:
304 |         """Load the image encoder from the given configuration and model path.
305 | 
306 |         :param config: the configuration dictionary or path to the JSON configuration file
307 |         :param model: the model state dictionary or path to the `.pt` model file
308 |         """
309 |         config = read_config(config)
310 |         if "text_encoder" in config:
311 |             config = config["text_encoder"]
312 | 
313 |         # We must strip all the non-member attributes before initializing the classes.
314 |         text_fields = TextEncoder.__dataclass_fields__
315 |         config = {k: v for k, v in config.items() if k in text_fields}
316 |         encoder = TextEncoder(**config)
317 | 
318 |         # Load from disk
319 |         if isinstance(model, (PathLike, str)):
320 |             state = torch.load(model)
321 |         else:
322 |             state = model
323 |         if "text_encoder" in state:
324 |             state = state["text_encoder"]
325 |         encoder.load_state_dict(state)
326 |         return encoder
327 | 
328 | 
329 | @dataclass(eq=False)
330 | class ImageEncoder(nn.Module):
331 |     dim: int
332 |     patch_size: int
333 |     image_size: int
334 |     num_layers: int
335 |     num_heads: int
336 |     embedding_dim: int
337 |     pooling: str
338 |     num_reg_tokens: int = 0
339 | 
340 |     def __post_init__(self):
341 |         super().__init__()
342 | 
343 |         seq_len = (self.image_size // self.patch_size) ** 2
344 |         self.patch_embed = nn.Conv2d(3, self.dim, self.patch_size, self.patch_size)
345 |         self.pos_embed = nn.Parameter(torch.randn(1, seq_len, self.dim) * 0.02)
346 |         self.cls_token = nn.Parameter(torch.zeros(1, 1, self.dim))
347 | 
348 |         if self.num_reg_tokens > 0:
349 |             self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim))
350 | 
351 |         self.blocks = nn.Sequential(
352 |             *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
353 |         )
354 | 
355 |         self.norm = nn.LayerNorm(self.dim, eps=1e-6)
356 |         self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
357 |         self.return_features = False
358 | 
359 |     def forward_features(self, x: Union[Tensor, dict]) -> Tensor:
360 |         x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1)
361 |         x = x + self.pos_embed
362 |         special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)]
363 | 
364 |         if self.num_reg_tokens > 0:
365 |             special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1))
366 | 
367 |         x = torch.cat(special_tokens + [x], dim=1)
368 |         x = self.blocks(x)
369 |         return self.norm(x)
370 | 
371 |     def forward_embedding(self, x: Tensor) -> Tensor:
372 |         if self.pooling == "cls":
373 |             x = x[:, 0]
374 |         else:
375 |             x = x.mean(dim=1)
376 | 
377 |         return self.embedding_projection(x)
378 | 
379 |     def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
380 |         if isinstance(x, dict):
381 |             x = x["images"]
382 | 
383 |         # If the model is on the GPU and the input matrices are not, shift them there
384 |         if _is_on_gpu(self) and not x.is_cuda:
385 |             x = x.cuda()
386 | 
387 |         features = self.forward_features(x)
388 |         embeddings = self.forward_embedding(features)
389 |         return_features = return_features if return_features is not None else self.return_features
390 |         if return_features:
391 |             return features, embeddings
392 |         return embeddings
393 | 
394 |     def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
395 |         result = self.forward(x, return_features)
396 |         if isinstance(result, tuple):
397 |             return result[0].detach(), result[1].detach()
398 |         else:
399 |             return result.detach()
400 | 
401 |     @staticmethod
402 |     def from_pretrained(
403 |         config: Union[PathLike, str, object],
404 |         model: Union[PathLike, str, Mapping[str, Any]],
405 |     ) -> ImageEncoder:
406 |         """Load the image encoder from the given configuration and model path.
407 | 
408 |         :param config: the configuration dictionary or path to the JSON configuration file
409 |         :param model: the model state dictionary or path to the `.pt` model file
410 |         """
411 |         config = read_config(config)
412 |         if "image_encoder" in config:
413 |             config = config["image_encoder"]
414 | 
415 |         # We must strip all the non-member attributes before initializing the classes.
416 |         image_fields = ImageEncoder.__dataclass_fields__
417 |         config = {k: v for k, v in config.items() if k in image_fields}
418 |         encoder = ImageEncoder(**config)
419 | 
420 |         # Load from disk
421 |         if isinstance(model, (PathLike, str)):
422 |             state = torch.load(model)
423 |         else:
424 |             state = model
425 |         if "image_encoder" in state:
426 |             state = state["image_encoder"]
427 |         encoder.load_state_dict(state)
428 |         return encoder
429 | 


--------------------------------------------------------------------------------
/python/uform/torch_processors.py:
--------------------------------------------------------------------------------
  1 | from os import PathLike
  2 | from typing import Dict, List, Union, Sequence
  3 | import json
  4 | 
  5 | import torch
  6 | from PIL.Image import Image
  7 | from tokenizers import Tokenizer
  8 | from torch import Tensor
  9 | from torchvision.transforms import (
 10 |     CenterCrop,
 11 |     Compose,
 12 |     InterpolationMode,
 13 |     Normalize,
 14 |     Resize,
 15 |     ToTensor,
 16 | )
 17 | 
 18 | from uform.shared import read_config
 19 | 
 20 | 
 21 | # lambda is not pickle-able
 22 | def convert_to_rgb(image):
 23 |     return image.convert("RGB")
 24 | 
 25 | 
 26 | class TextProcessor:
 27 |     def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
 28 |         """
 29 |         :param config: model config
 30 |         :param tokenizer_path: path to tokenizer file
 31 |         """
 32 | 
 33 |         config = read_config(config_path)
 34 |         if "text_encoder" in config:
 35 |             config = config["text_encoder"]
 36 | 
 37 |         self._max_seq_len = config["max_position_embeddings"]
 38 |         self._tokenizer = Tokenizer.from_file(tokenizer_path)
 39 |         self._tokenizer.no_padding()
 40 |         self._pad_token_idx = config["padding_idx"]
 41 | 
 42 |     def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
 43 |         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 44 | 
 45 |         :param texts: text of list of texts to tokenizer
 46 |         :return: dictionary with tokenized strings and attention masks as values
 47 |         """
 48 |         if isinstance(texts, str):
 49 |             texts = [texts]
 50 | 
 51 |         input_ids = torch.full(
 52 |             (len(texts), self._max_seq_len),
 53 |             fill_value=self._pad_token_idx,
 54 |             dtype=torch.int64,
 55 |         )
 56 | 
 57 |         attention_mask = torch.zeros(
 58 |             len(texts),
 59 |             self._max_seq_len,
 60 |             dtype=torch.int32,
 61 |         )
 62 |         encoded = self._tokenizer.encode_batch(texts)
 63 | 
 64 |         for i, seq in enumerate(encoded):
 65 |             seq_len = min(len(seq), self._max_seq_len)
 66 |             input_ids[i, :seq_len] = torch.LongTensor(
 67 |                 seq.ids[:seq_len],
 68 |             )
 69 |             attention_mask[i, :seq_len] = 1
 70 | 
 71 |         return {"input_ids": input_ids, "attention_mask": attention_mask}
 72 | 
 73 | 
 74 | class ImageProcessor:
 75 |     def __init__(self, config_path: PathLike):
 76 |         """
 77 |         :param config: model config
 78 |         """
 79 | 
 80 |         config = read_config(config_path)
 81 |         if "image_encoder" in config:
 82 |             config = config["image_encoder"]
 83 | 
 84 |         self._image_size = config["image_size"]
 85 |         self._normalization_means = config["normalization_means"]
 86 |         self._normalization_deviations = config["normalization_deviations"]
 87 | 
 88 |         assert isinstance(self._image_size, int) and self._image_size > 0
 89 |         assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
 90 |         assert len(self._normalization_means) == len(self._normalization_deviations) == 3
 91 | 
 92 |         self._image_transform = Compose(
 93 |             [
 94 |                 Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
 95 |                 convert_to_rgb,
 96 |                 CenterCrop(self._image_size),
 97 |                 ToTensor(),
 98 |                 Normalize(
 99 |                     mean=tuple(self._normalization_means),
100 |                     std=tuple(self._normalization_deviations),
101 |                 ),
102 |             ],
103 |         )
104 | 
105 |     def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]:
106 |         """Transforms one or more Pillow images into Torch Tensors.
107 | 
108 |         :param images: image or list of images to preprocess
109 |         :return: dictionary with float-represented images in tensors as values
110 |         """
111 | 
112 |         if isinstance(images, Sequence):
113 |             batch_images = torch.empty(
114 |                 (len(images), 3, self._image_size, self._image_size),
115 |                 dtype=torch.float32,
116 |             )
117 | 
118 |             for i, image in enumerate(images):
119 |                 batch_images[i] = self._image_transform(image)
120 | 
121 |         else:
122 |             batch_images = self._image_transform(images).unsqueeze(0)
123 | 
124 |         return {"images": batch_images}
125 | 


--------------------------------------------------------------------------------
/swift/EncodersTests.swift:
--------------------------------------------------------------------------------
  1 | import CoreGraphics
  2 | import Hub
  3 | import ImageIO
  4 | import UForm
  5 | import XCTest
  6 | 
  7 | final class TokenizerTests: XCTestCase {
  8 | 
  9 |     var hfToken: String?
 10 | 
 11 |     override func setUp() {
 12 |         super.setUp()
 13 |         // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory
 14 |         let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token")
 15 |         if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines)
 16 |         {
 17 |             hfToken = token
 18 |         }
 19 | 
 20 |         hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"]
 21 |         hfToken = hfToken ?? "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"
 22 |     }
 23 | 
 24 |     func cosineSimilarity<T: FloatingPoint>(between vectorA: [T], and vectorB: [T]) -> T {
 25 |         guard vectorA.count == vectorB.count else {
 26 |             fatalError("Vectors must be of the same length.")
 27 |         }
 28 | 
 29 |         let dotProduct = zip(vectorA, vectorB).reduce(T.zero) { $0 + ($1.0 * $1.1) }
 30 |         let magnitudeA = sqrt(vectorA.reduce(T.zero) { $0 + $1 * $1 })
 31 |         let magnitudeB = sqrt(vectorB.reduce(T.zero) { $0 + $1 * $1 })
 32 | 
 33 |         // Avoid division by zero
 34 |         if magnitudeA == T.zero || magnitudeB == T.zero {
 35 |             return T.zero
 36 |         }
 37 | 
 38 |         return dotProduct / (magnitudeA * magnitudeB)
 39 |     }
 40 | 
 41 |     func testTextEmbeddings(forModel modelName: String) async throws {
 42 | 
 43 |         let api = HubApi(hfToken: hfToken)
 44 |         let textModel = try await TextEncoder(
 45 |             modelName: "unum-cloud/uform3-image-text-english-small",
 46 |             hubApi: api
 47 |         )
 48 | 
 49 |         let texts = [
 50 |             "sunny beach with clear blue water",
 51 |             "crowded sandbeach under the bright sun",
 52 |             "dense forest with tall green trees",
 53 |             "quiet park in the morning light",
 54 |         ]
 55 | 
 56 |         var textEmbeddings: [[Float32]] = []
 57 |         for text in texts {
 58 |             let embedding: [Float32] = try textModel.encode(text).asFloats()
 59 |             textEmbeddings.append(embedding)
 60 |         }
 61 | 
 62 |         // Now let's compute the cosine similarity between the textEmbeddings
 63 |         let similarityBeach = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[1])
 64 |         let similarityForest = cosineSimilarity(between: textEmbeddings[2], and: textEmbeddings[3])
 65 |         let dissimilarityBetweenScenes = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[2])
 66 | 
 67 |         // Assert that similar texts have higher similarity scores
 68 |         XCTAssertTrue(
 69 |             similarityBeach > dissimilarityBetweenScenes,
 70 |             "Beach texts should be more similar to each other than to forest texts."
 71 |         )
 72 |         XCTAssertTrue(
 73 |             similarityForest > dissimilarityBetweenScenes,
 74 |             "Forest texts should be more similar to each other than to beach texts."
 75 |         )
 76 |     }
 77 | 
 78 |     func testTextEmbeddings() async throws {
 79 |         for model in [
 80 |             "unum-cloud/uform3-image-text-english-small",
 81 |             "unum-cloud/uform3-image-text-english-base",
 82 |             "unum-cloud/uform3-image-text-english-large",
 83 |             "unum-cloud/uform3-image-text-multilingual-base",
 84 |         ] {
 85 |             try await testTextEmbeddings(forModel: model)
 86 |         }
 87 |     }
 88 | 
 89 |     func testImageEmbeddings(forModel modelName: String) async throws {
 90 | 
 91 |         // One option is to use a local model repository.
 92 |         //
 93 |         //        let root = "uform/"
 94 |         //        let textModel = try TextEncoder(
 95 |         //            modelPath: root + "uform-vl-english-large-text_encoder.mlpackage",
 96 |         //            configPath: root + "uform-vl-english-large-text.json",
 97 |         //            tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json"
 98 |         //        )
 99 |         //        let imageModel = try ImageEncoder(
100 |         //            modelPath: root + "uform-vl-english-large-image_encoder.mlpackage",
101 |         //            configPath: root + "uform-vl-english-large-image.json"
102 |         //        )
103 |         //
104 |         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
105 |         let api = HubApi(hfToken: hfToken)
106 |         let textModel = try await TextEncoder(
107 |             modelName: modelName,
108 |             hubApi: api
109 |         )
110 |         let imageModel = try await ImageEncoder(
111 |             modelName: modelName,
112 |             hubApi: api
113 |         )
114 | 
115 |         let texts = [
116 |             "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
117 |             "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
118 |             "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
119 |             "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
120 |             "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
121 |         ]
122 |         let imageURLs = [
123 |             "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
124 |             "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
125 |             "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
126 |             "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
127 |             "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
128 |         ]
129 | 
130 |         var textEmbeddings: [[Float32]] = []
131 |         var imageEmbeddings: [[Float32]] = []
132 |         for (text, imageURL) in zip(texts, imageURLs) {
133 |             guard let url = URL(string: imageURL),
134 |                 let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
135 |                 let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil)
136 |             else {
137 |                 throw NSError(
138 |                     domain: "ImageError",
139 |                     code: 100,
140 |                     userInfo: [NSLocalizedDescriptionKey: "Could not load image from URL: \(imageURL)"]
141 |                 )
142 |             }
143 | 
144 |             let textEmbedding: [Float32] = try textModel.encode(text).asFloats()
145 |             textEmbeddings.append(textEmbedding)
146 |             let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats()
147 |             imageEmbeddings.append(imageEmbedding)
148 |         }
149 | 
150 |         // Now let's make sure that the cosine distance between image and respective text embeddings is low.
151 |         // Make sure that the similarity between image and text at index `i` is higher than with other texts and images.
152 |         for i in 0 ..< texts.count {
153 |             let pairSimilarity = cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[i])
154 |             let otherTextSimilarities = (0 ..< texts.count).filter { $0 != i }.map {
155 |                 cosineSimilarity(between: textEmbeddings[$0], and: imageEmbeddings[i])
156 |             }
157 |             let otherImageSimilarities = (0 ..< texts.count).filter { $0 != i }.map {
158 |                 cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[$0])
159 |             }
160 | 
161 |             XCTAssertTrue(
162 |                 pairSimilarity > otherTextSimilarities.max()!,
163 |                 "Text should be more similar to its corresponding image than to other images."
164 |             )
165 |             XCTAssertTrue(
166 |                 pairSimilarity > otherImageSimilarities.max()!,
167 |                 "Text should be more similar to its corresponding image than to other texts."
168 |             )
169 |         }
170 |     }
171 | 
172 |     func testImageEmbeddings() async throws {
173 |         for model in [
174 |             "unum-cloud/uform3-image-text-english-small",
175 |             "unum-cloud/uform3-image-text-english-base",
176 |             "unum-cloud/uform3-image-text-english-large",
177 |             "unum-cloud/uform3-image-text-multilingual-base",
178 |         ] {
179 |             try await testImageEmbeddings(forModel: model)
180 |         }
181 |     }
182 | 
183 | }
184 | 


--------------------------------------------------------------------------------
/swift/README.md:
--------------------------------------------------------------------------------
  1 | # UForm Swift SDK
  2 | 
  3 | UForm offers first-party support for Swift.
  4 | To get started, add UForm to your project using Swift Package Manager.
  5 | 
  6 | ```bash
  7 | swift package init --type executable
  8 | swift package add uform
  9 | ```
 10 | 
 11 | Then, import UForm in your Swift code:
 12 | 
 13 | ```swift
 14 | import UForm
 15 | ```
 16 | 
 17 | ## Embeddings
 18 | 
 19 | ### Text Embeddings
 20 | 
 21 | ```swift
 22 | let textModel = try await TextEncoder(
 23 |     modelName: "unum-cloud/uform3-image-text-english-small",
 24 |     computeUnits: .cpuAndNeuralEngine
 25 | )
 26 | let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
 27 | let textEmbedding: Embedding = try textModel.encode(text)
 28 | let textVector: [Float32] = textEmbedding.asFloats()
 29 | ```
 30 | 
 31 | ### Image Embeddings
 32 | 
 33 | ```swift
 34 | let imageModel = try await ImageEncoder(
 35 |     modelName: "unum-cloud/uform3-image-text-english-small",
 36 |     computeUnits: .cpuAndNeuralEngine
 37 | )
 38 | let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
 39 | guard let url = URL(string: imageURL),
 40 |     let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
 41 |     let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
 42 |     throw Exception("Could not load image from URL: \(imageURL)")
 43 | }
 44 | 
 45 | var imageEmbedding: Embedding = try imageModel.encode(cgImage)
 46 | var imageVector: [Float32] = embedding.asFloats()
 47 | ```
 48 | 
 49 | ### Choosing Target Device
 50 | 
 51 | Apple chips provide several functional units capable of high-throughput matrix multiplication and AI inference.
 52 | Those `computeUnits` include the CPU, GPU, and Neural Engine.
 53 | For maximum compatibility, the `.all` option is used by default.
 54 | Sadly, Apple's scheduler is not always optimal, and it might be beneficial to specify the target device explicitly, especially if the models are pre-compiled for the Apple Neural Engine, as it may yield significant performance gains.
 55 | 
 56 | | Model               | GPU Text E. | ANE Text E. | GPU Image E. | ANE Image E. |
 57 | | :------------------ | ----------: | ----------: | -----------: | -----------: |
 58 | | `english-small`     |     2.53 ms |     0.53 ms |      6.57 ms |      1.23 ms |
 59 | | `english-base`      |     2.54 ms |     0.61 ms |     18.90 ms |      3.79 ms |
 60 | | `english-large`     |     2.30 ms |     0.61 ms |     79.68 ms |     20.94 ms |
 61 | | `multilingual-base` |     2.34 ms |     0.50 ms |     18.98 ms |      3.77 ms |
 62 | 
 63 | > On Apple M4 iPad, running iOS 18.2.
 64 | > Batch size is 1, and the model is pre-loaded into memory.
 65 | > The original encoders use `f32` single-precision numbers for maximum compatibility, and mostly rely on __GPU__ for computation.
 66 | > The quantized encoders use a mixture of `i8`, `f16`, and `f32` numbers for maximum performance, and mostly rely on the Apple Neural Engine (__ANE__) for computation.
 67 | > The median latency is reported.
 68 | 
 69 | ### Computing Distances
 70 | 
 71 | There are several ways to compute distances between embeddings, once you have them.
 72 | Naive Swift code might look like this:
 73 | 
 74 | ```swift
 75 | func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
 76 |     let dotProduct = zip(a, b).map(*).reduce(0, +)
 77 |     let normA = sqrt(a.map { $0 * $0 }.reduce(0, +))
 78 |     let normB = sqrt(b.map { $0 * $0 }.reduce(0, +))
 79 |     return dotProduct / (normA * normB)
 80 | }
 81 | ```
 82 | 
 83 | A faster way to compute distances is to use the Accelerate framework:
 84 | 
 85 | ```swift
 86 | import Accelerate
 87 | 
 88 | func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
 89 |     var result: Float32 = 0
 90 |     var aNorm: Float32 = 0
 91 |     var bNorm: Float32 = 0
 92 |     vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
 93 |     vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count))
 94 |     vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count))
 95 |     return result / sqrt(aNorm * bNorm)
 96 | }
 97 | ```
 98 | 
 99 | An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings.
100 | 


--------------------------------------------------------------------------------