├── .github
└── workflows
│ ├── package.json
│ ├── prerelease.yml
│ └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .swift-format
├── .vscode
├── launch.json
├── settings.json
└── tasks.json
├── BENCHMARKS.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── Package.resolved
├── Package.swift
├── README.md
├── VERSION
├── assets
├── model_types_bg.png
└── unum.png
├── docs
├── Makefile
├── _static
│ ├── custom.css
│ └── custom.js
├── benchmarks.rst
├── conf.py
├── contributing.rst
├── index.rst
├── javascript
│ ├── index.rst
│ └── reference.rst.txt
├── python
│ ├── index.rst
│ └── reference.rst
└── swift
│ └── index.rst
├── javascript
├── README.md
├── encoders.mjs
├── encoders_test.js
├── hub.mjs
└── index.mjs
├── package-lock.json
├── package.json
├── pyproject.toml
├── python
├── README.md
├── scripts
│ ├── bench_decoders.py
│ ├── bench_encoders.py
│ ├── export_decoders.ipynb
│ ├── export_encoders.ipynb
│ ├── test_decoders.py
│ └── test_encoders.py
└── uform
│ ├── __init__.py
│ ├── chat.py
│ ├── gen_model.py
│ ├── numpy_processors.py
│ ├── onnx_encoders.py
│ ├── shared.py
│ ├── torch_decoders.py
│ ├── torch_encoders.py
│ └── torch_processors.py
├── swift
├── Encoders.swift
├── EncodersTests.swift
└── README.md
└── yarn.lock
/.github/workflows/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "devDependencies": {
3 | "@semantic-release/exec": "github:semantic-release/exec",
4 | "@semantic-release/git": "^10.0.1",
5 | "conventional-changelog-eslint": "^3.0.9",
6 | "semantic-release": "^20.1.3"
7 | },
8 | "release": {
9 | "branches": [
10 | "main"
11 | ],
12 | "debug": true,
13 | "ci": true,
14 | "dryRun": false,
15 | "plugins": [
16 | [
17 | "@semantic-release/commit-analyzer",
18 | {
19 | "preset": "eslint",
20 | "releaseRules": [
21 | {
22 | "tag": "Add",
23 | "release": "minor"
24 | },
25 | {
26 | "tag": "Break",
27 | "release": "major"
28 | },
29 | {
30 | "tag": "Improve",
31 | "release": "patch"
32 | },
33 | {
34 | "tag": "Make",
35 | "release": "patch"
36 | },
37 | {
38 | "tag": "Refactor",
39 | "release": false
40 | }
41 | ]
42 | }
43 | ],
44 | [
45 | "@semantic-release/release-notes-generator",
46 | {
47 | "preset": "eslint",
48 | "releaseRules": [
49 | {
50 | "tag": "Add",
51 | "release": "minor"
52 | },
53 | {
54 | "tag": "Break",
55 | "release": "major"
56 | },
57 | {
58 | "tag": "Improve",
59 | "release": "patch"
60 | },
61 | {
62 | "tag": "Make",
63 | "release": "patch"
64 | },
65 | {
66 | "tag": "Refactor",
67 | "release": false
68 | }
69 | ]
70 | }
71 | ],
72 | "@semantic-release/github",
73 | [
74 | "@semantic-release/exec",
75 | {
76 | "prepareCmd": "sed -i 's/version = \".*\"/version = \"${nextRelease.version}\"/' pyproject.toml"
77 | }
78 | ],
79 | [
80 | "@semantic-release/git",
81 | {
82 | "assets": [
83 | "pyproject.toml"
84 | ],
85 | "message": "Build: Released ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
86 | }
87 | ]
88 | ]
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/.github/workflows/prerelease.yml:
--------------------------------------------------------------------------------
1 | name: Pre-Release
2 |
3 | on:
4 | push:
5 | branches: ["main-dev"]
6 | pull_request:
7 | branches: ["main-dev"]
8 |
9 | env:
10 | BUILD_TYPE: Release
11 | GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
12 | PYTHONUTF8: 1
13 |
14 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | versioning:
20 | name: Update Version
21 | runs-on: ubuntu-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v4
25 | with:
26 | fetch-depth: 0
27 | persist-credentials: false
28 | - name: Run TinySemVer
29 | uses: ashvardanian/tinysemver@v2.0.7
30 | with:
31 | verbose: "true"
32 | version-file: "VERSION"
33 | update-version-in: |
34 | package.json:"version": "(\d+\.\d+\.\d+)"
35 | package-lock.json:"uform",\n\s+"version": "(\d+\.\d+\.\d+)"
36 | CITATION.cff:^version: (\d+\.\d+\.\d+)
37 | pyproject.toml:^version = "(\d+\.\d+\.\d+)"
38 | dry-run: "true"
39 |
40 | test_python:
41 | name: Test Python
42 | runs-on: ubuntu-latest
43 |
44 | steps:
45 | - uses: actions/checkout@v4
46 | - name: Set up Python
47 | uses: actions/setup-python@v5
48 | with:
49 | python-version: "3.11"
50 |
51 | - name: Install dependencies
52 | run: |
53 | python -m pip install --no-cache-dir --upgrade pip
54 | pip install -e ".[onnx]"
55 | pip install pytest
56 |
57 | # When running tests in CI, limit ourselves to the small model tests
58 | - name: Test with PyTest
59 | run: pytest python/scripts/ -s -x -Wd -v -k small
60 |
61 | test_javascript:
62 | name: Test JavaScript
63 | runs-on: ubuntu-latest
64 |
65 | steps:
66 | - uses: actions/checkout@v4
67 | - name: Set up Node.js
68 | uses: actions/setup-node@v4
69 | with:
70 | node-version: 20
71 |
72 | # TODO: JavaScript tests pass, but ONNX throws a memory error on exit
73 | # - name: Build JavaScript
74 | # run: npm ci
75 | # - name: Test JavaScript
76 | # run: npm test
77 |
78 | test_swift:
79 | name: Test Swift
80 | runs-on: macos-14
81 |
82 | steps:
83 | - uses: actions/checkout@v4
84 | - name: Build
85 | run: swift build
86 | - name: Run tests
87 | run: swift test
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 |
7 | env:
8 | GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
9 |
10 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
11 | permissions:
12 | contents: read
13 | pages: write
14 | id-token: write
15 |
16 | jobs:
17 | versioning:
18 | name: Update Version
19 | runs-on: ubuntu-latest
20 | steps:
21 | - name: Checkout
22 | uses: actions/checkout@v4
23 | with:
24 | fetch-depth: 0
25 | persist-credentials: false
26 | - name: Run TinySemVer
27 | uses: ashvardanian/tinysemver@v2.0.7
28 | with:
29 | verbose: "true"
30 | version-file: "VERSION"
31 | update-version-in: |
32 | package.json:"version": "(\d+\.\d+\.\d+)"
33 | package-lock.json:"uform",\n\s+"version": "(\d+\.\d+\.\d+)"
34 | CITATION.cff:^version: (\d+\.\d+\.\d+)
35 | pyproject.toml:^version = "(\d+\.\d+\.\d+)"
36 | dry-run: "false"
37 | push: "true"
38 | create-release: "true"
39 | github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
40 |
41 | rebase:
42 | name: Rebase Dev. Branch
43 | needs: versioning
44 | runs-on: ubuntu-latest
45 | steps:
46 | - name: Checkout the latest code
47 | uses: actions/checkout@v4
48 | with:
49 | fetch-depth: 0
50 |
51 | - name: Perform rebase
52 | run: |
53 | git fetch origin main
54 | git checkout main-dev
55 | git rebase origin/main
56 |
57 | - name: Push changes
58 | uses: CasperWA/push-protected@v2
59 | with:
60 | token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
61 | branch: main-dev
62 | unprotect_reviews: True
63 | force: True
64 |
65 | test_python:
66 | name: Run Tests
67 | runs-on: ubuntu-latest
68 | needs: versioning
69 | steps:
70 | - uses: actions/checkout@v4
71 | with:
72 | ref: "main"
73 |
74 | - name: Set up Python
75 | uses: actions/setup-python@v5
76 | with:
77 | python-version: "3.11"
78 |
79 | - name: Install dependencies
80 | run: |
81 | python -m pip install --upgrade pip
82 | pip install -e ".[onnx]"
83 | pip install pytest
84 |
85 | - name: Run PyTest
86 | run: pytest python/scripts/
87 |
88 | publish_python:
89 | name: Publish Python
90 | runs-on: ubuntu-latest
91 | needs: [versioning, test_python]
92 |
93 | steps:
94 | - uses: actions/checkout@v4
95 | with:
96 | ref: "main"
97 | - name: Set up Python
98 | uses: actions/setup-python@v5
99 | with:
100 | python-version: "3.11"
101 |
102 | - name: Install dependencies
103 | run: |
104 | python -m pip install --upgrade pip
105 | pip install build
106 |
107 | - name: Build package
108 | run: python -m build
109 |
110 | - name: Publish to PyPi
111 | uses: pypa/gh-action-pypi-publish@release/v1
112 | with:
113 | verbose: true
114 | print-hash: true
115 |
116 | publish_javascript:
117 | name: Publish JavaScript
118 | needs: versioning
119 | runs-on: ubuntu-22.04
120 |
121 | steps:
122 | - uses: actions/checkout@v4
123 | with:
124 | ref: "main"
125 |
126 | - name: Set up Node.js
127 | uses: actions/setup-node@v4
128 | with:
129 | node-version: 20
130 |
131 | # TODO: JavaScript tests pass, but ONNX throws a memory error on exit
132 | # - name: Build and Test
133 | # run: |
134 | # npm ci
135 | # npm test
136 |
137 | - name: Publish
138 | uses: JS-DevTools/npm-publish@v2
139 | with:
140 | token: ${{ secrets.NPM_TOKEN }}
141 |
142 | deploy_docs:
143 | name: Deploy Docs
144 | environment:
145 | name: github-pages
146 | url: ${{ steps.deployment.outputs.page_url }}
147 | runs-on: ubuntu-22.04
148 | needs: [publish_python, publish_javascript]
149 | steps:
150 | - name: Checkout
151 | uses: actions/checkout@v4
152 | with:
153 | ref: "main"
154 | - name: Install dependencies
155 | run: |
156 | sudo apt update &&
157 | sudo apt install -y doxygen graphviz dia git &&
158 | pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 &&
159 | npm install -g jsdoc
160 | - name: Setup GitHub Pages
161 | uses: actions/configure-pages@v2
162 | - name: Install UForm from PyPi
163 | run: pip install uform
164 | - name: Build documentation
165 | run: cd docs && make html
166 | - name: Copy assets
167 | run: cp -r assets build/docs/html/
168 | - name: Upload artifacts
169 | uses: actions/upload-pages-artifact@v1
170 | with:
171 | # Upload entire repository
172 | path: "./build/docs/html/"
173 | - name: Deploy to GitHub Pages
174 | id: deployment
175 | uses: actions/deploy-pages@v1
176 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /requirements.txt
2 | /dist/
3 | /test
4 | /build/
5 | yarn.lock
6 | *.egg-info
7 | __pycache__
8 | .build
9 | .swiftpm
10 | .hf_token
11 |
12 | dictionary*
13 | vocab*
14 | /models/
15 |
16 | # Tensors & ML Model
17 | *.onnx
18 | *.pt
19 | *.safetensors
20 | *.mlpackage
21 |
22 | # NodeJS
23 | node_modules
24 | node_build
25 | yarn-error.log
26 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | ci:
2 | autofix_commit_msg: "chore(pre-commit): autofix run"
3 | autoupdate_commit_msg: "chore(pre-commit): autoupdate hooks"
4 |
5 | default_install_hook_types:
6 | - pre-commit
7 |
8 | repos:
9 | - repo: https://github.com/pre-commit/pre-commit-hooks
10 | rev: v4.5.0
11 | hooks:
12 | - id: check-toml
13 | - id: check-yaml
14 | - id: debug-statements
15 | - id: end-of-file-fixer
16 | - id: name-tests-test
17 | - id: trailing-whitespace
18 | - repo: https://github.com/pappasam/toml-sort
19 | rev: v0.23.1
20 | hooks:
21 | - id: toml-sort-fix
22 | - repo: https://github.com/asottile/add-trailing-comma
23 | rev: v3.1.0
24 | hooks:
25 | - id: add-trailing-comma
26 | - repo: https://github.com/astral-sh/ruff-pre-commit
27 | rev: v0.1.11
28 | hooks:
29 | # Run the linter
30 | - id: ruff
31 | args: [--fix]
32 | # Run the formatter
33 | - id: ruff-format
34 |
--------------------------------------------------------------------------------
/.swift-format:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "lineLength": 120,
4 | "indentation": {
5 | "spaces": 4
6 | },
7 | "maximumBlankLines": 1,
8 | "respectsExistingLineBreaks": true,
9 | "lineBreakBeforeControlFlowKeywords": true,
10 | "lineBreakBeforeEachArgument": true,
11 | "multiElementCollectionTrailingCommas": true,
12 | "spacesAroundRangeFormationOperators": true
13 | }
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python Debugger",
9 | "type": "debugpy",
10 | "request": "launch",
11 | "program": "${file}",
12 | "console": "integratedTerminal",
13 | },
14 | {
15 | "name": "PyTest Debugger",
16 | "type": "debugpy",
17 | "request": "launch",
18 | "program": "pytest",
19 | "console": "integratedTerminal",
20 | "args": [
21 | "${file}",
22 | "-s",
23 | "-x",
24 | ],
25 | },
26 | {
27 | "name": "NodeJS Debugger",
28 | "type": "node-terminal",
29 | "request": "launch",
30 | "command": "npm run test",
31 | }
32 | ]
33 | }
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "cSpell.words": [
3 | "arange",
4 | "ashvardanian",
5 | "astype",
6 | "CFURL",
7 | "coreml",
8 | "crossattn",
9 | "cumsum",
10 | "dtype",
11 | "embs",
12 | "finfo",
13 | "huggingface",
14 | "keepdim",
15 | "linalg",
16 | "logits",
17 | "Matryoshka",
18 | "mlmodel",
19 | "mlpackage",
20 | "mlprogram",
21 | "multimodal",
22 | "ndarray",
23 | "numpy",
24 | "ONNX",
25 | "onnxconverter",
26 | "onnxruntime",
27 | "opset",
28 | "packbits",
29 | "preprocess",
30 | "pretrained",
31 | "probs",
32 | "pypi",
33 | "pytest",
34 | "randn",
35 | "rerank",
36 | "reranker",
37 | "reranking",
38 | "sandbeach",
39 | "sess",
40 | "SIMD",
41 | "softmax",
42 | "Tensorrt",
43 | "torchvision",
44 | "transfromers",
45 | "uform",
46 | "unimodal",
47 | "unsqueeze",
48 | "Vardanian",
49 | "whitespaces"
50 | ],
51 | "[python]": {
52 | "editor.defaultFormatter": "ms-python.black-formatter"
53 | },
54 | "python.formatting.provider": "none",
55 | "window.autoDetectColorScheme": true,
56 | "workbench.colorTheme": "Default Dark+",
57 | "workbench.preferredDarkColorTheme": "Default Dark+"
58 | }
--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
1 | {
2 | // See https://go.microsoft.com/fwlink/?LinkId=733558
3 | // for the documentation about the tasks.json format
4 | "version": "2.0.0",
5 | "tasks": [
6 | {
7 | "label": "Publish",
8 | "type": "shell",
9 | "command": "python -m pip install build twine && python -m build && twine check dist/* && twine upload dist/*"
10 | }
11 | ]
12 | }
13 |
--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
1 | # UForm Model Benchmarks
2 |
3 | ## Accuracy
4 |
5 | ### Embedding Models
6 |
7 | Few retrieval benchmarks exist for multimodal embeddings.
8 | The most famous ones for English are "MS-COCO" and "Flickr30k".
9 | Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
10 |
11 | | Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 |
12 | | :-------- | ---------: | ---------: | ----------: |
13 | | Flickr | 0.727 | 0.915 | 0.949 |
14 | | MS-COCO ¹ | 0.510 | 0.761 | 0.838 |
15 |
16 | For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
17 | Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
18 |
19 | | Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
20 | | :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
21 | | English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M |
22 | | Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M |
23 | | Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M |
24 | | Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M |
25 | | Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M |
26 | | French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M |
27 |
28 |
29 | All languages:
30 |
31 | | Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
32 | | :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
33 | | Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M |
34 | | Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M |
35 | | Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M |
36 | | English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M |
37 | | French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M |
38 | | German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M |
39 | | Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M |
40 | | Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M |
41 | | Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M |
42 | | Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M |
43 | | Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M |
44 | | Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M |
45 | | Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M |
46 | | Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M |
47 | | Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M |
48 | | Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M |
49 | | Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M |
50 | | Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M |
51 | | Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M |
52 | | Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M |
53 | | Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M |
54 | | | | | | | | | |
55 | | Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - |
56 | | Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - |
57 | | Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - |
58 | | Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - |
59 |
60 | ### Generative Models
61 |
62 | | Model | LLM Size | SQA | MME | MMBench | Average¹ |
63 | | :------------------- | -------: | ---: | -----: | ------: | -------: |
64 | | UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 |
65 | | MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 |
66 | | LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 |
67 |
68 | For captioning evaluation we measure CLIPScore and RefCLIPScore³.
69 |
70 | | Model | Size | Caption Length | CLIPScore | RefCLIPScore |
71 | | :---------------------------------- | ---: | -------------: | --------: | -----------: |
72 | | `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 |
73 | | `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 |
74 | | | | | | |
75 | | `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 |
76 | | `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 |
77 | | | | | | |
78 | | `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 |
79 | | `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 |
80 | | | | | | |
81 | | `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 |
82 | | `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 |
83 |
84 | Results for VQAv2 evaluation.
85 |
86 | | Model | Size | Accuracy |
87 | | :------------------------- | ---: | -------: |
88 | | `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 |
89 | | `unum-cloud/uform-gen` | 1.5B | 66.5 |
90 |
91 |
92 |
93 | > ¹ Train split was in training data.
94 | > ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
95 | > ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
96 |
97 | ## Speed
98 |
99 | ### Embedding Models
100 |
101 | UForm comes pre-packaged with speed benchmarks for the models.
102 |
103 | ```bash
104 | $ python python/scripts/bench_encoders.py --help
105 | usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
106 |
107 | options:
108 | -h, --help show this help message and exit
109 | --filter-out FILTER_OUT
110 | Filter out models, backends, or devices with a Regular Expression.
111 | --batch-size BATCH_SIZE
112 | Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
113 | ```
114 |
115 | Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and
116 |
117 | | Model Name | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s |
118 | | :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- |
119 | | unum-cloud/uform3-image-text-english-base | cpu | torch | 23.03 | 76.57 | 15,978.03 | 562.28 |
120 | | unum-cloud/uform3-image-text-english-base | cpu | onnx | 23.11 | 77.75 | 13,880.27 | 1,067.40 |
121 | | unum-cloud/uform3-image-text-english-base | cuda | torch | 22.87 | 1,060.40 | 12,348.94 | 13,242.83 |
122 | | unum-cloud/uform3-image-text-english-large | cpu | torch | 22.41 | 10.84 | 13,350.45 | 145.12 |
123 | | unum-cloud/uform3-image-text-english-large | cpu | onnx | 23.13 | 19.60 | 18,031.85 | 960.09 |
124 | | unum-cloud/uform3-image-text-english-large | cuda | torch | 22.78 | 244.86 | 13,226.40 | 10,204.04 |
125 | | unum-cloud/uform3-image-text-english-small | cpu | torch | 20.08 | 71.68 | 12,147.05 | 249.63 |
126 | | unum-cloud/uform3-image-text-english-small | cpu | onnx | 22.84 | 195.27 | 13,636.99 | 1,385.25 |
127 | | unum-cloud/uform3-image-text-english-small | cuda | torch | 22.63 | 2,662.16 | 14,731.18 | 14,694.87 |
128 | | unum-cloud/uform3-image-text-multilingual-base | cpu | torch | 22.98 | 64.28 | 10,129.27 | 209.76 |
129 | | unum-cloud/uform3-image-text-multilingual-base | cpu | onnx | 23.06 | 66.81 | 8,963.13 | 1,104.32 |
130 | | unum-cloud/uform3-image-text-multilingual-base | cuda | torch | 22.88 | 1,051.95 | 15,639.72 | 12,416.12 |
131 |
132 | If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates.
133 | On Nvidia RTX 3090:
134 |
135 | | Model | Multilingual | Speed | Speedup |
136 | | :----------------------------------------------- | -----------: | ---------------------: | ---------: |
137 | | `bert-base-uncased` | No | 1'612 sequences/second | |
138 | | `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 |
139 | | `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 |
140 | | `unum-cloud/uform3-image-text-multilingual-base` | __Yes__ | 6'809 sequences/second | __x 4.22__ |
141 |
142 | Given the small size of the model it also work well on mobile devices.
143 | On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
144 |
145 | | Device | Speed | Device TDP | Efficiency |
146 | | :--------------------- | ------------------: | ---------: | ----------------: |
147 | | Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule |
148 | | Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule |
149 | | Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule |
150 | | Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule |
151 |
152 | ### Generative Models
153 |
154 | ```bash
155 | $ python python/scripts/bench_decoders.py --help
156 | usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
157 |
158 | options:
159 | -h, --help show this help message and exit
160 | --batch-size BATCH_SIZE
161 | Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
162 | --max-length MAX_LENGTH
163 | Maximum length of the generated text in tokens.
164 | ```
165 |
166 | On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
167 |
168 | | Model | Size | Decoding Speed | Decoding Parallel Streams |
169 | | :---------------------------------- | ----: | -------------: | ---------------------------: |
170 | | `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 141 tokens/s | ~ 4 K tokens/s (32 streams) |
171 | | `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 211 tokens/s | ~ 2 K tokens/s (32 streams) |
172 | | `unum-cloud/uform-gen` | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) |
173 | | `unum-cloud/uform-gen2-dpo` | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) |
174 |
175 | On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
176 |
177 | | Model | Size | Decoding Speed | Speedup |
178 | | :---------------------------------- | ----: | -------------: | --------: |
179 | | `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 40 tokens/s | |
180 | | `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 40 tokens/s | |
181 | | `unum-cloud/uform-gen` | 1.5 B | ~ 140 tokens/s | __x 3.5__ |
182 |
183 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Kim"
5 | given-names: "Mikhail"
6 | orcid: "https://orcid.org/0009-0003-8413-3221"
7 | - family-names: "Orshulevich"
8 | given-names: "Vladimir"
9 | orcid: "https://orcid.org/0009-0007-8961-6969"
10 | - family-names: "Vardanian"
11 | given-names: "Ash"
12 | orcid: "https://orcid.org/0000-0002-4882-1815"
13 | title: "UForm by Unum Cloud"
14 | version: 3.1.1
15 | keywords:
16 | - "text-to-image retrieval"
17 | - "multimodal"
18 | - "visual-language pre-training"
19 | doi: 10.5281/zenodo.7951497
20 | date-released: 2023-01-03
21 | url: "https://github.com/unum-cloud/uform"
22 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to UForm
2 |
3 | We welcome contributions to UForm!
4 |
5 | ## Python
6 |
7 | Before submitting any changes, please make sure that the tests pass.
8 |
9 | ```sh
10 | pip install -e ".[dev]" # For development dependencies
11 | pip install -e ".[torch]" # For PyTorch
12 | pip install -e ".[onnx]" # For ONNX on CPU
13 | pip install -e ".[onnx-gpu]" # For ONNX on GPU, available for some platforms
14 | pip install -e ".[torch,onnx,onnx-gpu,dev]" # For all
15 |
16 | pytest python/scripts/ -s -x -Wd -v
17 | pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
18 | ```
19 |
20 | ## Swift
21 |
22 | To build and test the Swift package, use the following command:
23 |
24 | ```bash
25 | swift build
26 | swift test
27 | ```
28 |
29 | Swift formatting is enforced with `swift-format` default utility from Apple.
30 | To install and run it on all the files in the project, use the following command:
31 |
32 | ```bash
33 | brew install swift-format
34 | swift-format . -i -r
35 | ```
36 |
37 | The style is controlled by the `.swift-format` JSON file in the root of the repository.
38 | As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings.
39 |
40 | ## JavaScript
41 |
42 | For rapid development you can avoid the TypeScript precompilation step:
43 |
44 | ```sh
45 | npm install -g ts-node
46 | ts-node javascript/embeddings.mts
47 | ```
48 |
49 | Before submitting any changes, please make sure that the tests pass.
50 |
51 | ```sh
52 | npm install
53 | npm test
54 | ```
55 |
56 | ## Benchmarking
57 |
58 | If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally.
59 | The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU.
60 |
61 | ```sh
62 | git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository
63 | cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies
64 | python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large"
65 | ```
66 |
67 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "pins" : [
3 | {
4 | "identity" : "swift-argument-parser",
5 | "kind" : "remoteSourceControl",
6 | "location" : "https://github.com/apple/swift-argument-parser.git",
7 | "state" : {
8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
9 | "version" : "1.3.0"
10 | }
11 | },
12 | {
13 | "identity" : "swift-transformers",
14 | "kind" : "remoteSourceControl",
15 | "location" : "https://github.com/ashvardanian/swift-transformers",
16 | "state" : {
17 | "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
18 | }
19 | }
20 | ],
21 | "version" : 2
22 | }
23 |
--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version:5.9
2 | import PackageDescription
3 |
4 | let package = Package(
5 | name: "UForm",
6 | platforms: [
7 | // Linux doesn't have to be explicitly listed
8 | .iOS(.v16), // For iOS, version 13 and later
9 | .tvOS(.v16), // For tvOS, version 13 and later
10 | .macOS(.v13), // For macOS, version 10.15 (Catalina) and later
11 | .watchOS(.v6), // For watchOS, version 6 and later
12 | ],
13 | products: [
14 | .library(
15 | name: "UForm",
16 | targets: ["UForm"]
17 | )
18 | ],
19 | dependencies: [
20 | .package(
21 | url: "https://github.com/ashvardanian/swift-transformers",
22 | revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
23 | )
24 | ],
25 | targets: [
26 | .target(
27 | name: "UForm",
28 | dependencies: [
29 | .product(name: "Transformers", package: "swift-transformers")
30 | ],
31 | path: "swift",
32 | exclude: ["EncodersTests.swift"]
33 | ),
34 | .testTarget(
35 | name: "UFormTests",
36 | dependencies: ["UForm"],
37 | path: "swift",
38 | sources: ["EncodersTests.swift"]
39 | ),
40 | ]
41 | )
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
UForm
2 |
3 | Pocket-Sized Multimodal AI
4 | For Content Understanding and Generation
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
22 |
23 | Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
24 |
25 | ONNX • CoreML • PyTorch
26 |
27 | Python
28 | •
29 | JavaScript
30 | •
31 | Swift
32 |
33 |
34 | ---
35 |
36 | 
37 |
38 | Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient.
39 | UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages.
40 | UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA).
41 | With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone.
42 |
43 | ## Features
44 |
45 | - __Tiny Embeddings__: 64-dimensional [Matryoshka][matryoshka]-style embeddings for extremely fast [search][usearch].
46 | - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors.
47 | - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform.
48 | - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall.
49 | - __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages.
50 |
51 | [usearch]: https://github.com/unum-cloud/usearch
52 | [matryoshka]: https://arxiv.org/abs/2205.13147
53 |
54 | ## Models
55 |
56 | For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md).
57 |
58 | ### Embedding Models
59 |
60 |
96 |
97 | ### Generative Models
98 |
99 |
100 |
101 |
102 | Model |
103 | Parameters |
104 | Purpose |
105 | Architecture |
106 |
107 |
108 |
109 |
110 | uform-gen2-dpo 🆕 |
111 | 1.2 B |
112 | Chat, Image Captioning, VQA |
113 | qwen1.5-0.5B, ViT-H/14 |
114 |
115 |
116 | uform-gen2-qwen-500m |
117 | 1.2 B |
118 | Chat, Image Captioning, VQA |
119 | qwen1.5-0.5B, ViT-H/14 |
120 |
121 |
122 | uform-gen ⚠️ |
123 | 1.5 B |
124 | Image Captioning, VQA |
125 | llama-1.3B, ViT-B/16 |
126 |
127 |
128 |
129 |
130 | ## Quick Start Examples
131 |
132 | ### Embedding Models
133 |
134 | First, `pip install uform`.
135 | Then, load the model:
136 |
137 | ```py
138 | from uform import get_model, Modality
139 |
140 | processors, models = get_model('unum-cloud/uform3-image-text-english-small')
141 |
142 | model_text = models[Modality.TEXT_ENCODER]
143 | model_image = models[Modality.IMAGE_ENCODER]
144 | processor_text = processors[Modality.TEXT_ENCODER]
145 | processor_image = processors[Modality.IMAGE_ENCODER]
146 | ```
147 |
148 | Embed images:
149 |
150 | ```py
151 | import requests
152 | from io import BytesIO
153 | from PIL import Image
154 |
155 | image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
156 | image = Image.open(BytesIO(requests.get(image_url).content))
157 | image_data = processor_image(image)
158 | image_features, image_embedding = model_image.encode(image_data, return_features=True)
159 | ```
160 |
161 | Embed queries:
162 |
163 | ```py
164 | text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
165 | text_data = processor_text(text)
166 | text_features, text_embedding = model_text.encode(text_data, return_features=True)
167 | ```
168 |
169 | For more details check out:
170 |
171 | - Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models)
172 | - JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models)
173 | - Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models)
174 |
175 | ### Generative Models
176 |
177 | The generative models are natively compatible with
178 |
179 | ```python
180 | from transformers import AutoModel, AutoProcessor
181 |
182 | model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
183 | processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
184 |
185 | prompt = 'Question or Instruction'
186 | image = Image.open('image.jpg')
187 |
188 | inputs = processor(text=[prompt], images=[image], return_tensors='pt')
189 |
190 | with torch.inference_mode():
191 | output = model.generate(
192 | **inputs,
193 | do_sample=False,
194 | use_cache=True,
195 | max_new_tokens=256,
196 | eos_token_id=151645,
197 | pad_token_id=processor.tokenizer.pad_token_id
198 | )
199 | prompt_len = inputs['input_ids'].shape[1]
200 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
201 | ```
202 |
203 | For more details check out:
204 |
205 | - Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models)
206 | - JavaScript docs on generative models 🔜
207 | - Swift docs on generative models 🔜
208 |
209 | ## Technical Details
210 |
211 | ### Down-casting, Quantization, Matryoshka, and Slicing
212 |
213 | Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
214 | Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support.
215 | Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries.
216 | Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search.
217 |
218 | ```python
219 | import numpy as np
220 |
221 | f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
222 | f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
223 | i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
224 | b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
225 | ```
226 |
227 | Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner.
228 |
229 | ```python
230 | import numpy as np
231 |
232 | large_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
233 | small_embedding: np.ndarray = large_embedding[:, :256]
234 | tiny_embedding: np.ndarray = large_embedding[:, :64]
235 | ```
236 |
237 | Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries.
238 | When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD.
239 |
240 | ```python
241 | from simsimd import cosine, hamming
242 |
243 | distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU
244 | distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU
245 | distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU
246 | distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU
247 | ```
248 |
249 | Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch.
250 | Here are a couple of examples:
251 |
252 | ```python
253 | from usearch.index import Index
254 |
255 | f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings
256 | f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings
257 | i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings
258 | b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings
259 | ```
260 |
261 | [github-usearch]: https://github.com/unum-cloud/usearch
262 | [github-simsimd]: https://github.com/ashvardanian/simsimd
263 | [report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel
264 | [report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/
265 |
266 | ### Compact Packaging
267 |
268 | PyTorch is a heavy dependency to carry, especially if you run on Edge or IoT devices.
269 | Using vanilla ONNX runtime, one can significantly reduce memory consumption and deployment latency.
270 |
271 | ```sh
272 | $ conda create -n uform_torch python=3.10 -y
273 | $ conda create -n uform_onnx python=3.10 -y
274 | $ conda activate uform_torch && pip install -e ".[torch]" && conda deactivate
275 | $ conda activate uform_onnx && pip install -e ".[onnx]" && conda deactivate
276 | $ du -sh $(conda info --envs | grep 'uform_torch' | awk '{print $2}')
277 | > 5.2G ~/conda/envs/uform_torch
278 | $ du -sh $(conda info --envs | grep 'uform_onnx' | awk '{print $2}')
279 | > 461M ~/conda/envs/uform_onnx
280 | ```
281 |
282 | Most of that weight can be further reduced down to 100 MB for both the model and the runtime.
283 | You can pick one of many supported [ONNX execution providers][onnx-providers], which includes XNNPACK, CUDA and TensorRT for Nvidia GPUs, OpenVINO on Intel, DirectML on Windows, ROCm on AMD, CoreML on Apple devices, and more to come.
284 |
285 | [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/
286 |
287 | ### Multimodal Chat in CLI
288 |
289 | The generative models can be used for chat-like experiences in the command line.
290 | For that, you can use the `uform-chat` CLI tool, which is available in the UForm package.
291 |
292 | ```bash
293 | $ pip install uform
294 | $ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg
295 | $ uform-chat --model unum-cloud/uform-gen2-dpo \
296 | > --image="https://bit.ly/3tIVg9M" \
297 | > --device="cuda:0" \
298 | > --fp16
299 | ```
300 |
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 3.1.1
2 |
--------------------------------------------------------------------------------
/assets/model_types_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unum-cloud/uform/33d5df7951cf3bee8b14d1110cc3bbae1ff6fba8/assets/model_types_bg.png
--------------------------------------------------------------------------------
/assets/unum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unum-cloud/uform/33d5df7951cf3bee8b14d1110cc3bbae1ff6fba8/assets/unum.png
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = ../build/docs
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | p.caption {
2 | font-size: 0 !important;
3 | margin: 8px 0px !important;
4 | padding: 0 !important;
5 | border-bottom: 1px solid #8b7f8b12;
6 | }
7 |
8 | article>section>h1:nth-child(1) {
9 | display: none;
10 | }
11 |
12 | .sidebar-brand-text {
13 | cursor: initial;
14 | }
15 |
16 | table>tbody>tr>td {
17 | text-align: center;
18 | }
19 |
20 | table>tbody>tr>td:first-child {
21 | text-align: left;
22 | }
23 |
24 | #overview>p>a>img {
25 | height: 25px !important;
26 | }
27 |
--------------------------------------------------------------------------------
/docs/_static/custom.js:
--------------------------------------------------------------------------------
1 | $(document).ready(function () {
2 | const github_logo = ``
5 |
6 | $(".sidebar-brand-text").html("Unum · UForm
2.1.1" + github_logo)
7 | })
8 |
--------------------------------------------------------------------------------
/docs/benchmarks.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Benchmarks
3 | ====================
4 |
5 | .. mdinclude:: ../BENCHMARKS.md
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 |
9 | project = "Unum · UForm"
10 | copyright = "2023, Unum"
11 | author = "Unum"
12 | release = open("../VERSION", "r").read().strip()
13 | with open("_static/custom.js", "r+") as js:
14 | content = js.read()
15 | js.seek(0)
16 | js.truncate()
17 | js.write(content.replace("$(VERSION)", release))
18 |
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 |
22 | extensions = [
23 | "breathe",
24 | "m2r2",
25 | "sphinx.ext.autodoc",
26 | "sphinx_js",
27 | "sphinx.ext.autosummary",
28 | "sphinx.ext.intersphinx",
29 | "sphinx.ext.napoleon",
30 | "sphinxcontrib.jquery",
31 | "sphinxcontrib.googleanalytics",
32 | ]
33 |
34 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "*.md"]
35 |
36 | googleanalytics_id = "341385789"
37 | googleanalytics_enabled = True
38 |
39 | # -- Options for HTML output -------------------------------------------------
40 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
41 |
42 | html_logo = "../assets/unum.png"
43 | html_theme = "furo"
44 | html_static_path = ["_static"]
45 | html_css_files = ["custom.css"]
46 | html_js_files = ["custom.js"]
47 | html_baseurl = "/docs/uform/"
48 |
49 | breathe_projects = {"UForm": "../build/xml"}
50 | breathe_default_project = "UForm"
51 |
52 | js_source_path = "../javascript/"
53 |
--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Contributing
3 | ====================
4 |
5 | .. mdinclude:: ../CONTRIBUTING.md
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Overview
3 | ====================
4 | .. mdinclude:: ../README.md
5 |
6 | .. toctree::
7 | :hidden:
8 | :caption: �
9 |
10 | python/index
11 | javascript/index
12 | swift/index
13 |
14 | .. toctree::
15 | :hidden:
16 | :caption: �
17 |
18 | contributing
19 | benchmarks
20 |
21 | .. toctree::
22 | :hidden:
23 | :caption: �
24 |
25 | genindex
26 |
--------------------------------------------------------------------------------
/docs/javascript/index.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | JavaScript SDK
3 | ====================
4 |
5 |
6 | .. mdinclude:: ../../javascript/README.md
7 |
8 | .. toctree::
9 | :hidden:
10 |
--------------------------------------------------------------------------------
/docs/javascript/reference.rst.txt:
--------------------------------------------------------------------------------
1 | API Reference
2 | ====================
3 |
4 | ====================
5 | Encoders
6 | ====================
7 |
8 | .. js:autoclass:: ../javascript/encoders.TextProcessor
9 | :members:
10 |
11 | .. js:autoclass:: ../javascript/encoders.ImageProcessor
12 | :members:
13 |
14 | .. js:autoclass:: ../javascript/encoders.TextEncoder
15 | :members:
16 |
17 | .. js:autoclass:: ../javascript/encoders.ImageEncoder
18 | :members:
19 |
--------------------------------------------------------------------------------
/docs/python/index.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Python SDK
3 | ====================
4 |
5 |
6 | .. mdinclude:: ../../python/README.md
7 |
8 | .. toctree::
9 | :hidden:
10 |
11 | reference
--------------------------------------------------------------------------------
/docs/python/reference.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | ====================
3 |
4 | ====================
5 | Root
6 | ====================
7 |
8 | .. automodule:: uform
9 | :members:
10 | :undoc-members:
11 |
12 | ====================
13 | Torch Encoreds
14 | ====================
15 |
16 | .. automodule:: uform.torch_encoders
17 | :members:
18 | :undoc-members:
19 |
20 | ====================
21 | Torch Processors
22 | ====================
23 |
24 | .. automodule:: uform.torch_processors
25 | :members:
26 | :undoc-members:
27 |
28 | ====================
29 | ONNX Encoders
30 | ====================
31 |
32 | .. automodule:: uform.onnx_encoders
33 | :members:
34 | :undoc-members:
35 |
36 | ====================
37 | NumPy Processors
38 | ====================
39 |
40 | .. automodule:: uform.numpy_processors
41 | :members:
42 | :undoc-members:
43 |
--------------------------------------------------------------------------------
/docs/swift/index.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Swift SDK
3 | ====================
4 |
5 |
6 | .. mdinclude:: ../../swift/README.md
7 |
--------------------------------------------------------------------------------
/javascript/README.md:
--------------------------------------------------------------------------------
1 | # UForm for JavaScript
2 |
3 | UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications.
4 | Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware.
5 |
6 | ## Installation
7 |
8 | There are several ways to install the UForm JavaScript SDK from NPM.
9 |
10 | ```bash
11 | pnpm add uform
12 | npm add uform
13 | yarn add uform
14 | ```
15 |
16 | ## Quick Start
17 |
18 | ### Embeddings
19 |
20 | ```js
21 | import { getModel, Modality, TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from '@unum-cloud/uform';
22 |
23 | const { configPath, modalityPaths, tokenizerPath } = await getModel(
24 | modelId: 'unum-cloud/uform3-image-text-english-small',
25 | modalities: [Modality.TextEncoder, Modality.ImageEncoder]
26 | );
27 |
28 | const textProcessor = new TextProcessor(configPath, tokenizerPath);
29 | await textProcessor.init();
30 | const processedTexts = await textProcessor.process(["a small red panda in a zoo"]);
31 |
32 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
33 | await textEncoder.init();
34 | const textOutput = await textEncoder.encode(processedTexts);
35 | assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
36 | await textEncoder.dispose();
37 |
38 | const imageProcessor = new ImageProcessor(configPath);
39 | await imageProcessor.init();
40 | const processedImages = await imageProcessor.process("path/to/image.png");
41 |
42 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
43 | await imageEncoder.init();
44 | const imageOutput = await imageEncoder.encode(processedImages);
45 | assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
46 | ```
47 |
48 | The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK.
49 | The embeddings can later be compared using the cosine similarity or other distance metrics.
50 |
51 | ### Generative Models
52 |
53 | Coming soon ...
54 |
55 | ## Technical Details
56 |
57 | ### Faster Search
58 |
59 | Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
60 | Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search.
61 | In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd].
62 |
63 | [github-usearch]: https://github.com/unum-cloud/usearch
64 | [github-simsimd]: https://github.com/ashvardanian/simsimd
65 |
--------------------------------------------------------------------------------
/javascript/encoders.mjs:
--------------------------------------------------------------------------------
1 | import { readFileSync } from 'fs';
2 | import { InferenceSession, Tensor } from 'onnxruntime-node';
3 | import { PreTrainedTokenizer } from '@xenova/transformers';
4 | import sharp from 'sharp';
5 |
6 | /**
7 | * A processor for text data that prepares input for the text encoder model.
8 | */
9 | class TextProcessor {
10 |
11 | /**
12 | * Constructs a new TextProcessor instance.
13 | *
14 | * @param {string} configPath - The path to the configuration file for the text encoder.
15 | * @param {string} tokenizerPath - The path to the tokenizer configuration file.
16 | */
17 | constructor(configPath, tokenizerPath) {
18 | this.configPath = configPath;
19 | this.tokenizerPath = tokenizerPath;
20 |
21 | this.maxSeqLen = 0;
22 | this.padTokenIdx = 0;
23 | this.tokenizer = null;
24 | }
25 |
26 | /**
27 | * Initializes the TextProcessor by loading configurations and setting up the tokenizer.
28 | */
29 | async init() {
30 | var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
31 | if (config.text_encoder !== undefined) {
32 | config = config.text_encoder;
33 | }
34 |
35 | this.maxSeqLen = config.max_position_embeddings;
36 | this.padTokenIdx = config.padding_idx;
37 |
38 | const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' }));
39 | this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
40 | this.tokenizer.model_max_length = this.maxSeqLen;
41 | this.tokenizer.pad_token_id = this.padTokenIdx;
42 | }
43 |
44 | /**
45 | * Processes a list of text strings into model-ready format, including padding and attention masks.
46 | *
47 | * @param {Array} texts - An array of text strings to process.
48 | * @return {Object} The processed texts as model input features.
49 | */
50 | async process(texts) {
51 |
52 | const encoded = await this.tokenizer(texts, {
53 | add_special_tokens: true,
54 | padding: 'max_length',
55 | max_length: this.maxSeqLen,
56 | truncation: true,
57 | });
58 |
59 | return {
60 | 'input_ids': encoded.input_ids,
61 | 'attention_mask': encoded.attention_mask,
62 | };
63 | }
64 | }
65 |
66 | /**
67 | * An encoder for text data that uses a pre-trained model to encode text.
68 | */
69 | class TextEncoder {
70 |
71 | /**
72 | * Constructs a new TextEncoder instance.
73 | *
74 | * @param {string} modelPath - The path to the pre-trained ONNX model.
75 | */
76 | constructor(modelPath) {
77 | this.modelPath = modelPath;
78 | this.session = null;
79 | }
80 |
81 | /**
82 | * Initializes the ONNX session with the pre-trained model.
83 | */
84 | async init() {
85 | this.session = await InferenceSession.create(this.modelPath);
86 | }
87 |
88 | /**
89 | * Releases the ONNX session resources.
90 | */
91 | async dispose() {
92 | if (this.session) {
93 | await this.session.release().catch(error => console.error("Failed to release session", error));
94 | this.session = null;
95 | }
96 | }
97 |
98 | /**
99 | * Encodes the input data using the pre-trained model.
100 | *
101 | * @param {Object} inputs - The input data containing input_ids and attention_mask.
102 | * @return {Object} The encoded outputs from the model.
103 | */
104 | async encode(inputs) {
105 | if (!this.session) {
106 | throw new Error("Session is not initialized.");
107 | }
108 |
109 | // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
110 | function ensureInt32Array(data) {
111 | if (data instanceof Int32Array) {
112 | return data; // Use as is if already Int32Array
113 | }
114 | if (data instanceof BigInt64Array) {
115 | // Convert BigInt64Array to Int32Array, ensuring all values are in range
116 | return new Int32Array(Array.from(data).map(bigInt => {
117 | if (bigInt > 2147483647n || bigInt < -2147483648n) {
118 | throw new Error("Value out of range for Int32.");
119 | }
120 | return Number(bigInt); // Convert BigInt to Number
121 | }));
122 | }
123 | // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
124 | if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) {
125 | return new Int32Array(data); // Convert directly
126 | }
127 | throw new Error("Unsupported data type for tensor conversion.");
128 | }
129 |
130 | // Prepare tensor data
131 | const inputIDsData = ensureInt32Array(inputs.input_ids.data);
132 | const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);
133 |
134 | // Create ONNX Tensors as 'int32'
135 | const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims);
136 | const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims);
137 |
138 | // Run model inference
139 | return this.session.run({
140 | input_ids: inputIDs,
141 | attention_mask: attentionMask,
142 | });
143 | }
144 |
145 | }
146 |
147 | /**
148 | * A processor for image data that prepares images for the image encoder model.
149 | */
150 | class ImageProcessor {
151 | constructor(configPath) {
152 | this.configPath = configPath;
153 | }
154 |
155 | /**
156 | * Initializes the ImageProcessor by loading configuration settings for image preprocessing.
157 | */
158 | async init() {
159 | var config = JSON.parse(readFileSync(this.configPath, 'utf8'));
160 | if (config.image_encoder !== undefined) {
161 | config = config.image_encoder;
162 | }
163 |
164 | this.imageSize = config.image_size;
165 | this.normalizationMeans = config.normalization_means;
166 | this.normalizationDeviations = config.normalization_deviations;
167 |
168 | this.imageMean = new Float32Array(this.normalizationMeans);
169 | this.imageStd = new Float32Array(this.normalizationDeviations);
170 | }
171 | /**
172 | * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing.
173 | *
174 | * @param {Buffer|Array} images - A single image or an array of images to process.
175 | * @return {Array} The processed image data as an array of Float32Arrays.
176 | */
177 | async process(images) {
178 | const processSingle = async (image) => {
179 | let img = sharp(image).toColorspace('srgb');
180 | const metadata = await img.metadata();
181 | const scale = this.imageSize / Math.min(metadata.width, metadata.height);
182 | const scaledWidth = Math.ceil(metadata.width * scale);
183 | const scaledHeight = Math.ceil(metadata.height * scale);
184 | img = img.resize({
185 | width: scaledWidth,
186 | height: scaledHeight,
187 | fit: sharp.fit.cover,
188 | position: sharp.strategy.entropy,
189 | options: sharp.interpolators.bicubic
190 | }).extract({
191 | left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
192 | top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
193 | width: this.imageSize,
194 | height: this.imageSize
195 | }).removeAlpha();
196 |
197 | let buffer = await img.raw().toBuffer();
198 | let array = new Float32Array(buffer.length);
199 |
200 | // When we export into the `array`, we reorder the dimensions of the tensor
201 | // from HWC to CHW, and normalize the pixel values.
202 | let channelSize = this.imageSize * this.imageSize;
203 | for (let i = 0; i < this.imageSize * this.imageSize; i++) {
204 | let r = buffer[i * 3];
205 | let g = buffer[i * 3 + 1];
206 | let b = buffer[i * 3 + 2];
207 | array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0];
208 | array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1];
209 | array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2];
210 | }
211 |
212 | return array;
213 | };
214 |
215 | if (Array.isArray(images)) {
216 | return Promise.all(images.map(img => processSingle(img)));
217 | } else {
218 | return [await processSingle(images)];
219 | }
220 | }
221 | }
222 |
223 | /**
224 | * An encoder for image data that uses a pre-trained model to encode images.
225 | */
226 | class ImageEncoder {
227 | constructor(modelPath, processor) {
228 | this.modelPath = modelPath;
229 | this.imageSize = processor.imageSize;
230 | }
231 |
232 | /**
233 | * Initializes the ONNX session with the pre-trained model.
234 | */
235 | async init() {
236 | this.session = await InferenceSession.create(this.modelPath);
237 | }
238 |
239 | /**
240 | * Releases the ONNX session resources.
241 | */
242 | async dispose() {
243 | if (this.session) {
244 | await this.session.release().catch(error => console.error("Failed to release session", error));
245 | this.session = null;
246 | }
247 | }
248 |
249 | /**
250 | * Encodes the processed image data using the pre-trained model.
251 | *
252 | * @param {Float32Array|Array} images - The processed image data.
253 | * @return {Object} The encoded outputs from the model.
254 | */
255 | async encode(images) {
256 | if (!this.session) {
257 | throw new Error("Session is not initialized.");
258 | }
259 |
260 | // Helper function to ensure data is a Float32Array.
261 | const ensureFloat32Array = (data) => {
262 | if (!(data instanceof Float32Array)) {
263 | throw new Error("Unsupported data type for tensor conversion.");
264 | }
265 | return data;
266 | };
267 |
268 | // Helper function to concatenate multiple Float32Arrays into a single Float32Array.
269 | const concatFloat32Arrays = (arrays) => {
270 | const totalLength = arrays.reduce((acc, val) => acc + val.length, 0);
271 | const result = new Float32Array(totalLength);
272 | let offset = 0;
273 | for (let arr of arrays) {
274 | result.set(arr, offset);
275 | offset += arr.length;
276 | }
277 | return result;
278 | };
279 |
280 | let imagesData;
281 | let dims;
282 |
283 | if (Array.isArray(images)) {
284 | // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size.
285 | const arrays = images.map(ensureFloat32Array);
286 | imagesData = concatFloat32Arrays(arrays);
287 | const numImages = arrays.length;
288 | const numChannels = 3;
289 | const height = this.imageSize;
290 | const width = this.imageSize;
291 | dims = [numImages, numChannels, height, width];
292 | } else {
293 | // Single image images, which is already a Float32Array.
294 | imagesData = ensureFloat32Array(images);
295 | const numChannels = 3;
296 | const height = this.imageSize;
297 | const width = this.imageSize;
298 | dims = [1, numChannels, height, width];
299 | }
300 |
301 | // Create ONNX Tensor
302 | const imagesTensor = new Tensor('float32', imagesData, dims);
303 |
304 | // Run model inference
305 | return this.session.run({
306 | images: imagesTensor,
307 | });
308 | }
309 | }
310 |
311 | export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder };
312 |
--------------------------------------------------------------------------------
/javascript/encoders_test.js:
--------------------------------------------------------------------------------
1 | import { existsSync, readFileSync } from 'fs';
2 | import { fileURLToPath } from 'url';
3 | import path from 'path';
4 | import assert from 'assert';
5 | import fetch from 'node-fetch';
6 |
7 | import { getModel, Modality, TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from './index.mjs';
8 |
9 | // Check if the HuggingFace Hub API token is set in the environment variable.
10 | let hf_token = process.env.HUGGINGFACE_HUB_TOKEN;
11 | if (!hf_token) {
12 | const dirname = path.dirname(fileURLToPath(import.meta.url));
13 | const tokenPath = path.join(dirname, '../', '.hf_token');
14 | if (existsSync(tokenPath)) {
15 | hf_token = readFileSync(tokenPath, 'utf8').trim();
16 | }
17 | }
18 |
19 | async function tryGettingCheckpoint(modelId, modalities) {
20 | const { configPath, modalityPaths, tokenizerPath } = await getModel(
21 | modelId,
22 | modalities,
23 | hf_token,
24 | '.onnx'
25 | );
26 |
27 | assert(configPath !== null, "Config path should not be null");
28 | assert(modalityPaths !== null, "Modality paths should not be null");
29 | assert(tokenizerPath !== null, "Tokenizer path should not be null");
30 |
31 | // Check if the file actually exists
32 | assert(existsSync(configPath), `Config file should exist at ${configPath}`);
33 | assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
34 | for (const modalityPath of Object.values(modalityPaths)) {
35 | assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
36 | }
37 | }
38 |
39 | async function testGetCheckpoint() {
40 | console.log("- `testGetCheckpoint`: Start");
41 |
42 | try {
43 | const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
44 |
45 | for (const modelId of [
46 | 'unum-cloud/uform3-image-text-english-small',
47 | 'unum-cloud/uform3-image-text-english-base',
48 | 'unum-cloud/uform3-image-text-english-large',
49 | 'unum-cloud/uform3-image-text-multilingual-base',
50 | ]) {
51 | await tryGettingCheckpoint(modelId, modalities, hf_token);
52 | }
53 |
54 | console.log("- `testGetCheckpoint`: Success");
55 | } catch (error) {
56 | console.error("- `testGetCheckpoint`: Failed", error);
57 | }
58 | }
59 |
60 | async function tryTextEncoderForwardPass(modelId) {
61 | const modalities = [Modality.TextEncoder];
62 | const { configPath, modalityPaths, tokenizerPath } = await getModel(
63 | modelId,
64 | modalities,
65 | hf_token,
66 | '.onnx'
67 | );
68 |
69 | const textProcessor = new TextProcessor(configPath, tokenizerPath);
70 | await textProcessor.init();
71 | const processedTexts = await textProcessor.process("a small red panda in a zoo");
72 |
73 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
74 | await textEncoder.init();
75 | const textOutput = await textEncoder.encode(processedTexts);
76 | assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
77 |
78 | await textEncoder.dispose();
79 | }
80 |
81 | async function tryImageEncoderForwardPass(modelId) {
82 | const modalities = [Modality.ImageEncoder];
83 | const { configPath, modalityPaths } = await getModel(
84 | modelId,
85 | modalities,
86 | hf_token,
87 | '.onnx'
88 | );
89 |
90 | const imageProcessor = new ImageProcessor(configPath);
91 | await imageProcessor.init();
92 | const processedImages = await imageProcessor.process("assets/unum.png");
93 |
94 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
95 | await imageEncoder.init();
96 | const imageOutput = await imageEncoder.encode(processedImages);
97 | assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
98 |
99 | await imageEncoder.dispose();
100 | }
101 |
102 | function cosineSimilarity(vecA, vecB) {
103 | // We may be receiving a complex tensor type, so let's check if it
104 | // has an array member named `data`.
105 | if (vecA.data) {
106 | vecA = vecA.data;
107 | }
108 | if (vecB.data) {
109 | vecB = vecB.data;
110 | }
111 |
112 | let dotProduct = 0.0;
113 | let normA = 0.0;
114 | let normB = 0.0;
115 | for (let i = 0; i < vecA.length; i++) {
116 | dotProduct += vecA[i] * 1.0 * vecB[i];
117 | normA += vecA[i] * 1.0 * vecA[i];
118 | normB += vecB[i] * 1.0 * vecB[i];
119 | }
120 | if (normA === 0 || normB === 0) {
121 | return 0;
122 | } else {
123 | return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
124 | }
125 | }
126 |
127 | async function fetchImage(url) {
128 | const response = await fetch(url);
129 | const arrayBuffer = await response.arrayBuffer();
130 | const buffer = Buffer.from(arrayBuffer);
131 | return buffer;
132 | }
133 |
134 | async function tryCrossReferencingImageAndText(modelId) {
135 |
136 | const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
137 | const { configPath, modalityPaths, tokenizerPath } = await getModel(
138 | modelId,
139 | modalities,
140 | hf_token,
141 | '.onnx'
142 | );
143 |
144 | const imageProcessor = new ImageProcessor(configPath);
145 | await imageProcessor.init();
146 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
147 | await imageEncoder.init();
148 | const textProcessor = new TextProcessor(configPath, tokenizerPath);
149 | await textProcessor.init();
150 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
151 | await textEncoder.init();
152 |
153 | const texts = [
154 | "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
155 | "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
156 | "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
157 | "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
158 | "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
159 | ];
160 | const imageUrls = [
161 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
162 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
163 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
164 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
165 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
166 | ];
167 |
168 | const textEmbeddings = [];
169 | const imageEmbeddings = [];
170 |
171 | for (let i = 0; i < texts.length; i++) {
172 | const text = texts[i];
173 | const imageUrl = imageUrls[i];
174 | const imageBuffer = await fetchImage(imageUrl);
175 |
176 | const processedText = await textProcessor.process(text);
177 | const processedImage = await imageProcessor.process(imageBuffer);
178 |
179 | const textEmbedding = await textEncoder.encode(processedText);
180 | const imageEmbedding = await imageEncoder.encode(processedImage);
181 |
182 | textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
183 | imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
184 |
185 | // Print-based debugging at its best :)
186 | // console.log(`Text: ${text}, Image: ${imageUrl}`);
187 | // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`);
188 | // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`);
189 | console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`)
190 | }
191 |
192 | for (let i = 0; i < texts.length; i++) {
193 | const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]);
194 | const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i]));
195 | const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie));
196 |
197 | const maxOtherTextSimilarity = Math.max(...otherTextSimilarities);
198 | const maxOtherImageSimilarity = Math.max(...otherImageSimilarities);
199 |
200 | assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images.");
201 | assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts.");
202 | }
203 |
204 | await textEncoder.dispose();
205 | await imageEncoder.dispose();
206 | }
207 |
208 | async function testEncoders() {
209 | console.log("- `testEncoders`: Start");
210 |
211 | try {
212 |
213 | // Go through the bi-modal models
214 | for (const modelId of [
215 | 'unum-cloud/uform3-image-text-english-small',
216 | // 'unum-cloud/uform3-image-text-english-base',
217 | // 'unum-cloud/uform3-image-text-english-large',
218 | // 'unum-cloud/uform3-image-text-multilingual-base',
219 | ]) {
220 | await tryTextEncoderForwardPass(modelId, hf_token);
221 | await tryImageEncoderForwardPass(modelId, hf_token);
222 | await tryCrossReferencingImageAndText(modelId, hf_token);
223 | }
224 |
225 | console.log("- `testEncoders`: Success");
226 | } catch (error) {
227 | console.error("- `testEncoders`: Failed", error);
228 | }
229 | }
230 |
231 | process.on('uncaughtException', (error) => {
232 | console.error('Uncaught Exception:', error);
233 | });
234 |
235 | testGetCheckpoint();
236 | testEncoders();
237 |
--------------------------------------------------------------------------------
/javascript/hub.mjs:
--------------------------------------------------------------------------------
1 | import { join } from "path"
2 | import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs";
3 |
4 | import { downloadFile, listFiles } from "@huggingface/hub";
5 |
6 | const Modality = {
7 | TextEncoder: "text_encoder",
8 | ImageEncoder: "image_encoder",
9 | VideoEncoder: "video_encoder",
10 | TextDecoder: "text_decoder",
11 | };
12 |
13 | function isModality(value) {
14 | return Object.values(Modality).includes(value);
15 | }
16 |
17 | function normalizeModalities(modalities) {
18 | return modalities.map(x => {
19 | if (typeof x === "string") {
20 | if (isModality(x)) {
21 | return x;
22 | } else {
23 | throw new Error(`Invalid modality: ${x}`);
24 | }
25 | }
26 | return x;
27 | });
28 | }
29 |
30 | async function ensureDirectoryExists(dirPath) {
31 | if (!existsSync(dirPath)) {
32 | mkdirSync(dirPath, { recursive: true });
33 | }
34 | }
35 |
36 | async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
37 | modalities = normalizeModalities(modalities);
38 |
39 | const configNames = ['config.json'];
40 | const tokenizerNames = ['tokenizer.json'];
41 | const modelFileNames = modalities.map(modality => `${modality}${format}`);
42 | const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
43 |
44 | const repo = { type: "model", name: modelId };
45 | const credentials = token ? { accessToken: token } : undefined;
46 |
47 | let configPath = null;
48 | let tokenizerPath = null;
49 | const modalityPaths = {};
50 | const modelSaveDir = join(saveDir, modelId);
51 |
52 | await ensureDirectoryExists(modelSaveDir);
53 |
54 | const fileIterator = listFiles({ repo, recursive: true, credentials });
55 | for await (const file of fileIterator) {
56 | const fileName = file.path.split('/').pop();
57 | if (fileName && allowedPatterns.includes(fileName)) {
58 | const filePath = file.path;
59 | const savePath = join(modelSaveDir, fileName);
60 |
61 | if (configNames.includes(fileName)) {
62 | configPath = savePath;
63 | } else if (tokenizerNames.includes(fileName)) {
64 | tokenizerPath = savePath;
65 | } else {
66 | const modalityName = fileName.split('.')[0];
67 | modalityPaths[modalityName] = savePath;
68 | }
69 |
70 | const response = await downloadFile({ repo, path: filePath, credentials });
71 | if (response) {
72 | // HuggingFace might be defining the `env.localModelPath` variable
73 | // to store the downloaded files in a local directory.
74 | // Let's check if the file is there.
75 | // const localPath = join(env.localModelPath, repo, filePath);
76 | // if (existsSync(localPath)) {
77 | // console.log(`File already exists locally at ${localPath}`);
78 | // }
79 |
80 | if (response.body && response.body.pipe) {
81 | const fileStream = createWriteStream(savePath);
82 | response.body.pipe(fileStream);
83 | await new Promise((resolve, reject) => {
84 | fileStream.on('finish', resolve);
85 | fileStream.on('error', reject);
86 | });
87 | } else if (response.arrayBuffer) {
88 | // Handle non-streamable response for environments like Node.js
89 | const buffer = await response.arrayBuffer();
90 | writeFileSync(savePath, Buffer.from(buffer));
91 | } else {
92 | console.error('Unexpected response type');
93 | }
94 | console.log(`Downloaded ${fileName} successfully to ${savePath}`);
95 | } else {
96 | console.log('No response received for the file download request.');
97 | }
98 | }
99 | }
100 |
101 | return { configPath, modalityPaths, tokenizerPath };
102 | }
103 |
104 | export { getModel, Modality };
105 |
--------------------------------------------------------------------------------
/javascript/index.mjs:
--------------------------------------------------------------------------------
1 | // Re-export everything from hub.mjs
2 | export * from './hub.mjs';
3 |
4 | // Re-export everything from encoders.mjs
5 | export * from './encoders.mjs';
6 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@unum-cloud/uform",
3 | "type": "module",
4 | "version": "3.1.1",
5 | "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
6 | "dependencies": {
7 | "@huggingface/hub": "^0.14.8",
8 | "@xenova/transformers": "^2.17.0",
9 | "node-fetch": "^3.3.2",
10 | "onnxruntime-node": "^1.17.0",
11 | "onnxruntime-web": "^1.17.3"
12 | },
13 | "devDependencies": {
14 | "nodemon": "^2.0.15"
15 | },
16 | "scripts": {
17 | "start": "node javascript/encoders.mjs",
18 | "test": "node javascript/encoders_test.js"
19 | },
20 | "main": "javascript/index.mjs",
21 | "files": [
22 | "javascript/index.mjs",
23 | "javascript/encoders.mjs",
24 | "javascript/hub.mjs"
25 | ],
26 | "directories": {
27 | "doc": "docs"
28 | },
29 | "keywords": [
30 | "AI",
31 | "multimodal",
32 | "content generation",
33 | "huggingface"
34 | ],
35 | "author": "Ash Vardanian, Unum Cloud",
36 | "license": "Apache-2.0"
37 | }
38 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | build-backend = "setuptools.build_meta"
3 | requires = ["setuptools>=42"]
4 |
5 | [project]
6 | authors = [
7 | {email = "ash.vardanian@unum.cloud", name = "Ash Vardanian"},
8 | {email = "mike.kim@unum.cloud", name = "Mikhail Kim"},
9 | {email = "vladimir.orshulevich@unum.cloud", name = "Vladimir Orshulevich"},
10 | ]
11 | classifiers = [
12 | "Development Status :: 5 - Production/Stable",
13 | "License :: OSI Approved :: Apache Software License",
14 | "Natural Language :: Chinese (Simplified)",
15 | "Natural Language :: English",
16 | "Natural Language :: French",
17 | "Natural Language :: German",
18 | "Natural Language :: Italian",
19 | "Natural Language :: Japanese",
20 | "Natural Language :: Korean",
21 | "Natural Language :: Polish",
22 | "Natural Language :: Russian",
23 | "Natural Language :: Spanish",
24 | "Natural Language :: Turkish",
25 | "Operating System :: OS Independent",
26 | "Programming Language :: Python :: 3",
27 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
28 | "Topic :: Scientific/Engineering :: Image Processing",
29 | "Topic :: Scientific/Engineering :: Image Recognition",
30 | ]
31 | dependencies = [
32 | "huggingface_hub>=0.16.4",
33 | "tokenizers>=0.13.3",
34 | "pillow",
35 | "simsimd",
36 | ]
37 | description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
38 | maintainers = [
39 | {email = "info@unum.cloud", name = "Unum Cloud"},
40 | ]
41 | name = "uform"
42 | readme = "README.md"
43 | requires-python = ">=3.7"
44 | version = "3.1.1"
45 |
46 | [project.scripts]
47 | uform-chat = "uform.chat:main"
48 |
49 | [project.optional-dependencies]
50 | torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"]
51 | onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"]
52 | onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"]
53 | dev = ["pytest", "pandas"]
54 |
55 | [project.urls]
56 | "Homepage" = "https://github.com/unum-cloud/uform"
57 |
58 | [tool.setuptools.packages.find]
59 | where = ["python"]
60 | include = ["uform"]
61 | namespaces = false
62 |
63 | [tool.ruff]
64 | ignore = ["C408", "C901", "E501", "E741"]
65 | ignore-init-module-imports = true
66 | select = ["C", "E", "F", "I", "UP", "W"]
67 |
68 | [tool.ruff.isort]
69 | lines-after-imports = 2
70 |
71 | [tool.ruff.lint.isort]
72 | known-first-party = ["uform"]
73 |
74 | [tool.ruff.per-file-ignores]
75 | "__init__.py" = ["E401"]
76 |
77 | [tool.tomlsort]
78 | all = true
79 | in_place = true
80 | spaces_before_inline_comment = 2
81 | spaces_indent_inline_array = 4
82 | trailing_comma_inline_array = true
83 |
84 | # Configuration options for the Black formatter:
85 | # https://black.readthedocs.io/en/latest/usage_and_configuration/the_basics.html#where-black-looks-for-the-file
86 | [tool.black]
87 | line-length = 120 # Set line length to the same value as in `.clang-format` for modern wide screens
88 | target-version = ['py36', 'py312'] # Set target Python versions to 3.6 and 3.12
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | # UForm Python SDK
2 |
3 | UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications.
4 | The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware.
5 |
6 | ## Installation
7 |
8 | There are several ways to install the UForm Python SDK, depending on the backend you want to use.
9 | PyTorch is by far the heaviest, but the most capable.
10 | ONNX is a lightweight alternative that can run on any CPU, and on some GPUs.
11 |
12 | ```bash
13 | pip install "uform[torch]" # For PyTorch
14 | pip install "uform[onnx]" # For ONNX on CPU
15 | pip install "uform[onnx-gpu]" # For ONNX on GPU, available for some platforms
16 | pip install "uform[torch,onnx]" # For PyTorch and ONNX Python tests
17 | ```
18 |
19 | ## Quick Start
20 |
21 | ### Embeddings
22 |
23 | Load the model:
24 |
25 | ```py
26 | from uform import get_model, Modality
27 |
28 | model_name = 'unum-cloud/uform3-image-text-english-small'
29 | modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER]
30 | processors, models = get_model(model_name, modalities=modalities)
31 |
32 | model_text = models[Modality.TEXT_ENCODER]
33 | model_image = models[Modality.IMAGE_ENCODER]
34 | processor_text = processors[Modality.TEXT_ENCODER]
35 | processor_image = processors[Modality.IMAGE_ENCODER]
36 | ```
37 |
38 | Embed images:
39 |
40 | ```py
41 | import requests
42 | from io import BytesIO
43 | from PIL import Image
44 |
45 | image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
46 | image_url = Image.open(BytesIO(requests.get(image_url).content))
47 | image_data = processor_image(image)
48 | image_features, image_embedding = model_image.encode(image_data, return_features=True)
49 | ```
50 |
51 | Embed queries:
52 |
53 | ```py
54 | text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
55 | text_data = processor_text(text)
56 | text_features, text_embedding = model_text.encode(text_data, return_features=True)
57 | ```
58 |
59 | ### Generative Models
60 |
61 | UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
62 | Those models can be used to caption images or power multimodal chat experiences.
63 |
64 | ```python
65 | from transformers import AutoModel, AutoProcessor
66 |
67 | model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
68 | processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
69 |
70 | prompt = 'Question or Instruction'
71 | image = Image.open('image.jpg')
72 |
73 | inputs = processor(text=[prompt], images=[image], return_tensors='pt')
74 |
75 | with torch.inference_mode():
76 | output = model.generate(
77 | **inputs,
78 | do_sample=False,
79 | use_cache=True,
80 | max_new_tokens=256,
81 | eos_token_id=151645,
82 | pad_token_id=processor.tokenizer.pad_token_id
83 | )
84 | prompt_len = inputs['input_ids'].shape[1]
85 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
86 | ```
87 |
88 | You can check examples of different prompts in our demo Gradio spaces on HuggingFace:
89 |
90 | - for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
91 | - for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo)
92 |
93 | ## Technical Details
94 |
95 | ### Multi-GPU Parallelism
96 |
97 | To achieve higher throughput, you can launch UForm on multiple GPUs.
98 | For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
99 |
100 | ```python
101 | from uform import get_model, Modality
102 | import torch.nn as nn
103 |
104 | encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch')
105 |
106 | model_text = models[Modality.TEXT_ENCODER]
107 | model_image = models[Modality.IMAGE_ENCODER]
108 | processor_text = processors[Modality.TEXT_ENCODER]
109 | processor_image = processors[Modality.IMAGE_ENCODER]
110 |
111 | model_text.return_features = False
112 | model_image.return_features = False
113 | model_text_parallel = nn.DataParallel(model_text)
114 | model_image_parallel = nn.DataParallel(model_image)
115 | ```
116 |
117 | Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays.
118 |
119 | ```python
120 | def get_image_embedding(images: List[Image]):
121 | preprocessed = processor_image(images)
122 | embedding = model_image_parallel.forward(preprocessed)
123 | return embedding.detach().cpu().numpy()
124 |
125 | def get_text_embedding(texts: List[str]):
126 | preprocessed = processor_text(texts)
127 | embedding = model_text_parallel.forward(preprocessed)
128 | return embedding.detach().cpu().numpy()
129 | ```
130 |
131 | ### ONNX and CUDA
132 |
133 | The configuration process may include a few additional steps, depending on the environment.
134 | When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
135 |
136 | ```sh
137 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
138 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
139 | sudo apt-get update
140 | sudo apt-get -y install cuda-toolkit-12
141 | pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
142 | export CUDA_PATH="/usr/local/cuda-12/bin"
143 | export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
144 | export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
145 | pytest python/scripts/ -s -x -Wd -v -k onnx
146 | ```
147 |
148 | [install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
149 |
--------------------------------------------------------------------------------
/python/scripts/bench_decoders.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | from time import perf_counter
3 | from dataclasses import dataclass
4 | from typing import List
5 | import argparse
6 |
7 | import requests
8 | import torch
9 | from PIL import Image
10 | from transformers import (
11 | AutoProcessor,
12 | InstructBlipForConditionalGeneration,
13 | InstructBlipProcessor,
14 | LlavaForConditionalGeneration,
15 | AutoModel,
16 | AutoProcessor,
17 | )
18 |
19 | from uform.torch_decoders import VLMForCausalLM, VLMProcessor
20 |
21 | dtype = torch.bfloat16
22 | low_cpu_mem_usage = False
23 | device = "cuda:0"
24 |
25 |
26 | @dataclass
27 | class BenchmarkResult:
28 | model_name: str
29 | device_name: str
30 | backend_name: str
31 | duration_image_preprocessing: float
32 | duration_image_embedding: float
33 | duration_text_preprocessing: float
34 | duration_text_embedding: float
35 |
36 |
37 | def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]:
38 | # BLIP models require the prompt to be the first argument
39 | prompt = [prompt] * batch_size
40 | image = [image] * batch_size
41 | try:
42 | inputs = processor(prompt, image, return_tensors="pt")
43 | except ValueError:
44 | inputs = processor(image, prompt, return_tensors="pt")
45 |
46 | # Downcast and move to device
47 | for possible_key in ["images", "pixel_values"]:
48 | if possible_key not in inputs:
49 | continue
50 | inputs[possible_key] = inputs[possible_key].to(dtype) # Downcast floats
51 | inputs = {k: v.to(device) for k, v in inputs.items()} # Move to the right device
52 |
53 | with torch.inference_mode():
54 | output = model.generate(
55 | **inputs,
56 | do_sample=False,
57 | # use_cache=True,
58 | max_new_tokens=max_length,
59 | eos_token_id=32001,
60 | pad_token_id=processor.tokenizer.pad_token_id,
61 | )
62 | prompt_len = inputs["input_ids"].shape[1]
63 | decoded_texts = processor.batch_decode(
64 | output[:, prompt_len:],
65 | skip_special_tokens=True,
66 | )
67 | return decoded_texts
68 |
69 |
70 | def duration(callable):
71 | """Profile the duration of a callable and return the duration and the result."""
72 | start = perf_counter()
73 | result = callable()
74 | stop = perf_counter()
75 | return stop - start, result
76 |
77 |
78 | def bench_captions(
79 | model,
80 | processor,
81 | prompt: str,
82 | images: List[Image.Image],
83 | max_length: int = 256,
84 | batch_size: int = 10,
85 | ) -> List[str]:
86 | total_duration = 0
87 | total_length = 0
88 | model = torch.compile(model)
89 |
90 | def caption_image(image):
91 | return caption(
92 | model=model,
93 | processor=processor,
94 | prompt=prompt,
95 | image=image,
96 | max_length=max_length,
97 | batch_size=batch_size,
98 | )
99 |
100 | for image in images:
101 | seconds, captions = duration(partial(caption_image, image=image))
102 | total_duration += seconds
103 | total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions)
104 |
105 | del model
106 | del processor
107 | print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
108 |
109 |
110 | def main(batch_size: int = 10, max_length: int = 256):
111 |
112 | image_urls = [
113 | "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
114 | "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
115 | "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
116 | "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
117 | "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
118 | ]
119 | images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
120 | captions = [
121 | "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
122 | "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
123 | "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
124 | "asian girl sleeping in a bed. top down view",
125 | "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
126 | ]
127 |
128 | print("UForm-Gen2")
129 | bench_captions(
130 | model=AutoModel.from_pretrained(
131 | "unum-cloud/uform-gen2-dpo",
132 | trust_remote_code=True,
133 | torch_dtype=dtype,
134 | low_cpu_mem_usage=low_cpu_mem_usage,
135 | ignore_mismatched_sizes=True,
136 | ).to(device),
137 | processor=AutoProcessor.from_pretrained(
138 | "unum-cloud/uform-gen2-dpo",
139 | trust_remote_code=True,
140 | ),
141 | prompt="Describe the picture in great detail",
142 | images=images,
143 | batch_size=batch_size,
144 | max_length=max_length,
145 | )
146 |
147 | print("UForm-Gen")
148 | bench_captions(
149 | model=VLMForCausalLM.from_pretrained(
150 | "unum-cloud/uform-gen",
151 | torch_dtype=dtype,
152 | low_cpu_mem_usage=low_cpu_mem_usage,
153 | ignore_mismatched_sizes=True,
154 | ).to(device),
155 | processor=VLMProcessor.from_pretrained(
156 | "unum-cloud/uform-gen",
157 | ),
158 | prompt="[cap] Summarize the visual content of the image.",
159 | images=images,
160 | batch_size=batch_size,
161 | max_length=max_length,
162 | )
163 |
164 | print("LLaVA")
165 | bench_captions(
166 | model=LlavaForConditionalGeneration.from_pretrained(
167 | "llava-hf/llava-1.5-7b-hf",
168 | torch_dtype=dtype,
169 | low_cpu_mem_usage=low_cpu_mem_usage,
170 | ).to(device),
171 | processor=AutoProcessor.from_pretrained(
172 | "llava-hf/llava-1.5-7b-hf",
173 | ),
174 | prompt="USER: \nWhat are these?\nASSISTANT:",
175 | images=images,
176 | batch_size=batch_size,
177 | max_length=max_length,
178 | )
179 |
180 | print("InstructBLIP")
181 | bench_captions(
182 | model=InstructBlipForConditionalGeneration.from_pretrained(
183 | "Salesforce/instructblip-vicuna-7b",
184 | torch_dtype=dtype,
185 | low_cpu_mem_usage=low_cpu_mem_usage,
186 | ).to(device),
187 | processor=InstructBlipProcessor.from_pretrained(
188 | "Salesforce/instructblip-vicuna-7b",
189 | ),
190 | prompt="Summarize the visual content of the image.",
191 | images=images,
192 | batch_size=batch_size,
193 | max_length=max_length,
194 | )
195 |
196 |
197 | if __name__ == "__main__":
198 |
199 | parser = argparse.ArgumentParser()
200 | parser.add_argument(
201 | "--batch-size",
202 | type=int,
203 | default=10,
204 | help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
205 | )
206 | parser.add_argument(
207 | "--max-length",
208 | type=str,
209 | default=256,
210 | help="Maximum length of the generated text in tokens.",
211 | )
212 | args = parser.parse_args()
213 |
214 | main(batch_size=args.batch_size, max_length=args.max_length)
215 |
--------------------------------------------------------------------------------
/python/scripts/bench_encoders.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | This script provides the throughput of UForm multimodal embedding models.
5 |
6 | The output of the script will cover:
7 | - Time to preprocess an image, and throughput in images/s.
8 | - Time to tokenize the text, and throughput in queries/s.
9 | - Time to encode the image, and throughput in images/s.
10 | - Time to encode the text, and throughput in queries/s.
11 | - Share of time spent on each part of the pipeline.
12 |
13 | Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx),
14 | and precision (float32 or bfloat16), producing a pretty comprehensive benchmark.
15 |
16 | Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`.
17 | Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled.
18 | """
19 |
20 | from functools import partial
21 | from time import perf_counter
22 | from dataclasses import dataclass
23 | from typing import List, Tuple, Literal, Callable, Generator
24 | import re
25 | import argparse
26 |
27 | import requests
28 | from PIL import Image
29 | import pandas as pd
30 |
31 | from uform import get_model, Modality, ExecutionProviderError
32 |
33 | # Define global constants for the hardware availability
34 | torch_available = False
35 | try:
36 | import torch
37 |
38 | torch_available = True
39 | except ImportError:
40 | pass
41 | onnx_available = False
42 | try:
43 | import onnx
44 |
45 | onnx_available = True
46 | except ImportError:
47 | pass
48 | cuda_available = False
49 | try:
50 | if torch_available:
51 | cuda_available = torch.cuda.is_available()
52 | elif onnx_available:
53 | import onnxruntime
54 |
55 | cuda_available = onnxruntime.get_device() == "GPU"
56 | except ImportError:
57 | pass
58 |
59 |
60 | @dataclass
61 | class BenchmarkResult:
62 | model_name: str
63 | device_name: Literal["cpu", "cuda"] = "cpu"
64 | backend_name: Literal["torch", "onnx"] = "torch"
65 | duration_image_preprocessing: float = 0
66 | duration_image_embedding: float = 0
67 | duration_text_preprocessing: float = 0
68 | duration_text_embedding: float = 0
69 |
70 |
71 | def duration(callable, synchronize=False):
72 | """Profile the duration of a callable and return the duration and the result."""
73 | if synchronize and torch_available and cuda_available:
74 | torch.cuda.synchronize() # Wait for CUDA operations to complete
75 | start = perf_counter()
76 | result = callable()
77 | if synchronize and torch_available and cuda_available:
78 | torch.cuda.synchronize() # Ensure all CUDA kernels have finished
79 | stop = perf_counter()
80 | return stop - start, result
81 |
82 |
83 | def get_captioned_images() -> List[Tuple[Image.Image, str]]:
84 | """Get a list of pre-downloaded and decoded images and their captions."""
85 | image_urls = [
86 | "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
87 | "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
88 | "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
89 | "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
90 | "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
91 | ]
92 | images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
93 | captions = [
94 | "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
95 | "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
96 | "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
97 | "asian girl sleeping in a bed. top down view",
98 | "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
99 | ]
100 | return list(zip(images, captions))
101 |
102 |
103 | def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
104 | """Yields callable benchmarks for all supported backends of the given model."""
105 |
106 | # Pull the content and artificially grow the batch size
107 | images, captions = zip(*get_captioned_images())
108 |
109 | if len(images) < batch_size:
110 | import math
111 |
112 | multiplier = int(math.ceil(batch_size / len(images)))
113 | images *= multiplier
114 | captions *= multiplier
115 | images = images[:batch_size]
116 | captions = captions[:batch_size]
117 |
118 | def run(model_name: str, device: str, backend_name: str):
119 | result = BenchmarkResult(
120 | model_name=model_name,
121 | backend_name=backend_name,
122 | device_name=device,
123 | duration_image_preprocessing=0,
124 | duration_image_embedding=0,
125 | duration_text_preprocessing=0,
126 | duration_text_embedding=0,
127 | )
128 |
129 | sync = backend_name == "torch"
130 | processors, models = get_model(
131 | model_name,
132 | device=device,
133 | modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER],
134 | backend=backend_name,
135 | )
136 |
137 | model_text = models[Modality.TEXT_ENCODER]
138 | model_image = models[Modality.IMAGE_ENCODER]
139 | processor_text = processors[Modality.TEXT_ENCODER]
140 | processor_image = processors[Modality.IMAGE_ENCODER]
141 |
142 | # Image preprocessing
143 | total_duration = 0
144 | total_iterations = 0
145 | while total_duration < 10 and total_iterations < 100:
146 | seconds, _ = duration(lambda: processor_image(images))
147 | total_duration += seconds
148 | total_iterations += len(images)
149 | duration_per_iteration = total_duration / total_iterations
150 | result.duration_image_preprocessing = duration_per_iteration
151 |
152 | # Image embedding
153 | total_duration = 0
154 | total_iterations = 0
155 | while total_duration < 10 and total_iterations < 100:
156 | images_data = processor_image(images)
157 | seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync)
158 | total_duration += seconds
159 | total_iterations += len(images)
160 | duration_per_iteration = total_duration / total_iterations
161 | result.duration_image_embedding = duration_per_iteration
162 |
163 | # Text preprocessing
164 | total_duration = 0
165 | total_iterations = 0
166 | while total_duration < 10 and total_iterations < 100:
167 | seconds, _ = duration(lambda: processor_text(captions))
168 | total_duration += seconds
169 | total_iterations += len(captions)
170 | duration_per_iteration = total_duration / total_iterations
171 | result.duration_text_preprocessing = duration_per_iteration
172 |
173 | # Text embedding
174 | total_duration = 0
175 | total_iterations = 0
176 | while total_duration < 10 and total_iterations < 100:
177 | texts_data = processor_text(captions)
178 | seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync)
179 | total_duration += seconds
180 | total_iterations += len(captions)
181 | duration_per_iteration = total_duration / total_iterations
182 | result.duration_text_embedding = duration_per_iteration
183 |
184 | return result
185 |
186 | devices = ["cpu"]
187 | if cuda_available:
188 | devices.append("cuda")
189 | backends = []
190 | if torch_available:
191 | backends.append("torch")
192 | if onnx_available:
193 | backends.append("onnx")
194 |
195 | for device in devices:
196 | for backend_name in backends:
197 | for model_name in [
198 | "unum-cloud/uform3-image-text-english-small",
199 | "unum-cloud/uform3-image-text-english-base",
200 | "unum-cloud/uform3-image-text-english-large",
201 | "unum-cloud/uform3-image-text-multilingual-base",
202 | ]:
203 | yield BenchmarkResult(
204 | model_name=model_name,
205 | device_name=device,
206 | backend_name=backend_name,
207 | ), partial(run, model_name, device, backend_name)
208 |
209 |
210 | def main(filter_out: str = None, batch_size: int = 10):
211 | results = []
212 | filter_pattern = re.compile(filter_out) if filter_out else None
213 | for specs, func in yield_benchmarks(batch_size=batch_size):
214 | if filter_pattern and (
215 | filter_pattern.search(specs.model_name)
216 | or filter_pattern.search(specs.backend_name)
217 | or filter_pattern.search(specs.device_name)
218 | ):
219 | continue
220 |
221 | try:
222 | print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
223 | result = func()
224 | results.append(result)
225 | except ExecutionProviderError as e:
226 | print(f"- skipping missing backend")
227 | print(e)
228 |
229 | results = sorted(results, key=lambda x: x.model_name)
230 | results = [x.__dict__ for x in results]
231 |
232 | df = pd.DataFrame(results)
233 | df.columns = [
234 | "Model Name",
235 | "Device",
236 | "Backend",
237 | "Images Preprocessed/s",
238 | "Images Encoded/s",
239 | "Texts Preprocessed/s",
240 | "Texts Encoded/s",
241 | ]
242 |
243 | def inverse(x):
244 | return 1 / x if x != 0 else 0
245 |
246 | # Apply number formatting directly in the DataFrame
247 | formatted_df = df.copy()
248 | formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format)
249 | formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format)
250 | formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format)
251 | formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format)
252 |
253 | # Convert formatted DataFrame to Markdown
254 | print(formatted_df.to_markdown())
255 |
256 |
257 | if __name__ == "__main__":
258 |
259 | parser = argparse.ArgumentParser()
260 | parser.add_argument(
261 | "--filter-out",
262 | type=str,
263 | default=None,
264 | help="Filter out models, backends, or devices with a Regular Expression.",
265 | )
266 | parser.add_argument(
267 | "--batch-size",
268 | type=int,
269 | default=10,
270 | help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
271 | )
272 | args = parser.parse_args()
273 |
274 | main(filter_out=args.filter_out, batch_size=args.batch_size)
275 |
--------------------------------------------------------------------------------
/python/scripts/export_decoders.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
8 | "\n",
9 | "Depending on the backend, we prefer different qunatization schemes.\n",
10 | "\n",
11 | "- For ONNX we use `uint8` quantization.\n",
12 | "- For PyTorch we use `bfloat16` quantization.\n",
13 | "- For CoreML we use `float32` representation."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "!pip install --upgrade \"uform[torch]\" coremltools"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import os\n",
32 | "model_name = \"unum-cloud/uform-gen2-dpo\"\n",
33 | "output_directory = \"../../\""
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import torch\n",
43 | "import uform\n",
44 | "from PIL import Image\n",
45 | "from transformers import AutoModel, AutoProcessor\n",
46 | "\n",
47 | "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n",
48 | "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n",
49 | "\n",
50 | "prompt = 'Describe the picture'\n",
51 | "image = Image.open('../../assets/unum.png')\n",
52 | "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n",
53 | "\n",
54 | "with torch.inference_mode():\n",
55 | " output = model.generate(\n",
56 | " **inputs,\n",
57 | " do_sample=False,\n",
58 | " use_cache=True,\n",
59 | " max_new_tokens=256,\n",
60 | " eos_token_id=151645,\n",
61 | " pad_token_id=processor.tokenizer.pad_token_id\n",
62 | " )\n",
63 | "prompt_len = inputs['input_ids'].shape[1]\n",
64 | "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n",
65 | "\n",
66 | "print(decoded_text)"
67 | ]
68 | }
69 | ],
70 | "metadata": {
71 | "kernelspec": {
72 | "display_name": "base",
73 | "language": "python",
74 | "name": "python3"
75 | },
76 | "language_info": {
77 | "codemirror_mode": {
78 | "name": "ipython",
79 | "version": 3
80 | },
81 | "file_extension": ".py",
82 | "mimetype": "text/x-python",
83 | "name": "python",
84 | "nbconvert_exporter": "python",
85 | "pygments_lexer": "ipython3",
86 | "version": "3.11.5"
87 | }
88 | },
89 | "nbformat": 4,
90 | "nbformat_minor": 2
91 | }
92 |
--------------------------------------------------------------------------------
/python/scripts/test_decoders.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from PIL import Image
3 |
4 | # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
5 | try:
6 | import torch
7 |
8 | torch_available = True
9 | except:
10 | torch_available = False
11 |
12 | torch_hf_models = [
13 | "unum-cloud/uform-gen2-qwen-500m",
14 | "unum-cloud/uform-gen2-dpo",
15 | ]
16 |
17 |
18 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
19 | @pytest.mark.parametrize("model_name", torch_hf_models)
20 | def test_one_conversation(model_name: str):
21 | from transformers import AutoModel, AutoProcessor
22 |
23 | model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
24 | processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
25 |
26 | prompt = "Describe the image in great detail."
27 | image = Image.open("assets/unum.png")
28 |
29 | inputs = processor(text=[prompt], images=[image], return_tensors="pt")
30 |
31 | with torch.inference_mode():
32 | output = model.generate(
33 | **inputs,
34 | do_sample=False,
35 | use_cache=True,
36 | max_new_tokens=10,
37 | pad_token_id=processor.tokenizer.pad_token_id,
38 | )
39 | prompt_len = inputs["input_ids"].shape[1]
40 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
41 |
42 | assert len(decoded_text), "No text was generated from the model."
43 |
44 |
45 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
46 | @pytest.mark.parametrize("model_name", torch_hf_models)
47 | @pytest.mark.parametrize("batch_size", [1, 2])
48 | def test_many_conversations(model_name: str, batch_size: int):
49 |
50 | from transformers import AutoModel, AutoProcessor
51 |
52 | model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
53 | processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
54 |
55 | prompt = "Describe the image in great detail."
56 | image = Image.open("assets/unum.png")
57 |
58 | texts = [prompt] * batch_size
59 | images = [image] * batch_size
60 | inputs = processor(text=texts, images=images, return_tensors="pt")
61 |
62 | with torch.inference_mode():
63 | output = model.generate(
64 | **inputs,
65 | do_sample=False,
66 | use_cache=True,
67 | max_new_tokens=10,
68 | pad_token_id=processor.tokenizer.pad_token_id,
69 | )
70 | prompt_len = inputs["input_ids"].shape[1]
71 | decoded_texts = processor.batch_decode(output[:, prompt_len:])
72 |
73 | assert all(len(decoded_text) for decoded_text in decoded_texts), "No text was generated from the model."
74 |
--------------------------------------------------------------------------------
/python/scripts/test_encoders.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 | from typing import Tuple
3 | import requests
4 | from io import BytesIO
5 | import os
6 |
7 | import pytest
8 | import numpy as np
9 | from PIL import Image
10 |
11 | from uform import Modality, get_model, ExecutionProviderError
12 |
13 | # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
14 | try:
15 | import torch
16 |
17 | torch_available = True
18 | except:
19 | torch_available = False
20 |
21 | # ONNX is not a very light dependency either
22 | try:
23 | import onnx
24 |
25 | onnx_available = True
26 | except:
27 | onnx_available = False
28 |
29 | torch_models = [
30 | "unum-cloud/uform3-image-text-english-small",
31 | "unum-cloud/uform3-image-text-english-base",
32 | "unum-cloud/uform3-image-text-english-large",
33 | "unum-cloud/uform3-image-text-multilingual-base",
34 | ]
35 |
36 | onnx_models = [
37 | "unum-cloud/uform3-image-text-english-small",
38 | "unum-cloud/uform3-image-text-english-base",
39 | "unum-cloud/uform3-image-text-english-large",
40 | "unum-cloud/uform3-image-text-multilingual-base",
41 | ]
42 |
43 | # Let's check if the HuggingFace Hub API token is set in the environment variable.
44 | # If it's not there, check if the `.hf_token` file is present in the current working directory.
45 | token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
46 | if token is None:
47 | token_path = "./.hf_token"
48 | if os.path.exists(token_path):
49 | with open(token_path, "r") as file:
50 | token = file.read().strip()
51 |
52 |
53 | def skip_on(exception, reason="No good reason :)"):
54 | def decorator_func(f):
55 | @wraps(f)
56 | def wrapper(*args, **kwargs):
57 | try:
58 | # Try to run the test
59 | return f(*args, **kwargs)
60 | except exception:
61 | pytest.skip(reason)
62 |
63 | return wrapper
64 |
65 | return decorator_func
66 |
67 |
68 | def cosine_similarity(x, y) -> float:
69 | if not isinstance(x, np.ndarray):
70 | x = x.detach().numpy()
71 | if not isinstance(y, np.ndarray):
72 | y = y.detach().numpy()
73 |
74 | # Unlike NumPy, SimSIMD can properly deal with integer types
75 | x = x.astype(np.float32).flatten()
76 | y = y.astype(np.float32).flatten()
77 | return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
78 |
79 |
80 | def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1):
81 | """Test if the embeddings of text and image are semantically similar
82 | using a small set of example text-image pairs."""
83 |
84 | texts = [
85 | "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
86 | "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
87 | "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
88 | "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
89 | "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
90 | ]
91 |
92 | image_urls = [
93 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
94 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
95 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
96 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
97 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
98 | ]
99 | assert len(texts) == len(image_urls), "Number of texts and images should be the same."
100 |
101 | images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls]
102 | count_pairs = len(texts)
103 |
104 | # Ensure we have a sufficiently large batch
105 | texts = texts * batch_size_multiple
106 | images = images * batch_size_multiple
107 |
108 | # Compute the embedding in a batch fashion
109 | text_embeddings = text_to_embedding(texts)
110 | image_embeddings = image_to_embedding(images)
111 |
112 | # Evaluate cosine similarity
113 | for i in range(count_pairs):
114 | pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
115 | other_text_similarities = [
116 | cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i
117 | ]
118 | other_image_similarities = [
119 | cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i
120 | ]
121 |
122 | assert pair_similarity > max(
123 | other_text_similarities
124 | ), "Text should be more similar to its corresponding image than to other images."
125 | assert pair_similarity > max(
126 | other_image_similarities
127 | ), "Image should be more similar to its corresponding text than to other texts."
128 |
129 |
130 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
131 | @pytest.mark.parametrize("model_name", torch_models)
132 | def test_torch_one_embedding(model_name: str):
133 | processors, models = get_model(model_name, token=token, backend="torch")
134 | model_text = models[Modality.TEXT_ENCODER]
135 | model_image = models[Modality.IMAGE_ENCODER]
136 | processor_text = processors[Modality.TEXT_ENCODER]
137 | processor_image = processors[Modality.IMAGE_ENCODER]
138 |
139 | text = "a small red panda in a zoo"
140 | image_path = "assets/unum.png"
141 |
142 | image = Image.open(image_path)
143 | image_data = processor_image(image)
144 | text_data = processor_text(text)
145 |
146 | image_features, image_embedding = model_image.encode(image_data, return_features=True)
147 | text_features, text_embedding = model_text.encode(text_data, return_features=True)
148 |
149 | assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
150 | assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
151 |
152 | # Test if the model outputs actually make sense
153 | cross_references_image_and_text_embeddings(
154 | lambda text: model_text(processor_text(text)),
155 | lambda image: model_image(processor_image(image)),
156 | )
157 |
158 |
159 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
160 | @pytest.mark.parametrize("model_name", torch_models)
161 | @pytest.mark.parametrize("batch_size", [1, 2])
162 | def test_torch_many_embeddings(model_name: str, batch_size: int):
163 |
164 | processors, models = get_model(model_name, token=token, backend="torch")
165 | model_text = models[Modality.TEXT_ENCODER]
166 | model_image = models[Modality.IMAGE_ENCODER]
167 | processor_text = processors[Modality.TEXT_ENCODER]
168 | processor_image = processors[Modality.IMAGE_ENCODER]
169 |
170 | texts = ["a small red panda in a zoo"] * batch_size
171 | image_paths = ["assets/unum.png"] * batch_size
172 |
173 | images = [Image.open(path) for path in image_paths]
174 | image_data = processor_image(images)
175 | text_data = processor_text(texts)
176 |
177 | image_embeddings = model_image.encode(image_data, return_features=False)
178 | text_embeddings = model_text.encode(text_data, return_features=False)
179 |
180 | assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
181 | assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
182 |
183 |
184 | @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
185 | @pytest.mark.parametrize("model_name", onnx_models)
186 | @pytest.mark.parametrize("device", ["CPUExecutionProvider"])
187 | @skip_on(ExecutionProviderError, reason="Missing execution provider")
188 | def test_onnx_one_embedding(model_name: str, device: str):
189 |
190 | processors, models = get_model(model_name, token=token, device=device, backend="onnx")
191 | model_text = models[Modality.TEXT_ENCODER]
192 | model_image = models[Modality.IMAGE_ENCODER]
193 | processor_text = processors[Modality.TEXT_ENCODER]
194 | processor_image = processors[Modality.IMAGE_ENCODER]
195 |
196 | text = "a small red panda in a zoo"
197 | image_path = "assets/unum.png"
198 |
199 | image = Image.open(image_path)
200 | image_data = processor_image(image)
201 | text_data = processor_text(text)
202 |
203 | image_features, image_embedding = model_image.encode(image_data)
204 | text_features, text_embedding = model_text.encode(text_data)
205 |
206 | assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
207 | assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
208 |
209 | # Nested fucntions are easier to debug, than lambdas
210 | def get_image_embedding(image_data):
211 | features, embedding = model_image.encode(processor_image(image_data))
212 | return embedding
213 |
214 | def get_text_embedding(text_data):
215 | features, embedding = model_text.encode(processor_text(text_data))
216 | return embedding
217 |
218 | # Test if the model outputs actually make sense
219 | cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
220 |
221 |
222 | @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
223 | @pytest.mark.parametrize("model_name", onnx_models)
224 | @pytest.mark.parametrize("batch_size", [1, 2])
225 | @pytest.mark.parametrize("device", ["CPUExecutionProvider"])
226 | @skip_on(ExecutionProviderError, reason="Missing execution provider")
227 | def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
228 |
229 | processors, models = get_model(model_name, token=token, device=device, backend="onnx")
230 | model_text = models[Modality.TEXT_ENCODER]
231 | model_image = models[Modality.IMAGE_ENCODER]
232 | processor_text = processors[Modality.TEXT_ENCODER]
233 | processor_image = processors[Modality.IMAGE_ENCODER]
234 |
235 | texts = ["a small red panda in a zoo"] * batch_size
236 | image_paths = ["assets/unum.png"] * batch_size
237 |
238 | images = [Image.open(path) for path in image_paths]
239 | image_data = processor_image(images)
240 | text_data = processor_text(texts)
241 |
242 | image_embeddings = model_image.encode(image_data, return_features=False)
243 | text_embeddings = model_text.encode(text_data, return_features=False)
244 |
245 | assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
246 | assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
247 |
248 |
249 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
250 | @pytest.mark.parametrize("model_name", torch_models[:1])
251 | def test_torch_multi_gpu(model_name: str):
252 |
253 | count_cuda_devices = torch.cuda.device_count()
254 | if count_cuda_devices < 2:
255 | pytest.skip("Not enough CUDA devices to run multi-GPU test")
256 |
257 | processors, models = get_model(model_name, token=token, backend="torch", device="cuda")
258 | model_text = models[Modality.TEXT_ENCODER]
259 | model_image = models[Modality.IMAGE_ENCODER]
260 | processor_text = processors[Modality.TEXT_ENCODER]
261 | processor_image = processors[Modality.IMAGE_ENCODER]
262 |
263 | import torch.nn as nn
264 |
265 | model_text.return_features = False
266 | model_image.return_features = False
267 | model_text_parallel = nn.DataParallel(model_text)
268 | model_image_parallel = nn.DataParallel(model_image)
269 |
270 | # Nested fucntions are easier to debug, than lambdas
271 | def get_image_embedding(image_data):
272 | preprocessed = processor_image(image_data)
273 | embedding = model_image_parallel.forward(preprocessed)
274 | return embedding.detach().cpu().numpy()
275 |
276 | def get_text_embedding(text_data):
277 | preprocessed = processor_text(text_data)
278 | embedding = model_text_parallel.forward(preprocessed)
279 | return embedding.detach().cpu().numpy()
280 |
281 | # Test if the model outputs actually make sense
282 | cross_references_image_and_text_embeddings(
283 | get_text_embedding,
284 | get_image_embedding,
285 | batch_size_multiple=count_cuda_devices,
286 | )
287 |
288 |
289 | if __name__ == "__main__":
290 | # If you want to run this test file individually, you can do so by running:
291 | # pytest.main(["-s", "-x", __file__])
292 | pass
293 |
--------------------------------------------------------------------------------
/python/uform/__init__.py:
--------------------------------------------------------------------------------
1 | from os.path import join, exists
2 | from typing import Dict, Optional, Tuple, Literal, Union, Callable
3 |
4 | from huggingface_hub import snapshot_download, utils
5 |
6 | from uform.shared import ExecutionProviderError, Modality
7 |
8 |
9 | def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
10 | if modalities is None:
11 | return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
12 |
13 | return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities)
14 |
15 |
16 | def get_checkpoint(
17 | model_name: str,
18 | modalities: Tuple[str, Modality],
19 | token: Optional[str] = None,
20 | format: Literal[".pt", ".onnx"] = ".pt",
21 | ) -> Tuple[str, Dict[Modality, str], Optional[str]]:
22 | """Downloads a model checkpoint from the Hugging Face Hub.
23 |
24 | :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small`
25 | :param token: The Hugging Face API token, if required
26 | :param modalities: The modalities to download, like `("text_encoder", "image_encoder")`
27 | :param format: The format of the model checkpoint, either `.pt` or `.onnx`
28 | :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
29 | """
30 |
31 | modalities = _normalize_modalities(modalities)
32 |
33 | # It is not recommended to use `.pth` extension when checkpointing models
34 | # because it collides with Python path (`.pth`) configuration files.
35 | merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]]
36 | separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities]
37 | config_names = ["torch_config.json", "config.json"]
38 | tokenizer_names = ["tokenizer.json"]
39 |
40 | old_progress_behavior = utils.are_progress_bars_disabled()
41 | utils.disable_progress_bars()
42 |
43 | # The download stats depend on the number of times the `config.json` is pulled
44 | # https://huggingface.co/docs/hub/models-download-stats
45 | model_path = snapshot_download(
46 | repo_id=model_name,
47 | token=token,
48 | allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
49 | )
50 |
51 | if old_progress_behavior:
52 | utils.enable_progress_bars()
53 |
54 | # Find the first name in `config_names` that is present
55 | config_path = None
56 | for config_name in config_names:
57 | if exists(join(model_path, config_name)):
58 | config_path = join(model_path, config_name)
59 | break
60 |
61 | # Same for the tokenizer
62 | tokenizer_path = None
63 | for tokenizer_name in tokenizer_names:
64 | if exists(join(model_path, tokenizer_name)):
65 | tokenizer_path = join(model_path, tokenizer_name)
66 | break
67 |
68 | # Ideally, we want to separately fetch all the models.
69 | # If those aren't available, aggregate separate modalities and merge them.
70 | modality_paths = None
71 | for file_name in merged_model_names:
72 | if exists(join(model_path, file_name)):
73 | modality_paths = join(model_path, file_name)
74 | break
75 |
76 | if modality_paths is None:
77 | modality_paths = {}
78 | for separate_modality_name in separate_modality_names:
79 | if exists(join(model_path, separate_modality_name)):
80 | modality_name, _, _ = separate_modality_name.partition(".")
81 | modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name)
82 |
83 | return config_path, modality_paths, tokenizer_path
84 |
85 |
86 | def get_model_torch(
87 | model_name: str,
88 | *,
89 | token: Optional[str] = None,
90 | device: Literal["cpu", "cuda"] = "cpu",
91 | modalities: Optional[Tuple[Union[str, Modality]]] = None,
92 | ) -> Tuple[Dict[Modality, Callable], Dict]:
93 | """
94 | Fetches and constructs a PyTorch model with its processors based on provided modalities.
95 |
96 | :param model_name: The identifier of the model on the Hugging Face Hub.
97 | :param token: Optional API token for authenticated access to the model.
98 | :param device: The device to load the model onto ('cpu' or 'cuda').
99 | :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
100 | :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
101 | """
102 | from uform.torch_encoders import TextEncoder, ImageEncoder
103 | from uform.torch_processors import TextProcessor, ImageProcessor
104 |
105 | modalities = _normalize_modalities(modalities)
106 | config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt")
107 |
108 | result_processors = {}
109 | result_models = {}
110 |
111 | if Modality.TEXT_ENCODER in modalities:
112 | processor = TextProcessor(config_path, tokenizer_path)
113 | encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER))
114 | encoder = encoder.eval().to(device)
115 | result_processors[Modality.TEXT_ENCODER] = processor
116 | result_models[Modality.TEXT_ENCODER] = encoder
117 |
118 | if Modality.IMAGE_ENCODER in modalities:
119 | processor = ImageProcessor(config_path)
120 | encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER))
121 | encoder = encoder.eval().to(device)
122 | result_processors[Modality.IMAGE_ENCODER] = processor
123 | result_models[Modality.IMAGE_ENCODER] = encoder
124 |
125 | return result_processors, result_models
126 |
127 |
128 | def get_model_onnx(
129 | model_name: str,
130 | *,
131 | device: Literal["cpu", "cuda"] = "cpu",
132 | token: Optional[str] = None,
133 | modalities: Optional[Tuple[str]] = None,
134 | ):
135 | """
136 | Fetches and constructs an ONNX model with its processors based on provided modalities.
137 |
138 | :param model_name: The identifier of the model on the Hugging Face Hub.
139 | :param device: The device on which the model will operate ('cpu' or 'cuda').
140 | :param token: Optional API token for authenticated access to the model.
141 | :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
142 | :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
143 | """
144 | from uform.onnx_encoders import TextEncoder, ImageEncoder
145 | from uform.numpy_processors import TextProcessor, ImageProcessor
146 |
147 | modalities = _normalize_modalities(modalities)
148 | config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx")
149 |
150 | result_processors = {}
151 | result_models = {}
152 |
153 | if Modality.TEXT_ENCODER in modalities:
154 | processor = TextProcessor(config_path, tokenizer_path)
155 | encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device)
156 | result_processors[Modality.TEXT_ENCODER] = processor
157 | result_models[Modality.TEXT_ENCODER] = encoder
158 |
159 | if Modality.IMAGE_ENCODER in modalities:
160 | processor = ImageProcessor(config_path)
161 | encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device)
162 | result_processors[Modality.IMAGE_ENCODER] = processor
163 | result_models[Modality.IMAGE_ENCODER] = encoder
164 |
165 | return result_processors, result_models
166 |
167 |
168 | def get_model(
169 | model_name: str,
170 | *,
171 | device: Literal["cpu", "cuda"] = "cpu", # change this if you have a GPU
172 | backend: Literal["onnx", "torch"] = "onnx", # lighter = better
173 | modalities: Optional[Tuple[str, Modality]] = None, # all by default
174 | token: Optional[str] = None, # optional HuggingFace Hub token for private models
175 | ) -> Tuple[Dict[Modality, Callable], Dict]:
176 | """
177 | Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend.
178 |
179 | :param model_name: The identifier of the model on the Hugging Face Hub.
180 | :param device: The device to load the model onto ('cpu' or 'cuda').
181 | :param backend: The backend framework to use ('onnx' or 'torch').
182 | :param modalities: A tuple specifying the types of model components to fetch.
183 | :param token: Optional API token for authenticated access to the model.
184 | :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
185 | """
186 | if backend == "onnx":
187 | return get_model_onnx(model_name, device=device, token=token, modalities=modalities)
188 | elif backend == "torch":
189 | return get_model_torch(model_name, device=device, token=token, modalities=modalities)
190 | else:
191 | raise ValueError(f"Unknown backend: {backend}")
192 |
--------------------------------------------------------------------------------
/python/uform/chat.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | import requests
4 | import torch
5 | from PIL import Image
6 | from transformers import TextStreamer, AutoModel, AutoProcessor
7 |
8 |
9 | def parse_args():
10 | parser = ArgumentParser(description="Chat with UForm generative model")
11 |
12 | parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path")
13 | parser.add_argument("--image", type=str, required=True, help="Path to image or URL")
14 | parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`")
15 | parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference")
16 |
17 | return parser.parse_args()
18 |
19 |
20 | def run_chat(opts, model, processor):
21 | streamer = TextStreamer(
22 | processor.tokenizer,
23 | skip_prompt=True,
24 | skip_special_tokens=True,
25 | )
26 |
27 | messages = [{"role": "system", "content": "You are a helpful assistant."}]
28 | is_first_message = True
29 |
30 | if opts.image.startswith("http"):
31 | image = Image.open(requests.get(opts.image, stream=True).raw)
32 | else:
33 | image = Image.open(opts.image)
34 |
35 | image = (
36 | processor.feature_extractor(image) #
37 | .unsqueeze(0)
38 | .to(torch.bfloat16 if opts.fp16 else torch.float32)
39 | .to(opts.device)
40 | )
41 |
42 | while True:
43 | if messages[-1]["role"] in ("system", "assistant"):
44 | message = input("User: ")
45 | if is_first_message:
46 | message = f" {message}"
47 | is_first_message = False
48 | messages.append({"role": "user", "content": message})
49 |
50 | print()
51 |
52 | else:
53 | input_ids = processor.tokenizer.apply_chat_template(
54 | messages,
55 | return_tensors="pt",
56 | add_generation_prompt=True,
57 | ).to(opts.device)
58 |
59 | attention_mask = torch.ones(
60 | 1,
61 | input_ids.shape[1] + processor.num_image_latents - 1,
62 | ).to(opts.device)
63 | inputs = {
64 | "input_ids": input_ids,
65 | "attention_mask": attention_mask,
66 | "images": image,
67 | }
68 |
69 | print("Assistant: ", end="")
70 | with torch.inference_mode():
71 | output = model.generate(
72 | **inputs,
73 | do_sample=False,
74 | use_cache=True,
75 | max_new_tokens=1024,
76 | eos_token_id=151645,
77 | pad_token_id=processor.tokenizer.pad_token_id,
78 | streamer=streamer,
79 | )
80 | print()
81 |
82 | prompt_len = inputs["input_ids"].shape[1]
83 | message = processor.batch_decode(output[:, prompt_len:-1])[0]
84 |
85 | messages.append({"role": "assistant", "content": message})
86 |
87 |
88 | def main():
89 | try:
90 | opts = parse_args()
91 | processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True)
92 | model = (
93 | AutoModel.from_pretrained(
94 | opts.model,
95 | torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
96 | ignore_mismatched_sizes=True,
97 | trust_remote_code=True,
98 | )
99 | .eval()
100 | .to(opts.device)
101 | )
102 |
103 | run_chat(opts, model, processor)
104 |
105 | except KeyboardInterrupt:
106 | print("Bye!")
107 | pass
108 |
109 |
110 | if __name__ == "__main__":
111 | main()
112 |
--------------------------------------------------------------------------------
/python/uform/gen_model.py:
--------------------------------------------------------------------------------
1 | from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
2 |
--------------------------------------------------------------------------------
/python/uform/numpy_processors.py:
--------------------------------------------------------------------------------
1 | from os import PathLike
2 | from typing import Dict, List, Union, Sequence
3 | import json
4 |
5 | from PIL.Image import Image, BICUBIC
6 | from tokenizers import Tokenizer
7 | import numpy as np
8 |
9 | from uform.shared import read_config
10 |
11 |
12 | class TextProcessor:
13 | def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
14 | """
15 | :param config: model config
16 | :param tokenizer_path: path to tokenizer file
17 | """
18 |
19 | config = read_config(config_path)
20 | if "text_encoder" in config:
21 | config = config["text_encoder"]
22 |
23 | self._max_seq_len = config["max_position_embeddings"]
24 | self._tokenizer = Tokenizer.from_file(tokenizer_path)
25 | self._tokenizer.no_padding()
26 | self._pad_token_idx = config["padding_idx"]
27 |
28 | def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]:
29 | """Transforms one or more strings into dictionary with tokenized strings and attention masks.
30 |
31 | :param texts: text of list of texts to tokenizer
32 | """
33 | if isinstance(texts, str):
34 | texts = [texts]
35 |
36 | input_ids = np.full(
37 | (len(texts), self._max_seq_len),
38 | fill_value=self._pad_token_idx,
39 | dtype=np.int32,
40 | )
41 |
42 | attention_mask = np.zeros(
43 | (len(texts), self._max_seq_len),
44 | dtype=np.int32,
45 | )
46 | encoded = self._tokenizer.encode_batch(texts)
47 |
48 | for i, seq in enumerate(encoded):
49 | seq_len = min(len(seq), self._max_seq_len)
50 | input_ids[i, :seq_len] = seq.ids[:seq_len]
51 |
52 | attention_mask[i, :seq_len] = 1
53 |
54 | return {"input_ids": input_ids, "attention_mask": attention_mask}
55 |
56 |
57 | class ImageProcessor:
58 | def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
59 | """
60 | :param config: model config
61 | :param tokenizer_path: path to tokenizer file
62 | :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
63 | """
64 |
65 | config = read_config(config_path)
66 | if "image_encoder" in config:
67 | config = config["image_encoder"]
68 |
69 | self._image_size = config["image_size"]
70 | self._normalization_means = config["normalization_means"]
71 | self._normalization_deviations = config["normalization_deviations"]
72 |
73 | assert isinstance(self._image_size, int) and self._image_size > 0
74 | assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
75 | assert len(self._normalization_means) == len(self._normalization_deviations) == 3
76 |
77 | self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
78 | self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
79 |
80 | def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray:
81 | """Transforms one or more Pillow images into Torch Tensors.
82 |
83 | :param images: image or list of images to preprocess
84 | """
85 |
86 | if isinstance(images, Sequence):
87 | batch_images = np.empty(
88 | (len(images), 3, self._image_size, self._image_size),
89 | dtype=np.float32,
90 | )
91 |
92 | for i, image in enumerate(images):
93 | batch_images[i] = self._resize_crop_normalize(image)
94 |
95 | else:
96 | batch_images = self._resize_crop_normalize(images)[None]
97 |
98 | return batch_images
99 |
100 | def _resize_crop_normalize(self, image: Image):
101 | width, height = image.size
102 |
103 | if width < height:
104 | width = self._image_size
105 | height = int(height / width * self._image_size)
106 | else:
107 | width = int(width / height * self._image_size)
108 | height = self._image_size
109 |
110 | image = image.resize((width, height), resample=BICUBIC)
111 |
112 | left = (width - self._image_size) / 2
113 | top = (height - self._image_size) / 2
114 | right = (width + self._image_size) / 2
115 | bottom = (height + self._image_size) / 2
116 |
117 | image = image.convert("RGB").crop((left, top, right, bottom))
118 | # At this point `image` is a PIL Image with RGB channels.
119 | # If you convert it to `np.ndarray` it will have shape (H, W, C) where C is the number of channels.
120 | image = (np.array(image).astype(np.float32) / 255.0 - self.image_mean) / self.image_std
121 |
122 | # To make it compatible with PyTorch, we need to transpose the image to (C, H, W).
123 | return np.transpose(image, (2, 0, 1))
124 |
--------------------------------------------------------------------------------
/python/uform/onnx_encoders.py:
--------------------------------------------------------------------------------
1 | from os import PathLike
2 | from typing import Dict, Optional, Tuple, Union, Literal
3 | import json
4 |
5 | import onnxruntime as ort
6 | from numpy import ndarray
7 |
8 | from uform.shared import ExecutionProviderError
9 |
10 |
11 | def available_providers(device: Optional[str]) -> Tuple[str, ...]:
12 | """Returns a tuple of available execution providers based on the requested device.
13 | https://onnxruntime.ai/docs/execution-providers/
14 |
15 | :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name.
16 | :return: Tuple of available execution providers.
17 | :raises ExecutionProviderError: If the requested device is not available.
18 | """
19 |
20 | gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
21 | cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
22 | available = ort.get_available_providers()
23 |
24 | # If no target device is specified, let's sort all the available ones with respect to our preference
25 | if device is None:
26 | preferences = gpu_providers + cpu_providers
27 | filtered_preferences = tuple(provider for provider in preferences if provider in available)
28 | if len(filtered_preferences):
29 | return filtered_preferences
30 | if len(available):
31 | return available
32 | raise ExecutionProviderError("No execution providers are available")
33 |
34 | # If a GPU is requested, but no GPU providers are available, raise an error
35 | if device == "gpu" or device == "cuda":
36 | if all(provider not in available for provider in gpu_providers):
37 | raise ExecutionProviderError(
38 | f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
39 | )
40 | return [x for x in gpu_providers if x in available]
41 |
42 | # If a CPU is requested, but no CPU providers are available, raise an error
43 | if device == "cpu":
44 | if all(provider not in available for provider in cpu_providers):
45 | raise ExecutionProviderError(
46 | f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
47 | )
48 | return [x for x in cpu_providers if x in available]
49 |
50 | if device not in available:
51 | available_providers = ", ".join(available)
52 | raise ExecutionProviderError(
53 | f"Execution provider {device} is not available. Currently installed: {available_providers}"
54 | )
55 |
56 | return (device,)
57 |
58 |
59 | class ImageEncoder:
60 | def __init__(
61 | self,
62 | model_path: str,
63 | *,
64 | device: Literal["cpu", "cuda"] = "cpu",
65 | return_features: bool = True,
66 | ):
67 | """
68 | :param model_path: Path to onnx model
69 | :param device: Device name, either cpu or gpu
70 | """
71 |
72 | session_options = ort.SessionOptions()
73 | session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
74 |
75 | self.return_features = return_features
76 | self.session = ort.InferenceSession(
77 | model_path,
78 | sess_options=session_options,
79 | providers=available_providers(device),
80 | )
81 |
82 | def encode(
83 | self, images: ndarray, return_features: Optional[bool] = None
84 | ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
85 | features, embeddings = self.session.run(None, {"images": images})
86 | return_features = return_features if return_features is not None else self.return_features
87 | if return_features:
88 | return features, embeddings
89 | return embeddings
90 |
91 |
92 | class TextEncoder:
93 | def __init__(
94 | self,
95 | model_path: str,
96 | *,
97 | device: Literal["cpu", "cuda"] = "cpu",
98 | return_features: bool = True,
99 | ):
100 | """
101 | :param text_encoder_path: Path to onnx of text encoder
102 | :param device: Device name, either cpu or gpu
103 | """
104 |
105 | session_options = ort.SessionOptions()
106 | session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
107 |
108 | self.return_features = return_features
109 | self.text_encoder_session = ort.InferenceSession(
110 | model_path,
111 | sess_options=session_options,
112 | providers=available_providers(device),
113 | )
114 |
115 | def encode(
116 | self,
117 | x: Union[ndarray, dict],
118 | attention_mask: Optional[ndarray] = None,
119 | return_features: Optional[bool] = None,
120 | ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
121 | if isinstance(x, dict):
122 | assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
123 | attention_mask = x["attention_mask"]
124 | input_ids = x["input_ids"]
125 | else:
126 | input_ids = x
127 |
128 | features, embeddings = self.text_encoder_session.run(
129 | None,
130 | {
131 | "input_ids": input_ids,
132 | "attention_mask": attention_mask,
133 | },
134 | )
135 |
136 | return_features = return_features if return_features is not None else self.return_features
137 | if return_features:
138 | return features, embeddings
139 | return embeddings
140 |
--------------------------------------------------------------------------------
/python/uform/shared.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import Union
3 | from os import PathLike
4 | import json
5 |
6 |
7 | class Modality(Enum):
8 | TEXT_ENCODER = "text_encoder"
9 | IMAGE_ENCODER = "image_encoder"
10 | VIDEO_ENCODER = "video_encoder"
11 | TEXT_DECODER = "text_decoder"
12 |
13 |
14 | class ExecutionProviderError(Exception):
15 | """Exception raised when a requested execution provider is not available."""
16 |
17 |
18 | ConfigOrPath = Union[PathLike, str, object]
19 |
20 |
21 | def read_config(path_or_object: ConfigOrPath) -> object:
22 | if isinstance(path_or_object, (PathLike, str)):
23 | with open(path_or_object, "r") as f:
24 | return json.load(f)
25 | else:
26 | return path_or_object
27 |
--------------------------------------------------------------------------------
/python/uform/torch_decoders.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple, Union
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from torch import nn
6 | from torchvision.transforms import (
7 | CenterCrop,
8 | Compose,
9 | InterpolationMode,
10 | Normalize,
11 | RandomResizedCrop,
12 | Resize,
13 | ToTensor,
14 | )
15 | from transformers import AutoConfig, AutoTokenizer
16 | from transformers.configuration_utils import PretrainedConfig
17 | from transformers.modeling_outputs import CausalLMOutputWithPast
18 | from transformers.modeling_utils import PreTrainedModel
19 | from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
20 | from transformers.processing_utils import ProcessorMixin
21 | from transformers.tokenization_utils_base import BatchEncoding
22 |
23 | from uform.torch_encoders import ImageEncoder
24 |
25 | IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
26 | IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
27 |
28 |
29 | def convert_to_rgb(image):
30 | return image.convert("RGB")
31 |
32 |
33 | class LayerScale(nn.Module):
34 | def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
35 | super().__init__()
36 | self.weight = nn.Parameter(init_values * torch.ones(dim))
37 | self.inplace = inplace
38 |
39 | def forward(self, x):
40 | return x.mul_(self.weight) if self.inplace else x * self.weight
41 |
42 |
43 | class ImageFeaturesPooler(nn.Module):
44 | def __init__(
45 | self,
46 | input_size,
47 | hidden_size,
48 | num_attn_heads,
49 | intermediate_size,
50 | num_latents,
51 | initializer_range,
52 | ):
53 | super().__init__()
54 | self.projection = nn.Linear(input_size, hidden_size)
55 |
56 | self.pooler = nn.TransformerDecoderLayer(
57 | hidden_size,
58 | num_attn_heads,
59 | intermediate_size,
60 | activation=nn.functional.silu,
61 | batch_first=True,
62 | norm_first=True,
63 | )
64 | self.image_latents = nn.Parameter(
65 | torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
66 | )
67 |
68 | def forward(self, features):
69 | features = self.projection(features)
70 | return self.pooler(
71 | self.image_latents.expand(features.shape[0], -1, -1),
72 | features,
73 | )
74 |
75 |
76 | class VLMConfig(PretrainedConfig):
77 | model_type = "vlm"
78 |
79 | def __init__(
80 | self,
81 | text_decoder_name_or_path: str = "",
82 | tokenizer_name_or_path: str = "",
83 | image_size: int = 224,
84 | image_encoder_hidden_size: int = 768,
85 | image_encoder_patch_size: int = 16,
86 | image_encoder_num_layers: int = 12,
87 | image_encoder_num_heads: int = 12,
88 | image_encoder_embedding_dim: int = 256,
89 | image_encoder_pooling: str = "cls",
90 | image_pooler_num_attn_heads: int = 16,
91 | image_pooler_intermediate_size: int = 5504,
92 | image_pooler_num_latents: int = 196,
93 | image_token_id: int = 32002,
94 | initializer_range: float = 0.02,
95 | use_cache: bool = True,
96 | center_crop: bool = True,
97 | **kwargs,
98 | ):
99 | self.text_decoder_name_or_path = text_decoder_name_or_path
100 | self.tokenizer_name_or_path = tokenizer_name_or_path
101 |
102 | self.image_size = image_size
103 | self.image_encoder_hidden_size = image_encoder_hidden_size
104 | self.image_encoder_patch_size = image_encoder_patch_size
105 | self.image_encoder_num_layers = image_encoder_num_layers
106 | self.image_encoder_num_heads = image_encoder_num_heads
107 | self.image_encoder_embedding_dim = image_encoder_embedding_dim
108 | self.image_encoder_pooling = image_encoder_pooling
109 |
110 | self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
111 | self.image_pooler_intermediate_size = image_pooler_intermediate_size
112 | self.image_pooler_num_latents = image_pooler_num_latents
113 |
114 | self.image_token_id = image_token_id
115 |
116 | self.initializer_range = initializer_range
117 | self.use_cache = use_cache
118 | self.center_crop = center_crop
119 |
120 | super().__init__(**kwargs)
121 |
122 |
123 | class VLMPreTrainedModel(PreTrainedModel):
124 | config_class = VLMConfig
125 | base_model_prefix = "vlm"
126 | supports_gradient_checkpointing = True
127 | _no_split_modules = []
128 | _skip_keys_device_placement = "past_key_values"
129 |
130 | def _init_weights(self, module):
131 | pass
132 |
133 | def _initialize_weights(self, module):
134 | pass
135 |
136 |
137 | class VLMForCausalLM(VLMPreTrainedModel):
138 | def __init__(self, config: VLMConfig):
139 | super().__init__(config)
140 |
141 | self.config = config
142 | self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
143 | self.text_config.vocab_size += 3
144 | self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
145 |
146 | self.image_encoder = ImageEncoder(
147 | self.config.image_encoder_hidden_size,
148 | self.config.image_encoder_patch_size,
149 | self.config.image_size,
150 | self.config.image_encoder_num_layers,
151 | self.config.image_encoder_num_heads,
152 | self.config.image_encoder_embedding_dim,
153 | self.config.image_encoder_pooling,
154 | )
155 |
156 | # replace models' layerscales because `transformers` automatically renames keys in `state_dict`
157 | for i in range(len(self.image_encoder.blocks)):
158 | self.image_encoder.blocks[i].ls1 = LayerScale(
159 | self.image_encoder.blocks[i].ls1.dim,
160 | )
161 | self.image_encoder.blocks[i].ls2 = LayerScale(
162 | self.image_encoder.blocks[i].ls2.dim,
163 | )
164 |
165 | self.image_pooler = ImageFeaturesPooler(
166 | self.config.image_encoder_hidden_size,
167 | self.text_config.hidden_size,
168 | self.config.image_pooler_num_attn_heads,
169 | self.config.image_pooler_intermediate_size,
170 | self.config.image_pooler_num_latents,
171 | self.config.initializer_range,
172 | )
173 |
174 | def get_input_embeddings(self):
175 | return self.text_decoder.get_input_embeddings()
176 |
177 | def set_input_embeddings(self, value):
178 | self.text_decoder.set_input_embeddings(value)
179 |
180 | def get_images_embeddings(self, images):
181 | features = self.image_encoder.forward_features(images)
182 | return self.image_pooler(features)
183 |
184 | def gather_continuous_embeddings(
185 | self,
186 | input_ids: torch.Tensor,
187 | word_embeddings: torch.Tensor,
188 | image_embeddings: torch.Tensor,
189 | ) -> torch.Tensor:
190 | start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
191 | embeddings = []
192 |
193 | for sample_idx, start_idx in enumerate(start_indices.tolist()):
194 | embeddings.append(
195 | torch.cat(
196 | (
197 | word_embeddings[sample_idx, :start_idx],
198 | image_embeddings[sample_idx],
199 | word_embeddings[sample_idx, start_idx + 1 :],
200 | ),
201 | dim=0,
202 | ),
203 | )
204 |
205 | return torch.stack(embeddings, dim=0)
206 |
207 | def forward(
208 | self,
209 | input_ids: torch.LongTensor = None,
210 | images: torch.Tensor = None,
211 | attention_mask: Optional[torch.Tensor] = None,
212 | position_ids: Optional[torch.LongTensor] = None,
213 | past_key_values: Optional[List[torch.FloatTensor]] = None,
214 | inputs_embeds: Optional[torch.FloatTensor] = None,
215 | use_cache: Optional[bool] = None,
216 | labels: Optional[torch.Tensor] = None,
217 | output_attentions: Optional[bool] = None,
218 | output_hidden_states: Optional[bool] = None,
219 | return_dict: Optional[bool] = None,
220 | ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
221 |
222 | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
223 | output_hidden_states = (
224 | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
225 | )
226 | use_cache = use_cache if use_cache is not None else self.config.use_cache
227 |
228 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict
229 |
230 | if input_ids is not None and inputs_embeds is not None:
231 | raise ValueError(
232 | "You cannot specify both input_ids and inputs_embeds at the same time",
233 | )
234 | elif input_ids is None and inputs_embeds is None:
235 | raise ValueError("You have to specify either input_is or inputs_embeds")
236 |
237 | if inputs_embeds is None and past_key_values is None:
238 | inputs_embeds = self.get_input_embeddings()(input_ids)
239 |
240 | if images is not None:
241 | image_embeds = self.get_images_embeddings(images)
242 | inputs_embeds = self.gather_continuous_embeddings(
243 | input_ids,
244 | inputs_embeds,
245 | image_embeds,
246 | )
247 |
248 | if position_ids is None:
249 | seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
250 | past_key_values_length = 0
251 |
252 | if past_key_values is not None:
253 | past_key_values_length = past_key_values[0][0].shape[2]
254 |
255 | device = input_ids.device if input_ids is not None else inputs_embeds.device
256 | position_ids = torch.arange(
257 | past_key_values_length,
258 | seq_length + past_key_values_length,
259 | dtype=torch.long,
260 | device=device,
261 | )
262 | position_ids = position_ids.unsqueeze(0)
263 |
264 | outputs = self.text_decoder(
265 | inputs_embeds=inputs_embeds,
266 | input_ids=input_ids if past_key_values is not None else None,
267 | attention_mask=attention_mask,
268 | labels=labels,
269 | position_ids=position_ids,
270 | past_key_values=past_key_values,
271 | output_attentions=output_attentions,
272 | output_hidden_states=output_hidden_states,
273 | use_cache=use_cache,
274 | return_dict=return_dict,
275 | )
276 |
277 | return outputs
278 |
279 | def prepare_inputs_for_generation(
280 | self,
281 | input_ids,
282 | images=None,
283 | past_key_values=None,
284 | attention_mask=None,
285 | inputs_embeds=None,
286 | **kwargs,
287 | ):
288 | if past_key_values:
289 | input_ids = input_ids[:, -1:]
290 |
291 | position_ids = kwargs.get("position_ids", None)
292 | if attention_mask is not None and position_ids is None:
293 | # create position_ids on the fly for batch generation
294 | position_ids = attention_mask.long().cumsum(-1) - 1
295 | position_ids.masked_fill_(attention_mask == 0, 1)
296 | if past_key_values:
297 | position_ids = position_ids[:, -1].unsqueeze(-1)
298 |
299 | # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
300 | if inputs_embeds is not None and past_key_values is None:
301 | model_inputs = {"inputs_embeds": inputs_embeds}
302 | else:
303 | model_inputs = {"input_ids": input_ids}
304 |
305 | if images is not None:
306 | model_inputs["images"] = images
307 |
308 | model_inputs.update(
309 | {
310 | "position_ids": position_ids,
311 | "past_key_values": past_key_values,
312 | "use_cache": kwargs.get("use_cache"),
313 | "attention_mask": attention_mask,
314 | "images": images if past_key_values is None else None,
315 | },
316 | )
317 | return model_inputs
318 |
319 | @classmethod
320 | def from_config(cls, config, **kwargs):
321 | return cls._from_config(config, **kwargs)
322 |
323 |
324 | class VLMProcessor(ProcessorMixin):
325 | def __init__(self, config, **kwargs):
326 | self.feature_extractor = None
327 | self.config = config
328 |
329 | if config.center_crop:
330 | self.image_processor = Compose(
331 | [
332 | Resize(256, interpolation=InterpolationMode.BICUBIC),
333 | CenterCrop(config.image_size),
334 | convert_to_rgb,
335 | ToTensor(),
336 | Normalize(
337 | mean=IMAGENET_MEAN,
338 | std=IMAGENET_STD,
339 | ),
340 | ],
341 | )
342 | else:
343 | self.image_processor = Compose(
344 | [
345 | RandomResizedCrop(
346 | config.image_size,
347 | scale=(0.8, 1),
348 | interpolation=InterpolationMode.BICUBIC,
349 | ),
350 | convert_to_rgb,
351 | ToTensor(),
352 | Normalize(
353 | mean=IMAGENET_MEAN,
354 | std=IMAGENET_STD,
355 | ),
356 | ],
357 | )
358 |
359 | self.tokenizer = AutoTokenizer.from_pretrained(
360 | config.tokenizer_name_or_path,
361 | additional_special_tokens=["<|im_end|>"],
362 | )
363 | self.num_image_latents = config.image_pooler_num_latents
364 |
365 | def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
366 | if texts is not None:
367 | if isinstance(texts, str):
368 | texts = [texts]
369 |
370 | tokenized_texts = []
371 | for text in texts:
372 | messages = [
373 | {"role": "system", "content": "You are a helpful assistant."},
374 | {"role": "user", "content": f" {text}"},
375 | ]
376 | tokenized_prompt = self.tokenizer.apply_chat_template(
377 | messages,
378 | add_generation_prompt=True,
379 | return_tensors=return_tensors,
380 | )
381 |
382 | tokenized_texts.append(tokenized_prompt)
383 |
384 | max_len = max(len(t[0]) for t in tokenized_texts)
385 | input_ids = torch.full(
386 | (len(tokenized_texts), max_len),
387 | fill_value=self.tokenizer.pad_token_id,
388 | dtype=torch.int64,
389 | )
390 | attention_mask = torch.full(
391 | (len(tokenized_texts), max_len),
392 | fill_value=0,
393 | dtype=torch.int64,
394 | )
395 |
396 | for i, tokens in enumerate(tokenized_texts):
397 | input_ids[i, -len(tokens[0]) :] = tokens[0]
398 | attention_mask[i, -len(tokens[0]) :] = 1
399 |
400 | attention_mask = F.pad(
401 | attention_mask,
402 | pad=(0, self.num_image_latents - 1),
403 | value=1,
404 | )
405 |
406 | encoding = BatchEncoding(
407 | data={
408 | "input_ids": input_ids,
409 | "attention_mask": attention_mask,
410 | },
411 | )
412 |
413 | if images is not None:
414 | if isinstance(images, (list, tuple)):
415 | image_features = torch.empty(
416 | (len(images), 3, self.config.image_size, self.config.image_size),
417 | dtype=torch.float32,
418 | )
419 |
420 | for i, image in enumerate(images):
421 | image_features[i] = self.image_processor(image)
422 | else:
423 | image_features = self.image_processor(images).unsqueeze(0)
424 |
425 | if texts is not None and images is not None:
426 | encoding["images"] = image_features
427 | return encoding
428 |
429 | if texts is not None:
430 | return encoding
431 |
432 | return BatchEncoding(
433 | data={
434 | "images": image_features,
435 | },
436 | tensor_type=return_tensors,
437 | )
438 |
439 | def batch_decode(self, *args, **kwargs):
440 | return self.tokenizer.batch_decode(*args, **kwargs)
441 |
442 | def decode(self, *args, **kwargs):
443 | return self.tokenizer.decode(*args, **kwargs)
444 |
445 | @classmethod
446 | def from_pretrained(
447 | cls,
448 | pretrained_model_name_or_path,
449 | cache_dir=None,
450 | force_download: bool = False,
451 | local_files_only: bool = False,
452 | token=None,
453 | revision: str = "main",
454 | **kwargs,
455 | ):
456 | config = AutoConfig.from_pretrained(
457 | pretrained_model_name_or_path,
458 | cache_dir=cache_dir,
459 | force_download=force_download,
460 | local_files_only=local_files_only,
461 | revision=revision,
462 | token=token,
463 | **kwargs,
464 | )
465 | return cls(config)
466 |
467 |
468 | AutoConfig.register("vlm", VLMConfig)
469 | AutoModel.register(VLMConfig, VLMForCausalLM)
470 |
--------------------------------------------------------------------------------
/python/uform/torch_encoders.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dataclasses import dataclass
4 | from os import PathLike
5 | from typing import Dict, Optional, Union, Mapping, Any, Tuple
6 |
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from torch import Tensor
11 | from PIL.Image import Image
12 |
13 | from uform.shared import read_config
14 |
15 |
16 | def _is_on_gpu(model: nn.Module) -> bool:
17 | try:
18 | return next(model.parameters()).device.type == "cuda"
19 | except StopIteration:
20 | return False
21 |
22 |
23 | @dataclass(eq=False)
24 | class Attention(nn.Module):
25 | dim: int
26 | num_heads: int
27 | dropout_prob: float = 0
28 |
29 | def __post_init__(self):
30 | super().__init__()
31 |
32 | self.use_sdp = int(torch.__version__[0]) > 1
33 |
34 | self.query = nn.Linear(self.dim, self.dim)
35 | self.key = nn.Linear(self.dim, self.dim)
36 | self.value = nn.Linear(self.dim, self.dim)
37 | self.out = nn.Linear(self.dim, self.dim)
38 |
39 | self.head_dim = self.dim // self.num_heads
40 | self.scale = self.head_dim**-0.5
41 |
42 | def forward(
43 | self,
44 | x: Tensor,
45 | attn_mask: Optional[Tensor] = None,
46 | context: Optional[Tensor] = None,
47 | is_causal: bool = False,
48 | ) -> Tensor:
49 | query = self.reshape(self.query(x))
50 | key = self.reshape(self.key(x if context is None else context))
51 | value = self.reshape(self.value(x if context is None else context))
52 |
53 | if self.use_sdp:
54 | x = F.scaled_dot_product_attention(
55 | query,
56 | key,
57 | value,
58 | attn_mask,
59 | dropout_p=self.dropout_prob if self.training else 0,
60 | is_causal=is_causal,
61 | )
62 | else:
63 | attn = query @ key.transpose(-2, -1) * self.scale
64 | if attn_mask is not None:
65 | attn += attn_mask
66 |
67 | attn = attn.softmax(dim=-1)
68 | x = attn @ value
69 |
70 | return self.out(x.transpose(2, 1).flatten(2))
71 |
72 | def reshape(self, x: Tensor) -> Tensor:
73 | batch_size, seq_len, _ = x.shape
74 | x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
75 | return x.transpose(2, 1)
76 |
77 |
78 | @dataclass(eq=False)
79 | class MLP(nn.Module):
80 | dim: int
81 | dim_expand_factor: int = 4
82 |
83 | def __post_init__(self):
84 | super().__init__()
85 |
86 | self.hidden_layer = nn.Linear(self.dim, self.dim * self.dim_expand_factor)
87 | self.output_layer = nn.Linear(self.dim * self.dim_expand_factor, self.dim)
88 |
89 | def forward(self, x: Tensor) -> Tensor:
90 | x = F.gelu(self.hidden_layer(x))
91 | return self.output_layer(x)
92 |
93 |
94 | @dataclass(eq=False)
95 | class LayerScale(nn.Module):
96 | dim: int
97 | init_values: float = 1e-5
98 | inplace: bool = False
99 |
100 | def __post_init__(self):
101 | super().__init__()
102 | self.gamma = nn.Parameter(self.init_values * torch.ones(self.dim))
103 |
104 | def forward(self, x: Tensor) -> Tensor:
105 | return x.mul_(self.gamma) if self.inplace else x * self.gamma
106 |
107 |
108 | @dataclass(eq=False)
109 | class TextEncoderBlock(nn.Module):
110 | dim: int
111 | num_heads: int
112 | dropout_prob: float
113 | cross_attention: bool = False
114 |
115 | def __post_init__(self):
116 | super().__init__()
117 |
118 | self.norm_attn = nn.LayerNorm(self.dim, eps=1e-12)
119 | self.attention = Attention(self.dim, self.num_heads, self.dropout_prob)
120 |
121 | if self.cross_attention:
122 | self.norm_crossattn = nn.LayerNorm(self.dim, eps=1e-12)
123 | self.crossattn = Attention(self.dim, self.num_heads, self.dropout_prob)
124 |
125 | self.norm_mlp = nn.LayerNorm(self.dim, eps=1e-12)
126 | self.mlp = MLP(self.dim)
127 |
128 | self.dropout = nn.Dropout(self.dropout_prob)
129 |
130 | def forward(
131 | self,
132 | x: Tensor,
133 | attn_mask: Tensor,
134 | context: Optional[Tensor] = None,
135 | ) -> Tensor:
136 | x = self.norm_attn(x + self.dropout(self.attention(x, attn_mask)))
137 |
138 | if self.cross_attention and context is not None:
139 | x = self.norm_crossattn(
140 | x + self.dropout(self.crossattn(x, context=context)),
141 | )
142 |
143 | return self.norm_mlp(x + self.dropout(self.mlp(x)))
144 |
145 |
146 | @dataclass(eq=False)
147 | class ImageEncoderBlock(nn.Module):
148 | dim: int
149 | num_heads: int
150 |
151 | def __post_init__(self):
152 | super().__init__()
153 | self.norm1 = nn.LayerNorm(self.dim, eps=1e-6)
154 | self.attn = Attention(self.dim, self.num_heads)
155 | self.ls1 = LayerScale(self.dim)
156 |
157 | self.norm2 = nn.LayerNorm(self.dim, eps=1e-6)
158 | self.mlp = MLP(self.dim)
159 | self.ls2 = LayerScale(self.dim)
160 |
161 | def forward(self, x: Tensor) -> Tensor:
162 | x = x + self.ls1(self.attn(self.norm1(x)))
163 | x = x + self.ls2(self.mlp(self.norm2(x)))
164 | return x
165 |
166 |
167 | @dataclass(eq=False)
168 | class TextEncoder(nn.Module):
169 | model_type: str
170 | dim: int
171 | context_dim: int
172 | vocab_size: int
173 | padding_idx: int
174 | num_layers: int
175 | num_heads: int
176 | embedding_dim: int
177 | multimodal_layers_ids: tuple
178 | head_one_neuron: bool
179 | pooling: str = "cls"
180 | max_position_embeddings: int = 77
181 | dropout_prob: float = 0
182 |
183 | def __post_init__(self):
184 | super().__init__()
185 |
186 | self.word_embeddings = nn.Embedding(
187 | self.vocab_size,
188 | self.dim,
189 | padding_idx=self.padding_idx,
190 | )
191 | self.position_embeddings = nn.Embedding(self.max_position_embeddings, self.dim)
192 |
193 | if self.model_type == "bert":
194 | self.register_buffer(
195 | "position_ids",
196 | torch.arange(self.max_position_embeddings).unsqueeze(0),
197 | persistent=False,
198 | )
199 |
200 | self.layer_norm = nn.LayerNorm(self.dim, eps=1e-12)
201 | self.dropout = nn.Dropout(self.dropout_prob)
202 |
203 | self.blocks = nn.ModuleList(
204 | [
205 | TextEncoderBlock(
206 | self.dim,
207 | self.num_heads,
208 | self.dropout_prob,
209 | layer_id in self.multimodal_layers_ids,
210 | )
211 | for layer_id in range(self.num_layers)
212 | ],
213 | )
214 |
215 | self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
216 | self.matching_head = nn.Linear(self.dim, 1 if self.head_one_neuron else 2)
217 |
218 | if self.context_dim != self.dim:
219 | self.context_projection = nn.Linear(self.context_dim, self.dim, bias=False)
220 | else:
221 | self.context_projection = nn.Identity()
222 | self.return_features = False
223 |
224 | def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
225 | x = self.embed_text(x)
226 | attn_mask = self.get_attention_mask(attn_mask, x.dtype)
227 |
228 | for block in self.blocks:
229 | if not block.cross_attention:
230 | x = block(x, attn_mask)
231 |
232 | return x
233 |
234 | def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor:
235 | return self.embedding_projection(self.pool_features(x, attn_mask))
236 |
237 | def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
238 | if self.pooling == "cls":
239 | return x[:, 0]
240 |
241 | attn_mask = attn_mask.unsqueeze(2).type_as(x)
242 | return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1)
243 |
244 | def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor:
245 | attn_mask = attn_mask.to(dtype)
246 | attn_mask = (1.0 - attn_mask) * torch.finfo(dtype).min
247 | return attn_mask.unsqueeze(1).expand(-1, attn_mask.shape[1], -1).unsqueeze(1)
248 |
249 | def get_position_ids(self, x: Tensor) -> Tensor:
250 | if self.model_type == "roberta":
251 | mask = x.ne(self.padding_idx).int()
252 | return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + self.padding_idx
253 |
254 | return self.position_ids[:, : x.shape[1]]
255 |
256 | def embed_text(self, x: Tensor) -> Tensor:
257 | positional_embedding = self.position_embeddings(self.get_position_ids(x))
258 | x = self.word_embeddings(x) + positional_embedding
259 | return self.dropout(self.layer_norm(x))
260 |
261 | def forward(
262 | self,
263 | x: Union[Tensor, dict],
264 | attention_mask: Optional[Tensor] = None,
265 | return_features: Optional[bool] = None,
266 | ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
267 |
268 | if isinstance(x, dict):
269 | assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
270 | attention_mask = x["attention_mask"]
271 | x = x["input_ids"]
272 | elif attention_mask is None:
273 | # If no attention mask is provided - create one with all ones
274 | attention_mask = torch.ones_like(x)
275 |
276 | # If the model is on the GPU and the input matrices are not, shift them there
277 | if _is_on_gpu(self) and not x.is_cuda:
278 | x = x.cuda()
279 | attention_mask = attention_mask.cuda()
280 |
281 | features = self.forward_features(x, attention_mask)
282 | embeddings = self.forward_embedding(features, attention_mask)
283 |
284 | return_features = return_features if return_features is not None else self.return_features
285 | if return_features:
286 | return features, embeddings
287 | return embeddings
288 |
289 | def encode(
290 | self,
291 | x: Union[Tensor, dict],
292 | attention_mask: Optional[Tensor] = None,
293 | return_features: Optional[bool] = None,
294 | ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
295 |
296 | result = self.forward(x, attention_mask, return_features)
297 | if isinstance(result, tuple):
298 | return result[0].detach(), result[1].detach()
299 | else:
300 | return result.detach()
301 |
302 | @staticmethod
303 | def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder:
304 | """Load the image encoder from the given configuration and model path.
305 |
306 | :param config: the configuration dictionary or path to the JSON configuration file
307 | :param model: the model state dictionary or path to the `.pt` model file
308 | """
309 | config = read_config(config)
310 | if "text_encoder" in config:
311 | config = config["text_encoder"]
312 |
313 | # We must strip all the non-member attributes before initializing the classes.
314 | text_fields = TextEncoder.__dataclass_fields__
315 | config = {k: v for k, v in config.items() if k in text_fields}
316 | encoder = TextEncoder(**config)
317 |
318 | # Load from disk
319 | if isinstance(model, (PathLike, str)):
320 | state = torch.load(model)
321 | else:
322 | state = model
323 | if "text_encoder" in state:
324 | state = state["text_encoder"]
325 | encoder.load_state_dict(state)
326 | return encoder
327 |
328 |
329 | @dataclass(eq=False)
330 | class ImageEncoder(nn.Module):
331 | dim: int
332 | patch_size: int
333 | image_size: int
334 | num_layers: int
335 | num_heads: int
336 | embedding_dim: int
337 | pooling: str
338 | num_reg_tokens: int = 0
339 |
340 | def __post_init__(self):
341 | super().__init__()
342 |
343 | seq_len = (self.image_size // self.patch_size) ** 2
344 | self.patch_embed = nn.Conv2d(3, self.dim, self.patch_size, self.patch_size)
345 | self.pos_embed = nn.Parameter(torch.randn(1, seq_len, self.dim) * 0.02)
346 | self.cls_token = nn.Parameter(torch.zeros(1, 1, self.dim))
347 |
348 | if self.num_reg_tokens > 0:
349 | self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim))
350 |
351 | self.blocks = nn.Sequential(
352 | *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
353 | )
354 |
355 | self.norm = nn.LayerNorm(self.dim, eps=1e-6)
356 | self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
357 | self.return_features = False
358 |
359 | def forward_features(self, x: Union[Tensor, dict]) -> Tensor:
360 | x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1)
361 | x = x + self.pos_embed
362 | special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)]
363 |
364 | if self.num_reg_tokens > 0:
365 | special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1))
366 |
367 | x = torch.cat(special_tokens + [x], dim=1)
368 | x = self.blocks(x)
369 | return self.norm(x)
370 |
371 | def forward_embedding(self, x: Tensor) -> Tensor:
372 | if self.pooling == "cls":
373 | x = x[:, 0]
374 | else:
375 | x = x.mean(dim=1)
376 |
377 | return self.embedding_projection(x)
378 |
379 | def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
380 | if isinstance(x, dict):
381 | x = x["images"]
382 |
383 | # If the model is on the GPU and the input matrices are not, shift them there
384 | if _is_on_gpu(self) and not x.is_cuda:
385 | x = x.cuda()
386 |
387 | features = self.forward_features(x)
388 | embeddings = self.forward_embedding(features)
389 | return_features = return_features if return_features is not None else self.return_features
390 | if return_features:
391 | return features, embeddings
392 | return embeddings
393 |
394 | def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
395 | result = self.forward(x, return_features)
396 | if isinstance(result, tuple):
397 | return result[0].detach(), result[1].detach()
398 | else:
399 | return result.detach()
400 |
401 | @staticmethod
402 | def from_pretrained(
403 | config: Union[PathLike, str, object],
404 | model: Union[PathLike, str, Mapping[str, Any]],
405 | ) -> ImageEncoder:
406 | """Load the image encoder from the given configuration and model path.
407 |
408 | :param config: the configuration dictionary or path to the JSON configuration file
409 | :param model: the model state dictionary or path to the `.pt` model file
410 | """
411 | config = read_config(config)
412 | if "image_encoder" in config:
413 | config = config["image_encoder"]
414 |
415 | # We must strip all the non-member attributes before initializing the classes.
416 | image_fields = ImageEncoder.__dataclass_fields__
417 | config = {k: v for k, v in config.items() if k in image_fields}
418 | encoder = ImageEncoder(**config)
419 |
420 | # Load from disk
421 | if isinstance(model, (PathLike, str)):
422 | state = torch.load(model)
423 | else:
424 | state = model
425 | if "image_encoder" in state:
426 | state = state["image_encoder"]
427 | encoder.load_state_dict(state)
428 | return encoder
429 |
--------------------------------------------------------------------------------
/python/uform/torch_processors.py:
--------------------------------------------------------------------------------
1 | from os import PathLike
2 | from typing import Dict, List, Union, Sequence
3 | import json
4 |
5 | import torch
6 | from PIL.Image import Image
7 | from tokenizers import Tokenizer
8 | from torch import Tensor
9 | from torchvision.transforms import (
10 | CenterCrop,
11 | Compose,
12 | InterpolationMode,
13 | Normalize,
14 | Resize,
15 | ToTensor,
16 | )
17 |
18 | from uform.shared import read_config
19 |
20 |
21 | # lambda is not pickle-able
22 | def convert_to_rgb(image):
23 | return image.convert("RGB")
24 |
25 |
26 | class TextProcessor:
27 | def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
28 | """
29 | :param config: model config
30 | :param tokenizer_path: path to tokenizer file
31 | """
32 |
33 | config = read_config(config_path)
34 | if "text_encoder" in config:
35 | config = config["text_encoder"]
36 |
37 | self._max_seq_len = config["max_position_embeddings"]
38 | self._tokenizer = Tokenizer.from_file(tokenizer_path)
39 | self._tokenizer.no_padding()
40 | self._pad_token_idx = config["padding_idx"]
41 |
42 | def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
43 | """Transforms one or more strings into dictionary with tokenized strings and attention masks.
44 |
45 | :param texts: text of list of texts to tokenizer
46 | :return: dictionary with tokenized strings and attention masks as values
47 | """
48 | if isinstance(texts, str):
49 | texts = [texts]
50 |
51 | input_ids = torch.full(
52 | (len(texts), self._max_seq_len),
53 | fill_value=self._pad_token_idx,
54 | dtype=torch.int64,
55 | )
56 |
57 | attention_mask = torch.zeros(
58 | len(texts),
59 | self._max_seq_len,
60 | dtype=torch.int32,
61 | )
62 | encoded = self._tokenizer.encode_batch(texts)
63 |
64 | for i, seq in enumerate(encoded):
65 | seq_len = min(len(seq), self._max_seq_len)
66 | input_ids[i, :seq_len] = torch.LongTensor(
67 | seq.ids[:seq_len],
68 | )
69 | attention_mask[i, :seq_len] = 1
70 |
71 | return {"input_ids": input_ids, "attention_mask": attention_mask}
72 |
73 |
74 | class ImageProcessor:
75 | def __init__(self, config_path: PathLike):
76 | """
77 | :param config: model config
78 | """
79 |
80 | config = read_config(config_path)
81 | if "image_encoder" in config:
82 | config = config["image_encoder"]
83 |
84 | self._image_size = config["image_size"]
85 | self._normalization_means = config["normalization_means"]
86 | self._normalization_deviations = config["normalization_deviations"]
87 |
88 | assert isinstance(self._image_size, int) and self._image_size > 0
89 | assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
90 | assert len(self._normalization_means) == len(self._normalization_deviations) == 3
91 |
92 | self._image_transform = Compose(
93 | [
94 | Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
95 | convert_to_rgb,
96 | CenterCrop(self._image_size),
97 | ToTensor(),
98 | Normalize(
99 | mean=tuple(self._normalization_means),
100 | std=tuple(self._normalization_deviations),
101 | ),
102 | ],
103 | )
104 |
105 | def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]:
106 | """Transforms one or more Pillow images into Torch Tensors.
107 |
108 | :param images: image or list of images to preprocess
109 | :return: dictionary with float-represented images in tensors as values
110 | """
111 |
112 | if isinstance(images, Sequence):
113 | batch_images = torch.empty(
114 | (len(images), 3, self._image_size, self._image_size),
115 | dtype=torch.float32,
116 | )
117 |
118 | for i, image in enumerate(images):
119 | batch_images[i] = self._image_transform(image)
120 |
121 | else:
122 | batch_images = self._image_transform(images).unsqueeze(0)
123 |
124 | return {"images": batch_images}
125 |
--------------------------------------------------------------------------------
/swift/EncodersTests.swift:
--------------------------------------------------------------------------------
1 | import CoreGraphics
2 | import Hub
3 | import ImageIO
4 | import UForm
5 | import XCTest
6 |
7 | final class TokenizerTests: XCTestCase {
8 |
9 | var hfToken: String?
10 |
11 | override func setUp() {
12 | super.setUp()
13 | // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory
14 | let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token")
15 | if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines)
16 | {
17 | hfToken = token
18 | }
19 |
20 | hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"]
21 | hfToken = hfToken ?? "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"
22 | }
23 |
24 | func cosineSimilarity(between vectorA: [T], and vectorB: [T]) -> T {
25 | guard vectorA.count == vectorB.count else {
26 | fatalError("Vectors must be of the same length.")
27 | }
28 |
29 | let dotProduct = zip(vectorA, vectorB).reduce(T.zero) { $0 + ($1.0 * $1.1) }
30 | let magnitudeA = sqrt(vectorA.reduce(T.zero) { $0 + $1 * $1 })
31 | let magnitudeB = sqrt(vectorB.reduce(T.zero) { $0 + $1 * $1 })
32 |
33 | // Avoid division by zero
34 | if magnitudeA == T.zero || magnitudeB == T.zero {
35 | return T.zero
36 | }
37 |
38 | return dotProduct / (magnitudeA * magnitudeB)
39 | }
40 |
41 | func testTextEmbeddings(forModel modelName: String) async throws {
42 |
43 | let api = HubApi(hfToken: hfToken)
44 | let textModel = try await TextEncoder(
45 | modelName: "unum-cloud/uform3-image-text-english-small",
46 | hubApi: api
47 | )
48 |
49 | let texts = [
50 | "sunny beach with clear blue water",
51 | "crowded sandbeach under the bright sun",
52 | "dense forest with tall green trees",
53 | "quiet park in the morning light",
54 | ]
55 |
56 | var textEmbeddings: [[Float32]] = []
57 | for text in texts {
58 | let embedding: [Float32] = try textModel.encode(text).asFloats()
59 | textEmbeddings.append(embedding)
60 | }
61 |
62 | // Now let's compute the cosine similarity between the textEmbeddings
63 | let similarityBeach = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[1])
64 | let similarityForest = cosineSimilarity(between: textEmbeddings[2], and: textEmbeddings[3])
65 | let dissimilarityBetweenScenes = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[2])
66 |
67 | // Assert that similar texts have higher similarity scores
68 | XCTAssertTrue(
69 | similarityBeach > dissimilarityBetweenScenes,
70 | "Beach texts should be more similar to each other than to forest texts."
71 | )
72 | XCTAssertTrue(
73 | similarityForest > dissimilarityBetweenScenes,
74 | "Forest texts should be more similar to each other than to beach texts."
75 | )
76 | }
77 |
78 | func testTextEmbeddings() async throws {
79 | for model in [
80 | "unum-cloud/uform3-image-text-english-small",
81 | "unum-cloud/uform3-image-text-english-base",
82 | "unum-cloud/uform3-image-text-english-large",
83 | "unum-cloud/uform3-image-text-multilingual-base",
84 | ] {
85 | try await testTextEmbeddings(forModel: model)
86 | }
87 | }
88 |
89 | func testImageEmbeddings(forModel modelName: String) async throws {
90 |
91 | // One option is to use a local model repository.
92 | //
93 | // let root = "uform/"
94 | // let textModel = try TextEncoder(
95 | // modelPath: root + "uform-vl-english-large-text_encoder.mlpackage",
96 | // configPath: root + "uform-vl-english-large-text.json",
97 | // tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json"
98 | // )
99 | // let imageModel = try ImageEncoder(
100 | // modelPath: root + "uform-vl-english-large-image_encoder.mlpackage",
101 | // configPath: root + "uform-vl-english-large-image.json"
102 | // )
103 | //
104 | // A better option is to fetch directly from HuggingFace, similar to how users would do that:
105 | let api = HubApi(hfToken: hfToken)
106 | let textModel = try await TextEncoder(
107 | modelName: modelName,
108 | hubApi: api
109 | )
110 | let imageModel = try await ImageEncoder(
111 | modelName: modelName,
112 | hubApi: api
113 | )
114 |
115 | let texts = [
116 | "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
117 | "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
118 | "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
119 | "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
120 | "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
121 | ]
122 | let imageURLs = [
123 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
124 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
125 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
126 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
127 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
128 | ]
129 |
130 | var textEmbeddings: [[Float32]] = []
131 | var imageEmbeddings: [[Float32]] = []
132 | for (text, imageURL) in zip(texts, imageURLs) {
133 | guard let url = URL(string: imageURL),
134 | let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
135 | let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil)
136 | else {
137 | throw NSError(
138 | domain: "ImageError",
139 | code: 100,
140 | userInfo: [NSLocalizedDescriptionKey: "Could not load image from URL: \(imageURL)"]
141 | )
142 | }
143 |
144 | let textEmbedding: [Float32] = try textModel.encode(text).asFloats()
145 | textEmbeddings.append(textEmbedding)
146 | let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats()
147 | imageEmbeddings.append(imageEmbedding)
148 | }
149 |
150 | // Now let's make sure that the cosine distance between image and respective text embeddings is low.
151 | // Make sure that the similarity between image and text at index `i` is higher than with other texts and images.
152 | for i in 0 ..< texts.count {
153 | let pairSimilarity = cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[i])
154 | let otherTextSimilarities = (0 ..< texts.count).filter { $0 != i }.map {
155 | cosineSimilarity(between: textEmbeddings[$0], and: imageEmbeddings[i])
156 | }
157 | let otherImageSimilarities = (0 ..< texts.count).filter { $0 != i }.map {
158 | cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[$0])
159 | }
160 |
161 | XCTAssertTrue(
162 | pairSimilarity > otherTextSimilarities.max()!,
163 | "Text should be more similar to its corresponding image than to other images."
164 | )
165 | XCTAssertTrue(
166 | pairSimilarity > otherImageSimilarities.max()!,
167 | "Text should be more similar to its corresponding image than to other texts."
168 | )
169 | }
170 | }
171 |
172 | func testImageEmbeddings() async throws {
173 | for model in [
174 | "unum-cloud/uform3-image-text-english-small",
175 | "unum-cloud/uform3-image-text-english-base",
176 | "unum-cloud/uform3-image-text-english-large",
177 | "unum-cloud/uform3-image-text-multilingual-base",
178 | ] {
179 | try await testImageEmbeddings(forModel: model)
180 | }
181 | }
182 |
183 | }
184 |
--------------------------------------------------------------------------------
/swift/README.md:
--------------------------------------------------------------------------------
1 | # UForm Swift SDK
2 |
3 | UForm offers first-party support for Swift.
4 | To get started, add UForm to your project using Swift Package Manager.
5 |
6 | ```bash
7 | swift package init --type executable
8 | swift package add uform
9 | ```
10 |
11 | Then, import UForm in your Swift code:
12 |
13 | ```swift
14 | import UForm
15 | ```
16 |
17 | ## Embeddings
18 |
19 | ### Text Embeddings
20 |
21 | ```swift
22 | let textModel = try await TextEncoder(
23 | modelName: "unum-cloud/uform3-image-text-english-small",
24 | computeUnits: .cpuAndNeuralEngine
25 | )
26 | let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
27 | let textEmbedding: Embedding = try textModel.encode(text)
28 | let textVector: [Float32] = textEmbedding.asFloats()
29 | ```
30 |
31 | ### Image Embeddings
32 |
33 | ```swift
34 | let imageModel = try await ImageEncoder(
35 | modelName: "unum-cloud/uform3-image-text-english-small",
36 | computeUnits: .cpuAndNeuralEngine
37 | )
38 | let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
39 | guard let url = URL(string: imageURL),
40 | let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
41 | let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
42 | throw Exception("Could not load image from URL: \(imageURL)")
43 | }
44 |
45 | var imageEmbedding: Embedding = try imageModel.encode(cgImage)
46 | var imageVector: [Float32] = embedding.asFloats()
47 | ```
48 |
49 | ### Choosing Target Device
50 |
51 | Apple chips provide several functional units capable of high-throughput matrix multiplication and AI inference.
52 | Those `computeUnits` include the CPU, GPU, and Neural Engine.
53 | For maximum compatibility, the `.all` option is used by default.
54 | Sadly, Apple's scheduler is not always optimal, and it might be beneficial to specify the target device explicitly, especially if the models are pre-compiled for the Apple Neural Engine, as it may yield significant performance gains.
55 |
56 | | Model | GPU Text E. | ANE Text E. | GPU Image E. | ANE Image E. |
57 | | :------------------ | ----------: | ----------: | -----------: | -----------: |
58 | | `english-small` | 2.53 ms | 0.53 ms | 6.57 ms | 1.23 ms |
59 | | `english-base` | 2.54 ms | 0.61 ms | 18.90 ms | 3.79 ms |
60 | | `english-large` | 2.30 ms | 0.61 ms | 79.68 ms | 20.94 ms |
61 | | `multilingual-base` | 2.34 ms | 0.50 ms | 18.98 ms | 3.77 ms |
62 |
63 | > On Apple M4 iPad, running iOS 18.2.
64 | > Batch size is 1, and the model is pre-loaded into memory.
65 | > The original encoders use `f32` single-precision numbers for maximum compatibility, and mostly rely on __GPU__ for computation.
66 | > The quantized encoders use a mixture of `i8`, `f16`, and `f32` numbers for maximum performance, and mostly rely on the Apple Neural Engine (__ANE__) for computation.
67 | > The median latency is reported.
68 |
69 | ### Computing Distances
70 |
71 | There are several ways to compute distances between embeddings, once you have them.
72 | Naive Swift code might look like this:
73 |
74 | ```swift
75 | func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
76 | let dotProduct = zip(a, b).map(*).reduce(0, +)
77 | let normA = sqrt(a.map { $0 * $0 }.reduce(0, +))
78 | let normB = sqrt(b.map { $0 * $0 }.reduce(0, +))
79 | return dotProduct / (normA * normB)
80 | }
81 | ```
82 |
83 | A faster way to compute distances is to use the Accelerate framework:
84 |
85 | ```swift
86 | import Accelerate
87 |
88 | func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
89 | var result: Float32 = 0
90 | var aNorm: Float32 = 0
91 | var bNorm: Float32 = 0
92 | vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
93 | vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count))
94 | vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count))
95 | return result / sqrt(aNorm * bNorm)
96 | }
97 | ```
98 |
99 | An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings.
100 |
--------------------------------------------------------------------------------