├── .github └── workflows │ ├── package.json │ ├── prerelease.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .swift-format ├── .vscode ├── launch.json ├── settings.json └── tasks.json ├── BENCHMARKS.md ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── Package.resolved ├── Package.swift ├── README.md ├── VERSION ├── assets ├── model_types_bg.png └── unum.png ├── docs ├── Makefile ├── _static │ ├── custom.css │ └── custom.js ├── benchmarks.rst ├── conf.py ├── contributing.rst ├── index.rst ├── javascript │ ├── index.rst │ └── reference.rst.txt ├── python │ ├── index.rst │ └── reference.rst └── swift │ └── index.rst ├── javascript ├── README.md ├── encoders.mjs ├── encoders_test.js ├── hub.mjs └── index.mjs ├── package-lock.json ├── package.json ├── pyproject.toml ├── python ├── README.md ├── scripts │ ├── bench_decoders.py │ ├── bench_encoders.py │ ├── export_decoders.ipynb │ ├── export_encoders.ipynb │ ├── test_decoders.py │ └── test_encoders.py └── uform │ ├── __init__.py │ ├── chat.py │ ├── gen_model.py │ ├── numpy_processors.py │ ├── onnx_encoders.py │ ├── shared.py │ ├── torch_decoders.py │ ├── torch_encoders.py │ └── torch_processors.py ├── swift ├── Encoders.swift ├── EncodersTests.swift └── README.md └── yarn.lock /.github/workflows/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "devDependencies": { 3 | "@semantic-release/exec": "github:semantic-release/exec", 4 | "@semantic-release/git": "^10.0.1", 5 | "conventional-changelog-eslint": "^3.0.9", 6 | "semantic-release": "^20.1.3" 7 | }, 8 | "release": { 9 | "branches": [ 10 | "main" 11 | ], 12 | "debug": true, 13 | "ci": true, 14 | "dryRun": false, 15 | "plugins": [ 16 | [ 17 | "@semantic-release/commit-analyzer", 18 | { 19 | "preset": "eslint", 20 | "releaseRules": [ 21 | { 22 | "tag": "Add", 23 | "release": "minor" 24 | }, 25 | { 26 | "tag": "Break", 27 | "release": "major" 28 | }, 29 | { 30 | "tag": "Improve", 31 | "release": "patch" 32 | }, 33 | { 34 | "tag": "Make", 35 | "release": "patch" 36 | }, 37 | { 38 | "tag": "Refactor", 39 | "release": false 40 | } 41 | ] 42 | } 43 | ], 44 | [ 45 | "@semantic-release/release-notes-generator", 46 | { 47 | "preset": "eslint", 48 | "releaseRules": [ 49 | { 50 | "tag": "Add", 51 | "release": "minor" 52 | }, 53 | { 54 | "tag": "Break", 55 | "release": "major" 56 | }, 57 | { 58 | "tag": "Improve", 59 | "release": "patch" 60 | }, 61 | { 62 | "tag": "Make", 63 | "release": "patch" 64 | }, 65 | { 66 | "tag": "Refactor", 67 | "release": false 68 | } 69 | ] 70 | } 71 | ], 72 | "@semantic-release/github", 73 | [ 74 | "@semantic-release/exec", 75 | { 76 | "prepareCmd": "sed -i 's/version = \".*\"/version = \"${nextRelease.version}\"/' pyproject.toml" 77 | } 78 | ], 79 | [ 80 | "@semantic-release/git", 81 | { 82 | "assets": [ 83 | "pyproject.toml" 84 | ], 85 | "message": "Build: Released ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" 86 | } 87 | ] 88 | ] 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /.github/workflows/prerelease.yml: -------------------------------------------------------------------------------- 1 | name: Pre-Release 2 | 3 | on: 4 | push: 5 | branches: ["main-dev"] 6 | pull_request: 7 | branches: ["main-dev"] 8 | 9 | env: 10 | BUILD_TYPE: Release 11 | GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }} 12 | PYTHONUTF8: 1 13 | 14 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | versioning: 20 | name: Update Version 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | persist-credentials: false 28 | - name: Run TinySemVer 29 | uses: ashvardanian/tinysemver@v2.0.7 30 | with: 31 | verbose: "true" 32 | version-file: "VERSION" 33 | update-version-in: | 34 | package.json:"version": "(\d+\.\d+\.\d+)" 35 | package-lock.json:"uform",\n\s+"version": "(\d+\.\d+\.\d+)" 36 | CITATION.cff:^version: (\d+\.\d+\.\d+) 37 | pyproject.toml:^version = "(\d+\.\d+\.\d+)" 38 | dry-run: "true" 39 | 40 | test_python: 41 | name: Test Python 42 | runs-on: ubuntu-latest 43 | 44 | steps: 45 | - uses: actions/checkout@v4 46 | - name: Set up Python 47 | uses: actions/setup-python@v5 48 | with: 49 | python-version: "3.11" 50 | 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --no-cache-dir --upgrade pip 54 | pip install -e ".[onnx]" 55 | pip install pytest 56 | 57 | # When running tests in CI, limit ourselves to the small model tests 58 | - name: Test with PyTest 59 | run: pytest python/scripts/ -s -x -Wd -v -k small 60 | 61 | test_javascript: 62 | name: Test JavaScript 63 | runs-on: ubuntu-latest 64 | 65 | steps: 66 | - uses: actions/checkout@v4 67 | - name: Set up Node.js 68 | uses: actions/setup-node@v4 69 | with: 70 | node-version: 20 71 | 72 | # TODO: JavaScript tests pass, but ONNX throws a memory error on exit 73 | # - name: Build JavaScript 74 | # run: npm ci 75 | # - name: Test JavaScript 76 | # run: npm test 77 | 78 | test_swift: 79 | name: Test Swift 80 | runs-on: macos-14 81 | 82 | steps: 83 | - uses: actions/checkout@v4 84 | - name: Build 85 | run: swift build 86 | - name: Run tests 87 | run: swift test -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | env: 8 | GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }} 9 | 10 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 11 | permissions: 12 | contents: read 13 | pages: write 14 | id-token: write 15 | 16 | jobs: 17 | versioning: 18 | name: Update Version 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v4 23 | with: 24 | fetch-depth: 0 25 | persist-credentials: false 26 | - name: Run TinySemVer 27 | uses: ashvardanian/tinysemver@v2.0.7 28 | with: 29 | verbose: "true" 30 | version-file: "VERSION" 31 | update-version-in: | 32 | package.json:"version": "(\d+\.\d+\.\d+)" 33 | package-lock.json:"uform",\n\s+"version": "(\d+\.\d+\.\d+)" 34 | CITATION.cff:^version: (\d+\.\d+\.\d+) 35 | pyproject.toml:^version = "(\d+\.\d+\.\d+)" 36 | dry-run: "false" 37 | push: "true" 38 | create-release: "true" 39 | github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }} 40 | 41 | rebase: 42 | name: Rebase Dev. Branch 43 | needs: versioning 44 | runs-on: ubuntu-latest 45 | steps: 46 | - name: Checkout the latest code 47 | uses: actions/checkout@v4 48 | with: 49 | fetch-depth: 0 50 | 51 | - name: Perform rebase 52 | run: | 53 | git fetch origin main 54 | git checkout main-dev 55 | git rebase origin/main 56 | 57 | - name: Push changes 58 | uses: CasperWA/push-protected@v2 59 | with: 60 | token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }} 61 | branch: main-dev 62 | unprotect_reviews: True 63 | force: True 64 | 65 | test_python: 66 | name: Run Tests 67 | runs-on: ubuntu-latest 68 | needs: versioning 69 | steps: 70 | - uses: actions/checkout@v4 71 | with: 72 | ref: "main" 73 | 74 | - name: Set up Python 75 | uses: actions/setup-python@v5 76 | with: 77 | python-version: "3.11" 78 | 79 | - name: Install dependencies 80 | run: | 81 | python -m pip install --upgrade pip 82 | pip install -e ".[onnx]" 83 | pip install pytest 84 | 85 | - name: Run PyTest 86 | run: pytest python/scripts/ 87 | 88 | publish_python: 89 | name: Publish Python 90 | runs-on: ubuntu-latest 91 | needs: [versioning, test_python] 92 | 93 | steps: 94 | - uses: actions/checkout@v4 95 | with: 96 | ref: "main" 97 | - name: Set up Python 98 | uses: actions/setup-python@v5 99 | with: 100 | python-version: "3.11" 101 | 102 | - name: Install dependencies 103 | run: | 104 | python -m pip install --upgrade pip 105 | pip install build 106 | 107 | - name: Build package 108 | run: python -m build 109 | 110 | - name: Publish to PyPi 111 | uses: pypa/gh-action-pypi-publish@release/v1 112 | with: 113 | verbose: true 114 | print-hash: true 115 | 116 | publish_javascript: 117 | name: Publish JavaScript 118 | needs: versioning 119 | runs-on: ubuntu-22.04 120 | 121 | steps: 122 | - uses: actions/checkout@v4 123 | with: 124 | ref: "main" 125 | 126 | - name: Set up Node.js 127 | uses: actions/setup-node@v4 128 | with: 129 | node-version: 20 130 | 131 | # TODO: JavaScript tests pass, but ONNX throws a memory error on exit 132 | # - name: Build and Test 133 | # run: | 134 | # npm ci 135 | # npm test 136 | 137 | - name: Publish 138 | uses: JS-DevTools/npm-publish@v2 139 | with: 140 | token: ${{ secrets.NPM_TOKEN }} 141 | 142 | deploy_docs: 143 | name: Deploy Docs 144 | environment: 145 | name: github-pages 146 | url: ${{ steps.deployment.outputs.page_url }} 147 | runs-on: ubuntu-22.04 148 | needs: [publish_python, publish_javascript] 149 | steps: 150 | - name: Checkout 151 | uses: actions/checkout@v4 152 | with: 153 | ref: "main" 154 | - name: Install dependencies 155 | run: | 156 | sudo apt update && 157 | sudo apt install -y doxygen graphviz dia git && 158 | pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 && 159 | npm install -g jsdoc 160 | - name: Setup GitHub Pages 161 | uses: actions/configure-pages@v2 162 | - name: Install UForm from PyPi 163 | run: pip install uform 164 | - name: Build documentation 165 | run: cd docs && make html 166 | - name: Copy assets 167 | run: cp -r assets build/docs/html/ 168 | - name: Upload artifacts 169 | uses: actions/upload-pages-artifact@v1 170 | with: 171 | # Upload entire repository 172 | path: "./build/docs/html/" 173 | - name: Deploy to GitHub Pages 174 | id: deployment 175 | uses: actions/deploy-pages@v1 176 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /requirements.txt 2 | /dist/ 3 | /test 4 | /build/ 5 | yarn.lock 6 | *.egg-info 7 | __pycache__ 8 | .build 9 | .swiftpm 10 | .hf_token 11 | 12 | dictionary* 13 | vocab* 14 | /models/ 15 | 16 | # Tensors & ML Model 17 | *.onnx 18 | *.pt 19 | *.safetensors 20 | *.mlpackage 21 | 22 | # NodeJS 23 | node_modules 24 | node_build 25 | yarn-error.log 26 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_commit_msg: "chore(pre-commit): autofix run" 3 | autoupdate_commit_msg: "chore(pre-commit): autoupdate hooks" 4 | 5 | default_install_hook_types: 6 | - pre-commit 7 | 8 | repos: 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v4.5.0 11 | hooks: 12 | - id: check-toml 13 | - id: check-yaml 14 | - id: debug-statements 15 | - id: end-of-file-fixer 16 | - id: name-tests-test 17 | - id: trailing-whitespace 18 | - repo: https://github.com/pappasam/toml-sort 19 | rev: v0.23.1 20 | hooks: 21 | - id: toml-sort-fix 22 | - repo: https://github.com/asottile/add-trailing-comma 23 | rev: v3.1.0 24 | hooks: 25 | - id: add-trailing-comma 26 | - repo: https://github.com/astral-sh/ruff-pre-commit 27 | rev: v0.1.11 28 | hooks: 29 | # Run the linter 30 | - id: ruff 31 | args: [--fix] 32 | # Run the formatter 33 | - id: ruff-format 34 | -------------------------------------------------------------------------------- /.swift-format: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "lineLength": 120, 4 | "indentation": { 5 | "spaces": 4 6 | }, 7 | "maximumBlankLines": 1, 8 | "respectsExistingLineBreaks": true, 9 | "lineBreakBeforeControlFlowKeywords": true, 10 | "lineBreakBeforeEachArgument": true, 11 | "multiElementCollectionTrailingCommas": true, 12 | "spacesAroundRangeFormationOperators": true 13 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python Debugger", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | }, 14 | { 15 | "name": "PyTest Debugger", 16 | "type": "debugpy", 17 | "request": "launch", 18 | "program": "pytest", 19 | "console": "integratedTerminal", 20 | "args": [ 21 | "${file}", 22 | "-s", 23 | "-x", 24 | ], 25 | }, 26 | { 27 | "name": "NodeJS Debugger", 28 | "type": "node-terminal", 29 | "request": "launch", 30 | "command": "npm run test", 31 | } 32 | ] 33 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "arange", 4 | "ashvardanian", 5 | "astype", 6 | "CFURL", 7 | "coreml", 8 | "crossattn", 9 | "cumsum", 10 | "dtype", 11 | "embs", 12 | "finfo", 13 | "huggingface", 14 | "keepdim", 15 | "linalg", 16 | "logits", 17 | "Matryoshka", 18 | "mlmodel", 19 | "mlpackage", 20 | "mlprogram", 21 | "multimodal", 22 | "ndarray", 23 | "numpy", 24 | "ONNX", 25 | "onnxconverter", 26 | "onnxruntime", 27 | "opset", 28 | "packbits", 29 | "preprocess", 30 | "pretrained", 31 | "probs", 32 | "pypi", 33 | "pytest", 34 | "randn", 35 | "rerank", 36 | "reranker", 37 | "reranking", 38 | "sandbeach", 39 | "sess", 40 | "SIMD", 41 | "softmax", 42 | "Tensorrt", 43 | "torchvision", 44 | "transfromers", 45 | "uform", 46 | "unimodal", 47 | "unsqueeze", 48 | "Vardanian", 49 | "whitespaces" 50 | ], 51 | "[python]": { 52 | "editor.defaultFormatter": "ms-python.black-formatter" 53 | }, 54 | "python.formatting.provider": "none", 55 | "window.autoDetectColorScheme": true, 56 | "workbench.colorTheme": "Default Dark+", 57 | "workbench.preferredDarkColorTheme": "Default Dark+" 58 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "Publish", 8 | "type": "shell", 9 | "command": "python -m pip install build twine && python -m build && twine check dist/* && twine upload dist/*" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /BENCHMARKS.md: -------------------------------------------------------------------------------- 1 | # UForm Model Benchmarks 2 | 3 | ## Accuracy 4 | 5 | ### Embedding Models 6 | 7 | Few retrieval benchmarks exist for multimodal embeddings. 8 | The most famous ones for English are "MS-COCO" and "Flickr30k". 9 | Evaluating `uform-vl-english` model, one can expect the following numbers for search quality. 10 | 11 | | Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | 12 | | :-------- | ---------: | ---------: | ----------: | 13 | | Flickr | 0.727 | 0.915 | 0.949 | 14 | | MS-COCO ¹ | 0.510 | 0.761 | 0.838 | 15 | 16 | For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository². 17 | Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model. 18 | 19 | | Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | 20 | | :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: | 21 | | English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | 22 | | Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | 23 | | Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | 24 | | Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | 25 | | Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | 26 | | French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | 27 | 28 | 29 | All languages: 30 | 31 | | Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | 32 | | :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: | 33 | | Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | 34 | | Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M | 35 | | Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | 36 | | English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | 37 | | French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | 38 | | German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M | 39 | | Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M | 40 | | Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | 41 | | Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M | 42 | | Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M | 43 | | Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M | 44 | | Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M | 45 | | Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M | 46 | | Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M | 47 | | Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M | 48 | | Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M | 49 | | Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | 50 | | Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M | 51 | | Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M | 52 | | Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M | 53 | | Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M | 54 | | | | | | | | | | 55 | | Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - | 56 | | Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - | 57 | | Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - | 58 | | Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - | 59 | 60 | ### Generative Models 61 | 62 | | Model | LLM Size | SQA | MME | MMBench | Average¹ | 63 | | :------------------- | -------: | ---: | -----: | ------: | -------: | 64 | | UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 | 65 | | MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 | 66 | | LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 | 67 | 68 | For captioning evaluation we measure CLIPScore and RefCLIPScore³. 69 | 70 | | Model | Size | Caption Length | CLIPScore | RefCLIPScore | 71 | | :---------------------------------- | ---: | -------------: | --------: | -----------: | 72 | | `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 | 73 | | `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 | 74 | | | | | | | 75 | | `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 | 76 | | `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 | 77 | | | | | | | 78 | | `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 | 79 | | `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 | 80 | | | | | | | 81 | | `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 | 82 | | `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 | 83 | 84 | Results for VQAv2 evaluation. 85 | 86 | | Model | Size | Accuracy | 87 | | :------------------------- | ---: | -------: | 88 | | `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 | 89 | | `unum-cloud/uform-gen` | 1.5B | 66.5 | 90 | 91 |
92 | 93 | > ¹ Train split was in training data.
94 | > ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
95 | > ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model. 96 | 97 | ## Speed 98 | 99 | ### Embedding Models 100 | 101 | UForm comes pre-packaged with speed benchmarks for the models. 102 | 103 | ```bash 104 | $ python python/scripts/bench_encoders.py --help 105 | usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE] 106 | 107 | options: 108 | -h, --help show this help message and exit 109 | --filter-out FILTER_OUT 110 | Filter out models, backends, or devices with a Regular Expression. 111 | --batch-size BATCH_SIZE 112 | Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. 113 | ``` 114 | 115 | Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and 116 | 117 | | Model Name | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s | 118 | | :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- | 119 | | unum-cloud/uform3-image-text-english-base | cpu | torch | 23.03 | 76.57 | 15,978.03 | 562.28 | 120 | | unum-cloud/uform3-image-text-english-base | cpu | onnx | 23.11 | 77.75 | 13,880.27 | 1,067.40 | 121 | | unum-cloud/uform3-image-text-english-base | cuda | torch | 22.87 | 1,060.40 | 12,348.94 | 13,242.83 | 122 | | unum-cloud/uform3-image-text-english-large | cpu | torch | 22.41 | 10.84 | 13,350.45 | 145.12 | 123 | | unum-cloud/uform3-image-text-english-large | cpu | onnx | 23.13 | 19.60 | 18,031.85 | 960.09 | 124 | | unum-cloud/uform3-image-text-english-large | cuda | torch | 22.78 | 244.86 | 13,226.40 | 10,204.04 | 125 | | unum-cloud/uform3-image-text-english-small | cpu | torch | 20.08 | 71.68 | 12,147.05 | 249.63 | 126 | | unum-cloud/uform3-image-text-english-small | cpu | onnx | 22.84 | 195.27 | 13,636.99 | 1,385.25 | 127 | | unum-cloud/uform3-image-text-english-small | cuda | torch | 22.63 | 2,662.16 | 14,731.18 | 14,694.87 | 128 | | unum-cloud/uform3-image-text-multilingual-base | cpu | torch | 22.98 | 64.28 | 10,129.27 | 209.76 | 129 | | unum-cloud/uform3-image-text-multilingual-base | cpu | onnx | 23.06 | 66.81 | 8,963.13 | 1,104.32 | 130 | | unum-cloud/uform3-image-text-multilingual-base | cuda | torch | 22.88 | 1,051.95 | 15,639.72 | 12,416.12 | 131 | 132 | If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates. 133 | On Nvidia RTX 3090: 134 | 135 | | Model | Multilingual | Speed | Speedup | 136 | | :----------------------------------------------- | -----------: | ---------------------: | ---------: | 137 | | `bert-base-uncased` | No | 1'612 sequences/second | | 138 | | `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 | 139 | | `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 | 140 | | `unum-cloud/uform3-image-text-multilingual-base` | __Yes__ | 6'809 sequences/second | __x 4.22__ | 141 | 142 | Given the small size of the model it also work well on mobile devices. 143 | On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards. 144 | 145 | | Device | Speed | Device TDP | Efficiency | 146 | | :--------------------- | ------------------: | ---------: | ----------------: | 147 | | Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule | 148 | | Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule | 149 | | Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule | 150 | | Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule | 151 | 152 | ### Generative Models 153 | 154 | ```bash 155 | $ python python/scripts/bench_decoders.py --help 156 | usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE] 157 | 158 | options: 159 | -h, --help show this help message and exit 160 | --batch-size BATCH_SIZE 161 | Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. 162 | --max-length MAX_LENGTH 163 | Maximum length of the generated text in tokens. 164 | ``` 165 | 166 | On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. 167 | 168 | | Model | Size | Decoding Speed | Decoding Parallel Streams | 169 | | :---------------------------------- | ----: | -------------: | ---------------------------: | 170 | | `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 141 tokens/s | ~ 4 K tokens/s (32 streams) | 171 | | `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 211 tokens/s | ~ 2 K tokens/s (32 streams) | 172 | | `unum-cloud/uform-gen` | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) | 173 | | `unum-cloud/uform-gen2-dpo` | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) | 174 | 175 | On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. 176 | 177 | | Model | Size | Decoding Speed | Speedup | 178 | | :---------------------------------- | ----: | -------------: | --------: | 179 | | `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 40 tokens/s | | 180 | | `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 40 tokens/s | | 181 | | `unum-cloud/uform-gen` | 1.5 B | ~ 140 tokens/s | __x 3.5__ | 182 | 183 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Kim" 5 | given-names: "Mikhail" 6 | orcid: "https://orcid.org/0009-0003-8413-3221" 7 | - family-names: "Orshulevich" 8 | given-names: "Vladimir" 9 | orcid: "https://orcid.org/0009-0007-8961-6969" 10 | - family-names: "Vardanian" 11 | given-names: "Ash" 12 | orcid: "https://orcid.org/0000-0002-4882-1815" 13 | title: "UForm by Unum Cloud" 14 | version: 3.1.1 15 | keywords: 16 | - "text-to-image retrieval" 17 | - "multimodal" 18 | - "visual-language pre-training" 19 | doi: 10.5281/zenodo.7951497 20 | date-released: 2023-01-03 21 | url: "https://github.com/unum-cloud/uform" 22 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to UForm 2 | 3 | We welcome contributions to UForm! 4 | 5 | ## Python 6 | 7 | Before submitting any changes, please make sure that the tests pass. 8 | 9 | ```sh 10 | pip install -e ".[dev]" # For development dependencies 11 | pip install -e ".[torch]" # For PyTorch 12 | pip install -e ".[onnx]" # For ONNX on CPU 13 | pip install -e ".[onnx-gpu]" # For ONNX on GPU, available for some platforms 14 | pip install -e ".[torch,onnx,onnx-gpu,dev]" # For all 15 | 16 | pytest python/scripts/ -s -x -Wd -v 17 | pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch 18 | ``` 19 | 20 | ## Swift 21 | 22 | To build and test the Swift package, use the following command: 23 | 24 | ```bash 25 | swift build 26 | swift test 27 | ``` 28 | 29 | Swift formatting is enforced with `swift-format` default utility from Apple. 30 | To install and run it on all the files in the project, use the following command: 31 | 32 | ```bash 33 | brew install swift-format 34 | swift-format . -i -r 35 | ``` 36 | 37 | The style is controlled by the `.swift-format` JSON file in the root of the repository. 38 | As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings. 39 | 40 | ## JavaScript 41 | 42 | For rapid development you can avoid the TypeScript precompilation step: 43 | 44 | ```sh 45 | npm install -g ts-node 46 | ts-node javascript/embeddings.mts 47 | ``` 48 | 49 | Before submitting any changes, please make sure that the tests pass. 50 | 51 | ```sh 52 | npm install 53 | npm test 54 | ``` 55 | 56 | ## Benchmarking 57 | 58 | If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally. 59 | The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU. 60 | 61 | ```sh 62 | git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository 63 | cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies 64 | python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large" 65 | ``` 66 | 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "swift-argument-parser", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/apple/swift-argument-parser.git", 7 | "state" : { 8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 9 | "version" : "1.3.0" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-transformers", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/ashvardanian/swift-transformers", 16 | "state" : { 17 | "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c" 18 | } 19 | } 20 | ], 21 | "version" : 2 22 | } 23 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:5.9 2 | import PackageDescription 3 | 4 | let package = Package( 5 | name: "UForm", 6 | platforms: [ 7 | // Linux doesn't have to be explicitly listed 8 | .iOS(.v16), // For iOS, version 13 and later 9 | .tvOS(.v16), // For tvOS, version 13 and later 10 | .macOS(.v13), // For macOS, version 10.15 (Catalina) and later 11 | .watchOS(.v6), // For watchOS, version 6 and later 12 | ], 13 | products: [ 14 | .library( 15 | name: "UForm", 16 | targets: ["UForm"] 17 | ) 18 | ], 19 | dependencies: [ 20 | .package( 21 | url: "https://github.com/ashvardanian/swift-transformers", 22 | revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c" 23 | ) 24 | ], 25 | targets: [ 26 | .target( 27 | name: "UForm", 28 | dependencies: [ 29 | .product(name: "Transformers", package: "swift-transformers") 30 | ], 31 | path: "swift", 32 | exclude: ["EncodersTests.swift"] 33 | ), 34 | .testTarget( 35 | name: "UFormTests", 36 | dependencies: ["UForm"], 37 | path: "swift", 38 | sources: ["EncodersTests.swift"] 39 | ), 40 | ] 41 | ) 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

UForm

2 |

3 | Pocket-Sized Multimodal AI
4 | For Content Understanding and Generation
5 |

6 |
7 | 8 |

9 | Discord 10 |       11 | LinkedIn 12 |       13 | Twitter 14 |       15 | Blog 16 |       17 | GitHub 18 |

19 | 20 |

21 | Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat 22 |
23 | Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents 24 |
25 | ONNX • CoreML • PyTorch 26 |
27 | Python 28 | • 29 | JavaScript 30 | • 31 | Swift 32 |

33 | 34 | --- 35 | 36 | ![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true) 37 | 38 | Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient. 39 | UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages. 40 | UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA). 41 | With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone. 42 | 43 | ## Features 44 | 45 | - __Tiny Embeddings__: 64-dimensional [Matryoshka][matryoshka]-style embeddings for extremely fast [search][usearch]. 46 | - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors. 47 | - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform. 48 | - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall. 49 | - __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages. 50 | 51 | [usearch]: https://github.com/unum-cloud/usearch 52 | [matryoshka]: https://arxiv.org/abs/2205.13147 53 | 54 | ## Models 55 | 56 | For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md). 57 | 58 | ### Embedding Models 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 |
ModelParametersLanguagesArchitecture
uform3-image-text-english-large 🆕365 M112 layer BERT, ViT-L/14
uform3-image-text-english-base143 M14 layer BERT, ViT-B/16
uform3-image-text-english-small 🆕79 M14 layer BERT, ViT-S/16
uform3-image-text-multilingual-base206M2112 layer BERT, ViT-B/16
96 | 97 | ### Generative Models 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 |
ModelParametersPurposeArchitecture
uform-gen2-dpo 🆕1.2 BChat, Image Captioning, VQAqwen1.5-0.5B, ViT-H/14
uform-gen2-qwen-500m1.2 BChat, Image Captioning, VQAqwen1.5-0.5B, ViT-H/14
uform-gen ⚠️1.5 BImage Captioning, VQAllama-1.3B, ViT-B/16
129 | 130 | ## Quick Start Examples 131 | 132 | ### Embedding Models 133 | 134 | First, `pip install uform`. 135 | Then, load the model: 136 | 137 | ```py 138 | from uform import get_model, Modality 139 | 140 | processors, models = get_model('unum-cloud/uform3-image-text-english-small') 141 | 142 | model_text = models[Modality.TEXT_ENCODER] 143 | model_image = models[Modality.IMAGE_ENCODER] 144 | processor_text = processors[Modality.TEXT_ENCODER] 145 | processor_image = processors[Modality.IMAGE_ENCODER] 146 | ``` 147 | 148 | Embed images: 149 | 150 | ```py 151 | import requests 152 | from io import BytesIO 153 | from PIL import Image 154 | 155 | image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' 156 | image = Image.open(BytesIO(requests.get(image_url).content)) 157 | image_data = processor_image(image) 158 | image_features, image_embedding = model_image.encode(image_data, return_features=True) 159 | ``` 160 | 161 | Embed queries: 162 | 163 | ```py 164 | text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' 165 | text_data = processor_text(text) 166 | text_features, text_embedding = model_text.encode(text_data, return_features=True) 167 | ``` 168 | 169 | For more details check out: 170 | 171 | - Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models) 172 | - JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models) 173 | - Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models) 174 | 175 | ### Generative Models 176 | 177 | The generative models are natively compatible with 178 | 179 | ```python 180 | from transformers import AutoModel, AutoProcessor 181 | 182 | model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) 183 | processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) 184 | 185 | prompt = 'Question or Instruction' 186 | image = Image.open('image.jpg') 187 | 188 | inputs = processor(text=[prompt], images=[image], return_tensors='pt') 189 | 190 | with torch.inference_mode(): 191 | output = model.generate( 192 | **inputs, 193 | do_sample=False, 194 | use_cache=True, 195 | max_new_tokens=256, 196 | eos_token_id=151645, 197 | pad_token_id=processor.tokenizer.pad_token_id 198 | ) 199 | prompt_len = inputs['input_ids'].shape[1] 200 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0] 201 | ``` 202 | 203 | For more details check out: 204 | 205 | - Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models) 206 | - JavaScript docs on generative models 🔜 207 | - Swift docs on generative models 🔜 208 | 209 | ## Technical Details 210 | 211 | ### Down-casting, Quantization, Matryoshka, and Slicing 212 | 213 | Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall. 214 | Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support. 215 | Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries. 216 | Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search. 217 | 218 | ```python 219 | import numpy as np 220 | 221 | f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False) 222 | f16_embedding: np.ndarray = f32_embedding.astype(np.float16) 223 | i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8) 224 | b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8)) 225 | ``` 226 | 227 | Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner. 228 | 229 | ```python 230 | import numpy as np 231 | 232 | large_embedding: np.ndarray = model.encode_text(text_data, return_features=False) 233 | small_embedding: np.ndarray = large_embedding[:, :256] 234 | tiny_embedding: np.ndarray = large_embedding[:, :64] 235 | ``` 236 | 237 | Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries. 238 | When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD. 239 | 240 | ```python 241 | from simsimd import cosine, hamming 242 | 243 | distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU 244 | distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU 245 | distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU 246 | distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU 247 | ``` 248 | 249 | Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch. 250 | Here are a couple of examples: 251 | 252 | ```python 253 | from usearch.index import Index 254 | 255 | f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings 256 | f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings 257 | i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings 258 | b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings 259 | ``` 260 | 261 | [github-usearch]: https://github.com/unum-cloud/usearch 262 | [github-simsimd]: https://github.com/ashvardanian/simsimd 263 | [report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel 264 | [report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/ 265 | 266 | ### Compact Packaging 267 | 268 | PyTorch is a heavy dependency to carry, especially if you run on Edge or IoT devices. 269 | Using vanilla ONNX runtime, one can significantly reduce memory consumption and deployment latency. 270 | 271 | ```sh 272 | $ conda create -n uform_torch python=3.10 -y 273 | $ conda create -n uform_onnx python=3.10 -y 274 | $ conda activate uform_torch && pip install -e ".[torch]" && conda deactivate 275 | $ conda activate uform_onnx && pip install -e ".[onnx]" && conda deactivate 276 | $ du -sh $(conda info --envs | grep 'uform_torch' | awk '{print $2}') 277 | > 5.2G ~/conda/envs/uform_torch 278 | $ du -sh $(conda info --envs | grep 'uform_onnx' | awk '{print $2}') 279 | > 461M ~/conda/envs/uform_onnx 280 | ``` 281 | 282 | Most of that weight can be further reduced down to 100 MB for both the model and the runtime. 283 | You can pick one of many supported [ONNX execution providers][onnx-providers], which includes XNNPACK, CUDA and TensorRT for Nvidia GPUs, OpenVINO on Intel, DirectML on Windows, ROCm on AMD, CoreML on Apple devices, and more to come. 284 | 285 | [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/ 286 | 287 | ### Multimodal Chat in CLI 288 | 289 | The generative models can be used for chat-like experiences in the command line. 290 | For that, you can use the `uform-chat` CLI tool, which is available in the UForm package. 291 | 292 | ```bash 293 | $ pip install uform 294 | $ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg 295 | $ uform-chat --model unum-cloud/uform-gen2-dpo \ 296 | > --image="https://bit.ly/3tIVg9M" \ 297 | > --device="cuda:0" \ 298 | > --fp16 299 | ``` 300 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 3.1.1 2 | -------------------------------------------------------------------------------- /assets/model_types_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unum-cloud/uform/33d5df7951cf3bee8b14d1110cc3bbae1ff6fba8/assets/model_types_bg.png -------------------------------------------------------------------------------- /assets/unum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unum-cloud/uform/33d5df7951cf3bee8b14d1110cc3bbae1ff6fba8/assets/unum.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = ../build/docs 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | p.caption { 2 | font-size: 0 !important; 3 | margin: 8px 0px !important; 4 | padding: 0 !important; 5 | border-bottom: 1px solid #8b7f8b12; 6 | } 7 | 8 | article>section>h1:nth-child(1) { 9 | display: none; 10 | } 11 | 12 | .sidebar-brand-text { 13 | cursor: initial; 14 | } 15 | 16 | table>tbody>tr>td { 17 | text-align: center; 18 | } 19 | 20 | table>tbody>tr>td:first-child { 21 | text-align: left; 22 | } 23 | 24 | #overview>p>a>img { 25 | height: 25px !important; 26 | } 27 | -------------------------------------------------------------------------------- /docs/_static/custom.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function () { 2 | const github_logo = ` 3 | 4 | ` 5 | 6 | $(".sidebar-brand-text").html("Unum · UForm
2.1.1" + github_logo) 7 | }) 8 | -------------------------------------------------------------------------------- /docs/benchmarks.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Benchmarks 3 | ==================== 4 | 5 | .. mdinclude:: ../BENCHMARKS.md -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "Unum · UForm" 10 | copyright = "2023, Unum" 11 | author = "Unum" 12 | release = open("../VERSION", "r").read().strip() 13 | with open("_static/custom.js", "r+") as js: 14 | content = js.read() 15 | js.seek(0) 16 | js.truncate() 17 | js.write(content.replace("$(VERSION)", release)) 18 | 19 | # -- General configuration --------------------------------------------------- 20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 21 | 22 | extensions = [ 23 | "breathe", 24 | "m2r2", 25 | "sphinx.ext.autodoc", 26 | "sphinx_js", 27 | "sphinx.ext.autosummary", 28 | "sphinx.ext.intersphinx", 29 | "sphinx.ext.napoleon", 30 | "sphinxcontrib.jquery", 31 | "sphinxcontrib.googleanalytics", 32 | ] 33 | 34 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "*.md"] 35 | 36 | googleanalytics_id = "341385789" 37 | googleanalytics_enabled = True 38 | 39 | # -- Options for HTML output ------------------------------------------------- 40 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 41 | 42 | html_logo = "../assets/unum.png" 43 | html_theme = "furo" 44 | html_static_path = ["_static"] 45 | html_css_files = ["custom.css"] 46 | html_js_files = ["custom.js"] 47 | html_baseurl = "/docs/uform/" 48 | 49 | breathe_projects = {"UForm": "../build/xml"} 50 | breathe_default_project = "UForm" 51 | 52 | js_source_path = "../javascript/" 53 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Contributing 3 | ==================== 4 | 5 | .. mdinclude:: ../CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Overview 3 | ==================== 4 | .. mdinclude:: ../README.md 5 | 6 | .. toctree:: 7 | :hidden: 8 | :caption: � 9 | 10 | python/index 11 | javascript/index 12 | swift/index 13 | 14 | .. toctree:: 15 | :hidden: 16 | :caption: � 17 | 18 | contributing 19 | benchmarks 20 | 21 | .. toctree:: 22 | :hidden: 23 | :caption: � 24 | 25 | genindex 26 | -------------------------------------------------------------------------------- /docs/javascript/index.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | JavaScript SDK 3 | ==================== 4 | 5 | 6 | .. mdinclude:: ../../javascript/README.md 7 | 8 | .. toctree:: 9 | :hidden: 10 | -------------------------------------------------------------------------------- /docs/javascript/reference.rst.txt: -------------------------------------------------------------------------------- 1 | API Reference 2 | ==================== 3 | 4 | ==================== 5 | Encoders 6 | ==================== 7 | 8 | .. js:autoclass:: ../javascript/encoders.TextProcessor 9 | :members: 10 | 11 | .. js:autoclass:: ../javascript/encoders.ImageProcessor 12 | :members: 13 | 14 | .. js:autoclass:: ../javascript/encoders.TextEncoder 15 | :members: 16 | 17 | .. js:autoclass:: ../javascript/encoders.ImageEncoder 18 | :members: 19 | -------------------------------------------------------------------------------- /docs/python/index.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Python SDK 3 | ==================== 4 | 5 | 6 | .. mdinclude:: ../../python/README.md 7 | 8 | .. toctree:: 9 | :hidden: 10 | 11 | reference -------------------------------------------------------------------------------- /docs/python/reference.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ==================== 3 | 4 | ==================== 5 | Root 6 | ==================== 7 | 8 | .. automodule:: uform 9 | :members: 10 | :undoc-members: 11 | 12 | ==================== 13 | Torch Encoreds 14 | ==================== 15 | 16 | .. automodule:: uform.torch_encoders 17 | :members: 18 | :undoc-members: 19 | 20 | ==================== 21 | Torch Processors 22 | ==================== 23 | 24 | .. automodule:: uform.torch_processors 25 | :members: 26 | :undoc-members: 27 | 28 | ==================== 29 | ONNX Encoders 30 | ==================== 31 | 32 | .. automodule:: uform.onnx_encoders 33 | :members: 34 | :undoc-members: 35 | 36 | ==================== 37 | NumPy Processors 38 | ==================== 39 | 40 | .. automodule:: uform.numpy_processors 41 | :members: 42 | :undoc-members: 43 | -------------------------------------------------------------------------------- /docs/swift/index.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Swift SDK 3 | ==================== 4 | 5 | 6 | .. mdinclude:: ../../swift/README.md 7 | -------------------------------------------------------------------------------- /javascript/README.md: -------------------------------------------------------------------------------- 1 | # UForm for JavaScript 2 | 3 | UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications. 4 | Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware. 5 | 6 | ## Installation 7 | 8 | There are several ways to install the UForm JavaScript SDK from NPM. 9 | 10 | ```bash 11 | pnpm add uform 12 | npm add uform 13 | yarn add uform 14 | ``` 15 | 16 | ## Quick Start 17 | 18 | ### Embeddings 19 | 20 | ```js 21 | import { getModel, Modality, TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from '@unum-cloud/uform'; 22 | 23 | const { configPath, modalityPaths, tokenizerPath } = await getModel( 24 | modelId: 'unum-cloud/uform3-image-text-english-small', 25 | modalities: [Modality.TextEncoder, Modality.ImageEncoder] 26 | ); 27 | 28 | const textProcessor = new TextProcessor(configPath, tokenizerPath); 29 | await textProcessor.init(); 30 | const processedTexts = await textProcessor.process(["a small red panda in a zoo"]); 31 | 32 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); 33 | await textEncoder.init(); 34 | const textOutput = await textEncoder.encode(processedTexts); 35 | assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); 36 | await textEncoder.dispose(); 37 | 38 | const imageProcessor = new ImageProcessor(configPath); 39 | await imageProcessor.init(); 40 | const processedImages = await imageProcessor.process("path/to/image.png"); 41 | 42 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); 43 | await imageEncoder.init(); 44 | const imageOutput = await imageEncoder.encode(processedImages); 45 | assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); 46 | ``` 47 | 48 | The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK. 49 | The embeddings can later be compared using the cosine similarity or other distance metrics. 50 | 51 | ### Generative Models 52 | 53 | Coming soon ... 54 | 55 | ## Technical Details 56 | 57 | ### Faster Search 58 | 59 | Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall. 60 | Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search. 61 | In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd]. 62 | 63 | [github-usearch]: https://github.com/unum-cloud/usearch 64 | [github-simsimd]: https://github.com/ashvardanian/simsimd 65 | -------------------------------------------------------------------------------- /javascript/encoders.mjs: -------------------------------------------------------------------------------- 1 | import { readFileSync } from 'fs'; 2 | import { InferenceSession, Tensor } from 'onnxruntime-node'; 3 | import { PreTrainedTokenizer } from '@xenova/transformers'; 4 | import sharp from 'sharp'; 5 | 6 | /** 7 | * A processor for text data that prepares input for the text encoder model. 8 | */ 9 | class TextProcessor { 10 | 11 | /** 12 | * Constructs a new TextProcessor instance. 13 | * 14 | * @param {string} configPath - The path to the configuration file for the text encoder. 15 | * @param {string} tokenizerPath - The path to the tokenizer configuration file. 16 | */ 17 | constructor(configPath, tokenizerPath) { 18 | this.configPath = configPath; 19 | this.tokenizerPath = tokenizerPath; 20 | 21 | this.maxSeqLen = 0; 22 | this.padTokenIdx = 0; 23 | this.tokenizer = null; 24 | } 25 | 26 | /** 27 | * Initializes the TextProcessor by loading configurations and setting up the tokenizer. 28 | */ 29 | async init() { 30 | var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' })); 31 | if (config.text_encoder !== undefined) { 32 | config = config.text_encoder; 33 | } 34 | 35 | this.maxSeqLen = config.max_position_embeddings; 36 | this.padTokenIdx = config.padding_idx; 37 | 38 | const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' })); 39 | this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config); 40 | this.tokenizer.model_max_length = this.maxSeqLen; 41 | this.tokenizer.pad_token_id = this.padTokenIdx; 42 | } 43 | 44 | /** 45 | * Processes a list of text strings into model-ready format, including padding and attention masks. 46 | * 47 | * @param {Array} texts - An array of text strings to process. 48 | * @return {Object} The processed texts as model input features. 49 | */ 50 | async process(texts) { 51 | 52 | const encoded = await this.tokenizer(texts, { 53 | add_special_tokens: true, 54 | padding: 'max_length', 55 | max_length: this.maxSeqLen, 56 | truncation: true, 57 | }); 58 | 59 | return { 60 | 'input_ids': encoded.input_ids, 61 | 'attention_mask': encoded.attention_mask, 62 | }; 63 | } 64 | } 65 | 66 | /** 67 | * An encoder for text data that uses a pre-trained model to encode text. 68 | */ 69 | class TextEncoder { 70 | 71 | /** 72 | * Constructs a new TextEncoder instance. 73 | * 74 | * @param {string} modelPath - The path to the pre-trained ONNX model. 75 | */ 76 | constructor(modelPath) { 77 | this.modelPath = modelPath; 78 | this.session = null; 79 | } 80 | 81 | /** 82 | * Initializes the ONNX session with the pre-trained model. 83 | */ 84 | async init() { 85 | this.session = await InferenceSession.create(this.modelPath); 86 | } 87 | 88 | /** 89 | * Releases the ONNX session resources. 90 | */ 91 | async dispose() { 92 | if (this.session) { 93 | await this.session.release().catch(error => console.error("Failed to release session", error)); 94 | this.session = null; 95 | } 96 | } 97 | 98 | /** 99 | * Encodes the input data using the pre-trained model. 100 | * 101 | * @param {Object} inputs - The input data containing input_ids and attention_mask. 102 | * @return {Object} The encoded outputs from the model. 103 | */ 104 | async encode(inputs) { 105 | if (!this.session) { 106 | throw new Error("Session is not initialized."); 107 | } 108 | 109 | // Helper function to convert BigInt64Array to Int32Array or validate Int32Array 110 | function ensureInt32Array(data) { 111 | if (data instanceof Int32Array) { 112 | return data; // Use as is if already Int32Array 113 | } 114 | if (data instanceof BigInt64Array) { 115 | // Convert BigInt64Array to Int32Array, ensuring all values are in range 116 | return new Int32Array(Array.from(data).map(bigInt => { 117 | if (bigInt > 2147483647n || bigInt < -2147483648n) { 118 | throw new Error("Value out of range for Int32."); 119 | } 120 | return Number(bigInt); // Convert BigInt to Number 121 | })); 122 | } 123 | // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array 124 | if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) { 125 | return new Int32Array(data); // Convert directly 126 | } 127 | throw new Error("Unsupported data type for tensor conversion."); 128 | } 129 | 130 | // Prepare tensor data 131 | const inputIDsData = ensureInt32Array(inputs.input_ids.data); 132 | const attentionMaskData = ensureInt32Array(inputs.attention_mask.data); 133 | 134 | // Create ONNX Tensors as 'int32' 135 | const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims); 136 | const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims); 137 | 138 | // Run model inference 139 | return this.session.run({ 140 | input_ids: inputIDs, 141 | attention_mask: attentionMask, 142 | }); 143 | } 144 | 145 | } 146 | 147 | /** 148 | * A processor for image data that prepares images for the image encoder model. 149 | */ 150 | class ImageProcessor { 151 | constructor(configPath) { 152 | this.configPath = configPath; 153 | } 154 | 155 | /** 156 | * Initializes the ImageProcessor by loading configuration settings for image preprocessing. 157 | */ 158 | async init() { 159 | var config = JSON.parse(readFileSync(this.configPath, 'utf8')); 160 | if (config.image_encoder !== undefined) { 161 | config = config.image_encoder; 162 | } 163 | 164 | this.imageSize = config.image_size; 165 | this.normalizationMeans = config.normalization_means; 166 | this.normalizationDeviations = config.normalization_deviations; 167 | 168 | this.imageMean = new Float32Array(this.normalizationMeans); 169 | this.imageStd = new Float32Array(this.normalizationDeviations); 170 | } 171 | /** 172 | * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing. 173 | * 174 | * @param {Buffer|Array} images - A single image or an array of images to process. 175 | * @return {Array} The processed image data as an array of Float32Arrays. 176 | */ 177 | async process(images) { 178 | const processSingle = async (image) => { 179 | let img = sharp(image).toColorspace('srgb'); 180 | const metadata = await img.metadata(); 181 | const scale = this.imageSize / Math.min(metadata.width, metadata.height); 182 | const scaledWidth = Math.ceil(metadata.width * scale); 183 | const scaledHeight = Math.ceil(metadata.height * scale); 184 | img = img.resize({ 185 | width: scaledWidth, 186 | height: scaledHeight, 187 | fit: sharp.fit.cover, 188 | position: sharp.strategy.entropy, 189 | options: sharp.interpolators.bicubic 190 | }).extract({ 191 | left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)), 192 | top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)), 193 | width: this.imageSize, 194 | height: this.imageSize 195 | }).removeAlpha(); 196 | 197 | let buffer = await img.raw().toBuffer(); 198 | let array = new Float32Array(buffer.length); 199 | 200 | // When we export into the `array`, we reorder the dimensions of the tensor 201 | // from HWC to CHW, and normalize the pixel values. 202 | let channelSize = this.imageSize * this.imageSize; 203 | for (let i = 0; i < this.imageSize * this.imageSize; i++) { 204 | let r = buffer[i * 3]; 205 | let g = buffer[i * 3 + 1]; 206 | let b = buffer[i * 3 + 2]; 207 | array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0]; 208 | array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1]; 209 | array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2]; 210 | } 211 | 212 | return array; 213 | }; 214 | 215 | if (Array.isArray(images)) { 216 | return Promise.all(images.map(img => processSingle(img))); 217 | } else { 218 | return [await processSingle(images)]; 219 | } 220 | } 221 | } 222 | 223 | /** 224 | * An encoder for image data that uses a pre-trained model to encode images. 225 | */ 226 | class ImageEncoder { 227 | constructor(modelPath, processor) { 228 | this.modelPath = modelPath; 229 | this.imageSize = processor.imageSize; 230 | } 231 | 232 | /** 233 | * Initializes the ONNX session with the pre-trained model. 234 | */ 235 | async init() { 236 | this.session = await InferenceSession.create(this.modelPath); 237 | } 238 | 239 | /** 240 | * Releases the ONNX session resources. 241 | */ 242 | async dispose() { 243 | if (this.session) { 244 | await this.session.release().catch(error => console.error("Failed to release session", error)); 245 | this.session = null; 246 | } 247 | } 248 | 249 | /** 250 | * Encodes the processed image data using the pre-trained model. 251 | * 252 | * @param {Float32Array|Array} images - The processed image data. 253 | * @return {Object} The encoded outputs from the model. 254 | */ 255 | async encode(images) { 256 | if (!this.session) { 257 | throw new Error("Session is not initialized."); 258 | } 259 | 260 | // Helper function to ensure data is a Float32Array. 261 | const ensureFloat32Array = (data) => { 262 | if (!(data instanceof Float32Array)) { 263 | throw new Error("Unsupported data type for tensor conversion."); 264 | } 265 | return data; 266 | }; 267 | 268 | // Helper function to concatenate multiple Float32Arrays into a single Float32Array. 269 | const concatFloat32Arrays = (arrays) => { 270 | const totalLength = arrays.reduce((acc, val) => acc + val.length, 0); 271 | const result = new Float32Array(totalLength); 272 | let offset = 0; 273 | for (let arr of arrays) { 274 | result.set(arr, offset); 275 | offset += arr.length; 276 | } 277 | return result; 278 | }; 279 | 280 | let imagesData; 281 | let dims; 282 | 283 | if (Array.isArray(images)) { 284 | // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size. 285 | const arrays = images.map(ensureFloat32Array); 286 | imagesData = concatFloat32Arrays(arrays); 287 | const numImages = arrays.length; 288 | const numChannels = 3; 289 | const height = this.imageSize; 290 | const width = this.imageSize; 291 | dims = [numImages, numChannels, height, width]; 292 | } else { 293 | // Single image images, which is already a Float32Array. 294 | imagesData = ensureFloat32Array(images); 295 | const numChannels = 3; 296 | const height = this.imageSize; 297 | const width = this.imageSize; 298 | dims = [1, numChannels, height, width]; 299 | } 300 | 301 | // Create ONNX Tensor 302 | const imagesTensor = new Tensor('float32', imagesData, dims); 303 | 304 | // Run model inference 305 | return this.session.run({ 306 | images: imagesTensor, 307 | }); 308 | } 309 | } 310 | 311 | export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder }; 312 | -------------------------------------------------------------------------------- /javascript/encoders_test.js: -------------------------------------------------------------------------------- 1 | import { existsSync, readFileSync } from 'fs'; 2 | import { fileURLToPath } from 'url'; 3 | import path from 'path'; 4 | import assert from 'assert'; 5 | import fetch from 'node-fetch'; 6 | 7 | import { getModel, Modality, TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from './index.mjs'; 8 | 9 | // Check if the HuggingFace Hub API token is set in the environment variable. 10 | let hf_token = process.env.HUGGINGFACE_HUB_TOKEN; 11 | if (!hf_token) { 12 | const dirname = path.dirname(fileURLToPath(import.meta.url)); 13 | const tokenPath = path.join(dirname, '../', '.hf_token'); 14 | if (existsSync(tokenPath)) { 15 | hf_token = readFileSync(tokenPath, 'utf8').trim(); 16 | } 17 | } 18 | 19 | async function tryGettingCheckpoint(modelId, modalities) { 20 | const { configPath, modalityPaths, tokenizerPath } = await getModel( 21 | modelId, 22 | modalities, 23 | hf_token, 24 | '.onnx' 25 | ); 26 | 27 | assert(configPath !== null, "Config path should not be null"); 28 | assert(modalityPaths !== null, "Modality paths should not be null"); 29 | assert(tokenizerPath !== null, "Tokenizer path should not be null"); 30 | 31 | // Check if the file actually exists 32 | assert(existsSync(configPath), `Config file should exist at ${configPath}`); 33 | assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`); 34 | for (const modalityPath of Object.values(modalityPaths)) { 35 | assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`); 36 | } 37 | } 38 | 39 | async function testGetCheckpoint() { 40 | console.log("- `testGetCheckpoint`: Start"); 41 | 42 | try { 43 | const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; 44 | 45 | for (const modelId of [ 46 | 'unum-cloud/uform3-image-text-english-small', 47 | 'unum-cloud/uform3-image-text-english-base', 48 | 'unum-cloud/uform3-image-text-english-large', 49 | 'unum-cloud/uform3-image-text-multilingual-base', 50 | ]) { 51 | await tryGettingCheckpoint(modelId, modalities, hf_token); 52 | } 53 | 54 | console.log("- `testGetCheckpoint`: Success"); 55 | } catch (error) { 56 | console.error("- `testGetCheckpoint`: Failed", error); 57 | } 58 | } 59 | 60 | async function tryTextEncoderForwardPass(modelId) { 61 | const modalities = [Modality.TextEncoder]; 62 | const { configPath, modalityPaths, tokenizerPath } = await getModel( 63 | modelId, 64 | modalities, 65 | hf_token, 66 | '.onnx' 67 | ); 68 | 69 | const textProcessor = new TextProcessor(configPath, tokenizerPath); 70 | await textProcessor.init(); 71 | const processedTexts = await textProcessor.process("a small red panda in a zoo"); 72 | 73 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); 74 | await textEncoder.init(); 75 | const textOutput = await textEncoder.encode(processedTexts); 76 | assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); 77 | 78 | await textEncoder.dispose(); 79 | } 80 | 81 | async function tryImageEncoderForwardPass(modelId) { 82 | const modalities = [Modality.ImageEncoder]; 83 | const { configPath, modalityPaths } = await getModel( 84 | modelId, 85 | modalities, 86 | hf_token, 87 | '.onnx' 88 | ); 89 | 90 | const imageProcessor = new ImageProcessor(configPath); 91 | await imageProcessor.init(); 92 | const processedImages = await imageProcessor.process("assets/unum.png"); 93 | 94 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); 95 | await imageEncoder.init(); 96 | const imageOutput = await imageEncoder.encode(processedImages); 97 | assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); 98 | 99 | await imageEncoder.dispose(); 100 | } 101 | 102 | function cosineSimilarity(vecA, vecB) { 103 | // We may be receiving a complex tensor type, so let's check if it 104 | // has an array member named `data`. 105 | if (vecA.data) { 106 | vecA = vecA.data; 107 | } 108 | if (vecB.data) { 109 | vecB = vecB.data; 110 | } 111 | 112 | let dotProduct = 0.0; 113 | let normA = 0.0; 114 | let normB = 0.0; 115 | for (let i = 0; i < vecA.length; i++) { 116 | dotProduct += vecA[i] * 1.0 * vecB[i]; 117 | normA += vecA[i] * 1.0 * vecA[i]; 118 | normB += vecB[i] * 1.0 * vecB[i]; 119 | } 120 | if (normA === 0 || normB === 0) { 121 | return 0; 122 | } else { 123 | return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); 124 | } 125 | } 126 | 127 | async function fetchImage(url) { 128 | const response = await fetch(url); 129 | const arrayBuffer = await response.arrayBuffer(); 130 | const buffer = Buffer.from(arrayBuffer); 131 | return buffer; 132 | } 133 | 134 | async function tryCrossReferencingImageAndText(modelId) { 135 | 136 | const modalities = [Modality.ImageEncoder, Modality.TextEncoder]; 137 | const { configPath, modalityPaths, tokenizerPath } = await getModel( 138 | modelId, 139 | modalities, 140 | hf_token, 141 | '.onnx' 142 | ); 143 | 144 | const imageProcessor = new ImageProcessor(configPath); 145 | await imageProcessor.init(); 146 | const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); 147 | await imageEncoder.init(); 148 | const textProcessor = new TextProcessor(configPath, tokenizerPath); 149 | await textProcessor.init(); 150 | const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); 151 | await textEncoder.init(); 152 | 153 | const texts = [ 154 | "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", 155 | "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", 156 | "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", 157 | "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", 158 | "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", 159 | ]; 160 | const imageUrls = [ 161 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", 162 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", 163 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", 164 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", 165 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", 166 | ]; 167 | 168 | const textEmbeddings = []; 169 | const imageEmbeddings = []; 170 | 171 | for (let i = 0; i < texts.length; i++) { 172 | const text = texts[i]; 173 | const imageUrl = imageUrls[i]; 174 | const imageBuffer = await fetchImage(imageUrl); 175 | 176 | const processedText = await textProcessor.process(text); 177 | const processedImage = await imageProcessor.process(imageBuffer); 178 | 179 | const textEmbedding = await textEncoder.encode(processedText); 180 | const imageEmbedding = await imageEncoder.encode(processedImage); 181 | 182 | textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data)); 183 | imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data)); 184 | 185 | // Print-based debugging at its best :) 186 | // console.log(`Text: ${text}, Image: ${imageUrl}`); 187 | // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`); 188 | // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`); 189 | console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`) 190 | } 191 | 192 | for (let i = 0; i < texts.length; i++) { 193 | const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]); 194 | const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i])); 195 | const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie)); 196 | 197 | const maxOtherTextSimilarity = Math.max(...otherTextSimilarities); 198 | const maxOtherImageSimilarity = Math.max(...otherImageSimilarities); 199 | 200 | assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images."); 201 | assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts."); 202 | } 203 | 204 | await textEncoder.dispose(); 205 | await imageEncoder.dispose(); 206 | } 207 | 208 | async function testEncoders() { 209 | console.log("- `testEncoders`: Start"); 210 | 211 | try { 212 | 213 | // Go through the bi-modal models 214 | for (const modelId of [ 215 | 'unum-cloud/uform3-image-text-english-small', 216 | // 'unum-cloud/uform3-image-text-english-base', 217 | // 'unum-cloud/uform3-image-text-english-large', 218 | // 'unum-cloud/uform3-image-text-multilingual-base', 219 | ]) { 220 | await tryTextEncoderForwardPass(modelId, hf_token); 221 | await tryImageEncoderForwardPass(modelId, hf_token); 222 | await tryCrossReferencingImageAndText(modelId, hf_token); 223 | } 224 | 225 | console.log("- `testEncoders`: Success"); 226 | } catch (error) { 227 | console.error("- `testEncoders`: Failed", error); 228 | } 229 | } 230 | 231 | process.on('uncaughtException', (error) => { 232 | console.error('Uncaught Exception:', error); 233 | }); 234 | 235 | testGetCheckpoint(); 236 | testEncoders(); 237 | -------------------------------------------------------------------------------- /javascript/hub.mjs: -------------------------------------------------------------------------------- 1 | import { join } from "path" 2 | import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs"; 3 | 4 | import { downloadFile, listFiles } from "@huggingface/hub"; 5 | 6 | const Modality = { 7 | TextEncoder: "text_encoder", 8 | ImageEncoder: "image_encoder", 9 | VideoEncoder: "video_encoder", 10 | TextDecoder: "text_decoder", 11 | }; 12 | 13 | function isModality(value) { 14 | return Object.values(Modality).includes(value); 15 | } 16 | 17 | function normalizeModalities(modalities) { 18 | return modalities.map(x => { 19 | if (typeof x === "string") { 20 | if (isModality(x)) { 21 | return x; 22 | } else { 23 | throw new Error(`Invalid modality: ${x}`); 24 | } 25 | } 26 | return x; 27 | }); 28 | } 29 | 30 | async function ensureDirectoryExists(dirPath) { 31 | if (!existsSync(dirPath)) { 32 | mkdirSync(dirPath, { recursive: true }); 33 | } 34 | } 35 | 36 | async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') { 37 | modalities = normalizeModalities(modalities); 38 | 39 | const configNames = ['config.json']; 40 | const tokenizerNames = ['tokenizer.json']; 41 | const modelFileNames = modalities.map(modality => `${modality}${format}`); 42 | const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames]; 43 | 44 | const repo = { type: "model", name: modelId }; 45 | const credentials = token ? { accessToken: token } : undefined; 46 | 47 | let configPath = null; 48 | let tokenizerPath = null; 49 | const modalityPaths = {}; 50 | const modelSaveDir = join(saveDir, modelId); 51 | 52 | await ensureDirectoryExists(modelSaveDir); 53 | 54 | const fileIterator = listFiles({ repo, recursive: true, credentials }); 55 | for await (const file of fileIterator) { 56 | const fileName = file.path.split('/').pop(); 57 | if (fileName && allowedPatterns.includes(fileName)) { 58 | const filePath = file.path; 59 | const savePath = join(modelSaveDir, fileName); 60 | 61 | if (configNames.includes(fileName)) { 62 | configPath = savePath; 63 | } else if (tokenizerNames.includes(fileName)) { 64 | tokenizerPath = savePath; 65 | } else { 66 | const modalityName = fileName.split('.')[0]; 67 | modalityPaths[modalityName] = savePath; 68 | } 69 | 70 | const response = await downloadFile({ repo, path: filePath, credentials }); 71 | if (response) { 72 | // HuggingFace might be defining the `env.localModelPath` variable 73 | // to store the downloaded files in a local directory. 74 | // Let's check if the file is there. 75 | // const localPath = join(env.localModelPath, repo, filePath); 76 | // if (existsSync(localPath)) { 77 | // console.log(`File already exists locally at ${localPath}`); 78 | // } 79 | 80 | if (response.body && response.body.pipe) { 81 | const fileStream = createWriteStream(savePath); 82 | response.body.pipe(fileStream); 83 | await new Promise((resolve, reject) => { 84 | fileStream.on('finish', resolve); 85 | fileStream.on('error', reject); 86 | }); 87 | } else if (response.arrayBuffer) { 88 | // Handle non-streamable response for environments like Node.js 89 | const buffer = await response.arrayBuffer(); 90 | writeFileSync(savePath, Buffer.from(buffer)); 91 | } else { 92 | console.error('Unexpected response type'); 93 | } 94 | console.log(`Downloaded ${fileName} successfully to ${savePath}`); 95 | } else { 96 | console.log('No response received for the file download request.'); 97 | } 98 | } 99 | } 100 | 101 | return { configPath, modalityPaths, tokenizerPath }; 102 | } 103 | 104 | export { getModel, Modality }; 105 | -------------------------------------------------------------------------------- /javascript/index.mjs: -------------------------------------------------------------------------------- 1 | // Re-export everything from hub.mjs 2 | export * from './hub.mjs'; 3 | 4 | // Re-export everything from encoders.mjs 5 | export * from './encoders.mjs'; 6 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@unum-cloud/uform", 3 | "type": "module", 4 | "version": "3.1.1", 5 | "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", 6 | "dependencies": { 7 | "@huggingface/hub": "^0.14.8", 8 | "@xenova/transformers": "^2.17.0", 9 | "node-fetch": "^3.3.2", 10 | "onnxruntime-node": "^1.17.0", 11 | "onnxruntime-web": "^1.17.3" 12 | }, 13 | "devDependencies": { 14 | "nodemon": "^2.0.15" 15 | }, 16 | "scripts": { 17 | "start": "node javascript/encoders.mjs", 18 | "test": "node javascript/encoders_test.js" 19 | }, 20 | "main": "javascript/index.mjs", 21 | "files": [ 22 | "javascript/index.mjs", 23 | "javascript/encoders.mjs", 24 | "javascript/hub.mjs" 25 | ], 26 | "directories": { 27 | "doc": "docs" 28 | }, 29 | "keywords": [ 30 | "AI", 31 | "multimodal", 32 | "content generation", 33 | "huggingface" 34 | ], 35 | "author": "Ash Vardanian, Unum Cloud", 36 | "license": "Apache-2.0" 37 | } 38 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = ["setuptools>=42"] 4 | 5 | [project] 6 | authors = [ 7 | {email = "ash.vardanian@unum.cloud", name = "Ash Vardanian"}, 8 | {email = "mike.kim@unum.cloud", name = "Mikhail Kim"}, 9 | {email = "vladimir.orshulevich@unum.cloud", name = "Vladimir Orshulevich"}, 10 | ] 11 | classifiers = [ 12 | "Development Status :: 5 - Production/Stable", 13 | "License :: OSI Approved :: Apache Software License", 14 | "Natural Language :: Chinese (Simplified)", 15 | "Natural Language :: English", 16 | "Natural Language :: French", 17 | "Natural Language :: German", 18 | "Natural Language :: Italian", 19 | "Natural Language :: Japanese", 20 | "Natural Language :: Korean", 21 | "Natural Language :: Polish", 22 | "Natural Language :: Russian", 23 | "Natural Language :: Spanish", 24 | "Natural Language :: Turkish", 25 | "Operating System :: OS Independent", 26 | "Programming Language :: Python :: 3", 27 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 28 | "Topic :: Scientific/Engineering :: Image Processing", 29 | "Topic :: Scientific/Engineering :: Image Recognition", 30 | ] 31 | dependencies = [ 32 | "huggingface_hub>=0.16.4", 33 | "tokenizers>=0.13.3", 34 | "pillow", 35 | "simsimd", 36 | ] 37 | description = "Pocket-Sized Multimodal AI for Content Understanding and Generation" 38 | maintainers = [ 39 | {email = "info@unum.cloud", name = "Unum Cloud"}, 40 | ] 41 | name = "uform" 42 | readme = "README.md" 43 | requires-python = ">=3.7" 44 | version = "3.1.1" 45 | 46 | [project.scripts] 47 | uform-chat = "uform.chat:main" 48 | 49 | [project.optional-dependencies] 50 | torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"] 51 | onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"] 52 | onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"] 53 | dev = ["pytest", "pandas"] 54 | 55 | [project.urls] 56 | "Homepage" = "https://github.com/unum-cloud/uform" 57 | 58 | [tool.setuptools.packages.find] 59 | where = ["python"] 60 | include = ["uform"] 61 | namespaces = false 62 | 63 | [tool.ruff] 64 | ignore = ["C408", "C901", "E501", "E741"] 65 | ignore-init-module-imports = true 66 | select = ["C", "E", "F", "I", "UP", "W"] 67 | 68 | [tool.ruff.isort] 69 | lines-after-imports = 2 70 | 71 | [tool.ruff.lint.isort] 72 | known-first-party = ["uform"] 73 | 74 | [tool.ruff.per-file-ignores] 75 | "__init__.py" = ["E401"] 76 | 77 | [tool.tomlsort] 78 | all = true 79 | in_place = true 80 | spaces_before_inline_comment = 2 81 | spaces_indent_inline_array = 4 82 | trailing_comma_inline_array = true 83 | 84 | # Configuration options for the Black formatter: 85 | # https://black.readthedocs.io/en/latest/usage_and_configuration/the_basics.html#where-black-looks-for-the-file 86 | [tool.black] 87 | line-length = 120 # Set line length to the same value as in `.clang-format` for modern wide screens 88 | target-version = ['py36', 'py312'] # Set target Python versions to 3.6 and 3.12 -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # UForm Python SDK 2 | 3 | UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications. 4 | The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware. 5 | 6 | ## Installation 7 | 8 | There are several ways to install the UForm Python SDK, depending on the backend you want to use. 9 | PyTorch is by far the heaviest, but the most capable. 10 | ONNX is a lightweight alternative that can run on any CPU, and on some GPUs. 11 | 12 | ```bash 13 | pip install "uform[torch]" # For PyTorch 14 | pip install "uform[onnx]" # For ONNX on CPU 15 | pip install "uform[onnx-gpu]" # For ONNX on GPU, available for some platforms 16 | pip install "uform[torch,onnx]" # For PyTorch and ONNX Python tests 17 | ``` 18 | 19 | ## Quick Start 20 | 21 | ### Embeddings 22 | 23 | Load the model: 24 | 25 | ```py 26 | from uform import get_model, Modality 27 | 28 | model_name = 'unum-cloud/uform3-image-text-english-small' 29 | modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER] 30 | processors, models = get_model(model_name, modalities=modalities) 31 | 32 | model_text = models[Modality.TEXT_ENCODER] 33 | model_image = models[Modality.IMAGE_ENCODER] 34 | processor_text = processors[Modality.TEXT_ENCODER] 35 | processor_image = processors[Modality.IMAGE_ENCODER] 36 | ``` 37 | 38 | Embed images: 39 | 40 | ```py 41 | import requests 42 | from io import BytesIO 43 | from PIL import Image 44 | 45 | image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' 46 | image_url = Image.open(BytesIO(requests.get(image_url).content)) 47 | image_data = processor_image(image) 48 | image_features, image_embedding = model_image.encode(image_data, return_features=True) 49 | ``` 50 | 51 | Embed queries: 52 | 53 | ```py 54 | text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' 55 | text_data = processor_text(text) 56 | text_features, text_embedding = model_text.encode(text_data, return_features=True) 57 | ``` 58 | 59 | ### Generative Models 60 | 61 | UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library. 62 | Those models can be used to caption images or power multimodal chat experiences. 63 | 64 | ```python 65 | from transformers import AutoModel, AutoProcessor 66 | 67 | model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) 68 | processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) 69 | 70 | prompt = 'Question or Instruction' 71 | image = Image.open('image.jpg') 72 | 73 | inputs = processor(text=[prompt], images=[image], return_tensors='pt') 74 | 75 | with torch.inference_mode(): 76 | output = model.generate( 77 | **inputs, 78 | do_sample=False, 79 | use_cache=True, 80 | max_new_tokens=256, 81 | eos_token_id=151645, 82 | pad_token_id=processor.tokenizer.pad_token_id 83 | ) 84 | prompt_len = inputs['input_ids'].shape[1] 85 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0] 86 | ``` 87 | 88 | You can check examples of different prompts in our demo Gradio spaces on HuggingFace: 89 | 90 | - for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo) 91 | - for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo) 92 | 93 | ## Technical Details 94 | 95 | ### Multi-GPU Parallelism 96 | 97 | To achieve higher throughput, you can launch UForm on multiple GPUs. 98 | For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`). 99 | 100 | ```python 101 | from uform import get_model, Modality 102 | import torch.nn as nn 103 | 104 | encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch') 105 | 106 | model_text = models[Modality.TEXT_ENCODER] 107 | model_image = models[Modality.IMAGE_ENCODER] 108 | processor_text = processors[Modality.TEXT_ENCODER] 109 | processor_image = processors[Modality.IMAGE_ENCODER] 110 | 111 | model_text.return_features = False 112 | model_image.return_features = False 113 | model_text_parallel = nn.DataParallel(model_text) 114 | model_image_parallel = nn.DataParallel(model_image) 115 | ``` 116 | 117 | Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays. 118 | 119 | ```python 120 | def get_image_embedding(images: List[Image]): 121 | preprocessed = processor_image(images) 122 | embedding = model_image_parallel.forward(preprocessed) 123 | return embedding.detach().cpu().numpy() 124 | 125 | def get_text_embedding(texts: List[str]): 126 | preprocessed = processor_text(texts) 127 | embedding = model_text_parallel.forward(preprocessed) 128 | return embedding.detach().cpu().numpy() 129 | ``` 130 | 131 | ### ONNX and CUDA 132 | 133 | The configuration process may include a few additional steps, depending on the environment. 134 | When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository. 135 | 136 | ```sh 137 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb 138 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 139 | sudo apt-get update 140 | sudo apt-get -y install cuda-toolkit-12 141 | pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ 142 | export CUDA_PATH="/usr/local/cuda-12/bin" 143 | export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}" 144 | export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" 145 | pytest python/scripts/ -s -x -Wd -v -k onnx 146 | ``` 147 | 148 | [install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu 149 | -------------------------------------------------------------------------------- /python/scripts/bench_decoders.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from time import perf_counter 3 | from dataclasses import dataclass 4 | from typing import List 5 | import argparse 6 | 7 | import requests 8 | import torch 9 | from PIL import Image 10 | from transformers import ( 11 | AutoProcessor, 12 | InstructBlipForConditionalGeneration, 13 | InstructBlipProcessor, 14 | LlavaForConditionalGeneration, 15 | AutoModel, 16 | AutoProcessor, 17 | ) 18 | 19 | from uform.torch_decoders import VLMForCausalLM, VLMProcessor 20 | 21 | dtype = torch.bfloat16 22 | low_cpu_mem_usage = False 23 | device = "cuda:0" 24 | 25 | 26 | @dataclass 27 | class BenchmarkResult: 28 | model_name: str 29 | device_name: str 30 | backend_name: str 31 | duration_image_preprocessing: float 32 | duration_image_embedding: float 33 | duration_text_preprocessing: float 34 | duration_text_embedding: float 35 | 36 | 37 | def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]: 38 | # BLIP models require the prompt to be the first argument 39 | prompt = [prompt] * batch_size 40 | image = [image] * batch_size 41 | try: 42 | inputs = processor(prompt, image, return_tensors="pt") 43 | except ValueError: 44 | inputs = processor(image, prompt, return_tensors="pt") 45 | 46 | # Downcast and move to device 47 | for possible_key in ["images", "pixel_values"]: 48 | if possible_key not in inputs: 49 | continue 50 | inputs[possible_key] = inputs[possible_key].to(dtype) # Downcast floats 51 | inputs = {k: v.to(device) for k, v in inputs.items()} # Move to the right device 52 | 53 | with torch.inference_mode(): 54 | output = model.generate( 55 | **inputs, 56 | do_sample=False, 57 | # use_cache=True, 58 | max_new_tokens=max_length, 59 | eos_token_id=32001, 60 | pad_token_id=processor.tokenizer.pad_token_id, 61 | ) 62 | prompt_len = inputs["input_ids"].shape[1] 63 | decoded_texts = processor.batch_decode( 64 | output[:, prompt_len:], 65 | skip_special_tokens=True, 66 | ) 67 | return decoded_texts 68 | 69 | 70 | def duration(callable): 71 | """Profile the duration of a callable and return the duration and the result.""" 72 | start = perf_counter() 73 | result = callable() 74 | stop = perf_counter() 75 | return stop - start, result 76 | 77 | 78 | def bench_captions( 79 | model, 80 | processor, 81 | prompt: str, 82 | images: List[Image.Image], 83 | max_length: int = 256, 84 | batch_size: int = 10, 85 | ) -> List[str]: 86 | total_duration = 0 87 | total_length = 0 88 | model = torch.compile(model) 89 | 90 | def caption_image(image): 91 | return caption( 92 | model=model, 93 | processor=processor, 94 | prompt=prompt, 95 | image=image, 96 | max_length=max_length, 97 | batch_size=batch_size, 98 | ) 99 | 100 | for image in images: 101 | seconds, captions = duration(partial(caption_image, image=image)) 102 | total_duration += seconds 103 | total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions) 104 | 105 | del model 106 | del processor 107 | print(f"Throughput: {total_length/total_duration:.2f} tokens/s") 108 | 109 | 110 | def main(batch_size: int = 10, max_length: int = 256): 111 | 112 | image_urls = [ 113 | "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 114 | "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 115 | "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 116 | "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 117 | "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 118 | ] 119 | images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls] 120 | captions = [ 121 | "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field", 122 | "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta", 123 | "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank", 124 | "asian girl sleeping in a bed. top down view", 125 | "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it", 126 | ] 127 | 128 | print("UForm-Gen2") 129 | bench_captions( 130 | model=AutoModel.from_pretrained( 131 | "unum-cloud/uform-gen2-dpo", 132 | trust_remote_code=True, 133 | torch_dtype=dtype, 134 | low_cpu_mem_usage=low_cpu_mem_usage, 135 | ignore_mismatched_sizes=True, 136 | ).to(device), 137 | processor=AutoProcessor.from_pretrained( 138 | "unum-cloud/uform-gen2-dpo", 139 | trust_remote_code=True, 140 | ), 141 | prompt="Describe the picture in great detail", 142 | images=images, 143 | batch_size=batch_size, 144 | max_length=max_length, 145 | ) 146 | 147 | print("UForm-Gen") 148 | bench_captions( 149 | model=VLMForCausalLM.from_pretrained( 150 | "unum-cloud/uform-gen", 151 | torch_dtype=dtype, 152 | low_cpu_mem_usage=low_cpu_mem_usage, 153 | ignore_mismatched_sizes=True, 154 | ).to(device), 155 | processor=VLMProcessor.from_pretrained( 156 | "unum-cloud/uform-gen", 157 | ), 158 | prompt="[cap] Summarize the visual content of the image.", 159 | images=images, 160 | batch_size=batch_size, 161 | max_length=max_length, 162 | ) 163 | 164 | print("LLaVA") 165 | bench_captions( 166 | model=LlavaForConditionalGeneration.from_pretrained( 167 | "llava-hf/llava-1.5-7b-hf", 168 | torch_dtype=dtype, 169 | low_cpu_mem_usage=low_cpu_mem_usage, 170 | ).to(device), 171 | processor=AutoProcessor.from_pretrained( 172 | "llava-hf/llava-1.5-7b-hf", 173 | ), 174 | prompt="USER: \nWhat are these?\nASSISTANT:", 175 | images=images, 176 | batch_size=batch_size, 177 | max_length=max_length, 178 | ) 179 | 180 | print("InstructBLIP") 181 | bench_captions( 182 | model=InstructBlipForConditionalGeneration.from_pretrained( 183 | "Salesforce/instructblip-vicuna-7b", 184 | torch_dtype=dtype, 185 | low_cpu_mem_usage=low_cpu_mem_usage, 186 | ).to(device), 187 | processor=InstructBlipProcessor.from_pretrained( 188 | "Salesforce/instructblip-vicuna-7b", 189 | ), 190 | prompt="Summarize the visual content of the image.", 191 | images=images, 192 | batch_size=batch_size, 193 | max_length=max_length, 194 | ) 195 | 196 | 197 | if __name__ == "__main__": 198 | 199 | parser = argparse.ArgumentParser() 200 | parser.add_argument( 201 | "--batch-size", 202 | type=int, 203 | default=10, 204 | help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", 205 | ) 206 | parser.add_argument( 207 | "--max-length", 208 | type=str, 209 | default=256, 210 | help="Maximum length of the generated text in tokens.", 211 | ) 212 | args = parser.parse_args() 213 | 214 | main(batch_size=args.batch_size, max_length=args.max_length) 215 | -------------------------------------------------------------------------------- /python/scripts/bench_encoders.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This script provides the throughput of UForm multimodal embedding models. 5 | 6 | The output of the script will cover: 7 | - Time to preprocess an image, and throughput in images/s. 8 | - Time to tokenize the text, and throughput in queries/s. 9 | - Time to encode the image, and throughput in images/s. 10 | - Time to encode the text, and throughput in queries/s. 11 | - Share of time spent on each part of the pipeline. 12 | 13 | Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx), 14 | and precision (float32 or bfloat16), producing a pretty comprehensive benchmark. 15 | 16 | Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`. 17 | Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled. 18 | """ 19 | 20 | from functools import partial 21 | from time import perf_counter 22 | from dataclasses import dataclass 23 | from typing import List, Tuple, Literal, Callable, Generator 24 | import re 25 | import argparse 26 | 27 | import requests 28 | from PIL import Image 29 | import pandas as pd 30 | 31 | from uform import get_model, Modality, ExecutionProviderError 32 | 33 | # Define global constants for the hardware availability 34 | torch_available = False 35 | try: 36 | import torch 37 | 38 | torch_available = True 39 | except ImportError: 40 | pass 41 | onnx_available = False 42 | try: 43 | import onnx 44 | 45 | onnx_available = True 46 | except ImportError: 47 | pass 48 | cuda_available = False 49 | try: 50 | if torch_available: 51 | cuda_available = torch.cuda.is_available() 52 | elif onnx_available: 53 | import onnxruntime 54 | 55 | cuda_available = onnxruntime.get_device() == "GPU" 56 | except ImportError: 57 | pass 58 | 59 | 60 | @dataclass 61 | class BenchmarkResult: 62 | model_name: str 63 | device_name: Literal["cpu", "cuda"] = "cpu" 64 | backend_name: Literal["torch", "onnx"] = "torch" 65 | duration_image_preprocessing: float = 0 66 | duration_image_embedding: float = 0 67 | duration_text_preprocessing: float = 0 68 | duration_text_embedding: float = 0 69 | 70 | 71 | def duration(callable, synchronize=False): 72 | """Profile the duration of a callable and return the duration and the result.""" 73 | if synchronize and torch_available and cuda_available: 74 | torch.cuda.synchronize() # Wait for CUDA operations to complete 75 | start = perf_counter() 76 | result = callable() 77 | if synchronize and torch_available and cuda_available: 78 | torch.cuda.synchronize() # Ensure all CUDA kernels have finished 79 | stop = perf_counter() 80 | return stop - start, result 81 | 82 | 83 | def get_captioned_images() -> List[Tuple[Image.Image, str]]: 84 | """Get a list of pre-downloaded and decoded images and their captions.""" 85 | image_urls = [ 86 | "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 87 | "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 88 | "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 89 | "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 90 | "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", 91 | ] 92 | images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls] 93 | captions = [ 94 | "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field", 95 | "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta", 96 | "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank", 97 | "asian girl sleeping in a bed. top down view", 98 | "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it", 99 | ] 100 | return list(zip(images, captions)) 101 | 102 | 103 | def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]: 104 | """Yields callable benchmarks for all supported backends of the given model.""" 105 | 106 | # Pull the content and artificially grow the batch size 107 | images, captions = zip(*get_captioned_images()) 108 | 109 | if len(images) < batch_size: 110 | import math 111 | 112 | multiplier = int(math.ceil(batch_size / len(images))) 113 | images *= multiplier 114 | captions *= multiplier 115 | images = images[:batch_size] 116 | captions = captions[:batch_size] 117 | 118 | def run(model_name: str, device: str, backend_name: str): 119 | result = BenchmarkResult( 120 | model_name=model_name, 121 | backend_name=backend_name, 122 | device_name=device, 123 | duration_image_preprocessing=0, 124 | duration_image_embedding=0, 125 | duration_text_preprocessing=0, 126 | duration_text_embedding=0, 127 | ) 128 | 129 | sync = backend_name == "torch" 130 | processors, models = get_model( 131 | model_name, 132 | device=device, 133 | modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER], 134 | backend=backend_name, 135 | ) 136 | 137 | model_text = models[Modality.TEXT_ENCODER] 138 | model_image = models[Modality.IMAGE_ENCODER] 139 | processor_text = processors[Modality.TEXT_ENCODER] 140 | processor_image = processors[Modality.IMAGE_ENCODER] 141 | 142 | # Image preprocessing 143 | total_duration = 0 144 | total_iterations = 0 145 | while total_duration < 10 and total_iterations < 100: 146 | seconds, _ = duration(lambda: processor_image(images)) 147 | total_duration += seconds 148 | total_iterations += len(images) 149 | duration_per_iteration = total_duration / total_iterations 150 | result.duration_image_preprocessing = duration_per_iteration 151 | 152 | # Image embedding 153 | total_duration = 0 154 | total_iterations = 0 155 | while total_duration < 10 and total_iterations < 100: 156 | images_data = processor_image(images) 157 | seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync) 158 | total_duration += seconds 159 | total_iterations += len(images) 160 | duration_per_iteration = total_duration / total_iterations 161 | result.duration_image_embedding = duration_per_iteration 162 | 163 | # Text preprocessing 164 | total_duration = 0 165 | total_iterations = 0 166 | while total_duration < 10 and total_iterations < 100: 167 | seconds, _ = duration(lambda: processor_text(captions)) 168 | total_duration += seconds 169 | total_iterations += len(captions) 170 | duration_per_iteration = total_duration / total_iterations 171 | result.duration_text_preprocessing = duration_per_iteration 172 | 173 | # Text embedding 174 | total_duration = 0 175 | total_iterations = 0 176 | while total_duration < 10 and total_iterations < 100: 177 | texts_data = processor_text(captions) 178 | seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync) 179 | total_duration += seconds 180 | total_iterations += len(captions) 181 | duration_per_iteration = total_duration / total_iterations 182 | result.duration_text_embedding = duration_per_iteration 183 | 184 | return result 185 | 186 | devices = ["cpu"] 187 | if cuda_available: 188 | devices.append("cuda") 189 | backends = [] 190 | if torch_available: 191 | backends.append("torch") 192 | if onnx_available: 193 | backends.append("onnx") 194 | 195 | for device in devices: 196 | for backend_name in backends: 197 | for model_name in [ 198 | "unum-cloud/uform3-image-text-english-small", 199 | "unum-cloud/uform3-image-text-english-base", 200 | "unum-cloud/uform3-image-text-english-large", 201 | "unum-cloud/uform3-image-text-multilingual-base", 202 | ]: 203 | yield BenchmarkResult( 204 | model_name=model_name, 205 | device_name=device, 206 | backend_name=backend_name, 207 | ), partial(run, model_name, device, backend_name) 208 | 209 | 210 | def main(filter_out: str = None, batch_size: int = 10): 211 | results = [] 212 | filter_pattern = re.compile(filter_out) if filter_out else None 213 | for specs, func in yield_benchmarks(batch_size=batch_size): 214 | if filter_pattern and ( 215 | filter_pattern.search(specs.model_name) 216 | or filter_pattern.search(specs.backend_name) 217 | or filter_pattern.search(specs.device_name) 218 | ): 219 | continue 220 | 221 | try: 222 | print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend") 223 | result = func() 224 | results.append(result) 225 | except ExecutionProviderError as e: 226 | print(f"- skipping missing backend") 227 | print(e) 228 | 229 | results = sorted(results, key=lambda x: x.model_name) 230 | results = [x.__dict__ for x in results] 231 | 232 | df = pd.DataFrame(results) 233 | df.columns = [ 234 | "Model Name", 235 | "Device", 236 | "Backend", 237 | "Images Preprocessed/s", 238 | "Images Encoded/s", 239 | "Texts Preprocessed/s", 240 | "Texts Encoded/s", 241 | ] 242 | 243 | def inverse(x): 244 | return 1 / x if x != 0 else 0 245 | 246 | # Apply number formatting directly in the DataFrame 247 | formatted_df = df.copy() 248 | formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format) 249 | formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format) 250 | formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format) 251 | formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format) 252 | 253 | # Convert formatted DataFrame to Markdown 254 | print(formatted_df.to_markdown()) 255 | 256 | 257 | if __name__ == "__main__": 258 | 259 | parser = argparse.ArgumentParser() 260 | parser.add_argument( 261 | "--filter-out", 262 | type=str, 263 | default=None, 264 | help="Filter out models, backends, or devices with a Regular Expression.", 265 | ) 266 | parser.add_argument( 267 | "--batch-size", 268 | type=int, 269 | default=10, 270 | help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", 271 | ) 272 | args = parser.parse_args() 273 | 274 | main(filter_out=args.filter_out, batch_size=args.batch_size) 275 | -------------------------------------------------------------------------------- /python/scripts/export_decoders.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n", 8 | "\n", 9 | "Depending on the backend, we prefer different qunatization schemes.\n", 10 | "\n", 11 | "- For ONNX we use `uint8` quantization.\n", 12 | "- For PyTorch we use `bfloat16` quantization.\n", 13 | "- For CoreML we use `float32` representation." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "!pip install --upgrade \"uform[torch]\" coremltools" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import os\n", 32 | "model_name = \"unum-cloud/uform-gen2-dpo\"\n", 33 | "output_directory = \"../../\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import torch\n", 43 | "import uform\n", 44 | "from PIL import Image\n", 45 | "from transformers import AutoModel, AutoProcessor\n", 46 | "\n", 47 | "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n", 48 | "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n", 49 | "\n", 50 | "prompt = 'Describe the picture'\n", 51 | "image = Image.open('../../assets/unum.png')\n", 52 | "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n", 53 | "\n", 54 | "with torch.inference_mode():\n", 55 | " output = model.generate(\n", 56 | " **inputs,\n", 57 | " do_sample=False,\n", 58 | " use_cache=True,\n", 59 | " max_new_tokens=256,\n", 60 | " eos_token_id=151645,\n", 61 | " pad_token_id=processor.tokenizer.pad_token_id\n", 62 | " )\n", 63 | "prompt_len = inputs['input_ids'].shape[1]\n", 64 | "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n", 65 | "\n", 66 | "print(decoded_text)" 67 | ] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "base", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.11.5" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 2 91 | } 92 | -------------------------------------------------------------------------------- /python/scripts/test_decoders.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from PIL import Image 3 | 4 | # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed 5 | try: 6 | import torch 7 | 8 | torch_available = True 9 | except: 10 | torch_available = False 11 | 12 | torch_hf_models = [ 13 | "unum-cloud/uform-gen2-qwen-500m", 14 | "unum-cloud/uform-gen2-dpo", 15 | ] 16 | 17 | 18 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") 19 | @pytest.mark.parametrize("model_name", torch_hf_models) 20 | def test_one_conversation(model_name: str): 21 | from transformers import AutoModel, AutoProcessor 22 | 23 | model = AutoModel.from_pretrained(model_name, trust_remote_code=True) 24 | processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) 25 | 26 | prompt = "Describe the image in great detail." 27 | image = Image.open("assets/unum.png") 28 | 29 | inputs = processor(text=[prompt], images=[image], return_tensors="pt") 30 | 31 | with torch.inference_mode(): 32 | output = model.generate( 33 | **inputs, 34 | do_sample=False, 35 | use_cache=True, 36 | max_new_tokens=10, 37 | pad_token_id=processor.tokenizer.pad_token_id, 38 | ) 39 | prompt_len = inputs["input_ids"].shape[1] 40 | decoded_text = processor.batch_decode(output[:, prompt_len:])[0] 41 | 42 | assert len(decoded_text), "No text was generated from the model." 43 | 44 | 45 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") 46 | @pytest.mark.parametrize("model_name", torch_hf_models) 47 | @pytest.mark.parametrize("batch_size", [1, 2]) 48 | def test_many_conversations(model_name: str, batch_size: int): 49 | 50 | from transformers import AutoModel, AutoProcessor 51 | 52 | model = AutoModel.from_pretrained(model_name, trust_remote_code=True) 53 | processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) 54 | 55 | prompt = "Describe the image in great detail." 56 | image = Image.open("assets/unum.png") 57 | 58 | texts = [prompt] * batch_size 59 | images = [image] * batch_size 60 | inputs = processor(text=texts, images=images, return_tensors="pt") 61 | 62 | with torch.inference_mode(): 63 | output = model.generate( 64 | **inputs, 65 | do_sample=False, 66 | use_cache=True, 67 | max_new_tokens=10, 68 | pad_token_id=processor.tokenizer.pad_token_id, 69 | ) 70 | prompt_len = inputs["input_ids"].shape[1] 71 | decoded_texts = processor.batch_decode(output[:, prompt_len:]) 72 | 73 | assert all(len(decoded_text) for decoded_text in decoded_texts), "No text was generated from the model." 74 | -------------------------------------------------------------------------------- /python/scripts/test_encoders.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from typing import Tuple 3 | import requests 4 | from io import BytesIO 5 | import os 6 | 7 | import pytest 8 | import numpy as np 9 | from PIL import Image 10 | 11 | from uform import Modality, get_model, ExecutionProviderError 12 | 13 | # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed 14 | try: 15 | import torch 16 | 17 | torch_available = True 18 | except: 19 | torch_available = False 20 | 21 | # ONNX is not a very light dependency either 22 | try: 23 | import onnx 24 | 25 | onnx_available = True 26 | except: 27 | onnx_available = False 28 | 29 | torch_models = [ 30 | "unum-cloud/uform3-image-text-english-small", 31 | "unum-cloud/uform3-image-text-english-base", 32 | "unum-cloud/uform3-image-text-english-large", 33 | "unum-cloud/uform3-image-text-multilingual-base", 34 | ] 35 | 36 | onnx_models = [ 37 | "unum-cloud/uform3-image-text-english-small", 38 | "unum-cloud/uform3-image-text-english-base", 39 | "unum-cloud/uform3-image-text-english-large", 40 | "unum-cloud/uform3-image-text-multilingual-base", 41 | ] 42 | 43 | # Let's check if the HuggingFace Hub API token is set in the environment variable. 44 | # If it's not there, check if the `.hf_token` file is present in the current working directory. 45 | token = os.getenv("HUGGINGFACE_HUB_TOKEN", None) 46 | if token is None: 47 | token_path = "./.hf_token" 48 | if os.path.exists(token_path): 49 | with open(token_path, "r") as file: 50 | token = file.read().strip() 51 | 52 | 53 | def skip_on(exception, reason="No good reason :)"): 54 | def decorator_func(f): 55 | @wraps(f) 56 | def wrapper(*args, **kwargs): 57 | try: 58 | # Try to run the test 59 | return f(*args, **kwargs) 60 | except exception: 61 | pytest.skip(reason) 62 | 63 | return wrapper 64 | 65 | return decorator_func 66 | 67 | 68 | def cosine_similarity(x, y) -> float: 69 | if not isinstance(x, np.ndarray): 70 | x = x.detach().numpy() 71 | if not isinstance(y, np.ndarray): 72 | y = y.detach().numpy() 73 | 74 | # Unlike NumPy, SimSIMD can properly deal with integer types 75 | x = x.astype(np.float32).flatten() 76 | y = y.astype(np.float32).flatten() 77 | return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) 78 | 79 | 80 | def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1): 81 | """Test if the embeddings of text and image are semantically similar 82 | using a small set of example text-image pairs.""" 83 | 84 | texts = [ 85 | "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", 86 | "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", 87 | "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", 88 | "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", 89 | "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", 90 | ] 91 | 92 | image_urls = [ 93 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", 94 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", 95 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", 96 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", 97 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", 98 | ] 99 | assert len(texts) == len(image_urls), "Number of texts and images should be the same." 100 | 101 | images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls] 102 | count_pairs = len(texts) 103 | 104 | # Ensure we have a sufficiently large batch 105 | texts = texts * batch_size_multiple 106 | images = images * batch_size_multiple 107 | 108 | # Compute the embedding in a batch fashion 109 | text_embeddings = text_to_embedding(texts) 110 | image_embeddings = image_to_embedding(images) 111 | 112 | # Evaluate cosine similarity 113 | for i in range(count_pairs): 114 | pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i]) 115 | other_text_similarities = [ 116 | cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i 117 | ] 118 | other_image_similarities = [ 119 | cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i 120 | ] 121 | 122 | assert pair_similarity > max( 123 | other_text_similarities 124 | ), "Text should be more similar to its corresponding image than to other images." 125 | assert pair_similarity > max( 126 | other_image_similarities 127 | ), "Image should be more similar to its corresponding text than to other texts." 128 | 129 | 130 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") 131 | @pytest.mark.parametrize("model_name", torch_models) 132 | def test_torch_one_embedding(model_name: str): 133 | processors, models = get_model(model_name, token=token, backend="torch") 134 | model_text = models[Modality.TEXT_ENCODER] 135 | model_image = models[Modality.IMAGE_ENCODER] 136 | processor_text = processors[Modality.TEXT_ENCODER] 137 | processor_image = processors[Modality.IMAGE_ENCODER] 138 | 139 | text = "a small red panda in a zoo" 140 | image_path = "assets/unum.png" 141 | 142 | image = Image.open(image_path) 143 | image_data = processor_image(image) 144 | text_data = processor_text(text) 145 | 146 | image_features, image_embedding = model_image.encode(image_data, return_features=True) 147 | text_features, text_embedding = model_text.encode(text_data, return_features=True) 148 | 149 | assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" 150 | assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" 151 | 152 | # Test if the model outputs actually make sense 153 | cross_references_image_and_text_embeddings( 154 | lambda text: model_text(processor_text(text)), 155 | lambda image: model_image(processor_image(image)), 156 | ) 157 | 158 | 159 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") 160 | @pytest.mark.parametrize("model_name", torch_models) 161 | @pytest.mark.parametrize("batch_size", [1, 2]) 162 | def test_torch_many_embeddings(model_name: str, batch_size: int): 163 | 164 | processors, models = get_model(model_name, token=token, backend="torch") 165 | model_text = models[Modality.TEXT_ENCODER] 166 | model_image = models[Modality.IMAGE_ENCODER] 167 | processor_text = processors[Modality.TEXT_ENCODER] 168 | processor_image = processors[Modality.IMAGE_ENCODER] 169 | 170 | texts = ["a small red panda in a zoo"] * batch_size 171 | image_paths = ["assets/unum.png"] * batch_size 172 | 173 | images = [Image.open(path) for path in image_paths] 174 | image_data = processor_image(images) 175 | text_data = processor_text(texts) 176 | 177 | image_embeddings = model_image.encode(image_data, return_features=False) 178 | text_embeddings = model_text.encode(text_data, return_features=False) 179 | 180 | assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" 181 | assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" 182 | 183 | 184 | @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") 185 | @pytest.mark.parametrize("model_name", onnx_models) 186 | @pytest.mark.parametrize("device", ["CPUExecutionProvider"]) 187 | @skip_on(ExecutionProviderError, reason="Missing execution provider") 188 | def test_onnx_one_embedding(model_name: str, device: str): 189 | 190 | processors, models = get_model(model_name, token=token, device=device, backend="onnx") 191 | model_text = models[Modality.TEXT_ENCODER] 192 | model_image = models[Modality.IMAGE_ENCODER] 193 | processor_text = processors[Modality.TEXT_ENCODER] 194 | processor_image = processors[Modality.IMAGE_ENCODER] 195 | 196 | text = "a small red panda in a zoo" 197 | image_path = "assets/unum.png" 198 | 199 | image = Image.open(image_path) 200 | image_data = processor_image(image) 201 | text_data = processor_text(text) 202 | 203 | image_features, image_embedding = model_image.encode(image_data) 204 | text_features, text_embedding = model_text.encode(text_data) 205 | 206 | assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" 207 | assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" 208 | 209 | # Nested fucntions are easier to debug, than lambdas 210 | def get_image_embedding(image_data): 211 | features, embedding = model_image.encode(processor_image(image_data)) 212 | return embedding 213 | 214 | def get_text_embedding(text_data): 215 | features, embedding = model_text.encode(processor_text(text_data)) 216 | return embedding 217 | 218 | # Test if the model outputs actually make sense 219 | cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding) 220 | 221 | 222 | @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") 223 | @pytest.mark.parametrize("model_name", onnx_models) 224 | @pytest.mark.parametrize("batch_size", [1, 2]) 225 | @pytest.mark.parametrize("device", ["CPUExecutionProvider"]) 226 | @skip_on(ExecutionProviderError, reason="Missing execution provider") 227 | def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): 228 | 229 | processors, models = get_model(model_name, token=token, device=device, backend="onnx") 230 | model_text = models[Modality.TEXT_ENCODER] 231 | model_image = models[Modality.IMAGE_ENCODER] 232 | processor_text = processors[Modality.TEXT_ENCODER] 233 | processor_image = processors[Modality.IMAGE_ENCODER] 234 | 235 | texts = ["a small red panda in a zoo"] * batch_size 236 | image_paths = ["assets/unum.png"] * batch_size 237 | 238 | images = [Image.open(path) for path in image_paths] 239 | image_data = processor_image(images) 240 | text_data = processor_text(texts) 241 | 242 | image_embeddings = model_image.encode(image_data, return_features=False) 243 | text_embeddings = model_text.encode(text_data, return_features=False) 244 | 245 | assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" 246 | assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" 247 | 248 | 249 | @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") 250 | @pytest.mark.parametrize("model_name", torch_models[:1]) 251 | def test_torch_multi_gpu(model_name: str): 252 | 253 | count_cuda_devices = torch.cuda.device_count() 254 | if count_cuda_devices < 2: 255 | pytest.skip("Not enough CUDA devices to run multi-GPU test") 256 | 257 | processors, models = get_model(model_name, token=token, backend="torch", device="cuda") 258 | model_text = models[Modality.TEXT_ENCODER] 259 | model_image = models[Modality.IMAGE_ENCODER] 260 | processor_text = processors[Modality.TEXT_ENCODER] 261 | processor_image = processors[Modality.IMAGE_ENCODER] 262 | 263 | import torch.nn as nn 264 | 265 | model_text.return_features = False 266 | model_image.return_features = False 267 | model_text_parallel = nn.DataParallel(model_text) 268 | model_image_parallel = nn.DataParallel(model_image) 269 | 270 | # Nested fucntions are easier to debug, than lambdas 271 | def get_image_embedding(image_data): 272 | preprocessed = processor_image(image_data) 273 | embedding = model_image_parallel.forward(preprocessed) 274 | return embedding.detach().cpu().numpy() 275 | 276 | def get_text_embedding(text_data): 277 | preprocessed = processor_text(text_data) 278 | embedding = model_text_parallel.forward(preprocessed) 279 | return embedding.detach().cpu().numpy() 280 | 281 | # Test if the model outputs actually make sense 282 | cross_references_image_and_text_embeddings( 283 | get_text_embedding, 284 | get_image_embedding, 285 | batch_size_multiple=count_cuda_devices, 286 | ) 287 | 288 | 289 | if __name__ == "__main__": 290 | # If you want to run this test file individually, you can do so by running: 291 | # pytest.main(["-s", "-x", __file__]) 292 | pass 293 | -------------------------------------------------------------------------------- /python/uform/__init__.py: -------------------------------------------------------------------------------- 1 | from os.path import join, exists 2 | from typing import Dict, Optional, Tuple, Literal, Union, Callable 3 | 4 | from huggingface_hub import snapshot_download, utils 5 | 6 | from uform.shared import ExecutionProviderError, Modality 7 | 8 | 9 | def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: 10 | if modalities is None: 11 | return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER) 12 | 13 | return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities) 14 | 15 | 16 | def get_checkpoint( 17 | model_name: str, 18 | modalities: Tuple[str, Modality], 19 | token: Optional[str] = None, 20 | format: Literal[".pt", ".onnx"] = ".pt", 21 | ) -> Tuple[str, Dict[Modality, str], Optional[str]]: 22 | """Downloads a model checkpoint from the Hugging Face Hub. 23 | 24 | :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small` 25 | :param token: The Hugging Face API token, if required 26 | :param modalities: The modalities to download, like `("text_encoder", "image_encoder")` 27 | :param format: The format of the model checkpoint, either `.pt` or `.onnx` 28 | :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path 29 | """ 30 | 31 | modalities = _normalize_modalities(modalities) 32 | 33 | # It is not recommended to use `.pth` extension when checkpointing models 34 | # because it collides with Python path (`.pth`) configuration files. 35 | merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]] 36 | separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities] 37 | config_names = ["torch_config.json", "config.json"] 38 | tokenizer_names = ["tokenizer.json"] 39 | 40 | old_progress_behavior = utils.are_progress_bars_disabled() 41 | utils.disable_progress_bars() 42 | 43 | # The download stats depend on the number of times the `config.json` is pulled 44 | # https://huggingface.co/docs/hub/models-download-stats 45 | model_path = snapshot_download( 46 | repo_id=model_name, 47 | token=token, 48 | allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names, 49 | ) 50 | 51 | if old_progress_behavior: 52 | utils.enable_progress_bars() 53 | 54 | # Find the first name in `config_names` that is present 55 | config_path = None 56 | for config_name in config_names: 57 | if exists(join(model_path, config_name)): 58 | config_path = join(model_path, config_name) 59 | break 60 | 61 | # Same for the tokenizer 62 | tokenizer_path = None 63 | for tokenizer_name in tokenizer_names: 64 | if exists(join(model_path, tokenizer_name)): 65 | tokenizer_path = join(model_path, tokenizer_name) 66 | break 67 | 68 | # Ideally, we want to separately fetch all the models. 69 | # If those aren't available, aggregate separate modalities and merge them. 70 | modality_paths = None 71 | for file_name in merged_model_names: 72 | if exists(join(model_path, file_name)): 73 | modality_paths = join(model_path, file_name) 74 | break 75 | 76 | if modality_paths is None: 77 | modality_paths = {} 78 | for separate_modality_name in separate_modality_names: 79 | if exists(join(model_path, separate_modality_name)): 80 | modality_name, _, _ = separate_modality_name.partition(".") 81 | modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name) 82 | 83 | return config_path, modality_paths, tokenizer_path 84 | 85 | 86 | def get_model_torch( 87 | model_name: str, 88 | *, 89 | token: Optional[str] = None, 90 | device: Literal["cpu", "cuda"] = "cpu", 91 | modalities: Optional[Tuple[Union[str, Modality]]] = None, 92 | ) -> Tuple[Dict[Modality, Callable], Dict]: 93 | """ 94 | Fetches and constructs a PyTorch model with its processors based on provided modalities. 95 | 96 | :param model_name: The identifier of the model on the Hugging Face Hub. 97 | :param token: Optional API token for authenticated access to the model. 98 | :param device: The device to load the model onto ('cpu' or 'cuda'). 99 | :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder). 100 | :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. 101 | """ 102 | from uform.torch_encoders import TextEncoder, ImageEncoder 103 | from uform.torch_processors import TextProcessor, ImageProcessor 104 | 105 | modalities = _normalize_modalities(modalities) 106 | config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt") 107 | 108 | result_processors = {} 109 | result_models = {} 110 | 111 | if Modality.TEXT_ENCODER in modalities: 112 | processor = TextProcessor(config_path, tokenizer_path) 113 | encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)) 114 | encoder = encoder.eval().to(device) 115 | result_processors[Modality.TEXT_ENCODER] = processor 116 | result_models[Modality.TEXT_ENCODER] = encoder 117 | 118 | if Modality.IMAGE_ENCODER in modalities: 119 | processor = ImageProcessor(config_path) 120 | encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)) 121 | encoder = encoder.eval().to(device) 122 | result_processors[Modality.IMAGE_ENCODER] = processor 123 | result_models[Modality.IMAGE_ENCODER] = encoder 124 | 125 | return result_processors, result_models 126 | 127 | 128 | def get_model_onnx( 129 | model_name: str, 130 | *, 131 | device: Literal["cpu", "cuda"] = "cpu", 132 | token: Optional[str] = None, 133 | modalities: Optional[Tuple[str]] = None, 134 | ): 135 | """ 136 | Fetches and constructs an ONNX model with its processors based on provided modalities. 137 | 138 | :param model_name: The identifier of the model on the Hugging Face Hub. 139 | :param device: The device on which the model will operate ('cpu' or 'cuda'). 140 | :param token: Optional API token for authenticated access to the model. 141 | :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder). 142 | :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. 143 | """ 144 | from uform.onnx_encoders import TextEncoder, ImageEncoder 145 | from uform.numpy_processors import TextProcessor, ImageProcessor 146 | 147 | modalities = _normalize_modalities(modalities) 148 | config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx") 149 | 150 | result_processors = {} 151 | result_models = {} 152 | 153 | if Modality.TEXT_ENCODER in modalities: 154 | processor = TextProcessor(config_path, tokenizer_path) 155 | encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device) 156 | result_processors[Modality.TEXT_ENCODER] = processor 157 | result_models[Modality.TEXT_ENCODER] = encoder 158 | 159 | if Modality.IMAGE_ENCODER in modalities: 160 | processor = ImageProcessor(config_path) 161 | encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device) 162 | result_processors[Modality.IMAGE_ENCODER] = processor 163 | result_models[Modality.IMAGE_ENCODER] = encoder 164 | 165 | return result_processors, result_models 166 | 167 | 168 | def get_model( 169 | model_name: str, 170 | *, 171 | device: Literal["cpu", "cuda"] = "cpu", # change this if you have a GPU 172 | backend: Literal["onnx", "torch"] = "onnx", # lighter = better 173 | modalities: Optional[Tuple[str, Modality]] = None, # all by default 174 | token: Optional[str] = None, # optional HuggingFace Hub token for private models 175 | ) -> Tuple[Dict[Modality, Callable], Dict]: 176 | """ 177 | Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend. 178 | 179 | :param model_name: The identifier of the model on the Hugging Face Hub. 180 | :param device: The device to load the model onto ('cpu' or 'cuda'). 181 | :param backend: The backend framework to use ('onnx' or 'torch'). 182 | :param modalities: A tuple specifying the types of model components to fetch. 183 | :param token: Optional API token for authenticated access to the model. 184 | :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. 185 | """ 186 | if backend == "onnx": 187 | return get_model_onnx(model_name, device=device, token=token, modalities=modalities) 188 | elif backend == "torch": 189 | return get_model_torch(model_name, device=device, token=token, modalities=modalities) 190 | else: 191 | raise ValueError(f"Unknown backend: {backend}") 192 | -------------------------------------------------------------------------------- /python/uform/chat.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | import requests 4 | import torch 5 | from PIL import Image 6 | from transformers import TextStreamer, AutoModel, AutoProcessor 7 | 8 | 9 | def parse_args(): 10 | parser = ArgumentParser(description="Chat with UForm generative model") 11 | 12 | parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path") 13 | parser.add_argument("--image", type=str, required=True, help="Path to image or URL") 14 | parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`") 15 | parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference") 16 | 17 | return parser.parse_args() 18 | 19 | 20 | def run_chat(opts, model, processor): 21 | streamer = TextStreamer( 22 | processor.tokenizer, 23 | skip_prompt=True, 24 | skip_special_tokens=True, 25 | ) 26 | 27 | messages = [{"role": "system", "content": "You are a helpful assistant."}] 28 | is_first_message = True 29 | 30 | if opts.image.startswith("http"): 31 | image = Image.open(requests.get(opts.image, stream=True).raw) 32 | else: 33 | image = Image.open(opts.image) 34 | 35 | image = ( 36 | processor.feature_extractor(image) # 37 | .unsqueeze(0) 38 | .to(torch.bfloat16 if opts.fp16 else torch.float32) 39 | .to(opts.device) 40 | ) 41 | 42 | while True: 43 | if messages[-1]["role"] in ("system", "assistant"): 44 | message = input("User: ") 45 | if is_first_message: 46 | message = f" {message}" 47 | is_first_message = False 48 | messages.append({"role": "user", "content": message}) 49 | 50 | print() 51 | 52 | else: 53 | input_ids = processor.tokenizer.apply_chat_template( 54 | messages, 55 | return_tensors="pt", 56 | add_generation_prompt=True, 57 | ).to(opts.device) 58 | 59 | attention_mask = torch.ones( 60 | 1, 61 | input_ids.shape[1] + processor.num_image_latents - 1, 62 | ).to(opts.device) 63 | inputs = { 64 | "input_ids": input_ids, 65 | "attention_mask": attention_mask, 66 | "images": image, 67 | } 68 | 69 | print("Assistant: ", end="") 70 | with torch.inference_mode(): 71 | output = model.generate( 72 | **inputs, 73 | do_sample=False, 74 | use_cache=True, 75 | max_new_tokens=1024, 76 | eos_token_id=151645, 77 | pad_token_id=processor.tokenizer.pad_token_id, 78 | streamer=streamer, 79 | ) 80 | print() 81 | 82 | prompt_len = inputs["input_ids"].shape[1] 83 | message = processor.batch_decode(output[:, prompt_len:-1])[0] 84 | 85 | messages.append({"role": "assistant", "content": message}) 86 | 87 | 88 | def main(): 89 | try: 90 | opts = parse_args() 91 | processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True) 92 | model = ( 93 | AutoModel.from_pretrained( 94 | opts.model, 95 | torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32, 96 | ignore_mismatched_sizes=True, 97 | trust_remote_code=True, 98 | ) 99 | .eval() 100 | .to(opts.device) 101 | ) 102 | 103 | run_chat(opts, model, processor) 104 | 105 | except KeyboardInterrupt: 106 | print("Bye!") 107 | pass 108 | 109 | 110 | if __name__ == "__main__": 111 | main() 112 | -------------------------------------------------------------------------------- /python/uform/gen_model.py: -------------------------------------------------------------------------------- 1 | from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path 2 | -------------------------------------------------------------------------------- /python/uform/numpy_processors.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, List, Union, Sequence 3 | import json 4 | 5 | from PIL.Image import Image, BICUBIC 6 | from tokenizers import Tokenizer 7 | import numpy as np 8 | 9 | from uform.shared import read_config 10 | 11 | 12 | class TextProcessor: 13 | def __init__(self, config_path: PathLike, tokenizer_path: PathLike): 14 | """ 15 | :param config: model config 16 | :param tokenizer_path: path to tokenizer file 17 | """ 18 | 19 | config = read_config(config_path) 20 | if "text_encoder" in config: 21 | config = config["text_encoder"] 22 | 23 | self._max_seq_len = config["max_position_embeddings"] 24 | self._tokenizer = Tokenizer.from_file(tokenizer_path) 25 | self._tokenizer.no_padding() 26 | self._pad_token_idx = config["padding_idx"] 27 | 28 | def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]: 29 | """Transforms one or more strings into dictionary with tokenized strings and attention masks. 30 | 31 | :param texts: text of list of texts to tokenizer 32 | """ 33 | if isinstance(texts, str): 34 | texts = [texts] 35 | 36 | input_ids = np.full( 37 | (len(texts), self._max_seq_len), 38 | fill_value=self._pad_token_idx, 39 | dtype=np.int32, 40 | ) 41 | 42 | attention_mask = np.zeros( 43 | (len(texts), self._max_seq_len), 44 | dtype=np.int32, 45 | ) 46 | encoded = self._tokenizer.encode_batch(texts) 47 | 48 | for i, seq in enumerate(encoded): 49 | seq_len = min(len(seq), self._max_seq_len) 50 | input_ids[i, :seq_len] = seq.ids[:seq_len] 51 | 52 | attention_mask[i, :seq_len] = 1 53 | 54 | return {"input_ids": input_ids, "attention_mask": attention_mask} 55 | 56 | 57 | class ImageProcessor: 58 | def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None): 59 | """ 60 | :param config: model config 61 | :param tokenizer_path: path to tokenizer file 62 | :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) 63 | """ 64 | 65 | config = read_config(config_path) 66 | if "image_encoder" in config: 67 | config = config["image_encoder"] 68 | 69 | self._image_size = config["image_size"] 70 | self._normalization_means = config["normalization_means"] 71 | self._normalization_deviations = config["normalization_deviations"] 72 | 73 | assert isinstance(self._image_size, int) and self._image_size > 0 74 | assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) 75 | assert len(self._normalization_means) == len(self._normalization_deviations) == 3 76 | 77 | self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None] 78 | self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None] 79 | 80 | def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray: 81 | """Transforms one or more Pillow images into Torch Tensors. 82 | 83 | :param images: image or list of images to preprocess 84 | """ 85 | 86 | if isinstance(images, Sequence): 87 | batch_images = np.empty( 88 | (len(images), 3, self._image_size, self._image_size), 89 | dtype=np.float32, 90 | ) 91 | 92 | for i, image in enumerate(images): 93 | batch_images[i] = self._resize_crop_normalize(image) 94 | 95 | else: 96 | batch_images = self._resize_crop_normalize(images)[None] 97 | 98 | return batch_images 99 | 100 | def _resize_crop_normalize(self, image: Image): 101 | width, height = image.size 102 | 103 | if width < height: 104 | width = self._image_size 105 | height = int(height / width * self._image_size) 106 | else: 107 | width = int(width / height * self._image_size) 108 | height = self._image_size 109 | 110 | image = image.resize((width, height), resample=BICUBIC) 111 | 112 | left = (width - self._image_size) / 2 113 | top = (height - self._image_size) / 2 114 | right = (width + self._image_size) / 2 115 | bottom = (height + self._image_size) / 2 116 | 117 | image = image.convert("RGB").crop((left, top, right, bottom)) 118 | # At this point `image` is a PIL Image with RGB channels. 119 | # If you convert it to `np.ndarray` it will have shape (H, W, C) where C is the number of channels. 120 | image = (np.array(image).astype(np.float32) / 255.0 - self.image_mean) / self.image_std 121 | 122 | # To make it compatible with PyTorch, we need to transpose the image to (C, H, W). 123 | return np.transpose(image, (2, 0, 1)) 124 | -------------------------------------------------------------------------------- /python/uform/onnx_encoders.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, Optional, Tuple, Union, Literal 3 | import json 4 | 5 | import onnxruntime as ort 6 | from numpy import ndarray 7 | 8 | from uform.shared import ExecutionProviderError 9 | 10 | 11 | def available_providers(device: Optional[str]) -> Tuple[str, ...]: 12 | """Returns a tuple of available execution providers based on the requested device. 13 | https://onnxruntime.ai/docs/execution-providers/ 14 | 15 | :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name. 16 | :return: Tuple of available execution providers. 17 | :raises ExecutionProviderError: If the requested device is not available. 18 | """ 19 | 20 | gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider") 21 | cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider") 22 | available = ort.get_available_providers() 23 | 24 | # If no target device is specified, let's sort all the available ones with respect to our preference 25 | if device is None: 26 | preferences = gpu_providers + cpu_providers 27 | filtered_preferences = tuple(provider for provider in preferences if provider in available) 28 | if len(filtered_preferences): 29 | return filtered_preferences 30 | if len(available): 31 | return available 32 | raise ExecutionProviderError("No execution providers are available") 33 | 34 | # If a GPU is requested, but no GPU providers are available, raise an error 35 | if device == "gpu" or device == "cuda": 36 | if all(provider not in available for provider in gpu_providers): 37 | raise ExecutionProviderError( 38 | f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}" 39 | ) 40 | return [x for x in gpu_providers if x in available] 41 | 42 | # If a CPU is requested, but no CPU providers are available, raise an error 43 | if device == "cpu": 44 | if all(provider not in available for provider in cpu_providers): 45 | raise ExecutionProviderError( 46 | f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}" 47 | ) 48 | return [x for x in cpu_providers if x in available] 49 | 50 | if device not in available: 51 | available_providers = ", ".join(available) 52 | raise ExecutionProviderError( 53 | f"Execution provider {device} is not available. Currently installed: {available_providers}" 54 | ) 55 | 56 | return (device,) 57 | 58 | 59 | class ImageEncoder: 60 | def __init__( 61 | self, 62 | model_path: str, 63 | *, 64 | device: Literal["cpu", "cuda"] = "cpu", 65 | return_features: bool = True, 66 | ): 67 | """ 68 | :param model_path: Path to onnx model 69 | :param device: Device name, either cpu or gpu 70 | """ 71 | 72 | session_options = ort.SessionOptions() 73 | session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL 74 | 75 | self.return_features = return_features 76 | self.session = ort.InferenceSession( 77 | model_path, 78 | sess_options=session_options, 79 | providers=available_providers(device), 80 | ) 81 | 82 | def encode( 83 | self, images: ndarray, return_features: Optional[bool] = None 84 | ) -> Union[ndarray, Tuple[ndarray, ndarray]]: 85 | features, embeddings = self.session.run(None, {"images": images}) 86 | return_features = return_features if return_features is not None else self.return_features 87 | if return_features: 88 | return features, embeddings 89 | return embeddings 90 | 91 | 92 | class TextEncoder: 93 | def __init__( 94 | self, 95 | model_path: str, 96 | *, 97 | device: Literal["cpu", "cuda"] = "cpu", 98 | return_features: bool = True, 99 | ): 100 | """ 101 | :param text_encoder_path: Path to onnx of text encoder 102 | :param device: Device name, either cpu or gpu 103 | """ 104 | 105 | session_options = ort.SessionOptions() 106 | session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL 107 | 108 | self.return_features = return_features 109 | self.text_encoder_session = ort.InferenceSession( 110 | model_path, 111 | sess_options=session_options, 112 | providers=available_providers(device), 113 | ) 114 | 115 | def encode( 116 | self, 117 | x: Union[ndarray, dict], 118 | attention_mask: Optional[ndarray] = None, 119 | return_features: Optional[bool] = None, 120 | ) -> Union[ndarray, Tuple[ndarray, ndarray]]: 121 | if isinstance(x, dict): 122 | assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" 123 | attention_mask = x["attention_mask"] 124 | input_ids = x["input_ids"] 125 | else: 126 | input_ids = x 127 | 128 | features, embeddings = self.text_encoder_session.run( 129 | None, 130 | { 131 | "input_ids": input_ids, 132 | "attention_mask": attention_mask, 133 | }, 134 | ) 135 | 136 | return_features = return_features if return_features is not None else self.return_features 137 | if return_features: 138 | return features, embeddings 139 | return embeddings 140 | -------------------------------------------------------------------------------- /python/uform/shared.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Union 3 | from os import PathLike 4 | import json 5 | 6 | 7 | class Modality(Enum): 8 | TEXT_ENCODER = "text_encoder" 9 | IMAGE_ENCODER = "image_encoder" 10 | VIDEO_ENCODER = "video_encoder" 11 | TEXT_DECODER = "text_decoder" 12 | 13 | 14 | class ExecutionProviderError(Exception): 15 | """Exception raised when a requested execution provider is not available.""" 16 | 17 | 18 | ConfigOrPath = Union[PathLike, str, object] 19 | 20 | 21 | def read_config(path_or_object: ConfigOrPath) -> object: 22 | if isinstance(path_or_object, (PathLike, str)): 23 | with open(path_or_object, "r") as f: 24 | return json.load(f) 25 | else: 26 | return path_or_object 27 | -------------------------------------------------------------------------------- /python/uform/torch_decoders.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | from torchvision.transforms import ( 7 | CenterCrop, 8 | Compose, 9 | InterpolationMode, 10 | Normalize, 11 | RandomResizedCrop, 12 | Resize, 13 | ToTensor, 14 | ) 15 | from transformers import AutoConfig, AutoTokenizer 16 | from transformers.configuration_utils import PretrainedConfig 17 | from transformers.modeling_outputs import CausalLMOutputWithPast 18 | from transformers.modeling_utils import PreTrainedModel 19 | from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM 20 | from transformers.processing_utils import ProcessorMixin 21 | from transformers.tokenization_utils_base import BatchEncoding 22 | 23 | from uform.torch_encoders import ImageEncoder 24 | 25 | IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) 26 | IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) 27 | 28 | 29 | def convert_to_rgb(image): 30 | return image.convert("RGB") 31 | 32 | 33 | class LayerScale(nn.Module): 34 | def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False): 35 | super().__init__() 36 | self.weight = nn.Parameter(init_values * torch.ones(dim)) 37 | self.inplace = inplace 38 | 39 | def forward(self, x): 40 | return x.mul_(self.weight) if self.inplace else x * self.weight 41 | 42 | 43 | class ImageFeaturesPooler(nn.Module): 44 | def __init__( 45 | self, 46 | input_size, 47 | hidden_size, 48 | num_attn_heads, 49 | intermediate_size, 50 | num_latents, 51 | initializer_range, 52 | ): 53 | super().__init__() 54 | self.projection = nn.Linear(input_size, hidden_size) 55 | 56 | self.pooler = nn.TransformerDecoderLayer( 57 | hidden_size, 58 | num_attn_heads, 59 | intermediate_size, 60 | activation=nn.functional.silu, 61 | batch_first=True, 62 | norm_first=True, 63 | ) 64 | self.image_latents = nn.Parameter( 65 | torch.randn(1, num_latents, hidden_size) * initializer_range**0.5, 66 | ) 67 | 68 | def forward(self, features): 69 | features = self.projection(features) 70 | return self.pooler( 71 | self.image_latents.expand(features.shape[0], -1, -1), 72 | features, 73 | ) 74 | 75 | 76 | class VLMConfig(PretrainedConfig): 77 | model_type = "vlm" 78 | 79 | def __init__( 80 | self, 81 | text_decoder_name_or_path: str = "", 82 | tokenizer_name_or_path: str = "", 83 | image_size: int = 224, 84 | image_encoder_hidden_size: int = 768, 85 | image_encoder_patch_size: int = 16, 86 | image_encoder_num_layers: int = 12, 87 | image_encoder_num_heads: int = 12, 88 | image_encoder_embedding_dim: int = 256, 89 | image_encoder_pooling: str = "cls", 90 | image_pooler_num_attn_heads: int = 16, 91 | image_pooler_intermediate_size: int = 5504, 92 | image_pooler_num_latents: int = 196, 93 | image_token_id: int = 32002, 94 | initializer_range: float = 0.02, 95 | use_cache: bool = True, 96 | center_crop: bool = True, 97 | **kwargs, 98 | ): 99 | self.text_decoder_name_or_path = text_decoder_name_or_path 100 | self.tokenizer_name_or_path = tokenizer_name_or_path 101 | 102 | self.image_size = image_size 103 | self.image_encoder_hidden_size = image_encoder_hidden_size 104 | self.image_encoder_patch_size = image_encoder_patch_size 105 | self.image_encoder_num_layers = image_encoder_num_layers 106 | self.image_encoder_num_heads = image_encoder_num_heads 107 | self.image_encoder_embedding_dim = image_encoder_embedding_dim 108 | self.image_encoder_pooling = image_encoder_pooling 109 | 110 | self.image_pooler_num_attn_heads = image_pooler_num_attn_heads 111 | self.image_pooler_intermediate_size = image_pooler_intermediate_size 112 | self.image_pooler_num_latents = image_pooler_num_latents 113 | 114 | self.image_token_id = image_token_id 115 | 116 | self.initializer_range = initializer_range 117 | self.use_cache = use_cache 118 | self.center_crop = center_crop 119 | 120 | super().__init__(**kwargs) 121 | 122 | 123 | class VLMPreTrainedModel(PreTrainedModel): 124 | config_class = VLMConfig 125 | base_model_prefix = "vlm" 126 | supports_gradient_checkpointing = True 127 | _no_split_modules = [] 128 | _skip_keys_device_placement = "past_key_values" 129 | 130 | def _init_weights(self, module): 131 | pass 132 | 133 | def _initialize_weights(self, module): 134 | pass 135 | 136 | 137 | class VLMForCausalLM(VLMPreTrainedModel): 138 | def __init__(self, config: VLMConfig): 139 | super().__init__(config) 140 | 141 | self.config = config 142 | self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path) 143 | self.text_config.vocab_size += 3 144 | self.text_decoder = AutoModelForCausalLM.from_config(self.text_config) 145 | 146 | self.image_encoder = ImageEncoder( 147 | self.config.image_encoder_hidden_size, 148 | self.config.image_encoder_patch_size, 149 | self.config.image_size, 150 | self.config.image_encoder_num_layers, 151 | self.config.image_encoder_num_heads, 152 | self.config.image_encoder_embedding_dim, 153 | self.config.image_encoder_pooling, 154 | ) 155 | 156 | # replace models' layerscales because `transformers` automatically renames keys in `state_dict` 157 | for i in range(len(self.image_encoder.blocks)): 158 | self.image_encoder.blocks[i].ls1 = LayerScale( 159 | self.image_encoder.blocks[i].ls1.dim, 160 | ) 161 | self.image_encoder.blocks[i].ls2 = LayerScale( 162 | self.image_encoder.blocks[i].ls2.dim, 163 | ) 164 | 165 | self.image_pooler = ImageFeaturesPooler( 166 | self.config.image_encoder_hidden_size, 167 | self.text_config.hidden_size, 168 | self.config.image_pooler_num_attn_heads, 169 | self.config.image_pooler_intermediate_size, 170 | self.config.image_pooler_num_latents, 171 | self.config.initializer_range, 172 | ) 173 | 174 | def get_input_embeddings(self): 175 | return self.text_decoder.get_input_embeddings() 176 | 177 | def set_input_embeddings(self, value): 178 | self.text_decoder.set_input_embeddings(value) 179 | 180 | def get_images_embeddings(self, images): 181 | features = self.image_encoder.forward_features(images) 182 | return self.image_pooler(features) 183 | 184 | def gather_continuous_embeddings( 185 | self, 186 | input_ids: torch.Tensor, 187 | word_embeddings: torch.Tensor, 188 | image_embeddings: torch.Tensor, 189 | ) -> torch.Tensor: 190 | start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1] 191 | embeddings = [] 192 | 193 | for sample_idx, start_idx in enumerate(start_indices.tolist()): 194 | embeddings.append( 195 | torch.cat( 196 | ( 197 | word_embeddings[sample_idx, :start_idx], 198 | image_embeddings[sample_idx], 199 | word_embeddings[sample_idx, start_idx + 1 :], 200 | ), 201 | dim=0, 202 | ), 203 | ) 204 | 205 | return torch.stack(embeddings, dim=0) 206 | 207 | def forward( 208 | self, 209 | input_ids: torch.LongTensor = None, 210 | images: torch.Tensor = None, 211 | attention_mask: Optional[torch.Tensor] = None, 212 | position_ids: Optional[torch.LongTensor] = None, 213 | past_key_values: Optional[List[torch.FloatTensor]] = None, 214 | inputs_embeds: Optional[torch.FloatTensor] = None, 215 | use_cache: Optional[bool] = None, 216 | labels: Optional[torch.Tensor] = None, 217 | output_attentions: Optional[bool] = None, 218 | output_hidden_states: Optional[bool] = None, 219 | return_dict: Optional[bool] = None, 220 | ) -> Union[dict, Tuple, CausalLMOutputWithPast]: 221 | 222 | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions 223 | output_hidden_states = ( 224 | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states 225 | ) 226 | use_cache = use_cache if use_cache is not None else self.config.use_cache 227 | 228 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 229 | 230 | if input_ids is not None and inputs_embeds is not None: 231 | raise ValueError( 232 | "You cannot specify both input_ids and inputs_embeds at the same time", 233 | ) 234 | elif input_ids is None and inputs_embeds is None: 235 | raise ValueError("You have to specify either input_is or inputs_embeds") 236 | 237 | if inputs_embeds is None and past_key_values is None: 238 | inputs_embeds = self.get_input_embeddings()(input_ids) 239 | 240 | if images is not None: 241 | image_embeds = self.get_images_embeddings(images) 242 | inputs_embeds = self.gather_continuous_embeddings( 243 | input_ids, 244 | inputs_embeds, 245 | image_embeds, 246 | ) 247 | 248 | if position_ids is None: 249 | seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1] 250 | past_key_values_length = 0 251 | 252 | if past_key_values is not None: 253 | past_key_values_length = past_key_values[0][0].shape[2] 254 | 255 | device = input_ids.device if input_ids is not None else inputs_embeds.device 256 | position_ids = torch.arange( 257 | past_key_values_length, 258 | seq_length + past_key_values_length, 259 | dtype=torch.long, 260 | device=device, 261 | ) 262 | position_ids = position_ids.unsqueeze(0) 263 | 264 | outputs = self.text_decoder( 265 | inputs_embeds=inputs_embeds, 266 | input_ids=input_ids if past_key_values is not None else None, 267 | attention_mask=attention_mask, 268 | labels=labels, 269 | position_ids=position_ids, 270 | past_key_values=past_key_values, 271 | output_attentions=output_attentions, 272 | output_hidden_states=output_hidden_states, 273 | use_cache=use_cache, 274 | return_dict=return_dict, 275 | ) 276 | 277 | return outputs 278 | 279 | def prepare_inputs_for_generation( 280 | self, 281 | input_ids, 282 | images=None, 283 | past_key_values=None, 284 | attention_mask=None, 285 | inputs_embeds=None, 286 | **kwargs, 287 | ): 288 | if past_key_values: 289 | input_ids = input_ids[:, -1:] 290 | 291 | position_ids = kwargs.get("position_ids", None) 292 | if attention_mask is not None and position_ids is None: 293 | # create position_ids on the fly for batch generation 294 | position_ids = attention_mask.long().cumsum(-1) - 1 295 | position_ids.masked_fill_(attention_mask == 0, 1) 296 | if past_key_values: 297 | position_ids = position_ids[:, -1].unsqueeze(-1) 298 | 299 | # if `inputs_embeds` are passed, we only want to use them in the 1st generation step 300 | if inputs_embeds is not None and past_key_values is None: 301 | model_inputs = {"inputs_embeds": inputs_embeds} 302 | else: 303 | model_inputs = {"input_ids": input_ids} 304 | 305 | if images is not None: 306 | model_inputs["images"] = images 307 | 308 | model_inputs.update( 309 | { 310 | "position_ids": position_ids, 311 | "past_key_values": past_key_values, 312 | "use_cache": kwargs.get("use_cache"), 313 | "attention_mask": attention_mask, 314 | "images": images if past_key_values is None else None, 315 | }, 316 | ) 317 | return model_inputs 318 | 319 | @classmethod 320 | def from_config(cls, config, **kwargs): 321 | return cls._from_config(config, **kwargs) 322 | 323 | 324 | class VLMProcessor(ProcessorMixin): 325 | def __init__(self, config, **kwargs): 326 | self.feature_extractor = None 327 | self.config = config 328 | 329 | if config.center_crop: 330 | self.image_processor = Compose( 331 | [ 332 | Resize(256, interpolation=InterpolationMode.BICUBIC), 333 | CenterCrop(config.image_size), 334 | convert_to_rgb, 335 | ToTensor(), 336 | Normalize( 337 | mean=IMAGENET_MEAN, 338 | std=IMAGENET_STD, 339 | ), 340 | ], 341 | ) 342 | else: 343 | self.image_processor = Compose( 344 | [ 345 | RandomResizedCrop( 346 | config.image_size, 347 | scale=(0.8, 1), 348 | interpolation=InterpolationMode.BICUBIC, 349 | ), 350 | convert_to_rgb, 351 | ToTensor(), 352 | Normalize( 353 | mean=IMAGENET_MEAN, 354 | std=IMAGENET_STD, 355 | ), 356 | ], 357 | ) 358 | 359 | self.tokenizer = AutoTokenizer.from_pretrained( 360 | config.tokenizer_name_or_path, 361 | additional_special_tokens=["<|im_end|>"], 362 | ) 363 | self.num_image_latents = config.image_pooler_num_latents 364 | 365 | def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs): 366 | if texts is not None: 367 | if isinstance(texts, str): 368 | texts = [texts] 369 | 370 | tokenized_texts = [] 371 | for text in texts: 372 | messages = [ 373 | {"role": "system", "content": "You are a helpful assistant."}, 374 | {"role": "user", "content": f" {text}"}, 375 | ] 376 | tokenized_prompt = self.tokenizer.apply_chat_template( 377 | messages, 378 | add_generation_prompt=True, 379 | return_tensors=return_tensors, 380 | ) 381 | 382 | tokenized_texts.append(tokenized_prompt) 383 | 384 | max_len = max(len(t[0]) for t in tokenized_texts) 385 | input_ids = torch.full( 386 | (len(tokenized_texts), max_len), 387 | fill_value=self.tokenizer.pad_token_id, 388 | dtype=torch.int64, 389 | ) 390 | attention_mask = torch.full( 391 | (len(tokenized_texts), max_len), 392 | fill_value=0, 393 | dtype=torch.int64, 394 | ) 395 | 396 | for i, tokens in enumerate(tokenized_texts): 397 | input_ids[i, -len(tokens[0]) :] = tokens[0] 398 | attention_mask[i, -len(tokens[0]) :] = 1 399 | 400 | attention_mask = F.pad( 401 | attention_mask, 402 | pad=(0, self.num_image_latents - 1), 403 | value=1, 404 | ) 405 | 406 | encoding = BatchEncoding( 407 | data={ 408 | "input_ids": input_ids, 409 | "attention_mask": attention_mask, 410 | }, 411 | ) 412 | 413 | if images is not None: 414 | if isinstance(images, (list, tuple)): 415 | image_features = torch.empty( 416 | (len(images), 3, self.config.image_size, self.config.image_size), 417 | dtype=torch.float32, 418 | ) 419 | 420 | for i, image in enumerate(images): 421 | image_features[i] = self.image_processor(image) 422 | else: 423 | image_features = self.image_processor(images).unsqueeze(0) 424 | 425 | if texts is not None and images is not None: 426 | encoding["images"] = image_features 427 | return encoding 428 | 429 | if texts is not None: 430 | return encoding 431 | 432 | return BatchEncoding( 433 | data={ 434 | "images": image_features, 435 | }, 436 | tensor_type=return_tensors, 437 | ) 438 | 439 | def batch_decode(self, *args, **kwargs): 440 | return self.tokenizer.batch_decode(*args, **kwargs) 441 | 442 | def decode(self, *args, **kwargs): 443 | return self.tokenizer.decode(*args, **kwargs) 444 | 445 | @classmethod 446 | def from_pretrained( 447 | cls, 448 | pretrained_model_name_or_path, 449 | cache_dir=None, 450 | force_download: bool = False, 451 | local_files_only: bool = False, 452 | token=None, 453 | revision: str = "main", 454 | **kwargs, 455 | ): 456 | config = AutoConfig.from_pretrained( 457 | pretrained_model_name_or_path, 458 | cache_dir=cache_dir, 459 | force_download=force_download, 460 | local_files_only=local_files_only, 461 | revision=revision, 462 | token=token, 463 | **kwargs, 464 | ) 465 | return cls(config) 466 | 467 | 468 | AutoConfig.register("vlm", VLMConfig) 469 | AutoModel.register(VLMConfig, VLMForCausalLM) 470 | -------------------------------------------------------------------------------- /python/uform/torch_encoders.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass 4 | from os import PathLike 5 | from typing import Dict, Optional, Union, Mapping, Any, Tuple 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch import Tensor 11 | from PIL.Image import Image 12 | 13 | from uform.shared import read_config 14 | 15 | 16 | def _is_on_gpu(model: nn.Module) -> bool: 17 | try: 18 | return next(model.parameters()).device.type == "cuda" 19 | except StopIteration: 20 | return False 21 | 22 | 23 | @dataclass(eq=False) 24 | class Attention(nn.Module): 25 | dim: int 26 | num_heads: int 27 | dropout_prob: float = 0 28 | 29 | def __post_init__(self): 30 | super().__init__() 31 | 32 | self.use_sdp = int(torch.__version__[0]) > 1 33 | 34 | self.query = nn.Linear(self.dim, self.dim) 35 | self.key = nn.Linear(self.dim, self.dim) 36 | self.value = nn.Linear(self.dim, self.dim) 37 | self.out = nn.Linear(self.dim, self.dim) 38 | 39 | self.head_dim = self.dim // self.num_heads 40 | self.scale = self.head_dim**-0.5 41 | 42 | def forward( 43 | self, 44 | x: Tensor, 45 | attn_mask: Optional[Tensor] = None, 46 | context: Optional[Tensor] = None, 47 | is_causal: bool = False, 48 | ) -> Tensor: 49 | query = self.reshape(self.query(x)) 50 | key = self.reshape(self.key(x if context is None else context)) 51 | value = self.reshape(self.value(x if context is None else context)) 52 | 53 | if self.use_sdp: 54 | x = F.scaled_dot_product_attention( 55 | query, 56 | key, 57 | value, 58 | attn_mask, 59 | dropout_p=self.dropout_prob if self.training else 0, 60 | is_causal=is_causal, 61 | ) 62 | else: 63 | attn = query @ key.transpose(-2, -1) * self.scale 64 | if attn_mask is not None: 65 | attn += attn_mask 66 | 67 | attn = attn.softmax(dim=-1) 68 | x = attn @ value 69 | 70 | return self.out(x.transpose(2, 1).flatten(2)) 71 | 72 | def reshape(self, x: Tensor) -> Tensor: 73 | batch_size, seq_len, _ = x.shape 74 | x = x.view(batch_size, seq_len, self.num_heads, self.head_dim) 75 | return x.transpose(2, 1) 76 | 77 | 78 | @dataclass(eq=False) 79 | class MLP(nn.Module): 80 | dim: int 81 | dim_expand_factor: int = 4 82 | 83 | def __post_init__(self): 84 | super().__init__() 85 | 86 | self.hidden_layer = nn.Linear(self.dim, self.dim * self.dim_expand_factor) 87 | self.output_layer = nn.Linear(self.dim * self.dim_expand_factor, self.dim) 88 | 89 | def forward(self, x: Tensor) -> Tensor: 90 | x = F.gelu(self.hidden_layer(x)) 91 | return self.output_layer(x) 92 | 93 | 94 | @dataclass(eq=False) 95 | class LayerScale(nn.Module): 96 | dim: int 97 | init_values: float = 1e-5 98 | inplace: bool = False 99 | 100 | def __post_init__(self): 101 | super().__init__() 102 | self.gamma = nn.Parameter(self.init_values * torch.ones(self.dim)) 103 | 104 | def forward(self, x: Tensor) -> Tensor: 105 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 106 | 107 | 108 | @dataclass(eq=False) 109 | class TextEncoderBlock(nn.Module): 110 | dim: int 111 | num_heads: int 112 | dropout_prob: float 113 | cross_attention: bool = False 114 | 115 | def __post_init__(self): 116 | super().__init__() 117 | 118 | self.norm_attn = nn.LayerNorm(self.dim, eps=1e-12) 119 | self.attention = Attention(self.dim, self.num_heads, self.dropout_prob) 120 | 121 | if self.cross_attention: 122 | self.norm_crossattn = nn.LayerNorm(self.dim, eps=1e-12) 123 | self.crossattn = Attention(self.dim, self.num_heads, self.dropout_prob) 124 | 125 | self.norm_mlp = nn.LayerNorm(self.dim, eps=1e-12) 126 | self.mlp = MLP(self.dim) 127 | 128 | self.dropout = nn.Dropout(self.dropout_prob) 129 | 130 | def forward( 131 | self, 132 | x: Tensor, 133 | attn_mask: Tensor, 134 | context: Optional[Tensor] = None, 135 | ) -> Tensor: 136 | x = self.norm_attn(x + self.dropout(self.attention(x, attn_mask))) 137 | 138 | if self.cross_attention and context is not None: 139 | x = self.norm_crossattn( 140 | x + self.dropout(self.crossattn(x, context=context)), 141 | ) 142 | 143 | return self.norm_mlp(x + self.dropout(self.mlp(x))) 144 | 145 | 146 | @dataclass(eq=False) 147 | class ImageEncoderBlock(nn.Module): 148 | dim: int 149 | num_heads: int 150 | 151 | def __post_init__(self): 152 | super().__init__() 153 | self.norm1 = nn.LayerNorm(self.dim, eps=1e-6) 154 | self.attn = Attention(self.dim, self.num_heads) 155 | self.ls1 = LayerScale(self.dim) 156 | 157 | self.norm2 = nn.LayerNorm(self.dim, eps=1e-6) 158 | self.mlp = MLP(self.dim) 159 | self.ls2 = LayerScale(self.dim) 160 | 161 | def forward(self, x: Tensor) -> Tensor: 162 | x = x + self.ls1(self.attn(self.norm1(x))) 163 | x = x + self.ls2(self.mlp(self.norm2(x))) 164 | return x 165 | 166 | 167 | @dataclass(eq=False) 168 | class TextEncoder(nn.Module): 169 | model_type: str 170 | dim: int 171 | context_dim: int 172 | vocab_size: int 173 | padding_idx: int 174 | num_layers: int 175 | num_heads: int 176 | embedding_dim: int 177 | multimodal_layers_ids: tuple 178 | head_one_neuron: bool 179 | pooling: str = "cls" 180 | max_position_embeddings: int = 77 181 | dropout_prob: float = 0 182 | 183 | def __post_init__(self): 184 | super().__init__() 185 | 186 | self.word_embeddings = nn.Embedding( 187 | self.vocab_size, 188 | self.dim, 189 | padding_idx=self.padding_idx, 190 | ) 191 | self.position_embeddings = nn.Embedding(self.max_position_embeddings, self.dim) 192 | 193 | if self.model_type == "bert": 194 | self.register_buffer( 195 | "position_ids", 196 | torch.arange(self.max_position_embeddings).unsqueeze(0), 197 | persistent=False, 198 | ) 199 | 200 | self.layer_norm = nn.LayerNorm(self.dim, eps=1e-12) 201 | self.dropout = nn.Dropout(self.dropout_prob) 202 | 203 | self.blocks = nn.ModuleList( 204 | [ 205 | TextEncoderBlock( 206 | self.dim, 207 | self.num_heads, 208 | self.dropout_prob, 209 | layer_id in self.multimodal_layers_ids, 210 | ) 211 | for layer_id in range(self.num_layers) 212 | ], 213 | ) 214 | 215 | self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False) 216 | self.matching_head = nn.Linear(self.dim, 1 if self.head_one_neuron else 2) 217 | 218 | if self.context_dim != self.dim: 219 | self.context_projection = nn.Linear(self.context_dim, self.dim, bias=False) 220 | else: 221 | self.context_projection = nn.Identity() 222 | self.return_features = False 223 | 224 | def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: 225 | x = self.embed_text(x) 226 | attn_mask = self.get_attention_mask(attn_mask, x.dtype) 227 | 228 | for block in self.blocks: 229 | if not block.cross_attention: 230 | x = block(x, attn_mask) 231 | 232 | return x 233 | 234 | def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor: 235 | return self.embedding_projection(self.pool_features(x, attn_mask)) 236 | 237 | def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: 238 | if self.pooling == "cls": 239 | return x[:, 0] 240 | 241 | attn_mask = attn_mask.unsqueeze(2).type_as(x) 242 | return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1) 243 | 244 | def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor: 245 | attn_mask = attn_mask.to(dtype) 246 | attn_mask = (1.0 - attn_mask) * torch.finfo(dtype).min 247 | return attn_mask.unsqueeze(1).expand(-1, attn_mask.shape[1], -1).unsqueeze(1) 248 | 249 | def get_position_ids(self, x: Tensor) -> Tensor: 250 | if self.model_type == "roberta": 251 | mask = x.ne(self.padding_idx).int() 252 | return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + self.padding_idx 253 | 254 | return self.position_ids[:, : x.shape[1]] 255 | 256 | def embed_text(self, x: Tensor) -> Tensor: 257 | positional_embedding = self.position_embeddings(self.get_position_ids(x)) 258 | x = self.word_embeddings(x) + positional_embedding 259 | return self.dropout(self.layer_norm(x)) 260 | 261 | def forward( 262 | self, 263 | x: Union[Tensor, dict], 264 | attention_mask: Optional[Tensor] = None, 265 | return_features: Optional[bool] = None, 266 | ) -> Union[Tensor, Tuple[Tensor, Tensor]]: 267 | 268 | if isinstance(x, dict): 269 | assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" 270 | attention_mask = x["attention_mask"] 271 | x = x["input_ids"] 272 | elif attention_mask is None: 273 | # If no attention mask is provided - create one with all ones 274 | attention_mask = torch.ones_like(x) 275 | 276 | # If the model is on the GPU and the input matrices are not, shift them there 277 | if _is_on_gpu(self) and not x.is_cuda: 278 | x = x.cuda() 279 | attention_mask = attention_mask.cuda() 280 | 281 | features = self.forward_features(x, attention_mask) 282 | embeddings = self.forward_embedding(features, attention_mask) 283 | 284 | return_features = return_features if return_features is not None else self.return_features 285 | if return_features: 286 | return features, embeddings 287 | return embeddings 288 | 289 | def encode( 290 | self, 291 | x: Union[Tensor, dict], 292 | attention_mask: Optional[Tensor] = None, 293 | return_features: Optional[bool] = None, 294 | ) -> Union[Tensor, Tuple[Tensor, Tensor]]: 295 | 296 | result = self.forward(x, attention_mask, return_features) 297 | if isinstance(result, tuple): 298 | return result[0].detach(), result[1].detach() 299 | else: 300 | return result.detach() 301 | 302 | @staticmethod 303 | def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder: 304 | """Load the image encoder from the given configuration and model path. 305 | 306 | :param config: the configuration dictionary or path to the JSON configuration file 307 | :param model: the model state dictionary or path to the `.pt` model file 308 | """ 309 | config = read_config(config) 310 | if "text_encoder" in config: 311 | config = config["text_encoder"] 312 | 313 | # We must strip all the non-member attributes before initializing the classes. 314 | text_fields = TextEncoder.__dataclass_fields__ 315 | config = {k: v for k, v in config.items() if k in text_fields} 316 | encoder = TextEncoder(**config) 317 | 318 | # Load from disk 319 | if isinstance(model, (PathLike, str)): 320 | state = torch.load(model) 321 | else: 322 | state = model 323 | if "text_encoder" in state: 324 | state = state["text_encoder"] 325 | encoder.load_state_dict(state) 326 | return encoder 327 | 328 | 329 | @dataclass(eq=False) 330 | class ImageEncoder(nn.Module): 331 | dim: int 332 | patch_size: int 333 | image_size: int 334 | num_layers: int 335 | num_heads: int 336 | embedding_dim: int 337 | pooling: str 338 | num_reg_tokens: int = 0 339 | 340 | def __post_init__(self): 341 | super().__init__() 342 | 343 | seq_len = (self.image_size // self.patch_size) ** 2 344 | self.patch_embed = nn.Conv2d(3, self.dim, self.patch_size, self.patch_size) 345 | self.pos_embed = nn.Parameter(torch.randn(1, seq_len, self.dim) * 0.02) 346 | self.cls_token = nn.Parameter(torch.zeros(1, 1, self.dim)) 347 | 348 | if self.num_reg_tokens > 0: 349 | self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim)) 350 | 351 | self.blocks = nn.Sequential( 352 | *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)], 353 | ) 354 | 355 | self.norm = nn.LayerNorm(self.dim, eps=1e-6) 356 | self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False) 357 | self.return_features = False 358 | 359 | def forward_features(self, x: Union[Tensor, dict]) -> Tensor: 360 | x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1) 361 | x = x + self.pos_embed 362 | special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)] 363 | 364 | if self.num_reg_tokens > 0: 365 | special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1)) 366 | 367 | x = torch.cat(special_tokens + [x], dim=1) 368 | x = self.blocks(x) 369 | return self.norm(x) 370 | 371 | def forward_embedding(self, x: Tensor) -> Tensor: 372 | if self.pooling == "cls": 373 | x = x[:, 0] 374 | else: 375 | x = x.mean(dim=1) 376 | 377 | return self.embedding_projection(x) 378 | 379 | def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor: 380 | if isinstance(x, dict): 381 | x = x["images"] 382 | 383 | # If the model is on the GPU and the input matrices are not, shift them there 384 | if _is_on_gpu(self) and not x.is_cuda: 385 | x = x.cuda() 386 | 387 | features = self.forward_features(x) 388 | embeddings = self.forward_embedding(features) 389 | return_features = return_features if return_features is not None else self.return_features 390 | if return_features: 391 | return features, embeddings 392 | return embeddings 393 | 394 | def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor: 395 | result = self.forward(x, return_features) 396 | if isinstance(result, tuple): 397 | return result[0].detach(), result[1].detach() 398 | else: 399 | return result.detach() 400 | 401 | @staticmethod 402 | def from_pretrained( 403 | config: Union[PathLike, str, object], 404 | model: Union[PathLike, str, Mapping[str, Any]], 405 | ) -> ImageEncoder: 406 | """Load the image encoder from the given configuration and model path. 407 | 408 | :param config: the configuration dictionary or path to the JSON configuration file 409 | :param model: the model state dictionary or path to the `.pt` model file 410 | """ 411 | config = read_config(config) 412 | if "image_encoder" in config: 413 | config = config["image_encoder"] 414 | 415 | # We must strip all the non-member attributes before initializing the classes. 416 | image_fields = ImageEncoder.__dataclass_fields__ 417 | config = {k: v for k, v in config.items() if k in image_fields} 418 | encoder = ImageEncoder(**config) 419 | 420 | # Load from disk 421 | if isinstance(model, (PathLike, str)): 422 | state = torch.load(model) 423 | else: 424 | state = model 425 | if "image_encoder" in state: 426 | state = state["image_encoder"] 427 | encoder.load_state_dict(state) 428 | return encoder 429 | -------------------------------------------------------------------------------- /python/uform/torch_processors.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, List, Union, Sequence 3 | import json 4 | 5 | import torch 6 | from PIL.Image import Image 7 | from tokenizers import Tokenizer 8 | from torch import Tensor 9 | from torchvision.transforms import ( 10 | CenterCrop, 11 | Compose, 12 | InterpolationMode, 13 | Normalize, 14 | Resize, 15 | ToTensor, 16 | ) 17 | 18 | from uform.shared import read_config 19 | 20 | 21 | # lambda is not pickle-able 22 | def convert_to_rgb(image): 23 | return image.convert("RGB") 24 | 25 | 26 | class TextProcessor: 27 | def __init__(self, config_path: PathLike, tokenizer_path: PathLike): 28 | """ 29 | :param config: model config 30 | :param tokenizer_path: path to tokenizer file 31 | """ 32 | 33 | config = read_config(config_path) 34 | if "text_encoder" in config: 35 | config = config["text_encoder"] 36 | 37 | self._max_seq_len = config["max_position_embeddings"] 38 | self._tokenizer = Tokenizer.from_file(tokenizer_path) 39 | self._tokenizer.no_padding() 40 | self._pad_token_idx = config["padding_idx"] 41 | 42 | def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: 43 | """Transforms one or more strings into dictionary with tokenized strings and attention masks. 44 | 45 | :param texts: text of list of texts to tokenizer 46 | :return: dictionary with tokenized strings and attention masks as values 47 | """ 48 | if isinstance(texts, str): 49 | texts = [texts] 50 | 51 | input_ids = torch.full( 52 | (len(texts), self._max_seq_len), 53 | fill_value=self._pad_token_idx, 54 | dtype=torch.int64, 55 | ) 56 | 57 | attention_mask = torch.zeros( 58 | len(texts), 59 | self._max_seq_len, 60 | dtype=torch.int32, 61 | ) 62 | encoded = self._tokenizer.encode_batch(texts) 63 | 64 | for i, seq in enumerate(encoded): 65 | seq_len = min(len(seq), self._max_seq_len) 66 | input_ids[i, :seq_len] = torch.LongTensor( 67 | seq.ids[:seq_len], 68 | ) 69 | attention_mask[i, :seq_len] = 1 70 | 71 | return {"input_ids": input_ids, "attention_mask": attention_mask} 72 | 73 | 74 | class ImageProcessor: 75 | def __init__(self, config_path: PathLike): 76 | """ 77 | :param config: model config 78 | """ 79 | 80 | config = read_config(config_path) 81 | if "image_encoder" in config: 82 | config = config["image_encoder"] 83 | 84 | self._image_size = config["image_size"] 85 | self._normalization_means = config["normalization_means"] 86 | self._normalization_deviations = config["normalization_deviations"] 87 | 88 | assert isinstance(self._image_size, int) and self._image_size > 0 89 | assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) 90 | assert len(self._normalization_means) == len(self._normalization_deviations) == 3 91 | 92 | self._image_transform = Compose( 93 | [ 94 | Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), 95 | convert_to_rgb, 96 | CenterCrop(self._image_size), 97 | ToTensor(), 98 | Normalize( 99 | mean=tuple(self._normalization_means), 100 | std=tuple(self._normalization_deviations), 101 | ), 102 | ], 103 | ) 104 | 105 | def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]: 106 | """Transforms one or more Pillow images into Torch Tensors. 107 | 108 | :param images: image or list of images to preprocess 109 | :return: dictionary with float-represented images in tensors as values 110 | """ 111 | 112 | if isinstance(images, Sequence): 113 | batch_images = torch.empty( 114 | (len(images), 3, self._image_size, self._image_size), 115 | dtype=torch.float32, 116 | ) 117 | 118 | for i, image in enumerate(images): 119 | batch_images[i] = self._image_transform(image) 120 | 121 | else: 122 | batch_images = self._image_transform(images).unsqueeze(0) 123 | 124 | return {"images": batch_images} 125 | -------------------------------------------------------------------------------- /swift/EncodersTests.swift: -------------------------------------------------------------------------------- 1 | import CoreGraphics 2 | import Hub 3 | import ImageIO 4 | import UForm 5 | import XCTest 6 | 7 | final class TokenizerTests: XCTestCase { 8 | 9 | var hfToken: String? 10 | 11 | override func setUp() { 12 | super.setUp() 13 | // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory 14 | let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token") 15 | if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines) 16 | { 17 | hfToken = token 18 | } 19 | 20 | hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"] 21 | hfToken = hfToken ?? "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" 22 | } 23 | 24 | func cosineSimilarity(between vectorA: [T], and vectorB: [T]) -> T { 25 | guard vectorA.count == vectorB.count else { 26 | fatalError("Vectors must be of the same length.") 27 | } 28 | 29 | let dotProduct = zip(vectorA, vectorB).reduce(T.zero) { $0 + ($1.0 * $1.1) } 30 | let magnitudeA = sqrt(vectorA.reduce(T.zero) { $0 + $1 * $1 }) 31 | let magnitudeB = sqrt(vectorB.reduce(T.zero) { $0 + $1 * $1 }) 32 | 33 | // Avoid division by zero 34 | if magnitudeA == T.zero || magnitudeB == T.zero { 35 | return T.zero 36 | } 37 | 38 | return dotProduct / (magnitudeA * magnitudeB) 39 | } 40 | 41 | func testTextEmbeddings(forModel modelName: String) async throws { 42 | 43 | let api = HubApi(hfToken: hfToken) 44 | let textModel = try await TextEncoder( 45 | modelName: "unum-cloud/uform3-image-text-english-small", 46 | hubApi: api 47 | ) 48 | 49 | let texts = [ 50 | "sunny beach with clear blue water", 51 | "crowded sandbeach under the bright sun", 52 | "dense forest with tall green trees", 53 | "quiet park in the morning light", 54 | ] 55 | 56 | var textEmbeddings: [[Float32]] = [] 57 | for text in texts { 58 | let embedding: [Float32] = try textModel.encode(text).asFloats() 59 | textEmbeddings.append(embedding) 60 | } 61 | 62 | // Now let's compute the cosine similarity between the textEmbeddings 63 | let similarityBeach = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[1]) 64 | let similarityForest = cosineSimilarity(between: textEmbeddings[2], and: textEmbeddings[3]) 65 | let dissimilarityBetweenScenes = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[2]) 66 | 67 | // Assert that similar texts have higher similarity scores 68 | XCTAssertTrue( 69 | similarityBeach > dissimilarityBetweenScenes, 70 | "Beach texts should be more similar to each other than to forest texts." 71 | ) 72 | XCTAssertTrue( 73 | similarityForest > dissimilarityBetweenScenes, 74 | "Forest texts should be more similar to each other than to beach texts." 75 | ) 76 | } 77 | 78 | func testTextEmbeddings() async throws { 79 | for model in [ 80 | "unum-cloud/uform3-image-text-english-small", 81 | "unum-cloud/uform3-image-text-english-base", 82 | "unum-cloud/uform3-image-text-english-large", 83 | "unum-cloud/uform3-image-text-multilingual-base", 84 | ] { 85 | try await testTextEmbeddings(forModel: model) 86 | } 87 | } 88 | 89 | func testImageEmbeddings(forModel modelName: String) async throws { 90 | 91 | // One option is to use a local model repository. 92 | // 93 | // let root = "uform/" 94 | // let textModel = try TextEncoder( 95 | // modelPath: root + "uform-vl-english-large-text_encoder.mlpackage", 96 | // configPath: root + "uform-vl-english-large-text.json", 97 | // tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json" 98 | // ) 99 | // let imageModel = try ImageEncoder( 100 | // modelPath: root + "uform-vl-english-large-image_encoder.mlpackage", 101 | // configPath: root + "uform-vl-english-large-image.json" 102 | // ) 103 | // 104 | // A better option is to fetch directly from HuggingFace, similar to how users would do that: 105 | let api = HubApi(hfToken: hfToken) 106 | let textModel = try await TextEncoder( 107 | modelName: modelName, 108 | hubApi: api 109 | ) 110 | let imageModel = try await ImageEncoder( 111 | modelName: modelName, 112 | hubApi: api 113 | ) 114 | 115 | let texts = [ 116 | "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", 117 | "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", 118 | "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", 119 | "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", 120 | "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", 121 | ] 122 | let imageURLs = [ 123 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", 124 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", 125 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", 126 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", 127 | "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", 128 | ] 129 | 130 | var textEmbeddings: [[Float32]] = [] 131 | var imageEmbeddings: [[Float32]] = [] 132 | for (text, imageURL) in zip(texts, imageURLs) { 133 | guard let url = URL(string: imageURL), 134 | let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), 135 | let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) 136 | else { 137 | throw NSError( 138 | domain: "ImageError", 139 | code: 100, 140 | userInfo: [NSLocalizedDescriptionKey: "Could not load image from URL: \(imageURL)"] 141 | ) 142 | } 143 | 144 | let textEmbedding: [Float32] = try textModel.encode(text).asFloats() 145 | textEmbeddings.append(textEmbedding) 146 | let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats() 147 | imageEmbeddings.append(imageEmbedding) 148 | } 149 | 150 | // Now let's make sure that the cosine distance between image and respective text embeddings is low. 151 | // Make sure that the similarity between image and text at index `i` is higher than with other texts and images. 152 | for i in 0 ..< texts.count { 153 | let pairSimilarity = cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[i]) 154 | let otherTextSimilarities = (0 ..< texts.count).filter { $0 != i }.map { 155 | cosineSimilarity(between: textEmbeddings[$0], and: imageEmbeddings[i]) 156 | } 157 | let otherImageSimilarities = (0 ..< texts.count).filter { $0 != i }.map { 158 | cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[$0]) 159 | } 160 | 161 | XCTAssertTrue( 162 | pairSimilarity > otherTextSimilarities.max()!, 163 | "Text should be more similar to its corresponding image than to other images." 164 | ) 165 | XCTAssertTrue( 166 | pairSimilarity > otherImageSimilarities.max()!, 167 | "Text should be more similar to its corresponding image than to other texts." 168 | ) 169 | } 170 | } 171 | 172 | func testImageEmbeddings() async throws { 173 | for model in [ 174 | "unum-cloud/uform3-image-text-english-small", 175 | "unum-cloud/uform3-image-text-english-base", 176 | "unum-cloud/uform3-image-text-english-large", 177 | "unum-cloud/uform3-image-text-multilingual-base", 178 | ] { 179 | try await testImageEmbeddings(forModel: model) 180 | } 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /swift/README.md: -------------------------------------------------------------------------------- 1 | # UForm Swift SDK 2 | 3 | UForm offers first-party support for Swift. 4 | To get started, add UForm to your project using Swift Package Manager. 5 | 6 | ```bash 7 | swift package init --type executable 8 | swift package add uform 9 | ``` 10 | 11 | Then, import UForm in your Swift code: 12 | 13 | ```swift 14 | import UForm 15 | ``` 16 | 17 | ## Embeddings 18 | 19 | ### Text Embeddings 20 | 21 | ```swift 22 | let textModel = try await TextEncoder( 23 | modelName: "unum-cloud/uform3-image-text-english-small", 24 | computeUnits: .cpuAndNeuralEngine 25 | ) 26 | let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." 27 | let textEmbedding: Embedding = try textModel.encode(text) 28 | let textVector: [Float32] = textEmbedding.asFloats() 29 | ``` 30 | 31 | ### Image Embeddings 32 | 33 | ```swift 34 | let imageModel = try await ImageEncoder( 35 | modelName: "unum-cloud/uform3-image-text-english-small", 36 | computeUnits: .cpuAndNeuralEngine 37 | ) 38 | let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" 39 | guard let url = URL(string: imageURL), 40 | let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), 41 | let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) { 42 | throw Exception("Could not load image from URL: \(imageURL)") 43 | } 44 | 45 | var imageEmbedding: Embedding = try imageModel.encode(cgImage) 46 | var imageVector: [Float32] = embedding.asFloats() 47 | ``` 48 | 49 | ### Choosing Target Device 50 | 51 | Apple chips provide several functional units capable of high-throughput matrix multiplication and AI inference. 52 | Those `computeUnits` include the CPU, GPU, and Neural Engine. 53 | For maximum compatibility, the `.all` option is used by default. 54 | Sadly, Apple's scheduler is not always optimal, and it might be beneficial to specify the target device explicitly, especially if the models are pre-compiled for the Apple Neural Engine, as it may yield significant performance gains. 55 | 56 | | Model | GPU Text E. | ANE Text E. | GPU Image E. | ANE Image E. | 57 | | :------------------ | ----------: | ----------: | -----------: | -----------: | 58 | | `english-small` | 2.53 ms | 0.53 ms | 6.57 ms | 1.23 ms | 59 | | `english-base` | 2.54 ms | 0.61 ms | 18.90 ms | 3.79 ms | 60 | | `english-large` | 2.30 ms | 0.61 ms | 79.68 ms | 20.94 ms | 61 | | `multilingual-base` | 2.34 ms | 0.50 ms | 18.98 ms | 3.77 ms | 62 | 63 | > On Apple M4 iPad, running iOS 18.2. 64 | > Batch size is 1, and the model is pre-loaded into memory. 65 | > The original encoders use `f32` single-precision numbers for maximum compatibility, and mostly rely on __GPU__ for computation. 66 | > The quantized encoders use a mixture of `i8`, `f16`, and `f32` numbers for maximum performance, and mostly rely on the Apple Neural Engine (__ANE__) for computation. 67 | > The median latency is reported. 68 | 69 | ### Computing Distances 70 | 71 | There are several ways to compute distances between embeddings, once you have them. 72 | Naive Swift code might look like this: 73 | 74 | ```swift 75 | func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 { 76 | let dotProduct = zip(a, b).map(*).reduce(0, +) 77 | let normA = sqrt(a.map { $0 * $0 }.reduce(0, +)) 78 | let normB = sqrt(b.map { $0 * $0 }.reduce(0, +)) 79 | return dotProduct / (normA * normB) 80 | } 81 | ``` 82 | 83 | A faster way to compute distances is to use the Accelerate framework: 84 | 85 | ```swift 86 | import Accelerate 87 | 88 | func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 { 89 | var result: Float32 = 0 90 | var aNorm: Float32 = 0 91 | var bNorm: Float32 = 0 92 | vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count)) 93 | vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count)) 94 | vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count)) 95 | return result / sqrt(aNorm * bNorm) 96 | } 97 | ``` 98 | 99 | An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings. 100 | --------------------------------------------------------------------------------