├── Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb ├── LICENSE ├── ONNX_float16_to_float32.ipynb ├── README.md ├── deno-example.js ├── enable-threads.js ├── nodejs-example.js ├── onnx-image-demo.html ├── onnx-text-demo.html └── tfjs-text-demo.html /Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Export CLIP to ONNX/tflite/tfjs/tf saved model.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU" 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "AtWchYtU0Dtv" 21 | }, 22 | "source": [ 23 | "# IMPORTANT: Make sure you're using a GPU runtime!" 24 | ], 25 | "execution_count": null, 26 | "outputs": [] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "h_zSu-EKxlBP" 32 | }, 33 | "source": [ 34 | "# Based on this notebook: https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb" 35 | ], 36 | "execution_count": null, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "Ur4qmQUIBxwe" 43 | }, 44 | "source": [ 45 | "!git clone https://github.com/openai/CLIP\n", 46 | "%cd CLIP" 47 | ], 48 | "execution_count": null, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "Rbzq8REAy73u" 55 | }, 56 | "source": [ 57 | "# This is SUPER hacky because I don't know a better way (that's quick). Basically the vision model is ready to export as-is, like this:\n", 58 | "# torch.onnx.export(model.vision, ...)\n", 59 | "# but the text model has a couple of pre-processing steps (like converting tokens to embeddings), and I'd like to have all that\n", 60 | "# processing contained within the onnx file for the text encoder. The `torch.onnx.export` function seems to only be able to\n", 61 | "# take a *model* as an input, and not a function (like `model.encode_text`), so I'm hackily renaming `model.encode_text` to\n", 62 | "# `model.forward` so that I can then write:\n", 63 | "# torch.onnx.export(model, ...)\n", 64 | "# to export the text encoder. I'm sure there's a much better way to do this. If this stops working, note that\n", 65 | "# it was working at the following commit hash, so you can clone this to get it working: https://github.com/openai/CLIP/tree/573315e83f07b53a61ff5098757e8fc885f1703e\n", 66 | "!sed -i -e 's/def forward(self, image, text):/def old_forward(self, image, text):/g' ./clip/model.py\n", 67 | "!sed -i -e 's/def encode_text(self, text):/def forward(self, text):/g' ./clip/model.py" 68 | ], 69 | "execution_count": 2, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "0BpdJkdBssk9" 76 | }, 77 | "source": [ 78 | "! pip install ftfy regex tqdm" 79 | ], 80 | "execution_count": null, 81 | "outputs": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "uLFS29hnhlY4" 87 | }, 88 | "source": [ 89 | "import numpy as np\n", 90 | "import torch\n", 91 | "import clip\n", 92 | "\n", 93 | "clip.available_models()" 94 | ], 95 | "execution_count": null, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "IBRVTY9lbGm8" 102 | }, 103 | "source": [ 104 | "model, preprocess = clip.load(\"ViT-B/32\")\n", 105 | "model.cuda().eval()\n", 106 | "input_resolution = model.visual.input_resolution\n", 107 | "context_length = model.context_length\n", 108 | "vocab_size = model.vocab_size\n", 109 | "\n", 110 | "print(\"Model parameters:\", f\"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}\")\n", 111 | "print(\"Input resolution:\", input_resolution)\n", 112 | "print(\"Context length:\", context_length)\n", 113 | "print(\"Vocab size:\", vocab_size)" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "metadata": { 121 | "id": "qGom156-i2kL" 122 | }, 123 | "source": [ 124 | "clip.tokenize(\"Hello World!\")" 125 | ], 126 | "execution_count": null, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "tMc1AXzBlhzm" 133 | }, 134 | "source": [ 135 | "import os\n", 136 | "import skimage\n", 137 | "from PIL import Image\n", 138 | "import numpy as np\n", 139 | "\n", 140 | "from collections import OrderedDict\n", 141 | "import torch\n", 142 | "\n", 143 | "# images in skimage to use and their textual descriptions\n", 144 | "descriptions = {\n", 145 | " \"astronaut\": \"a portrait of an astronaut with the American flag\",\n", 146 | "}" 147 | ], 148 | "execution_count": 7, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "metadata": { 154 | "id": "NSSrLY185jSf" 155 | }, 156 | "source": [ 157 | "original_images = []\n", 158 | "images = []\n", 159 | "texts = []\n", 160 | "\n", 161 | "for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(\".png\") or filename.endswith(\".jpg\")]:\n", 162 | " name = os.path.splitext(filename)[0]\n", 163 | " if name not in descriptions:\n", 164 | " continue\n", 165 | "\n", 166 | " image = Image.open(os.path.join(skimage.data_dir, filename)).convert(\"RGB\")\n", 167 | " original_images.append(image)\n", 168 | " images.append(preprocess(image))\n", 169 | " texts.append(descriptions[name])" 170 | ], 171 | "execution_count": 8, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "HBgCanxi8JKw" 178 | }, 179 | "source": [ 180 | "image_input = torch.tensor(np.stack(images)).half().cuda()\n", 181 | "text_tokens = clip.tokenize([\"This is \" + desc for desc in texts]).cuda()" 182 | ], 183 | "execution_count": 9, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "source": [ 189 | "model.visual(image_input)[0] # astronaut pic embedding" 190 | ], 191 | "metadata": { 192 | "id": "g0o8mDN6wq_L" 193 | }, 194 | "execution_count": null, 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "source": [ 200 | "model(text_tokens)[0] # astronaut text embedding" 201 | ], 202 | "metadata": { 203 | "id": "qEPHMWwN0Puv" 204 | }, 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "id": "kDmmi0vMI9WY" 212 | }, 213 | "source": [ 214 | "torch.onnx.export(model, text_tokens, \"clip-text-vit-32.onnx\", export_params=True, opset_version=12, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})" 215 | ], 216 | "execution_count": null, 217 | "outputs": [] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "metadata": { 222 | "id": "BLSGVjueonP0" 223 | }, 224 | "source": [ 225 | "torch.onnx.export(model.visual, image_input, \"clip-image-vit-32.onnx\", export_params=True, opset_version=12, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})" 226 | ], 227 | "execution_count": 12, 228 | "outputs": [] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "metadata": { 233 | "id": "X0I6iPCOxB9M" 234 | }, 235 | "source": [ 236 | "# use this option in the above torch.onnx.export calls if you get a \"Unable to cast from non-held to held instance (T& to Holder)\" error:\n", 237 | "# operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "OhCoVnSo2XFr" 246 | }, 247 | "source": [ 248 | "# The onnx model files are now in the /content/CLIP directory." 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "source": [ 256 | "# Attempt at quantizing model to uint8 (doesn't seem to work? no errors, but onnx file is same size)\n", 257 | "# Reference: https://github.com/minimaxir/imgbeddings/blob/36fb4d7ac6b82694d109cef6f887d4cb9c49da0f/imgbeddings/models.py#L94\n", 258 | "# Here's the model the above code generates: https://huggingface.co/minimaxir/imgbeddings/blob/main/patch32_v1.onnx\n", 259 | "# Here's a demo of the above ONNX model with ORT Web: https://jsbin.com/nupehazaju/edit?html,output <-- seems to work, but this model doesn't have the projection head that squashes 768 vec to 512 elements (so can be compared to text embeddings of same length)\n", 260 | "!pip install onnxruntime\n", 261 | "!pip install onnx\n", 262 | "from onnxruntime.quantization import quantize_dynamic, QuantType\n", 263 | "quantize_dynamic(\"clip-image-vit-32.onnx\", \"clip-image-vit-32-uint8.onnx\", weight_type=QuantType.QUInt8, extra_options={\"MatMulConstBOnly\":False}) # I added the MatMulConstBOnly as a guess due to warnings that it outputs without it" 264 | ], 265 | "metadata": { 266 | "id": "24LcAdP2doTx" 267 | }, 268 | "execution_count": 22, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "metadata": { 274 | "id": "44jzFoZzxPrf" 275 | }, 276 | "source": [ 277 | "# The code below is for converting to tflite, tfjs and tf saved model:" 278 | ], 279 | "execution_count": null, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "r2VoXSsyyFu-" 286 | }, 287 | "source": [ 288 | "!pip install git+https://github.com/onnx/onnx-tensorflow.git" 289 | ], 290 | "execution_count": null, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "metadata": { 296 | "id": "n0axzSah0_h4" 297 | }, 298 | "source": [ 299 | "!onnx-tf convert -i clip-image-vit-32.onnx -o clip-image-vit-32-tf\n", 300 | "!onnx-tf convert -i clip-text-vit-32.onnx -o clip-text-vit-32-tf" 301 | ], 302 | "execution_count": null, 303 | "outputs": [] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "metadata": { 308 | "id": "2kDc0sPILbQu" 309 | }, 310 | "source": [ 311 | "!pip install tensorflowjs" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WXFWVZACLUR8" 320 | }, 321 | "source": [ 322 | "!tensorflowjs_converter --input_format tf_saved_model ./clip-image-vit-32-tf ./clip-image-vit-32-tfjs\n", 323 | "!tensorflowjs_converter --input_format tf_saved_model ./clip-text-vit-32-tf ./clip-text-vit-32-tfjs" 324 | ], 325 | "execution_count": null, 326 | "outputs": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "metadata": { 331 | "id": "g1Ub_dsaKqO8" 332 | }, 333 | "source": [ 334 | "import tensorflow as tf\n", 335 | "\n", 336 | "# image encoder:\n", 337 | "converter = tf.lite.TFLiteConverter.from_saved_model(\"./clip-image-vit-32-tf\")\n", 338 | "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844\n", 339 | "tflite_model = converter.convert()\n", 340 | "with open('clip-image-vit-32.tflite', 'wb') as f:\n", 341 | " f.write(tflite_model)\n", 342 | "\n", 343 | "# text encoder:\n", 344 | "converter = tf.lite.TFLiteConverter.from_saved_model(\"./clip-text-vit-32-tf\")\n", 345 | "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844\n", 346 | "tflite_model = converter.convert()\n", 347 | "with open('clip-text-vit-32.tflite', 'wb') as f:\n", 348 | " f.write(tflite_model)" 349 | ], 350 | "execution_count": null, 351 | "outputs": [] 352 | } 353 | ] 354 | } 355 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 josephrocca 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ONNX_float16_to_float32.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "ONNX float16 to float32.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "zDyM0tPjW0KD" 23 | }, 24 | "source": [ 25 | "!git clone https://github.com/josephrocca/onnx-typecast # based on: https://github.com/aadhithya/onnx-typecast\n", 26 | "%cd onnx-typecast" 27 | ], 28 | "execution_count": null, 29 | "outputs": [] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "metadata": { 34 | "id": "v-sR4H6eXAHM" 35 | }, 36 | "source": [ 37 | "!pip install -r requirements.txt" 38 | ], 39 | "execution_count": null, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "7pdZu8W7XITw" 46 | }, 47 | "source": [ 48 | "!python3 convert-float16-to-float32.py ./path/to/input.onnx ./path/to/output.onnx" 49 | ], 50 | "execution_count": null, 51 | "outputs": [] 52 | } 53 | ] 54 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **NOTE (Sept 7th 2023)**: At this point you *may* want to use [Transformers.js](https://github.com/xenova/transformers.js) instead since it's well-maintained and supports quantized models which are much smaller. That said, if you don't want to include the whole Transformers.js library in your app (as of writing I'm not sure if [tree-shaking](https://developer.mozilla.org/en-US/docs/Glossary/Tree_shaking) is supported yet), then you can still directly use ONNX Runtime Web with the quantized models produced by the [Transformers.js conversion scripts](https://huggingface.co/docs/transformers.js/custom_usage#convert-your-models-to-onnx). 2 | 3 | Here are the relevant modules for Transformers.js: 4 | 5 | * https://huggingface.co/docs/transformers.js/api/models#module_models.CLIPTextModelWithProjection 6 | * https://huggingface.co/docs/transformers.js/api/models#module_models.CLIPVisionModelWithProjection 7 | 8 | Here's a full working example that uses Transformers.js: 9 | ```js 10 | let quantized = false; // change to `true` for a much smaller model (e.g. 87mb vs 345mb for image model), but lower accuracy 11 | let { AutoProcessor, CLIPVisionModelWithProjection, RawImage, AutoTokenizer, CLIPTextModelWithProjection } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.5.4/dist/transformers.js'); 12 | let imageProcessor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); 13 | let visionModel = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16', {quantized}); 14 | let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); 15 | let textModel = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16', {quantized}); 16 | 17 | function cosineSimilarity(A, B) { 18 | if(A.length !== B.length) throw new Error("A.length !== B.length"); 19 | let dotProduct = 0, mA = 0, mB = 0; 20 | for(let i = 0; i < A.length; i++){ 21 | dotProduct += A[i] * B[i]; 22 | mA += A[i] * A[i]; 23 | mB += B[i] * B[i]; 24 | } 25 | mA = Math.sqrt(mA); 26 | mB = Math.sqrt(mB); 27 | let similarity = dotProduct / (mA * mB); 28 | return similarity; 29 | } 30 | 31 | // get image embedding: 32 | let image = await RawImage.read('https://i.imgur.com/RKsLoNB.png'); 33 | let imageInputs = await imageProcessor(image); 34 | let { image_embeds } = await visionModel(imageInputs); 35 | console.log(image_embeds.data); 36 | 37 | // get text embedding: 38 | let texts = ['a photo of an astronaut']; 39 | let textInputs = tokenizer(texts, { padding: true, truncation: true }); 40 | let { text_embeds } = await textModel(textInputs); 41 | console.log(text_embeds.data); 42 | 43 | let similarity = cosineSimilarity(image_embeds.data, text_embeds.data); 44 | console.log(similarity); 45 | ``` 46 | Note that the above code uses `clip-vit-base-patch16` instead of what's used in this repo, `clip-vit-base-patch32` - not sure which is best, but you can change `patch16` to `patch32` in the above code if you want to test it. Also note that you'll see some `GET`/`404` errors in the console - that's expected, since Transformers.js tries to load models locally first. There's probably a way to disable this. 47 | 48 | Transformers.js also has a **ton** of other models available, and it's quite easy to use. E.g. here's an example of a text embedding / retrieval model: 49 | ```js 50 | let { pipeline } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.5.4/dist/transformers.js'); 51 | let extractor = await pipeline('feature-extraction', 'Xenova/e5-large-v2'); 52 | let dotProduct = (vec1, vec2) => vec1.reduce((sum, val, i) => sum + val * vec2[i], 0); 53 | 54 | let passage1 = await extractor('passage: She likes carrots and celery.', { pooling: 'mean', normalize: true }); 55 | let passage2 = await extractor('passage: This is a good calculus guide.', { pooling: 'mean', normalize: true }); 56 | let query = await extractor('query: Taking care of rabbits', { pooling: 'mean', normalize: true }); 57 | 58 | let similarity1 = dotProduct(query.data, passage1.data); 59 | let similarity2 = dotProduct(query.data, passage2.data); 60 | ``` 61 | 62 | --- 63 | 64 | # OpenAI CLIP JavaScript 65 | OpenAI's CLIP model ported to JavaScript using the ONNX web runtime. I also got the LiT models working [here](https://github.com/josephrocca/lit-encoder-js). 66 | 67 | **Minimal demos**: 68 | * Image model: https://josephrocca.github.io/openai-clip-js/onnx-image-demo.html 69 | * Text model: https://josephrocca.github.io/openai-clip-js/onnx-text-demo.html 70 | 71 | **Example applications**: 72 | * Sorting/searching a local folder of images using a text prompt: https://github.com/josephrocca/clip-image-sorter 73 | 74 | **Server side**: 75 | * Deno: https://github.com/josephrocca/openai-clip-js/blob/main/deno-example.js 76 | * Node.js: https://github.com/josephrocca/openai-clip-js/blob/main/nodejs-example.js 77 | 78 | **Notes:** 79 | 80 | * The model files are about **4x** larger than they actually need to be - params are float32 instead of uint8. If you're using CLIP in a "real" web app, you should probably quantize it. [@minimaxir](https://github.com/minimaxir) has done it ([1](https://github.com/minimaxir/imgbeddings/blob/36fb4d7ac6b82694d109cef6f887d4cb9c49da0f/imgbeddings/models.py#L94), [2](https://huggingface.co/minimaxir/imgbeddings/blob/main/patch32_v1.onnx)), and that model [worked first try](https://jsbin.com/nupehazaju/edit?html,output) with ORT Web (which is amazing), but it outputs a 768 element vector instead of 512, which I think is because @minimaxir's model is missing the final projection head which puts image embeddings into same-sized space as text embeddings. I had a quick attempt at it in [the ONNX export notebook](https://colab.research.google.com/github/josephrocca/openai-clip-js/blob/main/Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb) (see cell after ONNX conversion), but it doesn't seem to be working. If you investigate this and get it working, please open an issue. Thanks to [@congraIiIso](https://twitter.com/congraIiIso) on Twitter for bringing the uint8 quantization to my attention! **Edit**: I've managed to get quantization "working", but the embeddings that the quantized models produce don't seem to be close enough to the correct embeddings. See [this comment](https://github.com/josephrocca/openai-clip-js/issues/3#issuecomment-1221482824) for details. 81 | * You should use bicubic resizing of images to get the most accurate embeddings. [Here's a simple](https://gist.github.com/josephrocca/d97e0532f34e1205f4006d45ca909024) copy-paste JavaScript bicubic resize + center crop function that uses [wasm-vips](https://github.com/kleisauke/wasm-vips). 82 | * More info: In the above-linked image model demo, the image encoder demo uses the default HTML5 canvas resize algorithm when pre-processing the input image. This is apparently not bicubic (which is what OpenAI's CLIP repo uses). This leads to the embeddings being a bit different to what Pytorch gives. I'm not sure if this will end up mattering in practical usage, but in case it matters to you, you should not use canvas resizing, and instead use an actual bicubic resizer. For example, [this astronaut pic](https://i.imgur.com/ec4Ao4s.png) has this embedding with the Pytorch model: `[0.3181,0.3054,-0.1548,0.0767,-0.1699,0.1320,-0.2974,-0.1940,-0.3052,0.2299,0.1995, -0.3025,0.3108,-0.2305,0.2368, ...]` and ONNX Runtime Web (wasm backend) gives: `[0.3635,0.3301,-0.1093,0.0598,-0.1526,0.1127,-0.3373,-0.1544,-0.2627,0.2372,-0.2012,-0.3182,0.3022,-0.2940,0.2227, ...]`. If you pre-resize the image with a bicubic algorithm ([like this](https://i.imgur.com/RKsLoNB.png) - the default image used in the demo), then the embeddings are basically the same. 83 | * The ONNX text model produces embeddings that seem to be close enough to the Pytorch model based on "eyeballing" some image/text matching tasks, but note that there are some non-trivial-looking differences. Again, I don't know whether these differences are enough to significantly affect real-world usage. Please feel free to open an issue if you manage to run some proper tests. Here are the embeddings for "a portrait of an astronaut with the American flag" in Pytorch and ONNX: 84 | * Pytorch: `[-0.16650, 0.05167, -0.15320, 0.44922, 0.20642, -0.29565, 0.04041, -0.41064, -0.15015, 0.31934, -0.06842, -0.25464, 0.12311, -0.09509, 0.24109, -0.04883, 0.26074, 0.00045, 0.20972, 0.36987, ...]` 85 | * ONNX: `[-0.19535, 0.01808, -0.09647, 0.61671, 0.17760, -0.30735, -0.03580, -0.31977, -0.21485, 0.38863, 0.05983, -0.24685, 0.17829, -0.16579, 0.17799, -0.07826, 0.28496, -0.02429, 0.11830, 0.37698, ...]` 86 | * Models are served to the browser directly from [this HuggingFace 🤗 repo](https://huggingface.co/rocca/openai-clip-js/tree/main). 87 | * Regarding model conversion: 88 | * I used [this Colab notebook](https://colab.research.google.com/github/josephrocca/openai-clip-js/blob/main/Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb) to convert the Pytorch models to ONNX/tfjs/etc. 89 | * I used [this Colab notebook](https://colab.research.google.com/github/josephrocca/openai-clip-js/blob/main/ONNX_float16_to_float32.ipynb) to convert weights from float16 to float32 because the ONNX web runtime doesn't currently support float16. This means that the model files are twice as big as they should be ([issue](https://github.com/microsoft/onnxruntime/issues/9758)). 90 | * See the comment at the top of [this file](https://github.com/josephrocca/onnx-typecast/blob/master/fix-clip-text-vit-32-float32---scratch.py) for an extra conversion step that needs to be applied to the text model to avoid [this error](https://github.com/microsoft/onnxruntime/issues/9760#issue-1053052192). 91 | 92 | 93 | **Todo (maybe):** 94 | * Try tfjs runtime if [this issue](https://github.com/tensorflow/tfjs/issues/5847) gets resolved. 95 | * Try to get tflite model exporting and working. 96 | -------------------------------------------------------------------------------- /deno-example.js: -------------------------------------------------------------------------------- 1 | import { createCanvas, loadImage } from "https://deno.land/x/canvas@v1.4.1/mod.ts"; 2 | import { serve } from "https://deno.land/std@0.144.0/http/server.ts"; 3 | import "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.15.1/dist/ort.js"; 4 | 5 | ort.env.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.15.1/dist/"; 6 | 7 | let onnxImageSession = await ort.InferenceSession.create("https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx", { executionProviders: ["wasm"] }); 8 | 9 | // let onnxTextSession = await ort.InferenceSession.create("https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-text-vit-32-float32-int32.onnx", { executionProviders: ["wasm"] }); 10 | // let Tokenizer = (await import("https://deno.land/x/clip_bpe@v0.0.6/mod.js")).default; 11 | // let textTokenizer = new Tokenizer(); 12 | 13 | console.log("Finished loading CLIP image model."); 14 | 15 | await serve(async request => { 16 | if(!URL.canParse(request.url)) return new Response("Invalid URL."); 17 | 18 | const urlData = new URL(request.url); 19 | const params = Object.fromEntries(urlData.searchParams.entries()); 20 | const path = urlData.pathname; 21 | const ip = request.headers.get('CF-Connecting-IP'); 22 | 23 | if(path === "/api/image") { 24 | console.log("params.imageUrl", params.imageUrl); 25 | let imageUrl = params.imageUrl ?? (await request.json()).imageUrl; 26 | let embedding = await embedImage(imageUrl); 27 | return new Response(JSON.stringify([...embedding])); 28 | } 29 | 30 | return new Response("Not found.", {status:404}); 31 | }, {port: Deno.env.get("PORT")}); 32 | 33 | async function embedImage(url) { 34 | let rgbData = await getRgbData(url); 35 | 36 | const feeds = {'input': new ort.Tensor('float32', rgbData, [1,3,224,224])}; 37 | 38 | let t = Date.now(); 39 | console.log("Running inference..."); 40 | const results = await onnxImageSession.run(feeds); 41 | console.log(`Finished inference in ${Date.now()-t}ms`); 42 | 43 | const data = results["output"].data; 44 | // console.log(`data of result tensor 'output'`, data); 45 | return data; 46 | } 47 | 48 | // async function embedText(text) { 49 | // let textTokens = textTokenizer.encodeForCLIP(text); 50 | // textTokens = Int32Array.from(textTokens); 51 | // const feeds = {input: new ort.Tensor('int32', textTokens, [1, 77])}; 52 | // const results = await onnxTextSession.run(feeds); 53 | // return [...results["output"].data]; 54 | // } 55 | 56 | async function getRgbData(imgUrl) { 57 | let img = await loadImage(imgUrl); 58 | let canvas = createCanvas(224, 224); 59 | let ctx = canvas.getContext("2d"); 60 | ctx.drawImage(img, 0, 0, canvas.width, canvas.height); 61 | let imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); 62 | 63 | let rgbData = [[], [], []]; // [r, g, b] 64 | // remove alpha and put into correct shape: 65 | let d = imageData.data; 66 | for(let i = 0; i < d.length; i += 4) { 67 | let x = (i/4) % canvas.width; 68 | let y = Math.floor((i/4) / canvas.width) 69 | if(!rgbData[0][y]) rgbData[0][y] = []; 70 | if(!rgbData[1][y]) rgbData[1][y] = []; 71 | if(!rgbData[2][y]) rgbData[2][y] = []; 72 | rgbData[0][y][x] = d[i+0]/255; 73 | rgbData[1][y][x] = d[i+1]/255; 74 | rgbData[2][y][x] = d[i+2]/255; 75 | // From CLIP repo: Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)) 76 | rgbData[0][y][x] = (rgbData[0][y][x] - 0.48145466) / 0.26862954; 77 | rgbData[1][y][x] = (rgbData[1][y][x] - 0.4578275) / 0.26130258; 78 | rgbData[2][y][x] = (rgbData[2][y][x] - 0.40821073) / 0.27577711; 79 | } 80 | rgbData = Float32Array.from(rgbData.flat().flat()); 81 | return rgbData; 82 | } 83 | -------------------------------------------------------------------------------- /enable-threads.js: -------------------------------------------------------------------------------- 1 | // NOTE: This file creates a service worker that cross-origin-isolates the page (read more here: https://web.dev/coop-coep/) which allows us to use wasm threads. 2 | // Normally you would set the COOP and COEP headers on the server to do this, but Github Pages doesn't allow this, so this is a hack to do that. 3 | 4 | /* Edited version of: coi-serviceworker v0.1.6 - Guido Zuidhof, licensed under MIT */ 5 | // From here: https://github.com/gzuidhof/coi-serviceworker 6 | if(typeof window === 'undefined') { 7 | self.addEventListener("install", () => self.skipWaiting()); 8 | self.addEventListener("activate", e => e.waitUntil(self.clients.claim())); 9 | 10 | async function handleFetch(request) { 11 | if(request.cache === "only-if-cached" && request.mode !== "same-origin") { 12 | return; 13 | } 14 | 15 | if(request.mode === "no-cors") { // We need to set `credentials` to "omit" for no-cors requests, per this comment: https://bugs.chromium.org/p/chromium/issues/detail?id=1309901#c7 16 | request = new Request(request.url, { 17 | cache: request.cache, 18 | credentials: "omit", 19 | headers: request.headers, 20 | integrity: request.integrity, 21 | destination: request.destination, 22 | keepalive: request.keepalive, 23 | method: request.method, 24 | mode: request.mode, 25 | redirect: request.redirect, 26 | referrer: request.referrer, 27 | referrerPolicy: request.referrerPolicy, 28 | signal: request.signal, 29 | }); 30 | } 31 | 32 | let r = await fetch(request).catch(e => console.error(e)); 33 | 34 | if(r.status === 0) { 35 | return r; 36 | } 37 | 38 | const headers = new Headers(r.headers); 39 | headers.set("Cross-Origin-Embedder-Policy", "credentialless"); // or: require-corp 40 | headers.set("Cross-Origin-Opener-Policy", "same-origin"); 41 | 42 | return new Response(r.body, { status: r.status, statusText: r.statusText, headers }); 43 | } 44 | 45 | self.addEventListener("fetch", function(e) { 46 | e.respondWith(handleFetch(e.request)); // respondWith must be executed synchonously (but can be passed a Promise) 47 | }); 48 | 49 | } else { 50 | (async function() { 51 | if(window.crossOriginIsolated !== false) return; 52 | 53 | let registration = await navigator.serviceWorker.register(window.document.currentScript.src).catch(e => console.error("COOP/COEP Service Worker failed to register:", e)); 54 | if(registration) { 55 | console.log("COOP/COEP Service Worker registered", registration.scope); 56 | 57 | registration.addEventListener("updatefound", () => { 58 | console.log("Reloading page to make use of updated COOP/COEP Service Worker."); 59 | window.location.reload(); 60 | }); 61 | 62 | // If the registration is active, but it's not controlling the page 63 | if(registration.active && !navigator.serviceWorker.controller) { 64 | console.log("Reloading page to make use of COOP/COEP Service Worker."); 65 | window.location.reload(); 66 | } 67 | } 68 | })(); 69 | } 70 | 71 | // Code to deregister: 72 | // let registrations = await navigator.serviceWorker.getRegistrations(); 73 | // for(let registration of registrations) { 74 | // await registration.unregister(); 75 | // } 76 | -------------------------------------------------------------------------------- /nodejs-example.js: -------------------------------------------------------------------------------- 1 | // npm install canvas onnxruntime-web 2 | const { createCanvas, loadImage } = require('canvas'); 3 | const ort = require('onnxruntime-web'); 4 | 5 | ort.env.wasm.numThreads = 1; // otherwise for some reason I get "TypeError [ERR_WORKER_PATH]: The worker script or module filename must be an absolute path" 6 | 7 | let onnxImageSession; 8 | 9 | (async function() { 10 | console.log("loading clip model..."); 11 | onnxImageSession = await ort.InferenceSession.create("https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx", { executionProviders: ["wasm"] }); 12 | console.log("loaded. now running inference..."); 13 | await embedImage("https://i.imgur.com/RKsLoNB.png"); // can also pass it a dataURL 14 | })(); 15 | 16 | async function embedImage(url) { 17 | let rgbData = await getRgbData(url); 18 | 19 | const feeds = {'input': new ort.Tensor('float32', rgbData, [1,3,224,224])}; 20 | 21 | let t = Date.now(); 22 | console.log("Running inference..."); 23 | const results = await onnxImageSession.run(feeds); 24 | console.log(`Finished inference in ${Date.now()-t}ms`); 25 | 26 | const data = results["output"].data; 27 | // console.log(`data of result tensor 'output'`, data); 28 | return data; 29 | } 30 | 31 | async function embedText(text) { 32 | let textTokens = textTokenizer.encodeForCLIP(text); 33 | textTokens = Int32Array.from(textTokens); 34 | const feeds = {input: new ort.Tensor('int32', textTokens, [1, 77])}; 35 | const results = await onnxTextSession.run(feeds); 36 | return [...results["output"].data]; 37 | } 38 | 39 | async function getRgbData(imgUrl) { 40 | let img = await loadImage(imgUrl); 41 | let canvas = createCanvas(224, 224); 42 | let ctx = canvas.getContext("2d"); 43 | ctx.drawImage(img, 0, 0, canvas.width, canvas.height); 44 | let imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); 45 | 46 | let rgbData = [[], [], []]; // [r, g, b] 47 | // remove alpha and put into correct shape: 48 | let d = imageData.data; 49 | for(let i = 0; i < d.length; i += 4) { 50 | let x = (i/4) % canvas.width; 51 | let y = Math.floor((i/4) / canvas.width) 52 | if(!rgbData[0][y]) rgbData[0][y] = []; 53 | if(!rgbData[1][y]) rgbData[1][y] = []; 54 | if(!rgbData[2][y]) rgbData[2][y] = []; 55 | rgbData[0][y][x] = d[i+0]/255; 56 | rgbData[1][y][x] = d[i+1]/255; 57 | rgbData[2][y][x] = d[i+2]/255; 58 | // From CLIP repo: Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)) 59 | rgbData[0][y][x] = (rgbData[0][y][x] - 0.48145466) / 0.26862954; 60 | rgbData[1][y][x] = (rgbData[1][y][x] - 0.4578275) / 0.26130258; 61 | rgbData[2][y][x] = (rgbData[2][y][x] - 0.40821073) / 0.27577711; 62 | } 63 | rgbData = Float32Array.from(rgbData.flat().flat()); 64 | return rgbData; 65 | } 66 | -------------------------------------------------------------------------------- /onnx-image-demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | OpenAI CLIP JavaScript - Image Demo - ONNX Web Runtime 5 | 6 | 7 | 8 | 9 | 10 |
11 | imgur.com url (ideally 224x224): 12 | 13 | 14 |
15 | backend: 19 |
20 | quantized: 24 |
25 | 26 |
27 |

github repo - huggingface repo

28 | 29 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /onnx-text-demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | OpenAI CLIP JavaScript - Text Demo - ONNX Web Runtime 5 | 6 | 7 | 8 | 9 | 10 |
11 | input text 12 |
13 | quantized: 17 |
18 | 19 |
20 |

github repo

21 | 22 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /tfjs-text-demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | OpenAI CLIP JavaScript - Text Demo - tfjs 5 | 6 | 7 | 8 | 9 | 10 |

Note: To run this you need to clone this repo, and then download the tfjs model folder from here and name the folder "clip-text-vit-32-tfjs", and then run a static file server in the repo directory.

11 | 12 |
13 | input text 14 | backend: 18 | 19 |
20 |

github repo - huggingface repo

21 | 22 | 50 | 51 | 52 | --------------------------------------------------------------------------------