├── FineTuning_LayoutLMv3_Trainer_HF_DocLayNet.ipynb ├── Fine_tune_KOSMOS_2_for_multimodal_grounding.ipynb ├── LayoutLMv3_Inference.ipynb ├── README.md ├── SAM_DocLayNet.ipynb ├── UDOPEncoderModel_fine_tune_DocLayNet.ipynb ├── UDOP_DocLayNet_Inference.ipynb └── test.png /Fine_tune_KOSMOS_2_for_multimodal_grounding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | " $\"Open$ " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "PPs11Mw5p1RA" 17 | }, 18 | "source": [ 19 | "## Inference with KOSMOS-2 for multimodal grounding and referral\n", 20 | "\n", 21 | "In this notebook, we'll perform inference with Microsoft's new impressive multimodal large language model (LLM) called [KOSMOS-2](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos-2).\n", 22 | "\n", 23 | "\n", 24 | "https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py#L619\n", 25 | "\n", 26 | "https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py#L2924\n", 27 | "\n", 28 | "https://discuss.huggingface.co/t/how-is-the-data-shifted-by-one-token-during-causallm-fine-tuning/36386\n", 29 | "\n", 30 | "https://github.com/huggingface/transformers/blob/b2748a6efd045dd771f8fd48e8b309cbc061c618/src/transformers/models/kosmos2/__init__.py\n", 31 | "\n", 32 | "https://github.com/microsoft/unilm/blob/master/kosmos-2/fairseq/fairseq/logging/metrics.py\n", 33 | "\n", 34 | "https://github.com/huggingface/transformers/blob/main/src/transformers/trainer_pt_utils.py#L482" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "gUTmGeUXngno" 41 | }, 42 | "source": [ 43 | "## Set-up environment\n", 44 | "\n", 45 | "Let's start by installing 🤗 Transformers. We install from main here since the model is brand new at the time of writing. We also install Accelerate and Bitsandbytes since those will provide [4-bit inference](https://huggingface.co/blog/4bit-transformers-bitsandbytes), greatly reducing the memory requirements to load the model (without those I wouldn't be able to load the model in Google Colab)." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "id": "NIZ4b6kQnewl", 53 | "colab": { 54 | "base_uri": "https://localhost:8080/" 55 | }, 56 | "outputId": "3b606ff3-e773-4008-d24c-d57a71a296aa" 57 | }, 58 | "outputs": [ 59 | { 60 | "output_type": "stream", 61 | "name": "stdout", 62 | "text": [ 63 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 64 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.1/290.1 kB\u001b[0m \u001b[31m30.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 65 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.2/102.2 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 66 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 67 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 68 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 69 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 70 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 71 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 72 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 73 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 74 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m64.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 75 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m54.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 76 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 77 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 78 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 79 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 80 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 81 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 82 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 83 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 84 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 85 | "\u001b[?25h Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "# install required libaries\n", 91 | "!pip install -q -U transformers accelerate bitsandbytes seqeval evaluate" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "id": "91psL2aw4uN_" 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "from transformers import AutoProcessor, AutoModelForVision2Seq\n", 103 | "import requests\n", 104 | "from datasets import load_dataset\n", 105 | "from datasets.features import ClassLabel\n", 106 | "import re\n", 107 | "from PIL import Image, ImageDraw, ImageFont\n", 108 | "import math\n", 109 | "import random\n", 110 | "from transformers import Kosmos2Config, Kosmos2Model" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "id": "W8E--vZukZT3" 117 | }, 118 | "source": [ 119 | "\n", 120 | "\n", 121 | "> The image resolution is set to 1280×1280 and the patch size is 10×10. We divide the width and height of the image into 256 bins, with each bin consisting of 5×5 pixels. A total of 256×256 location tokens are added to the vocabulary.\n", 122 | "\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "id": "Dku7xH2hlTRg" 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "## Config of Kosmos2 changed just to demonstrate fine tuning\n", 134 | "# configuration For to test fine tuning code\n", 135 | "configuration = Kosmos2Config(\n", 136 | " text_config = {\"layers\" : 4},\n", 137 | " vision_config = {\"num_hidden_layers\" : 4}\n", 138 | ")\n", 139 | "'''\n", 140 | "# configuration for actual fine-tuning\n", 141 | "configuration = Kosmos2Config()\n", 142 | "'''" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "id": "73Ik8HzakeBN", 150 | "colab": { 151 | "base_uri": "https://localhost:8080/" 152 | }, 153 | "outputId": "50e53879-426d-413d-b145-93b25a80a7d0" 154 | }, 155 | "outputs": [ 156 | { 157 | "output_type": "stream", 158 | "name": "stderr", 159 | "text": [ 160 | "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", 161 | "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", 162 | "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", 163 | "You will be able to reuse this secret in all of your notebooks.\n", 164 | "Please note that authentication is recommended but still optional to access public models or datasets.\n", 165 | " warnings.warn(\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "from transformers import Kosmos2ForConditionalGeneration\n", 171 | "from transformers import Kosmos2Config, Kosmos2Model, AutoProcessor\n", 172 | "\n", 173 | "model = Kosmos2ForConditionalGeneration.from_pretrained(\"microsoft/kosmos-2-patch14-224\", device_map=\"auto\", config = configuration)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "id": "A4TyIx7kkTrg", 181 | "colab": { 182 | "base_uri": "https://localhost:8080/" 183 | }, 184 | "outputId": "59fc80bb-ed75-4121-ac1a-0472431d0f4c" 185 | }, 186 | "outputs": [ 187 | { 188 | "output_type": "stream", 189 | "name": "stderr", 190 | "text": [ 191 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "processor = AutoProcessor.from_pretrained(\"microsoft/kosmos-2-patch14-224\", add_eos_token=True, device_map=\"auto\")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "2LkHlPBtpUX8" 203 | }, 204 | "source": [ 205 | "## Load model\n", 206 | "\n", 207 | "Next, let's load the model along with its processor. We specify `load_in_4bit=True` to reduce the size of the weights to be able to load the model in Google Colab. This is all thanks to the magic of bitsandbytes' integration in the Transformers library (see [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes) for all info). We also specify to place the model on the GPU (with id=0, meaning the first GPU on our system)." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "colab": { 215 | "base_uri": "https://localhost:8080/" 216 | }, 217 | "id": "N8kVsop21jgE", 218 | "outputId": "2c7217eb-5237-4821-af69-8f2a49327c64" 219 | }, 220 | "outputs": [ 221 | { 222 | "output_type": "stream", 223 | "name": "stdout", 224 | "text": [ 225 | "Train dataset size: 4\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "# dataset_id =\"pierreguillou/DocLayNet-small\"\n", 231 | "# This dataset is takes from DocLayNet dataset\n", 232 | "## This finetuning was done for the layout detection in the any image. Task was to find table/ header/ footer... from the any given image.\n", 233 | "dataset_id = \"Mit1208/test_dataset\"\n", 234 | "\n", 235 | "dataset = load_dataset(dataset_id, trust_remote_code=True)\n", 236 | "\n", 237 | "print(f\"Train dataset size: {len(dataset['train'])}\")\n", 238 | "# print(f\"Test dataset size: {len(dataset['test'])}\")" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "id": "oQkH4NpT10nF" 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "# Remove data which has no text\n", 250 | "# https://github.com/huggingface/transformers/blob/main/src/transformers/models/kosmos2/processing_kosmos2.py#L154\n", 251 | "dataset = dataset.filter(lambda example: len(example['texts']) > 0)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "source": [ 257 | "## Define variables\n", 258 | "\n", 259 | "below part is to defind id2label and label2id, some of the code is for creating visualization of layouts on the image (you can ignore color part)." 260 | ], 261 | "metadata": { 262 | "id": "6LY4J4LBACVy" 263 | } 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "id": "RqL5-AhlEyjd" 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "features = dataset[\"train\"].features\n", 274 | "column_names = dataset[\"train\"].column_names\n", 275 | "image_column_name = \"image\"\n", 276 | "text_column_name = \"texts\"\n", 277 | "boxes_column_name = \"bboxes_block\"\n", 278 | "label_column_name = \"categories\"\n", 279 | "\n", 280 | "# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the\n", 281 | "# unique labels.\n", 282 | "def get_label_list(labels):\n", 283 | " unique_labels = set()\n", 284 | " for label in labels:\n", 285 | " unique_labels = unique_labels | set(label)\n", 286 | " label_list = list(unique_labels)\n", 287 | " label_list.sort()\n", 288 | " return label_list\n", 289 | "\n", 290 | "if isinstance(features[label_column_name].feature, ClassLabel):\n", 291 | " label_list = features[label_column_name].feature.names\n", 292 | " # No need to convert the labels since they are already ints.\n", 293 | " id2label = {k: v for k,v in enumerate(label_list)}\n", 294 | " label2id = {v: k for k,v in enumerate(label_list)}\n", 295 | "else:\n", 296 | " label_list = get_label_list(dataset[\"train\"][label_column_name])\n", 297 | " id2label = {k: v for k,v in enumerate(label_list)}\n", 298 | " label2id = {v: k for k,v in enumerate(label_list)}\n", 299 | "num_labels = len(label_list)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "colab": { 307 | "base_uri": "https://localhost:8080/" 308 | }, 309 | "id": "O4KB67IxD8IP", 310 | "outputId": "12efe284-43c5-49a0-98b1-5b4e72660899" 311 | }, 312 | "outputs": [ 313 | { 314 | "output_type": "execute_result", 315 | "data": { 316 | "text/plain": [ 317 | "{0: 'Caption',\n", 318 | " 1: 'Footnote',\n", 319 | " 2: 'Formula',\n", 320 | " 3: 'List-item',\n", 321 | " 4: 'Page-footer',\n", 322 | " 5: 'Page-header',\n", 323 | " 6: 'Picture',\n", 324 | " 7: 'Section-header',\n", 325 | " 8: 'Table',\n", 326 | " 9: 'Text',\n", 327 | " 10: 'Title'}" 328 | ] 329 | }, 330 | "metadata": {}, 331 | "execution_count": 8 332 | } 333 | ], 334 | "source": [ 335 | "id2label" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "id": "g9tB7iibOoNm" 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "# Define colors for all labels\n", 347 | "get_colors = lambda n: list(map(lambda i: \"#\" + \"%06x\" % random.randint(0, 0xFFFFFF),range(n)))\n", 348 | "colors = get_colors(len(label_list))\n", 349 | "font = ImageFont.load_default()\n", 350 | "label2color = {label: colors[idx] for idx, label in enumerate(label_list)}" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "id": "cW58lqieR8sZ" 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "# Normalize box diamentions to range 0 to 1000\n", 362 | "def normalized_box(box, image_width=1025, image_height=1025):\n", 363 | " return [\n", 364 | " round(float(box[0] / image_width), 6),\n", 365 | " round(float(box[1] / image_height), 6),\n", 366 | " round(float(box[2] / image_width), 6),\n", 367 | " round(float(box[3] / image_height), 6),\n", 368 | " ]\n", 369 | "\n", 370 | "def convert_box(bbox):\n", 371 | " x, y, w, h = tuple(bbox) # Box coordinates are in (left, top, width, height) format\n", 372 | " return [x, y, x+w, y+h] # we need to convert it into (x1, y1, x2, y2) which is (left, top, left+widght, top+height)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "id": "wMSbhg-vSRcw" 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "example = dataset[\"train\"][0]\n", 384 | "# This function remove duplicate entries from the dataset\n", 385 | "def set_cat_box(example):\n", 386 | " list1_tuples = [tuple(inner_list) for inner_list in example['bboxes_block']]\n", 387 | "\n", 388 | " # Create unique pairs\n", 389 | " unique_pairs = set(zip(list1_tuples, example['categories']))\n", 390 | "\n", 391 | " # Separate the unique pairs back into lists\n", 392 | " result_list1, result_list2 = zip(*unique_pairs)\n", 393 | " return result_list1, result_list2\n", 394 | "\n", 395 | "# set_boxs, set_categories = set_cat_box(example)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "id": "x_34sFXnNbPH" 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "import pandas as pd\n", 407 | "from tqdm import tqdm\n", 408 | "tqdm.pandas()\n", 409 | "\n", 410 | "train_df = pd.DataFrame(dataset['train'])" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "id": "297s45-RNmIR" 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "train_df['type'] = 'train'\n", 422 | "all_df = train_df" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": { 429 | "colab": { 430 | "base_uri": "https://localhost:8080/" 431 | }, 432 | "id": "C26yzCDNNt5F", 433 | "outputId": "381399fb-b8cb-4382-f043-5b8372778d51" 434 | }, 435 | "outputs": [ 436 | { 437 | "output_type": "execute_result", 438 | "data": { 439 | "text/plain": [ 440 | "DatasetDict({\n", 441 | " train: Dataset({\n", 442 | " features: ['id', 'texts', 'bboxes_block', 'bboxes_line', 'categories', 'image', 'page_hash', 'original_filename', 'page_no', 'num_pages', 'original_width', 'original_height', 'coco_width', 'coco_height', 'collection', 'doc_category'],\n", 443 | " num_rows: 4\n", 444 | " })\n", 445 | "})" 446 | ] 447 | }, 448 | "metadata": {}, 449 | "execution_count": 16 450 | } 451 | ], 452 | "source": [ 453 | "dataset" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "id": "W7GrLRqoFweg" 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "## Create proper prompt which has grounding labels and it's location.\n", 465 | "def pre_process_data(example_df):\n", 466 | "\n", 467 | " set_boxs, set_categories = set_cat_box(example_df)\n", 468 | " example_df['float_val'] = [tuple(normalized_box(convert_box(i))) for i in set_boxs]\n", 469 | " example_df['text'] = ' This image is type of ' + example_df['doc_category'] + '. It has multiple page layouts ' + \", \".join([\"\" + id2label[i] +\"\" for i in set_categories]) + 'in it.'\n", 470 | "\n", 471 | " # print(encoding)\n", 472 | " return example_df" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "id": "YvzgeNUgNl_X", 480 | "colab": { 481 | "base_uri": "https://localhost:8080/" 482 | }, 483 | "outputId": "fae7d25f-f137-4f0c-f2e2-a4f153b71cda" 484 | }, 485 | "outputs": [ 486 | { 487 | "output_type": "stream", 488 | "name": "stderr", 489 | "text": [ 490 | "100%|██████████| 4/4 [00:00<00:00, 373.47it/s]\n" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "all_df = all_df.progress_apply(pre_process_data, axis=1)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": { 502 | "id": "m6BrA0aAlHbC" 503 | }, 504 | "outputs": [], 505 | "source": [ 506 | "import torch\n", 507 | "device = 'cuda' if torch.cuda.is_available() else 'cpu'" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": { 514 | "id": "PYF1XTaENl1_" 515 | }, 516 | "outputs": [], 517 | "source": [ 518 | "from datasets import Dataset\n", 519 | "## process prompt. Note: this will convert bounding box to required text and then convert it to number\n", 520 | "inputs = processor(images = all_df['image'].to_list(), text = all_df['text'].to_list(), bboxes = all_df['float_val'].to_list(), padding=True, truncation= True, return_tensors=\"pt\")\n", 521 | "labels = inputs['input_ids'].clone()\n", 522 | "labels[inputs['input_ids'] == 1] = -100\n", 523 | "inputs['labels'] = labels\n", 524 | "\n", 525 | "dataset = Dataset.from_dict(inputs)\n", 526 | "train_test_split = dataset.train_test_split(test_size=0.3)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "colab": { 534 | "base_uri": "https://localhost:8080/" 535 | }, 536 | "id": "RxUi6Ug9NlhK", 537 | "outputId": "be42bf88-cdc3-4ec3-df11-d6fc46a0ee14" 538 | }, 539 | "outputs": [ 540 | { 541 | "output_type": "execute_result", 542 | "data": { 543 | "text/plain": [ 544 | "DatasetDict({\n", 545 | " train: Dataset({\n", 546 | " features: ['pixel_values', 'input_ids', 'attention_mask', 'image_embeds_position_mask', 'labels'],\n", 547 | " num_rows: 2\n", 548 | " })\n", 549 | " test: Dataset({\n", 550 | " features: ['pixel_values', 'input_ids', 'attention_mask', 'image_embeds_position_mask', 'labels'],\n", 551 | " num_rows: 2\n", 552 | " })\n", 553 | "})" 554 | ] 555 | }, 556 | "metadata": {}, 557 | "execution_count": 18 558 | } 559 | ], 560 | "source": [ 561 | "train_test_split" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": { 568 | "id": "9YNbyTP4o4GC" 569 | }, 570 | "outputs": [], 571 | "source": [ 572 | "train_dataset = train_test_split['train']\n", 573 | "test_dataset = train_test_split['test']" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": { 580 | "id": "wi-QA0eNtoy1" 581 | }, 582 | "outputs": [], 583 | "source": [ 584 | "train_dataset.set_format(\"torch\")\n", 585 | "test_dataset.set_format(\"torch\")" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": { 592 | "colab": { 593 | "base_uri": "https://localhost:8080/" 594 | }, 595 | "id": "9Knwkv6cQ1K6", 596 | "outputId": "861ec678-84b5-4914-be84-eb57da0c0367" 597 | }, 598 | "outputs": [ 599 | { 600 | "output_type": "stream", 601 | "name": "stdout", 602 | "text": [ 603 | "pixel_values torch.Size([3, 224, 224])\n", 604 | "input_ids torch.Size([221])\n", 605 | "attention_mask torch.Size([221])\n", 606 | "image_embeds_position_mask torch.Size([221])\n", 607 | "labels torch.Size([221])\n" 608 | ] 609 | } 610 | ], 611 | "source": [ 612 | "import torch\n", 613 | "\n", 614 | "example = train_test_split['train'][0]\n", 615 | "for k,v in example.items():\n", 616 | " print(k,v.shape)" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": { 623 | "id": "KNWDM6DmciNC" 624 | }, 625 | "outputs": [], 626 | "source": [ 627 | "from huggingface_hub import notebook_login" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": { 634 | "colab": { 635 | "base_uri": "https://localhost:8080/", 636 | "height": 145, 637 | "referenced_widgets": [ 638 | "6675760ed4764f2ea3919417116abc53", 639 | "c9c834b4a4a74cae90469d0232e9bcb2", 640 | "257d63ff695043ee8c3299f03df92634", 641 | "14558fa6407b4f1fa9ed108e486eac0c", 642 | "c0bdf76e44e44b1fa04c138c5e383eaa", 643 | "82a33816d1954d26bc0ff6ea3576f513", 644 | "c018269ce79d4437907dea34e202427c", 645 | "7aa20c1dc69f48a792381c01f5071b8d", 646 | "62afc0a26592489ba0ed297bde5d4357", 647 | "1b7101c5d9dd440b846e60aa4c26fb72", 648 | "cef699be72da4aaeb370f71a6ac356a9", 649 | "50f583f944274cb9a49c0362b7acd713", 650 | "55f4ee25f1484576bb33cf476e2aa038", 651 | "85495656479c4323bd62279b8ef544d3", 652 | "afed23dafec44c079cda02c1268e8a6d", 653 | "7f87103d999a40dcaa4efe8d75843da2", 654 | "d89b8da9708744bf92eb71c5b5ca48a5", 655 | "23552beda4c94263bafbe834ed283921", 656 | "a59dc124d8104c0ab29574e281885dd9", 657 | "e2f52d9a90764140bbfcc4c6f6c4589c", 658 | "123e74f8e0a8412f9bbf8b2e918d99d8", 659 | "523f4a3541094d38a8dbc4dd3ef9c928", 660 | "b20db9890b464247a9ef7e4fb1adb732", 661 | "b3984a95687d4b978a25fa153ec0a877", 662 | "a3b62f55344340fa9f0db744ff2c8c70", 663 | "e4ee7889597f4f078504df0a19976c69", 664 | "7a98f305582c46d890a3f599f4dc9cf3", 665 | "803b81733b7e4bafa94100201436ceb9", 666 | "bc2570f343ed47e7bd296162db9e43a7", 667 | "7efc9dc702fd40469875f367a90c126a", 668 | "985f228d52e34ac39be0fd52a2dce7c3", 669 | "7879ea9286004824b34798805700d5b7" 670 | ] 671 | }, 672 | "id": "TPxza5wpcjqA", 673 | "outputId": "83a96fe5-cccb-432a-be84-658a10524f34" 674 | }, 675 | "outputs": [ 676 | { 677 | "output_type": "display_data", 678 | "data": { 679 | "text/plain": [ 680 | "VBox(children=(HTML(value=' " 798 | ], 799 | "text/html": [ 800 | "\n", 801 | "

\n", 802 | " \n", 803 | " \n", 804 | " [1000/1000 16:24, Epoch 1000/1000]\n", 805 | "

\n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | "

Step	Training Loss	Validation Loss
100	No log	1.612360
200	No log	1.566668
300	2.393000	1.660224
400	2.393000	1.736173
500	0.004200	1.764009
600	0.004200	1.800707
700	0.004200	1.779289
800	0.001600	1.784427
900	0.001600	1.802788
1000	0.001000	1.804111

" 867 | ] 868 | }, 869 | "metadata": {} 870 | }, 871 | { 872 | "output_type": "error", 873 | "ename": "SafetensorError", 874 | "evalue": "Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: \"No space left on device\" })", 875 | "traceback": [ 876 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 877 | "\u001b[0;31mSafetensorError\u001b[0m Traceback (most recent call last)", 878 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 879 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1769\u001b[0m \u001b[0;31m# Disable progress bars when uploading models during checkpoints to avoid polluting stdout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[0mhf_hub_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdisable_progress_bars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1771\u001b[0;31m return inner_training_loop(\n\u001b[0m\u001b[1;32m 1772\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1773\u001b[0m \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 880 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2191\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2192\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2193\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_log_save_evaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtr_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_norm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2194\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2195\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_substep_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 881 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2587\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_save\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2588\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_save_checkpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2589\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2590\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 882 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_save_checkpoint\u001b[0;34m(self, model, trial, metrics)\u001b[0m\n\u001b[1;32m 2654\u001b[0m \u001b[0mrun_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_output_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrial\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2655\u001b[0m \u001b[0moutput_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheckpoint_folder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2656\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_internal_call\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2657\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2658\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_only_model\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 883 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36msave_model\u001b[0;34m(self, output_dir, _internal_call)\u001b[0m\n\u001b[1;32m 3148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3149\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_save\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3150\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3152\u001b[0m \u001b[0;31m# Push to the Hub when `save_model` is called by the user.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 884 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_save\u001b[0;34m(self, output_dir, state_dict)\u001b[0m\n\u001b[1;32m 3223\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mWEIGHTS_NAME\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3224\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3225\u001b[0;31m self.model.save_pretrained(\n\u001b[0m\u001b[1;32m 3226\u001b[0m \u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstate_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msafe_serialization\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_safetensors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3227\u001b[0m )\n", 885 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\u001b[0m in \u001b[0;36msave_pretrained\u001b[0;34m(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)\u001b[0m\n\u001b[1;32m 2466\u001b[0m \u001b[0;31m# At some point we will need to deal better with save_function (used for TPU and other distributed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2467\u001b[0m \u001b[0;31m# joyfulness), but for now this enough.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2468\u001b[0;31m \u001b[0msafe_save_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshard\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_directory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshard_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"format\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"pt\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2469\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2470\u001b[0m \u001b[0msave_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshard\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_directory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshard_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 886 | "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/safetensors/torch.py\u001b[0m in \u001b[0;36msave_file\u001b[0;34m(tensors, filename, metadata)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 280\u001b[0m \"\"\"\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0mserialize_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_flatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 887 | "\u001b[0;31mSafetensorError\u001b[0m: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: \"No space left on device\" })" 888 | ] 889 | } 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": null, 895 | "metadata": { 896 | "id": "AXzFb4mDfcnk" 897 | }, 898 | "outputs": [], 899 | "source": [] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": null, 904 | "metadata": { 905 | "id": "JzMvoc5xfccW" 906 | }, 907 | "outputs": [], 908 | "source": [] 909 | }, 910 | { 911 | "cell_type": "markdown", 912 | "source": [ 913 | "## Only useful if patch size is different then 224" 914 | ], 915 | "metadata": { 916 | "id": "_0dbr6HkAYPY" 917 | } 918 | }, 919 | { 920 | "cell_type": "code", 921 | "source": [ 922 | "'''\n", 923 | "# Initializing a Kosmos-2 kosmos-2-patch14-224 style configuration\n", 924 | "configuration = Kosmos2Config(\n", 925 | " text_config = {\"max_position_embeddings\" : 2048*2, \"attention_heads\" : 32*4},\n", 926 | " vision_config = {\"image_size\" : 1280, \"patch_size\" : 256}\n", 927 | " )\n", 928 | "# configuration = Kosmos2Config(latent_query_num = 64 * 4)\n", 929 | "# model = Kosmos2ForConditionalGeneration.from_pretrained(\"microsoft/kosmos-2-patch14-224\", config = configuration, ignore_mismatched_sizes=True)\n", 930 | "# num_patches_per_side = 32*math.sqrt(total_tokens_increase_by)\n", 931 | "# total_tokens_increase_by = 64\n", 932 | "# # , num_patch_index_tokens = 1024 * total_tokens_increase_by\n", 933 | "'''" 934 | ], 935 | "metadata": { 936 | "id": "GlAvDEl5AhEM" 937 | }, 938 | "execution_count": null, 939 | "outputs": [] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": null, 944 | "metadata": { 945 | "id": "Kpp8VEnsfcZq" 946 | }, 947 | "outputs": [], 948 | "source": [ 949 | "# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L35C1-L75C38\n", 950 | "# (with format modifications)\n", 951 | "def patch_index_to_coordinate(ul_idx: int, lr_idx: int, num_patches_per_side: int):\n", 952 | " # Compute the size of each cell in the grid\n", 953 | " cell_size = 1.0 / num_patches_per_side\n", 954 | "\n", 955 | " # Compute the x and y indices of the upper-left and lower-right corners of the bounding box\n", 956 | " ul_x = ul_idx % num_patches_per_side\n", 957 | " ul_y = ul_idx // num_patches_per_side\n", 958 | "\n", 959 | " lr_x = lr_idx % num_patches_per_side\n", 960 | " lr_y = lr_idx // num_patches_per_side\n", 961 | "\n", 962 | " # Compute the normalized coordinates of the bounding box\n", 963 | " if ul_idx == lr_idx:\n", 964 | " x1 = ul_x * cell_size\n", 965 | " y1 = ul_y * cell_size\n", 966 | " x2 = lr_x * cell_size + cell_size\n", 967 | " y2 = lr_y * cell_size + cell_size\n", 968 | " elif ul_x == lr_x or ul_y == lr_y:\n", 969 | " x1 = ul_x * cell_size\n", 970 | " y1 = ul_y * cell_size\n", 971 | " x2 = lr_x * cell_size + cell_size\n", 972 | " y2 = lr_y * cell_size + cell_size\n", 973 | " else:\n", 974 | " x1 = ul_x * cell_size + cell_size / 2\n", 975 | " y1 = ul_y * cell_size + cell_size / 2\n", 976 | " x2 = lr_x * cell_size + cell_size / 2\n", 977 | " y2 = lr_y * cell_size + cell_size / 2\n", 978 | "\n", 979 | " return x1, y1, x2, y2\n", 980 | "\n", 981 | "\n", 982 | "# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L4-L33\n", 983 | "# (with format modifications)\n", 984 | "def extract_entities_with_patch_indices(text):\n", 985 | " # The regular expression pattern for matching the required formats\n", 986 | " pattern = r\"(?:(([^<]+)))?\"\n", 987 | "\n", 988 | " # Find all matches in the given string\n", 989 | " matches = re.finditer(pattern, text)\n", 990 | "\n", 991 | " # Initialize an empty list to store the valid patch_index combinations\n", 992 | " entities_with_patch_indices = []\n", 993 | "\n", 994 | " for match in matches:\n", 995 | " # span of a `phrase` that is between and \n", 996 | " span = match.span(2)\n", 997 | " phrase_tag, phrase, match_content = match.groups()\n", 998 | " if not phrase_tag:\n", 999 | " phrase = None\n", 1000 | " # We take the starting position of `