├── CodeGemma_colab.ipynb ├── Complete_guide_to_audio_datasets.ipynb ├── Infer_Whisper_🤗transformers_edition.ipynb ├── README.md ├── RecurrentGemma_colab.ipynb ├── SmolVLM_500M_inference.ipynb ├── Whisper_Large_8bit_loading_w_bnb.ipynb ├── Whisper_transformers_timestamps.ipynb ├── Whisper_translate_with_🤗transformers_pipeline.ipynb ├── Whisper_w_PEFT.ipynb ├── dduf_my_repo_colab.ipynb ├── deepseek_r1_distill_qwen1_5B_transformers.ipynb ├── gemma_2_9b_colab.ipynb ├── hf_gguf_convert.ipynb ├── insanely_fast_whisper_colab.ipynb ├── kokoro_tts.ipynb ├── mathstral_7b_colab.ipynb ├── orpheus-pretrained-inference-demo.ipynb ├── stable_audio_open_colab.ipynb ├── text_to_music_with_spectrogram_diffusion_and_diffusers.ipynb ├── text_to_sound_with_audioLDM_and_diffusers.ipynb ├── transformers_autoawq_colab.ipynb ├── transformers_whisper_ckpt_to_OAI.ipynb ├── translate_w_seamless_m4tv2.ipynb ├── use_encodec_w_transformers.ipynb ├── whisper_turbo_in_transformers.ipynb ├── zephyr_assisted_musicgen_generations.ipynb └── zero_to_asr_101.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Notebooks 2 | -------------------------------------------------------------------------------- /RecurrentGemma_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU", 18 | "widgets": { 19 | "application/vnd.jupyter.widget-state+json": { 20 | "e6f2b94e3bb345859811226dc345a6e3": { 21 | "model_module": "@jupyter-widgets/controls", 22 | "model_name": "HBoxModel", 23 | "model_module_version": "1.5.0", 24 | "state": { 25 | "_dom_classes": [], 26 | "_model_module": "@jupyter-widgets/controls", 27 | "_model_module_version": "1.5.0", 28 | "_model_name": "HBoxModel", 29 | "_view_count": null, 30 | "_view_module": "@jupyter-widgets/controls", 31 | "_view_module_version": "1.5.0", 32 | "_view_name": "HBoxView", 33 | "box_style": "", 34 | "children": [ 35 | "IPY_MODEL_ce12a18f666740eabe1f71be5152e7d8", 36 | "IPY_MODEL_76a4282443194092af1039de43c523c6", 37 | "IPY_MODEL_3775878f2e484e84b6cb0971d2d35a7f" 38 | ], 39 | "layout": "IPY_MODEL_df936531829f4c1580c04719de77059c" 40 | } 41 | }, 42 | "ce12a18f666740eabe1f71be5152e7d8": { 43 | "model_module": "@jupyter-widgets/controls", 44 | "model_name": "HTMLModel", 45 | "model_module_version": "1.5.0", 46 | "state": { 47 | "_dom_classes": [], 48 | "_model_module": "@jupyter-widgets/controls", 49 | "_model_module_version": "1.5.0", 50 | "_model_name": "HTMLModel", 51 | "_view_count": null, 52 | "_view_module": "@jupyter-widgets/controls", 53 | "_view_module_version": "1.5.0", 54 | "_view_name": "HTMLView", 55 | "description": "", 56 | "description_tooltip": null, 57 | "layout": "IPY_MODEL_ab60c88077984531aa08150c72fa7ab5", 58 | "placeholder": "​", 59 | "style": "IPY_MODEL_ad9901a17d3d4b1b898941d16c16f1cb", 60 | "value": "tokenizer_config.json: 100%" 61 | } 62 | }, 63 | "76a4282443194092af1039de43c523c6": { 64 | "model_module": "@jupyter-widgets/controls", 65 | "model_name": "FloatProgressModel", 66 | "model_module_version": "1.5.0", 67 | "state": { 68 | "_dom_classes": [], 69 | "_model_module": "@jupyter-widgets/controls", 70 | "_model_module_version": "1.5.0", 71 | "_model_name": "FloatProgressModel", 72 | "_view_count": null, 73 | "_view_module": "@jupyter-widgets/controls", 74 | "_view_module_version": "1.5.0", 75 | "_view_name": "ProgressView", 76 | "bar_style": "success", 77 | "description": "", 78 | "description_tooltip": null, 79 | "layout": "IPY_MODEL_f2fbf227d64e451681083a0ca189405c", 80 | "max": 40529, 81 | "min": 0, 82 | "orientation": "horizontal", 83 | "style": "IPY_MODEL_72364dfb4d994115bfd049dae5f53423", 84 | "value": 40529 85 | } 86 | }, 87 | "3775878f2e484e84b6cb0971d2d35a7f": { 88 | "model_module": "@jupyter-widgets/controls", 89 | "model_name": "HTMLModel", 90 | "model_module_version": "1.5.0", 91 | "state": { 92 | "_dom_classes": [], 93 | "_model_module": "@jupyter-widgets/controls", 94 | "_model_module_version": "1.5.0", 95 | "_model_name": "HTMLModel", 96 | "_view_count": null, 97 | "_view_module": "@jupyter-widgets/controls", 98 | "_view_module_version": "1.5.0", 99 | "_view_name": "HTMLView", 100 | "description": "", 101 | "description_tooltip": null, 102 | "layout": "IPY_MODEL_fcbe52122eeb466d91b44011f7f9bc47", 103 | "placeholder": "​", 104 | "style": "IPY_MODEL_c0915cfb93634ede86568eface4115d8", 105 | "value": " 40.5k/40.5k [00:00<00:00, 670kB/s]" 106 | } 107 | }, 108 | "df936531829f4c1580c04719de77059c": { 109 | "model_module": "@jupyter-widgets/base", 110 | "model_name": "LayoutModel", 111 | "model_module_version": "1.2.0", 112 | "state": { 113 | "_model_module": "@jupyter-widgets/base", 114 | "_model_module_version": "1.2.0", 115 | "_model_name": "LayoutModel", 116 | "_view_count": null, 117 | "_view_module": "@jupyter-widgets/base", 118 | "_view_module_version": "1.2.0", 119 | "_view_name": "LayoutView", 120 | "align_content": null, 121 | "align_items": null, 122 | "align_self": null, 123 | "border": null, 124 | "bottom": null, 125 | "display": null, 126 | "flex": null, 127 | "flex_flow": null, 128 | "grid_area": null, 129 | "grid_auto_columns": null, 130 | "grid_auto_flow": null, 131 | "grid_auto_rows": null, 132 | "grid_column": null, 133 | "grid_gap": null, 134 | "grid_row": null, 135 | "grid_template_areas": null, 136 | "grid_template_columns": null, 137 | "grid_template_rows": null, 138 | "height": null, 139 | "justify_content": null, 140 | "justify_items": null, 141 | "left": null, 142 | "margin": null, 143 | "max_height": null, 144 | "max_width": null, 145 | "min_height": null, 146 | "min_width": null, 147 | "object_fit": null, 148 | "object_position": null, 149 | "order": null, 150 | "overflow": null, 151 | "overflow_x": null, 152 | "overflow_y": null, 153 | "padding": null, 154 | "right": null, 155 | "top": null, 156 | "visibility": null, 157 | "width": null 158 | } 159 | }, 160 | "ab60c88077984531aa08150c72fa7ab5": { 161 | "model_module": "@jupyter-widgets/base", 162 | "model_name": "LayoutModel", 163 | "model_module_version": "1.2.0", 164 | "state": { 165 | "_model_module": "@jupyter-widgets/base", 166 | "_model_module_version": "1.2.0", 167 | "_model_name": "LayoutModel", 168 | "_view_count": null, 169 | "_view_module": "@jupyter-widgets/base", 170 | "_view_module_version": "1.2.0", 171 | "_view_name": "LayoutView", 172 | "align_content": null, 173 | "align_items": null, 174 | "align_self": null, 175 | "border": null, 176 | "bottom": null, 177 | "display": null, 178 | "flex": null, 179 | "flex_flow": null, 180 | "grid_area": null, 181 | "grid_auto_columns": null, 182 | "grid_auto_flow": null, 183 | "grid_auto_rows": null, 184 | "grid_column": null, 185 | "grid_gap": null, 186 | "grid_row": null, 187 | "grid_template_areas": null, 188 | "grid_template_columns": null, 189 | "grid_template_rows": null, 190 | "height": null, 191 | "justify_content": null, 192 | "justify_items": null, 193 | "left": null, 194 | "margin": null, 195 | "max_height": null, 196 | "max_width": null, 197 | "min_height": null, 198 | "min_width": null, 199 | "object_fit": null, 200 | "object_position": null, 201 | "order": null, 202 | "overflow": null, 203 | "overflow_x": null, 204 | "overflow_y": null, 205 | "padding": null, 206 | "right": null, 207 | "top": null, 208 | "visibility": null, 209 | "width": null 210 | } 211 | }, 212 | "ad9901a17d3d4b1b898941d16c16f1cb": { 213 | "model_module": "@jupyter-widgets/controls", 214 | "model_name": "DescriptionStyleModel", 215 | "model_module_version": "1.5.0", 216 | "state": { 217 | "_model_module": "@jupyter-widgets/controls", 218 | "_model_module_version": "1.5.0", 219 | "_model_name": "DescriptionStyleModel", 220 | "_view_count": null, 221 | "_view_module": "@jupyter-widgets/base", 222 | "_view_module_version": "1.2.0", 223 | "_view_name": "StyleView", 224 | "description_width": "" 225 | } 226 | }, 227 | "f2fbf227d64e451681083a0ca189405c": { 228 | "model_module": "@jupyter-widgets/base", 229 | "model_name": "LayoutModel", 230 | "model_module_version": "1.2.0", 231 | "state": { 232 | "_model_module": "@jupyter-widgets/base", 233 | "_model_module_version": "1.2.0", 234 | "_model_name": "LayoutModel", 235 | "_view_count": null, 236 | "_view_module": "@jupyter-widgets/base", 237 | "_view_module_version": "1.2.0", 238 | "_view_name": "LayoutView", 239 | "align_content": null, 240 | "align_items": null, 241 | "align_self": null, 242 | "border": null, 243 | "bottom": null, 244 | "display": null, 245 | "flex": null, 246 | "flex_flow": null, 247 | "grid_area": null, 248 | "grid_auto_columns": null, 249 | "grid_auto_flow": null, 250 | "grid_auto_rows": null, 251 | "grid_column": null, 252 | "grid_gap": null, 253 | "grid_row": null, 254 | "grid_template_areas": null, 255 | "grid_template_columns": null, 256 | "grid_template_rows": null, 257 | "height": null, 258 | "justify_content": null, 259 | "justify_items": null, 260 | "left": null, 261 | "margin": null, 262 | "max_height": null, 263 | "max_width": null, 264 | "min_height": null, 265 | "min_width": null, 266 | "object_fit": null, 267 | "object_position": null, 268 | "order": null, 269 | "overflow": null, 270 | "overflow_x": null, 271 | "overflow_y": null, 272 | "padding": null, 273 | "right": null, 274 | "top": null, 275 | "visibility": null, 276 | "width": null 277 | } 278 | }, 279 | "72364dfb4d994115bfd049dae5f53423": { 280 | "model_module": "@jupyter-widgets/controls", 281 | "model_name": "ProgressStyleModel", 282 | "model_module_version": "1.5.0", 283 | "state": { 284 | "_model_module": "@jupyter-widgets/controls", 285 | "_model_module_version": "1.5.0", 286 | "_model_name": "ProgressStyleModel", 287 | "_view_count": null, 288 | "_view_module": "@jupyter-widgets/base", 289 | "_view_module_version": "1.2.0", 290 | "_view_name": "StyleView", 291 | "bar_color": null, 292 | "description_width": "" 293 | } 294 | }, 295 | "fcbe52122eeb466d91b44011f7f9bc47": { 296 | "model_module": "@jupyter-widgets/base", 297 | "model_name": "LayoutModel", 298 | "model_module_version": "1.2.0", 299 | "state": { 300 | "_model_module": "@jupyter-widgets/base", 301 | "_model_module_version": "1.2.0", 302 | "_model_name": "LayoutModel", 303 | "_view_count": null, 304 | "_view_module": "@jupyter-widgets/base", 305 | "_view_module_version": "1.2.0", 306 | "_view_name": "LayoutView", 307 | "align_content": null, 308 | "align_items": null, 309 | "align_self": null, 310 | "border": null, 311 | "bottom": null, 312 | "display": null, 313 | "flex": null, 314 | "flex_flow": null, 315 | "grid_area": null, 316 | "grid_auto_columns": null, 317 | "grid_auto_flow": null, 318 | "grid_auto_rows": null, 319 | "grid_column": null, 320 | "grid_gap": null, 321 | "grid_row": null, 322 | "grid_template_areas": null, 323 | "grid_template_columns": null, 324 | "grid_template_rows": null, 325 | "height": null, 326 | "justify_content": null, 327 | "justify_items": null, 328 | "left": null, 329 | "margin": null, 330 | "max_height": null, 331 | "max_width": null, 332 | "min_height": null, 333 | "min_width": null, 334 | "object_fit": null, 335 | "object_position": null, 336 | "order": null, 337 | "overflow": null, 338 | "overflow_x": null, 339 | "overflow_y": null, 340 | "padding": null, 341 | "right": null, 342 | "top": null, 343 | "visibility": null, 344 | "width": null 345 | } 346 | }, 347 | "c0915cfb93634ede86568eface4115d8": { 348 | "model_module": "@jupyter-widgets/controls", 349 | "model_name": "DescriptionStyleModel", 350 | "model_module_version": "1.5.0", 351 | "state": { 352 | "_model_module": "@jupyter-widgets/controls", 353 | "_model_module_version": "1.5.0", 354 | "_model_name": "DescriptionStyleModel", 355 | "_view_count": null, 356 | "_view_module": "@jupyter-widgets/base", 357 | "_view_module_version": "1.2.0", 358 | "_view_name": "StyleView", 359 | "description_width": "" 360 | } 361 | }, 362 | "401cecca00fb42b29f1ec3fd5cfa4396": { 363 | "model_module": "@jupyter-widgets/controls", 364 | "model_name": "HBoxModel", 365 | "model_module_version": "1.5.0", 366 | "state": { 367 | "_dom_classes": [], 368 | "_model_module": "@jupyter-widgets/controls", 369 | "_model_module_version": "1.5.0", 370 | "_model_name": "HBoxModel", 371 | "_view_count": null, 372 | "_view_module": "@jupyter-widgets/controls", 373 | "_view_module_version": "1.5.0", 374 | "_view_name": "HBoxView", 375 | "box_style": "", 376 | "children": [ 377 | "IPY_MODEL_e98c3904a7e346a5870c5ac768cd6a98", 378 | "IPY_MODEL_d908df5a5a8945dc88f8d0f147245bbd", 379 | "IPY_MODEL_f1a29b9608244a9db3cc919ad149ef48" 380 | ], 381 | "layout": "IPY_MODEL_d19c6905802d46c4beb6fe8886cb6e8c" 382 | } 383 | }, 384 | "e98c3904a7e346a5870c5ac768cd6a98": { 385 | "model_module": "@jupyter-widgets/controls", 386 | "model_name": "HTMLModel", 387 | "model_module_version": "1.5.0", 388 | "state": { 389 | "_dom_classes": [], 390 | "_model_module": "@jupyter-widgets/controls", 391 | "_model_module_version": "1.5.0", 392 | "_model_name": "HTMLModel", 393 | "_view_count": null, 394 | "_view_module": "@jupyter-widgets/controls", 395 | "_view_module_version": "1.5.0", 396 | "_view_name": "HTMLView", 397 | "description": "", 398 | "description_tooltip": null, 399 | "layout": "IPY_MODEL_f7d7bc20d2ba40eeb576e8865cdbb8ec", 400 | "placeholder": "​", 401 | "style": "IPY_MODEL_1a431c4a814941169c8feff1b4741052", 402 | "value": "Downloading shards: 100%" 403 | } 404 | }, 405 | "d908df5a5a8945dc88f8d0f147245bbd": { 406 | "model_module": "@jupyter-widgets/controls", 407 | "model_name": "FloatProgressModel", 408 | "model_module_version": "1.5.0", 409 | "state": { 410 | "_dom_classes": [], 411 | "_model_module": "@jupyter-widgets/controls", 412 | "_model_module_version": "1.5.0", 413 | "_model_name": "FloatProgressModel", 414 | "_view_count": null, 415 | "_view_module": "@jupyter-widgets/controls", 416 | "_view_module_version": "1.5.0", 417 | "_view_name": "ProgressView", 418 | "bar_style": "success", 419 | "description": "", 420 | "description_tooltip": null, 421 | "layout": "IPY_MODEL_21a66f11f21e4913a1a5a975727916f0", 422 | "max": 2, 423 | "min": 0, 424 | "orientation": "horizontal", 425 | "style": "IPY_MODEL_b931a5080c154b7dbbdcebf1a48aa9a3", 426 | "value": 2 427 | } 428 | }, 429 | "f1a29b9608244a9db3cc919ad149ef48": { 430 | "model_module": "@jupyter-widgets/controls", 431 | "model_name": "HTMLModel", 432 | "model_module_version": "1.5.0", 433 | "state": { 434 | "_dom_classes": [], 435 | "_model_module": "@jupyter-widgets/controls", 436 | "_model_module_version": "1.5.0", 437 | "_model_name": "HTMLModel", 438 | "_view_count": null, 439 | "_view_module": "@jupyter-widgets/controls", 440 | "_view_module_version": "1.5.0", 441 | "_view_name": "HTMLView", 442 | "description": "", 443 | "description_tooltip": null, 444 | "layout": "IPY_MODEL_6cc6f7e129fc47c9ac57d38f713c50ea", 445 | "placeholder": "​", 446 | "style": "IPY_MODEL_896598f7441c4e84b4c7963b520d6daf", 447 | "value": " 2/2 [00:00<00:00,  7.70it/s]" 448 | } 449 | }, 450 | "d19c6905802d46c4beb6fe8886cb6e8c": { 451 | "model_module": "@jupyter-widgets/base", 452 | "model_name": "LayoutModel", 453 | "model_module_version": "1.2.0", 454 | "state": { 455 | "_model_module": "@jupyter-widgets/base", 456 | "_model_module_version": "1.2.0", 457 | "_model_name": "LayoutModel", 458 | "_view_count": null, 459 | "_view_module": "@jupyter-widgets/base", 460 | "_view_module_version": "1.2.0", 461 | "_view_name": "LayoutView", 462 | "align_content": null, 463 | "align_items": null, 464 | "align_self": null, 465 | "border": null, 466 | "bottom": null, 467 | "display": null, 468 | "flex": null, 469 | "flex_flow": null, 470 | "grid_area": null, 471 | "grid_auto_columns": null, 472 | "grid_auto_flow": null, 473 | "grid_auto_rows": null, 474 | "grid_column": null, 475 | "grid_gap": null, 476 | "grid_row": null, 477 | "grid_template_areas": null, 478 | "grid_template_columns": null, 479 | "grid_template_rows": null, 480 | "height": null, 481 | "justify_content": null, 482 | "justify_items": null, 483 | "left": null, 484 | "margin": null, 485 | "max_height": null, 486 | "max_width": null, 487 | "min_height": null, 488 | "min_width": null, 489 | "object_fit": null, 490 | "object_position": null, 491 | "order": null, 492 | "overflow": null, 493 | "overflow_x": null, 494 | "overflow_y": null, 495 | "padding": null, 496 | "right": null, 497 | "top": null, 498 | "visibility": null, 499 | "width": null 500 | } 501 | }, 502 | "f7d7bc20d2ba40eeb576e8865cdbb8ec": { 503 | "model_module": "@jupyter-widgets/base", 504 | "model_name": "LayoutModel", 505 | "model_module_version": "1.2.0", 506 | "state": { 507 | "_model_module": "@jupyter-widgets/base", 508 | "_model_module_version": "1.2.0", 509 | "_model_name": "LayoutModel", 510 | "_view_count": null, 511 | "_view_module": "@jupyter-widgets/base", 512 | "_view_module_version": "1.2.0", 513 | "_view_name": "LayoutView", 514 | "align_content": null, 515 | "align_items": null, 516 | "align_self": null, 517 | "border": null, 518 | "bottom": null, 519 | "display": null, 520 | "flex": null, 521 | "flex_flow": null, 522 | "grid_area": null, 523 | "grid_auto_columns": null, 524 | "grid_auto_flow": null, 525 | "grid_auto_rows": null, 526 | "grid_column": null, 527 | "grid_gap": null, 528 | "grid_row": null, 529 | "grid_template_areas": null, 530 | "grid_template_columns": null, 531 | "grid_template_rows": null, 532 | "height": null, 533 | "justify_content": null, 534 | "justify_items": null, 535 | "left": null, 536 | "margin": null, 537 | "max_height": null, 538 | "max_width": null, 539 | "min_height": null, 540 | "min_width": null, 541 | "object_fit": null, 542 | "object_position": null, 543 | "order": null, 544 | "overflow": null, 545 | "overflow_x": null, 546 | "overflow_y": null, 547 | "padding": null, 548 | "right": null, 549 | "top": null, 550 | "visibility": null, 551 | "width": null 552 | } 553 | }, 554 | "1a431c4a814941169c8feff1b4741052": { 555 | "model_module": "@jupyter-widgets/controls", 556 | "model_name": "DescriptionStyleModel", 557 | "model_module_version": "1.5.0", 558 | "state": { 559 | "_model_module": "@jupyter-widgets/controls", 560 | "_model_module_version": "1.5.0", 561 | "_model_name": "DescriptionStyleModel", 562 | "_view_count": null, 563 | "_view_module": "@jupyter-widgets/base", 564 | "_view_module_version": "1.2.0", 565 | "_view_name": "StyleView", 566 | "description_width": "" 567 | } 568 | }, 569 | "21a66f11f21e4913a1a5a975727916f0": { 570 | "model_module": "@jupyter-widgets/base", 571 | "model_name": "LayoutModel", 572 | "model_module_version": "1.2.0", 573 | "state": { 574 | "_model_module": "@jupyter-widgets/base", 575 | "_model_module_version": "1.2.0", 576 | "_model_name": "LayoutModel", 577 | "_view_count": null, 578 | "_view_module": "@jupyter-widgets/base", 579 | "_view_module_version": "1.2.0", 580 | "_view_name": "LayoutView", 581 | "align_content": null, 582 | "align_items": null, 583 | "align_self": null, 584 | "border": null, 585 | "bottom": null, 586 | "display": null, 587 | "flex": null, 588 | "flex_flow": null, 589 | "grid_area": null, 590 | "grid_auto_columns": null, 591 | "grid_auto_flow": null, 592 | "grid_auto_rows": null, 593 | "grid_column": null, 594 | "grid_gap": null, 595 | "grid_row": null, 596 | "grid_template_areas": null, 597 | "grid_template_columns": null, 598 | "grid_template_rows": null, 599 | "height": null, 600 | "justify_content": null, 601 | "justify_items": null, 602 | "left": null, 603 | "margin": null, 604 | "max_height": null, 605 | "max_width": null, 606 | "min_height": null, 607 | "min_width": null, 608 | "object_fit": null, 609 | "object_position": null, 610 | "order": null, 611 | "overflow": null, 612 | "overflow_x": null, 613 | "overflow_y": null, 614 | "padding": null, 615 | "right": null, 616 | "top": null, 617 | "visibility": null, 618 | "width": null 619 | } 620 | }, 621 | "b931a5080c154b7dbbdcebf1a48aa9a3": { 622 | "model_module": "@jupyter-widgets/controls", 623 | "model_name": "ProgressStyleModel", 624 | "model_module_version": "1.5.0", 625 | "state": { 626 | "_model_module": "@jupyter-widgets/controls", 627 | "_model_module_version": "1.5.0", 628 | "_model_name": "ProgressStyleModel", 629 | "_view_count": null, 630 | "_view_module": "@jupyter-widgets/base", 631 | "_view_module_version": "1.2.0", 632 | "_view_name": "StyleView", 633 | "bar_color": null, 634 | "description_width": "" 635 | } 636 | }, 637 | "6cc6f7e129fc47c9ac57d38f713c50ea": { 638 | "model_module": "@jupyter-widgets/base", 639 | "model_name": "LayoutModel", 640 | "model_module_version": "1.2.0", 641 | "state": { 642 | "_model_module": "@jupyter-widgets/base", 643 | "_model_module_version": "1.2.0", 644 | "_model_name": "LayoutModel", 645 | "_view_count": null, 646 | "_view_module": "@jupyter-widgets/base", 647 | "_view_module_version": "1.2.0", 648 | "_view_name": "LayoutView", 649 | "align_content": null, 650 | "align_items": null, 651 | "align_self": null, 652 | "border": null, 653 | "bottom": null, 654 | "display": null, 655 | "flex": null, 656 | "flex_flow": null, 657 | "grid_area": null, 658 | "grid_auto_columns": null, 659 | "grid_auto_flow": null, 660 | "grid_auto_rows": null, 661 | "grid_column": null, 662 | "grid_gap": null, 663 | "grid_row": null, 664 | "grid_template_areas": null, 665 | "grid_template_columns": null, 666 | "grid_template_rows": null, 667 | "height": null, 668 | "justify_content": null, 669 | "justify_items": null, 670 | "left": null, 671 | "margin": null, 672 | "max_height": null, 673 | "max_width": null, 674 | "min_height": null, 675 | "min_width": null, 676 | "object_fit": null, 677 | "object_position": null, 678 | "order": null, 679 | "overflow": null, 680 | "overflow_x": null, 681 | "overflow_y": null, 682 | "padding": null, 683 | "right": null, 684 | "top": null, 685 | "visibility": null, 686 | "width": null 687 | } 688 | }, 689 | "896598f7441c4e84b4c7963b520d6daf": { 690 | "model_module": "@jupyter-widgets/controls", 691 | "model_name": "DescriptionStyleModel", 692 | "model_module_version": "1.5.0", 693 | "state": { 694 | "_model_module": "@jupyter-widgets/controls", 695 | "_model_module_version": "1.5.0", 696 | "_model_name": "DescriptionStyleModel", 697 | "_view_count": null, 698 | "_view_module": "@jupyter-widgets/base", 699 | "_view_module_version": "1.2.0", 700 | "_view_name": "StyleView", 701 | "description_width": "" 702 | } 703 | }, 704 | "b391b63b204848009b051b9c9a5062a3": { 705 | "model_module": "@jupyter-widgets/controls", 706 | "model_name": "HBoxModel", 707 | "model_module_version": "1.5.0", 708 | "state": { 709 | "_dom_classes": [], 710 | "_model_module": "@jupyter-widgets/controls", 711 | "_model_module_version": "1.5.0", 712 | "_model_name": "HBoxModel", 713 | "_view_count": null, 714 | "_view_module": "@jupyter-widgets/controls", 715 | "_view_module_version": "1.5.0", 716 | "_view_name": "HBoxView", 717 | "box_style": "", 718 | "children": [ 719 | "IPY_MODEL_0eee8063610d46139d7576ef02ddc228", 720 | "IPY_MODEL_5d3bd17d0aa44d84a91d3ac8255dc296", 721 | "IPY_MODEL_84981495b59f46009bde2cdbec478a5f" 722 | ], 723 | "layout": "IPY_MODEL_f51d9c821e3b4f558ad76706f99d76a6" 724 | } 725 | }, 726 | "0eee8063610d46139d7576ef02ddc228": { 727 | "model_module": "@jupyter-widgets/controls", 728 | "model_name": "HTMLModel", 729 | "model_module_version": "1.5.0", 730 | "state": { 731 | "_dom_classes": [], 732 | "_model_module": "@jupyter-widgets/controls", 733 | "_model_module_version": "1.5.0", 734 | "_model_name": "HTMLModel", 735 | "_view_count": null, 736 | "_view_module": "@jupyter-widgets/controls", 737 | "_view_module_version": "1.5.0", 738 | "_view_name": "HTMLView", 739 | "description": "", 740 | "description_tooltip": null, 741 | "layout": "IPY_MODEL_07e7af89197e489b877e309189e6ea53", 742 | "placeholder": "​", 743 | "style": "IPY_MODEL_66e1d6e6e45146a7a56d3e935f56ad51", 744 | "value": "Loading checkpoint shards: 100%" 745 | } 746 | }, 747 | "5d3bd17d0aa44d84a91d3ac8255dc296": { 748 | "model_module": "@jupyter-widgets/controls", 749 | "model_name": "FloatProgressModel", 750 | "model_module_version": "1.5.0", 751 | "state": { 752 | "_dom_classes": [], 753 | "_model_module": "@jupyter-widgets/controls", 754 | "_model_module_version": "1.5.0", 755 | "_model_name": "FloatProgressModel", 756 | "_view_count": null, 757 | "_view_module": "@jupyter-widgets/controls", 758 | "_view_module_version": "1.5.0", 759 | "_view_name": "ProgressView", 760 | "bar_style": "success", 761 | "description": "", 762 | "description_tooltip": null, 763 | "layout": "IPY_MODEL_85b6c5d6fdc745d4a533004de3c97408", 764 | "max": 2, 765 | "min": 0, 766 | "orientation": "horizontal", 767 | "style": "IPY_MODEL_9f5ddcf6583246af9ec1ebe7f23446d6", 768 | "value": 2 769 | } 770 | }, 771 | "84981495b59f46009bde2cdbec478a5f": { 772 | "model_module": "@jupyter-widgets/controls", 773 | "model_name": "HTMLModel", 774 | "model_module_version": "1.5.0", 775 | "state": { 776 | "_dom_classes": [], 777 | "_model_module": "@jupyter-widgets/controls", 778 | "_model_module_version": "1.5.0", 779 | "_model_name": "HTMLModel", 780 | "_view_count": null, 781 | "_view_module": "@jupyter-widgets/controls", 782 | "_view_module_version": "1.5.0", 783 | "_view_name": "HTMLView", 784 | "description": "", 785 | "description_tooltip": null, 786 | "layout": "IPY_MODEL_58b61a060918476c82be882ed6d5cc10", 787 | "placeholder": "​", 788 | "style": "IPY_MODEL_7a2fce54921c4062a739fb690387f156", 789 | "value": " 2/2 [00:19<00:00,  8.21s/it]" 790 | } 791 | }, 792 | "f51d9c821e3b4f558ad76706f99d76a6": { 793 | "model_module": "@jupyter-widgets/base", 794 | "model_name": "LayoutModel", 795 | "model_module_version": "1.2.0", 796 | "state": { 797 | "_model_module": "@jupyter-widgets/base", 798 | "_model_module_version": "1.2.0", 799 | "_model_name": "LayoutModel", 800 | "_view_count": null, 801 | "_view_module": "@jupyter-widgets/base", 802 | "_view_module_version": "1.2.0", 803 | "_view_name": "LayoutView", 804 | "align_content": null, 805 | "align_items": null, 806 | "align_self": null, 807 | "border": null, 808 | "bottom": null, 809 | "display": null, 810 | "flex": null, 811 | "flex_flow": null, 812 | "grid_area": null, 813 | "grid_auto_columns": null, 814 | "grid_auto_flow": null, 815 | "grid_auto_rows": null, 816 | "grid_column": null, 817 | "grid_gap": null, 818 | "grid_row": null, 819 | "grid_template_areas": null, 820 | "grid_template_columns": null, 821 | "grid_template_rows": null, 822 | "height": null, 823 | "justify_content": null, 824 | "justify_items": null, 825 | "left": null, 826 | "margin": null, 827 | "max_height": null, 828 | "max_width": null, 829 | "min_height": null, 830 | "min_width": null, 831 | "object_fit": null, 832 | "object_position": null, 833 | "order": null, 834 | "overflow": null, 835 | "overflow_x": null, 836 | "overflow_y": null, 837 | "padding": null, 838 | "right": null, 839 | "top": null, 840 | "visibility": null, 841 | "width": null 842 | } 843 | }, 844 | "07e7af89197e489b877e309189e6ea53": { 845 | "model_module": "@jupyter-widgets/base", 846 | "model_name": "LayoutModel", 847 | "model_module_version": "1.2.0", 848 | "state": { 849 | "_model_module": "@jupyter-widgets/base", 850 | "_model_module_version": "1.2.0", 851 | "_model_name": "LayoutModel", 852 | "_view_count": null, 853 | "_view_module": "@jupyter-widgets/base", 854 | "_view_module_version": "1.2.0", 855 | "_view_name": "LayoutView", 856 | "align_content": null, 857 | "align_items": null, 858 | "align_self": null, 859 | "border": null, 860 | "bottom": null, 861 | "display": null, 862 | "flex": null, 863 | "flex_flow": null, 864 | "grid_area": null, 865 | "grid_auto_columns": null, 866 | "grid_auto_flow": null, 867 | "grid_auto_rows": null, 868 | "grid_column": null, 869 | "grid_gap": null, 870 | "grid_row": null, 871 | "grid_template_areas": null, 872 | "grid_template_columns": null, 873 | "grid_template_rows": null, 874 | "height": null, 875 | "justify_content": null, 876 | "justify_items": null, 877 | "left": null, 878 | "margin": null, 879 | "max_height": null, 880 | "max_width": null, 881 | "min_height": null, 882 | "min_width": null, 883 | "object_fit": null, 884 | "object_position": null, 885 | "order": null, 886 | "overflow": null, 887 | "overflow_x": null, 888 | "overflow_y": null, 889 | "padding": null, 890 | "right": null, 891 | "top": null, 892 | "visibility": null, 893 | "width": null 894 | } 895 | }, 896 | "66e1d6e6e45146a7a56d3e935f56ad51": { 897 | "model_module": "@jupyter-widgets/controls", 898 | "model_name": "DescriptionStyleModel", 899 | "model_module_version": "1.5.0", 900 | "state": { 901 | "_model_module": "@jupyter-widgets/controls", 902 | "_model_module_version": "1.5.0", 903 | "_model_name": "DescriptionStyleModel", 904 | "_view_count": null, 905 | "_view_module": "@jupyter-widgets/base", 906 | "_view_module_version": "1.2.0", 907 | "_view_name": "StyleView", 908 | "description_width": "" 909 | } 910 | }, 911 | "85b6c5d6fdc745d4a533004de3c97408": { 912 | "model_module": "@jupyter-widgets/base", 913 | "model_name": "LayoutModel", 914 | "model_module_version": "1.2.0", 915 | "state": { 916 | "_model_module": "@jupyter-widgets/base", 917 | "_model_module_version": "1.2.0", 918 | "_model_name": "LayoutModel", 919 | "_view_count": null, 920 | "_view_module": "@jupyter-widgets/base", 921 | "_view_module_version": "1.2.0", 922 | "_view_name": "LayoutView", 923 | "align_content": null, 924 | "align_items": null, 925 | "align_self": null, 926 | "border": null, 927 | "bottom": null, 928 | "display": null, 929 | "flex": null, 930 | "flex_flow": null, 931 | "grid_area": null, 932 | "grid_auto_columns": null, 933 | "grid_auto_flow": null, 934 | "grid_auto_rows": null, 935 | "grid_column": null, 936 | "grid_gap": null, 937 | "grid_row": null, 938 | "grid_template_areas": null, 939 | "grid_template_columns": null, 940 | "grid_template_rows": null, 941 | "height": null, 942 | "justify_content": null, 943 | "justify_items": null, 944 | "left": null, 945 | "margin": null, 946 | "max_height": null, 947 | "max_width": null, 948 | "min_height": null, 949 | "min_width": null, 950 | "object_fit": null, 951 | "object_position": null, 952 | "order": null, 953 | "overflow": null, 954 | "overflow_x": null, 955 | "overflow_y": null, 956 | "padding": null, 957 | "right": null, 958 | "top": null, 959 | "visibility": null, 960 | "width": null 961 | } 962 | }, 963 | "9f5ddcf6583246af9ec1ebe7f23446d6": { 964 | "model_module": "@jupyter-widgets/controls", 965 | "model_name": "ProgressStyleModel", 966 | "model_module_version": "1.5.0", 967 | "state": { 968 | "_model_module": "@jupyter-widgets/controls", 969 | "_model_module_version": "1.5.0", 970 | "_model_name": "ProgressStyleModel", 971 | "_view_count": null, 972 | "_view_module": "@jupyter-widgets/base", 973 | "_view_module_version": "1.2.0", 974 | "_view_name": "StyleView", 975 | "bar_color": null, 976 | "description_width": "" 977 | } 978 | }, 979 | "58b61a060918476c82be882ed6d5cc10": { 980 | "model_module": "@jupyter-widgets/base", 981 | "model_name": "LayoutModel", 982 | "model_module_version": "1.2.0", 983 | "state": { 984 | "_model_module": "@jupyter-widgets/base", 985 | "_model_module_version": "1.2.0", 986 | "_model_name": "LayoutModel", 987 | "_view_count": null, 988 | "_view_module": "@jupyter-widgets/base", 989 | "_view_module_version": "1.2.0", 990 | "_view_name": "LayoutView", 991 | "align_content": null, 992 | "align_items": null, 993 | "align_self": null, 994 | "border": null, 995 | "bottom": null, 996 | "display": null, 997 | "flex": null, 998 | "flex_flow": null, 999 | "grid_area": null, 1000 | "grid_auto_columns": null, 1001 | "grid_auto_flow": null, 1002 | "grid_auto_rows": null, 1003 | "grid_column": null, 1004 | "grid_gap": null, 1005 | "grid_row": null, 1006 | "grid_template_areas": null, 1007 | "grid_template_columns": null, 1008 | "grid_template_rows": null, 1009 | "height": null, 1010 | "justify_content": null, 1011 | "justify_items": null, 1012 | "left": null, 1013 | "margin": null, 1014 | "max_height": null, 1015 | "max_width": null, 1016 | "min_height": null, 1017 | "min_width": null, 1018 | "object_fit": null, 1019 | "object_position": null, 1020 | "order": null, 1021 | "overflow": null, 1022 | "overflow_x": null, 1023 | "overflow_y": null, 1024 | "padding": null, 1025 | "right": null, 1026 | "top": null, 1027 | "visibility": null, 1028 | "width": null 1029 | } 1030 | }, 1031 | "7a2fce54921c4062a739fb690387f156": { 1032 | "model_module": "@jupyter-widgets/controls", 1033 | "model_name": "DescriptionStyleModel", 1034 | "model_module_version": "1.5.0", 1035 | "state": { 1036 | "_model_module": "@jupyter-widgets/controls", 1037 | "_model_module_version": "1.5.0", 1038 | "_model_name": "DescriptionStyleModel", 1039 | "_view_count": null, 1040 | "_view_module": "@jupyter-widgets/base", 1041 | "_view_module_version": "1.2.0", 1042 | "_view_name": "StyleView", 1043 | "description_width": "" 1044 | } 1045 | } 1046 | } 1047 | } 1048 | }, 1049 | "cells": [ 1050 | { 1051 | "cell_type": "markdown", 1052 | "metadata": { 1053 | "id": "view-in-github", 1054 | "colab_type": "text" 1055 | }, 1056 | "source": [ 1057 | "\"Open" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "markdown", 1062 | "source": [ 1063 | "# RecurrentGemma - 2B & 2B-it\n", 1064 | "\n", 1065 | "RecurrentGemma is a family of open language models built on a novel recurrent architecture developed at Google. Both pre-trained (2B) and instruction-tuned (2B-it) versions are available in English.\n", 1066 | "\n", 1067 | "Like Gemma, [RecurrentGemma](https://huggingface.co/google/recurrentgemma-2b-it) models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Because of its novel architecture, RecurrentGemma requires less memory than Gemma and achieves faster inference when generating long sequences." 1068 | ], 1069 | "metadata": { 1070 | "id": "MVkIfH6Cg7Fx" 1071 | } 1072 | }, 1073 | { 1074 | "cell_type": "code", 1075 | "execution_count": null, 1076 | "metadata": { 1077 | "colab": { 1078 | "base_uri": "https://localhost:8080/" 1079 | }, 1080 | "id": "ahVaTC6rEIVI", 1081 | "outputId": "2036392c-b381-4ca0-80ba-16ba8c87cde3" 1082 | }, 1083 | "outputs": [ 1084 | { 1085 | "output_type": "stream", 1086 | "name": "stdout", 1087 | "text": [ 1088 | "Collecting transformers==4.40.0.dev0\n", 1089 | " Downloading https://huggingface.co/datasets/reach-vb/random-wheels/resolve/main/transformers-4.40.0.dev0-py3-none-any.whl (8.8 MB)\n", 1090 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1091 | "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (3.13.3)\n", 1092 | "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (0.20.3)\n", 1093 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (1.25.2)\n", 1094 | "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (24.0)\n", 1095 | "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (6.0.1)\n", 1096 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (2023.12.25)\n", 1097 | "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (2.31.0)\n", 1098 | "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (0.15.2)\n", 1099 | "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (0.4.2)\n", 1100 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.40.0.dev0) (4.66.2)\n", 1101 | "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers==4.40.0.dev0) (2023.6.0)\n", 1102 | "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers==4.40.0.dev0) (4.10.0)\n", 1103 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.40.0.dev0) (3.3.2)\n", 1104 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.40.0.dev0) (3.6)\n", 1105 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.40.0.dev0) (2.0.7)\n", 1106 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.40.0.dev0) (2024.2.2)\n", 1107 | "Installing collected packages: transformers\n", 1108 | " Attempting uninstall: transformers\n", 1109 | " Found existing installation: transformers 4.38.2\n", 1110 | " Uninstalling transformers-4.38.2:\n", 1111 | " Successfully uninstalled transformers-4.38.2\n", 1112 | "Successfully installed transformers-4.40.0.dev0\n" 1113 | ] 1114 | } 1115 | ], 1116 | "source": [ 1117 | "!pip install git+https://github.com/huggingface/transformers.git" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "markdown", 1122 | "source": [ 1123 | "## Load the model checkpoints\n", 1124 | "\n", 1125 | "Make sure to accept the terms and conditions for the model before running the code further here: https://huggingface.co/google/recurrentgemma-2b-it.\n" 1126 | ], 1127 | "metadata": { 1128 | "id": "FZK4T_zHhL9Q" 1129 | } 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "source": [ 1134 | "import torch\n", 1135 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n", 1136 | "\n", 1137 | "tokenizer = AutoTokenizer.from_pretrained(\"google/recurrentgemma-2b-it\")\n", 1138 | "model = AutoModelForCausalLM.from_pretrained(\"google/recurrentgemma-2b-it\", torch_dtype=torch.float16).to(\"cuda:0\")" 1139 | ], 1140 | "metadata": { 1141 | "colab": { 1142 | "base_uri": "https://localhost:8080/", 1143 | "height": 129, 1144 | "referenced_widgets": [ 1145 | "e6f2b94e3bb345859811226dc345a6e3", 1146 | "ce12a18f666740eabe1f71be5152e7d8", 1147 | "76a4282443194092af1039de43c523c6", 1148 | "3775878f2e484e84b6cb0971d2d35a7f", 1149 | "df936531829f4c1580c04719de77059c", 1150 | "ab60c88077984531aa08150c72fa7ab5", 1151 | "ad9901a17d3d4b1b898941d16c16f1cb", 1152 | "f2fbf227d64e451681083a0ca189405c", 1153 | "72364dfb4d994115bfd049dae5f53423", 1154 | "fcbe52122eeb466d91b44011f7f9bc47", 1155 | "c0915cfb93634ede86568eface4115d8", 1156 | "401cecca00fb42b29f1ec3fd5cfa4396", 1157 | "e98c3904a7e346a5870c5ac768cd6a98", 1158 | "d908df5a5a8945dc88f8d0f147245bbd", 1159 | "f1a29b9608244a9db3cc919ad149ef48", 1160 | "d19c6905802d46c4beb6fe8886cb6e8c", 1161 | "f7d7bc20d2ba40eeb576e8865cdbb8ec", 1162 | "1a431c4a814941169c8feff1b4741052", 1163 | "21a66f11f21e4913a1a5a975727916f0", 1164 | "b931a5080c154b7dbbdcebf1a48aa9a3", 1165 | "6cc6f7e129fc47c9ac57d38f713c50ea", 1166 | "896598f7441c4e84b4c7963b520d6daf", 1167 | "b391b63b204848009b051b9c9a5062a3", 1168 | "0eee8063610d46139d7576ef02ddc228", 1169 | "5d3bd17d0aa44d84a91d3ac8255dc296", 1170 | "84981495b59f46009bde2cdbec478a5f", 1171 | "f51d9c821e3b4f558ad76706f99d76a6", 1172 | "07e7af89197e489b877e309189e6ea53", 1173 | "66e1d6e6e45146a7a56d3e935f56ad51", 1174 | "85b6c5d6fdc745d4a533004de3c97408", 1175 | "9f5ddcf6583246af9ec1ebe7f23446d6", 1176 | "58b61a060918476c82be882ed6d5cc10", 1177 | "7a2fce54921c4062a739fb690387f156" 1178 | ] 1179 | }, 1180 | "id": "XItA_HZ-EPIR", 1181 | "outputId": "22b1edbc-c6d7-4ad0-b992-0f59682a30ce" 1182 | }, 1183 | "execution_count": null, 1184 | "outputs": [ 1185 | { 1186 | "output_type": "display_data", 1187 | "data": { 1188 | "text/plain": [ 1189 | "tokenizer_config.json: 0%| | 0.00/40.5k [00:00user\\nWrite a hello world program\\nmodel\\n```python\\nprint(\"Hello, world!\")\\n```\\n\\nThis program will print the message \"Hello, world!\" to the console.\\n\\n**Explanation:**\\n\\n* `print()` is a built-in Python function that prints the given argument to the console.\\n* `\"Hello, world!\"` is the string that will be printed.\\n\\n**Output:**\\n\\n```\\nHello, world!\\n```']\n" 1305 | ] 1306 | } 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "markdown", 1311 | "source": [ 1312 | "Enjoy! There's much more you can do to maximise the output of your generation. Check out this guide: https://huggingface.co/docs/transformers/generation_strategies" 1313 | ], 1314 | "metadata": { 1315 | "id": "rkpXJ5sHwmMH" 1316 | } 1317 | } 1318 | ] 1319 | } -------------------------------------------------------------------------------- /SmolVLM_500M_inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "authorship_tag": "ABX9TyM1i88r1apXFhIZpD959OWG", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU", 19 | "widgets": { 20 | "application/vnd.jupyter.widget-state+json": { 21 | "79fbb885952040c18d4aa8ab66baa034": { 22 | "model_module": "@jupyter-widgets/controls", 23 | "model_name": "HBoxModel", 24 | "model_module_version": "1.5.0", 25 | "state": { 26 | "_dom_classes": [], 27 | "_model_module": "@jupyter-widgets/controls", 28 | "_model_module_version": "1.5.0", 29 | "_model_name": "HBoxModel", 30 | "_view_count": null, 31 | "_view_module": "@jupyter-widgets/controls", 32 | "_view_module_version": "1.5.0", 33 | "_view_name": "HBoxView", 34 | "box_style": "", 35 | "children": [ 36 | "IPY_MODEL_bf0e759d410a482cbd4d3e2eaf9665a2", 37 | "IPY_MODEL_ac87ee6d1c7e4595a2c362be34e52b3e", 38 | "IPY_MODEL_794101aa3d194073ac116c66ca73fd41" 39 | ], 40 | "layout": "IPY_MODEL_cd24a5374c29462c9d8d70f19892dd56" 41 | } 42 | }, 43 | "bf0e759d410a482cbd4d3e2eaf9665a2": { 44 | "model_module": "@jupyter-widgets/controls", 45 | "model_name": "HTMLModel", 46 | "model_module_version": "1.5.0", 47 | "state": { 48 | "_dom_classes": [], 49 | "_model_module": "@jupyter-widgets/controls", 50 | "_model_module_version": "1.5.0", 51 | "_model_name": "HTMLModel", 52 | "_view_count": null, 53 | "_view_module": "@jupyter-widgets/controls", 54 | "_view_module_version": "1.5.0", 55 | "_view_name": "HTMLView", 56 | "description": "", 57 | "description_tooltip": null, 58 | "layout": "IPY_MODEL_2222a49475bf4a9b89dd760bd057c89a", 59 | "placeholder": "​", 60 | "style": "IPY_MODEL_6a9083cd5884400a95aebf432d236c67", 61 | "value": "generation_config.json: 100%" 62 | } 63 | }, 64 | "ac87ee6d1c7e4595a2c362be34e52b3e": { 65 | "model_module": "@jupyter-widgets/controls", 66 | "model_name": "FloatProgressModel", 67 | "model_module_version": "1.5.0", 68 | "state": { 69 | "_dom_classes": [], 70 | "_model_module": "@jupyter-widgets/controls", 71 | "_model_module_version": "1.5.0", 72 | "_model_name": "FloatProgressModel", 73 | "_view_count": null, 74 | "_view_module": "@jupyter-widgets/controls", 75 | "_view_module_version": "1.5.0", 76 | "_view_name": "ProgressView", 77 | "bar_style": "success", 78 | "description": "", 79 | "description_tooltip": null, 80 | "layout": "IPY_MODEL_c0f685a98c274cb39ebf5ec6d1883906", 81 | "max": 136, 82 | "min": 0, 83 | "orientation": "horizontal", 84 | "style": "IPY_MODEL_3bf5037740d3416bbe7c53882e67a447", 85 | "value": 136 86 | } 87 | }, 88 | "794101aa3d194073ac116c66ca73fd41": { 89 | "model_module": "@jupyter-widgets/controls", 90 | "model_name": "HTMLModel", 91 | "model_module_version": "1.5.0", 92 | "state": { 93 | "_dom_classes": [], 94 | "_model_module": "@jupyter-widgets/controls", 95 | "_model_module_version": "1.5.0", 96 | "_model_name": "HTMLModel", 97 | "_view_count": null, 98 | "_view_module": "@jupyter-widgets/controls", 99 | "_view_module_version": "1.5.0", 100 | "_view_name": "HTMLView", 101 | "description": "", 102 | "description_tooltip": null, 103 | "layout": "IPY_MODEL_3ed28d63ed164447b6fcb61ead12b509", 104 | "placeholder": "​", 105 | "style": "IPY_MODEL_43746e3c771e49369d1a573506876bc4", 106 | "value": " 136/136 [00:00<00:00, 4.65kB/s]" 107 | } 108 | }, 109 | "cd24a5374c29462c9d8d70f19892dd56": { 110 | "model_module": "@jupyter-widgets/base", 111 | "model_name": "LayoutModel", 112 | "model_module_version": "1.2.0", 113 | "state": { 114 | "_model_module": "@jupyter-widgets/base", 115 | "_model_module_version": "1.2.0", 116 | "_model_name": "LayoutModel", 117 | "_view_count": null, 118 | "_view_module": "@jupyter-widgets/base", 119 | "_view_module_version": "1.2.0", 120 | "_view_name": "LayoutView", 121 | "align_content": null, 122 | "align_items": null, 123 | "align_self": null, 124 | "border": null, 125 | "bottom": null, 126 | "display": null, 127 | "flex": null, 128 | "flex_flow": null, 129 | "grid_area": null, 130 | "grid_auto_columns": null, 131 | "grid_auto_flow": null, 132 | "grid_auto_rows": null, 133 | "grid_column": null, 134 | "grid_gap": null, 135 | "grid_row": null, 136 | "grid_template_areas": null, 137 | "grid_template_columns": null, 138 | "grid_template_rows": null, 139 | "height": null, 140 | "justify_content": null, 141 | "justify_items": null, 142 | "left": null, 143 | "margin": null, 144 | "max_height": null, 145 | "max_width": null, 146 | "min_height": null, 147 | "min_width": null, 148 | "object_fit": null, 149 | "object_position": null, 150 | "order": null, 151 | "overflow": null, 152 | "overflow_x": null, 153 | "overflow_y": null, 154 | "padding": null, 155 | "right": null, 156 | "top": null, 157 | "visibility": null, 158 | "width": null 159 | } 160 | }, 161 | "2222a49475bf4a9b89dd760bd057c89a": { 162 | "model_module": "@jupyter-widgets/base", 163 | "model_name": "LayoutModel", 164 | "model_module_version": "1.2.0", 165 | "state": { 166 | "_model_module": "@jupyter-widgets/base", 167 | "_model_module_version": "1.2.0", 168 | "_model_name": "LayoutModel", 169 | "_view_count": null, 170 | "_view_module": "@jupyter-widgets/base", 171 | "_view_module_version": "1.2.0", 172 | "_view_name": "LayoutView", 173 | "align_content": null, 174 | "align_items": null, 175 | "align_self": null, 176 | "border": null, 177 | "bottom": null, 178 | "display": null, 179 | "flex": null, 180 | "flex_flow": null, 181 | "grid_area": null, 182 | "grid_auto_columns": null, 183 | "grid_auto_flow": null, 184 | "grid_auto_rows": null, 185 | "grid_column": null, 186 | "grid_gap": null, 187 | "grid_row": null, 188 | "grid_template_areas": null, 189 | "grid_template_columns": null, 190 | "grid_template_rows": null, 191 | "height": null, 192 | "justify_content": null, 193 | "justify_items": null, 194 | "left": null, 195 | "margin": null, 196 | "max_height": null, 197 | "max_width": null, 198 | "min_height": null, 199 | "min_width": null, 200 | "object_fit": null, 201 | "object_position": null, 202 | "order": null, 203 | "overflow": null, 204 | "overflow_x": null, 205 | "overflow_y": null, 206 | "padding": null, 207 | "right": null, 208 | "top": null, 209 | "visibility": null, 210 | "width": null 211 | } 212 | }, 213 | "6a9083cd5884400a95aebf432d236c67": { 214 | "model_module": "@jupyter-widgets/controls", 215 | "model_name": "DescriptionStyleModel", 216 | "model_module_version": "1.5.0", 217 | "state": { 218 | "_model_module": "@jupyter-widgets/controls", 219 | "_model_module_version": "1.5.0", 220 | "_model_name": "DescriptionStyleModel", 221 | "_view_count": null, 222 | "_view_module": "@jupyter-widgets/base", 223 | "_view_module_version": "1.2.0", 224 | "_view_name": "StyleView", 225 | "description_width": "" 226 | } 227 | }, 228 | "c0f685a98c274cb39ebf5ec6d1883906": { 229 | "model_module": "@jupyter-widgets/base", 230 | "model_name": "LayoutModel", 231 | "model_module_version": "1.2.0", 232 | "state": { 233 | "_model_module": "@jupyter-widgets/base", 234 | "_model_module_version": "1.2.0", 235 | "_model_name": "LayoutModel", 236 | "_view_count": null, 237 | "_view_module": "@jupyter-widgets/base", 238 | "_view_module_version": "1.2.0", 239 | "_view_name": "LayoutView", 240 | "align_content": null, 241 | "align_items": null, 242 | "align_self": null, 243 | "border": null, 244 | "bottom": null, 245 | "display": null, 246 | "flex": null, 247 | "flex_flow": null, 248 | "grid_area": null, 249 | "grid_auto_columns": null, 250 | "grid_auto_flow": null, 251 | "grid_auto_rows": null, 252 | "grid_column": null, 253 | "grid_gap": null, 254 | "grid_row": null, 255 | "grid_template_areas": null, 256 | "grid_template_columns": null, 257 | "grid_template_rows": null, 258 | "height": null, 259 | "justify_content": null, 260 | "justify_items": null, 261 | "left": null, 262 | "margin": null, 263 | "max_height": null, 264 | "max_width": null, 265 | "min_height": null, 266 | "min_width": null, 267 | "object_fit": null, 268 | "object_position": null, 269 | "order": null, 270 | "overflow": null, 271 | "overflow_x": null, 272 | "overflow_y": null, 273 | "padding": null, 274 | "right": null, 275 | "top": null, 276 | "visibility": null, 277 | "width": null 278 | } 279 | }, 280 | "3bf5037740d3416bbe7c53882e67a447": { 281 | "model_module": "@jupyter-widgets/controls", 282 | "model_name": "ProgressStyleModel", 283 | "model_module_version": "1.5.0", 284 | "state": { 285 | "_model_module": "@jupyter-widgets/controls", 286 | "_model_module_version": "1.5.0", 287 | "_model_name": "ProgressStyleModel", 288 | "_view_count": null, 289 | "_view_module": "@jupyter-widgets/base", 290 | "_view_module_version": "1.2.0", 291 | "_view_name": "StyleView", 292 | "bar_color": null, 293 | "description_width": "" 294 | } 295 | }, 296 | "3ed28d63ed164447b6fcb61ead12b509": { 297 | "model_module": "@jupyter-widgets/base", 298 | "model_name": "LayoutModel", 299 | "model_module_version": "1.2.0", 300 | "state": { 301 | "_model_module": "@jupyter-widgets/base", 302 | "_model_module_version": "1.2.0", 303 | "_model_name": "LayoutModel", 304 | "_view_count": null, 305 | "_view_module": "@jupyter-widgets/base", 306 | "_view_module_version": "1.2.0", 307 | "_view_name": "LayoutView", 308 | "align_content": null, 309 | "align_items": null, 310 | "align_self": null, 311 | "border": null, 312 | "bottom": null, 313 | "display": null, 314 | "flex": null, 315 | "flex_flow": null, 316 | "grid_area": null, 317 | "grid_auto_columns": null, 318 | "grid_auto_flow": null, 319 | "grid_auto_rows": null, 320 | "grid_column": null, 321 | "grid_gap": null, 322 | "grid_row": null, 323 | "grid_template_areas": null, 324 | "grid_template_columns": null, 325 | "grid_template_rows": null, 326 | "height": null, 327 | "justify_content": null, 328 | "justify_items": null, 329 | "left": null, 330 | "margin": null, 331 | "max_height": null, 332 | "max_width": null, 333 | "min_height": null, 334 | "min_width": null, 335 | "object_fit": null, 336 | "object_position": null, 337 | "order": null, 338 | "overflow": null, 339 | "overflow_x": null, 340 | "overflow_y": null, 341 | "padding": null, 342 | "right": null, 343 | "top": null, 344 | "visibility": null, 345 | "width": null 346 | } 347 | }, 348 | "43746e3c771e49369d1a573506876bc4": { 349 | "model_module": "@jupyter-widgets/controls", 350 | "model_name": "DescriptionStyleModel", 351 | "model_module_version": "1.5.0", 352 | "state": { 353 | "_model_module": "@jupyter-widgets/controls", 354 | "_model_module_version": "1.5.0", 355 | "_model_name": "DescriptionStyleModel", 356 | "_view_count": null, 357 | "_view_module": "@jupyter-widgets/base", 358 | "_view_module_version": "1.2.0", 359 | "_view_name": "StyleView", 360 | "description_width": "" 361 | } 362 | } 363 | } 364 | } 365 | }, 366 | "cells": [ 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "id": "view-in-github", 371 | "colab_type": "text" 372 | }, 373 | "source": [ 374 | "\"Open" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "source": [ 380 | "# Smollest VLM out there: SmolVLM 256M & 500M 🔥\n", 381 | "\n", 382 | "Check out the model checkpoints and spaces [here](https://huggingface.co/collections/HuggingFaceTB/smolvlm-256m-and-500m-6791fafc5bb0ab8acc960fb0)" 383 | ], 384 | "metadata": { 385 | "id": "B1Enw9pfXN50" 386 | } 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 1, 391 | "metadata": { 392 | "id": "XmSco3okWA0q" 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "import torch\n", 397 | "from PIL import Image\n", 398 | "from transformers import AutoProcessor, AutoModelForVision2Seq\n", 399 | "from transformers.image_utils import load_image\n", 400 | "\n", 401 | "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "source": [ 407 | "## Load the Processor & the Model" 408 | ], 409 | "metadata": { 410 | "id": "ZMNgWos8ZyWp" 411 | } 412 | }, 413 | { 414 | "cell_type": "code", 415 | "source": [ 416 | "# Load images\n", 417 | "image = load_image(\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\")\n", 418 | "\n", 419 | "# Initialize processor and model\n", 420 | "processor = AutoProcessor.from_pretrained(\"HuggingFaceTB/SmolVLM-500M-Instruct\")\n", 421 | "model = AutoModelForVision2Seq.from_pretrained(\n", 422 | " \"HuggingFaceTB/SmolVLM-500M-Instruct\",\n", 423 | " torch_dtype=torch.bfloat16,\n", 424 | ").to(DEVICE)\n" 425 | ], 426 | "metadata": { 427 | "colab": { 428 | "base_uri": "https://localhost:8080/", 429 | "height": 74, 430 | "referenced_widgets": [ 431 | "79fbb885952040c18d4aa8ab66baa034", 432 | "bf0e759d410a482cbd4d3e2eaf9665a2", 433 | "ac87ee6d1c7e4595a2c362be34e52b3e", 434 | "794101aa3d194073ac116c66ca73fd41", 435 | "cd24a5374c29462c9d8d70f19892dd56", 436 | "2222a49475bf4a9b89dd760bd057c89a", 437 | "6a9083cd5884400a95aebf432d236c67", 438 | "c0f685a98c274cb39ebf5ec6d1883906", 439 | "3bf5037740d3416bbe7c53882e67a447", 440 | "3ed28d63ed164447b6fcb61ead12b509", 441 | "43746e3c771e49369d1a573506876bc4" 442 | ] 443 | }, 444 | "id": "3V_VnjSJWPOh", 445 | "outputId": "39bdd135-e37e-4187-8237-2c1483388a10" 446 | }, 447 | "execution_count": 3, 448 | "outputs": [ 449 | { 450 | "output_type": "stream", 451 | "name": "stderr", 452 | "text": [ 453 | "Some kwargs in processor config are unused and will not have any effect: image_seq_len. \n" 454 | ] 455 | }, 456 | { 457 | "output_type": "display_data", 458 | "data": { 459 | "text/plain": [ 460 | "generation_config.json: 0%| | 0.00/136 [00:00
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. " 64 | } 65 | }, 66 | "83e7115a09d144ab9ef10dd0c60c6a68": { 67 | "model_module": "@jupyter-widgets/controls", 68 | "model_name": "PasswordModel", 69 | "model_module_version": "1.5.0", 70 | "state": { 71 | "_dom_classes": [], 72 | "_model_module": "@jupyter-widgets/controls", 73 | "_model_module_version": "1.5.0", 74 | "_model_name": "PasswordModel", 75 | "_view_count": null, 76 | "_view_module": "@jupyter-widgets/controls", 77 | "_view_module_version": "1.5.0", 78 | "_view_name": "PasswordView", 79 | "continuous_update": true, 80 | "description": "Token:", 81 | "description_tooltip": null, 82 | "disabled": false, 83 | "layout": "IPY_MODEL_b1f2b510eb2b4e63b4ccc5e0c963da3d", 84 | "placeholder": "​", 85 | "style": "IPY_MODEL_fd6bd029bfeb4028b34b4ffa848e5ee7", 86 | "value": "" 87 | } 88 | }, 89 | "85b25d75e78b4cda8546259be94235b1": { 90 | "model_module": "@jupyter-widgets/controls", 91 | "model_name": "CheckboxModel", 92 | "model_module_version": "1.5.0", 93 | "state": { 94 | "_dom_classes": [], 95 | "_model_module": "@jupyter-widgets/controls", 96 | "_model_module_version": "1.5.0", 97 | "_model_name": "CheckboxModel", 98 | "_view_count": null, 99 | "_view_module": "@jupyter-widgets/controls", 100 | "_view_module_version": "1.5.0", 101 | "_view_name": "CheckboxView", 102 | "description": "Add token as git credential?", 103 | "description_tooltip": null, 104 | "disabled": false, 105 | "indent": true, 106 | "layout": "IPY_MODEL_7788f987f5ee48dbb63d3634b87309cd", 107 | "style": "IPY_MODEL_a28a135050b641b19d5d7bb12d666ae7", 108 | "value": true 109 | } 110 | }, 111 | "4a69efc93763470eb2afd6fe1c258076": { 112 | "model_module": "@jupyter-widgets/controls", 113 | "model_name": "ButtonModel", 114 | "model_module_version": "1.5.0", 115 | "state": { 116 | "_dom_classes": [], 117 | "_model_module": "@jupyter-widgets/controls", 118 | "_model_module_version": "1.5.0", 119 | "_model_name": "ButtonModel", 120 | "_view_count": null, 121 | "_view_module": "@jupyter-widgets/controls", 122 | "_view_module_version": "1.5.0", 123 | "_view_name": "ButtonView", 124 | "button_style": "", 125 | "description": "Login", 126 | "disabled": false, 127 | "icon": "", 128 | "layout": "IPY_MODEL_05ffd42668424ac09d052c325dbad8d3", 129 | "style": "IPY_MODEL_cb1a9f6bf78047ca88d90d31ed9ab1ee", 130 | "tooltip": "" 131 | } 132 | }, 133 | "b32ea47b26f44f0582aecf7cd1526f0e": { 134 | "model_module": "@jupyter-widgets/controls", 135 | "model_name": "HTMLModel", 136 | "model_module_version": "1.5.0", 137 | "state": { 138 | "_dom_classes": [], 139 | "_model_module": "@jupyter-widgets/controls", 140 | "_model_module_version": "1.5.0", 141 | "_model_name": "HTMLModel", 142 | "_view_count": null, 143 | "_view_module": "@jupyter-widgets/controls", 144 | "_view_module_version": "1.5.0", 145 | "_view_name": "HTMLView", 146 | "description": "", 147 | "description_tooltip": null, 148 | "layout": "IPY_MODEL_bea114e4787c4318a99072f76ab641bc", 149 | "placeholder": "​", 150 | "style": "IPY_MODEL_87390232ac4649ba9681b65beb48d17a", 151 | "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " 152 | } 153 | }, 154 | "4b818f15db4d47ce910c4987c64501c7": { 155 | "model_module": "@jupyter-widgets/base", 156 | "model_name": "LayoutModel", 157 | "model_module_version": "1.2.0", 158 | "state": { 159 | "_model_module": "@jupyter-widgets/base", 160 | "_model_module_version": "1.2.0", 161 | "_model_name": "LayoutModel", 162 | "_view_count": null, 163 | "_view_module": "@jupyter-widgets/base", 164 | "_view_module_version": "1.2.0", 165 | "_view_name": "LayoutView", 166 | "align_content": null, 167 | "align_items": "center", 168 | "align_self": null, 169 | "border": null, 170 | "bottom": null, 171 | "display": "flex", 172 | "flex": null, 173 | "flex_flow": "column", 174 | "grid_area": null, 175 | "grid_auto_columns": null, 176 | "grid_auto_flow": null, 177 | "grid_auto_rows": null, 178 | "grid_column": null, 179 | "grid_gap": null, 180 | "grid_row": null, 181 | "grid_template_areas": null, 182 | "grid_template_columns": null, 183 | "grid_template_rows": null, 184 | "height": null, 185 | "justify_content": null, 186 | "justify_items": null, 187 | "left": null, 188 | "margin": null, 189 | "max_height": null, 190 | "max_width": null, 191 | "min_height": null, 192 | "min_width": null, 193 | "object_fit": null, 194 | "object_position": null, 195 | "order": null, 196 | "overflow": null, 197 | "overflow_x": null, 198 | "overflow_y": null, 199 | "padding": null, 200 | "right": null, 201 | "top": null, 202 | "visibility": null, 203 | "width": "50%" 204 | } 205 | }, 206 | "3ea81ae0965840ed8ad532ee02606d05": { 207 | "model_module": "@jupyter-widgets/base", 208 | "model_name": "LayoutModel", 209 | "model_module_version": "1.2.0", 210 | "state": { 211 | "_model_module": "@jupyter-widgets/base", 212 | "_model_module_version": "1.2.0", 213 | "_model_name": "LayoutModel", 214 | "_view_count": null, 215 | "_view_module": "@jupyter-widgets/base", 216 | "_view_module_version": "1.2.0", 217 | "_view_name": "LayoutView", 218 | "align_content": null, 219 | "align_items": null, 220 | "align_self": null, 221 | "border": null, 222 | "bottom": null, 223 | "display": null, 224 | "flex": null, 225 | "flex_flow": null, 226 | "grid_area": null, 227 | "grid_auto_columns": null, 228 | "grid_auto_flow": null, 229 | "grid_auto_rows": null, 230 | "grid_column": null, 231 | "grid_gap": null, 232 | "grid_row": null, 233 | "grid_template_areas": null, 234 | "grid_template_columns": null, 235 | "grid_template_rows": null, 236 | "height": null, 237 | "justify_content": null, 238 | "justify_items": null, 239 | "left": null, 240 | "margin": null, 241 | "max_height": null, 242 | "max_width": null, 243 | "min_height": null, 244 | "min_width": null, 245 | "object_fit": null, 246 | "object_position": null, 247 | "order": null, 248 | "overflow": null, 249 | "overflow_x": null, 250 | "overflow_y": null, 251 | "padding": null, 252 | "right": null, 253 | "top": null, 254 | "visibility": null, 255 | "width": null 256 | } 257 | }, 258 | "a1bfa88b478542a3a4b8773693355bb2": { 259 | "model_module": "@jupyter-widgets/controls", 260 | "model_name": "DescriptionStyleModel", 261 | "model_module_version": "1.5.0", 262 | "state": { 263 | "_model_module": "@jupyter-widgets/controls", 264 | "_model_module_version": "1.5.0", 265 | "_model_name": "DescriptionStyleModel", 266 | "_view_count": null, 267 | "_view_module": "@jupyter-widgets/base", 268 | "_view_module_version": "1.2.0", 269 | "_view_name": "StyleView", 270 | "description_width": "" 271 | } 272 | }, 273 | "b1f2b510eb2b4e63b4ccc5e0c963da3d": { 274 | "model_module": "@jupyter-widgets/base", 275 | "model_name": "LayoutModel", 276 | "model_module_version": "1.2.0", 277 | "state": { 278 | "_model_module": "@jupyter-widgets/base", 279 | "_model_module_version": "1.2.0", 280 | "_model_name": "LayoutModel", 281 | "_view_count": null, 282 | "_view_module": "@jupyter-widgets/base", 283 | "_view_module_version": "1.2.0", 284 | "_view_name": "LayoutView", 285 | "align_content": null, 286 | "align_items": null, 287 | "align_self": null, 288 | "border": null, 289 | "bottom": null, 290 | "display": null, 291 | "flex": null, 292 | "flex_flow": null, 293 | "grid_area": null, 294 | "grid_auto_columns": null, 295 | "grid_auto_flow": null, 296 | "grid_auto_rows": null, 297 | "grid_column": null, 298 | "grid_gap": null, 299 | "grid_row": null, 300 | "grid_template_areas": null, 301 | "grid_template_columns": null, 302 | "grid_template_rows": null, 303 | "height": null, 304 | "justify_content": null, 305 | "justify_items": null, 306 | "left": null, 307 | "margin": null, 308 | "max_height": null, 309 | "max_width": null, 310 | "min_height": null, 311 | "min_width": null, 312 | "object_fit": null, 313 | "object_position": null, 314 | "order": null, 315 | "overflow": null, 316 | "overflow_x": null, 317 | "overflow_y": null, 318 | "padding": null, 319 | "right": null, 320 | "top": null, 321 | "visibility": null, 322 | "width": null 323 | } 324 | }, 325 | "fd6bd029bfeb4028b34b4ffa848e5ee7": { 326 | "model_module": "@jupyter-widgets/controls", 327 | "model_name": "DescriptionStyleModel", 328 | "model_module_version": "1.5.0", 329 | "state": { 330 | "_model_module": "@jupyter-widgets/controls", 331 | "_model_module_version": "1.5.0", 332 | "_model_name": "DescriptionStyleModel", 333 | "_view_count": null, 334 | "_view_module": "@jupyter-widgets/base", 335 | "_view_module_version": "1.2.0", 336 | "_view_name": "StyleView", 337 | "description_width": "" 338 | } 339 | }, 340 | "7788f987f5ee48dbb63d3634b87309cd": { 341 | "model_module": "@jupyter-widgets/base", 342 | "model_name": "LayoutModel", 343 | "model_module_version": "1.2.0", 344 | "state": { 345 | "_model_module": "@jupyter-widgets/base", 346 | "_model_module_version": "1.2.0", 347 | "_model_name": "LayoutModel", 348 | "_view_count": null, 349 | "_view_module": "@jupyter-widgets/base", 350 | "_view_module_version": "1.2.0", 351 | "_view_name": "LayoutView", 352 | "align_content": null, 353 | "align_items": null, 354 | "align_self": null, 355 | "border": null, 356 | "bottom": null, 357 | "display": null, 358 | "flex": null, 359 | "flex_flow": null, 360 | "grid_area": null, 361 | "grid_auto_columns": null, 362 | "grid_auto_flow": null, 363 | "grid_auto_rows": null, 364 | "grid_column": null, 365 | "grid_gap": null, 366 | "grid_row": null, 367 | "grid_template_areas": null, 368 | "grid_template_columns": null, 369 | "grid_template_rows": null, 370 | "height": null, 371 | "justify_content": null, 372 | "justify_items": null, 373 | "left": null, 374 | "margin": null, 375 | "max_height": null, 376 | "max_width": null, 377 | "min_height": null, 378 | "min_width": null, 379 | "object_fit": null, 380 | "object_position": null, 381 | "order": null, 382 | "overflow": null, 383 | "overflow_x": null, 384 | "overflow_y": null, 385 | "padding": null, 386 | "right": null, 387 | "top": null, 388 | "visibility": null, 389 | "width": null 390 | } 391 | }, 392 | "a28a135050b641b19d5d7bb12d666ae7": { 393 | "model_module": "@jupyter-widgets/controls", 394 | "model_name": "DescriptionStyleModel", 395 | "model_module_version": "1.5.0", 396 | "state": { 397 | "_model_module": "@jupyter-widgets/controls", 398 | "_model_module_version": "1.5.0", 399 | "_model_name": "DescriptionStyleModel", 400 | "_view_count": null, 401 | "_view_module": "@jupyter-widgets/base", 402 | "_view_module_version": "1.2.0", 403 | "_view_name": "StyleView", 404 | "description_width": "" 405 | } 406 | }, 407 | "05ffd42668424ac09d052c325dbad8d3": { 408 | "model_module": "@jupyter-widgets/base", 409 | "model_name": "LayoutModel", 410 | "model_module_version": "1.2.0", 411 | "state": { 412 | "_model_module": "@jupyter-widgets/base", 413 | "_model_module_version": "1.2.0", 414 | "_model_name": "LayoutModel", 415 | "_view_count": null, 416 | "_view_module": "@jupyter-widgets/base", 417 | "_view_module_version": "1.2.0", 418 | "_view_name": "LayoutView", 419 | "align_content": null, 420 | "align_items": null, 421 | "align_self": null, 422 | "border": null, 423 | "bottom": null, 424 | "display": null, 425 | "flex": null, 426 | "flex_flow": null, 427 | "grid_area": null, 428 | "grid_auto_columns": null, 429 | "grid_auto_flow": null, 430 | "grid_auto_rows": null, 431 | "grid_column": null, 432 | "grid_gap": null, 433 | "grid_row": null, 434 | "grid_template_areas": null, 435 | "grid_template_columns": null, 436 | "grid_template_rows": null, 437 | "height": null, 438 | "justify_content": null, 439 | "justify_items": null, 440 | "left": null, 441 | "margin": null, 442 | "max_height": null, 443 | "max_width": null, 444 | "min_height": null, 445 | "min_width": null, 446 | "object_fit": null, 447 | "object_position": null, 448 | "order": null, 449 | "overflow": null, 450 | "overflow_x": null, 451 | "overflow_y": null, 452 | "padding": null, 453 | "right": null, 454 | "top": null, 455 | "visibility": null, 456 | "width": null 457 | } 458 | }, 459 | "cb1a9f6bf78047ca88d90d31ed9ab1ee": { 460 | "model_module": "@jupyter-widgets/controls", 461 | "model_name": "ButtonStyleModel", 462 | "model_module_version": "1.5.0", 463 | "state": { 464 | "_model_module": "@jupyter-widgets/controls", 465 | "_model_module_version": "1.5.0", 466 | "_model_name": "ButtonStyleModel", 467 | "_view_count": null, 468 | "_view_module": "@jupyter-widgets/base", 469 | "_view_module_version": "1.2.0", 470 | "_view_name": "StyleView", 471 | "button_color": null, 472 | "font_weight": "" 473 | } 474 | }, 475 | "bea114e4787c4318a99072f76ab641bc": { 476 | "model_module": "@jupyter-widgets/base", 477 | "model_name": "LayoutModel", 478 | "model_module_version": "1.2.0", 479 | "state": { 480 | "_model_module": "@jupyter-widgets/base", 481 | "_model_module_version": "1.2.0", 482 | "_model_name": "LayoutModel", 483 | "_view_count": null, 484 | "_view_module": "@jupyter-widgets/base", 485 | "_view_module_version": "1.2.0", 486 | "_view_name": "LayoutView", 487 | "align_content": null, 488 | "align_items": null, 489 | "align_self": null, 490 | "border": null, 491 | "bottom": null, 492 | "display": null, 493 | "flex": null, 494 | "flex_flow": null, 495 | "grid_area": null, 496 | "grid_auto_columns": null, 497 | "grid_auto_flow": null, 498 | "grid_auto_rows": null, 499 | "grid_column": null, 500 | "grid_gap": null, 501 | "grid_row": null, 502 | "grid_template_areas": null, 503 | "grid_template_columns": null, 504 | "grid_template_rows": null, 505 | "height": null, 506 | "justify_content": null, 507 | "justify_items": null, 508 | "left": null, 509 | "margin": null, 510 | "max_height": null, 511 | "max_width": null, 512 | "min_height": null, 513 | "min_width": null, 514 | "object_fit": null, 515 | "object_position": null, 516 | "order": null, 517 | "overflow": null, 518 | "overflow_x": null, 519 | "overflow_y": null, 520 | "padding": null, 521 | "right": null, 522 | "top": null, 523 | "visibility": null, 524 | "width": null 525 | } 526 | }, 527 | "87390232ac4649ba9681b65beb48d17a": { 528 | "model_module": "@jupyter-widgets/controls", 529 | "model_name": "DescriptionStyleModel", 530 | "model_module_version": "1.5.0", 531 | "state": { 532 | "_model_module": "@jupyter-widgets/controls", 533 | "_model_module_version": "1.5.0", 534 | "_model_name": "DescriptionStyleModel", 535 | "_view_count": null, 536 | "_view_module": "@jupyter-widgets/base", 537 | "_view_module_version": "1.2.0", 538 | "_view_name": "StyleView", 539 | "description_width": "" 540 | } 541 | } 542 | } 543 | } 544 | }, 545 | "cells": [ 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "id": "view-in-github", 550 | "colab_type": "text" 551 | }, 552 | "source": [ 553 | "\"Open" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "source": [ 559 | "# Whisper Large inference in 8-bit mode\n", 560 | "\n", 561 | "For faster and memory efficient inference for large models. Read more about it [here](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", 562 | "\n", 563 | "Compiled by: [Vaibhav (VB) Srivastav](https://twitter.com/reach_vb)" 564 | ], 565 | "metadata": { 566 | "id": "YnVYTxOBJMmR" 567 | } 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "source": [ 572 | "We'll first install the necessary packages. We need ffmpeg to decode `mp3` files from the CV11 dataset and transformers, bnb and accelerate to load the model in 8bit mode." 573 | ], 574 | "metadata": { 575 | "id": "KgGPly6_Lrhm" 576 | } 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": { 582 | "colab": { 583 | "base_uri": "https://localhost:8080/" 584 | }, 585 | "id": "JKOGmCPxrfJ4", 586 | "outputId": "b4a4a392-53ab-44aa-8616-047507375c87" 587 | }, 588 | "outputs": [ 589 | { 590 | "output_type": "stream", 591 | "name": "stdout", 592 | "text": [ 593 | "\r0% [Working]\r \rHit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease\n", 594 | "\r0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.\r0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Connecting to security.ubu\r \rHit:2 http://archive.ubuntu.com/ubuntu bionic InRelease\n", 595 | "\r0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait\r \rHit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease\n", 596 | "\r0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait\r \rHit:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease\n", 597 | "Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", 598 | "Hit:6 http://archive.ubuntu.com/ubuntu bionic-backports InRelease\n", 599 | "Ign:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", 600 | "Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", 601 | "Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", 602 | "Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", 603 | "Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", 604 | "Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", 605 | "Hit:13 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic InRelease\n", 606 | "Reading package lists... Done\n", 607 | "Hit:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", 608 | "Hit:2 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", 609 | "Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease\n", 610 | "Hit:4 http://security.ubuntu.com/ubuntu bionic-security InRelease\n", 611 | "Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease\n", 612 | "Hit:6 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", 613 | "Hit:7 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", 614 | "Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease\n", 615 | "Ign:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", 616 | "Hit:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", 617 | "Hit:11 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", 618 | "Hit:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease\n", 619 | "Hit:13 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic InRelease\n", 620 | "Reading package lists... Done\n", 621 | "Building dependency tree \n", 622 | "Reading state information... Done\n", 623 | "32 packages can be upgraded. Run 'apt list --upgradable' to see them.\n", 624 | "Reading package lists... Done\n", 625 | "Building dependency tree \n", 626 | "Reading state information... Done\n", 627 | "ffmpeg is already the newest version (7:4.3.2-0york0~18.04).\n", 628 | "The following package was automatically installed and is no longer required:\n", 629 | " libnvidia-common-460\n", 630 | "Use 'apt autoremove' to remove it.\n", 631 | "0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.\n", 632 | " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", 633 | " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", 634 | " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n" 635 | ] 636 | } 637 | ], 638 | "source": [ 639 | "!add-apt-repository -y ppa:jonathonf/ffmpeg-4 && apt update && apt install -y ffmpeg\n", 640 | "!pip install --quiet datasets git+https://github.com/huggingface/transformers evaluate huggingface_hub jiwer bitsandbytes accelerate" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "source": [ 646 | "Since we will be running inference on CV11 dataset, we'd need to authenticate ourselves (since, CV11 requires accepting its Terms and Conditions)" 647 | ], 648 | "metadata": { 649 | "id": "NvR6u52ZL9yb" 650 | } 651 | }, 652 | { 653 | "cell_type": "code", 654 | "source": [ 655 | "!git config --global credential.helper store\n", 656 | "from huggingface_hub import login\n", 657 | "\n", 658 | "login()" 659 | ], 660 | "metadata": { 661 | "colab": { 662 | "base_uri": "https://localhost:8080/", 663 | "height": 331, 664 | "referenced_widgets": [ 665 | "f7a390ea8adc42b3917a18aa20ac7b08", 666 | "9d22b9fb75264a64905c196405996b1f", 667 | "83e7115a09d144ab9ef10dd0c60c6a68", 668 | "85b25d75e78b4cda8546259be94235b1", 669 | "4a69efc93763470eb2afd6fe1c258076", 670 | "b32ea47b26f44f0582aecf7cd1526f0e", 671 | "4b818f15db4d47ce910c4987c64501c7", 672 | "3ea81ae0965840ed8ad532ee02606d05", 673 | "a1bfa88b478542a3a4b8773693355bb2", 674 | "b1f2b510eb2b4e63b4ccc5e0c963da3d", 675 | "fd6bd029bfeb4028b34b4ffa848e5ee7", 676 | "7788f987f5ee48dbb63d3634b87309cd", 677 | "a28a135050b641b19d5d7bb12d666ae7", 678 | "05ffd42668424ac09d052c325dbad8d3", 679 | "cb1a9f6bf78047ca88d90d31ed9ab1ee", 680 | "bea114e4787c4318a99072f76ab641bc", 681 | "87390232ac4649ba9681b65beb48d17a" 682 | ] 683 | }, 684 | "id": "tBSPoZggrtc8", 685 | "outputId": "1d9fed6b-2345-4eb1-923d-c4624373cc7d" 686 | }, 687 | "execution_count": null, 688 | "outputs": [ 689 | { 690 | "output_type": "stream", 691 | "name": "stdout", 692 | "text": [ 693 | "Token is valid.\n", 694 | "Your token has been saved in your configured git credential helpers (store).\n", 695 | "Your token has been saved to /root/.huggingface/token\n", 696 | "Login successful\n" 697 | ] 698 | } 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "source": [ 704 | "To reduce the memory and time overhead, we'll load the dataset in streaming fashion. During the time of inference we'll stream one data point at a time. This is specially useful for larger datasets." 705 | ], 706 | "metadata": { 707 | "id": "nmQdxHxXMPNL" 708 | } 709 | }, 710 | { 711 | "cell_type": "code", 712 | "source": [ 713 | "from datasets import load_dataset\n", 714 | "\n", 715 | "dataset = load_dataset(\n", 716 | " \"mozilla-foundation/common_voice_11_0\", \"en\", revision=\"streaming\", split=\"test\", streaming=True, use_auth_token=True\n", 717 | ")" 718 | ], 719 | "metadata": { 720 | "id": "_MeHLH1Qrv6_" 721 | }, 722 | "execution_count": null, 723 | "outputs": [] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "source": [ 728 | "Loading the model and processor in 8bit mode with `load_in_8bit=True`\n", 729 | "\n", 730 | "Note: This is the only change you need to make in order for you to run the model in 8bit mode." 731 | ], 732 | "metadata": { 733 | "id": "Jk-efqH5MeFg" 734 | } 735 | }, 736 | { 737 | "cell_type": "code", 738 | "source": [ 739 | "import torch\n", 740 | "from transformers import WhisperForConditionalGeneration, WhisperProcessor\n", 741 | "\n", 742 | "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-large\", device_map=\"auto\", load_in_8bit=True)\n", 743 | "processor = WhisperProcessor.from_pretrained(\"openai/whisper-large\", load_in_8bit=True)" 744 | ], 745 | "metadata": { 746 | "id": "UdiPC_3w_UyR" 747 | }, 748 | "execution_count": null, 749 | "outputs": [] 750 | }, 751 | { 752 | "cell_type": "markdown", 753 | "source": [ 754 | "Preprocess the dataset to be sampled at 16KHz, since Whisper expects 16KHz input." 755 | ], 756 | "metadata": { 757 | "id": "IJtaj3IpMwWF" 758 | } 759 | }, 760 | { 761 | "cell_type": "code", 762 | "source": [ 763 | "from datasets import Audio\n", 764 | "\n", 765 | "dataset = dataset.take(10)\n", 766 | "\n", 767 | "# resample to 16kHz\n", 768 | "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))" 769 | ], 770 | "metadata": { 771 | "id": "NN39fYPbEbkC" 772 | }, 773 | "execution_count": null, 774 | "outputs": [] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "source": [ 779 | "Voila! Time to run inference loop!" 780 | ], 781 | "metadata": { 782 | "id": "ZyXEYmF3M4C5" 783 | } 784 | }, 785 | { 786 | "cell_type": "code", 787 | "source": [ 788 | "%%time\n", 789 | "\n", 790 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 791 | "\n", 792 | "for data in dataset:\n", 793 | " inputs = processor.feature_extractor(data[\"audio\"][\"array\"], return_tensors=\"pt\", sampling_rate=16_000).input_features.half().to(device)\n", 794 | " forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"en\", task=\"transcribe\")\n", 795 | " predicted_ids = model.generate(inputs, forced_decoder_ids=forced_decoder_ids)\n", 796 | " text = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=False)[0]\n", 797 | " print(text)" 798 | ], 799 | "metadata": { 800 | "colab": { 801 | "base_uri": "https://localhost:8080/" 802 | }, 803 | "id": "Sbwt2HOPEVpn", 804 | "outputId": "bfea6799-9226-4b80-dab1-39dbf936d5ab" 805 | }, 806 | "execution_count": null, 807 | "outputs": [ 808 | { 809 | "output_type": "stream", 810 | "name": "stderr", 811 | "text": [ 812 | "Reading metadata...: 16354it [00:00, 66038.23it/s]\n" 813 | ] 814 | }, 815 | { 816 | "output_type": "stream", 817 | "name": "stdout", 818 | "text": [ 819 | " Joe Keaton disapproved of films and Buster also had reservations about the medium.\n", 820 | " She'll be alright.\n", 821 | " Six.\n", 822 | " All is well that ends well.\n", 823 | " It is a busy market town that serves a large, surrounded area.\n", 824 | " the team had Olympic champion Carolina Marin in the squad for the season\n", 825 | " Do you mean it?\n", 826 | " The new patch is less invasive than the old one, but still causes regression.\n", 827 | " How is Mozilla going to handle ambiguities like Q and Q?\n", 828 | " Wish you a safe and happy holiday.\n", 829 | "CPU times: user 42.3 s, sys: 1.27 s, total: 43.6 s\n", 830 | "Wall time: 43.7 s\n" 831 | ] 832 | } 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "source": [ 838 | "!nvidia-smi" 839 | ], 840 | "metadata": { 841 | "id": "Lq91uTlgM-Xp", 842 | "outputId": "c7e6aab2-9996-40e2-8855-669b6c69b705", 843 | "colab": { 844 | "base_uri": "https://localhost:8080/" 845 | } 846 | }, 847 | "execution_count": null, 848 | "outputs": [ 849 | { 850 | "output_type": "stream", 851 | "name": "stdout", 852 | "text": [ 853 | "Wed Dec 7 15:28:37 2022 \n", 854 | "+-----------------------------------------------------------------------------+\n", 855 | "| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 856 | "|-------------------------------+----------------------+----------------------+\n", 857 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 858 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 859 | "| | | MIG M. |\n", 860 | "|===============================+======================+======================|\n", 861 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 862 | "| N/A 57C P0 29W / 70W | 6410MiB / 15109MiB | 0% Default |\n", 863 | "| | | N/A |\n", 864 | "+-------------------------------+----------------------+----------------------+\n", 865 | " \n", 866 | "+-----------------------------------------------------------------------------+\n", 867 | "| Processes: |\n", 868 | "| GPU GI CI PID Type Process name GPU Memory |\n", 869 | "| ID ID Usage |\n", 870 | "|=============================================================================|\n", 871 | "+-----------------------------------------------------------------------------+\n" 872 | ] 873 | } 874 | ] 875 | } 876 | ] 877 | } -------------------------------------------------------------------------------- /Whisper_translate_with_🤗transformers_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNc6C55bNNEjls6hK10Usqh", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "oW1KgNjEgtvZ", 37 | "outputId": "403edb60-8922-4d1b-ff04-d2b291efc89d" 38 | }, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "name": "stdout", 43 | "text": [ 44 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 45 | "Requirement already satisfied: transformers in /usr/local/lib/python3.8/dist-packages (4.26.1)\n", 46 | "Requirement already satisfied: datasets in /usr/local/lib/python3.8/dist-packages (2.9.0)\n", 47 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (1.21.6)\n", 48 | "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (6.0)\n", 49 | "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (0.12.0)\n", 50 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2)\n", 51 | "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (0.13.2)\n", 52 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers) (4.64.1)\n", 53 | "Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers) (2.25.1)\n", 54 | "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0)\n", 55 | "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (23.0)\n", 56 | "Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0)\n", 57 | "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2023.1.0)\n", 58 | "Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6)\n", 59 | "Requirement already satisfied: multiprocess in /usr/local/lib/python3.8/dist-packages (from datasets) (0.70.14)\n", 60 | "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5)\n", 61 | "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.18.0)\n", 62 | "Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3)\n", 63 | "Requirement already satisfied: xxhash in /usr/local/lib/python3.8/dist-packages (from datasets) (3.2.0)\n", 64 | "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2)\n", 65 | "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0)\n", 66 | "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1)\n", 67 | "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2)\n", 68 | "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1)\n", 69 | "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3)\n", 70 | "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4)\n", 71 | "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.4.0)\n", 72 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (1.26.14)\n", 73 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2022.12.7)\n", 74 | "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (4.0.0)\n", 75 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2.10)\n", 76 | "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7.1)\n", 77 | "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2)\n", 78 | "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "!pip install transformers datasets" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "source": [ 89 | "from transformers import pipeline\n", 90 | "from datasets import load_dataset" 91 | ], 92 | "metadata": { 93 | "id": "BLCTDydZgwvJ" 94 | }, 95 | "execution_count": 25, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "source": [ 101 | "pipe = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-small\", generate_kwargs={\"task\": \"translate\", \"language\": \"german\"})" 102 | ], 103 | "metadata": { 104 | "id": "_fGRIMlMhF_A" 105 | }, 106 | "execution_count": 26, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "source": [ 112 | "cv11 = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"de\", streaming=True, split=\"test\")" 113 | ], 114 | "metadata": { 115 | "id": "GbIJMykEEuVO" 116 | }, 117 | "execution_count": 27, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "source": [ 123 | "next(iter(cv11))" 124 | ], 125 | "metadata": { 126 | "colab": { 127 | "base_uri": "https://localhost:8080/" 128 | }, 129 | "id": "-xP3eHKDLVwS", 130 | "outputId": "f147a031-3cc4-4e1f-de2a-424bea9e4a05" 131 | }, 132 | "execution_count": 31, 133 | "outputs": [ 134 | { 135 | "output_type": "stream", 136 | "name": "stderr", 137 | "text": [ 138 | "Reading metadata...: 16082it [00:00, 24809.25it/s]\n" 139 | ] 140 | }, 141 | { 142 | "output_type": "execute_result", 143 | "data": { 144 | "text/plain": [ 145 | "{'client_id': '0052c07533a6976233ad5926d950b523002c4d8cdd9ae8726dbfec385951bd22aa707a742c49afe20c7d6cb9515dbaddac5b4d6fe8ebddcfbec46a2d3180a3a1',\n", 146 | " 'path': 'common_voice_de_17922420.mp3',\n", 147 | " 'audio': {'path': 'common_voice_de_17922420.mp3',\n", 148 | " 'array': array([ 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,\n", 149 | " -9.0749630e-12, 5.6385865e-09, 7.3282314e-09], dtype=float32),\n", 150 | " 'sampling_rate': 48000},\n", 151 | " 'sentence': 'Zieht euch bitte draußen die Schuhe aus.',\n", 152 | " 'up_votes': 2,\n", 153 | " 'down_votes': 0,\n", 154 | " 'age': '',\n", 155 | " 'gender': '',\n", 156 | " 'accent': '',\n", 157 | " 'locale': 'de',\n", 158 | " 'segment': ''}" 159 | ] 160 | }, 161 | "metadata": {}, 162 | "execution_count": 31 163 | } 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "source": [ 169 | "test_speech = {\"raw\": next(iter(cv11))[\"audio\"][\"array\"],\n", 170 | " \"sampling_rate\": next(iter(cv11))[\"audio\"][\"sampling_rate\"]}" 171 | ], 172 | "metadata": { 173 | "colab": { 174 | "base_uri": "https://localhost:8080/" 175 | }, 176 | "id": "ZZlzI4iWFD7C", 177 | "outputId": "f390e005-9720-4e8d-8d65-61f3d3714b7b" 178 | }, 179 | "execution_count": 28, 180 | "outputs": [ 181 | { 182 | "output_type": "stream", 183 | "name": "stderr", 184 | "text": [ 185 | "Reading metadata...: 16082it [00:00, 23627.15it/s]\n", 186 | "Reading metadata...: 16082it [00:00, 32748.99it/s]\n" 187 | ] 188 | } 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "source": [ 194 | "pipe(test_speech, return_timestamps=True, chunk_length_s=30, stride_length_s=[6,0])" 195 | ], 196 | "metadata": { 197 | "colab": { 198 | "base_uri": "https://localhost:8080/" 199 | }, 200 | "id": "1RDKFmrViDGP", 201 | "outputId": "84d38b06-8639-4285-86d2-7b40f0497ca1" 202 | }, 203 | "execution_count": 30, 204 | "outputs": [ 205 | { 206 | "output_type": "execute_result", 207 | "data": { 208 | "text/plain": [ 209 | "{'text': ' Please take off your shoes.',\n", 210 | " 'chunks': [{'text': ' Please take off your shoes.', 'timestamp': (0.0, 3.0)}]}" 211 | ] 212 | }, 213 | "metadata": {}, 214 | "execution_count": 30 215 | } 216 | ] 217 | } 218 | ] 219 | } -------------------------------------------------------------------------------- /deepseek_r1_distill_qwen1_5B_transformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "authorship_tag": "ABX9TyMaHXlZf4FF/2AbgPjQfxrR", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "source": [ 34 | "# Run DeepSeek R1 Distill Qwen 1.5B in FREE Google Colab\n", 35 | "\n", 36 | "Powered by Transformers and DeepSeek! ❤️" 37 | ], 38 | "metadata": { 39 | "id": "6dxqPSkyeDoO" 40 | } 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "source": [ 45 | "## Download the model checkpoint\n", 46 | "\n", 47 | "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" 48 | ], 49 | "metadata": { 50 | "id": "4vwR5z2LeN8h" 51 | } 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 7, 56 | "metadata": { 57 | "id": "uU4FwZWVbgdO" 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 62 | "\n", 63 | "model_name = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n", 64 | "\n", 65 | "model = AutoModelForCausalLM.from_pretrained(\n", 66 | " model_name,).to(\"cuda\")\n", 67 | "tokenizer = AutoTokenizer.from_pretrained(model_name)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "source": [ 73 | "## Provide a prompt & generation parameters" 74 | ], 75 | "metadata": { 76 | "id": "TH2jLGUHeSty" 77 | } 78 | }, 79 | { 80 | "cell_type": "code", 81 | "source": [ 82 | "prompt = \"write an efficient alogirthm for sorting a 2 dimensional array\"\n", 83 | "messages = [\n", 84 | " {\"role\": \"system\", \"content\": \"You are an extremely focused and to the point assistant.\"},\n", 85 | " {\"role\": \"user\", \"content\": prompt}\n", 86 | "]\n", 87 | "text = tokenizer.apply_chat_template(\n", 88 | " messages,\n", 89 | " tokenize=False,\n", 90 | " add_generation_prompt=True\n", 91 | ")\n", 92 | "model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)" 93 | ], 94 | "metadata": { 95 | "id": "A122cuXIcAUu" 96 | }, 97 | "execution_count": 8, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "source": [ 103 | "## Generate text" 104 | ], 105 | "metadata": { 106 | "id": "ImQgJ188eZ3X" 107 | } 108 | }, 109 | { 110 | "cell_type": "code", 111 | "source": [ 112 | "generated_ids = model.generate(\n", 113 | " **model_inputs,\n", 114 | " max_new_tokens=2048\n", 115 | ")\n", 116 | "generated_ids = [\n", 117 | " output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n", 118 | "]" 119 | ], 120 | "metadata": { 121 | "colab": { 122 | "base_uri": "https://localhost:8080/" 123 | }, 124 | "id": "pGg5GtgCcDCM", 125 | "outputId": "c6eeafd7-b1c3-4eaa-ad8e-f98bc77ca534" 126 | }, 127 | "execution_count": 9, 128 | "outputs": [ 129 | { 130 | "output_type": "stream", 131 | "name": "stderr", 132 | "text": [ 133 | "Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n" 134 | ] 135 | } 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "source": [ 141 | "## Decode response" 142 | ], 143 | "metadata": { 144 | "id": "8aIGWhNwefE-" 145 | } 146 | }, 147 | { 148 | "cell_type": "code", 149 | "source": [ 150 | "response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n" 151 | ], 152 | "metadata": { 153 | "id": "EfunHq9IcEH0" 154 | }, 155 | "execution_count": 11, 156 | "outputs": [] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "source": [ 161 | "## Voila, enjoy the response!" 162 | ], 163 | "metadata": { 164 | "id": "RjUJoLlTehLA" 165 | } 166 | }, 167 | { 168 | "cell_type": "code", 169 | "source": [ 170 | "print(response)" 171 | ], 172 | "metadata": { 173 | "colab": { 174 | "base_uri": "https://localhost:8080/" 175 | }, 176 | "id": "_dQ4sBtoct_R", 177 | "outputId": "8295138b-3d63-45d8-8090-f2e32eb718b0" 178 | }, 179 | "execution_count": 12, 180 | "outputs": [ 181 | { 182 | "output_type": "stream", 183 | "name": "stdout", 184 | "text": [ 185 | "\n", 186 | "Okay, so I need to figure out how to write an efficient algorithm for sorting a 2-dimensional array. Hmm, let's start by understanding what exactly is being asked here. The user wants an algorithm that can sort a 2D array, but I'm not entirely sure if they mean a 2D array of numbers or something else. Maybe it's a list of lists, where each sublist is a row. I should clarify that in my response.\n", 187 | "\n", 188 | "Alright, assuming it's a 2D array where each element is a number, I need to think about the best sorting algorithms for this. I know that for a single list, the most efficient sorting algorithms are typically O(n log n), like merge sort or quicksort. But since this is a 2D array, I have to consider how to sort it efficiently.\n", 189 | "\n", 190 | "One approach is to sort each row individually. That would mean applying a sorting algorithm to each sublist. But if the rows are of different lengths, that could cause issues. Wait, in a 2D array, are all rows of the same length? I think in most cases, yes, but I should consider that possibility.\n", 191 | "\n", 192 | "Another idea is to sort the entire array as a 2D structure. That would involve comparing elements across rows and columns. For example, sorting based on the first element, then the second, and so on. This is similar to how you sort a list of tuples in Python using the default sort, which compares elements lexicographically.\n", 193 | "\n", 194 | "I should also think about the time complexity. If I sort each row individually, the time complexity would be O(m * n log n), where m is the number of rows and n is the average number of elements per row. If the rows are of varying lengths, this could be inefficient. On the other hand, sorting the entire array as a 2D structure would have a time complexity of O(m * n^2 log n), which is worse.\n", 195 | "\n", 196 | "So, which approach is better? If the rows are of similar lengths and the sorting is done element-wise, sorting each row individually might be more efficient. But if the rows are of different lengths, the entire array approach would be better.\n", 197 | "\n", 198 | "Wait, the user didn't specify whether the array is a list of lists or a single list. I should clarify that. If it's a single list, then the algorithm would be O(n log n). If it's a 2D array, then it depends on the structure.\n", 199 | "\n", 200 | "I think the user is referring to a 2D array, so I should proceed with that assumption. Therefore, the algorithm should be able to handle a 2D array and sort it efficiently. I'll outline the steps for both approaches: sorting each row individually and sorting the entire array as a 2D structure.\n", 201 | "\n", 202 | "I should also mention that the choice between the two depends on the specific requirements, like the size of the array and the desired time complexity. For most cases, sorting each row individually might be sufficient and easier to implement.\n", 203 | "\n", 204 | "Finally, I'll provide a code example for both methods to illustrate how they can be implemented in Python. This way, the user can choose the one that best fits their needs.\n", 205 | "\n", 206 | "\n", 207 | "To sort a 2-dimensional array efficiently, you can choose between two approaches: sorting each row individually or sorting the entire array as a 2D structure. Here's how you can implement each method:\n", 208 | "\n", 209 | "### 1. Sort Each Row Individually\n", 210 | "This approach involves applying a sorting algorithm to each sublist (row) of the 2D array. This is efficient if the rows are of similar lengths and the sorting is done element-wise.\n", 211 | "\n", 212 | "**Algorithm:**\n", 213 | "1. For each row in the 2D array:\n", 214 | " - Apply a sorting algorithm (e.g., quicksort, mergesort, or a built-in sort function) to the row.\n", 215 | "2. Return the modified 2D array.\n", 216 | "\n", 217 | "**Python Code Example:**\n", 218 | "```python\n", 219 | "def sort_rows(arr):\n", 220 | " if not arr:\n", 221 | " return []\n", 222 | " for row in arr:\n", 223 | " row.sort()\n", 224 | " return arr\n", 225 | "\n", 226 | "# Example usage:\n", 227 | "arr = [[3, 1, 2], [4, 5, 6], [7, 8, 9]]\n", 228 | "sorted_arr = sort_rows(arr)\n", 229 | "print(sorted_arr)\n", 230 | "```\n", 231 | "\n", 232 | "### 2. Sort the Entire 2D Array\n", 233 | "This approach involves sorting the entire array as a 2D structure, which can be done lexicographically (element-wise comparison).\n", 234 | "\n", 235 | "**Algorithm:**\n", 236 | "1. Sort the entire 2D array using a sorting algorithm that compares elements across rows and columns.\n", 237 | "2. Return the sorted 2D array.\n", 238 | "\n", 239 | "**Python Code Example:**\n", 240 | "```python\n", 241 | "def sort_2d_array(arr):\n", 242 | " return sorted(arr)\n", 243 | "\n", 244 | "# Example usage:\n", 245 | "arr = [[3, 1, 2], [4, 5, 6], [7, 8, 9]]\n", 246 | "sorted_arr = sort_2d_array(arr)\n", 247 | "print(sorted_arr)\n", 248 | "```\n", 249 | "\n", 250 | "### Choosing the Appropriate Method\n", 251 | "- **Sorting Each Row Individually:** More efficient if rows are of similar lengths and the sorting is done element-wise.\n", 252 | "- **Sorting the Entire 2D Array:** More efficient if the rows are of varying lengths and the entire array needs to be sorted lexicographically.\n", 253 | "\n", 254 | "### Conclusion\n", 255 | "The choice between the two methods depends on the specific requirements of your use case. If rows are of similar lengths and element-wise sorting is sufficient, sorting each row individually is more efficient. If the rows are of varying lengths and a lexicographic sort is needed, sorting the entire array as a 2D structure is more appropriate.\n" 256 | ] 257 | } 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "source": [], 263 | "metadata": { 264 | "id": "YQQXrzuBcvo-" 265 | }, 266 | "execution_count": null, 267 | "outputs": [] 268 | } 269 | ] 270 | } -------------------------------------------------------------------------------- /insanely_fast_whisper_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "authorship_tag": "ABX9TyNO3mkZ+HMQrvkMHRtFpKvj", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "source": [ 34 | "# [Insanely Fast Whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)\n", 35 | "\n", 36 | "By VB (https://twitter.com/reach_vb)\n", 37 | "\n", 38 | "P.S. Make sure you're on a GPU run-time 🤗" 39 | ], 40 | "metadata": { 41 | "id": "q0MBgZKbhdII" 42 | } 43 | }, 44 | { 45 | "cell_type": "code", 46 | "source": [ 47 | "!pip install -q pipx && apt install python3.10-venv" 48 | ], 49 | "metadata": { 50 | "colab": { 51 | "base_uri": "https://localhost:8080/" 52 | }, 53 | "id": "VF-qp-FWJmyD", 54 | "outputId": "10712868-be6e-4b82-b8c2-95e43c591173" 55 | }, 56 | "execution_count": 1, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "name": "stdout", 61 | "text": [ 62 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/57.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.8/57.8 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 63 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 64 | "Reading package lists... Done\n", 65 | "Building dependency tree... Done\n", 66 | "Reading state information... Done\n", 67 | "The following additional packages will be installed:\n", 68 | " python3-pip-whl python3-setuptools-whl\n", 69 | "The following NEW packages will be installed:\n", 70 | " python3-pip-whl python3-setuptools-whl python3.10-venv\n", 71 | "0 upgraded, 3 newly installed, 0 to remove and 9 not upgraded.\n", 72 | "Need to get 2,473 kB of archives.\n", 73 | "After this operation, 2,884 kB of additional disk space will be used.\n", 74 | "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3-pip-whl all 22.0.2+dfsg-1ubuntu0.4 [1,680 kB]\n", 75 | "Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3-setuptools-whl all 59.6.0-1.2ubuntu0.22.04.1 [788 kB]\n", 76 | "Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3.10-venv amd64 3.10.12-1~22.04.2 [5,724 B]\n", 77 | "Fetched 2,473 kB in 2s (1,635 kB/s)\n", 78 | "Selecting previously unselected package python3-pip-whl.\n", 79 | "(Reading database ... 120880 files and directories currently installed.)\n", 80 | "Preparing to unpack .../python3-pip-whl_22.0.2+dfsg-1ubuntu0.4_all.deb ...\n", 81 | "Unpacking python3-pip-whl (22.0.2+dfsg-1ubuntu0.4) ...\n", 82 | "Selecting previously unselected package python3-setuptools-whl.\n", 83 | "Preparing to unpack .../python3-setuptools-whl_59.6.0-1.2ubuntu0.22.04.1_all.deb ...\n", 84 | "Unpacking python3-setuptools-whl (59.6.0-1.2ubuntu0.22.04.1) ...\n", 85 | "Selecting previously unselected package python3.10-venv.\n", 86 | "Preparing to unpack .../python3.10-venv_3.10.12-1~22.04.2_amd64.deb ...\n", 87 | "Unpacking python3.10-venv (3.10.12-1~22.04.2) ...\n", 88 | "Setting up python3-setuptools-whl (59.6.0-1.2ubuntu0.22.04.1) ...\n", 89 | "Setting up python3-pip-whl (22.0.2+dfsg-1ubuntu0.4) ...\n", 90 | "Setting up python3.10-venv (3.10.12-1~22.04.2) ...\n" 91 | ] 92 | } 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "source": [ 98 | "!pipx run insanely-fast-whisper --file-name https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav" 99 | ], 100 | "metadata": { 101 | "colab": { 102 | "base_uri": "https://localhost:8080/" 103 | }, 104 | "id": "i_H9Dm89Jj0-", 105 | "outputId": "f737b9fd-d625-4ccd-d8a1-1895cdf1b22f" 106 | }, 107 | "execution_count": 2, 108 | "outputs": [ 109 | { 110 | "output_type": "stream", 111 | "name": "stdout", 112 | "text": [ 113 | "config.json: 100% 1.25k/1.25k [00:00<00:00, 6.33MB/s]\n", 114 | "model.safetensors: 100% 3.09G/3.09G [00:12<00:00, 242MB/s]\n", 115 | "generation_config.json: 100% 3.87k/3.87k [00:00<00:00, 17.3MB/s]\n", 116 | "tokenizer_config.json: 100% 283k/283k [00:00<00:00, 2.15MB/s]\n", 117 | "vocab.json: 100% 1.04M/1.04M [00:00<00:00, 5.28MB/s]\n", 118 | "tokenizer.json: 100% 2.48M/2.48M [00:00<00:00, 9.49MB/s]\n", 119 | "merges.txt: 100% 494k/494k [00:00<00:00, 3.74MB/s]\n", 120 | "normalizer.json: 100% 52.7k/52.7k [00:00<00:00, 97.3MB/s]\n", 121 | "added_tokens.json: 100% 34.6k/34.6k [00:00<00:00, 110MB/s]\n", 122 | "special_tokens_map.json: 100% 2.07k/2.07k [00:00<00:00, 8.95MB/s]\n", 123 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", 124 | "preprocessor_config.json: 100% 340/340 [00:00<00:00, 1.98MB/s]\n", 125 | "The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.\n", 126 | "\u001b[2K🤗 \u001b[33mTranscribing...\u001b[0m \u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[93m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m\u001b[37m━\u001b[0m \u001b[33m0:00:09\u001b[0m\n", 127 | "\u001b[?25hVoila! Your file has been transcribed go check it out over here! output.json\n" 128 | ] 129 | } 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "source": [ 135 | "!head output.json" 136 | ], 137 | "metadata": { 138 | "colab": { 139 | "base_uri": "https://localhost:8080/" 140 | }, 141 | "id": "NDFrydpsvu57", 142 | "outputId": "de3d9635-5cf1-46ca-d401-e6c78c5659dc" 143 | }, 144 | "execution_count": 4, 145 | "outputs": [ 146 | { 147 | "output_type": "stream", 148 | "name": "stdout", 149 | "text": [ 150 | "{\"text\": \" So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know, you get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that, like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen to every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option. It was way too big a project. So I planned things out, and I decided it kind of had to go something like this. This is how the year would go. So I'd start off light,\", \"chunks\": [{\"timestamp\": [0.0, 4.48], \"text\": \" So in college, I was a government major,\"}, {\"timestamp\": [4.88, 6.62], \"text\": \" which means I had to write a lot of papers.\"}, {\"timestamp\": [7.42, 8.86], \"text\": \" Now, when a normal student writes a paper,\"}, {\"timestamp\": [8.94, 10.6], \"text\": \" they might spread the work out a little like this.\"}, {\"timestamp\": [11.74, 16.3], \"text\": \" So, you know, you get started maybe a little slowly,\"}, {\"timestamp\": [16.36, 17.86], \"text\": \" but you get enough done in the first week\"}, {\"timestamp\": [17.86, 19.76], \"text\": \" that with some heavier days later on,\"}, {\"timestamp\": [20.28, 21.98], \"text\": \" everything gets done and things stay civil.\"}, {\"timestamp\": [23.64, 25.8], \"text\": \" And I would want to do that, like that.\"}, {\"timestamp\": [26.12, 26.94], \"text\": \" That would be the plan.\"}, {\"timestamp\": [27.22, 29.84], \"text\": \" I would have it all ready to go,\"}, {\"timestamp\": [29.96, 32.42], \"text\": \" but then actually the paper would come along,\"}, {\"timestamp\": [32.46, 33.6], \"text\": \" and then I would kind of do this.\"}, {\"timestamp\": [36.48, 38.44], \"text\": \" And that would happen to every single paper.\"}, {\"timestamp\": [39.32, 43.04], \"text\": \" But then came my 90-page senior thesis,\"}, {\"timestamp\": [43.54, 46.0], \"text\": \" a paper you're supposed to spend a year on.\"}, {\"timestamp\": [46.0, 50.0], \"text\": \" I knew for a paper like that, my normal workflow was not an option.\"}, {\"timestamp\": [50.0, 52.0], \"text\": \" It was way too big a project.\"}, {\"timestamp\": [52.0, 56.0], \"text\": \" So I planned things out, and I decided it kind of had to go something like this.\"}, {\"timestamp\": [56.0, 58.0], \"text\": \" This is how the year would go.\"}, {\"timestamp\": [58.0, 60.0], \"text\": \" So I'd start off light,\"}]}" 151 | ] 152 | } 153 | ] 154 | } 155 | ] 156 | } -------------------------------------------------------------------------------- /orpheus-pretrained-inference-demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "source": [ 16 | "model_name = \"canopylabs/orpheus-3b-0.1-pretrained\"\n", 17 | "\n", 18 | "print(\"*** Change the model you use here\")" 19 | ], 20 | "metadata": { 21 | "id": "my_UA_HRu2tK", 22 | "colab": { 23 | "base_uri": "https://localhost:8080/" 24 | }, 25 | "outputId": "ae9d7e55-dcad-481a-8b8c-b1d71ad5565c" 26 | }, 27 | "execution_count": null, 28 | "outputs": [ 29 | { 30 | "output_type": "stream", 31 | "name": "stdout", 32 | "text": [ 33 | "*** Change the model you use here\n" 34 | ] 35 | } 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "id": "lHWzRUCDcyMx", 43 | "cellView": "form" 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "#@title Installation & Setup\n", 48 | "%%capture\n", 49 | "!pip install snac ipywebrtc\n", 50 | "!pip install datasets\n", 51 | "from snac import SNAC\n", 52 | "import torch\n", 53 | "import torch\n", 54 | "from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer\n", 55 | "import numpy as np\n", 56 | "import soundfile as sf\n", 57 | "import IPython.display as ipd\n", 58 | "import librosa\n", 59 | "from ipywebrtc import AudioRecorder, Audio\n", 60 | "from IPython.display import display\n", 61 | "import ipywidgets as widgets\n", 62 | "from huggingface_hub import snapshot_download\n", 63 | "import torchaudio.transforms as T\n", 64 | "import librosa\n", 65 | "import torch\n", 66 | "from IPython.display import Audio, display\n", 67 | "\n", 68 | "model_name = \"canopylabs/orpheus-tts-0.1-pretrained\"\n", 69 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", 70 | "\n", 71 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 72 | "snac_model = SNAC.from_pretrained(\"hubertsiuzdak/snac_24khz\")\n", 73 | "\n", 74 | "\n", 75 | "# Download only model config and safetensors\n", 76 | "model_path = snapshot_download(\n", 77 | " repo_id=model_name,\n", 78 | " allow_patterns=[\n", 79 | " \"config.json\",\n", 80 | " \"*.safetensors\",\n", 81 | " \"model.safetensors.index.json\",\n", 82 | " ],\n", 83 | " ignore_patterns=[\n", 84 | " \"optimizer.pt\",\n", 85 | " \"pytorch_model.bin\",\n", 86 | " \"training_args.bin\",\n", 87 | " \"scheduler.pt\",\n", 88 | " \"tokenizer.json\",\n", 89 | " \"tokenizer_config.json\",\n", 90 | " \"special_tokens_map.json\",\n", 91 | " \"vocab.json\",\n", 92 | " \"merges.txt\",\n", 93 | " \"tokenizer.*\"\n", 94 | " ]\n", 95 | ")\n", 96 | "\n", 97 | "model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)\n", 98 | "model.cuda()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "source": [ 104 | "### CHANGE THIS TO YOUR OWN FILE AND TEXT\n", 105 | "\n", 106 | "my_wav_file_is = \"X.wav\"\n", 107 | "and_the_transcript_is = \"Something or the other\"\n", 108 | "\n", 109 | "the_model_should_say = [\n", 110 | " \"I finally got into the university of my dreams! I can't believe all this hard work actually paid off!\",\n", 111 | " \"Why is your frickin' Waymo blocking the frickin' road? GET OUT OF THE WAY!\",\n", 112 | " \"I'm so sorry to hear about your pet, but you know, he'll pull through.\",\n", 113 | " \"Conversational, uhm, systems, tend to speak pretty robotically, because- because they don't, really understand how, uhm, humans talk.\"\n", 114 | "\n", 115 | "]" 116 | ], 117 | "metadata": { 118 | "id": "P81EElEWvg2J" 119 | }, 120 | "execution_count": null, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "source": [ 126 | "#@title Tokenising your stuff for the prompt\n", 127 | "%%capture\n", 128 | "\n", 129 | "''' Here we tokenise the prompt you gave us, we also tokenise the prompts you want the model to say\n", 130 | "\n", 131 | "The template is:\n", 132 | "\n", 133 | "start_of_human, start_of_text, text, end_of_text, start_of_ai, start_of_speech, speech, end_of_speech, end_of_ai, start_of_human, text, end_of_human and then generate from here\n", 134 | "\n", 135 | "'''\n", 136 | "\n", 137 | "\n", 138 | "filename = my_wav_file_is\n", 139 | "\n", 140 | "audio_array, sample_rate = librosa.load(filename, sr=24000)\n", 141 | "\n", 142 | "def tokenise_audio(waveform):\n", 143 | " waveform = torch.from_numpy(waveform).unsqueeze(0)\n", 144 | " waveform = waveform.to(dtype=torch.float32)\n", 145 | "\n", 146 | "\n", 147 | " waveform = waveform.unsqueeze(0)\n", 148 | "\n", 149 | " with torch.inference_mode():\n", 150 | " codes = snac_model.encode(waveform)\n", 151 | "\n", 152 | " all_codes = []\n", 153 | " for i in range(codes[0].shape[1]):\n", 154 | " all_codes.append(codes[0][0][i].item()+128266)\n", 155 | " all_codes.append(codes[1][0][2*i].item()+128266+4096)\n", 156 | " all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))\n", 157 | " all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))\n", 158 | " all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))\n", 159 | " all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))\n", 160 | " all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))\n", 161 | "\n", 162 | "\n", 163 | " return all_codes\n", 164 | "\n", 165 | "myts = tokenise_audio(audio_array)\n", 166 | "start_tokens = torch.tensor([[ 128259]], dtype=torch.int64)\n", 167 | "end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64)\n", 168 | "final_tokens = torch.tensor([[128258, 128262]], dtype=torch.int64)\n", 169 | "voice_prompt = and_the_transcript_is\n", 170 | "prompt_tokked = tokenizer(voice_prompt, return_tensors=\"pt\")\n", 171 | "\n", 172 | "input_ids = prompt_tokked[\"input_ids\"]\n", 173 | "\n", 174 | "zeroprompt_input_ids = torch.cat([start_tokens, input_ids, end_tokens, torch.tensor([myts]), final_tokens], dim=1) # SOH SOT Text EOT EOH\n", 175 | "\n", 176 | "prompts = the_model_should_say\n", 177 | "\n", 178 | "all_modified_input_ids = []\n", 179 | "for prompt in prompts:\n", 180 | " input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n", 181 | " second_input_ids = torch.cat([zeroprompt_input_ids, start_tokens, input_ids, end_tokens], dim=1)\n", 182 | " all_modified_input_ids.append(second_input_ids)\n", 183 | "\n", 184 | "\n", 185 | "all_padded_tensors = []\n", 186 | "all_attention_masks = []\n", 187 | "\n", 188 | "max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])\n", 189 | "\n", 190 | "for modified_input_ids in all_modified_input_ids:\n", 191 | " padding = max_length - modified_input_ids.shape[1]\n", 192 | " padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)\n", 193 | " attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)\n", 194 | " all_padded_tensors.append(padded_tensor)\n", 195 | " all_attention_masks.append(attention_mask)\n", 196 | "\n", 197 | "all_padded_tensors = torch.cat(all_padded_tensors, dim=0)\n", 198 | "all_attention_masks = torch.cat(all_attention_masks, dim=0)\n", 199 | "\n", 200 | "input_ids = all_padded_tensors.to(\"cuda\")\n", 201 | "attention_mask = all_attention_masks.to(\"cuda\")\n" 202 | ], 203 | "metadata": { 204 | "id": "cXgZmdclbfk_", 205 | "cellView": "form" 206 | }, 207 | "execution_count": null, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "id": "J_D2LtYw9gkl", 215 | "colab": { 216 | "base_uri": "https://localhost:8080/" 217 | }, 218 | "outputId": "73b937d8-720f-4445-ebdd-70063a065e6b", 219 | "cellView": "form" 220 | }, 221 | "outputs": [ 222 | { 223 | "output_type": "stream", 224 | "name": "stderr", 225 | "text": [ 226 | "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", 227 | "Setting `pad_token_id` to `eos_token_id`:128258 for open-end generation.\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "#@title Run Inference\n", 233 | "\n", 234 | "with torch.no_grad():\n", 235 | " generated_ids = model.generate(\n", 236 | " input_ids=input_ids,\n", 237 | " # attention_mask=attention_mask,\n", 238 | " max_new_tokens=990,\n", 239 | " do_sample=True,\n", 240 | " temperature=0.5,\n", 241 | " # top_k=40,\n", 242 | " top_p=0.9,\n", 243 | " repetition_penalty=1.1,\n", 244 | " num_return_sequences=1,\n", 245 | " eos_token_id=128258,\n", 246 | " # end_token_id=128009\n", 247 | " )\n", 248 | "\n", 249 | "# generated_ids = torch.cat([generated_ids, torch.tensor([[128262]]).to(\"cuda\")], dim=1) # EOAI" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "source": [ 255 | "#@title Convert output to speech\n", 256 | "%%capture\n", 257 | "token_to_find = 128257\n", 258 | "token_to_remove = 128258\n", 259 | "\n", 260 | "# Check if the token exists in the tensor\n", 261 | "token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)\n", 262 | "\n", 263 | "if len(token_indices[1]) > 0:\n", 264 | " last_occurrence_idx = token_indices[1][-1].item()\n", 265 | " cropped_tensor = generated_ids[:, last_occurrence_idx+1:]\n", 266 | "else:\n", 267 | " cropped_tensor = generated_ids\n", 268 | "\n", 269 | "mask = cropped_tensor != token_to_remove\n", 270 | "processed_rows = []\n", 271 | "for row in cropped_tensor:\n", 272 | " # Apply the mask to each row\n", 273 | " masked_row = row[row != token_to_remove]\n", 274 | " processed_rows.append(masked_row)\n", 275 | "\n", 276 | "code_lists = []\n", 277 | "for row in processed_rows:\n", 278 | " # row is a 1D tensor with its own length\n", 279 | " row_length = row.size(0)\n", 280 | " new_length = (row_length // 7) * 7 # largest multiple of 7 that fits in this row\n", 281 | " trimmed_row = row[:new_length]\n", 282 | " trimmed_row = [t - 128266 for t in trimmed_row]\n", 283 | " code_lists.append(trimmed_row)\n", 284 | "\n", 285 | "def redistribute_codes(code_list):\n", 286 | " layer_1 = []\n", 287 | " layer_2 = []\n", 288 | " layer_3 = []\n", 289 | " for i in range((len(code_list)+1)//7):\n", 290 | " layer_1.append(code_list[7*i])\n", 291 | " layer_2.append(code_list[7*i+1]-4096)\n", 292 | " layer_3.append(code_list[7*i+2]-(2*4096))\n", 293 | " layer_3.append(code_list[7*i+3]-(3*4096))\n", 294 | " layer_2.append(code_list[7*i+4]-(4*4096))\n", 295 | " layer_3.append(code_list[7*i+5]-(5*4096))\n", 296 | " layer_3.append(code_list[7*i+6]-(6*4096))\n", 297 | " codes = [torch.tensor(layer_1).unsqueeze(0),\n", 298 | " torch.tensor(layer_2).unsqueeze(0),\n", 299 | " torch.tensor(layer_3).unsqueeze(0)]\n", 300 | " audio_hat = snac_model.decode(codes)\n", 301 | " return audio_hat\n", 302 | "\n", 303 | "my_samples = []\n", 304 | "for code_list in code_lists:\n", 305 | " samples = redistribute_codes(code_list)\n", 306 | " my_samples.append(samples)" 307 | ], 308 | "metadata": { 309 | "id": "lV49oiPFpbXL", 310 | "cellView": "form" 311 | }, 312 | "execution_count": null, 313 | "outputs": [] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "source": [ 318 | "#@title Display Speech\n", 319 | "from IPython.display import Audio, display\n", 320 | "for samples in my_samples:\n", 321 | " display(Audio(samples.detach().squeeze().to(\"cpu\").numpy(), rate=24000))" 322 | ], 323 | "metadata": { 324 | "colab": { 325 | "base_uri": "https://localhost:8080/", 326 | "height": 162 327 | }, 328 | "id": "JuwkHqU4piMJ", 329 | "outputId": "c27f4f08-f57e-44be-9b21-3d381b3dd452", 330 | "cellView": "form" 331 | }, 332 | "execution_count": null, 333 | "outputs": [ 334 | { 335 | "output_type": "error", 336 | "ename": "NameError", 337 | "evalue": "name 'my_samples' is not defined", 338 | "traceback": [ 339 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 340 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 341 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0msamples\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmy_samples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mAudio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m24000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 342 | "\u001b[0;31mNameError\u001b[0m: name 'my_samples' is not defined" 343 | ] 344 | } 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "source": [], 350 | "metadata": { 351 | "id": "uQoWOaFC1EDi" 352 | }, 353 | "execution_count": null, 354 | "outputs": [] 355 | } 356 | ], 357 | "metadata": { 358 | "accelerator": "GPU", 359 | "colab": { 360 | "gpuType": "A100", 361 | "machine_shape": "hm", 362 | "provenance": [], 363 | "include_colab_link": true 364 | }, 365 | "kernelspec": { 366 | "display_name": "Python 3", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "name": "python" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 0 375 | } -------------------------------------------------------------------------------- /stable_audio_open_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "authorship_tag": "ABX9TyOQLO4/4GSQapVQeg1vezzz", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU", 19 | "widgets": { 20 | "application/vnd.jupyter.widget-state+json": { 21 | "d621e5ab73d24c0f85aca1e0a20d3e89": { 22 | "model_module": "@jupyter-widgets/controls", 23 | "model_name": "HBoxModel", 24 | "model_module_version": "1.5.0", 25 | "state": { 26 | "_dom_classes": [], 27 | "_model_module": "@jupyter-widgets/controls", 28 | "_model_module_version": "1.5.0", 29 | "_model_name": "HBoxModel", 30 | "_view_count": null, 31 | "_view_module": "@jupyter-widgets/controls", 32 | "_view_module_version": "1.5.0", 33 | "_view_name": "HBoxView", 34 | "box_style": "", 35 | "children": [ 36 | "IPY_MODEL_6c7808a91a3e4f72a1248832b985020c", 37 | "IPY_MODEL_13e9dac474494304b279d662de87d8f4", 38 | "IPY_MODEL_5d116c165c3841248f360b00ba35fffe" 39 | ], 40 | "layout": "IPY_MODEL_d700622dab8e41e4b3d3acc10ff42ba2" 41 | } 42 | }, 43 | "6c7808a91a3e4f72a1248832b985020c": { 44 | "model_module": "@jupyter-widgets/controls", 45 | "model_name": "HTMLModel", 46 | "model_module_version": "1.5.0", 47 | "state": { 48 | "_dom_classes": [], 49 | "_model_module": "@jupyter-widgets/controls", 50 | "_model_module_version": "1.5.0", 51 | "_model_name": "HTMLModel", 52 | "_view_count": null, 53 | "_view_module": "@jupyter-widgets/controls", 54 | "_view_module_version": "1.5.0", 55 | "_view_name": "HTMLView", 56 | "description": "", 57 | "description_tooltip": null, 58 | "layout": "IPY_MODEL_9d88d7dc67534a1fb6e1e15c9c5b937b", 59 | "placeholder": "​", 60 | "style": "IPY_MODEL_480f4371362742d2952ffbbd986ec729", 61 | "value": "100%" 62 | } 63 | }, 64 | "13e9dac474494304b279d662de87d8f4": { 65 | "model_module": "@jupyter-widgets/controls", 66 | "model_name": "FloatProgressModel", 67 | "model_module_version": "1.5.0", 68 | "state": { 69 | "_dom_classes": [], 70 | "_model_module": "@jupyter-widgets/controls", 71 | "_model_module_version": "1.5.0", 72 | "_model_name": "FloatProgressModel", 73 | "_view_count": null, 74 | "_view_module": "@jupyter-widgets/controls", 75 | "_view_module_version": "1.5.0", 76 | "_view_name": "ProgressView", 77 | "bar_style": "success", 78 | "description": "", 79 | "description_tooltip": null, 80 | "layout": "IPY_MODEL_f6aef59a2db049c39ee874ba3cfc4798", 81 | "max": 100, 82 | "min": 0, 83 | "orientation": "horizontal", 84 | "style": "IPY_MODEL_54b55dbf5a5d4a2e9afe5c7dc1e26d7d", 85 | "value": 100 86 | } 87 | }, 88 | "5d116c165c3841248f360b00ba35fffe": { 89 | "model_module": "@jupyter-widgets/controls", 90 | "model_name": "HTMLModel", 91 | "model_module_version": "1.5.0", 92 | "state": { 93 | "_dom_classes": [], 94 | "_model_module": "@jupyter-widgets/controls", 95 | "_model_module_version": "1.5.0", 96 | "_model_name": "HTMLModel", 97 | "_view_count": null, 98 | "_view_module": "@jupyter-widgets/controls", 99 | "_view_module_version": "1.5.0", 100 | "_view_name": "HTMLView", 101 | "description": "", 102 | "description_tooltip": null, 103 | "layout": "IPY_MODEL_e5b2378a037846528a84c0a39547ffc2", 104 | "placeholder": "​", 105 | "style": "IPY_MODEL_f7659f9f7fe14dff8577d6f904e3a108", 106 | "value": " 100/100 [00:34<00:00,  2.91it/s]" 107 | } 108 | }, 109 | "d700622dab8e41e4b3d3acc10ff42ba2": { 110 | "model_module": "@jupyter-widgets/base", 111 | "model_name": "LayoutModel", 112 | "model_module_version": "1.2.0", 113 | "state": { 114 | "_model_module": "@jupyter-widgets/base", 115 | "_model_module_version": "1.2.0", 116 | "_model_name": "LayoutModel", 117 | "_view_count": null, 118 | "_view_module": "@jupyter-widgets/base", 119 | "_view_module_version": "1.2.0", 120 | "_view_name": "LayoutView", 121 | "align_content": null, 122 | "align_items": null, 123 | "align_self": null, 124 | "border": null, 125 | "bottom": null, 126 | "display": null, 127 | "flex": null, 128 | "flex_flow": null, 129 | "grid_area": null, 130 | "grid_auto_columns": null, 131 | "grid_auto_flow": null, 132 | "grid_auto_rows": null, 133 | "grid_column": null, 134 | "grid_gap": null, 135 | "grid_row": null, 136 | "grid_template_areas": null, 137 | "grid_template_columns": null, 138 | "grid_template_rows": null, 139 | "height": null, 140 | "justify_content": null, 141 | "justify_items": null, 142 | "left": null, 143 | "margin": null, 144 | "max_height": null, 145 | "max_width": null, 146 | "min_height": null, 147 | "min_width": null, 148 | "object_fit": null, 149 | "object_position": null, 150 | "order": null, 151 | "overflow": null, 152 | "overflow_x": null, 153 | "overflow_y": null, 154 | "padding": null, 155 | "right": null, 156 | "top": null, 157 | "visibility": null, 158 | "width": null 159 | } 160 | }, 161 | "9d88d7dc67534a1fb6e1e15c9c5b937b": { 162 | "model_module": "@jupyter-widgets/base", 163 | "model_name": "LayoutModel", 164 | "model_module_version": "1.2.0", 165 | "state": { 166 | "_model_module": "@jupyter-widgets/base", 167 | "_model_module_version": "1.2.0", 168 | "_model_name": "LayoutModel", 169 | "_view_count": null, 170 | "_view_module": "@jupyter-widgets/base", 171 | "_view_module_version": "1.2.0", 172 | "_view_name": "LayoutView", 173 | "align_content": null, 174 | "align_items": null, 175 | "align_self": null, 176 | "border": null, 177 | "bottom": null, 178 | "display": null, 179 | "flex": null, 180 | "flex_flow": null, 181 | "grid_area": null, 182 | "grid_auto_columns": null, 183 | "grid_auto_flow": null, 184 | "grid_auto_rows": null, 185 | "grid_column": null, 186 | "grid_gap": null, 187 | "grid_row": null, 188 | "grid_template_areas": null, 189 | "grid_template_columns": null, 190 | "grid_template_rows": null, 191 | "height": null, 192 | "justify_content": null, 193 | "justify_items": null, 194 | "left": null, 195 | "margin": null, 196 | "max_height": null, 197 | "max_width": null, 198 | "min_height": null, 199 | "min_width": null, 200 | "object_fit": null, 201 | "object_position": null, 202 | "order": null, 203 | "overflow": null, 204 | "overflow_x": null, 205 | "overflow_y": null, 206 | "padding": null, 207 | "right": null, 208 | "top": null, 209 | "visibility": null, 210 | "width": null 211 | } 212 | }, 213 | "480f4371362742d2952ffbbd986ec729": { 214 | "model_module": "@jupyter-widgets/controls", 215 | "model_name": "DescriptionStyleModel", 216 | "model_module_version": "1.5.0", 217 | "state": { 218 | "_model_module": "@jupyter-widgets/controls", 219 | "_model_module_version": "1.5.0", 220 | "_model_name": "DescriptionStyleModel", 221 | "_view_count": null, 222 | "_view_module": "@jupyter-widgets/base", 223 | "_view_module_version": "1.2.0", 224 | "_view_name": "StyleView", 225 | "description_width": "" 226 | } 227 | }, 228 | "f6aef59a2db049c39ee874ba3cfc4798": { 229 | "model_module": "@jupyter-widgets/base", 230 | "model_name": "LayoutModel", 231 | "model_module_version": "1.2.0", 232 | "state": { 233 | "_model_module": "@jupyter-widgets/base", 234 | "_model_module_version": "1.2.0", 235 | "_model_name": "LayoutModel", 236 | "_view_count": null, 237 | "_view_module": "@jupyter-widgets/base", 238 | "_view_module_version": "1.2.0", 239 | "_view_name": "LayoutView", 240 | "align_content": null, 241 | "align_items": null, 242 | "align_self": null, 243 | "border": null, 244 | "bottom": null, 245 | "display": null, 246 | "flex": null, 247 | "flex_flow": null, 248 | "grid_area": null, 249 | "grid_auto_columns": null, 250 | "grid_auto_flow": null, 251 | "grid_auto_rows": null, 252 | "grid_column": null, 253 | "grid_gap": null, 254 | "grid_row": null, 255 | "grid_template_areas": null, 256 | "grid_template_columns": null, 257 | "grid_template_rows": null, 258 | "height": null, 259 | "justify_content": null, 260 | "justify_items": null, 261 | "left": null, 262 | "margin": null, 263 | "max_height": null, 264 | "max_width": null, 265 | "min_height": null, 266 | "min_width": null, 267 | "object_fit": null, 268 | "object_position": null, 269 | "order": null, 270 | "overflow": null, 271 | "overflow_x": null, 272 | "overflow_y": null, 273 | "padding": null, 274 | "right": null, 275 | "top": null, 276 | "visibility": null, 277 | "width": null 278 | } 279 | }, 280 | "54b55dbf5a5d4a2e9afe5c7dc1e26d7d": { 281 | "model_module": "@jupyter-widgets/controls", 282 | "model_name": "ProgressStyleModel", 283 | "model_module_version": "1.5.0", 284 | "state": { 285 | "_model_module": "@jupyter-widgets/controls", 286 | "_model_module_version": "1.5.0", 287 | "_model_name": "ProgressStyleModel", 288 | "_view_count": null, 289 | "_view_module": "@jupyter-widgets/base", 290 | "_view_module_version": "1.2.0", 291 | "_view_name": "StyleView", 292 | "bar_color": null, 293 | "description_width": "" 294 | } 295 | }, 296 | "e5b2378a037846528a84c0a39547ffc2": { 297 | "model_module": "@jupyter-widgets/base", 298 | "model_name": "LayoutModel", 299 | "model_module_version": "1.2.0", 300 | "state": { 301 | "_model_module": "@jupyter-widgets/base", 302 | "_model_module_version": "1.2.0", 303 | "_model_name": "LayoutModel", 304 | "_view_count": null, 305 | "_view_module": "@jupyter-widgets/base", 306 | "_view_module_version": "1.2.0", 307 | "_view_name": "LayoutView", 308 | "align_content": null, 309 | "align_items": null, 310 | "align_self": null, 311 | "border": null, 312 | "bottom": null, 313 | "display": null, 314 | "flex": null, 315 | "flex_flow": null, 316 | "grid_area": null, 317 | "grid_auto_columns": null, 318 | "grid_auto_flow": null, 319 | "grid_auto_rows": null, 320 | "grid_column": null, 321 | "grid_gap": null, 322 | "grid_row": null, 323 | "grid_template_areas": null, 324 | "grid_template_columns": null, 325 | "grid_template_rows": null, 326 | "height": null, 327 | "justify_content": null, 328 | "justify_items": null, 329 | "left": null, 330 | "margin": null, 331 | "max_height": null, 332 | "max_width": null, 333 | "min_height": null, 334 | "min_width": null, 335 | "object_fit": null, 336 | "object_position": null, 337 | "order": null, 338 | "overflow": null, 339 | "overflow_x": null, 340 | "overflow_y": null, 341 | "padding": null, 342 | "right": null, 343 | "top": null, 344 | "visibility": null, 345 | "width": null 346 | } 347 | }, 348 | "f7659f9f7fe14dff8577d6f904e3a108": { 349 | "model_module": "@jupyter-widgets/controls", 350 | "model_name": "DescriptionStyleModel", 351 | "model_module_version": "1.5.0", 352 | "state": { 353 | "_model_module": "@jupyter-widgets/controls", 354 | "_model_module_version": "1.5.0", 355 | "_model_name": "DescriptionStyleModel", 356 | "_view_count": null, 357 | "_view_module": "@jupyter-widgets/base", 358 | "_view_module_version": "1.2.0", 359 | "_view_name": "StyleView", 360 | "description_width": "" 361 | } 362 | } 363 | } 364 | } 365 | }, 366 | "cells": [ 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "id": "view-in-github", 371 | "colab_type": "text" 372 | }, 373 | "source": [ 374 | "\"Open" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "source": [ 380 | "!pip install -q einops stable_audio_tools" 381 | ], 382 | "metadata": { 383 | "id": "ULbF5y9eoeCN" 384 | }, 385 | "execution_count": 6, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 8, 391 | "metadata": { 392 | "id": "GM27lXQuobHw" 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "import torch\n", 397 | "import torchaudio\n", 398 | "from einops import rearrange\n", 399 | "from stable_audio_tools import get_pretrained_model\n", 400 | "from stable_audio_tools.inference.generation import generate_diffusion_cond" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "source": [ 406 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 407 | "\n", 408 | "# Download model\n", 409 | "model, model_config = get_pretrained_model(\"stabilityai/stable-audio-open-1.0\")\n", 410 | "sample_rate = model_config[\"sample_rate\"]\n", 411 | "sample_size = model_config[\"sample_size\"]\n", 412 | "\n", 413 | "model = model.to(device)" 414 | ], 415 | "metadata": { 416 | "colab": { 417 | "base_uri": "https://localhost:8080/" 418 | }, 419 | "id": "JQSox2_JrjSm", 420 | "outputId": "d08c9d02-4223-4ea3-b078-ce1ae384fb34" 421 | }, 422 | "execution_count": 9, 423 | "outputs": [ 424 | { 425 | "output_type": "stream", 426 | "name": "stderr", 427 | "text": [ 428 | "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", 429 | " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" 430 | ] 431 | } 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "source": [ 437 | "# Set up text and timing conditioning\n", 438 | "conditioning = [{\n", 439 | " \"prompt\": \"The sound of the piano keys being pressed, the soft melody that follows, and the gentle hum of the bass create a soothing atmosphere that envelops the listener. It's as if the music is a warm embrace, inviting you to relax and unwind. The rhythm is slow and steady, like a heartbeat, and the notes dance together in perfect harmony. It's a symphony of peace and tranquility, a lullaby for the soul.\",\n", 440 | " \"seconds_start\": 0,\n", 441 | " \"seconds_total\": 45\n", 442 | "}]" 443 | ], 444 | "metadata": { 445 | "id": "K87sVH9jtBB9" 446 | }, 447 | "execution_count": 10, 448 | "outputs": [] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "source": [ 453 | "# Generate stereo audio\n", 454 | "output = generate_diffusion_cond(\n", 455 | " model,\n", 456 | " steps=100,\n", 457 | " cfg_scale=7,\n", 458 | " conditioning=conditioning,\n", 459 | " sample_size=sample_size,\n", 460 | " sigma_min=0.3,\n", 461 | " sigma_max=500,\n", 462 | " sampler_type=\"dpmpp-3m-sde\",\n", 463 | " device=device\n", 464 | ")" 465 | ], 466 | "metadata": { 467 | "colab": { 468 | "base_uri": "https://localhost:8080/", 469 | "height": 181, 470 | "referenced_widgets": [ 471 | "d621e5ab73d24c0f85aca1e0a20d3e89", 472 | "6c7808a91a3e4f72a1248832b985020c", 473 | "13e9dac474494304b279d662de87d8f4", 474 | "5d116c165c3841248f360b00ba35fffe", 475 | "d700622dab8e41e4b3d3acc10ff42ba2", 476 | "9d88d7dc67534a1fb6e1e15c9c5b937b", 477 | "480f4371362742d2952ffbbd986ec729", 478 | "f6aef59a2db049c39ee874ba3cfc4798", 479 | "54b55dbf5a5d4a2e9afe5c7dc1e26d7d", 480 | "e5b2378a037846528a84c0a39547ffc2", 481 | "f7659f9f7fe14dff8577d6f904e3a108" 482 | ] 483 | }, 484 | "id": "vV1V-15ktC07", 485 | "outputId": "e319f4a7-8db7-4c0c-e582-a0777f90868c" 486 | }, 487 | "execution_count": 11, 488 | "outputs": [ 489 | { 490 | "output_type": "stream", 491 | "name": "stdout", 492 | "text": [ 493 | "384734133\n" 494 | ] 495 | }, 496 | { 497 | "output_type": "display_data", 498 | "data": { 499 | "text/plain": [ 500 | " 0%| | 0/100 [00:00 d (b n)\")" 527 | ], 528 | "metadata": { 529 | "id": "jNH-zKlZtEk5" 530 | }, 531 | "execution_count": 12, 532 | "outputs": [] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "source": [ 537 | "# Peak normalize, clip, convert to int16, and save to file\n", 538 | "output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()\n", 539 | "torchaudio.save(\"output.wav\", output, sample_rate)" 540 | ], 541 | "metadata": { 542 | "id": "iwvWXYMjtF0B" 543 | }, 544 | "execution_count": 13, 545 | "outputs": [] 546 | } 547 | ] 548 | } -------------------------------------------------------------------------------- /transformers_whisper_ckpt_to_OAI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOwqcZA9X/v0IMR2JdFVrpM", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "widgets": { 18 | "application/vnd.jupyter.widget-state+json": { 19 | "745588c98f5743ddb72d83824270f5c5": { 20 | "model_module": "@jupyter-widgets/controls", 21 | "model_name": "HBoxModel", 22 | "model_module_version": "1.5.0", 23 | "state": { 24 | "_dom_classes": [], 25 | "_model_module": "@jupyter-widgets/controls", 26 | "_model_module_version": "1.5.0", 27 | "_model_name": "HBoxModel", 28 | "_view_count": null, 29 | "_view_module": "@jupyter-widgets/controls", 30 | "_view_module_version": "1.5.0", 31 | "_view_name": "HBoxView", 32 | "box_style": "", 33 | "children": [ 34 | "IPY_MODEL_ef501fa7fc204e2bb70fbd9b61484f1f", 35 | "IPY_MODEL_c55c2b2459e044f6890e0851ea862d21", 36 | "IPY_MODEL_0c9d723b0fad406fac28e8955f492cb7" 37 | ], 38 | "layout": "IPY_MODEL_6989f75bbb3746fba2f10bf1b9581ead" 39 | } 40 | }, 41 | "ef501fa7fc204e2bb70fbd9b61484f1f": { 42 | "model_module": "@jupyter-widgets/controls", 43 | "model_name": "HTMLModel", 44 | "model_module_version": "1.5.0", 45 | "state": { 46 | "_dom_classes": [], 47 | "_model_module": "@jupyter-widgets/controls", 48 | "_model_module_version": "1.5.0", 49 | "_model_name": "HTMLModel", 50 | "_view_count": null, 51 | "_view_module": "@jupyter-widgets/controls", 52 | "_view_module_version": "1.5.0", 53 | "_view_name": "HTMLView", 54 | "description": "", 55 | "description_tooltip": null, 56 | "layout": "IPY_MODEL_6de3345bd4a74f74a0c53b5ebe1f3187", 57 | "placeholder": "​", 58 | "style": "IPY_MODEL_672d9de7616c46f083c6bf9c4f9caff3", 59 | "value": "Downloading: 100%" 60 | } 61 | }, 62 | "c55c2b2459e044f6890e0851ea862d21": { 63 | "model_module": "@jupyter-widgets/controls", 64 | "model_name": "FloatProgressModel", 65 | "model_module_version": "1.5.0", 66 | "state": { 67 | "_dom_classes": [], 68 | "_model_module": "@jupyter-widgets/controls", 69 | "_model_module_version": "1.5.0", 70 | "_model_name": "FloatProgressModel", 71 | "_view_count": null, 72 | "_view_module": "@jupyter-widgets/controls", 73 | "_view_module_version": "1.5.0", 74 | "_view_name": "ProgressView", 75 | "bar_style": "success", 76 | "description": "", 77 | "description_tooltip": null, 78 | "layout": "IPY_MODEL_e49c2e613b93405a9366f759e36ccb43", 79 | "max": 1985, 80 | "min": 0, 81 | "orientation": "horizontal", 82 | "style": "IPY_MODEL_6043edb9df9e47d0a28ac8fbafca28b1", 83 | "value": 1985 84 | } 85 | }, 86 | "0c9d723b0fad406fac28e8955f492cb7": { 87 | "model_module": "@jupyter-widgets/controls", 88 | "model_name": "HTMLModel", 89 | "model_module_version": "1.5.0", 90 | "state": { 91 | "_dom_classes": [], 92 | "_model_module": "@jupyter-widgets/controls", 93 | "_model_module_version": "1.5.0", 94 | "_model_name": "HTMLModel", 95 | "_view_count": null, 96 | "_view_module": "@jupyter-widgets/controls", 97 | "_view_module_version": "1.5.0", 98 | "_view_name": "HTMLView", 99 | "description": "", 100 | "description_tooltip": null, 101 | "layout": "IPY_MODEL_fa5debe4f46b4dd796528715a962b738", 102 | "placeholder": "​", 103 | "style": "IPY_MODEL_c0bef8c62caf4e149a250ce7a5fe245b", 104 | "value": " 1.99k/1.99k [00:00<00:00, 33.8kB/s]" 105 | } 106 | }, 107 | "6989f75bbb3746fba2f10bf1b9581ead": { 108 | "model_module": "@jupyter-widgets/base", 109 | "model_name": "LayoutModel", 110 | "model_module_version": "1.2.0", 111 | "state": { 112 | "_model_module": "@jupyter-widgets/base", 113 | "_model_module_version": "1.2.0", 114 | "_model_name": "LayoutModel", 115 | "_view_count": null, 116 | "_view_module": "@jupyter-widgets/base", 117 | "_view_module_version": "1.2.0", 118 | "_view_name": "LayoutView", 119 | "align_content": null, 120 | "align_items": null, 121 | "align_self": null, 122 | "border": null, 123 | "bottom": null, 124 | "display": null, 125 | "flex": null, 126 | "flex_flow": null, 127 | "grid_area": null, 128 | "grid_auto_columns": null, 129 | "grid_auto_flow": null, 130 | "grid_auto_rows": null, 131 | "grid_column": null, 132 | "grid_gap": null, 133 | "grid_row": null, 134 | "grid_template_areas": null, 135 | "grid_template_columns": null, 136 | "grid_template_rows": null, 137 | "height": null, 138 | "justify_content": null, 139 | "justify_items": null, 140 | "left": null, 141 | "margin": null, 142 | "max_height": null, 143 | "max_width": null, 144 | "min_height": null, 145 | "min_width": null, 146 | "object_fit": null, 147 | "object_position": null, 148 | "order": null, 149 | "overflow": null, 150 | "overflow_x": null, 151 | "overflow_y": null, 152 | "padding": null, 153 | "right": null, 154 | "top": null, 155 | "visibility": null, 156 | "width": null 157 | } 158 | }, 159 | "6de3345bd4a74f74a0c53b5ebe1f3187": { 160 | "model_module": "@jupyter-widgets/base", 161 | "model_name": "LayoutModel", 162 | "model_module_version": "1.2.0", 163 | "state": { 164 | "_model_module": "@jupyter-widgets/base", 165 | "_model_module_version": "1.2.0", 166 | "_model_name": "LayoutModel", 167 | "_view_count": null, 168 | "_view_module": "@jupyter-widgets/base", 169 | "_view_module_version": "1.2.0", 170 | "_view_name": "LayoutView", 171 | "align_content": null, 172 | "align_items": null, 173 | "align_self": null, 174 | "border": null, 175 | "bottom": null, 176 | "display": null, 177 | "flex": null, 178 | "flex_flow": null, 179 | "grid_area": null, 180 | "grid_auto_columns": null, 181 | "grid_auto_flow": null, 182 | "grid_auto_rows": null, 183 | "grid_column": null, 184 | "grid_gap": null, 185 | "grid_row": null, 186 | "grid_template_areas": null, 187 | "grid_template_columns": null, 188 | "grid_template_rows": null, 189 | "height": null, 190 | "justify_content": null, 191 | "justify_items": null, 192 | "left": null, 193 | "margin": null, 194 | "max_height": null, 195 | "max_width": null, 196 | "min_height": null, 197 | "min_width": null, 198 | "object_fit": null, 199 | "object_position": null, 200 | "order": null, 201 | "overflow": null, 202 | "overflow_x": null, 203 | "overflow_y": null, 204 | "padding": null, 205 | "right": null, 206 | "top": null, 207 | "visibility": null, 208 | "width": null 209 | } 210 | }, 211 | "672d9de7616c46f083c6bf9c4f9caff3": { 212 | "model_module": "@jupyter-widgets/controls", 213 | "model_name": "DescriptionStyleModel", 214 | "model_module_version": "1.5.0", 215 | "state": { 216 | "_model_module": "@jupyter-widgets/controls", 217 | "_model_module_version": "1.5.0", 218 | "_model_name": "DescriptionStyleModel", 219 | "_view_count": null, 220 | "_view_module": "@jupyter-widgets/base", 221 | "_view_module_version": "1.2.0", 222 | "_view_name": "StyleView", 223 | "description_width": "" 224 | } 225 | }, 226 | "e49c2e613b93405a9366f759e36ccb43": { 227 | "model_module": "@jupyter-widgets/base", 228 | "model_name": "LayoutModel", 229 | "model_module_version": "1.2.0", 230 | "state": { 231 | "_model_module": "@jupyter-widgets/base", 232 | "_model_module_version": "1.2.0", 233 | "_model_name": "LayoutModel", 234 | "_view_count": null, 235 | "_view_module": "@jupyter-widgets/base", 236 | "_view_module_version": "1.2.0", 237 | "_view_name": "LayoutView", 238 | "align_content": null, 239 | "align_items": null, 240 | "align_self": null, 241 | "border": null, 242 | "bottom": null, 243 | "display": null, 244 | "flex": null, 245 | "flex_flow": null, 246 | "grid_area": null, 247 | "grid_auto_columns": null, 248 | "grid_auto_flow": null, 249 | "grid_auto_rows": null, 250 | "grid_column": null, 251 | "grid_gap": null, 252 | "grid_row": null, 253 | "grid_template_areas": null, 254 | "grid_template_columns": null, 255 | "grid_template_rows": null, 256 | "height": null, 257 | "justify_content": null, 258 | "justify_items": null, 259 | "left": null, 260 | "margin": null, 261 | "max_height": null, 262 | "max_width": null, 263 | "min_height": null, 264 | "min_width": null, 265 | "object_fit": null, 266 | "object_position": null, 267 | "order": null, 268 | "overflow": null, 269 | "overflow_x": null, 270 | "overflow_y": null, 271 | "padding": null, 272 | "right": null, 273 | "top": null, 274 | "visibility": null, 275 | "width": null 276 | } 277 | }, 278 | "6043edb9df9e47d0a28ac8fbafca28b1": { 279 | "model_module": "@jupyter-widgets/controls", 280 | "model_name": "ProgressStyleModel", 281 | "model_module_version": "1.5.0", 282 | "state": { 283 | "_model_module": "@jupyter-widgets/controls", 284 | "_model_module_version": "1.5.0", 285 | "_model_name": "ProgressStyleModel", 286 | "_view_count": null, 287 | "_view_module": "@jupyter-widgets/base", 288 | "_view_module_version": "1.2.0", 289 | "_view_name": "StyleView", 290 | "bar_color": null, 291 | "description_width": "" 292 | } 293 | }, 294 | "fa5debe4f46b4dd796528715a962b738": { 295 | "model_module": "@jupyter-widgets/base", 296 | "model_name": "LayoutModel", 297 | "model_module_version": "1.2.0", 298 | "state": { 299 | "_model_module": "@jupyter-widgets/base", 300 | "_model_module_version": "1.2.0", 301 | "_model_name": "LayoutModel", 302 | "_view_count": null, 303 | "_view_module": "@jupyter-widgets/base", 304 | "_view_module_version": "1.2.0", 305 | "_view_name": "LayoutView", 306 | "align_content": null, 307 | "align_items": null, 308 | "align_self": null, 309 | "border": null, 310 | "bottom": null, 311 | "display": null, 312 | "flex": null, 313 | "flex_flow": null, 314 | "grid_area": null, 315 | "grid_auto_columns": null, 316 | "grid_auto_flow": null, 317 | "grid_auto_rows": null, 318 | "grid_column": null, 319 | "grid_gap": null, 320 | "grid_row": null, 321 | "grid_template_areas": null, 322 | "grid_template_columns": null, 323 | "grid_template_rows": null, 324 | "height": null, 325 | "justify_content": null, 326 | "justify_items": null, 327 | "left": null, 328 | "margin": null, 329 | "max_height": null, 330 | "max_width": null, 331 | "min_height": null, 332 | "min_width": null, 333 | "object_fit": null, 334 | "object_position": null, 335 | "order": null, 336 | "overflow": null, 337 | "overflow_x": null, 338 | "overflow_y": null, 339 | "padding": null, 340 | "right": null, 341 | "top": null, 342 | "visibility": null, 343 | "width": null 344 | } 345 | }, 346 | "c0bef8c62caf4e149a250ce7a5fe245b": { 347 | "model_module": "@jupyter-widgets/controls", 348 | "model_name": "DescriptionStyleModel", 349 | "model_module_version": "1.5.0", 350 | "state": { 351 | "_model_module": "@jupyter-widgets/controls", 352 | "_model_module_version": "1.5.0", 353 | "_model_name": "DescriptionStyleModel", 354 | "_view_count": null, 355 | "_view_module": "@jupyter-widgets/base", 356 | "_view_module_version": "1.2.0", 357 | "_view_name": "StyleView", 358 | "description_width": "" 359 | } 360 | }, 361 | "7f891763dcbd47b187945359fa20e037": { 362 | "model_module": "@jupyter-widgets/controls", 363 | "model_name": "HBoxModel", 364 | "model_module_version": "1.5.0", 365 | "state": { 366 | "_dom_classes": [], 367 | "_model_module": "@jupyter-widgets/controls", 368 | "_model_module_version": "1.5.0", 369 | "_model_name": "HBoxModel", 370 | "_view_count": null, 371 | "_view_module": "@jupyter-widgets/controls", 372 | "_view_module_version": "1.5.0", 373 | "_view_name": "HBoxView", 374 | "box_style": "", 375 | "children": [ 376 | "IPY_MODEL_fbe40f8a39ed4b1ba41b04f3761a2e7e", 377 | "IPY_MODEL_f05e0141670c4460976e16fa7ce72dfd", 378 | "IPY_MODEL_ddc73b7194c144ee9c7c8d6abe953ebd" 379 | ], 380 | "layout": "IPY_MODEL_fc6ccbdf580145c3b17aef8c1c06405c" 381 | } 382 | }, 383 | "fbe40f8a39ed4b1ba41b04f3761a2e7e": { 384 | "model_module": "@jupyter-widgets/controls", 385 | "model_name": "HTMLModel", 386 | "model_module_version": "1.5.0", 387 | "state": { 388 | "_dom_classes": [], 389 | "_model_module": "@jupyter-widgets/controls", 390 | "_model_module_version": "1.5.0", 391 | "_model_name": "HTMLModel", 392 | "_view_count": null, 393 | "_view_module": "@jupyter-widgets/controls", 394 | "_view_module_version": "1.5.0", 395 | "_view_name": "HTMLView", 396 | "description": "", 397 | "description_tooltip": null, 398 | "layout": "IPY_MODEL_7290d13ad54749faa28ebd4442c279f3", 399 | "placeholder": "​", 400 | "style": "IPY_MODEL_b6d4f50b005f43a38910569aa3cc11c1", 401 | "value": "Downloading: 100%" 402 | } 403 | }, 404 | "f05e0141670c4460976e16fa7ce72dfd": { 405 | "model_module": "@jupyter-widgets/controls", 406 | "model_name": "FloatProgressModel", 407 | "model_module_version": "1.5.0", 408 | "state": { 409 | "_dom_classes": [], 410 | "_model_module": "@jupyter-widgets/controls", 411 | "_model_module_version": "1.5.0", 412 | "_model_name": "FloatProgressModel", 413 | "_view_count": null, 414 | "_view_module": "@jupyter-widgets/controls", 415 | "_view_module_version": "1.5.0", 416 | "_view_name": "ProgressView", 417 | "bar_style": "success", 418 | "description": "", 419 | "description_tooltip": null, 420 | "layout": "IPY_MODEL_aec08700fe794e5db392755ea3447a99", 421 | "max": 967102601, 422 | "min": 0, 423 | "orientation": "horizontal", 424 | "style": "IPY_MODEL_5c24c6d84dd64f0da47b0ac8217747fb", 425 | "value": 967102601 426 | } 427 | }, 428 | "ddc73b7194c144ee9c7c8d6abe953ebd": { 429 | "model_module": "@jupyter-widgets/controls", 430 | "model_name": "HTMLModel", 431 | "model_module_version": "1.5.0", 432 | "state": { 433 | "_dom_classes": [], 434 | "_model_module": "@jupyter-widgets/controls", 435 | "_model_module_version": "1.5.0", 436 | "_model_name": "HTMLModel", 437 | "_view_count": null, 438 | "_view_module": "@jupyter-widgets/controls", 439 | "_view_module_version": "1.5.0", 440 | "_view_name": "HTMLView", 441 | "description": "", 442 | "description_tooltip": null, 443 | "layout": "IPY_MODEL_0ef0dc2df03d4aeb898e87d550f787cb", 444 | "placeholder": "​", 445 | "style": "IPY_MODEL_6c1d179f465c4a8ab8c9f87169c55e6e", 446 | "value": " 967M/967M [00:28<00:00, 34.4MB/s]" 447 | } 448 | }, 449 | "fc6ccbdf580145c3b17aef8c1c06405c": { 450 | "model_module": "@jupyter-widgets/base", 451 | "model_name": "LayoutModel", 452 | "model_module_version": "1.2.0", 453 | "state": { 454 | "_model_module": "@jupyter-widgets/base", 455 | "_model_module_version": "1.2.0", 456 | "_model_name": "LayoutModel", 457 | "_view_count": null, 458 | "_view_module": "@jupyter-widgets/base", 459 | "_view_module_version": "1.2.0", 460 | "_view_name": "LayoutView", 461 | "align_content": null, 462 | "align_items": null, 463 | "align_self": null, 464 | "border": null, 465 | "bottom": null, 466 | "display": null, 467 | "flex": null, 468 | "flex_flow": null, 469 | "grid_area": null, 470 | "grid_auto_columns": null, 471 | "grid_auto_flow": null, 472 | "grid_auto_rows": null, 473 | "grid_column": null, 474 | "grid_gap": null, 475 | "grid_row": null, 476 | "grid_template_areas": null, 477 | "grid_template_columns": null, 478 | "grid_template_rows": null, 479 | "height": null, 480 | "justify_content": null, 481 | "justify_items": null, 482 | "left": null, 483 | "margin": null, 484 | "max_height": null, 485 | "max_width": null, 486 | "min_height": null, 487 | "min_width": null, 488 | "object_fit": null, 489 | "object_position": null, 490 | "order": null, 491 | "overflow": null, 492 | "overflow_x": null, 493 | "overflow_y": null, 494 | "padding": null, 495 | "right": null, 496 | "top": null, 497 | "visibility": null, 498 | "width": null 499 | } 500 | }, 501 | "7290d13ad54749faa28ebd4442c279f3": { 502 | "model_module": "@jupyter-widgets/base", 503 | "model_name": "LayoutModel", 504 | "model_module_version": "1.2.0", 505 | "state": { 506 | "_model_module": "@jupyter-widgets/base", 507 | "_model_module_version": "1.2.0", 508 | "_model_name": "LayoutModel", 509 | "_view_count": null, 510 | "_view_module": "@jupyter-widgets/base", 511 | "_view_module_version": "1.2.0", 512 | "_view_name": "LayoutView", 513 | "align_content": null, 514 | "align_items": null, 515 | "align_self": null, 516 | "border": null, 517 | "bottom": null, 518 | "display": null, 519 | "flex": null, 520 | "flex_flow": null, 521 | "grid_area": null, 522 | "grid_auto_columns": null, 523 | "grid_auto_flow": null, 524 | "grid_auto_rows": null, 525 | "grid_column": null, 526 | "grid_gap": null, 527 | "grid_row": null, 528 | "grid_template_areas": null, 529 | "grid_template_columns": null, 530 | "grid_template_rows": null, 531 | "height": null, 532 | "justify_content": null, 533 | "justify_items": null, 534 | "left": null, 535 | "margin": null, 536 | "max_height": null, 537 | "max_width": null, 538 | "min_height": null, 539 | "min_width": null, 540 | "object_fit": null, 541 | "object_position": null, 542 | "order": null, 543 | "overflow": null, 544 | "overflow_x": null, 545 | "overflow_y": null, 546 | "padding": null, 547 | "right": null, 548 | "top": null, 549 | "visibility": null, 550 | "width": null 551 | } 552 | }, 553 | "b6d4f50b005f43a38910569aa3cc11c1": { 554 | "model_module": "@jupyter-widgets/controls", 555 | "model_name": "DescriptionStyleModel", 556 | "model_module_version": "1.5.0", 557 | "state": { 558 | "_model_module": "@jupyter-widgets/controls", 559 | "_model_module_version": "1.5.0", 560 | "_model_name": "DescriptionStyleModel", 561 | "_view_count": null, 562 | "_view_module": "@jupyter-widgets/base", 563 | "_view_module_version": "1.2.0", 564 | "_view_name": "StyleView", 565 | "description_width": "" 566 | } 567 | }, 568 | "aec08700fe794e5db392755ea3447a99": { 569 | "model_module": "@jupyter-widgets/base", 570 | "model_name": "LayoutModel", 571 | "model_module_version": "1.2.0", 572 | "state": { 573 | "_model_module": "@jupyter-widgets/base", 574 | "_model_module_version": "1.2.0", 575 | "_model_name": "LayoutModel", 576 | "_view_count": null, 577 | "_view_module": "@jupyter-widgets/base", 578 | "_view_module_version": "1.2.0", 579 | "_view_name": "LayoutView", 580 | "align_content": null, 581 | "align_items": null, 582 | "align_self": null, 583 | "border": null, 584 | "bottom": null, 585 | "display": null, 586 | "flex": null, 587 | "flex_flow": null, 588 | "grid_area": null, 589 | "grid_auto_columns": null, 590 | "grid_auto_flow": null, 591 | "grid_auto_rows": null, 592 | "grid_column": null, 593 | "grid_gap": null, 594 | "grid_row": null, 595 | "grid_template_areas": null, 596 | "grid_template_columns": null, 597 | "grid_template_rows": null, 598 | "height": null, 599 | "justify_content": null, 600 | "justify_items": null, 601 | "left": null, 602 | "margin": null, 603 | "max_height": null, 604 | "max_width": null, 605 | "min_height": null, 606 | "min_width": null, 607 | "object_fit": null, 608 | "object_position": null, 609 | "order": null, 610 | "overflow": null, 611 | "overflow_x": null, 612 | "overflow_y": null, 613 | "padding": null, 614 | "right": null, 615 | "top": null, 616 | "visibility": null, 617 | "width": null 618 | } 619 | }, 620 | "5c24c6d84dd64f0da47b0ac8217747fb": { 621 | "model_module": "@jupyter-widgets/controls", 622 | "model_name": "ProgressStyleModel", 623 | "model_module_version": "1.5.0", 624 | "state": { 625 | "_model_module": "@jupyter-widgets/controls", 626 | "_model_module_version": "1.5.0", 627 | "_model_name": "ProgressStyleModel", 628 | "_view_count": null, 629 | "_view_module": "@jupyter-widgets/base", 630 | "_view_module_version": "1.2.0", 631 | "_view_name": "StyleView", 632 | "bar_color": null, 633 | "description_width": "" 634 | } 635 | }, 636 | "0ef0dc2df03d4aeb898e87d550f787cb": { 637 | "model_module": "@jupyter-widgets/base", 638 | "model_name": "LayoutModel", 639 | "model_module_version": "1.2.0", 640 | "state": { 641 | "_model_module": "@jupyter-widgets/base", 642 | "_model_module_version": "1.2.0", 643 | "_model_name": "LayoutModel", 644 | "_view_count": null, 645 | "_view_module": "@jupyter-widgets/base", 646 | "_view_module_version": "1.2.0", 647 | "_view_name": "LayoutView", 648 | "align_content": null, 649 | "align_items": null, 650 | "align_self": null, 651 | "border": null, 652 | "bottom": null, 653 | "display": null, 654 | "flex": null, 655 | "flex_flow": null, 656 | "grid_area": null, 657 | "grid_auto_columns": null, 658 | "grid_auto_flow": null, 659 | "grid_auto_rows": null, 660 | "grid_column": null, 661 | "grid_gap": null, 662 | "grid_row": null, 663 | "grid_template_areas": null, 664 | "grid_template_columns": null, 665 | "grid_template_rows": null, 666 | "height": null, 667 | "justify_content": null, 668 | "justify_items": null, 669 | "left": null, 670 | "margin": null, 671 | "max_height": null, 672 | "max_width": null, 673 | "min_height": null, 674 | "min_width": null, 675 | "object_fit": null, 676 | "object_position": null, 677 | "order": null, 678 | "overflow": null, 679 | "overflow_x": null, 680 | "overflow_y": null, 681 | "padding": null, 682 | "right": null, 683 | "top": null, 684 | "visibility": null, 685 | "width": null 686 | } 687 | }, 688 | "6c1d179f465c4a8ab8c9f87169c55e6e": { 689 | "model_module": "@jupyter-widgets/controls", 690 | "model_name": "DescriptionStyleModel", 691 | "model_module_version": "1.5.0", 692 | "state": { 693 | "_model_module": "@jupyter-widgets/controls", 694 | "_model_module_version": "1.5.0", 695 | "_model_name": "DescriptionStyleModel", 696 | "_view_count": null, 697 | "_view_module": "@jupyter-widgets/base", 698 | "_view_module_version": "1.2.0", 699 | "_view_name": "StyleView", 700 | "description_width": "" 701 | } 702 | } 703 | } 704 | } 705 | }, 706 | "cells": [ 707 | { 708 | "cell_type": "markdown", 709 | "metadata": { 710 | "id": "view-in-github", 711 | "colab_type": "text" 712 | }, 713 | "source": [ 714 | "\"Open" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "source": [ 720 | "!pip install git+https://github.com/huggingface/transformers" 721 | ], 722 | "metadata": { 723 | "colab": { 724 | "base_uri": "https://localhost:8080/" 725 | }, 726 | "id": "uPXNgQVu_gQi", 727 | "outputId": "93d1eb58-8609-43d2-af59-06a6e33ebfd1" 728 | }, 729 | "execution_count": null, 730 | "outputs": [ 731 | { 732 | "output_type": "stream", 733 | "name": "stdout", 734 | "text": [ 735 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 736 | "Collecting git+https://github.com/huggingface/transformers\n", 737 | " Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nksmoml9\n", 738 | " Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-nksmoml9\n", 739 | " Resolved https://github.com/huggingface/transformers to commit d0f324f1e13b2813d4571f446795b15f01cda056\n", 740 | " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", 741 | " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", 742 | " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", 743 | "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (0.13.2)\n", 744 | "Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (2.25.1)\n", 745 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (1.21.6)\n", 746 | "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (3.8.2)\n", 747 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (2022.6.2)\n", 748 | "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (6.0)\n", 749 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (4.64.1)\n", 750 | "Requirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (0.11.1)\n", 751 | "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers==4.26.0.dev0) (21.3)\n", 752 | "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers==4.26.0.dev0) (4.4.0)\n", 753 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.8/dist-packages (from packaging>=20.0->transformers==4.26.0.dev0) (3.0.9)\n", 754 | "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.26.0.dev0) (4.0.0)\n", 755 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.26.0.dev0) (2.10)\n", 756 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.26.0.dev0) (1.24.3)\n", 757 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers==4.26.0.dev0) (2022.12.7)\n" 758 | ] 759 | } 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": { 766 | "id": "iV8RL4Oy_F3Q" 767 | }, 768 | "outputs": [], 769 | "source": [ 770 | "from copy import deepcopy\n", 771 | "import torch\n", 772 | "from transformers import WhisperForConditionalGeneration" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "source": [ 778 | "WHISPER_MAPPING = {\n", 779 | " \"layers\": \"blocks\",\n", 780 | " \"fc1\": \"mlp.0\",\n", 781 | " \"fc2\": \"mlp.2\",\n", 782 | " \"final_layer_norm\": \"mlp_ln\",\n", 783 | " \"layers\": \"blocks\",\n", 784 | " \".self_attn.q_proj\": \".attn.query\",\n", 785 | " \".self_attn.k_proj\": \".attn.key\",\n", 786 | " \".self_attn.v_proj\": \".attn.value\",\n", 787 | " \".self_attn_layer_norm\": \".attn_ln\",\n", 788 | " \".self_attn.out_proj\": \".attn.out\",\n", 789 | " \".encoder_attn.q_proj\": \".cross_attn.query\",\n", 790 | " \".encoder_attn.k_proj\": \".cross_attn.key\",\n", 791 | " \".encoder_attn.v_proj\": \".cross_attn.value\",\n", 792 | " \".encoder_attn_layer_norm\": \".cross_attn_ln\",\n", 793 | " \".encoder_attn.out_proj\": \".cross_attn.out\",\n", 794 | " \"decoder.layer_norm.\": \"decoder.ln.\",\n", 795 | " \"encoder.layer_norm.\": \"encoder.ln_post.\",\n", 796 | " \"embed_tokens\": \"token_embedding\",\n", 797 | " \"encoder.embed_positions.weight\": \"encoder.positional_embedding\",\n", 798 | " \"decoder.embed_positions.weight\": \"decoder.positional_embedding\",\n", 799 | " \"layer_norm\": \"ln_post\",\n", 800 | "}\n", 801 | "\n", 802 | "\n", 803 | "def rename_keys(s_dict):\n", 804 | " keys = list(s_dict.keys())\n", 805 | " for key in keys:\n", 806 | " new_key = key\n", 807 | " for k, v in WHISPER_MAPPING.items():\n", 808 | " if k in key:\n", 809 | " new_key = new_key.replace(k, v)\n", 810 | "\n", 811 | " s_dict[new_key] = s_dict.pop(key)\n", 812 | " return s_dict" 813 | ], 814 | "metadata": { 815 | "id": "ZvlNRPssAAoa" 816 | }, 817 | "execution_count": null, 818 | "outputs": [] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "source": [ 823 | "def convert_hf_ckpt_to_whisper_ckpt(hf_model_name_or_path, whisper_ckpt_save_path):\n", 824 | " transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)\n", 825 | " config = transformer_model.config\n", 826 | "\n", 827 | " dims = {\n", 828 | " 'n_mels': config.num_mel_bins,\n", 829 | " 'n_vocab': config.vocab_size,\n", 830 | " 'n_audio_ctx': config.max_source_positions,\n", 831 | " 'n_audio_state': config.d_model,\n", 832 | " 'n_audio_head': config.encoder_attention_heads,\n", 833 | " 'n_audio_layer': config.encoder_layers,\n", 834 | " 'n_text_ctx': config.max_target_positions,\n", 835 | " 'n_text_state': config.d_model,\n", 836 | " 'n_text_head': config.decoder_attention_heads,\n", 837 | " 'n_text_layer': config.decoder_layers\n", 838 | " }\n", 839 | "\n", 840 | " state_dict = deepcopy(transformer_model.model.state_dict())\n", 841 | " state_dict = rename_keys(state_dict)\n", 842 | "\n", 843 | " torch.save({\"dims\": dims, \"model_state_dict\": state_dict}, whisper_ckpt_save_path)" 844 | ], 845 | "metadata": { 846 | "id": "CEeChPKkAn6f" 847 | }, 848 | "execution_count": null, 849 | "outputs": [] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "source": [ 854 | "convert_hf_ckpt_to_whisper_ckpt(\"flozi00/whisper-small-german\", \"flozi00_whisper-small-german_OAI\")" 855 | ], 856 | "metadata": { 857 | "colab": { 858 | "base_uri": "https://localhost:8080/", 859 | "height": 81, 860 | "referenced_widgets": [ 861 | "745588c98f5743ddb72d83824270f5c5", 862 | "ef501fa7fc204e2bb70fbd9b61484f1f", 863 | "c55c2b2459e044f6890e0851ea862d21", 864 | "0c9d723b0fad406fac28e8955f492cb7", 865 | "6989f75bbb3746fba2f10bf1b9581ead", 866 | "6de3345bd4a74f74a0c53b5ebe1f3187", 867 | "672d9de7616c46f083c6bf9c4f9caff3", 868 | "e49c2e613b93405a9366f759e36ccb43", 869 | "6043edb9df9e47d0a28ac8fbafca28b1", 870 | "fa5debe4f46b4dd796528715a962b738", 871 | "c0bef8c62caf4e149a250ce7a5fe245b", 872 | "7f891763dcbd47b187945359fa20e037", 873 | "fbe40f8a39ed4b1ba41b04f3761a2e7e", 874 | "f05e0141670c4460976e16fa7ce72dfd", 875 | "ddc73b7194c144ee9c7c8d6abe953ebd", 876 | "fc6ccbdf580145c3b17aef8c1c06405c", 877 | "7290d13ad54749faa28ebd4442c279f3", 878 | "b6d4f50b005f43a38910569aa3cc11c1", 879 | "aec08700fe794e5db392755ea3447a99", 880 | "5c24c6d84dd64f0da47b0ac8217747fb", 881 | "0ef0dc2df03d4aeb898e87d550f787cb", 882 | "6c1d179f465c4a8ab8c9f87169c55e6e" 883 | ] 884 | }, 885 | "id": "86N7BfR-BQYA", 886 | "outputId": "b149cb02-579f-44c9-8d26-651db72ce79a" 887 | }, 888 | "execution_count": null, 889 | "outputs": [ 890 | { 891 | "output_type": "display_data", 892 | "data": { 893 | "text/plain": [ 894 | "Downloading: 0%| | 0.00/1.99k [00:00