├── LICENSE ├── OpenAI_Whisper_ASR_Demo.ipynb ├── README.md └── cpu_app.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 amrrs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /OpenAI_Whisper_ASR_Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "id": "ZsJUxc0aRsAf", 8 | "outputId": "8f75d6cd-8935-4a6e-eef9-2001c8e71191", 9 | "colab": { 10 | "base_uri": "https://localhost:8080/" 11 | } 12 | }, 13 | "outputs": [ 14 | { 15 | "output_type": "stream", 16 | "name": "stdout", 17 | "text": [ 18 | "\u001b[K |████████████████████████████████| 4.9 MB 8.1 MB/s \n", 19 | "\u001b[K |████████████████████████████████| 6.6 MB 41.5 MB/s \n", 20 | "\u001b[K |████████████████████████████████| 120 kB 65.0 MB/s \n", 21 | "\u001b[?25h Building wheel for whisper (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "! pip install git+https://github.com/openai/whisper.git -q" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "import whisper\n", 33 | "\n", 34 | "model = whisper.load_model(\"base\")\n" 35 | ], 36 | "metadata": { 37 | "id": "Kr5faKybKi4p", 38 | "outputId": "2a607999-33ac-4158-a9c6-0a7d7d4ba722", 39 | "colab": { 40 | "base_uri": "https://localhost:8080/", 41 | "height": 123, 42 | "referenced_widgets": [ 43 | "8be0ab44dc1b42ab93831f9babf2e6bc", 44 | "c5f1fffed5de4384a13bea1977894683", 45 | "e884f3a6e2c44e469d8ca5249e813fab", 46 | "5732293799cf42bd979f75b71b2bf1d2", 47 | "1143ae57534d4ce9a86dc7a9294819f7", 48 | "729f9914f10c44f8a887f19030246078", 49 | "0acff2776082461da3386814b1ce5db6", 50 | "1912c3335dfd40b4805a7e70d65e9654", 51 | "a419fdcb24a04a7c96356dd096b4e68a", 52 | "01dbcbbbc3c04035b4a2bab45f81b02e", 53 | "84cdb69f8b06492bb51c2ded0c1d4687" 54 | ] 55 | } 56 | }, 57 | "execution_count": 2, 58 | "outputs": [ 59 | { 60 | "output_type": "stream", 61 | "name": "stderr", 62 | "text": [ 63 | "The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.\n" 64 | ] 65 | }, 66 | { 67 | "output_type": "stream", 68 | "name": "stdout", 69 | "text": [ 70 | "Moving 0 files to the new cache system\n" 71 | ] 72 | }, 73 | { 74 | "output_type": "display_data", 75 | "data": { 76 | "text/plain": [ 77 | "0it [00:00, ?it/s]" 78 | ], 79 | "application/vnd.jupyter.widget-view+json": { 80 | "version_major": 2, 81 | "version_minor": 0, 82 | "model_id": "8be0ab44dc1b42ab93831f9babf2e6bc" 83 | } 84 | }, 85 | "metadata": {} 86 | }, 87 | { 88 | "output_type": "stream", 89 | "name": "stderr", 90 | "text": [ 91 | "100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 55.7MiB/s]\n" 92 | ] 93 | } 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "source": [ 99 | "model.device" 100 | ], 101 | "metadata": { 102 | "colab": { 103 | "base_uri": "https://localhost:8080/" 104 | }, 105 | "id": "u_6_s2iHboR4", 106 | "outputId": "74b1a342-2b05-4e4f-da0c-9a889ea366a5" 107 | }, 108 | "execution_count": 3, 109 | "outputs": [ 110 | { 111 | "output_type": "execute_result", 112 | "data": { 113 | "text/plain": [ 114 | "device(type='cuda', index=0)" 115 | ] 116 | }, 117 | "metadata": {}, 118 | "execution_count": 3 119 | } 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "source": [ 125 | "from IPython.display import Audio\n", 126 | "Audio(\"/content/defines.mp3\")" 127 | ], 128 | "metadata": { 129 | "colab": { 130 | "base_uri": "https://localhost:8080/", 131 | "height": 75 132 | }, 133 | "id": "fhLths-Nfn5Z", 134 | "outputId": "d14f2679-8404-482a-9315-6f42040e0750" 135 | }, 136 | "execution_count": 18, 137 | "outputs": [ 138 | { 139 | "output_type": "execute_result", 140 | "data": { 141 | "text/plain": [ 142 | "" 143 | ], 144 | "text/html": [ 145 | "\n", 146 | " \n", 150 | " " 151 | ] 152 | }, 153 | "metadata": {}, 154 | "execution_count": 18 155 | } 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "source": [ 161 | "from IPython.display import Audio\n", 162 | "Audio(\"/content/pokkiri-message-3920.mp3\")" 163 | ], 164 | "metadata": { 165 | "colab": { 166 | "base_uri": "https://localhost:8080/", 167 | "height": 75 168 | }, 169 | "id": "_8vKBR6cfxm6", 170 | "outputId": "5c7a3e4d-708e-4ec9-cfaf-b39ffeb1f156" 171 | }, 172 | "execution_count": 19, 173 | "outputs": [ 174 | { 175 | "output_type": "execute_result", 176 | "data": { 177 | "text/plain": [ 178 | "" 179 | ], 180 | "text/html": [ 181 | "\n", 182 | " \n", 186 | " " 187 | ] 188 | }, 189 | "metadata": {}, 190 | "execution_count": 19 191 | } 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "source": [ 197 | "\n", 198 | "# load audio and pad/trim it to fit 30 seconds\n", 199 | "audio = whisper.load_audio(\"/content/pokkiri-message-3920.mp3\")\n", 200 | "audio = whisper.pad_or_trim(audio)\n", 201 | "\n", 202 | "# make log-Mel spectrogram and move to the same device as the model\n", 203 | "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n", 204 | "\n", 205 | "# detect the spoken language\n", 206 | "_, probs = model.detect_language(mel)\n", 207 | "print(f\"Detected language: {max(probs, key=probs.get)}\")\n", 208 | "\n", 209 | "# decode the audio\n", 210 | "options = whisper.DecodingOptions()\n", 211 | "result = whisper.decode(model, mel, options)\n", 212 | "\n", 213 | "# print the recognized text\n", 214 | "print(result.text)" 215 | ], 216 | "metadata": { 217 | "id": "bDXgLIprIsAj", 218 | "outputId": "6df60b80-acc9-4a7f-f570-18ba2515e9c1", 219 | "colab": { 220 | "base_uri": "https://localhost:8080/" 221 | } 222 | }, 223 | "execution_count": 21, 224 | "outputs": [ 225 | { 226 | "output_type": "stream", 227 | "name": "stdout", 228 | "text": [ 229 | "Detected language: ta\n", 230 | "ஒரு வாட்டி முடியுப் பணிட்டான் என் பேசினானே கேட்கமாட்டான்.\n" 231 | ] 232 | } 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "source": [ 238 | "! pip install gradio -q" 239 | ], 240 | "metadata": { 241 | "id": "fjM27tWsI4dH", 242 | "colab": { 243 | "base_uri": "https://localhost:8080/" 244 | }, 245 | "outputId": "d9b9f822-c104-432f-c9d2-13b90761e486" 246 | }, 247 | "execution_count": 4, 248 | "outputs": [ 249 | { 250 | "output_type": "stream", 251 | "name": "stdout", 252 | "text": [ 253 | "\u001b[K |████████████████████████████████| 5.3 MB 8.5 MB/s \n", 254 | "\u001b[K |████████████████████████████████| 212 kB 69.4 MB/s \n", 255 | "\u001b[K |████████████████████████████████| 54 kB 3.3 MB/s \n", 256 | "\u001b[K |████████████████████████████████| 112 kB 53.5 MB/s \n", 257 | "\u001b[K |████████████████████████████████| 270 kB 53.8 MB/s \n", 258 | "\u001b[K |████████████████████████████████| 2.3 MB 37.9 MB/s \n", 259 | "\u001b[K |████████████████████████████████| 55 kB 3.4 MB/s \n", 260 | "\u001b[K |████████████████████████████████| 57 kB 4.8 MB/s \n", 261 | "\u001b[K |████████████████████████████████| 84 kB 3.6 MB/s \n", 262 | "\u001b[K |████████████████████████████████| 84 kB 4.2 MB/s \n", 263 | "\u001b[K |████████████████████████████████| 63 kB 2.5 MB/s \n", 264 | "\u001b[K |████████████████████████████████| 80 kB 10.8 MB/s \n", 265 | "\u001b[K |████████████████████████████████| 68 kB 7.1 MB/s \n", 266 | "\u001b[K |████████████████████████████████| 43 kB 2.3 MB/s \n", 267 | "\u001b[K |████████████████████████████████| 856 kB 53.5 MB/s \n", 268 | "\u001b[K |████████████████████████████████| 4.0 MB 60.6 MB/s \n", 269 | "\u001b[K |████████████████████████████████| 594 kB 73.5 MB/s \n", 270 | "\u001b[?25h Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 271 | " Building wheel for python-multipart (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 272 | ] 273 | } 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "source": [ 279 | "import gradio as gr \n", 280 | "import time" 281 | ], 282 | "metadata": { 283 | "id": "ILFOYNnTcYe8" 284 | }, 285 | "execution_count": 6, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "source": [ 291 | "def transcribe(audio):\n", 292 | " \n", 293 | " #time.sleep(3)\n", 294 | " # load audio and pad/trim it to fit 30 seconds\n", 295 | " audio = whisper.load_audio(audio)\n", 296 | " audio = whisper.pad_or_trim(audio)\n", 297 | "\n", 298 | " # make log-Mel spectrogram and move to the same device as the model\n", 299 | " mel = whisper.log_mel_spectrogram(audio).to(model.device)\n", 300 | "\n", 301 | " # detect the spoken language\n", 302 | " _, probs = model.detect_language(mel)\n", 303 | " print(f\"Detected language: {max(probs, key=probs.get)}\")\n", 304 | "\n", 305 | " # decode the audio\n", 306 | " options = whisper.DecodingOptions()\n", 307 | " result = whisper.decode(model, mel, options)\n", 308 | " return result.text\n" 309 | ], 310 | "metadata": { 311 | "id": "JtTvvQQPcOZZ" 312 | }, 313 | "execution_count": 11, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "source": [ 319 | "\n", 320 | "gr.Interface(\n", 321 | " title = 'OpenAI Whisper ASR Gradio Web UI', \n", 322 | " fn=transcribe, \n", 323 | " inputs=[\n", 324 | " gr.inputs.Audio(source=\"microphone\", type=\"filepath\")\n", 325 | " ],\n", 326 | " outputs=[\n", 327 | " \"textbox\"\n", 328 | " ],\n", 329 | " live=True).launch()" 330 | ], 331 | "metadata": { 332 | "colab": { 333 | "base_uri": "https://localhost:8080/", 334 | "height": 776 335 | }, 336 | "id": "deSAVvfJcWBo", 337 | "outputId": "6ead8f3e-f1d0-4180-a94c-8825e410968c" 338 | }, 339 | "execution_count": 16, 340 | "outputs": [ 341 | { 342 | "output_type": "stream", 343 | "name": "stderr", 344 | "text": [ 345 | "/usr/local/lib/python3.7/dist-packages/gradio/inputs.py:319: UserWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your components from gradio.components\n", 346 | " \"Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your components from gradio.components\",\n", 347 | "/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `optional` parameter is deprecated, and it has no effect\n", 348 | " warnings.warn(value)\n" 349 | ] 350 | }, 351 | { 352 | "output_type": "stream", 353 | "name": "stdout", 354 | "text": [ 355 | "Hint: Set streaming=True for Audio component to use live streaming.\n", 356 | "Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`\n", 357 | "Your interface requires microphone or webcam permissions - this may cause issues in Colab. Use the External URL in case of issues.\n", 358 | "Running on public URL: https://16570.gradio.app\n", 359 | "\n", 360 | "This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces\n" 361 | ] 362 | }, 363 | { 364 | "output_type": "display_data", 365 | "data": { 366 | "text/plain": [ 367 | "" 368 | ], 369 | "text/html": [ 370 | "
" 371 | ] 372 | }, 373 | "metadata": {} 374 | }, 375 | { 376 | "output_type": "execute_result", 377 | "data": { 378 | "text/plain": [ 379 | "(,\n", 380 | " 'http://127.0.0.1:7865/',\n", 381 | " 'https://16570.gradio.app')" 382 | ] 383 | }, 384 | "metadata": {}, 385 | "execution_count": 16 386 | } 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "source": [], 392 | "metadata": { 393 | "id": "y2Zid2MKdPxK" 394 | }, 395 | "execution_count": null, 396 | "outputs": [] 397 | } 398 | ], 399 | "metadata": { 400 | "accelerator": "GPU", 401 | "colab": { 402 | "collapsed_sections": [], 403 | "provenance": [] 404 | }, 405 | "gpuClass": "standard", 406 | "kernelspec": { 407 | "display_name": "Python 3 (ipykernel)", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.9.9" 422 | }, 423 | "widgets": { 424 | "application/vnd.jupyter.widget-state+json": { 425 | "8be0ab44dc1b42ab93831f9babf2e6bc": { 426 | "model_module": "@jupyter-widgets/controls", 427 | "model_name": "HBoxModel", 428 | "model_module_version": "1.5.0", 429 | "state": { 430 | "_dom_classes": [], 431 | "_model_module": "@jupyter-widgets/controls", 432 | "_model_module_version": "1.5.0", 433 | "_model_name": "HBoxModel", 434 | "_view_count": null, 435 | "_view_module": "@jupyter-widgets/controls", 436 | "_view_module_version": "1.5.0", 437 | "_view_name": "HBoxView", 438 | "box_style": "", 439 | "children": [ 440 | "IPY_MODEL_c5f1fffed5de4384a13bea1977894683", 441 | "IPY_MODEL_e884f3a6e2c44e469d8ca5249e813fab", 442 | "IPY_MODEL_5732293799cf42bd979f75b71b2bf1d2" 443 | ], 444 | "layout": "IPY_MODEL_1143ae57534d4ce9a86dc7a9294819f7" 445 | } 446 | }, 447 | "c5f1fffed5de4384a13bea1977894683": { 448 | "model_module": "@jupyter-widgets/controls", 449 | "model_name": "HTMLModel", 450 | "model_module_version": "1.5.0", 451 | "state": { 452 | "_dom_classes": [], 453 | "_model_module": "@jupyter-widgets/controls", 454 | "_model_module_version": "1.5.0", 455 | "_model_name": "HTMLModel", 456 | "_view_count": null, 457 | "_view_module": "@jupyter-widgets/controls", 458 | "_view_module_version": "1.5.0", 459 | "_view_name": "HTMLView", 460 | "description": "", 461 | "description_tooltip": null, 462 | "layout": "IPY_MODEL_729f9914f10c44f8a887f19030246078", 463 | "placeholder": "​", 464 | "style": "IPY_MODEL_0acff2776082461da3386814b1ce5db6", 465 | "value": "" 466 | } 467 | }, 468 | "e884f3a6e2c44e469d8ca5249e813fab": { 469 | "model_module": "@jupyter-widgets/controls", 470 | "model_name": "FloatProgressModel", 471 | "model_module_version": "1.5.0", 472 | "state": { 473 | "_dom_classes": [], 474 | "_model_module": "@jupyter-widgets/controls", 475 | "_model_module_version": "1.5.0", 476 | "_model_name": "FloatProgressModel", 477 | "_view_count": null, 478 | "_view_module": "@jupyter-widgets/controls", 479 | "_view_module_version": "1.5.0", 480 | "_view_name": "ProgressView", 481 | "bar_style": "success", 482 | "description": "", 483 | "description_tooltip": null, 484 | "layout": "IPY_MODEL_1912c3335dfd40b4805a7e70d65e9654", 485 | "max": 1, 486 | "min": 0, 487 | "orientation": "horizontal", 488 | "style": "IPY_MODEL_a419fdcb24a04a7c96356dd096b4e68a", 489 | "value": 0 490 | } 491 | }, 492 | "5732293799cf42bd979f75b71b2bf1d2": { 493 | "model_module": "@jupyter-widgets/controls", 494 | "model_name": "HTMLModel", 495 | "model_module_version": "1.5.0", 496 | "state": { 497 | "_dom_classes": [], 498 | "_model_module": "@jupyter-widgets/controls", 499 | "_model_module_version": "1.5.0", 500 | "_model_name": "HTMLModel", 501 | "_view_count": null, 502 | "_view_module": "@jupyter-widgets/controls", 503 | "_view_module_version": "1.5.0", 504 | "_view_name": "HTMLView", 505 | "description": "", 506 | "description_tooltip": null, 507 | "layout": "IPY_MODEL_01dbcbbbc3c04035b4a2bab45f81b02e", 508 | "placeholder": "​", 509 | "style": "IPY_MODEL_84cdb69f8b06492bb51c2ded0c1d4687", 510 | "value": " 0/0 [00:00<?, ?it/s]" 511 | } 512 | }, 513 | "1143ae57534d4ce9a86dc7a9294819f7": { 514 | "model_module": "@jupyter-widgets/base", 515 | "model_name": "LayoutModel", 516 | "model_module_version": "1.2.0", 517 | "state": { 518 | "_model_module": "@jupyter-widgets/base", 519 | "_model_module_version": "1.2.0", 520 | "_model_name": "LayoutModel", 521 | "_view_count": null, 522 | "_view_module": "@jupyter-widgets/base", 523 | "_view_module_version": "1.2.0", 524 | "_view_name": "LayoutView", 525 | "align_content": null, 526 | "align_items": null, 527 | "align_self": null, 528 | "border": null, 529 | "bottom": null, 530 | "display": null, 531 | "flex": null, 532 | "flex_flow": null, 533 | "grid_area": null, 534 | "grid_auto_columns": null, 535 | "grid_auto_flow": null, 536 | "grid_auto_rows": null, 537 | "grid_column": null, 538 | "grid_gap": null, 539 | "grid_row": null, 540 | "grid_template_areas": null, 541 | "grid_template_columns": null, 542 | "grid_template_rows": null, 543 | "height": null, 544 | "justify_content": null, 545 | "justify_items": null, 546 | "left": null, 547 | "margin": null, 548 | "max_height": null, 549 | "max_width": null, 550 | "min_height": null, 551 | "min_width": null, 552 | "object_fit": null, 553 | "object_position": null, 554 | "order": null, 555 | "overflow": null, 556 | "overflow_x": null, 557 | "overflow_y": null, 558 | "padding": null, 559 | "right": null, 560 | "top": null, 561 | "visibility": null, 562 | "width": null 563 | } 564 | }, 565 | "729f9914f10c44f8a887f19030246078": { 566 | "model_module": "@jupyter-widgets/base", 567 | "model_name": "LayoutModel", 568 | "model_module_version": "1.2.0", 569 | "state": { 570 | "_model_module": "@jupyter-widgets/base", 571 | "_model_module_version": "1.2.0", 572 | "_model_name": "LayoutModel", 573 | "_view_count": null, 574 | "_view_module": "@jupyter-widgets/base", 575 | "_view_module_version": "1.2.0", 576 | "_view_name": "LayoutView", 577 | "align_content": null, 578 | "align_items": null, 579 | "align_self": null, 580 | "border": null, 581 | "bottom": null, 582 | "display": null, 583 | "flex": null, 584 | "flex_flow": null, 585 | "grid_area": null, 586 | "grid_auto_columns": null, 587 | "grid_auto_flow": null, 588 | "grid_auto_rows": null, 589 | "grid_column": null, 590 | "grid_gap": null, 591 | "grid_row": null, 592 | "grid_template_areas": null, 593 | "grid_template_columns": null, 594 | "grid_template_rows": null, 595 | "height": null, 596 | "justify_content": null, 597 | "justify_items": null, 598 | "left": null, 599 | "margin": null, 600 | "max_height": null, 601 | "max_width": null, 602 | "min_height": null, 603 | "min_width": null, 604 | "object_fit": null, 605 | "object_position": null, 606 | "order": null, 607 | "overflow": null, 608 | "overflow_x": null, 609 | "overflow_y": null, 610 | "padding": null, 611 | "right": null, 612 | "top": null, 613 | "visibility": null, 614 | "width": null 615 | } 616 | }, 617 | "0acff2776082461da3386814b1ce5db6": { 618 | "model_module": "@jupyter-widgets/controls", 619 | "model_name": "DescriptionStyleModel", 620 | "model_module_version": "1.5.0", 621 | "state": { 622 | "_model_module": "@jupyter-widgets/controls", 623 | "_model_module_version": "1.5.0", 624 | "_model_name": "DescriptionStyleModel", 625 | "_view_count": null, 626 | "_view_module": "@jupyter-widgets/base", 627 | "_view_module_version": "1.2.0", 628 | "_view_name": "StyleView", 629 | "description_width": "" 630 | } 631 | }, 632 | "1912c3335dfd40b4805a7e70d65e9654": { 633 | "model_module": "@jupyter-widgets/base", 634 | "model_name": "LayoutModel", 635 | "model_module_version": "1.2.0", 636 | "state": { 637 | "_model_module": "@jupyter-widgets/base", 638 | "_model_module_version": "1.2.0", 639 | "_model_name": "LayoutModel", 640 | "_view_count": null, 641 | "_view_module": "@jupyter-widgets/base", 642 | "_view_module_version": "1.2.0", 643 | "_view_name": "LayoutView", 644 | "align_content": null, 645 | "align_items": null, 646 | "align_self": null, 647 | "border": null, 648 | "bottom": null, 649 | "display": null, 650 | "flex": null, 651 | "flex_flow": null, 652 | "grid_area": null, 653 | "grid_auto_columns": null, 654 | "grid_auto_flow": null, 655 | "grid_auto_rows": null, 656 | "grid_column": null, 657 | "grid_gap": null, 658 | "grid_row": null, 659 | "grid_template_areas": null, 660 | "grid_template_columns": null, 661 | "grid_template_rows": null, 662 | "height": null, 663 | "justify_content": null, 664 | "justify_items": null, 665 | "left": null, 666 | "margin": null, 667 | "max_height": null, 668 | "max_width": null, 669 | "min_height": null, 670 | "min_width": null, 671 | "object_fit": null, 672 | "object_position": null, 673 | "order": null, 674 | "overflow": null, 675 | "overflow_x": null, 676 | "overflow_y": null, 677 | "padding": null, 678 | "right": null, 679 | "top": null, 680 | "visibility": null, 681 | "width": "20px" 682 | } 683 | }, 684 | "a419fdcb24a04a7c96356dd096b4e68a": { 685 | "model_module": "@jupyter-widgets/controls", 686 | "model_name": "ProgressStyleModel", 687 | "model_module_version": "1.5.0", 688 | "state": { 689 | "_model_module": "@jupyter-widgets/controls", 690 | "_model_module_version": "1.5.0", 691 | "_model_name": "ProgressStyleModel", 692 | "_view_count": null, 693 | "_view_module": "@jupyter-widgets/base", 694 | "_view_module_version": "1.2.0", 695 | "_view_name": "StyleView", 696 | "bar_color": null, 697 | "description_width": "" 698 | } 699 | }, 700 | "01dbcbbbc3c04035b4a2bab45f81b02e": { 701 | "model_module": "@jupyter-widgets/base", 702 | "model_name": "LayoutModel", 703 | "model_module_version": "1.2.0", 704 | "state": { 705 | "_model_module": "@jupyter-widgets/base", 706 | "_model_module_version": "1.2.0", 707 | "_model_name": "LayoutModel", 708 | "_view_count": null, 709 | "_view_module": "@jupyter-widgets/base", 710 | "_view_module_version": "1.2.0", 711 | "_view_name": "LayoutView", 712 | "align_content": null, 713 | "align_items": null, 714 | "align_self": null, 715 | "border": null, 716 | "bottom": null, 717 | "display": null, 718 | "flex": null, 719 | "flex_flow": null, 720 | "grid_area": null, 721 | "grid_auto_columns": null, 722 | "grid_auto_flow": null, 723 | "grid_auto_rows": null, 724 | "grid_column": null, 725 | "grid_gap": null, 726 | "grid_row": null, 727 | "grid_template_areas": null, 728 | "grid_template_columns": null, 729 | "grid_template_rows": null, 730 | "height": null, 731 | "justify_content": null, 732 | "justify_items": null, 733 | "left": null, 734 | "margin": null, 735 | "max_height": null, 736 | "max_width": null, 737 | "min_height": null, 738 | "min_width": null, 739 | "object_fit": null, 740 | "object_position": null, 741 | "order": null, 742 | "overflow": null, 743 | "overflow_x": null, 744 | "overflow_y": null, 745 | "padding": null, 746 | "right": null, 747 | "top": null, 748 | "visibility": null, 749 | "width": null 750 | } 751 | }, 752 | "84cdb69f8b06492bb51c2ded0c1d4687": { 753 | "model_module": "@jupyter-widgets/controls", 754 | "model_name": "DescriptionStyleModel", 755 | "model_module_version": "1.5.0", 756 | "state": { 757 | "_model_module": "@jupyter-widgets/controls", 758 | "_model_module_version": "1.5.0", 759 | "_model_name": "DescriptionStyleModel", 760 | "_view_count": null, 761 | "_view_module": "@jupyter-widgets/base", 762 | "_view_module_version": "1.2.0", 763 | "_view_name": "StyleView", 764 | "description_width": "" 765 | } 766 | } 767 | } 768 | } 769 | }, 770 | "nbformat": 4, 771 | "nbformat_minor": 0 772 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # openai-whisper-webapp 2 | Code for OpenAI Whisper Web App Demo 3 | 4 | 5 | Open In Colab 6 | 7 | 8 | Credits: 9 | 10 | * OpenAI Whisper https://github.com/openai/whisper 11 | 12 | ![image](https://user-images.githubusercontent.com/5347322/191598847-c133d891-399c-4737-be08-18d21a27db95.png) 13 | -------------------------------------------------------------------------------- /cpu_app.py: -------------------------------------------------------------------------------- 1 | import whisper 2 | import gradio as gr 3 | 4 | model = whisper.load_model("small") 5 | 6 | def transcribe(audio): 7 | 8 | #time.sleep(3) 9 | # load audio and pad/trim it to fit 30 seconds 10 | audio = whisper.load_audio(audio) 11 | audio = whisper.pad_or_trim(audio) 12 | 13 | # make log-Mel spectrogram and move to the same device as the model 14 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 15 | 16 | # detect the spoken language 17 | _, probs = model.detect_language(mel) 18 | print(f"Detected language: {max(probs, key=probs.get)}") 19 | 20 | # decode the audio 21 | options = whisper.DecodingOptions(fp16 = False) 22 | result = whisper.decode(model, mel, options) 23 | return result.text 24 | 25 | 26 | 27 | gr.Interface( 28 | title = 'OpenAI Whisper ASR Gradio Web UI', 29 | fn=transcribe, 30 | inputs=[ 31 | gr.inputs.Audio(source="microphone", type="filepath") 32 | ], 33 | outputs=[ 34 | "textbox" 35 | ], 36 | live=True).launch() 37 | --------------------------------------------------------------------------------