├── .github
    └── FUNDING.yml
├── LICENSE
├── README.md
├── whisper_jax_large_colab.ipynb
├── whisper_jax_medium_colab.ipynb
└── whisper_jax_medium_gradio_colab.ipynb


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: camenduru
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: camenduru
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 🐣 Please follow me for new updates https://twitter.com/camenduru <br />
 2 | 🔥 Please join our discord server https://discord.gg/k5BwmmvJJU <br />
 3 | 🥳 Please join my patreon community https://patreon.com/camenduru <br />
 4 | 
 5 | ## 🚦 WIP 🚦
 6 | 
 7 | ### 🦒 Colab
 8 | 
 9 | | Colab | Info
10 | | --- | --- |
11 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_large_colab.ipynb) | openai/whisper-large-v2 Pro Colab (T4) we need High-RAM 😐
12 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_colab.ipynb) | openai/whisper-medium Free Colab (T4)
13 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_gradio_colab.ipynb) | openai/whisper-medium Free Colab (T4) Gradio
14 | 
15 | ### Main Repo
16 | https://github.com/openai/whisper <br />
17 | https://github.com/sanchit-gandhi/whisper-jax <br />
18 | 
19 | ### Paper
20 | https://arxiv.org/abs/2212.04356
21 | 


--------------------------------------------------------------------------------
/whisper_jax_large_colab.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "cell_type": "markdown",
 5 |       "metadata": {
 6 |         "id": "view-in-github"
 7 |       },
 8 |       "source": [
 9 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_large_colab.ipynb)"
10 |       ]
11 |     },
12 |     {
13 |       "cell_type": "code",
14 |       "execution_count": null,
15 |       "metadata": {
16 |         "id": "ngr4pZeHCM1C"
17 |       },
18 |       "outputs": [],
19 |       "source": [
20 |         "# https://github.com/sanchit-gandhi/whisper-jax/blob/main/README.md modified\n",
21 |         "\n",
22 |         "!pip install -q transformers==4.31.0\n",
23 |         "!pip install -U flax==0.7.2 \"jax[cuda11_local]==0.4.13\" \"jaxlib[cuda11_local]==0.4.13\" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n",
24 |         "!pip install -q git+https://github.com/camenduru/whisper-jax.git datasets soundfile librosa yt_dlp cached_property\n",
25 |         "\n",
26 |         "import jax\n",
27 |         "from whisper_jax import FlaxWhisperPipline\n",
28 |         "import jax.numpy as jnp\n",
29 |         "\n",
30 |         "pipeline = FlaxWhisperPipline(\"openai/whisper-large-v2\", dtype=jnp.float16)\n",
31 |         "from jax.experimental.compilation_cache import compilation_cache as cc\n",
32 |         "cc.initialize_cache(\"/content/jax_cache\")"
33 |       ]
34 |     },
35 |     {
36 |       "cell_type": "code",
37 |       "execution_count": null,
38 |       "metadata": {},
39 |       "outputs": [],
40 |       "source": [
41 |         "from yt_dlp import YoutubeDL\n",
42 |         "with YoutubeDL({'overwrites':True, 'format':'bestaudio[ext=m4a]', 'outtmpl':'/content/audio.m4a'}) as ydl:\n",
43 |         "    ydl.download(\"https://youtu.be/LXEAkeh7OTE\")"
44 |       ]
45 |     },
46 |     {
47 |       "cell_type": "code",
48 |       "execution_count": null,
49 |       "metadata": {},
50 |       "outputs": [],
51 |       "source": [
52 |         "outputs = pipeline(\"/content/audio.m4a\", return_timestamps=True)\n",
53 |         "text = outputs[\"text\"]\n",
54 |         "chunks = outputs[\"chunks\"]\n",
55 |         "\n",
56 |         "print(text)"
57 |       ]
58 |     },
59 |     {
60 |       "cell_type": "code",
61 |       "execution_count": null,
62 |       "metadata": {},
63 |       "outputs": [],
64 |       "source": [
65 |         "print(chunks)"
66 |       ]
67 |     }
68 |   ],
69 |   "metadata": {
70 |     "accelerator": "GPU",
71 |     "colab": {
72 |       "machine_shape": "hm",
73 |       "provenance": []
74 |     },
75 |     "gpuClass": "standard",
76 |     "kernelspec": {
77 |       "display_name": "Python 3",
78 |       "name": "python3"
79 |     },
80 |     "language_info": {
81 |       "name": "python"
82 |     }
83 |   },
84 |   "nbformat": 4,
85 |   "nbformat_minor": 0
86 | }
87 | 


--------------------------------------------------------------------------------
/whisper_jax_medium_colab.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "cell_type": "markdown",
 5 |       "metadata": {
 6 |         "id": "view-in-github"
 7 |       },
 8 |       "source": [
 9 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_colab.ipynb)"
10 |       ]
11 |     },
12 |     {
13 |       "cell_type": "code",
14 |       "execution_count": null,
15 |       "metadata": {
16 |         "id": "ngr4pZeHCM1C"
17 |       },
18 |       "outputs": [],
19 |       "source": [
20 |         "# https://github.com/sanchit-gandhi/whisper-jax/blob/main/README.md modified\n",
21 |         "\n",
22 |         "!pip install -q transformers==4.31.0\n",
23 |         "!pip install -U flax==0.7.2 \"jax[cuda11_local]==0.4.13\" \"jaxlib[cuda11_local]==0.4.13\" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n",
24 |         "!pip install -q git+https://github.com/camenduru/whisper-jax.git datasets soundfile librosa yt_dlp cached_property\n",
25 |         "\n",
26 |         "import jax\n",
27 |         "from whisper_jax import FlaxWhisperPipline\n",
28 |         "import jax.numpy as jnp\n",
29 |         "\n",
30 |         "pipeline = FlaxWhisperPipline(\"openai/whisper-medium\", dtype=jnp.float16)\n",
31 |         "from jax.experimental.compilation_cache import compilation_cache as cc\n",
32 |         "cc.initialize_cache(\"/content/jax_cache\")"
33 |       ]
34 |     },
35 |     {
36 |       "cell_type": "code",
37 |       "execution_count": null,
38 |       "metadata": {},
39 |       "outputs": [],
40 |       "source": [
41 |         "from yt_dlp import YoutubeDL\n",
42 |         "with YoutubeDL({'overwrites':True, 'format':'bestaudio[ext=m4a]', 'outtmpl':'/content/audio.m4a'}) as ydl:\n",
43 |         "    ydl.download(\"https://youtu.be/LXEAkeh7OTE\")"
44 |       ]
45 |     },
46 |     {
47 |       "cell_type": "code",
48 |       "execution_count": null,
49 |       "metadata": {},
50 |       "outputs": [],
51 |       "source": [
52 |         "outputs = pipeline(\"/content/audio.m4a\")\n",
53 |         "text = outputs[\"text\"]\n",
54 |         "\n",
55 |         "print(text)"
56 |       ]
57 |     }
58 |   ],
59 |   "metadata": {
60 |     "accelerator": "GPU",
61 |     "colab": {
62 |       "provenance": []
63 |     },
64 |     "gpuClass": "standard",
65 |     "kernelspec": {
66 |       "display_name": "Python 3",
67 |       "name": "python3"
68 |     },
69 |     "language_info": {
70 |       "name": "python"
71 |     }
72 |   },
73 |   "nbformat": 4,
74 |   "nbformat_minor": 0
75 | }
76 | 


--------------------------------------------------------------------------------
/whisper_jax_medium_gradio_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "view-in-github"
  7 |       },
  8 |       "source": [
  9 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_gradio_colab.ipynb)"
 10 |       ]
 11 |     },
 12 |     {
 13 |       "cell_type": "code",
 14 |       "execution_count": null,
 15 |       "metadata": {
 16 |         "id": "ngr4pZeHCM1C"
 17 |       },
 18 |       "outputs": [],
 19 |       "source": [
 20 |         "!pip install -q transformers==4.31.0\n",
 21 |         "!pip install -U flax==0.7.2 \"jax[cuda11_local]==0.4.13\" \"jaxlib[cuda11_local]==0.4.13\" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n",
 22 |         "!pip install -q git+https://github.com/camenduru/whisper-jax.git@dev datasets soundfile librosa yt_dlp gradio cached_property\n",
 23 |         "\n",
 24 |         "import gradio as gr\n",
 25 |         "from yt_dlp import YoutubeDL\n",
 26 |         "import os\n",
 27 |         "import jax\n",
 28 |         "from whisper_jax import FlaxWhisperPipline\n",
 29 |         "import jax.numpy as jnp\n",
 30 |         "\n",
 31 |         "pipeline = FlaxWhisperPipline(\"openai/whisper-medium\", dtype=jnp.float16)\n",
 32 |         "from jax.experimental.compilation_cache import compilation_cache as cc\n",
 33 |         "cc.initialize_cache(\"/content/jax_cache\")\n",
 34 |         "    \n",
 35 |         "def download_video(url):\n",
 36 |         "  ydl_opts = {'overwrites':True, 'format':'bestaudio[ext=m4a]', 'outtmpl':'/content/audio.m4a'}\n",
 37 |         "  with YoutubeDL(ydl_opts) as ydl:\n",
 38 |         "    ydl.download(url)\n",
 39 |         "    return f\"/content/audio.m4a\"\n",
 40 |         "\n",
 41 |         "# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50\n",
 42 |         "def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = \".\"):\n",
 43 |         "    if seconds is not None:\n",
 44 |         "        milliseconds = round(seconds * 1000.0)\n",
 45 |         "        hours = milliseconds // 3_600_000\n",
 46 |         "        milliseconds -= hours * 3_600_000\n",
 47 |         "        minutes = milliseconds // 60_000\n",
 48 |         "        milliseconds -= minutes * 60_000\n",
 49 |         "        seconds = milliseconds // 1_000\n",
 50 |         "        milliseconds -= seconds * 1_000\n",
 51 |         "        hours_marker = f\"{hours:02d}:\" if always_include_hours or hours > 0 else \"\"\n",
 52 |         "        return f\"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}\"\n",
 53 |         "    else:\n",
 54 |         "        return seconds\n",
 55 |         "  \n",
 56 |         "def transcribe(audio_in):\n",
 57 |         "    outputs = pipeline(\"/content/audio.m4a\", return_timestamps=True)\n",
 58 |         "    text = outputs[\"text\"]\n",
 59 |         "    chunks = outputs[\"chunks\"]\n",
 60 |         "    output = \"\"\n",
 61 |         "    # https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer/blob/main/app.py modifyed\n",
 62 |         "    for i, chunk in enumerate(chunks):\n",
 63 |         "      output += f\"{i+1}\\n\"\n",
 64 |         "      output += f\"{format_timestamp(chunk['timestamp'][0])} --> {format_timestamp(chunk['timestamp'][1])}\\n\"\n",
 65 |         "      output += f\"{chunk['text']}\\n\\n\"\n",
 66 |         "    return text, output\n",
 67 |         "\n",
 68 |         "app = gr.Blocks()\n",
 69 |         "with app:\n",
 70 |         "  with gr.Row():\n",
 71 |         "    with gr.Column():\n",
 72 |         "      input_text = gr.Textbox(show_label=False, value=\"https://www.youtube.com/watch?v=SN2sak8Tp70\")\n",
 73 |         "      input_download_button = gr.Button(value=\"Download from YouTube or Twitch\")\n",
 74 |         "      input_transcribe_button = gr.Button(value=\"Transcribe\")\n",
 75 |         "    with gr.Column():\n",
 76 |         "        audio_out = gr.Audio(label=\"Output Audio\")\n",
 77 |         "        text_out = gr.Textbox(label=\"Output Text\")\n",
 78 |         "        chunks_out = gr.Textbox(label=\"Output SRT\")\n",
 79 |         "    input_download_button.click(download_video, inputs=[input_text], outputs=[audio_out])\n",
 80 |         "    input_transcribe_button.click(transcribe, inputs=[audio_out], outputs=[text_out, chunks_out])\n",
 81 |         "  \n",
 82 |         "app.launch(debug=True)"
 83 |       ]
 84 |     }
 85 |   ],
 86 |   "metadata": {
 87 |     "accelerator": "GPU",
 88 |     "colab": {
 89 |       "provenance": []
 90 |     },
 91 |     "gpuClass": "standard",
 92 |     "kernelspec": {
 93 |       "display_name": "Python 3",
 94 |       "name": "python3"
 95 |     },
 96 |     "language_info": {
 97 |       "name": "python"
 98 |     }
 99 |   },
100 |   "nbformat": 4,
101 |   "nbformat_minor": 0
102 | }
103 | 


--------------------------------------------------------------------------------