├── .github └── FUNDING.yml ├── LICENSE ├── README.md ├── whisper_jax_large_colab.ipynb ├── whisper_jax_medium_colab.ipynb └── whisper_jax_medium_gradio_colab.ipynb /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: camenduru 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: camenduru 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 🐣 Please follow me for new updates https://twitter.com/camenduru
2 | 🔥 Please join our discord server https://discord.gg/k5BwmmvJJU
3 | 🥳 Please join my patreon community https://patreon.com/camenduru
4 | 5 | ## 🚦 WIP 🚦 6 | 7 | ### 🦒 Colab 8 | 9 | | Colab | Info 10 | | --- | --- | 11 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_large_colab.ipynb) | openai/whisper-large-v2 Pro Colab (T4) we need High-RAM 😐 12 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_colab.ipynb) | openai/whisper-medium Free Colab (T4) 13 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_gradio_colab.ipynb) | openai/whisper-medium Free Colab (T4) Gradio 14 | 15 | ### Main Repo 16 | https://github.com/openai/whisper
17 | https://github.com/sanchit-gandhi/whisper-jax
18 | 19 | ### Paper 20 | https://arxiv.org/abs/2212.04356 21 | -------------------------------------------------------------------------------- /whisper_jax_large_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github" 7 | }, 8 | "source": [ 9 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_large_colab.ipynb)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "ngr4pZeHCM1C" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "# https://github.com/sanchit-gandhi/whisper-jax/blob/main/README.md modified\n", 21 | "\n", 22 | "!pip install -q transformers==4.31.0\n", 23 | "!pip install -U flax==0.7.2 \"jax[cuda11_local]==0.4.13\" \"jaxlib[cuda11_local]==0.4.13\" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n", 24 | "!pip install -q git+https://github.com/camenduru/whisper-jax.git datasets soundfile librosa yt_dlp cached_property\n", 25 | "\n", 26 | "import jax\n", 27 | "from whisper_jax import FlaxWhisperPipline\n", 28 | "import jax.numpy as jnp\n", 29 | "\n", 30 | "pipeline = FlaxWhisperPipline(\"openai/whisper-large-v2\", dtype=jnp.float16)\n", 31 | "from jax.experimental.compilation_cache import compilation_cache as cc\n", 32 | "cc.initialize_cache(\"/content/jax_cache\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from yt_dlp import YoutubeDL\n", 42 | "with YoutubeDL({'overwrites':True, 'format':'bestaudio[ext=m4a]', 'outtmpl':'/content/audio.m4a'}) as ydl:\n", 43 | " ydl.download(\"https://youtu.be/LXEAkeh7OTE\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "outputs = pipeline(\"/content/audio.m4a\", return_timestamps=True)\n", 53 | "text = outputs[\"text\"]\n", 54 | "chunks = outputs[\"chunks\"]\n", 55 | "\n", 56 | "print(text)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "print(chunks)" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "accelerator": "GPU", 71 | "colab": { 72 | "machine_shape": "hm", 73 | "provenance": [] 74 | }, 75 | "gpuClass": "standard", 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "name": "python" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 0 86 | } 87 | -------------------------------------------------------------------------------- /whisper_jax_medium_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github" 7 | }, 8 | "source": [ 9 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_colab.ipynb)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "ngr4pZeHCM1C" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "# https://github.com/sanchit-gandhi/whisper-jax/blob/main/README.md modified\n", 21 | "\n", 22 | "!pip install -q transformers==4.31.0\n", 23 | "!pip install -U flax==0.7.2 \"jax[cuda11_local]==0.4.13\" \"jaxlib[cuda11_local]==0.4.13\" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n", 24 | "!pip install -q git+https://github.com/camenduru/whisper-jax.git datasets soundfile librosa yt_dlp cached_property\n", 25 | "\n", 26 | "import jax\n", 27 | "from whisper_jax import FlaxWhisperPipline\n", 28 | "import jax.numpy as jnp\n", 29 | "\n", 30 | "pipeline = FlaxWhisperPipline(\"openai/whisper-medium\", dtype=jnp.float16)\n", 31 | "from jax.experimental.compilation_cache import compilation_cache as cc\n", 32 | "cc.initialize_cache(\"/content/jax_cache\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from yt_dlp import YoutubeDL\n", 42 | "with YoutubeDL({'overwrites':True, 'format':'bestaudio[ext=m4a]', 'outtmpl':'/content/audio.m4a'}) as ydl:\n", 43 | " ydl.download(\"https://youtu.be/LXEAkeh7OTE\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "outputs = pipeline(\"/content/audio.m4a\")\n", 53 | "text = outputs[\"text\"]\n", 54 | "\n", 55 | "print(text)" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "accelerator": "GPU", 61 | "colab": { 62 | "provenance": [] 63 | }, 64 | "gpuClass": "standard", 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "name": "python" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 0 75 | } 76 | -------------------------------------------------------------------------------- /whisper_jax_medium_gradio_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github" 7 | }, 8 | "source": [ 9 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/whisper-jax-colab/blob/main/whisper_jax_medium_gradio_colab.ipynb)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "ngr4pZeHCM1C" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "!pip install -q transformers==4.31.0\n", 21 | "!pip install -U flax==0.7.2 \"jax[cuda11_local]==0.4.13\" \"jaxlib[cuda11_local]==0.4.13\" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n", 22 | "!pip install -q git+https://github.com/camenduru/whisper-jax.git@dev datasets soundfile librosa yt_dlp gradio cached_property\n", 23 | "\n", 24 | "import gradio as gr\n", 25 | "from yt_dlp import YoutubeDL\n", 26 | "import os\n", 27 | "import jax\n", 28 | "from whisper_jax import FlaxWhisperPipline\n", 29 | "import jax.numpy as jnp\n", 30 | "\n", 31 | "pipeline = FlaxWhisperPipline(\"openai/whisper-medium\", dtype=jnp.float16)\n", 32 | "from jax.experimental.compilation_cache import compilation_cache as cc\n", 33 | "cc.initialize_cache(\"/content/jax_cache\")\n", 34 | " \n", 35 | "def download_video(url):\n", 36 | " ydl_opts = {'overwrites':True, 'format':'bestaudio[ext=m4a]', 'outtmpl':'/content/audio.m4a'}\n", 37 | " with YoutubeDL(ydl_opts) as ydl:\n", 38 | " ydl.download(url)\n", 39 | " return f\"/content/audio.m4a\"\n", 40 | "\n", 41 | "# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50\n", 42 | "def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = \".\"):\n", 43 | " if seconds is not None:\n", 44 | " milliseconds = round(seconds * 1000.0)\n", 45 | " hours = milliseconds // 3_600_000\n", 46 | " milliseconds -= hours * 3_600_000\n", 47 | " minutes = milliseconds // 60_000\n", 48 | " milliseconds -= minutes * 60_000\n", 49 | " seconds = milliseconds // 1_000\n", 50 | " milliseconds -= seconds * 1_000\n", 51 | " hours_marker = f\"{hours:02d}:\" if always_include_hours or hours > 0 else \"\"\n", 52 | " return f\"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}\"\n", 53 | " else:\n", 54 | " return seconds\n", 55 | " \n", 56 | "def transcribe(audio_in):\n", 57 | " outputs = pipeline(\"/content/audio.m4a\", return_timestamps=True)\n", 58 | " text = outputs[\"text\"]\n", 59 | " chunks = outputs[\"chunks\"]\n", 60 | " output = \"\"\n", 61 | " # https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer/blob/main/app.py modifyed\n", 62 | " for i, chunk in enumerate(chunks):\n", 63 | " output += f\"{i+1}\\n\"\n", 64 | " output += f\"{format_timestamp(chunk['timestamp'][0])} --> {format_timestamp(chunk['timestamp'][1])}\\n\"\n", 65 | " output += f\"{chunk['text']}\\n\\n\"\n", 66 | " return text, output\n", 67 | "\n", 68 | "app = gr.Blocks()\n", 69 | "with app:\n", 70 | " with gr.Row():\n", 71 | " with gr.Column():\n", 72 | " input_text = gr.Textbox(show_label=False, value=\"https://www.youtube.com/watch?v=SN2sak8Tp70\")\n", 73 | " input_download_button = gr.Button(value=\"Download from YouTube or Twitch\")\n", 74 | " input_transcribe_button = gr.Button(value=\"Transcribe\")\n", 75 | " with gr.Column():\n", 76 | " audio_out = gr.Audio(label=\"Output Audio\")\n", 77 | " text_out = gr.Textbox(label=\"Output Text\")\n", 78 | " chunks_out = gr.Textbox(label=\"Output SRT\")\n", 79 | " input_download_button.click(download_video, inputs=[input_text], outputs=[audio_out])\n", 80 | " input_transcribe_button.click(transcribe, inputs=[audio_out], outputs=[text_out, chunks_out])\n", 81 | " \n", 82 | "app.launch(debug=True)" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "accelerator": "GPU", 88 | "colab": { 89 | "provenance": [] 90 | }, 91 | "gpuClass": "standard", 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "name": "python3" 95 | }, 96 | "language_info": { 97 | "name": "python" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 0 102 | } 103 | --------------------------------------------------------------------------------