├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── decipher ├── __init__.py ├── __main__.py ├── action.py └── gui.py ├── notebook.ipynb ├── requirements.txt └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .idea/ 132 | logs/ 133 | tmp/ 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 dsymbol 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Decipher 📺️ 2 | 3 | AI-generated transcription subtitles are a way to automatically add subtitles to your videos by using artificial intelligence to transcribe the audio from the video. This eliminates the need for manual transcription and can make your videos more accessible to a wider audience. Decipher uses [whisper](#what-is-whisper) to transcribe the audio taken from the video and create subtitles 4 | 5 | #### What is whisper? 6 | [Whisper](https://github.com/openai/whisper) is an automatic State-of-the-Art speech recognition system from OpenAI that has been trained on 680,000 hours 7 | of multilingual and multitask supervised data collected from the web. This large and diverse dataset leads to improved 8 | robustness to accents, background noise and technical language. 9 | 10 | ## Getting Started 11 | 12 | There are two different ways to begin using decipher, depending on your preferences: 13 | 14 | * [`Google Colab`](#google-colab) 15 | * [`Manual`](#manual) 16 | 17 | ### Google Colab 18 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dsymbol/decipher/blob/main/notebook.ipynb) 19 | 20 | Notes: 21 | - Requires a (free) Google account 22 | - Instructions are embedded in the Colab Notebook 23 | 24 | Google Colab is a cloud-based platform for machine learning and data science, for free without the need for a powerful GPU of your own. It offers the option to borrow a powerful GPU (Tesla K80, T4, P4, or P100) on their server for free for a maximum of 12 hours per session. For those who require even more powerful GPUs and longer runtimes, Colab Pro/Pro+ options are available. 25 | 26 | ### Manual 27 | 28 | #### Dependencies 29 | 30 | - [`Python`](https://www.python.org/downloads/) 31 | - [`ffmpeg`](https://ffmpeg.org/) 32 | 33 | #### Installation 34 | 35 | ``` 36 | pip install git+https://github.com/dsymbol/decipher 37 | ``` 38 | 39 | or 40 | 41 | ``` 42 | git clone https://github.com/dsymbol/decipher 43 | cd decipher && pip install . 44 | ``` 45 | *Note: Do NOT use 'pip install decipher'. It installs a different package.* 46 | 47 | #### GUI (gradio) usage 48 | 49 | ```bash 50 | decipher gui 51 | # or 52 | python -m decipher gui 53 | ``` 54 | 55 | #### Command-line usage 56 | 57 | The `transcribe` subcommand allows you to transcribe a video file into a SubRip Subtitle (SRT) file. 58 | It also has the option to automatically add the generated subtitles to the video. 59 | 60 | The `subtitle` subcommand allows you to add subtitles to a video using an already existing SRT file. 61 | This subcommand does not perform transcription, but rather assumes that the SRT file has already been created. 62 | It is typically used by people who want to validate the accuracy of a transcription generated by the transcribe subcommand. 63 | 64 | To get started right away: 65 | 66 | ```bash 67 | decipher --help 68 | ``` 69 | 70 | You can run decipher as a package if running it as a script doesn't work: 71 | 72 | ```bash 73 | python -m decipher --help 74 | ``` 75 | 76 | #### Command-line examples: 77 | 78 | Generate SRT subtitles for video: 79 | 80 | ```bash 81 | decipher transcribe -i video.mp4 --model small 82 | ``` 83 | 84 | Burn generated subtitles into video: 85 | 86 | ```bash 87 | decipher subtitle -i video.mp4 --subtitle_file video.srt --subtitle_action burn 88 | ``` 89 | 90 | Generate and burn subtitles into video without validating transcription: 91 | 92 | ```bash 93 | decipher transcribe -i video.mp4 --model small --subtitle_action burn 94 | ``` 95 | -------------------------------------------------------------------------------- /decipher/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /decipher/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from ffutils import get_ffmpeg_exe 5 | 6 | from .action import subtitle, transcribe 7 | from .gui import ui 8 | 9 | 10 | def cli(): 11 | parser = argparse.ArgumentParser( 12 | prog="decipher", description="Transcribe videos easily using openai whisper" 13 | ) 14 | subparsers = parser.add_subparsers(required=True, dest="action") 15 | 16 | t = subparsers.add_parser("transcribe", help="transcribe a video") 17 | t.add_argument( 18 | "-i", 19 | "--input", 20 | required=True, 21 | type=str, 22 | help="input video file path e.g. video.mp4", 23 | ) 24 | t.add_argument( 25 | "-o", "--output_dir", type=str, default=".", help="output directory path" 26 | ) 27 | t.add_argument( 28 | "--model", 29 | default="medium", 30 | type=str, 31 | help="name of the whisper model to use https://huggingface.co/openai/whisper-large-v3#model-details", 32 | ) 33 | t.add_argument( 34 | "--language", type=str, default=None, help="language spoken in the audio" 35 | ) 36 | t.add_argument( 37 | "--task", 38 | type=str, 39 | default="transcribe", 40 | choices=["transcribe", "translate"], 41 | help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')", 42 | ) 43 | t.add_argument( 44 | "--batch_size", 45 | required=False, 46 | type=int, 47 | default=24, 48 | help="Number of parallel batches reduce if you face out of memory errors", 49 | ) 50 | t.add_argument( 51 | "-a", 52 | "--subtitle_action", 53 | type=str, 54 | default=None, 55 | choices=["add", "burn"], 56 | help="whether to perform subtitle add or burn action", 57 | ) 58 | 59 | s = subparsers.add_parser("subtitle", help="subtitle a video") 60 | s.add_argument( 61 | "-i", 62 | "--input", 63 | required=True, 64 | type=str, 65 | help="input video file path e.g. video.mp4", 66 | ) 67 | s.add_argument( 68 | "-o", "--output_dir", type=str, default=".", help="output directory path" 69 | ) 70 | s.add_argument( 71 | "-s", 72 | "--subtitle_file", 73 | required=True, 74 | type=str, 75 | help="input subtitles path e.g. subtitle.srt", 76 | ) 77 | s.add_argument( 78 | "-a", 79 | "--subtitle_action", 80 | type=str, 81 | default="burn", 82 | choices=["add", "burn"], 83 | help="whether to perform subtitle add or burn action", 84 | ) 85 | 86 | g = subparsers.add_parser("gui", help="launch a gradio gui") 87 | g.add_argument( 88 | "--share", 89 | action="store_true", 90 | default=False, 91 | help="create a publicly shareable link for the interface (default: False)", 92 | ) 93 | return parser.parse_args() 94 | 95 | 96 | def main(): 97 | get_ffmpeg_exe() 98 | args = cli() 99 | 100 | if args.action == "transcribe": 101 | output = transcribe( 102 | args.input, 103 | args.output_dir, 104 | args.model, 105 | args.language, 106 | args.task, 107 | args.batch_size, 108 | args.subtitle_action, 109 | ) 110 | elif args.action == "subtitle": 111 | output = subtitle( 112 | args.input, args.subtitle_file, args.output_dir, args.subtitle_action 113 | ) 114 | elif args.action == "gui": 115 | return ui().launch(share=args.share) 116 | print(f"Result -> {output.output_dir}") 117 | 118 | 119 | if __name__ == "__main__": 120 | sys.exit(main()) 121 | -------------------------------------------------------------------------------- /decipher/action.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | from tempfile import TemporaryDirectory 4 | 5 | import stable_whisper 6 | import torch 7 | from ffutils import ffprog 8 | 9 | root = Path(__file__).parent 10 | 11 | 12 | @dataclass 13 | class PathStore: 14 | output_dir: Path 15 | subtitle_file: Path 16 | video_file: Path = None 17 | 18 | 19 | def audio_to_srt( 20 | audio_file, 21 | model="medium", 22 | task="transcribe", 23 | language=None, 24 | batch_size=24, 25 | ): 26 | if torch.cuda.is_available(): 27 | device = "cuda" 28 | elif torch.backends.mps.is_available(): 29 | device = "mps" 30 | else: 31 | device = "cpu" 32 | print(f"{device.upper()} is being used for this transcription.") 33 | 34 | model = stable_whisper.load_hf_whisper(model, device=device) 35 | result = model.transcribe( 36 | audio_file, language=language, task=task, batch_size=batch_size 37 | ) 38 | return result.to_srt_vtt(word_level=False) 39 | 40 | 41 | def transcribe( 42 | video_in, 43 | output_dir=".", 44 | model="medium", 45 | language=None, 46 | task="transcribe", 47 | batch_size=24, 48 | subtitle_action=None, 49 | ) -> PathStore: 50 | video_in = Path(video_in).absolute() 51 | assert video_in.exists(), f"File {video_in} does not exist" 52 | 53 | output_dir = Path(output_dir).absolute() 54 | output_dir.mkdir(parents=True, exist_ok=True) 55 | 56 | with TemporaryDirectory() as _tempdir: 57 | tempdir = Path(_tempdir) 58 | audio_file = str(tempdir / "audio.aac") 59 | 60 | ffprog( 61 | ["ffmpeg", "-y", "-i", str(video_in), "-vn", "-c:a", "aac", audio_file], 62 | desc=f"Extracting audio from video", 63 | ) 64 | 65 | srt_text = audio_to_srt(audio_file, model, task, language, batch_size) 66 | 67 | srt_file = output_dir / f"{video_in.stem}.srt" 68 | with open(srt_file, "w", encoding="utf-8") as f: 69 | f.write(srt_text) 70 | 71 | assert srt_file.exists(), f"SRT file not generated?" 72 | 73 | if subtitle_action: 74 | store = subtitle(video_in, srt_file, output_dir, subtitle_action) 75 | else: 76 | store = PathStore(output_dir, srt_file) 77 | 78 | return store 79 | 80 | 81 | def subtitle(video_in, subtitle_file, output_dir=".", action="burn") -> PathStore: 82 | video_in = Path(video_in).absolute() 83 | subtitle_file = Path(subtitle_file).absolute() 84 | assert video_in.exists(), f"File {video_in} does not exist" 85 | assert subtitle_file.exists(), f"File {subtitle_file} does not exist" 86 | 87 | output_dir = Path(output_dir).absolute() 88 | output_dir.mkdir(parents=True, exist_ok=True) 89 | 90 | if action == "burn": 91 | video_out = output_dir / f"{video_in.stem}_out.mp4" 92 | ffprog( 93 | [ 94 | "ffmpeg", 95 | "-y", 96 | "-i", 97 | str(video_in), 98 | "-vf", 99 | f"subtitles={str(subtitle_file.name)}:force_style='Fontname=Arial,Fontsize=16,OutlineColour=&H80000000,BorderStyle=4," 100 | "BackColour=&H80000000,Outline=0,Shadow=0,MarginV=10,Alignment=2,Bold=-1'", 101 | str(video_out), 102 | ], 103 | cwd=str(subtitle_file.parent), # https://trac.ffmpeg.org/ticket/3334 104 | desc=f"Burning subtitles into video", 105 | ) 106 | else: 107 | video_out = output_dir / f"{video_in.stem}_out.mp4" 108 | ffprog( 109 | [ 110 | "ffmpeg", 111 | "-y", 112 | "-i", 113 | str(video_in), 114 | "-i", 115 | str(subtitle_file), 116 | "-c:s", 117 | "mov_text", 118 | str(video_out), 119 | ], 120 | desc=f"Adding subtitles to video", 121 | ) 122 | 123 | return PathStore(output_dir, subtitle_file, video_out) 124 | -------------------------------------------------------------------------------- /decipher/gui.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import gradio as gr 4 | from decipher import action 5 | 6 | from tempfile import mktemp, gettempdir 7 | 8 | 9 | def __transcribe(video_in, model, language, task, batch_size, subs): 10 | result = action.transcribe( 11 | video_in, 12 | gettempdir(), 13 | model, 14 | language if language else None, 15 | task.lower(), 16 | batch_size, 17 | subs.lower() if subs else None 18 | ) 19 | with open(result.subtitle_file, "r", encoding='utf-8') as f: 20 | subtitles = f.read() 21 | return str(subtitles), str(result.video_file) 22 | 23 | 24 | def __subtitle(video_in, subs, task): 25 | temp_srt = mktemp(suffix=".srt") 26 | with open(temp_srt, "w", encoding="utf-8") as f: 27 | f.write(subs) 28 | result = action.subtitle(video_in, temp_srt, gettempdir(), task.lower()) 29 | os.remove(temp_srt) 30 | return str(result.video_file) 31 | 32 | 33 | MODELS = ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"] 34 | 35 | 36 | def ui(): 37 | with gr.Blocks() as demo: 38 | with gr.Tab("Transcribe"): 39 | with gr.Row(): 40 | with gr.Column(): 41 | ti_video = gr.Video(label="Video", sources=["upload"]) 42 | ti_model = gr.Dropdown(choices=MODELS, value="medium", label="Model") 43 | ti_language = gr.Textbox( 44 | label="Language", placeholder="English", 45 | info="Language spoken in the audio leave empty for detection" 46 | ) 47 | ti_task = gr.Radio( 48 | choices=["Transcribe", "Translate"], value="Transcribe", label="Task", 49 | info="Whether to perform X->X speech recognition or X->English translation" 50 | ) 51 | ti_subtitles = gr.Radio( 52 | label="Subtitle video", choices=["Add", "Burn"], 53 | info="Whether to perform subtitle add or burn action leave empty for none" 54 | ) 55 | ti_batch_size = gr.Slider( 56 | 0, 24, value=24, step=1, label="Batch Size", 57 | info="Number of parallel batches reduce if you face out of memory errors" 58 | ) 59 | with gr.Column(): 60 | to_subtitles = gr.Textbox(label="Subtitles", lines=15, show_copy_button=True, autoscroll=False) 61 | to_video = gr.Video(label="Video") 62 | transcribe_btn = gr.Button("Transcribe") 63 | transcribe_btn.click(fn=__transcribe, 64 | inputs=[ti_video, ti_model, ti_language, ti_task, ti_batch_size, ti_subtitles], 65 | outputs=[to_subtitles, to_video]) 66 | 67 | with gr.Tab("Subtitle"): 68 | with gr.Row(): 69 | with gr.Column(): 70 | si_video = gr.Video(label="Video", sources=["upload"]) 71 | si_subtitles = gr.Textbox(label="Subtitles", lines=15, show_copy_button=True) 72 | si_task = gr.Radio( 73 | label="Subtitle video", choices=["Add", "Burn"], value="Burn", 74 | info="Whether to perform subtitle add or burn action leave empty for none" 75 | ) 76 | with gr.Column(): 77 | so_video = gr.Video(label="Video") 78 | 79 | subtitle_btn = gr.Button("Subtitle") 80 | subtitle_btn.click(fn=__subtitle, inputs=[si_video, si_subtitles, si_task], outputs=so_video) 81 | 82 | return demo 83 | -------------------------------------------------------------------------------- /notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "EUSvb7WtISSV" 7 | }, 8 | "source": [ 9 | "# [Decipher](https://github.com/dsymbol/decipher) 📺️ Colab Notebook\n", 10 | "\n", 11 | "If you have any issues, questions or suggestions, post an issue [here](https://github.com/dsymbol/decipher/issues).\n", 12 | "\n", 13 | "If you would like to submit a pull request, please do so [here](https://github.com/dsymbol/decipher/pulls)." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "cellView": "form", 21 | "id": "0OTuWHHYKbvp" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "#@title ## Install and Run Decipher\n", 26 | "\n", 27 | "#@markdown Make sure the Hardware accelerator is set to GPU, go to Runtime > Change runtime type and select GPU in the \"Hardware accelerator\" dropdown menu then click \"Save\".\n", 28 | "\n", 29 | "#@markdown Once installation is done you will get a link to gradio.app, open it.\n", 30 | "!nvidia-smi\n", 31 | "!apt update && apt install ffmpeg\n", 32 | "!pip install git+https://github.com/dsymbol/decipher\n", 33 | "!python -m decipher gui --share" 34 | ] 35 | } 36 | ], 37 | "metadata": { 38 | "accelerator": "GPU", 39 | "colab": { 40 | "private_outputs": true, 41 | "provenance": [] 42 | }, 43 | "gpuClass": "standard", 44 | "kernelspec": { 45 | "display_name": "Python 3", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "name": "python" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } 55 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='decipher', 5 | url='https://github.com/dsymbol/decipher', 6 | author='dsymbol', 7 | install_requires=[ 8 | 'transformers', 9 | 'optimum', 10 | 'accelerate', 11 | 'tqdm', 12 | 'gradio', 13 | 'ffutils', 14 | 'stable-ts==2.17.3', 15 | ], 16 | packages=find_packages(), 17 | entry_points={ 18 | 'console_scripts': [ 19 | 'decipher = decipher.__main__:main' 20 | ] 21 | } 22 | ) 23 | --------------------------------------------------------------------------------