├── .gitignore ├── Makefile ├── README.md ├── config.py ├── core ├── __init__.py ├── audio.py ├── client.py ├── downloader.py ├── prompts.py ├── schema.py ├── utils.py └── video.py ├── main.py ├── poetry.lock ├── poetry.toml ├── pyproject.toml ├── src └── fonts │ └── PlaypenSans.ttf ├── tests └── __init__.py ├── vt └── __init__.py └── webui.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | .download 162 | output 163 | .DS_Store 164 | local_settings.py 165 | .env -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON_FILES=. 2 | lint format: PYTHON_FILES=. 3 | 4 | lint: 5 | poetry run ruff . 6 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff 7 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES) 8 | [ "$(PYTHON_FILES)" = "" ] || poetry run mypy $(PYTHON_FILES) 9 | 10 | format: 11 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) 12 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I --fix $(PYTHON_FILES) 13 | 14 | ###################### 15 | # HELP 16 | ###################### 17 | 18 | help: 19 | @echo '====================' 20 | @echo 'format - run code formatters' 21 | @echo 'lint - run linters' 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## video-translation 2 | 3 | video-translation is an ongoing project leveraging [OpenAI Whisper](https://github.com/openai/whisper) and the OpenAI API ([TTS](https://platform.openai.com/docs/guides/text-to-speech)) to accomplish the following objectives: 4 | 5 | 1. Video Download: ✅ 6 | 2. Extract Audio from Video: Default format is mp3. ✅ 7 | 3. Generate Subtitles from Audio and Translate: ✅ 8 | 4. Embed Hard Subtitles into Videos: ✅ 9 | 5. Support for Video Production in Different Languages based on Subtitles: In progress 10 | 11 | Hence, vt autonomously adds subtitles in multiple languages to both online and local videos. Future capabilities aim to automatically convert video voices into various languages. Feel free to follow and contribute to this project. 12 | 13 | ## Setup 14 | 15 | Begin by installing and updating using poetry: 16 | 17 | ```bash 18 | poetry install 19 | ``` 20 | 21 | Additionally, ensure you have the command-line tool `ffmpeg` installed on your system. It's available via most package managers: 22 | ```bash 23 | # on Ubuntu or Debian 24 | sudo apt update && sudo apt install ffmpeg 25 | 26 | # on Arch Linux 27 | sudo pacman -S ffmpeg 28 | 29 | # on MacOS using Homebrew (https://brew.sh/) 30 | brew install ffmpeg 31 | 32 | # on Windows using Chocolatey (https://chocolatey.org/) 33 | choco install ffmpeg 34 | 35 | # on Windows using Scoop (https://scoop.sh/) 36 | scoop install ffmpeg 37 | ``` 38 | 39 | ## Streamlit-based Web UI 40 | 41 | Our project's Web UI, powered by Streamlit, allows users to add subtitles to videos through an intuitive web interface. This section outlines the primary features of this functionality: 42 | 43 | 1. **Configurable Options**: Users can specify multiple settings such as the Whisper model for audio processing, language preferences, video file types, and subtitle styles to suit various requirements. 44 | 2. **Automatic Subtitle Generation**: The tool supports generating subtitles directly from the video content, leveraging advanced speech-to-text technology for accuracy. 45 | 3. **Subtitle Preview Mode**: After adding subtitles, users can engage a preview mode to see how subtitles appear on the actual video, ensuring proper synchronization and styling. 46 | 4. **Editable Subtitles with Auto Save**: This feature allows users to edit subtitles and see changes in real-time. The 'Auto Save' option can be toggled to apply modifications instantly. 47 | 5. **Subtitle File Download**: Once subtitles are created or edited, users have the option to download these files in different formats (e.g., SRT, VTT), making them compatible with various media players and platforms. 48 | 6. **Export Video with Subtitles**: Users can export the final video with subtitles embedded, ready for sharing on different platforms or for personal use. 49 | 50 | Launching the Web UI: 51 | 52 | ```bash 53 | poetry run streamlit run webui.py 54 | ``` 55 | 56 | And, open your web browser and enter the following address to access the Web UI: `http://localhost:8501/` 57 | 58 | ## Command-line usage 59 | 60 | There are two primary command-line usage types: 61 | 62 | 1. **Generate Subtitle Files**: Supports srt, vtt format based on video content. You can use video tools or website functions to add these subtitles to videos. 63 | 2. **Directly Add Subtitles to Videos**: Currently supports three methods of subtitle addition. 64 | 65 | ### Generate subtitle files(Supports srt, vtt, json, txt, tsv and all format) 66 | 67 | ```bash 68 | poetry run vt subtitle --path='https://www.bilibili.com/video/BV1tb4y1L7yA' --language=en --method=whisper --format=vtt --output=/Users/169/Downloads 69 | ``` 70 | 71 | ### Download video online and add subtitle 72 | 73 | ```bash 74 | poetry run python main.py web --url=https://www.youtube.com/watch?v=CqRrByI-ONE 75 | ``` 76 | 77 | The `web` subcommand also supports more options, as follows: 78 | 79 | ```bash 80 | poetry run vt web --help 81 | Usage: vt web [OPTIONS] 82 | 83 | Options: 84 | -u, --url TEXT Video url. [required] 85 | -o, --output TEXT Generated video dir. 86 | -b, --bilingual TEXT Use bilingual subtitle. Notice: this option is mutually exclusive 87 | with subtitles. 88 | -s, --subtitles TEXT Subtitle languages. split by ",". Notice: this option is mutually 89 | exclusive with bilingual. 90 | --help Show this message and exit. 91 | ``` 92 | 93 | ### Use local video and add subtitle 94 | 95 | ```bash 96 | poetry run vt local --path='/Users/169/Movies/test.mov 97 | ``` 98 | 99 | The `local` subcommand also supports more options, as follows: 100 | 101 | ```bash 102 | poetry run vt local --help 103 | Usage: vt local [OPTIONS] 104 | 105 | Options: 106 | -p, --path TEXT Local file path. [required] 107 | -o, --output TEXT Generated video dir. 108 | -b, --bilingual TEXT Use bilingual subtitle. you can specify the subtitle 109 | language by `--subtitles`. Notice: this option is 110 | mutually exclusive with subtitles. 111 | -s, --subtitles TEXT Subtitle languages. split by ",". Notice: this option 112 | is mutually exclusive with bilingual. 113 | --help Show this message and exit. 114 | ``` 115 | 116 | ### Add bilingual subtitles to local video 117 | 118 | Use the `--bilingual` option to create bilingual subtitles. Separate the two languages with commas, with the first language on top and the latter at the bottom. 119 | 120 | ```bash 121 | poetry run vt local --path="/Users/169/videos/Langchain C3_L6.mp4" --bilingual="cn,en" --output="/Users/169/Downloads" 122 | ``` 123 | 124 | ## Custom config 125 | 126 | All configuration items reside in [config.py](https://github.com/169/video-translation/blob/main/config.py). 127 | 128 | The project also supports limited custom configuration. Add a file named `local_settings.py` within the project and include the settings you wish to override. 129 | 130 | For instance, if the default debug is set to `False` in `config.py`, you can enable debug mode by adding `debug = True` in local_settings.py: 131 | 132 | ```bash 133 | cat local_settings.py 134 | debug = True 135 | ``` 136 | 137 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | DEBUG = False 4 | HERE = Path(__file__).parent.absolute() 5 | DOWNLOAD_DIR: str | Path = HERE / ".download" 6 | OUTPUT_DIR: str | Path = HERE / "output" 7 | WHISPER_MODEL = "large" 8 | FFMPEG_BIN = "ffmpeg" 9 | FFMPEG_PREFIX_OPTS = "-hide_banner -loglevel error -y" 10 | FFMPEG_FORMAT_OPTS = "-c:v libx264 -preset fast -qp 0 -c:a aac -vf format=yuv420p" 11 | ZH_FONT_NAME = "Yuanti SC" 12 | EN_FONT_NAME = "Playpen Sans" 13 | BI_FONT_NAME = "Songti SC" 14 | ZH_FONT_SIZE = 15 15 | EN_FONT_SIZE = 20 16 | BI_FONT_SIZE = 12 17 | # https://wamingo.net/rgbbgr/ 18 | FONT_COLOR = "&H00FFFFFF" 19 | OUTLINE_COLOR = "&H00504eff" 20 | MISMATCH_LIMIT = 5 21 | TEXT_LIMIT = 20 22 | 23 | OPENAI_API_KEY = "" 24 | OPENAI_MODEL = "gpt-4o" 25 | INITIAL_PROMPT_MAP = { 26 | "zh": "以下是普通话的句子,请以简体输出。", 27 | } 28 | 29 | ZH_FORCE_STYLE = f"Fontname={ZH_FONT_NAME},PrimaryColour={FONT_COLOR},OutlineColour={OUTLINE_COLOR},BorderStyle=3,Fontsize={ZH_FONT_SIZE}" 30 | EN_FORCE_STYLE = f"Fontname={EN_FONT_NAME},PrimaryColour={FONT_COLOR},OutlineColour={OUTLINE_COLOR},BorderStyle=3,Fontsize={EN_FONT_SIZE}" 31 | BI_FORCE_STYLE = f"Fontname={EN_FONT_NAME},PrimaryColour={FONT_COLOR},OutlineColour={OUTLINE_COLOR},BorderStyle=3,Fontsize={BI_FONT_SIZE}" 32 | 33 | try: 34 | from local_settings import * # noqa: F403 35 | except ImportError: 36 | ... 37 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/169/intelli-video/52f880e53c0c2208953c212557fecd53988b2255/core/__init__.py -------------------------------------------------------------------------------- /core/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import warnings 4 | from pathlib import Path 5 | 6 | import openai 7 | import tenacity 8 | import whisper 9 | from loguru import logger 10 | from whisper.utils import get_writer 11 | 12 | from config import ( 13 | DEBUG, 14 | DOWNLOAD_DIR, 15 | FFMPEG_BIN, 16 | FFMPEG_PREFIX_OPTS, 17 | TEXT_LIMIT, 18 | WHISPER_MODEL, 19 | INITIAL_PROMPT_MAP, 20 | ) 21 | from core.client import Client 22 | from core.prompts import render_translate_prompt 23 | from core.utils import ( 24 | assign_texts, 25 | batched, 26 | check_fallback_to_openai, 27 | parse_vtt, 28 | write_bilingual_vtt, 29 | write_vtt, 30 | ) 31 | 32 | 33 | def transcribe( 34 | audio: str, language: str = "en", model_name: str = WHISPER_MODEL 35 | ) -> dict: 36 | warnings.filterwarnings("ignore") 37 | logger.info(f"Transcribe: {audio} ") 38 | model = whisper.load_model(model_name) 39 | result = model.transcribe(audio, language=language, verbose=DEBUG, 40 | initial_prompt=INITIAL_PROMPT_MAP.get(language, None)) 41 | logger.info(f"Transcribed: {audio} ") 42 | warnings.filterwarnings("default") 43 | return result 44 | 45 | 46 | def generate_subtitle( 47 | audio: str, 48 | method: str, 49 | language: str, 50 | format: str, 51 | output_directory: str, 52 | model_name: str, 53 | ) -> str: 54 | if method == "whisper": 55 | filename = f"{output_directory}/{os.path.basename(audio).removesuffix('.mp3')}.{format}" 56 | result = transcribe(audio, language=language, model_name=model_name) 57 | writer = get_writer(format, output_directory) 58 | writer(result, audio) 59 | elif method == "openai_api": 60 | filename = f"{output_directory}/{os.path.basename(audio).removesuffix('.mp3')}.{language}.{format}" 61 | client = Client() 62 | text = client.transcribe(audio, response_format=format, language=language) # type: ignore[arg-type] 63 | 64 | with open(filename, "w") as f: 65 | f.write(text) 66 | else: 67 | raise ValueError(f"Unknown method: {method}") 68 | 69 | logger.info(f"Subtitle generated: {filename} ") 70 | return filename 71 | 72 | 73 | def generate_vtt_from_api( 74 | audio: str, title_language: str, other_language: str 75 | ) -> list[list[str]]: 76 | logger.info(f"Generate {title_language} vtt from OpenAI API: {audio}") 77 | client = Client() 78 | transcript = client.transcribe(audio) 79 | all_segments = [i.model_dump() for i in parse_vtt(transcript)] 80 | segments = [] 81 | for lst in batched(all_segments, TEXT_LIMIT): 82 | texts = [seg["text"] for seg in lst] 83 | content = "\n".join(texts) 84 | text_map = client.translate(render_translate_prompt(content, other_language)) 85 | text_map = assign_texts(text_map, texts) 86 | for seg in lst: 87 | text = seg["text"] 88 | trans = text_map.get(text.strip(), "") 89 | if title_language == "en": 90 | seg["title"] = f"{text}" 91 | seg["subtitle"] = trans 92 | else: 93 | seg["title"] = f"{trans}" 94 | seg["subtitle"] = text 95 | segments.append(seg) 96 | return [[write_bilingual_vtt(segments, audio), "bilingual"]] 97 | 98 | 99 | def generate_vtt(audio: str, bilingual: str, subtitles: str) -> list[list[str]]: 100 | warnings.filterwarnings("ignore") 101 | result = transcribe(audio) 102 | source_language = result["language"] 103 | 104 | client = Client() 105 | 106 | srts = [] 107 | 108 | if subtitles: 109 | subtitles_ = subtitles.split(",") 110 | if source_language in subtitles_: 111 | srts.append( 112 | [write_vtt(result["segments"], audio, source_language), source_language] 113 | ) 114 | logger.info(f"Generate {source_language} vtt: {srts[0][0]}") 115 | 116 | for dist_language in subtitles_: 117 | if dist_language != source_language: 118 | result = transcribe(audio, language=dist_language) 119 | srts.append( 120 | [write_vtt(result["segments"], audio, dist_language), dist_language] 121 | ) 122 | logger.info(f"Generate {dist_language} vtt: {srts[-1][0]}") 123 | if bilingual: 124 | title_language, subtitle_language = bilingual.split(",") 125 | if source_language in (title_language, subtitle_language): 126 | if "en" in (title_language, subtitle_language): 127 | result = transcribe(audio, language="en") 128 | other_language = ( 129 | subtitle_language if title_language == "en" else title_language 130 | ) 131 | segments = [] 132 | for lst in batched(result["segments"], TEXT_LIMIT): 133 | lst = list(lst) 134 | texts = [seg["text"] for seg in lst] 135 | content = "\n".join(texts) 136 | text_map = client.translate( 137 | render_translate_prompt(content, other_language) 138 | ) 139 | need_use_api = check_fallback_to_openai(text_map, texts) 140 | if need_use_api: 141 | try: 142 | return generate_vtt_from_api( 143 | audio, title_language, other_language 144 | ) 145 | except (openai.APIStatusError, tenacity.RetryError): 146 | ... 147 | text_map = assign_texts(text_map, texts) 148 | for seg in lst: 149 | text = seg["text"] 150 | trans = text_map.get(text.strip(), "") 151 | if title_language == "en": 152 | seg["title"] = f"{text}" 153 | seg["subtitle"] = trans 154 | else: 155 | seg["title"] = f"{trans}" 156 | seg["subtitle"] = text 157 | segments.append(seg) 158 | srts.append( 159 | [write_bilingual_vtt(segments, audio), "bilingual"], 160 | ) 161 | logger.info(f"Generate {bilingual} bilingual vtt: {srts[0][0]}") 162 | else: 163 | # TODO: support other languages 164 | ... 165 | 166 | warnings.filterwarnings("default") 167 | return srts 168 | 169 | 170 | def generate_audio(video: str | Path, suffix=".mp3") -> str: 171 | if not isinstance(video, Path): 172 | video = Path(video) 173 | 174 | audio = f"{DOWNLOAD_DIR}/{video.parts[-1].removesuffix(video.suffix)}{suffix}" 175 | 176 | subprocess.check_call( 177 | f"{FFMPEG_BIN} {FFMPEG_PREFIX_OPTS} -i '{video.as_posix()}' -b:a 192K -vn '{audio}'", 178 | shell=True, 179 | ) 180 | logger.info(f"Audio generated: {audio}") 181 | return audio 182 | -------------------------------------------------------------------------------- /core/client.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Literal 3 | 4 | from loguru import logger 5 | from openai import OpenAI 6 | from pydantic import ValidationError 7 | from tenacity import retry, stop_after_attempt 8 | 9 | from config import DEBUG, OPENAI_API_KEY, OPENAI_MODEL 10 | 11 | 12 | class Client: 13 | def __init__(self): 14 | if not OPENAI_API_KEY: 15 | raise ValidationError( 16 | "`OPENAI_API_KEY` is required. please add it to `local_settings.py`. " 17 | ) 18 | self.client = OpenAI(api_key=OPENAI_API_KEY) 19 | 20 | def translate(self, prompt) -> dict: 21 | if DEBUG: 22 | logger.debug(prompt) 23 | chat_completion = self.client.chat.completions.create( 24 | messages=[ 25 | { 26 | "role": "user", 27 | "content": prompt, 28 | } 29 | ], 30 | model=OPENAI_MODEL, 31 | response_format={"type": "json_object"}, 32 | ) 33 | 34 | return json.loads(chat_completion.choices[0].message.content) # type: ignore[arg-type] 35 | 36 | @retry(stop=stop_after_attempt(3)) 37 | def transcribe( 38 | self, 39 | audio: str, 40 | response_format: Literal["vvt", "srt"] = Literal["vtt"], # type: ignore[assignment] 41 | language="en", 42 | ) -> str: 43 | transcript = self.client.audio.transcriptions.create( 44 | model="whisper-1", 45 | language=language, 46 | file=open(audio, "rb"), 47 | response_format=response_format, # type: ignore[arg-type] 48 | ) 49 | return transcript # type: ignore 50 | -------------------------------------------------------------------------------- /core/downloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import yt_dlp 5 | from loguru import logger 6 | 7 | from config import DOWNLOAD_DIR 8 | from core.schema import DownloadMedia 9 | 10 | if isinstance(DOWNLOAD_DIR, Path): 11 | DOWNLOAD_DIR = DOWNLOAD_DIR.as_posix() 12 | 13 | ydl_opts = { 14 | "quiet": True, 15 | "format": "bv+ba/b", 16 | "paths": {"home": DOWNLOAD_DIR}, 17 | "keepvideo": True, 18 | "postprocessors": [ 19 | { 20 | "key": "FFmpegExtractAudio", 21 | "preferredcodec": "mp3", 22 | "preferredquality": "0", 23 | } 24 | ], 25 | } 26 | 27 | 28 | def download(url: str) -> DownloadMedia: 29 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 30 | logger.info(f"Downloading: {url}") 31 | r = ydl.extract_info(url, download=True) 32 | requested_downloads = r["requested_downloads"][0] 33 | for file in requested_downloads.get("__files_to_merge", []): 34 | try: 35 | os.remove(file) 36 | except FileNotFoundError: 37 | ... 38 | logger.info(f"Downloaded: {requested_downloads['filename']}") 39 | return DownloadMedia( 40 | title=r["title"], 41 | video=requested_downloads["filename"], 42 | audio=requested_downloads["filepath"], 43 | ) 44 | -------------------------------------------------------------------------------- /core/prompts.py: -------------------------------------------------------------------------------- 1 | from string import Template 2 | 3 | translate_prompt = """Translate to $other_language language strictly line by line. return json data 4 | 5 | Input: Hello, my name is John. 6 | I'm from the United States. 7 | Output: {"Hello, my name is John.": "你好,我的名字叫约翰.", "I'm from the United States.": "我来自美国."} 8 | Input: ${content} 9 | Output: 10 | """ 11 | 12 | 13 | def render_translate_prompt(content: str, other_language: str) -> str: 14 | return Template(translate_prompt).substitute( 15 | content=content, other_language=other_language 16 | ) 17 | -------------------------------------------------------------------------------- /core/schema.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class DownloadMedia(BaseModel): 7 | title: str 8 | video: str 9 | audio: str 10 | 11 | 12 | class Segment(BaseModel): 13 | text: str 14 | timestamp: Optional[str] 15 | start: Optional[float | None] = None 16 | end: Optional[float | None] = None 17 | title: Optional[str | None] = None 18 | subtitle: Optional[str | None] = None 19 | -------------------------------------------------------------------------------- /core/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from itertools import islice 3 | from typing import Generator 4 | 5 | from config import MISMATCH_LIMIT, TEXT_LIMIT 6 | from core.schema import Segment 7 | 8 | 9 | def format_timestamp( 10 | seconds: float, 11 | ): 12 | assert seconds >= 0, "non-negative timestamp expected" 13 | milliseconds = round(seconds * 1000.0) 14 | 15 | hours = milliseconds // 3_600_000 16 | milliseconds -= hours * 3_600_000 17 | 18 | minutes = milliseconds // 60_000 19 | milliseconds -= minutes * 60_000 20 | 21 | seconds = milliseconds // 1_000 22 | milliseconds -= seconds * 1_000 23 | 24 | return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}" 25 | 26 | 27 | def write_vtt( 28 | transcript: list[dict | Segment], audio: str, language: str | None = None 29 | ) -> str: 30 | if language is not None: 31 | vtt_filename = f"{audio.removesuffix('.mp3')}.{language}.vtt" 32 | else: 33 | vtt_filename = f"{audio.removesuffix('.mp3')}.vtt" 34 | return _write_vtt(transcript, vtt_filename) 35 | 36 | 37 | def _write_vtt(transcript: list[dict | Segment], vtt_filename: str) -> str: 38 | with open(vtt_filename, "w") as f: 39 | f.write("WEBVTT\n") 40 | for segment in transcript: 41 | if isinstance(segment, Segment): 42 | segment = segment.model_dump() 43 | start = format_timestamp(segment["start"]) 44 | end = format_timestamp(segment["end"]) 45 | f.write(f"\n{start} --> {end}\n{segment['text'].strip()}\n") 46 | 47 | return vtt_filename 48 | 49 | 50 | def write_bilingual_vtt(transcript: list[dict], audio: str) -> str: 51 | srt_filename = f"{audio.removesuffix('.mp3')}.bilingual.vtt" 52 | 53 | with open(srt_filename, "w") as f: 54 | f.write("WEBVTT\n") 55 | for segment in transcript: 56 | if "start" in segment and segment["start"] is not None: 57 | start = format_timestamp(segment["start"]) 58 | end = format_timestamp(segment["end"]) 59 | timestamp = f"{start} --> {end}" 60 | else: 61 | timestamp = segment["timestamp"] 62 | title = segment["title"].strip() 63 | subtitle = segment["subtitle"].strip() 64 | f.write(f"\n{timestamp}\n{title}\n{subtitle}\n") 65 | 66 | return srt_filename 67 | 68 | 69 | def batched(iterable, n) -> Generator: 70 | # batched('ABCDEFG', 3) --> ABC DEF G 71 | if n < 1: 72 | raise ValueError("n must be at least one") 73 | it = iter(iterable) 74 | while batch := tuple(islice(it, n)): 75 | yield batch 76 | 77 | 78 | def check_fallback_to_openai(text_map: dict, texts: list[str]) -> bool: 79 | texts = [text.strip() for text in texts] 80 | not_seen = {} 81 | for k, v in text_map.copy().items(): 82 | k = k.strip() 83 | if k not in texts: 84 | try: 85 | not_seen[k] = v.strip() 86 | except AttributeError: 87 | print(f"AttributeError: {k}\t{v}") 88 | not_seen[k] = v[0].strip() 89 | return len(not_seen) > MISMATCH_LIMIT 90 | 91 | 92 | def get_seconds(time_str: str) -> float: 93 | m, s = time_str.strip().split(":") 94 | return float(m) * 60 + float(s) 95 | 96 | 97 | def parse_vtt(text: str) -> list[Segment]: 98 | texts = [] 99 | for items in batched(text.splitlines()[2:], 3): 100 | try: 101 | timestamp, text, _ = list(items) 102 | except ValueError: 103 | timestamp, text = list(items) 104 | start, end = timestamp.split("-->") 105 | texts.append( 106 | Segment( 107 | timestamp=timestamp, 108 | start=get_seconds(start), 109 | end=get_seconds(end), 110 | text=text, 111 | ) 112 | ) 113 | return texts 114 | 115 | 116 | def assign_texts(text_map: dict, texts: list[str]) -> dict: 117 | for _ in range(TEXT_LIMIT): 118 | for i in range(1, len(texts)): 119 | new_text = " ".join(texts[:i]) 120 | if val := text_map.get(new_text): 121 | if i != 1: 122 | sep = round(len(val) / i) 123 | for index, text in enumerate(texts[:i]): 124 | text_map[text] = val[index * sep : (index + 1) * sep] 125 | texts = texts[i:] 126 | for text in texts: 127 | combined_text = "" 128 | match = False 129 | for sentence in re.split(",|\.|\?", text): 130 | sentence = sentence.strip() 131 | for term in (sentence, f"{sentence},", f"{sentence}.", f"{sentence}?"): 132 | if val := text_map.get(term): 133 | combined_text += f" {val}" 134 | match = True 135 | if match: 136 | text_map[text] = combined_text 137 | return text_map 138 | -------------------------------------------------------------------------------- /core/video.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pathlib import Path 3 | from typing import Generator 4 | 5 | from loguru import logger 6 | 7 | from config import ( 8 | BI_FORCE_STYLE, 9 | DEBUG, 10 | EN_FORCE_STYLE, 11 | FFMPEG_BIN, 12 | FFMPEG_FORMAT_OPTS, 13 | FFMPEG_PREFIX_OPTS, 14 | ZH_FORCE_STYLE, 15 | ) 16 | 17 | STYLE_MAP = { 18 | "zh": ZH_FORCE_STYLE, 19 | "en": EN_FORCE_STYLE, 20 | "bilingual": BI_FORCE_STYLE, 21 | } 22 | 23 | 24 | def generate_video( 25 | video: str | Path, vtts: list[list[str]], output_dir: str 26 | ) -> Generator: 27 | if not isinstance(video, Path): 28 | video = Path(video) 29 | 30 | suffix = video.suffix 31 | if suffix == ".webm": 32 | suffix = ".mp4" 33 | 34 | for vtt, language in vtts: 35 | output = f"{output_dir}/{video.parts[-1].removesuffix(video.suffix)}.{language}{suffix}" 36 | logger.info(f"Transcoding: {output}") 37 | style = STYLE_MAP.get(language, STYLE_MAP["en"]) 38 | cmd = f"""{FFMPEG_BIN} {FFMPEG_PREFIX_OPTS} -i '{video.as_posix()}' {FFMPEG_FORMAT_OPTS} -vf "subtitles='{vtt}':fontsdir=./src/fonts/:force_style='{style}'" '{output}'""" 39 | if DEBUG: 40 | logger.debug(f"CMD: {cmd}") 41 | subprocess.check_call(cmd, shell=True) 42 | logger.info(f"Video generated: {output}") 43 | yield output 44 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import click 2 | from loguru import logger 3 | from whisper import _MODELS 4 | from whisper.tokenizer import LANGUAGES 5 | 6 | from config import OUTPUT_DIR, WHISPER_MODEL 7 | from core.audio import generate_audio, generate_subtitle, generate_vtt 8 | from core.downloader import download 9 | from core.video import generate_video 10 | 11 | WHISPER_SUPPORT_TYPES = ["vtt", "srt", "json", "txt", "tsv", "all"] 12 | OPENAI_API_SUPPORT_TYPES = ["json", "text", "vtt", "srt"] 13 | 14 | 15 | class Mutex(click.Option): 16 | def __init__(self, *args, **kwargs): 17 | self.not_required_if: list = kwargs.pop("not_required_if", []) 18 | 19 | assert self.not_required_if, "'not_required_if' parameter required" 20 | kwargs["help"] = ( 21 | kwargs.get("help", "") 22 | + ". Notice: this option is mutually exclusive with " 23 | + ", ".join(self.not_required_if) 24 | + "." 25 | ).strip() 26 | super(Mutex, self).__init__(*args, **kwargs) 27 | 28 | def handle_parse_result(self, ctx, opts, args): 29 | current_opt: bool = self.name in opts 30 | for mutex_opt in self.not_required_if: 31 | if mutex_opt in opts: 32 | if current_opt: 33 | raise click.UsageError( 34 | "Illegal usage: '" 35 | + str(self.name) 36 | + "' is mutually exclusive with " 37 | + str(mutex_opt) 38 | + "." 39 | ) 40 | else: 41 | self.prompt = None 42 | if current_opt and not opts[self.name]: 43 | raise click.UsageError( 44 | f"Illegal usage: The value of '{self.name}' cannot be empty." 45 | ) 46 | return super(Mutex, self).handle_parse_result(ctx, opts, args) 47 | 48 | 49 | @click.group() 50 | def cli(): 51 | """`vt` is a tool for adding subtitle to videos and generating videos in other languages.""" 52 | 53 | 54 | @cli.command() 55 | @click.option("-p", "--path", required=True, help="Local file path or video url.") 56 | @click.option("-o", "--output", default=OUTPUT_DIR, help="Generated video dir.") 57 | @click.option( 58 | "-l", 59 | "--language", 60 | default="en", 61 | type=click.Choice(LANGUAGES.keys(), case_sensitive=False), 62 | help="Subtitle language.", 63 | ) 64 | @click.option( 65 | "-m", 66 | "--model", 67 | default=WHISPER_MODEL, 68 | type=click.Choice(_MODELS.keys(), case_sensitive=False), 69 | help="Whisper model name.", 70 | ) 71 | @click.option( 72 | "-m", 73 | "--method", 74 | default="whisper", 75 | type=click.Choice(["whisper", "openai_api"], case_sensitive=False), 76 | help="Method for generating subtitle files.", 77 | ) 78 | @click.option( 79 | "-f", 80 | "--format", 81 | default="vtt", 82 | help="Subtitle format.", 83 | ) 84 | def subtitle(path, output, language, model, method, format): 85 | if method == "whisper" and format not in WHISPER_SUPPORT_TYPES: 86 | raise click.UsageError( 87 | f"Illegal usage: `whisper` method only support: {WHISPER_SUPPORT_TYPES}." 88 | ) 89 | elif method == "openai_api" and format not in OPENAI_API_SUPPORT_TYPES: 90 | raise click.UsageError( 91 | f"Illegal usage: `openai_api` method only support: {OPENAI_API_SUPPORT_TYPES}." 92 | ) 93 | 94 | if path.startswith("http"): 95 | media = download(path) 96 | audio = media.audio 97 | else: 98 | audio = generate_audio(path) 99 | generate_subtitle(audio, method, language, format, output, model_name=model) 100 | 101 | 102 | @cli.command() 103 | @click.option("-p", "--path", required=True, help="Local file path.") 104 | @click.option("-o", "--output", default=OUTPUT_DIR, help="Generated video dir.") 105 | @click.option( 106 | "-b", 107 | "--bilingual", 108 | default="", 109 | cls=Mutex, 110 | not_required_if=["subtitles"], 111 | help="Use bilingual subtitle. you can specify the subtitle language by `--subtitles`", 112 | ) 113 | @click.option( 114 | "-s", 115 | "--subtitles", 116 | default="", 117 | cls=Mutex, 118 | not_required_if=["bilingual"], 119 | help='Subtitle languages. split by ","', 120 | ) 121 | def local(path, output, bilingual, subtitles): 122 | if bilingual and "," not in bilingual: 123 | raise click.UsageError( 124 | "Illegal usage: `--bilingual` requires 2 language subtitles, you can use `cn,en` or `en,cn`" 125 | ) 126 | audio = generate_audio(path) 127 | vtts = generate_vtt(audio, bilingual, subtitles) 128 | if not vtts: 129 | logger.warning("No subtitles generated.") 130 | for video in generate_video(path, vtts, output): 131 | logger.info(f"Generated: {video}") 132 | 133 | 134 | @cli.command() 135 | @click.option("-u", "--url", required=True, help="Video url.") 136 | @click.option("-o", "--output", default=OUTPUT_DIR, help="Generated video dir.") 137 | @click.option( 138 | "-b", 139 | "--bilingual", 140 | default="", 141 | cls=Mutex, 142 | not_required_if=["subtitles"], 143 | help="Use bilingual subtitle", 144 | ) 145 | @click.option( 146 | "-s", 147 | "--subtitles", 148 | default="zh,en", 149 | cls=Mutex, 150 | not_required_if=["bilingual"], 151 | help='Subtitle languages. split by ","', 152 | ) 153 | def web(url, output, bilingual, subtitles): 154 | if bilingual and "," not in bilingual: 155 | raise click.UsageError( 156 | "Illegal usage: `--bilingual` requires 2 language subtitles, you can use `cn,en` or `en,cn`" 157 | ) 158 | media = download(url) 159 | vtts = generate_vtt(media.audio, bilingual, subtitles) 160 | if not vtts: 161 | logger.warning("No subtitles generated.") 162 | for video in generate_video(media.video, vtts, output): 163 | logger.info(f"Generated: {video}") 164 | 165 | 166 | if __name__ == "__main__": 167 | cli() 168 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | 4 | [installer] 5 | modern-installation = false -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "vt" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Bob Lin "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | yt-dlp = "^2023.11.16" 11 | pydantic = "^2.5.2" 12 | click = {version = "^8.1.7"} 13 | ruff = "^0.1.7" 14 | mypy = "^1.7.1" 15 | openai-whisper = {git = "https://github.com/169/whisper.git", rev = "marker"} 16 | loguru = "^0.7.2" 17 | streamlit = "^1.29.0" 18 | openai = "^1.4.0" 19 | tenacity = "^8.2.3" 20 | watchdog = "^3.0.0" 21 | streamlit-component-video = "^0.3.0" 22 | 23 | [tool.mypy] 24 | ignore_missing_imports = true 25 | check_untyped_defs = true 26 | 27 | [build-system] 28 | requires = ["poetry-core"] 29 | build-backend = "poetry.core.masonry.api" 30 | 31 | [tool.poetry.scripts] 32 | vt = "main:cli" -------------------------------------------------------------------------------- /src/fonts/PlaypenSans.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/169/intelli-video/52f880e53c0c2208953c212557fecd53988b2255/src/fonts/PlaypenSans.ttf -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/169/intelli-video/52f880e53c0c2208953c212557fecd53988b2255/tests/__init__.py -------------------------------------------------------------------------------- /vt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/169/intelli-video/52f880e53c0c2208953c212557fecd53988b2255/vt/__init__.py -------------------------------------------------------------------------------- /webui.py: -------------------------------------------------------------------------------- 1 | # TODO: 2 | # 1. auto save 3 | import mimetypes 4 | import os.path 5 | 6 | import streamlit as st 7 | import whisper 8 | from streamlit_component_video import streamlit_component_video 9 | from whisper.tokenizer import LANGUAGES 10 | 11 | from config import ( 12 | DOWNLOAD_DIR, 13 | EN_FONT_NAME, 14 | EN_FONT_SIZE, 15 | FONT_COLOR, 16 | OUTLINE_COLOR, 17 | OUTPUT_DIR, 18 | WHISPER_MODEL, 19 | ZH_FONT_NAME, 20 | ZH_FONT_SIZE, 21 | ) 22 | from core.audio import generate_audio, generate_subtitle 23 | from core.downloader import download 24 | from core.utils import _write_vtt, parse_vtt 25 | from core.video import generate_video 26 | 27 | if "subtitle_path" not in st.session_state: 28 | st.session_state.update( 29 | { 30 | "segments": [], 31 | "replaced_segments": [], 32 | "current_seg_index": 0, 33 | "current_time": 0, 34 | "vtt_content": "", 35 | "subtitle_path": "", 36 | "widget_values": {}, 37 | "video": { 38 | "mimetype": "video/mp4", 39 | "path": "", 40 | "track": "", 41 | "current_time": 0, 42 | }, 43 | } 44 | ) 45 | 46 | 47 | def make_recording_widget(f): 48 | def wrapper(label, *args, **kwargs): 49 | widget_value = f(*args, **kwargs) 50 | st.session_state.widget_values[label] = widget_value 51 | return widget_value 52 | 53 | return wrapper 54 | 55 | 56 | with st.sidebar: 57 | st.info("🎈Configure") 58 | 59 | st.write("## File") 60 | mimetype = st.selectbox( 61 | "Video MIMETYPE", 62 | [v for k, v in mimetypes.types_map.items() if "video" in v], 63 | index=2, 64 | ) 65 | 66 | download_dir: str = st.text_input("Download Dir", DOWNLOAD_DIR) or "" 67 | output_dir: str = st.text_input("Output Dir", OUTPUT_DIR) or "" 68 | 69 | st.write("## Audio") 70 | 71 | try: 72 | index = whisper.available_models().index(WHISPER_MODEL) 73 | except ValueError: 74 | index = 0 75 | 76 | model: str = ( 77 | st.selectbox( 78 | "Whisper model", 79 | whisper.available_models(), 80 | index=index, 81 | ) 82 | or "" 83 | ) 84 | 85 | st.write("## Subtitle") 86 | 87 | language: str = ( 88 | st.selectbox( 89 | "Language", 90 | LANGUAGES.keys(), 91 | index=0, 92 | ) 93 | or "" 94 | ) 95 | method: str = ( 96 | st.selectbox( 97 | "Method", 98 | ["whisper", "openai_api"], 99 | index=0, 100 | ) 101 | or "" 102 | ) 103 | zh_font_name = st.text_input("ZH Font Name", ZH_FONT_NAME) 104 | en_font_name = st.text_input("ZH Font Name", EN_FONT_NAME) 105 | en_font_size = st.text_input("ZH Font Size", ZH_FONT_SIZE) 106 | zh_font_size = st.text_input("ZH Font Size", EN_FONT_SIZE) 107 | font_color = st.text_input("Font Color", FONT_COLOR) 108 | outline_color = st.text_input("ZH Font Name", OUTLINE_COLOR) 109 | 110 | st.write("## Video") 111 | 112 | video_path = st.text_input("Video Path or URL") 113 | subtitle_path = st.text_input("VTT Path or URL", value=st.session_state.subtitle_path) 114 | 115 | if subtitle_path and not st.session_state.segments: 116 | with open(subtitle_path) as f: 117 | content = f.read() 118 | st.session_state.vtt_content = content 119 | if not st.session_state.segments: 120 | st.session_state.segments = parse_vtt(content) 121 | st.session_state.replaced_segments = parse_vtt(content) 122 | 123 | 124 | def get_current_vtt_content() -> str: 125 | values = st.session_state["widget_values"] 126 | if not values: 127 | return "" 128 | if not values["video_component"]: 129 | return "" 130 | current_time = values["video_component"]["current_time"] 131 | for index, seg in enumerate(st.session_state.segments): 132 | if seg.start <= current_time <= seg.end: 133 | st.session_state["current_seg_index"] = index 134 | return seg.text 135 | return "" 136 | 137 | 138 | def subtitle_callback(path: str) -> None: 139 | if not path: 140 | st.error("Video Path or URL is required.") 141 | return 142 | with st.status("Generate subtitle..."): 143 | if path.startswith("http"): 144 | st.write("Download video...") 145 | media = download(path) 146 | audio = media.audio 147 | else: 148 | st.write("Generating audio...") 149 | audio = generate_audio(path) 150 | st.write("Generating subtitle...") 151 | st.session_state.subtitle_path = generate_subtitle( 152 | audio, method, language, "vtt", output_dir, model_name=model 153 | ) 154 | 155 | 156 | def save_callback() -> None: 157 | if not subtitle_path: 158 | st.error("Subtitle is required.") 159 | else: 160 | _write_vtt(st.session_state.replaced_segments, subtitle_path) 161 | st.session_state.segments = st.session_state.replaced_segments 162 | 163 | 164 | def preview_callback() -> None: 165 | track = subtitle_path or st.session_state.subtitle_path 166 | if not track: 167 | st.error("Subtitle is required.") 168 | elif not video_path: 169 | st.error("Video Path or URL is required.") 170 | else: 171 | st.session_state["video"] = dict( 172 | path=video_path, 173 | mimetype="video/mp4", 174 | track=track, 175 | current_time="", 176 | ) 177 | 178 | 179 | def generate_callback() -> None: 180 | if not video_path: 181 | st.error("Video Path or URL is required.") 182 | elif not subtitle_path: 183 | st.error("Subtitle is required.") 184 | else: 185 | vtts = [[subtitle_path, language]] 186 | for video in generate_video(video_path, vtts, output_dir): 187 | st.success(f"Video Generated: {video}") 188 | 189 | 190 | col1, col2, col3, col4, col5 = st.columns(5) 191 | 192 | with col1: 193 | st.button("Preview", on_click=preview_callback) 194 | 195 | with col2: 196 | st.button( 197 | "Generate VTT", 198 | on_click=subtitle_callback, 199 | args=(video_path,), 200 | ) 201 | 202 | with col3: 203 | st.button( 204 | "Save VTT", 205 | on_click=save_callback, 206 | ) 207 | with col4: 208 | st.download_button( 209 | "Download VTT", st.session_state.vtt_content, os.path.basename(subtitle_path) 210 | ) 211 | 212 | with col5: 213 | st.button("Generate", on_click=generate_callback) 214 | 215 | make_recording_widget(streamlit_component_video)( 216 | label="video_component", 217 | path=st.session_state["video"]["path"], 218 | mimetype=st.session_state["video"]["mimetype"], 219 | track=st.session_state["video"]["track"], 220 | current_time=st.session_state["current_time"], 221 | ) 222 | 223 | current_vtt = st.text_input("Subtitle", value=get_current_vtt_content()) 224 | if st.session_state.segments and current_vtt: 225 | st.session_state.replaced_segments[ 226 | st.session_state.current_seg_index 227 | ].text = current_vtt 228 | --------------------------------------------------------------------------------