├── .dockerignore ├── tess-data ├── configs │ ├── pdf │ ├── quiet │ ├── tsv │ ├── alto │ ├── api_config │ ├── get.images │ ├── logfile │ ├── lstmbox │ ├── makebox │ ├── wordstrbox │ ├── digits │ ├── hocr │ ├── unlv │ ├── inter │ ├── rebox │ ├── linebox │ ├── kannada │ ├── lstmdebug │ ├── bazaar │ ├── bigram │ ├── txt │ ├── ambigs.train │ ├── lstm.train │ ├── box.train │ ├── box.train.stderr │ ├── Makefile.am │ ├── strokewidth │ └── testspace ├── README.md └── eng.traineddata ├── .isort.cfg ├── LICENSE ├── Dockerfile ├── .gitignore ├── cartonizer.py ├── README.md ├── cowocr.py └── milksync.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | -------------------------------------------------------------------------------- /tess-data/configs/pdf: -------------------------------------------------------------------------------- 1 | tessedit_create_pdf 1 2 | -------------------------------------------------------------------------------- /tess-data/configs/quiet: -------------------------------------------------------------------------------- 1 | debug_file /dev/null 2 | -------------------------------------------------------------------------------- /tess-data/configs/tsv: -------------------------------------------------------------------------------- 1 | tessedit_create_tsv 1 2 | -------------------------------------------------------------------------------- /tess-data/configs/alto: -------------------------------------------------------------------------------- 1 | tessedit_create_alto 1 2 | -------------------------------------------------------------------------------- /tess-data/configs/api_config: -------------------------------------------------------------------------------- 1 | tessedit_zero_rejection T 2 | -------------------------------------------------------------------------------- /tess-data/configs/get.images: -------------------------------------------------------------------------------- 1 | tessedit_write_images T 2 | -------------------------------------------------------------------------------- /tess-data/configs/logfile: -------------------------------------------------------------------------------- 1 | debug_file tesseract.log 2 | -------------------------------------------------------------------------------- /tess-data/configs/lstmbox: -------------------------------------------------------------------------------- 1 | tessedit_create_lstmbox 1 2 | -------------------------------------------------------------------------------- /tess-data/configs/makebox: -------------------------------------------------------------------------------- 1 | tessedit_create_boxfile 1 2 | -------------------------------------------------------------------------------- /tess-data/configs/wordstrbox: -------------------------------------------------------------------------------- 1 | tessedit_create_wordstrbox 1 2 | -------------------------------------------------------------------------------- /tess-data/configs/digits: -------------------------------------------------------------------------------- 1 | tessedit_char_whitelist 0123456789-. 2 | -------------------------------------------------------------------------------- /tess-data/configs/hocr: -------------------------------------------------------------------------------- 1 | tessedit_create_hocr 1 2 | hocr_font_info 0 3 | -------------------------------------------------------------------------------- /tess-data/configs/unlv: -------------------------------------------------------------------------------- 1 | tessedit_write_unlv 1 2 | unlv_tilde_crunching T 3 | -------------------------------------------------------------------------------- /tess-data/configs/inter: -------------------------------------------------------------------------------- 1 | interactive_display_mode T 2 | tessedit_display_outwords T 3 | -------------------------------------------------------------------------------- /tess-data/configs/rebox: -------------------------------------------------------------------------------- 1 | tessedit_resegment_from_boxes 1 2 | tessedit_make_boxes_from_boxes 1 3 | -------------------------------------------------------------------------------- /tess-data/README.md: -------------------------------------------------------------------------------- 1 | Copied directly from the Tesseract project and is under their respective license. -------------------------------------------------------------------------------- /tess-data/configs/linebox: -------------------------------------------------------------------------------- 1 | tessedit_resegment_from_line_boxes 1 2 | tessedit_make_boxes_from_boxes 1 3 | -------------------------------------------------------------------------------- /tess-data/eng.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JohnDoee/the-cute-collection/HEAD/tess-data/eng.traineddata -------------------------------------------------------------------------------- /tess-data/configs/kannada: -------------------------------------------------------------------------------- 1 | textord_skewsmooth_offset 8 2 | textord_skewsmooth_offset2 8 3 | textord_merge_desc 0.5 4 | textord_no_rejects 1 5 | -------------------------------------------------------------------------------- /tess-data/configs/lstmdebug: -------------------------------------------------------------------------------- 1 | stopper_debug_level 1 2 | classify_debug_level 1 3 | segsearch_debug_level 1 4 | language_model_debug_level 3 5 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=88 7 | -------------------------------------------------------------------------------- /tess-data/configs/bazaar: -------------------------------------------------------------------------------- 1 | load_system_dawg F 2 | load_freq_dawg F 3 | user_words_suffix user-words 4 | user_patterns_suffix user-patterns 5 | -------------------------------------------------------------------------------- /tess-data/configs/bigram: -------------------------------------------------------------------------------- 1 | load_bigram_dawg True 2 | tessedit_enable_bigram_correction True 3 | tessedit_bigram_debug 3 4 | save_raw_choices True 5 | save_alt_choices True 6 | -------------------------------------------------------------------------------- /tess-data/configs/txt: -------------------------------------------------------------------------------- 1 | # This config file should be used with other cofig files which creates renderers. 2 | # usage example: tesseract eurotext.tif eurotext txt hocr pdf 3 | tessedit_create_txt 1 4 | -------------------------------------------------------------------------------- /tess-data/configs/ambigs.train: -------------------------------------------------------------------------------- 1 | tessedit_ambigs_training 1 2 | load_freq_dawg 0 3 | load_punc_dawg 0 4 | load_system_dawg 0 5 | load_number_dawg 0 6 | ambigs_debug_level 3 7 | load_fixed_length_dawgs 0 8 | -------------------------------------------------------------------------------- /tess-data/configs/lstm.train: -------------------------------------------------------------------------------- 1 | file_type .bl 2 | textord_fast_pitch_test T 3 | tessedit_zero_rejection T 4 | tessedit_minimal_rejection F 5 | tessedit_write_rep_codes F 6 | edges_children_fix F 7 | edges_childarea 0.65 8 | edges_boxarea 0.9 9 | tessedit_train_line_recognizer T 10 | textord_no_rejects T 11 | tessedit_init_config_only T 12 | -------------------------------------------------------------------------------- /tess-data/configs/box.train: -------------------------------------------------------------------------------- 1 | disable_character_fragments T 2 | file_type .bl 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | edges_children_fix F 8 | edges_childarea 0.65 9 | edges_boxarea 0.9 10 | tessedit_resegment_from_boxes T 11 | tessedit_train_from_boxes T 12 | textord_no_rejects T 13 | -------------------------------------------------------------------------------- /tess-data/configs/box.train.stderr: -------------------------------------------------------------------------------- 1 | file_type .bl 2 | #tessedit_use_nn F 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | edges_children_fix F 8 | edges_childarea 0.65 9 | edges_boxarea 0.9 10 | tessedit_resegment_from_boxes T 11 | tessedit_train_from_boxes T 12 | #textord_repeat_extraction F 13 | textord_no_rejects T 14 | -------------------------------------------------------------------------------- /tess-data/configs/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata/configs 2 | data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug 3 | data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images 4 | data_DATA += lstmbox wordstrbox 5 | # Configurations for OCR output. 6 | data_DATA += alto hocr pdf tsv txt 7 | data_DATA += linebox rebox strokewidth bigram 8 | EXTRA_DIST = $(data_DATA) 9 | -------------------------------------------------------------------------------- /tess-data/configs/strokewidth: -------------------------------------------------------------------------------- 1 | textord_show_blobs 0 2 | textord_debug_tabfind 3 3 | textord_tabfind_show_partitions 1 4 | textord_tabfind_show_initial_partitions 1 5 | textord_tabfind_show_columns 1 6 | textord_tabfind_show_blocks 1 7 | textord_tabfind_show_initialtabs 1 8 | textord_tabfind_show_finaltabs 1 9 | textord_tabfind_show_strokewidths 1 10 | textord_tabfind_show_vlines 0 11 | textord_tabfind_show_images 1 12 | tessedit_dump_pageseg_images 0 13 | -------------------------------------------------------------------------------- /tess-data/configs/testspace: -------------------------------------------------------------------------------- 1 | chop_ok_split 1000 2 | tosp_rep_space 16 3 | textord_words_default_minspace 0.3 4 | textord_space_size_is_variable 1 5 | gapmap_use_ends 1 6 | textord_linespace_iqrlimit 0.8 7 | textord_overlap_x 1.375 8 | textord_words_width_ile 0.8 9 | textord_words_maxspace 16 10 | textord_words_default_maxspace 1.5 11 | textord_words_default_minspace 0.1 12 | textord_words_min_minspace 0.7 13 | textord_words_default_nonspace 0.1 14 | words_default_prop_nonspace 0.65 15 | words_default_fixed_space 0.95 16 | textord_spacesize_ratiofp 4.8 17 | textord_spacesize_ratioprop 1 18 | debug_fix_space_level 1 19 | tosp_enough_space_samples_for_median 8 20 | tosp_short_row 30 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The cute collection can combine subtitles from various files and sync them. 2 | Copyright (C) 2021 Anders Jensen 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU Affero General Public License as 6 | published by the Free Software Foundation, either version 3 of the 7 | License, or (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-buster 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN apt-get update \ 6 | && apt-get install -y tesseract-ocr ffmpeg \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | RUN pip install -U setuptools pip wheel 10 | RUN pip install -U ffmpeg-python click guessit opencv-python librosa \ 11 | pysubs2 scikit-image jinja2 lxml tqdm pyxdameraulevenshtein \ 12 | textblob jinja2 pytesseract lxml annoy 13 | 14 | RUN mkdir /code 15 | ADD tess-data /code/tess-data 16 | COPY cartonizer.py cowocr.py milksync.py /code/ 17 | 18 | RUN echo '#!/bin/bash\npython3 /code/cartonizer.py "$@"' > /usr/bin/cartonizer && \ 19 | echo '#!/bin/bash\npython3 /code/cowocr.py "$@"' > /usr/bin/cowocr && \ 20 | echo '#!/bin/bash\npython3 /code/milksync.py "$@"' > /usr/bin/milksync && \ 21 | chmod +x /usr/bin/cartonizer /usr/bin/cowocr /usr/bin/milksync 22 | 23 | RUN mkdir /workdir 24 | WORKDIR /workdir -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | .env 141 | 142 | milk-temp 143 | cow-temp 144 | -------------------------------------------------------------------------------- /cartonizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import shlex 5 | import sys 6 | from pathlib import Path 7 | from pprint import pprint 8 | 9 | import click 10 | import ffmpeg 11 | import guessit 12 | 13 | KNOWN_SUBTITLE_EXTENSIONS = [".ass"] 14 | KNOWN_EXTENSIONS = [".mp4", ".mkv", ".ogm", ".avi"] 15 | VIDEO_MAPPING = { 16 | ("hevc", "Main 10"): "HEVC 10-bit", 17 | ("hevc", "Rext"): "HEVC 12-bit", 18 | ("h264", "High"): "h264", 19 | ("h264", "Main"): "h264", 20 | ("h264", "High 10"): "h264 10-bit", 21 | } 22 | VIDEO_RESOLUTION_MAPPING = { 23 | 1080: "1080p", 24 | 1088: "1080p", 25 | } 26 | AUDIO_MAPPING = {"flac": "FLAC", "aac": "AAC", "dts": "DTS-HDMA", "ac3": "AC3"} 27 | SOURCE_MAPPING = {"Blu-ray": "BD", "DVD": "DVD"} 28 | 29 | OP_CHAPTER_NAMES = ["OP", "Episode"] 30 | ED_CHAPTER_NAMES = ["ED", "Preview"] 31 | 32 | CHROMA_GENERATE_PARAM = "--only-generate-chroma" 33 | 34 | 35 | def map_episode_files(paths): 36 | episode_mapping = {} 37 | for path in paths: 38 | for f in path.iterdir(): 39 | if not f.is_file(): 40 | continue 41 | if not f.suffix.lower() in KNOWN_EXTENSIONS: 42 | continue 43 | info = guessit.guessit(f.name) 44 | episode = info.get("episode") 45 | if episode is None: 46 | episode = info.get("episode_title") 47 | if episode is not None: 48 | episode = int(episode.split(" ")[0].split("v")[0]) 49 | 50 | if episode is None: 51 | re_episode = re.findall("第(\d+)話", f.name) 52 | if not re_episode: 53 | continue 54 | episode = int(re_episode[0]) 55 | if isinstance(episode, list): 56 | episode = episode[-1] 57 | episode_mapping.setdefault(episode, []).append(f) 58 | return episode_mapping 59 | 60 | 61 | @click.group() 62 | def cli(): 63 | pass 64 | 65 | 66 | @cli.command() 67 | @click.argument("path", type=click.Path(exists=True), nargs=-1, required=True) 68 | @click.option("--input-subtitle-path", type=click.Path(exists=True)) 69 | @click.option("--op-ed-path", type=click.Path(exists=True), multiple=True) 70 | @click.option("--group", type=str) 71 | @click.option("--source", type=str) 72 | @click.option("--audio", type=str) 73 | @click.option("--title", type=str) 74 | @click.option("--dual-audio", is_flag=True) 75 | @click.option("--skip-chapters", is_flag=True) 76 | @click.option("--pre-generate-chroma", is_flag=True) 77 | @click.option("--skip-copy-oped", is_flag=True) 78 | @click.option("--additional-params", type=str) 79 | @click.option("--folder-name", type=str) 80 | @click.option("--file-name-template", type=str) 81 | @click.option("--output-subtitles-path", type=click.Path()) 82 | def sync( 83 | path, 84 | input_subtitle_path, 85 | op_ed_path, 86 | group, 87 | source, 88 | audio, 89 | title, 90 | dual_audio, 91 | skip_chapters, 92 | pre_generate_chroma, 93 | skip_copy_oped, 94 | additional_params, 95 | folder_name, 96 | file_name_template, 97 | output_subtitles_path, 98 | ): 99 | command_path = ( 100 | f"{sys.executable} {(Path(__file__).parent / 'milksync.py').absolute()}" 101 | ) 102 | 103 | if output_subtitles_path: 104 | output_subtitles_path = Path(output_subtitles_path) 105 | output_subtitles_path.mkdir(parents=True, exist_ok=True) 106 | 107 | external_subtitles = {} 108 | if input_subtitle_path is not None: 109 | for s in Path(input_subtitle_path).iterdir(): 110 | if s.suffix.lower() not in KNOWN_SUBTITLE_EXTENSIONS: 111 | continue 112 | external_subtitles[s.stem] = s 113 | 114 | paths = [Path(p) for p in path] 115 | if op_ed_path: 116 | op_ed_paths = [Path(p) for p in op_ed_path] 117 | else: 118 | op_ed_paths = [] 119 | 120 | episode_mapping = map_episode_files(paths) 121 | first_episode = sorted(episode_mapping.items())[0][1] 122 | probe_result = ffmpeg.probe(first_episode[-1]) 123 | release_name = { 124 | "show_name": title or guessit.guessit(first_episode[0].name)["title"], 125 | } 126 | if source: 127 | release_name["source"] = source 128 | else: 129 | release_name["source"] = SOURCE_MAPPING[ 130 | guessit.guessit(first_episode[-1].name)["source"] 131 | ] 132 | if audio: 133 | release_name["audio"] = audio 134 | for stream in probe_result["streams"]: 135 | if stream["codec_type"] == "video" and "video" not in release_name: 136 | key = (stream["codec_name"], stream["profile"]) 137 | if key not in VIDEO_MAPPING: 138 | click.echo(f"Unknown video key {key=}") 139 | quit(1) 140 | release_name["video"] = VIDEO_MAPPING[key] 141 | release_name["video_resolution"] = VIDEO_RESOLUTION_MAPPING.get( 142 | stream["coded_height"], 143 | f"{stream['coded_width']}x{stream['coded_height']}", 144 | ) 145 | elif stream["codec_type"] == "audio" and "audio" not in release_name: 146 | key = stream["codec_name"] 147 | if key not in AUDIO_MAPPING: 148 | click.echo(f"Unknown audio key {key=}") 149 | quit(1) 150 | release_name["audio"] = AUDIO_MAPPING[key] 151 | 152 | if not folder_name: 153 | folder_name = f"{group and '[' + group + '] ' or ''}{release_name['show_name']} ({release_name['source']} {release_name['video_resolution']} {release_name['video']} {release_name['audio']}{dual_audio and ' Dual-Audio' or ''})" 154 | if not file_name_template: 155 | file_name_template = f"{group and '[' + group + '] ' or ''}{release_name['show_name']} - %s ({release_name['source']} {release_name['video_resolution']} {release_name['video']} {release_name['audio']}{dual_audio and ' Dual-Audio' or ''})" 156 | click.echo(f"Folder name: {folder_name}") 157 | click.echo(f"File name template: {file_name_template}") 158 | 159 | copy_files = [] 160 | 161 | endings = [] 162 | openings = [] 163 | for op_ed_path in op_ed_paths: 164 | for f in op_ed_path.iterdir(): 165 | if "NCOP" in f.name: 166 | click.echo(f"Found OP {f.name}") 167 | openings.append(f) 168 | elif "NCED" in f.name: 169 | click.echo(f"Found ED {f.name}") 170 | endings.append(f) 171 | 172 | op_ed_chapter_command = [] 173 | if openings or endings: 174 | for i, opening in enumerate(sorted(openings, key=lambda f: f.name), 1): 175 | if not skip_chapters: 176 | op_ed_chapter_command.append( 177 | f"--chapter-segment-file '{str(opening)}' --chapter-segment-name-start '{OP_CHAPTER_NAMES[0]}' --chapter-segment-name-end '{OP_CHAPTER_NAMES[1]}'" 178 | ) 179 | name = "NCOP" 180 | if len(openings) > 1: 181 | name += str(i) 182 | if not skip_copy_oped: 183 | copy_files.append( 184 | (str(opening), f"{folder_name}/{file_name_template % name}.mkv") 185 | ) 186 | 187 | for i, ending in enumerate(sorted(endings, key=lambda f: f.name), 1): 188 | if not skip_chapters: 189 | op_ed_chapter_command.append( 190 | f"--chapter-segment-file '{str(ending)}' --chapter-segment-name-start '{ED_CHAPTER_NAMES[0]}' --chapter-segment-name-end '{ED_CHAPTER_NAMES[1]}'" 191 | ) 192 | name = "NCED" 193 | if len(endings) > 1: 194 | name += str(i) 195 | if not skip_copy_oped: 196 | copy_files.append( 197 | (str(ending), f"{folder_name}/{file_name_template % name}.mkv") 198 | ) 199 | 200 | op_ed_chapter_command = "".join([f" {cmd} \\\n" for cmd in op_ed_chapter_command]) 201 | episode_num_length = max(max(len(str(k)) for k in episode_mapping.keys()), 2) 202 | 203 | output_file = [] 204 | 205 | chroma_files = [] 206 | for files in episode_mapping.values(): 207 | for f in files: 208 | chroma_files.append(f"'{str(f)}'") 209 | if pre_generate_chroma: 210 | output_file.append("echo 'Generating chroma'") 211 | output_file.append( 212 | f"{command_path} {CHROMA_GENERATE_PARAM} {' '.join(chroma_files)}" 213 | ) 214 | output_file.append(f"mkdir -p '{folder_name}'") 215 | if additional_params: 216 | additional_params = f" {additional_params} \\\n" 217 | else: 218 | additional_params = "" 219 | 220 | for episode, files in sorted(episode_mapping.items()): 221 | if len(files) < 2: 222 | click.echo(f"Skipping episode {episode}") 223 | continue 224 | external_subtitle = "" 225 | if files[0].stem in external_subtitles: 226 | external_subtitle = f" --input-external-subtitle-track {shlex.quote(str(external_subtitles[files[0].stem]))} \\\n" 227 | output_file.append("echo ''") 228 | output_file.append(f"echo 'Handling episode {episode}'") 229 | if output_subtitles_path: 230 | output_subtitles = f" --output-subtitle {shlex.quote(str(output_subtitles_path / files[-1].with_suffix('.subtitle').name))} \\\n" 231 | else: 232 | output_subtitles = "" 233 | files = "".join([f" {shlex.quote(str(f))} \\\n" for f in files]) 234 | output_file.append( 235 | f"{command_path} \\\n{files}{op_ed_chapter_command}{external_subtitle}{additional_params}{output_subtitles} --output '{folder_name}/{file_name_template % str(episode).zfill(episode_num_length)}.mkv'" 236 | ) 237 | output_file.append("echo ''") 238 | output_file.append("echo 'Copying files'") 239 | for (src_f, dst_f) in copy_files: 240 | if not src_f.lower().endswith(".mkv"): 241 | click.echo("Copy file is not an mkv") 242 | quit(1) 243 | output_file.append(f"cp {shlex.quote(src_f)} {shlex.quote(dst_f)}") 244 | 245 | Path("create_release.sh").write_text("\n".join(output_file)) 246 | click.echo("Release file created, run: bash create_release.sh") 247 | 248 | 249 | @cli.command() 250 | @click.argument("subbed_path", type=click.Path(exists=True), required=True) 251 | @click.argument("unsubbed_path", type=click.Path(exists=True), required=True) 252 | @click.option("--additional-params", type=str) 253 | def ocr( 254 | subbed_path, 255 | unsubbed_path, 256 | additional_params, 257 | ): 258 | paths = [Path(subbed_path), Path(unsubbed_path)] 259 | command_path = ( 260 | f"{sys.executable} {(Path(__file__).parent / 'cowocr.py').absolute()}" 261 | ) 262 | 263 | output_file = [] 264 | 265 | episode_mapping = map_episode_files(paths) 266 | 267 | for episode, files in sorted(episode_mapping.items()): 268 | if len(files) < 2: 269 | click.echo(f"Skipping episode {episode}") 270 | continue 271 | output_file.append("echo ''") 272 | output_file.append(f"echo 'Handling episode {episode}'") 273 | files = "".join([f" {shlex.quote(str(f))} \\\n" for f in files]) 274 | output_file.append( 275 | f"{command_path} \\\n{files} extract-subtitles \\\n {additional_params or ''}" 276 | ) 277 | output_file.append(f"{command_path} \\\n{files} create-report") 278 | 279 | output_file.append("") 280 | Path("ocr_release.sh").write_text("\n".join(output_file)) 281 | print("OCR script file created, run: bash ocr_release.sh") 282 | 283 | 284 | if __name__ == "__main__": 285 | cli() 286 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Cute Collection 2 | 3 | This is a bunch of tools to sync subtitles and audio tracks automatically. 4 | 5 | ## Requirements 6 | 7 | * Linux 8 | * 10GB of available memory 9 | * ffmpeg and ffprobe in environment path 10 | * Python 3.8+ (might work with earlier, not tested though) 11 | * Tesseract in environment path if you want to OCR 12 | 13 | ## Installation Instructions 14 | 15 | ```bash 16 | cd ~ 17 | git clone https://github.com/JohnDoee/the-cute-collection.git the-cute-collection 18 | cd the-cute-collection 19 | 20 | python3 -m venv .env 21 | 22 | .env/bin/pip install -U setuptools pip wheel 23 | .env/bin/pip install ffmpeg-python click guessit opencv-python librosa pysubs2 scikit-image jinja2 lxml tqdm pyxdameraulevenshtein textblob jinja2 pytesseract lxml annoy 24 | ``` 25 | 26 | ## Docker Installation and Usage Instructions 27 | 28 | A docker image is available too, the commands look like this: 29 | 30 | cartonizer: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection cartonizer``` 31 | cowocr: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection cowocr``` 32 | milksync: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection milksync``` 33 | 34 | To execute a commandfile generated by cartonizer, use this syntax: 35 | 36 | ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection bash ocr_release.sh``` 37 | 38 | ## Cartonizer 39 | 40 | Generate a script for milksync to do bulk operations instead of one by one. 41 | 42 | ### How to use for automatic subtitle sync (milksync) 43 | 44 | The most basic usage is: 45 | 46 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py sync path-to-subbed path-to-unsubbed` 47 | 48 | Make sure the files are correctly matched, sometimes it takes multiple files it should not. 49 | 50 | This will generate a bash script called `create_release.sh`, you just have to run it with `bash create_release.sh` and wait. 51 | 52 | A description of all arguments and how and when to use them. 53 | 54 | #### --op-ed-path 55 | 56 | Look for OP and ED in the given path and use them to auto-generate chapters and copy them to the result path. 57 | 58 | Example: `--op-ed-path Unsubbed-Files/NC-OP-ED-Folder/` - looks for OP ED in the specified folder. 59 | 60 | #### --group 61 | 62 | Put a group name in the result folder name. 63 | 64 | Example: `--group Horse` - Sets group name to Horse and prefixes the folder name and files with it. 65 | 66 | #### --source 67 | 68 | Specify the source of the video track, e.g. BD for bluray. Will be auto-detected if not specified. 69 | 70 | Example: `--source VHS` - sets the source to the text string VHS. 71 | 72 | #### --audio 73 | 74 | Same as source but for the audio track. 75 | 76 | Example: `--audio Opus` - sets the audio source to the text string Opus. 77 | 78 | #### --title 79 | 80 | Sets the title of the release. 81 | 82 | Example: `--title Big Buck Bunny` - sets the title to the text string Big Buck Bunny. 83 | 84 | #### --dual-audio 85 | 86 | Marks the release as Dual-Audio 87 | 88 | Example: `--dual-audio` - Adds the text Dual-Audio to the release. 89 | 90 | #### --skip-chapters 91 | 92 | Skips adding chapters based on OP-ED specified with `--op-ed-path`. 93 | This is useful if you want to copy the NC-OP-ED files but copy the chapters from a release. 94 | 95 | Example: `--skip-chapters` - Instructs milksync to not assign chapters from OP & ED. 96 | 97 | #### --pre-generate-chroma 98 | 99 | Pre-generate chromas, this can sometimes speed up the total speed but is not recommended 100 | 101 | Example: `--pre-generate-chroma` - Adds a line to the script that pre-generates chromas. 102 | 103 | #### --skip-copy-oped 104 | 105 | Skips copying OP-ED specified with `--op-ed-path`. This is useful if you created the files yourself just to assign the chapters. 106 | 107 | Example: `--skip-copy-oped` - Cartonizer does not add the line to copy the files to the release folder. 108 | 109 | #### --additional-params 110 | 111 | Pass additional arguments to `milksync.py`. 112 | 113 | Example: `--additional-params '--chapter-beginning Intro'` - Tells milksync to add a chapter to the beginning fo the file. 114 | 115 | See milksync arguments for more. 116 | 117 | #### --folder-name 118 | 119 | Instead of auto-generating a foldername, use this name. 120 | 121 | Example: `--folder-name 'Happy Bunnies Riding The Wave (DVD)'` 122 | 123 | #### --file-name-template 124 | 125 | Instead of auto-generating a file name template, use this template. 126 | Must have a %s where the episode number is placed. 127 | 128 | Example: `--folder-name 'Happy Bunnies Riding The Wave (DVD) %s'` 129 | 130 | ### How to use for ocr (cowocr) 131 | 132 | The most basic usage is: 133 | 134 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py ocr path-to-subbed path-to-unsubbed` 135 | 136 | Make sure the files are correctly matched, sometimes it takes multiple files it should not. 137 | 138 | This will generate a bash script called `ocr_release.sh`, you just have to run it with `bash ocr_release.sh` and wait. 139 | 140 | A description of all arguments and how and when to use them. 141 | 142 | #### --additional-params 143 | 144 | Pass additional arguments to `cowocr.py`. 145 | 146 | Example: `--additional-params '--threads 1 --run-subregions-in-parallel'` - Tells cowocr to use 2 threads and run every subregion in parallel. 147 | 148 | See cowocr arguments for more. 149 | 150 | ### FAQ 151 | 152 | #### There is no OP/ED to assign chapters from (or it fails to use existing OP/ED) what do I do? 153 | 154 | The easiest way right now is to extract them manually, if you have a file named `Big Buck Bunny 01.mkv` and the chapters are like this in the file: 155 | 156 | * Opening: starts at 00:01:27.062 and stops at 00:02:57.123 157 | * Ending: starts at 00:22:11.362 and stops at 00:23:33.333 158 | 159 | Extract them with: 160 | ``` 161 | mkdir extracted 162 | ffmpeg -i 'Big Buck Bunny 01.mkv' -ss 00:01:27.062 -to 00:02:57.123 -map a:0 extracted/NCOP-01.mkv 163 | ffmpeg -i 'Big Buck Bunny 01.mkv' -ss 00:22:11.362 -to 00:23:33.333 -map a:0 extracted/NCED-01.mkv 164 | ``` 165 | 166 | These can then be used with `--op-ed-path extracted/ --skip-copy-oped` 167 | 168 | ## Milksync 169 | 170 | Compare audio tracks between two files and take subtitles and audio tracks from one and add to another. The intention is to remove the tedious work of manually aligning subtitles to a new files and give a far more exact 171 | result. 172 | 173 | ### How to use 174 | 175 | The most basic usage is: 176 | 177 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/milksync.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv --output merged-episode-01.mkv` 178 | 179 | This will take video and audio from the last file and put subtitles from the first file into the merged file. 180 | 181 | The command prints out information about what is going on, e.g. where chapters are placed and how much subtitles are moved. 182 | Make sure to check the result, especially around the breakpoints. WARNINGs can also be a hint about what might be wrong with the resulting file. 183 | 184 | Remember, you can always modify a command and just run it again to see what happens. Second time is normally faster than the first too. 185 | Sometimes experimenting can help you on your way. 186 | 187 | A description of all arguments and how and when to use them. 188 | 189 | #### --only-generate-chroma 190 | 191 | Only extract audio from the file and generate index, this can sometimes be used to speed up the overall progress, not recommended. 192 | 193 | Example: `--only-generate-chroma` - Quits after extracting chroma 194 | 195 | #### --sync-using-subtitle-audio 196 | 197 | Use the audio where the subtitles run to sync a specific line. Good when video is partial or re-arranged. Bad for audio syncs. 198 | 199 | Example: `--sync-using-subtitle-audio` - Enable the sync feature. 200 | 201 | #### --skip-subtitles 202 | 203 | Do not copy any subtitles, can be used for e.g. dubs only releases or subtitles from another source that are not to be synced this way. 204 | 205 | Example: `--skip-subtitles` - No subtitles copied 206 | 207 | #### --skip-shift-point 208 | 209 | The script prints out the points it uses to shift the subtitles, sometimes one or more of them might be bad or you want to see what happens with them removed. They are index based and you have to count it yourself from the milksync output. 210 | 211 | Generally not used. 212 | 213 | Example: `--skip-shift-point 2,3` - Skips shift point 2 and 3. 214 | 215 | #### --subtitle-cutoff 216 | 217 | If the subtitles start too early or run too long, this command can cut off subtitles to prevent this. The command takes a number in seconds that can be both positive (count from beginning of video result file) and negative (count from end of video result file) 218 | 219 | Example: `--subtitle-cutoff -50` - The last 50 second of the result will not have any subtitles. 220 | Example: `--subtitle-cutoff 30` - The first 30 second of the result will not have any subtitles. 221 | 222 | #### --only-delta 223 | 224 | Instead of putting subtitles into buckets and adjusting them to fit in it, just modify the timestamp on the subtitles. 225 | This one is very useful if one input runs faster or slower than the other. This can often be seen in the milksync output as a lot of sync points that either decrease or increase in delta. 226 | 227 | Example: `--only-delta` - Enable delta mode instead of subtitle bucket mode. 228 | 229 | #### --align-framerate 230 | 231 | Align source framerate to target video framerate, when speedup/slowdown used as technique to change framerate. 232 | 233 | Example: `--align-framerate` - Enable the feature and change source framerate to target framerate. 234 | 235 | #### --align-frames-too 236 | 237 | When using `--only-delta` it can be helpful to look at frames too to find a better difference. 238 | 239 | Example: `--only-delta --align-frames-too` - Enables frame alignment. 240 | 241 | #### --preserve-silence 242 | 243 | When extracting chroma from the files the audio at the end is trimmed to prevent silence from blocking alignment, this disables that feature. 244 | 245 | Example: `--preserve-silence` - Preserves silence. 246 | 247 | #### --temp-folder 248 | 249 | Where to save temporary files, this includes extracted audio and subtitle tracks including chroma generated from audio files. 250 | 251 | Example: `--temp-folder '/tmp/milk-temp/'` - Saves temp files to the specified folder 252 | 253 | #### --audio-tracks 254 | 255 | Define which audio tracks to use for syncing audio tracks. Milksync only works when the audio tracks in the input files are the same, i.e. same language. If you take e.g. english and japanese audio tracks and try to use then the results will vary quite a bit and likely not be very good. 256 | 257 | Input file index is defined as the order they are given to milksync. 258 | 259 | Example: `--audio-tracks 0:1,1:0` - Use audio track 1 from input file 0 and audio track 0 from input file 1. 260 | 261 | #### --adjust-shift-point 262 | 263 | Manually change a shift point. Can be used to see if the auto detect is not good enough or just modify it to work correctly. This is mostly used for debugging. 264 | 265 | Example: `--adjust-shift-point 0.3:10.3:1.3:11.3` - Set the first shift point to the specified values. Order is same as printed in milksync. 266 | 267 | #### --adjust-delay 268 | 269 | Manually adjust the delay to all points. Can be used for debugging. 270 | 271 | Example: `--adjust-delay 0.3` - Adds 0.3 second to every subtitle 272 | 273 | #### --sync-non-dialogue-to-video 274 | 275 | Sometimes the audio has been resynced to the video which means the speech subtitles and the sign subtitles must be synced independently. 276 | This flag tries to align signs to the video and speech to the audio, can be useful when the target is e.g. remastered. It can be very slow and quality can vary, the result is printed and you should check if the signs are positioned correctly. 277 | 278 | Example: `--sync-non-dialogue-to-video 0-1000` - Enables this feature for the given range of seconds. 279 | 280 | #### --chapter-source 281 | 282 | Specify which source file index to pull chapters from, these are synced in the same way as the audio tracks. 283 | 284 | If nothing chapter-related is specified, they are pulled from video source, i.e. last file. 285 | 286 | Example: `--chapter-source 0` - Take chapters from input file 0. 287 | 288 | #### --chapter-beginning 289 | 290 | Add a chapter to the beginning of the result. This means every part of the result will be part of a chapter. 291 | 292 | Example: `--chapter-beginning Beginning` - The first chapter at 00:00 is named Beginning. 293 | 294 | #### --chapter-segment-file 295 | 296 | Source file to generate chapter from, this is a part of the video that is sought for in the file. Useful for e.g. openings or endings. 297 | 298 | This is used in conjunction with `--chapter-segment-name-start` and `--chapter-segment-name-end`. Order matters and each `--chapter-segment-file` must have a `--chapter-segment-name-start` and `--chapter-segment-name-end`. 299 | 300 | Example: `--chapter-source NCED-01.mkv` - Match content of NCED-01.mkv to the result video and add chapters if found. 301 | 302 | #### --chapter-segment-name-start 303 | 304 | Name of the chapter starting where the beginning of `--chapter-segment-file` is matched. 305 | 306 | Example: `--chapter-segment-file End` - Names the chapter that matches the beginning of `--chapter-segment-file` End. 307 | 308 | #### --chapter-segment-name-end 309 | 310 | Name of the chapter starting where the end of `--chapter-segment-file` is matched. 311 | 312 | Example: `--chapter-segment-file 'After End'` - Names the chapter that matches the end of `--chapter-segment-file` After End. 313 | 314 | #### --chapter-segment-required 315 | 316 | Enforces that every chapter segment must be matched. 317 | 318 | Example: `--chapter-segment-required` - If a chapter segment is not matched, it will quit with an error. 319 | 320 | #### --metadata-audio-track 321 | 322 | Manually set metadata for an audio track, this is passed directly to ffmpeg. These matches the output mapping and no the input mapping. 323 | 324 | Example: `--metadata-audio-track 0=language=jpn --metadata-audio-track 0=title='Japanese' --metadata-audio-track 1=language=fra --metadata-audio-track 1=title='Bad french'` - Sets the first output audio track metadata to japanese with a matching title and the second audio track to french with a matching title. 325 | 326 | #### --metadata-subtitle-track 327 | 328 | Manually set metadata for a subtitle track, this is passed directly to ffmpeg. These matches the output mapping and no the input mapping. 329 | 330 | Example: `--metadata-subtitle-track 0=language=jpn --metadata-subtitle-track 0=title='Japanese' --metadata-subtitle-track 1=language=fra --metadata-audsubtitleio-track 1=title='Bad french'` - Sets the first output subtitle track metadata to japanese with a matching title and the second subtitle track to french with a matching title. 331 | 332 | #### --subtitle-min-font-size 333 | 334 | Increase font-size to minimum this. Sometimes subtitles are unreadable on the source. 335 | 336 | Example: `--subtitle-min-font-size 26` - Sets the font-size to, minimum, 26. 337 | 338 | #### --input-external-subtitle-track 339 | 340 | Use a specific external subtitle in output, it is assumed it matches video input 0. 341 | 342 | Example `--input-external-subtitle-track subtitles.ass` - Assumes the subtitle matches input 0 and syncs it to output. 343 | 344 | #### --output-video-file-index 345 | 346 | Which file to pull video data from, this is normally the last specified file and is normally not used. 347 | 348 | Example: `--output-video-file-index 1` - Pull video data from the second input file. 349 | 350 | #### --output-audio-mapping 351 | 352 | Define which audio tracks the output has and where to pull them from. Defaults to using only first audio from the last input file, same source as video. 353 | 354 | Example: `--output-audio-mapping 0:0,1:2` - Takes the first audio track from the first input file and the third audio track from the second input file. The result file first audio track is 0:0 and the second is 1:2. 355 | 356 | #### --output-subtitle-mapping 357 | 358 | Define which subtitle tracks the output has and where to pull them from. Defaults to using only first subtitle from the first input file. 359 | 360 | Example: `--output-subtitle-mapping 1:1,1:0` - Takes the first and the second subtitle track from the second input file. The order is as specified, i.e. the tracks are flipped. 361 | 362 | #### --output 363 | 364 | Where to save the result. 365 | 366 | Example: `--output Result-EP01.mkv` - Saves the complete file to Result-EP01.mkv 367 | 368 | #### --output-subtitle 369 | 370 | Save the synced subtitles. 371 | 372 | Example: `--output-subtitle Result-EP01.ass` - Saves the subtitle file to Result-EP01.ass 373 | 374 | ## CowOCR 375 | 376 | Compare two video tracks and look for differences. The intention is to find differences as they will indicate e.g. subititles and signs. 377 | 378 | The output is an .ass file and a report that can be used to verify and correct the output. 379 | 380 | ### How it works 381 | 382 | The base assumtion that CowOCR relies on is to find the differences between the source and destination video. To do this it goes through a few steps. 383 | 384 | The initial differences are found by running ORB algorithm against both source and target video, keypoints found at source not found at target is assumed to be differences. 385 | 386 | We now have a region we can assume is different, we look for text in that one. Threshold algorithm is run against the source and matching white areas are extracted. 387 | 388 | To make sure what is part of the text the color of all found area is extracted and grouped using k-means. Areas with colors close enough to the majority color found are considered part of the text. Additionally the border color is used in the same way. 389 | 390 | A bruteforce is performed here to find the best text mask by cycling through the colors. 391 | 392 | With text found and a mask matching the text (where it is in the picture) it is now time to figure out when it starts and ends. This is done by looping through the frames before and after the current frame and see if the colors match the extracted text, i.e. is the same text in the frames before and after the current frame. 393 | 394 | ### How to use 395 | 396 | The most basic usage to extract-subtitles is: 397 | 398 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv extract-subtitles` 399 | 400 | This will compare the two video files and try to extract the subtitles. 401 | 402 | After the subtitles are extracted, a report plus an .ass file can be created from the output with the create-report command. 403 | 404 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv create-report` 405 | 406 | The report and subtitle is, per default, located in the cow-temp folder which is created relative to where the command was executed. 407 | In this example, in the folder that contains the path-to-subbed and path-to-unsubbed folder. 408 | 409 | Verify the subtitles and we're all done, now we just need them merged. For this we can use milksync with just one additional parameter, `--input-subtitle-path cow-temp/` - that will pull the subtitles from the .ass file instead of the source video file. 410 | 411 | This is likely not how the actual workflow will be. See further down for an actual workflow. 412 | 413 | A description of all arguments and how and when to use them. 414 | 415 | ### extract-subtitles arguments 416 | 417 | This command extracts the subtitles from the video 418 | 419 | #### --threads 420 | 421 | How many threads to extract subtitles with. Unless specified it runs a subtitle region at a time. 422 | 423 | Example: `--threads 1` - Use only one thread 424 | 425 | #### --tesseract-data-path 426 | 427 | Path to tesseract data path. 428 | 429 | Example: `--threads tess-data/` - Read data from the tess-data folder. 430 | 431 | #### --frame-diff 432 | 433 | When comparing source and target video it can sometimes be necessary to specify frame differences. It should be sufficient to rely 434 | on the auto detection though. 435 | 436 | Example: `--frame-diff 8` - The target is 8 frames ahead of the source. 437 | 438 | #### --frame-range 439 | 440 | Specify frames on the source to extract subtitles from, can be useful to e.g. skip OP/ED 441 | 442 | Example: `--frame-range 1000-5000` - Extracts subtitles from frame 1000 to 5000 443 | 444 | #### --ignore-diff-fps 445 | 446 | As it uses frame differences to find subtitles the FPS must be the same. Sometimes it can be ignored (e.g. if the source just runs faster but has the same frames). This option makes it ignore the criteria 447 | 448 | Example: `--ignore-diff-fps` - Ignores FPS differences 449 | 450 | #### --run-subregions-in-parallel 451 | 452 | Run extraction for each subtitle region in parallel. Each thread will run for each subtitle regions so the total number 453 | of threads will be threads times subtitle region count. 454 | 455 | Example: `--run-subregions-in-parallel` - Run every subtitle region in parallel 456 | 457 | #### --fix-broken-frame-alignment 458 | 459 | Sometimes frames drift a bit differently so while the FPS is the same, one of the video files might have ghost frames or other annoyances. This tries to alleviate that issue. 460 | 461 | Example: `--fix-broken-frame-alignment` - Enable frame alignment fix. 462 | 463 | #### --debug-frame 464 | 465 | While you are editing the subtitle region configuration it is necessary to try and extract a specific frame to see the result. 466 | This is the command for that. It will run the current subtitle region configuration for the given frame and save an array of outputs 467 | to the temp-folder/debug. 468 | 469 | It will also print out what the various files contain. 470 | 471 | Example: `--debug-frame 1000` - Try to extract subtitles from source frame 1000. 472 | 473 | #### --debug-subregion 474 | 475 | In combination with --debug-frame it will use a specific subtitle region. If not specified, defaults to the first subtitle region. 476 | 477 | Example: `--debug-subregion bottom` - Extract using the sutitle region named bottom. 478 | 479 | ### create-report arguments 480 | 481 | This command turns the extracted subtitles into a report and an .ass file 482 | 483 | A report contains information for each subtitle region, this explanation is for the default config. 484 | The report is an html file you should open in your webbrowser, e.g. `cow-temp/Episode 1.avi-report/index.html`. In that report each region has two sections, "subtitle lines" and "missing regions". 485 | 486 | The "subtitle lines" are the found lines and these are reflected in the .ass file. 487 | With the bottom subtitles there area few things: 488 | 489 | - A start and end timestamp of the subtitle 490 | - Start and end frame and the initial discovery frame. 491 | - The subtitle text 492 | - Four frames used to check if timing is correct, before first frame, first frame, last frame and after last frame. If before first or after last contains the matching text, then timing is off. 493 | 494 | The "missing regions" part contains images of stuff where there are differences between source and target but it was unable to discover what exactly. Sometimes it is short words or un-ocrable subtitles. 495 | 496 | A subtitle-region scan does not yield the same type of results as it is unable to merge subtitle lines in the same way. It also contains green squares for matched text under "subtitle signs" section. 497 | 498 | Make sure to browse through the "missing regions" section, no tool is perfect. 499 | 500 | #### --output-report-path 501 | 502 | Where to save the report generated. Defaults to the temp-dir. 503 | 504 | Example: `--output-report-path /mnt/sdd/subtitle-temp-reports` - Save the report to the specified path. 505 | 506 | #### --output-subtitle-path 507 | 508 | Where to save the .ass subtitle file is saved. Defaults to the temp-dir. 509 | 510 | Example: `--output-report-path /mnt/sdd/subtitle-temp-subs` - Save the subtitles to the specified path. 511 | 512 | ### subtitle_regions.json 513 | 514 | This file is generated in the temp folder when the command is first run. Any video that uses a specific temp folder will use the same 515 | subtitle region file. A description of all available options can be found here. 516 | 517 | #### name 518 | 519 | Name of the subtitle region. Used with e.g. --debug-subregion parameter. 520 | 521 | #### scan_mode 522 | 523 | Specify how to find subtitles in a region, there are two choices, `bottom_center` and `search_slice`. 524 | 525 | `bottom_center` looks for subtitles in the middle of the region and assumes there is, max, one subtitle in the given region. 526 | Useful for normal subtitles at the bottom of the screen. 527 | 528 | `search_slice` looks around for differences that contains text, useful for e.g. signs. Cannot merge similar regions and can create a lot of duplicate lines. 529 | 530 | #### y, h, x, w, margin 531 | 532 | Specifies the dimension of a subtitle region, it starts at `x`, `y` and ends at `x+w`, `y+h`. If you run with --debug-frame it will show where the regions are. 533 | 534 | The `margin` is part of the region that cannot contain subtitles and any object that are part of it will be removed, useful for `bottom_center` scan mode where normal subtitles are not in the margin. 535 | 536 | #### area_min, area_max, area_min_density 537 | 538 | Minimum `area_min` and maximum `area_max` number of pixels a letter can contain. Minimum density `area_min_density` a letter has. 539 | 540 | These can be useful to remove things that most certainly cannot be letters. 541 | 542 | #### max_w, max_h 543 | 544 | Maximum size of a letter in pixels. 545 | 546 | #### min_stroke_width, max_stroke_width 547 | 548 | Minimum and maximum stroke width a letter can have. These are measured at the thickest spot of a letter. 549 | 550 | Examples could be, a long thin line will have a width of 1px while a circle will have a width of its radius. 551 | 552 | #### border_size 553 | 554 | Assumed size of border. 555 | 556 | This will often be either 1 or 2, it depends a bit on how the "Threshold" debug image looks, e.g. does it consume lots of the border or not. 557 | 558 | See "How it works" to understand what it is useful for. 559 | 560 | #### max_text_diff, max_border_diff 561 | 562 | Maximum difference for text and border to be assumed part of the same text line. 563 | 564 | This depends a bit on how well the text is marked and extracted, if it finds too few letters it might be smart to turn them up and vice-versa if it finds too much. 565 | 566 | See "How it works" to understand what it is useful for. 567 | 568 | #### percent_good_border 569 | 570 | How much of a border of a given figure must be good to be assumed part of the text. 571 | 572 | See "How it works" to understand what it is useful for. 573 | 574 | #### edge_threshold 575 | 576 | Used in relation with finding the differnence between source and target frames. Should probably not be touched. 577 | 578 | See "How it works" to understand what it might be useful for. 579 | 580 | #### threshold_mode, threshold_value 581 | 582 | Method and value to look for threshold with. There are two modes `adaptive` and `static`. 583 | 584 | `adaptive` finds out which pixel should be black and which should be white depending on the pixels around it. Can be useful if the inner text on the subtitles varies but is always bright. An example `threshold_value` for this could be 27, that will prevent most noise too. 585 | 586 | `static` is an absolute way of finding them, useful if the inner subtitle text is always bright and same color. An example `threshold_value` could be 200, which is the brightness cutoff. 587 | 588 | See "How it works" to understand what it is useful for and https://docs.opencv.org/4.5.2/d7/d4d/tutorial_py_thresholding.html for information about thresholds generally. 589 | 590 | #### ass_style_name 591 | 592 | Style name to use with text found here. 593 | 594 | #### invert_mode 595 | 596 | Not implemented, no effect. 597 | 598 | ### A realistic workflow 599 | 600 | In this example we have a set of 12 episodes we want to OCR, the source is 640x480 which matches the default subtitle region. 601 | Source files are located in `source-video` and the target files are in `target-video`. 602 | 603 | First we run cartonizer to create a batch script. 604 | 605 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py ocr source-video target-video --additional-params '--threads 1 --run-subregions-in-parallel'` 606 | 607 | This creates a file named `ocr_release.sh` and it will be the script we run when we have modified `subtitle_regions.json` enough. 608 | 609 | We open up the `ocr_release.sh` file and find OCR of the first episode. We need temp folder and configuration created first before we can OCR it all. 610 | 611 | ``` 612 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \ 613 | 'source-video/Episode 1.mkv' \ 614 | 'target-video/Episode 1.mkv' \ 615 | extract-subtitles \ 616 | --threads 1 --run-subregions-in-parallel 617 | ``` 618 | 619 | That is the command that extracts subtitles from the first episode, it will be the one we use for modifying `subtitle_regions.json`. 620 | 621 | Lets see how good the default config is by running it against part of the episode, 5000 frames should suffice (that is 3.5 minutes at 23.976 fps). 622 | 623 | ``` 624 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \ 625 | 'source-video/Episode 1.mkv' \ 626 | 'target-video/Episode 1.mkv' \ 627 | extract-subtitles \ 628 | --threads 1 --run-subregions-in-parallel \ 629 | --frame-range 5000-10000 # framerange we use to see 630 | ``` 631 | 632 | After it is done, create a report and see the result with 633 | 634 | ``` 635 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \ 636 | 'source-video/Episode 1.mkv' \ 637 | 'target-video/Episode 1.mkv' \ 638 | create-report 639 | ``` 640 | 641 | The report is in the cow-temp folder in this example including an .ass file and the `subtitle_regions.json` file. 642 | 643 | To modify and test changes to `subtitle_regions.json` we find a good subtitle frame number in the report and use that. 644 | 645 | ``` 646 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \ 647 | 'source-video/Episode 1.mkv' \ 648 | 'target-video/Episode 1.mkv' \ 649 | extract-subtitles \ 650 | --threads 1 --run-subregions-in-parallel \ 651 | --debug-frame 13754 --debug-subregion bottom 652 | ``` 653 | 654 | We can then run the initial 5000 frames again and see if the result is good enough. If it is, then just run the whole `ocr_release.sh`. 655 | 656 | When it is done the .ass in the cow-temp folder must be modified and the report followed. I do this by loading the subtitle file and source episode file into Aegisub. 657 | 658 | The subtitles must now be synced with the video, chapters added and other stuff. This can be done with MilkSync. 659 | 660 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py sync source-video target-video --additional-params '--external-subtitles cow-temp/'` 661 | 662 | Then run `create_release.sh` and you got a fully synced video. 663 | 664 | # License 665 | 666 | AGPL -------------------------------------------------------------------------------- /cowocr.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import dataclasses 3 | import json 4 | import math 5 | import re 6 | import shlex 7 | import threading 8 | import time 9 | import traceback 10 | from collections import namedtuple 11 | from pathlib import Path 12 | 13 | import click 14 | import cv2 15 | import jinja2 16 | import lxml.html 17 | import matplotlib 18 | import matplotlib.pyplot as plt 19 | import numpy as np 20 | import pysubs2 21 | import pytesseract 22 | from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance 23 | from scipy.spatial.distance import cdist, cosine 24 | from skimage.metrics import structural_similarity 25 | from textblob import TextBlob 26 | from tqdm import tqdm, trange 27 | 28 | BASE_ASS = r"""[Script Info] 29 | ; Script generated by Aegisub 3.2.2 30 | ; http://www.aegisub.org/ 31 | Title: CowOCR 32 | ScriptType: v4.00+ 33 | WrapStyle: 0 34 | ScaledBorderAndShadow: yes 35 | PlayResX: 640 36 | PlayResY: 480 37 | 38 | [V4+ Styles] 39 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 40 | Style: Default,Open Sans Semibold,26.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,2,0,0,28,1 41 | Style: Sign,Open Sans Semibold,26.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,2,0,0,28,1 42 | Style: Note,Open Sans Semibold,20.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,8,0,0,28,1 43 | 44 | [Events] 45 | Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 46 | """ 47 | 48 | HTML_BASE = r""" 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | {{ title }} 59 | 60 | 61 | 77 | %(body)s 78 | 79 | 80 | """ 81 | 82 | HTML_INDEX = HTML_BASE % { 83 | "body": r""" 84 |

OCR Report for {{ video_filename }}

85 | {% for sr in subtitle_regions %} 86 |
87 |

Subtitle Region: {{sr.name}}

88 | {% if sr.scan_mode == 'bottom_center' %} 89 | Subtitle lines ({{ subtitle_region_data[sr.name]['subtitle_lines']|length }}) 90 | Missing regions ({{ subtitle_region_data[sr.name]['missing_regions']|length }}) 91 | {% endif %} 92 | {% if sr.scan_mode == 'search_slice' %} 93 | Subtitle signs ({{ subtitle_region_data[sr.name]['subtitle_signs']|length }}) 94 | Missing regions ({{ subtitle_region_data[sr.name]['missing_regions']|length }}) 95 | {% endif %} 96 |
97 | {% endfor %} 98 | """ 99 | } 100 | 101 | HTML_SUBTITLE_LINES = HTML_BASE % { 102 | "body": r""" 103 |

Subtitle lines for: {{ sr.name }}

104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | {% for subtitle_line in subtitle_lines %} 115 | 116 | 117 | 121 | 124 | 130 | 131 | {% endfor %} 132 | 133 |
#TimestampsTextFrames
{{loop.index}} 118 | {{ subtitle_line['from_frame_no']|totimestamp }}
{{ subtitle_line['to_frame_no']|totimestamp }}
119 | {{ subtitle_line['from_frame_no'] }}
{{ subtitle_line['to_frame_no'] }}
{{ subtitle_line['initial_frame_no'] }} 120 |
122 | {% autoescape false %}{{ subtitle_line['subtitle_text'] | replace('\n', '
') }}{% endautoescape %}
123 |
125 |
126 |
127 |
128 | 129 |
134 | """ 135 | } 136 | 137 | HTML_MISSING_REGIONS = HTML_BASE % { 138 | "body": r""" 139 |

Missed regions for: {{ sr.name }}

140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | {% for missing_region in missing_regions %} 150 | 151 | 152 | 153 | 154 | 155 | {% endfor %} 156 | 157 |
#TimestampsFrame
{{loop.index}}{{ missing_region['frame_no']|totimestamp }}
{{ missing_region['frame_no'] }}
158 | 159 |

Short subtitle signs

160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | {% for subtitle_sign in short_subtitle_signs %} 170 | 171 | 172 | 176 | 177 | 178 | {% endfor %} 179 | 180 |
#TimestampsFrame
{{loop.index}} 173 | {{ subtitle_sign['from_frame_no']|totimestamp }}
{{ subtitle_sign['to_frame_no']|totimestamp }}
174 | {{ subtitle_sign['from_frame_no'] }}
{{ subtitle_sign['to_frame_no'] }}
{{ subtitle_sign['initial_frame_no'] }} 175 |
181 | """ 182 | } 183 | 184 | HTML_SUBTITLE_SIGNS = HTML_BASE % { 185 | "body": r""" 186 |

Subtitle signs for: {{ sr.name }}

187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | {% for subtitle_sign in subtitle_signs %} 197 | 198 | 199 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | {% endfor %} 210 | 211 |
#TimestampsFrame
{{loop.index}} 200 | {{ subtitle_sign['from_frame_no']|totimestamp }}
{{ subtitle_sign['to_frame_no']|totimestamp }}
201 | {{ subtitle_sign['from_frame_no'] }}
{{ subtitle_sign['to_frame_no'] }}
{{ subtitle_sign['initial_frame_no'] }} 202 |
{% autoescape false %}{{ subtitle_sign['subtitle_text'] | replace('\n', '
') }}{% endautoescape %}
212 | """ 213 | } 214 | 215 | 216 | @dataclasses.dataclass 217 | class SubtitleRegion: 218 | class ScanMode: 219 | BOTTOM_CENTER = "bottom_center" 220 | SEARCH_SLICE = "search_slice" 221 | 222 | class InvertMode: 223 | NO_INVERT = "no_invert" 224 | INVERT_ONLY = "invert_only" 225 | BOTH_INVERT = "both_invert" 226 | 227 | name: str 228 | scan_mode: ScanMode 229 | y: int 230 | h: int 231 | x: int 232 | w: int 233 | margin: int 234 | area_min: int 235 | area_max: int 236 | area_min_density: float 237 | max_w: int 238 | max_h: int 239 | min_stroke_width: int 240 | max_stroke_width: int 241 | border_size: int 242 | max_text_diff: int 243 | max_border_diff: int 244 | percent_good_border: float 245 | edge_threshold: int 246 | threshold_mode: str 247 | threshold_value: int 248 | ass_style_name: str 249 | invert_mode: InvertMode 250 | 251 | 252 | default_subtitle_regions = [ 253 | SubtitleRegion( 254 | name="bottom", 255 | scan_mode=SubtitleRegion.ScanMode.BOTTOM_CENTER, 256 | y=376, 257 | h=100, 258 | x=0, 259 | w=640, 260 | margin=4, 261 | area_min=6, 262 | area_max=1000, 263 | area_min_density=0.2, 264 | max_w=160, 265 | max_h=40, 266 | min_stroke_width=2, 267 | max_stroke_width=7, 268 | border_size=2, 269 | max_text_diff=60, 270 | max_border_diff=60, 271 | percent_good_border=25, 272 | edge_threshold=8, 273 | threshold_mode="static", 274 | threshold_value=200, 275 | ass_style_name="Default", 276 | invert_mode=SubtitleRegion.InvertMode.NO_INVERT, 277 | ), 278 | SubtitleRegion( 279 | name="region-scan", 280 | scan_mode=SubtitleRegion.ScanMode.SEARCH_SLICE, 281 | y=0, 282 | h=380, 283 | x=0, 284 | w=640, 285 | margin=0, 286 | area_min=6, 287 | area_max=1000, 288 | area_min_density=0.2, 289 | max_w=160, 290 | max_h=50, 291 | min_stroke_width=2, 292 | max_stroke_width=7, 293 | border_size=2, 294 | max_text_diff=60, 295 | max_border_diff=60, 296 | percent_good_border=25, 297 | edge_threshold=3, 298 | threshold_mode="static", 299 | threshold_value=200, 300 | ass_style_name="Sign", 301 | invert_mode=SubtitleRegion.InvertMode.NO_INVERT, 302 | ), 303 | ] 304 | 305 | 306 | def crop(image): 307 | y_nonzero, x_nonzero, _ = np.nonzero(image) 308 | if len(y_nonzero) == 0: 309 | return None 310 | crop_space = 4 311 | y, x, = max( 312 | np.min(y_nonzero) - crop_space, 0 313 | ), max(np.min(x_nonzero) - crop_space, 0) 314 | h, w = np.max(y_nonzero) + crop_space - y, np.max(x_nonzero) + crop_space - x 315 | return image[y : y + h, x : x + w], (y, h, x, w) 316 | 317 | 318 | def ocr_region(frame_region, tesseract_data_path): 319 | custom_oem_psm_config = f"--psm 6 --tessdata-dir {shlex.quote(tesseract_data_path)}" # TODO, get datapath 320 | return pytesseract.image_to_pdf_or_hocr( 321 | frame_region, extension="hocr", config=custom_oem_psm_config, lang="eng" 322 | ) 323 | 324 | 325 | def ocr_region_from_mask( 326 | frame, mask, tesseract_data_path, gather_images=None, inverted=False 327 | ): 328 | mask = cv2.dilate( 329 | mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=1 330 | ) 331 | choppable_frame = frame & cv2.cvtColor(mask, cv2.COLOR_GRAY2RGB) 332 | if inverted: 333 | choppable_frame[mask == 0] = [255, 255, 255] 334 | choppable_frame = cv2.bitwise_not(choppable_frame) 335 | 336 | crop_result = crop(choppable_frame) 337 | if crop_result is None: 338 | return None, None, None 339 | image, crop_region = crop_result 340 | image = cv2.bitwise_not(image) 341 | image = cv2.resize(image, (image.shape[1] * 2, image.shape[0] * 2)) 342 | image = cv2.blur(image, (2, 2)) 343 | if gather_images is not None: 344 | gather_images.append(("Image for OCR", image)) 345 | return image, crop_region, ocr_region(image, tesseract_data_path) 346 | 347 | 348 | def frame_has_mask(src_frame, dst_frame, mask, min_percent=0.1, max_offset=12): 349 | orig_mask = mask.copy() 350 | orig_mask = cv2.cvtColor(orig_mask, cv2.COLOR_GRAY2BGR) 351 | mask = mask.astype("bool") 352 | c = np.abs(src_frame[mask].flatten() - dst_frame[mask].flatten()).astype("uint8") 353 | c += max_offset 354 | return len(c[c > (max_offset * 2)]) < int(min_percent * len(c)) 355 | 356 | 357 | def find_frames_with_mask( 358 | video, 359 | initial_frame_no, 360 | src_frame, 361 | mask, 362 | shape, 363 | region, 364 | max_frames_backward=18, 365 | max_frames_forward=600, 366 | max_frames=None, 367 | ): 368 | frames_with_mask = [] 369 | for r in [range(0, -max_frames_backward - 1, -1), range(1, max_frames_forward)]: 370 | for i in r: 371 | frame_no = initial_frame_no + i 372 | if frame_no >= max_frames: 373 | break 374 | if frame_no > 0: 375 | dst_frame = get_frame(video, frame_no, shape, region=region) 376 | else: 377 | dst_frame = None 378 | if dst_frame is None or not frame_has_mask(src_frame, dst_frame, mask): 379 | break 380 | frames_with_mask.append(frame_no) 381 | if not frames_with_mask: 382 | return None, None 383 | 384 | return min(frames_with_mask), max(frames_with_mask) 385 | 386 | 387 | def remove_smaller_cc(mask): 388 | dilated_mask = cv2.dilate( 389 | mask.copy(), cv2.getStructuringElement(cv2.MORPH_RECT, (3, 5)), iterations=6 390 | ) 391 | numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats( 392 | dilated_mask, connectivity=4 393 | ) 394 | sizes = stats[:, -1] 395 | i = np.argmax(sizes[1:]) + 1 396 | labels[labels != i] = 0 397 | labels[labels == i] = 255 398 | return mask & labels.astype("uint8"), sizes[i] 399 | 400 | 401 | def estimate_line_width(labels): 402 | labels = labels.copy() 403 | labels = np.repeat(labels, 2, axis=0) 404 | labels = np.repeat(labels, 2, axis=1) 405 | mask = labels.copy() 406 | mask[mask > 0] = 1 407 | mask = mask.astype("uint8") 408 | label_line_width = {} 409 | i = 0 410 | current_labels = set(np.unique(labels)) 411 | 412 | label_count = {} 413 | for label, count in zip(*np.unique(labels, return_counts=True)): 414 | label_count[label] = [count] 415 | while True: 416 | mask = cv2.erode( 417 | mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=1 418 | ) 419 | i += 1 420 | new_labels = set(np.unique(labels * mask)) 421 | label_line_width[i] = current_labels - new_labels 422 | if len(new_labels) <= 1: 423 | break 424 | current_labels = new_labels 425 | for label, count in zip(*np.unique(labels * mask, return_counts=True)): 426 | label_count[label].append(count) 427 | return label_line_width 428 | 429 | 430 | def find_potential_text_block_areas(orig_labels, iterations=7): 431 | mask = orig_labels.copy() 432 | mask[mask > 0] = 255 433 | mask = mask.astype("uint8") 434 | mask = cv2.dilate( 435 | mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=iterations 436 | ) 437 | numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats( 438 | mask, connectivity=4 439 | ) 440 | shape = labels.shape 441 | mid = shape[1] // 2 442 | for i, (x, y, w, h, area) in enumerate(stats): 443 | if x < mid and x + w > mid and h >= 20: 444 | labels[labels == i] = 0 445 | region_space = 16 446 | source_region = np.zeros(mask.shape, dtype="bool") 447 | source_region[0 : mask.shape[0], mid - region_space : mid + region_space] = True 448 | return np.unique(orig_labels[labels.astype("bool")]), source_region 449 | 450 | 451 | def do_threshold(sr, frame): 452 | if sr.threshold_mode == "adaptive": 453 | thresh = cv2.adaptiveThreshold( 454 | cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY), 455 | 255, 456 | cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 457 | cv2.THRESH_BINARY, 458 | sr.threshold_value, 459 | 2, 460 | ) 461 | elif sr.threshold_mode == "static": 462 | thresh = cv2.threshold( 463 | cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY), 464 | sr.threshold_value, 465 | 255, 466 | cv2.THRESH_BINARY, 467 | )[1] 468 | return thresh 469 | 470 | 471 | def extract_text_subregion( 472 | frame, sr, tesseract_data_path, gather_images=None, inverted=False 473 | ): 474 | thresh = do_threshold(sr, frame) 475 | if inverted: 476 | thresh = cv2.bitwise_not(thresh) 477 | numLabels, labels = cv2.connectedComponents(thresh, connectivity=4) 478 | for label, count in zip(*np.unique(labels, return_counts=True)): 479 | if count <= sr.area_max: 480 | labels[labels == label] = 0 481 | labels[labels > 0] = 255 482 | thresh = thresh - labels.astype("uint8") 483 | 484 | numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats( 485 | thresh, connectivity=4 486 | ) 487 | subtitle_region_labels = labels.copy() 488 | if gather_images is not None: 489 | gather_images.append(("Thresh", thresh)) 490 | 491 | label_stats = {} 492 | for i, (x, y, w, h, area) in enumerate(stats): 493 | if i == 0: 494 | continue 495 | if ( 496 | sr.max_h >= h 497 | and sr.max_w >= w 498 | and sr.area_min <= area 499 | and sr.area_max >= area 500 | and sr.margin <= y 501 | and frame.shape[0] - sr.margin >= y + h # TODO: margin remove 502 | and sr.margin <= x 503 | and frame.shape[1] - sr.margin >= x + w 504 | and sr.area_min_density < (area / (h * w)) 505 | ): 506 | label_stats[i] = (x, y, w, h, area) 507 | else: 508 | labels[labels == i] = 0 509 | 510 | dust_labels, starting_area = find_potential_text_block_areas(labels) 511 | for i in dust_labels: 512 | if i == 0: 513 | continue 514 | labels[labels == i] = 0 515 | del label_stats[i] 516 | 517 | estimated_line_width = estimate_line_width(labels) 518 | correct_stroke_width = [ 519 | vv 520 | for v in [ 521 | v 522 | for (k, v) in estimated_line_width.items() 523 | if k <= sr.max_stroke_width and k >= sr.min_stroke_width 524 | ] 525 | for vv in v 526 | ] 527 | labels[np.isin(labels, correct_stroke_width, invert=True)] = 0 528 | 529 | if gather_images is not None: 530 | gather_images.append(("final mask", labels)) 531 | 532 | mask = labels.astype("bool") 533 | 534 | pixels = frame[mask] 535 | if len(pixels) == 1: 536 | return None, None 537 | pixels = np.float32(pixels) 538 | n_colors = min(20, len(pixels)) 539 | if n_colors == 0: 540 | return None, None 541 | criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, 0.1) 542 | flags = cv2.KMEANS_PP_CENTERS 543 | _, color_labels, palette = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags) 544 | _, counts = np.unique(color_labels, return_counts=True) 545 | 546 | starting_area_labels = np.full(labels.shape, 255, dtype="uint8") 547 | starting_area_labels[mask] = color_labels.flatten() 548 | 549 | starting_labels, starting_labels_counts = np.unique( 550 | starting_area_labels[starting_area], return_counts=True 551 | ) 552 | starting_labels_label_enum = sorted( 553 | [(l, c) for (l, c) in zip(starting_labels, starting_labels_counts) if l != 255], 554 | key=lambda x: x[1], 555 | reverse=True, 556 | ) 557 | starting_labels = [x[0] for x in starting_labels_label_enum] 558 | 559 | consumed_labels = set() 560 | label_enum = sorted(enumerate(counts), key=lambda l: l[1], reverse=True) 561 | region_candidates = [] 562 | for label, _ in label_enum: 563 | if label not in starting_labels: 564 | continue 565 | if label in consumed_labels: 566 | continue 567 | dominant_labels = [label] 568 | dominant = palette[label] 569 | for i, color in enumerate(palette): 570 | if i == label: 571 | continue 572 | if np.linalg.norm(dominant - color) < sr.max_text_diff: 573 | dominant_labels.append(i) 574 | 575 | consumed_labels |= set(dominant_labels) 576 | labels_inner = labels.copy() 577 | dominant_color_labels = color_labels.copy() 578 | dominant_color_labels[ 579 | np.isin(dominant_color_labels, list(dominant_labels), invert=True) 580 | ] = 0 581 | dominant_color_labels[dominant_color_labels >= 1] = 1 582 | 583 | labels_subset = np.unique( 584 | labels[mask][dominant_color_labels.astype("bool").flatten()] 585 | ) 586 | labels_subset = labels_subset[labels_subset != 0] 587 | 588 | labels_inner[np.isin(labels_inner, labels_subset, invert=True)] = 0 589 | labels_inner[labels_inner > 0] = 255 590 | labels_inner = labels_inner.astype("uint8") 591 | 592 | labels_border = ( 593 | cv2.dilate( 594 | labels_inner, 595 | cv2.getStructuringElement( 596 | cv2.MORPH_RECT, (sr.border_size * 2 + 1, sr.border_size * 2 + 1) 597 | ), 598 | iterations=1, 599 | ) 600 | - labels_inner 601 | ) 602 | border_pixels = frame[labels_border.astype("bool")] 603 | border_pixels = np.float32(border_pixels) 604 | border_n_colors = min(20, len(border_pixels)) 605 | 606 | if border_n_colors == 0: 607 | continue 608 | 609 | border_criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, 0.1) 610 | border_flags = cv2.KMEANS_PP_CENTERS 611 | _, border_color_labels, border_palette = cv2.kmeans( 612 | border_pixels, border_n_colors, None, border_criteria, 10, border_flags 613 | ) 614 | _, border_counts = np.unique(border_color_labels, return_counts=True) 615 | 616 | starting_area_border_labels = np.full(labels.shape, 255, dtype="uint8") 617 | starting_area_border_labels[ 618 | labels_border.astype("bool") 619 | ] = border_color_labels.flatten() 620 | 621 | consumed_border_labels = set() 622 | border_label_enum = sorted( 623 | enumerate(border_counts), key=lambda l: l[1], reverse=True 624 | ) 625 | for label, _ in border_label_enum: 626 | if label in consumed_border_labels: 627 | continue 628 | consumed_border_labels.add(label) 629 | dominant_labels_border = [label] 630 | dominant = border_palette[label] 631 | for i, color in enumerate(border_palette): 632 | if i in consumed_border_labels: 633 | continue 634 | if i == label: 635 | continue 636 | if np.linalg.norm(dominant - color) < sr.max_border_diff: 637 | # print(f"Merging {i=} {dominant=} and {color=}") 638 | dominant_labels_border.append(i) 639 | consumed_border_labels.add(i) 640 | break 641 | 642 | frame_border = np.zeros(frame.shape[:2], dtype="uint8") 643 | frame_border[labels_border.astype("bool")] = border_color_labels.flatten() + 1 644 | 645 | border_labels_subset = set(labels_subset) 646 | for i, (x, y, w, h, area) in enumerate(stats): 647 | if i not in border_labels_subset: 648 | continue 649 | mask_label = labels.copy() 650 | mask_label[mask_label != i] = 0 651 | mask_label[mask_label == i] = 255 652 | mask_label = mask_label.astype("uint8") 653 | mask_label_border = ( 654 | cv2.dilate( 655 | mask_label, 656 | cv2.getStructuringElement( 657 | cv2.MORPH_RECT, (sr.border_size * 2 + 1, sr.border_size * 2 + 1) 658 | ), 659 | iterations=1, 660 | ) 661 | - mask_label 662 | ) 663 | border_frame = frame_border & mask_label_border 664 | border_labels, border_counts = np.unique(border_frame, return_counts=True) 665 | border_labels = border_labels[1:] - 1 666 | border_counts = border_counts[1:] 667 | good, bad = 0, 0 668 | for border_label, border_count in zip(border_labels, border_counts): 669 | if border_label in dominant_labels_border: 670 | good += border_count 671 | else: 672 | bad += border_count 673 | if round((good / (good + bad)) * 100) < sr.percent_good_border: 674 | border_labels_subset.remove(i) 675 | 676 | border_labels_subset = np.array(list(border_labels_subset)) 677 | result_mask = labels.copy() 678 | result_mask[np.isin(result_mask, border_labels_subset, invert=True)] = 0 679 | result_mask[result_mask > 0] = 255 680 | result_mask = result_mask.astype("uint8") 681 | # result_mask = cv2.dilate(result_mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=1) 682 | if gather_images is not None: 683 | gather_images.append(("final region label", result_mask)) 684 | image, crop_region, hocr = ocr_region_from_mask( 685 | frame, 686 | result_mask, 687 | tesseract_data_path, 688 | gather_images=gather_images, 689 | inverted=inverted, 690 | ) 691 | if hocr is not None and hocr_to_text(hocr): 692 | if gather_images is not None: 693 | print("Found HOCR") 694 | print(hocr.decode()) 695 | region_candidates.append((result_mask, hocr)) 696 | return region_candidates 697 | 698 | 699 | def find_diff_regions( 700 | src_frame, 701 | dst_frame, 702 | gather_images=None, 703 | nfeatures=2000, 704 | min_kp_region=100, 705 | edge_threshold=14, 706 | ): 707 | orb = cv2.ORB_create( 708 | edgeThreshold=edge_threshold, patchSize=edge_threshold, nfeatures=nfeatures 709 | ) 710 | src_kp = orb.detect(src_frame, None) 711 | if gather_images is not None: 712 | gather_images.append( 713 | ( 714 | "Source keypoints", 715 | cv2.drawKeypoints(src_frame, src_kp, None, color=(0, 255, 0), flags=0), 716 | ) 717 | ) 718 | 719 | orb = cv2.ORB_create( 720 | edgeThreshold=edge_threshold, patchSize=edge_threshold, nfeatures=50000 721 | ) 722 | dst_kp = orb.detect(dst_frame, None) 723 | if gather_images is not None: 724 | gather_images.append( 725 | ( 726 | "Destination keypoints", 727 | cv2.drawKeypoints(dst_frame, dst_kp, None, color=(0, 255, 0), flags=0), 728 | ) 729 | ) 730 | 731 | kp_mask = np.zeros(src_frame.shape[:2], dtype="uint8") 732 | 733 | src_kp_vector = np.array([k.pt + (k.angle / 5,) for k in src_kp]) 734 | dst_kp_vector = np.array([k.pt + (k.angle / 5,) for k in dst_kp]) 735 | if len(src_kp_vector) == 0: 736 | return kp_mask 737 | 738 | if len(dst_kp_vector) == 0: 739 | good_kps = list(src_kp) 740 | else: 741 | C = cdist(dst_kp_vector, src_kp_vector, metric="euclidean") 742 | good_kps = [] 743 | for k, cost in zip(src_kp, np.min(C, axis=0)): 744 | if cost > 6.0: 745 | good_kps.append(k) 746 | 747 | if gather_images is not None: 748 | kp_diff_src = cv2.drawKeypoints( 749 | src_frame, good_kps, None, color=(0, 255, 0), flags=0 750 | ) 751 | gather_images.append(("KP Diff Source", kp_diff_src)) 752 | 753 | for kp in good_kps: 754 | x, y = kp.pt 755 | kp_mask[int(y), int(x)] = 255 756 | 757 | mask = cv2.dilate( 758 | kp_mask, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)), iterations=6 759 | ) 760 | numLabels, labels = cv2.connectedComponents(mask, connectivity=8) 761 | for i in range(1, numLabels): 762 | label_mask = labels.copy() 763 | label_mask[label_mask != i] = 0 764 | label_mask = kp_mask & label_mask.astype(bool) 765 | if len(label_mask[label_mask > 0]) < min_kp_region: 766 | labels[labels == i] = 0 767 | 768 | labels[labels > 0] = 255 769 | mask = labels.astype("uint8") 770 | if gather_images is not None: 771 | gather_images.append(("Mask", mask)) 772 | 773 | return mask 774 | 775 | 776 | def evaluate_if_inverted(sr, frame, mask): 777 | thresh = do_threshold(sr, frame) 778 | 779 | numLabels, labels = cv2.connectedComponents(thresh & mask, connectivity=4) 780 | numLabelsInverted, labelsInverted = cv2.connectedComponents( 781 | cv2.bitwise_not(thresh) & mask, connectivity=4 782 | ) 783 | 784 | return numLabels < numLabelsInverted 785 | 786 | 787 | def get_frame(video, frame_no, shape=None, region=None, skip_align=False): 788 | if frame_no < 0: 789 | raise Exception(f"Trying to get {frame_no}") 790 | 791 | video.set(cv2.CAP_PROP_POS_FRAMES, frame_no) 792 | ret, frame = video.read() 793 | 794 | if not ret: 795 | raise Exception(f"Failed to get frame {frame_no}") 796 | 797 | if skip_align: 798 | return frame 799 | 800 | if shape is not None: 801 | sy, sx, sz = shape 802 | fy, fx, fz = frame.shape 803 | frame_aspect = fx / fy 804 | 805 | if frame_aspect > sx / sy: 806 | new_fx = frame_aspect * sy 807 | slice_each_x = int((fx - new_fx) / 2) 808 | frame = frame[0:fy, slice_each_x : (fx - slice_each_x)] 809 | elif frame_aspect < sx / sy: 810 | new_fx = (sx / sy) * fy 811 | append_each_x = int((new_fx - fx) / 2) 812 | if append_each_x > 0: 813 | frame = cv2.copyMakeBorder( 814 | frame, 815 | 0, 816 | 0, 817 | append_each_x, 818 | append_each_x, 819 | cv2.BORDER_CONSTANT, 820 | None, 821 | (0, 0, 0), 822 | ) 823 | 824 | if frame.shape > shape: 825 | frame = cv2.resize(frame, (sx, sy)) 826 | 827 | if region is not None: 828 | frame = frame[region[0] : region[1], region[2] : region[3]] 829 | 830 | return frame 831 | 832 | 833 | def hocr_to_text(hocr): 834 | def parse_hocr_title(title): 835 | result = {} 836 | for entry in title.split(";"): 837 | if not entry.strip(): 838 | continue 839 | entry_key, entry_value = entry.strip().split(" ", 1) 840 | if entry_key in ["bbox", "x_ascenders", "x_wconf"]: 841 | result[entry_key] = [ 842 | int(float(x)) for x in entry_value.strip().split(" ") 843 | ] 844 | elif entry_key in ["x_size", "x_descenders", "baseline"]: 845 | result[entry_key] = [float(x) for x in entry_value.strip().split(" ")] 846 | return result 847 | 848 | tree = lxml.html.fromstring(hocr) 849 | lines, score, line_consumptions = [], [], [] 850 | for line in tree.xpath(r"//span[@class='ocr_line']"): 851 | line_specs = parse_hocr_title(line.attrib["title"]) 852 | line_consumption = [] 853 | line_text = [] 854 | for word in line.xpath(r"./span[@class='ocrx_word']"): 855 | specs = parse_hocr_title(word.attrib["title"]) 856 | score.append(specs["x_wconf"]) 857 | line_consumption.append(specs["bbox"][2] - specs["bbox"][0]) 858 | line_text.append(word.text) 859 | lines.append(" ".join(line_text)) 860 | line_consumptions.append( 861 | min(sum(line_consumption), line_specs["bbox"][2] - line_specs["bbox"][0]) 862 | ) 863 | if not lines: 864 | return 0, "", [] 865 | return np.average(np.array(score)), "\n".join(lines), line_consumptions 866 | 867 | 868 | def get_subtitle_mask( 869 | frame, sr, tesseract_data_path, gather_images=None, inverted=False 870 | ): 871 | subregion_candidates = extract_text_subregion( 872 | frame, sr, tesseract_data_path, gather_images=gather_images, inverted=inverted 873 | ) 874 | if subregion_candidates and subregion_candidates[0] is not None: 875 | candidate_scores = [] 876 | for result_mask, hocr in subregion_candidates: 877 | score, text, line_consumptions = hocr_to_text(hocr) 878 | if not text or not line_consumptions: 879 | continue 880 | percent_filled = sorted(line_consumptions, reverse=True)[0] / frame.shape[1] 881 | candidate_scores.append( 882 | (score * percent_filled, score, percent_filled, result_mask, text) 883 | ) 884 | if score > 65 and percent_filled > 0.8: 885 | return result_mask, text 886 | if candidate_scores: 887 | return sorted(candidate_scores, reverse=True, key=lambda x: x[0])[0][3:] 888 | return None, None 889 | 890 | 891 | def slice_mask_regions( 892 | frame, 893 | mask, 894 | sr, 895 | orig_region, 896 | tesseract_data_path, 897 | gather_images=None, 898 | inverted=False, 899 | ): 900 | subtitles = [] 901 | numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats( 902 | mask, connectivity=4 903 | ) 904 | for i, (x, y, w, h, area) in enumerate(stats): 905 | if i == 0: 906 | continue 907 | region = (max(y - 4, 0), y + h + 4, max(x - 4, 4), x + w + 4) 908 | sliced_frame = frame[region[0] : region[1], region[2] : region[3]].copy() 909 | subtitle_mask, subtitle_text = get_subtitle_mask( 910 | sliced_frame, 911 | sr, 912 | tesseract_data_path, 913 | gather_images=gather_images, 914 | inverted=inverted, 915 | ) 916 | if subtitle_text: 917 | subtitle_region = ( 918 | region[0] + orig_region[0], 919 | region[0] + sliced_frame.shape[0] + orig_region[0], 920 | region[2] + orig_region[2], 921 | region[2] + sliced_frame.shape[1] + orig_region[2], 922 | ) 923 | subtitles.append( 924 | (subtitle_mask, subtitle_text, sliced_frame, subtitle_region) 925 | ) 926 | else: 927 | subtitles.append((subtitle_mask, None, sliced_frame, None)) 928 | return subtitles 929 | 930 | 931 | def loop_frames( 932 | src_video, 933 | dst_video, 934 | start_frame_no, 935 | end_frame_no, 936 | frame_diff, 937 | sr, 938 | tesseract_data_path, 939 | frame_steps=9, 940 | fix_broken_frame_alignment=False, 941 | edge_threshold=14, 942 | gather_images=None, 943 | max_frames=None, 944 | ): 945 | src_video.set(cv2.CAP_PROP_POS_FRAMES, 0) 946 | _, src_frame = src_video.read() 947 | 948 | shape = src_frame.shape 949 | region = (sr.y, sr.y + sr.h, sr.x, sr.x + sr.w) 950 | min_frame_no = 0 951 | 952 | if frame_diff < 0: 953 | start_frame_no = max(start_frame_no - frame_diff, 0) 954 | 955 | for frame_no in tqdm( 956 | range(start_frame_no, end_frame_no, frame_steps), 957 | desc=f"Region: {sr.name} Framerange: {start_frame_no}-{end_frame_no}", 958 | ): 959 | if frame_no < min_frame_no: 960 | continue 961 | if fix_broken_frame_alignment: 962 | last_frame_no = frame_no + 2 963 | if max_frames is not None: 964 | last_frame_no = min(last_frame_no, max_frames) 965 | frame_no_range = range(max(frame_no - 1, 0, -frame_diff), last_frame_no) 966 | else: 967 | frame_no_range = range(frame_no, frame_no + 1) 968 | 969 | frames = [] 970 | 971 | for frame_no_actual in frame_no_range: 972 | src_frame = get_frame(src_video, frame_no_actual, shape, region=region) 973 | dst_frame = get_frame( 974 | dst_video, frame_no_actual + frame_diff, shape, region=region 975 | ) 976 | 977 | frames.append( 978 | ( 979 | src_frame, 980 | dst_frame, 981 | frame_no_actual, 982 | structural_similarity(src_frame, dst_frame, multichannel=True), 983 | ) 984 | ) 985 | 986 | src_frame, dst_frame, frame_no = sorted( 987 | frames, key=lambda x: x[3], reverse=True 988 | )[0][:3] 989 | if gather_images is not None: 990 | gather_images.append((f"Source frame {frame_no}", src_frame)) 991 | gather_images.append((f"Destination frame {frame_no}", dst_frame)) 992 | 993 | mask = find_diff_regions( 994 | src_frame, 995 | dst_frame, 996 | gather_images=gather_images, 997 | edge_threshold=edge_threshold, 998 | ) 999 | values, counts = np.unique(mask, return_counts=True) 1000 | 1001 | if len(counts) > 1 and counts[1] > 150: 1002 | inverted = evaluate_if_inverted(sr, src_frame, mask) 1003 | found_to_frames = [] 1004 | subtitles = [] 1005 | if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE: 1006 | subtitles += slice_mask_regions( 1007 | src_frame, 1008 | mask, 1009 | sr, 1010 | region, 1011 | tesseract_data_path, 1012 | gather_images=gather_images, 1013 | inverted=inverted, 1014 | ) 1015 | else: 1016 | subtitles.append( 1017 | get_subtitle_mask( 1018 | src_frame, 1019 | sr, 1020 | tesseract_data_path, 1021 | gather_images=gather_images, 1022 | inverted=inverted, 1023 | ) 1024 | + ( 1025 | src_frame, 1026 | region, 1027 | ) 1028 | ) 1029 | 1030 | for ( 1031 | subtitle_mask, 1032 | subtitle_text, 1033 | subtitle_src_frame, 1034 | subtitle_region, 1035 | ) in subtitles: 1036 | if gather_images is not None: 1037 | gather_images.append( 1038 | (f"Frame: {frame_no} - text: {subtitle_text}", subtitle_mask) 1039 | ) 1040 | 1041 | if subtitle_text: 1042 | from_frame_no, to_frame_no = find_frames_with_mask( 1043 | src_video, 1044 | frame_no, 1045 | subtitle_src_frame, 1046 | subtitle_mask, 1047 | shape, 1048 | subtitle_region, 1049 | max_frames=max_frames, 1050 | ) 1051 | if from_frame_no is None: 1052 | continue 1053 | found_to_frames.append(to_frame_no) 1054 | if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE: 1055 | yield { 1056 | "type": "subtitle_sign", 1057 | "from_frame_no": from_frame_no, 1058 | "to_frame_no": to_frame_no, 1059 | "initial_frame_no": frame_no, 1060 | "subtitle_text": subtitle_text, 1061 | "region": sr.name, 1062 | "position": [int(p) for p in subtitle_region], 1063 | } 1064 | else: 1065 | yield { 1066 | "type": "subtitle", 1067 | "from_frame_no": from_frame_no, 1068 | "to_frame_no": to_frame_no, 1069 | "initial_frame_no": frame_no, 1070 | "subtitle_text": subtitle_text, 1071 | "region": sr.name, 1072 | } 1073 | else: 1074 | yield { 1075 | "type": "missed_region", 1076 | "frame_no": frame_no, 1077 | "region": sr.name, 1078 | } 1079 | 1080 | if found_to_frames: 1081 | min_frame_no = min(found_to_frames) 1082 | 1083 | 1084 | def frame_generator(start_i): 1085 | for i in range(1, 300): 1086 | if i >= start_i: 1087 | continue 1088 | yield start_i + i 1089 | yield start_i - i 1090 | 1091 | 1092 | def find_good_frame_breakpoint(video, current_frame): # TODO: do binary search instead? 1093 | compare_frame_size = (32, 32) 1094 | frame_cache = {} 1095 | 1096 | def get_frame(frame_no): 1097 | if frame_no not in frame_cache: 1098 | video.set(cv2.CAP_PROP_POS_FRAMES, frame_no) 1099 | frame_cache[frame_no] = cv2.cvtColor( 1100 | cv2.resize(video.read()[1], compare_frame_size), cv2.COLOR_BGR2GRAY 1101 | ) 1102 | return frame_cache[frame_no] 1103 | 1104 | best_score = 1.0 1105 | best_frame = current_frame 1106 | for frame_no in frame_generator(current_frame): 1107 | score = structural_similarity(get_frame(frame_no), get_frame(frame_no + 1)) 1108 | if score < best_score: 1109 | best_score = score 1110 | best_frame = frame_no 1111 | if score < 0.65: 1112 | return frame_no - current_frame 1113 | return best_frame - current_frame 1114 | 1115 | 1116 | def estimate_video_frame_diff( 1117 | source_video, target_video, current_source_frame, current_target_frame 1118 | ): 1119 | frame_index_size = (64, 64) 1120 | compare_frame_count = 5 1121 | spread_frame_count = 14 1122 | ret, source_frame = source_video.read() 1123 | ret, target_frame = target_video.read() 1124 | 1125 | sy, sx, sz = source_frame.shape 1126 | ty, tx, tz = target_frame.shape 1127 | 1128 | s_aspect = sx / sy 1129 | t_aspect = tx / ty 1130 | 1131 | source_frames = [] 1132 | target_frames = [] 1133 | 1134 | source_from_frame = current_source_frame - (compare_frame_count // 2) 1135 | source_to_frame = source_from_frame + compare_frame_count 1136 | 1137 | source_video.set(cv2.CAP_PROP_POS_FRAMES, source_from_frame) 1138 | for _ in range(source_from_frame, source_to_frame): 1139 | frame_no = source_video.get(cv2.CAP_PROP_POS_FRAMES) 1140 | source_frames.append( 1141 | ( 1142 | cv2.cvtColor( 1143 | cv2.resize(source_video.read()[1], frame_index_size), 1144 | cv2.COLOR_BGR2GRAY, 1145 | ), 1146 | frame_no, 1147 | ) 1148 | ) 1149 | 1150 | target_from_frame = current_target_frame - spread_frame_count 1151 | target_to_frame = target_from_frame + (spread_frame_count * 2) 1152 | 1153 | target_video.set(cv2.CAP_PROP_POS_FRAMES, target_from_frame) 1154 | for _ in range(target_from_frame, target_to_frame): 1155 | frame_no = target_video.get(cv2.CAP_PROP_POS_FRAMES) 1156 | target_frame = target_video.read()[1] 1157 | if s_aspect > t_aspect: 1158 | new_tx = (tx / sx) * sy 1159 | slice_each_x = int((tx - new_tx) / 2) 1160 | target_frame = target_frame[0:ty, slice_each_x : (tx - slice_each_x)] 1161 | 1162 | target_frame = cv2.resize(target_frame, frame_index_size) 1163 | target_frames.append((cv2.cvtColor(target_frame, cv2.COLOR_BGR2GRAY), frame_no)) 1164 | best_diff = 0 1165 | best_frame_diff = None 1166 | for i in range(len(target_frames) - len(source_frames)): 1167 | v = target_frames[i : i + len(source_frames)] 1168 | diffs = [] 1169 | frame_nos = [] 1170 | for sf, tf in zip(source_frames, v): 1171 | sf, sfn = sf 1172 | tf, tfn = tf 1173 | frame_nos.append((sfn, tfn)) 1174 | diffs.append(structural_similarity(sf, tf, multichannel=False)) 1175 | diffs = np.square(np.array(diffs) * 100) 1176 | if sum(diffs) > best_diff: 1177 | best_diff = sum(diffs) 1178 | best_frame_diff = (target_from_frame + i) - source_from_frame 1179 | 1180 | return best_frame_diff 1181 | 1182 | 1183 | def totimestamp(fps, frame_count): 1184 | s = frame_count / fps 1185 | m, s = divmod(s, 60) 1186 | h, m = divmod(m, 60) 1187 | return f"{int(h):02}:{int(m):02}:{(s):06.3f}" 1188 | 1189 | 1190 | def make_time(frames, fps, pos=None): 1191 | ms = pysubs2.time.frames_to_ms(frames, fps) 1192 | if pos == "end": 1193 | ms = math.ceil((ms / 10)) * 10 1194 | elif pos == "start": 1195 | ms -= 10 1196 | actual_frames = pysubs2.time.ms_to_frames(ms, fps) 1197 | return ms 1198 | 1199 | 1200 | def save_frame_and_return_path(output_path, video, frame_no, position, region): 1201 | fn = f"frame-{frame_no:05}" 1202 | if position: 1203 | fn += "-" + "-".join([str(i) for i in position]) 1204 | if region: 1205 | fn += f"-r-{region.y}-{region.y + region.h}-{region.x}-{region.x + region.w}" 1206 | fn += ".jpg" 1207 | output_path.mkdir(exist_ok=True) 1208 | output_file = output_path / fn 1209 | if not output_file.exists(): 1210 | video.set(cv2.CAP_PROP_POS_FRAMES, frame_no) 1211 | ret, frame = video.read() 1212 | if position: 1213 | cv2.rectangle( 1214 | frame, 1215 | (position[2], position[0]), 1216 | (position[3], position[1]), 1217 | (0, 255, 0), 1218 | 3, 1219 | ) 1220 | if region: 1221 | frame = frame[ 1222 | region.y : region.y + region.h, region.x : region.x + region.w 1223 | ] 1224 | if frame.shape[1] > 640: 1225 | frame = cv2.resize( 1226 | frame, (640, int(frame.shape[0] / (frame.shape[1] / 640))) 1227 | ) 1228 | cv2.imwrite(str(output_file), frame) 1229 | 1230 | return f"{output_file.parent.name}/{output_file.name}" 1231 | 1232 | 1233 | def cleanup_text(text): 1234 | text = text.replace("|", "I") 1235 | return text 1236 | 1237 | 1238 | class Config: 1239 | def __init__(self, temp_folder, video_path): 1240 | self.lock = threading.Lock() 1241 | self.path = temp_folder / f"{Path(video_path).name}.conf.json" 1242 | if self.path.exists(): 1243 | self.config = json.loads(self.path.read_text()) 1244 | else: 1245 | self.config = {} 1246 | 1247 | def get_frame_diff(self): 1248 | with self.lock: 1249 | return self.config.get("frame_diff") 1250 | 1251 | def set_frame_diff(self, frame_diff): 1252 | with self.lock: 1253 | self.config["frame_diff"] = frame_diff 1254 | self._flush() 1255 | 1256 | def _flush(self): 1257 | self.path.write_text(json.dumps(self.config, indent=2)) 1258 | 1259 | def add_text_line(self, scan_mode, frame_range, progress, line=None): 1260 | with self.lock: 1261 | key = f"{scan_mode}-{frame_range[0]}-{frame_range[1]}" 1262 | if "progress" not in self.config: 1263 | self.config["progress"] = {} 1264 | self.config["progress"][key] = progress 1265 | if line: 1266 | self.config.setdefault("lines", []).append(line) 1267 | self._flush() 1268 | 1269 | def read_text_lines(self): 1270 | with self.lock: 1271 | return json.loads(self.path.read_text()).get("lines", []) 1272 | 1273 | 1274 | def pick_best_text(text_1, text_2): 1275 | t_text_1 = TextBlob(text_1) 1276 | t_text_2 = TextBlob(text_2) 1277 | 1278 | text_1_diff = normalized_damerau_levenshtein_distance(t_text_1, t_text_1.correct()) 1279 | text_2_diff = normalized_damerau_levenshtein_distance(t_text_2, t_text_2.correct()) 1280 | 1281 | if text_2_diff < text_1_diff: 1282 | return text_2 1283 | else: 1284 | return text_1 1285 | 1286 | 1287 | class FrameRangeParamType(click.ParamType): 1288 | name = "framerange" 1289 | 1290 | def convert(self, value, param, ctx): 1291 | value = value.split(":") 1292 | if len(value) != 2: 1293 | self.fail( 1294 | "Missing arguments, syntax is start_frame:end_frame - can be negative" 1295 | ) 1296 | 1297 | try: 1298 | start_frame = int(value[0]) 1299 | end_frame = int(value[1]) 1300 | except ValueError: 1301 | self.fail(f"Value is wrong type, must be int") 1302 | 1303 | return start_frame, end_frame 1304 | 1305 | 1306 | FRAME_RANGE = FrameRangeParamType() 1307 | 1308 | 1309 | @click.group() 1310 | @click.argument("subtitled-file", type=click.Path(exists=True), required=True) 1311 | @click.argument("unsubtitled-file", type=click.Path(exists=True), required=True) 1312 | @click.option( 1313 | "--temp-folder", 1314 | type=click.Path(), 1315 | default="cow-temp", 1316 | help="Temp folder to store various files in.", 1317 | ) 1318 | @click.option( 1319 | "--subtitle-region-file", 1320 | type=click.Path(), 1321 | help="Subtitle region file.", 1322 | ) 1323 | @click.pass_context 1324 | def cli(ctx, subtitled_file, unsubtitled_file, temp_folder, subtitle_region_file): 1325 | temp_folder = Path(temp_folder) 1326 | temp_folder.mkdir(exist_ok=True) 1327 | 1328 | ctx.ensure_object(dict) 1329 | ctx.obj["subtitled_file"] = subtitled_file 1330 | ctx.obj["unsubtitled_file"] = unsubtitled_file 1331 | ctx.obj["temp_folder"] = temp_folder 1332 | ctx.obj["config"] = Config(temp_folder, subtitled_file) 1333 | if not subtitle_region_file: 1334 | subtitle_region_file = temp_folder / "subtitle_regions.json" 1335 | else: 1336 | subtitle_region_file = Path(subtitle_region_file) 1337 | 1338 | if not subtitle_region_file.exists(): 1339 | subtitle_region_file.write_text( 1340 | json.dumps( 1341 | [dataclasses.asdict(sr) for sr in default_subtitle_regions], indent=2 1342 | ) 1343 | ) 1344 | 1345 | ctx.obj["subtitle_regions"] = [ 1346 | SubtitleRegion(**sr) for sr in json.loads(subtitle_region_file.read_text()) 1347 | ] 1348 | 1349 | 1350 | @cli.command() 1351 | @click.option( 1352 | "--threads", 1353 | type=int, 1354 | default=3, 1355 | help="Number of threads to parse video with.", 1356 | ) 1357 | @click.option( 1358 | "--tesseract-data-path", 1359 | type=click.Path(exists=True), 1360 | required=False, 1361 | ) 1362 | @click.option( 1363 | "--frame-diff", 1364 | type=int, 1365 | help="Set a frame diff manually", 1366 | ) 1367 | @click.option( 1368 | "--frame-range", 1369 | type=FRAME_RANGE, 1370 | help="Specify frame range to use.", 1371 | ) 1372 | @click.option( 1373 | "--ignore-diff-fps", 1374 | is_flag=True, 1375 | help="Ignore that FPS differ.", 1376 | ) 1377 | @click.option( 1378 | "--run-subregions-in-parallel", 1379 | is_flag=True, 1380 | help="Run all the subtitle regions in parallel instead of one at a time.", 1381 | ) 1382 | @click.option( 1383 | "--fix-broken-frame-alignment", 1384 | is_flag=True, 1385 | help="Sometimes one of the videos are broken frame-wise, try a few frames to see if we find a good match.", 1386 | ) 1387 | @click.option( 1388 | "--debug-frame", 1389 | type=int, 1390 | help="Debug a frame, output an image to show debug information.", 1391 | ) 1392 | @click.option( 1393 | "--debug-subregion", 1394 | type=str, 1395 | help="Choose a specific subtitle region, use in combination with debug-frame.", 1396 | ) 1397 | @click.pass_context 1398 | def extract_subtitles( 1399 | ctx, 1400 | threads, 1401 | tesseract_data_path, 1402 | frame_diff, 1403 | frame_range, 1404 | ignore_diff_fps, 1405 | run_subregions_in_parallel, 1406 | fix_broken_frame_alignment, 1407 | debug_frame, 1408 | debug_subregion, 1409 | ): # subtitle_overwrite_region 1410 | subtitle_regions = ctx.obj["subtitle_regions"] 1411 | if tesseract_data_path is None: 1412 | tesseract_data_path = (Path(__file__).parent / "tess-data").absolute() 1413 | else: 1414 | tesseract_data_path = Path(tesseract_data_path).absolute() 1415 | tesseract_data_path = str(tesseract_data_path) 1416 | 1417 | config = ctx.obj["config"] 1418 | src_video = cv2.VideoCapture(ctx.obj["subtitled_file"]) 1419 | dst_video = cv2.VideoCapture(ctx.obj["unsubtitled_file"]) 1420 | 1421 | src_fps = fps = round(src_video.get(cv2.CAP_PROP_FPS), 3) 1422 | dst_fps = round(dst_video.get(cv2.CAP_PROP_FPS), 3) 1423 | if not ignore_diff_fps and src_fps != dst_fps: 1424 | click.echo( 1425 | f"Source and destination FPS are differnet {src_fps=} {dst_fps=} - this is not supported right now - use --ignore-diff-fps to ignore this" 1426 | ) 1427 | quit(1) 1428 | 1429 | src_frame_count = src_video.get(cv2.CAP_PROP_FRAME_COUNT) 1430 | dst_frame_count = src_video.get(cv2.CAP_PROP_FRAME_COUNT) 1431 | max_frames = min( 1432 | int(src_video.get(cv2.CAP_PROP_FRAME_COUNT)), 1433 | int(dst_video.get(cv2.CAP_PROP_FRAME_COUNT)), 1434 | ) 1435 | 1436 | click.echo("Looking for video file frame difference") 1437 | if frame_diff is None: 1438 | frame_diff = config.get_frame_diff() 1439 | if frame_diff is None: # TODO: use DTW to find big diffs 1440 | click.echo("No frame diff found in cache, detecting...") 1441 | source_frame = int(max_frames * 0.6) 1442 | target_frame = source_frame 1443 | 1444 | good_breakpoint_diff = find_good_frame_breakpoint(src_video, source_frame) 1445 | frame_diff = estimate_video_frame_diff( 1446 | src_video, 1447 | dst_video, 1448 | source_frame + good_breakpoint_diff, 1449 | target_frame + good_breakpoint_diff, 1450 | ) 1451 | config.set_frame_diff(frame_diff) 1452 | click.echo(f"Using frame diff {frame_diff}") 1453 | 1454 | if debug_frame is not None: 1455 | debug_folder = ctx.obj["temp_folder"] / "debug" 1456 | debug_folder.mkdir(exist_ok=True) 1457 | start_frame_no = debug_frame 1458 | end_frame_no = start_frame_no + 1 1459 | if frame_diff < 0: 1460 | end_frame_no -= frame_diff 1461 | 1462 | img = get_frame(src_video, start_frame_no) 1463 | for sr in subtitle_regions: 1464 | cv2.rectangle(img, (sr.x, sr.y), (sr.x + sr.w, sr.y + sr.h), (0, 255, 0), 2) 1465 | cv2.putText( 1466 | img, 1467 | sr.name, 1468 | (20, sr.y + sr.h - 20), 1469 | cv2.FONT_HERSHEY_SIMPLEX, 1470 | 2, 1471 | (0, 255, 0), 1472 | 2, 1473 | cv2.LINE_AA, 1474 | ) 1475 | fn = debug_folder / f"subtitle_regions.png" 1476 | cv2.imwrite(str(fn.absolute()), img) 1477 | 1478 | if debug_subregion is not None: 1479 | sr = [sr for sr in subtitle_regions if sr.name == debug_subregion][0] 1480 | else: 1481 | sr = subtitle_regions[0] 1482 | 1483 | gather_images = [] 1484 | for result in loop_frames( 1485 | src_video, 1486 | dst_video, 1487 | start_frame_no, 1488 | end_frame_no, 1489 | frame_diff, 1490 | sr, 1491 | tesseract_data_path, 1492 | fix_broken_frame_alignment=fix_broken_frame_alignment, 1493 | gather_images=gather_images, 1494 | edge_threshold=sr.edge_threshold, 1495 | max_frames=max_frames, 1496 | ): 1497 | for i, (title, img) in enumerate(gather_images): 1498 | fn = debug_folder / f"{i:02}.png" 1499 | print(title, str(fn)) 1500 | if img is None or not img.shape: 1501 | print(f"Skipping image {title}") 1502 | continue 1503 | cv2.imwrite(str(fn.absolute()), img) 1504 | print(result) 1505 | break 1506 | else: 1507 | click.echo("Starting to extracting actual subtitles") 1508 | 1509 | def ocr_in_thread(start_frame_no, end_frame_no, frame_diff, sr): 1510 | src_video = cv2.VideoCapture(ctx.obj["subtitled_file"]) 1511 | dst_video = cv2.VideoCapture(ctx.obj["unsubtitled_file"]) 1512 | for result in loop_frames( 1513 | src_video, 1514 | dst_video, 1515 | start_frame_no, 1516 | end_frame_no, 1517 | frame_diff, 1518 | sr, 1519 | tesseract_data_path, 1520 | fix_broken_frame_alignment=fix_broken_frame_alignment, 1521 | gather_images=None, 1522 | edge_threshold=sr.edge_threshold, 1523 | max_frames=max_frames, 1524 | ): 1525 | config.add_text_line( 1526 | sr.scan_mode, 1527 | (start_frame_no, end_frame_no), 1528 | result.get("initial_frame_no", result.get("frame_no", 0)), 1529 | result, 1530 | ) 1531 | 1532 | max_workers = threads 1533 | if run_subregions_in_parallel: 1534 | max_workers *= len(subtitle_regions) 1535 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 1536 | if frame_range is None: 1537 | frame_range = (0, max_frames) 1538 | 1539 | if frame_range[1] < 0: 1540 | frame_range = (frame_range[0], max_frames - frame_range[1]) 1541 | 1542 | frames_per_thread = math.ceil((frame_range[1] - frame_range[0]) / threads) 1543 | jobs = {} 1544 | for sr in subtitle_regions: 1545 | for i in range(threads): 1546 | start_frame_no = frame_range[0] + (frames_per_thread * i) 1547 | end_frame_no = frame_range[0] + ( 1548 | min(frames_per_thread * (i + 1), max_frames) 1549 | ) 1550 | jobs[ 1551 | executor.submit( 1552 | ocr_in_thread, start_frame_no, end_frame_no, frame_diff, sr 1553 | ) 1554 | ] = (sr, start_frame_no, end_frame_no) 1555 | 1556 | for future in concurrent.futures.as_completed(jobs): 1557 | sr, start_frame_no, end_frame_no = jobs[future] 1558 | try: 1559 | future.result() 1560 | except Exception as exc: 1561 | click.echo( 1562 | f"Failed job {sr.name} {start_frame_no=} {end_frame_no=}" 1563 | ) 1564 | traceback.print_exc() 1565 | # else: 1566 | # click.echo(f"Done with job {sr.name} {start_frame_no=} {end_frame_no=}") 1567 | click.echo(f"Done generating subtitle json from: {str(ctx.obj['subtitled_file'])}") 1568 | 1569 | 1570 | @cli.command() 1571 | @click.option( 1572 | "--output-report-path", 1573 | type=click.Path(exists=True), 1574 | required=False, 1575 | help="Folder to save reports to", 1576 | ) 1577 | @click.option( 1578 | "--output-subtitle-path", 1579 | type=click.Path(exists=True), 1580 | required=False, 1581 | help="Folder to save the subtitles to", 1582 | ) 1583 | @click.pass_context 1584 | def create_report(ctx, output_report_path, output_subtitle_path): 1585 | src_video = cv2.VideoCapture(ctx.obj["subtitled_file"]) 1586 | max_frames = src_video.get(cv2.CAP_PROP_FRAME_COUNT) - 1 1587 | fps = src_video.get(cv2.CAP_PROP_FPS) 1588 | env = jinja2.Environment() 1589 | env.filters["totimestamp"] = lambda frame_count: totimestamp(fps, frame_count) 1590 | if output_report_path is None: 1591 | output_report_path = ( 1592 | ctx.obj["temp_folder"] / f"{Path(ctx.obj['subtitled_file']).name}-report" 1593 | ) 1594 | else: 1595 | output_report_path = Path(output_report_path) 1596 | 1597 | output_report_path.mkdir(exist_ok=True) 1598 | 1599 | if output_subtitle_path is None: 1600 | output_subtitle_path = ( 1601 | ctx.obj["temp_folder"] 1602 | / f"{Path(ctx.obj['subtitled_file']).with_suffix('.ass').name}" 1603 | ) 1604 | else: 1605 | output_subtitle_path = Path(output_subtitle_path) 1606 | 1607 | config = ctx.obj["config"] 1608 | 1609 | def _get_frame(frame_no, position=None, region=None): 1610 | frame_no = min(max_frames, frame_no) 1611 | return save_frame_and_return_path( 1612 | output_report_path / "images", src_video, frame_no, position, region 1613 | ) 1614 | 1615 | env.globals["get_frame"] = _get_frame 1616 | 1617 | subs = pysubs2.SSAFile.from_string(BASE_ASS) 1618 | subs.info["PlayResX"] = int(src_video.get(cv2.CAP_PROP_FRAME_WIDTH)) 1619 | subs.info["PlayResY"] = int(src_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 1620 | 1621 | subtitle_regions = ctx.obj["subtitle_regions"] 1622 | 1623 | subtitle_region_data = {} 1624 | for sr in subtitle_regions: 1625 | subtitle_region_data[sr.name] = { 1626 | "subtitle_lines": [], 1627 | "subtitle_signs": [], 1628 | "missing_regions": [], 1629 | } 1630 | missing_regions, subtitles, subtitle_signs = ( 1631 | subtitle_region_data[sr.name]["missing_regions"], 1632 | subtitle_region_data[sr.name]["subtitle_lines"], 1633 | subtitle_region_data[sr.name]["subtitle_signs"], 1634 | ) 1635 | for l in config.read_text_lines(): 1636 | if l["region"] != sr.name: 1637 | continue 1638 | if "to_frame_no" in l: 1639 | l["to_frame_no"] = min(l["to_frame_no"], max_frames) 1640 | l["from_frame_no"] = min(l["to_frame_no"], l["from_frame_no"]) 1641 | if l["type"] == "subtitle": 1642 | subtitles.append(l) 1643 | elif l["type"] == "subtitle_sign": 1644 | subtitle_signs.append(l) 1645 | elif l["type"] == "missed_region": 1646 | missing_regions.append(l) 1647 | else: 1648 | click.echo(f"unknown type {l['type']}") 1649 | 1650 | missing_regions = sorted(missing_regions, key=lambda x: x["frame_no"]) 1651 | subtitles = sorted(subtitles, key=lambda x: x["from_frame_no"]) 1652 | subtitle_signs = sorted(subtitle_signs, key=lambda x: x["from_frame_no"]) 1653 | 1654 | missing_frames = set() 1655 | for missing_region in missing_regions: 1656 | missing_frames.add(missing_region["frame_no"]) 1657 | 1658 | cleaned_subtitles = [] 1659 | last_line = None 1660 | for line in subtitles: 1661 | line["subtitle_text"] = cleanup_text(line["subtitle_text"]) 1662 | if last_line is not None: 1663 | if ( 1664 | last_line["to_frame_no"] > line["from_frame_no"] - 3 1665 | and normalized_damerau_levenshtein_distance( 1666 | last_line["subtitle_text"], line["subtitle_text"] 1667 | ) 1668 | < 0.2 1669 | ): 1670 | # TODO: make sure all intersecting lines are removed too 1671 | last_line["from_frame_no"] = min( 1672 | last_line["from_frame_no"], line["from_frame_no"] 1673 | ) 1674 | last_line["to_frame_no"] = max( 1675 | last_line["to_frame_no"], line["to_frame_no"] 1676 | ) 1677 | last_line["subtitle_text"] = pick_best_text( 1678 | last_line["subtitle_text"], line["subtitle_text"] 1679 | ) 1680 | click.echo(f"Merging: {line} {last_line}") 1681 | continue 1682 | for frame_no in range(line["from_frame_no"], line["to_frame_no"] + 1): 1683 | if frame_no in missing_frames: 1684 | missing_frames.remove(frame_no) 1685 | click.echo(f"Frame {frame_no} found in missing frames") 1686 | cleaned_subtitles.append(line) 1687 | last_line = line 1688 | 1689 | if sr.scan_mode == SubtitleRegion.ScanMode.BOTTOM_CENTER: 1690 | short_subtitle_signs = [ 1691 | line 1692 | for line in cleaned_subtitles 1693 | if line["to_frame_no"] - line["from_frame_no"] <= 4 1694 | ] 1695 | cleaned_subtitles = [ 1696 | line 1697 | for line in cleaned_subtitles 1698 | if line["to_frame_no"] - line["from_frame_no"] > 4 1699 | ] 1700 | (output_report_path / f"{sr.name}-subtitles.html").write_text( 1701 | env.from_string(HTML_SUBTITLE_LINES).render( 1702 | subtitle_lines=cleaned_subtitles, sr=sr, title="CowOCR - Subtitles" 1703 | ) 1704 | ) 1705 | 1706 | for line in cleaned_subtitles: 1707 | subtitle_text = line["subtitle_text"].replace("\n", "\\N") 1708 | subs.append( 1709 | pysubs2.SSAEvent( 1710 | start=make_time( 1711 | frames=line["from_frame_no"], fps=fps, pos="start" 1712 | ), 1713 | end=make_time(frames=line["to_frame_no"], fps=fps, pos="end"), 1714 | text=subtitle_text, 1715 | style=sr.ass_style_name, 1716 | ) 1717 | ) 1718 | 1719 | if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE: 1720 | short_subtitle_signs = [ 1721 | subtitle_sign 1722 | for subtitle_sign in subtitle_signs 1723 | if subtitle_sign["to_frame_no"] - subtitle_sign["from_frame_no"] <= 10 1724 | ] 1725 | missing_frames = [ 1726 | missing_region 1727 | for missing_region in missing_regions 1728 | if missing_region["frame_no"] in missing_frames 1729 | ] 1730 | (output_report_path / f"{sr.name}-missing-regions.html").write_text( 1731 | env.from_string(HTML_MISSING_REGIONS).render( 1732 | missing_regions=missing_frames, 1733 | sr=sr, 1734 | short_subtitle_signs=short_subtitle_signs, 1735 | title="CowOCR - Missing regions", 1736 | ) 1737 | ) 1738 | 1739 | if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE: 1740 | subtitle_signs = [ 1741 | subtitle_sign 1742 | for subtitle_sign in subtitle_signs 1743 | if subtitle_sign["to_frame_no"] - subtitle_sign["from_frame_no"] > 10 1744 | ] 1745 | for subtitle_sign in subtitle_signs: 1746 | subtitle_sign["subtitle_text"] = cleanup_text( 1747 | subtitle_sign["subtitle_text"] 1748 | ) 1749 | (output_report_path / f"{sr.name}-subtitle-signs.html").write_text( 1750 | env.from_string(HTML_SUBTITLE_SIGNS).render( 1751 | subtitle_signs=subtitle_signs, 1752 | sr=sr, 1753 | title="CowOCR - Subtitle Signs", 1754 | ) 1755 | ) 1756 | 1757 | for subtitle_sign in subtitle_signs: 1758 | subtitle_text = subtitle_sign["subtitle_text"].replace("\n", "\\N") 1759 | subtitle_text = ( 1760 | "{\pos(%s,%s)}" 1761 | % ( 1762 | int( 1763 | ( 1764 | subtitle_sign["position"][2] 1765 | + subtitle_sign["position"][3] 1766 | ) 1767 | / 2 1768 | ), 1769 | subtitle_sign["position"][1], 1770 | ) 1771 | ) + subtitle_text 1772 | subs.append( 1773 | pysubs2.SSAEvent( 1774 | start=make_time( 1775 | frames=subtitle_sign["from_frame_no"], fps=fps, pos="start" 1776 | ), 1777 | end=make_time( 1778 | frames=subtitle_sign["to_frame_no"], fps=fps, pos="end" 1779 | ), 1780 | text=subtitle_text, 1781 | style=sr.ass_style_name, 1782 | ) 1783 | ) 1784 | 1785 | (output_report_path / "index.html").write_text( 1786 | env.from_string(HTML_INDEX).render( 1787 | subtitle_regions=subtitle_regions, 1788 | subtitle_region_data=subtitle_region_data, 1789 | title="CowOCR - Index", 1790 | ) 1791 | ) 1792 | 1793 | subs.save(str(output_subtitle_path.absolute())) 1794 | 1795 | click.echo(f"Saved report to: {str(output_report_path)}") 1796 | click.echo(f"Saved subtitles to: {str(output_subtitle_path)}") 1797 | 1798 | 1799 | if __name__ == "__main__": 1800 | cli() 1801 | -------------------------------------------------------------------------------- /milksync.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import json 3 | import logging 4 | import math 5 | import os 6 | import pickle 7 | import shlex 8 | import shutil 9 | import subprocess 10 | import tempfile 11 | import uuid 12 | import warnings 13 | from collections import Counter 14 | from concurrent.futures import ThreadPoolExecutor 15 | from decimal import Decimal 16 | from pathlib import Path 17 | from re import sub 18 | 19 | import click 20 | import cv2 21 | import ffmpeg 22 | import librosa 23 | import numpy as np 24 | import pysubs2 25 | from annoy import AnnoyIndex 26 | from numpy.lib.stride_tricks import sliding_window_view 27 | from scipy.spatial.distance import cdist, cosine, pdist 28 | from skimage.metrics import structural_similarity 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | # HOP_LENGTH = 1024 33 | # HOP_LENGTH = 512 34 | HOP_LENGTH = None 35 | 36 | 37 | class Video: 38 | _cv2_video = None 39 | _cv2_video_info = None 40 | _ffmpeg_probe = None 41 | 42 | def __init__(self, filepath): 43 | self.filepath = filepath 44 | 45 | @property 46 | def video_capture(self): 47 | if not self._cv2_video: 48 | self._cv2_video = cv2.VideoCapture(self.filepath) 49 | 50 | return self._cv2_video 51 | 52 | @property 53 | def video_info(self): 54 | if not self._cv2_video_info: 55 | cap = self.video_capture 56 | self._cv2_video_info = { 57 | "fps": cap.get(cv2.CAP_PROP_FPS), 58 | "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 59 | } 60 | self._cv2_video_info["duration"] = Decimal( 61 | self._cv2_video_info["frame_count"] 62 | ) / Decimal(self._cv2_video_info["fps"]) 63 | 64 | return self._cv2_video_info 65 | 66 | @property 67 | def probe(self): 68 | if self._ffmpeg_probe is None: 69 | self._ffmpeg_probe = ffmpeg.probe(self.filepath) 70 | return self._ffmpeg_probe 71 | 72 | @property 73 | def subtitle_streams(self): 74 | return [s for s in self.probe["streams"] if s["codec_type"] == "subtitle"] 75 | 76 | @property 77 | def audio_streams(self): 78 | return [s for s in self.probe["streams"] if s["codec_type"] == "audio"] 79 | 80 | @property 81 | def chapters(self): 82 | chapters = ( 83 | json.loads( 84 | subprocess.check_output( 85 | [ 86 | "ffprobe", 87 | "-loglevel", 88 | "error", 89 | "-hide_banner", 90 | "-of", 91 | "json", 92 | "-show_chapters", 93 | self.filepath, 94 | ], 95 | stdin=subprocess.PIPE, 96 | stderr=subprocess.DEVNULL, 97 | ) 98 | ) 99 | or {} 100 | ) 101 | return chapters.get("chapters", []) 102 | 103 | def create_ffmpeg(self): 104 | return ffmpeg.input(self.filepath) 105 | 106 | def extract_subtitle_metadata(self, track_id): 107 | stream = self.subtitle_streams[track_id] 108 | metadata = [] 109 | for k, v in stream.get("tags", {}).items(): 110 | if k in ["language", "title"]: 111 | metadata.append((k, v)) 112 | return metadata 113 | 114 | def extract_audio_metadata(self, track_id): 115 | stream = self.audio_streams[track_id] 116 | metadata = [] 117 | for k, v in stream.get("tags", {}).items(): 118 | if k in ["language", "title"]: 119 | metadata.append((k, v)) 120 | return metadata 121 | 122 | 123 | def estimate_audio_shift_points_from_subtitles( 124 | x_1_chroma, 125 | x_2_chroma, 126 | fs, 127 | video_file, 128 | track_id, 129 | n_chroma, 130 | adjust_delay=None, 131 | framerate_align=None, 132 | max_ms_cutoff=110, 133 | external_subtitle_file=None, 134 | ): 135 | # subtitle, subtitle_format = extract_subtitle_data(video_file, track_id, framerate_align) 136 | if external_subtitle_file is not None: 137 | subtitle, subtitle_format = import_subtitle_data( 138 | external_subtitle_file, framerate_align 139 | ) 140 | else: 141 | subtitle, subtitle_format = extract_subtitle_data( 142 | video_file, track_id, framerate_align 143 | ) 144 | index = AnnoyIndex(n_chroma, "euclidean") 145 | for i, c in enumerate(x_2_chroma): 146 | index.add_item(i, c) 147 | index.build(10) 148 | 149 | def align_subtitle(subtitle_line, min_datapoint_percent=0.2, min_avg=0.05, n=10): 150 | start_i = int(subtitle_line.start * fs / HOP_LENGTH / 1000) 151 | end_i = int(subtitle_line.end * fs / HOP_LENGTH / 1000) 152 | if end_i >= len(x_1_chroma): 153 | return [] 154 | found_indexes = {} 155 | for i in range(end_i - start_i): 156 | source_vector = list(x_1_chroma[start_i + i]) 157 | for vector_index in index.get_nns_by_vector(source_vector, n): 158 | target_vector = index.get_item_vector(vector_index) 159 | found_indexes.setdefault(vector_index - i, []).append( 160 | cosine(source_vector, target_vector) 161 | ) 162 | candidates = [] 163 | for k, v in sorted( 164 | found_indexes.items(), key=lambda x: len(x[1]), reverse=True 165 | ): 166 | if len(v) < (end_i - start_i) * min_datapoint_percent: 167 | v = v + found_indexes.get(k + 1, []) + found_indexes.get(k - 1, []) 168 | if len(v) < (end_i - start_i) * min_datapoint_percent: 169 | break 170 | if np.average(v) > min_avg: 171 | continue 172 | 173 | candidates.append(k) 174 | return candidates 175 | 176 | subtitle_matches = [] 177 | for i, s in enumerate(subtitle): 178 | candidates = align_subtitle(s) 179 | if candidates: 180 | subtitle_matches.append( 181 | [ 182 | int( 183 | librosa.frames_to_time(candidate, sr=fs, hop_length=HOP_LENGTH) 184 | * 1000 185 | ) 186 | - s.start 187 | for candidate in candidates 188 | ] 189 | ) 190 | else: 191 | subtitle_matches.append(None) 192 | 193 | def generate_best_chains(subtitle_matches): 194 | subtitle_groups = [] 195 | for i, subtitle_match in enumerate(subtitle_matches): 196 | if subtitle_match is None: 197 | continue 198 | ts_diff = subtitle_match[0] 199 | if ( 200 | not subtitle_groups 201 | or abs( 202 | ts_diff - np.median([t for (t, _, _) in subtitle_groups[-1][:20]]) 203 | ) 204 | > max_ms_cutoff 205 | ): 206 | print(f"Creating new group with {ts_diff=}") 207 | subtitle_groups.append([(ts_diff, i, subtitle_match)]) 208 | else: 209 | subtitle_groups[-1].append((ts_diff, i, subtitle_match)) 210 | 211 | group_size_cutoff = 5 212 | has_modified = True 213 | while has_modified: 214 | has_modified = False 215 | previous_subtitle_group = None 216 | for i, subtitle_group in enumerate(subtitle_groups): 217 | if len(subtitle_group) > group_size_cutoff: 218 | ts_diff = np.median([t for (t, _, _) in subtitle_group[:20]]) 219 | if i >= len(subtitle_groups): 220 | next_subtitle_group = subtitle_groups[i + 1] 221 | else: 222 | next_subtitle_group = None 223 | if previous_subtitle_group is not None: 224 | for entry in list(previous_subtitle_group[::-1]): 225 | matched_ts = sorted( 226 | [(abs(ts - ts_diff), ts) for ts in entry[2]] 227 | ) 228 | if matched_ts[0][0] <= max_ms_cutoff: 229 | print(f"Moving entry forward {entry}") 230 | previous_subtitle_group.pop(-1) 231 | subtitle_group.insert( 232 | 0, (matched_ts[0][1], entry[1], entry[2]) 233 | ) 234 | has_modified = True 235 | else: 236 | print(f"Breaking at {entry}") 237 | break 238 | if next_subtitle_group is not None: 239 | for entry in list(next_subtitle_group): 240 | matched_ts = sorted( 241 | [(abs(ts - ts_diff), ts) for ts in entry[2]] 242 | ) 243 | if matched_ts[0][0] <= max_ms_cutoff: 244 | print(f"Moving entry back {entry}") 245 | next_subtitle_group.pop(0) 246 | subtitle_group.append( 247 | (matched_ts[0][1], entry[1], entry[2]) 248 | ) 249 | has_modified = True 250 | else: 251 | print(f"Breaking at back {entry}") 252 | break 253 | 254 | previous_subtitle_group = None 255 | else: 256 | previous_subtitle_group = subtitle_group 257 | 258 | to_remove_groups = [] 259 | for i, subtitle_group in enumerate(subtitle_groups): 260 | if len(subtitle_group) == 0: 261 | to_remove_groups.append(i) 262 | 263 | for i in to_remove_groups[::-1]: 264 | print(f"Removing group {i}") 265 | del subtitle_groups[i] 266 | 267 | has_modified = True 268 | while has_modified: 269 | has_modified = False 270 | to_remove_groups = [] 271 | for i, subtitle_group in enumerate(subtitle_groups): 272 | if len(subtitle_group) <= 2: 273 | to_remove_groups.append(i) 274 | 275 | for i in to_remove_groups[::-1]: 276 | del subtitle_groups[i] 277 | print(f"Deleting {i}") 278 | has_modified = True 279 | 280 | for i, subtitle_group in enumerate(subtitle_groups): 281 | if not subtitle_group or i == 0: 282 | continue 283 | previous_subtitle_group = subtitle_groups[i - 1] 284 | previous_ts_diff = np.median( 285 | [t for (t, _, _) in previous_subtitle_group] 286 | ) 287 | ts_diff = np.median([t for (t, _, _) in subtitle_group]) 288 | if abs(previous_ts_diff - ts_diff) < max_ms_cutoff: 289 | print(f"Merging into {i}") 290 | subtitle_groups[i] = previous_subtitle_group + subtitle_groups[i] 291 | subtitle_groups[i - 1] = [] 292 | has_modified = True 293 | 294 | return subtitle_groups 295 | 296 | audio_shift_points, sync_buckets, delete_buckets = [], [], [] 297 | 298 | previous_start_timestamp, previous_end_timestamp = None, None 299 | for subtitle_group in generate_best_chains(subtitle_matches): 300 | delta = np.median([t for (t, _, _) in subtitle_group if t is not None]) / 1000 301 | subtitle_group_indexes = set([i for (_, i, _) in subtitle_group]) 302 | timestamps = [] 303 | for i, subtitle_line in enumerate(subtitle): 304 | if i not in subtitle_group_indexes: 305 | continue 306 | timestamps += [subtitle_line.start, subtitle_line.end] 307 | start_timestamp = min(timestamps) / 1000 308 | end_timestamp = max(timestamps) / 1000 309 | 310 | slice_buffer_length = 51200 // HOP_LENGTH 311 | x_1_start_i = int((start_timestamp * fs) / HOP_LENGTH) 312 | x_1_end_i = ( 313 | int((end_timestamp * fs) / HOP_LENGTH) 314 | - slice_buffer_length 315 | - slice_buffer_length 316 | ) 317 | min_slice_length = 204800 // HOP_LENGTH 318 | if x_1_end_i - x_1_start_i > min_slice_length: 319 | x_2_start_i = int(((start_timestamp + delta) * fs) / HOP_LENGTH) 320 | x_2_end_i = int(((end_timestamp + delta) * fs) / HOP_LENGTH) 321 | 322 | x_1_chroma_slice = x_1_chroma[ 323 | x_1_start_i 324 | + slice_buffer_length : x_1_start_i 325 | + slice_buffer_length 326 | + min_slice_length 327 | ] 328 | x_2_chroma_slice = x_2_chroma[ 329 | x_2_start_i : x_2_start_i 330 | + min_slice_length 331 | + slice_buffer_length 332 | + slice_buffer_length 333 | ] 334 | 335 | C = cdist(x_1_chroma_slice, x_2_chroma_slice, metric="cosine") 336 | C = np.nan_to_num(C, copy=False) 337 | 338 | smallest_i, smallest_value = None, None 339 | for i in range(len(x_2_chroma_slice) - len(x_1_chroma_slice)): 340 | cost_diagonal = np.flip(np.diagonal(C, offset=i)) 341 | total_cost = np.sum(cost_diagonal) 342 | if smallest_value is None or smallest_value > total_cost: 343 | smallest_value = total_cost 344 | smallest_i = i 345 | print(f"Additional buffer change: {smallest_i - slice_buffer_length}") 346 | delta = librosa.frames_to_time( 347 | [(x_2_start_i - x_1_start_i) + (smallest_i - slice_buffer_length)], 348 | sr=fs, 349 | hop_length=HOP_LENGTH, 350 | )[0] 351 | # delta += librosa.frames_to_time([smallest_i - slice_buffer_length], sr=fs, hop_length=HOP_LENGTH)[0] 352 | delta += adjust_delay or 0 353 | 354 | print(f"sync points {start_timestamp} {end_timestamp} {delta=}") 355 | if not audio_shift_points and start_timestamp > 0 and delta > 0: 356 | audio_shift_points.append((0.0, delta, delta)) 357 | delete_buckets.append((0, start_timestamp - 0.001)) 358 | 359 | if previous_end_timestamp is not None: 360 | audio_shift_points.append( 361 | ( 362 | previous_end_timestamp + 0.01 + 100_000_000, 363 | previous_end_timestamp + 0.01, 364 | -100_000_000, 365 | ) 366 | ) 367 | delete_buckets.append( 368 | (previous_end_timestamp + 0.001, start_timestamp - 0.001) 369 | ) 370 | 371 | sync_buckets.append((start_timestamp, end_timestamp, delta)) 372 | audio_shift_points.append( 373 | (start_timestamp - 0.01 + delta, start_timestamp - 0.01, delta) 374 | ) 375 | previous_start_timestamp, previous_end_timestamp = ( 376 | start_timestamp, 377 | end_timestamp, 378 | ) 379 | if previous_end_timestamp is not None: 380 | delete_buckets.append((previous_end_timestamp + 0.001, 100_000)) 381 | 382 | return audio_shift_points, sync_buckets, delete_buckets 383 | 384 | 385 | def estimate_audio_shift_points( 386 | x_1_chroma, 387 | x_2_chroma, 388 | fs, 389 | max_cost_matrix_size=200_000_000, 390 | only_delta=False, 391 | adjust_delay=None, 392 | sliding_window_size=300, 393 | ): 394 | expected_matrix_size = len(x_1_chroma) * len(x_2_chroma) 395 | 396 | print(f"Expected cost-matrix size is {expected_matrix_size=}") 397 | 398 | if expected_matrix_size > max_cost_matrix_size: 399 | print( 400 | f"Since our cost-matrix is bigger than max allowed cost {max_cost_matrix_size=} we will slice it." 401 | ) 402 | chroma_slice_size = int(math.sqrt(max_cost_matrix_size)) 403 | chroma_slice_step = int(chroma_slice_size * 0.8) 404 | else: 405 | print("Memory can fully fit our cost-matrix") 406 | chroma_slice_size = 100_000_000 407 | chroma_slice_step = 100_000_000 408 | 409 | all_diffs = None 410 | all_timestamps = None 411 | 412 | for i in range( 413 | 0, max([len(x_1_chroma), len(x_2_chroma)]), chroma_slice_step 414 | ): # TODO 415 | start_i = max( 416 | min( 417 | len(x_1_chroma) - chroma_slice_step, 418 | len(x_2_chroma) - chroma_slice_step, 419 | i, 420 | ), 421 | 0, 422 | ) 423 | 424 | x_1_chroma_slice = x_1_chroma[start_i : start_i + chroma_slice_size] 425 | x_2_chroma_slice = x_2_chroma[start_i : start_i + chroma_slice_size] 426 | if start_i: 427 | wp_offset = librosa.frames_to_time([start_i], sr=fs, hop_length=HOP_LENGTH)[ 428 | 0 429 | ] 430 | else: 431 | wp_offset = 0 432 | print( 433 | f"Doing chroma slices x1={len(x_1_chroma_slice)} x2={len(x_2_chroma_slice)} {wp_offset=}" 434 | ) 435 | 436 | C = cdist(x_2_chroma_slice, x_1_chroma_slice, metric="cosine") 437 | C = np.nan_to_num(C, copy=False) 438 | D, wp = librosa.sequence.dtw(C=C) 439 | wp_s = np.flip( 440 | librosa.frames_to_time(wp, sr=fs, hop_length=HOP_LENGTH) + wp_offset, axis=0 441 | ) 442 | 443 | diffs = [] 444 | timestamps = [] 445 | 446 | t1_already_seen = set() 447 | t2_already_seen = set() 448 | 449 | for t1, t2 in wp_s: 450 | should_skip = t1 in t1_already_seen or t2 in t2_already_seen 451 | t1_already_seen.add(t1) 452 | t2_already_seen.add(t2) 453 | if should_skip: 454 | continue 455 | 456 | diff = np.round(t1 - t2, 3) 457 | diffs.append(diff) 458 | timestamps.append((t1, t2)) 459 | 460 | if all_timestamps: 461 | at1, at2 = all_timestamps[-1] 462 | t1, t2 = timestamps[0] 463 | 464 | cutoff_t = t1 + ((at1 - t1) / 2) 465 | for timestamp_index, (t1, t2) in enumerate(timestamps): 466 | if t1 > cutoff_t: 467 | break 468 | for all_timestamp_index, (t1, t2) in enumerate(all_timestamps[::-1]): 469 | if t1 <= cutoff_t: 470 | break 471 | all_timestamps = ( 472 | all_timestamps[:-all_timestamp_index] + timestamps[timestamp_index:] 473 | ) 474 | all_diffs = all_diffs[:-all_timestamp_index] + diffs[timestamp_index:] 475 | else: 476 | all_timestamps = timestamps 477 | all_diffs = diffs 478 | 479 | if only_delta: 480 | min_abs_diff = 0.03 481 | else: 482 | min_abs_diff = 0.06 483 | # sliding_window_size = 300 TODO: make it change near the end to detect changes while keeping it higher before. 484 | shift_points = [] 485 | last_most_common = None 486 | for i, v in enumerate( 487 | sliding_window_view(all_diffs, window_shape=sliding_window_size) 488 | ): 489 | most_common, most_common_count = Counter(v).most_common(1)[0] 490 | if ( 491 | last_most_common is None 492 | or abs(most_common - last_most_common) > min_abs_diff 493 | ): 494 | j = list(v).index(most_common) 495 | t1, t2 = all_timestamps[i + j] 496 | if adjust_delay: 497 | adjusted_most_common = most_common + adjust_delay 498 | else: 499 | adjusted_most_common = most_common 500 | shift_points.append((t1, t2, adjusted_most_common)) 501 | last_most_common = most_common 502 | print( 503 | f"Found sync point source_timestamp={t2} target_timestamp={t1} delta={most_common} delta_count={most_common_count} delta_average={np.average(v)} delta_median={np.median(v)}" 504 | ) 505 | 506 | new_shift_points = [] 507 | is_first_point = True 508 | for t1, t2, delta in shift_points: 509 | x_2_compare_point = int((t1 * fs) / HOP_LENGTH) 510 | x_1_compare_point = int((t2 * fs) / HOP_LENGTH) 511 | 512 | step_back = min( 513 | int((max(abs(delta * 0.3), 1) * fs) / HOP_LENGTH), 514 | x_1_compare_point, 515 | x_2_compare_point, 516 | ) # TODO: do not float into previous shift point 517 | range_end = min( 518 | int((5 * fs) / HOP_LENGTH), 519 | len(x_2_chroma) - x_2_compare_point, 520 | len(x_1_chroma) - x_1_compare_point, 521 | ) 522 | 523 | C = cdist( 524 | x_2_chroma[x_2_compare_point - step_back : x_2_compare_point + range_end], 525 | x_1_chroma[x_1_compare_point - step_back : x_1_compare_point + range_end], 526 | metric="cosine", 527 | ) 528 | C = np.nan_to_num(C, copy=False) 529 | cost_diagonal = np.flip(np.diagonal(C)) 530 | print( 531 | f"Trying to align range source_timestamp={t2} target_timestamp={t1} {delta=} {step_back=} {range_end=} {x_1_compare_point=} {x_2_compare_point=}" 532 | ) 533 | max_cost = np.max(cost_diagonal[:range_end]) * 1.15 534 | print(f"{max_cost=}") 535 | for i, cost in enumerate(cost_diagonal): 536 | if cost > max_cost: 537 | print(f"Found breakpoint at additional delta {i - range_end}") 538 | seconds = ((i - range_end) * HOP_LENGTH) / fs 539 | new_shift_points.append((t1 - seconds, t2 - seconds, delta)) 540 | break 541 | else: 542 | if is_first_point: 543 | new_shift_points.append((t1 - min(t1, t2), t2 - min(t1, t2), delta)) 544 | print( 545 | "No breakpoint found, moving all the way back (if this is the first point)" 546 | ) 547 | else: 548 | new_shift_points.append((t1, t2, delta)) 549 | print( 550 | "No breakpoint found, assume cost problems et.al. and just adding the current" 551 | ) 552 | 553 | is_first_point = False 554 | 555 | zero_shifting_delta = 8.0 556 | t1, t2, delta = new_shift_points[0] 557 | if 0 < t1 < zero_shifting_delta or 0 < t2 < zero_shifting_delta: 558 | print(f"Zero-shifting initial delta {t1=} {t2=} {delta}") 559 | shift_delta = min(t1, t2) 560 | t1 -= shift_delta 561 | t2 -= shift_delta 562 | new_shift_points[0] = (t1, t2, delta) 563 | 564 | return new_shift_points 565 | 566 | 567 | class TrackMappingParamType(click.ParamType): 568 | name = "trackmapping" 569 | 570 | def convert(self, value, param, ctx): 571 | converted_mapping = [] 572 | for mapping in value.split(","): 573 | try: 574 | mapping = [int(v) for v in mapping.split(":")] 575 | except ValueError: 576 | self.fail( 577 | "Failed to convert mappings to numbers. Syntax is file:track e.g. 0:1" 578 | ) 579 | 580 | if len(mapping) != 2: 581 | self.fail("Mappings must be 2 long. Syntax is file:track e.g. 0:1") 582 | 583 | converted_mapping.append(mapping) 584 | return converted_mapping 585 | 586 | 587 | TRACK_MAPPING = TrackMappingParamType() 588 | 589 | 590 | class MetadataMappingParamType(click.ParamType): 591 | name = "metadatamapping" 592 | 593 | def convert(self, value, param, ctx): 594 | value = value.split("=", 2) 595 | if len(value) != 3: 596 | self.fail("Missing arguments, syntax is track_id=key=value") 597 | 598 | track_id, key, value = value 599 | 600 | try: 601 | track_id = int(track_id) 602 | except ValueError: 603 | self.fail("Track ID must be an ID") 604 | 605 | return (track_id, key, value) 606 | 607 | 608 | METADATA_MAPPING = MetadataMappingParamType() 609 | 610 | 611 | class IntegerRangeParamType(click.ParamType): 612 | name = "integerrange" 613 | 614 | def convert(self, value, param, ctx): 615 | value = value.split("-") 616 | if len(value) != 2: 617 | self.fail("Missing arguments, syntax is start_second-end_second") 618 | 619 | start_second, end_second = value 620 | 621 | try: 622 | start_second, end_second = int(start_second), int(end_second) 623 | except ValueError: 624 | self.fail("Range must be integers") 625 | 626 | return (start_second, end_second) 627 | 628 | 629 | INTEGER_RANGE = IntegerRangeParamType() 630 | 631 | 632 | def generate_chroma_cqt( 633 | source_file, 634 | audio_track_id, 635 | target_file, 636 | n_chroma=12, 637 | framerate_align=None, 638 | preserve_silence=False, 639 | ): 640 | target_wav_file = target_file.with_suffix(".wav") 641 | 642 | cmd = [ 643 | "ffmpeg", 644 | "-y", 645 | "-i", 646 | str(source_file), 647 | "-map", 648 | f"0:a:{audio_track_id}", 649 | "-ar", 650 | "22050", 651 | ] 652 | 653 | # -filter:a "atempo=2.0" 654 | if framerate_align: 655 | cmd += [ 656 | "-filter:a", 657 | f"atempo={framerate_align[1] / framerate_align[0]}", 658 | ] 659 | 660 | cmd += [str(target_wav_file)] 661 | 662 | subprocess.check_call( 663 | cmd, 664 | stdout=subprocess.DEVNULL, 665 | stderr=subprocess.DEVNULL, 666 | stdin=subprocess.PIPE, 667 | ) 668 | 669 | src, fs = librosa.load(str(target_wav_file)) 670 | y = src 671 | if not preserve_silence: 672 | _, (ltrim, rtrim) = librosa.effects.trim(src) 673 | print( 674 | f"Trimming {(len(src) - rtrim) / fs}s silence from end from {source_file.name}" 675 | ) 676 | y = y[:rtrim] 677 | chroma = librosa.feature.chroma_cqt( 678 | y=y, sr=fs, hop_length=HOP_LENGTH, n_chroma=n_chroma 679 | ).T 680 | target_file.write_bytes(pickle.dumps((chroma, fs))) 681 | target_wav_file.unlink() 682 | return chroma, fs 683 | 684 | 685 | def find_and_align_chapter(x_1_chroma, x_2_chroma, fs, min_match_value=60): 686 | """Find x_1_chroma in x_2_chroma""" 687 | best_equals = 0 688 | C = cdist(x_1_chroma[42:], x_2_chroma, metric="cosine") 689 | C = np.nan_to_num(C, copy=False) 690 | smallest_sum = None 691 | location = None 692 | smallest_found = 9999999 693 | for i, v in enumerate( 694 | sliding_window_view(C.T, window_shape=len(x_1_chroma[42:]), axis=0) 695 | ): 696 | s = np.sum(np.diagonal(v)) 697 | if smallest_found > s: 698 | # print(s) 699 | smallest_found = s 700 | if s > min_match_value: 701 | continue 702 | if smallest_sum is None or s < smallest_sum: 703 | smallest_sum = s 704 | location = i 705 | 706 | if smallest_sum is None: 707 | return None 708 | 709 | return librosa.frames_to_time( 710 | [location, location + len(x_1_chroma)], sr=fs, hop_length=HOP_LENGTH 711 | ) 712 | 713 | 714 | def humanize_seconds(s): 715 | m, s = divmod(s, 60) 716 | h, m = divmod(m, 60) 717 | return f"{int(h):02}:{int(m):02}:{(s):06.3f}" 718 | 719 | 720 | def turn_audio_shift_points_to_audio_segments(audio_shift_points): 721 | sync_buckets = [] 722 | delete_buckets = [] 723 | 724 | if len(audio_shift_points) > 1: 725 | for i in range(len(audio_shift_points)): 726 | local_audio_shift_points = audio_shift_points[i : i + 2] 727 | if len(local_audio_shift_points) > 1: 728 | (st1, st2, sdelta), (et1, et2, edelta) = local_audio_shift_points 729 | 730 | from_delete_time = (et2 + edelta) - sdelta 731 | # from_delete_delay = max(0.0, et2 - from_delete_time) 732 | if from_delete_time < et2: 733 | delete_buckets.append((from_delete_time, et2)) 734 | 735 | # sync_buckets.append(((st2 - min(from_delete_delay, to_delete_max_reuse)), st2 + (et1 - st1), sdelta)) 736 | sync_buckets.append( 737 | (st2, min(st2 + (et1 - st1), et2), sdelta) 738 | ) # TODO: make sure the et2 stuff is correct 739 | 740 | t1, t2, delta = audio_shift_points[-1] 741 | # sync_buckets.append((t2 - min(from_delete_delay, to_delete_max_reuse), 1_000_000, delta)) 742 | sync_buckets.append((t2, 1_000_000, delta)) 743 | print(f"Delete buckets {delete_buckets}") 744 | 745 | return sync_buckets, delete_buckets 746 | 747 | 748 | def find_good_frame_breakpoint(video, current_frame): # do binary search instead? 749 | compare_frame_size = (32, 32) 750 | frame_cache = {} 751 | 752 | def get_frame(frame_no): 753 | if frame_no not in frame_cache: 754 | video.set(cv2.CAP_PROP_POS_FRAMES, frame_no) 755 | frame_cache[frame_no] = cv2.cvtColor( 756 | cv2.resize(video.read()[1], compare_frame_size), cv2.COLOR_BGR2GRAY 757 | ) 758 | return frame_cache[frame_no] 759 | 760 | best_score = 1.0 761 | best_frame = current_frame 762 | for frame_no in frame_generator(current_frame): 763 | score = structural_similarity(get_frame(frame_no), get_frame(frame_no + 1)) 764 | if score < best_score: 765 | best_score = score 766 | best_frame = frame_no 767 | if score < 0.65: 768 | return frame_no - current_frame 769 | return best_frame - current_frame 770 | 771 | 772 | def frame_generator(start_i): 773 | for i in range(1, 300): 774 | if i >= start_i: 775 | continue 776 | yield start_i + i 777 | yield start_i - i 778 | 779 | 780 | def estimate_frame_diff( 781 | source_video, target_video, current_source_frame, current_target_frame 782 | ): 783 | frame_index_size = (64, 64) 784 | compare_frame_count = 5 785 | spread_frame_count = 14 786 | ret, source_frame = source_video.read() 787 | ret, target_frame = target_video.read() 788 | 789 | sy, sx, sz = source_frame.shape 790 | ty, tx, tz = target_frame.shape 791 | 792 | s_aspect = sx / sy 793 | t_aspect = tx / ty 794 | 795 | source_frames = [] 796 | target_frames = [] 797 | 798 | source_from_frame = current_source_frame - (compare_frame_count // 2) 799 | source_to_frame = source_from_frame + compare_frame_count 800 | 801 | source_video.set(cv2.CAP_PROP_POS_FRAMES, source_from_frame) 802 | for _ in range(source_from_frame, source_to_frame): 803 | frame_no = source_video.get(cv2.CAP_PROP_POS_FRAMES) 804 | source_frames.append( 805 | ( 806 | cv2.cvtColor( 807 | cv2.resize(source_video.read()[1], frame_index_size), 808 | cv2.COLOR_BGR2GRAY, 809 | ), 810 | frame_no, 811 | ) 812 | ) 813 | 814 | target_from_frame = current_target_frame - spread_frame_count 815 | target_to_frame = target_from_frame + (spread_frame_count * 2) 816 | 817 | target_video.set(cv2.CAP_PROP_POS_FRAMES, target_from_frame) 818 | for _ in range(target_from_frame, target_to_frame): 819 | frame_no = target_video.get(cv2.CAP_PROP_POS_FRAMES) 820 | target_frame = target_video.read()[1] 821 | if s_aspect > t_aspect: 822 | new_tx = (tx / sx) * sy 823 | slice_each_x = int((tx - new_tx) / 2) 824 | target_frame = target_frame[0:ty, slice_each_x : (tx - slice_each_x)] 825 | 826 | target_frame = cv2.resize(target_frame, frame_index_size) 827 | target_frames.append((cv2.cvtColor(target_frame, cv2.COLOR_BGR2GRAY), frame_no)) 828 | best_diff = 0 829 | best_frame_diff = None 830 | for i in range(len(target_frames) - len(source_frames)): 831 | v = target_frames[i : i + len(source_frames)] 832 | diffs = [] 833 | frame_nos = [] 834 | for sf, tf in zip(source_frames, v): 835 | sf, sfn = sf 836 | tf, tfn = tf 837 | frame_nos.append((sfn, tfn)) 838 | diffs.append(structural_similarity(sf, tf, multichannel=False)) 839 | diffs = np.square(np.array(diffs) * 100) 840 | if sum(diffs) > best_diff: 841 | best_diff = sum(diffs) 842 | best_frame_diff = (target_from_frame + i) - source_from_frame 843 | 844 | return best_frame_diff 845 | 846 | 847 | def frame_align_video(source_video, target_video, line_start, delta): 848 | print(f"Frame aligning video at {line_start=} {delta=}") 849 | source_frame_no = math.ceil((line_start * source_video.video_info["fps"]) / 1000) 850 | target_frame_no = math.ceil( 851 | ((line_start + delta) * target_video.video_info["fps"]) / 1000 852 | ) 853 | 854 | good_breakpoint = find_good_frame_breakpoint( 855 | source_video.video_capture, source_frame_no 856 | ) 857 | 858 | frame_diff = estimate_frame_diff( 859 | source_video.video_capture, 860 | target_video.video_capture, 861 | source_frame_no + good_breakpoint, 862 | target_frame_no + good_breakpoint, 863 | ) 864 | # frame_diff_delta = math.ceil((frame_diff / target_video.video_info["fps"]) * 1000) 865 | best_target_frame = source_frame_no + frame_diff 866 | best_target_frame_time = math.ceil( 867 | best_target_frame * 1000 / target_video.video_info["fps"] 868 | ) 869 | # actual_delta = line.start + frame_diff_delta 870 | actual_delta = best_target_frame_time - line_start 871 | if actual_delta != delta: 872 | print( 873 | f"Sign delta is different {delta=} {actual_delta=} {line_start=} {best_target_frame=} {best_target_frame_time=} {humanize_seconds((line_start + actual_delta)/1000)}" 874 | ) 875 | 876 | return actual_delta 877 | 878 | 879 | def frame_align_sync_bucket(source_video, target_video, sync_bucket): 880 | start_timestamp, end_timestamp, delta = sync_bucket 881 | delta = round(delta * 1000) 882 | actual_end_timestamp = min( 883 | end_timestamp, float(source_video.video_info["duration"]) 884 | ) 885 | line_start = int(((actual_end_timestamp - start_timestamp) * 1000) / 2) 886 | new_delta = frame_align_video(source_video, target_video, line_start, delta) 887 | if delta != new_delta: 888 | print(f"Changed delta from {delta=} {new_delta=}") 889 | return (start_timestamp, end_timestamp, new_delta / 1000) 890 | 891 | 892 | def extract_subtitle_data(video_file, track_id, framerate_align=None): 893 | track_stream = video_file.subtitle_streams[track_id] 894 | codec_name_mapping = { 895 | "ass": "ass", 896 | "subrip": "srt", 897 | } 898 | subtitle_format = codec_name_mapping[track_stream["codec_name"]] 899 | subtitles_data = ( 900 | video_file.create_ffmpeg()[f"s:{track_id}"] 901 | .output("pipe:", format=subtitle_format) 902 | .run(capture_stdout=True, quiet=True)[0] 903 | .decode("utf-8") 904 | ) 905 | subtitles = pysubs2.SSAFile.from_string(subtitles_data) 906 | if framerate_align is not None: 907 | subtitles.transform_framerate(framerate_align[0], framerate_align[1]) 908 | return subtitles, subtitle_format 909 | 910 | 911 | def import_subtitle_data(external_subtitle_file, framerate_align=None): 912 | subtitle_format = external_subtitle_file.split(".")[-1] 913 | subtitles = pysubs2.load(external_subtitle_file) 914 | if framerate_align is not None: 915 | subtitles.transform_framerate(framerate_align[0], framerate_align[1]) 916 | return subtitles, subtitle_format 917 | 918 | 919 | def extract_and_sync_subtitles( 920 | video_file, 921 | track_id, 922 | video_duration, 923 | only_delta, 924 | audio_shift_points, 925 | subtitle_sync_buckets, 926 | subtitle_delete_buckets, 927 | output_file, 928 | subtitle_cutoff, 929 | sync_non_dialogue_to_video, 930 | output_video_file, 931 | framerate_align=None, 932 | external_subtitle_file=None, 933 | subtitle_min_font_size=None, 934 | ): 935 | video_align_cache = {} 936 | if external_subtitle_file is not None: 937 | subtitle, subtitle_format = import_subtitle_data( 938 | external_subtitle_file, framerate_align 939 | ) 940 | else: 941 | subtitle, subtitle_format = extract_subtitle_data( 942 | video_file, track_id, framerate_align 943 | ) 944 | # TODO: if target video is missing some stuff, make sure we do not shift into that part and have double subtitles. 945 | 946 | new_subtitles = [] 947 | video_duration = int(video_duration * 1000) 948 | subtitle_sync_buckets = [ 949 | (round(t1 * 1000), round(t2 * 1000), round(delta * 1000)) 950 | for (t1, t2, delta) in subtitle_sync_buckets 951 | ] 952 | subtitle_delete_buckets = [ 953 | (round(t1 * 1000), round(t2 * 1000)) for (t1, t2) in subtitle_delete_buckets 954 | ] 955 | print(f"Sync buckets {subtitle_sync_buckets=} {subtitle_delete_buckets=}") 956 | to_delete_lines = set() 957 | for i, line in enumerate(subtitle): 958 | is_dialogue = True 959 | if ( 960 | sync_non_dialogue_to_video 961 | and line.start >= sync_non_dialogue_to_video[0] * 1000 962 | and line.start <= sync_non_dialogue_to_video[1] * 1000 963 | and subtitle.format == "ass" 964 | and line.type == "Dialogue" 965 | and ( 966 | "\\pos(" in line.text or "\\move(" in line.text 967 | ) # a bit lazy way to figure out if sign 968 | ): 969 | is_dialogue = False 970 | skip_sync = False 971 | for t1, t2 in subtitle_delete_buckets: 972 | if ( 973 | not only_delta 974 | and line.start > t1 975 | and line.start < t2 976 | or line.end > t1 977 | and line.end < t2 978 | ): 979 | print(f"DELETING LINE {line}") 980 | to_delete_lines.add(i) 981 | skip_sync = True 982 | break 983 | if skip_sync: 984 | continue 985 | for ( 986 | t1, 987 | t2, 988 | delta, 989 | ) in subtitle_sync_buckets: # TODO: sync to new time and move to at least 0.00 990 | current_line_length = line.end - line.start 991 | if current_line_length < 1000: 992 | min_line_length = current_line_length 993 | elif current_line_length < 5000: 994 | min_line_length = int(current_line_length * 0.75) 995 | else: 996 | min_line_length = 5000 997 | if (line.start >= t1 and line.start <= t2) or ( 998 | (max(t1 + delta, 0) - delta) >= line.start 999 | and (max(t2 + delta, 0) - delta) <= line.start 1000 | ): # TODO: what does the extra part fix 1001 | if not only_delta and line.end >= t1 and line.end > t2: 1002 | print( 1003 | f"WARNING, we are floating outside bounds with end: {line.start} {line.end} - {line}" 1004 | ) 1005 | # print(f"Matching with t1={humanize_seconds(t1/1000)} t2={humanize_seconds(t2/1000)} {delta=} {line=} start") 1006 | if not is_dialogue: 1007 | if line.start not in video_align_cache: 1008 | video_align_cache[line.start] = frame_align_video( 1009 | video_file, output_video_file, line.start, delta 1010 | ) 1011 | delta = video_align_cache[line.start] 1012 | line.start += delta 1013 | if only_delta: 1014 | line.end += delta 1015 | else: 1016 | line.end = min( 1017 | line.end + delta, t2 1018 | ) # TODO: allow floating outside if it doesn't hit anything 1019 | if not only_delta and line.end - line.start < min_line_length: 1020 | print(f"WARNING, line is too short {line} - {min_line_length=}") 1021 | break 1022 | elif (line.end >= t1 and line.end <= t2) or ( 1023 | (max(t1 + delta, 0) - delta) >= line.end 1024 | and (max(t2 + delta, 0) - delta) <= line.end 1025 | ): # TODO: what does the extra part fix 1026 | if not only_delta and line.start >= t1 and line.start > t2: 1027 | print( 1028 | f"WARNING, we are floating outside bounds with start: {line.start} {line.end} - {line}" 1029 | ) 1030 | # print(f"Matching with t1={humanize_seconds(t1/1000)} t2={humanize_seconds(t2/1000)} {delta=} {line=} end") 1031 | if not is_dialogue: 1032 | if line.start not in video_align_cache: 1033 | video_align_cache[line.start] = frame_align_video( 1034 | video_file, output_video_file, line.start, delta 1035 | ) 1036 | delta = video_align_cache[line.start] 1037 | if only_delta: 1038 | line.start += delta 1039 | else: 1040 | line.start = min( 1041 | line.start + delta, t1 1042 | ) # TODO: allow floating outside if it doesn't hit anything 1043 | line.end += delta 1044 | if not only_delta and line.end - line.start < min_line_length: 1045 | print(f"WARNING, line is too short {line} - {min_line_length=}") 1046 | break 1047 | else: 1048 | print(f"Unable to find place for {line}") 1049 | to_delete_lines.add(i) 1050 | 1051 | for i, line in enumerate(subtitle): 1052 | if i in to_delete_lines: 1053 | continue 1054 | if subtitle_cutoff is not None and line.start > int(subtitle_cutoff * 1000): 1055 | print(f"Removing line {line} because it is after {subtitle_cutoff=}") 1056 | to_delete_lines.add(i) 1057 | elif line.start > video_duration: 1058 | print(f"Removing line {line} because it starts after end") 1059 | to_delete_lines.add(i) 1060 | elif line.end < 0: 1061 | print(f"Removing line {line} because it ends after the video") 1062 | to_delete_lines.add(i) 1063 | elif line.end > video_duration: 1064 | print(f"Moving line {line} end to end of video because it ended after") 1065 | line.end = video_duration 1066 | elif line.start < 0: 1067 | print( 1068 | f"Moving line {line} start to beginning of video because it started before" 1069 | ) 1070 | line.start = 0 1071 | 1072 | for i in sorted(to_delete_lines, reverse=True): 1073 | print(f"Deleting line {i}") 1074 | del subtitle[i] 1075 | 1076 | if subtitle_min_font_size is not None: 1077 | for style_name, style in subtitle.styles.items(): 1078 | if style.fontsize < subtitle_min_font_size: 1079 | print(f"Setting font size from {style.fontsize} for {style_name}") 1080 | style.fontsize = subtitle_min_font_size 1081 | 1082 | output_file = output_file.with_suffix("." + subtitle_format) 1083 | subtitle.save(output_file) 1084 | return output_file 1085 | 1086 | 1087 | def extract_and_sync_audio( 1088 | video_file, 1089 | track_id, 1090 | output_video_duration, 1091 | audio_shift_points, 1092 | sync_buckets, 1093 | delete_buckets, 1094 | audio_output_file, 1095 | ): 1096 | audio_stream = video_file.audio_streams[track_id] 1097 | audio_file_list = [] 1098 | segment_id = 1 1099 | already_added_delta = 0.0 1100 | for t1, t2, delta in sync_buckets: 1101 | expected_delta = delta - already_added_delta 1102 | # delay_cmd = [] 1103 | 1104 | if expected_delta > 0: 1105 | print( 1106 | f"Adding {expected_delta=} with absolute {delta=} silence audio from {t1=}" 1107 | ) 1108 | 1109 | silence_segment_output_file = audio_output_file.with_suffix( 1110 | f".s.{segment_id}.mkv" 1111 | ) 1112 | audio_file_list.append(silence_segment_output_file) 1113 | cmd = [ 1114 | "ffmpeg", 1115 | "-y", 1116 | "-f", 1117 | "lavfi", 1118 | "-i", 1119 | f"anullsrc=channel_layout={audio_stream['channel_layout']}:sample_rate={audio_stream['sample_rate']}", 1120 | "-t", 1121 | str(expected_delta), 1122 | "-c:a", 1123 | audio_stream["codec_name"], 1124 | str(silence_segment_output_file), 1125 | ] 1126 | subprocess.check_call( 1127 | cmd, 1128 | stdin=subprocess.PIPE, 1129 | stdout=subprocess.DEVNULL, 1130 | stderr=subprocess.DEVNULL, 1131 | ) 1132 | elif expected_delta < 0: 1133 | print(f"Removing {expected_delta=} with absolute {delta=} audio from {t1=}") 1134 | pass # we should cut the beginning of this track and cut from t2 to tt2 1135 | 1136 | print(f"Copying audio segment {t1=} {t2=} with cut {expected_delta=}") 1137 | segment_output_file = audio_output_file.with_suffix(f".{segment_id}.mkv") 1138 | audio_file_list.append(segment_output_file) 1139 | 1140 | cmd = [ 1141 | "ffmpeg", 1142 | "-y", 1143 | "-i", 1144 | video_file.filepath, 1145 | "-map", 1146 | f"a:{track_id}", 1147 | "-c", 1148 | "copy", 1149 | # ] + delay_cmd + [ 1150 | "-ss", 1151 | str(t1), 1152 | "-t", 1153 | str(t2 - t1), 1154 | str(segment_output_file), 1155 | ] 1156 | subprocess.check_call( 1157 | cmd, 1158 | stdin=subprocess.PIPE, 1159 | stdout=subprocess.DEVNULL, 1160 | stderr=subprocess.DEVNULL, 1161 | ) 1162 | 1163 | already_added_delta += expected_delta # TODO: measure actual length of file instead of assuming it is 100% correct. 1164 | segment_id += 1 1165 | 1166 | input_file = audio_output_file.with_suffix(".txt") 1167 | input_file.write_text("\n".join(f"file '{p.name}'" for p in audio_file_list)) 1168 | actual_audio_output_file = audio_output_file.with_suffix(f".mkv") 1169 | print(f"Combining audio segments for {str(actual_audio_output_file)}") 1170 | cmd = [ 1171 | "ffmpeg", 1172 | "-y", 1173 | "-f", 1174 | "concat", 1175 | "-safe", 1176 | "0", 1177 | "-i", 1178 | str(input_file), 1179 | "-c", 1180 | "copy", 1181 | str(actual_audio_output_file), 1182 | ] 1183 | subprocess.check_call( 1184 | cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL 1185 | ) 1186 | 1187 | return actual_audio_output_file 1188 | 1189 | 1190 | def extract_and_sync_chapters(video_file, video_duration, audio_shift_points): 1191 | chapters = [] 1192 | for chapter in video_file.chapters: 1193 | start_time = float(chapter["start_time"]) 1194 | title = chapter["tags"]["title"] 1195 | for t1, t2, delta in audio_shift_points: 1196 | if t2 < 5.0: 1197 | t2 = 0 1198 | if start_time >= t2: 1199 | new_start_time = max(start_time + delta, 0.0) 1200 | if new_start_time - 5.0 > video_duration: 1201 | print( 1202 | f"Skipping chapter {start_time=} {title=} because it floats after end" 1203 | ) 1204 | break 1205 | new_start_time = min(new_start_time, video_duration) 1206 | if new_start_time < 3.0: 1207 | new_start_time = 0 1208 | chapters.append((new_start_time, title)) 1209 | break 1210 | else: 1211 | print(f"Unable to find place for {start_time=} {title=}") 1212 | return chapters 1213 | 1214 | 1215 | @click.command() 1216 | @click.argument("file", type=click.Path(exists=True), nargs=-1, required=True) 1217 | @click.option( 1218 | "--only-generate-chroma", is_flag=True, help="Quit after chroma is generated" 1219 | ) 1220 | @click.option( 1221 | "--sync-using-subtitle-audio", 1222 | is_flag=True, 1223 | help="Extract audio from source where subtitles are and align with target. Good when video is partial or re-arranged. Bad for audio syncs.", 1224 | ) 1225 | @click.option("--skip-subtitles", is_flag=True, help="Do not align subtitles.") 1226 | @click.option("--skip-shift-point", type=str, help="List of sync points to skip") 1227 | @click.option( 1228 | "--subtitle-cutoff", 1229 | type=float, 1230 | help="Subtitle cutoff where everything after is removed", 1231 | ) 1232 | @click.option( 1233 | "--only-delta", 1234 | is_flag=True, 1235 | help="Only do delta shifts, not group alignment (warning, subtitles might overlap?)", 1236 | ) 1237 | @click.option( 1238 | "--align-framerate", 1239 | is_flag=True, 1240 | help="Align source framerate to target video framerate, when speedup/slowdown used as technique to change framerate.", 1241 | ) 1242 | @click.option( 1243 | "--align-frames-too", 1244 | is_flag=True, 1245 | help="Align using frames when the delta is discovered.", 1246 | ) 1247 | @click.option( 1248 | "--preserve-silence", 1249 | is_flag=True, 1250 | help="Preserve silence at the end of the video instead of trimming it", 1251 | ) 1252 | @click.option( 1253 | "--temp-folder", 1254 | type=click.Path(), 1255 | default="milk-temp", 1256 | help="Temp folder to store various files in.", 1257 | ) 1258 | @click.option( 1259 | "--audio-tracks", type=TRACK_MAPPING, help="Specify audio tracks to compare with." 1260 | ) 1261 | @click.option( 1262 | "--adjust-shift-point", 1263 | type=str, 1264 | help="Maually adjust an audio shift point.", 1265 | multiple=True, 1266 | ) 1267 | @click.option( 1268 | "--adjust-delay", type=float, help="Maually adjust delay." 1269 | ) # TODO: define audio track to do it to 1270 | @click.option( 1271 | "--sync-non-dialogue-to-video", 1272 | type=INTEGER_RANGE, 1273 | help="Sync non-dialogue using frames instead of audio, good for e.g. remastered where they audio might be re-aligned", 1274 | ) 1275 | @click.option( 1276 | "--chapter-source", 1277 | type=int, 1278 | help="Input file index where to extract chapters from.", 1279 | ) 1280 | @click.option( 1281 | "--chapter-beginning", 1282 | type=str, 1283 | help="Name of chapter from the beginning of the file", 1284 | ) 1285 | @click.option( 1286 | "--chapter-segment-file", 1287 | type=click.Path(exists=True), 1288 | help="Files to try to automatically generate chapters from.", 1289 | multiple=True, 1290 | ) 1291 | @click.option( 1292 | "--chapter-segment-name-start", 1293 | type=str, 1294 | help="Name start of chapter given in --chapter-segment-file, handled in same order", 1295 | multiple=True, 1296 | ) 1297 | @click.option( 1298 | "--chapter-segment-name-end", 1299 | type=str, 1300 | help="Name end of chapter given in --chapter-segment-file, handled in same order", 1301 | multiple=True, 1302 | ) 1303 | @click.option( 1304 | "--chapter-segment-required", 1305 | is_flag=True, 1306 | help="Error out if not all chapters are found", 1307 | ) 1308 | @click.option( 1309 | "--metadata-audio-track", 1310 | type=METADATA_MAPPING, 1311 | help="Set metadata for an audio track, syntax track_id=key=value", 1312 | multiple=True, 1313 | ) 1314 | @click.option( 1315 | "--metadata-subtitle-track", 1316 | type=METADATA_MAPPING, 1317 | help="Set metadata for a subtitle track, syntax track_id=key=value", 1318 | multiple=True, 1319 | ) 1320 | @click.option( 1321 | "--subtitle-min-font-size", 1322 | type=int, 1323 | help="Set the minimum font size", 1324 | ) 1325 | @click.option( 1326 | "--input-external-subtitle-track", 1327 | type=click.Path(exists=True), 1328 | help="External subtitle track, presumed to be part of first input file", 1329 | ) 1330 | @click.option( 1331 | "--output-video-file-index", type=int, help="Which file to pull video track from" 1332 | ) 1333 | @click.option( 1334 | "--output-audio-mapping", 1335 | type=TRACK_MAPPING, 1336 | help="Which audio tracks to include in output", 1337 | ) 1338 | @click.option( 1339 | "--output-subtitle-mapping", 1340 | type=TRACK_MAPPING, 1341 | help="Which subtitle tracks to include in output", 1342 | ) 1343 | @click.option("--output", type=click.Path(exists=False), help="Output file.") 1344 | @click.option( 1345 | "--output-subtitle", type=click.Path(exists=False), help="Output subtitle file." 1346 | ) 1347 | def main( 1348 | file, 1349 | only_generate_chroma, 1350 | sync_using_subtitle_audio, 1351 | skip_subtitles, 1352 | skip_shift_point, 1353 | subtitle_cutoff, 1354 | only_delta, 1355 | align_framerate, 1356 | align_frames_too, 1357 | preserve_silence, 1358 | temp_folder, 1359 | audio_tracks, 1360 | adjust_shift_point, 1361 | adjust_delay, 1362 | sync_non_dialogue_to_video, 1363 | chapter_source, 1364 | chapter_beginning, 1365 | chapter_segment_file, 1366 | chapter_segment_name_start, 1367 | chapter_segment_name_end, 1368 | chapter_segment_required, 1369 | metadata_audio_track, 1370 | metadata_subtitle_track, 1371 | subtitle_min_font_size, 1372 | input_external_subtitle_track, 1373 | output_video_file_index, 1374 | output_audio_mapping, 1375 | output_subtitle_mapping, 1376 | output, 1377 | output_subtitle, 1378 | ): 1379 | # logging.basicConfig(level=logging.DEBUG) 1380 | global HOP_LENGTH 1381 | 1382 | if output_video_file_index is None: 1383 | output_video_file_index = len(file) - 1 1384 | 1385 | if output_audio_mapping is None: 1386 | output_audio_mapping = [[output_video_file_index, 0]] 1387 | 1388 | if output_subtitle_mapping is None: 1389 | output_subtitle_mapping = [[0, 0]] 1390 | 1391 | if skip_shift_point: 1392 | skip_shift_point = sorted( 1393 | [int(i) for i in skip_shift_point.split(",")], reverse=True 1394 | ) 1395 | else: 1396 | skip_shift_point = [] 1397 | 1398 | if metadata_audio_track is None: 1399 | metadata_audio_track = [] 1400 | 1401 | if metadata_subtitle_track is None: 1402 | metadata_subtitle_track = [] 1403 | 1404 | if chapter_source is None: 1405 | chapter_source = output_video_file_index 1406 | 1407 | if adjust_shift_point is None: 1408 | adjust_shift_point = [] 1409 | 1410 | adjust_shift_points = [] 1411 | for point in adjust_shift_point: 1412 | point = point.split(":") 1413 | adjust_shift_points.append( 1414 | (int(point[0]), int(point[1]), int(point[2]), float(point[3])) 1415 | ) 1416 | 1417 | if sync_using_subtitle_audio: 1418 | n_chroma = 36 1419 | HOP_LENGTH = 512 1420 | else: 1421 | n_chroma = 12 1422 | HOP_LENGTH = 1024 1423 | 1424 | mapped_metadata_audio_track = {} 1425 | for track_id, key, value in metadata_audio_track: 1426 | mapped_metadata_audio_track.setdefault(track_id, {})[key] = value 1427 | 1428 | mapped_metadata_subtitle_track = {} 1429 | for track_id, key, value in metadata_subtitle_track: 1430 | mapped_metadata_subtitle_track.setdefault(track_id, {})[key] = value 1431 | 1432 | chapter_segment_name_start = list(chapter_segment_name_start or []) 1433 | chapter_segment_name_start += [""] * len(chapter_segment_file) 1434 | 1435 | chapter_segment_name_end = list(chapter_segment_name_end or []) 1436 | chapter_segment_name_end += [""] * len(chapter_segment_file) 1437 | 1438 | chapter_segment_files = {} 1439 | if chapter_segment_file: 1440 | for i, csf in enumerate(chapter_segment_file): 1441 | chapter_segment_files[Path(csf)] = ( 1442 | chapter_segment_name_start[i], 1443 | chapter_segment_name_end[i], 1444 | ) 1445 | 1446 | temp_folder = Path(temp_folder) 1447 | temp_folder.mkdir(exist_ok=True) 1448 | files = [Path(f) for f in file] 1449 | video_files = [Video(f) for f in file] 1450 | output_video_file = video_files[output_video_file_index] 1451 | output_video_duration = float(output_video_file.video_info["duration"]) 1452 | 1453 | if subtitle_cutoff and subtitle_cutoff < 0: 1454 | subtitle_cutoff += output_video_duration 1455 | print(f"Setting cutoff from negative to {subtitle_cutoff}") 1456 | 1457 | sync_audio_track_mapping = [0] * len(files) 1458 | if audio_tracks is not None: 1459 | for file_id, audio_track_id in audio_tracks: 1460 | sync_audio_track_mapping[file_id] = audio_track_id 1461 | 1462 | chromas = {} 1463 | 1464 | framerate_aligns = {} 1465 | if align_framerate: 1466 | target_framerate = output_video_file.video_info["fps"] 1467 | for i, (f, video_file) in enumerate(zip(files, video_files)): 1468 | if i == output_video_file_index: 1469 | continue 1470 | video_framerate = video_file.video_info["fps"] 1471 | if target_framerate != video_framerate: 1472 | framerate_aligns[i] = (video_framerate, target_framerate) 1473 | 1474 | with ThreadPoolExecutor(max_workers=8) as executor: 1475 | jobqueue = {} 1476 | for i, (f, audio_track_id) in enumerate(zip(files, sync_audio_track_mapping)): 1477 | framerate_align = framerate_aligns.get(i) 1478 | framerate_align_filename = ( 1479 | framerate_align 1480 | and f".{str(framerate_align[0]).replace('.', '_')}-{str(framerate_align[1]).replace('.', '_')}" 1481 | or "" 1482 | ) 1483 | preserve_silence_filename = preserve_silence and ".ps" or "" 1484 | 1485 | audio_chroma_output_file = temp_folder / ( 1486 | f.stem 1487 | + f"{framerate_align_filename}{preserve_silence_filename}.{HOP_LENGTH}.{n_chroma}.{audio_track_id}.chroma" 1488 | ) 1489 | if audio_chroma_output_file.exists(): 1490 | print(f"Loading chroma from {audio_chroma_output_file}") 1491 | chromas[f] = pickle.loads(audio_chroma_output_file.read_bytes()) 1492 | else: 1493 | print(f"Extracting chroma from {f.name}") 1494 | future = executor.submit( 1495 | generate_chroma_cqt, 1496 | f, 1497 | sync_audio_track_mapping[i], 1498 | audio_chroma_output_file, 1499 | n_chroma=n_chroma, 1500 | framerate_align=framerate_align, 1501 | preserve_silence=preserve_silence, 1502 | ) 1503 | jobqueue[future] = f 1504 | 1505 | for f in chapter_segment_files: 1506 | preserve_silence_filename = preserve_silence and ".ps" or "" 1507 | audio_chroma_output_file = temp_folder / ( 1508 | f.stem + f"{preserve_silence_filename}.{HOP_LENGTH}.{n_chroma}.c.chroma" 1509 | ) 1510 | if audio_chroma_output_file.exists(): 1511 | print(f"Loading chroma from {audio_chroma_output_file}") 1512 | chromas[f] = pickle.loads(audio_chroma_output_file.read_bytes()) 1513 | else: 1514 | print(f"Extracting chroma from {f.name}") 1515 | future = executor.submit( 1516 | generate_chroma_cqt, 1517 | f, 1518 | 0, 1519 | audio_chroma_output_file, 1520 | n_chroma=n_chroma, 1521 | preserve_silence=preserve_silence, 1522 | ) 1523 | jobqueue[future] = f 1524 | 1525 | for future in concurrent.futures.as_completed(jobqueue): 1526 | chromas[jobqueue[future]] = future.result() 1527 | 1528 | if only_generate_chroma: 1529 | print("Done generating chroma") 1530 | quit(0) 1531 | 1532 | x_2_chroma, fs = chromas[ 1533 | files[output_video_file_index] 1534 | ] # x_2_chroma is always target video, the one we align everything with 1535 | 1536 | chapter_timestamps = [] 1537 | for f, (start_name, end_name) in chapter_segment_files.items(): 1538 | print(f"Looking for chapter matching {f}") 1539 | x_1_chroma, fs = chromas[f] 1540 | chapter_specifications = find_and_align_chapter( 1541 | x_1_chroma, x_2_chroma, fs, min_match_value=60 + (n_chroma * 3) 1542 | ) 1543 | if chapter_specifications is None: 1544 | if chapter_segment_required: 1545 | print("Did not find required chapters") 1546 | quit(1) 1547 | else: 1548 | continue 1549 | 1550 | chapter_timestamps.append((chapter_specifications[0], start_name)) 1551 | chapter_timestamps.append((chapter_specifications[1], end_name)) 1552 | 1553 | if chapter_timestamps: 1554 | min_chapter_delay = 4.0 1555 | previous_chapter_time = None 1556 | 1557 | new_chapter_timestamps = [] 1558 | for chapter_timestamp, name in sorted(chapter_timestamps): 1559 | if chapter_timestamp < 3.0: 1560 | chapter_timestamp = 0.0 1561 | if ( 1562 | previous_chapter_time is not None 1563 | and chapter_timestamp - previous_chapter_time < min_chapter_delay 1564 | ): 1565 | print( 1566 | f"Skipping chapter '{name}' because it is too close to previous chapter" 1567 | ) 1568 | continue 1569 | 1570 | if output_video_duration - chapter_timestamp < min_chapter_delay: 1571 | print( 1572 | f"Skipping chapter '{name}' because it is too close to end of video" 1573 | ) 1574 | continue 1575 | 1576 | previous_chapter_time = chapter_timestamp 1577 | new_chapter_timestamps.append((chapter_timestamp, name)) 1578 | 1579 | chapter_timestamps = new_chapter_timestamps 1580 | 1581 | if chapter_beginning and chapter_timestamps[0][0] > 0.0: 1582 | print("Injecting chapter at beginning") 1583 | chapter_timestamps.insert(0, (0.0, chapter_beginning)) 1584 | 1585 | print("Found chapters:") 1586 | for chapter_timestamp, name in chapter_timestamps: 1587 | print(f" {humanize_seconds(chapter_timestamp)} - {name}") 1588 | 1589 | attachment_files = set() 1590 | subtitle_files = [] 1591 | audio_files = [] 1592 | 1593 | for i, (f, video_file) in enumerate(zip(files, video_files)): 1594 | if i == output_video_file_index: 1595 | continue 1596 | 1597 | x_1_chroma, fs = chromas[f] 1598 | if sync_using_subtitle_audio: 1599 | for j, (file_index, track_id) in enumerate(output_subtitle_mapping): 1600 | if file_index != i: 1601 | continue 1602 | break 1603 | else: 1604 | print( 1605 | "No subtitle track found to sync with, please specify a mapping to use this feature" 1606 | ) 1607 | quit(1) 1608 | 1609 | ( 1610 | audio_shift_points, 1611 | sync_buckets, 1612 | delete_buckets, 1613 | ) = estimate_audio_shift_points_from_subtitles( 1614 | x_1_chroma, 1615 | x_2_chroma, 1616 | fs, 1617 | video_file, 1618 | j, 1619 | n_chroma, 1620 | adjust_delay=adjust_delay, 1621 | framerate_align=framerate_aligns.get(i), 1622 | external_subtitle_file=input_external_subtitle_track, 1623 | ) 1624 | else: 1625 | audio_shift_points = estimate_audio_shift_points( 1626 | x_1_chroma, 1627 | x_2_chroma, 1628 | fs, 1629 | only_delta=only_delta, 1630 | adjust_delay=adjust_delay, 1631 | sliding_window_size=300, 1632 | ) 1633 | 1634 | for skip_point in skip_shift_point: 1635 | if len(audio_shift_points) - 1 >= skip_point: 1636 | print(f"Skipping shift point {audio_shift_points[skip_point]}") 1637 | del audio_shift_points[skip_point] 1638 | 1639 | for point in adjust_shift_points: 1640 | if point[0] != i: 1641 | continue 1642 | print(f"Changing shift point {point=} {audio_shift_points[point[1]]}") 1643 | p = list(audio_shift_points[point[1]]) 1644 | p[point[2]] = point[3] 1645 | audio_shift_points[point[1]] = tuple(p) 1646 | print(audio_shift_points) 1647 | 1648 | sync_buckets, delete_buckets = turn_audio_shift_points_to_audio_segments( 1649 | audio_shift_points 1650 | ) 1651 | print( 1652 | f"Found audio shift points {audio_shift_points=} {sync_buckets=} {delete_buckets=}" 1653 | ) 1654 | 1655 | if align_frames_too: 1656 | print("Frame aligning buckets") 1657 | sync_buckets = [ 1658 | frame_align_sync_bucket(video_file, output_video_file, sync_bucket) 1659 | for sync_bucket in sync_buckets 1660 | ] 1661 | 1662 | if not skip_subtitles: 1663 | for j, (file_index, track_id) in enumerate(output_subtitle_mapping): 1664 | if file_index != i: 1665 | continue 1666 | 1667 | subtitle_output_file = temp_folder / (f.stem + f".{track_id}.unknown") 1668 | subtitle_output_file = extract_and_sync_subtitles( 1669 | video_file, 1670 | track_id, 1671 | output_video_duration, 1672 | only_delta, 1673 | audio_shift_points, 1674 | sync_buckets, 1675 | delete_buckets, 1676 | subtitle_output_file, 1677 | subtitle_cutoff, 1678 | sync_non_dialogue_to_video, 1679 | output_video_file, 1680 | framerate_align=framerate_aligns.get(i), 1681 | external_subtitle_file=input_external_subtitle_track, 1682 | subtitle_min_font_size=subtitle_min_font_size, 1683 | ) 1684 | 1685 | if input_external_subtitle_track: 1686 | subtitle_files.append((j, subtitle_output_file, [])) 1687 | elif subtitle_output_file: 1688 | subtitle_metadata = video_file.extract_subtitle_metadata(track_id) 1689 | subtitle_files.append((j, subtitle_output_file, subtitle_metadata)) 1690 | attachment_files.add(i) 1691 | 1692 | for j, (file_index, track_id) in enumerate(output_audio_mapping): 1693 | if file_index != i: 1694 | continue 1695 | 1696 | audio_output_file = temp_folder / (f.stem + f".{track_id}.unknown") 1697 | audio_output_file = extract_and_sync_audio( 1698 | video_file, 1699 | track_id, 1700 | output_video_duration, 1701 | audio_shift_points, 1702 | sync_buckets, 1703 | delete_buckets, 1704 | audio_output_file, 1705 | ) 1706 | 1707 | if audio_output_file: 1708 | audio_metadata = video_file.extract_audio_metadata(track_id) 1709 | audio_files.append((j, audio_output_file, audio_metadata)) 1710 | 1711 | if not chapter_timestamps and i == chapter_source: 1712 | chapter_timestamps = extract_and_sync_chapters( 1713 | video_file, output_video_duration, audio_shift_points 1714 | ) 1715 | 1716 | if output_subtitle: 1717 | if subtitle_files: 1718 | subtitle_file = subtitle_files[0][1] 1719 | subtitle_output_file = Path(output_subtitle).with_suffix( 1720 | subtitle_file.suffix 1721 | ) 1722 | print(f"Wrote the first subtitle track to {subtitle_output_file}") 1723 | shutil.copy(subtitle_file, subtitle_output_file) 1724 | else: 1725 | print("No subtitle file to output, skipping") 1726 | 1727 | if not output: 1728 | print("No output defined, quitting here") 1729 | quit(1) 1730 | 1731 | print("Combining everything") 1732 | temp_output_file = temp_folder / (f.stem + f".temp.mkv") 1733 | 1734 | ffmpeg_inputs = [] 1735 | ffmpeg_options = [] 1736 | subtitle_track_count = 0 1737 | audio_track_count = 0 1738 | 1739 | for f in files: 1740 | ffmpeg_inputs.append(str(f)) 1741 | 1742 | for i, (f, video_file) in enumerate(zip(files, video_files)): 1743 | if i != output_video_file_index: 1744 | continue 1745 | 1746 | if not skip_subtitles: 1747 | for ( 1748 | file_index, 1749 | track_id, 1750 | ) in output_subtitle_mapping: # TODO, inject at correct point 1751 | if file_index != i: 1752 | continue 1753 | ffmpeg_options += ["-map", f"{i}:s:{track_id}", "-c", "copy"] 1754 | subtitle_track_count += 1 1755 | attachment_files.add(i) 1756 | 1757 | for ( 1758 | file_index, 1759 | track_id, 1760 | ) in output_audio_mapping: # TODO, inject at correct point 1761 | if file_index != i: 1762 | continue 1763 | ffmpeg_options += ["-map", f"{i}:a:{track_id}", "-c", "copy"] 1764 | 1765 | for tag_key, tag_value in mapped_metadata_audio_track.get(0, {}).items(): 1766 | ffmpeg_options += [f"-metadata:s:a:{0}", f"{tag_key}={tag_value}"] 1767 | audio_track_count += 1 1768 | 1769 | for i in sorted(attachment_files): 1770 | ffmpeg_options += [ 1771 | "-map", 1772 | f"{i}:d?", 1773 | "-c", 1774 | "copy", 1775 | "-map", 1776 | f"{i}:t?", 1777 | "-c", 1778 | "copy", 1779 | ] 1780 | 1781 | ffmpeg_options += ["-map", f"{output_video_file_index}:v", "-c", "copy"] 1782 | if not chapter_timestamps: 1783 | ffmpeg_options += ["-map_chapters", str(chapter_source)] 1784 | else: 1785 | ffmpeg_options += ["-map_chapters", "-1"] 1786 | 1787 | for j, subtitle_file, subtitle_metadata in sorted(subtitle_files): 1788 | input_index = len(ffmpeg_inputs) 1789 | ffmpeg_options += ["-map", f"{input_index}:s", "-c", "copy"] 1790 | seen_tag_keys = set() 1791 | for tag_key, tag_value in subtitle_metadata: 1792 | seen_tag_keys.add(tag_key) 1793 | tag_value = mapped_metadata_subtitle_track.get(j, {}).get( 1794 | tag_key, tag_value 1795 | ) 1796 | ffmpeg_options += [ 1797 | f"-metadata:s:s:{subtitle_track_count}", 1798 | f"{tag_key}={tag_value}", 1799 | ] 1800 | for tag_key, tag_value in mapped_metadata_subtitle_track.get(j, {}).items(): 1801 | if tag_key in seen_tag_keys: 1802 | continue 1803 | ffmpeg_options += [ 1804 | f"-metadata:s:s:{subtitle_track_count}", 1805 | f"{tag_key}={tag_value}", 1806 | ] 1807 | ffmpeg_inputs.append(str(subtitle_file)) 1808 | subtitle_track_count += 1 1809 | 1810 | for j, audio_file, audio_metadata in sorted(audio_files): 1811 | input_index = len(ffmpeg_inputs) 1812 | ffmpeg_options += ["-map", f"{input_index}:a", "-c", "copy"] 1813 | seen_tag_keys = set() 1814 | for tag_key, tag_value in audio_metadata: 1815 | seen_tag_keys.add(tag_key) 1816 | tag_value = mapped_metadata_audio_track.get(j, {}).get(tag_key, tag_value) 1817 | ffmpeg_options += [ 1818 | f"-metadata:s:a:{audio_track_count}", 1819 | f"{tag_key}={tag_value}", 1820 | ] 1821 | for tag_key, tag_value in mapped_metadata_audio_track.get(j, {}).items(): 1822 | if tag_key in seen_tag_keys: 1823 | continue 1824 | ffmpeg_options += [ 1825 | f"-metadata:s:a:{audio_track_count}", 1826 | f"{tag_key}={tag_value}", 1827 | ] 1828 | # for tag_key, tag_value in audio_metadata: 1829 | # tag_value = mapped_metadata_audio_track.get(j, {}).get(tag_key, tag_value) 1830 | # ffmpeg_options += [f"-metadata:s:a:{audio_track_count}", f"{tag_key}={tag_value}"] 1831 | ffmpeg_inputs.append(str(audio_file)) 1832 | audio_track_count += 1 1833 | 1834 | if output_subtitle_mapping: 1835 | ffmpeg_options += ["-disposition:s:0", "default"] 1836 | 1837 | cmd = ["ffmpeg", "-y"] 1838 | for ffmpeg_input in ffmpeg_inputs: 1839 | cmd += ["-i", ffmpeg_input] 1840 | cmd += ffmpeg_options 1841 | cmd += [str(temp_output_file)] 1842 | print(f"Running: {shlex.join(cmd)}") 1843 | subprocess.check_call( 1844 | cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL 1845 | ) 1846 | 1847 | cmd = ["mkvmerge", "--no-global-tags", "-o", str(output)] 1848 | 1849 | if chapter_timestamps: 1850 | chapter_file = temp_folder / (f.stem + f".chapters.txt") 1851 | chapter_file.write_text( 1852 | "\n".join( 1853 | f"CHAPTER{i:02}={humanize_seconds(chapter_timestamp)}\nCHAPTER{i:02}NAME={name}" 1854 | for (i, (chapter_timestamp, name)) in enumerate(chapter_timestamps) 1855 | ) 1856 | ) 1857 | cmd += ["--chapters", str(chapter_file)] 1858 | 1859 | cmd += [str(temp_output_file)] 1860 | print(f"Creating final MKV file: {output}") 1861 | subprocess.check_call( 1862 | cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL 1863 | ) 1864 | os.unlink(temp_output_file) 1865 | 1866 | 1867 | if __name__ == "__main__": 1868 | warnings.filterwarnings("ignore") 1869 | main() 1870 | --------------------------------------------------------------------------------