├── .dockerignore
├── tess-data
    ├── configs
    │   ├── pdf
    │   ├── quiet
    │   ├── tsv
    │   ├── alto
    │   ├── api_config
    │   ├── get.images
    │   ├── logfile
    │   ├── lstmbox
    │   ├── makebox
    │   ├── wordstrbox
    │   ├── digits
    │   ├── hocr
    │   ├── unlv
    │   ├── inter
    │   ├── rebox
    │   ├── linebox
    │   ├── kannada
    │   ├── lstmdebug
    │   ├── bazaar
    │   ├── bigram
    │   ├── txt
    │   ├── ambigs.train
    │   ├── lstm.train
    │   ├── box.train
    │   ├── box.train.stderr
    │   ├── Makefile.am
    │   ├── strokewidth
    │   └── testspace
    ├── README.md
    └── eng.traineddata
├── .isort.cfg
├── LICENSE
├── Dockerfile
├── .gitignore
├── cartonizer.py
├── README.md
├── cowocr.py
└── milksync.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .env
3 | 


--------------------------------------------------------------------------------
/tess-data/configs/pdf:
--------------------------------------------------------------------------------
1 | tessedit_create_pdf 1
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/quiet:
--------------------------------------------------------------------------------
1 | debug_file /dev/null
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/tsv:
--------------------------------------------------------------------------------
1 | tessedit_create_tsv 1
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/alto:
--------------------------------------------------------------------------------
1 | tessedit_create_alto 1
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/api_config:
--------------------------------------------------------------------------------
1 | tessedit_zero_rejection T
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/get.images:
--------------------------------------------------------------------------------
1 | tessedit_write_images T
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/logfile:
--------------------------------------------------------------------------------
1 | debug_file tesseract.log
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/lstmbox:
--------------------------------------------------------------------------------
1 | tessedit_create_lstmbox 1
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/makebox:
--------------------------------------------------------------------------------
1 | tessedit_create_boxfile 1
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/wordstrbox:
--------------------------------------------------------------------------------
1 | tessedit_create_wordstrbox 1
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/digits:
--------------------------------------------------------------------------------
1 | tessedit_char_whitelist 0123456789-.
2 | 


--------------------------------------------------------------------------------
/tess-data/configs/hocr:
--------------------------------------------------------------------------------
1 | tessedit_create_hocr 1
2 | hocr_font_info 0
3 | 


--------------------------------------------------------------------------------
/tess-data/configs/unlv:
--------------------------------------------------------------------------------
1 | tessedit_write_unlv 1
2 | unlv_tilde_crunching T
3 | 


--------------------------------------------------------------------------------
/tess-data/configs/inter:
--------------------------------------------------------------------------------
1 | interactive_display_mode				T
2 | tessedit_display_outwords		T
3 | 


--------------------------------------------------------------------------------
/tess-data/configs/rebox:
--------------------------------------------------------------------------------
1 | tessedit_resegment_from_boxes 1
2 | tessedit_make_boxes_from_boxes 1
3 | 


--------------------------------------------------------------------------------
/tess-data/README.md:
--------------------------------------------------------------------------------
1 | Copied directly from the Tesseract project and is under their respective license.


--------------------------------------------------------------------------------
/tess-data/configs/linebox:
--------------------------------------------------------------------------------
1 | tessedit_resegment_from_line_boxes 1
2 | tessedit_make_boxes_from_boxes 1
3 | 


--------------------------------------------------------------------------------
/tess-data/eng.traineddata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JohnDoee/the-cute-collection/HEAD/tess-data/eng.traineddata


--------------------------------------------------------------------------------
/tess-data/configs/kannada:
--------------------------------------------------------------------------------
1 | textord_skewsmooth_offset 8
2 | textord_skewsmooth_offset2 8
3 | textord_merge_desc 0.5
4 | textord_no_rejects 1
5 | 


--------------------------------------------------------------------------------
/tess-data/configs/lstmdebug:
--------------------------------------------------------------------------------
1 | stopper_debug_level 1
2 | classify_debug_level 1
3 | segsearch_debug_level 1
4 | language_model_debug_level 3
5 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | multi_line_output=3
3 | include_trailing_comma=True
4 | force_grid_wrap=0
5 | use_parentheses=True
6 | line_length=88
7 | 


--------------------------------------------------------------------------------
/tess-data/configs/bazaar:
--------------------------------------------------------------------------------
1 | load_system_dawg     F
2 | load_freq_dawg       F
3 | user_words_suffix    user-words
4 | user_patterns_suffix user-patterns
5 | 


--------------------------------------------------------------------------------
/tess-data/configs/bigram:
--------------------------------------------------------------------------------
1 | load_bigram_dawg	True
2 | tessedit_enable_bigram_correction	True
3 | tessedit_bigram_debug	3
4 | save_raw_choices	True
5 | save_alt_choices	True
6 | 


--------------------------------------------------------------------------------
/tess-data/configs/txt:
--------------------------------------------------------------------------------
1 | # This config file should be used with other cofig files which creates renderers.
2 | # usage example: tesseract eurotext.tif eurotext txt hocr pdf
3 | tessedit_create_txt 1
4 | 


--------------------------------------------------------------------------------
/tess-data/configs/ambigs.train:
--------------------------------------------------------------------------------
1 | tessedit_ambigs_training	1
2 | load_freq_dawg	0
3 | load_punc_dawg	0
4 | load_system_dawg	0
5 | load_number_dawg	0
6 | ambigs_debug_level	3
7 | load_fixed_length_dawgs	0
8 | 


--------------------------------------------------------------------------------
/tess-data/configs/lstm.train:
--------------------------------------------------------------------------------
 1 | file_type                   .bl
 2 | textord_fast_pitch_test	T
 3 | tessedit_zero_rejection T
 4 | tessedit_minimal_rejection F
 5 | tessedit_write_rep_codes F
 6 | edges_children_fix F
 7 | edges_childarea 0.65
 8 | edges_boxarea 0.9
 9 | tessedit_train_line_recognizer T
10 | textord_no_rejects T
11 | tessedit_init_config_only T
12 | 


--------------------------------------------------------------------------------
/tess-data/configs/box.train:
--------------------------------------------------------------------------------
 1 | disable_character_fragments T
 2 | file_type                   .bl
 3 | textord_fast_pitch_test	T
 4 | tessedit_zero_rejection T
 5 | tessedit_minimal_rejection F
 6 | tessedit_write_rep_codes F
 7 | edges_children_fix F
 8 | edges_childarea 0.65
 9 | edges_boxarea 0.9
10 | tessedit_resegment_from_boxes T
11 | tessedit_train_from_boxes T
12 | textord_no_rejects T
13 | 


--------------------------------------------------------------------------------
/tess-data/configs/box.train.stderr:
--------------------------------------------------------------------------------
 1 | file_type .bl
 2 | #tessedit_use_nn F
 3 | textord_fast_pitch_test T
 4 | tessedit_zero_rejection T
 5 | tessedit_minimal_rejection F
 6 | tessedit_write_rep_codes F
 7 | edges_children_fix F
 8 | edges_childarea 0.65
 9 | edges_boxarea 0.9
10 | tessedit_resegment_from_boxes T
11 | tessedit_train_from_boxes T
12 | #textord_repeat_extraction F
13 | textord_no_rejects T
14 | 


--------------------------------------------------------------------------------
/tess-data/configs/Makefile.am:
--------------------------------------------------------------------------------
1 | datadir = @datadir@/tessdata/configs
2 | data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug
3 | data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images
4 | data_DATA += lstmbox wordstrbox
5 | # Configurations for OCR output.
6 | data_DATA += alto hocr pdf tsv txt
7 | data_DATA += linebox rebox strokewidth bigram
8 | EXTRA_DIST = $(data_DATA)
9 | 


--------------------------------------------------------------------------------
/tess-data/configs/strokewidth:
--------------------------------------------------------------------------------
 1 | textord_show_blobs 0
 2 | textord_debug_tabfind 3
 3 | textord_tabfind_show_partitions 1
 4 | textord_tabfind_show_initial_partitions 1
 5 | textord_tabfind_show_columns 1
 6 | textord_tabfind_show_blocks 1
 7 | textord_tabfind_show_initialtabs 1
 8 | textord_tabfind_show_finaltabs 1
 9 | textord_tabfind_show_strokewidths 1
10 | textord_tabfind_show_vlines 0
11 | textord_tabfind_show_images 1
12 | tessedit_dump_pageseg_images 0
13 | 


--------------------------------------------------------------------------------
/tess-data/configs/testspace:
--------------------------------------------------------------------------------
 1 | chop_ok_split 1000
 2 | tosp_rep_space 16
 3 | textord_words_default_minspace 0.3
 4 | textord_space_size_is_variable	1
 5 | gapmap_use_ends	1
 6 | textord_linespace_iqrlimit	0.8
 7 | textord_overlap_x	1.375
 8 | textord_words_width_ile	0.8
 9 | textord_words_maxspace	16
10 | textord_words_default_maxspace	1.5
11 | textord_words_default_minspace	0.1
12 | textord_words_min_minspace	0.7
13 | textord_words_default_nonspace	0.1
14 | words_default_prop_nonspace	0.65
15 | words_default_fixed_space	0.95
16 | textord_spacesize_ratiofp	4.8
17 | textord_spacesize_ratioprop	1
18 | debug_fix_space_level	1
19 | tosp_enough_space_samples_for_median	8
20 | tosp_short_row	30
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The cute collection can combine subtitles from various files and sync them.
 2 | Copyright (C) 2021  Anders Jensen
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU Affero General Public License as
 6 | published by the Free Software Foundation, either version 3 of the
 7 | License, or (at your option) any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU Affero General Public License for more details.
13 | 
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program.  If not, see <https://www.gnu.org/licenses/>.


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-buster
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN apt-get update \
 6 |   && apt-get install -y tesseract-ocr ffmpeg \
 7 |   && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN pip install -U setuptools pip wheel
10 | RUN pip install -U ffmpeg-python click guessit opencv-python librosa \
11 |                    pysubs2 scikit-image jinja2 lxml tqdm pyxdameraulevenshtein \
12 |                    textblob jinja2 pytesseract lxml annoy
13 | 
14 | RUN mkdir /code
15 | ADD tess-data /code/tess-data
16 | COPY cartonizer.py cowocr.py milksync.py /code/
17 | 
18 | RUN echo '#!/bin/bash\npython3 /code/cartonizer.py "$@"' > /usr/bin/cartonizer && \
19 |     echo '#!/bin/bash\npython3 /code/cowocr.py "$@"' > /usr/bin/cowocr && \
20 |     echo '#!/bin/bash\npython3 /code/milksync.py "$@"' > /usr/bin/milksync && \
21 |     chmod +x /usr/bin/cartonizer /usr/bin/cowocr /usr/bin/milksync
22 | 
23 | RUN mkdir /workdir
24 | WORKDIR /workdir


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | .env
141 | 
142 | milk-temp
143 | cow-temp
144 | 


--------------------------------------------------------------------------------
/cartonizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re
  4 | import shlex
  5 | import sys
  6 | from pathlib import Path
  7 | from pprint import pprint
  8 | 
  9 | import click
 10 | import ffmpeg
 11 | import guessit
 12 | 
 13 | KNOWN_SUBTITLE_EXTENSIONS = [".ass"]
 14 | KNOWN_EXTENSIONS = [".mp4", ".mkv", ".ogm", ".avi"]
 15 | VIDEO_MAPPING = {
 16 |     ("hevc", "Main 10"): "HEVC 10-bit",
 17 |     ("hevc", "Rext"): "HEVC 12-bit",
 18 |     ("h264", "High"): "h264",
 19 |     ("h264", "Main"): "h264",
 20 |     ("h264", "High 10"): "h264 10-bit",
 21 | }
 22 | VIDEO_RESOLUTION_MAPPING = {
 23 |     1080: "1080p",
 24 |     1088: "1080p",
 25 | }
 26 | AUDIO_MAPPING = {"flac": "FLAC", "aac": "AAC", "dts": "DTS-HDMA", "ac3": "AC3"}
 27 | SOURCE_MAPPING = {"Blu-ray": "BD", "DVD": "DVD"}
 28 | 
 29 | OP_CHAPTER_NAMES = ["OP", "Episode"]
 30 | ED_CHAPTER_NAMES = ["ED", "Preview"]
 31 | 
 32 | CHROMA_GENERATE_PARAM = "--only-generate-chroma"
 33 | 
 34 | 
 35 | def map_episode_files(paths):
 36 |     episode_mapping = {}
 37 |     for path in paths:
 38 |         for f in path.iterdir():
 39 |             if not f.is_file():
 40 |                 continue
 41 |             if not f.suffix.lower() in KNOWN_EXTENSIONS:
 42 |                 continue
 43 |             info = guessit.guessit(f.name)
 44 |             episode = info.get("episode")
 45 |             if episode is None:
 46 |                 episode = info.get("episode_title")
 47 |                 if episode is not None:
 48 |                     episode = int(episode.split(" ")[0].split("v")[0])
 49 | 
 50 |             if episode is None:
 51 |                 re_episode = re.findall("第(\d+)話", f.name)
 52 |                 if not re_episode:
 53 |                     continue
 54 |                 episode = int(re_episode[0])
 55 |             if isinstance(episode, list):
 56 |                 episode = episode[-1]
 57 |             episode_mapping.setdefault(episode, []).append(f)
 58 |     return episode_mapping
 59 | 
 60 | 
 61 | @click.group()
 62 | def cli():
 63 |     pass
 64 | 
 65 | 
 66 | @cli.command()
 67 | @click.argument("path", type=click.Path(exists=True), nargs=-1, required=True)
 68 | @click.option("--input-subtitle-path", type=click.Path(exists=True))
 69 | @click.option("--op-ed-path", type=click.Path(exists=True), multiple=True)
 70 | @click.option("--group", type=str)
 71 | @click.option("--source", type=str)
 72 | @click.option("--audio", type=str)
 73 | @click.option("--title", type=str)
 74 | @click.option("--dual-audio", is_flag=True)
 75 | @click.option("--skip-chapters", is_flag=True)
 76 | @click.option("--pre-generate-chroma", is_flag=True)
 77 | @click.option("--skip-copy-oped", is_flag=True)
 78 | @click.option("--additional-params", type=str)
 79 | @click.option("--folder-name", type=str)
 80 | @click.option("--file-name-template", type=str)
 81 | @click.option("--output-subtitles-path", type=click.Path())
 82 | def sync(
 83 |     path,
 84 |     input_subtitle_path,
 85 |     op_ed_path,
 86 |     group,
 87 |     source,
 88 |     audio,
 89 |     title,
 90 |     dual_audio,
 91 |     skip_chapters,
 92 |     pre_generate_chroma,
 93 |     skip_copy_oped,
 94 |     additional_params,
 95 |     folder_name,
 96 |     file_name_template,
 97 |     output_subtitles_path,
 98 | ):
 99 |     command_path = (
100 |         f"{sys.executable} {(Path(__file__).parent / 'milksync.py').absolute()}"
101 |     )
102 | 
103 |     if output_subtitles_path:
104 |         output_subtitles_path = Path(output_subtitles_path)
105 |         output_subtitles_path.mkdir(parents=True, exist_ok=True)
106 | 
107 |     external_subtitles = {}
108 |     if input_subtitle_path is not None:
109 |         for s in Path(input_subtitle_path).iterdir():
110 |             if s.suffix.lower() not in KNOWN_SUBTITLE_EXTENSIONS:
111 |                 continue
112 |             external_subtitles[s.stem] = s
113 | 
114 |     paths = [Path(p) for p in path]
115 |     if op_ed_path:
116 |         op_ed_paths = [Path(p) for p in op_ed_path]
117 |     else:
118 |         op_ed_paths = []
119 | 
120 |     episode_mapping = map_episode_files(paths)
121 |     first_episode = sorted(episode_mapping.items())[0][1]
122 |     probe_result = ffmpeg.probe(first_episode[-1])
123 |     release_name = {
124 |         "show_name": title or guessit.guessit(first_episode[0].name)["title"],
125 |     }
126 |     if source:
127 |         release_name["source"] = source
128 |     else:
129 |         release_name["source"] = SOURCE_MAPPING[
130 |             guessit.guessit(first_episode[-1].name)["source"]
131 |         ]
132 |     if audio:
133 |         release_name["audio"] = audio
134 |     for stream in probe_result["streams"]:
135 |         if stream["codec_type"] == "video" and "video" not in release_name:
136 |             key = (stream["codec_name"], stream["profile"])
137 |             if key not in VIDEO_MAPPING:
138 |                 click.echo(f"Unknown video key {key=}")
139 |                 quit(1)
140 |             release_name["video"] = VIDEO_MAPPING[key]
141 |             release_name["video_resolution"] = VIDEO_RESOLUTION_MAPPING.get(
142 |                 stream["coded_height"],
143 |                 f"{stream['coded_width']}x{stream['coded_height']}",
144 |             )
145 |         elif stream["codec_type"] == "audio" and "audio" not in release_name:
146 |             key = stream["codec_name"]
147 |             if key not in AUDIO_MAPPING:
148 |                 click.echo(f"Unknown audio key {key=}")
149 |                 quit(1)
150 |             release_name["audio"] = AUDIO_MAPPING[key]
151 | 
152 |     if not folder_name:
153 |         folder_name = f"{group and '[' + group + '] ' or ''}{release_name['show_name']} ({release_name['source']} {release_name['video_resolution']} {release_name['video']} {release_name['audio']}{dual_audio and ' Dual-Audio' or ''})"
154 |     if not file_name_template:
155 |         file_name_template = f"{group and '[' + group + '] ' or ''}{release_name['show_name']} - %s ({release_name['source']} {release_name['video_resolution']} {release_name['video']} {release_name['audio']}{dual_audio and ' Dual-Audio' or ''})"
156 |     click.echo(f"Folder name: {folder_name}")
157 |     click.echo(f"File name template: {file_name_template}")
158 | 
159 |     copy_files = []
160 | 
161 |     endings = []
162 |     openings = []
163 |     for op_ed_path in op_ed_paths:
164 |         for f in op_ed_path.iterdir():
165 |             if "NCOP" in f.name:
166 |                 click.echo(f"Found OP {f.name}")
167 |                 openings.append(f)
168 |             elif "NCED" in f.name:
169 |                 click.echo(f"Found ED {f.name}")
170 |                 endings.append(f)
171 | 
172 |     op_ed_chapter_command = []
173 |     if openings or endings:
174 |         for i, opening in enumerate(sorted(openings, key=lambda f: f.name), 1):
175 |             if not skip_chapters:
176 |                 op_ed_chapter_command.append(
177 |                     f"--chapter-segment-file '{str(opening)}' --chapter-segment-name-start '{OP_CHAPTER_NAMES[0]}' --chapter-segment-name-end '{OP_CHAPTER_NAMES[1]}'"
178 |                 )
179 |             name = "NCOP"
180 |             if len(openings) > 1:
181 |                 name += str(i)
182 |             if not skip_copy_oped:
183 |                 copy_files.append(
184 |                     (str(opening), f"{folder_name}/{file_name_template % name}.mkv")
185 |                 )
186 | 
187 |         for i, ending in enumerate(sorted(endings, key=lambda f: f.name), 1):
188 |             if not skip_chapters:
189 |                 op_ed_chapter_command.append(
190 |                     f"--chapter-segment-file '{str(ending)}' --chapter-segment-name-start '{ED_CHAPTER_NAMES[0]}' --chapter-segment-name-end '{ED_CHAPTER_NAMES[1]}'"
191 |                 )
192 |             name = "NCED"
193 |             if len(endings) > 1:
194 |                 name += str(i)
195 |             if not skip_copy_oped:
196 |                 copy_files.append(
197 |                     (str(ending), f"{folder_name}/{file_name_template % name}.mkv")
198 |                 )
199 | 
200 |     op_ed_chapter_command = "".join([f"  {cmd} \\\n" for cmd in op_ed_chapter_command])
201 |     episode_num_length = max(max(len(str(k)) for k in episode_mapping.keys()), 2)
202 | 
203 |     output_file = []
204 | 
205 |     chroma_files = []
206 |     for files in episode_mapping.values():
207 |         for f in files:
208 |             chroma_files.append(f"'{str(f)}'")
209 |     if pre_generate_chroma:
210 |         output_file.append("echo 'Generating chroma'")
211 |         output_file.append(
212 |             f"{command_path} {CHROMA_GENERATE_PARAM} {' '.join(chroma_files)}"
213 |         )
214 |     output_file.append(f"mkdir -p '{folder_name}'")
215 |     if additional_params:
216 |         additional_params = f"  {additional_params} \\\n"
217 |     else:
218 |         additional_params = ""
219 | 
220 |     for episode, files in sorted(episode_mapping.items()):
221 |         if len(files) < 2:
222 |             click.echo(f"Skipping episode {episode}")
223 |             continue
224 |         external_subtitle = ""
225 |         if files[0].stem in external_subtitles:
226 |             external_subtitle = f"  --input-external-subtitle-track {shlex.quote(str(external_subtitles[files[0].stem]))} \\\n"
227 |         output_file.append("echo ''")
228 |         output_file.append(f"echo 'Handling episode {episode}'")
229 |         if output_subtitles_path:
230 |             output_subtitles = f"  --output-subtitle {shlex.quote(str(output_subtitles_path / files[-1].with_suffix('.subtitle').name))} \\\n"
231 |         else:
232 |             output_subtitles = ""
233 |         files = "".join([f"  {shlex.quote(str(f))} \\\n" for f in files])
234 |         output_file.append(
235 |             f"{command_path} \\\n{files}{op_ed_chapter_command}{external_subtitle}{additional_params}{output_subtitles}  --output '{folder_name}/{file_name_template % str(episode).zfill(episode_num_length)}.mkv'"
236 |         )
237 |     output_file.append("echo ''")
238 |     output_file.append("echo 'Copying files'")
239 |     for (src_f, dst_f) in copy_files:
240 |         if not src_f.lower().endswith(".mkv"):
241 |             click.echo("Copy file is not an mkv")
242 |             quit(1)
243 |         output_file.append(f"cp {shlex.quote(src_f)} {shlex.quote(dst_f)}")
244 | 
245 |     Path("create_release.sh").write_text("\n".join(output_file))
246 |     click.echo("Release file created, run: bash create_release.sh")
247 | 
248 | 
249 | @cli.command()
250 | @click.argument("subbed_path", type=click.Path(exists=True), required=True)
251 | @click.argument("unsubbed_path", type=click.Path(exists=True), required=True)
252 | @click.option("--additional-params", type=str)
253 | def ocr(
254 |     subbed_path,
255 |     unsubbed_path,
256 |     additional_params,
257 | ):
258 |     paths = [Path(subbed_path), Path(unsubbed_path)]
259 |     command_path = (
260 |         f"{sys.executable} {(Path(__file__).parent / 'cowocr.py').absolute()}"
261 |     )
262 | 
263 |     output_file = []
264 | 
265 |     episode_mapping = map_episode_files(paths)
266 | 
267 |     for episode, files in sorted(episode_mapping.items()):
268 |         if len(files) < 2:
269 |             click.echo(f"Skipping episode {episode}")
270 |             continue
271 |         output_file.append("echo ''")
272 |         output_file.append(f"echo 'Handling episode {episode}'")
273 |         files = "".join([f"  {shlex.quote(str(f))} \\\n" for f in files])
274 |         output_file.append(
275 |             f"{command_path} \\\n{files}  extract-subtitles \\\n  {additional_params or ''}"
276 |         )
277 |         output_file.append(f"{command_path} \\\n{files}  create-report")
278 | 
279 |     output_file.append("")
280 |     Path("ocr_release.sh").write_text("\n".join(output_file))
281 |     print("OCR script file created, run: bash ocr_release.sh")
282 | 
283 | 
284 | if __name__ == "__main__":
285 |     cli()
286 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # The Cute Collection
  2 | 
  3 | This is a bunch of tools to sync subtitles and audio tracks automatically.
  4 | 
  5 | ## Requirements
  6 | 
  7 | * Linux
  8 | * 10GB of available memory
  9 | * ffmpeg and ffprobe in environment path
 10 | * Python 3.8+ (might work with earlier, not tested though)
 11 | * Tesseract in environment path if you want to OCR
 12 | 
 13 | ## Installation Instructions
 14 | 
 15 | ```bash
 16 | cd ~
 17 | git clone https://github.com/JohnDoee/the-cute-collection.git the-cute-collection
 18 | cd the-cute-collection
 19 | 
 20 | python3 -m venv .env
 21 | 
 22 | .env/bin/pip install -U setuptools pip wheel
 23 | .env/bin/pip install ffmpeg-python click guessit opencv-python librosa pysubs2 scikit-image jinja2 lxml tqdm pyxdameraulevenshtein textblob jinja2 pytesseract lxml annoy
 24 | ```
 25 | 
 26 | ## Docker Installation and Usage Instructions
 27 | 
 28 | A docker image is available too, the commands look like this:
 29 | 
 30 | cartonizer: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection cartonizer```
 31 | cowocr: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection cowocr```
 32 | milksync: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection milksync```
 33 | 
 34 | To execute a commandfile generated by cartonizer, use this syntax:
 35 | 
 36 | ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection bash ocr_release.sh```
 37 | 
 38 | ## Cartonizer
 39 | 
 40 | Generate a script for milksync to do bulk operations instead of one by one.
 41 | 
 42 | ### How to use for automatic subtitle sync (milksync)
 43 | 
 44 | The most basic usage is:
 45 | 
 46 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py sync path-to-subbed path-to-unsubbed`
 47 | 
 48 | Make sure the files are correctly matched, sometimes it takes multiple files it should not.
 49 | 
 50 | This will generate a bash script called `create_release.sh`, you just have to run it with `bash create_release.sh` and wait.
 51 | 
 52 | A description of all arguments and how and when to use them.
 53 | 
 54 | #### --op-ed-path
 55 | 
 56 | Look for OP and ED in the given path and use them to auto-generate chapters and copy them to the result path.
 57 | 
 58 | Example: `--op-ed-path Unsubbed-Files/NC-OP-ED-Folder/` - looks for OP ED in the specified folder.
 59 | 
 60 | #### --group
 61 | 
 62 | Put a group name in the result folder name.
 63 | 
 64 | Example: `--group Horse` - Sets group name to Horse and prefixes the folder name and files with it.
 65 | 
 66 | #### --source
 67 | 
 68 | Specify the source of the video track, e.g. BD for bluray. Will be auto-detected if not specified.
 69 | 
 70 | Example: `--source VHS` - sets the source to the text string VHS.
 71 | 
 72 | #### --audio
 73 | 
 74 | Same as source but for the audio track.
 75 | 
 76 | Example: `--audio Opus` - sets the audio source to the text string Opus.
 77 | 
 78 | #### --title
 79 | 
 80 | Sets the title of the release.
 81 | 
 82 | Example: `--title Big Buck Bunny` - sets the title to the text string Big Buck Bunny.
 83 | 
 84 | #### --dual-audio
 85 | 
 86 | Marks the release as Dual-Audio
 87 | 
 88 | Example: `--dual-audio` - Adds the text Dual-Audio to the release.
 89 | 
 90 | #### --skip-chapters
 91 | 
 92 | Skips adding chapters based on OP-ED specified with `--op-ed-path`.
 93 | This is useful if you want to copy the NC-OP-ED files but copy the chapters from a release.
 94 | 
 95 | Example: `--skip-chapters` - Instructs milksync to not assign chapters from OP & ED.
 96 | 
 97 | #### --pre-generate-chroma
 98 | 
 99 | Pre-generate chromas, this can sometimes speed up the total speed but is not recommended
100 | 
101 | Example: `--pre-generate-chroma` - Adds a line to the script that pre-generates chromas.
102 | 
103 | #### --skip-copy-oped
104 | 
105 | Skips copying OP-ED specified with `--op-ed-path`. This is useful if you created the files yourself just to assign the chapters.
106 | 
107 | Example: `--skip-copy-oped` - Cartonizer does not add the line to copy the files to the release folder.
108 | 
109 | #### --additional-params
110 | 
111 | Pass additional arguments to `milksync.py`.
112 | 
113 | Example: `--additional-params '--chapter-beginning Intro'` - Tells milksync to add a chapter to the beginning fo the file.
114 | 
115 | See milksync arguments for more.
116 | 
117 | #### --folder-name
118 | 
119 | Instead of auto-generating a foldername, use this name.
120 | 
121 | Example: `--folder-name 'Happy Bunnies Riding The Wave (DVD)'`
122 | 
123 | #### --file-name-template
124 | 
125 | Instead of auto-generating a file name template, use this template.
126 | Must have a %s where the episode number is placed.
127 | 
128 | Example: `--folder-name 'Happy Bunnies Riding The Wave (DVD) %s'`
129 | 
130 | ### How to use for ocr (cowocr)
131 | 
132 | The most basic usage is:
133 | 
134 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py ocr path-to-subbed path-to-unsubbed`
135 | 
136 | Make sure the files are correctly matched, sometimes it takes multiple files it should not.
137 | 
138 | This will generate a bash script called `ocr_release.sh`, you just have to run it with `bash ocr_release.sh` and wait.
139 | 
140 | A description of all arguments and how and when to use them.
141 | 
142 | #### --additional-params
143 | 
144 | Pass additional arguments to `cowocr.py`.
145 | 
146 | Example: `--additional-params '--threads 1 --run-subregions-in-parallel'` - Tells cowocr to use 2 threads and run every subregion in parallel.
147 | 
148 | See cowocr arguments for more.
149 | 
150 | ### FAQ
151 | 
152 | #### There is no OP/ED to assign chapters from (or it fails to use existing OP/ED) what do I do?
153 | 
154 | The easiest way right now is to extract them manually, if you have a file named `Big Buck Bunny 01.mkv` and the chapters are like this in the file:
155 | 
156 | * Opening: starts at 00:01:27.062 and stops at 00:02:57.123
157 | * Ending: starts at 00:22:11.362 and stops at 00:23:33.333
158 | 
159 | Extract them with:
160 | ```
161 | mkdir extracted
162 | ffmpeg -i 'Big Buck Bunny 01.mkv' -ss 00:01:27.062 -to 00:02:57.123 -map a:0 extracted/NCOP-01.mkv
163 | ffmpeg -i 'Big Buck Bunny 01.mkv' -ss 00:22:11.362 -to 00:23:33.333 -map a:0 extracted/NCED-01.mkv
164 | ```
165 | 
166 | These can then be used with `--op-ed-path extracted/ --skip-copy-oped`
167 | 
168 | ## Milksync
169 | 
170 | Compare audio tracks between two files and take subtitles and audio tracks from one and add to another. The intention is to remove the tedious work of manually aligning subtitles to a new files and give a far more exact
171 | result.
172 | 
173 | ### How to use
174 | 
175 | The most basic usage is:
176 | 
177 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/milksync.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv --output merged-episode-01.mkv`
178 | 
179 | This will take video and audio from the last file and put subtitles from the first file into the merged file.
180 | 
181 | The command prints out information about what is going on, e.g. where chapters are placed and how much subtitles are moved.
182 | Make sure to check the result, especially around the breakpoints. WARNINGs can also be a hint about what might be wrong with the resulting file.
183 | 
184 | Remember, you can always modify a command and just run it again to see what happens. Second time is normally faster than the first too.
185 | Sometimes experimenting can help you on your way.
186 | 
187 | A description of all arguments and how and when to use them.
188 | 
189 | #### --only-generate-chroma
190 | 
191 | Only extract audio from the file and generate index, this can sometimes be used to speed up the overall progress, not recommended.
192 | 
193 | Example: `--only-generate-chroma` - Quits after extracting chroma
194 | 
195 | #### --sync-using-subtitle-audio
196 | 
197 | Use the audio where the subtitles run to sync a specific line. Good when video is partial or re-arranged. Bad for audio syncs.
198 | 
199 | Example: `--sync-using-subtitle-audio` - Enable the sync feature.
200 | 
201 | #### --skip-subtitles
202 | 
203 | Do not copy any subtitles, can be used for e.g. dubs only releases or subtitles from another source that are not to be synced this way.
204 | 
205 | Example: `--skip-subtitles` - No subtitles copied
206 | 
207 | #### --skip-shift-point
208 | 
209 | The script prints out the points it uses to shift the subtitles, sometimes one or more of them might be bad or you want to see what happens with them removed. They are index based and you have to count it yourself from the milksync output.
210 | 
211 | Generally not used.
212 | 
213 | Example: `--skip-shift-point 2,3` - Skips shift point 2 and 3.
214 | 
215 | #### --subtitle-cutoff
216 | 
217 | If the subtitles start too early or run too long, this command can cut off subtitles to prevent this. The command takes a number in seconds that can be both positive (count from beginning of video result file) and negative (count from end of video result file)
218 | 
219 | Example: `--subtitle-cutoff -50` - The last 50 second of the result will not have any subtitles.
220 | Example: `--subtitle-cutoff 30` - The first 30 second of the result will not have any subtitles.
221 | 
222 | #### --only-delta
223 | 
224 | Instead of putting subtitles into buckets and adjusting them to fit in it, just modify the timestamp on the subtitles.
225 | This one is very useful if one input runs faster or slower than the other. This can often be seen in the milksync output as a lot of sync points that either decrease or increase in delta.
226 | 
227 | Example: `--only-delta` - Enable delta mode instead of subtitle bucket mode.
228 | 
229 | #### --align-framerate
230 | 
231 | Align source framerate to target video framerate, when speedup/slowdown used as technique to change framerate.
232 | 
233 | Example: `--align-framerate` - Enable the feature and change source framerate to target framerate.
234 | 
235 | #### --align-frames-too
236 | 
237 | When using `--only-delta` it can be helpful to look at frames too to find a better difference.
238 | 
239 | Example: `--only-delta --align-frames-too` - Enables frame alignment.
240 | 
241 | #### --preserve-silence
242 | 
243 | When extracting chroma from the files the audio at the end is trimmed to prevent silence from blocking alignment, this disables that feature.
244 | 
245 | Example: `--preserve-silence` - Preserves silence.
246 | 
247 | #### --temp-folder
248 | 
249 | Where to save temporary files, this includes extracted audio and subtitle tracks including chroma generated from audio files.
250 | 
251 | Example: `--temp-folder '/tmp/milk-temp/'` - Saves temp files to the specified folder
252 | 
253 | #### --audio-tracks
254 | 
255 | Define which audio tracks to use for syncing audio tracks. Milksync only works when the audio tracks in the input files are the same, i.e. same language. If you take e.g. english and japanese audio tracks and try to use then the results will vary quite a bit and likely not be very good.
256 | 
257 | Input file index is defined as the order they are given to milksync.
258 | 
259 | Example: `--audio-tracks 0:1,1:0` - Use audio track 1 from input file 0 and audio track 0 from input file 1.
260 | 
261 | #### --adjust-shift-point
262 | 
263 | Manually change a shift point. Can be used to see if the auto detect is not good enough or just modify it to work correctly. This is mostly used for debugging.
264 | 
265 | Example: `--adjust-shift-point 0.3:10.3:1.3:11.3` - Set the first shift point to the specified values. Order is same as printed in milksync.
266 | 
267 | #### --adjust-delay
268 | 
269 | Manually adjust the delay to all points. Can be used for debugging.
270 | 
271 | Example: `--adjust-delay 0.3` - Adds 0.3 second to every subtitle
272 | 
273 | #### --sync-non-dialogue-to-video
274 | 
275 | Sometimes the audio has been resynced to the video which means the speech subtitles and the sign subtitles must be synced independently.
276 | This flag tries to align signs to the video and speech to the audio, can be useful when the target is e.g. remastered. It can be very slow and quality can vary, the result is printed and you should check if the signs are positioned correctly.
277 | 
278 | Example: `--sync-non-dialogue-to-video 0-1000` - Enables this feature for the given range of seconds.
279 | 
280 | #### --chapter-source
281 | 
282 | Specify which source file index to pull chapters from, these are synced in the same way as the audio tracks.
283 | 
284 | If nothing chapter-related is specified, they are pulled from video source, i.e. last file.
285 | 
286 | Example: `--chapter-source 0` - Take chapters from input file 0.
287 | 
288 | #### --chapter-beginning
289 | 
290 | Add a chapter to the beginning of the result. This means every part of the result will be part of a chapter.
291 | 
292 | Example: `--chapter-beginning Beginning` - The first chapter at 00:00 is named Beginning.
293 | 
294 | #### --chapter-segment-file
295 | 
296 | Source file to generate chapter from, this is a part of the video that is sought for in the file. Useful for e.g. openings or endings.
297 | 
298 | This is used in conjunction with `--chapter-segment-name-start` and `--chapter-segment-name-end`. Order matters and each `--chapter-segment-file` must have a `--chapter-segment-name-start` and `--chapter-segment-name-end`.
299 | 
300 | Example: `--chapter-source NCED-01.mkv` - Match content of NCED-01.mkv to the result video and add chapters if found.
301 | 
302 | #### --chapter-segment-name-start
303 | 
304 | Name of the chapter starting where the beginning of `--chapter-segment-file` is matched.
305 | 
306 | Example: `--chapter-segment-file End` - Names the chapter that matches the beginning of `--chapter-segment-file` End.
307 | 
308 | #### --chapter-segment-name-end
309 | 
310 | Name of the chapter starting where the end of `--chapter-segment-file` is matched.
311 | 
312 | Example: `--chapter-segment-file 'After End'` - Names the chapter that matches the end of `--chapter-segment-file` After End.
313 | 
314 | #### --chapter-segment-required
315 | 
316 | Enforces that every chapter segment must be matched.
317 | 
318 | Example: `--chapter-segment-required` - If a chapter segment is not matched, it will quit with an error.
319 | 
320 | #### --metadata-audio-track
321 | 
322 | Manually set metadata for an audio track, this is passed directly to ffmpeg. These matches the output mapping and no the input mapping.
323 | 
324 | Example: `--metadata-audio-track 0=language=jpn --metadata-audio-track 0=title='Japanese' --metadata-audio-track 1=language=fra --metadata-audio-track 1=title='Bad french'` - Sets the first output audio track metadata to japanese with a matching title and the second audio track to french with a matching title.
325 | 
326 | #### --metadata-subtitle-track
327 | 
328 | Manually set metadata for a subtitle track, this is passed directly to ffmpeg. These matches the output mapping and no the input mapping.
329 | 
330 | Example: `--metadata-subtitle-track 0=language=jpn --metadata-subtitle-track 0=title='Japanese' --metadata-subtitle-track 1=language=fra --metadata-audsubtitleio-track 1=title='Bad french'` - Sets the first output subtitle track metadata to japanese with a matching title and the second subtitle track to french with a matching title.
331 | 
332 | #### --subtitle-min-font-size
333 | 
334 | Increase font-size to minimum this. Sometimes subtitles are unreadable on the source.
335 | 
336 | Example: `--subtitle-min-font-size 26` - Sets the font-size to, minimum, 26.
337 | 
338 | #### --input-external-subtitle-track
339 | 
340 | Use a specific external subtitle in output, it is assumed it matches video input 0.
341 | 
342 | Example `--input-external-subtitle-track subtitles.ass` - Assumes the subtitle matches input 0 and syncs it to output.
343 | 
344 | #### --output-video-file-index
345 | 
346 | Which file to pull video data from, this is normally the last specified file and is normally not used.
347 | 
348 | Example: `--output-video-file-index 1` - Pull video data from the second input file.
349 | 
350 | #### --output-audio-mapping
351 | 
352 | Define which audio tracks the output has and where to pull them from. Defaults to using only first audio from the last input file, same source as video.
353 | 
354 | Example: `--output-audio-mapping 0:0,1:2` - Takes the first audio track from the first input file and the third audio track from the second input file. The result file first audio track is 0:0 and the second is 1:2.
355 | 
356 | #### --output-subtitle-mapping
357 | 
358 | Define which subtitle tracks the output has and where to pull them from. Defaults to using only first subtitle from the first input file.
359 | 
360 | Example: `--output-subtitle-mapping 1:1,1:0` - Takes the first and the second subtitle track from the second input file. The order is as specified, i.e. the tracks are flipped.
361 | 
362 | #### --output
363 | 
364 | Where to save the result.
365 | 
366 | Example: `--output Result-EP01.mkv` - Saves the complete file to Result-EP01.mkv
367 | 
368 | #### --output-subtitle
369 | 
370 | Save the synced subtitles.
371 | 
372 | Example: `--output-subtitle Result-EP01.ass` - Saves the subtitle file to Result-EP01.ass
373 | 
374 | ## CowOCR
375 | 
376 | Compare two video tracks and look for differences. The intention is to find differences as they will indicate e.g. subititles and signs.
377 | 
378 | The output is an .ass file and a report that can be used to verify and correct the output.
379 | 
380 | ### How it works
381 | 
382 | The base assumtion that CowOCR relies on is to find the differences between the source and destination video. To do this it goes through a few steps.
383 | 
384 | The initial differences are found by running ORB algorithm against both source and target video, keypoints found at source not found at target is assumed to be differences.
385 | 
386 | We now have a region we can assume is different, we look for text in that one. Threshold algorithm is run against the source and matching white areas are extracted.
387 | 
388 | To make sure what is part of the text the color of all found area is extracted and grouped using k-means. Areas with colors close enough to the majority color found are considered part of the text. Additionally the border color is used in the same way.
389 | 
390 | A bruteforce is performed here to find the best text mask by cycling through the colors.
391 | 
392 | With text found and a mask matching the text (where it is in the picture) it is now time to figure out when it starts and ends. This is done by looping through the frames before and after the current frame and see if the colors match the extracted text, i.e. is the same text in the frames before and after the current frame.
393 | 
394 | ### How to use
395 | 
396 | The most basic usage to extract-subtitles is:
397 | 
398 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv extract-subtitles`
399 | 
400 | This will compare the two video files and try to extract the subtitles.
401 | 
402 | After the subtitles are extracted, a report plus an .ass file can be created from the output with the create-report command.
403 | 
404 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv create-report`
405 | 
406 | The report and subtitle is, per default, located in the cow-temp folder which is created relative to where the command was executed.
407 | In this example, in the folder that contains the path-to-subbed and path-to-unsubbed folder.
408 | 
409 | Verify the subtitles and we're all done, now we just need them merged. For this we can use milksync with just one additional parameter, `--input-subtitle-path cow-temp/` - that will pull the subtitles from the .ass file instead of the source video file.
410 | 
411 | This is likely not how the actual workflow will be. See further down for an actual workflow.
412 | 
413 | A description of all arguments and how and when to use them.
414 | 
415 | ### extract-subtitles arguments
416 | 
417 | This command extracts the subtitles from the video
418 | 
419 | #### --threads
420 | 
421 | How many threads to extract subtitles with. Unless specified it runs a subtitle region at a time.
422 | 
423 | Example: `--threads 1` - Use only one thread
424 | 
425 | #### --tesseract-data-path
426 | 
427 | Path to tesseract data path.
428 | 
429 | Example: `--threads tess-data/` - Read data from the tess-data folder.
430 | 
431 | #### --frame-diff
432 | 
433 | When comparing source and target video it can sometimes be necessary to specify frame differences. It should be sufficient to rely
434 | on the auto detection though.
435 | 
436 | Example: `--frame-diff 8` - The target is 8 frames ahead of the source.
437 | 
438 | #### --frame-range
439 | 
440 | Specify frames on the source to extract subtitles from, can be useful to e.g. skip OP/ED
441 | 
442 | Example: `--frame-range 1000-5000` - Extracts subtitles from frame 1000 to 5000
443 | 
444 | #### --ignore-diff-fps
445 | 
446 | As it uses frame differences to find subtitles the FPS must be the same. Sometimes it can be ignored (e.g. if the source just runs faster but has the same frames). This option makes it ignore the criteria
447 | 
448 | Example: `--ignore-diff-fps` - Ignores FPS differences
449 | 
450 | #### --run-subregions-in-parallel
451 | 
452 | Run extraction for each subtitle region in parallel. Each thread will run for each subtitle regions so the total number
453 | of threads will be threads times subtitle region count.
454 | 
455 | Example: `--run-subregions-in-parallel` - Run every subtitle region in parallel
456 | 
457 | #### --fix-broken-frame-alignment
458 | 
459 | Sometimes frames drift a bit differently so while the FPS is the same, one of the video files might have ghost frames or other annoyances. This tries to alleviate that issue.
460 | 
461 | Example: `--fix-broken-frame-alignment` - Enable frame alignment fix.
462 | 
463 | #### --debug-frame
464 | 
465 | While you are editing the subtitle region configuration it is necessary to try and extract a specific frame to see the result.
466 | This is the command for that. It will run the current subtitle region configuration for the given frame and save an array of outputs
467 | to the temp-folder/debug.
468 | 
469 | It will also print out what the various files contain.
470 | 
471 | Example: `--debug-frame 1000` - Try to extract subtitles from source frame 1000.
472 | 
473 | #### --debug-subregion
474 | 
475 | In combination with --debug-frame it will use a specific subtitle region. If not specified, defaults to the first subtitle region.
476 | 
477 | Example: `--debug-subregion bottom` - Extract using the sutitle region named bottom.
478 | 
479 | ### create-report arguments
480 | 
481 | This command turns the extracted subtitles into a report and an .ass file
482 | 
483 | A report contains information for each subtitle region, this explanation is for the default config.
484 | The report is an html file you should open in your webbrowser, e.g. `cow-temp/Episode 1.avi-report/index.html`. In that report each region has two sections, "subtitle lines" and "missing regions".
485 | 
486 | The "subtitle lines" are the found lines and these are reflected in the .ass file.
487 | With the bottom subtitles there area few things:
488 | 
489 | - A start and end timestamp of the subtitle
490 | - Start and end frame and the initial discovery frame.
491 | - The subtitle text
492 | - Four frames used to check if timing is correct, before first frame, first frame, last frame and after last frame. If before first or after last contains the matching text, then timing is off.
493 | 
494 | The "missing regions" part contains images of stuff where there are differences between source and target but it was unable to discover what exactly. Sometimes it is short words or un-ocrable subtitles.
495 | 
496 | A subtitle-region scan does not yield the same type of results as it is unable to merge subtitle lines in the same way. It also contains green squares for matched text under "subtitle signs" section.
497 | 
498 | Make sure to browse through the "missing regions" section, no tool is perfect.
499 | 
500 | #### --output-report-path
501 | 
502 | Where to save the report generated. Defaults to the temp-dir.
503 | 
504 | Example: `--output-report-path /mnt/sdd/subtitle-temp-reports` - Save the report to the specified path.
505 | 
506 | #### --output-subtitle-path
507 | 
508 | Where to save the .ass subtitle file is saved. Defaults to the temp-dir.
509 | 
510 | Example: `--output-report-path /mnt/sdd/subtitle-temp-subs` - Save the subtitles to the specified path.
511 | 
512 | ### subtitle_regions.json
513 | 
514 | This file is generated in the temp folder when the command is first run. Any video that uses a specific temp folder will use the same
515 | subtitle region file. A description of all available options can be found here.
516 | 
517 | #### name
518 | 
519 | Name of the subtitle region. Used with e.g. --debug-subregion parameter.
520 | 
521 | #### scan_mode
522 | 
523 | Specify how to find subtitles in a region, there are two choices, `bottom_center` and `search_slice`.
524 | 
525 | `bottom_center` looks for subtitles in the middle of the region and assumes there is, max, one subtitle in the given region.
526 | Useful for normal subtitles at the bottom of the screen.
527 | 
528 | `search_slice` looks around for differences that contains text, useful for e.g. signs. Cannot merge similar regions and can create a lot of duplicate lines.
529 | 
530 | #### y, h, x, w, margin
531 | 
532 | Specifies the dimension of a subtitle region, it starts at `x`, `y` and ends at `x+w`, `y+h`. If you run with --debug-frame it will show where the regions are.
533 | 
534 | The `margin` is part of the region that cannot contain subtitles and any object that are part of it will be removed, useful for `bottom_center` scan mode where normal subtitles are not in the margin.
535 | 
536 | #### area_min, area_max, area_min_density
537 | 
538 | Minimum `area_min` and maximum `area_max` number of pixels a letter can contain. Minimum density `area_min_density` a letter has.
539 | 
540 | These can be useful to remove things that most certainly cannot be letters.
541 | 
542 | #### max_w, max_h
543 | 
544 | Maximum size of a letter in pixels.
545 | 
546 | #### min_stroke_width, max_stroke_width
547 | 
548 | Minimum and maximum stroke width a letter can have. These are measured at the thickest spot of a letter.
549 | 
550 | Examples could be, a long thin line will have a width of 1px while a circle will have a width of its radius.
551 | 
552 | #### border_size
553 | 
554 | Assumed size of border.
555 | 
556 | This will often be either 1 or 2, it depends a bit on how the "Threshold" debug image looks, e.g. does it consume lots of the border or not.
557 | 
558 | See "How it works" to understand what it is useful for.
559 | 
560 | #### max_text_diff, max_border_diff
561 | 
562 | Maximum difference for text and border to be assumed part of the same text line.
563 | 
564 | This depends a bit on how well the text is marked and extracted, if it finds too few letters it might be smart to turn them up and vice-versa if it finds too much.
565 | 
566 | See "How it works" to understand what it is useful for.
567 | 
568 | #### percent_good_border
569 | 
570 | How much of a border of a given figure must be good to be assumed part of the text.
571 | 
572 | See "How it works" to understand what it is useful for.
573 | 
574 | #### edge_threshold
575 | 
576 | Used in relation with finding the differnence between source and target frames. Should probably not be touched.
577 | 
578 | See "How it works" to understand what it might be useful for.
579 | 
580 | #### threshold_mode, threshold_value
581 | 
582 | Method and value to look for threshold with. There are two modes `adaptive` and `static`.
583 | 
584 | `adaptive` finds out which pixel should be black and which should be white depending on the pixels around it. Can be useful if the inner text on the subtitles varies but is always bright. An example `threshold_value` for this could be 27, that will prevent most noise too.
585 | 
586 | `static` is an absolute way of finding them, useful if the inner subtitle text is always bright and same color. An example `threshold_value` could be 200, which is the brightness cutoff.
587 | 
588 | See "How it works" to understand what it is useful for and https://docs.opencv.org/4.5.2/d7/d4d/tutorial_py_thresholding.html for information about thresholds generally.
589 | 
590 | #### ass_style_name
591 | 
592 | Style name to use with text found here.
593 | 
594 | #### invert_mode
595 | 
596 | Not implemented, no effect.
597 | 
598 | ### A realistic workflow
599 | 
600 | In this example we have a set of 12 episodes we want to OCR, the source is 640x480 which matches the default subtitle region.
601 | Source files are located in `source-video` and the target files are in `target-video`.
602 | 
603 | First we run cartonizer to create a batch script.
604 | 
605 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py ocr source-video target-video --additional-params '--threads 1 --run-subregions-in-parallel'`
606 | 
607 | This creates a file named `ocr_release.sh` and it will be the script we run when we have modified `subtitle_regions.json` enough.
608 | 
609 | We open up the `ocr_release.sh` file and find OCR of the first episode. We need temp folder and configuration created first before we can OCR it all.
610 | 
611 | ```
612 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
613 |   'source-video/Episode 1.mkv' \
614 |   'target-video/Episode 1.mkv' \
615 |   extract-subtitles \
616 |   --threads 1 --run-subregions-in-parallel
617 | ```
618 | 
619 | That is the command that extracts subtitles from the first episode, it will be the one we use for modifying `subtitle_regions.json`.
620 | 
621 | Lets see how good the default config is by running it against part of the episode, 5000 frames should suffice (that is 3.5 minutes at 23.976 fps).
622 | 
623 | ```
624 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
625 |   'source-video/Episode 1.mkv' \
626 |   'target-video/Episode 1.mkv' \
627 |   extract-subtitles \
628 |   --threads 1 --run-subregions-in-parallel \
629 |   --frame-range 5000-10000 # framerange we use to see
630 | ```
631 | 
632 | After it is done, create a report and see the result with
633 | 
634 | ```
635 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
636 |   'source-video/Episode 1.mkv' \
637 |   'target-video/Episode 1.mkv' \
638 |   create-report
639 | ```
640 | 
641 | The report is in the cow-temp folder in this example including an .ass file and the `subtitle_regions.json` file.
642 | 
643 | To modify and test changes to `subtitle_regions.json` we find a good subtitle frame number in the report and use that.
644 | 
645 | ```
646 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
647 |   'source-video/Episode 1.mkv' \
648 |   'target-video/Episode 1.mkv' \
649 |   extract-subtitles \
650 |   --threads 1 --run-subregions-in-parallel \
651 |   --debug-frame 13754 --debug-subregion bottom
652 | ```
653 | 
654 | We can then run the initial 5000 frames again and see if the result is good enough. If it is, then just run the whole `ocr_release.sh`.
655 | 
656 | When it is done the .ass in the cow-temp folder must be modified and the report followed. I do this by loading the subtitle file and source episode file into Aegisub.
657 | 
658 | The subtitles must now be synced with the video, chapters added and other stuff. This can be done with MilkSync.
659 | 
660 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py sync source-video target-video --additional-params '--external-subtitles cow-temp/'`
661 | 
662 | Then run `create_release.sh` and you got a fully synced video.
663 | 
664 | # License
665 | 
666 | AGPL


--------------------------------------------------------------------------------
/cowocr.py:
--------------------------------------------------------------------------------
   1 | import concurrent.futures
   2 | import dataclasses
   3 | import json
   4 | import math
   5 | import re
   6 | import shlex
   7 | import threading
   8 | import time
   9 | import traceback
  10 | from collections import namedtuple
  11 | from pathlib import Path
  12 | 
  13 | import click
  14 | import cv2
  15 | import jinja2
  16 | import lxml.html
  17 | import matplotlib
  18 | import matplotlib.pyplot as plt
  19 | import numpy as np
  20 | import pysubs2
  21 | import pytesseract
  22 | from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance
  23 | from scipy.spatial.distance import cdist, cosine
  24 | from skimage.metrics import structural_similarity
  25 | from textblob import TextBlob
  26 | from tqdm import tqdm, trange
  27 | 
  28 | BASE_ASS = r"""[Script Info]
  29 | ; Script generated by Aegisub 3.2.2
  30 | ; http://www.aegisub.org/
  31 | Title: CowOCR
  32 | ScriptType: v4.00+
  33 | WrapStyle: 0
  34 | ScaledBorderAndShadow: yes
  35 | PlayResX: 640
  36 | PlayResY: 480
  37 | 
  38 | [V4+ Styles]
  39 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
  40 | Style: Default,Open Sans Semibold,26.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,2,0,0,28,1
  41 | Style: Sign,Open Sans Semibold,26.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,2,0,0,28,1
  42 | Style: Note,Open Sans Semibold,20.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,8,0,0,28,1
  43 | 
  44 | [Events]
  45 | Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
  46 | """
  47 | 
  48 | HTML_BASE = r"""<!doctype html>
  49 | <html lang="en">
  50 |   <head>
  51 |     <!-- Required meta tags -->
  52 |     <meta charset="utf-8">
  53 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  54 | 
  55 |     <!-- Bootstrap CSS -->
  56 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
  57 | 
  58 |     <title>{{ title }}</title>
  59 |   </head>
  60 |   <body>
  61 |     <nav class="navbar navbar-expand-lg navbar-light bg-light">
  62 |     <div class="container-fluid">
  63 |         <a class="navbar-brand" href="index.html">OCR Report</a>
  64 |         <!---<ul class="navbar-nav me-auto mb-2 mb-lg-0">
  65 |             <li class="nav-item">
  66 |                 <a class="nav-link" href="subtitles.html">Subtitles</a>
  67 |             </li>
  68 |             <li class="nav-item">
  69 |                 <a class="nav-link" href="subtitle-signs.html">Subtitle Signs</a>
  70 |             </li>
  71 |             <li class="nav-item">
  72 |                 <a class="nav-link" href="missing-regions.html">Missing regions</a>
  73 |             </li>
  74 |         </ul>---->
  75 |     </div>
  76 |     </nav>
  77 |     %(body)s
  78 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/js/bootstrap.bundle.min.js" integrity="sha384-gtEjrD/SeCtmISkJkNUaaKMoLD0//ElJ19smozuHV6z3Iehds+3Ulb9Bn9Plx0x4" crossorigin="anonymous"></script>
  79 |   </body>
  80 | </html>"""
  81 | 
  82 | HTML_INDEX = HTML_BASE % {
  83 |     "body": r"""
  84 |     <h1>OCR Report for {{ video_filename }}</h1>
  85 |     {% for sr in subtitle_regions %}
  86 |     <div class="mb-3">
  87 |     <h4>Subtitle Region: {{sr.name}}</h4>
  88 |         {% if sr.scan_mode == 'bottom_center' %}
  89 |             <a href="{{ sr.name }}-subtitles.html">Subtitle lines<span style="display: none;"> ({{ subtitle_region_data[sr.name]['subtitle_lines']|length }})</span></a>
  90 |             <a href="{{ sr.name }}-missing-regions.html">Missing regions<span style="display: none;"> ({{ subtitle_region_data[sr.name]['missing_regions']|length }})</span></a>
  91 |         {% endif %}
  92 |         {% if sr.scan_mode == 'search_slice' %}
  93 |             <a href="{{ sr.name }}-subtitle-signs.html">Subtitle signs<span style="display: none;"> ({{ subtitle_region_data[sr.name]['subtitle_signs']|length }})</span></a>
  94 |             <a href="{{ sr.name }}-missing-regions.html">Missing regions<span style="display: none;"> ({{ subtitle_region_data[sr.name]['missing_regions']|length }})</span></a>
  95 |         {% endif %}
  96 |     </div>
  97 |     {% endfor %}
  98 | """
  99 | }
 100 | 
 101 | HTML_SUBTITLE_LINES = HTML_BASE % {
 102 |     "body": r"""
 103 |     <h1>Subtitle lines for: {{ sr.name }}</h1>
 104 |     <table class="table table-striped">
 105 |         <thead>
 106 |             <tr>
 107 |                 <th scope="col">#</th>
 108 |                 <th scope="col">Timestamps</th>
 109 |                 <th scope="col">Text</th>
 110 |                 <th scope="col">Frames</th>
 111 |             </tr>
 112 |         </thead>
 113 |         <tbody>
 114 |         {% for subtitle_line in subtitle_lines %}
 115 |             <tr>
 116 |                 <td>{{loop.index}}</td>
 117 |                 <td>
 118 |                     {{ subtitle_line['from_frame_no']|totimestamp }}<br>{{ subtitle_line['to_frame_no']|totimestamp }}<br>
 119 |                     {{ subtitle_line['from_frame_no'] }}<br>{{ subtitle_line['to_frame_no'] }}<br>{{ subtitle_line['initial_frame_no'] }}
 120 |                 </td>
 121 |                 <td>
 122 |                     {% autoescape false %}{{ subtitle_line['subtitle_text'] | replace('\n', '<br>') }}{% endautoescape %}<br>
 123 |                 </td>
 124 |                 <td>
 125 |                     <img class="mb-1" src="{{ get_frame(subtitle_line['from_frame_no'] - 1, None, sr) }}"><br>
 126 |                     <img class="mb-1" src="{{ get_frame(subtitle_line['from_frame_no'], None, sr) }}"><br>
 127 |                     <img class="mb-1" src="{{ get_frame(subtitle_line['to_frame_no'], None, sr) }}"><br>
 128 |                     <img src="{{ get_frame(subtitle_line['to_frame_no'] + 1, None, sr) }}">
 129 |                 </td>
 130 |             </tr>
 131 |         {% endfor %}
 132 |         </tbody>
 133 |     </table>
 134 | """
 135 | }
 136 | 
 137 | HTML_MISSING_REGIONS = HTML_BASE % {
 138 |     "body": r"""
 139 |     <h1>Missed regions for: {{ sr.name }}</h1>
 140 |     <table class="table table-striped">
 141 |         <thead>
 142 |             <tr>
 143 |                 <th scope="col">#</th>
 144 |                 <th scope="col">Timestamps</th>
 145 |                 <th scope="col">Frame</th>
 146 |             </tr>
 147 |         </thead>
 148 |         <tbody>
 149 |         {% for missing_region in missing_regions %}
 150 |             <tr>
 151 |                 <td>{{loop.index}}</td>
 152 |                 <td>{{ missing_region['frame_no']|totimestamp }}<br>{{ missing_region['frame_no'] }}</td>
 153 |                 <td><img src="{{ get_frame(missing_region['frame_no'], None) }}"></td>
 154 |             </tr>
 155 |         {% endfor %}
 156 |         </tbody>
 157 |     </table>
 158 | 
 159 |     <h1 class="mt-3">Short subtitle signs</h1>
 160 |     <table class="table table-striped">
 161 |         <thead>
 162 |             <tr>
 163 |                 <th scope="col">#</th>
 164 |                 <th scope="col">Timestamps</th>
 165 |                 <th scope="col">Frame</th>
 166 |             </tr>
 167 |         </thead>
 168 |         <tbody>
 169 |         {% for subtitle_sign in short_subtitle_signs %}
 170 |             <tr>
 171 |                 <td>{{loop.index}}</td>
 172 |                 <td>
 173 |                     {{ subtitle_sign['from_frame_no']|totimestamp }}<br>{{ subtitle_sign['to_frame_no']|totimestamp }}<br>
 174 |                     {{ subtitle_sign['from_frame_no'] }}<br>{{ subtitle_sign['to_frame_no'] }}<br>{{ subtitle_sign['initial_frame_no'] }}
 175 |                 </td>
 176 |                 <td><img src="{{ get_frame(subtitle_sign['from_frame_no'], None) }}"></td>
 177 |             </tr>
 178 |         {% endfor %}
 179 |         </tbody>
 180 |     </table>
 181 | """
 182 | }
 183 | 
 184 | HTML_SUBTITLE_SIGNS = HTML_BASE % {
 185 |     "body": r"""
 186 |     <h1>Subtitle signs for: {{ sr.name }}</h1>
 187 |     <table class="table table-striped">
 188 |         <thead>
 189 |             <tr>
 190 |                 <th scope="col">#</th>
 191 |                 <th scope="col">Timestamps</th>
 192 |                 <th scope="col">Frame</th>
 193 |             </tr>
 194 |         </thead>
 195 |         <tbody>
 196 |         {% for subtitle_sign in subtitle_signs %}
 197 |             <tr>
 198 |                 <td>{{loop.index}}</td>
 199 |                 <td>
 200 |                     {{ subtitle_sign['from_frame_no']|totimestamp }}<br>{{ subtitle_sign['to_frame_no']|totimestamp }}<br>
 201 |                     {{ subtitle_sign['from_frame_no'] }}<br>{{ subtitle_sign['to_frame_no'] }}<br>{{ subtitle_sign['initial_frame_no'] }}
 202 |                 </td>
 203 |                 <td>{% autoescape false %}{{ subtitle_sign['subtitle_text'] | replace('\n', '<br>') }}{% endautoescape %}</td>
 204 |                 <td><img src="{{ get_frame(subtitle_sign['from_frame_no'] - 1, subtitle_sign['position']) }}"></td>
 205 |                 <td><img src="{{ get_frame(subtitle_sign['from_frame_no'], subtitle_sign['position']) }}"></td>
 206 |                 <td><img src="{{ get_frame(subtitle_sign['to_frame_no'], subtitle_sign['position']) }}"></td>
 207 |                 <td><img src="{{ get_frame(subtitle_sign['to_frame_no'] + 1, subtitle_sign['position']) }}"></td>
 208 |             </tr>
 209 |         {% endfor %}
 210 |         </tbody>
 211 |     </table>
 212 | """
 213 | }
 214 | 
 215 | 
 216 | @dataclasses.dataclass
 217 | class SubtitleRegion:
 218 |     class ScanMode:
 219 |         BOTTOM_CENTER = "bottom_center"
 220 |         SEARCH_SLICE = "search_slice"
 221 | 
 222 |     class InvertMode:
 223 |         NO_INVERT = "no_invert"
 224 |         INVERT_ONLY = "invert_only"
 225 |         BOTH_INVERT = "both_invert"
 226 | 
 227 |     name: str
 228 |     scan_mode: ScanMode
 229 |     y: int
 230 |     h: int
 231 |     x: int
 232 |     w: int
 233 |     margin: int
 234 |     area_min: int
 235 |     area_max: int
 236 |     area_min_density: float
 237 |     max_w: int
 238 |     max_h: int
 239 |     min_stroke_width: int
 240 |     max_stroke_width: int
 241 |     border_size: int
 242 |     max_text_diff: int
 243 |     max_border_diff: int
 244 |     percent_good_border: float
 245 |     edge_threshold: int
 246 |     threshold_mode: str
 247 |     threshold_value: int
 248 |     ass_style_name: str
 249 |     invert_mode: InvertMode
 250 | 
 251 | 
 252 | default_subtitle_regions = [
 253 |     SubtitleRegion(
 254 |         name="bottom",
 255 |         scan_mode=SubtitleRegion.ScanMode.BOTTOM_CENTER,
 256 |         y=376,
 257 |         h=100,
 258 |         x=0,
 259 |         w=640,
 260 |         margin=4,
 261 |         area_min=6,
 262 |         area_max=1000,
 263 |         area_min_density=0.2,
 264 |         max_w=160,
 265 |         max_h=40,
 266 |         min_stroke_width=2,
 267 |         max_stroke_width=7,
 268 |         border_size=2,
 269 |         max_text_diff=60,
 270 |         max_border_diff=60,
 271 |         percent_good_border=25,
 272 |         edge_threshold=8,
 273 |         threshold_mode="static",
 274 |         threshold_value=200,
 275 |         ass_style_name="Default",
 276 |         invert_mode=SubtitleRegion.InvertMode.NO_INVERT,
 277 |     ),
 278 |     SubtitleRegion(
 279 |         name="region-scan",
 280 |         scan_mode=SubtitleRegion.ScanMode.SEARCH_SLICE,
 281 |         y=0,
 282 |         h=380,
 283 |         x=0,
 284 |         w=640,
 285 |         margin=0,
 286 |         area_min=6,
 287 |         area_max=1000,
 288 |         area_min_density=0.2,
 289 |         max_w=160,
 290 |         max_h=50,
 291 |         min_stroke_width=2,
 292 |         max_stroke_width=7,
 293 |         border_size=2,
 294 |         max_text_diff=60,
 295 |         max_border_diff=60,
 296 |         percent_good_border=25,
 297 |         edge_threshold=3,
 298 |         threshold_mode="static",
 299 |         threshold_value=200,
 300 |         ass_style_name="Sign",
 301 |         invert_mode=SubtitleRegion.InvertMode.NO_INVERT,
 302 |     ),
 303 | ]
 304 | 
 305 | 
 306 | def crop(image):
 307 |     y_nonzero, x_nonzero, _ = np.nonzero(image)
 308 |     if len(y_nonzero) == 0:
 309 |         return None
 310 |     crop_space = 4
 311 |     y, x, = max(
 312 |         np.min(y_nonzero) - crop_space, 0
 313 |     ), max(np.min(x_nonzero) - crop_space, 0)
 314 |     h, w = np.max(y_nonzero) + crop_space - y, np.max(x_nonzero) + crop_space - x
 315 |     return image[y : y + h, x : x + w], (y, h, x, w)
 316 | 
 317 | 
 318 | def ocr_region(frame_region, tesseract_data_path):
 319 |     custom_oem_psm_config = f"--psm 6 --tessdata-dir {shlex.quote(tesseract_data_path)}"  # TODO, get datapath
 320 |     return pytesseract.image_to_pdf_or_hocr(
 321 |         frame_region, extension="hocr", config=custom_oem_psm_config, lang="eng"
 322 |     )
 323 | 
 324 | 
 325 | def ocr_region_from_mask(
 326 |     frame, mask, tesseract_data_path, gather_images=None, inverted=False
 327 | ):
 328 |     mask = cv2.dilate(
 329 |         mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=1
 330 |     )
 331 |     choppable_frame = frame & cv2.cvtColor(mask, cv2.COLOR_GRAY2RGB)
 332 |     if inverted:
 333 |         choppable_frame[mask == 0] = [255, 255, 255]
 334 |         choppable_frame = cv2.bitwise_not(choppable_frame)
 335 | 
 336 |     crop_result = crop(choppable_frame)
 337 |     if crop_result is None:
 338 |         return None, None, None
 339 |     image, crop_region = crop_result
 340 |     image = cv2.bitwise_not(image)
 341 |     image = cv2.resize(image, (image.shape[1] * 2, image.shape[0] * 2))
 342 |     image = cv2.blur(image, (2, 2))
 343 |     if gather_images is not None:
 344 |         gather_images.append(("Image for OCR", image))
 345 |     return image, crop_region, ocr_region(image, tesseract_data_path)
 346 | 
 347 | 
 348 | def frame_has_mask(src_frame, dst_frame, mask, min_percent=0.1, max_offset=12):
 349 |     orig_mask = mask.copy()
 350 |     orig_mask = cv2.cvtColor(orig_mask, cv2.COLOR_GRAY2BGR)
 351 |     mask = mask.astype("bool")
 352 |     c = np.abs(src_frame[mask].flatten() - dst_frame[mask].flatten()).astype("uint8")
 353 |     c += max_offset
 354 |     return len(c[c > (max_offset * 2)]) < int(min_percent * len(c))
 355 | 
 356 | 
 357 | def find_frames_with_mask(
 358 |     video,
 359 |     initial_frame_no,
 360 |     src_frame,
 361 |     mask,
 362 |     shape,
 363 |     region,
 364 |     max_frames_backward=18,
 365 |     max_frames_forward=600,
 366 |     max_frames=None,
 367 | ):
 368 |     frames_with_mask = []
 369 |     for r in [range(0, -max_frames_backward - 1, -1), range(1, max_frames_forward)]:
 370 |         for i in r:
 371 |             frame_no = initial_frame_no + i
 372 |             if frame_no >= max_frames:
 373 |                 break
 374 |             if frame_no > 0:
 375 |                 dst_frame = get_frame(video, frame_no, shape, region=region)
 376 |             else:
 377 |                 dst_frame = None
 378 |             if dst_frame is None or not frame_has_mask(src_frame, dst_frame, mask):
 379 |                 break
 380 |             frames_with_mask.append(frame_no)
 381 |     if not frames_with_mask:
 382 |         return None, None
 383 | 
 384 |     return min(frames_with_mask), max(frames_with_mask)
 385 | 
 386 | 
 387 | def remove_smaller_cc(mask):
 388 |     dilated_mask = cv2.dilate(
 389 |         mask.copy(), cv2.getStructuringElement(cv2.MORPH_RECT, (3, 5)), iterations=6
 390 |     )
 391 |     numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(
 392 |         dilated_mask, connectivity=4
 393 |     )
 394 |     sizes = stats[:, -1]
 395 |     i = np.argmax(sizes[1:]) + 1
 396 |     labels[labels != i] = 0
 397 |     labels[labels == i] = 255
 398 |     return mask & labels.astype("uint8"), sizes[i]
 399 | 
 400 | 
 401 | def estimate_line_width(labels):
 402 |     labels = labels.copy()
 403 |     labels = np.repeat(labels, 2, axis=0)
 404 |     labels = np.repeat(labels, 2, axis=1)
 405 |     mask = labels.copy()
 406 |     mask[mask > 0] = 1
 407 |     mask = mask.astype("uint8")
 408 |     label_line_width = {}
 409 |     i = 0
 410 |     current_labels = set(np.unique(labels))
 411 | 
 412 |     label_count = {}
 413 |     for label, count in zip(*np.unique(labels, return_counts=True)):
 414 |         label_count[label] = [count]
 415 |     while True:
 416 |         mask = cv2.erode(
 417 |             mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=1
 418 |         )
 419 |         i += 1
 420 |         new_labels = set(np.unique(labels * mask))
 421 |         label_line_width[i] = current_labels - new_labels
 422 |         if len(new_labels) <= 1:
 423 |             break
 424 |         current_labels = new_labels
 425 |         for label, count in zip(*np.unique(labels * mask, return_counts=True)):
 426 |             label_count[label].append(count)
 427 |     return label_line_width
 428 | 
 429 | 
 430 | def find_potential_text_block_areas(orig_labels, iterations=7):
 431 |     mask = orig_labels.copy()
 432 |     mask[mask > 0] = 255
 433 |     mask = mask.astype("uint8")
 434 |     mask = cv2.dilate(
 435 |         mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=iterations
 436 |     )
 437 |     numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(
 438 |         mask, connectivity=4
 439 |     )
 440 |     shape = labels.shape
 441 |     mid = shape[1] // 2
 442 |     for i, (x, y, w, h, area) in enumerate(stats):
 443 |         if x < mid and x + w > mid and h >= 20:
 444 |             labels[labels == i] = 0
 445 |     region_space = 16
 446 |     source_region = np.zeros(mask.shape, dtype="bool")
 447 |     source_region[0 : mask.shape[0], mid - region_space : mid + region_space] = True
 448 |     return np.unique(orig_labels[labels.astype("bool")]), source_region
 449 | 
 450 | 
 451 | def do_threshold(sr, frame):
 452 |     if sr.threshold_mode == "adaptive":
 453 |         thresh = cv2.adaptiveThreshold(
 454 |             cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY),
 455 |             255,
 456 |             cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
 457 |             cv2.THRESH_BINARY,
 458 |             sr.threshold_value,
 459 |             2,
 460 |         )
 461 |     elif sr.threshold_mode == "static":
 462 |         thresh = cv2.threshold(
 463 |             cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY),
 464 |             sr.threshold_value,
 465 |             255,
 466 |             cv2.THRESH_BINARY,
 467 |         )[1]
 468 |     return thresh
 469 | 
 470 | 
 471 | def extract_text_subregion(
 472 |     frame, sr, tesseract_data_path, gather_images=None, inverted=False
 473 | ):
 474 |     thresh = do_threshold(sr, frame)
 475 |     if inverted:
 476 |         thresh = cv2.bitwise_not(thresh)
 477 |     numLabels, labels = cv2.connectedComponents(thresh, connectivity=4)
 478 |     for label, count in zip(*np.unique(labels, return_counts=True)):
 479 |         if count <= sr.area_max:
 480 |             labels[labels == label] = 0
 481 |     labels[labels > 0] = 255
 482 |     thresh = thresh - labels.astype("uint8")
 483 | 
 484 |     numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(
 485 |         thresh, connectivity=4
 486 |     )
 487 |     subtitle_region_labels = labels.copy()
 488 |     if gather_images is not None:
 489 |         gather_images.append(("Thresh", thresh))
 490 | 
 491 |     label_stats = {}
 492 |     for i, (x, y, w, h, area) in enumerate(stats):
 493 |         if i == 0:
 494 |             continue
 495 |         if (
 496 |             sr.max_h >= h
 497 |             and sr.max_w >= w
 498 |             and sr.area_min <= area
 499 |             and sr.area_max >= area
 500 |             and sr.margin <= y
 501 |             and frame.shape[0] - sr.margin >= y + h  # TODO: margin remove
 502 |             and sr.margin <= x
 503 |             and frame.shape[1] - sr.margin >= x + w
 504 |             and sr.area_min_density < (area / (h * w))
 505 |         ):
 506 |             label_stats[i] = (x, y, w, h, area)
 507 |         else:
 508 |             labels[labels == i] = 0
 509 | 
 510 |     dust_labels, starting_area = find_potential_text_block_areas(labels)
 511 |     for i in dust_labels:
 512 |         if i == 0:
 513 |             continue
 514 |         labels[labels == i] = 0
 515 |         del label_stats[i]
 516 | 
 517 |     estimated_line_width = estimate_line_width(labels)
 518 |     correct_stroke_width = [
 519 |         vv
 520 |         for v in [
 521 |             v
 522 |             for (k, v) in estimated_line_width.items()
 523 |             if k <= sr.max_stroke_width and k >= sr.min_stroke_width
 524 |         ]
 525 |         for vv in v
 526 |     ]
 527 |     labels[np.isin(labels, correct_stroke_width, invert=True)] = 0
 528 | 
 529 |     if gather_images is not None:
 530 |         gather_images.append(("final mask", labels))
 531 | 
 532 |     mask = labels.astype("bool")
 533 | 
 534 |     pixels = frame[mask]
 535 |     if len(pixels) == 1:
 536 |         return None, None
 537 |     pixels = np.float32(pixels)
 538 |     n_colors = min(20, len(pixels))
 539 |     if n_colors == 0:
 540 |         return None, None
 541 |     criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, 0.1)
 542 |     flags = cv2.KMEANS_PP_CENTERS
 543 |     _, color_labels, palette = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)
 544 |     _, counts = np.unique(color_labels, return_counts=True)
 545 | 
 546 |     starting_area_labels = np.full(labels.shape, 255, dtype="uint8")
 547 |     starting_area_labels[mask] = color_labels.flatten()
 548 | 
 549 |     starting_labels, starting_labels_counts = np.unique(
 550 |         starting_area_labels[starting_area], return_counts=True
 551 |     )
 552 |     starting_labels_label_enum = sorted(
 553 |         [(l, c) for (l, c) in zip(starting_labels, starting_labels_counts) if l != 255],
 554 |         key=lambda x: x[1],
 555 |         reverse=True,
 556 |     )
 557 |     starting_labels = [x[0] for x in starting_labels_label_enum]
 558 | 
 559 |     consumed_labels = set()
 560 |     label_enum = sorted(enumerate(counts), key=lambda l: l[1], reverse=True)
 561 |     region_candidates = []
 562 |     for label, _ in label_enum:
 563 |         if label not in starting_labels:
 564 |             continue
 565 |         if label in consumed_labels:
 566 |             continue
 567 |         dominant_labels = [label]
 568 |         dominant = palette[label]
 569 |         for i, color in enumerate(palette):
 570 |             if i == label:
 571 |                 continue
 572 |             if np.linalg.norm(dominant - color) < sr.max_text_diff:
 573 |                 dominant_labels.append(i)
 574 | 
 575 |         consumed_labels |= set(dominant_labels)
 576 |         labels_inner = labels.copy()
 577 |         dominant_color_labels = color_labels.copy()
 578 |         dominant_color_labels[
 579 |             np.isin(dominant_color_labels, list(dominant_labels), invert=True)
 580 |         ] = 0
 581 |         dominant_color_labels[dominant_color_labels >= 1] = 1
 582 | 
 583 |         labels_subset = np.unique(
 584 |             labels[mask][dominant_color_labels.astype("bool").flatten()]
 585 |         )
 586 |         labels_subset = labels_subset[labels_subset != 0]
 587 | 
 588 |         labels_inner[np.isin(labels_inner, labels_subset, invert=True)] = 0
 589 |         labels_inner[labels_inner > 0] = 255
 590 |         labels_inner = labels_inner.astype("uint8")
 591 | 
 592 |         labels_border = (
 593 |             cv2.dilate(
 594 |                 labels_inner,
 595 |                 cv2.getStructuringElement(
 596 |                     cv2.MORPH_RECT, (sr.border_size * 2 + 1, sr.border_size * 2 + 1)
 597 |                 ),
 598 |                 iterations=1,
 599 |             )
 600 |             - labels_inner
 601 |         )
 602 |         border_pixels = frame[labels_border.astype("bool")]
 603 |         border_pixels = np.float32(border_pixels)
 604 |         border_n_colors = min(20, len(border_pixels))
 605 | 
 606 |         if border_n_colors == 0:
 607 |             continue
 608 | 
 609 |         border_criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, 0.1)
 610 |         border_flags = cv2.KMEANS_PP_CENTERS
 611 |         _, border_color_labels, border_palette = cv2.kmeans(
 612 |             border_pixels, border_n_colors, None, border_criteria, 10, border_flags
 613 |         )
 614 |         _, border_counts = np.unique(border_color_labels, return_counts=True)
 615 | 
 616 |         starting_area_border_labels = np.full(labels.shape, 255, dtype="uint8")
 617 |         starting_area_border_labels[
 618 |             labels_border.astype("bool")
 619 |         ] = border_color_labels.flatten()
 620 | 
 621 |         consumed_border_labels = set()
 622 |         border_label_enum = sorted(
 623 |             enumerate(border_counts), key=lambda l: l[1], reverse=True
 624 |         )
 625 |         for label, _ in border_label_enum:
 626 |             if label in consumed_border_labels:
 627 |                 continue
 628 |             consumed_border_labels.add(label)
 629 |             dominant_labels_border = [label]
 630 |             dominant = border_palette[label]
 631 |             for i, color in enumerate(border_palette):
 632 |                 if i in consumed_border_labels:
 633 |                     continue
 634 |                 if i == label:
 635 |                     continue
 636 |                 if np.linalg.norm(dominant - color) < sr.max_border_diff:
 637 |                     # print(f"Merging {i=} {dominant=} and {color=}")
 638 |                     dominant_labels_border.append(i)
 639 |                     consumed_border_labels.add(i)
 640 |             break
 641 | 
 642 |         frame_border = np.zeros(frame.shape[:2], dtype="uint8")
 643 |         frame_border[labels_border.astype("bool")] = border_color_labels.flatten() + 1
 644 | 
 645 |         border_labels_subset = set(labels_subset)
 646 |         for i, (x, y, w, h, area) in enumerate(stats):
 647 |             if i not in border_labels_subset:
 648 |                 continue
 649 |             mask_label = labels.copy()
 650 |             mask_label[mask_label != i] = 0
 651 |             mask_label[mask_label == i] = 255
 652 |             mask_label = mask_label.astype("uint8")
 653 |             mask_label_border = (
 654 |                 cv2.dilate(
 655 |                     mask_label,
 656 |                     cv2.getStructuringElement(
 657 |                         cv2.MORPH_RECT, (sr.border_size * 2 + 1, sr.border_size * 2 + 1)
 658 |                     ),
 659 |                     iterations=1,
 660 |                 )
 661 |                 - mask_label
 662 |             )
 663 |             border_frame = frame_border & mask_label_border
 664 |             border_labels, border_counts = np.unique(border_frame, return_counts=True)
 665 |             border_labels = border_labels[1:] - 1
 666 |             border_counts = border_counts[1:]
 667 |             good, bad = 0, 0
 668 |             for border_label, border_count in zip(border_labels, border_counts):
 669 |                 if border_label in dominant_labels_border:
 670 |                     good += border_count
 671 |                 else:
 672 |                     bad += border_count
 673 |             if round((good / (good + bad)) * 100) < sr.percent_good_border:
 674 |                 border_labels_subset.remove(i)
 675 | 
 676 |         border_labels_subset = np.array(list(border_labels_subset))
 677 |         result_mask = labels.copy()
 678 |         result_mask[np.isin(result_mask, border_labels_subset, invert=True)] = 0
 679 |         result_mask[result_mask > 0] = 255
 680 |         result_mask = result_mask.astype("uint8")
 681 |         # result_mask = cv2.dilate(result_mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)), iterations=1)
 682 |         if gather_images is not None:
 683 |             gather_images.append(("final region label", result_mask))
 684 |         image, crop_region, hocr = ocr_region_from_mask(
 685 |             frame,
 686 |             result_mask,
 687 |             tesseract_data_path,
 688 |             gather_images=gather_images,
 689 |             inverted=inverted,
 690 |         )
 691 |         if hocr is not None and hocr_to_text(hocr):
 692 |             if gather_images is not None:
 693 |                 print("Found HOCR")
 694 |                 print(hocr.decode())
 695 |             region_candidates.append((result_mask, hocr))
 696 |     return region_candidates
 697 | 
 698 | 
 699 | def find_diff_regions(
 700 |     src_frame,
 701 |     dst_frame,
 702 |     gather_images=None,
 703 |     nfeatures=2000,
 704 |     min_kp_region=100,
 705 |     edge_threshold=14,
 706 | ):
 707 |     orb = cv2.ORB_create(
 708 |         edgeThreshold=edge_threshold, patchSize=edge_threshold, nfeatures=nfeatures
 709 |     )
 710 |     src_kp = orb.detect(src_frame, None)
 711 |     if gather_images is not None:
 712 |         gather_images.append(
 713 |             (
 714 |                 "Source keypoints",
 715 |                 cv2.drawKeypoints(src_frame, src_kp, None, color=(0, 255, 0), flags=0),
 716 |             )
 717 |         )
 718 | 
 719 |     orb = cv2.ORB_create(
 720 |         edgeThreshold=edge_threshold, patchSize=edge_threshold, nfeatures=50000
 721 |     )
 722 |     dst_kp = orb.detect(dst_frame, None)
 723 |     if gather_images is not None:
 724 |         gather_images.append(
 725 |             (
 726 |                 "Destination keypoints",
 727 |                 cv2.drawKeypoints(dst_frame, dst_kp, None, color=(0, 255, 0), flags=0),
 728 |             )
 729 |         )
 730 | 
 731 |     kp_mask = np.zeros(src_frame.shape[:2], dtype="uint8")
 732 | 
 733 |     src_kp_vector = np.array([k.pt + (k.angle / 5,) for k in src_kp])
 734 |     dst_kp_vector = np.array([k.pt + (k.angle / 5,) for k in dst_kp])
 735 |     if len(src_kp_vector) == 0:
 736 |         return kp_mask
 737 | 
 738 |     if len(dst_kp_vector) == 0:
 739 |         good_kps = list(src_kp)
 740 |     else:
 741 |         C = cdist(dst_kp_vector, src_kp_vector, metric="euclidean")
 742 |         good_kps = []
 743 |         for k, cost in zip(src_kp, np.min(C, axis=0)):
 744 |             if cost > 6.0:
 745 |                 good_kps.append(k)
 746 | 
 747 |     if gather_images is not None:
 748 |         kp_diff_src = cv2.drawKeypoints(
 749 |             src_frame, good_kps, None, color=(0, 255, 0), flags=0
 750 |         )
 751 |         gather_images.append(("KP Diff Source", kp_diff_src))
 752 | 
 753 |     for kp in good_kps:
 754 |         x, y = kp.pt
 755 |         kp_mask[int(y), int(x)] = 255
 756 | 
 757 |     mask = cv2.dilate(
 758 |         kp_mask, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)), iterations=6
 759 |     )
 760 |     numLabels, labels = cv2.connectedComponents(mask, connectivity=8)
 761 |     for i in range(1, numLabels):
 762 |         label_mask = labels.copy()
 763 |         label_mask[label_mask != i] = 0
 764 |         label_mask = kp_mask & label_mask.astype(bool)
 765 |         if len(label_mask[label_mask > 0]) < min_kp_region:
 766 |             labels[labels == i] = 0
 767 | 
 768 |     labels[labels > 0] = 255
 769 |     mask = labels.astype("uint8")
 770 |     if gather_images is not None:
 771 |         gather_images.append(("Mask", mask))
 772 | 
 773 |     return mask
 774 | 
 775 | 
 776 | def evaluate_if_inverted(sr, frame, mask):
 777 |     thresh = do_threshold(sr, frame)
 778 | 
 779 |     numLabels, labels = cv2.connectedComponents(thresh & mask, connectivity=4)
 780 |     numLabelsInverted, labelsInverted = cv2.connectedComponents(
 781 |         cv2.bitwise_not(thresh) & mask, connectivity=4
 782 |     )
 783 | 
 784 |     return numLabels < numLabelsInverted
 785 | 
 786 | 
 787 | def get_frame(video, frame_no, shape=None, region=None, skip_align=False):
 788 |     if frame_no < 0:
 789 |         raise Exception(f"Trying to get {frame_no}")
 790 | 
 791 |     video.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
 792 |     ret, frame = video.read()
 793 | 
 794 |     if not ret:
 795 |         raise Exception(f"Failed to get frame {frame_no}")
 796 | 
 797 |     if skip_align:
 798 |         return frame
 799 | 
 800 |     if shape is not None:
 801 |         sy, sx, sz = shape
 802 |         fy, fx, fz = frame.shape
 803 |         frame_aspect = fx / fy
 804 | 
 805 |         if frame_aspect > sx / sy:
 806 |             new_fx = frame_aspect * sy
 807 |             slice_each_x = int((fx - new_fx) / 2)
 808 |             frame = frame[0:fy, slice_each_x : (fx - slice_each_x)]
 809 |         elif frame_aspect < sx / sy:
 810 |             new_fx = (sx / sy) * fy
 811 |             append_each_x = int((new_fx - fx) / 2)
 812 |             if append_each_x > 0:
 813 |                 frame = cv2.copyMakeBorder(
 814 |                     frame,
 815 |                     0,
 816 |                     0,
 817 |                     append_each_x,
 818 |                     append_each_x,
 819 |                     cv2.BORDER_CONSTANT,
 820 |                     None,
 821 |                     (0, 0, 0),
 822 |                 )
 823 | 
 824 |         if frame.shape > shape:
 825 |             frame = cv2.resize(frame, (sx, sy))
 826 | 
 827 |     if region is not None:
 828 |         frame = frame[region[0] : region[1], region[2] : region[3]]
 829 | 
 830 |     return frame
 831 | 
 832 | 
 833 | def hocr_to_text(hocr):
 834 |     def parse_hocr_title(title):
 835 |         result = {}
 836 |         for entry in title.split(";"):
 837 |             if not entry.strip():
 838 |                 continue
 839 |             entry_key, entry_value = entry.strip().split(" ", 1)
 840 |             if entry_key in ["bbox", "x_ascenders", "x_wconf"]:
 841 |                 result[entry_key] = [
 842 |                     int(float(x)) for x in entry_value.strip().split(" ")
 843 |                 ]
 844 |             elif entry_key in ["x_size", "x_descenders", "baseline"]:
 845 |                 result[entry_key] = [float(x) for x in entry_value.strip().split(" ")]
 846 |         return result
 847 | 
 848 |     tree = lxml.html.fromstring(hocr)
 849 |     lines, score, line_consumptions = [], [], []
 850 |     for line in tree.xpath(r"//span[@class='ocr_line']"):
 851 |         line_specs = parse_hocr_title(line.attrib["title"])
 852 |         line_consumption = []
 853 |         line_text = []
 854 |         for word in line.xpath(r"./span[@class='ocrx_word']"):
 855 |             specs = parse_hocr_title(word.attrib["title"])
 856 |             score.append(specs["x_wconf"])
 857 |             line_consumption.append(specs["bbox"][2] - specs["bbox"][0])
 858 |             line_text.append(word.text)
 859 |         lines.append(" ".join(line_text))
 860 |         line_consumptions.append(
 861 |             min(sum(line_consumption), line_specs["bbox"][2] - line_specs["bbox"][0])
 862 |         )
 863 |     if not lines:
 864 |         return 0, "", []
 865 |     return np.average(np.array(score)), "\n".join(lines), line_consumptions
 866 | 
 867 | 
 868 | def get_subtitle_mask(
 869 |     frame, sr, tesseract_data_path, gather_images=None, inverted=False
 870 | ):
 871 |     subregion_candidates = extract_text_subregion(
 872 |         frame, sr, tesseract_data_path, gather_images=gather_images, inverted=inverted
 873 |     )
 874 |     if subregion_candidates and subregion_candidates[0] is not None:
 875 |         candidate_scores = []
 876 |         for result_mask, hocr in subregion_candidates:
 877 |             score, text, line_consumptions = hocr_to_text(hocr)
 878 |             if not text or not line_consumptions:
 879 |                 continue
 880 |             percent_filled = sorted(line_consumptions, reverse=True)[0] / frame.shape[1]
 881 |             candidate_scores.append(
 882 |                 (score * percent_filled, score, percent_filled, result_mask, text)
 883 |             )
 884 |             if score > 65 and percent_filled > 0.8:
 885 |                 return result_mask, text
 886 |         if candidate_scores:
 887 |             return sorted(candidate_scores, reverse=True, key=lambda x: x[0])[0][3:]
 888 |     return None, None
 889 | 
 890 | 
 891 | def slice_mask_regions(
 892 |     frame,
 893 |     mask,
 894 |     sr,
 895 |     orig_region,
 896 |     tesseract_data_path,
 897 |     gather_images=None,
 898 |     inverted=False,
 899 | ):
 900 |     subtitles = []
 901 |     numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(
 902 |         mask, connectivity=4
 903 |     )
 904 |     for i, (x, y, w, h, area) in enumerate(stats):
 905 |         if i == 0:
 906 |             continue
 907 |         region = (max(y - 4, 0), y + h + 4, max(x - 4, 4), x + w + 4)
 908 |         sliced_frame = frame[region[0] : region[1], region[2] : region[3]].copy()
 909 |         subtitle_mask, subtitle_text = get_subtitle_mask(
 910 |             sliced_frame,
 911 |             sr,
 912 |             tesseract_data_path,
 913 |             gather_images=gather_images,
 914 |             inverted=inverted,
 915 |         )
 916 |         if subtitle_text:
 917 |             subtitle_region = (
 918 |                 region[0] + orig_region[0],
 919 |                 region[0] + sliced_frame.shape[0] + orig_region[0],
 920 |                 region[2] + orig_region[2],
 921 |                 region[2] + sliced_frame.shape[1] + orig_region[2],
 922 |             )
 923 |             subtitles.append(
 924 |                 (subtitle_mask, subtitle_text, sliced_frame, subtitle_region)
 925 |             )
 926 |         else:
 927 |             subtitles.append((subtitle_mask, None, sliced_frame, None))
 928 |     return subtitles
 929 | 
 930 | 
 931 | def loop_frames(
 932 |     src_video,
 933 |     dst_video,
 934 |     start_frame_no,
 935 |     end_frame_no,
 936 |     frame_diff,
 937 |     sr,
 938 |     tesseract_data_path,
 939 |     frame_steps=9,
 940 |     fix_broken_frame_alignment=False,
 941 |     edge_threshold=14,
 942 |     gather_images=None,
 943 |     max_frames=None,
 944 | ):
 945 |     src_video.set(cv2.CAP_PROP_POS_FRAMES, 0)
 946 |     _, src_frame = src_video.read()
 947 | 
 948 |     shape = src_frame.shape
 949 |     region = (sr.y, sr.y + sr.h, sr.x, sr.x + sr.w)
 950 |     min_frame_no = 0
 951 | 
 952 |     if frame_diff < 0:
 953 |         start_frame_no = max(start_frame_no - frame_diff, 0)
 954 | 
 955 |     for frame_no in tqdm(
 956 |         range(start_frame_no, end_frame_no, frame_steps),
 957 |         desc=f"Region: {sr.name} Framerange: {start_frame_no}-{end_frame_no}",
 958 |     ):
 959 |         if frame_no < min_frame_no:
 960 |             continue
 961 |         if fix_broken_frame_alignment:
 962 |             last_frame_no = frame_no + 2
 963 |             if max_frames is not None:
 964 |                 last_frame_no = min(last_frame_no, max_frames)
 965 |             frame_no_range = range(max(frame_no - 1, 0, -frame_diff), last_frame_no)
 966 |         else:
 967 |             frame_no_range = range(frame_no, frame_no + 1)
 968 | 
 969 |         frames = []
 970 | 
 971 |         for frame_no_actual in frame_no_range:
 972 |             src_frame = get_frame(src_video, frame_no_actual, shape, region=region)
 973 |             dst_frame = get_frame(
 974 |                 dst_video, frame_no_actual + frame_diff, shape, region=region
 975 |             )
 976 | 
 977 |             frames.append(
 978 |                 (
 979 |                     src_frame,
 980 |                     dst_frame,
 981 |                     frame_no_actual,
 982 |                     structural_similarity(src_frame, dst_frame, multichannel=True),
 983 |                 )
 984 |             )
 985 | 
 986 |         src_frame, dst_frame, frame_no = sorted(
 987 |             frames, key=lambda x: x[3], reverse=True
 988 |         )[0][:3]
 989 |         if gather_images is not None:
 990 |             gather_images.append((f"Source frame {frame_no}", src_frame))
 991 |             gather_images.append((f"Destination frame {frame_no}", dst_frame))
 992 | 
 993 |         mask = find_diff_regions(
 994 |             src_frame,
 995 |             dst_frame,
 996 |             gather_images=gather_images,
 997 |             edge_threshold=edge_threshold,
 998 |         )
 999 |         values, counts = np.unique(mask, return_counts=True)
1000 | 
1001 |         if len(counts) > 1 and counts[1] > 150:
1002 |             inverted = evaluate_if_inverted(sr, src_frame, mask)
1003 |             found_to_frames = []
1004 |             subtitles = []
1005 |             if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE:
1006 |                 subtitles += slice_mask_regions(
1007 |                     src_frame,
1008 |                     mask,
1009 |                     sr,
1010 |                     region,
1011 |                     tesseract_data_path,
1012 |                     gather_images=gather_images,
1013 |                     inverted=inverted,
1014 |                 )
1015 |             else:
1016 |                 subtitles.append(
1017 |                     get_subtitle_mask(
1018 |                         src_frame,
1019 |                         sr,
1020 |                         tesseract_data_path,
1021 |                         gather_images=gather_images,
1022 |                         inverted=inverted,
1023 |                     )
1024 |                     + (
1025 |                         src_frame,
1026 |                         region,
1027 |                     )
1028 |                 )
1029 | 
1030 |             for (
1031 |                 subtitle_mask,
1032 |                 subtitle_text,
1033 |                 subtitle_src_frame,
1034 |                 subtitle_region,
1035 |             ) in subtitles:
1036 |                 if gather_images is not None:
1037 |                     gather_images.append(
1038 |                         (f"Frame: {frame_no} - text: {subtitle_text}", subtitle_mask)
1039 |                     )
1040 | 
1041 |                 if subtitle_text:
1042 |                     from_frame_no, to_frame_no = find_frames_with_mask(
1043 |                         src_video,
1044 |                         frame_no,
1045 |                         subtitle_src_frame,
1046 |                         subtitle_mask,
1047 |                         shape,
1048 |                         subtitle_region,
1049 |                         max_frames=max_frames,
1050 |                     )
1051 |                     if from_frame_no is None:
1052 |                         continue
1053 |                     found_to_frames.append(to_frame_no)
1054 |                     if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE:
1055 |                         yield {
1056 |                             "type": "subtitle_sign",
1057 |                             "from_frame_no": from_frame_no,
1058 |                             "to_frame_no": to_frame_no,
1059 |                             "initial_frame_no": frame_no,
1060 |                             "subtitle_text": subtitle_text,
1061 |                             "region": sr.name,
1062 |                             "position": [int(p) for p in subtitle_region],
1063 |                         }
1064 |                     else:
1065 |                         yield {
1066 |                             "type": "subtitle",
1067 |                             "from_frame_no": from_frame_no,
1068 |                             "to_frame_no": to_frame_no,
1069 |                             "initial_frame_no": frame_no,
1070 |                             "subtitle_text": subtitle_text,
1071 |                             "region": sr.name,
1072 |                         }
1073 |                 else:
1074 |                     yield {
1075 |                         "type": "missed_region",
1076 |                         "frame_no": frame_no,
1077 |                         "region": sr.name,
1078 |                     }
1079 | 
1080 |             if found_to_frames:
1081 |                 min_frame_no = min(found_to_frames)
1082 | 
1083 | 
1084 | def frame_generator(start_i):
1085 |     for i in range(1, 300):
1086 |         if i >= start_i:
1087 |             continue
1088 |         yield start_i + i
1089 |         yield start_i - i
1090 | 
1091 | 
1092 | def find_good_frame_breakpoint(video, current_frame):  # TODO: do binary search instead?
1093 |     compare_frame_size = (32, 32)
1094 |     frame_cache = {}
1095 | 
1096 |     def get_frame(frame_no):
1097 |         if frame_no not in frame_cache:
1098 |             video.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
1099 |             frame_cache[frame_no] = cv2.cvtColor(
1100 |                 cv2.resize(video.read()[1], compare_frame_size), cv2.COLOR_BGR2GRAY
1101 |             )
1102 |         return frame_cache[frame_no]
1103 | 
1104 |     best_score = 1.0
1105 |     best_frame = current_frame
1106 |     for frame_no in frame_generator(current_frame):
1107 |         score = structural_similarity(get_frame(frame_no), get_frame(frame_no + 1))
1108 |         if score < best_score:
1109 |             best_score = score
1110 |             best_frame = frame_no
1111 |         if score < 0.65:
1112 |             return frame_no - current_frame
1113 |     return best_frame - current_frame
1114 | 
1115 | 
1116 | def estimate_video_frame_diff(
1117 |     source_video, target_video, current_source_frame, current_target_frame
1118 | ):
1119 |     frame_index_size = (64, 64)
1120 |     compare_frame_count = 5
1121 |     spread_frame_count = 14
1122 |     ret, source_frame = source_video.read()
1123 |     ret, target_frame = target_video.read()
1124 | 
1125 |     sy, sx, sz = source_frame.shape
1126 |     ty, tx, tz = target_frame.shape
1127 | 
1128 |     s_aspect = sx / sy
1129 |     t_aspect = tx / ty
1130 | 
1131 |     source_frames = []
1132 |     target_frames = []
1133 | 
1134 |     source_from_frame = current_source_frame - (compare_frame_count // 2)
1135 |     source_to_frame = source_from_frame + compare_frame_count
1136 | 
1137 |     source_video.set(cv2.CAP_PROP_POS_FRAMES, source_from_frame)
1138 |     for _ in range(source_from_frame, source_to_frame):
1139 |         frame_no = source_video.get(cv2.CAP_PROP_POS_FRAMES)
1140 |         source_frames.append(
1141 |             (
1142 |                 cv2.cvtColor(
1143 |                     cv2.resize(source_video.read()[1], frame_index_size),
1144 |                     cv2.COLOR_BGR2GRAY,
1145 |                 ),
1146 |                 frame_no,
1147 |             )
1148 |         )
1149 | 
1150 |     target_from_frame = current_target_frame - spread_frame_count
1151 |     target_to_frame = target_from_frame + (spread_frame_count * 2)
1152 | 
1153 |     target_video.set(cv2.CAP_PROP_POS_FRAMES, target_from_frame)
1154 |     for _ in range(target_from_frame, target_to_frame):
1155 |         frame_no = target_video.get(cv2.CAP_PROP_POS_FRAMES)
1156 |         target_frame = target_video.read()[1]
1157 |         if s_aspect > t_aspect:
1158 |             new_tx = (tx / sx) * sy
1159 |             slice_each_x = int((tx - new_tx) / 2)
1160 |             target_frame = target_frame[0:ty, slice_each_x : (tx - slice_each_x)]
1161 | 
1162 |         target_frame = cv2.resize(target_frame, frame_index_size)
1163 |         target_frames.append((cv2.cvtColor(target_frame, cv2.COLOR_BGR2GRAY), frame_no))
1164 |     best_diff = 0
1165 |     best_frame_diff = None
1166 |     for i in range(len(target_frames) - len(source_frames)):
1167 |         v = target_frames[i : i + len(source_frames)]
1168 |         diffs = []
1169 |         frame_nos = []
1170 |         for sf, tf in zip(source_frames, v):
1171 |             sf, sfn = sf
1172 |             tf, tfn = tf
1173 |             frame_nos.append((sfn, tfn))
1174 |             diffs.append(structural_similarity(sf, tf, multichannel=False))
1175 |         diffs = np.square(np.array(diffs) * 100)
1176 |         if sum(diffs) > best_diff:
1177 |             best_diff = sum(diffs)
1178 |             best_frame_diff = (target_from_frame + i) - source_from_frame
1179 | 
1180 |     return best_frame_diff
1181 | 
1182 | 
1183 | def totimestamp(fps, frame_count):
1184 |     s = frame_count / fps
1185 |     m, s = divmod(s, 60)
1186 |     h, m = divmod(m, 60)
1187 |     return f"{int(h):02}:{int(m):02}:{(s):06.3f}"
1188 | 
1189 | 
1190 | def make_time(frames, fps, pos=None):
1191 |     ms = pysubs2.time.frames_to_ms(frames, fps)
1192 |     if pos == "end":
1193 |         ms = math.ceil((ms / 10)) * 10
1194 |     elif pos == "start":
1195 |         ms -= 10
1196 |     actual_frames = pysubs2.time.ms_to_frames(ms, fps)
1197 |     return ms
1198 | 
1199 | 
1200 | def save_frame_and_return_path(output_path, video, frame_no, position, region):
1201 |     fn = f"frame-{frame_no:05}"
1202 |     if position:
1203 |         fn += "-" + "-".join([str(i) for i in position])
1204 |     if region:
1205 |         fn += f"-r-{region.y}-{region.y + region.h}-{region.x}-{region.x + region.w}"
1206 |     fn += ".jpg"
1207 |     output_path.mkdir(exist_ok=True)
1208 |     output_file = output_path / fn
1209 |     if not output_file.exists():
1210 |         video.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
1211 |         ret, frame = video.read()
1212 |         if position:
1213 |             cv2.rectangle(
1214 |                 frame,
1215 |                 (position[2], position[0]),
1216 |                 (position[3], position[1]),
1217 |                 (0, 255, 0),
1218 |                 3,
1219 |             )
1220 |         if region:
1221 |             frame = frame[
1222 |                 region.y : region.y + region.h, region.x : region.x + region.w
1223 |             ]
1224 |         if frame.shape[1] > 640:
1225 |             frame = cv2.resize(
1226 |                 frame, (640, int(frame.shape[0] / (frame.shape[1] / 640)))
1227 |             )
1228 |         cv2.imwrite(str(output_file), frame)
1229 | 
1230 |     return f"{output_file.parent.name}/{output_file.name}"
1231 | 
1232 | 
1233 | def cleanup_text(text):
1234 |     text = text.replace("|", "I")
1235 |     return text
1236 | 
1237 | 
1238 | class Config:
1239 |     def __init__(self, temp_folder, video_path):
1240 |         self.lock = threading.Lock()
1241 |         self.path = temp_folder / f"{Path(video_path).name}.conf.json"
1242 |         if self.path.exists():
1243 |             self.config = json.loads(self.path.read_text())
1244 |         else:
1245 |             self.config = {}
1246 | 
1247 |     def get_frame_diff(self):
1248 |         with self.lock:
1249 |             return self.config.get("frame_diff")
1250 | 
1251 |     def set_frame_diff(self, frame_diff):
1252 |         with self.lock:
1253 |             self.config["frame_diff"] = frame_diff
1254 |             self._flush()
1255 | 
1256 |     def _flush(self):
1257 |         self.path.write_text(json.dumps(self.config, indent=2))
1258 | 
1259 |     def add_text_line(self, scan_mode, frame_range, progress, line=None):
1260 |         with self.lock:
1261 |             key = f"{scan_mode}-{frame_range[0]}-{frame_range[1]}"
1262 |             if "progress" not in self.config:
1263 |                 self.config["progress"] = {}
1264 |             self.config["progress"][key] = progress
1265 |             if line:
1266 |                 self.config.setdefault("lines", []).append(line)
1267 |             self._flush()
1268 | 
1269 |     def read_text_lines(self):
1270 |         with self.lock:
1271 |             return json.loads(self.path.read_text()).get("lines", [])
1272 | 
1273 | 
1274 | def pick_best_text(text_1, text_2):
1275 |     t_text_1 = TextBlob(text_1)
1276 |     t_text_2 = TextBlob(text_2)
1277 | 
1278 |     text_1_diff = normalized_damerau_levenshtein_distance(t_text_1, t_text_1.correct())
1279 |     text_2_diff = normalized_damerau_levenshtein_distance(t_text_2, t_text_2.correct())
1280 | 
1281 |     if text_2_diff < text_1_diff:
1282 |         return text_2
1283 |     else:
1284 |         return text_1
1285 | 
1286 | 
1287 | class FrameRangeParamType(click.ParamType):
1288 |     name = "framerange"
1289 | 
1290 |     def convert(self, value, param, ctx):
1291 |         value = value.split(":")
1292 |         if len(value) != 2:
1293 |             self.fail(
1294 |                 "Missing arguments, syntax is start_frame:end_frame - can be negative"
1295 |             )
1296 | 
1297 |         try:
1298 |             start_frame = int(value[0])
1299 |             end_frame = int(value[1])
1300 |         except ValueError:
1301 |             self.fail(f"Value is wrong type, must be int")
1302 | 
1303 |         return start_frame, end_frame
1304 | 
1305 | 
1306 | FRAME_RANGE = FrameRangeParamType()
1307 | 
1308 | 
1309 | @click.group()
1310 | @click.argument("subtitled-file", type=click.Path(exists=True), required=True)
1311 | @click.argument("unsubtitled-file", type=click.Path(exists=True), required=True)
1312 | @click.option(
1313 |     "--temp-folder",
1314 |     type=click.Path(),
1315 |     default="cow-temp",
1316 |     help="Temp folder to store various files in.",
1317 | )
1318 | @click.option(
1319 |     "--subtitle-region-file",
1320 |     type=click.Path(),
1321 |     help="Subtitle region file.",
1322 | )
1323 | @click.pass_context
1324 | def cli(ctx, subtitled_file, unsubtitled_file, temp_folder, subtitle_region_file):
1325 |     temp_folder = Path(temp_folder)
1326 |     temp_folder.mkdir(exist_ok=True)
1327 | 
1328 |     ctx.ensure_object(dict)
1329 |     ctx.obj["subtitled_file"] = subtitled_file
1330 |     ctx.obj["unsubtitled_file"] = unsubtitled_file
1331 |     ctx.obj["temp_folder"] = temp_folder
1332 |     ctx.obj["config"] = Config(temp_folder, subtitled_file)
1333 |     if not subtitle_region_file:
1334 |         subtitle_region_file = temp_folder / "subtitle_regions.json"
1335 |     else:
1336 |         subtitle_region_file = Path(subtitle_region_file)
1337 | 
1338 |     if not subtitle_region_file.exists():
1339 |         subtitle_region_file.write_text(
1340 |             json.dumps(
1341 |                 [dataclasses.asdict(sr) for sr in default_subtitle_regions], indent=2
1342 |             )
1343 |         )
1344 | 
1345 |     ctx.obj["subtitle_regions"] = [
1346 |         SubtitleRegion(**sr) for sr in json.loads(subtitle_region_file.read_text())
1347 |     ]
1348 | 
1349 | 
1350 | @cli.command()
1351 | @click.option(
1352 |     "--threads",
1353 |     type=int,
1354 |     default=3,
1355 |     help="Number of threads to parse video with.",
1356 | )
1357 | @click.option(
1358 |     "--tesseract-data-path",
1359 |     type=click.Path(exists=True),
1360 |     required=False,
1361 | )
1362 | @click.option(
1363 |     "--frame-diff",
1364 |     type=int,
1365 |     help="Set a frame diff manually",
1366 | )
1367 | @click.option(
1368 |     "--frame-range",
1369 |     type=FRAME_RANGE,
1370 |     help="Specify frame range to use.",
1371 | )
1372 | @click.option(
1373 |     "--ignore-diff-fps",
1374 |     is_flag=True,
1375 |     help="Ignore that FPS differ.",
1376 | )
1377 | @click.option(
1378 |     "--run-subregions-in-parallel",
1379 |     is_flag=True,
1380 |     help="Run all the subtitle regions in parallel instead of one at a time.",
1381 | )
1382 | @click.option(
1383 |     "--fix-broken-frame-alignment",
1384 |     is_flag=True,
1385 |     help="Sometimes one of the videos are broken frame-wise, try a few frames to see if we find a good match.",
1386 | )
1387 | @click.option(
1388 |     "--debug-frame",
1389 |     type=int,
1390 |     help="Debug a frame, output an image to show debug information.",
1391 | )
1392 | @click.option(
1393 |     "--debug-subregion",
1394 |     type=str,
1395 |     help="Choose a specific subtitle region, use in combination with debug-frame.",
1396 | )
1397 | @click.pass_context
1398 | def extract_subtitles(
1399 |     ctx,
1400 |     threads,
1401 |     tesseract_data_path,
1402 |     frame_diff,
1403 |     frame_range,
1404 |     ignore_diff_fps,
1405 |     run_subregions_in_parallel,
1406 |     fix_broken_frame_alignment,
1407 |     debug_frame,
1408 |     debug_subregion,
1409 | ):  # subtitle_overwrite_region
1410 |     subtitle_regions = ctx.obj["subtitle_regions"]
1411 |     if tesseract_data_path is None:
1412 |         tesseract_data_path = (Path(__file__).parent / "tess-data").absolute()
1413 |     else:
1414 |         tesseract_data_path = Path(tesseract_data_path).absolute()
1415 |     tesseract_data_path = str(tesseract_data_path)
1416 | 
1417 |     config = ctx.obj["config"]
1418 |     src_video = cv2.VideoCapture(ctx.obj["subtitled_file"])
1419 |     dst_video = cv2.VideoCapture(ctx.obj["unsubtitled_file"])
1420 | 
1421 |     src_fps = fps = round(src_video.get(cv2.CAP_PROP_FPS), 3)
1422 |     dst_fps = round(dst_video.get(cv2.CAP_PROP_FPS), 3)
1423 |     if not ignore_diff_fps and src_fps != dst_fps:
1424 |         click.echo(
1425 |             f"Source and destination FPS are differnet {src_fps=} {dst_fps=} - this is not supported right now - use --ignore-diff-fps to ignore this"
1426 |         )
1427 |         quit(1)
1428 | 
1429 |     src_frame_count = src_video.get(cv2.CAP_PROP_FRAME_COUNT)
1430 |     dst_frame_count = src_video.get(cv2.CAP_PROP_FRAME_COUNT)
1431 |     max_frames = min(
1432 |         int(src_video.get(cv2.CAP_PROP_FRAME_COUNT)),
1433 |         int(dst_video.get(cv2.CAP_PROP_FRAME_COUNT)),
1434 |     )
1435 | 
1436 |     click.echo("Looking for video file frame difference")
1437 |     if frame_diff is None:
1438 |         frame_diff = config.get_frame_diff()
1439 |         if frame_diff is None:  # TODO: use DTW to find big diffs
1440 |             click.echo("No frame diff found in cache, detecting...")
1441 |             source_frame = int(max_frames * 0.6)
1442 |             target_frame = source_frame
1443 | 
1444 |             good_breakpoint_diff = find_good_frame_breakpoint(src_video, source_frame)
1445 |             frame_diff = estimate_video_frame_diff(
1446 |                 src_video,
1447 |                 dst_video,
1448 |                 source_frame + good_breakpoint_diff,
1449 |                 target_frame + good_breakpoint_diff,
1450 |             )
1451 |             config.set_frame_diff(frame_diff)
1452 |     click.echo(f"Using frame diff {frame_diff}")
1453 | 
1454 |     if debug_frame is not None:
1455 |         debug_folder = ctx.obj["temp_folder"] / "debug"
1456 |         debug_folder.mkdir(exist_ok=True)
1457 |         start_frame_no = debug_frame
1458 |         end_frame_no = start_frame_no + 1
1459 |         if frame_diff < 0:
1460 |             end_frame_no -= frame_diff
1461 | 
1462 |         img = get_frame(src_video, start_frame_no)
1463 |         for sr in subtitle_regions:
1464 |             cv2.rectangle(img, (sr.x, sr.y), (sr.x + sr.w, sr.y + sr.h), (0, 255, 0), 2)
1465 |             cv2.putText(
1466 |                 img,
1467 |                 sr.name,
1468 |                 (20, sr.y + sr.h - 20),
1469 |                 cv2.FONT_HERSHEY_SIMPLEX,
1470 |                 2,
1471 |                 (0, 255, 0),
1472 |                 2,
1473 |                 cv2.LINE_AA,
1474 |             )
1475 |         fn = debug_folder / f"subtitle_regions.png"
1476 |         cv2.imwrite(str(fn.absolute()), img)
1477 | 
1478 |         if debug_subregion is not None:
1479 |             sr = [sr for sr in subtitle_regions if sr.name == debug_subregion][0]
1480 |         else:
1481 |             sr = subtitle_regions[0]
1482 | 
1483 |         gather_images = []
1484 |         for result in loop_frames(
1485 |             src_video,
1486 |             dst_video,
1487 |             start_frame_no,
1488 |             end_frame_no,
1489 |             frame_diff,
1490 |             sr,
1491 |             tesseract_data_path,
1492 |             fix_broken_frame_alignment=fix_broken_frame_alignment,
1493 |             gather_images=gather_images,
1494 |             edge_threshold=sr.edge_threshold,
1495 |             max_frames=max_frames,
1496 |         ):
1497 |             for i, (title, img) in enumerate(gather_images):
1498 |                 fn = debug_folder / f"{i:02}.png"
1499 |                 print(title, str(fn))
1500 |                 if img is None or not img.shape:
1501 |                     print(f"Skipping image {title}")
1502 |                     continue
1503 |                 cv2.imwrite(str(fn.absolute()), img)
1504 |             print(result)
1505 |             break
1506 |     else:
1507 |         click.echo("Starting to extracting actual subtitles")
1508 | 
1509 |         def ocr_in_thread(start_frame_no, end_frame_no, frame_diff, sr):
1510 |             src_video = cv2.VideoCapture(ctx.obj["subtitled_file"])
1511 |             dst_video = cv2.VideoCapture(ctx.obj["unsubtitled_file"])
1512 |             for result in loop_frames(
1513 |                 src_video,
1514 |                 dst_video,
1515 |                 start_frame_no,
1516 |                 end_frame_no,
1517 |                 frame_diff,
1518 |                 sr,
1519 |                 tesseract_data_path,
1520 |                 fix_broken_frame_alignment=fix_broken_frame_alignment,
1521 |                 gather_images=None,
1522 |                 edge_threshold=sr.edge_threshold,
1523 |                 max_frames=max_frames,
1524 |             ):
1525 |                 config.add_text_line(
1526 |                     sr.scan_mode,
1527 |                     (start_frame_no, end_frame_no),
1528 |                     result.get("initial_frame_no", result.get("frame_no", 0)),
1529 |                     result,
1530 |                 )
1531 | 
1532 |         max_workers = threads
1533 |         if run_subregions_in_parallel:
1534 |             max_workers *= len(subtitle_regions)
1535 |         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
1536 |             if frame_range is None:
1537 |                 frame_range = (0, max_frames)
1538 | 
1539 |             if frame_range[1] < 0:
1540 |                 frame_range = (frame_range[0], max_frames - frame_range[1])
1541 | 
1542 |             frames_per_thread = math.ceil((frame_range[1] - frame_range[0]) / threads)
1543 |             jobs = {}
1544 |             for sr in subtitle_regions:
1545 |                 for i in range(threads):
1546 |                     start_frame_no = frame_range[0] + (frames_per_thread * i)
1547 |                     end_frame_no = frame_range[0] + (
1548 |                         min(frames_per_thread * (i + 1), max_frames)
1549 |                     )
1550 |                     jobs[
1551 |                         executor.submit(
1552 |                             ocr_in_thread, start_frame_no, end_frame_no, frame_diff, sr
1553 |                         )
1554 |                     ] = (sr, start_frame_no, end_frame_no)
1555 | 
1556 |             for future in concurrent.futures.as_completed(jobs):
1557 |                 sr, start_frame_no, end_frame_no = jobs[future]
1558 |                 try:
1559 |                     future.result()
1560 |                 except Exception as exc:
1561 |                     click.echo(
1562 |                         f"Failed job {sr.name} {start_frame_no=} {end_frame_no=}"
1563 |                     )
1564 |                     traceback.print_exc()
1565 |                 # else:
1566 |                 #     click.echo(f"Done with job {sr.name} {start_frame_no=} {end_frame_no=}")
1567 |     click.echo(f"Done generating subtitle json from: {str(ctx.obj['subtitled_file'])}")
1568 | 
1569 | 
1570 | @cli.command()
1571 | @click.option(
1572 |     "--output-report-path",
1573 |     type=click.Path(exists=True),
1574 |     required=False,
1575 |     help="Folder to save reports to",
1576 | )
1577 | @click.option(
1578 |     "--output-subtitle-path",
1579 |     type=click.Path(exists=True),
1580 |     required=False,
1581 |     help="Folder to save the subtitles to",
1582 | )
1583 | @click.pass_context
1584 | def create_report(ctx, output_report_path, output_subtitle_path):
1585 |     src_video = cv2.VideoCapture(ctx.obj["subtitled_file"])
1586 |     max_frames = src_video.get(cv2.CAP_PROP_FRAME_COUNT) - 1
1587 |     fps = src_video.get(cv2.CAP_PROP_FPS)
1588 |     env = jinja2.Environment()
1589 |     env.filters["totimestamp"] = lambda frame_count: totimestamp(fps, frame_count)
1590 |     if output_report_path is None:
1591 |         output_report_path = (
1592 |             ctx.obj["temp_folder"] / f"{Path(ctx.obj['subtitled_file']).name}-report"
1593 |         )
1594 |     else:
1595 |         output_report_path = Path(output_report_path)
1596 | 
1597 |     output_report_path.mkdir(exist_ok=True)
1598 | 
1599 |     if output_subtitle_path is None:
1600 |         output_subtitle_path = (
1601 |             ctx.obj["temp_folder"]
1602 |             / f"{Path(ctx.obj['subtitled_file']).with_suffix('.ass').name}"
1603 |         )
1604 |     else:
1605 |         output_subtitle_path = Path(output_subtitle_path)
1606 | 
1607 |     config = ctx.obj["config"]
1608 | 
1609 |     def _get_frame(frame_no, position=None, region=None):
1610 |         frame_no = min(max_frames, frame_no)
1611 |         return save_frame_and_return_path(
1612 |             output_report_path / "images", src_video, frame_no, position, region
1613 |         )
1614 | 
1615 |     env.globals["get_frame"] = _get_frame
1616 | 
1617 |     subs = pysubs2.SSAFile.from_string(BASE_ASS)
1618 |     subs.info["PlayResX"] = int(src_video.get(cv2.CAP_PROP_FRAME_WIDTH))
1619 |     subs.info["PlayResY"] = int(src_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
1620 | 
1621 |     subtitle_regions = ctx.obj["subtitle_regions"]
1622 | 
1623 |     subtitle_region_data = {}
1624 |     for sr in subtitle_regions:
1625 |         subtitle_region_data[sr.name] = {
1626 |             "subtitle_lines": [],
1627 |             "subtitle_signs": [],
1628 |             "missing_regions": [],
1629 |         }
1630 |         missing_regions, subtitles, subtitle_signs = (
1631 |             subtitle_region_data[sr.name]["missing_regions"],
1632 |             subtitle_region_data[sr.name]["subtitle_lines"],
1633 |             subtitle_region_data[sr.name]["subtitle_signs"],
1634 |         )
1635 |         for l in config.read_text_lines():
1636 |             if l["region"] != sr.name:
1637 |                 continue
1638 |             if "to_frame_no" in l:
1639 |                 l["to_frame_no"] = min(l["to_frame_no"], max_frames)
1640 |                 l["from_frame_no"] = min(l["to_frame_no"], l["from_frame_no"])
1641 |             if l["type"] == "subtitle":
1642 |                 subtitles.append(l)
1643 |             elif l["type"] == "subtitle_sign":
1644 |                 subtitle_signs.append(l)
1645 |             elif l["type"] == "missed_region":
1646 |                 missing_regions.append(l)
1647 |             else:
1648 |                 click.echo(f"unknown type {l['type']}")
1649 | 
1650 |         missing_regions = sorted(missing_regions, key=lambda x: x["frame_no"])
1651 |         subtitles = sorted(subtitles, key=lambda x: x["from_frame_no"])
1652 |         subtitle_signs = sorted(subtitle_signs, key=lambda x: x["from_frame_no"])
1653 | 
1654 |         missing_frames = set()
1655 |         for missing_region in missing_regions:
1656 |             missing_frames.add(missing_region["frame_no"])
1657 | 
1658 |         cleaned_subtitles = []
1659 |         last_line = None
1660 |         for line in subtitles:
1661 |             line["subtitle_text"] = cleanup_text(line["subtitle_text"])
1662 |             if last_line is not None:
1663 |                 if (
1664 |                     last_line["to_frame_no"] > line["from_frame_no"] - 3
1665 |                     and normalized_damerau_levenshtein_distance(
1666 |                         last_line["subtitle_text"], line["subtitle_text"]
1667 |                     )
1668 |                     < 0.2
1669 |                 ):
1670 |                     # TODO: make sure all intersecting lines are removed too
1671 |                     last_line["from_frame_no"] = min(
1672 |                         last_line["from_frame_no"], line["from_frame_no"]
1673 |                     )
1674 |                     last_line["to_frame_no"] = max(
1675 |                         last_line["to_frame_no"], line["to_frame_no"]
1676 |                     )
1677 |                     last_line["subtitle_text"] = pick_best_text(
1678 |                         last_line["subtitle_text"], line["subtitle_text"]
1679 |                     )
1680 |                     click.echo(f"Merging: {line} {last_line}")
1681 |                     continue
1682 |             for frame_no in range(line["from_frame_no"], line["to_frame_no"] + 1):
1683 |                 if frame_no in missing_frames:
1684 |                     missing_frames.remove(frame_no)
1685 |                     click.echo(f"Frame {frame_no} found in missing frames")
1686 |             cleaned_subtitles.append(line)
1687 |             last_line = line
1688 | 
1689 |         if sr.scan_mode == SubtitleRegion.ScanMode.BOTTOM_CENTER:
1690 |             short_subtitle_signs = [
1691 |                 line
1692 |                 for line in cleaned_subtitles
1693 |                 if line["to_frame_no"] - line["from_frame_no"] <= 4
1694 |             ]
1695 |             cleaned_subtitles = [
1696 |                 line
1697 |                 for line in cleaned_subtitles
1698 |                 if line["to_frame_no"] - line["from_frame_no"] > 4
1699 |             ]
1700 |             (output_report_path / f"{sr.name}-subtitles.html").write_text(
1701 |                 env.from_string(HTML_SUBTITLE_LINES).render(
1702 |                     subtitle_lines=cleaned_subtitles, sr=sr, title="CowOCR - Subtitles"
1703 |                 )
1704 |             )
1705 | 
1706 |             for line in cleaned_subtitles:
1707 |                 subtitle_text = line["subtitle_text"].replace("\n", "\\N")
1708 |                 subs.append(
1709 |                     pysubs2.SSAEvent(
1710 |                         start=make_time(
1711 |                             frames=line["from_frame_no"], fps=fps, pos="start"
1712 |                         ),
1713 |                         end=make_time(frames=line["to_frame_no"], fps=fps, pos="end"),
1714 |                         text=subtitle_text,
1715 |                         style=sr.ass_style_name,
1716 |                     )
1717 |                 )
1718 | 
1719 |         if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE:
1720 |             short_subtitle_signs = [
1721 |                 subtitle_sign
1722 |                 for subtitle_sign in subtitle_signs
1723 |                 if subtitle_sign["to_frame_no"] - subtitle_sign["from_frame_no"] <= 10
1724 |             ]
1725 |         missing_frames = [
1726 |             missing_region
1727 |             for missing_region in missing_regions
1728 |             if missing_region["frame_no"] in missing_frames
1729 |         ]
1730 |         (output_report_path / f"{sr.name}-missing-regions.html").write_text(
1731 |             env.from_string(HTML_MISSING_REGIONS).render(
1732 |                 missing_regions=missing_frames,
1733 |                 sr=sr,
1734 |                 short_subtitle_signs=short_subtitle_signs,
1735 |                 title="CowOCR - Missing regions",
1736 |             )
1737 |         )
1738 | 
1739 |         if sr.scan_mode == SubtitleRegion.ScanMode.SEARCH_SLICE:
1740 |             subtitle_signs = [
1741 |                 subtitle_sign
1742 |                 for subtitle_sign in subtitle_signs
1743 |                 if subtitle_sign["to_frame_no"] - subtitle_sign["from_frame_no"] > 10
1744 |             ]
1745 |             for subtitle_sign in subtitle_signs:
1746 |                 subtitle_sign["subtitle_text"] = cleanup_text(
1747 |                     subtitle_sign["subtitle_text"]
1748 |                 )
1749 |             (output_report_path / f"{sr.name}-subtitle-signs.html").write_text(
1750 |                 env.from_string(HTML_SUBTITLE_SIGNS).render(
1751 |                     subtitle_signs=subtitle_signs,
1752 |                     sr=sr,
1753 |                     title="CowOCR - Subtitle Signs",
1754 |                 )
1755 |             )
1756 | 
1757 |             for subtitle_sign in subtitle_signs:
1758 |                 subtitle_text = subtitle_sign["subtitle_text"].replace("\n", "\\N")
1759 |                 subtitle_text = (
1760 |                     "{\pos(%s,%s)}"
1761 |                     % (
1762 |                         int(
1763 |                             (
1764 |                                 subtitle_sign["position"][2]
1765 |                                 + subtitle_sign["position"][3]
1766 |                             )
1767 |                             / 2
1768 |                         ),
1769 |                         subtitle_sign["position"][1],
1770 |                     )
1771 |                 ) + subtitle_text
1772 |                 subs.append(
1773 |                     pysubs2.SSAEvent(
1774 |                         start=make_time(
1775 |                             frames=subtitle_sign["from_frame_no"], fps=fps, pos="start"
1776 |                         ),
1777 |                         end=make_time(
1778 |                             frames=subtitle_sign["to_frame_no"], fps=fps, pos="end"
1779 |                         ),
1780 |                         text=subtitle_text,
1781 |                         style=sr.ass_style_name,
1782 |                     )
1783 |                 )
1784 | 
1785 |     (output_report_path / "index.html").write_text(
1786 |         env.from_string(HTML_INDEX).render(
1787 |             subtitle_regions=subtitle_regions,
1788 |             subtitle_region_data=subtitle_region_data,
1789 |             title="CowOCR - Index",
1790 |         )
1791 |     )
1792 | 
1793 |     subs.save(str(output_subtitle_path.absolute()))
1794 | 
1795 |     click.echo(f"Saved report to: {str(output_report_path)}")
1796 |     click.echo(f"Saved subtitles to: {str(output_subtitle_path)}")
1797 | 
1798 | 
1799 | if __name__ == "__main__":
1800 |     cli()
1801 | 


--------------------------------------------------------------------------------
/milksync.py:
--------------------------------------------------------------------------------
   1 | import concurrent.futures
   2 | import json
   3 | import logging
   4 | import math
   5 | import os
   6 | import pickle
   7 | import shlex
   8 | import shutil
   9 | import subprocess
  10 | import tempfile
  11 | import uuid
  12 | import warnings
  13 | from collections import Counter
  14 | from concurrent.futures import ThreadPoolExecutor
  15 | from decimal import Decimal
  16 | from pathlib import Path
  17 | from re import sub
  18 | 
  19 | import click
  20 | import cv2
  21 | import ffmpeg
  22 | import librosa
  23 | import numpy as np
  24 | import pysubs2
  25 | from annoy import AnnoyIndex
  26 | from numpy.lib.stride_tricks import sliding_window_view
  27 | from scipy.spatial.distance import cdist, cosine, pdist
  28 | from skimage.metrics import structural_similarity
  29 | 
  30 | logger = logging.getLogger(__name__)
  31 | 
  32 | # HOP_LENGTH = 1024
  33 | # HOP_LENGTH = 512
  34 | HOP_LENGTH = None
  35 | 
  36 | 
  37 | class Video:
  38 |     _cv2_video = None
  39 |     _cv2_video_info = None
  40 |     _ffmpeg_probe = None
  41 | 
  42 |     def __init__(self, filepath):
  43 |         self.filepath = filepath
  44 | 
  45 |     @property
  46 |     def video_capture(self):
  47 |         if not self._cv2_video:
  48 |             self._cv2_video = cv2.VideoCapture(self.filepath)
  49 | 
  50 |         return self._cv2_video
  51 | 
  52 |     @property
  53 |     def video_info(self):
  54 |         if not self._cv2_video_info:
  55 |             cap = self.video_capture
  56 |             self._cv2_video_info = {
  57 |                 "fps": cap.get(cv2.CAP_PROP_FPS),
  58 |                 "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
  59 |             }
  60 |             self._cv2_video_info["duration"] = Decimal(
  61 |                 self._cv2_video_info["frame_count"]
  62 |             ) / Decimal(self._cv2_video_info["fps"])
  63 | 
  64 |         return self._cv2_video_info
  65 | 
  66 |     @property
  67 |     def probe(self):
  68 |         if self._ffmpeg_probe is None:
  69 |             self._ffmpeg_probe = ffmpeg.probe(self.filepath)
  70 |         return self._ffmpeg_probe
  71 | 
  72 |     @property
  73 |     def subtitle_streams(self):
  74 |         return [s for s in self.probe["streams"] if s["codec_type"] == "subtitle"]
  75 | 
  76 |     @property
  77 |     def audio_streams(self):
  78 |         return [s for s in self.probe["streams"] if s["codec_type"] == "audio"]
  79 | 
  80 |     @property
  81 |     def chapters(self):
  82 |         chapters = (
  83 |             json.loads(
  84 |                 subprocess.check_output(
  85 |                     [
  86 |                         "ffprobe",
  87 |                         "-loglevel",
  88 |                         "error",
  89 |                         "-hide_banner",
  90 |                         "-of",
  91 |                         "json",
  92 |                         "-show_chapters",
  93 |                         self.filepath,
  94 |                     ],
  95 |                     stdin=subprocess.PIPE,
  96 |                     stderr=subprocess.DEVNULL,
  97 |                 )
  98 |             )
  99 |             or {}
 100 |         )
 101 |         return chapters.get("chapters", [])
 102 | 
 103 |     def create_ffmpeg(self):
 104 |         return ffmpeg.input(self.filepath)
 105 | 
 106 |     def extract_subtitle_metadata(self, track_id):
 107 |         stream = self.subtitle_streams[track_id]
 108 |         metadata = []
 109 |         for k, v in stream.get("tags", {}).items():
 110 |             if k in ["language", "title"]:
 111 |                 metadata.append((k, v))
 112 |         return metadata
 113 | 
 114 |     def extract_audio_metadata(self, track_id):
 115 |         stream = self.audio_streams[track_id]
 116 |         metadata = []
 117 |         for k, v in stream.get("tags", {}).items():
 118 |             if k in ["language", "title"]:
 119 |                 metadata.append((k, v))
 120 |         return metadata
 121 | 
 122 | 
 123 | def estimate_audio_shift_points_from_subtitles(
 124 |     x_1_chroma,
 125 |     x_2_chroma,
 126 |     fs,
 127 |     video_file,
 128 |     track_id,
 129 |     n_chroma,
 130 |     adjust_delay=None,
 131 |     framerate_align=None,
 132 |     max_ms_cutoff=110,
 133 |     external_subtitle_file=None,
 134 | ):
 135 |     # subtitle, subtitle_format = extract_subtitle_data(video_file, track_id, framerate_align)
 136 |     if external_subtitle_file is not None:
 137 |         subtitle, subtitle_format = import_subtitle_data(
 138 |             external_subtitle_file, framerate_align
 139 |         )
 140 |     else:
 141 |         subtitle, subtitle_format = extract_subtitle_data(
 142 |             video_file, track_id, framerate_align
 143 |         )
 144 |     index = AnnoyIndex(n_chroma, "euclidean")
 145 |     for i, c in enumerate(x_2_chroma):
 146 |         index.add_item(i, c)
 147 |     index.build(10)
 148 | 
 149 |     def align_subtitle(subtitle_line, min_datapoint_percent=0.2, min_avg=0.05, n=10):
 150 |         start_i = int(subtitle_line.start * fs / HOP_LENGTH / 1000)
 151 |         end_i = int(subtitle_line.end * fs / HOP_LENGTH / 1000)
 152 |         if end_i >= len(x_1_chroma):
 153 |             return []
 154 |         found_indexes = {}
 155 |         for i in range(end_i - start_i):
 156 |             source_vector = list(x_1_chroma[start_i + i])
 157 |             for vector_index in index.get_nns_by_vector(source_vector, n):
 158 |                 target_vector = index.get_item_vector(vector_index)
 159 |                 found_indexes.setdefault(vector_index - i, []).append(
 160 |                     cosine(source_vector, target_vector)
 161 |                 )
 162 |         candidates = []
 163 |         for k, v in sorted(
 164 |             found_indexes.items(), key=lambda x: len(x[1]), reverse=True
 165 |         ):
 166 |             if len(v) < (end_i - start_i) * min_datapoint_percent:
 167 |                 v = v + found_indexes.get(k + 1, []) + found_indexes.get(k - 1, [])
 168 |                 if len(v) < (end_i - start_i) * min_datapoint_percent:
 169 |                     break
 170 |             if np.average(v) > min_avg:
 171 |                 continue
 172 | 
 173 |             candidates.append(k)
 174 |         return candidates
 175 | 
 176 |     subtitle_matches = []
 177 |     for i, s in enumerate(subtitle):
 178 |         candidates = align_subtitle(s)
 179 |         if candidates:
 180 |             subtitle_matches.append(
 181 |                 [
 182 |                     int(
 183 |                         librosa.frames_to_time(candidate, sr=fs, hop_length=HOP_LENGTH)
 184 |                         * 1000
 185 |                     )
 186 |                     - s.start
 187 |                     for candidate in candidates
 188 |                 ]
 189 |             )
 190 |         else:
 191 |             subtitle_matches.append(None)
 192 | 
 193 |     def generate_best_chains(subtitle_matches):
 194 |         subtitle_groups = []
 195 |         for i, subtitle_match in enumerate(subtitle_matches):
 196 |             if subtitle_match is None:
 197 |                 continue
 198 |             ts_diff = subtitle_match[0]
 199 |             if (
 200 |                 not subtitle_groups
 201 |                 or abs(
 202 |                     ts_diff - np.median([t for (t, _, _) in subtitle_groups[-1][:20]])
 203 |                 )
 204 |                 > max_ms_cutoff
 205 |             ):
 206 |                 print(f"Creating new group with {ts_diff=}")
 207 |                 subtitle_groups.append([(ts_diff, i, subtitle_match)])
 208 |             else:
 209 |                 subtitle_groups[-1].append((ts_diff, i, subtitle_match))
 210 | 
 211 |         group_size_cutoff = 5
 212 |         has_modified = True
 213 |         while has_modified:
 214 |             has_modified = False
 215 |             previous_subtitle_group = None
 216 |             for i, subtitle_group in enumerate(subtitle_groups):
 217 |                 if len(subtitle_group) > group_size_cutoff:
 218 |                     ts_diff = np.median([t for (t, _, _) in subtitle_group[:20]])
 219 |                     if i >= len(subtitle_groups):
 220 |                         next_subtitle_group = subtitle_groups[i + 1]
 221 |                     else:
 222 |                         next_subtitle_group = None
 223 |                     if previous_subtitle_group is not None:
 224 |                         for entry in list(previous_subtitle_group[::-1]):
 225 |                             matched_ts = sorted(
 226 |                                 [(abs(ts - ts_diff), ts) for ts in entry[2]]
 227 |                             )
 228 |                             if matched_ts[0][0] <= max_ms_cutoff:
 229 |                                 print(f"Moving entry forward {entry}")
 230 |                                 previous_subtitle_group.pop(-1)
 231 |                                 subtitle_group.insert(
 232 |                                     0, (matched_ts[0][1], entry[1], entry[2])
 233 |                                 )
 234 |                                 has_modified = True
 235 |                             else:
 236 |                                 print(f"Breaking at {entry}")
 237 |                                 break
 238 |                     if next_subtitle_group is not None:
 239 |                         for entry in list(next_subtitle_group):
 240 |                             matched_ts = sorted(
 241 |                                 [(abs(ts - ts_diff), ts) for ts in entry[2]]
 242 |                             )
 243 |                             if matched_ts[0][0] <= max_ms_cutoff:
 244 |                                 print(f"Moving entry back {entry}")
 245 |                                 next_subtitle_group.pop(0)
 246 |                                 subtitle_group.append(
 247 |                                     (matched_ts[0][1], entry[1], entry[2])
 248 |                                 )
 249 |                                 has_modified = True
 250 |                             else:
 251 |                                 print(f"Breaking at back {entry}")
 252 |                                 break
 253 | 
 254 |                     previous_subtitle_group = None
 255 |                 else:
 256 |                     previous_subtitle_group = subtitle_group
 257 | 
 258 |             to_remove_groups = []
 259 |             for i, subtitle_group in enumerate(subtitle_groups):
 260 |                 if len(subtitle_group) == 0:
 261 |                     to_remove_groups.append(i)
 262 | 
 263 |             for i in to_remove_groups[::-1]:
 264 |                 print(f"Removing group {i}")
 265 |                 del subtitle_groups[i]
 266 | 
 267 |         has_modified = True
 268 |         while has_modified:
 269 |             has_modified = False
 270 |             to_remove_groups = []
 271 |             for i, subtitle_group in enumerate(subtitle_groups):
 272 |                 if len(subtitle_group) <= 2:
 273 |                     to_remove_groups.append(i)
 274 | 
 275 |             for i in to_remove_groups[::-1]:
 276 |                 del subtitle_groups[i]
 277 |                 print(f"Deleting {i}")
 278 |                 has_modified = True
 279 | 
 280 |             for i, subtitle_group in enumerate(subtitle_groups):
 281 |                 if not subtitle_group or i == 0:
 282 |                     continue
 283 |                 previous_subtitle_group = subtitle_groups[i - 1]
 284 |                 previous_ts_diff = np.median(
 285 |                     [t for (t, _, _) in previous_subtitle_group]
 286 |                 )
 287 |                 ts_diff = np.median([t for (t, _, _) in subtitle_group])
 288 |                 if abs(previous_ts_diff - ts_diff) < max_ms_cutoff:
 289 |                     print(f"Merging into {i}")
 290 |                     subtitle_groups[i] = previous_subtitle_group + subtitle_groups[i]
 291 |                     subtitle_groups[i - 1] = []
 292 |                     has_modified = True
 293 | 
 294 |         return subtitle_groups
 295 | 
 296 |     audio_shift_points, sync_buckets, delete_buckets = [], [], []
 297 | 
 298 |     previous_start_timestamp, previous_end_timestamp = None, None
 299 |     for subtitle_group in generate_best_chains(subtitle_matches):
 300 |         delta = np.median([t for (t, _, _) in subtitle_group if t is not None]) / 1000
 301 |         subtitle_group_indexes = set([i for (_, i, _) in subtitle_group])
 302 |         timestamps = []
 303 |         for i, subtitle_line in enumerate(subtitle):
 304 |             if i not in subtitle_group_indexes:
 305 |                 continue
 306 |             timestamps += [subtitle_line.start, subtitle_line.end]
 307 |         start_timestamp = min(timestamps) / 1000
 308 |         end_timestamp = max(timestamps) / 1000
 309 | 
 310 |         slice_buffer_length = 51200 // HOP_LENGTH
 311 |         x_1_start_i = int((start_timestamp * fs) / HOP_LENGTH)
 312 |         x_1_end_i = (
 313 |             int((end_timestamp * fs) / HOP_LENGTH)
 314 |             - slice_buffer_length
 315 |             - slice_buffer_length
 316 |         )
 317 |         min_slice_length = 204800 // HOP_LENGTH
 318 |         if x_1_end_i - x_1_start_i > min_slice_length:
 319 |             x_2_start_i = int(((start_timestamp + delta) * fs) / HOP_LENGTH)
 320 |             x_2_end_i = int(((end_timestamp + delta) * fs) / HOP_LENGTH)
 321 | 
 322 |             x_1_chroma_slice = x_1_chroma[
 323 |                 x_1_start_i
 324 |                 + slice_buffer_length : x_1_start_i
 325 |                 + slice_buffer_length
 326 |                 + min_slice_length
 327 |             ]
 328 |             x_2_chroma_slice = x_2_chroma[
 329 |                 x_2_start_i : x_2_start_i
 330 |                 + min_slice_length
 331 |                 + slice_buffer_length
 332 |                 + slice_buffer_length
 333 |             ]
 334 | 
 335 |             C = cdist(x_1_chroma_slice, x_2_chroma_slice, metric="cosine")
 336 |             C = np.nan_to_num(C, copy=False)
 337 | 
 338 |             smallest_i, smallest_value = None, None
 339 |             for i in range(len(x_2_chroma_slice) - len(x_1_chroma_slice)):
 340 |                 cost_diagonal = np.flip(np.diagonal(C, offset=i))
 341 |                 total_cost = np.sum(cost_diagonal)
 342 |                 if smallest_value is None or smallest_value > total_cost:
 343 |                     smallest_value = total_cost
 344 |                     smallest_i = i
 345 |             print(f"Additional buffer change: {smallest_i - slice_buffer_length}")
 346 |             delta = librosa.frames_to_time(
 347 |                 [(x_2_start_i - x_1_start_i) + (smallest_i - slice_buffer_length)],
 348 |                 sr=fs,
 349 |                 hop_length=HOP_LENGTH,
 350 |             )[0]
 351 |             # delta += librosa.frames_to_time([smallest_i - slice_buffer_length], sr=fs, hop_length=HOP_LENGTH)[0]
 352 |             delta += adjust_delay or 0
 353 | 
 354 |         print(f"sync points {start_timestamp} {end_timestamp} {delta=}")
 355 |         if not audio_shift_points and start_timestamp > 0 and delta > 0:
 356 |             audio_shift_points.append((0.0, delta, delta))
 357 |             delete_buckets.append((0, start_timestamp - 0.001))
 358 | 
 359 |         if previous_end_timestamp is not None:
 360 |             audio_shift_points.append(
 361 |                 (
 362 |                     previous_end_timestamp + 0.01 + 100_000_000,
 363 |                     previous_end_timestamp + 0.01,
 364 |                     -100_000_000,
 365 |                 )
 366 |             )
 367 |             delete_buckets.append(
 368 |                 (previous_end_timestamp + 0.001, start_timestamp - 0.001)
 369 |             )
 370 | 
 371 |         sync_buckets.append((start_timestamp, end_timestamp, delta))
 372 |         audio_shift_points.append(
 373 |             (start_timestamp - 0.01 + delta, start_timestamp - 0.01, delta)
 374 |         )
 375 |         previous_start_timestamp, previous_end_timestamp = (
 376 |             start_timestamp,
 377 |             end_timestamp,
 378 |         )
 379 |     if previous_end_timestamp is not None:
 380 |         delete_buckets.append((previous_end_timestamp + 0.001, 100_000))
 381 | 
 382 |     return audio_shift_points, sync_buckets, delete_buckets
 383 | 
 384 | 
 385 | def estimate_audio_shift_points(
 386 |     x_1_chroma,
 387 |     x_2_chroma,
 388 |     fs,
 389 |     max_cost_matrix_size=200_000_000,
 390 |     only_delta=False,
 391 |     adjust_delay=None,
 392 |     sliding_window_size=300,
 393 | ):
 394 |     expected_matrix_size = len(x_1_chroma) * len(x_2_chroma)
 395 | 
 396 |     print(f"Expected cost-matrix size is {expected_matrix_size=}")
 397 | 
 398 |     if expected_matrix_size > max_cost_matrix_size:
 399 |         print(
 400 |             f"Since our cost-matrix is bigger than max allowed cost {max_cost_matrix_size=} we will slice it."
 401 |         )
 402 |         chroma_slice_size = int(math.sqrt(max_cost_matrix_size))
 403 |         chroma_slice_step = int(chroma_slice_size * 0.8)
 404 |     else:
 405 |         print("Memory can fully fit our cost-matrix")
 406 |         chroma_slice_size = 100_000_000
 407 |         chroma_slice_step = 100_000_000
 408 | 
 409 |     all_diffs = None
 410 |     all_timestamps = None
 411 | 
 412 |     for i in range(
 413 |         0, max([len(x_1_chroma), len(x_2_chroma)]), chroma_slice_step
 414 |     ):  # TODO
 415 |         start_i = max(
 416 |             min(
 417 |                 len(x_1_chroma) - chroma_slice_step,
 418 |                 len(x_2_chroma) - chroma_slice_step,
 419 |                 i,
 420 |             ),
 421 |             0,
 422 |         )
 423 | 
 424 |         x_1_chroma_slice = x_1_chroma[start_i : start_i + chroma_slice_size]
 425 |         x_2_chroma_slice = x_2_chroma[start_i : start_i + chroma_slice_size]
 426 |         if start_i:
 427 |             wp_offset = librosa.frames_to_time([start_i], sr=fs, hop_length=HOP_LENGTH)[
 428 |                 0
 429 |             ]
 430 |         else:
 431 |             wp_offset = 0
 432 |         print(
 433 |             f"Doing chroma slices x1={len(x_1_chroma_slice)} x2={len(x_2_chroma_slice)} {wp_offset=}"
 434 |         )
 435 | 
 436 |         C = cdist(x_2_chroma_slice, x_1_chroma_slice, metric="cosine")
 437 |         C = np.nan_to_num(C, copy=False)
 438 |         D, wp = librosa.sequence.dtw(C=C)
 439 |         wp_s = np.flip(
 440 |             librosa.frames_to_time(wp, sr=fs, hop_length=HOP_LENGTH) + wp_offset, axis=0
 441 |         )
 442 | 
 443 |         diffs = []
 444 |         timestamps = []
 445 | 
 446 |         t1_already_seen = set()
 447 |         t2_already_seen = set()
 448 | 
 449 |         for t1, t2 in wp_s:
 450 |             should_skip = t1 in t1_already_seen or t2 in t2_already_seen
 451 |             t1_already_seen.add(t1)
 452 |             t2_already_seen.add(t2)
 453 |             if should_skip:
 454 |                 continue
 455 | 
 456 |             diff = np.round(t1 - t2, 3)
 457 |             diffs.append(diff)
 458 |             timestamps.append((t1, t2))
 459 | 
 460 |         if all_timestamps:
 461 |             at1, at2 = all_timestamps[-1]
 462 |             t1, t2 = timestamps[0]
 463 | 
 464 |             cutoff_t = t1 + ((at1 - t1) / 2)
 465 |             for timestamp_index, (t1, t2) in enumerate(timestamps):
 466 |                 if t1 > cutoff_t:
 467 |                     break
 468 |             for all_timestamp_index, (t1, t2) in enumerate(all_timestamps[::-1]):
 469 |                 if t1 <= cutoff_t:
 470 |                     break
 471 |             all_timestamps = (
 472 |                 all_timestamps[:-all_timestamp_index] + timestamps[timestamp_index:]
 473 |             )
 474 |             all_diffs = all_diffs[:-all_timestamp_index] + diffs[timestamp_index:]
 475 |         else:
 476 |             all_timestamps = timestamps
 477 |             all_diffs = diffs
 478 | 
 479 |     if only_delta:
 480 |         min_abs_diff = 0.03
 481 |     else:
 482 |         min_abs_diff = 0.06
 483 |     # sliding_window_size = 300 TODO: make it change near the end to detect changes while keeping it higher before.
 484 |     shift_points = []
 485 |     last_most_common = None
 486 |     for i, v in enumerate(
 487 |         sliding_window_view(all_diffs, window_shape=sliding_window_size)
 488 |     ):
 489 |         most_common, most_common_count = Counter(v).most_common(1)[0]
 490 |         if (
 491 |             last_most_common is None
 492 |             or abs(most_common - last_most_common) > min_abs_diff
 493 |         ):
 494 |             j = list(v).index(most_common)
 495 |             t1, t2 = all_timestamps[i + j]
 496 |             if adjust_delay:
 497 |                 adjusted_most_common = most_common + adjust_delay
 498 |             else:
 499 |                 adjusted_most_common = most_common
 500 |             shift_points.append((t1, t2, adjusted_most_common))
 501 |             last_most_common = most_common
 502 |             print(
 503 |                 f"Found sync point source_timestamp={t2} target_timestamp={t1} delta={most_common} delta_count={most_common_count} delta_average={np.average(v)} delta_median={np.median(v)}"
 504 |             )
 505 | 
 506 |     new_shift_points = []
 507 |     is_first_point = True
 508 |     for t1, t2, delta in shift_points:
 509 |         x_2_compare_point = int((t1 * fs) / HOP_LENGTH)
 510 |         x_1_compare_point = int((t2 * fs) / HOP_LENGTH)
 511 | 
 512 |         step_back = min(
 513 |             int((max(abs(delta * 0.3), 1) * fs) / HOP_LENGTH),
 514 |             x_1_compare_point,
 515 |             x_2_compare_point,
 516 |         )  # TODO: do not float into previous shift point
 517 |         range_end = min(
 518 |             int((5 * fs) / HOP_LENGTH),
 519 |             len(x_2_chroma) - x_2_compare_point,
 520 |             len(x_1_chroma) - x_1_compare_point,
 521 |         )
 522 | 
 523 |         C = cdist(
 524 |             x_2_chroma[x_2_compare_point - step_back : x_2_compare_point + range_end],
 525 |             x_1_chroma[x_1_compare_point - step_back : x_1_compare_point + range_end],
 526 |             metric="cosine",
 527 |         )
 528 |         C = np.nan_to_num(C, copy=False)
 529 |         cost_diagonal = np.flip(np.diagonal(C))
 530 |         print(
 531 |             f"Trying to align range source_timestamp={t2} target_timestamp={t1} {delta=} {step_back=} {range_end=} {x_1_compare_point=} {x_2_compare_point=}"
 532 |         )
 533 |         max_cost = np.max(cost_diagonal[:range_end]) * 1.15
 534 |         print(f"{max_cost=}")
 535 |         for i, cost in enumerate(cost_diagonal):
 536 |             if cost > max_cost:
 537 |                 print(f"Found breakpoint at additional delta {i - range_end}")
 538 |                 seconds = ((i - range_end) * HOP_LENGTH) / fs
 539 |                 new_shift_points.append((t1 - seconds, t2 - seconds, delta))
 540 |                 break
 541 |         else:
 542 |             if is_first_point:
 543 |                 new_shift_points.append((t1 - min(t1, t2), t2 - min(t1, t2), delta))
 544 |                 print(
 545 |                     "No breakpoint found, moving all the way back (if this is the first point)"
 546 |                 )
 547 |             else:
 548 |                 new_shift_points.append((t1, t2, delta))
 549 |                 print(
 550 |                     "No breakpoint found, assume cost problems et.al. and just adding the current"
 551 |                 )
 552 | 
 553 |         is_first_point = False
 554 | 
 555 |     zero_shifting_delta = 8.0
 556 |     t1, t2, delta = new_shift_points[0]
 557 |     if 0 < t1 < zero_shifting_delta or 0 < t2 < zero_shifting_delta:
 558 |         print(f"Zero-shifting initial delta {t1=} {t2=} {delta}")
 559 |         shift_delta = min(t1, t2)
 560 |         t1 -= shift_delta
 561 |         t2 -= shift_delta
 562 |         new_shift_points[0] = (t1, t2, delta)
 563 | 
 564 |     return new_shift_points
 565 | 
 566 | 
 567 | class TrackMappingParamType(click.ParamType):
 568 |     name = "trackmapping"
 569 | 
 570 |     def convert(self, value, param, ctx):
 571 |         converted_mapping = []
 572 |         for mapping in value.split(","):
 573 |             try:
 574 |                 mapping = [int(v) for v in mapping.split(":")]
 575 |             except ValueError:
 576 |                 self.fail(
 577 |                     "Failed to convert mappings to numbers. Syntax is file:track e.g. 0:1"
 578 |                 )
 579 | 
 580 |             if len(mapping) != 2:
 581 |                 self.fail("Mappings must be 2 long. Syntax is file:track e.g. 0:1")
 582 | 
 583 |             converted_mapping.append(mapping)
 584 |         return converted_mapping
 585 | 
 586 | 
 587 | TRACK_MAPPING = TrackMappingParamType()
 588 | 
 589 | 
 590 | class MetadataMappingParamType(click.ParamType):
 591 |     name = "metadatamapping"
 592 | 
 593 |     def convert(self, value, param, ctx):
 594 |         value = value.split("=", 2)
 595 |         if len(value) != 3:
 596 |             self.fail("Missing arguments, syntax is track_id=key=value")
 597 | 
 598 |         track_id, key, value = value
 599 | 
 600 |         try:
 601 |             track_id = int(track_id)
 602 |         except ValueError:
 603 |             self.fail("Track ID must be an ID")
 604 | 
 605 |         return (track_id, key, value)
 606 | 
 607 | 
 608 | METADATA_MAPPING = MetadataMappingParamType()
 609 | 
 610 | 
 611 | class IntegerRangeParamType(click.ParamType):
 612 |     name = "integerrange"
 613 | 
 614 |     def convert(self, value, param, ctx):
 615 |         value = value.split("-")
 616 |         if len(value) != 2:
 617 |             self.fail("Missing arguments, syntax is start_second-end_second")
 618 | 
 619 |         start_second, end_second = value
 620 | 
 621 |         try:
 622 |             start_second, end_second = int(start_second), int(end_second)
 623 |         except ValueError:
 624 |             self.fail("Range must be integers")
 625 | 
 626 |         return (start_second, end_second)
 627 | 
 628 | 
 629 | INTEGER_RANGE = IntegerRangeParamType()
 630 | 
 631 | 
 632 | def generate_chroma_cqt(
 633 |     source_file,
 634 |     audio_track_id,
 635 |     target_file,
 636 |     n_chroma=12,
 637 |     framerate_align=None,
 638 |     preserve_silence=False,
 639 | ):
 640 |     target_wav_file = target_file.with_suffix(".wav")
 641 | 
 642 |     cmd = [
 643 |         "ffmpeg",
 644 |         "-y",
 645 |         "-i",
 646 |         str(source_file),
 647 |         "-map",
 648 |         f"0:a:{audio_track_id}",
 649 |         "-ar",
 650 |         "22050",
 651 |     ]
 652 | 
 653 |     # -filter:a "atempo=2.0"
 654 |     if framerate_align:
 655 |         cmd += [
 656 |             "-filter:a",
 657 |             f"atempo={framerate_align[1] / framerate_align[0]}",
 658 |         ]
 659 | 
 660 |     cmd += [str(target_wav_file)]
 661 | 
 662 |     subprocess.check_call(
 663 |         cmd,
 664 |         stdout=subprocess.DEVNULL,
 665 |         stderr=subprocess.DEVNULL,
 666 |         stdin=subprocess.PIPE,
 667 |     )
 668 | 
 669 |     src, fs = librosa.load(str(target_wav_file))
 670 |     y = src
 671 |     if not preserve_silence:
 672 |         _, (ltrim, rtrim) = librosa.effects.trim(src)
 673 |         print(
 674 |             f"Trimming {(len(src) - rtrim) / fs}s silence from end from {source_file.name}"
 675 |         )
 676 |         y = y[:rtrim]
 677 |     chroma = librosa.feature.chroma_cqt(
 678 |         y=y, sr=fs, hop_length=HOP_LENGTH, n_chroma=n_chroma
 679 |     ).T
 680 |     target_file.write_bytes(pickle.dumps((chroma, fs)))
 681 |     target_wav_file.unlink()
 682 |     return chroma, fs
 683 | 
 684 | 
 685 | def find_and_align_chapter(x_1_chroma, x_2_chroma, fs, min_match_value=60):
 686 |     """Find x_1_chroma in x_2_chroma"""
 687 |     best_equals = 0
 688 |     C = cdist(x_1_chroma[42:], x_2_chroma, metric="cosine")
 689 |     C = np.nan_to_num(C, copy=False)
 690 |     smallest_sum = None
 691 |     location = None
 692 |     smallest_found = 9999999
 693 |     for i, v in enumerate(
 694 |         sliding_window_view(C.T, window_shape=len(x_1_chroma[42:]), axis=0)
 695 |     ):
 696 |         s = np.sum(np.diagonal(v))
 697 |         if smallest_found > s:
 698 |             # print(s)
 699 |             smallest_found = s
 700 |         if s > min_match_value:
 701 |             continue
 702 |         if smallest_sum is None or s < smallest_sum:
 703 |             smallest_sum = s
 704 |             location = i
 705 | 
 706 |     if smallest_sum is None:
 707 |         return None
 708 | 
 709 |     return librosa.frames_to_time(
 710 |         [location, location + len(x_1_chroma)], sr=fs, hop_length=HOP_LENGTH
 711 |     )
 712 | 
 713 | 
 714 | def humanize_seconds(s):
 715 |     m, s = divmod(s, 60)
 716 |     h, m = divmod(m, 60)
 717 |     return f"{int(h):02}:{int(m):02}:{(s):06.3f}"
 718 | 
 719 | 
 720 | def turn_audio_shift_points_to_audio_segments(audio_shift_points):
 721 |     sync_buckets = []
 722 |     delete_buckets = []
 723 | 
 724 |     if len(audio_shift_points) > 1:
 725 |         for i in range(len(audio_shift_points)):
 726 |             local_audio_shift_points = audio_shift_points[i : i + 2]
 727 |             if len(local_audio_shift_points) > 1:
 728 |                 (st1, st2, sdelta), (et1, et2, edelta) = local_audio_shift_points
 729 | 
 730 |                 from_delete_time = (et2 + edelta) - sdelta
 731 |                 # from_delete_delay = max(0.0, et2 - from_delete_time)
 732 |                 if from_delete_time < et2:
 733 |                     delete_buckets.append((from_delete_time, et2))
 734 | 
 735 |                 # sync_buckets.append(((st2 - min(from_delete_delay, to_delete_max_reuse)), st2 + (et1 - st1), sdelta))
 736 |                 sync_buckets.append(
 737 |                     (st2, min(st2 + (et1 - st1), et2), sdelta)
 738 |                 )  # TODO: make sure the et2 stuff is correct
 739 | 
 740 |     t1, t2, delta = audio_shift_points[-1]
 741 |     # sync_buckets.append((t2 - min(from_delete_delay, to_delete_max_reuse), 1_000_000, delta))
 742 |     sync_buckets.append((t2, 1_000_000, delta))
 743 |     print(f"Delete buckets {delete_buckets}")
 744 | 
 745 |     return sync_buckets, delete_buckets
 746 | 
 747 | 
 748 | def find_good_frame_breakpoint(video, current_frame):  # do binary search instead?
 749 |     compare_frame_size = (32, 32)
 750 |     frame_cache = {}
 751 | 
 752 |     def get_frame(frame_no):
 753 |         if frame_no not in frame_cache:
 754 |             video.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
 755 |             frame_cache[frame_no] = cv2.cvtColor(
 756 |                 cv2.resize(video.read()[1], compare_frame_size), cv2.COLOR_BGR2GRAY
 757 |             )
 758 |         return frame_cache[frame_no]
 759 | 
 760 |     best_score = 1.0
 761 |     best_frame = current_frame
 762 |     for frame_no in frame_generator(current_frame):
 763 |         score = structural_similarity(get_frame(frame_no), get_frame(frame_no + 1))
 764 |         if score < best_score:
 765 |             best_score = score
 766 |             best_frame = frame_no
 767 |         if score < 0.65:
 768 |             return frame_no - current_frame
 769 |     return best_frame - current_frame
 770 | 
 771 | 
 772 | def frame_generator(start_i):
 773 |     for i in range(1, 300):
 774 |         if i >= start_i:
 775 |             continue
 776 |         yield start_i + i
 777 |         yield start_i - i
 778 | 
 779 | 
 780 | def estimate_frame_diff(
 781 |     source_video, target_video, current_source_frame, current_target_frame
 782 | ):
 783 |     frame_index_size = (64, 64)
 784 |     compare_frame_count = 5
 785 |     spread_frame_count = 14
 786 |     ret, source_frame = source_video.read()
 787 |     ret, target_frame = target_video.read()
 788 | 
 789 |     sy, sx, sz = source_frame.shape
 790 |     ty, tx, tz = target_frame.shape
 791 | 
 792 |     s_aspect = sx / sy
 793 |     t_aspect = tx / ty
 794 | 
 795 |     source_frames = []
 796 |     target_frames = []
 797 | 
 798 |     source_from_frame = current_source_frame - (compare_frame_count // 2)
 799 |     source_to_frame = source_from_frame + compare_frame_count
 800 | 
 801 |     source_video.set(cv2.CAP_PROP_POS_FRAMES, source_from_frame)
 802 |     for _ in range(source_from_frame, source_to_frame):
 803 |         frame_no = source_video.get(cv2.CAP_PROP_POS_FRAMES)
 804 |         source_frames.append(
 805 |             (
 806 |                 cv2.cvtColor(
 807 |                     cv2.resize(source_video.read()[1], frame_index_size),
 808 |                     cv2.COLOR_BGR2GRAY,
 809 |                 ),
 810 |                 frame_no,
 811 |             )
 812 |         )
 813 | 
 814 |     target_from_frame = current_target_frame - spread_frame_count
 815 |     target_to_frame = target_from_frame + (spread_frame_count * 2)
 816 | 
 817 |     target_video.set(cv2.CAP_PROP_POS_FRAMES, target_from_frame)
 818 |     for _ in range(target_from_frame, target_to_frame):
 819 |         frame_no = target_video.get(cv2.CAP_PROP_POS_FRAMES)
 820 |         target_frame = target_video.read()[1]
 821 |         if s_aspect > t_aspect:
 822 |             new_tx = (tx / sx) * sy
 823 |             slice_each_x = int((tx - new_tx) / 2)
 824 |             target_frame = target_frame[0:ty, slice_each_x : (tx - slice_each_x)]
 825 | 
 826 |         target_frame = cv2.resize(target_frame, frame_index_size)
 827 |         target_frames.append((cv2.cvtColor(target_frame, cv2.COLOR_BGR2GRAY), frame_no))
 828 |     best_diff = 0
 829 |     best_frame_diff = None
 830 |     for i in range(len(target_frames) - len(source_frames)):
 831 |         v = target_frames[i : i + len(source_frames)]
 832 |         diffs = []
 833 |         frame_nos = []
 834 |         for sf, tf in zip(source_frames, v):
 835 |             sf, sfn = sf
 836 |             tf, tfn = tf
 837 |             frame_nos.append((sfn, tfn))
 838 |             diffs.append(structural_similarity(sf, tf, multichannel=False))
 839 |         diffs = np.square(np.array(diffs) * 100)
 840 |         if sum(diffs) > best_diff:
 841 |             best_diff = sum(diffs)
 842 |             best_frame_diff = (target_from_frame + i) - source_from_frame
 843 | 
 844 |     return best_frame_diff
 845 | 
 846 | 
 847 | def frame_align_video(source_video, target_video, line_start, delta):
 848 |     print(f"Frame aligning video at {line_start=} {delta=}")
 849 |     source_frame_no = math.ceil((line_start * source_video.video_info["fps"]) / 1000)
 850 |     target_frame_no = math.ceil(
 851 |         ((line_start + delta) * target_video.video_info["fps"]) / 1000
 852 |     )
 853 | 
 854 |     good_breakpoint = find_good_frame_breakpoint(
 855 |         source_video.video_capture, source_frame_no
 856 |     )
 857 | 
 858 |     frame_diff = estimate_frame_diff(
 859 |         source_video.video_capture,
 860 |         target_video.video_capture,
 861 |         source_frame_no + good_breakpoint,
 862 |         target_frame_no + good_breakpoint,
 863 |     )
 864 |     # frame_diff_delta = math.ceil((frame_diff / target_video.video_info["fps"]) * 1000)
 865 |     best_target_frame = source_frame_no + frame_diff
 866 |     best_target_frame_time = math.ceil(
 867 |         best_target_frame * 1000 / target_video.video_info["fps"]
 868 |     )
 869 |     # actual_delta = line.start + frame_diff_delta
 870 |     actual_delta = best_target_frame_time - line_start
 871 |     if actual_delta != delta:
 872 |         print(
 873 |             f"Sign delta is different {delta=} {actual_delta=} {line_start=} {best_target_frame=} {best_target_frame_time=} {humanize_seconds((line_start + actual_delta)/1000)}"
 874 |         )
 875 | 
 876 |     return actual_delta
 877 | 
 878 | 
 879 | def frame_align_sync_bucket(source_video, target_video, sync_bucket):
 880 |     start_timestamp, end_timestamp, delta = sync_bucket
 881 |     delta = round(delta * 1000)
 882 |     actual_end_timestamp = min(
 883 |         end_timestamp, float(source_video.video_info["duration"])
 884 |     )
 885 |     line_start = int(((actual_end_timestamp - start_timestamp) * 1000) / 2)
 886 |     new_delta = frame_align_video(source_video, target_video, line_start, delta)
 887 |     if delta != new_delta:
 888 |         print(f"Changed delta from {delta=} {new_delta=}")
 889 |     return (start_timestamp, end_timestamp, new_delta / 1000)
 890 | 
 891 | 
 892 | def extract_subtitle_data(video_file, track_id, framerate_align=None):
 893 |     track_stream = video_file.subtitle_streams[track_id]
 894 |     codec_name_mapping = {
 895 |         "ass": "ass",
 896 |         "subrip": "srt",
 897 |     }
 898 |     subtitle_format = codec_name_mapping[track_stream["codec_name"]]
 899 |     subtitles_data = (
 900 |         video_file.create_ffmpeg()[f"s:{track_id}"]
 901 |         .output("pipe:", format=subtitle_format)
 902 |         .run(capture_stdout=True, quiet=True)[0]
 903 |         .decode("utf-8")
 904 |     )
 905 |     subtitles = pysubs2.SSAFile.from_string(subtitles_data)
 906 |     if framerate_align is not None:
 907 |         subtitles.transform_framerate(framerate_align[0], framerate_align[1])
 908 |     return subtitles, subtitle_format
 909 | 
 910 | 
 911 | def import_subtitle_data(external_subtitle_file, framerate_align=None):
 912 |     subtitle_format = external_subtitle_file.split(".")[-1]
 913 |     subtitles = pysubs2.load(external_subtitle_file)
 914 |     if framerate_align is not None:
 915 |         subtitles.transform_framerate(framerate_align[0], framerate_align[1])
 916 |     return subtitles, subtitle_format
 917 | 
 918 | 
 919 | def extract_and_sync_subtitles(
 920 |     video_file,
 921 |     track_id,
 922 |     video_duration,
 923 |     only_delta,
 924 |     audio_shift_points,
 925 |     subtitle_sync_buckets,
 926 |     subtitle_delete_buckets,
 927 |     output_file,
 928 |     subtitle_cutoff,
 929 |     sync_non_dialogue_to_video,
 930 |     output_video_file,
 931 |     framerate_align=None,
 932 |     external_subtitle_file=None,
 933 |     subtitle_min_font_size=None,
 934 | ):
 935 |     video_align_cache = {}
 936 |     if external_subtitle_file is not None:
 937 |         subtitle, subtitle_format = import_subtitle_data(
 938 |             external_subtitle_file, framerate_align
 939 |         )
 940 |     else:
 941 |         subtitle, subtitle_format = extract_subtitle_data(
 942 |             video_file, track_id, framerate_align
 943 |         )
 944 |     # TODO: if target video is missing some stuff, make sure we do not shift into that part and have double subtitles.
 945 | 
 946 |     new_subtitles = []
 947 |     video_duration = int(video_duration * 1000)
 948 |     subtitle_sync_buckets = [
 949 |         (round(t1 * 1000), round(t2 * 1000), round(delta * 1000))
 950 |         for (t1, t2, delta) in subtitle_sync_buckets
 951 |     ]
 952 |     subtitle_delete_buckets = [
 953 |         (round(t1 * 1000), round(t2 * 1000)) for (t1, t2) in subtitle_delete_buckets
 954 |     ]
 955 |     print(f"Sync buckets {subtitle_sync_buckets=} {subtitle_delete_buckets=}")
 956 |     to_delete_lines = set()
 957 |     for i, line in enumerate(subtitle):
 958 |         is_dialogue = True
 959 |         if (
 960 |             sync_non_dialogue_to_video
 961 |             and line.start >= sync_non_dialogue_to_video[0] * 1000
 962 |             and line.start <= sync_non_dialogue_to_video[1] * 1000
 963 |             and subtitle.format == "ass"
 964 |             and line.type == "Dialogue"
 965 |             and (
 966 |                 "\\pos(" in line.text or "\\move(" in line.text
 967 |             )  # a bit lazy way to figure out if sign
 968 |         ):
 969 |             is_dialogue = False
 970 |         skip_sync = False
 971 |         for t1, t2 in subtitle_delete_buckets:
 972 |             if (
 973 |                 not only_delta
 974 |                 and line.start > t1
 975 |                 and line.start < t2
 976 |                 or line.end > t1
 977 |                 and line.end < t2
 978 |             ):
 979 |                 print(f"DELETING LINE {line}")
 980 |                 to_delete_lines.add(i)
 981 |                 skip_sync = True
 982 |                 break
 983 |         if skip_sync:
 984 |             continue
 985 |         for (
 986 |             t1,
 987 |             t2,
 988 |             delta,
 989 |         ) in subtitle_sync_buckets:  # TODO: sync to new time and move to at least 0.00
 990 |             current_line_length = line.end - line.start
 991 |             if current_line_length < 1000:
 992 |                 min_line_length = current_line_length
 993 |             elif current_line_length < 5000:
 994 |                 min_line_length = int(current_line_length * 0.75)
 995 |             else:
 996 |                 min_line_length = 5000
 997 |             if (line.start >= t1 and line.start <= t2) or (
 998 |                 (max(t1 + delta, 0) - delta) >= line.start
 999 |                 and (max(t2 + delta, 0) - delta) <= line.start
1000 |             ):  # TODO: what does the extra part fix
1001 |                 if not only_delta and line.end >= t1 and line.end > t2:
1002 |                     print(
1003 |                         f"WARNING, we are floating outside bounds with end: {line.start} {line.end} - {line}"
1004 |                     )
1005 |                 # print(f"Matching with t1={humanize_seconds(t1/1000)} t2={humanize_seconds(t2/1000)} {delta=} {line=} start")
1006 |                 if not is_dialogue:
1007 |                     if line.start not in video_align_cache:
1008 |                         video_align_cache[line.start] = frame_align_video(
1009 |                             video_file, output_video_file, line.start, delta
1010 |                         )
1011 |                     delta = video_align_cache[line.start]
1012 |                 line.start += delta
1013 |                 if only_delta:
1014 |                     line.end += delta
1015 |                 else:
1016 |                     line.end = min(
1017 |                         line.end + delta, t2
1018 |                     )  # TODO: allow floating outside if it doesn't hit anything
1019 |                 if not only_delta and line.end - line.start < min_line_length:
1020 |                     print(f"WARNING, line is too short {line} - {min_line_length=}")
1021 |                 break
1022 |             elif (line.end >= t1 and line.end <= t2) or (
1023 |                 (max(t1 + delta, 0) - delta) >= line.end
1024 |                 and (max(t2 + delta, 0) - delta) <= line.end
1025 |             ):  # TODO: what does the extra part fix
1026 |                 if not only_delta and line.start >= t1 and line.start > t2:
1027 |                     print(
1028 |                         f"WARNING, we are floating outside bounds with start: {line.start} {line.end} - {line}"
1029 |                     )
1030 |                 # print(f"Matching with t1={humanize_seconds(t1/1000)} t2={humanize_seconds(t2/1000)} {delta=} {line=} end")
1031 |                 if not is_dialogue:
1032 |                     if line.start not in video_align_cache:
1033 |                         video_align_cache[line.start] = frame_align_video(
1034 |                             video_file, output_video_file, line.start, delta
1035 |                         )
1036 |                     delta = video_align_cache[line.start]
1037 |                 if only_delta:
1038 |                     line.start += delta
1039 |                 else:
1040 |                     line.start = min(
1041 |                         line.start + delta, t1
1042 |                     )  # TODO: allow floating outside if it doesn't hit anything
1043 |                 line.end += delta
1044 |                 if not only_delta and line.end - line.start < min_line_length:
1045 |                     print(f"WARNING, line is too short {line} - {min_line_length=}")
1046 |                 break
1047 |         else:
1048 |             print(f"Unable to find place for {line}")
1049 |             to_delete_lines.add(i)
1050 | 
1051 |     for i, line in enumerate(subtitle):
1052 |         if i in to_delete_lines:
1053 |             continue
1054 |         if subtitle_cutoff is not None and line.start > int(subtitle_cutoff * 1000):
1055 |             print(f"Removing line {line} because it is after {subtitle_cutoff=}")
1056 |             to_delete_lines.add(i)
1057 |         elif line.start > video_duration:
1058 |             print(f"Removing line {line} because it starts after end")
1059 |             to_delete_lines.add(i)
1060 |         elif line.end < 0:
1061 |             print(f"Removing line {line} because it ends after the video")
1062 |             to_delete_lines.add(i)
1063 |         elif line.end > video_duration:
1064 |             print(f"Moving line {line} end to end of video because it ended after")
1065 |             line.end = video_duration
1066 |         elif line.start < 0:
1067 |             print(
1068 |                 f"Moving line {line} start to beginning of video because it started before"
1069 |             )
1070 |             line.start = 0
1071 | 
1072 |     for i in sorted(to_delete_lines, reverse=True):
1073 |         print(f"Deleting line {i}")
1074 |         del subtitle[i]
1075 | 
1076 |     if subtitle_min_font_size is not None:
1077 |         for style_name, style in subtitle.styles.items():
1078 |             if style.fontsize < subtitle_min_font_size:
1079 |                 print(f"Setting font size from {style.fontsize} for {style_name}")
1080 |                 style.fontsize = subtitle_min_font_size
1081 | 
1082 |     output_file = output_file.with_suffix("." + subtitle_format)
1083 |     subtitle.save(output_file)
1084 |     return output_file
1085 | 
1086 | 
1087 | def extract_and_sync_audio(
1088 |     video_file,
1089 |     track_id,
1090 |     output_video_duration,
1091 |     audio_shift_points,
1092 |     sync_buckets,
1093 |     delete_buckets,
1094 |     audio_output_file,
1095 | ):
1096 |     audio_stream = video_file.audio_streams[track_id]
1097 |     audio_file_list = []
1098 |     segment_id = 1
1099 |     already_added_delta = 0.0
1100 |     for t1, t2, delta in sync_buckets:
1101 |         expected_delta = delta - already_added_delta
1102 |         # delay_cmd = []
1103 | 
1104 |         if expected_delta > 0:
1105 |             print(
1106 |                 f"Adding {expected_delta=} with absolute {delta=} silence audio from {t1=}"
1107 |             )
1108 | 
1109 |             silence_segment_output_file = audio_output_file.with_suffix(
1110 |                 f".s.{segment_id}.mkv"
1111 |             )
1112 |             audio_file_list.append(silence_segment_output_file)
1113 |             cmd = [
1114 |                 "ffmpeg",
1115 |                 "-y",
1116 |                 "-f",
1117 |                 "lavfi",
1118 |                 "-i",
1119 |                 f"anullsrc=channel_layout={audio_stream['channel_layout']}:sample_rate={audio_stream['sample_rate']}",
1120 |                 "-t",
1121 |                 str(expected_delta),
1122 |                 "-c:a",
1123 |                 audio_stream["codec_name"],
1124 |                 str(silence_segment_output_file),
1125 |             ]
1126 |             subprocess.check_call(
1127 |                 cmd,
1128 |                 stdin=subprocess.PIPE,
1129 |                 stdout=subprocess.DEVNULL,
1130 |                 stderr=subprocess.DEVNULL,
1131 |             )
1132 |         elif expected_delta < 0:
1133 |             print(f"Removing {expected_delta=} with absolute {delta=} audio from {t1=}")
1134 |             pass  # we should cut the beginning of this track and cut from t2 to tt2
1135 | 
1136 |         print(f"Copying audio segment {t1=} {t2=} with cut {expected_delta=}")
1137 |         segment_output_file = audio_output_file.with_suffix(f".{segment_id}.mkv")
1138 |         audio_file_list.append(segment_output_file)
1139 | 
1140 |         cmd = [
1141 |             "ffmpeg",
1142 |             "-y",
1143 |             "-i",
1144 |             video_file.filepath,
1145 |             "-map",
1146 |             f"a:{track_id}",
1147 |             "-c",
1148 |             "copy",
1149 |             # ] + delay_cmd + [
1150 |             "-ss",
1151 |             str(t1),
1152 |             "-t",
1153 |             str(t2 - t1),
1154 |             str(segment_output_file),
1155 |         ]
1156 |         subprocess.check_call(
1157 |             cmd,
1158 |             stdin=subprocess.PIPE,
1159 |             stdout=subprocess.DEVNULL,
1160 |             stderr=subprocess.DEVNULL,
1161 |         )
1162 | 
1163 |         already_added_delta += expected_delta  # TODO: measure actual length of file instead of assuming it is 100% correct.
1164 |         segment_id += 1
1165 | 
1166 |     input_file = audio_output_file.with_suffix(".txt")
1167 |     input_file.write_text("\n".join(f"file '{p.name}'" for p in audio_file_list))
1168 |     actual_audio_output_file = audio_output_file.with_suffix(f".mkv")
1169 |     print(f"Combining audio segments for {str(actual_audio_output_file)}")
1170 |     cmd = [
1171 |         "ffmpeg",
1172 |         "-y",
1173 |         "-f",
1174 |         "concat",
1175 |         "-safe",
1176 |         "0",
1177 |         "-i",
1178 |         str(input_file),
1179 |         "-c",
1180 |         "copy",
1181 |         str(actual_audio_output_file),
1182 |     ]
1183 |     subprocess.check_call(
1184 |         cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
1185 |     )
1186 | 
1187 |     return actual_audio_output_file
1188 | 
1189 | 
1190 | def extract_and_sync_chapters(video_file, video_duration, audio_shift_points):
1191 |     chapters = []
1192 |     for chapter in video_file.chapters:
1193 |         start_time = float(chapter["start_time"])
1194 |         title = chapter["tags"]["title"]
1195 |         for t1, t2, delta in audio_shift_points:
1196 |             if t2 < 5.0:
1197 |                 t2 = 0
1198 |             if start_time >= t2:
1199 |                 new_start_time = max(start_time + delta, 0.0)
1200 |                 if new_start_time - 5.0 > video_duration:
1201 |                     print(
1202 |                         f"Skipping chapter {start_time=} {title=} because it floats after end"
1203 |                     )
1204 |                     break
1205 |                 new_start_time = min(new_start_time, video_duration)
1206 |                 if new_start_time < 3.0:
1207 |                     new_start_time = 0
1208 |                 chapters.append((new_start_time, title))
1209 |                 break
1210 |         else:
1211 |             print(f"Unable to find place for {start_time=} {title=}")
1212 |     return chapters
1213 | 
1214 | 
1215 | @click.command()
1216 | @click.argument("file", type=click.Path(exists=True), nargs=-1, required=True)
1217 | @click.option(
1218 |     "--only-generate-chroma", is_flag=True, help="Quit after chroma is generated"
1219 | )
1220 | @click.option(
1221 |     "--sync-using-subtitle-audio",
1222 |     is_flag=True,
1223 |     help="Extract audio from source where subtitles are and align with target. Good when video is partial or re-arranged. Bad for audio syncs.",
1224 | )
1225 | @click.option("--skip-subtitles", is_flag=True, help="Do not align subtitles.")
1226 | @click.option("--skip-shift-point", type=str, help="List of sync points to skip")
1227 | @click.option(
1228 |     "--subtitle-cutoff",
1229 |     type=float,
1230 |     help="Subtitle cutoff where everything after is removed",
1231 | )
1232 | @click.option(
1233 |     "--only-delta",
1234 |     is_flag=True,
1235 |     help="Only do delta shifts, not group alignment (warning, subtitles might overlap?)",
1236 | )
1237 | @click.option(
1238 |     "--align-framerate",
1239 |     is_flag=True,
1240 |     help="Align source framerate to target video framerate, when speedup/slowdown used as technique to change framerate.",
1241 | )
1242 | @click.option(
1243 |     "--align-frames-too",
1244 |     is_flag=True,
1245 |     help="Align using frames when the delta is discovered.",
1246 | )
1247 | @click.option(
1248 |     "--preserve-silence",
1249 |     is_flag=True,
1250 |     help="Preserve silence at the end of the video instead of trimming it",
1251 | )
1252 | @click.option(
1253 |     "--temp-folder",
1254 |     type=click.Path(),
1255 |     default="milk-temp",
1256 |     help="Temp folder to store various files in.",
1257 | )
1258 | @click.option(
1259 |     "--audio-tracks", type=TRACK_MAPPING, help="Specify audio tracks to compare with."
1260 | )
1261 | @click.option(
1262 |     "--adjust-shift-point",
1263 |     type=str,
1264 |     help="Maually adjust an audio shift point.",
1265 |     multiple=True,
1266 | )
1267 | @click.option(
1268 |     "--adjust-delay", type=float, help="Maually adjust delay."
1269 | )  # TODO: define audio track to do it to
1270 | @click.option(
1271 |     "--sync-non-dialogue-to-video",
1272 |     type=INTEGER_RANGE,
1273 |     help="Sync non-dialogue using frames instead of audio, good for e.g. remastered where they audio might be re-aligned",
1274 | )
1275 | @click.option(
1276 |     "--chapter-source",
1277 |     type=int,
1278 |     help="Input file index where to extract chapters from.",
1279 | )
1280 | @click.option(
1281 |     "--chapter-beginning",
1282 |     type=str,
1283 |     help="Name of chapter from the beginning of the file",
1284 | )
1285 | @click.option(
1286 |     "--chapter-segment-file",
1287 |     type=click.Path(exists=True),
1288 |     help="Files to try to automatically generate chapters from.",
1289 |     multiple=True,
1290 | )
1291 | @click.option(
1292 |     "--chapter-segment-name-start",
1293 |     type=str,
1294 |     help="Name start of chapter given in --chapter-segment-file, handled in same order",
1295 |     multiple=True,
1296 | )
1297 | @click.option(
1298 |     "--chapter-segment-name-end",
1299 |     type=str,
1300 |     help="Name end of chapter given in --chapter-segment-file, handled in same order",
1301 |     multiple=True,
1302 | )
1303 | @click.option(
1304 |     "--chapter-segment-required",
1305 |     is_flag=True,
1306 |     help="Error out if not all chapters are found",
1307 | )
1308 | @click.option(
1309 |     "--metadata-audio-track",
1310 |     type=METADATA_MAPPING,
1311 |     help="Set metadata for an audio track, syntax track_id=key=value",
1312 |     multiple=True,
1313 | )
1314 | @click.option(
1315 |     "--metadata-subtitle-track",
1316 |     type=METADATA_MAPPING,
1317 |     help="Set metadata for a subtitle track, syntax track_id=key=value",
1318 |     multiple=True,
1319 | )
1320 | @click.option(
1321 |     "--subtitle-min-font-size",
1322 |     type=int,
1323 |     help="Set the minimum font size",
1324 | )
1325 | @click.option(
1326 |     "--input-external-subtitle-track",
1327 |     type=click.Path(exists=True),
1328 |     help="External subtitle track, presumed to be part of first input file",
1329 | )
1330 | @click.option(
1331 |     "--output-video-file-index", type=int, help="Which file to pull video track from"
1332 | )
1333 | @click.option(
1334 |     "--output-audio-mapping",
1335 |     type=TRACK_MAPPING,
1336 |     help="Which audio tracks to include in output",
1337 | )
1338 | @click.option(
1339 |     "--output-subtitle-mapping",
1340 |     type=TRACK_MAPPING,
1341 |     help="Which subtitle tracks to include in output",
1342 | )
1343 | @click.option("--output", type=click.Path(exists=False), help="Output file.")
1344 | @click.option(
1345 |     "--output-subtitle", type=click.Path(exists=False), help="Output subtitle file."
1346 | )
1347 | def main(
1348 |     file,
1349 |     only_generate_chroma,
1350 |     sync_using_subtitle_audio,
1351 |     skip_subtitles,
1352 |     skip_shift_point,
1353 |     subtitle_cutoff,
1354 |     only_delta,
1355 |     align_framerate,
1356 |     align_frames_too,
1357 |     preserve_silence,
1358 |     temp_folder,
1359 |     audio_tracks,
1360 |     adjust_shift_point,
1361 |     adjust_delay,
1362 |     sync_non_dialogue_to_video,
1363 |     chapter_source,
1364 |     chapter_beginning,
1365 |     chapter_segment_file,
1366 |     chapter_segment_name_start,
1367 |     chapter_segment_name_end,
1368 |     chapter_segment_required,
1369 |     metadata_audio_track,
1370 |     metadata_subtitle_track,
1371 |     subtitle_min_font_size,
1372 |     input_external_subtitle_track,
1373 |     output_video_file_index,
1374 |     output_audio_mapping,
1375 |     output_subtitle_mapping,
1376 |     output,
1377 |     output_subtitle,
1378 | ):
1379 |     # logging.basicConfig(level=logging.DEBUG)
1380 |     global HOP_LENGTH
1381 | 
1382 |     if output_video_file_index is None:
1383 |         output_video_file_index = len(file) - 1
1384 | 
1385 |     if output_audio_mapping is None:
1386 |         output_audio_mapping = [[output_video_file_index, 0]]
1387 | 
1388 |     if output_subtitle_mapping is None:
1389 |         output_subtitle_mapping = [[0, 0]]
1390 | 
1391 |     if skip_shift_point:
1392 |         skip_shift_point = sorted(
1393 |             [int(i) for i in skip_shift_point.split(",")], reverse=True
1394 |         )
1395 |     else:
1396 |         skip_shift_point = []
1397 | 
1398 |     if metadata_audio_track is None:
1399 |         metadata_audio_track = []
1400 | 
1401 |     if metadata_subtitle_track is None:
1402 |         metadata_subtitle_track = []
1403 | 
1404 |     if chapter_source is None:
1405 |         chapter_source = output_video_file_index
1406 | 
1407 |     if adjust_shift_point is None:
1408 |         adjust_shift_point = []
1409 | 
1410 |     adjust_shift_points = []
1411 |     for point in adjust_shift_point:
1412 |         point = point.split(":")
1413 |         adjust_shift_points.append(
1414 |             (int(point[0]), int(point[1]), int(point[2]), float(point[3]))
1415 |         )
1416 | 
1417 |     if sync_using_subtitle_audio:
1418 |         n_chroma = 36
1419 |         HOP_LENGTH = 512
1420 |     else:
1421 |         n_chroma = 12
1422 |         HOP_LENGTH = 1024
1423 | 
1424 |     mapped_metadata_audio_track = {}
1425 |     for track_id, key, value in metadata_audio_track:
1426 |         mapped_metadata_audio_track.setdefault(track_id, {})[key] = value
1427 | 
1428 |     mapped_metadata_subtitle_track = {}
1429 |     for track_id, key, value in metadata_subtitle_track:
1430 |         mapped_metadata_subtitle_track.setdefault(track_id, {})[key] = value
1431 | 
1432 |     chapter_segment_name_start = list(chapter_segment_name_start or [])
1433 |     chapter_segment_name_start += [""] * len(chapter_segment_file)
1434 | 
1435 |     chapter_segment_name_end = list(chapter_segment_name_end or [])
1436 |     chapter_segment_name_end += [""] * len(chapter_segment_file)
1437 | 
1438 |     chapter_segment_files = {}
1439 |     if chapter_segment_file:
1440 |         for i, csf in enumerate(chapter_segment_file):
1441 |             chapter_segment_files[Path(csf)] = (
1442 |                 chapter_segment_name_start[i],
1443 |                 chapter_segment_name_end[i],
1444 |             )
1445 | 
1446 |     temp_folder = Path(temp_folder)
1447 |     temp_folder.mkdir(exist_ok=True)
1448 |     files = [Path(f) for f in file]
1449 |     video_files = [Video(f) for f in file]
1450 |     output_video_file = video_files[output_video_file_index]
1451 |     output_video_duration = float(output_video_file.video_info["duration"])
1452 | 
1453 |     if subtitle_cutoff and subtitle_cutoff < 0:
1454 |         subtitle_cutoff += output_video_duration
1455 |         print(f"Setting cutoff from negative to {subtitle_cutoff}")
1456 | 
1457 |     sync_audio_track_mapping = [0] * len(files)
1458 |     if audio_tracks is not None:
1459 |         for file_id, audio_track_id in audio_tracks:
1460 |             sync_audio_track_mapping[file_id] = audio_track_id
1461 | 
1462 |     chromas = {}
1463 | 
1464 |     framerate_aligns = {}
1465 |     if align_framerate:
1466 |         target_framerate = output_video_file.video_info["fps"]
1467 |         for i, (f, video_file) in enumerate(zip(files, video_files)):
1468 |             if i == output_video_file_index:
1469 |                 continue
1470 |             video_framerate = video_file.video_info["fps"]
1471 |             if target_framerate != video_framerate:
1472 |                 framerate_aligns[i] = (video_framerate, target_framerate)
1473 | 
1474 |     with ThreadPoolExecutor(max_workers=8) as executor:
1475 |         jobqueue = {}
1476 |         for i, (f, audio_track_id) in enumerate(zip(files, sync_audio_track_mapping)):
1477 |             framerate_align = framerate_aligns.get(i)
1478 |             framerate_align_filename = (
1479 |                 framerate_align
1480 |                 and f".{str(framerate_align[0]).replace('.', '_')}-{str(framerate_align[1]).replace('.', '_')}"
1481 |                 or ""
1482 |             )
1483 |             preserve_silence_filename = preserve_silence and ".ps" or ""
1484 | 
1485 |             audio_chroma_output_file = temp_folder / (
1486 |                 f.stem
1487 |                 + f"{framerate_align_filename}{preserve_silence_filename}.{HOP_LENGTH}.{n_chroma}.{audio_track_id}.chroma"
1488 |             )
1489 |             if audio_chroma_output_file.exists():
1490 |                 print(f"Loading chroma from {audio_chroma_output_file}")
1491 |                 chromas[f] = pickle.loads(audio_chroma_output_file.read_bytes())
1492 |             else:
1493 |                 print(f"Extracting chroma from {f.name}")
1494 |                 future = executor.submit(
1495 |                     generate_chroma_cqt,
1496 |                     f,
1497 |                     sync_audio_track_mapping[i],
1498 |                     audio_chroma_output_file,
1499 |                     n_chroma=n_chroma,
1500 |                     framerate_align=framerate_align,
1501 |                     preserve_silence=preserve_silence,
1502 |                 )
1503 |                 jobqueue[future] = f
1504 | 
1505 |         for f in chapter_segment_files:
1506 |             preserve_silence_filename = preserve_silence and ".ps" or ""
1507 |             audio_chroma_output_file = temp_folder / (
1508 |                 f.stem + f"{preserve_silence_filename}.{HOP_LENGTH}.{n_chroma}.c.chroma"
1509 |             )
1510 |             if audio_chroma_output_file.exists():
1511 |                 print(f"Loading chroma from {audio_chroma_output_file}")
1512 |                 chromas[f] = pickle.loads(audio_chroma_output_file.read_bytes())
1513 |             else:
1514 |                 print(f"Extracting chroma from {f.name}")
1515 |                 future = executor.submit(
1516 |                     generate_chroma_cqt,
1517 |                     f,
1518 |                     0,
1519 |                     audio_chroma_output_file,
1520 |                     n_chroma=n_chroma,
1521 |                     preserve_silence=preserve_silence,
1522 |                 )
1523 |                 jobqueue[future] = f
1524 | 
1525 |         for future in concurrent.futures.as_completed(jobqueue):
1526 |             chromas[jobqueue[future]] = future.result()
1527 | 
1528 |     if only_generate_chroma:
1529 |         print("Done generating chroma")
1530 |         quit(0)
1531 | 
1532 |     x_2_chroma, fs = chromas[
1533 |         files[output_video_file_index]
1534 |     ]  # x_2_chroma is always target video, the one we align everything with
1535 | 
1536 |     chapter_timestamps = []
1537 |     for f, (start_name, end_name) in chapter_segment_files.items():
1538 |         print(f"Looking for chapter matching {f}")
1539 |         x_1_chroma, fs = chromas[f]
1540 |         chapter_specifications = find_and_align_chapter(
1541 |             x_1_chroma, x_2_chroma, fs, min_match_value=60 + (n_chroma * 3)
1542 |         )
1543 |         if chapter_specifications is None:
1544 |             if chapter_segment_required:
1545 |                 print("Did not find required chapters")
1546 |                 quit(1)
1547 |             else:
1548 |                 continue
1549 | 
1550 |         chapter_timestamps.append((chapter_specifications[0], start_name))
1551 |         chapter_timestamps.append((chapter_specifications[1], end_name))
1552 | 
1553 |     if chapter_timestamps:
1554 |         min_chapter_delay = 4.0
1555 |         previous_chapter_time = None
1556 | 
1557 |         new_chapter_timestamps = []
1558 |         for chapter_timestamp, name in sorted(chapter_timestamps):
1559 |             if chapter_timestamp < 3.0:
1560 |                 chapter_timestamp = 0.0
1561 |             if (
1562 |                 previous_chapter_time is not None
1563 |                 and chapter_timestamp - previous_chapter_time < min_chapter_delay
1564 |             ):
1565 |                 print(
1566 |                     f"Skipping chapter '{name}' because it is too close to previous chapter"
1567 |                 )
1568 |                 continue
1569 | 
1570 |             if output_video_duration - chapter_timestamp < min_chapter_delay:
1571 |                 print(
1572 |                     f"Skipping chapter '{name}' because it is too close to end of video"
1573 |                 )
1574 |                 continue
1575 | 
1576 |             previous_chapter_time = chapter_timestamp
1577 |             new_chapter_timestamps.append((chapter_timestamp, name))
1578 | 
1579 |         chapter_timestamps = new_chapter_timestamps
1580 | 
1581 |         if chapter_beginning and chapter_timestamps[0][0] > 0.0:
1582 |             print("Injecting chapter at beginning")
1583 |             chapter_timestamps.insert(0, (0.0, chapter_beginning))
1584 | 
1585 |         print("Found chapters:")
1586 |         for chapter_timestamp, name in chapter_timestamps:
1587 |             print(f" {humanize_seconds(chapter_timestamp)} - {name}")
1588 | 
1589 |     attachment_files = set()
1590 |     subtitle_files = []
1591 |     audio_files = []
1592 | 
1593 |     for i, (f, video_file) in enumerate(zip(files, video_files)):
1594 |         if i == output_video_file_index:
1595 |             continue
1596 | 
1597 |         x_1_chroma, fs = chromas[f]
1598 |         if sync_using_subtitle_audio:
1599 |             for j, (file_index, track_id) in enumerate(output_subtitle_mapping):
1600 |                 if file_index != i:
1601 |                     continue
1602 |                 break
1603 |             else:
1604 |                 print(
1605 |                     "No subtitle track found to sync with, please specify a mapping to use this feature"
1606 |                 )
1607 |                 quit(1)
1608 | 
1609 |             (
1610 |                 audio_shift_points,
1611 |                 sync_buckets,
1612 |                 delete_buckets,
1613 |             ) = estimate_audio_shift_points_from_subtitles(
1614 |                 x_1_chroma,
1615 |                 x_2_chroma,
1616 |                 fs,
1617 |                 video_file,
1618 |                 j,
1619 |                 n_chroma,
1620 |                 adjust_delay=adjust_delay,
1621 |                 framerate_align=framerate_aligns.get(i),
1622 |                 external_subtitle_file=input_external_subtitle_track,
1623 |             )
1624 |         else:
1625 |             audio_shift_points = estimate_audio_shift_points(
1626 |                 x_1_chroma,
1627 |                 x_2_chroma,
1628 |                 fs,
1629 |                 only_delta=only_delta,
1630 |                 adjust_delay=adjust_delay,
1631 |                 sliding_window_size=300,
1632 |             )
1633 | 
1634 |             for skip_point in skip_shift_point:
1635 |                 if len(audio_shift_points) - 1 >= skip_point:
1636 |                     print(f"Skipping shift point {audio_shift_points[skip_point]}")
1637 |                     del audio_shift_points[skip_point]
1638 | 
1639 |             for point in adjust_shift_points:
1640 |                 if point[0] != i:
1641 |                     continue
1642 |                 print(f"Changing shift point {point=} {audio_shift_points[point[1]]}")
1643 |                 p = list(audio_shift_points[point[1]])
1644 |                 p[point[2]] = point[3]
1645 |                 audio_shift_points[point[1]] = tuple(p)
1646 |                 print(audio_shift_points)
1647 | 
1648 |             sync_buckets, delete_buckets = turn_audio_shift_points_to_audio_segments(
1649 |                 audio_shift_points
1650 |             )
1651 |         print(
1652 |             f"Found audio shift points {audio_shift_points=} {sync_buckets=} {delete_buckets=}"
1653 |         )
1654 | 
1655 |         if align_frames_too:
1656 |             print("Frame aligning buckets")
1657 |             sync_buckets = [
1658 |                 frame_align_sync_bucket(video_file, output_video_file, sync_bucket)
1659 |                 for sync_bucket in sync_buckets
1660 |             ]
1661 | 
1662 |         if not skip_subtitles:
1663 |             for j, (file_index, track_id) in enumerate(output_subtitle_mapping):
1664 |                 if file_index != i:
1665 |                     continue
1666 | 
1667 |                 subtitle_output_file = temp_folder / (f.stem + f".{track_id}.unknown")
1668 |                 subtitle_output_file = extract_and_sync_subtitles(
1669 |                     video_file,
1670 |                     track_id,
1671 |                     output_video_duration,
1672 |                     only_delta,
1673 |                     audio_shift_points,
1674 |                     sync_buckets,
1675 |                     delete_buckets,
1676 |                     subtitle_output_file,
1677 |                     subtitle_cutoff,
1678 |                     sync_non_dialogue_to_video,
1679 |                     output_video_file,
1680 |                     framerate_align=framerate_aligns.get(i),
1681 |                     external_subtitle_file=input_external_subtitle_track,
1682 |                     subtitle_min_font_size=subtitle_min_font_size,
1683 |                 )
1684 | 
1685 |                 if input_external_subtitle_track:
1686 |                     subtitle_files.append((j, subtitle_output_file, []))
1687 |                 elif subtitle_output_file:
1688 |                     subtitle_metadata = video_file.extract_subtitle_metadata(track_id)
1689 |                     subtitle_files.append((j, subtitle_output_file, subtitle_metadata))
1690 |                     attachment_files.add(i)
1691 | 
1692 |         for j, (file_index, track_id) in enumerate(output_audio_mapping):
1693 |             if file_index != i:
1694 |                 continue
1695 | 
1696 |             audio_output_file = temp_folder / (f.stem + f".{track_id}.unknown")
1697 |             audio_output_file = extract_and_sync_audio(
1698 |                 video_file,
1699 |                 track_id,
1700 |                 output_video_duration,
1701 |                 audio_shift_points,
1702 |                 sync_buckets,
1703 |                 delete_buckets,
1704 |                 audio_output_file,
1705 |             )
1706 | 
1707 |             if audio_output_file:
1708 |                 audio_metadata = video_file.extract_audio_metadata(track_id)
1709 |                 audio_files.append((j, audio_output_file, audio_metadata))
1710 | 
1711 |         if not chapter_timestamps and i == chapter_source:
1712 |             chapter_timestamps = extract_and_sync_chapters(
1713 |                 video_file, output_video_duration, audio_shift_points
1714 |             )
1715 | 
1716 |     if output_subtitle:
1717 |         if subtitle_files:
1718 |             subtitle_file = subtitle_files[0][1]
1719 |             subtitle_output_file = Path(output_subtitle).with_suffix(
1720 |                 subtitle_file.suffix
1721 |             )
1722 |             print(f"Wrote the first subtitle track to {subtitle_output_file}")
1723 |             shutil.copy(subtitle_file, subtitle_output_file)
1724 |         else:
1725 |             print("No subtitle file to output, skipping")
1726 | 
1727 |     if not output:
1728 |         print("No output defined, quitting here")
1729 |         quit(1)
1730 | 
1731 |     print("Combining everything")
1732 |     temp_output_file = temp_folder / (f.stem + f".temp.mkv")
1733 | 
1734 |     ffmpeg_inputs = []
1735 |     ffmpeg_options = []
1736 |     subtitle_track_count = 0
1737 |     audio_track_count = 0
1738 | 
1739 |     for f in files:
1740 |         ffmpeg_inputs.append(str(f))
1741 | 
1742 |     for i, (f, video_file) in enumerate(zip(files, video_files)):
1743 |         if i != output_video_file_index:
1744 |             continue
1745 | 
1746 |         if not skip_subtitles:
1747 |             for (
1748 |                 file_index,
1749 |                 track_id,
1750 |             ) in output_subtitle_mapping:  # TODO, inject at correct point
1751 |                 if file_index != i:
1752 |                     continue
1753 |                 ffmpeg_options += ["-map", f"{i}:s:{track_id}", "-c", "copy"]
1754 |                 subtitle_track_count += 1
1755 |                 attachment_files.add(i)
1756 | 
1757 |         for (
1758 |             file_index,
1759 |             track_id,
1760 |         ) in output_audio_mapping:  # TODO, inject at correct point
1761 |             if file_index != i:
1762 |                 continue
1763 |             ffmpeg_options += ["-map", f"{i}:a:{track_id}", "-c", "copy"]
1764 | 
1765 |             for tag_key, tag_value in mapped_metadata_audio_track.get(0, {}).items():
1766 |                 ffmpeg_options += [f"-metadata:s:a:{0}", f"{tag_key}={tag_value}"]
1767 |             audio_track_count += 1
1768 | 
1769 |     for i in sorted(attachment_files):
1770 |         ffmpeg_options += [
1771 |             "-map",
1772 |             f"{i}:d?",
1773 |             "-c",
1774 |             "copy",
1775 |             "-map",
1776 |             f"{i}:t?",
1777 |             "-c",
1778 |             "copy",
1779 |         ]
1780 | 
1781 |     ffmpeg_options += ["-map", f"{output_video_file_index}:v", "-c", "copy"]
1782 |     if not chapter_timestamps:
1783 |         ffmpeg_options += ["-map_chapters", str(chapter_source)]
1784 |     else:
1785 |         ffmpeg_options += ["-map_chapters", "-1"]
1786 | 
1787 |     for j, subtitle_file, subtitle_metadata in sorted(subtitle_files):
1788 |         input_index = len(ffmpeg_inputs)
1789 |         ffmpeg_options += ["-map", f"{input_index}:s", "-c", "copy"]
1790 |         seen_tag_keys = set()
1791 |         for tag_key, tag_value in subtitle_metadata:
1792 |             seen_tag_keys.add(tag_key)
1793 |             tag_value = mapped_metadata_subtitle_track.get(j, {}).get(
1794 |                 tag_key, tag_value
1795 |             )
1796 |             ffmpeg_options += [
1797 |                 f"-metadata:s:s:{subtitle_track_count}",
1798 |                 f"{tag_key}={tag_value}",
1799 |             ]
1800 |         for tag_key, tag_value in mapped_metadata_subtitle_track.get(j, {}).items():
1801 |             if tag_key in seen_tag_keys:
1802 |                 continue
1803 |             ffmpeg_options += [
1804 |                 f"-metadata:s:s:{subtitle_track_count}",
1805 |                 f"{tag_key}={tag_value}",
1806 |             ]
1807 |         ffmpeg_inputs.append(str(subtitle_file))
1808 |         subtitle_track_count += 1
1809 | 
1810 |     for j, audio_file, audio_metadata in sorted(audio_files):
1811 |         input_index = len(ffmpeg_inputs)
1812 |         ffmpeg_options += ["-map", f"{input_index}:a", "-c", "copy"]
1813 |         seen_tag_keys = set()
1814 |         for tag_key, tag_value in audio_metadata:
1815 |             seen_tag_keys.add(tag_key)
1816 |             tag_value = mapped_metadata_audio_track.get(j, {}).get(tag_key, tag_value)
1817 |             ffmpeg_options += [
1818 |                 f"-metadata:s:a:{audio_track_count}",
1819 |                 f"{tag_key}={tag_value}",
1820 |             ]
1821 |         for tag_key, tag_value in mapped_metadata_audio_track.get(j, {}).items():
1822 |             if tag_key in seen_tag_keys:
1823 |                 continue
1824 |             ffmpeg_options += [
1825 |                 f"-metadata:s:a:{audio_track_count}",
1826 |                 f"{tag_key}={tag_value}",
1827 |             ]
1828 |         # for tag_key, tag_value in audio_metadata:
1829 |         #     tag_value = mapped_metadata_audio_track.get(j, {}).get(tag_key, tag_value)
1830 |         #     ffmpeg_options += [f"-metadata:s:a:{audio_track_count}", f"{tag_key}={tag_value}"]
1831 |         ffmpeg_inputs.append(str(audio_file))
1832 |         audio_track_count += 1
1833 | 
1834 |     if output_subtitle_mapping:
1835 |         ffmpeg_options += ["-disposition:s:0", "default"]
1836 | 
1837 |     cmd = ["ffmpeg", "-y"]
1838 |     for ffmpeg_input in ffmpeg_inputs:
1839 |         cmd += ["-i", ffmpeg_input]
1840 |     cmd += ffmpeg_options
1841 |     cmd += [str(temp_output_file)]
1842 |     print(f"Running: {shlex.join(cmd)}")
1843 |     subprocess.check_call(
1844 |         cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
1845 |     )
1846 | 
1847 |     cmd = ["mkvmerge", "--no-global-tags", "-o", str(output)]
1848 | 
1849 |     if chapter_timestamps:
1850 |         chapter_file = temp_folder / (f.stem + f".chapters.txt")
1851 |         chapter_file.write_text(
1852 |             "\n".join(
1853 |                 f"CHAPTER{i:02}={humanize_seconds(chapter_timestamp)}\nCHAPTER{i:02}NAME={name}"
1854 |                 for (i, (chapter_timestamp, name)) in enumerate(chapter_timestamps)
1855 |             )
1856 |         )
1857 |         cmd += ["--chapters", str(chapter_file)]
1858 | 
1859 |     cmd += [str(temp_output_file)]
1860 |     print(f"Creating final MKV file: {output}")
1861 |     subprocess.check_call(
1862 |         cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
1863 |     )
1864 |     os.unlink(temp_output_file)
1865 | 
1866 | 
1867 | if __name__ == "__main__":
1868 |     warnings.filterwarnings("ignore")
1869 |     main()
1870 | 


--------------------------------------------------------------------------------