├── .dockerignore
├── tess-data
├── configs
│ ├── pdf
│ ├── quiet
│ ├── tsv
│ ├── alto
│ ├── api_config
│ ├── get.images
│ ├── logfile
│ ├── lstmbox
│ ├── makebox
│ ├── wordstrbox
│ ├── digits
│ ├── hocr
│ ├── unlv
│ ├── inter
│ ├── rebox
│ ├── linebox
│ ├── kannada
│ ├── lstmdebug
│ ├── bazaar
│ ├── bigram
│ ├── txt
│ ├── ambigs.train
│ ├── lstm.train
│ ├── box.train
│ ├── box.train.stderr
│ ├── Makefile.am
│ ├── strokewidth
│ └── testspace
├── README.md
└── eng.traineddata
├── .isort.cfg
├── LICENSE
├── Dockerfile
├── .gitignore
├── cartonizer.py
├── README.md
├── cowocr.py
└── milksync.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .env
3 |
--------------------------------------------------------------------------------
/tess-data/configs/pdf:
--------------------------------------------------------------------------------
1 | tessedit_create_pdf 1
2 |
--------------------------------------------------------------------------------
/tess-data/configs/quiet:
--------------------------------------------------------------------------------
1 | debug_file /dev/null
2 |
--------------------------------------------------------------------------------
/tess-data/configs/tsv:
--------------------------------------------------------------------------------
1 | tessedit_create_tsv 1
2 |
--------------------------------------------------------------------------------
/tess-data/configs/alto:
--------------------------------------------------------------------------------
1 | tessedit_create_alto 1
2 |
--------------------------------------------------------------------------------
/tess-data/configs/api_config:
--------------------------------------------------------------------------------
1 | tessedit_zero_rejection T
2 |
--------------------------------------------------------------------------------
/tess-data/configs/get.images:
--------------------------------------------------------------------------------
1 | tessedit_write_images T
2 |
--------------------------------------------------------------------------------
/tess-data/configs/logfile:
--------------------------------------------------------------------------------
1 | debug_file tesseract.log
2 |
--------------------------------------------------------------------------------
/tess-data/configs/lstmbox:
--------------------------------------------------------------------------------
1 | tessedit_create_lstmbox 1
2 |
--------------------------------------------------------------------------------
/tess-data/configs/makebox:
--------------------------------------------------------------------------------
1 | tessedit_create_boxfile 1
2 |
--------------------------------------------------------------------------------
/tess-data/configs/wordstrbox:
--------------------------------------------------------------------------------
1 | tessedit_create_wordstrbox 1
2 |
--------------------------------------------------------------------------------
/tess-data/configs/digits:
--------------------------------------------------------------------------------
1 | tessedit_char_whitelist 0123456789-.
2 |
--------------------------------------------------------------------------------
/tess-data/configs/hocr:
--------------------------------------------------------------------------------
1 | tessedit_create_hocr 1
2 | hocr_font_info 0
3 |
--------------------------------------------------------------------------------
/tess-data/configs/unlv:
--------------------------------------------------------------------------------
1 | tessedit_write_unlv 1
2 | unlv_tilde_crunching T
3 |
--------------------------------------------------------------------------------
/tess-data/configs/inter:
--------------------------------------------------------------------------------
1 | interactive_display_mode T
2 | tessedit_display_outwords T
3 |
--------------------------------------------------------------------------------
/tess-data/configs/rebox:
--------------------------------------------------------------------------------
1 | tessedit_resegment_from_boxes 1
2 | tessedit_make_boxes_from_boxes 1
3 |
--------------------------------------------------------------------------------
/tess-data/README.md:
--------------------------------------------------------------------------------
1 | Copied directly from the Tesseract project and is under their respective license.
--------------------------------------------------------------------------------
/tess-data/configs/linebox:
--------------------------------------------------------------------------------
1 | tessedit_resegment_from_line_boxes 1
2 | tessedit_make_boxes_from_boxes 1
3 |
--------------------------------------------------------------------------------
/tess-data/eng.traineddata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JohnDoee/the-cute-collection/HEAD/tess-data/eng.traineddata
--------------------------------------------------------------------------------
/tess-data/configs/kannada:
--------------------------------------------------------------------------------
1 | textord_skewsmooth_offset 8
2 | textord_skewsmooth_offset2 8
3 | textord_merge_desc 0.5
4 | textord_no_rejects 1
5 |
--------------------------------------------------------------------------------
/tess-data/configs/lstmdebug:
--------------------------------------------------------------------------------
1 | stopper_debug_level 1
2 | classify_debug_level 1
3 | segsearch_debug_level 1
4 | language_model_debug_level 3
5 |
--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | multi_line_output=3
3 | include_trailing_comma=True
4 | force_grid_wrap=0
5 | use_parentheses=True
6 | line_length=88
7 |
--------------------------------------------------------------------------------
/tess-data/configs/bazaar:
--------------------------------------------------------------------------------
1 | load_system_dawg F
2 | load_freq_dawg F
3 | user_words_suffix user-words
4 | user_patterns_suffix user-patterns
5 |
--------------------------------------------------------------------------------
/tess-data/configs/bigram:
--------------------------------------------------------------------------------
1 | load_bigram_dawg True
2 | tessedit_enable_bigram_correction True
3 | tessedit_bigram_debug 3
4 | save_raw_choices True
5 | save_alt_choices True
6 |
--------------------------------------------------------------------------------
/tess-data/configs/txt:
--------------------------------------------------------------------------------
1 | # This config file should be used with other cofig files which creates renderers.
2 | # usage example: tesseract eurotext.tif eurotext txt hocr pdf
3 | tessedit_create_txt 1
4 |
--------------------------------------------------------------------------------
/tess-data/configs/ambigs.train:
--------------------------------------------------------------------------------
1 | tessedit_ambigs_training 1
2 | load_freq_dawg 0
3 | load_punc_dawg 0
4 | load_system_dawg 0
5 | load_number_dawg 0
6 | ambigs_debug_level 3
7 | load_fixed_length_dawgs 0
8 |
--------------------------------------------------------------------------------
/tess-data/configs/lstm.train:
--------------------------------------------------------------------------------
1 | file_type .bl
2 | textord_fast_pitch_test T
3 | tessedit_zero_rejection T
4 | tessedit_minimal_rejection F
5 | tessedit_write_rep_codes F
6 | edges_children_fix F
7 | edges_childarea 0.65
8 | edges_boxarea 0.9
9 | tessedit_train_line_recognizer T
10 | textord_no_rejects T
11 | tessedit_init_config_only T
12 |
--------------------------------------------------------------------------------
/tess-data/configs/box.train:
--------------------------------------------------------------------------------
1 | disable_character_fragments T
2 | file_type .bl
3 | textord_fast_pitch_test T
4 | tessedit_zero_rejection T
5 | tessedit_minimal_rejection F
6 | tessedit_write_rep_codes F
7 | edges_children_fix F
8 | edges_childarea 0.65
9 | edges_boxarea 0.9
10 | tessedit_resegment_from_boxes T
11 | tessedit_train_from_boxes T
12 | textord_no_rejects T
13 |
--------------------------------------------------------------------------------
/tess-data/configs/box.train.stderr:
--------------------------------------------------------------------------------
1 | file_type .bl
2 | #tessedit_use_nn F
3 | textord_fast_pitch_test T
4 | tessedit_zero_rejection T
5 | tessedit_minimal_rejection F
6 | tessedit_write_rep_codes F
7 | edges_children_fix F
8 | edges_childarea 0.65
9 | edges_boxarea 0.9
10 | tessedit_resegment_from_boxes T
11 | tessedit_train_from_boxes T
12 | #textord_repeat_extraction F
13 | textord_no_rejects T
14 |
--------------------------------------------------------------------------------
/tess-data/configs/Makefile.am:
--------------------------------------------------------------------------------
1 | datadir = @datadir@/tessdata/configs
2 | data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug
3 | data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images
4 | data_DATA += lstmbox wordstrbox
5 | # Configurations for OCR output.
6 | data_DATA += alto hocr pdf tsv txt
7 | data_DATA += linebox rebox strokewidth bigram
8 | EXTRA_DIST = $(data_DATA)
9 |
--------------------------------------------------------------------------------
/tess-data/configs/strokewidth:
--------------------------------------------------------------------------------
1 | textord_show_blobs 0
2 | textord_debug_tabfind 3
3 | textord_tabfind_show_partitions 1
4 | textord_tabfind_show_initial_partitions 1
5 | textord_tabfind_show_columns 1
6 | textord_tabfind_show_blocks 1
7 | textord_tabfind_show_initialtabs 1
8 | textord_tabfind_show_finaltabs 1
9 | textord_tabfind_show_strokewidths 1
10 | textord_tabfind_show_vlines 0
11 | textord_tabfind_show_images 1
12 | tessedit_dump_pageseg_images 0
13 |
--------------------------------------------------------------------------------
/tess-data/configs/testspace:
--------------------------------------------------------------------------------
1 | chop_ok_split 1000
2 | tosp_rep_space 16
3 | textord_words_default_minspace 0.3
4 | textord_space_size_is_variable 1
5 | gapmap_use_ends 1
6 | textord_linespace_iqrlimit 0.8
7 | textord_overlap_x 1.375
8 | textord_words_width_ile 0.8
9 | textord_words_maxspace 16
10 | textord_words_default_maxspace 1.5
11 | textord_words_default_minspace 0.1
12 | textord_words_min_minspace 0.7
13 | textord_words_default_nonspace 0.1
14 | words_default_prop_nonspace 0.65
15 | words_default_fixed_space 0.95
16 | textord_spacesize_ratiofp 4.8
17 | textord_spacesize_ratioprop 1
18 | debug_fix_space_level 1
19 | tosp_enough_space_samples_for_median 8
20 | tosp_short_row 30
21 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The cute collection can combine subtitles from various files and sync them.
2 | Copyright (C) 2021 Anders Jensen
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-buster
2 |
3 | ENV DEBIAN_FRONTEND=noninteractive
4 |
5 | RUN apt-get update \
6 | && apt-get install -y tesseract-ocr ffmpeg \
7 | && rm -rf /var/lib/apt/lists/*
8 |
9 | RUN pip install -U setuptools pip wheel
10 | RUN pip install -U ffmpeg-python click guessit opencv-python librosa \
11 | pysubs2 scikit-image jinja2 lxml tqdm pyxdameraulevenshtein \
12 | textblob jinja2 pytesseract lxml annoy
13 |
14 | RUN mkdir /code
15 | ADD tess-data /code/tess-data
16 | COPY cartonizer.py cowocr.py milksync.py /code/
17 |
18 | RUN echo '#!/bin/bash\npython3 /code/cartonizer.py "$@"' > /usr/bin/cartonizer && \
19 | echo '#!/bin/bash\npython3 /code/cowocr.py "$@"' > /usr/bin/cowocr && \
20 | echo '#!/bin/bash\npython3 /code/milksync.py "$@"' > /usr/bin/milksync && \
21 | chmod +x /usr/bin/cartonizer /usr/bin/cowocr /usr/bin/milksync
22 |
23 | RUN mkdir /workdir
24 | WORKDIR /workdir
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
139 |
140 | .env
141 |
142 | milk-temp
143 | cow-temp
144 |
--------------------------------------------------------------------------------
/cartonizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import shlex
5 | import sys
6 | from pathlib import Path
7 | from pprint import pprint
8 |
9 | import click
10 | import ffmpeg
11 | import guessit
12 |
13 | KNOWN_SUBTITLE_EXTENSIONS = [".ass"]
14 | KNOWN_EXTENSIONS = [".mp4", ".mkv", ".ogm", ".avi"]
15 | VIDEO_MAPPING = {
16 | ("hevc", "Main 10"): "HEVC 10-bit",
17 | ("hevc", "Rext"): "HEVC 12-bit",
18 | ("h264", "High"): "h264",
19 | ("h264", "Main"): "h264",
20 | ("h264", "High 10"): "h264 10-bit",
21 | }
22 | VIDEO_RESOLUTION_MAPPING = {
23 | 1080: "1080p",
24 | 1088: "1080p",
25 | }
26 | AUDIO_MAPPING = {"flac": "FLAC", "aac": "AAC", "dts": "DTS-HDMA", "ac3": "AC3"}
27 | SOURCE_MAPPING = {"Blu-ray": "BD", "DVD": "DVD"}
28 |
29 | OP_CHAPTER_NAMES = ["OP", "Episode"]
30 | ED_CHAPTER_NAMES = ["ED", "Preview"]
31 |
32 | CHROMA_GENERATE_PARAM = "--only-generate-chroma"
33 |
34 |
35 | def map_episode_files(paths):
36 | episode_mapping = {}
37 | for path in paths:
38 | for f in path.iterdir():
39 | if not f.is_file():
40 | continue
41 | if not f.suffix.lower() in KNOWN_EXTENSIONS:
42 | continue
43 | info = guessit.guessit(f.name)
44 | episode = info.get("episode")
45 | if episode is None:
46 | episode = info.get("episode_title")
47 | if episode is not None:
48 | episode = int(episode.split(" ")[0].split("v")[0])
49 |
50 | if episode is None:
51 | re_episode = re.findall("第(\d+)話", f.name)
52 | if not re_episode:
53 | continue
54 | episode = int(re_episode[0])
55 | if isinstance(episode, list):
56 | episode = episode[-1]
57 | episode_mapping.setdefault(episode, []).append(f)
58 | return episode_mapping
59 |
60 |
61 | @click.group()
62 | def cli():
63 | pass
64 |
65 |
66 | @cli.command()
67 | @click.argument("path", type=click.Path(exists=True), nargs=-1, required=True)
68 | @click.option("--input-subtitle-path", type=click.Path(exists=True))
69 | @click.option("--op-ed-path", type=click.Path(exists=True), multiple=True)
70 | @click.option("--group", type=str)
71 | @click.option("--source", type=str)
72 | @click.option("--audio", type=str)
73 | @click.option("--title", type=str)
74 | @click.option("--dual-audio", is_flag=True)
75 | @click.option("--skip-chapters", is_flag=True)
76 | @click.option("--pre-generate-chroma", is_flag=True)
77 | @click.option("--skip-copy-oped", is_flag=True)
78 | @click.option("--additional-params", type=str)
79 | @click.option("--folder-name", type=str)
80 | @click.option("--file-name-template", type=str)
81 | @click.option("--output-subtitles-path", type=click.Path())
82 | def sync(
83 | path,
84 | input_subtitle_path,
85 | op_ed_path,
86 | group,
87 | source,
88 | audio,
89 | title,
90 | dual_audio,
91 | skip_chapters,
92 | pre_generate_chroma,
93 | skip_copy_oped,
94 | additional_params,
95 | folder_name,
96 | file_name_template,
97 | output_subtitles_path,
98 | ):
99 | command_path = (
100 | f"{sys.executable} {(Path(__file__).parent / 'milksync.py').absolute()}"
101 | )
102 |
103 | if output_subtitles_path:
104 | output_subtitles_path = Path(output_subtitles_path)
105 | output_subtitles_path.mkdir(parents=True, exist_ok=True)
106 |
107 | external_subtitles = {}
108 | if input_subtitle_path is not None:
109 | for s in Path(input_subtitle_path).iterdir():
110 | if s.suffix.lower() not in KNOWN_SUBTITLE_EXTENSIONS:
111 | continue
112 | external_subtitles[s.stem] = s
113 |
114 | paths = [Path(p) for p in path]
115 | if op_ed_path:
116 | op_ed_paths = [Path(p) for p in op_ed_path]
117 | else:
118 | op_ed_paths = []
119 |
120 | episode_mapping = map_episode_files(paths)
121 | first_episode = sorted(episode_mapping.items())[0][1]
122 | probe_result = ffmpeg.probe(first_episode[-1])
123 | release_name = {
124 | "show_name": title or guessit.guessit(first_episode[0].name)["title"],
125 | }
126 | if source:
127 | release_name["source"] = source
128 | else:
129 | release_name["source"] = SOURCE_MAPPING[
130 | guessit.guessit(first_episode[-1].name)["source"]
131 | ]
132 | if audio:
133 | release_name["audio"] = audio
134 | for stream in probe_result["streams"]:
135 | if stream["codec_type"] == "video" and "video" not in release_name:
136 | key = (stream["codec_name"], stream["profile"])
137 | if key not in VIDEO_MAPPING:
138 | click.echo(f"Unknown video key {key=}")
139 | quit(1)
140 | release_name["video"] = VIDEO_MAPPING[key]
141 | release_name["video_resolution"] = VIDEO_RESOLUTION_MAPPING.get(
142 | stream["coded_height"],
143 | f"{stream['coded_width']}x{stream['coded_height']}",
144 | )
145 | elif stream["codec_type"] == "audio" and "audio" not in release_name:
146 | key = stream["codec_name"]
147 | if key not in AUDIO_MAPPING:
148 | click.echo(f"Unknown audio key {key=}")
149 | quit(1)
150 | release_name["audio"] = AUDIO_MAPPING[key]
151 |
152 | if not folder_name:
153 | folder_name = f"{group and '[' + group + '] ' or ''}{release_name['show_name']} ({release_name['source']} {release_name['video_resolution']} {release_name['video']} {release_name['audio']}{dual_audio and ' Dual-Audio' or ''})"
154 | if not file_name_template:
155 | file_name_template = f"{group and '[' + group + '] ' or ''}{release_name['show_name']} - %s ({release_name['source']} {release_name['video_resolution']} {release_name['video']} {release_name['audio']}{dual_audio and ' Dual-Audio' or ''})"
156 | click.echo(f"Folder name: {folder_name}")
157 | click.echo(f"File name template: {file_name_template}")
158 |
159 | copy_files = []
160 |
161 | endings = []
162 | openings = []
163 | for op_ed_path in op_ed_paths:
164 | for f in op_ed_path.iterdir():
165 | if "NCOP" in f.name:
166 | click.echo(f"Found OP {f.name}")
167 | openings.append(f)
168 | elif "NCED" in f.name:
169 | click.echo(f"Found ED {f.name}")
170 | endings.append(f)
171 |
172 | op_ed_chapter_command = []
173 | if openings or endings:
174 | for i, opening in enumerate(sorted(openings, key=lambda f: f.name), 1):
175 | if not skip_chapters:
176 | op_ed_chapter_command.append(
177 | f"--chapter-segment-file '{str(opening)}' --chapter-segment-name-start '{OP_CHAPTER_NAMES[0]}' --chapter-segment-name-end '{OP_CHAPTER_NAMES[1]}'"
178 | )
179 | name = "NCOP"
180 | if len(openings) > 1:
181 | name += str(i)
182 | if not skip_copy_oped:
183 | copy_files.append(
184 | (str(opening), f"{folder_name}/{file_name_template % name}.mkv")
185 | )
186 |
187 | for i, ending in enumerate(sorted(endings, key=lambda f: f.name), 1):
188 | if not skip_chapters:
189 | op_ed_chapter_command.append(
190 | f"--chapter-segment-file '{str(ending)}' --chapter-segment-name-start '{ED_CHAPTER_NAMES[0]}' --chapter-segment-name-end '{ED_CHAPTER_NAMES[1]}'"
191 | )
192 | name = "NCED"
193 | if len(endings) > 1:
194 | name += str(i)
195 | if not skip_copy_oped:
196 | copy_files.append(
197 | (str(ending), f"{folder_name}/{file_name_template % name}.mkv")
198 | )
199 |
200 | op_ed_chapter_command = "".join([f" {cmd} \\\n" for cmd in op_ed_chapter_command])
201 | episode_num_length = max(max(len(str(k)) for k in episode_mapping.keys()), 2)
202 |
203 | output_file = []
204 |
205 | chroma_files = []
206 | for files in episode_mapping.values():
207 | for f in files:
208 | chroma_files.append(f"'{str(f)}'")
209 | if pre_generate_chroma:
210 | output_file.append("echo 'Generating chroma'")
211 | output_file.append(
212 | f"{command_path} {CHROMA_GENERATE_PARAM} {' '.join(chroma_files)}"
213 | )
214 | output_file.append(f"mkdir -p '{folder_name}'")
215 | if additional_params:
216 | additional_params = f" {additional_params} \\\n"
217 | else:
218 | additional_params = ""
219 |
220 | for episode, files in sorted(episode_mapping.items()):
221 | if len(files) < 2:
222 | click.echo(f"Skipping episode {episode}")
223 | continue
224 | external_subtitle = ""
225 | if files[0].stem in external_subtitles:
226 | external_subtitle = f" --input-external-subtitle-track {shlex.quote(str(external_subtitles[files[0].stem]))} \\\n"
227 | output_file.append("echo ''")
228 | output_file.append(f"echo 'Handling episode {episode}'")
229 | if output_subtitles_path:
230 | output_subtitles = f" --output-subtitle {shlex.quote(str(output_subtitles_path / files[-1].with_suffix('.subtitle').name))} \\\n"
231 | else:
232 | output_subtitles = ""
233 | files = "".join([f" {shlex.quote(str(f))} \\\n" for f in files])
234 | output_file.append(
235 | f"{command_path} \\\n{files}{op_ed_chapter_command}{external_subtitle}{additional_params}{output_subtitles} --output '{folder_name}/{file_name_template % str(episode).zfill(episode_num_length)}.mkv'"
236 | )
237 | output_file.append("echo ''")
238 | output_file.append("echo 'Copying files'")
239 | for (src_f, dst_f) in copy_files:
240 | if not src_f.lower().endswith(".mkv"):
241 | click.echo("Copy file is not an mkv")
242 | quit(1)
243 | output_file.append(f"cp {shlex.quote(src_f)} {shlex.quote(dst_f)}")
244 |
245 | Path("create_release.sh").write_text("\n".join(output_file))
246 | click.echo("Release file created, run: bash create_release.sh")
247 |
248 |
249 | @cli.command()
250 | @click.argument("subbed_path", type=click.Path(exists=True), required=True)
251 | @click.argument("unsubbed_path", type=click.Path(exists=True), required=True)
252 | @click.option("--additional-params", type=str)
253 | def ocr(
254 | subbed_path,
255 | unsubbed_path,
256 | additional_params,
257 | ):
258 | paths = [Path(subbed_path), Path(unsubbed_path)]
259 | command_path = (
260 | f"{sys.executable} {(Path(__file__).parent / 'cowocr.py').absolute()}"
261 | )
262 |
263 | output_file = []
264 |
265 | episode_mapping = map_episode_files(paths)
266 |
267 | for episode, files in sorted(episode_mapping.items()):
268 | if len(files) < 2:
269 | click.echo(f"Skipping episode {episode}")
270 | continue
271 | output_file.append("echo ''")
272 | output_file.append(f"echo 'Handling episode {episode}'")
273 | files = "".join([f" {shlex.quote(str(f))} \\\n" for f in files])
274 | output_file.append(
275 | f"{command_path} \\\n{files} extract-subtitles \\\n {additional_params or ''}"
276 | )
277 | output_file.append(f"{command_path} \\\n{files} create-report")
278 |
279 | output_file.append("")
280 | Path("ocr_release.sh").write_text("\n".join(output_file))
281 | print("OCR script file created, run: bash ocr_release.sh")
282 |
283 |
284 | if __name__ == "__main__":
285 | cli()
286 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The Cute Collection
2 |
3 | This is a bunch of tools to sync subtitles and audio tracks automatically.
4 |
5 | ## Requirements
6 |
7 | * Linux
8 | * 10GB of available memory
9 | * ffmpeg and ffprobe in environment path
10 | * Python 3.8+ (might work with earlier, not tested though)
11 | * Tesseract in environment path if you want to OCR
12 |
13 | ## Installation Instructions
14 |
15 | ```bash
16 | cd ~
17 | git clone https://github.com/JohnDoee/the-cute-collection.git the-cute-collection
18 | cd the-cute-collection
19 |
20 | python3 -m venv .env
21 |
22 | .env/bin/pip install -U setuptools pip wheel
23 | .env/bin/pip install ffmpeg-python click guessit opencv-python librosa pysubs2 scikit-image jinja2 lxml tqdm pyxdameraulevenshtein textblob jinja2 pytesseract lxml annoy
24 | ```
25 |
26 | ## Docker Installation and Usage Instructions
27 |
28 | A docker image is available too, the commands look like this:
29 |
30 | cartonizer: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection cartonizer```
31 | cowocr: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection cowocr```
32 | milksync: ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection milksync```
33 |
34 | To execute a commandfile generated by cartonizer, use this syntax:
35 |
36 | ```docker run -ti --rm -v `pwd`:/workdir/ johndoee/the-cute-collection bash ocr_release.sh```
37 |
38 | ## Cartonizer
39 |
40 | Generate a script for milksync to do bulk operations instead of one by one.
41 |
42 | ### How to use for automatic subtitle sync (milksync)
43 |
44 | The most basic usage is:
45 |
46 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py sync path-to-subbed path-to-unsubbed`
47 |
48 | Make sure the files are correctly matched, sometimes it takes multiple files it should not.
49 |
50 | This will generate a bash script called `create_release.sh`, you just have to run it with `bash create_release.sh` and wait.
51 |
52 | A description of all arguments and how and when to use them.
53 |
54 | #### --op-ed-path
55 |
56 | Look for OP and ED in the given path and use them to auto-generate chapters and copy them to the result path.
57 |
58 | Example: `--op-ed-path Unsubbed-Files/NC-OP-ED-Folder/` - looks for OP ED in the specified folder.
59 |
60 | #### --group
61 |
62 | Put a group name in the result folder name.
63 |
64 | Example: `--group Horse` - Sets group name to Horse and prefixes the folder name and files with it.
65 |
66 | #### --source
67 |
68 | Specify the source of the video track, e.g. BD for bluray. Will be auto-detected if not specified.
69 |
70 | Example: `--source VHS` - sets the source to the text string VHS.
71 |
72 | #### --audio
73 |
74 | Same as source but for the audio track.
75 |
76 | Example: `--audio Opus` - sets the audio source to the text string Opus.
77 |
78 | #### --title
79 |
80 | Sets the title of the release.
81 |
82 | Example: `--title Big Buck Bunny` - sets the title to the text string Big Buck Bunny.
83 |
84 | #### --dual-audio
85 |
86 | Marks the release as Dual-Audio
87 |
88 | Example: `--dual-audio` - Adds the text Dual-Audio to the release.
89 |
90 | #### --skip-chapters
91 |
92 | Skips adding chapters based on OP-ED specified with `--op-ed-path`.
93 | This is useful if you want to copy the NC-OP-ED files but copy the chapters from a release.
94 |
95 | Example: `--skip-chapters` - Instructs milksync to not assign chapters from OP & ED.
96 |
97 | #### --pre-generate-chroma
98 |
99 | Pre-generate chromas, this can sometimes speed up the total speed but is not recommended
100 |
101 | Example: `--pre-generate-chroma` - Adds a line to the script that pre-generates chromas.
102 |
103 | #### --skip-copy-oped
104 |
105 | Skips copying OP-ED specified with `--op-ed-path`. This is useful if you created the files yourself just to assign the chapters.
106 |
107 | Example: `--skip-copy-oped` - Cartonizer does not add the line to copy the files to the release folder.
108 |
109 | #### --additional-params
110 |
111 | Pass additional arguments to `milksync.py`.
112 |
113 | Example: `--additional-params '--chapter-beginning Intro'` - Tells milksync to add a chapter to the beginning fo the file.
114 |
115 | See milksync arguments for more.
116 |
117 | #### --folder-name
118 |
119 | Instead of auto-generating a foldername, use this name.
120 |
121 | Example: `--folder-name 'Happy Bunnies Riding The Wave (DVD)'`
122 |
123 | #### --file-name-template
124 |
125 | Instead of auto-generating a file name template, use this template.
126 | Must have a %s where the episode number is placed.
127 |
128 | Example: `--folder-name 'Happy Bunnies Riding The Wave (DVD) %s'`
129 |
130 | ### How to use for ocr (cowocr)
131 |
132 | The most basic usage is:
133 |
134 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py ocr path-to-subbed path-to-unsubbed`
135 |
136 | Make sure the files are correctly matched, sometimes it takes multiple files it should not.
137 |
138 | This will generate a bash script called `ocr_release.sh`, you just have to run it with `bash ocr_release.sh` and wait.
139 |
140 | A description of all arguments and how and when to use them.
141 |
142 | #### --additional-params
143 |
144 | Pass additional arguments to `cowocr.py`.
145 |
146 | Example: `--additional-params '--threads 1 --run-subregions-in-parallel'` - Tells cowocr to use 2 threads and run every subregion in parallel.
147 |
148 | See cowocr arguments for more.
149 |
150 | ### FAQ
151 |
152 | #### There is no OP/ED to assign chapters from (or it fails to use existing OP/ED) what do I do?
153 |
154 | The easiest way right now is to extract them manually, if you have a file named `Big Buck Bunny 01.mkv` and the chapters are like this in the file:
155 |
156 | * Opening: starts at 00:01:27.062 and stops at 00:02:57.123
157 | * Ending: starts at 00:22:11.362 and stops at 00:23:33.333
158 |
159 | Extract them with:
160 | ```
161 | mkdir extracted
162 | ffmpeg -i 'Big Buck Bunny 01.mkv' -ss 00:01:27.062 -to 00:02:57.123 -map a:0 extracted/NCOP-01.mkv
163 | ffmpeg -i 'Big Buck Bunny 01.mkv' -ss 00:22:11.362 -to 00:23:33.333 -map a:0 extracted/NCED-01.mkv
164 | ```
165 |
166 | These can then be used with `--op-ed-path extracted/ --skip-copy-oped`
167 |
168 | ## Milksync
169 |
170 | Compare audio tracks between two files and take subtitles and audio tracks from one and add to another. The intention is to remove the tedious work of manually aligning subtitles to a new files and give a far more exact
171 | result.
172 |
173 | ### How to use
174 |
175 | The most basic usage is:
176 |
177 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/milksync.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv --output merged-episode-01.mkv`
178 |
179 | This will take video and audio from the last file and put subtitles from the first file into the merged file.
180 |
181 | The command prints out information about what is going on, e.g. where chapters are placed and how much subtitles are moved.
182 | Make sure to check the result, especially around the breakpoints. WARNINGs can also be a hint about what might be wrong with the resulting file.
183 |
184 | Remember, you can always modify a command and just run it again to see what happens. Second time is normally faster than the first too.
185 | Sometimes experimenting can help you on your way.
186 |
187 | A description of all arguments and how and when to use them.
188 |
189 | #### --only-generate-chroma
190 |
191 | Only extract audio from the file and generate index, this can sometimes be used to speed up the overall progress, not recommended.
192 |
193 | Example: `--only-generate-chroma` - Quits after extracting chroma
194 |
195 | #### --sync-using-subtitle-audio
196 |
197 | Use the audio where the subtitles run to sync a specific line. Good when video is partial or re-arranged. Bad for audio syncs.
198 |
199 | Example: `--sync-using-subtitle-audio` - Enable the sync feature.
200 |
201 | #### --skip-subtitles
202 |
203 | Do not copy any subtitles, can be used for e.g. dubs only releases or subtitles from another source that are not to be synced this way.
204 |
205 | Example: `--skip-subtitles` - No subtitles copied
206 |
207 | #### --skip-shift-point
208 |
209 | The script prints out the points it uses to shift the subtitles, sometimes one or more of them might be bad or you want to see what happens with them removed. They are index based and you have to count it yourself from the milksync output.
210 |
211 | Generally not used.
212 |
213 | Example: `--skip-shift-point 2,3` - Skips shift point 2 and 3.
214 |
215 | #### --subtitle-cutoff
216 |
217 | If the subtitles start too early or run too long, this command can cut off subtitles to prevent this. The command takes a number in seconds that can be both positive (count from beginning of video result file) and negative (count from end of video result file)
218 |
219 | Example: `--subtitle-cutoff -50` - The last 50 second of the result will not have any subtitles.
220 | Example: `--subtitle-cutoff 30` - The first 30 second of the result will not have any subtitles.
221 |
222 | #### --only-delta
223 |
224 | Instead of putting subtitles into buckets and adjusting them to fit in it, just modify the timestamp on the subtitles.
225 | This one is very useful if one input runs faster or slower than the other. This can often be seen in the milksync output as a lot of sync points that either decrease or increase in delta.
226 |
227 | Example: `--only-delta` - Enable delta mode instead of subtitle bucket mode.
228 |
229 | #### --align-framerate
230 |
231 | Align source framerate to target video framerate, when speedup/slowdown used as technique to change framerate.
232 |
233 | Example: `--align-framerate` - Enable the feature and change source framerate to target framerate.
234 |
235 | #### --align-frames-too
236 |
237 | When using `--only-delta` it can be helpful to look at frames too to find a better difference.
238 |
239 | Example: `--only-delta --align-frames-too` - Enables frame alignment.
240 |
241 | #### --preserve-silence
242 |
243 | When extracting chroma from the files the audio at the end is trimmed to prevent silence from blocking alignment, this disables that feature.
244 |
245 | Example: `--preserve-silence` - Preserves silence.
246 |
247 | #### --temp-folder
248 |
249 | Where to save temporary files, this includes extracted audio and subtitle tracks including chroma generated from audio files.
250 |
251 | Example: `--temp-folder '/tmp/milk-temp/'` - Saves temp files to the specified folder
252 |
253 | #### --audio-tracks
254 |
255 | Define which audio tracks to use for syncing audio tracks. Milksync only works when the audio tracks in the input files are the same, i.e. same language. If you take e.g. english and japanese audio tracks and try to use then the results will vary quite a bit and likely not be very good.
256 |
257 | Input file index is defined as the order they are given to milksync.
258 |
259 | Example: `--audio-tracks 0:1,1:0` - Use audio track 1 from input file 0 and audio track 0 from input file 1.
260 |
261 | #### --adjust-shift-point
262 |
263 | Manually change a shift point. Can be used to see if the auto detect is not good enough or just modify it to work correctly. This is mostly used for debugging.
264 |
265 | Example: `--adjust-shift-point 0.3:10.3:1.3:11.3` - Set the first shift point to the specified values. Order is same as printed in milksync.
266 |
267 | #### --adjust-delay
268 |
269 | Manually adjust the delay to all points. Can be used for debugging.
270 |
271 | Example: `--adjust-delay 0.3` - Adds 0.3 second to every subtitle
272 |
273 | #### --sync-non-dialogue-to-video
274 |
275 | Sometimes the audio has been resynced to the video which means the speech subtitles and the sign subtitles must be synced independently.
276 | This flag tries to align signs to the video and speech to the audio, can be useful when the target is e.g. remastered. It can be very slow and quality can vary, the result is printed and you should check if the signs are positioned correctly.
277 |
278 | Example: `--sync-non-dialogue-to-video 0-1000` - Enables this feature for the given range of seconds.
279 |
280 | #### --chapter-source
281 |
282 | Specify which source file index to pull chapters from, these are synced in the same way as the audio tracks.
283 |
284 | If nothing chapter-related is specified, they are pulled from video source, i.e. last file.
285 |
286 | Example: `--chapter-source 0` - Take chapters from input file 0.
287 |
288 | #### --chapter-beginning
289 |
290 | Add a chapter to the beginning of the result. This means every part of the result will be part of a chapter.
291 |
292 | Example: `--chapter-beginning Beginning` - The first chapter at 00:00 is named Beginning.
293 |
294 | #### --chapter-segment-file
295 |
296 | Source file to generate chapter from, this is a part of the video that is sought for in the file. Useful for e.g. openings or endings.
297 |
298 | This is used in conjunction with `--chapter-segment-name-start` and `--chapter-segment-name-end`. Order matters and each `--chapter-segment-file` must have a `--chapter-segment-name-start` and `--chapter-segment-name-end`.
299 |
300 | Example: `--chapter-source NCED-01.mkv` - Match content of NCED-01.mkv to the result video and add chapters if found.
301 |
302 | #### --chapter-segment-name-start
303 |
304 | Name of the chapter starting where the beginning of `--chapter-segment-file` is matched.
305 |
306 | Example: `--chapter-segment-file End` - Names the chapter that matches the beginning of `--chapter-segment-file` End.
307 |
308 | #### --chapter-segment-name-end
309 |
310 | Name of the chapter starting where the end of `--chapter-segment-file` is matched.
311 |
312 | Example: `--chapter-segment-file 'After End'` - Names the chapter that matches the end of `--chapter-segment-file` After End.
313 |
314 | #### --chapter-segment-required
315 |
316 | Enforces that every chapter segment must be matched.
317 |
318 | Example: `--chapter-segment-required` - If a chapter segment is not matched, it will quit with an error.
319 |
320 | #### --metadata-audio-track
321 |
322 | Manually set metadata for an audio track, this is passed directly to ffmpeg. These matches the output mapping and no the input mapping.
323 |
324 | Example: `--metadata-audio-track 0=language=jpn --metadata-audio-track 0=title='Japanese' --metadata-audio-track 1=language=fra --metadata-audio-track 1=title='Bad french'` - Sets the first output audio track metadata to japanese with a matching title and the second audio track to french with a matching title.
325 |
326 | #### --metadata-subtitle-track
327 |
328 | Manually set metadata for a subtitle track, this is passed directly to ffmpeg. These matches the output mapping and no the input mapping.
329 |
330 | Example: `--metadata-subtitle-track 0=language=jpn --metadata-subtitle-track 0=title='Japanese' --metadata-subtitle-track 1=language=fra --metadata-audsubtitleio-track 1=title='Bad french'` - Sets the first output subtitle track metadata to japanese with a matching title and the second subtitle track to french with a matching title.
331 |
332 | #### --subtitle-min-font-size
333 |
334 | Increase font-size to minimum this. Sometimes subtitles are unreadable on the source.
335 |
336 | Example: `--subtitle-min-font-size 26` - Sets the font-size to, minimum, 26.
337 |
338 | #### --input-external-subtitle-track
339 |
340 | Use a specific external subtitle in output, it is assumed it matches video input 0.
341 |
342 | Example `--input-external-subtitle-track subtitles.ass` - Assumes the subtitle matches input 0 and syncs it to output.
343 |
344 | #### --output-video-file-index
345 |
346 | Which file to pull video data from, this is normally the last specified file and is normally not used.
347 |
348 | Example: `--output-video-file-index 1` - Pull video data from the second input file.
349 |
350 | #### --output-audio-mapping
351 |
352 | Define which audio tracks the output has and where to pull them from. Defaults to using only first audio from the last input file, same source as video.
353 |
354 | Example: `--output-audio-mapping 0:0,1:2` - Takes the first audio track from the first input file and the third audio track from the second input file. The result file first audio track is 0:0 and the second is 1:2.
355 |
356 | #### --output-subtitle-mapping
357 |
358 | Define which subtitle tracks the output has and where to pull them from. Defaults to using only first subtitle from the first input file.
359 |
360 | Example: `--output-subtitle-mapping 1:1,1:0` - Takes the first and the second subtitle track from the second input file. The order is as specified, i.e. the tracks are flipped.
361 |
362 | #### --output
363 |
364 | Where to save the result.
365 |
366 | Example: `--output Result-EP01.mkv` - Saves the complete file to Result-EP01.mkv
367 |
368 | #### --output-subtitle
369 |
370 | Save the synced subtitles.
371 |
372 | Example: `--output-subtitle Result-EP01.ass` - Saves the subtitle file to Result-EP01.ass
373 |
374 | ## CowOCR
375 |
376 | Compare two video tracks and look for differences. The intention is to find differences as they will indicate e.g. subititles and signs.
377 |
378 | The output is an .ass file and a report that can be used to verify and correct the output.
379 |
380 | ### How it works
381 |
382 | The base assumtion that CowOCR relies on is to find the differences between the source and destination video. To do this it goes through a few steps.
383 |
384 | The initial differences are found by running ORB algorithm against both source and target video, keypoints found at source not found at target is assumed to be differences.
385 |
386 | We now have a region we can assume is different, we look for text in that one. Threshold algorithm is run against the source and matching white areas are extracted.
387 |
388 | To make sure what is part of the text the color of all found area is extracted and grouped using k-means. Areas with colors close enough to the majority color found are considered part of the text. Additionally the border color is used in the same way.
389 |
390 | A bruteforce is performed here to find the best text mask by cycling through the colors.
391 |
392 | With text found and a mask matching the text (where it is in the picture) it is now time to figure out when it starts and ends. This is done by looping through the frames before and after the current frame and see if the colors match the extracted text, i.e. is the same text in the frames before and after the current frame.
393 |
394 | ### How to use
395 |
396 | The most basic usage to extract-subtitles is:
397 |
398 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv extract-subtitles`
399 |
400 | This will compare the two video files and try to extract the subtitles.
401 |
402 | After the subtitles are extracted, a report plus an .ass file can be created from the output with the create-report command.
403 |
404 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py path-to-subbed/episode-01.mkv path-to-unsubbed/episode-01.mkv create-report`
405 |
406 | The report and subtitle is, per default, located in the cow-temp folder which is created relative to where the command was executed.
407 | In this example, in the folder that contains the path-to-subbed and path-to-unsubbed folder.
408 |
409 | Verify the subtitles and we're all done, now we just need them merged. For this we can use milksync with just one additional parameter, `--input-subtitle-path cow-temp/` - that will pull the subtitles from the .ass file instead of the source video file.
410 |
411 | This is likely not how the actual workflow will be. See further down for an actual workflow.
412 |
413 | A description of all arguments and how and when to use them.
414 |
415 | ### extract-subtitles arguments
416 |
417 | This command extracts the subtitles from the video
418 |
419 | #### --threads
420 |
421 | How many threads to extract subtitles with. Unless specified it runs a subtitle region at a time.
422 |
423 | Example: `--threads 1` - Use only one thread
424 |
425 | #### --tesseract-data-path
426 |
427 | Path to tesseract data path.
428 |
429 | Example: `--threads tess-data/` - Read data from the tess-data folder.
430 |
431 | #### --frame-diff
432 |
433 | When comparing source and target video it can sometimes be necessary to specify frame differences. It should be sufficient to rely
434 | on the auto detection though.
435 |
436 | Example: `--frame-diff 8` - The target is 8 frames ahead of the source.
437 |
438 | #### --frame-range
439 |
440 | Specify frames on the source to extract subtitles from, can be useful to e.g. skip OP/ED
441 |
442 | Example: `--frame-range 1000-5000` - Extracts subtitles from frame 1000 to 5000
443 |
444 | #### --ignore-diff-fps
445 |
446 | As it uses frame differences to find subtitles the FPS must be the same. Sometimes it can be ignored (e.g. if the source just runs faster but has the same frames). This option makes it ignore the criteria
447 |
448 | Example: `--ignore-diff-fps` - Ignores FPS differences
449 |
450 | #### --run-subregions-in-parallel
451 |
452 | Run extraction for each subtitle region in parallel. Each thread will run for each subtitle regions so the total number
453 | of threads will be threads times subtitle region count.
454 |
455 | Example: `--run-subregions-in-parallel` - Run every subtitle region in parallel
456 |
457 | #### --fix-broken-frame-alignment
458 |
459 | Sometimes frames drift a bit differently so while the FPS is the same, one of the video files might have ghost frames or other annoyances. This tries to alleviate that issue.
460 |
461 | Example: `--fix-broken-frame-alignment` - Enable frame alignment fix.
462 |
463 | #### --debug-frame
464 |
465 | While you are editing the subtitle region configuration it is necessary to try and extract a specific frame to see the result.
466 | This is the command for that. It will run the current subtitle region configuration for the given frame and save an array of outputs
467 | to the temp-folder/debug.
468 |
469 | It will also print out what the various files contain.
470 |
471 | Example: `--debug-frame 1000` - Try to extract subtitles from source frame 1000.
472 |
473 | #### --debug-subregion
474 |
475 | In combination with --debug-frame it will use a specific subtitle region. If not specified, defaults to the first subtitle region.
476 |
477 | Example: `--debug-subregion bottom` - Extract using the sutitle region named bottom.
478 |
479 | ### create-report arguments
480 |
481 | This command turns the extracted subtitles into a report and an .ass file
482 |
483 | A report contains information for each subtitle region, this explanation is for the default config.
484 | The report is an html file you should open in your webbrowser, e.g. `cow-temp/Episode 1.avi-report/index.html`. In that report each region has two sections, "subtitle lines" and "missing regions".
485 |
486 | The "subtitle lines" are the found lines and these are reflected in the .ass file.
487 | With the bottom subtitles there area few things:
488 |
489 | - A start and end timestamp of the subtitle
490 | - Start and end frame and the initial discovery frame.
491 | - The subtitle text
492 | - Four frames used to check if timing is correct, before first frame, first frame, last frame and after last frame. If before first or after last contains the matching text, then timing is off.
493 |
494 | The "missing regions" part contains images of stuff where there are differences between source and target but it was unable to discover what exactly. Sometimes it is short words or un-ocrable subtitles.
495 |
496 | A subtitle-region scan does not yield the same type of results as it is unable to merge subtitle lines in the same way. It also contains green squares for matched text under "subtitle signs" section.
497 |
498 | Make sure to browse through the "missing regions" section, no tool is perfect.
499 |
500 | #### --output-report-path
501 |
502 | Where to save the report generated. Defaults to the temp-dir.
503 |
504 | Example: `--output-report-path /mnt/sdd/subtitle-temp-reports` - Save the report to the specified path.
505 |
506 | #### --output-subtitle-path
507 |
508 | Where to save the .ass subtitle file is saved. Defaults to the temp-dir.
509 |
510 | Example: `--output-report-path /mnt/sdd/subtitle-temp-subs` - Save the subtitles to the specified path.
511 |
512 | ### subtitle_regions.json
513 |
514 | This file is generated in the temp folder when the command is first run. Any video that uses a specific temp folder will use the same
515 | subtitle region file. A description of all available options can be found here.
516 |
517 | #### name
518 |
519 | Name of the subtitle region. Used with e.g. --debug-subregion parameter.
520 |
521 | #### scan_mode
522 |
523 | Specify how to find subtitles in a region, there are two choices, `bottom_center` and `search_slice`.
524 |
525 | `bottom_center` looks for subtitles in the middle of the region and assumes there is, max, one subtitle in the given region.
526 | Useful for normal subtitles at the bottom of the screen.
527 |
528 | `search_slice` looks around for differences that contains text, useful for e.g. signs. Cannot merge similar regions and can create a lot of duplicate lines.
529 |
530 | #### y, h, x, w, margin
531 |
532 | Specifies the dimension of a subtitle region, it starts at `x`, `y` and ends at `x+w`, `y+h`. If you run with --debug-frame it will show where the regions are.
533 |
534 | The `margin` is part of the region that cannot contain subtitles and any object that are part of it will be removed, useful for `bottom_center` scan mode where normal subtitles are not in the margin.
535 |
536 | #### area_min, area_max, area_min_density
537 |
538 | Minimum `area_min` and maximum `area_max` number of pixels a letter can contain. Minimum density `area_min_density` a letter has.
539 |
540 | These can be useful to remove things that most certainly cannot be letters.
541 |
542 | #### max_w, max_h
543 |
544 | Maximum size of a letter in pixels.
545 |
546 | #### min_stroke_width, max_stroke_width
547 |
548 | Minimum and maximum stroke width a letter can have. These are measured at the thickest spot of a letter.
549 |
550 | Examples could be, a long thin line will have a width of 1px while a circle will have a width of its radius.
551 |
552 | #### border_size
553 |
554 | Assumed size of border.
555 |
556 | This will often be either 1 or 2, it depends a bit on how the "Threshold" debug image looks, e.g. does it consume lots of the border or not.
557 |
558 | See "How it works" to understand what it is useful for.
559 |
560 | #### max_text_diff, max_border_diff
561 |
562 | Maximum difference for text and border to be assumed part of the same text line.
563 |
564 | This depends a bit on how well the text is marked and extracted, if it finds too few letters it might be smart to turn them up and vice-versa if it finds too much.
565 |
566 | See "How it works" to understand what it is useful for.
567 |
568 | #### percent_good_border
569 |
570 | How much of a border of a given figure must be good to be assumed part of the text.
571 |
572 | See "How it works" to understand what it is useful for.
573 |
574 | #### edge_threshold
575 |
576 | Used in relation with finding the differnence between source and target frames. Should probably not be touched.
577 |
578 | See "How it works" to understand what it might be useful for.
579 |
580 | #### threshold_mode, threshold_value
581 |
582 | Method and value to look for threshold with. There are two modes `adaptive` and `static`.
583 |
584 | `adaptive` finds out which pixel should be black and which should be white depending on the pixels around it. Can be useful if the inner text on the subtitles varies but is always bright. An example `threshold_value` for this could be 27, that will prevent most noise too.
585 |
586 | `static` is an absolute way of finding them, useful if the inner subtitle text is always bright and same color. An example `threshold_value` could be 200, which is the brightness cutoff.
587 |
588 | See "How it works" to understand what it is useful for and https://docs.opencv.org/4.5.2/d7/d4d/tutorial_py_thresholding.html for information about thresholds generally.
589 |
590 | #### ass_style_name
591 |
592 | Style name to use with text found here.
593 |
594 | #### invert_mode
595 |
596 | Not implemented, no effect.
597 |
598 | ### A realistic workflow
599 |
600 | In this example we have a set of 12 episodes we want to OCR, the source is 640x480 which matches the default subtitle region.
601 | Source files are located in `source-video` and the target files are in `target-video`.
602 |
603 | First we run cartonizer to create a batch script.
604 |
605 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py ocr source-video target-video --additional-params '--threads 1 --run-subregions-in-parallel'`
606 |
607 | This creates a file named `ocr_release.sh` and it will be the script we run when we have modified `subtitle_regions.json` enough.
608 |
609 | We open up the `ocr_release.sh` file and find OCR of the first episode. We need temp folder and configuration created first before we can OCR it all.
610 |
611 | ```
612 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
613 | 'source-video/Episode 1.mkv' \
614 | 'target-video/Episode 1.mkv' \
615 | extract-subtitles \
616 | --threads 1 --run-subregions-in-parallel
617 | ```
618 |
619 | That is the command that extracts subtitles from the first episode, it will be the one we use for modifying `subtitle_regions.json`.
620 |
621 | Lets see how good the default config is by running it against part of the episode, 5000 frames should suffice (that is 3.5 minutes at 23.976 fps).
622 |
623 | ```
624 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
625 | 'source-video/Episode 1.mkv' \
626 | 'target-video/Episode 1.mkv' \
627 | extract-subtitles \
628 | --threads 1 --run-subregions-in-parallel \
629 | --frame-range 5000-10000 # framerange we use to see
630 | ```
631 |
632 | After it is done, create a report and see the result with
633 |
634 | ```
635 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
636 | 'source-video/Episode 1.mkv' \
637 | 'target-video/Episode 1.mkv' \
638 | create-report
639 | ```
640 |
641 | The report is in the cow-temp folder in this example including an .ass file and the `subtitle_regions.json` file.
642 |
643 | To modify and test changes to `subtitle_regions.json` we find a good subtitle frame number in the report and use that.
644 |
645 | ```
646 | ~/the-cute-collection/.env/bin/python ~/the-cute-collection/cowocr.py \
647 | 'source-video/Episode 1.mkv' \
648 | 'target-video/Episode 1.mkv' \
649 | extract-subtitles \
650 | --threads 1 --run-subregions-in-parallel \
651 | --debug-frame 13754 --debug-subregion bottom
652 | ```
653 |
654 | We can then run the initial 5000 frames again and see if the result is good enough. If it is, then just run the whole `ocr_release.sh`.
655 |
656 | When it is done the .ass in the cow-temp folder must be modified and the report followed. I do this by loading the subtitle file and source episode file into Aegisub.
657 |
658 | The subtitles must now be synced with the video, chapters added and other stuff. This can be done with MilkSync.
659 |
660 | `~/the-cute-collection/.env/bin/python ~/the-cute-collection/cartonizer.py sync source-video target-video --additional-params '--external-subtitles cow-temp/'`
661 |
662 | Then run `create_release.sh` and you got a fully synced video.
663 |
664 | # License
665 |
666 | AGPL
--------------------------------------------------------------------------------
/cowocr.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures
2 | import dataclasses
3 | import json
4 | import math
5 | import re
6 | import shlex
7 | import threading
8 | import time
9 | import traceback
10 | from collections import namedtuple
11 | from pathlib import Path
12 |
13 | import click
14 | import cv2
15 | import jinja2
16 | import lxml.html
17 | import matplotlib
18 | import matplotlib.pyplot as plt
19 | import numpy as np
20 | import pysubs2
21 | import pytesseract
22 | from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance
23 | from scipy.spatial.distance import cdist, cosine
24 | from skimage.metrics import structural_similarity
25 | from textblob import TextBlob
26 | from tqdm import tqdm, trange
27 |
28 | BASE_ASS = r"""[Script Info]
29 | ; Script generated by Aegisub 3.2.2
30 | ; http://www.aegisub.org/
31 | Title: CowOCR
32 | ScriptType: v4.00+
33 | WrapStyle: 0
34 | ScaledBorderAndShadow: yes
35 | PlayResX: 640
36 | PlayResY: 480
37 |
38 | [V4+ Styles]
39 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
40 | Style: Default,Open Sans Semibold,26.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,2,0,0,28,1
41 | Style: Sign,Open Sans Semibold,26.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,2,0,0,28,1
42 | Style: Note,Open Sans Semibold,20.0,&H00FFFFFF,&H000000FF,&H00020713,&H00000000,-1,0,0,0,100.0,100.0,0.0,0.0,1,1.7,0.0,8,0,0,28,1
43 |
44 | [Events]
45 | Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
46 | """
47 |
48 | HTML_BASE = r"""
49 |
50 |