├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── main.py ├── tests ├── samples │ ├── the_godfather_1972.da.srt │ ├── the_godfather_1972.es.srt │ ├── the_godfather_1972.he.srt │ ├── the_godfather_1972.it.srt │ ├── the_godfather_1972.ko.srt │ └── the_godfather_1972.pt.srt ├── __init__.py ├── tests_stream.py └── tests_analysis.py ├── publish.bat ├── .gitignore ├── .editorconfig ├── requirements.txt ├── test.py ├── curator ├── plans │ ├── __init__.py │ ├── link.py │ ├── sync.py │ ├── rename.py │ ├── tag.py │ ├── convert.py │ └── merge.py ├── __init__.py ├── database.py ├── databases │ ├── __init__.py │ ├── omdb.py │ ├── tmdb.py │ └── imdb.py ├── util.py ├── task.py ├── analysis.py ├── plan.py ├── media.py ├── cli.py ├── tui.py └── stream.py ├── docs ├── index.md └── images │ ├── curator-tag.svg │ ├── curator-rename.svg │ └── curator-merge.svg ├── setup.py ├── scripts └── generate_screenshots.py ├── README.md └── LICENSE /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [AlexAltea] 2 | patreon: AlexAltea 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from curator.cli import main 4 | 5 | if __name__ == '__main__': 6 | main() 7 | -------------------------------------------------------------------------------- /tests/samples/the_godfather_1972.da.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/curator/HEAD/tests/samples/the_godfather_1972.da.srt -------------------------------------------------------------------------------- /tests/samples/the_godfather_1972.es.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/curator/HEAD/tests/samples/the_godfather_1972.es.srt -------------------------------------------------------------------------------- /tests/samples/the_godfather_1972.he.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/curator/HEAD/tests/samples/the_godfather_1972.he.srt -------------------------------------------------------------------------------- /tests/samples/the_godfather_1972.it.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/curator/HEAD/tests/samples/the_godfather_1972.it.srt -------------------------------------------------------------------------------- /tests/samples/the_godfather_1972.ko.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/curator/HEAD/tests/samples/the_godfather_1972.ko.srt -------------------------------------------------------------------------------- /tests/samples/the_godfather_1972.pt.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/curator/HEAD/tests/samples/the_godfather_1972.pt.srt -------------------------------------------------------------------------------- /publish.bat: -------------------------------------------------------------------------------- 1 | del dist\* 2 | python setup.py bdist_wheel --universal 3 | gpg --detach-sign -u FA31DF0C -a dist/* 4 | twine upload dist/* 5 | pause 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Curator. 5 | """ 6 | 7 | # Imports 8 | from .tests_analysis import * 9 | from .tests_stream import * 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDEs 2 | .idea 3 | .vscode 4 | 5 | # Python 6 | .venv 7 | __pycache__ 8 | *.pyc 9 | *.pyo 10 | *.pyd 11 | 12 | # Package 13 | /*egg-info 14 | /build 15 | /dist 16 | 17 | temp* 18 | private -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig 2 | # http://editorconfig.org 3 | 4 | root = true 5 | 6 | [**.{py}] 7 | indent_style = space 8 | indent_size = 4 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | arrow==1.2.3 2 | chardet==3.0.4 3 | iso639-lang==2.1.0 4 | langid==1.1.6 5 | milli==1.0.0 6 | numpy>=1.21.6 7 | openai-whisper==20230918 8 | pandas==1.5.3 9 | pysrt==1.1.2 10 | requests==2.28.2 11 | textdistance==4.5.0 12 | textual==0.40.0 13 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Curator. 5 | """ 6 | 7 | from tests import * 8 | 9 | def test(): 10 | test_analysis() 11 | test_stream() 12 | print('All tests passed successfully.') 13 | 14 | if __name__ == '__main__': 15 | test() 16 | -------------------------------------------------------------------------------- /curator/plans/__init__.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .convert import * 3 | from .link import * 4 | from .merge import * 5 | from .rename import * 6 | from .sync import * 7 | from .tag import * 8 | 9 | # Prevent polluting namespace 10 | del convert 11 | del link 12 | del merge 13 | del rename 14 | del sync 15 | del tag 16 | -------------------------------------------------------------------------------- /curator/__init__.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | from .analysis import * 3 | from .database import * 4 | from .media import * 5 | from .plan import * 6 | from .stream import * 7 | from .task import * 8 | 9 | # Prevent polluting namespace 10 | del analysis 11 | del database 12 | del media 13 | del plan 14 | del stream 15 | del task 16 | -------------------------------------------------------------------------------- /curator/database.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class Database: 4 | def __init__(self, name): 5 | self.name = name 6 | 7 | # Set cache directory 8 | cache_root = os.getenv("XDG_CACHE_HOME", 9 | os.path.join(os.path.expanduser("~"), ".cache")) 10 | self.cache = os.path.join(cache_root, "curator", name) 11 | os.makedirs(self.cache, exist_ok=True) 12 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | Curator documentation 2 | ===================== 3 | 4 | ## Plan 5 | 6 | Collection of one or more *Tasks*. 7 | 8 | *TODO*. 9 | 10 | ## Task 11 | 12 | Atomic operation over one or more *Media* or *Streams*. 13 | 14 | *TODO*. 15 | 16 | ## Media 17 | 18 | File or link to a file that contains *Streams*. 19 | 20 | *TODO*. 21 | 22 | ## Stream 23 | 24 | Video, audio, subtitle or data tracks pertaining to some *Media*. 25 | 26 | *TODO*. 27 | -------------------------------------------------------------------------------- /curator/databases/__init__.py: -------------------------------------------------------------------------------- 1 | # Import and return database 2 | def get_database(name, *args, **kwargs): 3 | if name == 'imdb': 4 | from .imdb import ImdbDatabase 5 | return ImdbDatabase(*args, **kwargs) 6 | if name == 'omdb': 7 | from .omdb import OmdbDatabase 8 | return OmdbDatabase(*args, **kwargs) 9 | if name == 'tmdb': 10 | from .tmdb import TmdbDatabase 11 | return TmdbDatabase(*args, **kwargs) 12 | else: 13 | raise ValueError("Unknown database name") 14 | -------------------------------------------------------------------------------- /curator/plans/link.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from curator import Plan, Task, Media 4 | 5 | class LinkPlan(Plan): 6 | def columns(self): 7 | return [ 8 | { 'name': 'Name', 'width': '100%' }, 9 | ] 10 | 11 | class LinkTask(Task): 12 | def __init__(self, input, output): 13 | super().__init__([input], [output]) 14 | assert(output.type == Media.TYPE_LINK) 15 | 16 | def view(self): 17 | return [(self.inputs[0].name,)] 18 | 19 | def apply(self): 20 | src = self.inputs[0].path 21 | lnk = self.outputs[0].path 22 | os.symlink(src, lnk) 23 | 24 | def plan_link(media, output): 25 | plan = LinkPlan() 26 | for m in media: 27 | path = os.path.join(output, m.name) 28 | link = Media(path, Media.TYPE_LINK) 29 | task = LinkTask(m, link) 30 | plan.add_task(task) 31 | return plan 32 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: [push] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | strategy: 7 | matrix: 8 | python-version: ["3.8"] 9 | 10 | steps: 11 | - uses: actions/checkout@v3 12 | 13 | # Install and cache FFmpeg 14 | - uses: FedericoCarboni/setup-ffmpeg@v2 15 | name: Set up FFmpeg 16 | 17 | # Install and cache Python dependencies 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | cache: 'pip' 23 | - name: Install Python dependencies 24 | run: pip install -r requirements.txt 25 | 26 | # Package tests 27 | - name: Test package 28 | run: | 29 | python test.py 30 | python setup.py check --strict --metadata 31 | - name: Install package 32 | run: | 33 | pip install . 34 | -------------------------------------------------------------------------------- /tests/tests_stream.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Curator. 5 | """ 6 | 7 | from curator.media import * 8 | from curator.stream import * 9 | 10 | def test_detect_subtitle_language(): 11 | srt_lang = lambda path: Media(path).get_streams()[0].detect_subtitle_language() 12 | assert(srt_lang("tests/samples/the_godfather_1972.da.srt") == 'dan') 13 | assert(srt_lang("tests/samples/the_godfather_1972.en.srt") == 'eng') 14 | assert(srt_lang("tests/samples/the_godfather_1972.es.srt") == 'spa') 15 | assert(srt_lang("tests/samples/the_godfather_1972.fr.srt") == 'fra') 16 | assert(srt_lang("tests/samples/the_godfather_1972.he.srt") == 'heb') 17 | assert(srt_lang("tests/samples/the_godfather_1972.it.srt") == 'ita') 18 | assert(srt_lang("tests/samples/the_godfather_1972.ko.srt") == 'kor') 19 | assert(srt_lang("tests/samples/the_godfather_1972.pl.srt") == 'pol') 20 | assert(srt_lang("tests/samples/the_godfather_1972.pt.srt") == 'por') 21 | assert(srt_lang("tests/samples/the_godfather_1972.zh.srt") == 'zho') 22 | 23 | def test_stream(): 24 | test_detect_subtitle_language() 25 | -------------------------------------------------------------------------------- /curator/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | 5 | from collections.abc import Iterable 6 | 7 | def confirm(question, default="yes"): 8 | valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} 9 | if default is None: 10 | prompt = " [y/n] " 11 | elif default == "yes": 12 | prompt = " [Y/n] " 13 | elif default == "no": 14 | prompt = " [y/N] " 15 | else: 16 | raise ValueError(f"Invalid default answer: '{default}'") 17 | 18 | while True: 19 | sys.stdout.write(question + prompt) 20 | choice = input().lower() 21 | if default is not None and choice == "": 22 | return valid[default] 23 | elif choice in valid: 24 | return valid[choice] 25 | else: 26 | sys.stdout.write("Please respond 'yes' or 'no' ('y' or 'n').\n") 27 | 28 | def flatten(xs): 29 | for x in xs: 30 | if isinstance(x, Iterable) and not isinstance(x, (str, bytes)): 31 | yield from flatten(x) 32 | else: 33 | yield x 34 | 35 | def find_executable(name, hints=[]): 36 | path = shutil.which(name) 37 | if path: 38 | return path 39 | for hint in hints: 40 | if os.path.exists(hint): 41 | return hint 42 | return None 43 | -------------------------------------------------------------------------------- /curator/task.py: -------------------------------------------------------------------------------- 1 | class Task: 2 | def __init__(self, inputs=[], outputs=[]): 3 | self.inputs = inputs 4 | self.outputs = outputs 5 | self.enabled = True 6 | self.warnings = set() 7 | self.errors = set() 8 | self.id = None 9 | self.failed = False 10 | 11 | def add_warning(self, warning): 12 | self.warnings.add(warning) 13 | 14 | def add_error(self, error): 15 | self.errors.add(error) 16 | self.enabled = False 17 | 18 | def combine(self, other): 19 | assert(self.inputs == other.inputs) 20 | assert(self.outputs == other.outputs) 21 | assert(self.enabled == other.enabled) 22 | self.warnings |= other.warnings 23 | self.errors |= other.errors 24 | 25 | def concat(self, other): 26 | assert(self.outputs == self.inputs) 27 | raise Exception("Unimplemented") 28 | 29 | # Helpers 30 | def input_streams(self): 31 | for media in self.inputs: 32 | for stream in media.get_streams(): 33 | yield stream 34 | 35 | def input_video_streams(self): 36 | for stream in self.input_streams(): 37 | if stream.is_video(): 38 | yield stream 39 | 40 | def input_audio_streams(self): 41 | for stream in self.input_streams(): 42 | if stream.is_audio(): 43 | yield stream 44 | 45 | def input_subtitle_streams(self): 46 | for stream in self.input_streams(): 47 | if stream.is_subtitle(): 48 | yield stream 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import setuptools 5 | 6 | from pkg_resources import parse_requirements 7 | 8 | CURATOR_VERSION = '0.1.1' 9 | CURATOR_REPOSITORY_URL = 'https://github.com/AlexAltea/curator' 10 | CURATOR_DOWNLOAD_URL = 'https://github.com/AlexAltea/curator/tarball/' + CURATOR_VERSION 11 | 12 | # Description 13 | CURATOR_DESCRIPTION = """Curator 14 | ========= 15 | 16 | .. image:: https://github.com/AlexAltea/curator/actions/workflows/ci.yml/badge.svg 17 | :target: https://github.com/AlexAltea/curator/actions/workflows/ci.yml 18 | 19 | Automated normalization and curating of media collections. Written in Python 3.x. 20 | 21 | More information at: https://github.com/AlexAltea/curator 22 | """ 23 | 24 | with open('requirements.txt', 'r') as f: 25 | requirements = [str(req) for req in parse_requirements(f)] 26 | 27 | setuptools.setup( 28 | name='curator', 29 | version=CURATOR_VERSION, 30 | description='Automated normalization and curating of media collections', 31 | long_description=CURATOR_DESCRIPTION, 32 | license='Apache-2.0', 33 | author='Alexandro Sanchez Bach', 34 | author_email='alexandro@phi.nz', 35 | url=CURATOR_REPOSITORY_URL, 36 | download_url=CURATOR_DOWNLOAD_URL, 37 | packages=['curator', 'curator.databases', 'curator.plans'], 38 | entry_points={ 39 | 'console_scripts': ['curator=curator.cli:main'], 40 | }, 41 | install_requires=requirements, 42 | classifiers=[ 43 | 'Intended Audience :: Developers', 44 | 'License :: OSI Approved :: Apache Software License v2.0', 45 | 'Programming Language :: Python :: 3.8', 46 | 'Natural Language :: English', 47 | ], 48 | ) 49 | -------------------------------------------------------------------------------- /curator/analysis.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | YEAR_MIN = 1800 4 | YEAR_MAX = 2030 5 | 6 | def detect_year(name): 7 | """ 8 | Extract the movie year given the file name. 9 | Assumptions: 10 | - Year is a 4-digit number, optionally surrounded by non-word characters. 11 | - Year is interpreted as a Gregorian calendar integer in range [YEAR_MIN, YEAR_MAX]. 12 | - Year is the rightmost string satisfying these two conditions. 13 | - Year is never found at the beginning of the file name. 14 | """ 15 | matches = re.finditer(r'(?:\b|_)(\d{4})(?:\b|_)', name) 16 | for match in reversed(list(matches)): 17 | if match.start() == 0: 18 | return None 19 | year = int(match.group(1)) 20 | if YEAR_MIN <= year <= YEAR_MAX: 21 | return year 22 | return None 23 | 24 | def detect_name(name, year=None): 25 | """ 26 | Extract the movie name given the file name. 27 | Optionally provide the movie release year, as tokenization hint. 28 | Assumptions: 29 | - Name appears before the year. 30 | - Name does not contatain parenthesis or brackets. 31 | """ 32 | # Trim anything after year 33 | if year is None: 34 | year = detect_year(name) 35 | if year: 36 | name = name[:name.rfind(str(year))] 37 | # Normalize scene releases 38 | if not ' ' in name: 39 | name = name.replace('.', ' ') 40 | name = name.replace('_', ' ') 41 | # Extract matching left-starting pattern as name 42 | match = re.match(r'[\w\s\,\.\-\'\&]+', name) 43 | if match: 44 | return match[0].strip() 45 | return None 46 | 47 | def detect_tags(name): 48 | """ 49 | Extract the file tags in the file name. 50 | Assumptions: 51 | - Tags are surrounded by square brackets. 52 | - Tags do not contain any kind of brackets within. 53 | - Multiple tags can exist. 54 | """ 55 | matches = re.findall(r'\[([\w\-\,\.]+)\]', name) 56 | return matches 57 | -------------------------------------------------------------------------------- /curator/plan.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | # Configuration 5 | DEFAULT_UI_BACKEND = 'tui' 6 | 7 | class Plan: 8 | def __init__(self): 9 | self.tasks = [] 10 | self.last_id = 0 11 | 12 | def __iter__(self): 13 | for task in self.tasks: 14 | yield task 15 | 16 | def __len__(self): 17 | return len(self.tasks) 18 | 19 | def __getitem__(self, index): 20 | return self.tasks[index] 21 | 22 | def is_empty(self): 23 | return len(self.tasks) == 0 24 | 25 | def add_task(self, task): 26 | self.last_id += 1 27 | self.tasks.append(task) 28 | task.id = self.last_id 29 | 30 | def optimize(self): 31 | logging.debug('This plan does not support optimizations') 32 | 33 | def validate(self): 34 | outputs = set() 35 | for task in self.tasks: 36 | for output in task.outputs: 37 | path = output.path 38 | if path in outputs: 39 | task.add_error(f'Output {path} already exists in the plan') 40 | if os.path.exists(path): 41 | task.add_error(f'Output {path} already exists in the filesystem') 42 | outputs.add(path) 43 | 44 | def apply(self): 45 | for task in self.tasks: 46 | if task.enabled: 47 | try: 48 | task.apply() 49 | except Exception as e: 50 | task.failed = True 51 | print(f'Task #{task.id} with input {task.inputs[0]} failed:\n{e}') 52 | 53 | def show(self): 54 | from .tui import print_plan 55 | thead, tbody = self.show_tasks() 56 | tbody = list(map(lambda row: tuple(map(str, row)), tbody)) 57 | print_plan(thead, tbody) 58 | 59 | def show_tasks(self): 60 | thead = tuple(map(lambda c: c['name'], self.columns())) 61 | tbody = [] 62 | for task in self.tasks: 63 | tbody += task.view() 64 | return thead, tbody 65 | 66 | def edit(self, backend=DEFAULT_UI_BACKEND): 67 | from .tui import EditorApp 68 | if backend == 'tui': 69 | app = EditorApp(self) 70 | app.run() 71 | -------------------------------------------------------------------------------- /curator/plans/sync.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | from curator import Plan, Task, Media 5 | 6 | class SyncPlan(Plan): 7 | def columns(self): 8 | return [ 9 | { 'name': 'Input', 'width': '100%' }, 10 | { 'name': 'Old start', 'width': '9' }, 11 | { 'name': '+', 'width': '1' }, 12 | { 'name': 'Delta', 'width': '9' }, 13 | { 'name': '→', 'width': '1' }, 14 | { 'name': "New start", 'width': '9' }, 15 | ] 16 | 17 | class SyncTask(Task): 18 | def __init__(self, input, output, start, delta): 19 | super().__init__([input], [output]) 20 | self.start = start # Just for debugging 21 | self.delta = delta 22 | 23 | def view(self): 24 | t0 = self.start 25 | t1 = self.start + self.delta 26 | dt = self.delta 27 | return [(self.inputs[0].name, t0, "+", dt, "→", t1)] 28 | 29 | def apply(self): 30 | si = self.inputs[0] 31 | so = self.outputs[0] 32 | 33 | # Build ffmpeg command 34 | cmd = ['ffmpeg'] 35 | cmd += ['-i', si.media.path] 36 | cmd += ['-itsoffset', str(self.delta)] 37 | cmd += ['-i', si.media.path] 38 | cmd += ['-c:v', 'copy'] 39 | cmd += ['-c:a', 'copy'] 40 | cmd += ['-c:s', 'copy'] 41 | 42 | # Select streams respecting input order 43 | for i in range(si.media.num_streams()): 44 | if i == si.index: 45 | cmd += ['-map', f'1:{i}'] 46 | else: 47 | cmd += ['-map', f'0:{i}'] 48 | 49 | # Generate and replace from temporary directory 50 | with tempfile.TemporaryDirectory(dir=si.media.dir, prefix='.temp-curator-') as tmp: 51 | output = os.path.join(tmp, f'output.{si.media.ext}') 52 | cmd += [output] 53 | result = subprocess.run(cmd, capture_output=True) 54 | if result.returncode != 0: 55 | errors = result.stderr.decode('utf-8') 56 | raise Exception(f"Failed to sync {self.outputs[0].name} with ffmpeg:\n{errors}") 57 | os.replace(output, so.media.path) 58 | 59 | def plan_sync(media): 60 | plan = SyncPlan() 61 | for m in media: 62 | pass # TODO 63 | return plan 64 | -------------------------------------------------------------------------------- /tests/tests_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Curator. 5 | """ 6 | 7 | from curator.analysis import * 8 | 9 | def test_analysis_years(): 10 | # Scene-syntax testing 11 | assert(1990 == detect_year('Jacobs.Ladder.1990.720p.BluRay.x264.YIFY')) 12 | assert(1968 == detect_year('2001.A.Space.Odyssey.1968.1080p.BluRay.x264-[YTS.AM]')) 13 | assert(2008 == detect_year('10000.BC.2008_HDRip_[scarabey.org]')) 14 | assert(2014 == detect_year('Interstellar.2014.4K.UltraHD.BluRay.2160p.x264.DTS-HD.MA.5.1.AAC.5.1-POOP')) 15 | 16 | # Custom-syntax testing 17 | assert(2013 == detect_year('Coherence (2013) [English]')) 18 | assert(2003 == detect_year('Bad Santa (Extended Cut) (2003) [English]')) 19 | assert(1984 == detect_year('1984 (1984) [English]')) 20 | assert(None == detect_year('Ani-Kuri 15 [Japanese]')) 21 | 22 | # Stress testing 23 | assert(None == detect_year('2000')) 24 | assert(None == detect_year('x2000')) 25 | assert(2000 == detect_year('1234 2000')) 26 | assert(2000 == detect_year('1234 2000 1080')) 27 | assert(2000 == detect_year('1234 2000 1080 x1999')) 28 | assert(2000 == detect_year('1234 2000 1080 1999x')) 29 | assert(None == detect_year('1234')) 30 | assert(None == detect_year('')) 31 | 32 | def test_analysis_names(): 33 | # Scene-syntax testing 34 | assert(detect_name('Jacobs.Ladder.1990.720p.BluRay.x264.YIFY') 35 | == 'Jacobs Ladder') 36 | assert(detect_name('2001.A.Space.Odyssey.1968.1080p.BluRay.x264-[YTS.AM]') 37 | == '2001 A Space Odyssey') 38 | assert(detect_name('10000.BC.2008_HDRip_[scarabey.org]') 39 | == '10000 BC') 40 | assert(detect_name('Interstellar.2014.4K.UltraHD.BluRay.2160p.x264.DTS-HD.MA.5.1.AAC.5.1-POOP') 41 | == 'Interstellar') 42 | 43 | # Custom-syntax testing 44 | assert(detect_name('Coherence (2013) [English]') 45 | == 'Coherence') 46 | assert(detect_name('Bad Santa (Extended Cut) (2003) [English]') 47 | == 'Bad Santa') 48 | assert(detect_name('1984 (1984) [English]') 49 | == '1984') 50 | assert(detect_name('Ani-Kuri 15 [Japanese]') 51 | == 'Ani-Kuri 15') 52 | 53 | def test_analysis(): 54 | test_analysis_years() 55 | test_analysis_names() 56 | -------------------------------------------------------------------------------- /curator/databases/omdb.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import csv 3 | import io 4 | import logging 5 | import math 6 | import os 7 | 8 | import arrow 9 | import milli 10 | import requests 11 | 12 | from curator import Database 13 | 14 | class OmdbDatabase(Database): 15 | def __init__(self): 16 | super().__init__("omdb") 17 | 18 | # Check if cached index exists 19 | suffix = arrow.utcnow().format('YYYY-MM-DD') 20 | cache_name = f'index_milli_{suffix}' 21 | cache_path = os.path.join(self.cache, cache_name) 22 | if os.path.exists(cache_path): 23 | self.ix = milli.Index(cache_path, 1024*1024*1024) # 1 GiB 24 | return 25 | 26 | # Otherwise create one 27 | logging.info("Creating movie index...") 28 | csv1 = self.get_omdb_dataset('all_movies') 29 | csv2 = self.get_omdb_dataset('all_movie_aliases_iso') 30 | movies = {} 31 | for row in csv1: 32 | movie_id = int(row['id']) 33 | movies[movie_id] = { 34 | 'id': movie_id, 35 | 'name': row['name'], 36 | 'year': row['date'][:4], 37 | 'aliases': [], 38 | } 39 | for row in csv2: 40 | movie_id = int(row['movie_id']) 41 | movie = movies.setdefault(movie_id, { 'id': movie_id, 'aliases': [] }) 42 | movie['aliases'].append(row['name']) 43 | os.mkdir(cache_path) 44 | self.ix = milli.Index(cache_path, 1024*1024*1024) # 1 GiB 45 | self.ix.add_documents(list(movies.values())) 46 | 47 | def get_omdb_dataset(self, name): 48 | suffix = arrow.utcnow().format('MM_DD_YYYY') 49 | cache_name = f'{name}_{suffix}.csv.bz2' 50 | cache_path = os.path.join(self.cache, cache_name) 51 | if not os.path.exists(cache_path): 52 | r = requests.get(f'http://www.omdb.org/data/{name}.csv.bz2') 53 | with open(cache_path, 'wb') as f: 54 | f.write(r.content) 55 | 56 | # Parse compressed CSV dataset 57 | with open(cache_path, 'rb') as f: 58 | data = bz2.decompress(f.read()) 59 | text = data.decode('utf-8') 60 | return csv.DictReader(io.StringIO(text), delimiter=',') 61 | 62 | def query(self, name, year=None): 63 | results = self.ix.search(name) 64 | if not results: 65 | return None 66 | movie = self.ix.get_document(results[0]) 67 | return [{ 68 | 'name': name, 69 | 'oname': movie.get('name'), 70 | 'year': movie.get('year'), 71 | }] 72 | -------------------------------------------------------------------------------- /curator/databases/tmdb.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import math 3 | import os 4 | import sqlite3 5 | 6 | import arrow 7 | import pandas as pd 8 | import requests 9 | from textdistance import levenshtein 10 | 11 | from curator import Database 12 | 13 | class TmdbDatabase(Database): 14 | def __init__(self): 15 | super().__init__("tmdb") 16 | 17 | # Get movie IDs from TMDB, and cache them 18 | suffix = arrow.utcnow().shift(days=-1).format('MM_DD_YYYY') 19 | cache_name = f'movie_ids_{suffix}.json.gz' 20 | cache_path = os.path.join(self.cache, cache_name) 21 | if not os.path.exists(cache_path): 22 | r = requests.get(f'http://files.tmdb.org/p/exports/{cache_name}') 23 | with open(cache_path, 'wb') as f: 24 | f.write(r.content) 25 | 26 | # Parse movie IDs table 27 | with open(cache_path, 'rb') as f: 28 | data = gzip.decompress(f.read()) 29 | text = data.decode('utf-8') 30 | df = pd.read_json(text, lines=True) 31 | 32 | # Convert to FTS5-enabled SQLite databse 33 | db = sqlite3.connect(':memory:') 34 | db.execute('CREATE VIRTUAL TABLE movie_ids USING fts5(id, original_title, popularity, adult, video);') 35 | df.to_sql('movie_ids', db, if_exists='append', index=False) 36 | self.db = db 37 | 38 | def get_year(self, id): 39 | return None # TODO 40 | 41 | def query_exact(self, name, year=None): 42 | results = self.db.execute( 43 | f'''SELECT original_title, popularity, id FROM movie_ids 44 | WHERE original_title = "{name}" 45 | ORDER BY popularity''').fetchall() 46 | if year: 47 | results = list(filter(lambda r: year == self.get_year(r[2]), results)) 48 | if not results: 49 | return None 50 | r = max(results, key=lambda r: r[1]) 51 | return { 52 | 'name': r[0], 53 | 'year': self.get_year(r[2]), 54 | } 55 | 56 | def query_fuzzy(self, name, year=None): 57 | results = self.db.execute( 58 | f'''SELECT original_title, popularity, id FROM movie_ids 59 | WHERE original_title MATCH "{name}" AND popularity >= 0.61 60 | ORDER BY popularity''').fetchall() 61 | def score(record): 62 | original_title, popularity = record[:2] 63 | distance = levenshtein.distance(original_title, name) 64 | if distance == 0: 65 | return math.inf 66 | return popularity * 1/distance 67 | r = max(results, key=score) 68 | return [{ 69 | 'name': r[0], 70 | 'year': self.get_year(r[2]), 71 | }] 72 | 73 | def query(self, name, year=None): 74 | match = self.query_exact(name, year) 75 | if match: 76 | return match 77 | match = self.query_fuzzy(name, year) 78 | if match: 79 | return match 80 | return None 81 | -------------------------------------------------------------------------------- /scripts/generate_screenshots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import glob 4 | import os 5 | import shutil 6 | import subprocess 7 | import tempfile 8 | 9 | # Configuration 10 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 11 | PROJECT_ROOT = os.path.join(SCRIPT_DIR, "..") 12 | PROJECT_DOCS = os.path.join(PROJECT_ROOT, "docs") 13 | 14 | # Data 15 | # TODO: Generate the screenshots from actual output 16 | TERM_CURATOR_MERGE = '''$ curator merge -f mkv ./movies/The* 17 | ┌───┬──────────────────────────────────┬───┬────────────────────────────────┐ 18 | │ # │ Inputs │ → │ Output │ 19 | ├───┼──────────────────────────────────┼───┼────────────────────────────────┤ 20 | │ 1 │ The Social Network (2010).mkv │ → │ The Social Network (2010).mkv │ 21 | │ │ The Social Network (2010).es.ac3 │ ↗ │ │ 22 | │ │ The Social Network (2010).en.srt │ ↗ │ │ 23 | │ │ The Social Network (2010).es.srt │ ↗ │ │ 24 | │ │ The Social Network (2010).de.srt │ ↗ │ │ 25 | │ 2 │ There Will Be Blood (2007).mp4 │ → │ There Will Be Blood (2007).mkv │ 26 | │ │ There Will Be Blood (2007).srt │ ↗ │ │ 27 | └───┴──────────────────────────────────┴───┴────────────────────────────────┘ 28 | Continue? (y/N) ''' 29 | 30 | TERM_CURATOR_RENAME = '''$ curator rename -f "@name (@year).@ext" ./downloads/* 31 | ┌───┬────────────────────────────────────────────────────┬───┬─────────────────────────────────┐ 32 | │ # │ Old │ → │ New │ 33 | ├───┼────────────────────────────────────────────────────┼───┼─────────────────────────────────┤ 34 | │ 1 │ 10000.BC.2008_HDRip_[scarabey.org].mp4 │ → │ 10000 BC (2008).mp4 │ 35 | │ 2 │ 2001.A.Space.Odyssey.1968.BluRay.x264-[YTS.AM].mp4 │ → │ 2001 A Space Odyssey (1968).mp4 │ 36 | │ 3 │ Jacobs.Ladder.1990.720p.BluRay.x264.YIFY.mkv │ → │ Jacobs Ladder (1990).mkv │ 37 | │ 4 │ Venom.2018.HDTS.XViD.AC3-ETRG.mkv │ → │ Venom (2018).mkv │ 38 | └───┴────────────────────────────────────────────────────┴───┴─────────────────────────────────┘ 39 | Continue? (y/N) ''' 40 | 41 | TERM_CURATOR_TAG = '''$ curator tag -s audio -t language --only-macrolanguages . 42 | ┌───┬────────────────────────────────┬────────┬─────┬───┬─────┐ 43 | │ # │ Name │ Stream │ Old │ → │ New │ 44 | ├───┼────────────────────────────────┼────────┼─────┼───┼─────┤ 45 | │ 1 │ El Bola (2000).avi │ 1 │ │ → │ spa │ 46 | │ 2 │ Perfect Blue (1997).mkv │ 1 │ │ → │ jpn │ 47 | │ 3 │ Perfect Blue (1997).mkv │ 2 │ │ → │ eng │ 48 | │ 4 │ Saving Private Ryan (1998).mp4 │ 1 │ │ → │ eng │ 49 | │ 5 │ The Innocents (2021).mkv │ 1 │ │ → │ nor │ 50 | │ 6 │ Three-Body (2023) - S01E01.mkv │ 1 │ chi │ → │ zho │ 51 | └───┴────────────────────────────────┴────────┴─────┴───┴─────┘ 52 | Continue? (y/N) ''' 53 | 54 | def termtosvg(text, output): 55 | term_w = 100 56 | term_h = text.count('\n') + 1 57 | cmd = ['termtosvg'] 58 | cmd += ['-t', 'window_frame'] 59 | cmd += ['--screen-geometry', f'{term_w}x{term_h}'] 60 | cmd += ['--still-frames'] 61 | cmd += ['--command', f'echo -n -e {repr(text)}'] 62 | with tempfile.TemporaryDirectory() as tmp: 63 | cmd += [tmp] 64 | result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 65 | assert(result.returncode == 0) 66 | files = glob.glob(os.path.join(tmp, '*')) 67 | last = sorted(files)[-1] 68 | shutil.move(last, output) 69 | 70 | def main(): 71 | termtosvg(TERM_CURATOR_MERGE, 72 | os.path.join(PROJECT_DOCS, 'images/curator-merge.svg')) 73 | termtosvg(TERM_CURATOR_RENAME, 74 | os.path.join(PROJECT_DOCS, 'images/curator-rename.svg')) 75 | termtosvg(TERM_CURATOR_TAG, 76 | os.path.join(PROJECT_DOCS, 'images/curator-tag.svg')) 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Curator 2 | ======= 3 | 4 | [![ci-badge](https://github.com/AlexAltea/curator/actions/workflows/ci.yml/badge.svg)](https://github.com/AlexAltea/curator/actions/workflows/ci.yml) 5 | 6 | Automated normalization and curating of media collections. Written in Python 3.x. 7 | 8 | Curator is a collection of stateless CLI tools, following the [Unix philosophy](https://en.wikipedia.org/wiki/Unix_philosophy), to organize large collections of heterogeneous media. Each tool creates a *plan* made of *tasks* with clearly defined input and output files, which the user can optionally review before applying. 9 | 10 | Install the package via: 11 | 12 | ```sh 13 | pip install git+https://github.com/AlexAltea/curator.git 14 | ``` 15 | 16 | ## Credits 17 | 18 | Acknowledgements to people who contributed code/ideas to the project: 19 | 20 | - [Victor Garcia Herrero](https://github.com/VictGH): Mathematician, Machine Learning expert and tamer of scoring functions. 21 | 22 | ## Features 23 | 24 | Curator can automatically rename and link media files, edit container metadata, remux and merge streams. Reducing manual labor and achieve reliable results across different media from potentially different sources, some tools rely on signal processing and machine learning (e.g. [Whisper](https://openai.com/blog/whisper/), [LangID](https://github.com/saffsd/langid.py)). 25 | 26 | Highlighted use cases (current and planned): 27 | 28 | - [x] Filter media by container and stream metadata (all). 29 | - [x] Rename files based on existing filenames ([`curator-rename`](#rename)). 30 | - [x] Merge streams from multiple related containers ([`curator-merge`](#merge)). 31 | - [x] Detect audio/subtitle language from sound and text data ([`curator-tag`](#tag)). 32 | - [ ] Rename files based on existing metadata and databases ([`curator-rename`](#rename)). 33 | - [ ] Synchronize audio/subtitle streams ([`curator-merge`](#merge) and [`curator-sync`](#sync)). 34 | - [ ] Remove scene banners from subtitles ([`curator-clean`](#clean)). 35 | - [ ] Detect watermarks in video streams ([`curator-clean`](#clean) and [`curator-merge`](#merge)). 36 | - [ ] Select highest quality audio/video streams ([`curator-merge`](#merge)). 37 | 38 | Below you can find a description and examples of all tools provided by Curator: 39 | 40 | ### Auto 41 | 42 | ```mermaid 43 | flowchart LR 44 | Convert --> Merge --> Sync --> Tag --> Rename 45 | ``` 46 | 47 | ### Merge 48 | 49 | Merges all streams with identical names into a single container, except for: 50 | 51 | - Video streams, if one already exists. 52 | - Audio streams, if one with the same `language` tag already exists. 53 | 54 | Requires all video containers to be MKV. 55 | 56 | ![example-curator-merge](./docs/images/curator-merge.svg) 57 | 58 | ### Rename 59 | 60 | Update filenames according to a pattern made of the following variables: 61 | 62 | | Key | Description | 63 | |----------|-------------| 64 | | `@ext` | File extension of the input media. | 65 | | `@dbid` | When using a database, the ID of the match, e.g. `imdbid-tt12345678`. | 66 | | `@name` | Localized name of the media. | 67 | | `@oname` | Original name of the media (needs database). | 68 | | `@tags` | Tags present in the input media filename enclosed by square brackets, if any. | 69 | | `@year` | Year the media was released. | 70 | 71 | ![example-curator-rename](./docs/images/curator-rename.svg) 72 | 73 | ### Sync 74 | 75 | Synchronize streams via data cross-correlation. 76 | 77 | Every synchronization task involves (A) a reference stream, and (B) the stream we want to synchronize. We name this relationship as *A ← B*. Curator can only handle the following types of synchronization tasks: 78 | 79 | - [ ] *Video ← Audio*:\ 80 | Comparing lip movement timestamps with ASR timestamps. 81 | - [ ] *Audio ← Audio*:\ 82 | Comparing sound data. 83 | - [ ] *Audio ← Subtitle*:\ 84 | Comparing ASR timestamps with uniquely matching text timestamps. 85 | - [ ] *Subtitle ← Subtitle*:\ 86 | Comparing text timestamps. 87 | 88 | The synchronization plan (`SyncPlan`) will create a tree of synchronization tasks (`SyncTask`) for every media file it processes. For example, with an input `Media("movie.mkv")` with streams: `#0` (video), `#1` (audio:eng), `#2` (audio:spa), `#3` (subtitle:eng), `#4` (subtitle:spa), it will genarate the following sync proposals: 89 | 90 | 1. `#0` ← `#1` 91 | 2. `#1` ← `#2` 92 | 3. `#1` ← `#3` 93 | 4. `#3` ← `#4` 94 | 95 | ### Tag 96 | 97 | ![example-curator-tag](./docs/images/curator-tag.svg) 98 | -------------------------------------------------------------------------------- /curator/plans/rename.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from curator.analysis import * 5 | from curator.databases import * 6 | from curator import Plan, Task, Media 7 | 8 | class RenamePlan(Plan): 9 | def columns(self): 10 | return [ 11 | { 'name': 'Old', 'width': '50%' }, 12 | { 'name': 'Source', 'width': '8' }, 13 | { 'name': '→', 'width': '1' }, 14 | { 'name': "New", 'width': '50%' }, 15 | ] 16 | 17 | class RenameTask(Task): 18 | def __init__(self, input, name, source, alternatives=[]): 19 | super().__init__([input]) 20 | self.update_output(name) 21 | self.source = source 22 | self.alternatives = alternatives 23 | 24 | def update_output(self, name): 25 | input = self.inputs[0] 26 | output_path = os.path.join(os.path.dirname(input.path), name) 27 | output_media = Media(output_path, Media.TYPE_FILE) 28 | self.outputs = [output_media] 29 | 30 | def view(self): 31 | name_input = self.inputs[0].name 32 | name_output = self.outputs[0].name 33 | return [(name_input, self.source, "→", name_output)] 34 | 35 | def apply(self): 36 | src = self.inputs[0].path 37 | dst = self.outputs[0].path 38 | if not os.path.exists(dst): 39 | os.rename(src, dst) 40 | 41 | def normalize(filename): 42 | replacements = [ 43 | (r'([\w!]): ', r'\1 - '), # Remove colons when used as separators 44 | (r'\.\.\.', r''), # Remove ellipsis 45 | (r' vs\. ', r' vs '), # Remove versus period 46 | (r' 1/3 ', r' ⅓ '), # Convert to vulgar fractions 47 | (r'/', r'-'), # Remove slashes 48 | (r'\*', r''), # Remove stars 49 | (r'\?', r''), # Remove question marks 50 | ] 51 | for pattern, replacement in replacements: 52 | filename = re.sub(pattern, replacement, filename) 53 | return filename 54 | 55 | def format_entry(format, entry, tags, ext): 56 | name = entry.get('name') 57 | year = entry.get('year') 58 | dbid = entry.get('dbid') 59 | oname = entry.get('oname') 60 | 61 | filename = format 62 | filename = filename.replace('@name', str(name)) 63 | filename = filename.replace('@oname', str(oname)) 64 | filename = normalize(filename) 65 | filename = filename.replace('@dbid', str(dbid)) 66 | filename = filename.replace('@year', str(year)) 67 | filename = filename.replace('@ext', ext) 68 | filename = filename.replace('@tags', 69 | ''.join(map(lambda t: f'[{t}] ', tags))) 70 | root, ext = os.path.splitext(filename) 71 | filename = root.strip() + ext 72 | return filename 73 | 74 | def plan_rename(media, format, db=None): 75 | plan = RenamePlan() 76 | for m in media: 77 | # Detect name, year and tags 78 | name = detect_name(m.name) 79 | year = detect_year(m.name) 80 | tags = detect_tags(m.name) 81 | dbid = None 82 | oname = None 83 | source = "analysis" 84 | if db and (entries := db.query(name, year)): 85 | entry = entries[0] 86 | name = entry.get('name') 87 | year = entry.get('year') 88 | dbid = entry.get('dbid') 89 | oname = entry.get('oname') 90 | source = db.name 91 | if '@name' in format and not name: 92 | logging.warning(f"Could not rename: {m.name} (name not detected)") 93 | continue 94 | if '@year' in format and not year: 95 | logging.warning(f"Could not rename: {m.name} (year not detected)") 96 | continue 97 | if '@dbid' in format and not dbid: 98 | logging.warning(f"Could not rename: {m.name} (database id not detected)") 99 | continue 100 | if '@oname' in format and not oname: 101 | logging.warning(f"Could not rename: {m.name} (original name not found)") 102 | continue 103 | 104 | # Generate new filename 105 | filename = format 106 | filename = filename.replace('@name', str(name)) 107 | filename = filename.replace('@oname', str(oname)) 108 | filename = normalize(filename) 109 | filename = filename.replace('@dbid', str(dbid)) 110 | filename = filename.replace('@year', str(year)) 111 | filename = filename.replace('@ext', m.ext.lower()) 112 | filename = filename.replace('@tags', 113 | ''.join(map(lambda t: f'[{t}] ', tags))) 114 | root, ext = os.path.splitext(filename) 115 | filename = root.strip() + ext 116 | 117 | if filename != m.name: 118 | alternatives = list(map(lambda e: format_entry(format, e, tags, m.ext.lower()), entries)) 119 | task = RenameTask(m, filename, source, alternatives) 120 | plan.add_task(task) 121 | return plan 122 | -------------------------------------------------------------------------------- /docs/images/curator-tag.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 18 | 41 | 42 | 43 | 44 | 45 | 46 | $ curator tag -s audio -t language --only-macrolanguages .┌───┬────────────────────────────────┬────────┬─────┬───┬─────┐│ # │ Name │ Stream │ Old │ → │ New │├───┼────────────────────────────────┼────────┼─────┼───┼─────┤│ 1 │ El Bola (2000).avi │ 1 │ │ → │ spa ││ 2 │ Perfect Blue (1997).mkv │ 1 │ │ → │ jpn ││ 3 │ Perfect Blue (1997).mkv │ 2 │ │ → │ eng ││ 4 │ Saving Private Ryan (1998).mp4 │ 1 │ │ → │ eng ││ 5 │ The Innocents (2021).mkv │ 1 │ │ → │ nor ││ 6 │ Three-Body (2023) - S01E01.mkv │ 1 │ chi │ → │ zho │└───┴────────────────────────────────┴────────┴─────┴───┴─────┘Continue? (y/N) 47 | -------------------------------------------------------------------------------- /docs/images/curator-rename.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 18 | 41 | 42 | 43 | 44 | 45 | 46 | $ curator rename -f "@name (@year).@ext" ./downloads/*┌───┬────────────────────────────────────────────────────┬───┬─────────────────────────────────┐│ # │ Old │ → │ New │├───┼────────────────────────────────────────────────────┼───┼─────────────────────────────────┤│ 1 │ 10000.BC.2008_HDRip_[scarabey.org].mp4 │ → │ 10000 BC (2008).mp4 ││ 2 │ 2001.A.Space.Odyssey.1968.BluRay.x264-[YTS.AM].mp4 │ → │ 2001 A Space Odyssey (1968).mp4 ││ 3 │ Jacobs.Ladder.1990.720p.BluRay.x264.YIFY.mkv │ → │ Jacobs Ladder (1990).mkv ││ 4 │ Venom.2018.HDTS.XViD.AC3-ETRG.mkv │ → │ Venom (2018).mkv │└───┴────────────────────────────────────────────────────┴───┴─────────────────────────────────┘Continue? (y/N) 47 | -------------------------------------------------------------------------------- /docs/images/curator-merge.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 18 | 41 | 42 | 43 | 44 | 45 | 46 | $ curator merge -f mkv ./movies/The*┌───┬──────────────────────────────────┬───┬────────────────────────────────┐│ # │ Inputs │ → │ Output │├───┼──────────────────────────────────┼───┼────────────────────────────────┤│ 1 │ The Social Network (2010).mkv │ → │ The Social Network (2010).mkv ││ │ The Social Network (2010).es.ac3 │ ↗ │ ││ │ The Social Network (2010).en.srt │ ↗ │ ││ │ The Social Network (2010).es.srt │ ↗ │ ││ │ The Social Network (2010).de.srt │ ↗ │ ││ 2 │ There Will Be Blood (2007).mp4 │ → │ There Will Be Blood (2007).mkv ││ │ There Will Be Blood (2007).srt │ ↗ │ │└───┴──────────────────────────────────┴───┴────────────────────────────────┘Continue? (y/N) 47 | -------------------------------------------------------------------------------- /curator/plans/tag.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os 3 | import subprocess 4 | import tempfile 5 | import shutil 6 | 7 | from curator import Plan, Task, Media 8 | from curator.util import * 9 | 10 | class TagPlan(Plan): 11 | def columns(self): 12 | return [ 13 | { 'name': 'Name', 'width': '50%' }, 14 | { 'name': 'Stream', 'width': '6' }, 15 | { 'name': "Old", 'width': '25%' }, 16 | { 'name': '→', 'width': '1' }, 17 | { 'name': "New", 'width': '25%' }, 18 | ] 19 | 20 | def optimize(self): 21 | tasks = [] 22 | last_id = 0 23 | last_task = None 24 | for task in self.tasks: 25 | if last_task and last_task.inputs == task.inputs: 26 | last_task.combine(task) 27 | else: 28 | last_id += 1 29 | last_task = task 30 | tasks.append(task) 31 | task.id = last_id 32 | self.tasks = tasks 33 | 34 | class TagTask(Task): 35 | TagUpdate = collections.namedtuple('TagUpdate', ('index', 'tag', 'old', 'new')) 36 | 37 | def __init__(self, input): 38 | super().__init__([input], []) 39 | self.updates = [] 40 | self.use_mkvpropedit = False 41 | self.path_mkvpropedit = None 42 | 43 | def combine(self, other): 44 | super().combine(other) 45 | self.updates += other.updates 46 | 47 | def view(self): 48 | rows = [] 49 | for update in self.updates: 50 | rows.append([self.inputs[0].name, str(update.index), str(update.old), "→", str(update.new)]) 51 | return rows 52 | 53 | def add_update(self, index, tag, old, new=None): 54 | self.updates.append(self.TagUpdate(index, tag, old, new)) 55 | 56 | def apply(self): 57 | if self.use_mkvpropedit and self.path_mkvpropedit: 58 | self.apply_with_mkvpropedit() 59 | return 60 | 61 | m = self.inputs[0] 62 | cmd = ['ffmpeg'] 63 | cmd += ['-i', m.path] 64 | cmd += ['-c:v', 'copy'] 65 | cmd += ['-c:a', 'copy'] 66 | cmd += ['-c:s', 'copy'] 67 | cmd += ['-map', '0'] 68 | cmd += ['-map_metadata', '0'] 69 | 70 | for update in self.updates: 71 | cmd += [f'-metadata:s:{update.index}', f'{update.tag}={update.new}'] 72 | 73 | # Tweaks 74 | s = m.get_streams()[update.index] 75 | if m.is_format('avi') and update.tag == 'language': 76 | if (audio_index := s.audio_index()) not in range(9): 77 | raise Exception("RIFF IASx tags should only support up to 9 audio tracks") 78 | cmd += ['-metadata', f'IAS{audio_index + 1}={update.new}'] 79 | 80 | with tempfile.TemporaryDirectory(dir=m.dir, prefix='.temp-curator-') as tmp: 81 | output = os.path.join(tmp, f'output.{m.ext}') 82 | cmd += [output] 83 | result = subprocess.run(cmd, capture_output=True) 84 | if result.returncode != 0: 85 | errors = result.stderr.decode('utf-8') 86 | raise Exception(f"Failed to update tags in {m.name} with ffmpeg:\n{errors}") 87 | os.replace(output, m.path) 88 | 89 | def apply_with_mkvpropedit(self): 90 | m = self.inputs[0] 91 | cmd = [self.path_mkvpropedit, m.path] 92 | cmd += ['--disable-language-ietf'] # Fix error 0xc00d3e8c in Windows Movies & TV app 93 | for update in self.updates: 94 | cmd += ['--edit', f'track:{update.index+1}'] 95 | cmd += ['--set', f'{update.tag}={update.new}'] 96 | result = subprocess.run(cmd, capture_output=True) 97 | if result.returncode != 0: 98 | errors = result.stderr.decode('utf-8') 99 | raise Exception(f"Failed to update tags in {m.name} with mkvpropedit:\n{errors}") 100 | 101 | def tag_value(stream, tag, opts=None): 102 | try: 103 | if tag == 'language': 104 | return stream.detect_language(opts) 105 | except Exception as e: 106 | print(f'Could not process {stream.media.path}:\n{e}') 107 | return None 108 | 109 | def plan_tag(media, stype, tag, value=None, skip_tagged=False, opts=None): 110 | path_mkvpropedit = find_executable('mkvpropedit', [ 111 | 'C:/Program Files/MKVToolNix/mkvpropedit.exe', 112 | 'C:/Program Files (x86)/MKVToolNix/mkvpropedit.exe', 113 | ]) 114 | plan = TagPlan() 115 | for m in media: 116 | # Skip files with formats that do not support tagging 117 | if m.is_format('subviewer'): 118 | continue 119 | 120 | for stream in m.get_streams(): 121 | # Filter streams and get old tag value 122 | stream_info = stream.get_info() 123 | if stream_info['codec_type'] not in ('audio', 'subtitle'): 124 | continue 125 | if stype != 'all' and stream_info['codec_type'] != stype: 126 | continue 127 | stream_value = stream_info['tags'].get(tag) or \ 128 | stream_info['tags'].get(tag.lower()) or \ 129 | stream_info['tags'].get(tag.upper()) 130 | if skip_tagged and stream_value is not None: 131 | continue 132 | 133 | # Create tag update task 134 | task = TagTask(m) 135 | if m.is_format('avi'): 136 | task.add_warning("Modifying AVI metadata might affect stream synchronization.") 137 | if tag == 'languge' and stream.get_info()['codec_type'] == 'audio' and stream.audio_index() > 8: 138 | task.add_error("Cannot change AVI audio stream using IASx tags. Index out of range.") 139 | if m.is_format('matroska'): 140 | task.use_mkvpropedit = True 141 | task.path_mkvpropedit = path_mkvpropedit 142 | old_value = stream_value 143 | new_value = value if value is not None else tag_value(stream, tag, opts) 144 | if old_value != new_value and new_value is not None: 145 | task.add_update(stream.index, tag, old_value, new_value) 146 | plan.add_task(task) 147 | return plan 148 | -------------------------------------------------------------------------------- /curator/databases/imdb.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | import hashlib 4 | import io 5 | import logging 6 | import math 7 | import os 8 | import re 9 | 10 | import arrow 11 | import milli 12 | import requests 13 | import textdistance 14 | 15 | from curator import Database 16 | from curator.util import confirm 17 | 18 | IMDB_DISCLAIMER = """IMDb Dataset Terms and Conditions: 19 | By using this backend you agree to IMDb Datesets terms and conditions. 20 | Read more at https://www.imdb.com/interfaces/ regarding compliance with 21 | the IMdb Non-Commercial Licensing and copyright/license. 22 | 23 | To proceed, please confirm that Curator and data at ~/.cache/curator/imdb 24 | is and will be used *only* for personal and non-commercial purposes. 25 | """ 26 | 27 | IMDB_ACKNOWLEDGEMENT = """Information courtesy of IMDb (https://www.imdb.com). 28 | Used with permission.""" 29 | 30 | class ImdbDatabase(Database): 31 | def __init__(self, cache_days): 32 | super().__init__("imdb") 33 | 34 | # Common suffix 35 | today = arrow.utcnow().format('YYYY-MM-DD') 36 | 37 | # Check IMDb terms 38 | terms_hash = hashlib.sha256(IMDB_DISCLAIMER.encode('utf-8')).hexdigest()[:32] 39 | terms_path = os.path.join(self.cache, f'terms_{terms_hash}') 40 | if not os.path.exists(terms_path): 41 | print(IMDB_DISCLAIMER) 42 | if confirm("Do you agree/confirm this?", default='no'): 43 | with open(terms_path, 'w') as f: 44 | f.write(f'```\n{IMDB_DISCLAIMER}```\n\nAgreed on {today}.\n') 45 | else: 46 | print("Curator cannot continue without accepting the IMDb terms.") 47 | exit(0) 48 | print(IMDB_ACKNOWLEDGEMENT) 49 | 50 | # Check if cached index exists 51 | for day in list(range(cache_days)) + [0]: 52 | day = arrow.utcnow().shift(days=-day).format('YYYY-MM-DD') 53 | cache_name = f'index_milli_{day}' 54 | cache_path = os.path.join(self.cache, cache_name) 55 | if os.path.exists(cache_path): 56 | logging.info(f"Using cached movie index from {day}") 57 | self.ix = milli.Index(cache_path, 4*1024*1024*1024) # 4 GiB 58 | return 59 | 60 | # Otherwise create one 61 | logging.info("Creating movie index...") 62 | title_akas = self.get_imdb_dataset('title.akas') 63 | title_basics = self.get_imdb_dataset('title.basics') 64 | title_ratings = self.get_imdb_dataset('title.ratings') 65 | movies = {} 66 | for row in title_basics: 67 | if row['titleType'] != 'movie' or row['startYear'] == '\\N': 68 | continue 69 | movie_id = row['tconst'] 70 | movies[movie_id] = { 71 | 'id': movie_id, 72 | 'name': row['primaryTitle'], 73 | 'year': row['startYear'], 74 | 'akas': [], 75 | 'votes': 0, 76 | } 77 | for row in title_akas: 78 | movie_id = row['titleId'] 79 | movie = movies.get(movie_id) 80 | if movie is None: 81 | continue 82 | movie['akas'].append(row['title']) 83 | for row in title_ratings: 84 | movie_id = row['tconst'] 85 | movie = movies.get(movie_id) 86 | if movie is None: 87 | continue 88 | movie['votes'] = int(row['numVotes']) 89 | os.mkdir(cache_path) 90 | self.ix = milli.Index(cache_path, 4*1024*1024*1024) # 4 GiB 91 | self.ix.add_documents(list(movies.values())) 92 | 93 | def get_imdb_dataset(self, name): 94 | today = arrow.utcnow().format('YYYY-MM-DD') 95 | cache_name = f'{name}_{today}.tsv.gz' 96 | cache_path = os.path.join(self.cache, cache_name) 97 | if not os.path.exists(cache_path): 98 | r = requests.get(f'https://datasets.imdbws.com/{name}.tsv.gz') 99 | with open(cache_path, 'wb') as f: 100 | f.write(r.content) 101 | 102 | # Parse compressed CSV dataset 103 | with gzip.open(cache_path) as gz: 104 | text = gz.read().decode('utf-8') 105 | return csv.DictReader(io.StringIO(text), delimiter='\t', quoting=csv.QUOTE_NONE) 106 | 107 | def query(self, name, year=None): 108 | results = self.ix.search(name) 109 | if not results: 110 | return None 111 | movies = self.ix.get_documents(results) 112 | if year is not None: 113 | movies_year_exact = list(filter(lambda m: int(m['year']) == year, movies)) 114 | if len(movies_year_exact) > 0: 115 | movies = movies_year_exact 116 | else: 117 | movies_year_above = list(filter(lambda m: int(m['year']) == year + 1, movies)) 118 | movies_year_below = list(filter(lambda m: int(m['year']) == year - 1, movies)) 119 | movies = movies_year_above + movies_year_below 120 | if not movies: 121 | return None 122 | for movie in movies: 123 | titles = [movie['name']] + movie['akas'] 124 | distance = min(map(lambda title: textdistance.levenshtein(title, name), titles)) 125 | popularity = math.log10(movie['votes'] + 1) 126 | movie['score'] = popularity - distance 127 | # Return closest matches 128 | movies = [{ 129 | 'name': name, 130 | 'oname': movie.get('name'), 131 | 'year': movie.get('year'), 132 | 'dbid': 'imdbid-' + movie.get('id'), 133 | } for movie in sorted(movies, key=lambda m: m['score'], reverse=True)][:10] 134 | return movies 135 | 136 | def query_id(self, name): 137 | match = re.search(r'tt\d{7,9}', name) 138 | if not match: 139 | return None 140 | id = match.group() 141 | results = self.ix.search(name) 142 | if not results: 143 | return None 144 | movies = self.ix.get_documents(results) 145 | for movie in movies: 146 | if movie.get('id') == id: 147 | return [{ 148 | 'name': name, 149 | 'oname': movie.get('name'), 150 | 'year': movie.get('year'), 151 | 'dbid': 'imdbid-' + movie.get('id'), 152 | }] 153 | return None 154 | -------------------------------------------------------------------------------- /curator/plans/convert.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess 4 | import tempfile 5 | 6 | from curator import Plan, Task, Media 7 | from curator.util import flatten 8 | 9 | class ConvertPlan(Plan): 10 | def columns(self): 11 | return [ 12 | { 'name': 'Inputs', 'width': '50%' }, 13 | { 'name': '→', 'width': '1' }, 14 | { 'name': "Output", 'width': '50%' }, 15 | ] 16 | 17 | class ConvertTask(Task): 18 | def __init__(self, input, output, format, delete=False): 19 | super().__init__([input], [output]) 20 | assert(output.type == Media.TYPE_FILE) 21 | self.format = format 22 | self.delete = delete 23 | self.fflags = set() 24 | self.cflags = set() 25 | self.mflags = set() 26 | self.unpack_bframes = False 27 | self.skip_subtitles = False 28 | 29 | def view(self): 30 | return [(self.inputs[0].name, "→", self.outputs[0].name)] 31 | 32 | def apply(self): 33 | temp = None 34 | # Solve conflict when -fflags +genpts and -bsf:v mpeg4_unpack_bframes are both enabled 35 | input_media = self.inputs[0].path 36 | if self.unpack_bframes and '+genpts' in self.fflags: 37 | temp = tempfile.TemporaryDirectory(dir=self.inputs[0].dir, prefix='.temp-curator-') 38 | fixed_media = os.path.join(temp.name, "media.avi") 39 | cmd = ['ffmpeg'] 40 | cmd += ['-i', input_media] 41 | cmd += ['-c:v', 'copy'] 42 | cmd += ['-c:a', 'copy'] 43 | cmd += ['-bsf:v', 'mpeg4_unpack_bframes'] 44 | cmd += ['-map', '0'] 45 | if self.skip_subtitles: 46 | cmd += ['-map', '-0:s'] 47 | cmd += ['-map_metadata', '0'] 48 | cmd += ['-movflags', 'use_metadata_tags'] 49 | cmd += [fixed_media] 50 | result = subprocess.run(cmd, capture_output=True) 51 | if result.returncode != 0: 52 | errors = result.stderr.decode('utf-8') 53 | raise Exception(f"Failed to generate PTS for {self.inputs[0].name} with ffmpeg:\n{errors}") 54 | input_media = fixed_media 55 | 56 | # Build ffmpeg command 57 | cmd = ['ffmpeg'] 58 | if self.fflags: 59 | cmd += ['-fflags', ''.join(self.fflags)] 60 | cmd += ['-i', input_media] 61 | cmd += ['-c:v', 'copy'] 62 | cmd += ['-c:a', 'copy'] 63 | cmd += ['-c:s', 'copy'] 64 | cmd += ['-c:d', 'copy'] 65 | cmd += ['-c:t', 'copy'] 66 | if self.cflags: 67 | cmd += flatten(self.cflags) 68 | if self.unpack_bframes and '+genpts' not in self.fflags: 69 | cmd += ['-bsf:v', 'mpeg4_unpack_bframes'] 70 | cmd += ['-map', '0'] 71 | if self.skip_subtitles: 72 | cmd += ['-map', '-0:s'] 73 | if self.mflags: 74 | cmd += flatten(self.mflags) 75 | cmd += ['-map_metadata', '0'] 76 | cmd += ['-movflags', 'use_metadata_tags'] 77 | 78 | # Create output file 79 | output = self.outputs[0].path 80 | cmd += [output] 81 | result = subprocess.run(cmd, capture_output=True) 82 | if temp: 83 | temp.cleanup() 84 | if result.returncode != 0: 85 | if os.path.exists(output): 86 | os.remove(output) 87 | errors = result.stderr.decode('utf-8') 88 | raise Exception(f"Failed to convert to {output} with ffmpeg:\n{errors}") 89 | if self.delete: 90 | os.remove(self.inputs[0].path) 91 | 92 | def add_fflag(self, flag): 93 | self.fflags.add(flag) 94 | 95 | def add_cflag(self, flag): 96 | self.cflags.add(flag) 97 | 98 | def add_mflag(self, flag): 99 | self.mflags.add(flag) 100 | 101 | def plan_convert(media, format, delete=False): 102 | plan = ConvertPlan() 103 | for m in media: 104 | root, ext = os.path.splitext(m.path) 105 | output_path = f'{root}.{format}' 106 | if os.path.exists(output_path): 107 | logging.debug(f'Skipping existing output: {output_path}') 108 | continue 109 | output_media = Media(output_path, Media.TYPE_FILE) 110 | task = ConvertTask(m, output_media, format, delete) 111 | 112 | # Tweaks for mismatching formats 113 | if m.get_info()['format_name'] in ('avi', 'ogg') and m.has_video(): 114 | task.add_warning(f'Media contains packets without PTS data.') 115 | task.add_fflag('+genpts') 116 | if m.get_info()['format_name'] == 'avi' and m.has_video_codec('h264'): 117 | task.add_error('AVI contains H264 stream. Unpacking is required, but not supported.') 118 | if m.get_info()['format_name'] == 'avi' and m.has_subtitle(): 119 | task.skip_subtitles = True 120 | task.add_warning('AVI contains subtitles. Conversion is not supported.') 121 | if m.has_packed_bframes(): 122 | task.unpack_bframes = True 123 | task.add_warning(f'Media contains packed B-frames. Unpacking is required.') 124 | if format == 'mkv': 125 | for stream in m.get_streams(): 126 | # Reencode MP4/TX3G to MKV/SRT 127 | if stream.get_info()['codec_type'] == "subtitle" and \ 128 | stream.get_info()['codec_name'] == "mov_text": 129 | task.add_warning(f'Conversion requires reencoding {stream}. Styles will be removed.') 130 | task.add_cflag(('-c:s', 'text')) 131 | if format == 'mkv': 132 | for stream in m.get_streams(): 133 | # Drop MP4/TMCD 134 | if stream.get_info()['codec_type'] == "data" and \ 135 | stream.get_info()['tags'].get('handler_name') == "Time Code Media Handler": 136 | task.add_warning('Chapters have been included in {stream}. Stream will be dropped.') 137 | task.add_mflag(('-map', f'-0:{stream.index}')) 138 | if stream.get_info()['codec_type'] == "data" and \ 139 | stream.get_info()['tags'].get('handler_name') == "SubtitleHandler": 140 | task.add_warning('Chapters have been included in {stream}. Stream will be dropped, but chapters might be carried over by ffmpeg.') 141 | task.add_mflag(('-map', f'-0:{stream.index}')) 142 | # Drop binary data 143 | if stream.get_info()['codec_type'] == "data" and \ 144 | stream.get_info()['codec_name'] == "bin_data": 145 | task.add_warning('Binary data has been included in {stream}. Stream will be dropped.') 146 | task.add_mflag(('-map', f'-0:{stream.index}')) 147 | plan.add_task(task) 148 | return plan 149 | -------------------------------------------------------------------------------- /curator/media.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import glob 3 | import json 4 | import operator 5 | import os 6 | import subprocess 7 | 8 | from .stream import * 9 | 10 | # Extensions 11 | VIDEO_EXTENSIONS = ['avi', 'flv', 'm4v', 'mkv', 'mov', 'mp4', 'mpg', 'wmv'] 12 | AUDIO_EXTENSIONS = ['mp3', 'aac', 'ogg'] 13 | TEXTS_EXTENSIONS = ['srt', 'ass'] 14 | 15 | class Media: 16 | # Type 17 | TYPE_FILE = 1 18 | TYPE_LINK = 2 19 | 20 | def __init__(self, path, type=None): 21 | self.path = path 22 | 23 | # Detect regular file or link 24 | if type is None: 25 | if os.path.isfile(path): 26 | self.type = Media.TYPE_FILE 27 | elif os.path.islink(path): 28 | self.type = Media.TYPE_LINK 29 | else: 30 | self.type = type 31 | 32 | # Cache directory, filename and extension 33 | root, ext = os.path.splitext(path) 34 | self.name = os.path.basename(path) 35 | self.dir = os.path.dirname(root) 36 | self.ext = ext[1:] 37 | 38 | # Cache media information 39 | self.info = None 40 | self.streams = None 41 | self.packets = None 42 | 43 | def __repr__(self): 44 | return f'Media("{self.name}")' 45 | 46 | def has_video_ext(self): 47 | return self.ext in VIDEO_EXTENSIONS 48 | 49 | def has_audio_ext(self): 50 | return self.ext in AUDIO_EXTENSIONS 51 | 52 | def has_subtitle_ext(self): 53 | return self.ext in TEXTS_EXTENSIONS 54 | 55 | def has_video(self): 56 | return any(map(lambda s: s.is_video(), self.get_streams())) 57 | 58 | def has_audio(self): 59 | return any(map(lambda s: s.is_audio(), self.get_streams())) 60 | 61 | def has_subtitle(self): 62 | return any(map(lambda s: s.is_subtitle(), self.get_streams())) 63 | 64 | def has_video_codec(self, codec_name): 65 | return any(map(lambda s: s.is_video() and s.get_info()['codec_name'] == codec_name, 66 | self.get_streams())) 67 | 68 | def has_subtitle_codec(self, codec_name): 69 | return any(map(lambda s: s.is_subtitle() and s.get_info()['codec_name'] == codec_name, 70 | self.get_streams())) 71 | 72 | def has_packed_bframes(self): 73 | if self.get_info()['format_name'] != 'avi': 74 | return False 75 | for stream in self.get_streams(): 76 | if stream.is_video() and stream.has_packed_bframes(): 77 | return True 78 | return False 79 | 80 | def is_format(self, name): 81 | return name in self.get_info()['format_name'].split(',') 82 | 83 | def get_info(self): 84 | if self.info: 85 | return self.info 86 | cmd = ['ffprobe', self.path] 87 | cmd += ['-show_format'] 88 | cmd += ['-of', 'json'] 89 | result = subprocess.run(cmd, capture_output=True) 90 | if result.returncode != 0: 91 | errors = result.stderr.decode('utf-8') 92 | raise Exception(f"Failed to get info from {self.path} with ffmpeg:\n{errors}") 93 | output = result.stdout.decode('utf-8') 94 | self.info = json.loads(output)['format'] 95 | return self.info 96 | 97 | def get_packets(self): 98 | if self.packets: 99 | return self.packets 100 | cmd = ['ffprobe', self.path] 101 | cmd += ['-show_packets'] 102 | cmd += ['-of', 'json'] 103 | result = subprocess.run(cmd, capture_output=True) 104 | if result.returncode != 0: 105 | errors = result.stderr.decode('utf-8') 106 | raise Exception(f"Failed to get packets from {self.path} with ffmpeg:\n{errors}") 107 | output = result.stdout.decode('utf-8') 108 | self.packets = json.loads(output)['packets'] 109 | return self.packets 110 | 111 | def get_streams(self): 112 | if self.streams is not None: 113 | return self.streams 114 | 115 | # Obtain information about streams within media 116 | cmd = ['ffprobe', self.path] 117 | cmd += ['-show_streams'] 118 | cmd += ['-of', 'json'] 119 | result = subprocess.run(cmd, capture_output=True) 120 | if result.returncode != 0: 121 | errors = result.stderr.decode('utf-8') 122 | raise Exception(f"Failed get info from {self.path} with ffmpeg:\n{errors}") 123 | output = result.stdout.decode('utf-8') 124 | streams_info = json.loads(output)['streams'] 125 | 126 | # Create and return stream objects 127 | streams = [] 128 | for stream_info in streams_info: 129 | stream_info.setdefault('tags', {}) 130 | stream = Stream(self, stream_info['index'], stream_info) 131 | streams.append(stream) 132 | self.streams = streams 133 | return streams 134 | 135 | def num_streams(): 136 | return len(get_streams()) 137 | 138 | 139 | def parse_query(query): 140 | lhs, rhs = query.split('=') 141 | path = lhs.split('.') 142 | return { 'lhs_path': path, 'op': operator.eq, 'rhs_value': rhs } 143 | 144 | def filter_streams(streams, query): 145 | results = [] 146 | query = parse_query(query) 147 | for stream in streams: 148 | try: 149 | lhs = functools.reduce(dict.get, query['lhs_path'], stream.get_info()) 150 | except TypeError: 151 | continue 152 | rhs = query['rhs_value'] 153 | if query['op'](lhs, rhs): 154 | results.append(stream) 155 | return results 156 | 157 | def filter_check(media, queries): 158 | if not queries: 159 | return True 160 | streams = media.get_streams() 161 | for query in queries: 162 | streams = filter_streams(streams, query) 163 | if len(streams) == 0: 164 | return False 165 | return True 166 | 167 | def media_input(paths, recursive=False, queries=[]): 168 | media = [] 169 | for path in paths: 170 | # Add files 171 | if os.path.isfile(path): 172 | m = Media(path) 173 | if filter_check(m, queries): 174 | media.append(m) 175 | # Add directories 176 | elif os.path.isdir(path): 177 | path = os.path.join(path, '*') 178 | for path in glob.glob(path, recursive=recursive): 179 | if os.path.isfile(path): 180 | m = Media(path) 181 | if filter_check(m, queries): 182 | media.append(m) 183 | # Add wildcards (needed for Windows) 184 | elif '*' in path: 185 | for path in glob.glob(path, recursive=recursive): 186 | if os.path.isfile(path): 187 | m = Media(path) 188 | if filter_check(m, queries): 189 | media.append(m) 190 | return media 191 | -------------------------------------------------------------------------------- /curator/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import pathlib 7 | import sys 8 | 9 | import curator 10 | from curator.databases import get_database 11 | from curator.util import confirm 12 | 13 | # Helpers 14 | def curator_argparser(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('input', nargs='+', type=str) 17 | parser.add_argument('--log', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='WARNING') 18 | parser.add_argument('-q', '--query', action='append', help="metadata filter(s), e.g. `tags.language=eng`", default=[]) 19 | parser.add_argument('-y', action='store_true') # Auto-yes 20 | parser.add_argument('-n', action='store_true') # Auto-no 21 | parser.add_argument('-r', action='store_true') # Recursive 22 | return parser 23 | 24 | def curator_args(parser, argv): 25 | args = parser.parse_args(argv) 26 | logging.basicConfig(format='%(asctime)s | %(levelname)s | %(message)s', 27 | level=getattr(logging, args.log), stream=sys.stderr) 28 | return args 29 | 30 | def curator_input(args): 31 | media = curator.media_input(args.input, recursive=args.r, queries=args.query) 32 | logging.info(f'Analyzing {len(media)} input media files') 33 | return media 34 | 35 | def curator_handle_plan(plan, args): 36 | plan.validate() 37 | if plan.is_empty(): 38 | print('Current plan requires no tasks. There is nothing to be done.') 39 | return 40 | 41 | # Dry run 42 | if args.n: 43 | plan.show() 44 | return 45 | # Blind run 46 | if args.y: 47 | curator_apply_plan(plan) 48 | return 49 | # Interactive mode (default) 50 | plan.edit() 51 | tasks_enabled = len([t for t in plan if t.enabled]) 52 | print(f"After changes, the current plan has {tasks_enabled} tasks enabled out of {len(plan)}.") 53 | if confirm("Apply plan?", default="no"): 54 | curator_apply_plan(plan) 55 | 56 | def curator_apply_plan(plan): 57 | plan.optimize() 58 | plan.apply() 59 | tasks_failed = len([t for t in plan if t.failed]) 60 | if not tasks_failed: 61 | print('All tasks completed successfully') 62 | else: 63 | print('Some tasks failed:') 64 | for task in plan: 65 | if not task.failed: continue 66 | print(f'- Task #{task.id} with input {task.inputs[0]} failed') 67 | 68 | # Usage 69 | CURATOR_USAGE = ''' 70 | Usage: curator [] 71 | 72 | The following commands are supported: 73 | convert Convert files to a different format. 74 | link Create symbolic links to files in another directory. 75 | merge Merge related files into a single container. 76 | rename Rename files according to their metadata. 77 | tag Update stream metadata/tags. 78 | '''.strip() 79 | 80 | def curator_convert(argv): 81 | parser = curator_argparser() 82 | parser.add_argument('-d', '--delete', action='store_true', help='delete inputs after converting') 83 | parser.add_argument('-f', '--format', choices=['mkv'], required=True) 84 | args = curator_args(parser, argv) 85 | 86 | from curator.plans import plan_convert 87 | media = curator_input(args) 88 | plan = plan_convert(media, args.format, args.delete) 89 | curator_handle_plan(plan, args) 90 | 91 | def curator_link(argv): 92 | parser = curator_argparser() 93 | parser.add_argument('-o', '--output', required=True) 94 | args = curator_args(parser, argv) 95 | 96 | from curator.plans import plan_link 97 | media = curator_input(args) 98 | plan = plan_link(media, args.output) 99 | curator_handle_plan(plan, args) 100 | 101 | def curator_merge(argv): 102 | parser = curator_argparser() 103 | parser.add_argument('-d', '--delete', action='store_true', help='delete inputs after merging') 104 | parser.add_argument('-f', '--format', choices=['mkv'], default='mkv') 105 | 106 | # Video stream selection 107 | parser.add_argument('--try-video-criteria', required=False, default='resolution,codec,fps', 108 | help='Comma-separated list of video criteria in decreasing order of importance') 109 | parser.add_argument('--try-video-codecs', required=False, default='hevc,h264,mpeg4', 110 | help='Comma-separated list of video codec in decreasing order of preference') 111 | parser.add_argument('--min-video-resolution', required=False, default=None, 112 | help='Try to discard video streams below this resolution') 113 | parser.add_argument('--max-video-resolution', required=False, default=None, 114 | help='Try to discard video streams above this resolution') 115 | parser.add_argument('--min-video-bitrate', required=False, default=None, 116 | help='Try to discard video streams below this bitrate') 117 | parser.add_argument('--max-video-bitrate', required=False, default=None, 118 | help='Try to discard video streams above this bitrate') 119 | 120 | args = curator_args(parser, argv) 121 | if args.min_video_resolution or args.min_video_bitrate or \ 122 | args.max_video_resolution or args.max_video_bitrate: 123 | raise Exception("Unsupported argument") 124 | select = lambda *keys: { k: vars(args)[k] for k in keys } 125 | opts = select( 126 | 'try_video_criteria', 127 | 'try_video_codecs', 128 | 'min_video_resolution', 129 | 'max_video_resolution', 130 | 'min_video_bitrate', 131 | 'max_video_bitrate') 132 | for k in ('try_video_criteria', 'try_video_codecs'): 133 | opts[k] = opts[k].split(',') 134 | 135 | from curator.plans import plan_merge 136 | media = curator_input(args) 137 | plan = plan_merge(media, args.format, args.delete, opts) 138 | curator_handle_plan(plan, args) 139 | 140 | def curator_rename(argv): 141 | parser = curator_argparser() 142 | parser.add_argument('-f', '--format', default="@name (@year).@ext") 143 | parser.add_argument('-d', '--db', required=False) 144 | parser.add_argument('--db-cache-days', required=False, type=int, default=30, 145 | help='Update database if older than N days. Set to 0 to force refresh (default: 30 days)') 146 | args = curator_args(parser, argv) 147 | 148 | from curator.plans import plan_rename 149 | db = get_database(args.db, args.db_cache_days) if args.db else None 150 | media = curator_input(args) 151 | plan = plan_rename(media, args.format, db) 152 | curator_handle_plan(plan, args) 153 | 154 | def curator_tag(argv): 155 | parser = curator_argparser() 156 | parser.add_argument('-s', '--streams', default="all", choices=["all", "audio", "subtitle"]) 157 | parser.add_argument('-t', '--tag', required=True, choices=["language"]) 158 | parser.add_argument('-v', '--value', required=False) 159 | parser.add_argument('--skip-tagged', action='store_true', 160 | help='skip streams if a valid tag already exists') 161 | # Tag-specific options 162 | parser.add_argument('--only-macrolanguages', action='store_true', 163 | help='when detecting languages, consider only macrolanguages. ' + 164 | 'e.g. this will map `nno`/`nnb` detections into `nor`.') 165 | parser.add_argument('--max-audio-samples', type=int, default=10, 166 | help='when detecting languages in audio, max number of samples to extract.') 167 | parser.add_argument('--min-score', type=float, default=0.8, 168 | help='when detecting languages in audio, max number of samples to extract.') 169 | args = curator_args(parser, argv) 170 | 171 | # Select relevant options 172 | select = lambda *keys: { k: vars(args)[k] for k in keys } 173 | if args.tag == 'language': 174 | opts = select('only_macrolanguages', 'max_audio_samples', 'min_score') 175 | 176 | from curator.plans import plan_tag 177 | media = curator_input(args) 178 | plan = plan_tag(media, args.streams, args.tag, args.value, args.skip_tagged, opts) 179 | curator_handle_plan(plan, args) 180 | 181 | def main(): 182 | commands = { 183 | 'convert': curator_convert, 184 | 'link': curator_link, 185 | 'merge': curator_merge, 186 | 'rename': curator_rename, 187 | 'tag': curator_tag, 188 | } 189 | 190 | # If no arguments are provided 191 | if len(sys.argv) < 2 or sys.argv[1] in ('-h', '--help'): 192 | print('Curator: Automated normalization and curating of media collections.\n') 193 | print(CURATOR_USAGE) 194 | return 195 | 196 | # Configure logging 197 | blacklist = ['langid'] 198 | for name in blacklist: 199 | logging.getLogger(name).setLevel(logging.ERROR) 200 | 201 | # Dispatch command otherwise 202 | command = sys.argv[1] 203 | handler = commands.get(command) 204 | if not handler: 205 | print('Unsupported command "{}"\n'.format(command)) 206 | print(CURATOR_USAGE) 207 | exit(1) 208 | handler(sys.argv[2:]) 209 | 210 | if __name__ == '__main__': 211 | main() 212 | -------------------------------------------------------------------------------- /curator/plans/merge.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import subprocess 5 | import tempfile 6 | 7 | from curator import Plan, Task, Media 8 | from curator import VIDEO_EXTENSIONS 9 | 10 | # Default options 11 | DEF_OPTS_MERGE = { 12 | # Video selection 13 | 'try_video_criteria': ['resolution', 'codec', 'fps', 'length'], 14 | 'try_video_codecs': ['hevc', 'h264', 'mpeg4'], 15 | 'min_video_resolution': None, 16 | 'max_video_resolution': None, 17 | 'min_video_bitrate': None, 18 | 'max_video_bitrate': None, 19 | 20 | # Audio selection 21 | 'try_audio_criteria': ['codec', 'bitrate', 'channels'], 22 | 'try_audio_codecs': ['flac', 'dts', 'eac3', 'ac3', 'mp3'], 23 | 'min_audio_bitrate': None, 24 | 'max_audio_bitrate': None, 25 | 26 | # Subtitle selection 27 | 'try_subtitle_criteria': [], 28 | } 29 | 30 | class MergePlan(Plan): 31 | def columns(self): 32 | return [ 33 | { 'name': 'Inputs', 'width': '50%' }, 34 | { 'name': '→', 'width': '1' }, 35 | { 'name': "Output", 'width': '50%' }, 36 | ] 37 | 38 | def validate(self): 39 | # Overriding Plan.validate since MergePlans allow for inplace merges 40 | # where the output already exists in the filesystem. 41 | outputs = set() 42 | for task in self.tasks: 43 | for output in task.outputs: 44 | path = output.path 45 | if path in outputs: 46 | task.add_error(f'Output {path} already exists in the plan') 47 | outputs.add(path) 48 | 49 | class MergeTask(Task): 50 | def __init__(self, inputs, output, format, delete): 51 | super().__init__(inputs, [output]) 52 | self.format = format 53 | self.delete = delete 54 | self.selected_streams = [] 55 | 56 | def view(self): 57 | rows = [] 58 | for m in self.inputs: 59 | # Check if input used 60 | m_selected_streams = list(filter(lambda s: s.media == m, self.selected_streams)) 61 | if len(m_selected_streams) == 0: 62 | continue 63 | # Show selected media 64 | rows.append((m.name, 65 | '↗' if rows else '→', 66 | ' ' if rows else self.outputs[0].name, 67 | )) 68 | # Show selected streams if strict subset of available 69 | if len(m_selected_streams) < len(m.get_streams()): 70 | for stream in m_selected_streams: 71 | rows.append((f' - {stream}', '↗', '')) 72 | return rows 73 | 74 | def select_streams(self, video, audio, subtitle): 75 | self.selected_streams = [video] 76 | self.selected_streams += audio 77 | self.selected_streams += subtitle 78 | 79 | def apply(self): 80 | # Build ffmpeg command 81 | cmd = ['ffmpeg'] 82 | for minput in self.inputs: 83 | cmd += ['-i', minput.path] 84 | cmd += ['-c:v', 'copy'] 85 | cmd += ['-c:a', 'copy'] 86 | cmd += ['-c:s', 'copy'] 87 | for stream in self.selected_streams: 88 | index_m = self.inputs.index(stream.media) 89 | index_s = stream.index 90 | cmd += ['-map', f'{index_m}:{index_s}'] 91 | 92 | # Create output file 93 | output = self.outputs[0] 94 | with tempfile.TemporaryDirectory(dir=output.dir, prefix='.temp-curator-') as tmp: 95 | output_tmp = os.path.join(tmp, f'output.{output.ext}') 96 | cmd += [output_tmp] 97 | result = subprocess.run(cmd, capture_output=True) 98 | if result.returncode != 0: 99 | errors = result.stderr.decode('utf-8') 100 | raise Exception(f"Failed to merge into {output.name} with ffmpeg:\n{errors}") 101 | os.replace(output_tmp, output.path) 102 | if self.delete: 103 | for media in self.inputs: 104 | if media.path == output.path: 105 | continue # Do not accidentally remove output after in-place merges 106 | os.remove(media.path) 107 | 108 | def select_codec(s1, s2, codec_list): 109 | codec_scores = { codec: len(codec_list) - index for index, codec in enumerate(codec_list) } 110 | s1_codec = s1.get_info()['codec_name'] 111 | s2_codec = s2.get_info()['codec_name'] 112 | s1_codec_score = codec_scores.get(s1_codec, 0) 113 | s2_codec_score = codec_scores.get(s2_codec, 0) 114 | if s1_codec_score == 0: 115 | logging.warning(f"Select criteria does not consider codec {s1_codec} in stream {s1}") 116 | if s2_codec_score == 0: 117 | logging.warning(f"Select criteria does not consider codec {s2_codec} in stream {s2}") 118 | if s1_codec_score > s2_codec_score: 119 | return s1 120 | if s2_codec_score > s1_codec_score: 121 | return s2 122 | return None 123 | 124 | def select_video_stream(s1, s2, opts=DEF_OPTS_MERGE): 125 | if s1 is None: 126 | return s2 127 | if s2 is None: 128 | return s1 129 | for criterion in opts['try_video_criteria']: 130 | # Resolution: 131 | # - Consider width as some sources include black bars which might increase the height. 132 | # - Only consider 10% increases as meaningful enough to trigger this criterion. 133 | if criterion == 'resolution': 134 | if s1.get_info()['width'] / s2.get_info()['width'] > 1.1: 135 | return s1 136 | if s2.get_info()['width'] / s1.get_info()['width'] > 1.1: 137 | return s2 138 | # Codec 139 | elif criterion == 'codec': 140 | stream = select_codec(s1, s2, opts['try_video_codecs']) 141 | if stream is not None: 142 | return stream 143 | # Frames 144 | elif criterion == 'fps': 145 | if s1.get_frame_rate() > s2.get_frame_rate(): 146 | return s1 147 | if s2.get_frame_rate() > s1.get_frame_rate(): 148 | return s2 149 | else: 150 | raise Exception(f"Unknown video selection criterion: {criterion}") 151 | logging.warning(f'Video criteria could not select between {s1} and {s2}') 152 | return s1 153 | 154 | def select_audio_stream(s1, s2, opts=DEF_OPTS_MERGE): 155 | for criterion in opts['try_audio_criteria']: 156 | # Codec 157 | if criterion == 'codec': 158 | stream = select_codec(s1, s2, opts['try_audio_codecs']) 159 | if stream is not None: 160 | return stream 161 | # Bitrate 162 | elif criterion == 'bitrate': 163 | if int(s1.get_info()['bit_rate']) > int(s2.get_info()['bit_rate']): 164 | return s1 165 | if int(s2.get_info()['bit_rate']) > int(s1.get_info()['bit_rate']): 166 | return s2 167 | # Channels 168 | elif criterion == 'channels': 169 | if s1.get_info()['channels'] > s2.get_info()['channels']: 170 | return s1 171 | if s2.get_info()['channels'] > s1.get_info()['channels']: 172 | return s2 173 | else: 174 | raise Exception(f"Unknown video selection criterion: {criterion}") 175 | logging.warning(f'Audio criteria could not select between {s1} and {s2}') 176 | return s1 177 | 178 | def select_subtitle_stream(s1, s2, opts=DEF_OPTS_MERGE): 179 | for criterion in opts['try_subtitle_criteria']: 180 | pass 181 | logging.warning(f'Subtitle criteria could not select between {s1} and {s2}') 182 | return s1 183 | 184 | def find_related(target, media): 185 | basename, _ = os.path.splitext(target.name) 186 | matches = [] 187 | for m in media: 188 | if basename in m.name and m is not target: 189 | matches.append(m) 190 | # Try to find dedicated subtitle folders in lone video files 191 | dirname = os.path.dirname(target.path) 192 | dirents = glob.glob(os.path.join(dirname, '*')) 193 | if len([f for f in dirents if os.path.isfile(f) and os.path.splitext(f)[1][1:] in VIDEO_EXTENSIONS]) == 1: 194 | for entry in dirents: 195 | if os.path.isdir(entry) and os.path.basename(entry).lower() in ('subs', 'subtitles'): 196 | subs = map(lambda path: Media(path), glob.glob(os.path.join(entry, '*.srt'))) 197 | matches += list(subs) 198 | return matches 199 | 200 | def plan_merge(media, format, delete=False, opts=DEF_OPTS_MERGE): 201 | plan = MergePlan() 202 | # Identify related files 203 | for m in media: 204 | if not m.is_format('matroska'): 205 | continue 206 | basepath, _ = os.path.splitext(m.path) 207 | output = Media(f'{basepath}.{format}', Media.TYPE_FILE) 208 | related = find_related(m, media) 209 | if len(related) >= 1: 210 | task = MergeTask([m] + related, output, format, delete) 211 | plan.add_task(task) 212 | # Choose which streams to preserve starting with video 213 | for task in plan: 214 | video_stream = None 215 | for s in task.input_video_streams(): 216 | video_stream = select_video_stream(video_stream, s) 217 | # Then audio 218 | audio_streams = [] 219 | for curr in task.input_audio_streams(): 220 | inserted = False 221 | for index, prev in enumerate(audio_streams): 222 | curr_lang = curr.get_info()['tags'].get('language') 223 | prev_lang = prev.get_info()['tags'].get('language') 224 | # FIXME: Do not remove anything while merging 225 | if False and curr_lang == prev_lang != None: 226 | audio_streams[index] = select_audio_stream(prev, curr) 227 | inserted = True 228 | break 229 | if not inserted: 230 | audio_streams.append(curr) 231 | # Then subtitles 232 | subtitle_streams = [] 233 | for curr in task.input_subtitle_streams(): 234 | inserted = False 235 | for index, prev in enumerate(subtitle_streams): 236 | curr_lang = curr.get_info()['tags'].get('language') 237 | prev_lang = prev.get_info()['tags'].get('language') 238 | # FIXME: Do not remove anything while merging 239 | if False and curr_lang == prev_lang != None: 240 | subtitle_streams[index] = select_subtitle_stream(prev, curr) 241 | inserted = True 242 | break 243 | if not inserted: 244 | subtitle_streams.append(curr) 245 | task.select_streams(video_stream, audio_streams, subtitle_streams) 246 | return plan 247 | -------------------------------------------------------------------------------- /curator/tui.py: -------------------------------------------------------------------------------- 1 | import math 2 | import shutil 3 | import time 4 | 5 | import numpy 6 | 7 | from rich.style import Style 8 | from rich.table import Table 9 | from rich.text import Text 10 | from textual.app import App, ComposeResult, RenderResult 11 | from textual.binding import Binding 12 | from textual.containers import Horizontal, Vertical 13 | from textual.geometry import Size 14 | from textual.keys import Keys 15 | from textual.screen import Screen 16 | from textual.widgets.data_table import ColumnKey, RowKey 17 | from textual.widgets._header import HeaderIcon 18 | from textual.widgets import Button, Header, Footer, DataTable, Static 19 | 20 | from curator.plans import RenameTask 21 | 22 | class TaskFlow(DataTable): 23 | COMPONENT_CLASSES = { 24 | "taskflow--task-odd", 25 | "taskflow--task-even", 26 | "taskflow--task-disabled", 27 | } 28 | DEFAULT_CSS = """ 29 | TaskFlow > .taskflow--task-odd { 30 | background: $primary 0%; 31 | } 32 | TaskFlow > .taskflow--task-even { 33 | background: $primary 10%; 34 | } 35 | TaskFlow > .taskflow--task-disabled { 36 | color: $text 25%; 37 | } 38 | """ 39 | 40 | def __init__(self, plan, *args, **kwargs): 41 | super().__init__(*args, **kwargs) 42 | self.cursor_type = 'row' 43 | self.zebra_stripes = False 44 | self.styles.overflow_x = "hidden" 45 | self.plan = plan 46 | 47 | def on_mount(self): 48 | for column in self.get_columns(): 49 | name = column['name'] 50 | align = "right" if name == "#" else None 51 | label = Text(name, overflow='ellipsis', justify=align) 52 | self.add_column(label, key=name) 53 | self.update_tasks() 54 | 55 | def update_tasks(self): 56 | self.clear() 57 | for task in self.plan: 58 | view = task.view() 59 | row = self.task_to_row(task) 60 | self.add_row(*row, height=len(view)) 61 | 62 | def update_task(self, index): 63 | task = self.plan[index] 64 | row = self.task_to_row(task) 65 | for cell, value in enumerate(row): 66 | self.update_cell_at((index, cell), value) 67 | 68 | @staticmethod 69 | def task_to_row(task): 70 | view = task.view() 71 | row = map(lambda c: '\n'.join(c), zip(*view)) 72 | row = [Text(str(task.id), justify="right", overflow='ellipsis')] + \ 73 | list(map(lambda text: Text(text, overflow='ellipsis'), row)) 74 | return row 75 | 76 | def get_columns(self): 77 | first_width = str(len(str(len(self.plan)))) 78 | columns = [{ 'name': '#', 'width': str(first_width) }] + self.plan.columns() 79 | return columns 80 | 81 | def compute_column_widths(self, w): 82 | columns = self.get_columns() 83 | # First reserve absolute widths 84 | for col in columns: 85 | if not col['width'].endswith('%'): 86 | col_width = int(col['width']) 87 | col['width'] = col_width 88 | w -= col_width 89 | # Then reserve relative widths 90 | scale = 1 91 | for col in columns: 92 | if isinstance(col['width'], str): 93 | col_ratio = float(col['width'][:-1]) / 100 94 | col_width = round(w * col_ratio * scale) 95 | col['width'] = col_width 96 | scale_div = (1 - col_ratio * scale) 97 | scale = scale_div and scale / scale_div or math.inf # Avoid division by zero 98 | w -= col_width 99 | # Adjust last column 100 | if w != 0: 101 | col['width'] += w 102 | return columns 103 | 104 | def on_resize(self, event): 105 | cols = self.compute_column_widths(event.size.width) 106 | for c in cols: 107 | key = ColumnKey(c['name']) 108 | col = self.columns.get(key) 109 | if col: 110 | col.width = c['width'] 111 | col.auto_width = False 112 | self._require_update_dimensions = True 113 | 114 | # HACK: Overriding private method 115 | def _render_line_in_row(self, row_key, line_no, base_style, cursor_location, hover_location): 116 | index = self._row_locations.get(row_key) 117 | style = "taskflow--task-odd" 118 | if index is not None: 119 | if index % 2: 120 | style = "taskflow--task-even" 121 | if not self.plan[index].enabled: 122 | style = "taskflow--task-disabled" 123 | style = self.get_component_styles(style).rich_style 124 | return super()._render_line_in_row(row_key, line_no, style, cursor_location, hover_location) 125 | 126 | 127 | class TaskAlternatives(DataTable): 128 | def __init__(self, plan, *args, **kwargs): 129 | super().__init__(*args, **kwargs) 130 | self.cursor_type = 'row' 131 | self.zebra_stripes = False 132 | self.styles.dock = "right" 133 | self.styles.width = "30%" 134 | self.styles.margin = (1, 0, 1, 1) 135 | self.styles.overflow_x = "hidden" 136 | self.visible = False 137 | self.display = False 138 | self.plan = plan 139 | 140 | self.add_column('#',) 141 | self.add_column('Name') 142 | self.update(plan[0]) 143 | 144 | def update(self, task): 145 | self.clear() 146 | if isinstance(task, RenameTask): 147 | self.add_rows(enumerate(task.alternatives)) 148 | 149 | 150 | class EditorApp(App): 151 | TITLE = "Curator" 152 | BINDINGS = [ 153 | ("q", "quit", "Quit"), 154 | #("f", "view_flow", "View flow"), 155 | #("c", "view_commands", "View commands"), 156 | ("d", "disable_all", "Disable all"), 157 | ("e", "enable_all", "Enable all"), 158 | ("a", "toggle_alternatives", "Toggle alternatives"), 159 | ] 160 | 161 | def __init__(self, plan): 162 | super().__init__() 163 | self.plan = plan 164 | 165 | def compose(self) -> ComposeResult: 166 | yield Header() 167 | yield TaskFlow(self.plan) 168 | yield TaskAlternatives(self.plan) 169 | yield Footer() 170 | 171 | def on_mount(self): 172 | # Remove annoying icon 173 | self.query_one(HeaderIcon).icon = ' ' 174 | 175 | def clear_line_cache(self, Widget): 176 | # HACK: Without this styles don't refresh 177 | # TODO: Find a better approach 178 | table = self.query_one(Widget) 179 | self._require_update_dimensions = True 180 | table._line_cache.clear() 181 | 182 | def on_key(self, event): 183 | if event.key == Keys.Space or event.key == Keys.Enter: 184 | self.toggle_selected_task() 185 | if event.key == Keys.Up or event.key == Keys.Down: 186 | self.call_after_refresh(self.update_alternatives) 187 | if event.key in '0123456789': 188 | index = int(event.key) 189 | self.select_alternative(index) 190 | 191 | def action_view_flow(self): 192 | return 193 | 194 | def action_view_commands(self): 195 | return 196 | 197 | def action_disable_all(self): 198 | for task in self.plan: 199 | task.enabled = False 200 | 201 | # HACK: Without this styles don't refresh 202 | # TODO: Find a better approach 203 | table = self.query_one(TaskFlow) 204 | table._clear_caches() 205 | self.refresh() 206 | 207 | def action_enable_all(self): 208 | for task in self.plan: 209 | task.enabled = True 210 | 211 | # HACK: Without this styles don't refresh 212 | # TODO: Find a better approach 213 | table = self.query_one(TaskFlow) 214 | table._clear_caches() 215 | self.refresh() 216 | 217 | def action_toggle_alternatives(self): 218 | alts = self.query_one(TaskAlternatives) 219 | alts.visible ^= True 220 | alts.display ^= True 221 | 222 | def toggle_selected_task(self): 223 | table = self.query_one(TaskFlow) 224 | index = table.cursor_coordinate.row 225 | self.plan[index].enabled ^= True 226 | self.clear_line_cache(TaskFlow) 227 | 228 | def update_alternatives(self): 229 | task = self.get_current_task() 230 | alts = self.query_one(TaskAlternatives) 231 | alts.update(task) 232 | 233 | def select_alternative(self, index): 234 | if not self.query_one(TaskAlternatives).visible: 235 | return 236 | task = self.get_current_task() 237 | task.update_output(task.alternatives[index]) 238 | table = self.query_one(TaskFlow) 239 | table.update_task(task.id - 1) 240 | 241 | def get_current_task(self): 242 | table = self.query_one(TaskFlow) 243 | index = table.cursor_coordinate.row 244 | return self.plan[index] 245 | 246 | ALIGN_LEFT = 1 247 | ALIGN_RIGHT = 2 248 | 249 | # Helpers 250 | def print_field(string, length, align=ALIGN_LEFT): 251 | lpad = ' ' 252 | rpad = ' ' 253 | if len(string) <= length: 254 | if align == ALIGN_LEFT: 255 | rpad += ' ' * (length - len(string)) 256 | if align == ALIGN_RIGHT: 257 | padr += ' ' * (length - len(string)) 258 | return lpad + string + rpad 259 | else: 260 | return lpad + string[:length-3] + '...' + rpad 261 | 262 | def compute_width(table, maxwidth=80): 263 | widths = [] # (width, avgw, maxw, fixed) 264 | table = numpy.transpose(table) 265 | for column in table: 266 | lengths = list(map(len, column)) 267 | average = numpy.average(lengths) 268 | maximum = max(lengths) 269 | if maximum <= 4: 270 | widths.append((maximum, average, maximum, True)) 271 | else: 272 | widths.append((maximum, average, maximum, False)) 273 | 274 | # Account for padding and borders 275 | maxwidth = maxwidth - 3*len(table) + 1 276 | 277 | # Reduce column size if overflow 278 | curwidth = sum(map(lambda x: x[0], widths)) 279 | if curwidth > maxwidth: 280 | removal = curwidth - maxwidth 281 | fixwidth = sum(map(lambda x: x[0], filter(lambda x: x[3], widths))) 282 | movwidth = sum(map(lambda x: x[0], filter(lambda x: not x[3], widths))) 283 | ratio = (movwidth-removal)/movwidth 284 | for i in range(len(widths)): 285 | width = widths[i] 286 | if width[3]: continue 287 | widths[i] = (int(width[0] * ratio - 1),) + width[1:] 288 | return list(map(lambda x: x[0], widths)) 289 | 290 | def print_plan(thead, tbody): 291 | # Add ID column 292 | thead = ("#",) + thead 293 | for i in range(len(tbody)): 294 | tbody[i] = (str(i+1),) + tbody[i] 295 | 296 | # Compute width for each column 297 | termsize = shutil.get_terminal_size() 298 | table = [thead] + tbody 299 | widths = compute_width(table, termsize.columns) 300 | 301 | # Print table 302 | print('┌' + '┬'.join(list(map(lambda w: '─'*(w+2), widths))) + '┐') 303 | print('│' + '│'.join(list(map(lambda x: print_field(*x), zip(thead, widths)))) + '│') 304 | print('├' + '┼'.join(list(map(lambda w: '─'*(w+2), widths))) + '┤') 305 | for row in tbody: 306 | print('│' + '│'.join(list(map(lambda x: print_field(*x), zip(row, widths)))) + '│') 307 | print('└' + '┴'.join(list(map(lambda w: '─'*(w+2), widths))) + '┘') 308 | -------------------------------------------------------------------------------- /curator/stream.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import fractions 3 | import json 4 | import logging 5 | import os 6 | import re 7 | import subprocess 8 | import tempfile 9 | 10 | import chardet 11 | import iso639 12 | import langid 13 | import pysrt 14 | 15 | # Default options 16 | DEF_OPTS_LANGUAGE = { 17 | 'only_macrolanguages': False, 18 | 'max_audio_samples': 10, 19 | 'max_video_samples': 10, 20 | 'min_score': 0.8, 21 | } 22 | 23 | class Stream: 24 | def __init__(self, media, index, info=None): 25 | self.media = media 26 | self.index = index 27 | 28 | # Cache stream information 29 | self.info = info 30 | self.frames = None 31 | self.packets = None 32 | 33 | # Store warnings about the stream 34 | self.warnings = set() 35 | 36 | def __repr__(self): 37 | return f'Stream("{self.media.name}", index={self.index})' 38 | 39 | def is_video(self): 40 | return self.get_info()['codec_type'] == 'video' 41 | 42 | def is_audio(self): 43 | return self.get_info()['codec_type'] == 'audio' 44 | 45 | def is_subtitle(self): 46 | return self.get_info()['codec_type'] == 'subtitle' 47 | 48 | def video_index(self): 49 | return len([s for s in self.media.get_streams()[:self.index] if s.is_video()]) 50 | 51 | def audio_index(self): 52 | return len([s for s in self.media.get_streams()[:self.index] if s.is_audio()]) 53 | 54 | def subtitle_index(self): 55 | return len([s for s in self.media.get_streams()[:self.index] if s.is_subtitle()]) 56 | 57 | def has_packed_bframes(self): 58 | packet = self.get_packet(0) 59 | packet_offs = int(packet['pos']) 60 | packet_size = int(packet['size']) 61 | with open(self.media.path, 'rb') as f: 62 | f.seek(packet_offs) 63 | data = f.read(packet_size) 64 | match1 = re.search(br'\x00\x00\x01\xB2DivX(\d+)b(\d+)p', data) 65 | match2 = re.search(br'\x00\x00\x01\xB2DivX(\d+)Build(\d+)p', data) 66 | if match1 or match2: 67 | return True 68 | return False 69 | 70 | def get_info(self): 71 | if self.info: 72 | return self.info 73 | cmd = ['ffprobe', self.media.path] 74 | cmd += ['-show_streams'] 75 | cmd += ['-select_streams', str(self.index)] 76 | cmd += ['-of', 'json'] 77 | result = subprocess.run(cmd, capture_output=True) 78 | if result.returncode != 0: 79 | errors = result.stderr.decode('utf-8') 80 | raise Exception(f"Failed to get info from {self} with ffmpeg:\n{errors}") 81 | output = result.stdout.decode('utf-8') 82 | self.info = json.loads(output)['streams'] 83 | self.info.setdefault('tags', {}) 84 | return self.info 85 | 86 | def get_duration(self): 87 | info = self.get_info() 88 | if 'duration' in info: 89 | return float(info['duration']) 90 | self.warnings.add("Stream has no `duration` information.") 91 | info = self.media.get_info() 92 | if 'duration' in info: 93 | return float(info['duration']) 94 | raise Exception("Could not determine stream duration.") 95 | 96 | def get_frame_rate(self): 97 | assert(self.is_video()) 98 | return fractions.Fraction(self.get_info()['avg_frame_rate']) 99 | 100 | def get_frames(self): 101 | if self.frames: 102 | return self.frames 103 | cmd = ['ffprobe', self.media.path] 104 | cmd += ['-show_frames'] 105 | cmd += ['-select_streams', str(self.index)] 106 | cmd += ['-of', 'json'] 107 | result = subprocess.run(cmd, capture_output=True) 108 | if result.returncode != 0: 109 | errors = result.stderr.decode('utf-8') 110 | raise Exception(f"Failed to get frames from {self} with ffmpeg:\n{errors}") 111 | output = result.stdout.decode('utf-8') 112 | self.frames = json.loads(output)['frames'] 113 | return self.frames 114 | 115 | def get_packets(self): 116 | if self.packets: 117 | return self.packets 118 | cmd = ['ffprobe', self.media.path] 119 | cmd += ['-show_packets'] 120 | cmd += ['-select_streams', str(self.index)] 121 | cmd += ['-of', 'json'] 122 | result = subprocess.run(cmd, capture_output=True) 123 | if result.returncode != 0: 124 | errors = result.stderr.decode('utf-8') 125 | raise Exception(f"Failed to get packets from {self} with ffmpeg:\n{errors}") 126 | output = result.stdout.decode('utf-8') 127 | self.packets = json.loads(output)['packets'] 128 | return self.packets 129 | 130 | def get_packet(self, index): 131 | if self.packets: 132 | return self.packets[index] 133 | cmd = ['ffprobe', self.media.path] 134 | cmd += ['-show_packets'] 135 | cmd += ['-select_streams', str(self.index)] 136 | cmd += ['-read_intervals', f'%+#{index+1}'] 137 | cmd += ['-of', 'json'] 138 | result = subprocess.run(cmd, capture_output=True) 139 | if result.returncode != 0: 140 | errors = result.stderr.decode('utf-8') 141 | raise Exception(f"Failed to get packets from {self} with ffmpeg:\n{errors}") 142 | output = result.stdout.decode('utf-8') 143 | packet = json.loads(output)['packets'][index] 144 | return packet 145 | 146 | def detect_language(self, opts=DEF_OPTS_LANGUAGE): 147 | opts = DEF_OPTS_LANGUAGE if opts is None else opts 148 | codec_type = self.get_info()['codec_type'] 149 | if codec_type == 'audio': 150 | return self.detect_audio_language(opts) 151 | if codec_type == 'subtitle': 152 | return self.detect_subtitle_language(opts) 153 | 154 | def detect_audio_language(self, opts=DEF_OPTS_LANGUAGE): 155 | """ 156 | Detect language of an audio stream using OpenAI Whisper. 157 | """ 158 | assert(self.is_audio()) 159 | debug = logging.getLogger().level == logging.DEBUG 160 | logging.debug(f'Detecting audio language in stream #{self.index} of media: "{self.media.name}"') 161 | 162 | import whisper 163 | from whisper.audio import CHUNK_LENGTH 164 | model = whisper.load_model("base") 165 | 166 | # Calculate number of samples 167 | duration = self.get_duration() 168 | len_samples = float(CHUNK_LENGTH) 169 | num_samples = min(opts['max_audio_samples'], int(duration / len_samples)) 170 | 171 | results = {} 172 | with tempfile.TemporaryDirectory() as tmp: 173 | ext = self.media.ext 174 | err_samples = 0 175 | for index in range(num_samples): 176 | # Extract sample 177 | sample = os.path.join(tmp, f'sample{index:04d}.{ext}') 178 | cmd = ['ffmpeg', '-i', self.media.path, '-map', f'0:{self.index}'] 179 | cmd += ['-c:a', 'copy'] 180 | cmd += ['-ss', str(index * duration / num_samples)] 181 | cmd += ['-t', str(len_samples)] 182 | cmd += [sample] 183 | result = subprocess.run(cmd, capture_output=True) 184 | if result.returncode != 0: 185 | errors = result.stderr.decode('utf-8') 186 | raise Exception(f"Failed to extract audio sample from {self.media.path} with ffmpeg:\n{errors}") 187 | 188 | # Detect language in sample 189 | try: 190 | audio = whisper.load_audio(sample) 191 | audio = whisper.pad_or_trim(audio) 192 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 193 | _, probs = model.detect_language(mel) 194 | except Exception as e: 195 | logging.warning(f'Failed to detect language in {sample}:\n{e}') 196 | err_samples += 1 197 | continue 198 | # Process language detection results 199 | if debug: 200 | highest_probs = dict(collections.Counter(probs).most_common(5)) 201 | highest_probs_rounded = { k: f'{v:.4f}' for k, v in highest_probs.items() } 202 | logging.debug(f'Sample #{index:02d}: {highest_probs_rounded}') 203 | lang = max(probs, key=probs.get) 204 | prob = probs[lang] 205 | if opts['min_score'] <= prob: 206 | results.setdefault(lang, []).append(prob) 207 | 208 | # Compute final scores as votes+avg(prob) if more than half succeeded 209 | if err_samples > num_samples / 2: 210 | return None 211 | results = { k: len(v) + sum(v)/len(v) for k, v in results.items() } 212 | if not results: 213 | return None 214 | 215 | # Rename keys since OpenAI Whisper does not fully adhere to ISO 639-1 216 | replacements = [('jw', 'jv')] 217 | for old, new in replacements: 218 | if old in results: 219 | results[new] = results.pop(old) 220 | 221 | # Optionally merge into ISO 639-3 macrolanguages and return highest ocurring 222 | if opts['only_macrolanguages']: 223 | macro_results = {} 224 | for key, value in results.items(): 225 | part3 = iso639.Lang(pt1=key).pt3 226 | macro = iso639.Lang(pt1=key).macro() 227 | lang = macro.pt3 if macro else part3 228 | macro_results[lang] = macro_results.get(lang, 0) + value 229 | lang = max(macro_results, key=macro_results.get) 230 | return lang 231 | 232 | # Get highest occurring language and convert ISO 639-1 to ISO 639-3 233 | lang = max(results, key=results.get) 234 | lang = iso639.Lang(pt1=lang).pt3 235 | return lang 236 | 237 | def detect_subtitle_language(self, opts=DEF_OPTS_LANGUAGE): 238 | """ 239 | Detect subtitle language copying/converting to SRT, 240 | extracting the raw text and detecting its language. 241 | """ 242 | assert(self.is_subtitle()) 243 | 244 | # Cannot detect language in bitmap subtitles 245 | if self.get_info()['codec_name'] == 'hdmv_pgs_subtitle': 246 | return None 247 | 248 | # Detect subtitle language 249 | def srt_language(path): 250 | with open(path, 'rb') as f: 251 | enc = chardet.detect(f.read())['encoding'] 252 | if enc == 'Windows-1254': 253 | enc = None # Often false positive, let PySRT auto-detect 254 | subs = pysrt.open(path, encoding=enc) 255 | text = ' '.join(map(lambda x: x.text, subs)) 256 | lang = langid.classify(text)[0] 257 | lang = iso639.Lang(pt1=lang).pt3 258 | return lang 259 | 260 | # Check if the parent media is already an SRT file 261 | path = self.media.path 262 | if self.media.ext == 'srt': 263 | return srt_language(path) 264 | 265 | # Otherwise extract subtitle stream, converting to SRT 266 | with tempfile.TemporaryDirectory() as tmp: 267 | output = os.path.join(tmp, 'output.srt') 268 | cmd = ['ffmpeg', '-i', path, '-map', f'0:{self.index}'] 269 | if self.get_info()['codec_name'] in ('srt', 'subrip'): 270 | cmd += ['-c:s', 'copy'] 271 | cmd += [output] 272 | result = subprocess.run(cmd, capture_output=True) 273 | if result.returncode != 0: 274 | errors = result.stderr.decode('utf-8') 275 | raise Exception(f"Failed to extract subtitles from {path} with ffmpeg:\n{errors}") 276 | return srt_language(output) 277 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------