├── .github └── workflows │ └── build-image.yml ├── Dockerfile ├── LICENSE ├── README.md ├── app.py └── renovate.json /.github/workflows/build-image.yml: -------------------------------------------------------------------------------- 1 | name: 'Build Image' 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '**.md' 7 | branches-ignore: 8 | - 'renovate/**' 9 | 10 | release: 11 | types: 12 | - published 13 | 14 | workflow_dispatch: 15 | 16 | permissions: 17 | contents: read 18 | packages: write 19 | actions: read 20 | security-events: write 21 | 22 | jobs: 23 | build: 24 | uses: SlashNephy/.github/.github/workflows/docker-build.yml@master 25 | with: 26 | image-name: ghcr.io/slashnephy/m2ts-classifier 27 | image-platforms: linux/amd64 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11.3-alpine3.16@sha256:9efc6e155f287eb424ede74aeff198be75ae04504b1e42e87ec9f221e7410f2d 2 | 3 | RUN apk add --update --no-cache --virtual .build-deps \ 4 | build-base \ 5 | && pip install --no-cache-dir \ 6 | python-Levenshtein \ 7 | && apk del --purge .build-deps 8 | 9 | COPY ./app.py /app.py 10 | ENTRYPOINT [ "python", "-u", "app.py" ] 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Nep 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # m2ts-classifier 2 | 3 | 🧹 m2ts ファイルを作品ごとにシンボリックリンクを作成し, 整理します 4 | 5 | ![screenshot1.png](https://i.imgur.com/XSjwXO7.png) 6 | 7 | ![screenshot2.png](https://i.imgur.com/YgZv3ko.png) 8 | 9 | ## ロジック 10 | 11 | - ファイル名のレーベンシュタイン距離を計算します。距離が小さいほど, タイトルの類似性が高いため同じ作品としてみなします。 12 | - 同じ作品としてみなしたファイル名から共通文字列を算出します。共通文字列が各作品のフォルダになります。 13 | - 実際のファイルを移動させるのではなく, シンボリックリンクを貼るため高速かつ安全に整理を行えます。 14 | 15 | ## docker-compose 16 | 17 | ```yml 18 | services: 19 | classifier: 20 | container_name: m2ts-classifier 21 | image: ghcr.io/slashnephy/m2ts-classifier:master 22 | volumes: 23 | - /mnt:/mnt:ro 24 | - /mnt/links:/mnt/links 25 | environment: 26 | # 対象とする拡張子, ファイル名の比較をしてるだけなので m2ts じゃなくても使えます 27 | TARGET_EXTENSION: m2ts 28 | # リンクの作成場所 29 | OUTPUT_DIRECTORY: /mnt/links 30 | # m2ts の保存場所 31 | MOUNT_POINTS: /mnt 32 | 33 | # 各閾値, 詳しくはソースコード参照 34 | # ファイル名の編集距離の許容値, これを下回ったものだけが同じ作品とみなされる 35 | LD_THRESHOLD: 0.5 36 | # マッチ数の許容値, これを上回った際に作品ごとのフォルダが作られる 37 | MATCH_THRESHOLD: 4 38 | # 共通文字列の文字列の長さの許容値, これを上回った際にフォルダが作られる 39 | SEQUENCE_THRESHOLD: 4 40 | 41 | # https://github.com/SlashNephy/comskip-tvtplay と併用するかどうか 42 | # .chapter ファイルのシンボリックリンクも作成されるようになる 43 | SUPPORT_COMSKIP_TVTPLAY: 1 44 | ``` 45 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import unicodedata 5 | from dataclasses import dataclass 6 | from difflib import SequenceMatcher 7 | from pathlib import Path 8 | from pprint import pprint 9 | 10 | import Levenshtein 11 | 12 | TARGET_EXTENSION = os.getenv("TARGET_EXTENSION", "m2ts") 13 | OUTPUT_DIRECTORY = os.getenv("OUTPUT_DIRECTORY") 14 | MOUNT_POINTS = [v for k, v in os.environ.items() if k.startswith("MOUNT_POINTS")] 15 | LD_THRESHOLD = float(os.getenv("LD_THRESHOLD", "0.5")) 16 | MATCH_THRESHOLD = int(os.getenv("MATCH_THRESHOLD", "4")) 17 | SEQUENCE_THRESHOLD = int(os.getenv("SEQUENCE_THRESHOLD", "4")) 18 | INTERVAL_SECONDS = int(os.getenv("INTERVAL_SECONDS", "900")) 19 | PREFIXES_PATTERN = re.compile(os.getenv("PREFIXES_PATTERN", r"^(アニメ\s|アニメA・|アニメギルド|アニメ26)")) 20 | SUFFIXES_PATTERN = re.compile(os.getenv("SUFFIXES_PATTERN", r"(第\d*|#\d*|\(\d+\)|ほか|[\(「])\s*$")) 21 | BRACKETS_PATTERN = re.compile(os.getenv("BRACKETS_PATTERN", r"(\[.+?\]|【.+?】|「.+?」)")) 22 | SUPPORT_COMSKIP_TVTPLAY = os.getenv("SUPPORT_COMSKIP_TVTPLAY") == "1" 23 | 24 | def enumerate_paths(): 25 | return [ 26 | path 27 | for mp in MOUNT_POINTS 28 | for path in Path(mp).glob(f"**/*.{TARGET_EXTENSION}") 29 | if path.is_file() and not path.is_symlink() 30 | ] 31 | 32 | def enumerate_broken_links(): 33 | return [ 34 | path 35 | for path in Path(OUTPUT_DIRECTORY).glob("**/*") 36 | if path.is_symlink() and not path.is_file() 37 | ] 38 | 39 | def enumerate_empty_directories(): 40 | return [ 41 | path 42 | for path in Path(OUTPUT_DIRECTORY).glob("**/*") 43 | if path.is_dir() and not any(path.iterdir()) 44 | ] 45 | 46 | def enumerate_toplevel_links(): 47 | return [ 48 | path 49 | for path in Path(OUTPUT_DIRECTORY).glob("*") 50 | if path.is_symlink() and path.is_file() 51 | ] 52 | 53 | def remove_brackets(text): 54 | return BRACKETS_PATTERN.sub("", text) 55 | 56 | def remove_prefix(text): 57 | return PREFIXES_PATTERN.sub("", text) 58 | 59 | def remove_suffix(text): 60 | return SUFFIXES_PATTERN.sub("", text) 61 | 62 | 63 | windows_special_characters_pattern = re.compile(r"[<>:\"/\\|\?\*]") 64 | def remove_windows_special_characters(text): 65 | return windows_special_characters_pattern.sub("", text) 66 | 67 | 68 | @dataclass(frozen=True) 69 | class Entry: 70 | path: Path 71 | name: str 72 | 73 | def find_common_sequence(a, b): 74 | match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b)) 75 | return a[match.a: match.a + match.size] 76 | 77 | def find_chapter_path(path): 78 | chapter_path = path.with_suffix(".chapter") 79 | if chapter_path.exists(): 80 | return chapter_path 81 | 82 | chapters_dir = path.parent / "chapters" 83 | if chapters_dir.exists(): 84 | chapter_path = chapters_dir / path.with_suffix(".chapter").name 85 | if chapter_path.exists(): 86 | return chapter_path 87 | 88 | def create_directory(name): 89 | directory = Path(OUTPUT_DIRECTORY) / name 90 | if not directory.exists(): 91 | directory.mkdir() 92 | 93 | return directory 94 | 95 | def create_link(directory, src): 96 | link_path = directory / src.name 97 | if not link_path.is_symlink(): 98 | link_path.symlink_to(src) 99 | print(f"create symlink: {link_path}") 100 | 101 | if SUPPORT_COMSKIP_TVTPLAY: 102 | chapter_path = find_chapter_path(src) 103 | if chapter_path: 104 | chapters_directory = link_path.parent / "chapters" 105 | if not chapters_directory.exists(): 106 | chapters_directory.mkdir() 107 | 108 | chapter_link_path = chapters_directory / link_path.with_suffix(".chapter").name 109 | chapter_link_path.symlink_to(chapter_path) 110 | print(f"create symlink: {chapter_link_path}") 111 | 112 | def create_links(): 113 | checked_entries = [] 114 | entries = [ 115 | Entry( 116 | path, 117 | name=remove_suffix(remove_prefix(remove_windows_special_characters(remove_brackets(unicodedata.normalize("NFKC", path.stem))))).strip() 118 | ) 119 | for path in enumerate_paths() 120 | ] 121 | entries.sort(key=lambda x: x.name) 122 | 123 | for e1 in entries: 124 | if e1 in checked_entries or not e1.name: 125 | continue 126 | 127 | print(e1) 128 | 129 | # 編集距離をすべて求める 130 | lds = { 131 | e2: Levenshtein.distance(e1.name, e2.name) / max(len(e1.name), len(e2.name)) 132 | for e2 in entries 133 | if e2.name 134 | } 135 | 136 | # 条件を満たす編集距離だけを取り出す 137 | filtered_lds = { 138 | e2: ld 139 | for e2, ld in lds.items() 140 | if ld < LD_THRESHOLD 141 | } 142 | 143 | pprint(filtered_lds) 144 | if len(filtered_lds) < MATCH_THRESHOLD: 145 | continue 146 | 147 | # 共通文字列を探し, 最も最短のものをディレクトリ名とする 148 | sequences = [ 149 | find_common_sequence(e1.name, e2.name) 150 | for e2 in filtered_lds.keys() 151 | ] 152 | min_sequence = min(sequences, key=lambda x: len(x)) 153 | common_sequence = remove_suffix(remove_prefix(min_sequence)).strip() 154 | 155 | print(common_sequence) 156 | if len(common_sequence) < SEQUENCE_THRESHOLD: 157 | continue 158 | 159 | # ディレクトリを作成する 160 | link_dir = create_directory(common_sequence) 161 | 162 | # シンボリックリンクを作成する 163 | for e2 in filtered_lds.keys(): 164 | create_link(link_dir, e2.path) 165 | 166 | checked_entries.extend(list(filtered_lds.keys())) 167 | 168 | for e in entries: 169 | if e not in checked_entries: 170 | create_link(Path(OUTPUT_DIRECTORY), e.path) 171 | 172 | def cleanup_links(): 173 | # 壊れたシンボリックリンクを削除 174 | for path in enumerate_broken_links(): 175 | path.unlink() 176 | print(f"remove symlink: {path}") 177 | 178 | # 空のディレクトリを削除 179 | for directory in enumerate_empty_directories(): 180 | directory.rmdir() 181 | print(f"remove directory: {directory}") 182 | 183 | # OUTPUT_DIRECTORY のトップレベルから他のフォルダに含まれているリンクを削除 184 | directories = [x for x in Path(OUTPUT_DIRECTORY).iterdir() if x.is_dir()] 185 | for path in enumerate_toplevel_links(): 186 | for directory in directories: 187 | if (directory / path.name).exists(): 188 | path.unlink() 189 | print(f"remove symlink: {path}") 190 | break 191 | 192 | 193 | if __name__ == "__main__": 194 | if not OUTPUT_DIRECTORY or not MOUNT_POINTS: 195 | raise RuntimeError("OUTPUT_DIRECTORY or MOUNT_POINTS is not defined.") 196 | 197 | while True: 198 | create_links() 199 | cleanup_links() 200 | 201 | time.sleep(INTERVAL_SECONDS) 202 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "github>SlashNephy/.github:renovate-config" 5 | ] 6 | } 7 | --------------------------------------------------------------------------------