└── subtitle_embeddings ├── README ├── LICENSE ├── llm.py ├── utils.py ├── generate.py ├── choose.py └── kdenlive └── proj0.kdenlive.ass /subtitle_embeddings/README: -------------------------------------------------------------------------------- 1 | https://www.bilibili.com/video/BV1CyjzzCEvA/ 2 | -------------------------------------------------------------------------------- /subtitle_embeddings/LICENSE: -------------------------------------------------------------------------------- 1 | For all files except proj0.kdenlive.ass, they are in public domain and you could use them freely. 2 | 3 | The translators hold the copyright/license to the subtitle file. 4 | -------------------------------------------------------------------------------- /subtitle_embeddings/llm.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | 3 | llmkey_path = "/home/taoky/scripts/bin/llmkey.py" 4 | spec = importlib.util.spec_from_file_location("llmkey", llmkey_path) 5 | llmkey_module = importlib.util.module_from_spec(spec) 6 | spec.loader.exec_module(llmkey_module) 7 | -------------------------------------------------------------------------------- /subtitle_embeddings/utils.py: -------------------------------------------------------------------------------- 1 | def miliseconds_to_time(ms): 2 | """Convert milliseconds to a time string in the format HH:MM:SS.mmm""" 3 | hours = ms // 3600000 4 | ms %= 3600000 5 | minutes = ms // 60000 6 | ms %= 60000 7 | seconds = ms // 1000 8 | milliseconds = ms % 1000 9 | return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}" 10 | -------------------------------------------------------------------------------- /subtitle_embeddings/generate.py: -------------------------------------------------------------------------------- 1 | from llm import llmkey_module 2 | import argparse 3 | import numpy as np 4 | from pathlib import Path 5 | import pysubs2 6 | from collections import defaultdict 7 | import openai 8 | import tqdm.asyncio 9 | import json 10 | import asyncio 11 | from utils import miliseconds_to_time 12 | 13 | 14 | parser = argparse.ArgumentParser(description="Generate data files from subtitle files") 15 | parser.add_argument("files", nargs="+", type=Path) 16 | args = parser.parse_args() 17 | 18 | 19 | lines_mapping = defaultdict(list) 20 | 21 | for file in args.files: 22 | file: Path 23 | subs = pysubs2.load(file) 24 | for line in subs: 25 | if line.type == "Comment": 26 | continue 27 | lines_mapping[line.plaintext].append( 28 | { 29 | "filename": file.name, 30 | "start": miliseconds_to_time(line.start), 31 | "end": miliseconds_to_time(line.end), 32 | } 33 | ) 34 | 35 | text_lines = list(lines_mapping.keys()) 36 | embeddings = np.zeros((len(lines_mapping), 1024)) 37 | infos = [None] * len(lines_mapping) 38 | 39 | semaphore = asyncio.Semaphore(50) 40 | 41 | 42 | async def fetch_embedding(ai: openai.AsyncClient, index, text): 43 | async with semaphore: 44 | try: 45 | response = await ai.embeddings.create(model="BAAI/bge-m3", input=text) 46 | embedding = np.array(response.data[0].embedding, dtype=np.float32) 47 | info = [text, lines_mapping[text]] 48 | return index, embedding, info 49 | except Exception as e: 50 | print(f"[Error] Failed to embed text at index {index}: {e}") 51 | return index, None, None 52 | 53 | 54 | async def main(): 55 | ai = await llmkey_module.aget_ai() 56 | ai: openai.AsyncClient 57 | tasks = [fetch_embedding(ai, i, text) for i, text in enumerate(text_lines)] 58 | for future in tqdm.asyncio.tqdm.as_completed(tasks, total=len(tasks)): 59 | index, embedding, info = await future 60 | if embedding is not None: 61 | embeddings[index] = embedding 62 | infos[index] = info 63 | 64 | np.save("embeddings.npy", embeddings) 65 | with open("infos.json", "w") as f: 66 | json.dump(infos, f, ensure_ascii=False, indent=4) 67 | 68 | 69 | if __name__ == "__main__": 70 | asyncio.run(main()) 71 | -------------------------------------------------------------------------------- /subtitle_embeddings/choose.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | from llm import llmkey_module 4 | import openai 5 | import subprocess 6 | import readline # noqa: F401 7 | import re 8 | import argparse 9 | from pathlib import Path 10 | from utils import miliseconds_to_time 11 | 12 | parser = argparse.ArgumentParser(description="Choose a subtitle line based on query") 13 | parser.add_argument("video_folder", type=Path, help="Path to the video folder") 14 | parser.add_argument("--index", type=int, default=None, help="Index of the subtitle line to choose") 15 | args = parser.parse_args() 16 | 17 | ai = llmkey_module.get_ai() 18 | ai: openai.Client 19 | 20 | embeddings = np.load("embeddings.npy") 21 | with open("infos.json", "r") as f: 22 | infos = json.load(f) 23 | 24 | 25 | # choose top20 (cosine) 26 | if args.index is None: 27 | query = input("Enter your query: ") 28 | query_embedding = ( 29 | ai.embeddings.create(model="BAAI/bge-m3", input=query).data[0].embedding 30 | ) 31 | top_k = 20 32 | cosine_similarities = np.dot(embeddings, query_embedding) / ( 33 | np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding) 34 | ) 35 | top_indices = np.argsort(cosine_similarities)[-top_k:][::-1] 36 | print(f"Top {top_k} results for query '{query}':") 37 | for index in top_indices: 38 | text, lines = infos[index] 39 | print(f"({index}) Text: {text}") 40 | for line in lines: 41 | print(f" File: {line['filename']}, Start: {line['start']}, End: {line['end']}") 42 | print() 43 | 44 | while True: 45 | try: 46 | index = int(input("Your choice: ")) 47 | text, lines = infos[index] 48 | break 49 | except ValueError: 50 | pass 51 | else: 52 | index = args.index 53 | text, lines = infos[index] 54 | 55 | 56 | if len(lines) > 1: 57 | while True: 58 | try: 59 | line_index = int( 60 | input(f"Multiple lines found, choose one (0-{len(lines) - 1}): ") 61 | ) 62 | if 0 <= line_index < len(lines): 63 | break 64 | except ValueError: 65 | pass 66 | else: 67 | line_index = 0 68 | line = lines[line_index] 69 | 70 | print(text, line) 71 | 72 | def calc_duration(start, end): 73 | """Calculate the duration between start and end times.""" 74 | start_ms = int(sum(float(x) * 60 ** i * 1000 for i, x in enumerate(reversed(start.split(":"))))) 75 | end_ms = int(sum(float(x) * 60 ** i * 1000 for i, x in enumerate(reversed(end.split(":"))))) 76 | return miliseconds_to_time(end_ms - start_ms) 77 | 78 | line["duration"] = calc_duration(line["start"], line["end"]) 79 | 80 | SUBTITLE_FILENAME = re.compile(r".+\.(\d+)\..+") 81 | 82 | ep = SUBTITLE_FILENAME.search(line["filename"]).group(1) 83 | video_filename = f"Hanayamata - {ep} [BDRip 1920x1080 x264 FLAC].mkv" 84 | video_path = args.video_folder / video_filename 85 | 86 | # call ffmpeg to extract audio segment and export to m4a 87 | subprocess.run( 88 | [ 89 | "ffmpeg", 90 | "-ss", 91 | line["start"], 92 | "-t", 93 | line["duration"], 94 | "-i", 95 | video_path, 96 | "-vn", 97 | "-c:a", 98 | "aac", 99 | f"{text}-{ep}-{line_index}.m4a", 100 | ], 101 | check=True, 102 | ) 103 | -------------------------------------------------------------------------------- /subtitle_embeddings/kdenlive/proj0.kdenlive.ass: -------------------------------------------------------------------------------- 1 | [Script Info] 2 | ; Script generated by Kdenlive 25.04.1 3 | LayoutResX: 1920 4 | LayoutResY: 1080 5 | PlayResX: 1920 6 | PlayResY: 1080 7 | ScaledBorderAndShadow: yes 8 | ScriptType: v4.00+ 9 | WrapStyle: 0 10 | YCbCr Matrix: TV.709 11 | 12 | [Kdenlive Extradata] 13 | MaxLayer: 2 14 | DefaultStyles: Default,Default,Default,Default 15 | 16 | [V4+ Styles] 17 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 18 | Style: Dial_CH,Noto Sans CJK SC,70.00,&H14FFFFFF,&H000000FF,&H14211711,&H00000000,-1,0,0,0,100.00,100.00,0.00,0.00,1,2.00,0.00,2,10,10,30,1 19 | Style: EDCN,Noto Sans CJK SC,45.00,&H00FFFFFF,&HF0000000,&H00D6A977,&H407D5323,-1,0,0,0,100.00,100.00,6.00,0.00,1,3.00,0.00,2,10,10,30,134 20 | 21 | [Events] 22 | Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 23 | Dialogue: 0,00:00:31.24,00:00:31.99,Dial_CH,,0,0,0,,小祥 24 | Dialogue: 0,00:00:32.57,00:00:34.24,Dial_CH,,0,0,0,,太好了 你来了 25 | Dialogue: 0,00:00:34.66,00:00:36.62,Dial_CH,,0,0,0,,都湿了啊 你没事吧 26 | Dialogue: 0,00:00:37.08,00:00:39.96,Dial_CH,,0,0,0,,怎么没来 为什么没反应了啊 27 | Dialogue: 0,00:00:42.12,00:00:43.75,Dial_CH,,0,0,0,,我有件想让你们做的事 28 | Dialogue: 0,00:00:46.67,00:00:49.76,Dial_CH,,0,0,0,,对 解散 那么告辞了 29 | Dialogue: 0,00:00:56.22,00:00:56.68,Dial_CH,,0,0,0,,等等 30 | Dialogue: 0,00:00:58.39,00:00:59.52,Dial_CH,,0,0,0,,稍微冷静一下 31 | Dialogue: 0,00:01:01.69,00:01:03.73,Dial_CH,,0,0,0,,果然是发生了什么 32 | Dialogue: 0,00:01:04.15,00:01:05.90,Dial_CH,,0,0,0,,却什么都没对我说 33 | Dialogue: 0,00:01:06.02,00:01:08.15,Dial_CH,,0,0,0,,还是说是因为讨厌我们了… 34 | Dialogue: 0,00:01:08.23,00:01:11.90,Dial_CH,,0,0,0,,我也一直想改变的来着 35 | Dialogue: 0,00:01:12.28,00:01:14.03,Dial_CH,,0,0,0,,这是我家里的问题 36 | Dialogue: 0,00:01:16.08,00:01:16.70,Dial_CH,,0,0,0,,Why 37 | Dialogue: 0,00:01:17.41,00:01:19.62,Dial_CH,,0,0,0,,夜来舞很开心的哦 38 | Dialogue: 0,00:01:19.87,00:01:21.87,Dial_CH,,0,0,0,,说什么其实有其他想干的事情 39 | Dialogue: 0,00:01:22.04,00:01:23.04,Dial_CH,,0,0,0,,我都说了 40 | Dialogue: 0,00:01:23.29,00:01:24.88,Dial_CH,,0,0,0,,是吗… 41 | Dialogue: 0,00:01:27.38,00:01:30.30,Dial_CH,,0,0,0,,但是我想快点成立夜来舞社 42 | Dialogue: 0,00:01:30.88,00:01:32.47,Dial_CH,,0,0,0,,不…不要 43 | Dialogue: 0,00:01:36.30,00:01:37.47,Dial_CH,,0,0,0,,是因为你要走了吗 44 | Dialogue: 0,00:01:37.76,00:01:38.93,Dial_CH,,0,0,0,,为什么要逃跑啊 45 | Dialogue: 0,00:01:40.35,00:01:41.81,Dial_CH,,0,0,0,,你在干什么啊 46 | Dialogue: 0,00:01:42.23,00:01:45.31,Dial_CH,,0,0,0,,完全不负责任 只考虑自己的事情 47 | Dialogue: 0,00:01:45.77,00:01:46.86,Dial_CH,,0,0,0,,大家说话啊 48 | Dialogue: 0,00:01:46.86,00:01:47.86,Dial_CH,,0,0,0,,别说了! 49 | Dialogue: 0,00:01:47.86,00:01:49.32,Dial_CH,,0,0,0,,不要擅自给我选啊 50 | Dialogue: 0,00:01:49.69,00:01:51.15,Dial_CH,,0,0,0,,还在找你呢 51 | Dialogue: 0,00:01:51.24,00:01:51.90,Dial_CH,,0,0,0,,给我闭嘴 52 | Dialogue: 0,00:01:54.45,00:01:55.70,Dial_CH,,0,0,0,,少了一个人 53 | Dialogue: 0,00:01:56.82,00:01:58.41,Dial_CH,,0,0,0,,(也)不要逃自主练习 54 | Dialogue: 0,00:01:59.37,00:02:01.87,Dial_CH,,0,0,0,,可不能在这里停滞不前 55 | Dialogue: 0,00:02:02.08,00:02:04.62,Dial_CH,,0,0,0,,一直都是毫无责任心 只顾自己 56 | Dialogue: 0,00:02:05.04,00:02:07.29,Dial_CH,,0,0,0,,但是不努力的话 57 | Dialogue: 0,00:02:07.58,00:02:09.50,Dial_CH,,0,0,0,,不负责又任性的人啊 58 | Dialogue: 0,00:02:09.88,00:02:12.09,Dial_CH,,0,0,0,,练习要按计划继续进行 59 | Dialogue: 0,00:02:12.38,00:02:13.92,Dial_CH,,0,0,0,,已经没多少时间了 60 | Dialogue: 0,00:02:15.88,00:02:17.68,Dial_CH,,0,0,0,,大家都在等着 61 | Dialogue: 0,00:02:18.26,00:02:18.51,Dial_CH,,0,0,0,,等一下 62 | Dialogue: 0,00:02:19.30,00:02:21.35,Dial_CH,,0,0,0,,这点小事你如实地说出来就好嘛 63 | Dialogue: 0,00:02:21.81,00:02:23.85,Dial_CH,,0,0,0,,其实是 非常开心的 64 | Dialogue: 0,00:02:24.27,00:02:26.48,Dial_CH,,0,0,0,,是吧 鸣也是这么想的吧 65 | Dialogue: 0,00:02:29.90,00:02:30.86,Dial_CH,,0,0,0,,我… 66 | Dialogue: 0,00:02:32.44,00:02:35.49,Dial_CH,,0,0,0,,已经不喜欢了 67 | Dialogue: 0,00:03:27.92,00:03:33.75,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&HD15CAE&\4c&H712158&}樱花拂风 风摇长发之时 68 | Dialogue: 0,00:03:34.05,00:03:37.17,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&HD15CAE&\4c&H712158&}藏不住小小叹息 69 | Dialogue: 0,00:03:37.67,00:03:40.51,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&HD15CAE&\4c&H712158&}一人行走在归途 70 | Dialogue: 0,00:03:41.05,00:03:47.10,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&HD15CAE&\4c&H712158&}在失去我梦想方向的那个夜晚 71 | Dialogue: 0,00:03:47.23,00:03:50.35,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&HD15CAE&\4c&H712158&}即使逞强安慰自己 72 | Dialogue: 0,00:03:50.48,00:03:53.82,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&HD15CAE&\4c&H712158&\t(500,2000,\3c&HD6A977&\4c&H7D5323&)}仍然无法弥补心中的空缺 73 | Dialogue: 0,00:03:53.98,00:03:57.24,EDCN,,0,0,0,,{\fad(250,250)\blur3}听我说 请听我说 月亮婆婆 74 | Dialogue: 0,00:03:57.28,00:04:00.41,EDCN,,0,0,0,,{\fad(250,250)\blur3}抬起头忍住眼泪 75 | Dialogue: 0,00:04:00.57,00:04:07.54,EDCN,,0,0,0,,{\fad(250,250)\blur3}泪光下那无数日夜的尽头 终有你身影 76 | Dialogue: 0,00:04:08.33,00:04:15.25,EDCN,,0,0,0,,{\fad(250,250)\blur3\t(350,650,\3c&H8F89EC&\4c&H37318E&)}如花飘 肩负花雪之歌 童话中的一国公主 77 | Dialogue: 0,00:04:15.55,00:04:21.47,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&H8F89EC&\4c&H37318E&}无暇般飞舞的长袖中飘落的花瓣 那众多的轨迹 78 | Dialogue: 0,00:04:21.51,00:04:28.52,EDCN,,0,0,0,,{\fad(250,250)\blur3\3c&H8F89EC&\4c&H37318E&\t(6200,6400,\3c&HD6A977&\4c&H7D5323&)}在伸出手的掌心中 是那曾经梦想的旋律 79 | Dialogue: 0,00:04:28.68,00:04:35.19,EDCN,,0,0,0,,{\fad(250,250)\blur3}心愿交织 愿永远如此 愿持续永恒 80 | --------------------------------------------------------------------------------