├── .gitignore
├── AdjustLrc.py
├── AdjustSrt.py
├── README.md
├── Sample
    └── sampleLrc.lrc
└── assets
    ├── a238efc8e01a2e5add3f61aca7ac7e9.jpg
    └── a62c7ecbd2822d514a87ec97eb95555.jpg


/.gitignore:
--------------------------------------------------------------------------------
1 | config.json
2 | Sample/*


--------------------------------------------------------------------------------
/AdjustLrc.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import zhipuai
  4 | import json
  5 | import requests
  6 | 
  7 | if_need_LLM_add_punctuation = True
  8 | # 是否需要调用 AI 为文本添加标点符号以便处理，主要用于中文无标点场景。如果为 True，需要在 config.json 中配置相关 API。
  9 | # 如果 if_need_LLM_add_punctuation = True, 从 config.json 中读取相关 API
 10 | if if_need_LLM_add_punctuation:
 11 |     with open("config.json", "r") as f:
 12 |         config = json.load(f)
 13 |         zhipuai_api_key = config["zhipuai_api_key"]
 14 |         openai_api_key =config["openai_api_key"]
 15 |         openai_api_url = config["openai_api_url"]
 16 |         deepseek_api_key = config["deepseek_api_key"]
 17 | 
 18 | # 智谱，垃圾
 19 | def add_punctuation_zhipuai(inputText):
 20 |     zhipuai.api_key = zhipuai_api_key
 21 |     response = zhipuai.model_api.invoke(
 22 |         model="chatglm_std",
 23 |         prompt=[
 24 |             {"role": "user", "content": f"{inputText}\n\n这是一个音频的转文字结果，以 .lrc 格式存储。请结合语义和上下文为每句话添加标点符号，禁止删除或新增时间点，只需返回修改后的 lrc 文本。"},
 25 |         ],
 26 |         temperature=0.9, # 值越小结果越稳定
 27 |         top_p = 0.7 
 28 |     )
 29 |     # Sample response:{'code': 200, 'msg': '操作成功', 'data': {'request_id': '7941314437787463250', 'task_id': '7941314437787463250', 'task_status': 'SUCCESS', 'choices': [{'role': 'assistant', 'content': '" 说你们想听哪些历史人物？"'}], 'usage': {'total_tokens': 9}}, 'success': True}
 30 |     # 直接打印 choices 中的第一个 content
 31 |     # print(response)
 32 |     # 检验是否成功
 33 |     if response["success"] == False:
 34 |         print("请求失败")
 35 |         return None
 36 |     else:
 37 |         print("智谱 AI Token 数量：{}，花费{}元".format(response["data"]["usage"]["total_tokens"], response["data"]["usage"]["total_tokens"]/1000*0.002))
 38 |         outputText = response["data"]["choices"][0]["content"]
 39 |         # 检查 outputText 最外层是否有双引号，如果有则去掉
 40 |         if outputText.startswith('"') and outputText.endswith('"'):
 41 |             outputText = outputText[1:-1]
 42 |         return outputText
 43 | 
 44 | # 调用 gpt3.5 为输入的文本添加标点符号。使用自定义的 api url。函数功能和 add_punctuation_zhipuai 相同。
 45 | def add_punctuation_openai(inputText):
 46 |     headers = {
 47 |         "Content-Type": "application/json",
 48 |         "Authorization": f"Bearer {openai_api_key}"
 49 |     }
 50 | 
 51 |     data = {
 52 |         "model": "gpt-3.5-turbo",
 53 |         "messages": [
 54 |             {"role": "system", "content": "你是为音频转录生成的 LRC 格式字幕添加标点符号的专家。保留原始单词，仅插入必要的标点符号，例如句号、逗号、大写字母、美元符号或百分号等符号以及格式。如果结合下一行判断此行无需添加标点，则可以不添加标点。仅使用提供的上下文，返回添加标点后的 LRC 格式字幕文本"},
 55 |             {"role": "user", "content": f"{inputText}"}
 56 |         ],
 57 |         "frequency_penalty":0,
 58 |         "presence_penalty":0,
 59 |         "temperature":0.6,
 60 |         "top_p":1
 61 |     }
 62 | 
 63 |     response = requests.post(f"{openai_api_url}/v1/chat/completions", headers=headers, json=data)
 64 |     response_data = response.json()
 65 | 
 66 |     if "choices" in response_data and len(response_data["choices"]) > 0:
 67 |         # $0.0005 为 prompt_tokens 的单价，$0.0015 为 completion_tokens 的单价,根据单价计算本次请求的总花费：prompt_tokens*0.0015 + completion_tokens*0.002
 68 |         print("openai api Token 数量：{}，花费{}元".format(response_data["usage"]["total_tokens"], response_data["usage"]["prompt_tokens"]/1000*0.0005*7.2 + response_data["usage"]["completion_tokens"]/1000*0.0015*7.2))
 69 |         outputText = response_data["choices"][0]["message"]["content"]
 70 |         print('##outputText:\n', outputText)
 71 |         return outputText
 72 |     else:
 73 |         print("response_data", response_data)
 74 |         # 抛出错误
 75 |         raise Exception("openai api 返回的数据不正确")
 76 | 
 77 | # 使用 deepseek api 添加标点 
 78 | def add_punctuation_deepseek(inputText):
 79 |     # https://platform.deepseek.com/api-docs/
 80 |     headers = {
 81 |         "Content-Type": "application/json",
 82 |         "Authorization": f"Bearer {deepseek_api_key}"
 83 |     }
 84 | 
 85 |     data = {
 86 |         "model": "deepseek-chat",
 87 |         "messages": [
 88 |             {"role": "system", "content": "你是为音频转录生成的 LRC 格式字幕添加标点符号的专家。保留原始单词，仅插入必要的标点符号，例如句号、逗号、大写字母、美元符号或百分号等符号以及格式。如果结合下一行判断此行无需添加标点，则可以不添加标点。仅使用提供的上下文，返回添加标点后的 LRC 格式字幕文本"},
 89 |             {"role": "user", "content": f"{inputText}"}
 90 |         ],
 91 |         "temperature":0.6
 92 |     }
 93 | 
 94 |     response = requests.post("https://api.deepseek.com/chat/completions", headers=headers, json=data)
 95 |     response_data = response.json()
 96 | 
 97 |     if "choices" in response_data and len(response_data["choices"]) > 0:
 98 |         outputText = response_data["choices"][0]["message"]["content"]
 99 |         print('##outputText:\n', outputText)
100 |         return outputText
101 |     else:
102 |         print("response_data", response_data)
103 |         # 抛出错误
104 |         raise Exception("deepseek api 返回的数据不正确")
105 |     
106 | # 为字幕文本添加标点，可以使用智谱AI，也可以使用 chatgpt。输入 captionList，返回 new_groups
107 | def add_punctuation_service(captionList,service='deepseek'):
108 |     # 最大中文长度，采取 max_token/2.5 估算。
109 |     max_text_length_per_request = 1500
110 |     # 将 captionList 的所有字幕文本(groups 里的第二项)拼接成一个字符串列表，每个字符串以换行符连接，长度不能超过 max_text_length_per_request。
111 |     text_list = []
112 |     text = ''
113 |     for i in range(len(captionList)):
114 |         if len(text) + len(captionList[i]) < max_text_length_per_request:
115 |             text = f"{text}\n{captionList[i]}"
116 |         else:
117 |             text_list.append(text)
118 |             text = ''
119 |     text_list.append(text)
120 |     # 对 text_list 中每一项调用 AI 为文本添加标点符号。合并所有返回的文本，然后按换行符分割成列表，每一项就是新的字幕文本。
121 |     new_text_list = []
122 |     for text in text_list:
123 |         if service == 'zhipuai':
124 |             new_text = add_punctuation_zhipuai(text)
125 |         elif service == 'openai':
126 |             new_text = add_punctuation_openai(text)
127 |         elif service == 'deepseek':
128 |             new_text = add_punctuation_deepseek(text)
129 |         new_text_list.append(new_text)
130 |     new_text = '\n'.join(new_text_list)
131 |     print('文本处理后：',new_text)
132 |     return new_text
133 | 
134 | def generate_output_file_path(lrc_file):
135 |     # 生成 output_file 的路径和文件名
136 |     directory = os.path.dirname(lrc_file)
137 |     filename = os.path.basename(lrc_file)
138 |     new_filename = filename.replace('.lrc', '_add_punctuation.lrc')
139 |     output_file = os.path.join(directory, new_filename)
140 |     print(output_file)
141 |     return output_file
142 | 
143 | def lrc2srt_file_path(lrc_file):
144 |     # 生成 srt_file 的路径和文件名
145 |     directory = os.path.dirname(lrc_file)
146 |     filename = os.path.basename(lrc_file)
147 |     new_filename = filename.replace('.lrc', '.srt')
148 |     srt_file = os.path.join(directory, new_filename)
149 |     print(srt_file)
150 |     return srt_file
151 | 
152 | # 将 lrc 格式字幕转换成 srt 格式，输入 lrc 文本，返回 srt 文本。
153 | def parse_lrc_timestamp(timestamp):
154 |     try:
155 |         parts = timestamp.split(':')
156 |         if len(parts) == 3:
157 |             hours, minutes, seconds = parts
158 |         elif len(parts) == 2:
159 |             hours = 0
160 |             minutes, seconds = parts
161 |         else:
162 |             return None
163 | 
164 |         seconds, milliseconds = seconds.split('.')
165 |         hours, minutes, seconds, milliseconds = int(hours), int(minutes), int(seconds), int(milliseconds)
166 |         return hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
167 |     except ValueError:
168 |         return None
169 | 
170 | def format_time(seconds):
171 |     hours = int(seconds) // 3600
172 |     minutes = (int(seconds) % 3600) // 60
173 |     seconds = int(seconds) % 60
174 |     milliseconds = int((seconds - int(seconds)) * 1000)
175 |     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
176 | 
177 | def lrc_to_srt(lrc_content):
178 |     subs = []
179 |     pattern = r'\[(\d+:\d+:\d+\.\d+|\d+:\d+\.\d+)\](.*)'
180 |     matches = re.findall(pattern, lrc_content)
181 |     for idx, match in enumerate(matches):
182 |         timestamp, text = match
183 |         start_time = parse_lrc_timestamp(timestamp)
184 |         if start_time is not None:
185 |             end_time = parse_lrc_timestamp(matches[idx + 1][0]) if idx + 1 < len(matches) else start_time + 1
186 |             subs.append(f"{len(subs) + 1}\n{format_time(start_time)} --> {format_time(end_time)}\n{text.strip()}\n")
187 |     return ''.join(subs)
188 | 
189 | # 主函数，读取 lrc 文件，调用 add_punctuation_service 为字幕文本添加标点，然后将结果写入新的 lrc 文件。
190 | def main():
191 |     lrc_file = "D:/我的坚果云/中国城夜总会剿匪记.lrc"
192 |     # LLM_service = 'deepseek'
193 |     LLM_service = 'openai'
194 |     # 读取 lrc 文件
195 |     with open(lrc_file, "r", encoding="utf-8") as f:
196 |         lrc = f.read()
197 |     # 将 lrc 按换行符分割成列表 captaionList，每一项就是一条字幕
198 |         captionList = lrc.split("\n")
199 |     # print("captionList", captionList)
200 |     # newCaption = add_punctuation_service(captionList,LLM_service)
201 |     newCaption = lrc
202 |     # 将 newCaption 写入新的 lrc 文件
203 |     output_file = generate_output_file_path(lrc_file)
204 |     with open(output_file, "w", encoding="utf-8") as f:
205 |         f.write(newCaption)
206 |     # 将 lrc 转换成 srt
207 |     srt = lrc_to_srt(newCaption)
208 |     # 将 srt 写入新的 srt 文件
209 |     with open(lrc2srt_file_path(output_file), "w", encoding="utf-8") as f:
210 |         f.write(srt)
211 | 
212 | if __name__ == "__main__":
213 |     main()


--------------------------------------------------------------------------------
/AdjustSrt.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | 
  5 | import requests
  6 | import zhipuai
  7 | 
  8 | # 每行字幕的最短字数，adjust_mode 为 3 时有效
  9 | # 英文转录参数
 10 | # min_length = 120
 11 | # max_length = 220
 12 | # if_need_spilt = True
 13 | # 是否需要根据非逗号拆分字幕，时间戳根据字符长度比例拆分，并不一定准确。实验性功能，建议在连续多句字幕都无标点结尾时使用。(主要应用于英文有标点场景)
 14 | # 中文转录测试
 15 | min_length = 120
 16 | max_length = 220
 17 | if_need_spilt = False
 18 | 
 19 | srt_file = 'D:/我的坚果云/中国城夜总会剿匪记_add_punctuation.srt' 
 20 | # 文本连接符号，英文为空格，中文不需要。
 21 | text_connector = ''
 22 | 
 23 | adjust_mode = '3'
 24 | # 字幕的调整方式，
 25 | # 1 为合并被断行的句子，
 26 | # 2 在 1 的基础上，保证每行以非逗号结尾
 27 | # 3 在 1-2 的基础上，保证每一行的字数在 min_length 和 max_length 之间
 28 | 
 29 | 
 30 | def main():
 31 |     adjust_srt_file(srt_file, adjust_mode)
 32 | 
 33 | def generate_output_file_path(srt_file):
 34 |     # 生成 output_file 的路径和文件名
 35 |     directory = os.path.dirname(srt_file)
 36 |     filename = os.path.basename(srt_file)
 37 |     new_filename = filename.replace('.srt', '_adjusted.srt')
 38 |     output_file = os.path.join(directory, new_filename)
 39 |     print(output_file)
 40 |     return output_file
 41 | 
 42 | #读取srt字幕，按空行分割成不同的组，返回一个列表，列表中的每个元素是一个组，组内包含编号、时间段和字幕文本
 43 | def read_and_split_srt_file(srt_file):
 44 |     # 读取srt文件内容并将内容按空行分割成不同的组
 45 |     with open(srt_file, 'r',encoding="utf-8") as file:
 46 |         content = file.read()
 47 |     old_groups = re.split(r'\n\s*\n', content)
 48 |     # 去除 old_groups 中的空字符串
 49 |     old_groups = [group for group in old_groups if group != '']
 50 |     return old_groups
 51 | 
 52 | # 拆分每行字幕。如果一行字幕内容里含有非逗号的标点，则拆分成两行字幕。同时时间戳按照开头到标点的长度占总长度的比例进行拆分。
 53 | def split_srt_content(old_groups):
 54 |     # 用于记录当前该处理 old_groups 中的第几个组
 55 |     current_number = 0
 56 |     # 用于在 new_groups 中的索引
 57 |     index = 0
 58 |     new_groups = []
 59 | 
 60 |     for i in range(len(old_groups)):
 61 |         # 如果 i 不等于 current_number 就跳过
 62 |         if(i == current_number):
 63 |             # 提取编号、时间段和字幕文本
 64 |             group = old_groups[i].strip().split('\n')
 65 | 
 66 |             if len(group) >= 3:
 67 |                 time_range = group[1]
 68 |                 text = ' '.join(group[2:])
 69 |                 # 检查当前字幕是否含有非逗号的标点，如果有则拆分成两行字幕。text part 取小于 1 的数字，保证不会取到最后一个标点。
 70 |                 text_part = text[:int(len(text)*4/5)]
 71 |                 if re.search(r'[^\w,，\s]', text_part):
 72 |                     # 获取当前字幕最后一个标点符号的索引
 73 |                     last_punctuation_index = re.search(r'[^\w,，\s]', text_part).span()[1]
 74 |                     # print(i, last_punctuation_index)
 75 |                     # 根据最后一个标点将当前字幕拆分成两行字幕
 76 |                     text_1 = text[:last_punctuation_index]
 77 |                     text_2 = text[last_punctuation_index:]
 78 |                     # 计算原字幕的持续时间，然后根据拆分后的两行字幕的长度占原字幕长度的比例，计算出两行字幕的持续时间。
 79 |                     time_range_1, time_range_2 = split_time_range(time_range, len(text_1) / len(text))
 80 |                     new_groups.append(f"{index}\n{time_range_1}\n{text_1}")
 81 |                     new_groups.append(f"{index+1}\n{time_range_2}\n{text_2}")
 82 |                     current_number += 1
 83 |                     index += 2
 84 |                 else:
 85 |                     new_groups.append(f"{index}\n{time_range}\n{text}")
 86 |                     current_number += 1
 87 |                     index += 1
 88 |     return new_groups
 89 | 
 90 | # 拆分时间段的函数，输入原时间段和拆分比例，返回拆分后的时间段。中间过程精确到毫秒。
 91 | def split_time_range(time_range, split_ratio):
 92 |     # 获取原时间段的开始时间和结束时间
 93 |     time_range_start = re.search(r'(\d{2}:\d{2}:\d{2},\d{3})', time_range).group(1)
 94 |     time_range_end = re.findall(r'(\d{2}:\d{2}:\d{2},\d{3})', time_range)[1]
 95 |     # 计算原时间段的持续时间，然后根据拆分后的两行字幕的长度占原字幕长度的比例，计算出两行字幕的持续时间。此处需要注意 00:06:33,319 中有毫秒。
 96 |     time_range_start_seconds = int(time_range_start.split(':')[0]) * 3600 + int(time_range_start.split(':')[1]) * 60 + int(time_range_start.split(':')[2].split(',')[0]) + int(time_range_start.split(':')[2].split(',')[1]) / 1000
 97 |     time_range_end_seconds = int(time_range_end.split(':')[0]) * 3600 + int(time_range_end.split(':')[1]) * 60 + int(time_range_end.split(':')[2].split(',')[0]) + int(time_range_end.split(':')[2].split(',')[1]) / 1000
 98 |     # 计算出拆分的时间，转换为 00:00:00,000 的格式
 99 |     split_seconds = (time_range_end_seconds - time_range_start_seconds) * split_ratio
100 |     split_time = time_range_start_seconds + split_seconds
101 |     split_time_hour = int(split_time / 3600)
102 |     split_time_minute = int((split_time - split_time_hour * 3600) / 60)
103 |     split_time_second = int(split_time - split_time_hour * 3600 - split_time_minute * 60)
104 |     split_time_millisecond = int((split_time - split_time_hour * 3600 - split_time_minute * 60 - split_time_second) * 1000)
105 |     split_time_range = f"{split_time_hour:02d}:{split_time_minute:02d}:{split_time_second:02d},{split_time_millisecond:03d}"
106 |     # 返回两个拼接后的时间端 time_range_start -> split_time_range 和 split_time_range -> time_range_end
107 |     return f"{time_range_start} --> {split_time_range}", f"{split_time_range} --> {time_range_end}"
108 | 
109 | 
110 | # 把断句合并成一句
111 | def adjust_srt_content(old_groups):
112 |     # 用于记录当前该处理 old_groups 中的第几个组
113 |     current_number = 0
114 |     # 用于在 new_groups 中的索引
115 |     index = 0
116 |     new_groups = []
117 | 
118 |     for i in range(len(old_groups)):
119 |         # 如果 i 不等于 current_number 就跳过
120 |         if(i == current_number):
121 |             # 提取编号、时间段和字幕文本
122 |             group = old_groups[i].strip().split('\n')
123 | 
124 |             if len(group) >= 3:
125 |                 time_range = group[1]
126 |                 text = ' '.join(group[2:])
127 |                 print(i,text)
128 |                 # 当前字幕文本不以标点结尾，则向后找到第一个不以逗号结尾的字幕文本；否则直接写入 new_groups
129 |                 if text.endswith(('.', ',', ':', ';', '?', '!', '，', '。', '：', '；', '？', '！')):
130 |                     new_groups.append(f"{index}\n{time_range}\n{text}")
131 |                     current_number += 1 
132 |                 else:
133 |                     move_times = 1
134 |                     # 如果当前句子结尾不是标点符号
135 |                     while not text.endswith(('.', ',', ':', ';', '?', '!', '，', '。', '：', '；', '？', '！')):
136 |                         # 判断 i+move_times 是否超出 old_groups 的索引范围
137 |                         if i + move_times >= len(old_groups)-1:
138 |                             move_times = 0
139 |                             break
140 |                         next_text = ' '.join(old_groups[i + move_times].strip().split('\n')[2:])
141 |                         text = f"{text}{text_connector}{next_text}"
142 |                         move_times += 1
143 |                     print(i, move_times)
144 |                     # 合并时间段和字幕文本
145 |                     time_range_start = re.search(r'(\d{2}:\d{2}:\d{2},\d{3})', time_range).group(1)
146 |                     next_time_range_end = re.findall(r'(\d{2}:\d{2}:\d{2},\d{3})', old_groups[i + move_times-1])[1]
147 |                     new_time_range = f"{time_range_start} --> {next_time_range_end}"
148 | 
149 |                     new_groups.append(f"{index}\n{new_time_range}\n{text}")
150 |                     current_number += move_times   
151 |                 index += 1
152 |     return new_groups
153 | 
154 | # 保证每一行的结尾非逗号，连续多行逗号就很尴尬(whisper 的幻觉重复情况)
155 | # 如果字符长度超过 max_length，就不再合并
156 | def adjust_srt_content_end_with_no_comma(old_groups):
157 |     # 用于记录当前该处理 old_groups 中的第几个组
158 |     current_number = 0
159 |     # 用于在 new_groups 中的索引
160 |     index = 0
161 |     new_groups = []
162 |     
163 |     for i in range(len(old_groups)):
164 |         # 如果 i 不等于 current_number 就跳过
165 |         if(i == current_number):
166 |             # 提取编号、时间段和字幕文本
167 |             group = old_groups[i].strip().split('\n')
168 | 
169 |             if len(group) >= 3:
170 |                 time_range = group[1]
171 |                 text = ' '.join(group[2:])
172 |                 # 当前字幕文本以逗号结尾，则向后找到第一个不以逗号结尾的字幕文本，合并
173 |                 if text.endswith((',', '，')):
174 |                     move_times = 1
175 |                     # 如果以中文逗号或者英文逗号结尾
176 |                     while text.endswith((',', '，')):
177 |                         # 判断 i+move_times 是否超出 old_groups 的索引范围
178 |                         if i + move_times >= len(old_groups)-1:
179 |                             move_times = 0
180 |                             break
181 |                         # 如果当前字幕文本加上下一个字幕文本的长度大于 max_length，就不再合并
182 |                         if len(text) + len(old_groups[i + move_times].strip().split('\n')[2:]) > max_length:
183 |                             break
184 |                         next_text = ' '.join(old_groups[i + move_times].strip().split('\n')[2:])
185 |                         text = f"{text}{text_connector}{next_text}"
186 |                         move_times += 1
187 |                     # 合并时间段和字幕文本
188 |                     time_range_start = re.search(r'(\d{2}:\d{2}:\d{2},\d{3})', time_range).group(1)
189 |                     next_time_range_end = re.findall(r'(\d{2}:\d{2}:\d{2},\d{3})', old_groups[i + move_times-1])[1]
190 |                     new_time_range = f"{time_range_start} --> {next_time_range_end}"
191 | 
192 |                     new_groups.append(f"{index}\n{new_time_range}\n{text}")
193 |                     current_number += move_times    
194 |                 else:
195 |                     new_groups.append(f"{index}\n{time_range}\n{text}")
196 |                     current_number += 1
197 |                 index += 1
198 |     return new_groups
199 | 
200 | # 每行字数在 min_length 和 max_length 之间
201 | def adjust_srt_content_with_min_max(old_groups):
202 |     # 用于记录当前该处理 old_groups 中的第几个组
203 |     current_number = 0
204 |     # 用于在 new_groups 中的索引
205 |     index = 0
206 |     new_groups = []
207 |     
208 |     print('len(old_groups)', len(old_groups))
209 |     for i in range(len(old_groups)):
210 |         # 如果 i 不等于 current_number 就跳过
211 |         if(i == current_number):
212 |             # 提取编号、时间段和字幕文本
213 |             group = old_groups[i].strip().split('\n')
214 | 
215 |             if len(group) >= 3:
216 |                 time_range = group[1]
217 |                 text = ' '.join(group[2:])
218 |                 print(i,text)
219 |                 # 如果当前字幕文本的长度小于 min_length，计算需要合并 n 组字幕文本才能大于 min_length，小于 max_length；否则直接写入 new_groups
220 |                 if len(text) < min_length:
221 |                     move_times = 1
222 |                     while len(text) < min_length:
223 |                         # 判断 i+move_times 是否超出 old_groups 的索引范围
224 |                         if i + move_times >= len(old_groups):
225 |                             print('out of range,len(old_groups):', len(old_groups))
226 |                             # move_times = 1
227 |                             break
228 |                         next_text = ' '.join(old_groups[i + move_times].strip().split('\n')[2:])
229 |                         print("next_text:",next_text)
230 |                         # 如果当前字幕文本加上下一个字幕文本的长度大于 max_length，就不再合并
231 |                         if len(text) + len(next_text) > max_length:
232 |                             break
233 |                         text = f"{text} {next_text}"
234 |                         move_times += 1
235 |                     print("current_number:",current_number,"move_times:", move_times,"len(text):",len(text),"\n\n")
236 |                     # 合并时间段和字幕文本
237 |                     time_range_start = re.search(r'(\d{2}:\d{2}:\d{2},\d{3})', time_range).group(1)
238 |                     next_time_range_end = re.findall(r'(\d{2}:\d{2}:\d{2},\d{3})', old_groups[i + move_times-1])[1]
239 |                     new_time_range = f"{time_range_start} --> {next_time_range_end}"
240 | 
241 |                     new_groups.append(f"{index}\n{new_time_range}\n{text}")
242 |                     current_number += move_times    
243 |                 else:
244 |                     new_groups.append(f"{index}\n{time_range}\n{text}")
245 |                     current_number += 1
246 |                 index += 1
247 |     return new_groups
248 | 
249 | def write_new_srt_file(output_file, new_groups):
250 |     # 将new_groups的内容按srt格式写入新的srt文件
251 |     with open(output_file, 'w',encoding="utf-8") as file:
252 |         file.write('\n\n'.join(new_groups))
253 | 
254 | 
255 | 
256 | def adjust_srt_file(srt_file, adjust_mode):
257 |     old_groups = read_and_split_srt_file(srt_file)
258 |     if if_need_spilt:
259 |         print('adjust_mode is 0，先根据非逗号拆分行')
260 |         old_groups = split_srt_content(old_groups)
261 |     # 断句合并是基础操作
262 |     new_groups = adjust_srt_content(old_groups)
263 |     if adjust_mode == '2':
264 |         print('adjust_mode is 2，保证每行以非逗号结尾')
265 |         new_groups = adjust_srt_content_end_with_no_comma(new_groups)
266 |     elif adjust_mode == '3':
267 |         print('adjust_mode is 3,min_length is ', min_length, 'max_length is ', max_length)
268 |         # 先保证每行以非逗号结尾，再保证每行字数在 min_length 和 max_length 之间
269 |         new_groups = adjust_srt_content_end_with_no_comma(new_groups)
270 |         new_groups = adjust_srt_content_with_min_max(new_groups)
271 |     output_file = generate_output_file_path(srt_file)
272 |     write_new_srt_file(output_file, new_groups)
273 | 
274 | if __name__ == '__main__':
275 |     main()
276 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 背景
 2 | 
 3 | 在音频转文字的准确率上，OpenAI 的 Whisper + Prompt 远超一般的模型服务(飞书妙记、通义听悟、讯飞...)，尤其是在专业名词较多的领域。
 4 | 
 5 | 但 Whisper 的转写断句能力较差(Large v2 模型)，经常会出现一句话被分隔成两行字幕的情况。
 6 | 
 7 | 根据 [openai-cookbook/examples/Whisper_prompting_guide.ipynb at main · openai/openai-cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb) 所言，Whisper 的 Prompt 和 GPT 的 Prompt 并不完全一致——即 Whisper 的 prompt 并不理解 Prompt 所蕴含的指令含义，它只是在**模仿 Prompt 的语言风格和词汇拼写**。
 8 | 
 9 | 所以通过 Prompt 很难完美解决 Whipser 转写句子被拆分的问题。
10 | 
11 | 目前思考这个问题有两种解决方案：
12 | 
13 | 1.  上传 srt 给 claude，让其返回合并句子后的 srt 文件。
14 | 2.  通过代码按某些规则来合并 srt 文件中的句子。
15 | 
16 | 1 是一种比较简单且理想的方案，因为大模型对语义有理解分析，其断句分段把握的比较合适。
17 | 
18 | 但 1 的问题在于大模型有字符长度限制，实测超过 30 min 的转录文本进行合并分段会超出字符限制。初次之外，这种方案还有等待时间较长，可能会出现幻觉等问题。
19 | 
20 | 我日常转录的是 3h 的课程录音，所以我选择采用方案 2 做断句修正。
21 | 
22 | ## 实现策略
23 | 
24 | 方案 2 的原理上比较简单：
25 | 
26 | > 判断当前行结尾是否有标点符号，如果没有则和下一组字幕文本合并。
27 | 
28 | ## 脚本使用
29 | 
30 | Clone 到本地后安装依赖，每次修改 `srt_file` 的路径即可使用。
31 | 
32 | 默认修改的字幕文件和源字幕文件在同一个文件夹之内，文件名会加 `_adjusted` 后缀。
33 | 
34 | 目前有三种模式：
35 | 
36 | - 1 为合并被断行的句子，
37 | 
38 | - 2 在 1 的基础上，保证每行以非逗号结尾
39 | 
40 | - 3 在 1-2 的基础上，保证每一行的字数不小于目标值(需要修改代码头部 min_length 和 max_length)
41 | 
42 | ![未处理前](https://gcore.jsdelivr.net/gh/zj1123581321/Adjust_SubTitle/assets/a238efc8e01a2e5add3f61aca7ac7e9.jpg)
43 | 
44 | ![处理后](https://github.com/zj1123581321/Adjust_SubTitle/raw/main/assets/a62c7ecbd2822d514a87ec97eb95555.jpg)


--------------------------------------------------------------------------------
/Sample/sampleLrc.lrc:
--------------------------------------------------------------------------------
  1 | [00:00.400]刚创业那会儿
  2 | [00:01.240]我非常害怕跟别人谈商业合作
  3 | [00:03.680]第一是因为自己确实没什么实力
  4 | [00:05.639]第二是因为对方总是显得那么从容
  5 | [00:07.919]气场死死地压制着我
  6 | [00:09.720]他们话里话外动不动就几百万用户
  7 | [00:12.000]几千万生意
  8 | [00:12.919]各种跟他们合作过的明星网红的名字
  9 | [00:14.960]往外蹦
 10 | [00:15.439]我连话都插不上
 11 | [00:16.760]一般遇上这种特别能吹逼的人
 12 | [00:18.559]我第一感觉就是很不靠谱
 13 | [00:20.440]但同时呢
 14 | [00:21.239]我又会怀疑自己
 15 | [00:22.480]可能他真的很牛
 16 | [00:23.440]只是我境界太低
 17 | [00:24.600]看不懂听不懂
 18 | [00:25.839]我猜
 19 | [00:26.359]淡粉内向且不善言辞的人
 20 | [00:28.239]应该都有过这类纠结
 21 | [00:29.719]一方面反感这种华而不实的人
 22 | [00:32.079]一方面又希望自己也能拥有这份自信
 23 | [00:35.159]那么有没有什么办法
 24 | [00:36.280]能迅速判断出一个人的实力
 25 | [00:38.479]以避免被忽悠呢
 26 | [00:39.719]有没有什么办法
 27 | [00:40.600]能迅速美化和展现自己的实力
 28 | [00:42.759]在别人面前呈现一个自信的自己
 29 | [00:45.200]作为一个在商业世界打拼7年的奸商
 30 | [00:47.640]这方面我还是有一丢丢心得的
 31 | [00:49.679]今天抛砖引玉
 32 | [00:51.039]分享一下我的经历
 33 | [00:52.640]首先作为一个小公司的老板
 34 | [00:57.399]我来说一个老板们都心照不宣的共识
 35 | [01:00.000]没有哪个老板是不吹牛的
 36 | [01:02.439]只不过有的老板比较豪放
 37 | [01:04.120]100分的实力吹出1000分的效果
 38 | [01:06.439]而有的老板比较收敛
 39 | [01:07.719]100分的实力吹出120分的效果
 40 | [01:10.120]创业圈基础操作就是
 41 | [01:11.640]公司用户500万会吹成千万用户
 42 | [01:13.879]融资1000万估值5000万
 43 | [01:15.599]首期融资到账200万
 44 | [01:17.159]会直接吹成公司估值一个亿
 45 | [01:19.159]让别人觉得是在跟一个
 46 | [01:20.439]身家过亿的人说话
 47 | [01:21.879]当老板呢
 48 | [01:22.519]为了一个数字能更好看
 49 | [01:23.760]都非常擅长四舍五入
 50 | [01:25.920]包括我自己
 51 | [01:26.799]我在往期视频说
 52 | [01:27.719]我之前带领过百人销售团队
 53 | [01:29.879]其实这个是有水分的
 54 | [01:31.439]我团队最高峰人数是85人左右
 55 | [01:33.840]我吹成了100人
 56 | [01:35.120]因为百人销售团队听起来更厉害是吧
 57 | [01:37.480]85人就没那么好听了
 58 | [01:39.400]再比如我发这个B站视频的时候
 59 | [01:41.319]B站大概48万粉
 60 | [01:42.920]等它到了50万粉
 61 | [01:44.280]我就能对外吹嘘说
 62 | [01:45.560]在下不才
 63 | [01:46.640]B站萌新up主粉丝大几十万吧
 64 | [01:49.280]实际上如果我再大胆一些
 65 | [01:50.879]我把我公众号7万粉加
 66 | [01:52.400]某乎的13万粉全都算上
 67 | [01:54.280]我就可以说
 68 | [01:55.280]现在我全网粉丝接近百万
 69 | [01:57.519]当然我不会针对外这么说
 70 | [01:59.079]我只是向大家展示
 71 | [02:00.319]这个牛逼一般是怎么吹起来的
 72 | [02:02.079]而且我可以告诉你
 73 | [02:03.040]像我这种六七十万吹成百万
 74 | [02:05.040]在商人的圈子里
 75 | [02:06.120]横向对比已经算是
 76 | [02:07.239]非常良心和低调的吹逼了
 77 | [02:09.759]我平常进的一些社群里面
 78 | [02:11.400]大佬们动不动就全网一亿粉丝
 79 | [02:13.879]我也不敢问这一亿粉丝是怎么算出来的
 80 | [02:15.919]可能人家真有那么厉害吧
 81 | [02:17.280]我说这也只是想让你意识到
 82 | [02:18.879]老板们都是吹牛逼的
 83 | [02:20.199]不用因为他吹牛逼
 84 | [02:21.479]就立马判断这个人不靠谱
 85 | [02:23.159]哪个老板没给员工画过大饼呢
 86 | [02:24.960]是吧
 87 | [02:25.280]老板不光给员工画大饼
 88 | [02:26.879]最擅长给自己画大饼
 89 | [02:28.560]一个大目标先立起来
 90 | [02:29.919]甭管能不能完成
 91 | [02:31.159]放倒那儿就是一个吉利
 92 | [02:32.680]创业初期是很需要这种
 93 | [02:34.240]盲目的乐观主义精神的
 94 | [02:35.719]是需要给自己洗洗脑的
 95 | [02:37.560]不然一些坎真的过不去
 96 | [02:39.159]所以不要以是否吹牛逼画大饼
 97 | [02:41.159]判断一个人的实力
 98 | [02:42.120]有句话叫做不要看他说了什么
 99 | [02:43.960]而要看他做了什么
100 | [02:45.319]我们可以通过一个人动作的细节
101 | [02:47.360]来判断他的实力


--------------------------------------------------------------------------------
/assets/a238efc8e01a2e5add3f61aca7ac7e9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zj1123581321/Adjust_SubTitle/3a5d4d3e759d4d98cd35a7274b96651c43b5f350/assets/a238efc8e01a2e5add3f61aca7ac7e9.jpg


--------------------------------------------------------------------------------
/assets/a62c7ecbd2822d514a87ec97eb95555.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zj1123581321/Adjust_SubTitle/3a5d4d3e759d4d98cd35a7274b96651c43b5f350/assets/a62c7ecbd2822d514a87ec97eb95555.jpg


--------------------------------------------------------------------------------