├── LICENSE
├── README.md
├── ban.txt
├── clean_list.py
├── input
    └── example.srt
├── merge.py
├── pack.py
└── split.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 AliceNavigator
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Alice_split_toolset
 2 | Split audio using the .srt file, clean up annotations, then merge and package into a format suitable for bert-vits2 in a standard manner.   
 3 | 使用.srt文件分割音频并清洗标注，合并封装至适用于bert-vits2的一个较为标准的格式
 4 | 
 5 | ### usage 使用
 6 | - 将同名wav和srt文件放入input，依照顺序执行，更多详细参数见-h
 7 | - Place the wav and srt files with the same name into the 'input' folder, execute in sequence, and see -h for more detailed parameters.
 8 | ```bash
 9 | python split.py --mono
10 | python clean_list.py --filter_english
11 | python merge.py
12 | python pack.py baki
13 | ```
14 | 


--------------------------------------------------------------------------------
/ban.txt:
--------------------------------------------------------------------------------
1 | 啧


--------------------------------------------------------------------------------
/clean_list.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | 
 5 | 
 6 | def process_mapping(mapping_path, filter_english, ban_file):
 7 |     with open(mapping_path, 'r', encoding='utf-8') as file:
 8 |         lines = file.readlines()
 9 | 
10 |     banned_phrases = []
11 |     if ban_file and os.path.exists(ban_file):
12 |         with open(ban_file, 'r', encoding='utf-8') as bf:
13 |             banned_phrases = [line.strip() for line in bf.readlines()]
14 | 
15 |     clean_mapping = []
16 | 
17 |     for line in lines:
18 |         filename, text = line.strip().split("|")
19 | 
20 |         if filter_english and re.search(r"[a-zA-Z]", text):
21 |             print(f'drop non-kanji text : {text}')
22 |             continue
23 | 
24 |         if any(ban_phrase in text for ban_phrase in banned_phrases):
25 |             print(f'drop ban text : {text}')
26 |             continue
27 | 
28 |         clean_mapping.append(line)
29 | 
30 |     with open(f'{mapping_path[:-12]}/clean_mapping.list', 'w', encoding='utf-8') as file:
31 |         for line in clean_mapping:
32 |             file.write(line)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser(description="Process and clean mapping.list based on criteria")
37 |     parser.add_argument("--filter_english", action="store_true", default=False, help="Remove entries with English text")
38 |     parser.add_argument("--ban_file", default="ban.txt", help="Path to file with banned phrases")
39 |     args = parser.parse_args()
40 | 
41 |     for root, dirs, files in os.walk('output'):
42 |         for folder in dirs:
43 |             process_mapping(f"./output/{folder}/mapping.list", args.filter_english, args.ban_file)
44 | 


--------------------------------------------------------------------------------
/input/example.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,200 --> 00:00:01,200
3 | 直播搞好
4 | 
5 | 2
6 | 00:00:01,566 --> 00:00:03,866
7 | 先要把我推流搞定


--------------------------------------------------------------------------------
/merge.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from pydub import AudioSegment
 4 | from tqdm import tqdm
 5 | import shutil
 6 | 
 7 | def merge_segments(mapping_path, folder_name, max_length):
 8 |     merge_path = f"merge/{folder_name}"
 9 |     if not os.path.exists(merge_path):
10 |         os.makedirs(merge_path)
11 |     else:
12 |         print(f'检测到{merge_path}已存在，执行删除')
13 |         shutil.rmtree(merge_path)
14 |         os.makedirs(merge_path)
15 | 
16 |     with open(mapping_path, 'r', encoding='utf-8') as file:
17 |         lines = file.readlines()
18 | 
19 |     segments_to_merge = []
20 |     current_text_length = 0
21 |     new_mapping = []
22 | 
23 |     for line in tqdm(lines, desc=f"Processing {folder_name}", unit="line"):
24 |         filename, text = line.strip().split("|")
25 |         current_text_length += len(text)
26 | 
27 |         segments_to_merge.append((filename, text))
28 | 
29 |         if current_text_length > max_length:
30 |             merged_audio = AudioSegment.empty()
31 |             merged_text = []
32 | 
33 |             for seg_file, seg_text in segments_to_merge:
34 |                 audio_path = os.path.join(os.path.dirname(mapping_path), seg_file)
35 |                 segment_audio = AudioSegment.from_wav(audio_path)
36 |                 merged_audio += segment_audio
37 |                 merged_text.append(seg_text)
38 | 
39 |             merged_filename = f"{segments_to_merge[0][0]}_to_{segments_to_merge[-1][0]}"
40 |             merged_audio.export(os.path.join(merge_path, merged_filename), format="wav")
41 |             new_mapping.append(f"{merged_filename}|{','.join(merged_text)}")
42 | 
43 |             segments_to_merge = []
44 |             current_text_length = 0
45 | 
46 |     if segments_to_merge:
47 |         merged_audio = AudioSegment.empty()
48 |         merged_text = []
49 | 
50 |         for seg_file, seg_text in segments_to_merge:
51 |             audio_path = os.path.join(os.path.dirname(mapping_path), seg_file)
52 |             segment_audio = AudioSegment.from_wav(audio_path)
53 |             merged_audio += segment_audio
54 |             merged_text.append(seg_text)
55 | 
56 |         merged_filename = f"{segments_to_merge[0][0]}_to_{segments_to_merge[-1][0]}"
57 |         merged_audio.export(os.path.join(merge_path, merged_filename), format="wav")
58 |         new_mapping.append(f"{merged_filename}|{' '.join(merged_text)}")
59 | 
60 |     with open(os.path.join(merge_path, "new_mapping.list"), 'w', encoding='utf-8') as file:
61 |         for line in new_mapping:
62 |             file.write(line + "\n")
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser(description="Merge short segments from mapping.list")
66 |     parser.add_argument("--max", type=int, default=20, help="Maximum text length for a segment")
67 |     args = parser.parse_args()
68 | 
69 |     for root, dirs, files in os.walk('output'):
70 |         for folder in tqdm(dirs, desc="Merging folders", unit="folder"):
71 |             merge_segments(f"./output/{folder}/clean_mapping.list", folder, args.max)
72 | 


--------------------------------------------------------------------------------
/pack.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import shutil
 4 | from shutil import copyfile
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def process_and_rename(character_name):
 9 |     dataset_path = f"./dataset/{character_name}"
10 |     if not os.path.exists(dataset_path):
11 |         os.makedirs(dataset_path)
12 |     else:
13 |         print(f'检测到"{dataset_path}" 已存在，执行删除')
14 |         shutil.rmtree(dataset_path)
15 |         os.makedirs(dataset_path)
16 | 
17 |     counter = 1
18 | 
19 |     all_folders = [folder for r, d, f in os.walk('./merge') for folder in d]
20 |     for folder in tqdm(all_folders, desc="Processing folders", unit="folder"):
21 |         mapping_path = os.path.join('./merge', folder, "new_mapping.list")
22 |         if os.path.exists(mapping_path):
23 |             with open(mapping_path, 'r', encoding='utf-8') as file:
24 |                 lines = file.readlines()
25 | 
26 |             for line in tqdm(lines, desc=f"Processing files in {folder}", unit="file", leave=False):
27 |                 old_filename, text = line.strip().split("|")
28 |                 old_filepath = os.path.join('./merge', folder, old_filename)
29 | 
30 |                 new_filename = f"{character_name}_{counter}.wav"
31 |                 new_filepath = os.path.join(dataset_path, new_filename)
32 |                 new_mapping_entry = f"./dataset/{character_name}/{new_filename}|{character_name}|ZH|{text}"
33 | 
34 |                 # Copy and rename the file
35 |                 copyfile(old_filepath, new_filepath)
36 | 
37 |                 with open(os.path.join(dataset_path, "dataset_mapping.list"), 'a', encoding='utf-8') as dataset_file:
38 |                     dataset_file.write(new_mapping_entry + "\n")
39 | 
40 |                 counter += 1
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser(description="Rename and restructure files based on character_name")
45 |     parser.add_argument("character_name", type=str, help="Name of the character for restructuring")
46 |     args = parser.parse_args()
47 | 
48 |     process_and_rename(args.character_name)
49 | 


--------------------------------------------------------------------------------
/split.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import shutil
 4 | 
 5 | from pydub import AudioSegment
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def time_to_milliseconds(time_str):
10 |     h, m, s = map(float, time_str.split(":"))
11 |     return int(h * 3600000 + m * 60000 + s * 1000)
12 | 
13 | 
14 | def sanitize_filename(filename):
15 |     # 过滤掉Windows上不允许的字符，并限制文件名的长度
16 |     illegal_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
17 |     for char in illegal_chars:
18 |         filename = filename.replace(char, '_')
19 |     return filename[:247]  # 247是为了保证后续可以添加后缀和索引
20 | 
21 | 
22 | def split_wav_by_srt(srt_path, wav_path, output_folder, sample_rate, mono, use_subtitle_as_name):
23 |     if not os.path.exists(output_folder):
24 |         os.makedirs(output_folder)
25 |     else:
26 |         print(f'检测到"{output_folder}" 已存在，执行删除')
27 |         shutil.rmtree(output_folder)
28 |         os.makedirs(output_folder)
29 | 
30 |     mapping = []
31 | 
32 |     with open(srt_path, 'r', encoding='utf-8') as file:
33 |         content = file.read()
34 |         blocks = content.strip().split("\n\n")
35 |         audio = AudioSegment.from_wav(wav_path)
36 |         prj_name = os.path.basename(wav_path)[:-4]
37 | 
38 |         for block in tqdm(blocks, desc=f"Processing {prj_name}"):
39 |             lines = block.split("\n")
40 |             times = lines[1].split("-->")
41 |             start_time, end_time = [time_to_milliseconds(t.strip().replace(",", ".")) for t in times]
42 |             subtitle = " ".join(lines[2:])
43 | 
44 |             segment = audio[start_time:end_time]
45 | 
46 |             if mono:
47 |                 segment = segment.set_channels(1)
48 | 
49 |             if sample_rate:
50 |                 segment = segment.set_frame_rate(sample_rate)
51 | 
52 |             if use_subtitle_as_name:
53 |                 filename = sanitize_filename(subtitle) + ".wav"
54 |                 idx = 1
55 |                 while os.path.exists(os.path.join(output_folder, prj_name, filename)):
56 |                     filename = sanitize_filename(subtitle) + f"_{idx}.wav"
57 |                     idx += 1
58 |             else:
59 |                 filename = f"{start_time}_{end_time}.wav"
60 |                 mapping.append(f"{filename}|{subtitle}")
61 | 
62 |             if not os.path.exists(os.path.join(output_folder, prj_name)):
63 |                 os.makedirs(os.path.join(output_folder, prj_name))
64 |             segment.export(os.path.join(output_folder, prj_name, filename), format="wav", parameters=["-sample_fmt", "s16"])
65 | 
66 |     if not use_subtitle_as_name:
67 |         with open(os.path.join(output_folder, prj_name, "mapping.list"), "a", encoding="utf-8") as f:
68 |             for line in mapping:
69 |                 f.write(line + "\n")
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     parser = argparse.ArgumentParser(description="Split WAVs based on SRT timings in a folder")
74 |     parser.add_argument("--input_folder", type=str, default="input", help="Path to the input folder containing SRT and WAV files")
75 |     parser.add_argument("--output_folder", type=str, default="output", help="Output folder path")
76 |     parser.add_argument("--sample_rate", type=int, default=44100, help="Sample rate for output WAVs")
77 |     parser.add_argument("--mono", action="store_true", help="Convert to mono")
78 |     parser.add_argument("--use_subtitle_as_name", action="store_true", help="Use subtitle as filename")
79 | 
80 |     args = parser.parse_args()
81 | 
82 |     for root, dirs, files in os.walk(args.input_folder):
83 |         for file in files:
84 |             if file.endswith(".srt"):
85 |                 wav_file = file.replace(".srt", ".wav")
86 |                 if wav_file in files:
87 |                     split_wav_by_srt(os.path.join(root, file), os.path.join(root, wav_file), args.output_folder,
88 |                                      args.sample_rate, args.mono, args.use_subtitle_as_name)
89 | 


--------------------------------------------------------------------------------