├── README.md ├── img └── img.png └── top └── lukeewin ├── app.py ├── download_model.py └── startup.bat /README.md: -------------------------------------------------------------------------------- 1 | # 0. 效果演示 2 | ![img.png](img/img.png) \ 3 | 演示音频分割功能可以访问 https://www.bilibili.com/video/BV1oxrcYuELK \ 4 | 演示视频分割功能可以访问 https://www.bilibili.com/video/BV1xYweeKEvZ \ 5 | 如果是不懂代码的人想要使用本项目,可以使用我打包好的程序,我是在Windows 11系统上打包的,不确定Windows其它版本是否能用,如果是非Windows系统,请使用源码方式运行。 \ 6 | [点击这里跳转到打包好的可执行程序](https://item.taobao.com/item.htm?ft=t&id=853452834970) 7 | # 1. 说明 8 | 这是基于开源的 FunASR 实现的说话人分离的 GUI 项目,可以在支持图形界面中的任意 PC 端运行 \ 9 | 要求 python version >= 3.8 \ 10 | 支持运行在 Windows、MacOS、Linux 系统 \ 11 | 本项目适合个人电脑使用,如果要在生产服务器中部署,并且需要并发处理,可到我博客中联系我 12 | # 2. 开发日志 13 | 2023-11-14 对选择的多个音频分离不同的人声 \ 14 | 2024-01-04 保存每个说话人对应的内容 \ 15 | 2024-01-09 增加合并相同说话人功能 \ 16 | 2024-01-22 增加视频切片功能 \ 17 | 2024-02-25 新增允许控制每个音频片段切割的字符数 18 | # 3. 安装 19 | 执行下面命令来安装依赖 20 | ```shell 21 | pip install -U funasr modelscope ffmpeg-python pydub 22 | ``` 23 | 此外还需要安装 torch,可以到 torch 官方中根据自己电脑情况安装不同版本的 torch \ 24 | 安装 ffmpeg,可以到 github 中搜索 ffmpeg,下载解压后,配置环境变量 \ 25 | 如果不会安装 torch 和 ffmpeg,可以参考我之前发布到博客中的一篇[文章](https://blog.lukeewin.top/archives/windows-an-zhuang-whisper#toc-head-1)。 26 | # 4. 功能 27 | 1. 支持对指定的单个或者多个音频中不同的说话人讲的话进行分离,分别归类到不同的目录中 28 | 2. 保存每个说话人对应的包含时间戳的文本内容 29 | 3. 支持视频切片,根据说话人声音进行视频切片 30 | 31 | # 5. 模型下载 32 | 执行下面程序,会自动下载模型到当前用户 .cache/modelscope/hub/models/iic/ 目录中 33 | ```shell 34 | python download_model.py 35 | ``` 36 | # 6. 联系 37 | 可以添加交流群 746213237 \ 38 | 个人技术分享博客:https://blog.lukeewin.top \ 39 | 如果是小白,不懂代码,可以[点击这里](https://item.taobao.com/item.htm?ft=t&id=853452834970) 40 | 41 | -------------------------------------------------------------------------------- /img/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukeewin/AudioSeparationGUI/e0c0925d350eca2b5dee9ae66c52625638491986/img/img.png -------------------------------------------------------------------------------- /top/lukeewin/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | import tkinter as tk 4 | import queue 5 | from datetime import timedelta, datetime 6 | from pydub import AudioSegment 7 | import ffmpeg 8 | from tkinter import filedialog, messagebox 9 | from funasr import AutoModel 10 | 11 | spk_txt_queue = queue.Queue() 12 | 13 | # 创建窗口 14 | root = tk.Tk() 15 | root.title("说话人分离 https://blog.lukeewin.top") 16 | 17 | # 获取屏幕宽度和高度 18 | screen_width = root.winfo_screenwidth() 19 | screen_height = root.winfo_screenheight() 20 | 21 | # 设置窗口大小 22 | window_width = 400 23 | window_height = 200 24 | 25 | # 计算居中位置 26 | x_coordinate = (screen_width // 2) - (window_width // 2) 27 | y_coordinate = (screen_height // 2) - (window_height // 2) 28 | 29 | # 设置窗口大小和位置 30 | root.geometry(f"{window_width}x{window_height}+{x_coordinate}+{y_coordinate}") 31 | 32 | home_directory = os.path.expanduser("~") 33 | asr_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") 34 | asr_model_revision = "v2.0.4" 35 | vad_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "speech_fsmn_vad_zh-cn-16k-common-pytorch") 36 | vad_model_revision = "v2.0.4" 37 | punc_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "punc_ct-transformer_zh-cn-common-vocab272727-pytorch") 38 | punc_model_revision = "v2.0.4" 39 | spk_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "speech_campplus_sv_zh-cn_16k-common") 40 | spk_model_revision = "v2.0.4" 41 | ngpu = 1 42 | device = "cuda" 43 | ncpu = 4 44 | 45 | # ASR 模型 46 | model = AutoModel(model=asr_model_path, 47 | model_revision=asr_model_revision, 48 | vad_model=vad_model_path, 49 | vad_model_revision=vad_model_revision, 50 | punc_model=punc_model_path, 51 | punc_model_revision=punc_model_revision, 52 | spk_model=spk_model_path, 53 | spk_model_revision = spk_model_revision, 54 | ngpu=ngpu, 55 | ncpu=ncpu, 56 | device=device, 57 | disable_pbar=True, 58 | disable_log=True, 59 | disable_update=True 60 | ) 61 | 62 | # 创建一个队列,用于线程间通信 63 | result_queue = queue.Queue() 64 | # 音频合并队列 65 | audio_concat_queue = queue.Queue() 66 | 67 | # 支持的音视频格式 68 | support_audio_format = ['.mp3', '.m4a', '.aac', '.ogg', '.wav', '.flac', '.wma', '.aif'] 69 | support_video_format = ['.mp4', '.avi', '.mov', '.mkv'] 70 | 71 | input_frame = tk.Frame(root) 72 | input_frame.pack(side=tk.TOP, padx=10, pady=2) 73 | output_frame = tk.Frame(root) 74 | output_frame.pack(side=tk.TOP, padx=10, pady=2) 75 | start_trans_frame = tk.Frame(root) 76 | start_trans_frame.pack(side=tk.TOP, padx=10, pady=2) 77 | show_frame = tk.Frame(root) 78 | show_frame.pack(side=tk.TOP,padx=10, pady=2) 79 | 80 | selected_file_list = [] 81 | # 选择需要分离的音频 82 | def select_multi_file(): 83 | selected_file_list.clear() 84 | selected_files = filedialog.askopenfilenames(title='选择多个文件', filetypes=[('音频文件', '*.mp3 *.wav *.ogg *.flac *.aac *.m4a *.aif *.wma'), ('视频文件', '*.mp4 *.avi *.mov *.mkv')]) 85 | selected_file_count = len(selected_files) 86 | for tmp_file in selected_files: 87 | selected_file_list.append(tmp_file) 88 | print(f"选择的音频或视频:{tmp_file}") 89 | show_input_info.config(text=f"已选择 {selected_file_count} 个文件") 90 | select_input_file_button = tk.Button(input_frame, text='选择音视频', command=select_multi_file) 91 | select_input_file_button.pack(side=tk.LEFT, padx=10, pady=2) 92 | show_input_info = tk.Label(input_frame, text='') 93 | show_input_info.pack(side=tk.LEFT, padx=10, pady=2) 94 | 95 | # 指定转写后的保存路径 96 | output_label = tk.Label(output_frame, text="保存路径") 97 | output_label.pack(side=tk.LEFT, padx=10, pady=2) 98 | 99 | save_path = tk.StringVar(None) 100 | # 指定保存路径 101 | def save_dir(): 102 | save_directory = filedialog.askdirectory(title='选择保存路径') 103 | if save_directory: 104 | save_path.set(save_directory) 105 | output_label.config(text=save_directory) 106 | tk.Button(output_frame, text='选择保存目录', command=save_dir).pack(side=tk.LEFT, padx=10, pady=2) 107 | 108 | def copy_output_path(): 109 | # 获取label中的文本内容 110 | text_to_copy = output_label.cget("text") 111 | # 清空剪贴板 112 | root.clipboard_clear() 113 | # 将文本内容添加到剪贴板 114 | root.clipboard_append(text_to_copy) 115 | 116 | # 复制 117 | copy_button = tk.Button(output_frame, text="复制路径", command=copy_output_path) 118 | copy_button.pack(side=tk.RIGHT, padx=10, pady=2) 119 | 120 | # 分离字数 121 | split_number = tk.Entry(start_trans_frame, width=2) 122 | split_number.insert(0, str(10)) 123 | split_number.pack(side=tk.LEFT, padx=5, pady=2) 124 | 125 | def to_date(milliseconds): 126 | """将时间戳转换为SRT格式的时间""" 127 | time_obj = timedelta(milliseconds=milliseconds) 128 | return f"{time_obj.seconds // 3600:02d}:{(time_obj.seconds // 60) % 60:02d}:{time_obj.seconds % 60:02d}.{time_obj.microseconds // 1000:03d}" 129 | 130 | 131 | def to_milliseconds(time_str): 132 | time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") 133 | time_delta = time_obj - datetime(1900, 1, 1) 134 | milliseconds = int(time_delta.total_seconds() * 1000) 135 | return milliseconds 136 | 137 | # 转写获取时间戳,根据时间戳进行切分,然后根据 spk id 进行分类 138 | # audio: 音频 139 | # return 切分后按照 spk id 的地址 140 | def trans(): 141 | if len(selected_file_list) != 0 and save_path.get() != '' and save_path.get() is not None: 142 | for audio in selected_file_list: 143 | if os.path.exists(audio): 144 | audio_name = os.path.splitext(os.path.basename(audio))[0] 145 | _, audio_extension = os.path.splitext(audio) 146 | show_info_label.config(text=f'正在执行中,请勿关闭程序。{audio}') 147 | speaker_audios = {} # 每个说话人作为 key,value 为列表,列表中为当前说话人对应的每个音频片段 148 | # 音频预处理 149 | try: 150 | audio_bytes, _ = ( 151 | ffmpeg.input(audio, threads=0, hwaccel='cuda') 152 | .output("-", format="wav", acodec="pcm_s16le", ac=1, ar=16000) 153 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 154 | ) 155 | res = model.generate(input=audio_bytes, batch_size_s=300, is_final=True, sentence_timestamp=True) 156 | rec_result = res[0] 157 | asr_result_text = rec_result['text'] 158 | if asr_result_text != '': 159 | sentences = [] 160 | for sentence in rec_result["sentence_info"]: 161 | start = to_date(sentence["start"]) 162 | end = to_date(sentence["end"]) 163 | if sentences and sentence["spk"] == sentences[-1]["spk"] and len(sentences[-1]["text"]) < int(split_number.get()): 164 | sentences[-1]["text"] += "" + sentence["text"] 165 | sentences[-1]["end"] = end 166 | else: 167 | sentences.append( 168 | {"text": sentence["text"], "start": start, "end": end, "spk": sentence["spk"]} 169 | ) 170 | 171 | # 剪切音频或视频片段 172 | i = 0 173 | for stn in sentences: 174 | stn_txt = stn['text'] 175 | start = stn['start'] 176 | end = stn['end'] 177 | # tmp_start = to_milliseconds(start) 178 | # tmp_end = to_milliseconds(end) 179 | # duration = round((tmp_end - tmp_start) / 1000, 3) 180 | spk = stn['spk'] 181 | 182 | # 根据文件名和 spk 创建目录 183 | date = datetime.now().strftime("%Y-%m-%d") 184 | final_save_path = os.path.join(save_path.get(), date, audio_name, str(spk)) 185 | os.makedirs(final_save_path, exist_ok=True) 186 | # 获取音视频后缀 187 | file_ext = os.path.splitext(audio)[-1] 188 | final_save_file = os.path.join(final_save_path, str(i)+file_ext) 189 | spk_txt_path = os.path.join(save_path.get(), date, audio_name) 190 | spk_txt_file = os.path.join(spk_txt_path, f'spk{spk}.txt') 191 | spk_txt_queue.put({'spk_txt_file': spk_txt_file, 'spk_txt': stn_txt, 'start': start, 'end': end}) 192 | i += 1 193 | try: 194 | if file_ext in support_audio_format: 195 | ( 196 | ffmpeg.input(audio, threads=0, ss=start, to=end, hwaccel='cuda') 197 | .output(final_save_file) 198 | .run(cmd=["ffmpeg", "-nostdin"], overwrite_output=True, capture_stdout=True, 199 | capture_stderr=True) 200 | ) 201 | elif file_ext in support_video_format: 202 | final_save_file = os.path.join(final_save_path, str(i)+'.mp4') 203 | ( 204 | ffmpeg.input(audio, threads=0, ss=start, to=end, hwaccel='cuda') 205 | .output(final_save_file, vcodec='libx264', crf=23, acodec='aac', ab='128k') 206 | .run(cmd=["ffmpeg", "-nostdin"], overwrite_output=True, capture_stdout=True, 207 | capture_stderr=True) 208 | ) 209 | else: 210 | print(f'{audio}不支持') 211 | except ffmpeg.Error as e: 212 | print(f"剪切音频发生错误,错误信息:{e}") 213 | # 记录说话人和对应的音频片段,用于合并音频片段 214 | if spk not in speaker_audios: 215 | speaker_audios[spk] = [] # 列表中存储音频片段 216 | speaker_audios[spk].append({'file': final_save_file, 'audio_name': audio_name}) 217 | ret = {"text": asr_result_text, "sentences": sentences} 218 | print(f'{audio} 切分完成') 219 | result_queue.put(f'{audio} 切分完成') 220 | show_info_label.config(text=f'{audio} 切分完成') 221 | print(f'转写结果:{ret}') 222 | # 存入合并队列 223 | audio_concat_queue.put(speaker_audios) 224 | else: 225 | print("没有转写结果") 226 | except Exception as e: 227 | print(f"转写异常:{e}") 228 | else: 229 | print("输入的文件不存在") 230 | messagebox.showinfo("提醒", "输入的文件不存在") 231 | else: 232 | print("没有填写输入输出") 233 | messagebox.showinfo("提醒", "没有填写选择文件或保存路径") 234 | 235 | 236 | def start_transcription_thread(): 237 | # 创建并启动转写线程 238 | thread = threading.Thread(target=trans) 239 | thread.start() 240 | 241 | 242 | btn_start = tk.Button(start_trans_frame, text="分离", command=start_transcription_thread) 243 | btn_start.pack(side=tk.LEFT, padx=10, pady=2) 244 | 245 | # 显示分离情况 246 | show_info_label = tk.Label(show_frame, text="") 247 | show_info_label.pack(side=tk.LEFT, padx=10, pady=2) 248 | 249 | 250 | def show_info(): 251 | res = result_queue.get() 252 | show_info_label.config(text=res) 253 | 254 | 255 | threading.Thread(target=show_info).start() 256 | 257 | 258 | def write_txt(): 259 | while True: 260 | item = spk_txt_queue.get() 261 | spk_txt_file = item['spk_txt_file'] 262 | spk_txt = item['spk_txt'] 263 | spk_start = item['start'] 264 | spk_end = item['end'] 265 | dir_path = os.path.dirname(spk_txt_file) 266 | os.makedirs(dir_path, exist_ok=True) 267 | with open(spk_txt_file, 'a', encoding='utf-8') as f: 268 | f.write(f"{spk_start} --> {spk_end}\n{spk_txt}\n\n") 269 | 270 | 271 | threading.Thread(target=write_txt).start() 272 | 273 | 274 | def audio_concat_worker(): 275 | while True: 276 | speaker_audios_tmp = audio_concat_queue.get() 277 | for spk, audio_segments in speaker_audios_tmp.items(): 278 | # 合并每个说话人的音频片段 279 | audio_name = audio_segments[0]['audio_name'] 280 | output_file = os.path.join(save_path.get(), datetime.now().strftime("%Y-%m-%d"), audio_name, f"{spk}.mp3") 281 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 282 | inputs = [seg['file'] for seg in audio_segments] 283 | concat_audio = AudioSegment.from_file(inputs[0]) 284 | for i in range(1, len(inputs)): 285 | concat_audio = concat_audio + AudioSegment.from_file(inputs[i]) 286 | concat_audio.export(output_file, format="mp3") 287 | print(f"已将 {spk} 的音频合并到 {output_file}") 288 | audio_concat_queue.task_done() 289 | 290 | 291 | # 创建一个线程用于消费音频合并队列中的内容 292 | audio_concat_thread = threading.Thread(target=audio_concat_worker) 293 | audio_concat_thread.daemon = True 294 | audio_concat_thread.start() 295 | 296 | 297 | if __name__ in '__main__': 298 | print("项目源码:https://github.com/lukeewin/AudioSeparationGUI") 299 | root.mainloop() 300 | -------------------------------------------------------------------------------- /top/lukeewin/download_model.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # _*_ coding utf-8 _*_ 3 | # @Time: 2025/1/4 17:05 4 | # @Author: Luke Ewin 5 | # @Blog: https://blog.lukeewin.top 6 | from modelscope import snapshot_download 7 | snapshot_download('iic/speech_campplus_sv_zh-cn_16k-common') 8 | snapshot_download('iic/speech_fsmn_vad_zh-cn-16k-common-pytorch') 9 | snapshot_download('iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch') 10 | snapshot_download('iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch') 11 | -------------------------------------------------------------------------------- /top/lukeewin/startup.bat: -------------------------------------------------------------------------------- 1 | call "D:\Software\office\python\Anaconda\Scripts\activate.bat" "D:\Software\office\python\Anaconda" 2 | call conda activate funasr 3 | python app.py 4 | pause --------------------------------------------------------------------------------