├── README.md
├── img
    └── img.png
└── top
    └── lukeewin
        ├── app.py
        ├── download_model.py
        └── startup.bat


/README.md:
--------------------------------------------------------------------------------
 1 | # 0. 效果演示
 2 | ![img.png](img/img.png) \
 3 | 演示音频分割功能可以访问 https://www.bilibili.com/video/BV1oxrcYuELK \
 4 | 演示视频分割功能可以访问 https://www.bilibili.com/video/BV1xYweeKEvZ \
 5 | 如果是不懂代码的人想要使用本项目，可以使用我打包好的程序，我是在Windows 11系统上打包的，不确定Windows其它版本是否能用，如果是非Windows系统，请使用源码方式运行。 \
 6 | [点击这里跳转到打包好的可执行程序](https://item.taobao.com/item.htm?ft=t&id=853452834970)
 7 | # 1. 说明
 8 | 这是基于开源的 FunASR 实现的说话人分离的 GUI 项目，可以在支持图形界面中的任意 PC 端运行 \
 9 | 要求 python version >= 3.8 \
10 | 支持运行在 Windows、MacOS、Linux 系统 \
11 | 本项目适合个人电脑使用，如果要在生产服务器中部署，并且需要并发处理，可到我博客中联系我
12 | # 2. 开发日志
13 | 2023-11-14 对选择的多个音频分离不同的人声 \
14 | 2024-01-04 保存每个说话人对应的内容 \
15 | 2024-01-09 增加合并相同说话人功能 \
16 | 2024-01-22 增加视频切片功能 \
17 | 2024-02-25 新增允许控制每个音频片段切割的字符数
18 | # 3. 安装
19 | 执行下面命令来安装依赖
20 | ```shell
21 | pip install -U funasr modelscope ffmpeg-python pydub
22 | ```
23 | 此外还需要安装 torch，可以到 torch 官方中根据自己电脑情况安装不同版本的 torch \
24 | 安装 ffmpeg，可以到 github 中搜索 ffmpeg，下载解压后，配置环境变量 \
25 | 如果不会安装 torch 和 ffmpeg，可以参考我之前发布到博客中的一篇[文章](https://blog.lukeewin.top/archives/windows-an-zhuang-whisper#toc-head-1)。
26 | # 4. 功能
27 | 1. 支持对指定的单个或者多个音频中不同的说话人讲的话进行分离，分别归类到不同的目录中
28 | 2. 保存每个说话人对应的包含时间戳的文本内容
29 | 3. 支持视频切片，根据说话人声音进行视频切片 
30 | 
31 | # 5. 模型下载
32 | 执行下面程序，会自动下载模型到当前用户 .cache/modelscope/hub/models/iic/ 目录中
33 | ```shell
34 | python download_model.py
35 | ```
36 | # 6. 联系
37 | 可以添加交流群 746213237 \
38 | 个人技术分享博客：https://blog.lukeewin.top \
39 | 如果是小白，不懂代码，可以[点击这里](https://item.taobao.com/item.htm?ft=t&id=853452834970)
40 | 
41 | 


--------------------------------------------------------------------------------
/img/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukeewin/AudioSeparationGUI/e0c0925d350eca2b5dee9ae66c52625638491986/img/img.png


--------------------------------------------------------------------------------
/top/lukeewin/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import threading
  3 | import tkinter as tk
  4 | import queue
  5 | from datetime import timedelta, datetime
  6 | from pydub import AudioSegment
  7 | import ffmpeg
  8 | from tkinter import filedialog, messagebox
  9 | from funasr import AutoModel
 10 | 
 11 | spk_txt_queue = queue.Queue()
 12 | 
 13 | # 创建窗口
 14 | root = tk.Tk()
 15 | root.title("说话人分离 https://blog.lukeewin.top")
 16 | 
 17 | # 获取屏幕宽度和高度
 18 | screen_width = root.winfo_screenwidth()
 19 | screen_height = root.winfo_screenheight()
 20 | 
 21 | # 设置窗口大小
 22 | window_width = 400
 23 | window_height = 200
 24 | 
 25 | # 计算居中位置
 26 | x_coordinate = (screen_width // 2) - (window_width // 2)
 27 | y_coordinate = (screen_height // 2) - (window_height // 2)
 28 | 
 29 | # 设置窗口大小和位置
 30 | root.geometry(f"{window_width}x{window_height}+{x_coordinate}+{y_coordinate}")
 31 | 
 32 | home_directory = os.path.expanduser("~")
 33 | asr_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
 34 | asr_model_revision = "v2.0.4"
 35 | vad_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "speech_fsmn_vad_zh-cn-16k-common-pytorch")
 36 | vad_model_revision = "v2.0.4"
 37 | punc_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
 38 | punc_model_revision = "v2.0.4"
 39 | spk_model_path = os.path.join(home_directory, ".cache", "modelscope", "hub", "models", "iic", "speech_campplus_sv_zh-cn_16k-common")
 40 | spk_model_revision = "v2.0.4"
 41 | ngpu = 1
 42 | device = "cuda"
 43 | ncpu = 4
 44 | 
 45 | # ASR 模型
 46 | model = AutoModel(model=asr_model_path,
 47 |                   model_revision=asr_model_revision,
 48 |                   vad_model=vad_model_path,
 49 |                   vad_model_revision=vad_model_revision,
 50 |                   punc_model=punc_model_path,
 51 |                   punc_model_revision=punc_model_revision,
 52 |                   spk_model=spk_model_path,
 53 |                   spk_model_revision = spk_model_revision,
 54 |                   ngpu=ngpu,
 55 |                   ncpu=ncpu,
 56 |                   device=device,
 57 |                   disable_pbar=True,
 58 |                   disable_log=True,
 59 |                   disable_update=True
 60 |                   )
 61 | 
 62 | # 创建一个队列，用于线程间通信
 63 | result_queue = queue.Queue()
 64 | # 音频合并队列
 65 | audio_concat_queue = queue.Queue()
 66 | 
 67 | # 支持的音视频格式
 68 | support_audio_format = ['.mp3', '.m4a', '.aac', '.ogg', '.wav', '.flac', '.wma', '.aif']
 69 | support_video_format = ['.mp4', '.avi', '.mov', '.mkv']
 70 | 
 71 | input_frame = tk.Frame(root)
 72 | input_frame.pack(side=tk.TOP, padx=10, pady=2)
 73 | output_frame = tk.Frame(root)
 74 | output_frame.pack(side=tk.TOP, padx=10, pady=2)
 75 | start_trans_frame = tk.Frame(root)
 76 | start_trans_frame.pack(side=tk.TOP, padx=10, pady=2)
 77 | show_frame = tk.Frame(root)
 78 | show_frame.pack(side=tk.TOP,padx=10, pady=2)
 79 | 
 80 | selected_file_list = []
 81 | # 选择需要分离的音频
 82 | def select_multi_file():
 83 |     selected_file_list.clear()
 84 |     selected_files = filedialog.askopenfilenames(title='选择多个文件', filetypes=[('音频文件', '*.mp3 *.wav *.ogg *.flac *.aac *.m4a *.aif *.wma'), ('视频文件', '*.mp4 *.avi *.mov *.mkv')])
 85 |     selected_file_count = len(selected_files)
 86 |     for tmp_file in selected_files:
 87 |         selected_file_list.append(tmp_file)
 88 |         print(f"选择的音频或视频：{tmp_file}")
 89 |     show_input_info.config(text=f"已选择 {selected_file_count} 个文件")
 90 | select_input_file_button = tk.Button(input_frame, text='选择音视频', command=select_multi_file)
 91 | select_input_file_button.pack(side=tk.LEFT, padx=10, pady=2)
 92 | show_input_info = tk.Label(input_frame, text='')
 93 | show_input_info.pack(side=tk.LEFT, padx=10, pady=2)
 94 | 
 95 | # 指定转写后的保存路径
 96 | output_label = tk.Label(output_frame, text="保存路径")
 97 | output_label.pack(side=tk.LEFT, padx=10, pady=2)
 98 | 
 99 | save_path = tk.StringVar(None)
100 | # 指定保存路径
101 | def save_dir():
102 |     save_directory = filedialog.askdirectory(title='选择保存路径')
103 |     if save_directory:
104 |         save_path.set(save_directory)
105 |         output_label.config(text=save_directory)
106 | tk.Button(output_frame, text='选择保存目录', command=save_dir).pack(side=tk.LEFT, padx=10, pady=2)
107 | 
108 | def copy_output_path():
109 |     # 获取label中的文本内容
110 |     text_to_copy = output_label.cget("text")
111 |     # 清空剪贴板
112 |     root.clipboard_clear()
113 |     # 将文本内容添加到剪贴板
114 |     root.clipboard_append(text_to_copy)
115 | 
116 | # 复制
117 | copy_button = tk.Button(output_frame, text="复制路径", command=copy_output_path)
118 | copy_button.pack(side=tk.RIGHT, padx=10, pady=2)
119 | 
120 | # 分离字数
121 | split_number = tk.Entry(start_trans_frame, width=2)
122 | split_number.insert(0, str(10))
123 | split_number.pack(side=tk.LEFT, padx=5, pady=2)
124 | 
125 | def to_date(milliseconds):
126 |     """将时间戳转换为SRT格式的时间"""
127 |     time_obj = timedelta(milliseconds=milliseconds)
128 |     return f"{time_obj.seconds // 3600:02d}:{(time_obj.seconds // 60) % 60:02d}:{time_obj.seconds % 60:02d}.{time_obj.microseconds // 1000:03d}"
129 | 
130 | 
131 | def to_milliseconds(time_str):
132 |     time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
133 |     time_delta = time_obj - datetime(1900, 1, 1)
134 |     milliseconds = int(time_delta.total_seconds() * 1000)
135 |     return milliseconds
136 | 
137 | # 转写获取时间戳，根据时间戳进行切分，然后根据 spk id 进行分类
138 | # audio: 音频
139 | # return 切分后按照 spk id 的地址
140 | def trans():
141 |     if len(selected_file_list) != 0 and save_path.get() != '' and save_path.get() is not None:
142 |         for audio in selected_file_list:
143 |             if os.path.exists(audio):
144 |                 audio_name = os.path.splitext(os.path.basename(audio))[0]
145 |                 _, audio_extension = os.path.splitext(audio)
146 |                 show_info_label.config(text=f'正在执行中，请勿关闭程序。{audio}')
147 |                 speaker_audios = {}  # 每个说话人作为 key，value 为列表，列表中为当前说话人对应的每个音频片段
148 |                 # 音频预处理
149 |                 try:
150 |                     audio_bytes, _ = (
151 |                         ffmpeg.input(audio, threads=0, hwaccel='cuda')
152 |                         .output("-", format="wav", acodec="pcm_s16le", ac=1, ar=16000)
153 |                         .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
154 |                     )
155 |                     res = model.generate(input=audio_bytes, batch_size_s=300, is_final=True, sentence_timestamp=True)
156 |                     rec_result = res[0]
157 |                     asr_result_text = rec_result['text']
158 |                     if asr_result_text != '':
159 |                         sentences = []
160 |                         for sentence in rec_result["sentence_info"]:
161 |                             start = to_date(sentence["start"])
162 |                             end = to_date(sentence["end"])
163 |                             if sentences and sentence["spk"] == sentences[-1]["spk"] and len(sentences[-1]["text"]) < int(split_number.get()):
164 |                                 sentences[-1]["text"] += "" + sentence["text"]
165 |                                 sentences[-1]["end"] = end
166 |                             else:
167 |                                 sentences.append(
168 |                                     {"text": sentence["text"], "start": start, "end": end, "spk": sentence["spk"]}
169 |                                 )
170 | 
171 |                         # 剪切音频或视频片段
172 |                         i = 0
173 |                         for stn in sentences:
174 |                             stn_txt = stn['text']
175 |                             start = stn['start']
176 |                             end = stn['end']
177 |                             # tmp_start = to_milliseconds(start)
178 |                             # tmp_end = to_milliseconds(end)
179 |                             # duration = round((tmp_end - tmp_start) / 1000, 3)
180 |                             spk = stn['spk']
181 | 
182 |                             # 根据文件名和 spk 创建目录
183 |                             date = datetime.now().strftime("%Y-%m-%d")
184 |                             final_save_path = os.path.join(save_path.get(), date, audio_name, str(spk))
185 |                             os.makedirs(final_save_path, exist_ok=True)
186 |                             # 获取音视频后缀
187 |                             file_ext = os.path.splitext(audio)[-1]
188 |                             final_save_file = os.path.join(final_save_path, str(i)+file_ext)
189 |                             spk_txt_path = os.path.join(save_path.get(), date, audio_name)
190 |                             spk_txt_file = os.path.join(spk_txt_path, f'spk{spk}.txt')
191 |                             spk_txt_queue.put({'spk_txt_file': spk_txt_file, 'spk_txt': stn_txt, 'start': start, 'end': end})
192 |                             i += 1
193 |                             try:
194 |                                 if file_ext in support_audio_format:
195 |                                     (
196 |                                         ffmpeg.input(audio, threads=0, ss=start, to=end, hwaccel='cuda')
197 |                                         .output(final_save_file)
198 |                                         .run(cmd=["ffmpeg", "-nostdin"], overwrite_output=True, capture_stdout=True,
199 |                                              capture_stderr=True)
200 |                                     )
201 |                                 elif file_ext in support_video_format:
202 |                                     final_save_file = os.path.join(final_save_path, str(i)+'.mp4')
203 |                                     (
204 |                                         ffmpeg.input(audio, threads=0, ss=start, to=end, hwaccel='cuda')
205 |                                         .output(final_save_file, vcodec='libx264', crf=23, acodec='aac', ab='128k')
206 |                                         .run(cmd=["ffmpeg", "-nostdin"], overwrite_output=True, capture_stdout=True,
207 |                                              capture_stderr=True)
208 |                                     )
209 |                                 else:
210 |                                     print(f'{audio}不支持')
211 |                             except ffmpeg.Error as e:
212 |                                 print(f"剪切音频发生错误，错误信息：{e}")
213 |                             # 记录说话人和对应的音频片段，用于合并音频片段
214 |                             if spk not in speaker_audios:
215 |                                 speaker_audios[spk] = []  # 列表中存储音频片段
216 |                             speaker_audios[spk].append({'file': final_save_file, 'audio_name': audio_name})
217 |                         ret = {"text": asr_result_text, "sentences": sentences}
218 |                         print(f'{audio} 切分完成')
219 |                         result_queue.put(f'{audio} 切分完成')
220 |                         show_info_label.config(text=f'{audio} 切分完成')
221 |                         print(f'转写结果：{ret}')
222 |                         # 存入合并队列
223 |                         audio_concat_queue.put(speaker_audios)
224 |                     else:
225 |                         print("没有转写结果")
226 |                 except Exception as e:
227 |                     print(f"转写异常：{e}")
228 |             else:
229 |                 print("输入的文件不存在")
230 |                 messagebox.showinfo("提醒", "输入的文件不存在")
231 |     else:
232 |         print("没有填写输入输出")
233 |         messagebox.showinfo("提醒", "没有填写选择文件或保存路径")
234 | 
235 | 
236 | def start_transcription_thread():
237 |     # 创建并启动转写线程
238 |     thread = threading.Thread(target=trans)
239 |     thread.start()
240 | 
241 | 
242 | btn_start = tk.Button(start_trans_frame, text="分离", command=start_transcription_thread)
243 | btn_start.pack(side=tk.LEFT, padx=10, pady=2)
244 | 
245 | # 显示分离情况
246 | show_info_label = tk.Label(show_frame, text="")
247 | show_info_label.pack(side=tk.LEFT, padx=10, pady=2)
248 | 
249 | 
250 | def show_info():
251 |     res = result_queue.get()
252 |     show_info_label.config(text=res)
253 | 
254 | 
255 | threading.Thread(target=show_info).start()
256 | 
257 | 
258 | def write_txt():
259 |     while True:
260 |         item = spk_txt_queue.get()
261 |         spk_txt_file = item['spk_txt_file']
262 |         spk_txt = item['spk_txt']
263 |         spk_start = item['start']
264 |         spk_end = item['end']
265 |         dir_path = os.path.dirname(spk_txt_file)
266 |         os.makedirs(dir_path, exist_ok=True)
267 |         with open(spk_txt_file, 'a', encoding='utf-8') as f:
268 |             f.write(f"{spk_start} --> {spk_end}\n{spk_txt}\n\n")
269 | 
270 | 
271 | threading.Thread(target=write_txt).start()
272 | 
273 | 
274 | def audio_concat_worker():
275 |     while True:
276 |         speaker_audios_tmp = audio_concat_queue.get()
277 |         for spk, audio_segments in speaker_audios_tmp.items():
278 |             # 合并每个说话人的音频片段
279 |             audio_name = audio_segments[0]['audio_name']
280 |             output_file = os.path.join(save_path.get(), datetime.now().strftime("%Y-%m-%d"), audio_name, f"{spk}.mp3")
281 |             os.makedirs(os.path.dirname(output_file), exist_ok=True)
282 |             inputs = [seg['file'] for seg in audio_segments]
283 |             concat_audio = AudioSegment.from_file(inputs[0])
284 |             for i in range(1, len(inputs)):
285 |                 concat_audio = concat_audio + AudioSegment.from_file(inputs[i])
286 |             concat_audio.export(output_file, format="mp3")
287 |             print(f"已将 {spk} 的音频合并到 {output_file}")
288 |         audio_concat_queue.task_done()
289 | 
290 | 
291 | # 创建一个线程用于消费音频合并队列中的内容
292 | audio_concat_thread = threading.Thread(target=audio_concat_worker)
293 | audio_concat_thread.daemon = True
294 | audio_concat_thread.start()
295 | 
296 | 
297 | if __name__ in '__main__':
298 |     print("项目源码：https://github.com/lukeewin/AudioSeparationGUI")
299 |     root.mainloop()
300 | 


--------------------------------------------------------------------------------
/top/lukeewin/download_model.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # _*_ coding utf-8 _*_
 3 | # @Time: 2025/1/4 17:05
 4 | # @Author: Luke Ewin
 5 | # @Blog: https://blog.lukeewin.top
 6 | from modelscope import snapshot_download
 7 | snapshot_download('iic/speech_campplus_sv_zh-cn_16k-common')
 8 | snapshot_download('iic/speech_fsmn_vad_zh-cn-16k-common-pytorch')
 9 | snapshot_download('iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch')
10 | snapshot_download('iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
11 | 


--------------------------------------------------------------------------------
/top/lukeewin/startup.bat:
--------------------------------------------------------------------------------
1 | call "D:\Software\office\python\Anaconda\Scripts\activate.bat" "D:\Software\office\python\Anaconda"
2 | call conda activate funasr
3 | python app.py
4 | pause


--------------------------------------------------------------------------------