├── 01 从麦克风流式转录.py ├── 02 从文件流式转录.py ├── 03 桌面悬浮字幕.py ├── assets ├── icon.ico └── 桌面实时字幕显示效果.png ├── audio └── placeholder ├── readme.md ├── requirements.txt └── style.css /01 从麦克风流式转录.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import wave 4 | import socket 5 | from multiprocessing import Process, Queue 6 | from string import ascii_letters 7 | from copy import deepcopy 8 | 9 | import numpy as np 10 | import sounddevice as sd 11 | from rich.console import Console 12 | from funasr_onnx.paraformer_online_bin import Paraformer 13 | import colorama; colorama.init() 14 | console = Console() 15 | import signal 16 | 17 | # paraformer 的单位片段长 60ms,在 16000 采样率下,就是 960 个采样 18 | # 它的 chunk_size ,如果设为 [10, 20, 10] 19 | # 就表示左回看 10 个片段,总长度 20 片段,右回看 10 片段 20 | # 20 个片段,也就是 1.2s 21 | 22 | # 它的每一个流,是保存在一个字典中,即 param_dict 23 | # 每次解析,都会修改 param_dict 这个词典 24 | 25 | # 将识别到的文字从 udp 端口发送 26 | udp_port = 6009 27 | 28 | # 一行最多显示多少宽度(每个中文宽度为2,英文字母宽度为1) 29 | line_width = 50 30 | 31 | def recognize(queue_in: Queue, queue_out: Queue): 32 | 33 | # 创建一个 udp socket,用于实时发送文字 34 | sk = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 35 | 36 | model_dir = 'model' 37 | chunk_size = [10, 20, 10] # 左回看数,总片段数,右回看数。每片段长 60ms 38 | model = Paraformer(model_dir, batch_size=1, quantize=True, chunk_size=chunk_size, intra_op_num_threads=4) # only support batch_size = 1 39 | 40 | # 通知主进程,可以开始了 41 | queue_out.put(True) 42 | 43 | # 每攒够 5 个片段,就预测一下虚文字 44 | pre_num = 0; pre_expect = 5 45 | printed_num = 0 # 记录一行已输出多少个字 46 | chunks = [] 47 | param_dict = {'cache': dict()} 48 | 行缓冲 = '' 49 | 旧预测 = '' 50 | while instruction := queue_in.get() : 51 | match instruction['type']: 52 | case 'feed': 53 | # 吃下片段 54 | chunks.append(instruction['samples']) 55 | pre_num += 1 56 | 57 | # 显示虚文字 58 | if len(chunks) < chunk_size[1] and pre_num == pre_expect and queue_in.qsize() < 3: 59 | pre_num = 0 60 | data = np.concatenate(chunks) 61 | 虚字典 = deepcopy(param_dict) 62 | 虚字典['is_final'] = True 63 | rec_result = model(audio_in=data, param_dict=虚字典) 64 | if rec_result and rec_result[0]['preds'][0]: 65 | 预测 = rec_result[0]['preds'][0] 66 | if 预测 and 预测 != 旧预测: 67 | 旧预测 = 预测 68 | sk.sendto((行缓冲+预测).encode('utf-8'), ('127.0.0.1', udp_port)) # 网络发送 69 | print(f'\033[0K\033[32m{行缓冲}\033[33m{预测}\033[0m', # 控制台打印 70 | end=f'\033[0G', flush=True) 71 | elif pre_num == 5: pre_num = 0 72 | 73 | # 显示实文字 74 | if len(chunks) == chunk_size[1]: 75 | param_dict['is_final'] = False 76 | data = np.concatenate(chunks) 77 | rec_result = model(audio_in=data, param_dict=param_dict) 78 | if rec_result and rec_result[0]['preds'][0]: 79 | 文字 = rec_result[0]['preds'][0] # 得到文字 80 | if 文字 and 文字[-1] in ascii_letters: 文字 += ' ' # 英文后面加空格 81 | 行缓冲 += 文字 # 加入缓冲 82 | sk.sendto(行缓冲.encode('utf-8'), ('127.0.0.1', udp_port)) # 网络发送 83 | print(f'\033[0K\033[32m{行缓冲}\033[0m', end='\033[0G', flush=True) # 控制台打印 84 | printed_num += len(文字.encode('gbk')) # 统计数字 85 | if printed_num >= line_width: print(''); 行缓冲 = ''; printed_num=0 # 每到长度极限,就清空换行 86 | chunks.clear() 87 | 88 | case 'end': 89 | if not chunks: 90 | chunks.append(np.zeros(960, dtype=np.float32)) 91 | data = np.concatenate(chunks) 92 | param_dict['is_final'] = True 93 | rec_result = model(audio_in=data, param_dict=param_dict) 94 | if rec_result: print(rec_result[0]['preds'][0], end='', flush=True) 95 | chunks.clear() 96 | param_dict = {'cache': dict()} 97 | print('\n\n') 98 | 99 | 100 | 101 | def record_callback(indata: np.ndarray, 102 | frames: int, time_info, 103 | status: sd.CallbackFlags) -> None: 104 | 105 | # 转成单声道、16000采样率 106 | data = np.mean(indata.copy()[::3], axis=1) 107 | 108 | # 放入队列 109 | queue_in.put({'type':'feed', 'samples':data}) 110 | 111 | # 保存音频 112 | f.writeframes((data * (2**15-1)).astype(np.int16).tobytes()) 113 | 114 | 115 | 116 | def main(): 117 | 118 | def signal_handler(sig, frame): print("\n\033[31m收到中断信号 Ctrl+C,退出程序\033[0m"); sys.exit(0) 119 | signal.signal(signal.SIGINT, signal_handler) 120 | 121 | global queue_in, queue_out 122 | queue_in = Queue() 123 | queue_out = Queue() 124 | process = Process(target=recognize, args=[queue_in, queue_out], daemon=True) 125 | process.start() 126 | 127 | # 等待模型加载完 128 | print('正在加载语音模型');queue_out.get() 129 | print(f'模型加载完成\n\n') 130 | 131 | try: 132 | device = sd.query_devices(kind='input') 133 | channels = device['max_input_channels'] 134 | console.print(f'使用默认音频设备:[italic]{device["name"]}', end='\n\n') 135 | except UnicodeDecodeError: 136 | console.print("由于编码问题,暂时无法获得麦克风设备名字", end='\n\n', style='bright_red') 137 | except sd.PortAudioError: 138 | console.print("没有找到麦克风设备", end='\n\n', style='bright_red') 139 | input('按回车键退出'); sys.exit() 140 | 141 | # 将音频保存到 wav,以作检查用 142 | global f 143 | f = wave.open('audio/out.wav', 'w') 144 | f.setnchannels(1) 145 | f.setsampwidth(2) 146 | f.setframerate(16000) 147 | 148 | # 我们原生录制的是 48000 采样率的,便于以后保存高品质录音 149 | # 可后续处理为 16000 采样率 150 | stream = sd.InputStream( 151 | channels=1, 152 | dtype="float32", 153 | samplerate=48000, 154 | blocksize=int(3 * 960), # 0.06 seconds 155 | callback=record_callback 156 | ); stream.start() 157 | 158 | print('开始了') 159 | while True: 160 | input() 161 | queue_in.put({'type': 'end'}) 162 | 163 | if __name__ == '__main__': 164 | main() 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /02 从文件流式转录.py: -------------------------------------------------------------------------------- 1 | import soundfile 2 | from funasr_onnx.paraformer_online_bin import Paraformer 3 | from pathlib import Path 4 | import subprocess 5 | import wave 6 | import numpy as np 7 | import time 8 | 9 | # 先用 ffmpeg 转格式 10 | file_path = 'audio/out.wav' 11 | wav_path = 'audio/temp.wav' 12 | command = ['ffmpeg', '-y', '-i', file_path, '-ar', '16000', '-ac', '1', wav_path] 13 | subprocess.run(command, capture_output=True) 14 | 15 | # 载入模型 16 | model_dir = 'model' 17 | chunk_size = [20, 40, 20] # 左回看,片段,右回看,单位 60ms 18 | model = Paraformer(model_dir, batch_size=1, quantize=True, chunk_size=chunk_size, intra_op_num_threads=4) # only support batch_size = 1 19 | 20 | ##online asr 21 | print('开始识别了') 22 | print(f'chunk_size: {chunk_size}') 23 | speech, sample_rate = soundfile.read(wav_path) 24 | speech_length = speech.shape[0] 25 | sample_offset = 0 26 | step = chunk_size[1] * 960 27 | param_dict = {'cache': dict()} 28 | final_result = "" 29 | for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)): 30 | if sample_offset + step >= speech_length - 1: 31 | step = speech_length - sample_offset 32 | is_final = True 33 | else: 34 | is_final = False 35 | param_dict['is_final'] = is_final 36 | data = speech[sample_offset: sample_offset + step] 37 | data = data.astype(np.float32) 38 | rec_result = model(audio_in=data, param_dict=param_dict) 39 | if len(rec_result) > 0: 40 | final_result += rec_result[0]["preds"][0] 41 | if rec_result: 42 | print(rec_result[0]['preds'][0], end='', flush=True) 43 | print('') 44 | -------------------------------------------------------------------------------- /03 桌面悬浮字幕.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | from pathlib import Path 3 | from PyQt5.QtCore import Qt 4 | from PyQt5.QtGui import QColor, QFont, QIcon 5 | from PyQt5.QtWidgets import QApplication, QLabel, QMainWindow, QVBoxLayout, QWidget, QSystemTrayIcon, QMenu, QAction 6 | from PyQt5.QtNetwork import QUdpSocket 7 | from rich import inspect 8 | 9 | 10 | # 窗体属性参考:https://doc.qt.io/qt-6/qt.html#WindowType-enum 11 | # 控件属性参考:https://doc.qt.io/qt-6/qt.html#WidgetAttribute-enum 12 | # 样式表参考:https://doc.qt.io/qt-5/stylesheet-syntax.html 13 | # https://doc.qt.io/qt-5/stylesheet-reference.html 14 | 15 | # 通过 udp 端口接收文字,并更新显示 16 | udp_port = 6009 17 | 18 | class TransparentWindow(QMainWindow): 19 | def __init__(self): 20 | super().__init__() 21 | 22 | # 设置窗口属性 23 | self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint | Qt.SplashScreen) # 24 | self.setAttribute(Qt.WA_TranslucentBackground, True) 25 | self.setStyleSheet(self.get_style()) 26 | 27 | 28 | # 设置窗口大小和位置 29 | self.setGeometry(400, 400, 400, 100) 30 | 31 | # 创建主窗口的 central widget 32 | central_widget = QWidget(self) 33 | self.setCentralWidget(central_widget) 34 | 35 | # 创建垂直布局 36 | layout = QVBoxLayout(central_widget) 37 | 38 | # 创建标签 39 | self.label = QLabel(f'端口:{udp_port} 外观:style.css', self) 40 | 41 | # 将标签添加到布局中 42 | layout.addWidget(self.label) 43 | 44 | # 添加系统托盘 45 | self.tray_icon = QSystemTrayIcon(self) 46 | self.tray_icon.setIcon(QIcon("assets/icon.ico")) # 托盘图标路径 47 | self.tray_icon.setVisible(True) 48 | self.tray_icon.setToolTip("悬浮窗口") 49 | self.tray_icon.activated.connect(self.tray_trigger) 50 | 51 | # 绑定 udp 端口 52 | self.udp_socket = QUdpSocket(self) 53 | self.udp_socket.bind(udp_port) 54 | self.udp_socket.readyRead.connect(self.receive_data) 55 | 56 | # 创建右键菜单 57 | self.create_context_menu() 58 | 59 | def change_port(self): 60 | self.udp_socket.close() 61 | self.udp_socket.bind(udp_port) 62 | 63 | def receive_data(self): 64 | while self.udp_socket.hasPendingDatagrams(): 65 | size = self.udp_socket.pendingDatagramSize() 66 | data, host, port = self.udp_socket.readDatagram(size) 67 | 68 | # 将接收到的数据转换为字符串并更新标签内容 69 | try: 70 | message = data.decode("utf-8") 71 | if message: 72 | self.label.setText(f"{message}") 73 | except Exception as e: 74 | print(e) 75 | 76 | def create_context_menu(self): 77 | self.menu = QMenu(self) 78 | switch_transparency_action = QAction("更新外观", self) 79 | switch_transparency_action.triggered.connect(self.update_style) 80 | self.menu.addAction(switch_transparency_action) 81 | 82 | self.hide_show_action = QAction("隐藏", self) 83 | self.hide_show_action.triggered.connect(self.hide_show) 84 | self.menu.addAction(self.hide_show_action) 85 | 86 | self.lock_unlock_action = QAction("锁定", self) 87 | self.lock_unlock_action.triggered.connect(self.lock_unlock) 88 | self.menu.addAction(self.lock_unlock_action) 89 | 90 | quit_action = QAction("退出", self) 91 | quit_action.triggered.connect(self.quit_application) 92 | self.menu.addAction(quit_action) 93 | 94 | self.tray_icon.setContextMenu(self.menu) 95 | 96 | def update_style(self): 97 | self.setStyleSheet(self.get_style()) # 更新 style 98 | self.resize(self.label.sizeHint()) # 更新窗体大小 99 | self.update() 100 | 101 | def get_style(self): 102 | style = "QLabel { color: green; background-color: rgba(0, 0, 0, 0%); }" 103 | style_path = Path('style.css') 104 | if style_path.exists: 105 | with open (style_path, 'r', encoding='utf-8') as f: style = f.read() 106 | else: 107 | with open (style_path, 'w', encoding='utf-8') as f: f.write(style) 108 | return style 109 | 110 | def quit_application(self): 111 | self.tray_icon.hide() 112 | QApplication.quit() 113 | 114 | def tray_trigger(self, reason): 115 | if reason == QSystemTrayIcon.Trigger: 116 | # 单击系统托盘图标时显示或隐藏窗口 117 | self.hide_show() 118 | 119 | def hide_show(self): 120 | if self.isVisible(): 121 | self.hide_show_action.setText('显示') 122 | self.hide() 123 | else: 124 | self.hide_show_action.setText('隐藏') 125 | self.show() 126 | 127 | def lock_unlock(self, reason): 128 | lock_state = not (self.windowFlags() & Qt.WindowTransparentForInput) 129 | if lock_state: self.lock_unlock_action.setText('解锁') 130 | else: self.lock_unlock_action.setText('锁定') 131 | self.setWindowFlag(Qt.WindowTransparentForInput, lock_state) 132 | self.hide_show() 133 | 134 | def mousePressEvent(self, event): 135 | if event.button() == Qt.LeftButton: 136 | self.drag_position = event.globalPos() - self.frameGeometry().topLeft() 137 | event.accept() 138 | 139 | def mouseMoveEvent(self, event): 140 | if event.buttons() == Qt.LeftButton: 141 | self.move(event.globalPos() - self.drag_position) 142 | event.accept() 143 | 144 | if __name__ == "__main__": 145 | app = QApplication(sys.argv) 146 | window = TransparentWindow() 147 | window.show() 148 | 149 | sys.exit(app.exec_()) 150 | -------------------------------------------------------------------------------- /assets/icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaujetZhao/FunASR-Online-Paraformer-Test/bc695e9069808e0efaabc7c60c1cd2f5e6fe8801/assets/icon.ico -------------------------------------------------------------------------------- /assets/桌面实时字幕显示效果.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaujetZhao/FunASR-Online-Paraformer-Test/bc695e9069808e0efaabc7c60c1cd2f5e6fe8801/assets/桌面实时字幕显示效果.png -------------------------------------------------------------------------------- /audio/placeholder: -------------------------------------------------------------------------------- 1 | placeholder -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 用 FunASR-ONNX 加载 Paraformer 流式模型,实现的低延迟实时语音识别、桌面实时字幕。 2 | 3 | ## 实时语音识别 4 | 5 | 下载模型: 6 | 7 | ``` 8 | git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online-onnx.git model 9 | ``` 10 | 11 | 安装依赖(Python3.10+): 12 | 13 | ``` 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | 使用:确保有麦克风,直接运行脚本 `01 从麦克风流式转录.py` 即可,在转录的同时,它也会用 UDP 向端口 6009 发送识别结果 18 | 19 | 若要更改端口,请修改源文件 20 | 21 | ## 桌面实时字幕 22 | 23 | 另外做了一个脚本 `03 桌面悬浮字幕.py` ,直接运行后,它会从 6009 端口接收 UDP 数据,实时更新在悬浮窗,以此来实现屏幕实时字幕 24 | 25 | 若要更改端口,请修改源文件 26 | 27 | 编辑 style.css 可以改变字幕的显示效果,包括字体大小、颜色、背景等 28 | 29 | 右键拖盘图标,可以: 30 | 31 | - 更新外观,在编辑 style.css 后使用 32 | - 隐藏、显示悬浮窗 33 | - 锁定、解锁悬浮窗;当解锁时,可以用鼠标拖动;当锁定时,无法拖动,鼠标事件会穿透 34 | - 退出 35 | 36 | ![桌面实时字幕显示效果](assets/桌面实时字幕显示效果.png) 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | sounddevice 3 | rich 4 | colorama 5 | funasr_onnx==0.2.4 6 | soundfile 7 | PyQt5 -------------------------------------------------------------------------------- /style.css: -------------------------------------------------------------------------------- 1 | QLabel { 2 | color: white; 3 | background-color: rgba(0, 0, 0, 65%); 4 | font-family: 楷体; 5 | font-size: 50px; 6 | 7 | border-width: 2px; 8 | border-radius: 10px; 9 | border-style: solid; 10 | border-color: yellowgreen; 11 | } 12 | 13 | --------------------------------------------------------------------------------