├── 01 从麦克风流式转录.py
├── 02 从文件流式转录.py
├── 03 桌面悬浮字幕.py
├── assets
    ├── icon.ico
    └── 桌面实时字幕显示效果.png
├── audio
    └── placeholder
├── readme.md
├── requirements.txt
└── style.css


/01 从麦克风流式转录.py:
--------------------------------------------------------------------------------
  1 | import sys 
  2 | import time
  3 | import wave
  4 | import socket
  5 | from multiprocessing import Process, Queue 
  6 | from string import ascii_letters
  7 | from copy import deepcopy
  8 | 
  9 | import numpy as np
 10 | import sounddevice as sd
 11 | from rich.console import Console
 12 | from funasr_onnx.paraformer_online_bin import Paraformer
 13 | import colorama; colorama.init()
 14 | console = Console()
 15 | import signal 
 16 | 
 17 | # paraformer 的单位片段长 60ms，在 16000 采样率下，就是 960 个采样
 18 | # 它的 chunk_size ，如果设为 [10, 20, 10]
 19 | # 就表示左回看 10 个片段，总长度 20 片段，右回看 10 片段
 20 | # 20 个片段，也就是 1.2s
 21 | 
 22 | # 它的每一个流，是保存在一个字典中，即 param_dict 
 23 | # 每次解析，都会修改 param_dict 这个词典
 24 | 
 25 | # 将识别到的文字从 udp 端口发送
 26 | udp_port = 6009
 27 | 
 28 | # 一行最多显示多少宽度（每个中文宽度为2，英文字母宽度为1）
 29 | line_width = 50
 30 | 
 31 | def recognize(queue_in: Queue, queue_out: Queue):
 32 | 
 33 |     # 创建一个 udp socket，用于实时发送文字
 34 |     sk = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 35 | 
 36 |     model_dir = 'model'
 37 |     chunk_size = [10, 20, 10] # 左回看数，总片段数，右回看数。每片段长 60ms
 38 |     model = Paraformer(model_dir, batch_size=1, quantize=True, chunk_size=chunk_size, intra_op_num_threads=4) # only support batch_size = 1
 39 | 
 40 |     # 通知主进程，可以开始了
 41 |     queue_out.put(True)
 42 | 
 43 |     # 每攒够 5 个片段，就预测一下虚文字
 44 |     pre_num = 0; pre_expect = 5
 45 |     printed_num = 0   # 记录一行已输出多少个字
 46 |     chunks = []
 47 |     param_dict = {'cache': dict()}
 48 |     行缓冲 = ''
 49 |     旧预测 = ''
 50 |     while instruction := queue_in.get() :
 51 |         match instruction['type']:
 52 |             case 'feed':
 53 |                 # 吃下片段
 54 |                 chunks.append(instruction['samples'])
 55 |                 pre_num += 1
 56 | 
 57 |                 # 显示虚文字
 58 |                 if len(chunks) < chunk_size[1] and pre_num == pre_expect and queue_in.qsize() < 3:
 59 |                     pre_num = 0
 60 |                     data = np.concatenate(chunks)
 61 |                     虚字典 = deepcopy(param_dict)
 62 |                     虚字典['is_final'] = True 
 63 |                     rec_result = model(audio_in=data, param_dict=虚字典)
 64 |                     if rec_result and rec_result[0]['preds'][0]:
 65 |                         预测 = rec_result[0]['preds'][0]
 66 |                         if 预测 and 预测 != 旧预测: 
 67 |                             旧预测 = 预测
 68 |                             sk.sendto((行缓冲+预测).encode('utf-8'), ('127.0.0.1', udp_port))  # 网络发送
 69 |                             print(f'\033[0K\033[32m{行缓冲}\033[33m{预测}\033[0m',             # 控制台打印
 70 |                                   end=f'\033[0G', flush=True)
 71 |                 elif pre_num == 5: pre_num = 0
 72 | 
 73 |                 # 显示实文字
 74 |                 if len(chunks) == chunk_size[1]:
 75 |                     param_dict['is_final'] = False
 76 |                     data = np.concatenate(chunks)
 77 |                     rec_result = model(audio_in=data, param_dict=param_dict)
 78 |                     if rec_result and rec_result[0]['preds'][0]:
 79 |                         文字 = rec_result[0]['preds'][0]                   # 得到文字
 80 |                         if 文字 and 文字[-1] in ascii_letters: 文字 += ' '  # 英文后面加空格
 81 |                         行缓冲 += 文字                                      # 加入缓冲
 82 |                         sk.sendto(行缓冲.encode('utf-8'), ('127.0.0.1', udp_port))           # 网络发送
 83 |                         print(f'\033[0K\033[32m{行缓冲}\033[0m', end='\033[0G', flush=True)  # 控制台打印
 84 |                         printed_num += len(文字.encode('gbk'))              # 统计数字
 85 |                         if printed_num >= line_width: print(''); 行缓冲 = ''; printed_num=0    # 每到长度极限，就清空换行
 86 |                     chunks.clear()
 87 | 
 88 |             case 'end': 
 89 |                 if not chunks:
 90 |                     chunks.append(np.zeros(960, dtype=np.float32))
 91 |                 data = np.concatenate(chunks)
 92 |                 param_dict['is_final'] = True
 93 |                 rec_result = model(audio_in=data, param_dict=param_dict)
 94 |                 if  rec_result: print(rec_result[0]['preds'][0], end='', flush=True)
 95 |                 chunks.clear()
 96 |                 param_dict = {'cache': dict()}
 97 |                 print('\n\n')
 98 |                 
 99 |         
100 | 
101 | def record_callback(indata: np.ndarray, 
102 |                     frames: int, time_info, 
103 |                     status: sd.CallbackFlags) -> None:
104 |     
105 |     # 转成单声道、16000采样率
106 |     data = np.mean(indata.copy()[::3], axis=1)
107 | 
108 |     # 放入队列
109 |     queue_in.put({'type':'feed', 'samples':data})
110 | 
111 |     # 保存音频
112 |     f.writeframes((data * (2**15-1)).astype(np.int16).tobytes())
113 | 
114 | 
115 |     
116 | def main():
117 | 
118 |     def signal_handler(sig, frame): print("\n\033[31m收到中断信号 Ctrl+C，退出程序\033[0m"); sys.exit(0)
119 |     signal.signal(signal.SIGINT, signal_handler)
120 | 
121 |     global queue_in, queue_out
122 |     queue_in = Queue()
123 |     queue_out = Queue()
124 |     process = Process(target=recognize, args=[queue_in, queue_out], daemon=True)
125 |     process.start()
126 | 
127 |     # 等待模型加载完
128 |     print('正在加载语音模型');queue_out.get()
129 |     print(f'模型加载完成\n\n')
130 | 
131 |     try:
132 |         device = sd.query_devices(kind='input')
133 |         channels = device['max_input_channels']
134 |         console.print(f'使用默认音频设备：[italic]{device["name"]}', end='\n\n')
135 |     except UnicodeDecodeError:
136 |         console.print("由于编码问题，暂时无法获得麦克风设备名字", end='\n\n', style='bright_red')
137 |     except sd.PortAudioError:
138 |         console.print("没有找到麦克风设备", end='\n\n', style='bright_red')
139 |         input('按回车键退出'); sys.exit()
140 |     
141 |     # 将音频保存到 wav，以作检查用
142 |     global f
143 |     f = wave.open('audio/out.wav', 'w')
144 |     f.setnchannels(1)
145 |     f.setsampwidth(2)
146 |     f.setframerate(16000)
147 | 
148 |     # 我们原生录制的是 48000 采样率的，便于以后保存高品质录音
149 |     # 可后续处理为 16000 采样率
150 |     stream = sd.InputStream(
151 |         channels=1,
152 |         dtype="float32",
153 |         samplerate=48000,
154 |         blocksize=int(3 * 960),  # 0.06 seconds
155 |         callback=record_callback
156 |     ); stream.start()
157 | 
158 |     print('开始了')
159 |     while True:
160 |         input()
161 |         queue_in.put({'type': 'end'})
162 | 
163 | if __name__ == '__main__':
164 |     main()
165 | 
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/02 从文件流式转录.py:
--------------------------------------------------------------------------------
 1 | import soundfile
 2 | from funasr_onnx.paraformer_online_bin import Paraformer
 3 | from pathlib import Path
 4 | import subprocess
 5 | import wave
 6 | import numpy as np
 7 | import time
 8 | 
 9 | # 先用 ffmpeg 转格式
10 | file_path = 'audio/out.wav'
11 | wav_path = 'audio/temp.wav'
12 | command = ['ffmpeg', '-y', '-i', file_path, '-ar', '16000', '-ac', '1', wav_path]
13 | subprocess.run(command, capture_output=True)
14 | 
15 | # 载入模型
16 | model_dir = 'model'
17 | chunk_size = [20, 40, 20] # 左回看，片段，右回看，单位 60ms
18 | model = Paraformer(model_dir, batch_size=1, quantize=True, chunk_size=chunk_size, intra_op_num_threads=4) # only support batch_size = 1
19 | 
20 | ##online asr
21 | print('开始识别了')
22 | print(f'chunk_size: {chunk_size}')
23 | speech, sample_rate = soundfile.read(wav_path)
24 | speech_length = speech.shape[0]
25 | sample_offset = 0
26 | step = chunk_size[1] * 960
27 | param_dict = {'cache': dict()}
28 | final_result = ""
29 | for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
30 |     if sample_offset + step >= speech_length - 1:
31 |         step = speech_length - sample_offset
32 |         is_final = True
33 |     else:
34 |         is_final = False
35 |     param_dict['is_final'] = is_final
36 |     data = speech[sample_offset: sample_offset + step]
37 |     data = data.astype(np.float32)
38 |     rec_result = model(audio_in=data, param_dict=param_dict)
39 |     if len(rec_result) > 0:
40 |        final_result += rec_result[0]["preds"][0]
41 |     if rec_result:
42 |         print(rec_result[0]['preds'][0], end='', flush=True)
43 | print('')
44 | 


--------------------------------------------------------------------------------
/03 桌面悬浮字幕.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | from pathlib import Path
  3 | from PyQt5.QtCore import Qt
  4 | from PyQt5.QtGui import QColor, QFont, QIcon
  5 | from PyQt5.QtWidgets import QApplication, QLabel, QMainWindow, QVBoxLayout, QWidget, QSystemTrayIcon, QMenu, QAction
  6 | from PyQt5.QtNetwork import QUdpSocket
  7 | from rich import inspect
  8 | 
  9 | 
 10 | # 窗体属性参考：https://doc.qt.io/qt-6/qt.html#WindowType-enum
 11 | # 控件属性参考：https://doc.qt.io/qt-6/qt.html#WidgetAttribute-enum
 12 | # 样式表参考：https://doc.qt.io/qt-5/stylesheet-syntax.html
 13 | #            https://doc.qt.io/qt-5/stylesheet-reference.html
 14 | 
 15 | # 通过 udp 端口接收文字，并更新显示
 16 | udp_port = 6009
 17 | 
 18 | class TransparentWindow(QMainWindow):
 19 |     def __init__(self):
 20 |         super().__init__()
 21 | 
 22 |         # 设置窗口属性
 23 |         self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint | Qt.SplashScreen) #   
 24 |         self.setAttribute(Qt.WA_TranslucentBackground, True)
 25 |         self.setStyleSheet(self.get_style())
 26 |         
 27 | 
 28 |         # 设置窗口大小和位置
 29 |         self.setGeometry(400, 400, 400, 100)
 30 | 
 31 |         # 创建主窗口的 central widget
 32 |         central_widget = QWidget(self)
 33 |         self.setCentralWidget(central_widget)
 34 | 
 35 |         # 创建垂直布局
 36 |         layout = QVBoxLayout(central_widget)
 37 | 
 38 |         # 创建标签
 39 |         self.label = QLabel(f'端口：{udp_port}    外观：style.css', self)
 40 | 
 41 |         # 将标签添加到布局中
 42 |         layout.addWidget(self.label)
 43 | 
 44 |         # 添加系统托盘
 45 |         self.tray_icon = QSystemTrayIcon(self)
 46 |         self.tray_icon.setIcon(QIcon("assets/icon.ico"))  # 托盘图标路径
 47 |         self.tray_icon.setVisible(True) 
 48 |         self.tray_icon.setToolTip("悬浮窗口")
 49 |         self.tray_icon.activated.connect(self.tray_trigger)
 50 | 
 51 |         # 绑定 udp 端口
 52 |         self.udp_socket = QUdpSocket(self)
 53 |         self.udp_socket.bind(udp_port)
 54 |         self.udp_socket.readyRead.connect(self.receive_data)
 55 | 
 56 |         # 创建右键菜单
 57 |         self.create_context_menu()
 58 | 
 59 |     def change_port(self):
 60 |         self.udp_socket.close()
 61 |         self.udp_socket.bind(udp_port) 
 62 | 
 63 |     def receive_data(self):
 64 |         while self.udp_socket.hasPendingDatagrams():
 65 |             size = self.udp_socket.pendingDatagramSize()
 66 |             data, host, port = self.udp_socket.readDatagram(size)
 67 | 
 68 |             # 将接收到的数据转换为字符串并更新标签内容
 69 |             try:
 70 |                 message = data.decode("utf-8")
 71 |                 if message:
 72 |                     self.label.setText(f"{message}")
 73 |             except Exception as e:
 74 |                 print(e)
 75 | 
 76 |     def create_context_menu(self):
 77 |         self.menu = QMenu(self)
 78 |         switch_transparency_action = QAction("更新外观", self)
 79 |         switch_transparency_action.triggered.connect(self.update_style)
 80 |         self.menu.addAction(switch_transparency_action)
 81 | 
 82 |         self.hide_show_action = QAction("隐藏", self)
 83 |         self.hide_show_action.triggered.connect(self.hide_show)
 84 |         self.menu.addAction(self.hide_show_action)
 85 | 
 86 |         self.lock_unlock_action = QAction("锁定", self)
 87 |         self.lock_unlock_action.triggered.connect(self.lock_unlock)
 88 |         self.menu.addAction(self.lock_unlock_action)
 89 | 
 90 |         quit_action = QAction("退出", self)
 91 |         quit_action.triggered.connect(self.quit_application)
 92 |         self.menu.addAction(quit_action)
 93 | 
 94 |         self.tray_icon.setContextMenu(self.menu)
 95 | 
 96 |     def update_style(self):
 97 |         self.setStyleSheet(self.get_style())    # 更新 style
 98 |         self.resize(self.label.sizeHint())      # 更新窗体大小
 99 |         self.update()
100 |     
101 |     def get_style(self):
102 |         style = "QLabel { color: green; background-color: rgba(0, 0, 0, 0%); }"
103 |         style_path = Path('style.css')
104 |         if style_path.exists:
105 |             with open (style_path, 'r', encoding='utf-8') as f: style = f.read()
106 |         else:
107 |             with open (style_path, 'w', encoding='utf-8') as f: f.write(style)
108 |         return style
109 | 
110 |     def quit_application(self):
111 |         self.tray_icon.hide()
112 |         QApplication.quit()
113 | 
114 |     def tray_trigger(self, reason):
115 |         if reason == QSystemTrayIcon.Trigger:
116 |             # 单击系统托盘图标时显示或隐藏窗口
117 |             self.hide_show()
118 |                 
119 |     def hide_show(self):
120 |         if self.isVisible():
121 |             self.hide_show_action.setText('显示')
122 |             self.hide()
123 |         else:
124 |             self.hide_show_action.setText('隐藏')
125 |             self.show()
126 | 
127 |     def lock_unlock(self, reason):
128 |         lock_state = not (self.windowFlags() & Qt.WindowTransparentForInput)
129 |         if lock_state: self.lock_unlock_action.setText('解锁')
130 |         else: self.lock_unlock_action.setText('锁定')
131 |         self.setWindowFlag(Qt.WindowTransparentForInput, lock_state)
132 |         self.hide_show()
133 |             
134 |     def mousePressEvent(self, event):
135 |         if event.button() == Qt.LeftButton:
136 |             self.drag_position = event.globalPos() - self.frameGeometry().topLeft()
137 |             event.accept()
138 | 
139 |     def mouseMoveEvent(self, event):
140 |         if event.buttons() == Qt.LeftButton:
141 |             self.move(event.globalPos() - self.drag_position)
142 |             event.accept()
143 | 
144 | if __name__ == "__main__":
145 |     app = QApplication(sys.argv)
146 |     window = TransparentWindow()
147 |     window.show()
148 | 
149 |     sys.exit(app.exec_())
150 | 


--------------------------------------------------------------------------------
/assets/icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaujetZhao/FunASR-Online-Paraformer-Test/bc695e9069808e0efaabc7c60c1cd2f5e6fe8801/assets/icon.ico


--------------------------------------------------------------------------------
/assets/桌面实时字幕显示效果.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaujetZhao/FunASR-Online-Paraformer-Test/bc695e9069808e0efaabc7c60c1cd2f5e6fe8801/assets/桌面实时字幕显示效果.png


--------------------------------------------------------------------------------
/audio/placeholder:
--------------------------------------------------------------------------------
1 | placeholder


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | 用 FunASR-ONNX 加载 Paraformer 流式模型，实现的低延迟实时语音识别、桌面实时字幕。
 2 | 
 3 | ## 实时语音识别
 4 | 
 5 | 下载模型：
 6 | 
 7 | ```
 8 | git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online-onnx.git model
 9 | ```
10 | 
11 | 安装依赖（Python3.10+）：
12 | 
13 | ```
14 | pip install -r requirements.txt
15 | ```
16 | 
17 | 使用：确保有麦克风，直接运行脚本 `01 从麦克风流式转录.py` 即可，在转录的同时，它也会用 UDP 向端口 6009 发送识别结果 
18 | 
19 | 若要更改端口，请修改源文件
20 | 
21 | ## 桌面实时字幕
22 | 
23 | 另外做了一个脚本 `03 桌面悬浮字幕.py` ，直接运行后，它会从 6009 端口接收 UDP 数据，实时更新在悬浮窗，以此来实现屏幕实时字幕
24 | 
25 | 若要更改端口，请修改源文件
26 | 
27 | 编辑 style.css 可以改变字幕的显示效果，包括字体大小、颜色、背景等
28 | 
29 | 右键拖盘图标，可以：
30 | 
31 | - 更新外观，在编辑 style.css 后使用
32 | - 隐藏、显示悬浮窗
33 | - 锁定、解锁悬浮窗；当解锁时，可以用鼠标拖动；当锁定时，无法拖动，鼠标事件会穿透
34 | - 退出
35 | 
36 | ![桌面实时字幕显示效果](assets/桌面实时字幕显示效果.png)
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | sounddevice
3 | rich
4 | colorama
5 | funasr_onnx==0.2.4
6 | soundfile
7 | PyQt5


--------------------------------------------------------------------------------
/style.css:
--------------------------------------------------------------------------------
 1 | QLabel {
 2 |     color: white; 
 3 |     background-color: rgba(0, 0, 0, 65%); 
 4 |     font-family: 楷体; 
 5 |     font-size: 50px; 
 6 | 
 7 |     border-width: 2px;
 8 |     border-radius: 10px;
 9 |     border-style: solid;
10 |     border-color: yellowgreen;
11 |     }
12 | 
13 | 


--------------------------------------------------------------------------------