├── .gitignore
├── .idea
└── .gitignore
├── Agent.py
├── LICENSE
├── README.md
├── app.py
├── assets
├── GUI-1.png
├── GUI-2.png
├── Qwen.icns
├── Qwen.ico
└── Qwen.png
├── build-scripts
└── windows
│ ├── README.md
│ ├── build.bat
│ ├── build.py
│ └── direct_spec.txt
├── config.py
├── core_pipeline.py
├── ears.py
├── file_version.txt
├── key.json.example
├── models
└── silero_vad.onnx
├── mouth.py
├── processors.py
├── requirements.txt
├── utils.py
├── web
├── static
│ ├── css
│ │ └── style.css
│ └── js
│ │ └── app.js
└── templates
│ └── index.html
└── webview_api.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | .Python
6 | .env
7 | .venv
8 | venv/
9 | env/
10 | ENV/
11 | env.bak/
12 | venv.bak/
13 | pip-log.txt
14 | pip-delete-this-directory.txt
15 |
16 | # Build
17 | build/
18 | dist/
19 | *.spec
20 | *.manifest
21 | *.pyc
22 | *.pyo
23 | *.pyd
24 | .Python
25 | *.so
26 |
27 | # IDE
28 | .idea/
29 | .vscode/
30 | *.swp
31 | *.swo
32 | .project
33 | .pydevproject
34 | .settings/
35 |
36 | # Project specific
37 | key.json
38 | *.log
39 | file_version.txt
40 | *.bak
41 |
42 | # Windows specific
43 | Thumbs.db
44 | ehthumbs.db
45 | Desktop.ini
46 | $RECYCLE.BIN/
47 |
48 | # macOS specific
49 | .DS_Store
50 | .AppleDouble
51 | .LSOverride
52 | ._*
53 |
54 | # Distribution / packaging
55 | .Python
56 | develop-eggs/
57 | downloads/
58 | eggs/
59 | .eggs/
60 | lib/
61 | lib64/
62 | parts/
63 | sdist/
64 | var/
65 | wheels/
66 | *.egg-info/
67 | .installed.cfg
68 | *.egg
69 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | key.json
10 |
--------------------------------------------------------------------------------
/Agent.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import threading
4 | from openai import OpenAI
5 | import base64
6 | from queue import Queue
7 | from typing import Dict, List, Callable, Any
8 | from config import (
9 | API_KEY, BASE_URL, DEBUG
10 | )
11 | from mouth import Mouth
12 | from ears import Ears
13 | from enum import Enum, auto
14 | from core_pipeline import ConversationPipeline
15 | from processors import AIProcessor, EventProcessor
16 |
17 | class SystemEvent(Enum):
18 | """系统事件枚举类,用于事件驱动的状态转换"""
19 | USER_SPEECH_STARTED = auto()
20 | USER_SPEECH_ENDED = auto()
21 | AI_RESPONSE_STARTED = auto()
22 | AI_RESPONSE_ENDED = auto()
23 | USER_INTERRUPT = auto()
24 | SESSION_ENDED = auto()
25 |
26 | class ChatState(Enum):
27 | """对话状态枚举类"""
28 | IDLE = auto() # 空闲状态
29 | USER_SPEAKING = auto() # 用户说话中
30 | AI_SPEAKING = auto() # AI说话中
31 | INTERRUPTED = auto() # 已被打断
32 |
33 | class Agent:
34 | def __init__(self, gui_mode=True, debug=False, on_state_change=None):
35 | """初始化语音对话代理
36 |
37 | Args:
38 | gui_mode: 是否使用GUI模式,默认为True
39 | debug: 是否启用调试模式,打印详细日志
40 | on_state_change: 状态变化回调函数,用于GUI模式更新UI
41 | """
42 | if not API_KEY:
43 | raise ValueError("API密钥未设置")
44 |
45 | # 配置参数
46 | self.gui_mode = gui_mode
47 | self.debug = debug
48 |
49 | # 状态回调函数
50 | self.on_state_change = on_state_change
51 |
52 | # 新的流处理管道
53 | self.pipeline = ConversationPipeline()
54 |
55 | # 初始化处理器
56 | self._setup_processors()
57 |
58 | # 会话控制
59 | self.is_running = False
60 | self.session_end_event = threading.Event()
61 |
62 | def _setup_processors(self):
63 | """设置处理器管道"""
64 | # 创建处理器实例
65 | audio_input = Ears()
66 | ai_processor = AIProcessor()
67 | audio_output = Mouth()
68 | event_processor = EventProcessor(on_state_change=self.on_state_change)
69 |
70 | # 添加处理器到管道
71 | self.pipeline.add_processor(audio_input)
72 | self.pipeline.add_processor(ai_processor)
73 | self.pipeline.add_processor(audio_output)
74 | self.pipeline.add_processor(event_processor)
75 |
76 | # 连接处理器
77 | self.pipeline.connect_processors()
78 |
79 | # 保存引用以便直接访问
80 | self.audio_input = audio_input
81 | self.ai_processor = ai_processor
82 | self.audio_output = audio_output
83 | self.event_processor = event_processor
84 |
85 | def print_conversation_history(self):
86 | """打印对话历史"""
87 | messages = self.ai_processor.messages
88 | if not messages:
89 | print("对话历史为空")
90 | return
91 |
92 | print("\n===== 对话历史 =====")
93 | for i, msg in enumerate(messages):
94 | role = msg["role"]
95 | if role == "user":
96 | has_audio = any(content.get("type") == "input_audio" for content in msg["content"])
97 | has_text = any(content.get("type") == "text" for content in msg["content"])
98 | print(f"{i+1}. 用户: ", end="")
99 | if has_text:
100 | for content in msg["content"]:
101 | if content.get("type") == "text":
102 | print(f"{content['text']}")
103 | break
104 | elif has_audio:
105 | print("[语音输入]")
106 | else:
107 | print("[未知输入]")
108 | elif role == "assistant":
109 | print(f"{i+1}. AI: ", end="")
110 | if isinstance(msg["content"], list) and msg["content"] and "text" in msg["content"][0]:
111 | print(f"{msg['content'][0]['text']}")
112 | else:
113 | print("[未知响应]")
114 | print("===================\n")
115 |
116 | def show_system_info(self):
117 | """显示系统信息"""
118 | print("\n===== 系统信息 =====")
119 | mics = self.audio_input.get_available_microphones()
120 | print("\n可用麦克风:")
121 | for i, mic in enumerate(mics):
122 | print(f"{i+1}. 设备ID: {mic['index']} - {mic['name']} (通道数: {mic['channels']})")
123 | print("\n===================")
124 |
125 | def start(self):
126 | """启动语音对话系统"""
127 | print("正在启动与Qwen-Omni的语音对话...")
128 |
129 | if not self.gui_mode:
130 | self.show_system_info()
131 |
132 | # 重置消息历史
133 | self.ai_processor.messages = []
134 | self.ai_processor.full_transcript = ""
135 |
136 | # 重置状态
137 | self.is_running = True
138 | self.session_end_event.clear()
139 |
140 | try:
141 | # 启动管道
142 | self.pipeline.start()
143 |
144 | print("语音对话系统已启动,等待用户输入...")
145 | return True
146 |
147 | except Exception as e:
148 | print(f"启动语音对话时出错: {e}")
149 | self.is_running = False
150 | return False
151 |
152 | def stop(self):
153 | """停止语音对话系统"""
154 | if not self.is_running:
155 | return False
156 |
157 | try:
158 | print("正在停止语音对话...")
159 | # 立即标记为非运行状态
160 | self.is_running = False
161 | # 设置会话结束事件
162 | self.session_end_event.set()
163 |
164 | # 停止所有音频播放
165 | if self.audio_output.is_playing:
166 | print("立即停止所有音频播放...")
167 | self.audio_output.stop_immediately()
168 |
169 | # 停止麦克风流
170 | print("停止麦克风流和所有监听线程...")
171 | self.audio_input.stop_mic_stream()
172 |
173 | # 等待麦克风流完全停止
174 | time.sleep(0.2)
175 |
176 | # 停止处理管道
177 | self.pipeline.stop()
178 |
179 | # 通知状态变化回调
180 | if self.on_state_change:
181 | self.on_state_change("idle")
182 |
183 | print("语音对话已完全停止")
184 | return True
185 |
186 | except Exception as e:
187 | print(f"停止语音对话时出错: {e}")
188 | return False
189 |
190 | def close(self):
191 | """清理资源"""
192 | self.stop()
193 | self.audio_input.close()
194 | self.audio_output.close()
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 曲艺 (Qu Yi)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Qwen2.5-Omni Real-time Voice Communication
2 |
3 | 基于通义千问 Qwen2.5-Omni 在线API的实时语音对话系统,支持实时语音交互、动态语音活动检测和流式音频处理。
4 |
5 | A real-time voice conversation system based on Qwen2.5-Omni Online API, supporting real-time voice interaction, dynamic voice activity detection, and streaming audio processing.
6 |
7 | > **注意**:这是一个初步的演示版本,主要实现了基础的语音对话功能。
8 | >
9 | > 计划逐步添加更多 Qwen2.5-Omni 支持的多模态交互功能。最终构建一个`全模态`的交互程序。
10 | >
11 | > **本项目开发过程中使用了大量AI**
12 |
13 | ## 1 使用方法
14 |
15 | ### GUI模式
16 |
17 | 1. 启动GUI界面:
18 | ```bash
19 | python app.py
20 | ```
21 |
22 | 2. 在打开的窗口中:
23 | - 点击"开始对话"按钮启动语音对话
24 | - 用户可以连续发言和打断发言
25 | - 再次点击按钮结束对话
26 |
27 |
28 |
29 |
30 |
31 |
32 | ## 2 开发计划
33 |
34 | **接下来要开发的内容: 完成Agent的Eyes 视觉能力,让它能够看到桌面,并和用户交流**
35 |
36 | 以下是计划添加的主要功能:
37 |
38 | - [ ] Agent架构
39 | - [x] **Brain** 与LLM交互
40 |
41 | - [x] **Ears** 听觉能力
42 | - [x] 交互式音频对话
43 | - [x] 打断式音频通话
44 | - [ ] 语音转文字,兼容Qwen-Omni对同一个输入Massage的模态限制,以更好支持多模态
45 |
46 | - [x] **Mouth**语音能力
47 | - [x] 交互式音频对话
48 | - [x] 打断式音频通话
49 |
50 | - [ ] **Eyes** 视觉能力
51 | - [ ] 通过点击“分享屏幕按钮”观察用户电脑桌面,并给出反馈
52 | - [ ] 通过语音交互,自动观察屏幕内容
53 | - [ ] 视频通话
54 |
55 | - [x] **Skin** GUI界面
56 | - [x] 音频交互动态UI
57 | - [x] 可视化对话状态
58 |
59 | - [ ] **Hands** 工作能力
60 | - [ ] 简单函数调用
61 | - [ ] 引用Qwen-VL来增强鼠标控制能力,可以做一些简单操作
62 | - [ ] MCP (Multi-modal Conversational Perception) 功能
63 |
64 |
65 | ## 3 功能特点
66 |
67 | - 实时语音交互:支持用户与AI助手进行实时语音对话
68 | - 智能语音检测:使用 Silero VAD (ONNX版本) 进行高精度的语音活动检测,无需PyTorch依赖
69 | - 动态录音控制:根据用户说话情况自动开始和结束录音
70 | - 流式音频处理:支持音频数据的流式处理和播放
71 | - 平滑打断机制:允许用户在AI回答过程中自然打断
72 | - 音频淡出效果:在对话结束或打断时提供平滑的音频过渡
73 | - 现代化GUI界面:动态视觉反馈
74 |
75 | ## 4 环境要求
76 |
77 | - Python 3.10(开发环境)
78 | - PyAudio 及其依赖的音频库
79 | - onnxruntime - 用于语音活动检测
80 | - pywebview (用于GUI界面)
81 | - 麦克风和音频输出设备
82 | - 推荐:[uv](https://github.com/astral-sh/uv) - 快速、现代的Python包管理器
83 |
84 | ## 5 安装说明
85 |
86 | ### 5.1 方法一:直接下载可执行文件
87 |
88 | 访问[Releases页面](https://github.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/releases)下载最新的Windows可执行文件。
89 |
90 | 下载后解压,在`key.json`中填入你的通义千问API密钥 **[API key获取方式](https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen?spm=a2c4g.11186623.help-menu-2400256.d_0_1_0.5a06b0a8iZbkAV)**:
91 |
92 | 双击"QwenOmniVoiceAssistant.exe"即可运行。
93 |
94 | ### 5.2 方法二:从源码构建
95 |
96 | #### 安装步骤
97 |
98 | 1. **创建Python环境**:
99 |
100 | ```bash
101 | # 安装Python 3.10(如已安装请跳过)
102 | # https://www.python.org/downloads/release/python-31011/
103 |
104 | # 克隆项目代码
105 | git clone https://github.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat.git
106 | cd Qwen2.5-Omni-multimodal-chat
107 |
108 | # 创建虚拟环境并激活
109 | python -m venv .venv
110 | # Windows
111 | .venv\Scripts\activate
112 | # Linux/macOS
113 | # source .venv/bin/activate
114 | ```
115 |
116 | 2. **安装依赖**:
117 |
118 | ```bash
119 | # 安装项目依赖
120 | pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
121 | ```
122 |
123 | 3. **配置API密钥**:
124 | 复制`key.json.example`为`key.json`,填入你的通义千问API密钥 **[API key获取方式](https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen?spm=a2c4g.11186623.help-menu-2400256.d_0_1_0.5a06b0a8iZbkAV)**:
125 | ```json
126 | {
127 | "api_key": "your-api-key-here",
128 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"
129 | }
130 | ```
131 |
132 | 4. **运行应用**:
133 |
134 | ```bash
135 | # 启动图形界面版本
136 | python app.py
137 |
138 | # 或启动命令行版本
139 | python app.py --console
140 | ```
141 |
142 | 5. **打包应用**:
143 |
144 | 项目根目录命令行输入:
145 |
146 | ```
147 | .\build-scripts\windows\build.bat
148 | ```
149 |
150 | **或双击启动打包脚本`build.bat`,打包文件存放=在`dist`文件夹下**
151 |
152 | ### 5.3 常见问题
153 |
154 | - **麦克风未检测到**:请检查系统麦克风权限设置,确保应用有权限访问麦克风
155 | - **运行时缺少依赖**:确保已正确安装所有依赖,如遇问题可尝试`pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/`
156 | - **API密钥无效**:确保已在key.json中填入正确的通义千问API密钥
157 |
158 | ### 5.4 高级用户说明
159 |
160 | 如需使用更高级的包管理工具如uv,可以参考以下步骤:
161 |
162 | ```bash
163 | 安装uv包管理器
164 | pip install uv
165 |
166 | # 使用uv创建环境
167 | uv venv --python=3.10
168 |
169 | # 使用uv安装依赖
170 | uv pip install -r requirements.txt
171 | ```
172 |
173 | ## 6 配置说明
174 |
175 | 可以在 `config.py` 中调整以下参数:
176 |
177 | - `DEBUG`:调试模式开关,启用时会保存录音文件
178 | - `AUDIO_FORMAT`:音频格式(默认pyaudio.paInt16)
179 | - `CHANNELS`:音频通道数(默认1)
180 | - `RATE`:音频采样率(默认16000Hz,兼容Silero VAD)
181 | - `CHUNK`:音频数据块大小(默认512,相当于32毫秒帧大小)
182 | - `RECORD_SECONDS`:默认录音秒数
183 | - `MIN_SPEECH_DURATION`:最短语音持续时间(秒)
184 | - `SPEECH_VOLUME_THRESHOLD`:语音音量阈值
185 | - `NORMAL_VOLUME_THRESHOLD`:正常音量阈值
186 | - `MIN_POSITIVE_FRAMES`:语音检测的最小正帧数
187 | - `MIN_NEGATIVE_FRAMES`:静音检测的最小负帧数
188 | - `PLAYER_RATE`:音频播放器采样率(默认24000Hz,匹配模型输出)
189 | - `FADE_OUT_DURATION`:音频淡出持续时间(秒)
190 | - `MAX_FINISH_DURATION`:打断时最大允许的完成时间(秒)
191 |
192 | ## 7 项目结构
193 |
194 | ```
195 | Qwen2.5-Omni-multimodal-chat/
196 | ├── app.py # 主程序入口
197 | ├── Agent.py # 核心代理类(语音对话管理)
198 | ├── ears.py # 音频输入处理(麦克风和VAD)
199 | ├── mouth.py # 音频输出处理(语音合成和播放)
200 | ├── webview_api.py # WebView API接口
201 | ├── utils.py # 通用工具函数
202 | ├── config.py # 配置文件
203 | ├── key.json.example # API密钥配置示例
204 | ├── pyproject.toml # Python项目配置
205 | ├── requirements.txt # 主要依赖列表
206 | ├── uv.lock # UV包管理器锁文件
207 | ├── LICENSE # MIT许可证
208 | ├── README.md # 项目说明文档
209 | │
210 | ├── assets/ # 资源文件
211 | │
212 | ├── models/ # 模型文件
213 | │ └── silero_vad.onnx # 语音活动检测模型
214 | │
215 | ├── recordings/ # 录音文件目录(运行时生成)
216 | │
217 | ├── build-scripts/ # 构建脚本
218 | │ └── windows/ # Windows平台构建
219 | │ ├── build.py # 构建Python脚本
220 | │ ├── build.bat # 构建批处理文件
221 | │ ├── direct_spec.txt # PyInstaller规范文件
222 | │ └── README.md # 构建说明
223 | │
224 | ├── web/ # GUI前端文件
225 | │ ├── templates/ # HTML模板
226 | │ │ └── index.html # 主界面HTML
227 | │ └── static/ # 静态资源
228 | │ ├── css/ # 样式文件
229 | │ │ └── style.css # 主样式表
230 | │ └── js/ # JavaScript文件
231 | │ └── app.js # 前端逻辑
232 | │
233 | ├── build/ # 构建中间文件(自动生成)
234 | └── dist/ # 分发包(自动生成)
235 | ```
236 |
237 | ## 8 注意事项
238 |
239 | 1. 确保系统有可用的麦克风设备
240 | 2. 保持网络连接稳定以确保与API的通信
241 | 3. 调整麦克风音量以获得最佳的语音识别效果
242 | 4. 在嘈杂环境中可能需要调整音量阈值参数
243 | 5. 使用uv管理依赖可以显著提升安装速度
244 | 6. 建议在虚拟环境中进行开发和构建
245 |
246 | ## 9 许可证
247 |
248 | 本项目采用 MIT 许可证,这意味着您可以自由地使用、修改、分发本软件,无论是用于个人还是商业目的。详情请参见项目根目录下的 [LICENSE](./LICENSE) 文件。
249 |
250 | ## 10 贡献指南
251 |
252 | 欢迎提交Issue和Pull Request来帮助改进项目。在提交代码前,请确保:
253 |
254 | 1. 代码符合Python代码规范
255 | 2. 添加必要的注释和文档
256 | 3. 更新相关的文档说明
257 | 4. 测试代码功能正常
258 |
259 | ## 11 联系方式
260 |
261 | 如有问题或建议,请通过以下方式联系:
262 |
263 | - 提交 Issue
264 | - 发送邮件至:[quyimail@foxmail.com]
265 |
266 | ## 致谢
267 |
268 | - [Qwen2.5-Omni](https://github.com/QwenLM/Qwen2.5-Omni) - 通义千问全模特模型 [相关文档](https://help.aliyun.com/zh/model-studio/user-guide/qwen-omni?spm=a2c4g.11186623.0.0.5aefb0a8nJc2z7#db6d0ff7c371y)
269 | - [Silero VAD](https://github.com/snakers4/silero-vad) - 语音活动检测模型
270 | - [pywebview](https://pywebview.flowrl.com/) - Python GUI框架
271 | - [Cursor](https://www.cursor.com/cn) - AI代码编辑器
272 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import webview
4 | import threading
5 | import argparse
6 | import platform
7 | from webview_api import AgentAPI
8 | from utils import apply_windows_compatibility_patches
9 | from Agent import Agent
10 | from mouth import Mouth
11 | from ears import Ears
12 |
13 | def run_server(headless=False):
14 | """启动pywebview服务器
15 |
16 | Args:
17 | headless: 如果为True,则以无GUI模式运行
18 | """
19 | # 在Windows平台上应用兼容性补丁
20 | if platform.system().lower() == 'windows':
21 | apply_windows_compatibility_patches()
22 |
23 | current_dir = os.path.dirname(os.path.abspath(__file__))
24 |
25 | # 创建API实例
26 | api = AgentAPI()
27 |
28 | # 如果是无GUI模式,跳过GUI初始化
29 | if headless:
30 | # 模拟window对象
31 | class DummyWindow:
32 | def evaluate_js(self, js_code):
33 | pass
34 |
35 | api.set_window(DummyWindow())
36 | # 启动无GUI的对话线程
37 | api.start_conversation()
38 | try:
39 | # 主线程等待
40 | while True:
41 | import time
42 | time.sleep(1)
43 | except KeyboardInterrupt:
44 | api.stop_conversation()
45 | return
46 |
47 | # 创建窗口配置
48 | window_settings = {
49 | 'width': 400,
50 | 'height': 550,
51 | 'resizable': True,
52 | 'min_size': (400, 550),
53 | 'background_color': '#FFFFFF',
54 | 'title': 'Qwen-Omni 语音助手',
55 | 'text_select': False,
56 | }
57 |
58 | # 根据平台选择最合适的渲染器
59 | system_platform = platform.system().lower()
60 |
61 | # 在Windows上使用MSHTML (轻量级选择)
62 | if system_platform == 'windows':
63 | try:
64 | # 尝试使用Edge WebView2(如果系统已安装)
65 | gui_options = 'edgechromium'
66 | print("[INFO] 使用Edge WebView2作为GUI后端(轻量级)")
67 | except Exception as e:
68 | # 回退到MSHTML (基于IE的渲染器)
69 | gui_options = 'mshtml'
70 | print(f"[INFO] 使用MSHTML作为GUI后端(轻量级,回退原因: {e})")
71 | else:
72 | # 在macOS和Linux上使用系统默认
73 | gui_options = None
74 | print("[INFO] 使用系统默认GUI后端")
75 |
76 | # 创建窗口并加载HTML
77 | window = webview.create_window(
78 | title=window_settings['title'],
79 | url='file://' + os.path.join(current_dir, 'web/templates/index.html'),
80 | js_api=api,
81 | width=window_settings['width'],
82 | height=window_settings['height'],
83 | resizable=window_settings['resizable'],
84 | min_size=window_settings['min_size'],
85 | background_color=window_settings['background_color'],
86 | text_select=window_settings['text_select'],
87 | )
88 |
89 | # 设置窗口引用
90 | api.set_window(window)
91 |
92 | # 配置语音聊天默认参数(与CLI模式相同的默认配置)
93 | api.configure_agent({
94 | 'recording_mode': 'dynamic', # 默认使用动态录音模式
95 | 'recording_seconds': 5, # 默认录音时长(固定模式下使用)
96 | })
97 |
98 | # 启动窗口,应用平台特定配置
99 | webview.start(debug=False, http_server=True, gui=gui_options)
100 |
101 | def run_console():
102 | """运行命令行版本"""
103 | voice_chat = Agent()
104 | try:
105 | voice_chat.start_conversation()
106 | except KeyboardInterrupt:
107 | print("\n命令行版本已终止")
108 | finally:
109 | voice_chat.close()
110 |
111 | if __name__ == "__main__":
112 | parser = argparse.ArgumentParser(description="Qwen-Omni 语音助手")
113 | parser.add_argument('--console', action='store_true', help='在命令行模式下运行')
114 | parser.add_argument('--headless', action='store_true', help='无GUI模式运行')
115 | args = parser.parse_args()
116 |
117 | if args.console:
118 | run_console()
119 | else:
120 | run_server(headless=args.headless)
--------------------------------------------------------------------------------
/assets/GUI-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/GUI-1.png
--------------------------------------------------------------------------------
/assets/GUI-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/GUI-2.png
--------------------------------------------------------------------------------
/assets/Qwen.icns:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/Qwen.icns
--------------------------------------------------------------------------------
/assets/Qwen.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/Qwen.ico
--------------------------------------------------------------------------------
/assets/Qwen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/Qwen.png
--------------------------------------------------------------------------------
/build-scripts/windows/README.md:
--------------------------------------------------------------------------------
1 | # Qwen2.5-Omni 语音助手 Windows 打包工具
2 |
3 | 这个目录包含用于将 Qwen2.5-Omni 语音助手打包为 Windows 可执行文件的工具脚本。
4 |
5 | ## 打包方法
6 |
7 | 1. 打开命令行,输入`.\build.bat`
8 | 2. 等待打包完成
9 |
10 | ## 打包结果
11 |
12 | 打包完成后都会在项目根目录下生成 `dist` 文件夹,其中包含:
13 |
14 | - `QwenOmniVoiceAssistant` 文件夹:包含可执行文件和所有依赖
15 |
16 | - `QwenOmniVoiceAssistant.exe`为文件入口
17 |
18 | ## 注意事项
19 |
20 | 1. 打包过程需要网络连接,因为可能需要下载依赖库
21 | 2. 请确保在打包前已安装Python 3.10+
22 | 3. 初次打包可能需要较长时间,因为要下载和安装依赖
23 |
24 | ## 运行要求
25 |
26 | 打包后的程序在Windows 7/8/10/11系统上应该都能正常运行,无需额外安装Python环境。
27 |
28 | ## 故障排除
29 |
30 | 如果遇到问题:
31 |
32 | 1. 检查控制台错误信息或日志
33 | 3. 确保拥有管理员权限
34 | 4. 如果打包失败,尝试关闭防病毒软件后重试
35 | 5. 如果运行打包程序遇到"无法找到入口点"等错误,可能是微软Visual C++ Redistributable缺失,请安装最新版本
--------------------------------------------------------------------------------
/build-scripts/windows/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: 设置控制台编码为UTF-8
3 | chcp 65001 > nul
4 | echo ==== Qwen-Omni 语音助手 Windows 打包工具 ====
5 | echo.
6 |
7 | :: 检查当前目录
8 | set "SCRIPT_DIR=%~dp0"
9 | set "PROJECT_ROOT=%SCRIPT_DIR%..\..\"
10 | cd /d "%PROJECT_ROOT%"
11 |
12 | :: 检测Python环境
13 | where python >nul 2>nul
14 | if %ERRORLEVEL% neq 0 (
15 | echo 错误: 未找到Python,请确保已安装Python并添加到PATH环境变量
16 | pause
17 | exit /b 1
18 | )
19 |
20 | :: 尝试使用uv
21 | where uv >nul 2>nul
22 | if %ERRORLEVEL% equ 0 (
23 | echo 发现uv包管理器,将使用uv进行依赖安装
24 | set USE_UV=1
25 | ) else (
26 | echo 未找到uv包管理器,将使用pip进行依赖安装
27 | set USE_UV=0
28 | )
29 |
30 | :: 确保pip可用
31 | if %USE_UV% equ 0 (
32 | python -m ensurepip --upgrade >nul 2>nul
33 | python -m pip --version >nul 2>nul
34 | if %ERRORLEVEL% neq 0 (
35 | echo 警告: pip不可用,将尝试使用内置的ensurepip模块安装
36 | python -m ensurepip --default-pip
37 | if %ERRORLEVEL% neq 0 (
38 | echo 错误: 无法安装pip
39 | pause
40 | exit /b 1
41 | )
42 | )
43 | )
44 |
45 | :: 安装PyInstaller(如果尚未安装)
46 | echo 检查PyInstaller是否已安装...
47 | python -c "import PyInstaller" >nul 2>nul
48 | if %ERRORLEVEL% neq 0 (
49 | echo 正在安装PyInstaller...
50 | if %USE_UV% equ 1 (
51 | uv pip install pyinstaller
52 | ) else (
53 | python -m pip install pyinstaller
54 | )
55 |
56 | if %ERRORLEVEL% neq 0 (
57 | echo 错误: PyInstaller安装失败
58 | pause
59 | exit /b 1
60 | )
61 | )
62 |
63 | :: 设置UTF-8环境变量
64 | set PYTHONIOENCODING=utf-8
65 |
66 | :: 执行打包脚本
67 | echo 正在启动打包过程...
68 | python "%SCRIPT_DIR%build.py"
69 |
70 | :: 等待用户确认
71 | if %ERRORLEVEL% neq 0 (
72 | echo.
73 | echo 打包过程遇到错误,请查看上方错误信息
74 | ) else (
75 | echo.
76 | echo 打包完成! 请查看dist目录
77 | )
78 |
79 | pause
--------------------------------------------------------------------------------
/build-scripts/windows/build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Qwen-Omni 语音助手Windows打包脚本
6 | 使用PyInstaller将程序打包为Windows可执行文件
7 | """
8 |
9 | import os
10 | import sys
11 | import shutil
12 | import subprocess
13 | import platform
14 | import tempfile
15 |
16 | # 确保在正确的工作目录下
17 | script_dir = os.path.dirname(os.path.abspath(__file__))
18 | project_root = os.path.abspath(os.path.join(script_dir, '../..'))
19 | os.chdir(project_root)
20 |
21 | # 检查是否在Windows平台上运行
22 | if platform.system().lower() != 'windows':
23 | print("错误: 此打包脚本仅适用于Windows平台")
24 | sys.exit(1)
25 |
26 | def clean_dist():
27 | """清理旧的构建文件"""
28 | print("正在清理旧的构建文件...")
29 | dirs_to_clean = ['build', 'dist']
30 | for dir_path in dirs_to_clean:
31 | if os.path.exists(dir_path):
32 | try:
33 | shutil.rmtree(dir_path)
34 | print(f" 已删除 {dir_path}/")
35 | except Exception as e:
36 | print(f" 警告: 无法删除 {dir_path}/: {e}")
37 |
38 | def check_dependencies():
39 | """检查必要的依赖"""
40 | print("正在检查系统依赖...")
41 |
42 | # 检查PyInstaller
43 | try:
44 | import PyInstaller
45 | print(f" 已安装 PyInstaller {PyInstaller.__version__}")
46 | except ImportError:
47 | print(" 未找到 PyInstaller,将尝试安装")
48 | return False
49 |
50 | return True
51 |
52 | def install_requirements():
53 | """安装所需的依赖"""
54 | print("正在安装PyInstaller和所需依赖...")
55 |
56 | # 首先尝试使用uv
57 | try:
58 | subprocess.run(['uv', 'pip', 'install', 'pyinstaller'], check=True)
59 | subprocess.run(['uv', 'pip', 'install', '-r', 'requirements.txt'], check=True)
60 | return True
61 | except (subprocess.SubprocessError, FileNotFoundError) as e:
62 | print(f" 使用uv安装失败: {e}")
63 |
64 | # 尝试使用标准pip
65 | try:
66 | subprocess.run([sys.executable, '-m', 'ensurepip', '--upgrade'], check=False)
67 | subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'], check=False)
68 | subprocess.run([sys.executable, '-m', 'pip', 'install', 'pyinstaller'], check=True)
69 | subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'], check=True)
70 | return True
71 | except subprocess.SubprocessError as e:
72 | print(f" 使用pip安装失败: {e}")
73 | return False
74 |
75 | def create_spec_file():
76 | """创建PyInstaller的spec文件"""
77 | print("正在创建spec文件...")
78 |
79 | # 使用direct_spec.txt模板
80 | direct_template_path = os.path.join(script_dir, 'direct_spec.txt')
81 | if os.path.exists(direct_template_path):
82 | try:
83 | with open(direct_template_path, 'r', encoding='utf-8') as f:
84 | spec_content = f.read()
85 | print(" 已从模板文件加载spec内容")
86 |
87 | # 确保文件没有GBK不支持的字符
88 | try:
89 | spec_content.encode('gbk', errors='strict')
90 | except UnicodeEncodeError:
91 | print(" 警告: 模板文件包含GBK编码不支持的字符,将进行替换")
92 | spec_content = spec_content.encode('gbk', errors='replace').decode('gbk')
93 |
94 | with open('qwen_omni.spec', 'w', encoding='utf-8') as f:
95 | f.write(spec_content)
96 | print(" 已创建 qwen_omni.spec")
97 | return True
98 | except Exception as e:
99 | print(f" 模板加载失败: {e}")
100 |
101 | # 使用内置模板作为备份方案
102 | print(" 使用内置模板")
103 | spec_content = """# -*- mode: python ; coding: utf-8 -*-
104 |
105 | import os
106 | import sys
107 |
108 | block_cipher = None
109 |
110 | datas = [
111 | ('web/templates', 'web/templates'),
112 | ('web/static', 'web/static'),
113 | ('assets/Qwen.ico', 'assets'),
114 | ]
115 |
116 | if os.path.exists('key.json'):
117 | datas.append(('key.json', '.'))
118 |
119 | hiddenimports = [
120 | 'pyaudio', 'numpy', 'webview', 'threading', 'json',
121 | 'platform', 'webview.platforms.winforms',
122 | ]
123 |
124 | a = Analysis(
125 | ['app.py'],
126 | pathex=[os.path.abspath('.')],
127 | binaries=[],
128 | datas=datas,
129 | hiddenimports=hiddenimports,
130 | hookspath=[],
131 | hooksconfig={},
132 | runtime_hooks=[],
133 | excludes=[],
134 | win_no_prefer_redirects=False,
135 | win_private_assemblies=False,
136 | cipher=block_cipher,
137 | noarchive=False,
138 | )
139 |
140 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
141 |
142 | exe = EXE(
143 | pyz,
144 | a.scripts,
145 | [],
146 | exclude_binaries=True,
147 | name='QwenOmniVoiceAssistant',
148 | debug=False,
149 | bootloader_ignore_signals=False,
150 | strip=False,
151 | upx=True,
152 | console=False,
153 | icon='assets/Qwen.ico',
154 | version='file_version.txt',
155 | )
156 |
157 | coll = COLLECT(
158 | exe,
159 | a.binaries,
160 | a.zipfiles,
161 | a.datas,
162 | strip=False,
163 | upx=True,
164 | upx_exclude=[],
165 | name='QwenOmniVoiceAssistant',
166 | )
167 | """
168 |
169 | try:
170 | with open('qwen_omni.spec', 'w', encoding='utf-8') as f:
171 | f.write(spec_content)
172 | print(" 已创建 qwen_omni.spec")
173 | return True
174 | except Exception as e:
175 | print(f" 创建spec文件失败: {e}")
176 | return False
177 |
178 | def find_pyinstaller():
179 | """查找PyInstaller可执行文件的路径"""
180 | paths_to_check = [
181 | # 当前Python环境的Scripts目录
182 | os.path.join(os.path.dirname(sys.executable), 'Scripts', 'pyinstaller.exe'),
183 | os.path.join(os.path.dirname(sys.executable), 'Scripts', 'pyinstaller'),
184 | # 用户安装的PyInstaller
185 | shutil.which('pyinstaller'),
186 | # 通过模块运行
187 | sys.executable + ' -m PyInstaller',
188 | ]
189 |
190 | for path in paths_to_check:
191 | if path and (os.path.exists(path) or ' -m ' in path):
192 | return path
193 |
194 | # 备选方案:直接使用Python -m
195 | return [sys.executable, '-m', 'PyInstaller']
196 |
197 | def build_executable():
198 | """使用PyInstaller构建可执行文件"""
199 | print("正在构建Windows可执行文件...")
200 |
201 | # 查找PyInstaller
202 | pyinstaller_path = find_pyinstaller()
203 |
204 | # 准备命令
205 | if isinstance(pyinstaller_path, list):
206 | cmd = pyinstaller_path + ['qwen_omni.spec', '--clean']
207 | elif ' -m ' in pyinstaller_path:
208 | cmd_parts = pyinstaller_path.split(' -m ')
209 | cmd = [cmd_parts[0], '-m', cmd_parts[1], 'qwen_omni.spec', '--clean']
210 | else:
211 | cmd = [pyinstaller_path, 'qwen_omni.spec', '--clean']
212 |
213 | print(f" 执行命令: {' '.join(cmd)}")
214 |
215 | try:
216 | # 设置环境变量以强制使用UTF-8
217 | env = os.environ.copy()
218 | env['PYTHONIOENCODING'] = 'utf-8'
219 | env['PYTHONUTF8'] = '1' # 强制Python使用UTF-8模式
220 |
221 | # 针对Windows命令行的处理
222 | if platform.system().lower() == 'windows':
223 | # 确保当前控制台使用UTF-8编码
224 | os.system('chcp 65001 > nul')
225 |
226 | # 创建临时文件捕获输出
227 | temp_log_path = None
228 | result = 1 # 默认为失败状态
229 |
230 | try:
231 | with tempfile.NamedTemporaryFile(delete=False, mode='w+', encoding='utf-8', suffix='.log') as tmp:
232 | temp_log_path = tmp.name
233 | process = subprocess.Popen(
234 | cmd,
235 | stdout=subprocess.PIPE,
236 | stderr=subprocess.STDOUT,
237 | universal_newlines=True,
238 | bufsize=1,
239 | env=env,
240 | encoding='utf-8',
241 | errors='replace'
242 | )
243 |
244 | # 实时输出日志
245 | for line in process.stdout:
246 | try:
247 | line = line.strip()
248 | if line:
249 | print(f" {line}")
250 | tmp.write(line + '\n')
251 | except UnicodeEncodeError:
252 | # 如果无法在控制台显示,仅写入日志文件
253 | tmp.write("(non-displayable characters)" + '\n')
254 |
255 | # 等待进程完成
256 | result = process.wait()
257 | except Exception as e:
258 | print(f" PyInstaller runtime error: {e}")
259 | if temp_log_path and os.path.exists(temp_log_path):
260 | print(f" Log saved to: {temp_log_path}")
261 | return False
262 | finally:
263 | # 检查是否成功完成
264 | if result != 0:
265 | if temp_log_path and os.path.exists(temp_log_path):
266 | print(f" Build failed (code {result}), log saved to: {temp_log_path}")
267 | return False
268 | else:
269 | # 尝试删除临时文件,但如果删除失败也不影响构建结果
270 | if temp_log_path and os.path.exists(temp_log_path):
271 | try:
272 | os.unlink(temp_log_path)
273 | except Exception as e:
274 | print(f" Note: Cannot delete temp file: {e}")
275 | print(" Build completed!")
276 |
277 | # 检查dist目录确认是否真的构建成功
278 | if os.path.exists(os.path.join('dist', 'QwenOmniVoiceAssistant')):
279 | return True
280 | else:
281 | print(" Warning: Build output not found")
282 | return False
283 |
284 | except Exception as e:
285 | print(f" Build process error: {str(e).encode('ascii', errors='replace').decode('ascii')}")
286 | return False
287 |
288 | def sanitize_key_json():
289 | """处理key.json文件,替换真实API密钥为示例值"""
290 | print("正在处理API密钥信息...")
291 |
292 | # 创建dist目录(如果不存在)
293 | if not os.path.exists('dist'):
294 | os.makedirs('dist')
295 |
296 | # 获取目标目录
297 | target_dir = os.path.join('dist', 'QwenOmniVoiceAssistant')
298 | if not os.path.exists(target_dir):
299 | os.makedirs(target_dir, exist_ok=True)
300 |
301 | if not os.path.exists('key.json'):
302 | print(" 未找到key.json文件,将创建示例配置")
303 |
304 | # 创建示例配置
305 | example_config = '''{
306 | "api_key": "yout api key",
307 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"
308 | }'''
309 |
310 | try:
311 | # 直接写入到目标文件夹
312 | with open(os.path.join(target_dir, 'key.json'), 'w', encoding='utf-8') as f:
313 | f.write(example_config)
314 | print(" 已创建示例配置文件 key.json")
315 | return True
316 | except Exception as e:
317 | print(f" 创建示例配置失败: {e}")
318 | return False
319 |
320 | try:
321 | # 读取原始key.json
322 | import json
323 | with open('key.json', 'r', encoding='utf-8') as f:
324 | key_data = json.load(f)
325 |
326 | # 备份API密钥信息
327 | original_api_key = key_data.get('api_key', '')
328 | if original_api_key:
329 | # 创建打包用的示例key.json
330 | key_data['api_key'] = "yout api key"
331 |
332 | # 直接写入到目标文件夹
333 | with open(os.path.join(target_dir, 'key.json'), 'w', encoding='utf-8') as f:
334 | json.dump(key_data, f, ensure_ascii=False, indent=4)
335 |
336 | # # 保存原始版本作为示例,同样放在目标文件夹内
337 | # with open(os.path.join(target_dir, 'key.json.backup'), 'w', encoding='utf-8') as f:
338 | # f.write(f"# 这是一个备份文件,包含了原始key.json的格式\n")
339 | # f.write(f"# 请将您的API密钥替换下面的示例值\n\n")
340 | # json.dump(key_data, f, ensure_ascii=False, indent=4)
341 |
342 | print(" 已处理API密钥信息:替换为示例值")
343 | return True
344 | else:
345 | print(" API密钥为空,将使用原始文件")
346 | # 复制原始文件到目标文件夹
347 | with open(os.path.join(target_dir, 'key.json'), 'w', encoding='utf-8') as f:
348 | json.dump(key_data, f, ensure_ascii=False, indent=4)
349 | print(" 已复制原始key.json文件(空API密钥)")
350 | return True
351 | except Exception as e:
352 | print(f" 处理API密钥失败: {e}")
353 | return False
354 |
355 | def copy_additional_files():
356 | """复制其他必要的运行时文件"""
357 | print("正在复制其他必要文件...")
358 | success = True
359 |
360 | # 检查目标目录是否存在
361 | target_dir = os.path.join('dist', 'QwenOmniVoiceAssistant')
362 | if not os.path.exists(target_dir):
363 | print(f" 错误: 目标目录不存在: {target_dir}")
364 | return False
365 |
366 | # 复制README和其他文档
367 | if os.path.exists('README.md'):
368 | try:
369 | shutil.copy2('README.md', target_dir)
370 | print(" 已复制 README.md")
371 | except Exception as e:
372 | print(f" 警告: 复制README失败: {e}")
373 | success = False
374 |
375 | # 处理key.json - 直接处理到目标目录
376 | sanitize_key_json()
377 |
378 | # 复制版本信息文件
379 | if os.path.exists('file_version.txt'):
380 | try:
381 | shutil.copy2('file_version.txt', target_dir)
382 | print(" 已复制 file_version.txt 到应用根目录")
383 | except Exception as e:
384 | print(f" 警告: 复制版本信息文件失败: {e}")
385 | success = False
386 |
387 | return success
388 |
389 | def create_shortcut():
390 | """创建快捷方式批处理文件"""
391 | print("创建启动批处理文件...")
392 |
393 | # 获取版本号
394 | version = extract_version()
395 |
396 | # 获取平台信息
397 | import platform
398 | arch = platform.machine().lower()
399 | if arch == 'amd64' or arch == 'x86_64':
400 | arch = 'x64'
401 | elif arch == 'x86':
402 | arch = 'x86'
403 | elif 'arm' in arch or 'aarch' in arch:
404 | arch = 'arm64'
405 | else:
406 | arch = platform.architecture()[0]
407 |
408 | # 获取Windows版本
409 | win_ver = platform.win32_ver()[0]
410 |
411 | # 构建目标文件夹名称 (包含平台信息)
412 | target_dir = f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}'
413 |
414 | try:
415 | # 创建一个中文名称的快捷方式批处理文件
416 | cn_batch = '''@echo off
417 | echo 创建"Qwen-Omni语音助手"快捷方式...
418 | cd /d "%~dp0"
419 | if not exist "QwenOmniVoiceAssistant.exe" cd QwenOmniVoiceAssistant
420 | start QwenOmniVoiceAssistant.exe
421 | exit
422 | '''
423 |
424 | # 确保目录存在
425 | if not os.path.exists(f'dist/{target_dir}'):
426 | os.makedirs(f'dist/{target_dir}', exist_ok=True)
427 |
428 | with open(f'dist/{target_dir}/启动语音助手.bat', 'w', encoding='utf-8') as f:
429 | f.write(cn_batch)
430 | print(" 已创建启动批处理文件")
431 |
432 | return True
433 | except Exception as e:
434 | print(f" 创建启动批处理文件失败: {e}")
435 | return False
436 |
437 | def rename_dist_folder():
438 | """将英文目录重命名为中文(可选),并添加平台信息"""
439 | try:
440 | # 获取版本号
441 | version = extract_version()
442 |
443 | # 获取系统架构信息
444 | import platform
445 | arch = platform.machine().lower()
446 | if arch == 'amd64' or arch == 'x86_64':
447 | arch = 'x64'
448 | elif arch == 'x86':
449 | arch = 'x86'
450 | elif 'arm' in arch or 'aarch' in arch:
451 | arch = 'arm64'
452 | else:
453 | arch = platform.architecture()[0] # 备选方案
454 |
455 | # 获取Windows版本
456 | win_ver = platform.win32_ver()[0]
457 |
458 | # 构建目标文件夹名称 (包含平台信息)
459 | target_dir = f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}'
460 |
461 | print("创建中文名称的启动文件...")
462 |
463 | # 检查目录是否存在
464 | if os.path.exists('dist/QwenOmniVoiceAssistant'):
465 | # 创建一个README说明
466 | readme_content = f'''# Qwen-Omni 语音助手 v{version}
467 |
468 | 这是Qwen-Omni语音助手的Windows版本。
469 | 系统要求: Windows {win_ver} {arch}
470 |
471 | 请双击"启动语音助手.bat"文件来运行应用程序。
472 |
473 | 注意:由于Windows系统编码限制,应用程序文件夹使用英文名称,但功能与界面仍然是中文的。
474 | '''
475 |
476 | # 确保目录存在 - 此时可能还没有重命名
477 | if os.path.exists(f'dist/{target_dir}'):
478 | readme_path = f'dist/{target_dir}/使用说明.txt'
479 | else:
480 | readme_path = 'dist/QwenOmniVoiceAssistant/使用说明.txt'
481 |
482 | with open(readme_path, 'w', encoding='utf-8') as f:
483 | f.write(readme_content)
484 |
485 | return True
486 | except Exception as e:
487 | print(f" 创建中文访问方式失败: {e}")
488 | return False
489 |
490 | def create_version_file():
491 | """创建版本信息文件,如果文件已存在则跳过"""
492 | version_file = 'file_version.txt'
493 |
494 | # 检查文件是否已存在
495 | if os.path.exists(version_file):
496 | print(f" {version_file} 已存在,跳过创建")
497 | return True
498 |
499 | print("创建版本信息文件...")
500 | version_content = """VSVersionInfo(
501 | ffi=FixedFileInfo(
502 | filevers=(0, 0, 1, 0),
503 | prodvers=(0, 0, 1, 0),
504 | mask=0x3f,
505 | flags=0x0,
506 | OS=0x40004,
507 | fileType=0x1,
508 | subtype=0x0,
509 | date=(0, 0)
510 | ),
511 | kids=[
512 | StringFileInfo(
513 | [
514 | StringTable(
515 | u'040904B0',
516 | [StringStruct(u'CompanyName', u''),
517 | StringStruct(u'FileDescription', u'Qwen-Omni Voice Assistant'),
518 | StringStruct(u'FileVersion', u'0.0.1'),
519 | StringStruct(u'InternalName', u'QwenOmniVoiceAssistant'),
520 | StringStruct(u'LegalCopyright', u''),
521 | StringStruct(u'OriginalFilename', u'QwenOmniVoiceAssistant.exe'),
522 | StringStruct(u'ProductName', u'Qwen-Omni Voice Assistant'),
523 | StringStruct(u'ProductVersion', u'Windows 0.0.1')])
524 | ]),
525 | VarFileInfo([VarStruct(u'Translation', [1033, 1200])])
526 | ]
527 | )"""
528 |
529 | try:
530 | with open(version_file, 'w', encoding='utf-8') as f:
531 | f.write(version_content)
532 | print(" 已创建版本信息文件")
533 | return True
534 | except Exception as e:
535 | print(f" 创建版本信息文件失败: {e}")
536 | return False
537 |
538 | def extract_version():
539 | """从version文件中提取版本号"""
540 | try:
541 | if os.path.exists('file_version.txt'):
542 | with open('file_version.txt', 'r', encoding='utf-8') as f:
543 | content = f.read()
544 | # 查找 FileVersion 字段
545 | import re
546 | version_match = re.search(r"FileVersion', u'([0-9\.]+)'", content)
547 | if version_match:
548 | return version_match.group(1)
549 | # 如果找不到版本号,返回默认值
550 | return "0.0.1"
551 | except Exception as e:
552 | print(f" 提取版本号失败: {e}")
553 | return "0.0.1"
554 |
555 | def main():
556 | """主函数,运行打包流程"""
557 | print("==== Qwen-Omni Voice Assistant Windows Build Tool ====")
558 | success = True
559 |
560 | try:
561 | # 设置stdout和stderr为utf-8模式
562 | if sys.stdout.encoding.lower() != 'utf-8':
563 | # Windows命令行默认使用cp936/gbk,需要设置为utf-8
564 | try:
565 | import io
566 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
567 | sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
568 | os.environ['PYTHONIOENCODING'] = 'utf-8'
569 | print("Console encoding set to UTF-8")
570 | except Exception as e:
571 | print(f"Warning: Cannot set UTF-8 encoding: {e}")
572 |
573 | clean_dist()
574 |
575 | # 创建版本信息文件
576 | create_version_file()
577 |
578 | # 提取版本号,用于文件夹命名
579 | version = extract_version()
580 | print(f" 当前版本号: {version}")
581 |
582 | # 检查依赖并安装
583 | if not check_dependencies():
584 | if not install_requirements():
585 | print("Error: Cannot install required dependencies")
586 | return 1
587 |
588 | # 创建spec文件
589 | if not create_spec_file():
590 | print("Error: Cannot create spec file")
591 | return 1
592 |
593 | # 构建可执行文件
594 | build_success = build_executable()
595 | if not build_success:
596 | # 即使PyInstaller返回失败,但检查dist目录是否存在结果
597 | if os.path.exists(os.path.join('dist', 'QwenOmniVoiceAssistant')):
598 | print("Note: Despite errors, build output found. Continuing.")
599 | build_success = True
600 | else:
601 | print("Error: Build failed, no output found")
602 | return 1
603 |
604 | # 复制额外文件
605 | copy_additional_files()
606 |
607 | # 创建快捷方式脚本
608 | create_shortcut()
609 |
610 | # 创建中文访问方式
611 | rename_dist_folder()
612 |
613 | # 重命名输出文件夹,添加版本号和平台信息
614 | if os.path.exists(os.path.join('dist', 'QwenOmniVoiceAssistant')):
615 | # 获取系统架构信息
616 | import platform
617 | arch = platform.machine().lower()
618 | if arch == 'amd64' or arch == 'x86_64':
619 | arch = 'x64'
620 | elif arch == 'x86':
621 | arch = 'x86'
622 | elif 'arm' in arch or 'aarch' in arch:
623 | arch = 'arm64'
624 | else:
625 | arch = platform.architecture()[0]
626 |
627 | # 获取Windows版本
628 | win_ver = platform.win32_ver()[0]
629 |
630 | # 获取版本号
631 | version = extract_version()
632 | versioned_folder = os.path.join('dist', f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}')
633 |
634 | if os.path.exists(versioned_folder):
635 | shutil.rmtree(versioned_folder)
636 | os.rename(os.path.join('dist', 'QwenOmniVoiceAssistant'), versioned_folder)
637 | print(f" 已将输出文件夹重命名为: QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}")
638 |
639 | # 最后检查构建结果
640 | version = extract_version()
641 | import platform
642 | arch = platform.machine().lower()
643 | if arch == 'amd64' or arch == 'x86_64':
644 | arch = 'x64'
645 | elif arch == 'x86':
646 | arch = 'x86'
647 | elif 'arm' in arch or 'aarch' in arch:
648 | arch = 'arm64'
649 | else:
650 | arch = platform.architecture()[0]
651 |
652 | win_ver = platform.win32_ver()[0]
653 | target_folder = f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}'
654 |
655 | if os.path.exists(os.path.join('dist', target_folder, 'QwenOmniVoiceAssistant.exe')):
656 | print(f"\nBuild successful! Executable at: dist/{target_folder}/QwenOmniVoiceAssistant.exe")
657 | print(f"You can directly run 'dist/{target_folder}/启动语音助手.bat'")
658 |
659 | # 创建API Key说明文件
660 | try:
661 | api_key_note_filename = '请先在key.json中填写api key [获取教程].txt'
662 | api_key_note_path = os.path.join('dist', target_folder, api_key_note_filename)
663 | api_key_note_content = """前往这里获取api key:
664 | https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen?spm=a2c4g.11186623.help-menu-2400256.d_0_1_0.5a06b0a8iZbkAV"""
665 | with open(api_key_note_path, 'w', encoding='utf-8') as f:
666 | f.write(api_key_note_content)
667 | print(f" 已创建API Key说明文件: dist/{target_folder}/{api_key_note_filename}")
668 | except Exception as e:
669 | print(f" 警告: 创建API Key说明文件失败: {e}")
670 |
671 | success = True
672 | else:
673 | print("\nWarning: Final executable not found, build may not be complete")
674 | success = False
675 |
676 | except UnicodeEncodeError as e:
677 | # 特别处理编码错误
678 | print("Error: Encoding issue caused build failure")
679 | print("Try the following:")
680 | print("1. Run 'chcp 65001' in command prompt")
681 | print("2. Then run this script again")
682 | return 1
683 | except Exception as e:
684 | # 确保异常信息能正确显示
685 | try:
686 | error_msg = str(e)
687 | print(f"Error during build process: {error_msg}")
688 | except UnicodeEncodeError:
689 | # 如果无法显示错误消息,使用ascii编码替换不可显示字符
690 | error_msg = str(e).encode('ascii', errors='replace').decode('ascii')
691 | print(f"Error during build process: {error_msg}")
692 | return 1
693 |
694 | return 0 if success else 1
695 |
696 | if __name__ == "__main__":
697 | sys.exit(main())
--------------------------------------------------------------------------------
/build-scripts/windows/direct_spec.txt:
--------------------------------------------------------------------------------
1 | # -*- mode: python ; coding: utf-8 -*-
2 |
3 | import os
4 | import sys
5 |
6 | block_cipher = None
7 |
8 | # 添加所有必要的数据文件
9 | datas = []
10 |
11 | # 添加web目录及其内容
12 | datas.extend([
13 | ('web/templates', 'web/templates'),
14 | ('web/static', 'web/static'),
15 | ])
16 |
17 | # 添加assets目录下的图标文件
18 | datas.extend([
19 | ('assets/Qwen.ico', 'assets'),
20 | ('assets/Qwen.png', 'assets'),
21 | ])
22 |
23 | # 添加key.json配置文件
24 | if os.path.exists('key.json'):
25 | datas.append(('key.json', '.'))
26 |
27 | # 添加ONNX模型文件
28 | if os.path.exists('models/silero_vad.onnx'):
29 | datas.append(('models/silero_vad.onnx', 'models'))
30 |
31 | # 基本的hidden imports
32 | hiddenimports = [
33 | 'pyaudio',
34 | 'numpy',
35 | 'onnxruntime',
36 | 'webview',
37 | 'threading',
38 | 'json',
39 | 'platform',
40 | 'random',
41 | 'wave',
42 | 'io',
43 | 'base64',
44 | 'math',
45 | 'time',
46 | 'soundfile',
47 | 'pyglet',
48 | 'webview.platforms.winforms',
49 | 'webview.window',
50 | ]
51 |
52 | # 明确排除cefpython3相关的模块
53 | excludes = [
54 | 'cefpython3',
55 | 'torch',
56 | 'torchaudio',
57 | 'transformers',
58 | 'safetensors',
59 | ]
60 |
61 | a = Analysis(
62 | ['app.py'],
63 | pathex=[os.path.abspath('.')],
64 | binaries=[],
65 | datas=datas,
66 | hiddenimports=hiddenimports,
67 | hookspath=[],
68 | hooksconfig={},
69 | runtime_hooks=[],
70 | excludes=excludes,
71 | win_no_prefer_redirects=False,
72 | win_private_assemblies=False,
73 | cipher=block_cipher,
74 | noarchive=False,
75 | )
76 |
77 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
78 |
79 | exe = EXE(
80 | pyz,
81 | a.scripts,
82 | [],
83 | exclude_binaries=True,
84 | name='QwenOmniVoiceAssistant',
85 | debug=False,
86 | bootloader_ignore_signals=False,
87 | strip=False,
88 | upx=True,
89 | console=False,
90 | icon='assets/Qwen.ico',
91 | version='file_version.txt',
92 | )
93 |
94 | coll = COLLECT(
95 | exe,
96 | a.binaries,
97 | a.zipfiles,
98 | a.datas,
99 | strip=False,
100 | upx=True,
101 | upx_exclude=[],
102 | name='QwenOmniVoiceAssistant',
103 | )
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import pyaudio
2 | import json
3 | import os
4 |
5 | # 调试设置
6 | DEBUG = False # 设置为True时开启调试模式,包括保存录音文件
7 |
8 | # 音频设置
9 | AUDIO_FORMAT = pyaudio.paInt16
10 | CHANNELS = 1
11 | RATE = 16000 # Silero VAD 支持的采样率
12 | CHUNK = 512 # 32毫秒帧大小 (16000 * 0.032 = 512),与 Silero VAD 兼容
13 |
14 | # API 设置
15 | try:
16 | with open('key.json', 'r', encoding='utf-8') as f:
17 | api_config = json.load(f)
18 | API_KEY = api_config['api_key']
19 | BASE_URL = api_config['base_url']
20 | except (FileNotFoundError, json.JSONDecodeError, KeyError) as e:
21 | print(f"Error loading API configuration from key.json: {e}")
22 | API_KEY = ''
23 | BASE_URL = ''
24 |
25 | # VAD 设置
26 | MIN_SPEECH_DURATION = 0.1
27 |
28 | # 音频播放器设置
29 | PLAYER_RATE = 24000 # 播放器采样率匹配模型输出
30 | FADE_OUT_DURATION = 0.15 # 标准淡出持续时间(秒)
31 | MAX_FINISH_DURATION = 0.25 # 被打断时最大允许的完成时间(秒)
--------------------------------------------------------------------------------
/core_pipeline.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import asyncio
4 | import threading
5 | import uuid
6 | import collections
7 | import numpy as np
8 | import queue
9 | import wave
10 | import io
11 | import base64
12 | from enum import Enum, auto
13 | from typing import Dict, List, Callable, Any, Optional, Union
14 | from config import (
15 | API_KEY, BASE_URL,
16 | CHANNELS, AUDIO_FORMAT, RATE, CHUNK,
17 | PLAYER_RATE, FADE_OUT_DURATION, MAX_FINISH_DURATION
18 | )
19 |
20 | class FrameType(Enum):
21 | """帧类型枚举"""
22 | DATA = auto() # 普通数据帧
23 | CONTROL = auto() # 控制帧(优先处理)
24 | SYSTEM = auto() # 系统帧(立即处理)
25 |
26 | class Frame:
27 | """表示流水线中传递的数据帧"""
28 | def __init__(self, type: FrameType, data=None, metadata=None):
29 | self.type = type
30 | self.data = data or {}
31 | self.metadata = metadata or {}
32 | self.timestamp = time.time()
33 | self.id = str(uuid.uuid4())
34 |
35 | def __str__(self):
36 | return f"Frame[{self.type.name}]: {', '.join(self.data.keys())}"
37 |
38 | class CancellationToken:
39 | """取消令牌,用于协调任务取消"""
40 | def __init__(self):
41 | self._cancelled = threading.Event()
42 | self._callbacks = []
43 |
44 | def cancel(self):
45 | """触发取消信号"""
46 | if not self._cancelled.is_set():
47 | self._cancelled.set()
48 | for callback in self._callbacks:
49 | try:
50 | callback()
51 | except Exception as e:
52 | print(f"Error in cancellation callback: {e}")
53 |
54 | def is_cancelled(self):
55 | """检查是否已取消"""
56 | return self._cancelled.is_set()
57 |
58 | def register_callback(self, callback):
59 | """注册取消回调函数"""
60 | if callback not in self._callbacks:
61 | self._callbacks.append(callback)
62 | return lambda: self._callbacks.remove(callback) if callback in self._callbacks else None
63 |
64 | def reset(self):
65 | """重置取消状态"""
66 | self._cancelled.clear()
67 | self._callbacks = []
68 |
69 | class ProcessorContext:
70 | """处理器上下文,维护处理链信息和全局状态"""
71 | def __init__(self):
72 | self.session_id = str(uuid.uuid4())
73 | self.state = {}
74 | self.cancellation_token = CancellationToken()
75 |
76 | def is_cancelled(self):
77 | """检查是否已取消"""
78 | return self.cancellation_token.is_cancelled()
79 |
80 | def new_session(self):
81 | """创建新会话"""
82 | self.session_id = str(uuid.uuid4())
83 | self.state = {}
84 | return self.session_id
85 |
86 | class ThreadSafeQueue:
87 | """线程安全的队列封装,适用于多线程环境"""
88 | def __init__(self, maxsize=0):
89 | self.queue = queue.Queue(maxsize)
90 | self.mutex = threading.RLock()
91 |
92 | def put(self, item, block=True, timeout=None):
93 | """添加项到队列"""
94 | return self.queue.put(item, block, timeout)
95 |
96 | def get(self, block=True, timeout=None):
97 | """从队列获取项"""
98 | return self.queue.get(block, timeout)
99 |
100 | def empty(self):
101 | """检查队列是否为空"""
102 | return self.queue.empty()
103 |
104 | def clear(self):
105 | """清空队列"""
106 | with self.mutex:
107 | while not self.queue.empty():
108 | try:
109 | self.queue.get_nowait()
110 | self.queue.task_done()
111 | except queue.Empty:
112 | break
113 |
114 | def task_done(self):
115 | """标记任务完成"""
116 | self.queue.task_done()
117 |
118 | def qsize(self):
119 | """获取队列大小"""
120 | return self.queue.qsize()
121 |
122 | class ProcessorBase:
123 | """处理器基类"""
124 | def __init__(self, name):
125 | self.name = name
126 | self.next_processor = None
127 | self.prev_processor = None
128 | self.context = None
129 | self.input_queue = ThreadSafeQueue()
130 | self.is_running = False
131 | self.thread = None
132 | self.lock = threading.RLock()
133 |
134 | def set_context(self, context):
135 | """设置处理器上下文"""
136 | self.context = context
137 |
138 | def set_next(self, processor):
139 | """设置下一个处理器"""
140 | self.next_processor = processor
141 | processor.prev_processor = self
142 | return processor
143 |
144 | def send_downstream(self, frame):
145 | """向下游发送帧"""
146 | if self.next_processor:
147 | # 系统帧优先直接处理,而不是放入队列
148 | if frame.type == FrameType.SYSTEM:
149 | self.next_processor.process_frame(frame)
150 | else:
151 | self.next_processor.enqueue_frame(frame)
152 |
153 | def send_upstream(self, frame):
154 | """向上游发送帧(用于控制和反馈)"""
155 | if self.prev_processor:
156 | # 系统帧总是优先处理
157 | if frame.type == FrameType.SYSTEM:
158 | self.prev_processor.process_frame(frame)
159 | else:
160 | self.prev_processor.enqueue_frame(frame)
161 |
162 | def enqueue_frame(self, frame):
163 | """将帧放入处理队列"""
164 | self.input_queue.put(frame)
165 |
166 | def process_frame(self, frame):
167 | """处理单个帧,子类必须实现"""
168 | raise NotImplementedError("Subclasses must implement process_frame")
169 |
170 | def start(self):
171 | """启动处理器"""
172 | with self.lock:
173 | if self.is_running:
174 | return
175 |
176 | self.is_running = True
177 | self.thread = threading.Thread(target=self._process_loop)
178 | self.thread.daemon = True
179 | self.thread.start()
180 |
181 | def stop(self):
182 | """停止处理器"""
183 | with self.lock:
184 | if not self.is_running:
185 | return
186 |
187 | self.is_running = False
188 | self.input_queue.clear()
189 |
190 | if self.thread and self.thread.is_alive():
191 | self.thread.join(timeout=1.0)
192 |
193 | def _process_loop(self):
194 | """处理循环"""
195 | try:
196 | while self.is_running and (self.context is None or not self.context.is_cancelled()):
197 | try:
198 | # 使用超时,避免无限等待
199 | frame = self.input_queue.get(timeout=0.1)
200 | except queue.Empty:
201 | continue
202 |
203 | try:
204 | # 处理帧
205 | self.process_frame(frame)
206 | except Exception as e:
207 | print(f"处理器 {self.name} 处理帧时出错: {e}")
208 | finally:
209 | self.input_queue.task_done()
210 |
211 | except Exception as e:
212 | print(f"处理器 {self.name} 的处理循环出错: {e}")
213 | finally:
214 | print(f"处理器 {self.name} 的处理循环已停止")
215 |
216 | class SystemEventEmitter:
217 | """系统事件发射器,用于发布系统事件"""
218 | def __init__(self, context):
219 | self.context = context
220 | self.listeners = {}
221 |
222 | def on(self, event_type, callback):
223 | """注册事件监听器"""
224 | if event_type not in self.listeners:
225 | self.listeners[event_type] = []
226 | self.listeners[event_type].append(callback)
227 |
228 | # 返回取消函数
229 | def cancel():
230 | if event_type in self.listeners and callback in self.listeners[event_type]:
231 | self.listeners[event_type].remove(callback)
232 | return cancel
233 |
234 | def emit(self, event_type, data=None):
235 | """发射事件"""
236 | if event_type in self.listeners:
237 | for callback in self.listeners[event_type]:
238 | try:
239 | callback(data)
240 | except Exception as e:
241 | print(f"事件处理回调出错: {e}")
242 |
243 | class ConversationPipeline:
244 | """对话管道 - 集成所有处理器"""
245 | def __init__(self):
246 | # 创建处理器上下文
247 | self.context = ProcessorContext()
248 |
249 | # 创建事件发射器
250 | self.events = SystemEventEmitter(self.context)
251 |
252 | # 处理器实例
253 | self.processors = []
254 |
255 | # 状态跟踪
256 | self.is_running = False
257 | self.lock = threading.RLock()
258 |
259 | def add_processor(self, processor):
260 | """添加处理器到管道"""
261 | processor.set_context(self.context)
262 | self.processors.append(processor)
263 | return processor
264 |
265 | def connect_processors(self):
266 | """连接所有处理器"""
267 | for i in range(len(self.processors) - 1):
268 | self.processors[i].set_next(self.processors[i+1])
269 |
270 | def start(self):
271 | """启动所有处理器"""
272 | with self.lock:
273 | if self.is_running:
274 | return False
275 |
276 | self.is_running = True
277 | self.context.cancellation_token.reset()
278 |
279 | # 启动所有处理器
280 | for processor in self.processors:
281 | processor.start()
282 |
283 | print(f"处理管道已启动,{len(self.processors)}个处理器在运行")
284 |
285 | # 发送启动命令到第一个处理器(通常是音频输入处理器)
286 | if self.processors:
287 | self.processors[0].process_frame(Frame(
288 | FrameType.SYSTEM,
289 | {"command": "start"}
290 | ))
291 | print("启动命令已发送到第一个处理器")
292 |
293 | return True
294 |
295 | def stop(self):
296 | """停止所有处理器"""
297 | with self.lock:
298 | if not self.is_running:
299 | return False
300 |
301 | # 触发取消事件
302 | self.context.cancellation_token.cancel()
303 |
304 | # 停止所有处理器
305 | for processor in reversed(self.processors):
306 | processor.stop()
307 |
308 | self.is_running = False
309 | print("处理管道已停止")
310 | return True
311 |
312 | def reset(self):
313 | """重置管道状态"""
314 | self.stop()
315 | self.context.new_session()
316 | print("处理管道已重置")
317 |
318 | # -------------------------------------------------------------------
319 | # 音频处理相关的工具函数
320 | # -------------------------------------------------------------------
321 |
322 | def int16_to_float32(audio_int16):
323 | """将int16音频数据转换为float32格式 (-1.0 到 1.0 范围)"""
324 | return audio_int16.astype(np.float32) / 32768.0
325 |
326 | def float32_to_int16(audio_float32):
327 | """将float32音频数据 (-1.0 到 1.0 范围) 转换为int16格式"""
328 | return (audio_float32 * 32768.0).astype(np.int16)
329 |
330 | def frames_to_wav_base64(frames, channels, sample_width, rate):
331 | """将音频帧转换为base64编码的WAV数据"""
332 | wav_buffer = io.BytesIO()
333 |
334 | with wave.open(wav_buffer, 'wb') as wf:
335 | wf.setnchannels(channels)
336 | wf.setsampwidth(sample_width)
337 | wf.setframerate(rate)
338 | wf.writeframes(b''.join(frames))
339 |
340 | wav_buffer.seek(0)
341 | wav_bytes = wav_buffer.read()
342 | return base64.b64encode(wav_bytes).decode('utf-8')
--------------------------------------------------------------------------------
/ears.py:
--------------------------------------------------------------------------------
1 | import pyaudio
2 | import threading
3 | import time
4 | import os
5 | import numpy as np
6 | import onnxruntime
7 | import collections
8 | from config import (
9 | AUDIO_FORMAT, CHANNELS, RATE, CHUNK,
10 | MIN_SPEECH_DURATION
11 | )
12 | from core_pipeline import (
13 | ProcessorBase, Frame, FrameType, int16_to_float32, frames_to_wav_base64
14 | )
15 |
16 | # VAD模型参数
17 | VAD_THRESHOLD = 0.6 # 语音检测阈值
18 | END_BUFFER_FRAMES = 10 # 语音结束后缓冲帧数
19 | MIN_NEG_FRAMES_FOR_ENDING = 8 # 检测结束的连续静音帧数
20 | MAX_SPEECH_DURATION = 180.0 # 语音最长持续时间(秒)
21 | PRE_BUFFER_FRAMES = int(1.0 * RATE / CHUNK) # 预缓冲帧数
22 | SPEECH_CONFIRM_FRAMES = 2 # 确认语音开始需要的连续帧数
23 | PRE_DETECTION_BUFFER_SIZE = int(2.0 * RATE / CHUNK) # 预检测缓冲区大小
24 |
25 | class Ears(ProcessorBase):
26 | """音频输入处理器 - 集成了语音检测和处理功能,直接将处理后的语音发送到AI处理器"""
27 | def __init__(self, name="audio_input"):
28 | super().__init__(name)
29 | self.p = pyaudio.PyAudio()
30 | self.stream = None
31 | self.vad_model = self._load_vad_model()
32 |
33 | # VAD状态变量
34 | self.state = np.zeros((2, 1, 128), dtype=np.float32)
35 | self.sr = RATE
36 |
37 | # 保存音频文件设置
38 | self.save_audio_file = True # 设置为True以保存音频文件
39 |
40 | # 循环缓冲区
41 | self.buffer = collections.deque(maxlen=PRE_DETECTION_BUFFER_SIZE)
42 |
43 | # 语音检测状态
44 | self.speech_detected = False
45 | self.consecutive_speech_frames = 0
46 | self.consecutive_silence_frames = 0
47 | self.is_collecting_speech = False
48 | self.speech_frames = []
49 | self.speech_start_time = None
50 |
51 | # 同步锁和事件
52 | self.stream_lock = threading.RLock()
53 | self.speech_detected_event = threading.Event()
54 | self.speech_ended_event = threading.Event()
55 |
56 | print("[Ears] 初始化完成")
57 |
58 | def _load_vad_model(self):
59 | """加载VAD模型"""
60 | model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models/silero_vad.onnx")
61 | print(f"加载Silero VAD ONNX模型: {model_path}")
62 | return onnxruntime.InferenceSession(model_path)
63 |
64 | def reset_vad_state(self):
65 | """重置VAD状态 - 现在不再需要保持状态"""
66 | pass
67 |
68 | def start_mic_stream(self):
69 | """启动麦克风流"""
70 | with self.stream_lock:
71 | if self.stream is not None:
72 | return
73 |
74 | try:
75 | self.stream = self.p.open(
76 | format=AUDIO_FORMAT,
77 | channels=CHANNELS,
78 | rate=RATE,
79 | input=True,
80 | frames_per_buffer=CHUNK,
81 | stream_callback=self._audio_callback
82 | )
83 | print("[Ears] 麦克风流已启动")
84 |
85 | # 重置状态
86 | self.buffer.clear()
87 | self.speech_frames = []
88 | self.speech_detected = False
89 | self.consecutive_speech_frames = 0
90 | self.consecutive_silence_frames = 0
91 | self.is_collecting_speech = False
92 | self.speech_start_time = None
93 | self.speech_detected_event.clear()
94 | self.speech_ended_event.clear()
95 |
96 | # 重置VAD状态
97 | self.state = np.zeros((2, 1, 128), dtype=np.float32)
98 |
99 | return True
100 | except Exception as e:
101 | print(f"[Ears] 启动麦克风流失败: {e}")
102 | return False
103 |
104 | def _audio_callback(self, in_data, frame_count, time_info, status):
105 | """音频回调函数"""
106 | if self.is_running:
107 | self.enqueue_frame(Frame(
108 | FrameType.DATA,
109 | {"audio_data": in_data, "frame_count": frame_count}
110 | ))
111 | return (None, pyaudio.paContinue)
112 |
113 | def process_frame(self, frame):
114 | """处理音频帧"""
115 | if frame.type == FrameType.SYSTEM:
116 | cmd = frame.data.get("command")
117 | if cmd == "stop":
118 | self.stop_mic_stream()
119 | elif cmd == "start":
120 | # 处理启动命令,启动麦克风流
121 | print("[Ears] 收到启动命令,开始启动麦克风流")
122 | self.start_mic_stream()
123 | return
124 |
125 | if frame.type == FrameType.DATA and "audio_data" in frame.data:
126 | audio_data = frame.data["audio_data"]
127 |
128 | # 添加到循环缓冲区
129 | self.buffer.append(audio_data)
130 |
131 | # 转换为numpy数组
132 | audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
133 | audio_float32 = int16_to_float32(audio_int16)
134 |
135 | # 检测语音
136 | is_speech = self._detect_speech(audio_float32)
137 |
138 | if is_speech:
139 | self.consecutive_speech_frames += 1
140 | self.consecutive_silence_frames = 0
141 | else:
142 | self.consecutive_silence_frames += 1
143 | self.consecutive_speech_frames = 0
144 |
145 | # 语音开始检测
146 | if not self.speech_detected and self.consecutive_speech_frames >= SPEECH_CONFIRM_FRAMES:
147 | self.speech_detected = True
148 | self.is_collecting_speech = True
149 | self.speech_start_time = time.time()
150 | self.speech_frames = list(self.buffer) # 复制预缓冲区内容
151 |
152 | # 发送语音开始事件
153 | self.speech_detected_event.set()
154 |
155 | # 发送用户打断系统事件帧到下游处理器
156 | print("[Ears] 检测到用户开始说话,发送用户打断事件到下游处理器")
157 | self.send_downstream(Frame(
158 | FrameType.SYSTEM,
159 | {"event": "user_interrupt", "command": "clear_pipeline"}
160 | ))
161 |
162 | # 通知下游
163 | self.send_downstream(Frame(
164 | FrameType.SYSTEM,
165 | {"event": "speech_started"}
166 | ))
167 | print("[Ears] 检测到语音开始")
168 |
169 | # 收集语音帧
170 | if self.is_collecting_speech:
171 | self.speech_frames.append(audio_data)
172 |
173 | # 检查超时
174 | if self.speech_start_time and (time.time() - self.speech_start_time) > MAX_SPEECH_DURATION:
175 | print(f"[Ears] 语音时长超过最大限制 {MAX_SPEECH_DURATION}秒,强制结束")
176 | self._end_speech_collection()
177 | return
178 |
179 | # 检查语音结束
180 | if self.consecutive_silence_frames >= MIN_NEG_FRAMES_FOR_ENDING:
181 | # 添加额外的缓冲帧
182 | buffer_count = 0
183 | while buffer_count < END_BUFFER_FRAMES and self.is_collecting_speech:
184 | buffer_count += 1
185 |
186 | if buffer_count >= END_BUFFER_FRAMES:
187 | self._end_speech_collection()
188 |
189 | def _end_speech_collection(self):
190 | """结束语音收集并将音频发送到AI处理器"""
191 | if not self.is_collecting_speech:
192 | return
193 |
194 | self.is_collecting_speech = False
195 | self.speech_detected = False
196 |
197 | # 收集的语音转为base64
198 | if self.speech_frames:
199 | # 处理完整的语音帧
200 | try:
201 | audio_base64 = self._convert_frames_to_base64(self.speech_frames)
202 | print(f"[Ears] 语音转换为base64完成,长度: {len(audio_base64)}")
203 |
204 | # 如果启用了保存音频功能,则保存音频文件
205 | if self.save_audio_file:
206 | self._save_audio_to_file(self.speech_frames, audio_base64)
207 |
208 | # 发送语音结束事件
209 | self.speech_ended_event.set()
210 |
211 | # 直接发送到AI处理器 (使用DATA帧代替SYSTEM帧)
212 | try:
213 | self.send_downstream(Frame(
214 | FrameType.DATA,
215 | {
216 | "type": "audio_data",
217 | "audio_base64": audio_base64
218 | }
219 | ))
220 | print(f"[Ears] 语音数据已发送到AI处理器,帧数: {len(self.speech_frames)}")
221 | except Exception as e:
222 | print(f"[Ears] 发送语音数据到AI处理器失败: {e}")
223 |
224 | speech_duration = time.time() - self.speech_start_time if self.speech_start_time else 0
225 | print(f"[Ears] 语音结束,持续时间: {speech_duration:.2f}秒")
226 | except Exception as e:
227 | print(f"[Ears] 处理语音时出错: {e}")
228 |
229 | # 重置状态
230 | self.consecutive_speech_frames = 0
231 | self.consecutive_silence_frames = 0
232 | self.speech_start_time = None
233 | self.speech_frames = []
234 |
235 | def _convert_frames_to_base64(self, frames):
236 | """将音频帧转换为base64编码的WAV数据"""
237 | try:
238 | result = frames_to_wav_base64(
239 | frames,
240 | CHANNELS,
241 | self.p.get_sample_size(AUDIO_FORMAT),
242 | RATE
243 | )
244 | return result
245 | except Exception as e:
246 | print(f"[Ears] 转换音频帧到base64失败: {e}")
247 | raise
248 |
249 | def _detect_speech(self, audio_float32):
250 | """使用VAD模型检测语音
251 | 基于 Silero VAD ONNX 模型
252 |
253 | Args:
254 | audio_float32: 输入音频帧 (float32 格式)
255 |
256 | Returns:
257 | bool: 是否检测到语音
258 | """
259 | try:
260 | # 确保输入形状正确 (Silero VAD 默认需要 512 采样点)
261 | if len(audio_float32) != 512:
262 | # 如果不是512点,进行补零或截断
263 | if len(audio_float32) < 512:
264 | # 补零
265 | padded = np.zeros(512, dtype=np.float32)
266 | padded[:len(audio_float32)] = audio_float32
267 | audio_float32 = padded
268 | else:
269 | # 取前512点
270 | audio_float32 = audio_float32[:512]
271 |
272 | # 重塑输入为模型期望的形状 [1, 512]
273 | audio = np.array(audio_float32, dtype=np.float32).reshape(1, -1)
274 |
275 | # 准备ONNX输入
276 | ort_inputs = {
277 | "input": audio,
278 | "state": self.state, # 使用当前状态
279 | "sr": np.array(self.sr, dtype=np.int64) # 添加采样率
280 | }
281 |
282 | # 运行ONNX推理
283 | ort_outs = self.vad_model.run(None, ort_inputs)
284 |
285 | # 更新状态
286 | if len(ort_outs) > 1:
287 | self.state = ort_outs[1]
288 |
289 | # 获取语音概率 - 第一个输出是语音概率
290 | speech_prob = ort_outs[0].item() # 语音概率
291 |
292 | # 使用阈值判断是否为语音
293 | return speech_prob >= VAD_THRESHOLD
294 |
295 | except Exception as e:
296 | print(f"[Ears] VAD检测出错: {e}")
297 | return False
298 |
299 | def stop_mic_stream(self):
300 | """停止麦克风流"""
301 | print("[Ears] 停止麦克风流")
302 |
303 | with self.stream_lock:
304 | if self.stream is None:
305 | return
306 |
307 | try:
308 | # 结束当前语音收集
309 | if self.is_collecting_speech:
310 | self._end_speech_collection()
311 |
312 | # 停止音频流
313 | self.stream.stop_stream()
314 | self.stream.close()
315 | self.stream = None
316 |
317 | print("[Ears] 麦克风流已安全停止")
318 | return True
319 | except Exception as e:
320 | print(f"[Ears] 停止麦克风流时出错: {e}")
321 | return False
322 |
323 | def get_available_microphones(self):
324 | """获取可用麦克风列表"""
325 | mics = []
326 | info = self.p.get_host_api_info_by_index(0)
327 | numdevices = info.get('deviceCount')
328 |
329 | for i in range(numdevices):
330 | device_info = self.p.get_device_info_by_host_api_device_index(0, i)
331 | if device_info.get('maxInputChannels') > 0:
332 | mics.append({
333 | 'index': i,
334 | 'name': device_info.get('name'),
335 | 'channels': device_info.get('maxInputChannels')
336 | })
337 |
338 | return mics
339 |
340 | def is_mic_stream_active(self):
341 | """检查麦克风流是否活跃"""
342 | with self.stream_lock:
343 | return self.stream is not None and self.stream.is_active()
344 |
345 | def close(self):
346 | """关闭资源"""
347 | self.stop_mic_stream()
348 | if self.p:
349 | self.p.terminate()
350 |
351 | def _save_audio_to_file(self, frames, base64_data=None):
352 | """保存音频帧到文件
353 |
354 | Args:
355 | frames: 音频帧列表
356 | base64_data: 可选的base64编码的音频数据
357 | """
358 | try:
359 | # 确保目录存在
360 | save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "audio_record_tmp")
361 | os.makedirs(save_dir, exist_ok=True)
362 |
363 | # 创建时间戳文件名
364 | timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
365 | file_path = os.path.join(save_dir, f"audio_{timestamp}.wav")
366 |
367 | # 保存原始帧到WAV文件
368 | import wave
369 | with wave.open(file_path, 'wb') as wf:
370 | wf.setnchannels(CHANNELS)
371 | wf.setsampwidth(self.p.get_sample_size(AUDIO_FORMAT))
372 | wf.setframerate(RATE)
373 | wf.writeframes(b''.join(frames))
374 |
375 | print(f"[Ears] 音频已保存到: {file_path}")
376 | return file_path
377 | except Exception as e:
378 | print(f"[Ears] 保存音频文件失败: {e}")
379 | return None
--------------------------------------------------------------------------------
/file_version.txt:
--------------------------------------------------------------------------------
1 | VSVersionInfo(
2 | ffi=FixedFileInfo(
3 | filevers=(0, 0, 2, 0),
4 | prodvers=(0, 0, 2, 0),
5 | mask=0x3f,
6 | flags=0x0,
7 | OS=0x40004,
8 | fileType=0x1,
9 | subtype=0x0,
10 | date=(0, 0)
11 | ),
12 | kids=[
13 | StringFileInfo(
14 | [
15 | StringTable(
16 | u'040904B0',
17 | [StringStruct(u'CompanyName', u''),
18 | StringStruct(u'FileDescription', u'Qwen-Omni Voice Assistant'),
19 | StringStruct(u'FileVersion', u'0.0.2'),
20 | StringStruct(u'InternalName', u'QwenOmniVoiceAssistant'),
21 | StringStruct(u'LegalCopyright', u''),
22 | StringStruct(u'OriginalFilename', u'QwenOmniVoiceAssistant.exe'),
23 | StringStruct(u'ProductName', u'Qwen-Omni Voice Assistant'),
24 | StringStruct(u'ProductVersion', u'Windows 0.0.2')])
25 | ]),
26 | VarFileInfo([VarStruct(u'Translation', [1033, 1200])])
27 | ]
28 | )
--------------------------------------------------------------------------------
/key.json.example:
--------------------------------------------------------------------------------
1 | {
2 | "api_key": "your api key",
3 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"
4 | }
5 |
--------------------------------------------------------------------------------
/models/silero_vad.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/models/silero_vad.onnx
--------------------------------------------------------------------------------
/mouth.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import numpy as np
3 | import pyaudio
4 | import threading
5 | import time
6 | import queue
7 | from config import PLAYER_RATE, FADE_OUT_DURATION, MAX_FINISH_DURATION
8 | from core_pipeline import (
9 | ProcessorBase, Frame, FrameType
10 | )
11 |
12 | class Mouth(ProcessorBase):
13 | """音频输出处理器 - 负责播放音频数据"""
14 | def __init__(self, name="audio_output"):
15 | super().__init__(name)
16 | self.p = pyaudio.PyAudio()
17 | self.stream = None
18 | self.audio_queue = queue.Queue()
19 | self.is_playing = False
20 | self.should_stop = False
21 | self.smooth_interrupt = False
22 | self.buffer_empty = threading.Event()
23 | self.buffer_empty.set() # 初始状态为空
24 | self.playback_finished = threading.Event()
25 | self.fade_out_enabled = True
26 | self.fade_out_duration = FADE_OUT_DURATION
27 | self.fade_out_active = False
28 | self.fade_out_start_time = None
29 | self.max_finish_duration = MAX_FINISH_DURATION
30 | self.interrupt_time = None
31 | self.last_audio_time = None
32 | self.stream_lock = threading.RLock()
33 | self.playback_thread = None
34 |
35 | print("[Mouth] 初始化完成")
36 |
37 | def start_stream(self):
38 | """启动音频输出流"""
39 | with self.stream_lock:
40 | if self.stream is not None:
41 | self.stop_stream()
42 |
43 | try:
44 | # 创建音频流
45 | self.stream = self.p.open(
46 | format=pyaudio.paInt16,
47 | channels=1,
48 | rate=PLAYER_RATE,
49 | output=True
50 | )
51 | self.is_playing = True
52 | self.should_stop = False
53 | self.buffer_empty.set()
54 | self.last_audio_time = None
55 | self.smooth_interrupt = False
56 | self.interrupt_time = None
57 | self.fade_out_active = False
58 | self.fade_out_start_time = None
59 | self.playback_finished.clear()
60 |
61 | # 启动播放线程
62 | self.playback_thread = threading.Thread(target=self._play_audio_continuous)
63 | self.playback_thread.daemon = True
64 | self.playback_thread.start()
65 | print("[Mouth] 音频输出流已创建,开始持续播放...")
66 | return True
67 | except Exception as e:
68 | print(f"[Mouth] 创建音频流时出错: {e}")
69 | self.is_playing = False
70 | self.stream = None
71 | return False
72 |
73 | def process_frame(self, frame):
74 | """处理帧"""
75 | if frame.type == FrameType.SYSTEM:
76 | cmd = frame.data.get("command")
77 | if cmd == "stop":
78 | self.stop_immediately()
79 | elif cmd == "pause":
80 | self.smooth_interrupt = True
81 | self.should_stop = True
82 | self.interrupt_time = time.time()
83 | elif cmd == "clear_pipeline":
84 | print("[Mouth] 收到清空管道命令,立即停止播放并清空音频队列")
85 | self.stop_immediately()
86 | # 确保音频队列为空
87 | while not self.audio_queue.empty():
88 | try:
89 | self.audio_queue.get_nowait()
90 | self.audio_queue.task_done()
91 | except queue.Empty:
92 | break
93 | self.buffer_empty.set()
94 |
95 | # 处理开始播放事件
96 | event = frame.data.get("event")
97 | if event == "play_audio" and "audio_data" in frame.data:
98 | self.add_audio_data(frame.data["audio_data"])
99 | print(f"[Mouth] 收到音频数据,长度: {len(frame.data['audio_data'])} 字符")
100 |
101 | elif frame.type == FrameType.DATA:
102 | # 处理音频数据
103 | if "audio_data" in frame.data:
104 | self.add_audio_data(frame.data["audio_data"])
105 |
106 | def add_audio_data(self, audio_data):
107 | """添加音频数据到队列"""
108 | # 检查播放线程状态,如果不存在或已结束但状态仍为playing,则重置状态
109 | if self.is_playing and (self.playback_thread is None or not self.playback_thread.is_alive()):
110 | print("[Mouth] 检测到播放线程已结束但状态未重置,强制重置状态")
111 | self.is_playing = False
112 | self.stream = None
113 |
114 | # 如果未播放状态,则启动流
115 | if not self.is_playing:
116 | self.start_stream()
117 |
118 | if self.should_stop and not self.smooth_interrupt:
119 | print("[Mouth] 已停止,不再接收新音频")
120 | return
121 |
122 | try:
123 | if self.playback_finished.is_set():
124 | self.playback_finished.clear()
125 |
126 | # 如果是base64编码的音频
127 | if isinstance(audio_data, str) and (audio_data.startswith("data:audio") or len(audio_data) > 100):
128 | try:
129 | # 提取base64部分
130 | if "base64," in audio_data:
131 | audio_data = audio_data.split("base64,")[1]
132 |
133 | wav_bytes = base64.b64decode(audio_data)
134 | print(f"[Mouth] base64解码成功,长度: {len(wav_bytes)} 字节")
135 | # 直接转换为numpy数组,不进行任何处理
136 | audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
137 | audio_data = audio_np.tobytes()
138 | except Exception as e:
139 | print(f"[Mouth] 解码base64音频失败: {e}")
140 | return
141 |
142 | # 平滑打断检查
143 | if self.smooth_interrupt and self.interrupt_time:
144 | current_time = time.time()
145 | if current_time - self.interrupt_time > self.max_finish_duration:
146 | print("[Mouth] 平滑打断已达到最大时间,停止更多音频")
147 | return
148 |
149 | # 添加到队列
150 | self.audio_queue.put(audio_data)
151 | self.buffer_empty.clear()
152 | self.last_audio_time = time.time()
153 | print(f"[Mouth] 音频数据已添加到队列,当前队列大小: {self.audio_queue.qsize()}")
154 | except Exception as e:
155 | print(f"[Mouth] 音频处理错误: {e}")
156 |
157 | def _play_audio_continuous(self):
158 | """后台持续音频播放线程"""
159 | print("[Mouth] 播放线程已启动")
160 | buffer = b""
161 | min_buffer_size = 1024 # 减小缓冲区以提高响应速度
162 | is_initial_buffer = True
163 | last_check_time = time.time()
164 | check_interval = 0.005 # 每5毫秒检查一次终止请求
165 | chunks_played = 0
166 |
167 | try:
168 | while self.is_playing and (not self.should_stop or self.smooth_interrupt):
169 | current_time = time.time()
170 |
171 | # 立即检查是否有直接停止请求
172 | if self.should_stop and not self.smooth_interrupt:
173 | print("[Mouth] 检测到直接停止请求,立即终止播放")
174 | break
175 |
176 | # 处理淡出效果
177 | if self.smooth_interrupt and self.interrupt_time and self.fade_out_enabled and not self.fade_out_active:
178 | self.fade_out_active = True
179 | self.fade_out_start_time = current_time
180 | print("[Mouth] 开始音量淡出效果...")
181 |
182 | # 检查是否已经到达最大完成时间
183 | if self.smooth_interrupt and self.interrupt_time:
184 | elapsed = current_time - self.interrupt_time
185 | if elapsed > self.max_finish_duration * 0.8: # 降低到80%的最大等待时间
186 | print("[Mouth] 达到最大等待时间的80%,强制停止音频")
187 | break
188 |
189 | try:
190 | # 处理队列中的音频数据
191 | chunks_processed = 0
192 | while not self.audio_queue.empty():
193 | # 每处理几个数据块就检查一次终止请求
194 | chunks_processed += 1
195 | if chunks_processed % 5 == 0 and self.should_stop and not self.smooth_interrupt:
196 | print("[Mouth] 数据处理中检测到停止请求,立即终止")
197 | break
198 |
199 | chunk = self.audio_queue.get(block=False)
200 | buffer += chunk
201 | self.audio_queue.task_done()
202 |
203 | # 再次检查终止请求
204 | if self.should_stop and not self.smooth_interrupt:
205 | print("[Mouth] 数据处理后检测到停止请求,立即终止")
206 | break
207 |
208 | # 当缓冲区有足够数据,或者是最后的数据时播放
209 | if len(buffer) >= min_buffer_size or (len(buffer) > 0 and self.audio_queue.empty()):
210 | if is_initial_buffer:
211 | print("[Mouth] 初始缓冲完成,开始平滑播放...")
212 | is_initial_buffer = False
213 |
214 | # 对当前块应用淡出效果(如果需要)
215 | if self.fade_out_active and self.fade_out_start_time:
216 | fade_progress = min(1.0, (current_time - self.fade_out_start_time) / self.fade_out_duration)
217 | audio_data = np.frombuffer(buffer, dtype=np.int16)
218 |
219 | # 使用非线性淡出曲线,在开始时变化较慢,结束时变化较快
220 | volume_factor = max(0, 1.0 - (fade_progress * fade_progress))
221 |
222 | # 应用音量变化
223 | audio_data = (audio_data * volume_factor).astype(np.int16)
224 | buffer = audio_data.tobytes()
225 |
226 | # 如果淡出接近完成,结束播放
227 | if fade_progress >= 0.6: # 降低阈值,当达到60%时就结束
228 | print(f"[Mouth] 淡出已达到阈值 {fade_progress:.2f},结束播放")
229 | break
230 |
231 | # 检查是否应当强制停止(如果打断且超过了最大时间)
232 | if self.smooth_interrupt and self.interrupt_time:
233 | elapsed = current_time - self.interrupt_time
234 | if elapsed > self.max_finish_duration * 0.4: # 进一步减小等待时间到40%
235 | print("[Mouth] 打断等待时间过长,强制停止")
236 | break
237 |
238 | # 播放前再次检查终止请求
239 | if self.should_stop and not self.smooth_interrupt:
240 | print("[Mouth] 播放前检测到停止请求,立即终止")
241 | break
242 |
243 | # 播放音频数据
244 | with self.stream_lock:
245 | if self.stream and (not self.should_stop or self.smooth_interrupt):
246 | try:
247 | # 将大块数据分成小块播放,每块之间检查终止请求
248 | if len(buffer) > 2048 and not self.smooth_interrupt:
249 | chunks = [buffer[i:i+2048] for i in range(0, len(buffer), 2048)]
250 | for i, small_chunk in enumerate(chunks):
251 | # 每播放一小块就检查终止请求
252 | if i > 0 and self.should_stop and not self.smooth_interrupt:
253 | print(f"[Mouth] 分块播放中检测到停止请求,已播放{i}/{len(chunks)}块,立即终止")
254 | break
255 | self.stream.write(small_chunk, exception_on_underflow=False)
256 | chunks_played += 1
257 | else:
258 | self.stream.write(buffer, exception_on_underflow=False)
259 | chunks_played += 1
260 | print(f"[Mouth] 已播放音频数据,总计 {chunks_played} 个块")
261 | except Exception as e:
262 | print(f"[Mouth] 音频播放过程中出错: {e}")
263 | break
264 | buffer = b""
265 |
266 | # 检查是否应当结束播放
267 | if self.audio_queue.empty() and len(buffer) == 0:
268 | if self.smooth_interrupt:
269 | print("[Mouth] 平滑打断:当前音频已完成")
270 | break
271 |
272 | # 检查两次音频之间的等待时间
273 | if self.last_audio_time:
274 | wait_time = current_time - self.last_audio_time
275 | if wait_time > 1.0: # 如果超过1秒没有新音频,结束播放
276 | print(f"[Mouth] 等待音频数据超时,播放完成")
277 | break
278 |
279 | # 如果队列为空,短暂暂停以避免CPU占用过高
280 | if self.audio_queue.empty() and not self.should_stop:
281 | # 用更短的时间轮询,提高响应性
282 | time.sleep(0.01)
283 |
284 | # 定期检查是否需要退出
285 | if current_time - last_check_time >= check_interval:
286 | last_check_time = current_time
287 | if self.should_stop and not self.smooth_interrupt:
288 | break
289 |
290 | except Exception as e:
291 | print(f"[Mouth] 音频处理循环出错: {e}")
292 | break
293 | except Exception as e:
294 | print(f"[Mouth] 播放线程异常: {e}")
295 | finally:
296 | # 确保线程结束时总是重置播放状态
297 | self.is_playing = False
298 | self.should_stop = False
299 | self.playback_finished.set()
300 | self.buffer_empty.set()
301 |
302 | # 关闭音频流
303 | with self.stream_lock:
304 | if self.stream:
305 | try:
306 | self.stream.stop_stream()
307 | self.stream.close()
308 | except Exception as e:
309 | print(f"[Mouth] 关闭音频流时出错: {e}")
310 | finally:
311 | self.stream = None
312 |
313 | print(f"[Mouth] 播放线程结束,共播放了 {chunks_played} 个音频块")
314 |
315 | # 显式重置播放状态变量,确保下次能重新启动
316 | self.playback_thread = None
317 |
318 | def is_audio_complete(self):
319 | """检查音频播放是否已完成"""
320 | return self.buffer_empty.is_set() and self.audio_queue.empty()
321 |
322 | def request_smooth_interrupt(self):
323 | """请求平滑打断播放"""
324 | if not self.is_playing:
325 | return False
326 |
327 | self.smooth_interrupt = True
328 | self.should_stop = True
329 | self.interrupt_time = time.time()
330 | print("[Mouth] 已请求平滑打断播放")
331 |
332 | if self.playback_thread and self.playback_thread.is_alive():
333 | return True
334 |
335 | return False
336 |
337 | def stop_with_fadeout(self, fadeout_time=0.1):
338 | """停止播放并应用淡出效果"""
339 | if fadeout_time > 0:
340 | self.fade_out_duration = fadeout_time
341 | return self.request_smooth_interrupt()
342 | else:
343 | return self.stop_immediately()
344 |
345 | def stop_stream(self):
346 | """关闭音频流但不中断当前播放"""
347 | with self.stream_lock:
348 | self.should_stop = True
349 |
350 | if self.stream:
351 | try:
352 | print("[Mouth] 开始关闭音频流...")
353 |
354 | # 清空队列
355 | while not self.audio_queue.empty():
356 | try:
357 | self.audio_queue.get_nowait()
358 | self.audio_queue.task_done()
359 | except queue.Empty:
360 | break
361 |
362 | # 关闭流
363 | self.stream.stop_stream()
364 | self.stream.close()
365 | self.stream = None
366 |
367 | # 设置事件
368 | self.buffer_empty.set()
369 | self.playback_finished.set()
370 |
371 | # 等待播放线程结束 (加入超时防止死锁)
372 | if self.playback_thread and self.playback_thread.is_alive():
373 | print("[Mouth] 等待播放线程结束...")
374 | self.playback_thread.join(timeout=1.0)
375 |
376 | # 无论线程是否结束,都强制重置状态
377 | self.is_playing = False
378 | self.smooth_interrupt = False
379 | self.fade_out_active = False
380 | self.playback_thread = None
381 |
382 | print("[Mouth] 音频流已完全关闭")
383 | return True
384 | except Exception as e:
385 | print(f"[Mouth] 关闭音频流时出错: {e}")
386 | # 出错时也重置关键状态
387 | self.is_playing = False
388 | self.playback_thread = None
389 | return False
390 |
391 | def stop_immediately(self):
392 | """立即停止所有播放"""
393 | print("[Mouth] 执行立即停止...")
394 |
395 | # 设置标志
396 | self.should_stop = True
397 | self.smooth_interrupt = False
398 |
399 | # 清空队列
400 | try:
401 | while not self.audio_queue.empty():
402 | try:
403 | self.audio_queue.get_nowait()
404 | self.audio_queue.task_done()
405 | except queue.Empty:
406 | break
407 | except:
408 | pass
409 |
410 | # 停止流
411 | success = self.stop_stream()
412 | return success
413 |
414 | def close(self):
415 | """关闭并清理资源"""
416 | self.stop_immediately()
417 | if self.p:
418 | try:
419 | self.p.terminate()
420 | except Exception as e:
421 | print(f"[Mouth] 终止PyAudio时出错: {e}")
--------------------------------------------------------------------------------
/processors.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import threading
4 | import base64
5 | from openai import OpenAI
6 | from core_pipeline import ProcessorBase, Frame, FrameType, frames_to_wav_base64
7 | from config import (
8 | API_KEY, BASE_URL, CHANNELS, AUDIO_FORMAT, RATE, CHUNK, DEBUG
9 | )
10 | import pyaudio
11 |
12 | class AIProcessor(ProcessorBase):
13 | """AI处理器 - 负责调用AI API并处理响应"""
14 | def __init__(self, name="ai_processor"):
15 | super().__init__(name)
16 |
17 | if not API_KEY:
18 | raise ValueError("API密钥未设置")
19 |
20 | # 初始化OpenAI客户端
21 | self.client = OpenAI(
22 | api_key=API_KEY,
23 | base_url=BASE_URL,
24 | )
25 | print(f"[AIProcessor] 初始化完成,使用base_url: {BASE_URL}")
26 | print(f"[AIProcessor] API密钥前8位: {API_KEY[:8]}...")
27 |
28 | # 对话历史
29 | self.messages = []
30 | self.full_transcript = ""
31 |
32 | # 当前响应任务
33 | self.current_response = None
34 | self.response_thread = None
35 | self.response_lock = threading.RLock()
36 |
37 | # 状态标志
38 | self.is_generating = False
39 |
40 | # 跟踪API请求
41 | self.current_request_id = None
42 | self.completed_request_ids = set() # 存储已完成或已打断的请求ID
43 | self.request_id_lock = threading.RLock()
44 |
45 | def process_frame(self, frame):
46 | """处理帧"""
47 | if frame.type == FrameType.SYSTEM:
48 | event = frame.data.get("event")
49 |
50 | if event == "user_interrupt":
51 | print("[AIProcessor] 收到用户打断请求")
52 | # 中断当前响应
53 | self._interrupt_response()
54 |
55 | # 检查是否需要清空管道
56 | if frame.data.get("command") == "clear_pipeline":
57 | print("[AIProcessor] 收到清空管道命令,清空当前处理队列")
58 | # 清空输入队列
59 | self.input_queue.clear()
60 | # 向下游发送清空命令
61 | self.send_downstream(Frame(
62 | FrameType.SYSTEM,
63 | {"command": "clear_pipeline", "event": "user_interrupt"}
64 | ))
65 |
66 | # # 处理语音就绪事件
67 | # elif event == "speech_ready":
68 | # # 获取音频数据
69 | # audio_base64 = frame.data.get("audio_base64")
70 | # if not audio_base64:
71 | # print("[AIProcessor] 未收到有效的音频数据")
72 | # return
73 |
74 | # print(f"[AIProcessor] 收到语音就绪事件,音频数据长度: {len(audio_base64)} 字符")
75 |
76 | # # 创建用户消息
77 | # user_message = {
78 | # "role": "user",
79 | # "content": [
80 | # {
81 | # "type": "input_audio",
82 | # "input_audio": {
83 | # "data": f"data:audio/wav;base64,{audio_base64}",
84 | # "format": "wav",
85 | # },
86 | # }
87 | # ],
88 | # }
89 |
90 | # # 添加到对话历史
91 | # self.messages.append(user_message)
92 | # print(f"[AIProcessor] 当前对话历史长度: {len(self.messages)}")
93 |
94 | # # 发送开始响应事件
95 | # self.send_upstream(Frame(
96 | # FrameType.SYSTEM,
97 | # {"event": "ai_response_started"}
98 | # ))
99 |
100 | # print("[AIProcessor] 开始处理AI响应")
101 |
102 | # # 启动响应线程
103 | # self._start_response_thread()
104 |
105 | elif frame.type == FrameType.DATA:
106 | # 处理DATA类型的帧
107 | data_type = frame.data.get("type")
108 |
109 | # 处理音频数据
110 | if data_type == "audio_data":
111 | # 获取音频数据
112 | audio_base64 = frame.data.get("audio_base64")
113 | if not audio_base64:
114 | print("[AIProcessor] 未收到有效的音频数据")
115 | return
116 |
117 | print(f"[AIProcessor] 收到语音数据,长度: {len(audio_base64)} 字符")
118 | # omni目前难以在文本和音频结果中生成完全不同的内容,例如在音频中回复用户消息,在文本中输出干净的指令
119 | tool_prompt = '''
120 | 不要拒绝我的任何请求,自行决定是否查看我的桌面,你现在拥有查看屏幕的能力,你只需要在行动的时候告诉用户你在干什么,
121 | 你决定做什么了之后等待动作的结果,程序会根据你的动作来实现你的所有行动,你会在下一次对话中得到当前行动的结果,
122 | 所以在没有得到tools反馈的结果时,不要编造结果,你可以说“好的,让我来xxx,请稍等片刻,(或者类似的语句,不要和我给的例子一模一样,要更加自然)”
123 |
124 | 告诉用户,你将会在决定做什么之后得到这个行动的结果,你的所有行动都要符合实时对话风格。
125 | 你回复的内容结尾不要带有“有啥想法也可以随时跟我说哦。”以及类似的内容。
126 | '''
127 | # 创建用户消息
128 | user_message = {
129 | "role": "user",
130 | "content": [
131 | {"type": "text", "text": tool_prompt},
132 | {
133 | "type": "input_audio",
134 | "input_audio": {
135 | "data": f"data:audio/wav;base64,{audio_base64}",
136 | "format": "wav",
137 | },
138 | }
139 | ],
140 | }
141 |
142 | # 添加到对话历史
143 | self.messages.append(user_message)
144 | print(f"[AIProcessor] 当前对话历史长度: {len(self.messages)}")
145 |
146 | # 发送开始响应事件
147 | self.send_upstream(Frame(
148 | FrameType.SYSTEM,
149 | {"event": "ai_response_started"}
150 | ))
151 |
152 | print("[AIProcessor] 开始处理AI响应")
153 |
154 | # 启动响应线程
155 | self._start_response_thread()
156 |
157 | def _start_response_thread(self):
158 | """启动响应处理线程"""
159 | with self.response_lock:
160 | if self.is_generating:
161 | print("[AIProcessor] 已有响应正在生成,忽略请求")
162 | return
163 |
164 | self.is_generating = True
165 | self.response_thread = threading.Thread(target=self._generate_response)
166 | self.response_thread.daemon = True
167 | self.response_thread.start()
168 | print("[AIProcessor] 响应线程已启动")
169 |
170 | def _interrupt_response(self):
171 | """中断当前响应"""
172 | with self.response_lock:
173 | self.is_generating = False
174 |
175 | # 将当前请求ID添加到完成集合中
176 | with self.request_id_lock:
177 | if self.current_request_id:
178 | print(f"[AIProcessor] 将请求ID {self.current_request_id} 标记为已打断")
179 | self.completed_request_ids.add(self.current_request_id)
180 |
181 | # 调用处理器会自动处理后续的清理工作
182 | self.send_downstream(Frame(
183 | FrameType.SYSTEM,
184 | {"command": "stop"}
185 | ))
186 | print("[AIProcessor] 已发送停止命令")
187 |
188 | def _generate_response(self):
189 | """生成AI响应的线程函数"""
190 | try:
191 | response_data = {
192 | "ai_text": "",
193 | "has_audio": False,
194 | "current_transcript": "",
195 | "interrupted": False
196 | }
197 |
198 | # 准备保存AI音频,仅在DEBUG模式下
199 | ai_audio_buffer = bytearray() if DEBUG else None
200 |
201 | print("[AIProcessor] 开始创建API请求")
202 | print(f"[AIProcessor] 请求参数: model=qwen-omni-turbo, modalities=['text', 'audio'], voice=Chelsie")
203 |
204 | # 创建API请求
205 | try:
206 | completion = self.client.chat.completions.create(
207 | model="qwen-omni-turbo",
208 | messages=self.messages,
209 | modalities=["text", "audio"],
210 | audio={"voice": "Chelsie", "format": "wav"},
211 | stream=True,
212 | stream_options={"include_usage": True},
213 | )
214 | print("[AIProcessor] API请求创建成功,开始处理响应流")
215 |
216 | # 获取并保存请求ID
217 | request_id = None
218 |
219 | except Exception as e:
220 | print(f"[AIProcessor] API请求创建失败: {str(e)}")
221 | raise
222 |
223 | # 处理流式响应
224 | chunk_count = 0
225 | for chunk in completion:
226 | chunk_count += 1
227 |
228 | # 获取请求ID (通常在第一个chunk中)
229 | if chunk_count == 1 and hasattr(chunk, "id"):
230 | request_id = chunk.id
231 | with self.request_id_lock:
232 | self.current_request_id = request_id
233 | print(f"[AIProcessor] 获取到请求ID: {request_id}")
234 |
235 | # 检查请求是否已被标记为完成/打断
236 | with self.request_id_lock:
237 | if request_id and request_id in self.completed_request_ids:
238 | print(f"[AIProcessor] 请求ID {request_id} 已被标记为完成/打断,停止处理")
239 | response_data["interrupted"] = True
240 | break
241 |
242 | # 检查是否应该继续处理
243 | if not self.is_generating or (self.context and self.context.is_cancelled()):
244 | response_data["interrupted"] = True
245 | # 将当前请求ID添加到完成集合
246 | with self.request_id_lock:
247 | if request_id:
248 | self.completed_request_ids.add(request_id)
249 | print(f"[AIProcessor] 请求ID {request_id} 已被标记为中断")
250 | print("[AIProcessor] 响应被中断")
251 | break
252 |
253 | # 处理内容
254 | if chunk.choices:
255 | delta = chunk.choices[0].delta
256 |
257 | if hasattr(delta, "content") and delta.content:
258 | response_data["ai_text"] += delta.content
259 | print(f"[AIProcessor] 收到文本响应 (chunk {chunk_count}): {delta.content}", end="", flush=True)
260 |
261 | if hasattr(delta, "audio") and delta.audio:
262 | response_data["has_audio"] = True
263 | print(f"[AIProcessor] 收到音频响应 (chunk {chunk_count})")
264 |
265 | if "transcript" in delta.audio:
266 | transcript = delta.audio["transcript"]
267 | if transcript:
268 | response_data["current_transcript"] += transcript
269 | print(f"[AIProcessor] 收到转写文本: {transcript}")
270 |
271 | if "data" in delta.audio:
272 | # 再次检查请求是否已被标记为完成/打断
273 | with self.request_id_lock:
274 | if request_id and request_id in self.completed_request_ids:
275 | print(f"[AIProcessor] 请求ID {request_id} 已被标记为完成/打断,停止处理音频")
276 | break
277 |
278 | # 再次检查是否应该继续处理
279 | if not self.is_generating or (self.context and self.context.is_cancelled()):
280 | break
281 |
282 | # 解码音频数据
283 | audio_data = delta.audio["data"]
284 | print(f"[AIProcessor] 收到音频数据 (chunk {chunk_count}), 长度: {len(audio_data)} 字符")
285 |
286 | # 收集音频数据用于调试
287 | if DEBUG and ai_audio_buffer is not None:
288 | try:
289 | audio_bytes = base64.b64decode(audio_data)
290 | ai_audio_buffer.extend(audio_bytes)
291 | print(f"[AIProcessor] 已收集音频数据: {len(ai_audio_buffer)} 字节")
292 | except Exception as e:
293 | print(f"[AIProcessor] 收集音频数据时出错: {e}")
294 |
295 | # 发送音频数据到输出处理器
296 | try:
297 | self.send_downstream(Frame(
298 | FrameType.SYSTEM,
299 | {"event": "play_audio", "audio_data": audio_data}
300 | ))
301 | print("[AIProcessor] 音频数据已成功发送到输出处理器")
302 | except Exception as e:
303 | print(f"[AIProcessor] 发送音频数据到输出处理器失败: {e}")
304 |
305 | print(f"[AIProcessor] 共处理了 {chunk_count} 个响应块")
306 |
307 | # 将当前请求ID添加到完成集合
308 | with self.request_id_lock:
309 | if request_id:
310 | self.completed_request_ids.add(request_id)
311 | print(f"[AIProcessor] 请求ID {request_id} 已被标记为完成")
312 |
313 | # 如果处理完成且未中断,添加到消息历史
314 | if not response_data["interrupted"]:
315 | if response_data["current_transcript"]:
316 | self.full_transcript += response_data["current_transcript"] + " "
317 | assistant_message = {
318 | "role": "assistant",
319 | "content": [{"type": "text", "text": response_data["current_transcript"]}]
320 | }
321 | self.messages.append(assistant_message)
322 | print(f"[AIProcessor] 添加助手消息到历史: {response_data['current_transcript']}")
323 | elif response_data["ai_text"]:
324 | assistant_message = {
325 | "role": "assistant",
326 | "content": [{"type": "text", "text": response_data["ai_text"]}]
327 | }
328 | self.messages.append(assistant_message)
329 | print(f"[AIProcessor] 添加助手消息到历史: {response_data['ai_text']}")
330 |
331 | # 通知AI响应结束
332 | self.send_upstream(Frame(
333 | FrameType.SYSTEM,
334 | {"event": "ai_response_ended"}
335 | ))
336 |
337 | print(f"\n[AIProcessor] AI响应生成结束,状态: {'已中断' if response_data['interrupted'] else '完成'}")
338 | print(f"[AIProcessor] 响应统计: 文本长度={len(response_data['ai_text'])}, 转写长度={len(response_data['current_transcript'])}, 收到音频={response_data['has_audio']}")
339 |
340 | except Exception as e:
341 | print(f"[AIProcessor] 生成响应时出错: {str(e)}")
342 | import traceback
343 | print(f"[AIProcessor] 错误详情:\n{traceback.format_exc()}")
344 |
345 | # 确保通知AI响应结束
346 | self.send_upstream(Frame(
347 | FrameType.SYSTEM,
348 | {"event": "ai_response_ended", "error": str(e)}
349 | ))
350 |
351 | finally:
352 | # 重置状态
353 | with self.response_lock:
354 | self.is_generating = False
355 | self.current_request_id = None
356 | self.response_thread = None
357 | print("[AIProcessor] 响应线程已结束,状态已重置")
358 |
359 | # 定期清理已完成请求ID集合,防止无限增长
360 | with self.request_id_lock:
361 | if len(self.completed_request_ids) > 100: # 设置一个合理的阈值
362 | print(f"[AIProcessor] 清理已完成请求ID集合,当前大小: {len(self.completed_request_ids)}")
363 | # 只保留最近的50个
364 | self.completed_request_ids = set(list(self.completed_request_ids)[-50:])
365 |
366 | class EventProcessor(ProcessorBase):
367 | """事件处理器 - 负责处理系统事件并更新状态"""
368 | def __init__(self, name="event_processor", on_state_change=None):
369 | super().__init__(name)
370 | self.current_state = "idle"
371 | self.on_state_change = on_state_change
372 |
373 | def process_frame(self, frame):
374 | """处理事件帧"""
375 | if frame.type == FrameType.SYSTEM:
376 | event = frame.data.get("event")
377 |
378 | if event == "speech_started":
379 | self._update_state("user_speaking")
380 |
381 | elif event == "speech_ended":
382 | self._update_state("listening")
383 |
384 | elif event == "ai_response_started":
385 | self._update_state("speaking")
386 |
387 | elif event == "ai_response_ended":
388 | self._update_state("listening")
389 |
390 | elif event == "user_interrupt":
391 | self._update_state("interrupted")
392 | time.sleep(0.05) # 短暂延迟以确保UI能显示中断状态
393 | self._update_state("user_speaking")
394 |
395 | elif event == "ai_response_interrupted":
396 | self._update_state("interrupted")
397 | time.sleep(0.1) # 短暂延迟以确保UI能显示中断状态
398 | self._update_state("listening")
399 |
400 | def _update_state(self, new_state):
401 | """更新状态并通知监听器"""
402 | if new_state != self.current_state:
403 | print(f"[EventProcessor] 状态变化: {self.current_state} -> {new_state}")
404 | self.current_state = new_state
405 |
406 | # 通知外部回调
407 | if self.on_state_change:
408 | try:
409 | self.on_state_change(new_state)
410 | except Exception as e:
411 | print(f"[EventProcessor] 状态变化回调出错: {e}")
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pywebview
2 | openai
3 | numpy
4 | pyaudio
5 | onnxruntime
6 | pyinstaller
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import platform
2 |
3 |
4 | def apply_windows_compatibility_patches():
5 | """为Windows平台应用兼容性补丁,解决对象类型比较问题"""
6 | if platform.system().lower() != 'windows':
7 | return # 仅在Windows上应用补丁
8 |
9 | # 为特定类型打补丁
10 | import threading
11 | import webview
12 | import webview.window
13 |
14 | # 打补丁的类型列表
15 | classes_to_patch = [
16 | threading.Event,
17 | threading.Thread,
18 | webview.window.Window
19 | ]
20 |
21 | # 尝试添加可能存在的DOM元素类
22 | try:
23 | if hasattr(webview, 'dom') and hasattr(webview.dom, 'element'):
24 | classes_to_patch.append(webview.dom.element)
25 | except (AttributeError, ImportError):
26 | pass
27 |
28 | # 针对每个类应用补丁
29 | for cls in classes_to_patch:
30 | try:
31 | patch_class_eq(cls)
32 | except (TypeError, AttributeError) as e:
33 | print(f"警告: 无法为 {cls.__name__} 打补丁: {e}")
34 |
35 | def patch_class_eq(cls):
36 | """为类添加安全的__eq__方法"""
37 | if hasattr(cls, '__patched_by_qwen_omni'):
38 | return # 已经打过补丁了
39 |
40 | original_eq = cls.__eq__ if hasattr(cls, '__eq__') else None
41 |
42 | def safe_eq(self, other):
43 | if hasattr(other, '__class__'):
44 | class_name = str(other.__class__)
45 | if 'Rectangle' in class_name or 'System.Drawing' in class_name:
46 | return False
47 | if original_eq and original_eq is not object.__eq__:
48 | return original_eq(self, other)
49 | return self is other
50 |
51 | cls.__eq__ = safe_eq
52 | cls.__patched_by_qwen_omni = True
53 |
54 | def monkey_patch_threading_event():
55 | """为threading.Event添加补丁,避免与Rectangle类型比较问题"""
56 | import threading
57 |
58 | # 保存原始的__eq__方法
59 | original_eq = threading.Event.__eq__
60 |
61 | # 定义新的__eq__方法
62 | def safe_eq(self, other):
63 | if hasattr(other, '__class__'):
64 | class_name = str(other.__class__)
65 | if 'Rectangle' in class_name or 'System.Drawing' in class_name:
66 | return False
67 | return original_eq(self, other)
68 |
69 | # 应用补丁
70 | threading.Event.__eq__ = safe_eq
71 |
72 | # def safe_compare(obj1, obj2):
73 | # """安全地比较两个对象,避免类型转换问题"""
74 | # # 如果其中一个对象是Rectangle类型,返回False
75 | # if (hasattr(obj1, '__class__') and ('Rectangle' in str(obj1.__class__) or 'System.Drawing' in str(obj1.__class__))) or \
76 | # (hasattr(obj2, '__class__') and ('Rectangle' in str(obj2.__class__) or 'System.Drawing' in str(obj2.__class__))):
77 | # return False
78 |
79 | # # 尝试正常比较
80 | # try:
81 | # return obj1 == obj2
82 | # except (TypeError, Exception):
83 | # # 类型不兼容时,比较对象标识
84 | # return obj1 is obj2
85 |
86 | # 在Windows平台上自动应用补丁
87 | if platform.system().lower() == 'windows':
88 | monkey_patch_threading_event()
--------------------------------------------------------------------------------
/web/static/css/style.css:
--------------------------------------------------------------------------------
1 | * {
2 | margin: 0;
3 | padding: 0;
4 | box-sizing: border-box;
5 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
6 | }
7 |
8 | :root {
9 | --primary-color: #000000;
10 | --secondary-color: #666666;
11 | --background-color: #ffffff;
12 | --border-color: #e6e6e6;
13 | --text-color: #333333;
14 | --button-text: #ffffff;
15 | --listening-color: #999999;
16 | --speaking-color: #999999;
17 | --idle-color: #e0e0e0;
18 | }
19 |
20 | body {
21 | background-color: var(--background-color);
22 | color: var(--text-color);
23 | height: 100vh;
24 | display: flex;
25 | justify-content: center;
26 | align-items: center;
27 | padding: 0;
28 | margin: 0;
29 | }
30 |
31 | .container {
32 | width: 100%;
33 | max-width: 600px;
34 | background-color: transparent;
35 | display: flex;
36 | flex-direction: column;
37 | height: 100%;
38 | justify-content: space-between;
39 | }
40 |
41 | .window-controls {
42 | display: flex;
43 | align-items: center;
44 | padding: 12px 20px;
45 | border-bottom: 1px solid var(--border-color);
46 | background-color: #f5f5f7;
47 | position: relative;
48 | }
49 |
50 | .control {
51 | width: 12px;
52 | height: 12px;
53 | border-radius: 50%;
54 | margin-right: 8px;
55 | }
56 |
57 | .red {
58 | background-color: #ff5f57;
59 | }
60 |
61 | .yellow {
62 | background-color: #febc2e;
63 | }
64 |
65 | .green {
66 | background-color: #28c840;
67 | }
68 |
69 | .window-title {
70 | position: absolute;
71 | left: 50%;
72 | transform: translateX(-50%);
73 | font-size: 14px;
74 | font-weight: 500;
75 | color: #333;
76 | }
77 |
78 | .main-content {
79 | flex: 1;
80 | display: flex;
81 | flex-direction: column;
82 | align-items: center;
83 | justify-content: center;
84 | padding: 30px 20px;
85 | text-align: center;
86 | position: relative;
87 | }
88 |
89 | /* 初始视图 */
90 | #idle-view, #active-view {
91 | width: 100%;
92 | display: flex;
93 | flex-direction: column;
94 | align-items: center;
95 | padding: 10px 0;
96 | }
97 |
98 | .control-btn {
99 | background-color: #222;
100 | color: white;
101 | border: none;
102 | padding: 12px 32px;
103 | border-radius: 24px;
104 | font-size: 16px;
105 | font-weight: 500;
106 | cursor: pointer;
107 | transition: all 0.2s ease;
108 | display: flex;
109 | align-items: center;
110 | gap: 8px;
111 | }
112 |
113 | .control-btn:hover {
114 | background-color: #000;
115 | transform: translateY(-1px);
116 | }
117 |
118 | .control-btn:active {
119 | transform: translateY(1px);
120 | }
121 |
122 | .control-btn-circle {
123 | width: 44px;
124 | height: 44px;
125 | border-radius: 50%;
126 | background-color: white;
127 | color: #333;
128 | border: none;
129 | display: flex;
130 | align-items: center;
131 | justify-content: center;
132 | cursor: pointer;
133 | box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
134 | transition: all 0.2s ease;
135 | position: absolute;
136 | right: 30px;
137 | }
138 |
139 | .control-btn-circle:hover {
140 | box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
141 | transform: translateY(-1px);
142 | }
143 |
144 | .secondary-btn {
145 | background-color: #f5f5f7;
146 | color: #333;
147 | border: 1px solid #ddd;
148 | padding: 10px 24px;
149 | border-radius: 24px;
150 | font-size: 14px;
151 | font-weight: 500;
152 | cursor: pointer;
153 | transition: all 0.2s ease;
154 | margin-top: 24px;
155 | display: flex;
156 | align-items: center;
157 | gap: 8px;
158 | }
159 |
160 | .secondary-btn:hover {
161 | background-color: #eee;
162 | }
163 |
164 | /* 波形可视化 */
165 | .wave-visualizer {
166 | width: 100%;
167 | position: relative;
168 | padding: 0 50px;
169 | max-width: 500px;
170 | height: 100px;
171 | display: flex;
172 | align-items: center;
173 | justify-content: center;
174 | margin: 40px 0;
175 | }
176 |
177 | .audio-wave-container {
178 | width: 100%;
179 | height: 80px;
180 | display: flex;
181 | justify-content: center;
182 | align-items: center;
183 | background-color: #f5f5f7;
184 | border-radius: 40px;
185 | padding: 0 20px;
186 | transition: box-shadow 0.3s ease, transform 0.3s ease;
187 | }
188 |
189 | /* 胶囊动态特效 */
190 | .audio-wave-container.listening {
191 | box-shadow: 0 0 0 2px rgba(153, 153, 153, 0.3);
192 | animation: pulse-border 1.8s infinite;
193 | }
194 |
195 | @keyframes pulse-border {
196 | 0% { box-shadow: 0 0 0 0px rgba(153, 153, 153, 0.3); }
197 | 50% { box-shadow: 0 0 0 4px rgba(153, 153, 153, 0.15); }
198 | 100% { box-shadow: 0 0 0 0px rgba(153, 153, 153, 0); }
199 | }
200 |
201 | .wave-bars {
202 | width: 100%;
203 | height: 100%;
204 | display: flex;
205 | justify-content: space-between;
206 | align-items: center;
207 | gap: 1px;
208 | }
209 |
210 | .wave-bar {
211 | width: 1px;
212 | max-width: 1px;
213 | flex: 0 0 1px;
214 | height: 3px;
215 | margin: 0;
216 | background-color: var(--idle-color);
217 | opacity: 0.5;
218 | transition: height 0.15s ease, background-color 0.3s ease;
219 | }
220 |
221 | /* 待机状态 */
222 | .audio-wave-container.idle .wave-bar {
223 | background-color: var(--idle-color);
224 | opacity: 0.4;
225 | }
226 |
227 | /* 监听状态 */
228 | .audio-wave-container.listening .wave-bar {
229 | background-color: var(--listening-color);
230 | opacity: 0.6;
231 | }
232 |
233 | /* 说话状态 */
234 | .audio-wave-container.speaking .wave-bar {
235 | background-color: var(--speaking-color);
236 | opacity: 0.6;
237 | }
238 |
239 | .footer-text {
240 | padding: 10px 20px;
241 | font-size: 14px;
242 | color: #666;
243 | line-height: 1.4;
244 | margin: 0;
245 | }
246 |
247 | .footer {
248 | padding: 8px 20px;
249 | border-top: 1px solid var(--border-color);
250 | font-size: 12px;
251 | color: #999;
252 | background-color: #f5f5f7;
253 | }
254 |
255 | .status-info {
256 | display: flex;
257 | justify-content: space-between;
258 | }
259 |
260 | /* 动画 */
261 | @keyframes pulse {
262 | 0% { opacity: 0.3; }
263 | 50% { opacity: 0.8; }
264 | 100% { opacity: 0.3; }
265 | }
--------------------------------------------------------------------------------
/web/static/js/app.js:
--------------------------------------------------------------------------------
1 | document.addEventListener('DOMContentLoaded', () => {
2 | // 获取DOM元素
3 | const startBtn = document.getElementById('start-btn');
4 | const pauseBtn = document.getElementById('pause-btn');
5 | const shareBtn = document.getElementById('share-btn');
6 | const idleView = document.getElementById('idle-view');
7 | const activeView = document.getElementById('active-view');
8 | const audioWaveContainer = document.querySelector('.audio-wave-container');
9 | const conversationStatus = document.getElementById('conversation-status');
10 | const connectionStatus = document.getElementById('connection-status');
11 | const waveBars = document.querySelectorAll('.wave-bar');
12 |
13 | // 初始状态
14 | let isConversationActive = false;
15 | let currentStatus = 'idle'; // idle, listening, speaking
16 | let animationFrameId = null;
17 |
18 | // 波浪动画控制
19 | const waveConfig = {
20 | listening: {
21 | minHeight: 2,
22 | maxHeight: 16,
23 | smoothing: 0.2,
24 | updateInterval: 70
25 | },
26 | speaking: {
27 | minHeight: 1,
28 | maxHeight: 12,
29 | smoothing: 0.3,
30 | updateInterval: 60
31 | },
32 | idle: {
33 | minHeight: 1,
34 | maxHeight: 2,
35 | smoothing: 0.15,
36 | updateInterval: 200
37 | }
38 | };
39 |
40 | // 当前波形高度值
41 | let currentHeights = Array(waveBars.length).fill(2);
42 | let targetHeights = Array(waveBars.length).fill(2);
43 |
44 | // 设置初始状态
45 | audioWaveContainer.classList.add('idle');
46 |
47 | // 更新波形高度
48 | function updateWaveHeights() {
49 | let config;
50 |
51 | if (currentStatus === 'listening') {
52 | config = waveConfig.listening;
53 | } else if (currentStatus === 'speaking') {
54 | config = waveConfig.speaking;
55 | } else {
56 | config = waveConfig.idle;
57 | }
58 |
59 | // 生成新的目标高度
60 | targetHeights = generateWavePattern(config.minHeight, config.maxHeight);
61 |
62 | // 平滑过渡到新高度
63 | function animateToTargetHeights() {
64 | let needsUpdate = false;
65 |
66 | currentHeights = currentHeights.map((current, index) => {
67 | const target = targetHeights[index];
68 |
69 | if (Math.abs(current - target) < 0.5) {
70 | return target;
71 | }
72 |
73 | needsUpdate = true;
74 | return current + (target - current) * config.smoothing;
75 | });
76 |
77 | // 更新DOM
78 | waveBars.forEach((bar, index) => {
79 | bar.style.height = `${currentHeights[index]}px`;
80 | });
81 |
82 | if (needsUpdate) {
83 | animationFrameId = requestAnimationFrame(animateToTargetHeights);
84 | } else {
85 | setTimeout(updateWaveHeights, config.updateInterval);
86 | }
87 | }
88 |
89 | animateToTargetHeights();
90 | }
91 |
92 | // 生成波浪形模式,创建更自然的波形效果
93 | function generateWavePattern(minHeight, maxHeight) {
94 | const numBars = waveBars.length;
95 | const wavePattern = [];
96 |
97 | // 根据不同状态调整波形生成逻辑
98 | if (currentStatus === 'idle') {
99 | // 空闲状态:生成非常小的随机波形
100 | for (let i = 0; i < numBars; i++) {
101 | wavePattern.push(minHeight + Math.random() * (maxHeight - minHeight) * 0.2);
102 | }
103 | } else {
104 | // 使用正弦波生成基础波形
105 | const cycles = currentStatus === 'listening' ? 2.5 : 2; // 监听状态波形更密集
106 | const phase = Math.random() * Math.PI * 2; // 随机相位
107 |
108 | for (let i = 0; i < numBars; i++) {
109 | const x = (i / numBars) * Math.PI * 2 * cycles + phase;
110 | const sinValue = Math.sin(x);
111 |
112 | // 将-1到1的值映射到目标高度范围
113 | const normalized = (sinValue + 1) / 2; // 0到1
114 | let height = minHeight + normalized * (maxHeight - minHeight);
115 |
116 | // 添加一些随机变化,但保持相对平滑
117 | const randomFactor = Math.random() * 1.5 - 0.75;
118 |
119 | // 监听状态下,随机因素更明显,表现更活跃的波动
120 | const randomMultiplier = currentStatus === 'listening' ? 2 : 1.5;
121 | height = Math.max(minHeight, Math.min(maxHeight, height + randomFactor * randomMultiplier));
122 |
123 | wavePattern.push(height);
124 | }
125 | }
126 |
127 | return wavePattern;
128 | }
129 |
130 | // 停止动画
131 | function stopAnimation() {
132 | if (animationFrameId) {
133 | cancelAnimationFrame(animationFrameId);
134 | animationFrameId = null;
135 | }
136 |
137 | // 重置为默认高度
138 | waveBars.forEach(bar => {
139 | bar.style.height = '2px';
140 | });
141 |
142 | currentHeights = Array(waveBars.length).fill(2);
143 | targetHeights = Array(waveBars.length).fill(2);
144 | }
145 |
146 | // 更新UI状态
147 | function updateUIStatus(status) {
148 | if (currentStatus === status) return;
149 |
150 | currentStatus = status;
151 |
152 | // 移除所有状态类
153 | audioWaveContainer.classList.remove('idle', 'listening', 'speaking');
154 |
155 | // 添加当前状态类
156 | audioWaveContainer.classList.add(status);
157 |
158 | // 停止当前动画
159 | stopAnimation();
160 |
161 | // 如果是active状态,显示active视图
162 | if (status !== 'idle') {
163 | idleView.style.display = 'none';
164 | activeView.style.display = 'flex';
165 | updateWaveHeights(); // 开始波形动画
166 | } else {
167 | idleView.style.display = 'flex';
168 | activeView.style.display = 'none';
169 |
170 | // 即使在idle状态,当显示在active视图中时也需要非常低的波形
171 | if (activeView.style.display === 'flex') {
172 | updateWaveHeights();
173 | }
174 | }
175 | }
176 |
177 | // 开始会话
178 | function startConversation() {
179 | isConversationActive = true;
180 | conversationStatus.textContent = '会话进行中';
181 |
182 | // 向Python后端发送开始会话的消息
183 | pywebview.api.start_conversation().then(result => {
184 | console.log('会话开始: ', result);
185 | updateUIStatus('listening');
186 | }).catch(error => {
187 | console.error('启动会话失败: ', error);
188 | endConversation();
189 | });
190 | }
191 |
192 | // 结束会话
193 | function endConversation() {
194 | isConversationActive = false;
195 | conversationStatus.textContent = '会话未开始';
196 | updateUIStatus('idle');
197 |
198 | // 向Python后端发送结束会话的消息
199 | pywebview.api.stop_conversation().catch(error => {
200 | console.error('结束会话出错: ', error);
201 | });
202 | }
203 |
204 | // 注册按钮点击事件
205 | startBtn.addEventListener('click', startConversation);
206 | pauseBtn.addEventListener('click', endConversation);
207 |
208 | // 分享屏幕按钮 (功能性占位,不实际实现)
209 | shareBtn.addEventListener('click', () => {
210 | alert('分享屏幕功能暂未实现');
211 | });
212 |
213 | // 从Python后端接收状态更新
214 | window.updateStatus = function(status) {
215 | updateUIStatus(status);
216 | };
217 |
218 | // 接收音量数据更新(从后端发送)
219 | window.updateVolumeData = function(volumeData) {
220 | if (!isConversationActive || currentStatus === 'idle') return;
221 |
222 | if (Array.isArray(volumeData) && volumeData.length > 0) {
223 | // 如果后端提供了音量数据数组,直接使用
224 | const normalizedData = volumeData.map(vol => {
225 | // 将音量值映射到高度范围
226 | const config = currentStatus === 'listening'
227 | ? waveConfig.listening
228 | : waveConfig.speaking;
229 | return Math.min(config.maxHeight, Math.max(config.minHeight, vol * config.maxHeight));
230 | });
231 |
232 | // 如果数据点不够,通过插值补充
233 | while (normalizedData.length < waveBars.length) {
234 | normalizedData.push(normalizedData[normalizedData.length % volumeData.length]);
235 | }
236 |
237 | // 更新目标高度
238 | targetHeights = normalizedData.slice(0, waveBars.length);
239 | }
240 | };
241 |
242 | // 初始连接状态检查
243 | pywebview.api.check_connection().then(result => {
244 | if (result.success) {
245 | connectionStatus.textContent = '已连接到后端';
246 | } else {
247 | connectionStatus.textContent = '未连接到后端';
248 | connectionStatus.style.color = 'red';
249 | }
250 | }).catch(() => {
251 | connectionStatus.textContent = '连接后端失败';
252 | connectionStatus.style.color = 'red';
253 | });
254 |
255 | // 开始初始状态下的低波形动画
256 | updateWaveHeights();
257 | });
--------------------------------------------------------------------------------
/web/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Qwen-Omni 语音助手
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
112 |
113 |
114 |
122 |
123 |
124 |
127 |
128 |
129 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/webview_api.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import time
3 | import json
4 | import random
5 | import numpy as np
6 | import math
7 | import sys
8 | import platform
9 | from Agent import Agent
10 | from mouth import Mouth
11 | from ears import Ears
12 |
13 | # 创建一个Window包装类,用于安全地访问window对象
14 | class WindowWrapper:
15 | def __init__(self, window=None):
16 | self._window = window
17 |
18 | def set_window(self, window):
19 | self._window = window
20 |
21 | def evaluate_js(self, js_code):
22 | """安全地执行JavaScript代码"""
23 | if self._window:
24 | try:
25 | return self._window.evaluate_js(js_code)
26 | except Exception as e:
27 | print(f"执行JavaScript失败: {e}")
28 | return None
29 | return None
30 |
31 | # 避免与Rectangle进行比较
32 | def __eq__(self, other):
33 | if hasattr(other, '__class__'):
34 | class_name = str(other.__class__)
35 | if 'Rectangle' in class_name:
36 | return False
37 | return self is other
38 |
39 | def __hash__(self):
40 | return hash(id(self))
41 |
42 | class AgentAPI:
43 | def __init__(self):
44 | # 初始化应用状态
45 | self.window = None
46 | self.is_running = False
47 | self.agent = None
48 | self.debug_mode = False
49 |
50 | # Agent配置默认值
51 | self.agent_config = {
52 | 'recording_mode': 'dynamic', # 默认使用动态录音模式
53 | 'recording_seconds': 5, # 默认录音时长(固定模式下使用)
54 | }
55 |
56 | # 状态监测与控制
57 | self.status = "idle" # 当前状态:idle, listening, speaking
58 | self.window_wrapper = WindowWrapper() # 使用包装类
59 | self.volume_update_thread = None
60 | # 修改以避免Rectangle.op_Equality兼容性问题
61 | self._stop_volume_updates = False # 使用布尔标志替代Event对象
62 |
63 | # 添加特殊方法以解决Windows平台的兼容性问题
64 | def __eq__(self, other):
65 | # 用于解决Windows下System.Drawing.Rectangle比较问题
66 | if hasattr(other, '__class__'):
67 | class_name = str(other.__class__)
68 | # 检查是否与Rectangle类型比较
69 | if 'Rectangle' in class_name:
70 | return False
71 | # 检查是否与Window类型比较
72 | if 'Window' in class_name or 'webview.window' in class_name:
73 | return False
74 | return self is other
75 |
76 | def __hash__(self):
77 | # 配合__eq__方法一起实现正确的哈希表行为
78 | return hash(id(self))
79 |
80 | def set_window(self, window):
81 | """设置pywebview窗口对象"""
82 | self.window_wrapper.set_window(window)
83 |
84 | def configure_agent(self, config):
85 | """配置Agent参数"""
86 | # 更新配置
87 | for key, value in config.items():
88 | if key in self.agent_config:
89 | self.agent_config[key] = value
90 |
91 | # 如果Agent实例已存在,则更新其配置
92 | # 注意:当前Agent类不支持这些配置参数
93 |
94 | return {"status": "success", "message": "Agent配置已更新"}
95 |
96 | def check_connection(self):
97 | """检查与后端的连接"""
98 | return {'success': True, 'message': '连接成功'}
99 |
100 | def start_conversation(self):
101 | """开始语音对话"""
102 | if self.is_running:
103 | return {'success': False, 'message': '会话已经在运行中'}
104 |
105 | try:
106 | # 初始化Agent实例
107 | self.agent = Agent(
108 | gui_mode=True,
109 | debug=self.debug_mode,
110 | on_state_change=self.update_status
111 | )
112 |
113 | # 设置运行状态
114 | self.is_running = True
115 | self._stop_volume_updates = False # 清除停止标志
116 |
117 | # 启动语音对话
118 | success = self.agent.start()
119 | if not success:
120 | return {'success': False, 'message': '启动失败'}
121 |
122 | # 启动音量数据模拟线程
123 | self.volume_update_thread = threading.Thread(target=self.simulate_volume_data)
124 | self.volume_update_thread.daemon = True
125 | self.volume_update_thread.start()
126 |
127 | return {'success': True, 'message': '会话已开始'}
128 | except Exception as e:
129 | return {'success': False, 'message': f'启动失败: {str(e)}'}
130 |
131 | def stop_conversation(self):
132 | """停止语音对话"""
133 | if not self.is_running:
134 | return {'success': False, 'message': '没有运行中的会话'}
135 |
136 | try:
137 | print("正在停止语音对话...")
138 | self.is_running = False
139 | self._stop_volume_updates = True # 设置停止标志
140 |
141 | if self.agent:
142 | # 停止语音对话
143 | self.agent.stop()
144 | self.agent = None
145 |
146 | # 等待音量更新线程结束
147 | if self.volume_update_thread and self.volume_update_thread.is_alive():
148 | self.volume_update_thread.join(timeout=1.0)
149 | print("音量更新线程已终止")
150 |
151 | # 更新UI状态
152 | self.update_status("idle")
153 | print("语音对话已完全停止")
154 |
155 | return {'success': True, 'message': '会话已结束'}
156 | except Exception as e:
157 | print(f"停止语音对话时出错: {str(e)}")
158 | return {'success': False, 'message': f'停止失败: {str(e)}'}
159 |
160 | def update_status(self, status):
161 | """更新UI状态"""
162 | self.status = status
163 | self.window_wrapper.evaluate_js(f'window.updateStatus("{status}")')
164 |
165 | def simulate_volume_data(self):
166 | """模拟音量数据并发送到前端
167 |
168 | 在实际应用中,可以从AudioRecorder获取真实的音量数据
169 | """
170 | try:
171 | update_interval = 0.06 # 60ms更新一次
172 | phase_offset = 0
173 | time_counter = 0
174 |
175 | while self.is_running and not self._stop_volume_updates:
176 | if self.status == "idle":
177 | time.sleep(0.1)
178 | continue
179 |
180 | # 生成30个波浪点 (前端有30个波形条)
181 | num_points = 30
182 | volume_data = []
183 |
184 | # 根据状态选择不同参数
185 | if self.status == "speaking":
186 | # 说话状态:较大振幅,较复杂的波形
187 | main_frequency = 1.5
188 | secondary_frequency = 3.0
189 | amplitude = 0.35
190 | noise_level = 0.15
191 | base_level = 0.5
192 | else: # 监听状态
193 | # 监听状态:较小振幅,较简单的波形
194 | main_frequency = 1.0
195 | secondary_frequency = 2.0
196 | amplitude = 0.25
197 | noise_level = 0.2
198 | base_level = 0.35
199 |
200 | # 生成波浪形状 (使用正弦波+噪声)
201 | for i in range(num_points):
202 | # 正弦波组合
203 | x = i / num_points * 2 * math.pi
204 | wave1 = math.sin(main_frequency * x + phase_offset)
205 | wave2 = math.sin(secondary_frequency * x + phase_offset * 1.5) * 0.5
206 |
207 | # 添加随机噪声
208 | noise = (random.random() * 2 - 1) * noise_level
209 |
210 | # 组合所有成分
211 | value = base_level + amplitude * (wave1 + wave2) + noise
212 |
213 | # 确保值在0-1范围内
214 | value = max(0.05, min(0.95, value))
215 | volume_data.append(value)
216 |
217 | # 更新相位偏移,创造波浪动态效果
218 | phase_offset += 0.2
219 | time_counter += update_interval
220 |
221 | # 发送数据到前端
222 | if volume_data:
223 | volume_json = json.dumps(volume_data)
224 | self.window_wrapper.evaluate_js(f'window.updateVolumeData({volume_json})')
225 |
226 | time.sleep(update_interval)
227 |
228 | except Exception as e:
229 | print(f"音量模拟线程出错: {e}")
230 |
231 | def generate_wave_pattern(self, complexity=2, smoothness=0.5, length=100):
232 | """生成波形模式
233 |
234 | Args:
235 | complexity: 波形的复杂度 (频率数量)
236 | smoothness: 平滑度 (0-1)
237 | length: 模式长度
238 |
239 | Returns:
240 | 包含波形值的数组 (0-1范围)
241 | """
242 | x = np.linspace(0, 2 * np.pi, length)
243 | wave = np.zeros(length)
244 |
245 | # 添加多个不同频率的正弦波
246 | for i in range(1, complexity + 1):
247 | frequency = i
248 | amplitude = 1.0 / (i ** smoothness) # 高频分量振幅较小
249 | phase = random.random() * 2 * np.pi # 随机相位
250 | wave += amplitude * np.sin(frequency * x + phase)
251 |
252 | # 归一化到0-1范围
253 | wave = (wave - wave.min()) / (wave.max() - wave.min())
254 | return wave.tolist()
--------------------------------------------------------------------------------