├── .gitignore
├── .idea
    └── .gitignore
├── Agent.py
├── LICENSE
├── README.md
├── app.py
├── assets
    ├── GUI-1.png
    ├── GUI-2.png
    ├── Qwen.icns
    ├── Qwen.ico
    └── Qwen.png
├── build-scripts
    └── windows
    │   ├── README.md
    │   ├── build.bat
    │   ├── build.py
    │   └── direct_spec.txt
├── config.py
├── core_pipeline.py
├── ears.py
├── file_version.txt
├── key.json.example
├── models
    └── silero_vad.onnx
├── mouth.py
├── processors.py
├── requirements.txt
├── utils.py
├── web
    ├── static
    │   ├── css
    │   │   └── style.css
    │   └── js
    │   │   └── app.js
    └── templates
    │   └── index.html
└── webview_api.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | .Python
 6 | .env
 7 | .venv
 8 | venv/
 9 | env/
10 | ENV/
11 | env.bak/
12 | venv.bak/
13 | pip-log.txt
14 | pip-delete-this-directory.txt
15 | 
16 | # Build
17 | build/
18 | dist/
19 | *.spec
20 | *.manifest
21 | *.pyc
22 | *.pyo
23 | *.pyd
24 | .Python
25 | *.so
26 | 
27 | # IDE
28 | .idea/
29 | .vscode/
30 | *.swp
31 | *.swo
32 | .project
33 | .pydevproject
34 | .settings/
35 | 
36 | # Project specific
37 | key.json
38 | *.log
39 | file_version.txt
40 | *.bak
41 | 
42 | # Windows specific
43 | Thumbs.db
44 | ehthumbs.db
45 | Desktop.ini
46 | $RECYCLE.BIN/
47 | 
48 | # macOS specific
49 | .DS_Store
50 | .AppleDouble
51 | .LSOverride
52 | ._*
53 | 
54 | # Distribution / packaging
55 | .Python
56 | develop-eggs/
57 | downloads/
58 | eggs/
59 | .eggs/
60 | lib/
61 | lib64/
62 | parts/
63 | sdist/
64 | var/
65 | wheels/
66 | *.egg-info/
67 | .installed.cfg
68 | *.egg
69 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
 1 | # Default ignored files
 2 | /shelf/
 3 | /workspace.xml
 4 | # Editor-based HTTP Client requests
 5 | /httpRequests/
 6 | # Datasource local storage ignored files
 7 | /dataSources/
 8 | /dataSources.local.xml
 9 | key.json
10 | 


--------------------------------------------------------------------------------
/Agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import threading
  4 | from openai import OpenAI
  5 | import base64
  6 | from queue import Queue
  7 | from typing import Dict, List, Callable, Any
  8 | from config import (
  9 |     API_KEY, BASE_URL, DEBUG
 10 | )
 11 | from mouth import Mouth
 12 | from ears import Ears
 13 | from enum import Enum, auto
 14 | from core_pipeline import ConversationPipeline
 15 | from processors import AIProcessor, EventProcessor
 16 | 
 17 | class SystemEvent(Enum):
 18 |     """系统事件枚举类，用于事件驱动的状态转换"""
 19 |     USER_SPEECH_STARTED = auto()
 20 |     USER_SPEECH_ENDED = auto()
 21 |     AI_RESPONSE_STARTED = auto()
 22 |     AI_RESPONSE_ENDED = auto()
 23 |     USER_INTERRUPT = auto()
 24 |     SESSION_ENDED = auto()
 25 | 
 26 | class ChatState(Enum):
 27 |     """对话状态枚举类"""
 28 |     IDLE = auto()           # 空闲状态
 29 |     USER_SPEAKING = auto()  # 用户说话中
 30 |     AI_SPEAKING = auto()    # AI说话中
 31 |     INTERRUPTED = auto()    # 已被打断
 32 | 
 33 | class Agent:
 34 |     def __init__(self, gui_mode=True, debug=False, on_state_change=None):
 35 |         """初始化语音对话代理
 36 |         
 37 |         Args:
 38 |             gui_mode: 是否使用GUI模式，默认为True
 39 |             debug: 是否启用调试模式，打印详细日志
 40 |             on_state_change: 状态变化回调函数，用于GUI模式更新UI
 41 |         """
 42 |         if not API_KEY:
 43 |             raise ValueError("API密钥未设置")
 44 |         
 45 |         # 配置参数
 46 |         self.gui_mode = gui_mode
 47 |         self.debug = debug
 48 |         
 49 |         # 状态回调函数
 50 |         self.on_state_change = on_state_change
 51 |         
 52 |         # 新的流处理管道
 53 |         self.pipeline = ConversationPipeline()
 54 |         
 55 |         # 初始化处理器
 56 |         self._setup_processors()
 57 |         
 58 |         # 会话控制
 59 |         self.is_running = False
 60 |         self.session_end_event = threading.Event()
 61 |         
 62 |     def _setup_processors(self):
 63 |         """设置处理器管道"""
 64 |         # 创建处理器实例
 65 |         audio_input = Ears()
 66 |         ai_processor = AIProcessor()
 67 |         audio_output = Mouth()
 68 |         event_processor = EventProcessor(on_state_change=self.on_state_change)
 69 |         
 70 |         # 添加处理器到管道
 71 |         self.pipeline.add_processor(audio_input)
 72 |         self.pipeline.add_processor(ai_processor)
 73 |         self.pipeline.add_processor(audio_output)
 74 |         self.pipeline.add_processor(event_processor)
 75 |         
 76 |         # 连接处理器
 77 |         self.pipeline.connect_processors()
 78 |         
 79 |         # 保存引用以便直接访问
 80 |         self.audio_input = audio_input
 81 |         self.ai_processor = ai_processor
 82 |         self.audio_output = audio_output
 83 |         self.event_processor = event_processor
 84 |     
 85 |     def print_conversation_history(self):
 86 |         """打印对话历史"""
 87 |         messages = self.ai_processor.messages
 88 |         if not messages:
 89 |             print("对话历史为空")
 90 |             return
 91 |         
 92 |         print("\n===== 对话历史 =====")
 93 |         for i, msg in enumerate(messages):
 94 |             role = msg["role"]
 95 |             if role == "user":
 96 |                 has_audio = any(content.get("type") == "input_audio" for content in msg["content"])
 97 |                 has_text = any(content.get("type") == "text" for content in msg["content"])
 98 |                 print(f"{i+1}. 用户: ", end="")
 99 |                 if has_text:
100 |                     for content in msg["content"]:
101 |                         if content.get("type") == "text":
102 |                             print(f"{content['text']}")
103 |                             break
104 |                 elif has_audio:
105 |                     print("[语音输入]")
106 |                 else:
107 |                     print("[未知输入]")
108 |             elif role == "assistant":
109 |                 print(f"{i+1}. AI: ", end="")
110 |                 if isinstance(msg["content"], list) and msg["content"] and "text" in msg["content"][0]:
111 |                     print(f"{msg['content'][0]['text']}")
112 |                 else:
113 |                     print("[未知响应]")
114 |         print("===================\n")
115 |     
116 |     def show_system_info(self):
117 |         """显示系统信息"""
118 |         print("\n===== 系统信息 =====")
119 |         mics = self.audio_input.get_available_microphones()
120 |         print("\n可用麦克风:")
121 |         for i, mic in enumerate(mics):
122 |             print(f"{i+1}. 设备ID: {mic['index']} - {mic['name']} (通道数: {mic['channels']})")
123 |         print("\n===================")
124 |     
125 |     def start(self):
126 |         """启动语音对话系统"""
127 |         print("正在启动与Qwen-Omni的语音对话...")
128 |         
129 |         if not self.gui_mode:
130 |             self.show_system_info()
131 |         
132 |         # 重置消息历史
133 |         self.ai_processor.messages = []
134 |         self.ai_processor.full_transcript = ""
135 |         
136 |         # 重置状态
137 |         self.is_running = True
138 |         self.session_end_event.clear()
139 |         
140 |         try:
141 |             # 启动管道
142 |             self.pipeline.start()
143 |             
144 |             print("语音对话系统已启动，等待用户输入...")
145 |             return True
146 |         
147 |         except Exception as e:
148 |             print(f"启动语音对话时出错: {e}")
149 |             self.is_running = False
150 |             return False
151 |     
152 |     def stop(self):
153 |         """停止语音对话系统"""
154 |         if not self.is_running:
155 |             return False
156 |         
157 |         try:
158 |             print("正在停止语音对话...")
159 |             # 立即标记为非运行状态
160 |             self.is_running = False
161 |             # 设置会话结束事件
162 |             self.session_end_event.set()
163 |             
164 |             # 停止所有音频播放
165 |             if self.audio_output.is_playing:
166 |                 print("立即停止所有音频播放...")
167 |                 self.audio_output.stop_immediately()
168 |             
169 |             # 停止麦克风流
170 |             print("停止麦克风流和所有监听线程...")
171 |             self.audio_input.stop_mic_stream()
172 |             
173 |             # 等待麦克风流完全停止
174 |             time.sleep(0.2)
175 |             
176 |             # 停止处理管道
177 |             self.pipeline.stop()
178 |             
179 |             # 通知状态变化回调
180 |             if self.on_state_change:
181 |                 self.on_state_change("idle")
182 |             
183 |             print("语音对话已完全停止")
184 |             return True
185 |         
186 |         except Exception as e:
187 |             print(f"停止语音对话时出错: {e}")
188 |             return False
189 |     
190 |     def close(self):
191 |         """清理资源"""
192 |         self.stop()
193 |         self.audio_input.close()
194 |         self.audio_output.close()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 曲艺 (Qu Yi)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Qwen2.5-Omni Real-time Voice Communication
  2 | 
  3 | 基于通义千问 Qwen2.5-Omni 在线API的实时语音对话系统，支持实时语音交互、动态语音活动检测和流式音频处理。
  4 | 
  5 | A real-time voice conversation system based on Qwen2.5-Omni Online API, supporting real-time voice interaction, dynamic voice activity detection, and streaming audio processing.
  6 | 
  7 | > **注意**：这是一个初步的演示版本，主要实现了基础的语音对话功能。
  8 | >
  9 | > 计划逐步添加更多 Qwen2.5-Omni 支持的多模态交互功能。最终构建一个`全模态`的交互程序。
 10 | >
 11 | > **<u>本项目开发过程中使用了大量AI</u>**
 12 | 
 13 | ## 1 使用方法
 14 | 
 15 | ### GUI模式
 16 | 
 17 | 1. 启动GUI界面：
 18 | ```bash
 19 | python app.py
 20 | ```
 21 | 
 22 | 2. 在打开的窗口中：
 23 |    - 点击"开始对话"按钮启动语音对话
 24 |    - 用户可以连续发言和打断发言
 25 |    - 再次点击按钮结束对话
 26 |    
 27 | 
 28 | <p align="center">
 29 |   <img src="https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/main/assets/GUI-1.png" width="45%">
 30 |   <img src="https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/main/assets/GUI-2.png" width="45%">
 31 | </p>
 32 | ## 2 开发计划
 33 | 
 34 | **<u>接下来要开发的内容： 完成Agent的Eyes 视觉能力，让它能够看到桌面，并和用户交流</u>**
 35 | 
 36 | 以下是计划添加的主要功能：
 37 | 
 38 | - [ ] Agent架构
 39 |   - [x] **Brain** 与LLM交互 
 40 | 
 41 |   - [x] **Ears** 听觉能力
 42 |     - [x] 交互式音频对话
 43 |     - [x] 打断式音频通话
 44 |     - [ ] 语音转文字，兼容Qwen-Omni对同一个输入Massage的模态限制，以更好支持多模态
 45 | 
 46 |   - [x] **Mouth**语音能力
 47 |     - [x] 交互式音频对话
 48 |     - [x] 打断式音频通话
 49 | 
 50 |   - [ ] **Eyes** 视觉能力
 51 |     - [ ] 通过点击“分享屏幕按钮”观察用户电脑桌面，并给出反馈
 52 |     - [ ] 通过语音交互，自动观察屏幕内容
 53 |     - [ ] 视频通话
 54 | 
 55 |   - [x] **Skin** GUI界面
 56 |     - [x] 音频交互动态UI
 57 |     - [x] 可视化对话状态
 58 | 
 59 |   - [ ] **Hands** 工作能力
 60 |     - [ ] 简单函数调用
 61 |     - [ ] 引用Qwen-VL来增强鼠标控制能力，可以做一些简单操作
 62 |     - [ ] MCP (Multi-modal Conversational Perception) 功能
 63 | 
 64 | 
 65 | ## 3 功能特点
 66 | 
 67 | - 实时语音交互：支持用户与AI助手进行实时语音对话
 68 | - 智能语音检测：使用 Silero VAD (ONNX版本) 进行高精度的语音活动检测，无需PyTorch依赖
 69 | - 动态录音控制：根据用户说话情况自动开始和结束录音
 70 | - 流式音频处理：支持音频数据的流式处理和播放
 71 | - 平滑打断机制：允许用户在AI回答过程中自然打断
 72 | - 音频淡出效果：在对话结束或打断时提供平滑的音频过渡
 73 | - 现代化GUI界面：动态视觉反馈
 74 | 
 75 | ## 4 环境要求
 76 | 
 77 | - Python 3.10（开发环境）
 78 | - PyAudio 及其依赖的音频库
 79 | - onnxruntime - 用于语音活动检测
 80 | - pywebview (用于GUI界面)
 81 | - 麦克风和音频输出设备
 82 | - 推荐：[uv](https://github.com/astral-sh/uv) - 快速、现代的Python包管理器
 83 | 
 84 | ## 5 安装说明
 85 | 
 86 | ### 5.1 方法一：直接下载可执行文件
 87 | 
 88 | 访问[Releases页面](https://github.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/releases)下载最新的Windows可执行文件。
 89 | 
 90 | 下载后解压，在`key.json`中填入你的通义千问API密钥 **[API key获取方式](https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen?spm=a2c4g.11186623.help-menu-2400256.d_0_1_0.5a06b0a8iZbkAV)**：
 91 | 
 92 | 双击"QwenOmniVoiceAssistant.exe"即可运行。
 93 | 
 94 | ### 5.2 方法二：从源码构建
 95 | 
 96 | #### 安装步骤
 97 | 
 98 | 1. **创建Python环境**：
 99 | 
100 | ```bash
101 | # 安装Python 3.10（如已安装请跳过）
102 | # https://www.python.org/downloads/release/python-31011/
103 | 
104 | # 克隆项目代码
105 | git clone https://github.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat.git
106 | cd Qwen2.5-Omni-multimodal-chat
107 | 
108 | # 创建虚拟环境并激活
109 | python -m venv .venv
110 | # Windows
111 | .venv\Scripts\activate  
112 | # Linux/macOS
113 | # source .venv/bin/activate
114 | ```
115 | 
116 | 2. **安装依赖**：
117 | 
118 | ```bash
119 | # 安装项目依赖
120 | pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
121 | ```
122 | 
123 | 3. **配置API密钥**：
124 | 复制`key.json.example`为`key.json`，填入你的通义千问API密钥 **[API key获取方式](https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen?spm=a2c4g.11186623.help-menu-2400256.d_0_1_0.5a06b0a8iZbkAV)**：
125 | ```json
126 | {
127 |     "api_key": "your-api-key-here",
128 |     "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"
129 | }
130 | ```
131 | 
132 | 4. **运行应用**：
133 | 
134 | ```bash
135 | # 启动图形界面版本
136 | python app.py
137 | 
138 | # 或启动命令行版本
139 | python app.py --console
140 | ```
141 | 
142 | 5. **打包应用**：
143 | 
144 | 项目根目录命令行输入：
145 | 
146 | ```
147 | .\build-scripts\windows\build.bat
148 | ```
149 | 
150 | **或双击启动打包脚本`build.bat`，打包文件存放=在`dist`文件夹下**
151 | 
152 | ### 5.3 常见问题
153 | 
154 | - **麦克风未检测到**：请检查系统麦克风权限设置，确保应用有权限访问麦克风
155 | - **运行时缺少依赖**：确保已正确安装所有依赖，如遇问题可尝试`pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/`
156 | - **API密钥无效**：确保已在key.json中填入正确的通义千问API密钥
157 | 
158 | ### 5.4 高级用户说明
159 | 
160 | 如需使用更高级的包管理工具如uv，可以参考以下步骤：
161 | 
162 | ```bash
163 |  安装uv包管理器
164 | pip install uv
165 | 
166 | # 使用uv创建环境
167 | uv venv --python=3.10
168 | 
169 | # 使用uv安装依赖
170 | uv pip install -r requirements.txt
171 | ```
172 | 
173 | ## 6 配置说明
174 | 
175 | 可以在 `config.py` 中调整以下参数：
176 | 
177 | - `DEBUG`：调试模式开关，启用时会保存录音文件
178 | - `AUDIO_FORMAT`：音频格式（默认pyaudio.paInt16）
179 | - `CHANNELS`：音频通道数（默认1）
180 | - `RATE`：音频采样率（默认16000Hz，兼容Silero VAD）
181 | - `CHUNK`：音频数据块大小（默认512，相当于32毫秒帧大小）
182 | - `RECORD_SECONDS`：默认录音秒数
183 | - `MIN_SPEECH_DURATION`：最短语音持续时间（秒）
184 | - `SPEECH_VOLUME_THRESHOLD`：语音音量阈值
185 | - `NORMAL_VOLUME_THRESHOLD`：正常音量阈值
186 | - `MIN_POSITIVE_FRAMES`：语音检测的最小正帧数
187 | - `MIN_NEGATIVE_FRAMES`：静音检测的最小负帧数
188 | - `PLAYER_RATE`：音频播放器采样率（默认24000Hz，匹配模型输出）
189 | - `FADE_OUT_DURATION`：音频淡出持续时间（秒）
190 | - `MAX_FINISH_DURATION`：打断时最大允许的完成时间（秒）
191 | 
192 | ## 7 项目结构
193 | 
194 | ```
195 | Qwen2.5-Omni-multimodal-chat/
196 | ├── app.py             # 主程序入口
197 | ├── Agent.py           # 核心代理类（语音对话管理）
198 | ├── ears.py            # 音频输入处理（麦克风和VAD）
199 | ├── mouth.py           # 音频输出处理（语音合成和播放）
200 | ├── webview_api.py     # WebView API接口
201 | ├── utils.py           # 通用工具函数
202 | ├── config.py          # 配置文件
203 | ├── key.json.example   # API密钥配置示例
204 | ├── pyproject.toml     # Python项目配置
205 | ├── requirements.txt   # 主要依赖列表
206 | ├── uv.lock           # UV包管理器锁文件
207 | ├── LICENSE           # MIT许可证
208 | ├── README.md         # 项目说明文档
209 | │
210 | ├── assets/           # 资源文件
211 | │
212 | ├── models/           # 模型文件
213 | │   └── silero_vad.onnx  # 语音活动检测模型
214 | │
215 | ├── recordings/       # 录音文件目录（运行时生成）
216 | │
217 | ├── build-scripts/    # 构建脚本
218 | │   └── windows/      # Windows平台构建
219 | │       ├── build.py        # 构建Python脚本
220 | │       ├── build.bat       # 构建批处理文件
221 | │       ├── direct_spec.txt # PyInstaller规范文件
222 | │       └── README.md       # 构建说明
223 | │
224 | ├── web/             # GUI前端文件
225 | │   ├── templates/    # HTML模板
226 | │   │   └── index.html  # 主界面HTML
227 | │   └── static/       # 静态资源
228 | │       ├── css/      # 样式文件
229 | │       │   └── style.css  # 主样式表
230 | │       └── js/       # JavaScript文件
231 | │           └── app.js  # 前端逻辑
232 | │
233 | ├── build/           # 构建中间文件（自动生成）
234 | └── dist/            # 分发包（自动生成）
235 | ```
236 | 
237 | ## 8 注意事项
238 | 
239 | 1. 确保系统有可用的麦克风设备
240 | 2. 保持网络连接稳定以确保与API的通信
241 | 3. 调整麦克风音量以获得最佳的语音识别效果
242 | 4. 在嘈杂环境中可能需要调整音量阈值参数
243 | 5. 使用uv管理依赖可以显著提升安装速度
244 | 6. 建议在虚拟环境中进行开发和构建
245 | 
246 | ## 9 许可证
247 | 
248 | 本项目采用 MIT 许可证，这意味着您可以自由地使用、修改、分发本软件，无论是用于个人还是商业目的。详情请参见项目根目录下的 [LICENSE](./LICENSE) 文件。
249 | 
250 | ## 10 贡献指南
251 | 
252 | 欢迎提交Issue和Pull Request来帮助改进项目。在提交代码前，请确保：
253 | 
254 | 1. 代码符合Python代码规范
255 | 2. 添加必要的注释和文档
256 | 3. 更新相关的文档说明
257 | 4. 测试代码功能正常
258 | 
259 | ## 11 联系方式
260 | 
261 | 如有问题或建议，请通过以下方式联系：
262 | 
263 | - 提交 Issue
264 | - 发送邮件至：[quyimail@foxmail.com]
265 | 
266 | ## 致谢
267 | 
268 | - [Qwen2.5-Omni](https://github.com/QwenLM/Qwen2.5-Omni) - 通义千问全模特模型 [相关文档](https://help.aliyun.com/zh/model-studio/user-guide/qwen-omni?spm=a2c4g.11186623.0.0.5aefb0a8nJc2z7#db6d0ff7c371y)
269 | - [Silero VAD](https://github.com/snakers4/silero-vad) - 语音活动检测模型 
270 | - [pywebview](https://pywebview.flowrl.com/) - Python GUI框架
271 | - [Cursor](https://www.cursor.com/cn) - AI代码编辑器
272 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import webview
  4 | import threading
  5 | import argparse
  6 | import platform
  7 | from webview_api import AgentAPI
  8 | from utils import apply_windows_compatibility_patches
  9 | from Agent import Agent
 10 | from mouth import Mouth
 11 | from ears import Ears
 12 | 
 13 | def run_server(headless=False):
 14 |     """启动pywebview服务器
 15 |     
 16 |     Args:
 17 |         headless: 如果为True，则以无GUI模式运行
 18 |     """
 19 |     # 在Windows平台上应用兼容性补丁
 20 |     if platform.system().lower() == 'windows':
 21 |         apply_windows_compatibility_patches()
 22 |     
 23 |     current_dir = os.path.dirname(os.path.abspath(__file__))
 24 |     
 25 |     # 创建API实例
 26 |     api = AgentAPI()
 27 |     
 28 |     # 如果是无GUI模式，跳过GUI初始化
 29 |     if headless:
 30 |         # 模拟window对象
 31 |         class DummyWindow:
 32 |             def evaluate_js(self, js_code):
 33 |                 pass
 34 |         
 35 |         api.set_window(DummyWindow())
 36 |         # 启动无GUI的对话线程
 37 |         api.start_conversation()
 38 |         try:
 39 |             # 主线程等待
 40 |             while True:
 41 |                 import time
 42 |                 time.sleep(1)
 43 |         except KeyboardInterrupt:
 44 |             api.stop_conversation()
 45 |             return
 46 |     
 47 |     # 创建窗口配置
 48 |     window_settings = {
 49 |         'width': 400,
 50 |         'height': 550,
 51 |         'resizable': True,
 52 |         'min_size': (400, 550),
 53 |         'background_color': '#FFFFFF',
 54 |         'title': 'Qwen-Omni 语音助手',
 55 |         'text_select': False,
 56 |     }
 57 |     
 58 |     # 根据平台选择最合适的渲染器
 59 |     system_platform = platform.system().lower()
 60 |     
 61 |     # 在Windows上使用MSHTML (轻量级选择)
 62 |     if system_platform == 'windows':
 63 |         try:
 64 |             # 尝试使用Edge WebView2（如果系统已安装）
 65 |             gui_options = 'edgechromium'
 66 |             print("[INFO] 使用Edge WebView2作为GUI后端（轻量级）")
 67 |         except Exception as e:
 68 |             # 回退到MSHTML (基于IE的渲染器)
 69 |             gui_options = 'mshtml' 
 70 |             print(f"[INFO] 使用MSHTML作为GUI后端（轻量级，回退原因: {e}）")
 71 |     else:
 72 |         # 在macOS和Linux上使用系统默认
 73 |         gui_options = None
 74 |         print("[INFO] 使用系统默认GUI后端")
 75 |     
 76 |     # 创建窗口并加载HTML
 77 |     window = webview.create_window(
 78 |         title=window_settings['title'],
 79 |         url='file://' + os.path.join(current_dir, 'web/templates/index.html'),
 80 |         js_api=api,
 81 |         width=window_settings['width'],
 82 |         height=window_settings['height'],
 83 |         resizable=window_settings['resizable'],
 84 |         min_size=window_settings['min_size'],
 85 |         background_color=window_settings['background_color'],
 86 |         text_select=window_settings['text_select'],
 87 |     )
 88 |     
 89 |     # 设置窗口引用
 90 |     api.set_window(window)
 91 |     
 92 |     # 配置语音聊天默认参数（与CLI模式相同的默认配置）
 93 |     api.configure_agent({
 94 |         'recording_mode': 'dynamic',     # 默认使用动态录音模式
 95 |         'recording_seconds': 5,          # 默认录音时长（固定模式下使用）
 96 |     })
 97 |     
 98 |     # 启动窗口，应用平台特定配置
 99 |     webview.start(debug=False, http_server=True, gui=gui_options)
100 | 
101 | def run_console():
102 |     """运行命令行版本"""
103 |     voice_chat = Agent()
104 |     try:
105 |         voice_chat.start_conversation()
106 |     except KeyboardInterrupt:
107 |         print("\n命令行版本已终止")
108 |     finally:
109 |         voice_chat.close()
110 | 
111 | if __name__ == "__main__":
112 |     parser = argparse.ArgumentParser(description="Qwen-Omni 语音助手")
113 |     parser.add_argument('--console', action='store_true', help='在命令行模式下运行')
114 |     parser.add_argument('--headless', action='store_true', help='无GUI模式运行')
115 |     args = parser.parse_args()
116 |     
117 |     if args.console:
118 |         run_console()
119 |     else:
120 |         run_server(headless=args.headless) 


--------------------------------------------------------------------------------
/assets/GUI-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/GUI-1.png


--------------------------------------------------------------------------------
/assets/GUI-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/GUI-2.png


--------------------------------------------------------------------------------
/assets/Qwen.icns:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/Qwen.icns


--------------------------------------------------------------------------------
/assets/Qwen.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/Qwen.ico


--------------------------------------------------------------------------------
/assets/Qwen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/assets/Qwen.png


--------------------------------------------------------------------------------
/build-scripts/windows/README.md:
--------------------------------------------------------------------------------
 1 | # Qwen2.5-Omni 语音助手 Windows 打包工具
 2 | 
 3 | 这个目录包含用于将 Qwen2.5-Omni 语音助手打包为 Windows 可执行文件的工具脚本。
 4 | 
 5 | ## 打包方法
 6 | 
 7 | 1. 打开命令行，输入`.\build.bat`
 8 | 2. 等待打包完成
 9 | 
10 | ## 打包结果
11 | 
12 | 打包完成后都会在项目根目录下生成 `dist` 文件夹，其中包含：
13 | 
14 | - `QwenOmniVoiceAssistant` 文件夹：包含可执行文件和所有依赖
15 | 
16 | - `QwenOmniVoiceAssistant.exe`为文件入口
17 | 
18 | ## 注意事项
19 | 
20 | 1. 打包过程需要网络连接，因为可能需要下载依赖库
21 | 2. 请确保在打包前已安装Python 3.10+
22 | 3. 初次打包可能需要较长时间，因为要下载和安装依赖
23 | 
24 | ## 运行要求
25 | 
26 | 打包后的程序在Windows 7/8/10/11系统上应该都能正常运行，无需额外安装Python环境。
27 | 
28 | ## 故障排除
29 | 
30 | 如果遇到问题：
31 | 
32 | 1. 检查控制台错误信息或日志
33 | 3. 确保拥有管理员权限
34 | 4. 如果打包失败，尝试关闭防病毒软件后重试
35 | 5. 如果运行打包程序遇到"无法找到入口点"等错误，可能是微软Visual C++ Redistributable缺失，请安装最新版本 


--------------------------------------------------------------------------------
/build-scripts/windows/build.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | :: 设置控制台编码为UTF-8
 3 | chcp 65001 > nul
 4 | echo ==== Qwen-Omni 语音助手 Windows 打包工具 ====
 5 | echo.
 6 | 
 7 | :: 检查当前目录
 8 | set "SCRIPT_DIR=%~dp0"
 9 | set "PROJECT_ROOT=%SCRIPT_DIR%..\..\"
10 | cd /d "%PROJECT_ROOT%"
11 | 
12 | :: 检测Python环境
13 | where python >nul 2>nul
14 | if %ERRORLEVEL% neq 0 (
15 |     echo 错误: 未找到Python，请确保已安装Python并添加到PATH环境变量
16 |     pause
17 |     exit /b 1
18 | )
19 | 
20 | :: 尝试使用uv
21 | where uv >nul 2>nul
22 | if %ERRORLEVEL% equ 0 (
23 |     echo 发现uv包管理器，将使用uv进行依赖安装
24 |     set USE_UV=1
25 | ) else (
26 |     echo 未找到uv包管理器，将使用pip进行依赖安装
27 |     set USE_UV=0
28 | )
29 | 
30 | :: 确保pip可用
31 | if %USE_UV% equ 0 (
32 |     python -m ensurepip --upgrade >nul 2>nul
33 |     python -m pip --version >nul 2>nul
34 |     if %ERRORLEVEL% neq 0 (
35 |         echo 警告: pip不可用，将尝试使用内置的ensurepip模块安装
36 |         python -m ensurepip --default-pip
37 |         if %ERRORLEVEL% neq 0 (
38 |             echo 错误: 无法安装pip
39 |             pause
40 |             exit /b 1
41 |         )
42 |     )
43 | )
44 | 
45 | :: 安装PyInstaller（如果尚未安装）
46 | echo 检查PyInstaller是否已安装...
47 | python -c "import PyInstaller" >nul 2>nul
48 | if %ERRORLEVEL% neq 0 (
49 |     echo 正在安装PyInstaller...
50 |     if %USE_UV% equ 1 (
51 |         uv pip install pyinstaller
52 |     ) else (
53 |         python -m pip install pyinstaller
54 |     )
55 |     
56 |     if %ERRORLEVEL% neq 0 (
57 |         echo 错误: PyInstaller安装失败
58 |         pause
59 |         exit /b 1
60 |     )
61 | )
62 | 
63 | :: 设置UTF-8环境变量
64 | set PYTHONIOENCODING=utf-8
65 | 
66 | :: 执行打包脚本
67 | echo 正在启动打包过程...
68 | python "%SCRIPT_DIR%build.py"
69 | 
70 | :: 等待用户确认
71 | if %ERRORLEVEL% neq 0 (
72 |     echo.
73 |     echo 打包过程遇到错误，请查看上方错误信息
74 | ) else (
75 |     echo.
76 |     echo 打包完成! 请查看dist目录
77 | )
78 | 
79 | pause 


--------------------------------------------------------------------------------
/build-scripts/windows/build.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Qwen-Omni 语音助手Windows打包脚本
  6 | 使用PyInstaller将程序打包为Windows可执行文件
  7 | """
  8 | 
  9 | import os
 10 | import sys
 11 | import shutil
 12 | import subprocess
 13 | import platform
 14 | import tempfile
 15 | 
 16 | # 确保在正确的工作目录下
 17 | script_dir = os.path.dirname(os.path.abspath(__file__))
 18 | project_root = os.path.abspath(os.path.join(script_dir, '../..'))
 19 | os.chdir(project_root)
 20 | 
 21 | # 检查是否在Windows平台上运行
 22 | if platform.system().lower() != 'windows':
 23 |     print("错误: 此打包脚本仅适用于Windows平台")
 24 |     sys.exit(1)
 25 | 
 26 | def clean_dist():
 27 |     """清理旧的构建文件"""
 28 |     print("正在清理旧的构建文件...")
 29 |     dirs_to_clean = ['build', 'dist']
 30 |     for dir_path in dirs_to_clean:
 31 |         if os.path.exists(dir_path):
 32 |             try:
 33 |                 shutil.rmtree(dir_path)
 34 |                 print(f"  已删除 {dir_path}/")
 35 |             except Exception as e:
 36 |                 print(f"  警告: 无法删除 {dir_path}/: {e}")
 37 | 
 38 | def check_dependencies():
 39 |     """检查必要的依赖"""
 40 |     print("正在检查系统依赖...")
 41 |     
 42 |     # 检查PyInstaller
 43 |     try:
 44 |         import PyInstaller
 45 |         print(f"  已安装 PyInstaller {PyInstaller.__version__}")
 46 |     except ImportError:
 47 |         print("  未找到 PyInstaller，将尝试安装")
 48 |         return False
 49 |     
 50 |     return True
 51 | 
 52 | def install_requirements():
 53 |     """安装所需的依赖"""
 54 |     print("正在安装PyInstaller和所需依赖...")
 55 |     
 56 |     # 首先尝试使用uv
 57 |     try:
 58 |         subprocess.run(['uv', 'pip', 'install', 'pyinstaller'], check=True)
 59 |         subprocess.run(['uv', 'pip', 'install', '-r', 'requirements.txt'], check=True)
 60 |         return True
 61 |     except (subprocess.SubprocessError, FileNotFoundError) as e:
 62 |         print(f"  使用uv安装失败: {e}")
 63 |     
 64 |     # 尝试使用标准pip
 65 |     try:
 66 |         subprocess.run([sys.executable, '-m', 'ensurepip', '--upgrade'], check=False)
 67 |         subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'], check=False)
 68 |         subprocess.run([sys.executable, '-m', 'pip', 'install', 'pyinstaller'], check=True)
 69 |         subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'], check=True)
 70 |         return True
 71 |     except subprocess.SubprocessError as e:
 72 |         print(f"  使用pip安装失败: {e}")
 73 |         return False
 74 | 
 75 | def create_spec_file():
 76 |     """创建PyInstaller的spec文件"""
 77 |     print("正在创建spec文件...")
 78 |     
 79 |     # 使用direct_spec.txt模板
 80 |     direct_template_path = os.path.join(script_dir, 'direct_spec.txt')
 81 |     if os.path.exists(direct_template_path):
 82 |         try:
 83 |             with open(direct_template_path, 'r', encoding='utf-8') as f:
 84 |                 spec_content = f.read()
 85 |             print("  已从模板文件加载spec内容")
 86 |             
 87 |             # 确保文件没有GBK不支持的字符
 88 |             try:
 89 |                 spec_content.encode('gbk', errors='strict')
 90 |             except UnicodeEncodeError:
 91 |                 print("  警告: 模板文件包含GBK编码不支持的字符，将进行替换")
 92 |                 spec_content = spec_content.encode('gbk', errors='replace').decode('gbk')
 93 |             
 94 |             with open('qwen_omni.spec', 'w', encoding='utf-8') as f:
 95 |                 f.write(spec_content)
 96 |             print("  已创建 qwen_omni.spec")
 97 |             return True
 98 |         except Exception as e:
 99 |             print(f"  模板加载失败: {e}")
100 |     
101 |     # 使用内置模板作为备份方案
102 |     print("  使用内置模板")
103 |     spec_content = """# -*- mode: python ; coding: utf-8 -*-
104 | 
105 | import os
106 | import sys
107 | 
108 | block_cipher = None
109 | 
110 | datas = [
111 |     ('web/templates', 'web/templates'),
112 |     ('web/static', 'web/static'),
113 |     ('assets/Qwen.ico', 'assets'),
114 | ]
115 | 
116 | if os.path.exists('key.json'):
117 |     datas.append(('key.json', '.'))
118 | 
119 | hiddenimports = [
120 |     'pyaudio', 'numpy', 'webview', 'threading', 'json',
121 |     'platform', 'webview.platforms.winforms',
122 | ]
123 | 
124 | a = Analysis(
125 |     ['app.py'],
126 |     pathex=[os.path.abspath('.')],
127 |     binaries=[],
128 |     datas=datas,
129 |     hiddenimports=hiddenimports,
130 |     hookspath=[],
131 |     hooksconfig={},
132 |     runtime_hooks=[],
133 |     excludes=[],
134 |     win_no_prefer_redirects=False,
135 |     win_private_assemblies=False,
136 |     cipher=block_cipher,
137 |     noarchive=False,
138 | )
139 | 
140 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
141 | 
142 | exe = EXE(
143 |     pyz,
144 |     a.scripts,
145 |     [],
146 |     exclude_binaries=True,
147 |     name='QwenOmniVoiceAssistant',
148 |     debug=False,
149 |     bootloader_ignore_signals=False,
150 |     strip=False,
151 |     upx=True,
152 |     console=False,
153 |     icon='assets/Qwen.ico',
154 |     version='file_version.txt',
155 | )
156 | 
157 | coll = COLLECT(
158 |     exe,
159 |     a.binaries,
160 |     a.zipfiles,
161 |     a.datas,
162 |     strip=False,
163 |     upx=True,
164 |     upx_exclude=[],
165 |     name='QwenOmniVoiceAssistant',
166 | )
167 | """
168 |     
169 |     try:
170 |         with open('qwen_omni.spec', 'w', encoding='utf-8') as f:
171 |             f.write(spec_content)
172 |         print("  已创建 qwen_omni.spec")
173 |         return True
174 |     except Exception as e:
175 |         print(f"  创建spec文件失败: {e}")
176 |         return False
177 | 
178 | def find_pyinstaller():
179 |     """查找PyInstaller可执行文件的路径"""
180 |     paths_to_check = [
181 |         # 当前Python环境的Scripts目录
182 |         os.path.join(os.path.dirname(sys.executable), 'Scripts', 'pyinstaller.exe'),
183 |         os.path.join(os.path.dirname(sys.executable), 'Scripts', 'pyinstaller'),
184 |         # 用户安装的PyInstaller
185 |         shutil.which('pyinstaller'),
186 |         # 通过模块运行
187 |         sys.executable + ' -m PyInstaller',
188 |     ]
189 |     
190 |     for path in paths_to_check:
191 |         if path and (os.path.exists(path) or ' -m ' in path):
192 |             return path
193 |     
194 |     # 备选方案：直接使用Python -m
195 |     return [sys.executable, '-m', 'PyInstaller']
196 | 
197 | def build_executable():
198 |     """使用PyInstaller构建可执行文件"""
199 |     print("正在构建Windows可执行文件...")
200 |     
201 |     # 查找PyInstaller
202 |     pyinstaller_path = find_pyinstaller()
203 |     
204 |     # 准备命令
205 |     if isinstance(pyinstaller_path, list):
206 |         cmd = pyinstaller_path + ['qwen_omni.spec', '--clean']
207 |     elif ' -m ' in pyinstaller_path:
208 |         cmd_parts = pyinstaller_path.split(' -m ')
209 |         cmd = [cmd_parts[0], '-m', cmd_parts[1], 'qwen_omni.spec', '--clean']
210 |     else:
211 |         cmd = [pyinstaller_path, 'qwen_omni.spec', '--clean']
212 |     
213 |     print(f"  执行命令: {' '.join(cmd)}")
214 |     
215 |     try:
216 |         # 设置环境变量以强制使用UTF-8
217 |         env = os.environ.copy()
218 |         env['PYTHONIOENCODING'] = 'utf-8'
219 |         env['PYTHONUTF8'] = '1'  # 强制Python使用UTF-8模式
220 |         
221 |         # 针对Windows命令行的处理
222 |         if platform.system().lower() == 'windows':
223 |             # 确保当前控制台使用UTF-8编码
224 |             os.system('chcp 65001 > nul')
225 |         
226 |         # 创建临时文件捕获输出
227 |         temp_log_path = None
228 |         result = 1  # 默认为失败状态
229 |         
230 |         try:
231 |             with tempfile.NamedTemporaryFile(delete=False, mode='w+', encoding='utf-8', suffix='.log') as tmp:
232 |                 temp_log_path = tmp.name
233 |                 process = subprocess.Popen(
234 |                     cmd,
235 |                     stdout=subprocess.PIPE,
236 |                     stderr=subprocess.STDOUT,
237 |                     universal_newlines=True,
238 |                     bufsize=1,
239 |                     env=env,
240 |                     encoding='utf-8',
241 |                     errors='replace'
242 |                 )
243 |                 
244 |                 # 实时输出日志
245 |                 for line in process.stdout:
246 |                     try:
247 |                         line = line.strip()
248 |                         if line:
249 |                             print(f"  {line}")
250 |                             tmp.write(line + '\n')
251 |                     except UnicodeEncodeError:
252 |                         # 如果无法在控制台显示，仅写入日志文件
253 |                         tmp.write("(non-displayable characters)" + '\n')
254 |                 
255 |                 # 等待进程完成
256 |                 result = process.wait()
257 |         except Exception as e:
258 |             print(f"  PyInstaller runtime error: {e}")
259 |             if temp_log_path and os.path.exists(temp_log_path):
260 |                 print(f"  Log saved to: {temp_log_path}")
261 |             return False
262 |         finally:
263 |             # 检查是否成功完成
264 |             if result != 0:
265 |                 if temp_log_path and os.path.exists(temp_log_path):
266 |                     print(f"  Build failed (code {result}), log saved to: {temp_log_path}")
267 |                 return False
268 |             else:
269 |                 # 尝试删除临时文件，但如果删除失败也不影响构建结果
270 |                 if temp_log_path and os.path.exists(temp_log_path):
271 |                     try:
272 |                         os.unlink(temp_log_path)
273 |                     except Exception as e:
274 |                         print(f"  Note: Cannot delete temp file: {e}")
275 |                 print("  Build completed!")
276 |                 
277 |                 # 检查dist目录确认是否真的构建成功
278 |                 if os.path.exists(os.path.join('dist', 'QwenOmniVoiceAssistant')):
279 |                     return True
280 |                 else:
281 |                     print("  Warning: Build output not found")
282 |                     return False
283 |                 
284 |     except Exception as e:
285 |         print(f"  Build process error: {str(e).encode('ascii', errors='replace').decode('ascii')}")
286 |         return False
287 | 
288 | def sanitize_key_json():
289 |     """处理key.json文件，替换真实API密钥为示例值"""
290 |     print("正在处理API密钥信息...")
291 |     
292 |     # 创建dist目录（如果不存在）
293 |     if not os.path.exists('dist'):
294 |         os.makedirs('dist')
295 |     
296 |     # 获取目标目录
297 |     target_dir = os.path.join('dist', 'QwenOmniVoiceAssistant')
298 |     if not os.path.exists(target_dir):
299 |         os.makedirs(target_dir, exist_ok=True)
300 |     
301 |     if not os.path.exists('key.json'):
302 |         print("  未找到key.json文件，将创建示例配置")
303 |         
304 |         # 创建示例配置
305 |         example_config = '''{
306 |     "api_key": "yout api key",
307 |     "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"
308 | }'''
309 |         
310 |         try:
311 |             # 直接写入到目标文件夹
312 |             with open(os.path.join(target_dir, 'key.json'), 'w', encoding='utf-8') as f:
313 |                 f.write(example_config)
314 |             print("  已创建示例配置文件 key.json")
315 |             return True
316 |         except Exception as e:
317 |             print(f"  创建示例配置失败: {e}")
318 |             return False
319 |     
320 |     try:
321 |         # 读取原始key.json
322 |         import json
323 |         with open('key.json', 'r', encoding='utf-8') as f:
324 |             key_data = json.load(f)
325 |         
326 |         # 备份API密钥信息
327 |         original_api_key = key_data.get('api_key', '')
328 |         if original_api_key:
329 |             # 创建打包用的示例key.json
330 |             key_data['api_key'] = "yout api key"
331 |             
332 |             # 直接写入到目标文件夹
333 |             with open(os.path.join(target_dir, 'key.json'), 'w', encoding='utf-8') as f:
334 |                 json.dump(key_data, f, ensure_ascii=False, indent=4)
335 |                 
336 |             # # 保存原始版本作为示例，同样放在目标文件夹内
337 |             # with open(os.path.join(target_dir, 'key.json.backup'), 'w', encoding='utf-8') as f:
338 |             #     f.write(f"# 这是一个备份文件，包含了原始key.json的格式\n")
339 |             #     f.write(f"# 请将您的API密钥替换下面的示例值\n\n")
340 |             #     json.dump(key_data, f, ensure_ascii=False, indent=4)
341 |             
342 |             print("  已处理API密钥信息：替换为示例值")
343 |             return True
344 |         else:
345 |             print("  API密钥为空，将使用原始文件")
346 |             # 复制原始文件到目标文件夹
347 |             with open(os.path.join(target_dir, 'key.json'), 'w', encoding='utf-8') as f:
348 |                 json.dump(key_data, f, ensure_ascii=False, indent=4)
349 |             print("  已复制原始key.json文件（空API密钥）")
350 |             return True
351 |     except Exception as e:
352 |         print(f"  处理API密钥失败: {e}")
353 |         return False
354 | 
355 | def copy_additional_files():
356 |     """复制其他必要的运行时文件"""
357 |     print("正在复制其他必要文件...")
358 |     success = True
359 |     
360 |     # 检查目标目录是否存在
361 |     target_dir = os.path.join('dist', 'QwenOmniVoiceAssistant')
362 |     if not os.path.exists(target_dir):
363 |         print(f"  错误: 目标目录不存在: {target_dir}")
364 |         return False
365 |     
366 |     # 复制README和其他文档
367 |     if os.path.exists('README.md'):
368 |         try:
369 |             shutil.copy2('README.md', target_dir)
370 |             print("  已复制 README.md")
371 |         except Exception as e:
372 |             print(f"  警告: 复制README失败: {e}")
373 |             success = False
374 |     
375 |     # 处理key.json - 直接处理到目标目录
376 |     sanitize_key_json()
377 |     
378 |     # 复制版本信息文件
379 |     if os.path.exists('file_version.txt'):
380 |         try:
381 |             shutil.copy2('file_version.txt', target_dir)
382 |             print("  已复制 file_version.txt 到应用根目录")
383 |         except Exception as e:
384 |             print(f"  警告: 复制版本信息文件失败: {e}")
385 |             success = False
386 |     
387 |     return success
388 | 
389 | def create_shortcut():
390 |     """创建快捷方式批处理文件"""
391 |     print("创建启动批处理文件...")
392 |     
393 |     # 获取版本号
394 |     version = extract_version()
395 |     
396 |     # 获取平台信息
397 |     import platform
398 |     arch = platform.machine().lower()
399 |     if arch == 'amd64' or arch == 'x86_64':
400 |         arch = 'x64'
401 |     elif arch == 'x86':
402 |         arch = 'x86'
403 |     elif 'arm' in arch or 'aarch' in arch:
404 |         arch = 'arm64'
405 |     else:
406 |         arch = platform.architecture()[0]
407 |     
408 |     # 获取Windows版本
409 |     win_ver = platform.win32_ver()[0]
410 |     
411 |     # 构建目标文件夹名称 (包含平台信息)
412 |     target_dir = f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}'
413 |     
414 |     try:
415 |         # 创建一个中文名称的快捷方式批处理文件
416 |         cn_batch = '''@echo off
417 | echo 创建"Qwen-Omni语音助手"快捷方式...
418 | cd /d "%~dp0"
419 | if not exist "QwenOmniVoiceAssistant.exe" cd QwenOmniVoiceAssistant
420 | start QwenOmniVoiceAssistant.exe
421 | exit
422 | '''
423 |         
424 |         # 确保目录存在
425 |         if not os.path.exists(f'dist/{target_dir}'):
426 |             os.makedirs(f'dist/{target_dir}', exist_ok=True)
427 |             
428 |         with open(f'dist/{target_dir}/启动语音助手.bat', 'w', encoding='utf-8') as f:
429 |             f.write(cn_batch)
430 |         print("  已创建启动批处理文件")
431 |         
432 |         return True
433 |     except Exception as e:
434 |         print(f"  创建启动批处理文件失败: {e}")
435 |         return False
436 | 
437 | def rename_dist_folder():
438 |     """将英文目录重命名为中文（可选），并添加平台信息"""
439 |     try:
440 |         # 获取版本号
441 |         version = extract_version()
442 |         
443 |         # 获取系统架构信息
444 |         import platform
445 |         arch = platform.machine().lower()
446 |         if arch == 'amd64' or arch == 'x86_64':
447 |             arch = 'x64'
448 |         elif arch == 'x86':
449 |             arch = 'x86'
450 |         elif 'arm' in arch or 'aarch' in arch:
451 |             arch = 'arm64'
452 |         else:
453 |             arch = platform.architecture()[0]  # 备选方案
454 |         
455 |         # 获取Windows版本
456 |         win_ver = platform.win32_ver()[0]
457 |         
458 |         # 构建目标文件夹名称 (包含平台信息)
459 |         target_dir = f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}'
460 |         
461 |         print("创建中文名称的启动文件...")
462 |         
463 |         # 检查目录是否存在
464 |         if os.path.exists('dist/QwenOmniVoiceAssistant'):
465 |             # 创建一个README说明
466 |             readme_content = f'''# Qwen-Omni 语音助手 v{version}
467 | 
468 | 这是Qwen-Omni语音助手的Windows版本。
469 | 系统要求: Windows {win_ver} {arch}
470 | 
471 | 请双击"启动语音助手.bat"文件来运行应用程序。
472 | 
473 | 注意：由于Windows系统编码限制，应用程序文件夹使用英文名称，但功能与界面仍然是中文的。
474 | '''
475 |             
476 |             # 确保目录存在 - 此时可能还没有重命名
477 |             if os.path.exists(f'dist/{target_dir}'):
478 |                 readme_path = f'dist/{target_dir}/使用说明.txt'
479 |             else:
480 |                 readme_path = 'dist/QwenOmniVoiceAssistant/使用说明.txt'
481 |                 
482 |             with open(readme_path, 'w', encoding='utf-8') as f:
483 |                 f.write(readme_content)
484 |             
485 |             return True
486 |     except Exception as e:
487 |         print(f"  创建中文访问方式失败: {e}")
488 |     return False
489 | 
490 | def create_version_file():
491 |     """创建版本信息文件，如果文件已存在则跳过"""
492 |     version_file = 'file_version.txt'
493 |     
494 |     # 检查文件是否已存在
495 |     if os.path.exists(version_file):
496 |         print(f"  {version_file} 已存在，跳过创建")
497 |         return True
498 |         
499 |     print("创建版本信息文件...")
500 |     version_content = """VSVersionInfo(
501 |   ffi=FixedFileInfo(
502 |     filevers=(0, 0, 1, 0),
503 |     prodvers=(0, 0, 1, 0),
504 |     mask=0x3f,
505 |     flags=0x0,
506 |     OS=0x40004,
507 |     fileType=0x1,
508 |     subtype=0x0,
509 |     date=(0, 0)
510 |   ),
511 |   kids=[
512 |     StringFileInfo(
513 |       [
514 |         StringTable(
515 |           u'040904B0',
516 |           [StringStruct(u'CompanyName', u''),
517 |            StringStruct(u'FileDescription', u'Qwen-Omni Voice Assistant'),
518 |            StringStruct(u'FileVersion', u'0.0.1'),
519 |            StringStruct(u'InternalName', u'QwenOmniVoiceAssistant'),
520 |            StringStruct(u'LegalCopyright', u''),
521 |            StringStruct(u'OriginalFilename', u'QwenOmniVoiceAssistant.exe'),
522 |            StringStruct(u'ProductName', u'Qwen-Omni Voice Assistant'),
523 |            StringStruct(u'ProductVersion', u'Windows 0.0.1')])
524 |       ]),
525 |     VarFileInfo([VarStruct(u'Translation', [1033, 1200])])
526 |   ]
527 | )"""
528 |     
529 |     try:
530 |         with open(version_file, 'w', encoding='utf-8') as f:
531 |             f.write(version_content)
532 |         print("  已创建版本信息文件")
533 |         return True
534 |     except Exception as e:
535 |         print(f"  创建版本信息文件失败: {e}")
536 |         return False
537 | 
538 | def extract_version():
539 |     """从version文件中提取版本号"""
540 |     try:
541 |         if os.path.exists('file_version.txt'):
542 |             with open('file_version.txt', 'r', encoding='utf-8') as f:
543 |                 content = f.read()
544 |                 # 查找 FileVersion 字段
545 |                 import re
546 |                 version_match = re.search(r"FileVersion', u'([0-9\.]+)'", content)
547 |                 if version_match:
548 |                     return version_match.group(1)
549 |         # 如果找不到版本号，返回默认值        
550 |         return "0.0.1"
551 |     except Exception as e:
552 |         print(f"  提取版本号失败: {e}")
553 |         return "0.0.1"
554 | 
555 | def main():
556 |     """主函数，运行打包流程"""
557 |     print("==== Qwen-Omni Voice Assistant Windows Build Tool ====")
558 |     success = True
559 |     
560 |     try:
561 |         # 设置stdout和stderr为utf-8模式
562 |         if sys.stdout.encoding.lower() != 'utf-8':
563 |             # Windows命令行默认使用cp936/gbk，需要设置为utf-8
564 |             try:
565 |                 import io
566 |                 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
567 |                 sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
568 |                 os.environ['PYTHONIOENCODING'] = 'utf-8'
569 |                 print("Console encoding set to UTF-8")
570 |             except Exception as e:
571 |                 print(f"Warning: Cannot set UTF-8 encoding: {e}")
572 |         
573 |         clean_dist()
574 |         
575 |         # 创建版本信息文件
576 |         create_version_file()
577 |         
578 |         # 提取版本号，用于文件夹命名
579 |         version = extract_version()
580 |         print(f"  当前版本号: {version}")
581 |         
582 |         # 检查依赖并安装
583 |         if not check_dependencies():
584 |             if not install_requirements():
585 |                 print("Error: Cannot install required dependencies")
586 |                 return 1
587 |         
588 |         # 创建spec文件
589 |         if not create_spec_file():
590 |             print("Error: Cannot create spec file")
591 |             return 1
592 |         
593 |         # 构建可执行文件
594 |         build_success = build_executable()
595 |         if not build_success:
596 |             # 即使PyInstaller返回失败，但检查dist目录是否存在结果
597 |             if os.path.exists(os.path.join('dist', 'QwenOmniVoiceAssistant')):
598 |                 print("Note: Despite errors, build output found. Continuing.")
599 |                 build_success = True
600 |             else:
601 |                 print("Error: Build failed, no output found")
602 |                 return 1
603 |         
604 |         # 复制额外文件
605 |         copy_additional_files()
606 |         
607 |         # 创建快捷方式脚本
608 |         create_shortcut()
609 |         
610 |         # 创建中文访问方式
611 |         rename_dist_folder()
612 |         
613 |         # 重命名输出文件夹，添加版本号和平台信息
614 |         if os.path.exists(os.path.join('dist', 'QwenOmniVoiceAssistant')):
615 |             # 获取系统架构信息
616 |             import platform
617 |             arch = platform.machine().lower()
618 |             if arch == 'amd64' or arch == 'x86_64':
619 |                 arch = 'x64'
620 |             elif arch == 'x86':
621 |                 arch = 'x86'
622 |             elif 'arm' in arch or 'aarch' in arch:
623 |                 arch = 'arm64'
624 |             else:
625 |                 arch = platform.architecture()[0]
626 |             
627 |             # 获取Windows版本
628 |             win_ver = platform.win32_ver()[0]
629 |             
630 |             # 获取版本号
631 |             version = extract_version()
632 |             versioned_folder = os.path.join('dist', f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}')
633 |             
634 |             if os.path.exists(versioned_folder):
635 |                 shutil.rmtree(versioned_folder)
636 |             os.rename(os.path.join('dist', 'QwenOmniVoiceAssistant'), versioned_folder)
637 |             print(f"  已将输出文件夹重命名为: QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}")
638 |         
639 |         # 最后检查构建结果
640 |         version = extract_version()
641 |         import platform
642 |         arch = platform.machine().lower()
643 |         if arch == 'amd64' or arch == 'x86_64':
644 |             arch = 'x64'
645 |         elif arch == 'x86':
646 |             arch = 'x86'
647 |         elif 'arm' in arch or 'aarch' in arch:
648 |             arch = 'arm64'
649 |         else:
650 |             arch = platform.architecture()[0]
651 |         
652 |         win_ver = platform.win32_ver()[0]
653 |         target_folder = f'QwenOmniVoiceAssistant_v{version}_win{win_ver}_{arch}'
654 |         
655 |         if os.path.exists(os.path.join('dist', target_folder, 'QwenOmniVoiceAssistant.exe')):
656 |             print(f"\nBuild successful! Executable at: dist/{target_folder}/QwenOmniVoiceAssistant.exe")
657 |             print(f"You can directly run 'dist/{target_folder}/启动语音助手.bat'")
658 |             
659 |             # 创建API Key说明文件
660 |             try:
661 |                 api_key_note_filename = '请先在key.json中填写api key [获取教程].txt'
662 |                 api_key_note_path = os.path.join('dist', target_folder, api_key_note_filename)
663 |                 api_key_note_content = """前往这里获取api key：
664 | https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen?spm=a2c4g.11186623.help-menu-2400256.d_0_1_0.5a06b0a8iZbkAV"""
665 |                 with open(api_key_note_path, 'w', encoding='utf-8') as f:
666 |                     f.write(api_key_note_content)
667 |                 print(f"  已创建API Key说明文件: dist/{target_folder}/{api_key_note_filename}")
668 |             except Exception as e:
669 |                 print(f"  警告: 创建API Key说明文件失败: {e}")
670 | 
671 |             success = True
672 |         else:
673 |             print("\nWarning: Final executable not found, build may not be complete")
674 |             success = False
675 |             
676 |     except UnicodeEncodeError as e:
677 |         # 特别处理编码错误
678 |         print("Error: Encoding issue caused build failure")
679 |         print("Try the following:")
680 |         print("1. Run 'chcp 65001' in command prompt")
681 |         print("2. Then run this script again")
682 |         return 1
683 |     except Exception as e:
684 |         # 确保异常信息能正确显示
685 |         try:
686 |             error_msg = str(e)
687 |             print(f"Error during build process: {error_msg}")
688 |         except UnicodeEncodeError:
689 |             # 如果无法显示错误消息，使用ascii编码替换不可显示字符
690 |             error_msg = str(e).encode('ascii', errors='replace').decode('ascii')
691 |             print(f"Error during build process: {error_msg}")
692 |         return 1
693 |     
694 |     return 0 if success else 1
695 | 
696 | if __name__ == "__main__":
697 |     sys.exit(main()) 


--------------------------------------------------------------------------------
/build-scripts/windows/direct_spec.txt:
--------------------------------------------------------------------------------
  1 | # -*- mode: python ; coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import sys
  5 | 
  6 | block_cipher = None
  7 | 
  8 | # 添加所有必要的数据文件
  9 | datas = []
 10 | 
 11 | # 添加web目录及其内容
 12 | datas.extend([
 13 |     ('web/templates', 'web/templates'),
 14 |     ('web/static', 'web/static'),
 15 | ])
 16 | 
 17 | # 添加assets目录下的图标文件
 18 | datas.extend([
 19 |     ('assets/Qwen.ico', 'assets'),
 20 |     ('assets/Qwen.png', 'assets'),
 21 | ])
 22 | 
 23 | # 添加key.json配置文件
 24 | if os.path.exists('key.json'):
 25 |     datas.append(('key.json', '.'))
 26 | 
 27 | # 添加ONNX模型文件
 28 | if os.path.exists('models/silero_vad.onnx'):
 29 |     datas.append(('models/silero_vad.onnx', 'models'))
 30 | 
 31 | # 基本的hidden imports
 32 | hiddenimports = [
 33 |     'pyaudio', 
 34 |     'numpy',
 35 |     'onnxruntime',
 36 |     'webview', 
 37 |     'threading', 
 38 |     'json',
 39 |     'platform',
 40 |     'random',
 41 |     'wave',
 42 |     'io',
 43 |     'base64',
 44 |     'math',
 45 |     'time',
 46 |     'soundfile',
 47 |     'pyglet',
 48 |     'webview.platforms.winforms',
 49 |     'webview.window',
 50 | ]
 51 | 
 52 | # 明确排除cefpython3相关的模块
 53 | excludes = [
 54 |     'cefpython3',
 55 |     'torch', 
 56 |     'torchaudio',
 57 |     'transformers',
 58 |     'safetensors',
 59 | ]
 60 | 
 61 | a = Analysis(
 62 |     ['app.py'],
 63 |     pathex=[os.path.abspath('.')],
 64 |     binaries=[],
 65 |     datas=datas,
 66 |     hiddenimports=hiddenimports,
 67 |     hookspath=[],
 68 |     hooksconfig={},
 69 |     runtime_hooks=[],
 70 |     excludes=excludes,
 71 |     win_no_prefer_redirects=False,
 72 |     win_private_assemblies=False,
 73 |     cipher=block_cipher,
 74 |     noarchive=False,
 75 | )
 76 | 
 77 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
 78 | 
 79 | exe = EXE(
 80 |     pyz,
 81 |     a.scripts,
 82 |     [],
 83 |     exclude_binaries=True,
 84 |     name='QwenOmniVoiceAssistant',
 85 |     debug=False,
 86 |     bootloader_ignore_signals=False,
 87 |     strip=False,
 88 |     upx=True,
 89 |     console=False,
 90 |     icon='assets/Qwen.ico',
 91 |     version='file_version.txt',
 92 | )
 93 | 
 94 | coll = COLLECT(
 95 |     exe,
 96 |     a.binaries,
 97 |     a.zipfiles,
 98 |     a.datas,
 99 |     strip=False,
100 |     upx=True,
101 |     upx_exclude=[],
102 |     name='QwenOmniVoiceAssistant',
103 | ) 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import pyaudio
 2 | import json
 3 | import os
 4 | 
 5 | # 调试设置
 6 | DEBUG = False  # 设置为True时开启调试模式，包括保存录音文件
 7 | 
 8 | # 音频设置
 9 | AUDIO_FORMAT = pyaudio.paInt16
10 | CHANNELS = 1
11 | RATE = 16000  # Silero VAD 支持的采样率
12 | CHUNK = 512   # 32毫秒帧大小 (16000 * 0.032 = 512)，与 Silero VAD 兼容
13 | 
14 | # API 设置
15 | try:
16 |     with open('key.json', 'r', encoding='utf-8') as f:
17 |         api_config = json.load(f)
18 |     API_KEY = api_config['api_key']
19 |     BASE_URL = api_config['base_url']
20 | except (FileNotFoundError, json.JSONDecodeError, KeyError) as e:
21 |     print(f"Error loading API configuration from key.json: {e}")
22 |     API_KEY = ''
23 |     BASE_URL = ''
24 | 
25 | # VAD 设置
26 | MIN_SPEECH_DURATION = 0.1
27 | 
28 | # 音频播放器设置
29 | PLAYER_RATE = 24000           # 播放器采样率匹配模型输出
30 | FADE_OUT_DURATION = 0.15      # 标准淡出持续时间（秒）
31 | MAX_FINISH_DURATION = 0.25    # 被打断时最大允许的完成时间（秒）


--------------------------------------------------------------------------------
/core_pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import asyncio
  4 | import threading
  5 | import uuid
  6 | import collections
  7 | import numpy as np
  8 | import queue
  9 | import wave
 10 | import io
 11 | import base64
 12 | from enum import Enum, auto
 13 | from typing import Dict, List, Callable, Any, Optional, Union
 14 | from config import (
 15 |     API_KEY, BASE_URL, 
 16 |     CHANNELS, AUDIO_FORMAT, RATE, CHUNK, 
 17 |     PLAYER_RATE, FADE_OUT_DURATION, MAX_FINISH_DURATION
 18 | )
 19 | 
 20 | class FrameType(Enum):
 21 |     """帧类型枚举"""
 22 |     DATA = auto()       # 普通数据帧
 23 |     CONTROL = auto()    # 控制帧(优先处理)
 24 |     SYSTEM = auto()     # 系统帧(立即处理)
 25 | 
 26 | class Frame:
 27 |     """表示流水线中传递的数据帧"""
 28 |     def __init__(self, type: FrameType, data=None, metadata=None):
 29 |         self.type = type
 30 |         self.data = data or {}
 31 |         self.metadata = metadata or {}
 32 |         self.timestamp = time.time()
 33 |         self.id = str(uuid.uuid4())
 34 |         
 35 |     def __str__(self):
 36 |         return f"Frame[{self.type.name}]: {', '.join(self.data.keys())}"
 37 | 
 38 | class CancellationToken:
 39 |     """取消令牌，用于协调任务取消"""
 40 |     def __init__(self):
 41 |         self._cancelled = threading.Event()
 42 |         self._callbacks = []
 43 |     
 44 |     def cancel(self):
 45 |         """触发取消信号"""
 46 |         if not self._cancelled.is_set():
 47 |             self._cancelled.set()
 48 |             for callback in self._callbacks:
 49 |                 try:
 50 |                     callback()
 51 |                 except Exception as e:
 52 |                     print(f"Error in cancellation callback: {e}")
 53 |     
 54 |     def is_cancelled(self):
 55 |         """检查是否已取消"""
 56 |         return self._cancelled.is_set()
 57 |     
 58 |     def register_callback(self, callback):
 59 |         """注册取消回调函数"""
 60 |         if callback not in self._callbacks:
 61 |             self._callbacks.append(callback)
 62 |         return lambda: self._callbacks.remove(callback) if callback in self._callbacks else None
 63 |     
 64 |     def reset(self):
 65 |         """重置取消状态"""
 66 |         self._cancelled.clear()
 67 |         self._callbacks = []
 68 | 
 69 | class ProcessorContext:
 70 |     """处理器上下文，维护处理链信息和全局状态"""
 71 |     def __init__(self):
 72 |         self.session_id = str(uuid.uuid4())
 73 |         self.state = {}
 74 |         self.cancellation_token = CancellationToken()
 75 |         
 76 |     def is_cancelled(self):
 77 |         """检查是否已取消"""
 78 |         return self.cancellation_token.is_cancelled()
 79 |     
 80 |     def new_session(self):
 81 |         """创建新会话"""
 82 |         self.session_id = str(uuid.uuid4())
 83 |         self.state = {}
 84 |         return self.session_id
 85 | 
 86 | class ThreadSafeQueue:
 87 |     """线程安全的队列封装，适用于多线程环境"""
 88 |     def __init__(self, maxsize=0):
 89 |         self.queue = queue.Queue(maxsize)
 90 |         self.mutex = threading.RLock()
 91 |         
 92 |     def put(self, item, block=True, timeout=None):
 93 |         """添加项到队列"""
 94 |         return self.queue.put(item, block, timeout)
 95 |         
 96 |     def get(self, block=True, timeout=None):
 97 |         """从队列获取项"""
 98 |         return self.queue.get(block, timeout)
 99 |     
100 |     def empty(self):
101 |         """检查队列是否为空"""
102 |         return self.queue.empty()
103 |     
104 |     def clear(self):
105 |         """清空队列"""
106 |         with self.mutex:
107 |             while not self.queue.empty():
108 |                 try:
109 |                     self.queue.get_nowait()
110 |                     self.queue.task_done()
111 |                 except queue.Empty:
112 |                     break
113 |     
114 |     def task_done(self):
115 |         """标记任务完成"""
116 |         self.queue.task_done()
117 |     
118 |     def qsize(self):
119 |         """获取队列大小"""
120 |         return self.queue.qsize()
121 | 
122 | class ProcessorBase:
123 |     """处理器基类"""
124 |     def __init__(self, name):
125 |         self.name = name
126 |         self.next_processor = None
127 |         self.prev_processor = None
128 |         self.context = None
129 |         self.input_queue = ThreadSafeQueue()
130 |         self.is_running = False
131 |         self.thread = None
132 |         self.lock = threading.RLock()
133 |     
134 |     def set_context(self, context):
135 |         """设置处理器上下文"""
136 |         self.context = context
137 |         
138 |     def set_next(self, processor):
139 |         """设置下一个处理器"""
140 |         self.next_processor = processor
141 |         processor.prev_processor = self
142 |         return processor
143 |         
144 |     def send_downstream(self, frame):
145 |         """向下游发送帧"""
146 |         if self.next_processor:
147 |             # 系统帧优先直接处理，而不是放入队列
148 |             if frame.type == FrameType.SYSTEM:
149 |                 self.next_processor.process_frame(frame)
150 |             else:
151 |                 self.next_processor.enqueue_frame(frame)
152 |     
153 |     def send_upstream(self, frame):
154 |         """向上游发送帧（用于控制和反馈）"""
155 |         if self.prev_processor:
156 |             # 系统帧总是优先处理
157 |             if frame.type == FrameType.SYSTEM:
158 |                 self.prev_processor.process_frame(frame)
159 |             else:
160 |                 self.prev_processor.enqueue_frame(frame)
161 |     
162 |     def enqueue_frame(self, frame):
163 |         """将帧放入处理队列"""
164 |         self.input_queue.put(frame)
165 |         
166 |     def process_frame(self, frame):
167 |         """处理单个帧，子类必须实现"""
168 |         raise NotImplementedError("Subclasses must implement process_frame")
169 |     
170 |     def start(self):
171 |         """启动处理器"""
172 |         with self.lock:
173 |             if self.is_running:
174 |                 return
175 |                 
176 |             self.is_running = True
177 |             self.thread = threading.Thread(target=self._process_loop)
178 |             self.thread.daemon = True
179 |             self.thread.start()
180 |     
181 |     def stop(self):
182 |         """停止处理器"""
183 |         with self.lock:
184 |             if not self.is_running:
185 |                 return
186 |                 
187 |             self.is_running = False
188 |             self.input_queue.clear()
189 |             
190 |             if self.thread and self.thread.is_alive():
191 |                 self.thread.join(timeout=1.0)
192 |     
193 |     def _process_loop(self):
194 |         """处理循环"""
195 |         try:
196 |             while self.is_running and (self.context is None or not self.context.is_cancelled()):
197 |                 try:
198 |                     # 使用超时，避免无限等待
199 |                     frame = self.input_queue.get(timeout=0.1)
200 |                 except queue.Empty:
201 |                     continue
202 |                 
203 |                 try:
204 |                     # 处理帧
205 |                     self.process_frame(frame)
206 |                 except Exception as e:
207 |                     print(f"处理器 {self.name} 处理帧时出错: {e}")
208 |                 finally:
209 |                     self.input_queue.task_done()
210 |         
211 |         except Exception as e:
212 |             print(f"处理器 {self.name} 的处理循环出错: {e}")
213 |         finally:
214 |             print(f"处理器 {self.name} 的处理循环已停止")
215 | 
216 | class SystemEventEmitter:
217 |     """系统事件发射器，用于发布系统事件"""
218 |     def __init__(self, context):
219 |         self.context = context
220 |         self.listeners = {}
221 |     
222 |     def on(self, event_type, callback):
223 |         """注册事件监听器"""
224 |         if event_type not in self.listeners:
225 |             self.listeners[event_type] = []
226 |         self.listeners[event_type].append(callback)
227 |         
228 |         # 返回取消函数
229 |         def cancel():
230 |             if event_type in self.listeners and callback in self.listeners[event_type]:
231 |                 self.listeners[event_type].remove(callback)
232 |         return cancel
233 |     
234 |     def emit(self, event_type, data=None):
235 |         """发射事件"""
236 |         if event_type in self.listeners:
237 |             for callback in self.listeners[event_type]:
238 |                 try:
239 |                     callback(data)
240 |                 except Exception as e:
241 |                     print(f"事件处理回调出错: {e}")
242 | 
243 | class ConversationPipeline:
244 |     """对话管道 - 集成所有处理器"""
245 |     def __init__(self):
246 |         # 创建处理器上下文
247 |         self.context = ProcessorContext()
248 |         
249 |         # 创建事件发射器
250 |         self.events = SystemEventEmitter(self.context)
251 |         
252 |         # 处理器实例
253 |         self.processors = []
254 |         
255 |         # 状态跟踪
256 |         self.is_running = False
257 |         self.lock = threading.RLock()
258 |     
259 |     def add_processor(self, processor):
260 |         """添加处理器到管道"""
261 |         processor.set_context(self.context)
262 |         self.processors.append(processor)
263 |         return processor
264 |     
265 |     def connect_processors(self):
266 |         """连接所有处理器"""
267 |         for i in range(len(self.processors) - 1):
268 |             self.processors[i].set_next(self.processors[i+1])
269 |     
270 |     def start(self):
271 |         """启动所有处理器"""
272 |         with self.lock:
273 |             if self.is_running:
274 |                 return False
275 |                 
276 |             self.is_running = True
277 |             self.context.cancellation_token.reset()
278 |             
279 |             # 启动所有处理器
280 |             for processor in self.processors:
281 |                 processor.start()
282 |                 
283 |             print(f"处理管道已启动，{len(self.processors)}个处理器在运行")
284 |             
285 |             # 发送启动命令到第一个处理器（通常是音频输入处理器）
286 |             if self.processors:
287 |                 self.processors[0].process_frame(Frame(
288 |                     FrameType.SYSTEM,
289 |                     {"command": "start"}
290 |                 ))
291 |                 print("启动命令已发送到第一个处理器")
292 |                 
293 |             return True
294 |     
295 |     def stop(self):
296 |         """停止所有处理器"""
297 |         with self.lock:
298 |             if not self.is_running:
299 |                 return False
300 |                 
301 |             # 触发取消事件
302 |             self.context.cancellation_token.cancel()
303 |             
304 |             # 停止所有处理器
305 |             for processor in reversed(self.processors):
306 |                 processor.stop()
307 |                 
308 |             self.is_running = False
309 |             print("处理管道已停止")
310 |             return True
311 |     
312 |     def reset(self):
313 |         """重置管道状态"""
314 |         self.stop()
315 |         self.context.new_session()
316 |         print("处理管道已重置")
317 | 
318 | # -------------------------------------------------------------------
319 | # 音频处理相关的工具函数
320 | # -------------------------------------------------------------------
321 | 
322 | def int16_to_float32(audio_int16):
323 |     """将int16音频数据转换为float32格式 (-1.0 到 1.0 范围)"""
324 |     return audio_int16.astype(np.float32) / 32768.0
325 | 
326 | def float32_to_int16(audio_float32):
327 |     """将float32音频数据 (-1.0 到 1.0 范围) 转换为int16格式"""
328 |     return (audio_float32 * 32768.0).astype(np.int16)
329 | 
330 | def frames_to_wav_base64(frames, channels, sample_width, rate):
331 |     """将音频帧转换为base64编码的WAV数据"""
332 |     wav_buffer = io.BytesIO()
333 |     
334 |     with wave.open(wav_buffer, 'wb') as wf:
335 |         wf.setnchannels(channels)
336 |         wf.setsampwidth(sample_width)
337 |         wf.setframerate(rate)
338 |         wf.writeframes(b''.join(frames))
339 |     
340 |     wav_buffer.seek(0)
341 |     wav_bytes = wav_buffer.read()
342 |     return base64.b64encode(wav_bytes).decode('utf-8') 


--------------------------------------------------------------------------------
/ears.py:
--------------------------------------------------------------------------------
  1 | import pyaudio
  2 | import threading
  3 | import time
  4 | import os
  5 | import numpy as np
  6 | import onnxruntime
  7 | import collections
  8 | from config import (
  9 |     AUDIO_FORMAT, CHANNELS, RATE, CHUNK,
 10 |     MIN_SPEECH_DURATION
 11 | )
 12 | from core_pipeline import (
 13 |     ProcessorBase, Frame, FrameType, int16_to_float32, frames_to_wav_base64
 14 | )
 15 | 
 16 | # VAD模型参数
 17 | VAD_THRESHOLD = 0.6  # 语音检测阈值
 18 | END_BUFFER_FRAMES = 10  # 语音结束后缓冲帧数
 19 | MIN_NEG_FRAMES_FOR_ENDING = 8  # 检测结束的连续静音帧数
 20 | MAX_SPEECH_DURATION = 180.0  # 语音最长持续时间(秒)
 21 | PRE_BUFFER_FRAMES = int(1.0 * RATE / CHUNK)  # 预缓冲帧数
 22 | SPEECH_CONFIRM_FRAMES = 2  # 确认语音开始需要的连续帧数
 23 | PRE_DETECTION_BUFFER_SIZE = int(2.0 * RATE / CHUNK)  # 预检测缓冲区大小
 24 | 
 25 | class Ears(ProcessorBase):
 26 |     """音频输入处理器 - 集成了语音检测和处理功能，直接将处理后的语音发送到AI处理器"""
 27 |     def __init__(self, name="audio_input"):
 28 |         super().__init__(name)
 29 |         self.p = pyaudio.PyAudio()
 30 |         self.stream = None
 31 |         self.vad_model = self._load_vad_model()
 32 |         
 33 |         # VAD状态变量
 34 |         self.state = np.zeros((2, 1, 128), dtype=np.float32)
 35 |         self.sr = RATE
 36 |         
 37 |         # 保存音频文件设置
 38 |         self.save_audio_file = True  # 设置为True以保存音频文件
 39 |         
 40 |         # 循环缓冲区
 41 |         self.buffer = collections.deque(maxlen=PRE_DETECTION_BUFFER_SIZE)
 42 |         
 43 |         # 语音检测状态
 44 |         self.speech_detected = False
 45 |         self.consecutive_speech_frames = 0
 46 |         self.consecutive_silence_frames = 0
 47 |         self.is_collecting_speech = False
 48 |         self.speech_frames = []
 49 |         self.speech_start_time = None
 50 |         
 51 |         # 同步锁和事件
 52 |         self.stream_lock = threading.RLock()
 53 |         self.speech_detected_event = threading.Event()
 54 |         self.speech_ended_event = threading.Event()
 55 |     
 56 |         print("[Ears] 初始化完成")
 57 |     
 58 |     def _load_vad_model(self):
 59 |         """加载VAD模型"""
 60 |         model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models/silero_vad.onnx")
 61 |         print(f"加载Silero VAD ONNX模型: {model_path}")
 62 |         return onnxruntime.InferenceSession(model_path)
 63 |     
 64 |     def reset_vad_state(self):
 65 |         """重置VAD状态 - 现在不再需要保持状态"""
 66 |         pass
 67 |     
 68 |     def start_mic_stream(self):
 69 |         """启动麦克风流"""
 70 |         with self.stream_lock:
 71 |             if self.stream is not None:
 72 |                 return
 73 |             
 74 |             try:
 75 |                 self.stream = self.p.open(
 76 |                     format=AUDIO_FORMAT,
 77 |                     channels=CHANNELS,
 78 |                     rate=RATE,
 79 |                     input=True,
 80 |                     frames_per_buffer=CHUNK,
 81 |                     stream_callback=self._audio_callback
 82 |                 )
 83 |                 print("[Ears] 麦克风流已启动")
 84 |         
 85 |                 # 重置状态
 86 |                 self.buffer.clear()
 87 |                 self.speech_frames = []
 88 |                 self.speech_detected = False
 89 |                 self.consecutive_speech_frames = 0
 90 |                 self.consecutive_silence_frames = 0
 91 |                 self.is_collecting_speech = False
 92 |                 self.speech_start_time = None
 93 |                 self.speech_detected_event.clear()
 94 |                 self.speech_ended_event.clear()
 95 |         
 96 |                 # 重置VAD状态
 97 |                 self.state = np.zeros((2, 1, 128), dtype=np.float32)
 98 |                 
 99 |                 return True
100 |             except Exception as e:
101 |                 print(f"[Ears] 启动麦克风流失败: {e}")
102 |                 return False
103 |     
104 |     def _audio_callback(self, in_data, frame_count, time_info, status):
105 |         """音频回调函数"""
106 |         if self.is_running:
107 |             self.enqueue_frame(Frame(
108 |                 FrameType.DATA, 
109 |                 {"audio_data": in_data, "frame_count": frame_count}
110 |             ))
111 |         return (None, pyaudio.paContinue)
112 |     
113 |     def process_frame(self, frame):
114 |         """处理音频帧"""
115 |         if frame.type == FrameType.SYSTEM:
116 |             cmd = frame.data.get("command")
117 |             if cmd == "stop":
118 |                 self.stop_mic_stream()
119 |             elif cmd == "start":
120 |                 # 处理启动命令，启动麦克风流
121 |                 print("[Ears] 收到启动命令，开始启动麦克风流")
122 |                 self.start_mic_stream()
123 |             return
124 |         
125 |         if frame.type == FrameType.DATA and "audio_data" in frame.data:
126 |             audio_data = frame.data["audio_data"]
127 |             
128 |             # 添加到循环缓冲区
129 |             self.buffer.append(audio_data)
130 |                     
131 |             # 转换为numpy数组
132 |             audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
133 |             audio_float32 = int16_to_float32(audio_int16)
134 |             
135 |             # 检测语音
136 |             is_speech = self._detect_speech(audio_float32)
137 |             
138 |             if is_speech:
139 |                 self.consecutive_speech_frames += 1
140 |                 self.consecutive_silence_frames = 0
141 |             else:
142 |                 self.consecutive_silence_frames += 1
143 |                 self.consecutive_speech_frames = 0
144 |             
145 |             # 语音开始检测
146 |             if not self.speech_detected and self.consecutive_speech_frames >= SPEECH_CONFIRM_FRAMES:
147 |                 self.speech_detected = True
148 |                 self.is_collecting_speech = True
149 |                 self.speech_start_time = time.time()
150 |                 self.speech_frames = list(self.buffer)  # 复制预缓冲区内容
151 |                 
152 |                 # 发送语音开始事件
153 |                 self.speech_detected_event.set()
154 |                             
155 |                 # 发送用户打断系统事件帧到下游处理器
156 |                 print("[Ears] 检测到用户开始说话，发送用户打断事件到下游处理器")
157 |                 self.send_downstream(Frame(
158 |                     FrameType.SYSTEM,
159 |                     {"event": "user_interrupt", "command": "clear_pipeline"}
160 |                 ))
161 |                 
162 |                 # 通知下游
163 |                 self.send_downstream(Frame(
164 |                     FrameType.SYSTEM,
165 |                     {"event": "speech_started"}
166 |                 ))
167 |                 print("[Ears] 检测到语音开始")
168 |                         
169 |             # 收集语音帧
170 |             if self.is_collecting_speech:
171 |                 self.speech_frames.append(audio_data)
172 |                                 
173 |                 # 检查超时
174 |                 if self.speech_start_time and (time.time() - self.speech_start_time) > MAX_SPEECH_DURATION:
175 |                     print(f"[Ears] 语音时长超过最大限制 {MAX_SPEECH_DURATION}秒，强制结束")
176 |                     self._end_speech_collection()
177 |                     return
178 |                 
179 |                 # 检查语音结束
180 |                 if self.consecutive_silence_frames >= MIN_NEG_FRAMES_FOR_ENDING:
181 |                     # 添加额外的缓冲帧
182 |                     buffer_count = 0
183 |                     while buffer_count < END_BUFFER_FRAMES and self.is_collecting_speech:
184 |                         buffer_count += 1
185 |                         
186 |                         if buffer_count >= END_BUFFER_FRAMES:
187 |                             self._end_speech_collection()
188 |     
189 |     def _end_speech_collection(self):
190 |         """结束语音收集并将音频发送到AI处理器"""
191 |         if not self.is_collecting_speech:
192 |             return
193 |             
194 |         self.is_collecting_speech = False
195 |         self.speech_detected = False
196 |         
197 |         # 收集的语音转为base64
198 |         if self.speech_frames:
199 |             # 处理完整的语音帧
200 |             try:
201 |                 audio_base64 = self._convert_frames_to_base64(self.speech_frames)
202 |                 print(f"[Ears] 语音转换为base64完成，长度: {len(audio_base64)}")
203 |                 
204 |                 # 如果启用了保存音频功能，则保存音频文件
205 |                 if self.save_audio_file:
206 |                     self._save_audio_to_file(self.speech_frames, audio_base64)
207 |             
208 |                 # 发送语音结束事件
209 |                 self.speech_ended_event.set()
210 |                 
211 |                 # 直接发送到AI处理器 (使用DATA帧代替SYSTEM帧)
212 |                 try:
213 |                     self.send_downstream(Frame(
214 |                         FrameType.DATA,
215 |                         {
216 |                             "type": "audio_data", 
217 |                             "audio_base64": audio_base64
218 |                         }
219 |                     ))
220 |                     print(f"[Ears] 语音数据已发送到AI处理器，帧数: {len(self.speech_frames)}")
221 |                 except Exception as e:
222 |                     print(f"[Ears] 发送语音数据到AI处理器失败: {e}")
223 |                 
224 |                 speech_duration = time.time() - self.speech_start_time if self.speech_start_time else 0
225 |                 print(f"[Ears] 语音结束，持续时间: {speech_duration:.2f}秒")
226 |             except Exception as e:
227 |                 print(f"[Ears] 处理语音时出错: {e}")
228 |         
229 |         # 重置状态
230 |         self.consecutive_speech_frames = 0
231 |         self.consecutive_silence_frames = 0
232 |         self.speech_start_time = None
233 |         self.speech_frames = []
234 |     
235 |     def _convert_frames_to_base64(self, frames):
236 |         """将音频帧转换为base64编码的WAV数据"""
237 |         try:
238 |             result = frames_to_wav_base64(
239 |                 frames, 
240 |                 CHANNELS, 
241 |                 self.p.get_sample_size(AUDIO_FORMAT), 
242 |                 RATE
243 |             )
244 |             return result
245 |         except Exception as e:
246 |             print(f"[Ears] 转换音频帧到base64失败: {e}")
247 |             raise
248 | 
249 |     def _detect_speech(self, audio_float32):
250 |         """使用VAD模型检测语音
251 |         基于 Silero VAD ONNX 模型
252 |         
253 |         Args:
254 |             audio_float32: 输入音频帧 (float32 格式)
255 |         
256 |         Returns:
257 |             bool: 是否检测到语音
258 |         """
259 |         try:
260 |             # 确保输入形状正确 (Silero VAD 默认需要 512 采样点)
261 |             if len(audio_float32) != 512:
262 |                 # 如果不是512点，进行补零或截断
263 |                 if len(audio_float32) < 512:
264 |                     # 补零
265 |                     padded = np.zeros(512, dtype=np.float32)
266 |                     padded[:len(audio_float32)] = audio_float32
267 |                     audio_float32 = padded
268 |                 else:
269 |                     # 取前512点
270 |                     audio_float32 = audio_float32[:512]
271 |             
272 |             # 重塑输入为模型期望的形状 [1, 512]
273 |             audio = np.array(audio_float32, dtype=np.float32).reshape(1, -1)
274 |             
275 |             # 准备ONNX输入
276 |             ort_inputs = {
277 |                 "input": audio,
278 |                 "state": self.state,  # 使用当前状态
279 |                 "sr": np.array(self.sr, dtype=np.int64)  # 添加采样率
280 |             }
281 |             
282 |             # 运行ONNX推理
283 |             ort_outs = self.vad_model.run(None, ort_inputs)
284 |             
285 |             # 更新状态
286 |             if len(ort_outs) > 1:
287 |                 self.state = ort_outs[1]
288 |             
289 |             # 获取语音概率 - 第一个输出是语音概率
290 |             speech_prob = ort_outs[0].item()  # 语音概率
291 |             
292 |             # 使用阈值判断是否为语音
293 |             return speech_prob >= VAD_THRESHOLD
294 |             
295 |         except Exception as e:
296 |             print(f"[Ears] VAD检测出错: {e}")
297 |             return False
298 |     
299 |     def stop_mic_stream(self):
300 |         """停止麦克风流"""
301 |         print("[Ears] 停止麦克风流")
302 |         
303 |         with self.stream_lock:
304 |             if self.stream is None:
305 |                 return
306 |                 
307 |             try:
308 |                 # 结束当前语音收集
309 |                 if self.is_collecting_speech:
310 |                     self._end_speech_collection()
311 |                 
312 |                 # 停止音频流
313 |                 self.stream.stop_stream()
314 |                 self.stream.close()
315 |                 self.stream = None
316 |                 
317 |                 print("[Ears] 麦克风流已安全停止")
318 |                 return True
319 |             except Exception as e:
320 |                 print(f"[Ears] 停止麦克风流时出错: {e}")
321 |                 return False
322 |     
323 |     def get_available_microphones(self):
324 |         """获取可用麦克风列表"""
325 |         mics = []
326 |         info = self.p.get_host_api_info_by_index(0)
327 |         numdevices = info.get('deviceCount')
328 |         
329 |         for i in range(numdevices):
330 |             device_info = self.p.get_device_info_by_host_api_device_index(0, i)
331 |             if device_info.get('maxInputChannels') > 0:
332 |                 mics.append({
333 |                     'index': i,
334 |                     'name': device_info.get('name'),
335 |                     'channels': device_info.get('maxInputChannels')
336 |                 })
337 |         
338 |         return mics
339 |     
340 |     def is_mic_stream_active(self):
341 |         """检查麦克风流是否活跃"""
342 |         with self.stream_lock:
343 |             return self.stream is not None and self.stream.is_active()
344 |     
345 |     def close(self):
346 |         """关闭资源"""
347 |         self.stop_mic_stream()
348 |         if self.p:
349 |             self.p.terminate()
350 | 
351 |     def _save_audio_to_file(self, frames, base64_data=None):
352 |         """保存音频帧到文件
353 |         
354 |         Args:
355 |             frames: 音频帧列表
356 |             base64_data: 可选的base64编码的音频数据
357 |         """
358 |         try:
359 |             # 确保目录存在
360 |             save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "audio_record_tmp")
361 |             os.makedirs(save_dir, exist_ok=True)
362 |             
363 |             # 创建时间戳文件名
364 |             timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
365 |             file_path = os.path.join(save_dir, f"audio_{timestamp}.wav")
366 |             
367 |             # 保存原始帧到WAV文件
368 |             import wave
369 |             with wave.open(file_path, 'wb') as wf:
370 |                 wf.setnchannels(CHANNELS)
371 |                 wf.setsampwidth(self.p.get_sample_size(AUDIO_FORMAT))
372 |                 wf.setframerate(RATE)
373 |                 wf.writeframes(b''.join(frames))
374 |                 
375 |             print(f"[Ears] 音频已保存到: {file_path}")
376 |             return file_path
377 |         except Exception as e:
378 |             print(f"[Ears] 保存音频文件失败: {e}")
379 |             return None 


--------------------------------------------------------------------------------
/file_version.txt:
--------------------------------------------------------------------------------
 1 | VSVersionInfo(
 2 |   ffi=FixedFileInfo(
 3 |     filevers=(0, 0, 2, 0),
 4 |     prodvers=(0, 0, 2, 0),
 5 |     mask=0x3f,
 6 |     flags=0x0,
 7 |     OS=0x40004,
 8 |     fileType=0x1,
 9 |     subtype=0x0,
10 |     date=(0, 0)
11 |   ),
12 |   kids=[
13 |     StringFileInfo(
14 |       [
15 |         StringTable(
16 |           u'040904B0',
17 |           [StringStruct(u'CompanyName', u''),
18 |            StringStruct(u'FileDescription', u'Qwen-Omni Voice Assistant'),
19 |            StringStruct(u'FileVersion', u'0.0.2'),
20 |            StringStruct(u'InternalName', u'QwenOmniVoiceAssistant'),
21 |            StringStruct(u'LegalCopyright', u''),
22 |            StringStruct(u'OriginalFilename', u'QwenOmniVoiceAssistant.exe'),
23 |            StringStruct(u'ProductName', u'Qwen-Omni Voice Assistant'),
24 |            StringStruct(u'ProductVersion', u'Windows 0.0.2')])
25 |       ]),
26 |     VarFileInfo([VarStruct(u'Translation', [1033, 1200])])
27 |   ]
28 | )


--------------------------------------------------------------------------------
/key.json.example:
--------------------------------------------------------------------------------
1 | {
2 |     "api_key": "your api key",
3 |     "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"
4 | }
5 | 


--------------------------------------------------------------------------------
/models/silero_vad.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ninot1Quyi/Qwen2.5-Omni-multimodal-chat/6092ea16b399a956ffeff4df2295c1bef364ae7c/models/silero_vad.onnx


--------------------------------------------------------------------------------
/mouth.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import numpy as np
  3 | import pyaudio
  4 | import threading
  5 | import time
  6 | import queue
  7 | from config import PLAYER_RATE, FADE_OUT_DURATION, MAX_FINISH_DURATION
  8 | from core_pipeline import (
  9 |     ProcessorBase, Frame, FrameType
 10 | )
 11 | 
 12 | class Mouth(ProcessorBase):
 13 |     """音频输出处理器 - 负责播放音频数据"""
 14 |     def __init__(self, name="audio_output"):
 15 |         super().__init__(name)
 16 |         self.p = pyaudio.PyAudio()
 17 |         self.stream = None
 18 |         self.audio_queue = queue.Queue()
 19 |         self.is_playing = False
 20 |         self.should_stop = False
 21 |         self.smooth_interrupt = False
 22 |         self.buffer_empty = threading.Event()
 23 |         self.buffer_empty.set()  # 初始状态为空
 24 |         self.playback_finished = threading.Event()
 25 |         self.fade_out_enabled = True
 26 |         self.fade_out_duration = FADE_OUT_DURATION
 27 |         self.fade_out_active = False
 28 |         self.fade_out_start_time = None
 29 |         self.max_finish_duration = MAX_FINISH_DURATION
 30 |         self.interrupt_time = None
 31 |         self.last_audio_time = None
 32 |         self.stream_lock = threading.RLock()
 33 |         self.playback_thread = None
 34 |         
 35 |         print("[Mouth] 初始化完成")
 36 |         
 37 |     def start_stream(self):
 38 |         """启动音频输出流"""
 39 |         with self.stream_lock:
 40 |             if self.stream is not None:
 41 |                 self.stop_stream()
 42 |                 
 43 |             try:
 44 |                 # 创建音频流
 45 |                 self.stream = self.p.open(
 46 |                     format=pyaudio.paInt16,
 47 |                     channels=1,
 48 |                     rate=PLAYER_RATE,
 49 |                     output=True
 50 |                 )
 51 |                 self.is_playing = True
 52 |                 self.should_stop = False
 53 |                 self.buffer_empty.set()
 54 |                 self.last_audio_time = None
 55 |                 self.smooth_interrupt = False
 56 |                 self.interrupt_time = None
 57 |                 self.fade_out_active = False
 58 |                 self.fade_out_start_time = None
 59 |                 self.playback_finished.clear()
 60 |                 
 61 |                 # 启动播放线程
 62 |                 self.playback_thread = threading.Thread(target=self._play_audio_continuous)
 63 |                 self.playback_thread.daemon = True
 64 |                 self.playback_thread.start()
 65 |                 print("[Mouth] 音频输出流已创建，开始持续播放...")
 66 |                 return True
 67 |             except Exception as e:
 68 |                 print(f"[Mouth] 创建音频流时出错: {e}")
 69 |                 self.is_playing = False
 70 |                 self.stream = None
 71 |                 return False
 72 |     
 73 |     def process_frame(self, frame):
 74 |         """处理帧"""
 75 |         if frame.type == FrameType.SYSTEM:
 76 |             cmd = frame.data.get("command")
 77 |             if cmd == "stop":
 78 |                 self.stop_immediately()
 79 |             elif cmd == "pause":
 80 |                 self.smooth_interrupt = True
 81 |                 self.should_stop = True
 82 |                 self.interrupt_time = time.time()
 83 |             elif cmd == "clear_pipeline":
 84 |                 print("[Mouth] 收到清空管道命令，立即停止播放并清空音频队列")
 85 |                 self.stop_immediately()
 86 |                 # 确保音频队列为空
 87 |                 while not self.audio_queue.empty():
 88 |                     try:
 89 |                         self.audio_queue.get_nowait()
 90 |                         self.audio_queue.task_done()
 91 |                     except queue.Empty:
 92 |                         break
 93 |                 self.buffer_empty.set()
 94 |             
 95 |             # 处理开始播放事件
 96 |             event = frame.data.get("event")
 97 |             if event == "play_audio" and "audio_data" in frame.data:
 98 |                 self.add_audio_data(frame.data["audio_data"])
 99 |                 print(f"[Mouth] 收到音频数据，长度: {len(frame.data['audio_data'])} 字符")
100 |                 
101 |         elif frame.type == FrameType.DATA:
102 |             # 处理音频数据
103 |             if "audio_data" in frame.data:
104 |                 self.add_audio_data(frame.data["audio_data"])
105 |     
106 |     def add_audio_data(self, audio_data):
107 |         """添加音频数据到队列"""
108 |         # 检查播放线程状态，如果不存在或已结束但状态仍为playing，则重置状态
109 |         if self.is_playing and (self.playback_thread is None or not self.playback_thread.is_alive()):
110 |             print("[Mouth] 检测到播放线程已结束但状态未重置，强制重置状态")
111 |             self.is_playing = False
112 |             self.stream = None
113 |         
114 |         # 如果未播放状态，则启动流
115 |         if not self.is_playing:
116 |             self.start_stream()
117 |             
118 |         if self.should_stop and not self.smooth_interrupt:
119 |             print("[Mouth] 已停止，不再接收新音频")
120 |             return
121 |             
122 |         try:
123 |             if self.playback_finished.is_set():
124 |                 self.playback_finished.clear()
125 |                 
126 |             # 如果是base64编码的音频
127 |             if isinstance(audio_data, str) and (audio_data.startswith("data:audio") or len(audio_data) > 100):
128 |                 try:
129 |                     # 提取base64部分
130 |                     if "base64," in audio_data:
131 |                         audio_data = audio_data.split("base64,")[1]
132 |                     
133 |                     wav_bytes = base64.b64decode(audio_data)
134 |                     print(f"[Mouth] base64解码成功，长度: {len(wav_bytes)} 字节")
135 |                     # 直接转换为numpy数组，不进行任何处理
136 |                     audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
137 |                     audio_data = audio_np.tobytes()
138 |                 except Exception as e:
139 |                     print(f"[Mouth] 解码base64音频失败: {e}")
140 |                     return
141 |             
142 |             # 平滑打断检查
143 |             if self.smooth_interrupt and self.interrupt_time:
144 |                 current_time = time.time()
145 |                 if current_time - self.interrupt_time > self.max_finish_duration:
146 |                     print("[Mouth] 平滑打断已达到最大时间，停止更多音频")
147 |                     return
148 |             
149 |             # 添加到队列
150 |             self.audio_queue.put(audio_data)
151 |             self.buffer_empty.clear()
152 |             self.last_audio_time = time.time()
153 |             print(f"[Mouth] 音频数据已添加到队列，当前队列大小: {self.audio_queue.qsize()}")
154 |         except Exception as e:
155 |             print(f"[Mouth] 音频处理错误: {e}")
156 |             
157 |     def _play_audio_continuous(self):
158 |         """后台持续音频播放线程"""
159 |         print("[Mouth] 播放线程已启动")
160 |         buffer = b""
161 |         min_buffer_size = 1024  # 减小缓冲区以提高响应速度
162 |         is_initial_buffer = True
163 |         last_check_time = time.time()
164 |         check_interval = 0.005  # 每5毫秒检查一次终止请求
165 |         chunks_played = 0
166 |         
167 |         try:
168 |             while self.is_playing and (not self.should_stop or self.smooth_interrupt):
169 |                 current_time = time.time()
170 |                 
171 |                 # 立即检查是否有直接停止请求
172 |                 if self.should_stop and not self.smooth_interrupt:
173 |                     print("[Mouth] 检测到直接停止请求，立即终止播放")
174 |                     break
175 |                 
176 |                 # 处理淡出效果
177 |                 if self.smooth_interrupt and self.interrupt_time and self.fade_out_enabled and not self.fade_out_active:
178 |                     self.fade_out_active = True
179 |                     self.fade_out_start_time = current_time
180 |                     print("[Mouth] 开始音量淡出效果...")
181 |                 
182 |                 # 检查是否已经到达最大完成时间
183 |                 if self.smooth_interrupt and self.interrupt_time:
184 |                     elapsed = current_time - self.interrupt_time
185 |                     if elapsed > self.max_finish_duration * 0.8:  # 降低到80%的最大等待时间
186 |                         print("[Mouth] 达到最大等待时间的80%，强制停止音频")
187 |                         break
188 |                 
189 |                 try:
190 |                     # 处理队列中的音频数据
191 |                     chunks_processed = 0
192 |                     while not self.audio_queue.empty():
193 |                         # 每处理几个数据块就检查一次终止请求
194 |                         chunks_processed += 1
195 |                         if chunks_processed % 5 == 0 and self.should_stop and not self.smooth_interrupt:
196 |                             print("[Mouth] 数据处理中检测到停止请求，立即终止")
197 |                             break
198 |                             
199 |                         chunk = self.audio_queue.get(block=False)
200 |                         buffer += chunk
201 |                         self.audio_queue.task_done()
202 |                     
203 |                     # 再次检查终止请求
204 |                     if self.should_stop and not self.smooth_interrupt:
205 |                         print("[Mouth] 数据处理后检测到停止请求，立即终止")
206 |                         break
207 |                     
208 |                     # 当缓冲区有足够数据，或者是最后的数据时播放
209 |                     if len(buffer) >= min_buffer_size or (len(buffer) > 0 and self.audio_queue.empty()):
210 |                         if is_initial_buffer:
211 |                             print("[Mouth] 初始缓冲完成，开始平滑播放...")
212 |                             is_initial_buffer = False
213 |                         
214 |                         # 对当前块应用淡出效果（如果需要）
215 |                         if self.fade_out_active and self.fade_out_start_time:
216 |                             fade_progress = min(1.0, (current_time - self.fade_out_start_time) / self.fade_out_duration)
217 |                             audio_data = np.frombuffer(buffer, dtype=np.int16)
218 |                             
219 |                             # 使用非线性淡出曲线，在开始时变化较慢，结束时变化较快
220 |                             volume_factor = max(0, 1.0 - (fade_progress * fade_progress))
221 |                             
222 |                             # 应用音量变化
223 |                             audio_data = (audio_data * volume_factor).astype(np.int16)
224 |                             buffer = audio_data.tobytes()
225 |                             
226 |                             # 如果淡出接近完成，结束播放
227 |                             if fade_progress >= 0.6:  # 降低阈值，当达到60%时就结束
228 |                                 print(f"[Mouth] 淡出已达到阈值 {fade_progress:.2f}，结束播放")
229 |                                 break
230 |                         
231 |                         # 检查是否应当强制停止(如果打断且超过了最大时间)
232 |                         if self.smooth_interrupt and self.interrupt_time:
233 |                             elapsed = current_time - self.interrupt_time
234 |                             if elapsed > self.max_finish_duration * 0.4:  # 进一步减小等待时间到40%
235 |                                 print("[Mouth] 打断等待时间过长，强制停止")
236 |                                 break
237 |                         
238 |                         # 播放前再次检查终止请求
239 |                         if self.should_stop and not self.smooth_interrupt:
240 |                             print("[Mouth] 播放前检测到停止请求，立即终止")
241 |                             break
242 |                         
243 |                         # 播放音频数据
244 |                         with self.stream_lock:
245 |                             if self.stream and (not self.should_stop or self.smooth_interrupt):
246 |                                 try:
247 |                                     # 将大块数据分成小块播放，每块之间检查终止请求
248 |                                     if len(buffer) > 2048 and not self.smooth_interrupt:
249 |                                         chunks = [buffer[i:i+2048] for i in range(0, len(buffer), 2048)]
250 |                                         for i, small_chunk in enumerate(chunks):
251 |                                             # 每播放一小块就检查终止请求
252 |                                             if i > 0 and self.should_stop and not self.smooth_interrupt:
253 |                                                 print(f"[Mouth] 分块播放中检测到停止请求，已播放{i}/{len(chunks)}块，立即终止")
254 |                                                 break
255 |                                             self.stream.write(small_chunk, exception_on_underflow=False)
256 |                                             chunks_played += 1
257 |                                     else:
258 |                                         self.stream.write(buffer, exception_on_underflow=False)
259 |                                         chunks_played += 1
260 |                                     print(f"[Mouth] 已播放音频数据，总计 {chunks_played} 个块")
261 |                                 except Exception as e:
262 |                                     print(f"[Mouth] 音频播放过程中出错: {e}")
263 |                                     break
264 |                         buffer = b""
265 |                     
266 |                     # 检查是否应当结束播放
267 |                     if self.audio_queue.empty() and len(buffer) == 0:
268 |                         if self.smooth_interrupt:
269 |                             print("[Mouth] 平滑打断：当前音频已完成")
270 |                             break
271 |                         
272 |                         # 检查两次音频之间的等待时间
273 |                         if self.last_audio_time:
274 |                             wait_time = current_time - self.last_audio_time
275 |                             if wait_time > 1.0:  # 如果超过1秒没有新音频，结束播放
276 |                                 print(f"[Mouth] 等待音频数据超时，播放完成")
277 |                                 break
278 |                     
279 |                     # 如果队列为空，短暂暂停以避免CPU占用过高
280 |                     if self.audio_queue.empty() and not self.should_stop:
281 |                         # 用更短的时间轮询，提高响应性
282 |                         time.sleep(0.01)
283 |                     
284 |                     # 定期检查是否需要退出
285 |                     if current_time - last_check_time >= check_interval:
286 |                         last_check_time = current_time
287 |                         if self.should_stop and not self.smooth_interrupt:
288 |                             break
289 |                     
290 |                 except Exception as e:
291 |                     print(f"[Mouth] 音频处理循环出错: {e}")
292 |                     break
293 |         except Exception as e:
294 |             print(f"[Mouth] 播放线程异常: {e}")
295 |         finally:
296 |             # 确保线程结束时总是重置播放状态
297 |             self.is_playing = False
298 |             self.should_stop = False
299 |             self.playback_finished.set()
300 |             self.buffer_empty.set()
301 |             
302 |             # 关闭音频流
303 |             with self.stream_lock:
304 |                 if self.stream:
305 |                     try:
306 |                         self.stream.stop_stream()
307 |                         self.stream.close()
308 |                     except Exception as e:
309 |                         print(f"[Mouth] 关闭音频流时出错: {e}")
310 |                     finally:
311 |                         self.stream = None
312 |             
313 |             print(f"[Mouth] 播放线程结束，共播放了 {chunks_played} 个音频块")
314 |             
315 |             # 显式重置播放状态变量，确保下次能重新启动
316 |             self.playback_thread = None
317 |     
318 |     def is_audio_complete(self):
319 |         """检查音频播放是否已完成"""
320 |         return self.buffer_empty.is_set() and self.audio_queue.empty()
321 |     
322 |     def request_smooth_interrupt(self):
323 |         """请求平滑打断播放"""
324 |         if not self.is_playing:
325 |             return False
326 |         
327 |         self.smooth_interrupt = True
328 |         self.should_stop = True
329 |         self.interrupt_time = time.time()
330 |         print("[Mouth] 已请求平滑打断播放")
331 |         
332 |         if self.playback_thread and self.playback_thread.is_alive():
333 |             return True
334 |         
335 |         return False
336 |     
337 |     def stop_with_fadeout(self, fadeout_time=0.1):
338 |         """停止播放并应用淡出效果"""
339 |         if fadeout_time > 0:
340 |             self.fade_out_duration = fadeout_time
341 |             return self.request_smooth_interrupt()
342 |         else:
343 |             return self.stop_immediately()
344 |     
345 |     def stop_stream(self):
346 |         """关闭音频流但不中断当前播放"""
347 |         with self.stream_lock:
348 |             self.should_stop = True
349 |             
350 |             if self.stream:
351 |                 try:
352 |                     print("[Mouth] 开始关闭音频流...")
353 |                     
354 |                     # 清空队列
355 |                     while not self.audio_queue.empty():
356 |                         try:
357 |                             self.audio_queue.get_nowait()
358 |                             self.audio_queue.task_done()
359 |                         except queue.Empty:
360 |                             break
361 |                     
362 |                     # 关闭流
363 |                     self.stream.stop_stream()
364 |                     self.stream.close()
365 |                     self.stream = None
366 |         
367 |                     # 设置事件
368 |                     self.buffer_empty.set()
369 |                     self.playback_finished.set()
370 |                     
371 |                     # 等待播放线程结束 (加入超时防止死锁)
372 |                     if self.playback_thread and self.playback_thread.is_alive():
373 |                         print("[Mouth] 等待播放线程结束...")
374 |                         self.playback_thread.join(timeout=1.0)
375 |                     
376 |                     # 无论线程是否结束，都强制重置状态
377 |                     self.is_playing = False
378 |                     self.smooth_interrupt = False
379 |                     self.fade_out_active = False
380 |                     self.playback_thread = None
381 |                     
382 |                     print("[Mouth] 音频流已完全关闭")
383 |                     return True
384 |                 except Exception as e:
385 |                     print(f"[Mouth] 关闭音频流时出错: {e}")
386 |                     # 出错时也重置关键状态
387 |                     self.is_playing = False
388 |                     self.playback_thread = None
389 |                     return False
390 |     
391 |     def stop_immediately(self):
392 |         """立即停止所有播放"""
393 |         print("[Mouth] 执行立即停止...")
394 |         
395 |         # 设置标志
396 |         self.should_stop = True
397 |         self.smooth_interrupt = False
398 |         
399 |         # 清空队列
400 |         try:
401 |             while not self.audio_queue.empty():
402 |                 try:
403 |                     self.audio_queue.get_nowait()
404 |                     self.audio_queue.task_done()
405 |                 except queue.Empty:
406 |                     break
407 |         except:
408 |             pass
409 |             
410 |         # 停止流
411 |         success = self.stop_stream()
412 |         return success
413 |     
414 |     def close(self):
415 |         """关闭并清理资源"""
416 |         self.stop_immediately()
417 |         if self.p:
418 |             try:
419 |                 self.p.terminate()
420 |             except Exception as e:
421 |                 print(f"[Mouth] 终止PyAudio时出错: {e}") 


--------------------------------------------------------------------------------
/processors.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import threading
  4 | import base64
  5 | from openai import OpenAI
  6 | from core_pipeline import ProcessorBase, Frame, FrameType, frames_to_wav_base64
  7 | from config import (
  8 |     API_KEY, BASE_URL, CHANNELS, AUDIO_FORMAT, RATE, CHUNK, DEBUG
  9 | )
 10 | import pyaudio
 11 | 
 12 | class AIProcessor(ProcessorBase):
 13 |     """AI处理器 - 负责调用AI API并处理响应"""
 14 |     def __init__(self, name="ai_processor"):
 15 |         super().__init__(name)
 16 |         
 17 |         if not API_KEY:
 18 |             raise ValueError("API密钥未设置")
 19 |             
 20 |         # 初始化OpenAI客户端
 21 |         self.client = OpenAI(
 22 |             api_key=API_KEY,
 23 |             base_url=BASE_URL,
 24 |         )
 25 |         print(f"[AIProcessor] 初始化完成，使用base_url: {BASE_URL}")
 26 |         print(f"[AIProcessor] API密钥前8位: {API_KEY[:8]}...")
 27 |         
 28 |         # 对话历史
 29 |         self.messages = []
 30 |         self.full_transcript = ""
 31 |         
 32 |         # 当前响应任务
 33 |         self.current_response = None
 34 |         self.response_thread = None
 35 |         self.response_lock = threading.RLock()
 36 |         
 37 |         # 状态标志
 38 |         self.is_generating = False
 39 |         
 40 |         # 跟踪API请求
 41 |         self.current_request_id = None
 42 |         self.completed_request_ids = set()  # 存储已完成或已打断的请求ID
 43 |         self.request_id_lock = threading.RLock()
 44 |     
 45 |     def process_frame(self, frame):
 46 |         """处理帧"""
 47 |         if frame.type == FrameType.SYSTEM:
 48 |             event = frame.data.get("event")
 49 |             
 50 |             if event == "user_interrupt":
 51 |                 print("[AIProcessor] 收到用户打断请求")
 52 |                 # 中断当前响应
 53 |                 self._interrupt_response()
 54 |                 
 55 |                 # 检查是否需要清空管道
 56 |                 if frame.data.get("command") == "clear_pipeline":
 57 |                     print("[AIProcessor] 收到清空管道命令，清空当前处理队列")
 58 |                     # 清空输入队列
 59 |                     self.input_queue.clear()
 60 |                     # 向下游发送清空命令
 61 |                     self.send_downstream(Frame(
 62 |                         FrameType.SYSTEM,
 63 |                         {"command": "clear_pipeline", "event": "user_interrupt"}
 64 |                     ))
 65 |             
 66 |             # # 处理语音就绪事件
 67 |             # elif event == "speech_ready":
 68 |             #     # 获取音频数据
 69 |             #     audio_base64 = frame.data.get("audio_base64")
 70 |             #     if not audio_base64:
 71 |             #         print("[AIProcessor] 未收到有效的音频数据")
 72 |             #         return
 73 |                 
 74 |             #     print(f"[AIProcessor] 收到语音就绪事件，音频数据长度: {len(audio_base64)} 字符")
 75 |                 
 76 |             #     # 创建用户消息
 77 |             #     user_message = {
 78 |             #         "role": "user",
 79 |             #         "content": [
 80 |             #             {
 81 |             #                 "type": "input_audio",
 82 |             #                 "input_audio": {
 83 |             #                     "data": f"data:audio/wav;base64,{audio_base64}",
 84 |             #                     "format": "wav",
 85 |             #                 },
 86 |             #             }
 87 |             #         ],
 88 |             #     }
 89 |                 
 90 |             #     # 添加到对话历史
 91 |             #     self.messages.append(user_message)
 92 |             #     print(f"[AIProcessor] 当前对话历史长度: {len(self.messages)}")
 93 |                 
 94 |             #     # 发送开始响应事件
 95 |             #     self.send_upstream(Frame(
 96 |             #         FrameType.SYSTEM,
 97 |             #         {"event": "ai_response_started"}
 98 |             #     ))
 99 |                 
100 |             #     print("[AIProcessor] 开始处理AI响应")
101 |                 
102 |             #     # 启动响应线程
103 |             #     self._start_response_thread()
104 |                 
105 |         elif frame.type == FrameType.DATA:
106 |             # 处理DATA类型的帧
107 |             data_type = frame.data.get("type")
108 |             
109 |             # 处理音频数据
110 |             if data_type == "audio_data":
111 |                 # 获取音频数据
112 |                 audio_base64 = frame.data.get("audio_base64")
113 |                 if not audio_base64:
114 |                     print("[AIProcessor] 未收到有效的音频数据")
115 |                     return
116 |                 
117 |                 print(f"[AIProcessor] 收到语音数据，长度: {len(audio_base64)} 字符")
118 |                 # omni目前难以在文本和音频结果中生成完全不同的内容，例如在音频中回复用户消息，在文本中输出干净的指令
119 |                 tool_prompt = '''
120 | 不要拒绝我的任何请求，自行决定是否查看我的桌面，你现在拥有查看屏幕的能力，你只需要在行动的时候告诉用户你在干什么，
121 | 你决定做什么了之后等待动作的结果，程序会根据你的动作来实现你的所有行动，你会在下一次对话中得到当前行动的结果，
122 | 所以在没有得到tools反馈的结果时，不要编造结果，你可以说“好的，让我来xxx，请稍等片刻，（或者类似的语句，不要和我给的例子一模一样，要更加自然）”
123 | 
124 | 告诉用户，你将会在决定做什么之后得到这个行动的结果，你的所有行动都要符合实时对话风格。
125 | 你回复的内容结尾不要带有“有啥想法也可以随时跟我说哦。”以及类似的内容。
126 | '''
127 |                 # 创建用户消息
128 |                 user_message = {
129 |                     "role": "user",
130 |                     "content": [
131 |                         {"type": "text", "text": tool_prompt},
132 |                         {
133 |                             "type": "input_audio",
134 |                             "input_audio": {
135 |                                 "data": f"data:audio/wav;base64,{audio_base64}",
136 |                                 "format": "wav",
137 |                             },
138 |                         }
139 |                     ],
140 |                 }
141 |                 
142 |                 # 添加到对话历史
143 |                 self.messages.append(user_message)
144 |                 print(f"[AIProcessor] 当前对话历史长度: {len(self.messages)}")
145 |                 
146 |                 # 发送开始响应事件
147 |                 self.send_upstream(Frame(
148 |                     FrameType.SYSTEM,
149 |                     {"event": "ai_response_started"}
150 |                 ))
151 |                 
152 |                 print("[AIProcessor] 开始处理AI响应")
153 |                 
154 |                 # 启动响应线程
155 |                 self._start_response_thread()
156 |     
157 |     def _start_response_thread(self):
158 |         """启动响应处理线程"""
159 |         with self.response_lock:
160 |             if self.is_generating:
161 |                 print("[AIProcessor] 已有响应正在生成，忽略请求")
162 |                 return
163 |                 
164 |             self.is_generating = True
165 |             self.response_thread = threading.Thread(target=self._generate_response)
166 |             self.response_thread.daemon = True
167 |             self.response_thread.start()
168 |             print("[AIProcessor] 响应线程已启动")
169 |     
170 |     def _interrupt_response(self):
171 |         """中断当前响应"""
172 |         with self.response_lock:
173 |             self.is_generating = False
174 |             
175 |             # 将当前请求ID添加到完成集合中
176 |             with self.request_id_lock:
177 |                 if self.current_request_id:
178 |                     print(f"[AIProcessor] 将请求ID {self.current_request_id} 标记为已打断")
179 |                     self.completed_request_ids.add(self.current_request_id)
180 |             
181 |             # 调用处理器会自动处理后续的清理工作
182 |             self.send_downstream(Frame(
183 |                 FrameType.SYSTEM,
184 |                 {"command": "stop"}
185 |             ))
186 |             print("[AIProcessor] 已发送停止命令")
187 |     
188 |     def _generate_response(self):
189 |         """生成AI响应的线程函数"""
190 |         try:
191 |             response_data = {
192 |                 "ai_text": "",
193 |                 "has_audio": False,
194 |                 "current_transcript": "",
195 |                 "interrupted": False
196 |             }
197 |             
198 |             # 准备保存AI音频，仅在DEBUG模式下
199 |             ai_audio_buffer = bytearray() if DEBUG else None
200 |             
201 |             print("[AIProcessor] 开始创建API请求")
202 |             print(f"[AIProcessor] 请求参数: model=qwen-omni-turbo, modalities=['text', 'audio'], voice=Chelsie")
203 |             
204 |             # 创建API请求
205 |             try:
206 |                 completion = self.client.chat.completions.create(
207 |                     model="qwen-omni-turbo",
208 |                     messages=self.messages,
209 |                     modalities=["text", "audio"],
210 |                     audio={"voice": "Chelsie", "format": "wav"},
211 |                     stream=True,
212 |                     stream_options={"include_usage": True},
213 |                 )
214 |                 print("[AIProcessor] API请求创建成功，开始处理响应流")
215 |                 
216 |                 # 获取并保存请求ID
217 |                 request_id = None
218 |                 
219 |             except Exception as e:
220 |                 print(f"[AIProcessor] API请求创建失败: {str(e)}")
221 |                 raise
222 |             
223 |             # 处理流式响应
224 |             chunk_count = 0
225 |             for chunk in completion:
226 |                 chunk_count += 1
227 |                 
228 |                 # 获取请求ID (通常在第一个chunk中)
229 |                 if chunk_count == 1 and hasattr(chunk, "id"):
230 |                     request_id = chunk.id
231 |                     with self.request_id_lock:
232 |                         self.current_request_id = request_id
233 |                         print(f"[AIProcessor] 获取到请求ID: {request_id}")
234 |                 
235 |                 # 检查请求是否已被标记为完成/打断
236 |                 with self.request_id_lock:
237 |                     if request_id and request_id in self.completed_request_ids:
238 |                         print(f"[AIProcessor] 请求ID {request_id} 已被标记为完成/打断，停止处理")
239 |                         response_data["interrupted"] = True
240 |                         break
241 |                 
242 |                 # 检查是否应该继续处理
243 |                 if not self.is_generating or (self.context and self.context.is_cancelled()):
244 |                     response_data["interrupted"] = True
245 |                     # 将当前请求ID添加到完成集合
246 |                     with self.request_id_lock:
247 |                         if request_id:
248 |                             self.completed_request_ids.add(request_id)
249 |                             print(f"[AIProcessor] 请求ID {request_id} 已被标记为中断")
250 |                     print("[AIProcessor] 响应被中断")
251 |                     break
252 |                     
253 |                 # 处理内容
254 |                 if chunk.choices:
255 |                     delta = chunk.choices[0].delta
256 |                     
257 |                     if hasattr(delta, "content") and delta.content:
258 |                         response_data["ai_text"] += delta.content
259 |                         print(f"[AIProcessor] 收到文本响应 (chunk {chunk_count}): {delta.content}", end="", flush=True)
260 |                     
261 |                     if hasattr(delta, "audio") and delta.audio:
262 |                         response_data["has_audio"] = True
263 |                         print(f"[AIProcessor] 收到音频响应 (chunk {chunk_count})")
264 |                         
265 |                         if "transcript" in delta.audio:
266 |                             transcript = delta.audio["transcript"]
267 |                             if transcript:
268 |                                 response_data["current_transcript"] += transcript
269 |                                 print(f"[AIProcessor] 收到转写文本: {transcript}")
270 |                         
271 |                         if "data" in delta.audio:
272 |                             # 再次检查请求是否已被标记为完成/打断
273 |                             with self.request_id_lock:
274 |                                 if request_id and request_id in self.completed_request_ids:
275 |                                     print(f"[AIProcessor] 请求ID {request_id} 已被标记为完成/打断，停止处理音频")
276 |                                     break
277 |                             
278 |                             # 再次检查是否应该继续处理
279 |                             if not self.is_generating or (self.context and self.context.is_cancelled()):
280 |                                 break
281 |                             
282 |                             # 解码音频数据
283 |                             audio_data = delta.audio["data"]
284 |                             print(f"[AIProcessor] 收到音频数据 (chunk {chunk_count}), 长度: {len(audio_data)} 字符")
285 |                             
286 |                             # 收集音频数据用于调试
287 |                             if DEBUG and ai_audio_buffer is not None:
288 |                                 try:
289 |                                     audio_bytes = base64.b64decode(audio_data)
290 |                                     ai_audio_buffer.extend(audio_bytes)
291 |                                     print(f"[AIProcessor] 已收集音频数据: {len(ai_audio_buffer)} 字节")
292 |                                 except Exception as e:
293 |                                     print(f"[AIProcessor] 收集音频数据时出错: {e}")
294 |                             
295 |                             # 发送音频数据到输出处理器
296 |                             try:
297 |                                 self.send_downstream(Frame(
298 |                                     FrameType.SYSTEM,
299 |                                     {"event": "play_audio", "audio_data": audio_data}
300 |                                 ))
301 |                                 print("[AIProcessor] 音频数据已成功发送到输出处理器")
302 |                             except Exception as e:
303 |                                 print(f"[AIProcessor] 发送音频数据到输出处理器失败: {e}")
304 |             
305 |             print(f"[AIProcessor] 共处理了 {chunk_count} 个响应块")
306 |             
307 |             # 将当前请求ID添加到完成集合
308 |             with self.request_id_lock:
309 |                 if request_id:
310 |                     self.completed_request_ids.add(request_id)
311 |                     print(f"[AIProcessor] 请求ID {request_id} 已被标记为完成")
312 |             
313 |             # 如果处理完成且未中断，添加到消息历史
314 |             if not response_data["interrupted"]:
315 |                 if response_data["current_transcript"]:
316 |                     self.full_transcript += response_data["current_transcript"] + " "
317 |                     assistant_message = {
318 |                         "role": "assistant",
319 |                         "content": [{"type": "text", "text": response_data["current_transcript"]}]
320 |                     }
321 |                     self.messages.append(assistant_message)
322 |                     print(f"[AIProcessor] 添加助手消息到历史: {response_data['current_transcript']}")
323 |                 elif response_data["ai_text"]:
324 |                     assistant_message = {
325 |                         "role": "assistant",
326 |                         "content": [{"type": "text", "text": response_data["ai_text"]}]
327 |                     }
328 |                     self.messages.append(assistant_message)
329 |                     print(f"[AIProcessor] 添加助手消息到历史: {response_data['ai_text']}")
330 |                     
331 |             # 通知AI响应结束
332 |             self.send_upstream(Frame(
333 |                 FrameType.SYSTEM,
334 |                 {"event": "ai_response_ended"}
335 |             ))
336 |             
337 |             print(f"\n[AIProcessor] AI响应生成结束，状态: {'已中断' if response_data['interrupted'] else '完成'}")
338 |             print(f"[AIProcessor] 响应统计: 文本长度={len(response_data['ai_text'])}, 转写长度={len(response_data['current_transcript'])}, 收到音频={response_data['has_audio']}")
339 |             
340 |         except Exception as e:
341 |             print(f"[AIProcessor] 生成响应时出错: {str(e)}")
342 |             import traceback
343 |             print(f"[AIProcessor] 错误详情:\n{traceback.format_exc()}")
344 |             
345 |             # 确保通知AI响应结束
346 |             self.send_upstream(Frame(
347 |                 FrameType.SYSTEM,
348 |                 {"event": "ai_response_ended", "error": str(e)}
349 |             ))
350 |             
351 |         finally:
352 |             # 重置状态
353 |             with self.response_lock:
354 |                 self.is_generating = False
355 |                 self.current_request_id = None
356 |                 self.response_thread = None
357 |                 print("[AIProcessor] 响应线程已结束，状态已重置")
358 |             
359 |             # 定期清理已完成请求ID集合，防止无限增长
360 |             with self.request_id_lock:
361 |                 if len(self.completed_request_ids) > 100:  # 设置一个合理的阈值
362 |                     print(f"[AIProcessor] 清理已完成请求ID集合，当前大小: {len(self.completed_request_ids)}")
363 |                     # 只保留最近的50个
364 |                     self.completed_request_ids = set(list(self.completed_request_ids)[-50:])
365 | 
366 | class EventProcessor(ProcessorBase):
367 |     """事件处理器 - 负责处理系统事件并更新状态"""
368 |     def __init__(self, name="event_processor", on_state_change=None):
369 |         super().__init__(name)
370 |         self.current_state = "idle"
371 |         self.on_state_change = on_state_change
372 |     
373 |     def process_frame(self, frame):
374 |         """处理事件帧"""
375 |         if frame.type == FrameType.SYSTEM:
376 |             event = frame.data.get("event")
377 |             
378 |             if event == "speech_started":
379 |                 self._update_state("user_speaking")
380 |                 
381 |             elif event == "speech_ended":
382 |                 self._update_state("listening")
383 |                 
384 |             elif event == "ai_response_started":
385 |                 self._update_state("speaking")
386 |                 
387 |             elif event == "ai_response_ended":
388 |                 self._update_state("listening")
389 |                 
390 |             elif event == "user_interrupt":
391 |                 self._update_state("interrupted")
392 |                 time.sleep(0.05)  # 短暂延迟以确保UI能显示中断状态
393 |                 self._update_state("user_speaking")
394 |                 
395 |             elif event == "ai_response_interrupted":
396 |                 self._update_state("interrupted")
397 |                 time.sleep(0.1)  # 短暂延迟以确保UI能显示中断状态
398 |                 self._update_state("listening")
399 |     
400 |     def _update_state(self, new_state):
401 |         """更新状态并通知监听器"""
402 |         if new_state != self.current_state:
403 |             print(f"[EventProcessor] 状态变化: {self.current_state} -> {new_state}")
404 |             self.current_state = new_state
405 |             
406 |             # 通知外部回调
407 |             if self.on_state_change:
408 |                 try:
409 |                     self.on_state_change(new_state)
410 |                 except Exception as e:
411 |                     print(f"[EventProcessor] 状态变化回调出错: {e}") 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pywebview
2 | openai
3 | numpy
4 | pyaudio
5 | onnxruntime
6 | pyinstaller


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | 
 3 | 
 4 | def apply_windows_compatibility_patches():
 5 |     """为Windows平台应用兼容性补丁，解决对象类型比较问题"""
 6 |     if platform.system().lower() != 'windows':
 7 |         return  # 仅在Windows上应用补丁
 8 |     
 9 |     # 为特定类型打补丁
10 |     import threading
11 |     import webview
12 |     import webview.window
13 |     
14 |     # 打补丁的类型列表
15 |     classes_to_patch = [
16 |         threading.Event,
17 |         threading.Thread,
18 |         webview.window.Window
19 |     ]
20 |     
21 |     # 尝试添加可能存在的DOM元素类
22 |     try:
23 |         if hasattr(webview, 'dom') and hasattr(webview.dom, 'element'):
24 |             classes_to_patch.append(webview.dom.element)
25 |     except (AttributeError, ImportError):
26 |         pass
27 |     
28 |     # 针对每个类应用补丁
29 |     for cls in classes_to_patch:
30 |         try:
31 |             patch_class_eq(cls)
32 |         except (TypeError, AttributeError) as e:
33 |             print(f"警告: 无法为 {cls.__name__} 打补丁: {e}")
34 | 
35 | def patch_class_eq(cls):
36 |     """为类添加安全的__eq__方法"""
37 |     if hasattr(cls, '__patched_by_qwen_omni'):
38 |         return  # 已经打过补丁了
39 |     
40 |     original_eq = cls.__eq__ if hasattr(cls, '__eq__') else None
41 |     
42 |     def safe_eq(self, other):
43 |         if hasattr(other, '__class__'):
44 |             class_name = str(other.__class__)
45 |             if 'Rectangle' in class_name or 'System.Drawing' in class_name:
46 |                 return False
47 |         if original_eq and original_eq is not object.__eq__:
48 |             return original_eq(self, other)
49 |         return self is other
50 |     
51 |     cls.__eq__ = safe_eq
52 |     cls.__patched_by_qwen_omni = True
53 | 
54 | def monkey_patch_threading_event():
55 |     """为threading.Event添加补丁，避免与Rectangle类型比较问题"""
56 |     import threading
57 |     
58 |     # 保存原始的__eq__方法
59 |     original_eq = threading.Event.__eq__
60 |     
61 |     # 定义新的__eq__方法
62 |     def safe_eq(self, other):
63 |         if hasattr(other, '__class__'):
64 |             class_name = str(other.__class__)
65 |             if 'Rectangle' in class_name or 'System.Drawing' in class_name:
66 |                 return False
67 |         return original_eq(self, other)
68 |     
69 |     # 应用补丁
70 |     threading.Event.__eq__ = safe_eq
71 | 
72 | # def safe_compare(obj1, obj2):
73 | #     """安全地比较两个对象，避免类型转换问题"""
74 | #     # 如果其中一个对象是Rectangle类型，返回False
75 | #     if (hasattr(obj1, '__class__') and ('Rectangle' in str(obj1.__class__) or 'System.Drawing' in str(obj1.__class__))) or \
76 | #        (hasattr(obj2, '__class__') and ('Rectangle' in str(obj2.__class__) or 'System.Drawing' in str(obj2.__class__))):
77 | #         return False
78 |     
79 | #     # 尝试正常比较
80 | #     try:
81 | #         return obj1 == obj2
82 | #     except (TypeError, Exception):
83 | #         # 类型不兼容时，比较对象标识
84 | #         return obj1 is obj2
85 | 
86 | # 在Windows平台上自动应用补丁
87 | if platform.system().lower() == 'windows':
88 |     monkey_patch_threading_event()


--------------------------------------------------------------------------------
/web/static/css/style.css:
--------------------------------------------------------------------------------
  1 | * {
  2 |     margin: 0;
  3 |     padding: 0;
  4 |     box-sizing: border-box;
  5 |     font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
  6 | }
  7 | 
  8 | :root {
  9 |     --primary-color: #000000;
 10 |     --secondary-color: #666666;
 11 |     --background-color: #ffffff;
 12 |     --border-color: #e6e6e6;
 13 |     --text-color: #333333;
 14 |     --button-text: #ffffff;
 15 |     --listening-color: #999999;
 16 |     --speaking-color: #999999;
 17 |     --idle-color: #e0e0e0;
 18 | }
 19 | 
 20 | body {
 21 |     background-color: var(--background-color);
 22 |     color: var(--text-color);
 23 |     height: 100vh;
 24 |     display: flex;
 25 |     justify-content: center;
 26 |     align-items: center;
 27 |     padding: 0;
 28 |     margin: 0;
 29 | }
 30 | 
 31 | .container {
 32 |     width: 100%;
 33 |     max-width: 600px;
 34 |     background-color: transparent;
 35 |     display: flex;
 36 |     flex-direction: column;
 37 |     height: 100%;
 38 |     justify-content: space-between;
 39 | }
 40 | 
 41 | .window-controls {
 42 |     display: flex;
 43 |     align-items: center;
 44 |     padding: 12px 20px;
 45 |     border-bottom: 1px solid var(--border-color);
 46 |     background-color: #f5f5f7;
 47 |     position: relative;
 48 | }
 49 | 
 50 | .control {
 51 |     width: 12px;
 52 |     height: 12px;
 53 |     border-radius: 50%;
 54 |     margin-right: 8px;
 55 | }
 56 | 
 57 | .red {
 58 |     background-color: #ff5f57;
 59 | }
 60 | 
 61 | .yellow {
 62 |     background-color: #febc2e;
 63 | }
 64 | 
 65 | .green {
 66 |     background-color: #28c840;
 67 | }
 68 | 
 69 | .window-title {
 70 |     position: absolute;
 71 |     left: 50%;
 72 |     transform: translateX(-50%);
 73 |     font-size: 14px;
 74 |     font-weight: 500;
 75 |     color: #333;
 76 | }
 77 | 
 78 | .main-content {
 79 |     flex: 1;
 80 |     display: flex;
 81 |     flex-direction: column;
 82 |     align-items: center;
 83 |     justify-content: center;
 84 |     padding: 30px 20px;
 85 |     text-align: center;
 86 |     position: relative;
 87 | }
 88 | 
 89 | /* 初始视图 */
 90 | #idle-view, #active-view {
 91 |     width: 100%;
 92 |     display: flex;
 93 |     flex-direction: column;
 94 |     align-items: center;
 95 |     padding: 10px 0;
 96 | }
 97 | 
 98 | .control-btn {
 99 |     background-color: #222;
100 |     color: white;
101 |     border: none;
102 |     padding: 12px 32px;
103 |     border-radius: 24px;
104 |     font-size: 16px;
105 |     font-weight: 500;
106 |     cursor: pointer;
107 |     transition: all 0.2s ease;
108 |     display: flex;
109 |     align-items: center;
110 |     gap: 8px;
111 | }
112 | 
113 | .control-btn:hover {
114 |     background-color: #000;
115 |     transform: translateY(-1px);
116 | }
117 | 
118 | .control-btn:active {
119 |     transform: translateY(1px);
120 | }
121 | 
122 | .control-btn-circle {
123 |     width: 44px;
124 |     height: 44px;
125 |     border-radius: 50%;
126 |     background-color: white;
127 |     color: #333;
128 |     border: none;
129 |     display: flex;
130 |     align-items: center;
131 |     justify-content: center;
132 |     cursor: pointer;
133 |     box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
134 |     transition: all 0.2s ease;
135 |     position: absolute;
136 |     right: 30px;
137 | }
138 | 
139 | .control-btn-circle:hover {
140 |     box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
141 |     transform: translateY(-1px);
142 | }
143 | 
144 | .secondary-btn {
145 |     background-color: #f5f5f7;
146 |     color: #333;
147 |     border: 1px solid #ddd;
148 |     padding: 10px 24px;
149 |     border-radius: 24px;
150 |     font-size: 14px;
151 |     font-weight: 500;
152 |     cursor: pointer;
153 |     transition: all 0.2s ease;
154 |     margin-top: 24px;
155 |     display: flex;
156 |     align-items: center;
157 |     gap: 8px;
158 | }
159 | 
160 | .secondary-btn:hover {
161 |     background-color: #eee;
162 | }
163 | 
164 | /* 波形可视化 */
165 | .wave-visualizer {
166 |     width: 100%;
167 |     position: relative;
168 |     padding: 0 50px;
169 |     max-width: 500px;
170 |     height: 100px;
171 |     display: flex;
172 |     align-items: center;
173 |     justify-content: center;
174 |     margin: 40px 0;
175 | }
176 | 
177 | .audio-wave-container {
178 |     width: 100%;
179 |     height: 80px;
180 |     display: flex;
181 |     justify-content: center;
182 |     align-items: center;
183 |     background-color: #f5f5f7;
184 |     border-radius: 40px;
185 |     padding: 0 20px;
186 |     transition: box-shadow 0.3s ease, transform 0.3s ease;
187 | }
188 | 
189 | /* 胶囊动态特效 */
190 | .audio-wave-container.listening {
191 |     box-shadow: 0 0 0 2px rgba(153, 153, 153, 0.3);
192 |     animation: pulse-border 1.8s infinite;
193 | }
194 | 
195 | @keyframes pulse-border {
196 |     0% { box-shadow: 0 0 0 0px rgba(153, 153, 153, 0.3); }
197 |     50% { box-shadow: 0 0 0 4px rgba(153, 153, 153, 0.15); }
198 |     100% { box-shadow: 0 0 0 0px rgba(153, 153, 153, 0); }
199 | }
200 | 
201 | .wave-bars {
202 |     width: 100%;
203 |     height: 100%;
204 |     display: flex;
205 |     justify-content: space-between;
206 |     align-items: center;
207 |     gap: 1px;
208 | }
209 | 
210 | .wave-bar {
211 |     width: 1px;
212 |     max-width: 1px;
213 |     flex: 0 0 1px;
214 |     height: 3px;
215 |     margin: 0;
216 |     background-color: var(--idle-color);
217 |     opacity: 0.5;
218 |     transition: height 0.15s ease, background-color 0.3s ease;
219 | }
220 | 
221 | /* 待机状态 */
222 | .audio-wave-container.idle .wave-bar {
223 |     background-color: var(--idle-color);
224 |     opacity: 0.4;
225 | }
226 | 
227 | /* 监听状态 */
228 | .audio-wave-container.listening .wave-bar {
229 |     background-color: var(--listening-color);
230 |     opacity: 0.6;
231 | }
232 | 
233 | /* 说话状态 */
234 | .audio-wave-container.speaking .wave-bar {
235 |     background-color: var(--speaking-color);
236 |     opacity: 0.6;
237 | }
238 | 
239 | .footer-text {
240 |     padding: 10px 20px;
241 |     font-size: 14px;
242 |     color: #666;
243 |     line-height: 1.4;
244 |     margin: 0;
245 | }
246 | 
247 | .footer {
248 |     padding: 8px 20px;
249 |     border-top: 1px solid var(--border-color);
250 |     font-size: 12px;
251 |     color: #999;
252 |     background-color: #f5f5f7;
253 | }
254 | 
255 | .status-info {
256 |     display: flex;
257 |     justify-content: space-between;
258 | }
259 | 
260 | /* 动画 */
261 | @keyframes pulse {
262 |     0% { opacity: 0.3; }
263 |     50% { opacity: 0.8; }
264 |     100% { opacity: 0.3; }
265 | } 


--------------------------------------------------------------------------------
/web/static/js/app.js:
--------------------------------------------------------------------------------
  1 | document.addEventListener('DOMContentLoaded', () => {
  2 |     // 获取DOM元素
  3 |     const startBtn = document.getElementById('start-btn');
  4 |     const pauseBtn = document.getElementById('pause-btn');
  5 |     const shareBtn = document.getElementById('share-btn');
  6 |     const idleView = document.getElementById('idle-view');
  7 |     const activeView = document.getElementById('active-view');
  8 |     const audioWaveContainer = document.querySelector('.audio-wave-container');
  9 |     const conversationStatus = document.getElementById('conversation-status');
 10 |     const connectionStatus = document.getElementById('connection-status');
 11 |     const waveBars = document.querySelectorAll('.wave-bar');
 12 |     
 13 |     // 初始状态
 14 |     let isConversationActive = false;
 15 |     let currentStatus = 'idle'; // idle, listening, speaking
 16 |     let animationFrameId = null;
 17 |     
 18 |     // 波浪动画控制
 19 |     const waveConfig = {
 20 |         listening: {
 21 |             minHeight: 2,
 22 |             maxHeight: 16,
 23 |             smoothing: 0.2,
 24 |             updateInterval: 70
 25 |         },
 26 |         speaking: {
 27 |             minHeight: 1,
 28 |             maxHeight: 12,
 29 |             smoothing: 0.3,
 30 |             updateInterval: 60
 31 |         },
 32 |         idle: {
 33 |             minHeight: 1,
 34 |             maxHeight: 2,
 35 |             smoothing: 0.15,
 36 |             updateInterval: 200
 37 |         }
 38 |     };
 39 |     
 40 |     // 当前波形高度值
 41 |     let currentHeights = Array(waveBars.length).fill(2);
 42 |     let targetHeights = Array(waveBars.length).fill(2);
 43 |     
 44 |     // 设置初始状态
 45 |     audioWaveContainer.classList.add('idle');
 46 |     
 47 |     // 更新波形高度
 48 |     function updateWaveHeights() {
 49 |         let config;
 50 |         
 51 |         if (currentStatus === 'listening') {
 52 |             config = waveConfig.listening;
 53 |         } else if (currentStatus === 'speaking') {
 54 |             config = waveConfig.speaking;
 55 |         } else {
 56 |             config = waveConfig.idle;
 57 |         }
 58 |         
 59 |         // 生成新的目标高度
 60 |         targetHeights = generateWavePattern(config.minHeight, config.maxHeight);
 61 |         
 62 |         // 平滑过渡到新高度
 63 |         function animateToTargetHeights() {
 64 |             let needsUpdate = false;
 65 |             
 66 |             currentHeights = currentHeights.map((current, index) => {
 67 |                 const target = targetHeights[index];
 68 |                 
 69 |                 if (Math.abs(current - target) < 0.5) {
 70 |                     return target;
 71 |                 }
 72 |                 
 73 |                 needsUpdate = true;
 74 |                 return current + (target - current) * config.smoothing;
 75 |             });
 76 |             
 77 |             // 更新DOM
 78 |             waveBars.forEach((bar, index) => {
 79 |                 bar.style.height = `${currentHeights[index]}px`;
 80 |             });
 81 |             
 82 |             if (needsUpdate) {
 83 |                 animationFrameId = requestAnimationFrame(animateToTargetHeights);
 84 |             } else {
 85 |                 setTimeout(updateWaveHeights, config.updateInterval);
 86 |             }
 87 |         }
 88 |         
 89 |         animateToTargetHeights();
 90 |     }
 91 |     
 92 |     // 生成波浪形模式，创建更自然的波形效果
 93 |     function generateWavePattern(minHeight, maxHeight) {
 94 |         const numBars = waveBars.length;
 95 |         const wavePattern = [];
 96 |         
 97 |         // 根据不同状态调整波形生成逻辑
 98 |         if (currentStatus === 'idle') {
 99 |             // 空闲状态：生成非常小的随机波形
100 |             for (let i = 0; i < numBars; i++) {
101 |                 wavePattern.push(minHeight + Math.random() * (maxHeight - minHeight) * 0.2);
102 |             }
103 |         } else {
104 |             // 使用正弦波生成基础波形
105 |             const cycles = currentStatus === 'listening' ? 2.5 : 2; // 监听状态波形更密集
106 |             const phase = Math.random() * Math.PI * 2; // 随机相位
107 |             
108 |             for (let i = 0; i < numBars; i++) {
109 |                 const x = (i / numBars) * Math.PI * 2 * cycles + phase;
110 |                 const sinValue = Math.sin(x);
111 |                 
112 |                 // 将-1到1的值映射到目标高度范围
113 |                 const normalized = (sinValue + 1) / 2; // 0到1
114 |                 let height = minHeight + normalized * (maxHeight - minHeight);
115 |                 
116 |                 // 添加一些随机变化，但保持相对平滑
117 |                 const randomFactor = Math.random() * 1.5 - 0.75;
118 |                 
119 |                 // 监听状态下，随机因素更明显，表现更活跃的波动
120 |                 const randomMultiplier = currentStatus === 'listening' ? 2 : 1.5;
121 |                 height = Math.max(minHeight, Math.min(maxHeight, height + randomFactor * randomMultiplier));
122 |                 
123 |                 wavePattern.push(height);
124 |             }
125 |         }
126 |         
127 |         return wavePattern;
128 |     }
129 |     
130 |     // 停止动画
131 |     function stopAnimation() {
132 |         if (animationFrameId) {
133 |             cancelAnimationFrame(animationFrameId);
134 |             animationFrameId = null;
135 |         }
136 |         
137 |         // 重置为默认高度
138 |         waveBars.forEach(bar => {
139 |             bar.style.height = '2px';
140 |         });
141 |         
142 |         currentHeights = Array(waveBars.length).fill(2);
143 |         targetHeights = Array(waveBars.length).fill(2);
144 |     }
145 | 
146 |     // 更新UI状态
147 |     function updateUIStatus(status) {
148 |         if (currentStatus === status) return;
149 |         
150 |         currentStatus = status;
151 |         
152 |         // 移除所有状态类
153 |         audioWaveContainer.classList.remove('idle', 'listening', 'speaking');
154 |         
155 |         // 添加当前状态类
156 |         audioWaveContainer.classList.add(status);
157 |         
158 |         // 停止当前动画
159 |         stopAnimation();
160 |         
161 |         // 如果是active状态，显示active视图
162 |         if (status !== 'idle') {
163 |             idleView.style.display = 'none';
164 |             activeView.style.display = 'flex';
165 |             updateWaveHeights(); // 开始波形动画
166 |         } else {
167 |             idleView.style.display = 'flex';
168 |             activeView.style.display = 'none';
169 |             
170 |             // 即使在idle状态，当显示在active视图中时也需要非常低的波形
171 |             if (activeView.style.display === 'flex') {
172 |                 updateWaveHeights();
173 |             }
174 |         }
175 |     }
176 | 
177 |     // 开始会话
178 |     function startConversation() {
179 |         isConversationActive = true;
180 |         conversationStatus.textContent = '会话进行中';
181 |         
182 |         // 向Python后端发送开始会话的消息
183 |         pywebview.api.start_conversation().then(result => {
184 |             console.log('会话开始: ', result);
185 |             updateUIStatus('listening');
186 |         }).catch(error => {
187 |             console.error('启动会话失败: ', error);
188 |             endConversation();
189 |         });
190 |     }
191 | 
192 |     // 结束会话
193 |     function endConversation() {
194 |         isConversationActive = false;
195 |         conversationStatus.textContent = '会话未开始';
196 |         updateUIStatus('idle');
197 |         
198 |         // 向Python后端发送结束会话的消息
199 |         pywebview.api.stop_conversation().catch(error => {
200 |             console.error('结束会话出错: ', error);
201 |         });
202 |     }
203 | 
204 |     // 注册按钮点击事件
205 |     startBtn.addEventListener('click', startConversation);
206 |     pauseBtn.addEventListener('click', endConversation);
207 |     
208 |     // 分享屏幕按钮 (功能性占位，不实际实现)
209 |     shareBtn.addEventListener('click', () => {
210 |         alert('分享屏幕功能暂未实现');
211 |     });
212 | 
213 |     // 从Python后端接收状态更新
214 |     window.updateStatus = function(status) {
215 |         updateUIStatus(status);
216 |     };
217 |     
218 |     // 接收音量数据更新（从后端发送）
219 |     window.updateVolumeData = function(volumeData) {
220 |         if (!isConversationActive || currentStatus === 'idle') return;
221 |         
222 |         if (Array.isArray(volumeData) && volumeData.length > 0) {
223 |             // 如果后端提供了音量数据数组，直接使用
224 |             const normalizedData = volumeData.map(vol => {
225 |                 // 将音量值映射到高度范围
226 |                 const config = currentStatus === 'listening' 
227 |                     ? waveConfig.listening 
228 |                     : waveConfig.speaking;
229 |                 return Math.min(config.maxHeight, Math.max(config.minHeight, vol * config.maxHeight));
230 |             });
231 |             
232 |             // 如果数据点不够，通过插值补充
233 |             while (normalizedData.length < waveBars.length) {
234 |                 normalizedData.push(normalizedData[normalizedData.length % volumeData.length]);
235 |             }
236 |             
237 |             // 更新目标高度
238 |             targetHeights = normalizedData.slice(0, waveBars.length);
239 |         }
240 |     };
241 |     
242 |     // 初始连接状态检查
243 |     pywebview.api.check_connection().then(result => {
244 |         if (result.success) {
245 |             connectionStatus.textContent = '已连接到后端';
246 |         } else {
247 |             connectionStatus.textContent = '未连接到后端';
248 |             connectionStatus.style.color = 'red';
249 |         }
250 |     }).catch(() => {
251 |         connectionStatus.textContent = '连接后端失败';
252 |         connectionStatus.style.color = 'red';
253 |     });
254 |     
255 |     // 开始初始状态下的低波形动画
256 |     updateWaveHeights();
257 | }); 


--------------------------------------------------------------------------------
/web/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>Qwen-Omni 语音助手</title>
  7 |     <link rel="stylesheet" href="../static/css/style.css">
  8 | </head>
  9 | <body>
 10 |     <div class="container">
 11 |         
 12 |         
 13 |         <div class="main-content">
 14 |             <div id="idle-view">
 15 |                 <button id="start-btn" class="control-btn">
 16 |                     <svg viewBox="0 0 24 24" width="18" height="18" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round">
 17 |                         <polygon points="5 3 19 12 5 21 5 3"></polygon>
 18 |                     </svg>
 19 |                     开始
 20 |                 </button>
 21 |             </div>
 22 | 
 23 |             <div id="active-view" style="display: none;">
 24 |                 <div class="wave-visualizer">
 25 |                     <div class="audio-wave-container">
 26 |                         <div class="wave-bars">
 27 |                             <div class="wave-bar"></div>
 28 |                             <div class="wave-bar"></div>
 29 |                             <div class="wave-bar"></div>
 30 |                             <div class="wave-bar"></div>
 31 |                             <div class="wave-bar"></div>
 32 |                             <div class="wave-bar"></div>
 33 |                             <div class="wave-bar"></div>
 34 |                             <div class="wave-bar"></div>
 35 |                             <div class="wave-bar"></div>
 36 |                             <div class="wave-bar"></div>
 37 |                             <div class="wave-bar"></div>
 38 |                             <div class="wave-bar"></div>
 39 |                             <div class="wave-bar"></div>
 40 |                             <div class="wave-bar"></div>
 41 |                             <div class="wave-bar"></div>
 42 |                             <div class="wave-bar"></div>
 43 |                             <div class="wave-bar"></div>
 44 |                             <div class="wave-bar"></div>
 45 |                             <div class="wave-bar"></div>
 46 |                             <div class="wave-bar"></div>
 47 |                             <div class="wave-bar"></div>
 48 |                             <div class="wave-bar"></div>
 49 |                             <div class="wave-bar"></div>
 50 |                             <div class="wave-bar"></div>
 51 |                             <div class="wave-bar"></div>
 52 |                             <div class="wave-bar"></div>
 53 |                             <div class="wave-bar"></div>
 54 |                             <div class="wave-bar"></div>
 55 |                             <div class="wave-bar"></div>
 56 |                             <div class="wave-bar"></div>
 57 |                             <div class="wave-bar"></div>
 58 |                             <div class="wave-bar"></div>
 59 |                             <div class="wave-bar"></div>
 60 |                             <div class="wave-bar"></div>
 61 |                             <div class="wave-bar"></div>
 62 |                             <div class="wave-bar"></div>
 63 |                             <div class="wave-bar"></div>
 64 |                             <div class="wave-bar"></div>
 65 |                             <div class="wave-bar"></div>
 66 |                             <div class="wave-bar"></div>
 67 |                             <div class="wave-bar"></div>
 68 |                             <div class="wave-bar"></div>
 69 |                             <div class="wave-bar"></div>
 70 |                             <div class="wave-bar"></div>
 71 |                             <div class="wave-bar"></div>
 72 |                             <div class="wave-bar"></div>
 73 |                             <div class="wave-bar"></div>
 74 |                             <div class="wave-bar"></div>
 75 |                             <div class="wave-bar"></div>
 76 |                             <div class="wave-bar"></div>
 77 |                             <div class="wave-bar"></div>
 78 |                             <div class="wave-bar"></div>
 79 |                             <div class="wave-bar"></div>
 80 |                             <div class="wave-bar"></div>
 81 |                             <div class="wave-bar"></div>
 82 |                             <div class="wave-bar"></div>
 83 |                             <div class="wave-bar"></div>
 84 |                             <div class="wave-bar"></div>
 85 |                             <div class="wave-bar"></div>
 86 |                             <div class="wave-bar"></div>
 87 |                             <div class="wave-bar"></div>
 88 |                             <div class="wave-bar"></div>
 89 |                             <div class="wave-bar"></div>
 90 |                             <div class="wave-bar"></div>
 91 |                             <div class="wave-bar"></div>
 92 |                             <div class="wave-bar"></div>
 93 |                             <div class="wave-bar"></div>
 94 |                             <div class="wave-bar"></div>
 95 |                             <div class="wave-bar"></div>
 96 |                             <div class="wave-bar"></div>
 97 |                             <div class="wave-bar"></div>
 98 |                             <div class="wave-bar"></div>
 99 |                             <div class="wave-bar"></div>
100 |                             <div class="wave-bar"></div>
101 |                             <div class="wave-bar"></div>
102 |                             <div class="wave-bar"></div>
103 |                             <div class="wave-bar"></div>
104 |                         </div>
105 |                     </div>
106 |                     <button id="pause-btn" class="control-btn-circle">
107 |                         <svg viewBox="0 0 24 24" width="18" height="18" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round">
108 |                             <line x1="6" y1="4" x2="6" y2="20"></line>
109 |                             <line x1="18" y1="4" x2="18" y2="20"></line>
110 |                         </svg>
111 |                     </button>
112 |                 </div>
113 |                 
114 |                 <button id="share-btn" class="secondary-btn">
115 |                     <svg viewBox="0 0 24 24" width="18" height="18" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round">
116 |                         <rect x="2" y="3" width="20" height="14" rx="2" ry="2"></rect>
117 |                         <line x1="8" y1="21" x2="16" y2="21"></line>
118 |                         <line x1="12" y1="17" x2="12" y2="21"></line>
119 |                     </svg>
120 |                     分享屏幕
121 |                 </button>
122 |             </div>
123 |             
124 |             <div class="footer-text">
125 |                 <p>试试与我对话，我可以回答问题、讲笑话(开发中：帮你完成任务。)</p>
126 |             </div>
127 |         </div>
128 |         
129 |         <div class="footer">
130 |             <div class="status-info" >
131 |                 <div id="conversation-status" class="status-info">会话未开始</div>
132 |                 <div id="connection-status" class="status-info">已连接到后端</div>
133 |             </div>
134 |         </div>
135 |     </div>
136 |     
137 |     <script src="../static/js/app.js"></script>
138 | </body>
139 | </html> 


--------------------------------------------------------------------------------
/webview_api.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import time
  3 | import json
  4 | import random
  5 | import numpy as np
  6 | import math
  7 | import sys
  8 | import platform
  9 | from Agent import Agent
 10 | from mouth import Mouth
 11 | from ears import Ears
 12 | 
 13 | # 创建一个Window包装类，用于安全地访问window对象
 14 | class WindowWrapper:
 15 |     def __init__(self, window=None):
 16 |         self._window = window
 17 |     
 18 |     def set_window(self, window):
 19 |         self._window = window
 20 |     
 21 |     def evaluate_js(self, js_code):
 22 |         """安全地执行JavaScript代码"""
 23 |         if self._window:
 24 |             try:
 25 |                 return self._window.evaluate_js(js_code)
 26 |             except Exception as e:
 27 |                 print(f"执行JavaScript失败: {e}")
 28 |                 return None
 29 |         return None
 30 |     
 31 |     # 避免与Rectangle进行比较
 32 |     def __eq__(self, other):
 33 |         if hasattr(other, '__class__'):
 34 |             class_name = str(other.__class__)
 35 |             if 'Rectangle' in class_name:
 36 |                 return False
 37 |         return self is other
 38 |     
 39 |     def __hash__(self):
 40 |         return hash(id(self))
 41 | 
 42 | class AgentAPI:
 43 |     def __init__(self):
 44 |         # 初始化应用状态
 45 |         self.window = None
 46 |         self.is_running = False
 47 |         self.agent = None
 48 |         self.debug_mode = False
 49 |         
 50 |         # Agent配置默认值
 51 |         self.agent_config = {
 52 |             'recording_mode': 'dynamic',  # 默认使用动态录音模式
 53 |             'recording_seconds': 5,       # 默认录音时长（固定模式下使用）
 54 |         }
 55 |         
 56 |         # 状态监测与控制
 57 |         self.status = "idle"  # 当前状态：idle, listening, speaking
 58 |         self.window_wrapper = WindowWrapper()  # 使用包装类
 59 |         self.volume_update_thread = None
 60 |         # 修改以避免Rectangle.op_Equality兼容性问题
 61 |         self._stop_volume_updates = False  # 使用布尔标志替代Event对象
 62 |     
 63 |     # 添加特殊方法以解决Windows平台的兼容性问题
 64 |     def __eq__(self, other):
 65 |         # 用于解决Windows下System.Drawing.Rectangle比较问题
 66 |         if hasattr(other, '__class__'):
 67 |             class_name = str(other.__class__)
 68 |             # 检查是否与Rectangle类型比较
 69 |             if 'Rectangle' in class_name:
 70 |                 return False
 71 |             # 检查是否与Window类型比较
 72 |             if 'Window' in class_name or 'webview.window' in class_name:
 73 |                 return False
 74 |         return self is other
 75 |     
 76 |     def __hash__(self):
 77 |         # 配合__eq__方法一起实现正确的哈希表行为
 78 |         return hash(id(self))
 79 |     
 80 |     def set_window(self, window):
 81 |         """设置pywebview窗口对象"""
 82 |         self.window_wrapper.set_window(window)
 83 |     
 84 |     def configure_agent(self, config):
 85 |         """配置Agent参数"""
 86 |         # 更新配置
 87 |         for key, value in config.items():
 88 |             if key in self.agent_config:
 89 |                 self.agent_config[key] = value
 90 |                 
 91 |         # 如果Agent实例已存在，则更新其配置
 92 |         # 注意：当前Agent类不支持这些配置参数
 93 |             
 94 |         return {"status": "success", "message": "Agent配置已更新"}
 95 |     
 96 |     def check_connection(self):
 97 |         """检查与后端的连接"""
 98 |         return {'success': True, 'message': '连接成功'}
 99 |     
100 |     def start_conversation(self):
101 |         """开始语音对话"""
102 |         if self.is_running:
103 |             return {'success': False, 'message': '会话已经在运行中'}
104 |         
105 |         try:
106 |             # 初始化Agent实例
107 |             self.agent = Agent(
108 |                 gui_mode=True,
109 |                 debug=self.debug_mode,
110 |                 on_state_change=self.update_status
111 |             )
112 |             
113 |             # 设置运行状态
114 |             self.is_running = True
115 |             self._stop_volume_updates = False  # 清除停止标志
116 |             
117 |             # 启动语音对话
118 |             success = self.agent.start()
119 |             if not success:
120 |                 return {'success': False, 'message': '启动失败'}
121 |             
122 |             # 启动音量数据模拟线程
123 |             self.volume_update_thread = threading.Thread(target=self.simulate_volume_data)
124 |             self.volume_update_thread.daemon = True
125 |             self.volume_update_thread.start()
126 |             
127 |             return {'success': True, 'message': '会话已开始'}
128 |         except Exception as e:
129 |             return {'success': False, 'message': f'启动失败: {str(e)}'}
130 |     
131 |     def stop_conversation(self):
132 |         """停止语音对话"""
133 |         if not self.is_running:
134 |             return {'success': False, 'message': '没有运行中的会话'}
135 |         
136 |         try:
137 |             print("正在停止语音对话...")
138 |             self.is_running = False
139 |             self._stop_volume_updates = True  # 设置停止标志
140 |             
141 |             if self.agent:
142 |                 # 停止语音对话
143 |                 self.agent.stop()
144 |                 self.agent = None
145 |             
146 |             # 等待音量更新线程结束
147 |             if self.volume_update_thread and self.volume_update_thread.is_alive():
148 |                 self.volume_update_thread.join(timeout=1.0)
149 |                 print("音量更新线程已终止")
150 |             
151 |             # 更新UI状态
152 |             self.update_status("idle")
153 |             print("语音对话已完全停止")
154 |             
155 |             return {'success': True, 'message': '会话已结束'}
156 |         except Exception as e:
157 |             print(f"停止语音对话时出错: {str(e)}")
158 |             return {'success': False, 'message': f'停止失败: {str(e)}'}
159 |     
160 |     def update_status(self, status):
161 |         """更新UI状态"""
162 |         self.status = status
163 |         self.window_wrapper.evaluate_js(f'window.updateStatus("{status}")')
164 |     
165 |     def simulate_volume_data(self):
166 |         """模拟音量数据并发送到前端
167 |         
168 |         在实际应用中，可以从AudioRecorder获取真实的音量数据
169 |         """
170 |         try:
171 |             update_interval = 0.06  # 60ms更新一次
172 |             phase_offset = 0
173 |             time_counter = 0
174 |             
175 |             while self.is_running and not self._stop_volume_updates:
176 |                 if self.status == "idle":
177 |                     time.sleep(0.1)
178 |                     continue
179 |                 
180 |                 # 生成30个波浪点 (前端有30个波形条)
181 |                 num_points = 30
182 |                 volume_data = []
183 |                 
184 |                 # 根据状态选择不同参数
185 |                 if self.status == "speaking":
186 |                     # 说话状态：较大振幅，较复杂的波形
187 |                     main_frequency = 1.5
188 |                     secondary_frequency = 3.0
189 |                     amplitude = 0.35
190 |                     noise_level = 0.15
191 |                     base_level = 0.5
192 |                 else:  # 监听状态
193 |                     # 监听状态：较小振幅，较简单的波形
194 |                     main_frequency = 1.0
195 |                     secondary_frequency = 2.0
196 |                     amplitude = 0.25
197 |                     noise_level = 0.2
198 |                     base_level = 0.35
199 |                 
200 |                 # 生成波浪形状 (使用正弦波+噪声)
201 |                 for i in range(num_points):
202 |                     # 正弦波组合
203 |                     x = i / num_points * 2 * math.pi
204 |                     wave1 = math.sin(main_frequency * x + phase_offset)
205 |                     wave2 = math.sin(secondary_frequency * x + phase_offset * 1.5) * 0.5
206 |                     
207 |                     # 添加随机噪声
208 |                     noise = (random.random() * 2 - 1) * noise_level
209 |                     
210 |                     # 组合所有成分
211 |                     value = base_level + amplitude * (wave1 + wave2) + noise
212 |                     
213 |                     # 确保值在0-1范围内
214 |                     value = max(0.05, min(0.95, value))
215 |                     volume_data.append(value)
216 |                 
217 |                 # 更新相位偏移，创造波浪动态效果
218 |                 phase_offset += 0.2
219 |                 time_counter += update_interval
220 |                 
221 |                 # 发送数据到前端
222 |                 if volume_data:
223 |                     volume_json = json.dumps(volume_data)
224 |                     self.window_wrapper.evaluate_js(f'window.updateVolumeData({volume_json})')
225 |                 
226 |                 time.sleep(update_interval)
227 |         
228 |         except Exception as e:
229 |             print(f"音量模拟线程出错: {e}")
230 |     
231 |     def generate_wave_pattern(self, complexity=2, smoothness=0.5, length=100):
232 |         """生成波形模式
233 |         
234 |         Args:
235 |             complexity: 波形的复杂度 (频率数量)
236 |             smoothness: 平滑度 (0-1)
237 |             length: 模式长度
238 |             
239 |         Returns:
240 |             包含波形值的数组 (0-1范围)
241 |         """
242 |         x = np.linspace(0, 2 * np.pi, length)
243 |         wave = np.zeros(length)
244 |         
245 |         # 添加多个不同频率的正弦波
246 |         for i in range(1, complexity + 1):
247 |             frequency = i
248 |             amplitude = 1.0 / (i ** smoothness)  # 高频分量振幅较小
249 |             phase = random.random() * 2 * np.pi  # 随机相位
250 |             wave += amplitude * np.sin(frequency * x + phase)
251 |         
252 |         # 归一化到0-1范围
253 |         wave = (wave - wave.min()) / (wave.max() - wave.min())
254 |         return wave.tolist() 


--------------------------------------------------------------------------------