├── src ├── __init__.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ └── config_loader.cpython-310.pyc │ └── config_loader.py ├── transcriber.py ├── llm_client.py ├── voice_generator.py └── audio_capture.py ├── logo.png ├── doc_pic ├── GUI.png ├── QR.png ├── Use.jpg └── logo.png ├── output └── test_record.wav ├── requirements.txt ├── config.ini ├── main_cmd.py ├── README.md ├── main.py └── README_en.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/logo.png -------------------------------------------------------------------------------- /doc_pic/GUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/doc_pic/GUI.png -------------------------------------------------------------------------------- /doc_pic/QR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/doc_pic/QR.png -------------------------------------------------------------------------------- /doc_pic/Use.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/doc_pic/Use.jpg -------------------------------------------------------------------------------- /doc_pic/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/doc_pic/logo.png -------------------------------------------------------------------------------- /output/test_record.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/output/test_record.wav -------------------------------------------------------------------------------- /src/utils/__pycache__/config_loader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMACY2017/InterPilot/HEAD/src/utils/__pycache__/config_loader.cpython-310.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyQt5==5.15.11 2 | PyQt5_sip==12.17.0 3 | PyAudioWPatch==0.2.12.7 4 | openai==1.68.2 5 | openai-whisper==20240930 6 | markdown2==2.5.3 7 | pydub==0.25.1 8 | sounddevice==0.5.1 9 | soundfile==0.13.1 10 | tiktoken==0.9.0 11 | torch==2.6.0 12 | tqdm==4.67.1 13 | numpy==2.1.3 14 | requests==2.32.3 15 | pyaudiowpatch==0.2.12.7 -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | API_URL = https://api.siliconflow.cn/v1 3 | API_KEY = sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 4 | MODEL = deepseek-ai/DeepSeek-R1-Distill-Qwen-7B 5 | SPEAKER_DEVICE_INDEX = -1 6 | MIC_DEVICE_INDEX = 33 7 | OUTPUT_DIR = output 8 | WHISPER_MODEL_SIZW = base 9 | DEFAULT_PROMPT = "请你作为一个熟悉人工智能知识的专业算法工程师帮助我。我正在参加一场面试,接下来你被输入的文字来自于面试官的语音转文字,请你全力理解并为我写好合适的回答:" 10 | -------------------------------------------------------------------------------- /src/utils/config_loader.py: -------------------------------------------------------------------------------- 1 | # src/utils/config_loader.py 2 | import os 3 | import configparser 4 | 5 | def get_config(): 6 | # 获取当前文件的绝对路径 7 | current_dir = os.path.dirname(os.path.abspath(__file__)) 8 | # 向上回溯两级到项目根目录(workspace/) 9 | project_root = os.path.dirname(os.path.dirname(current_dir)) 10 | config_path = os.path.join(project_root, 'config.ini') 11 | 12 | config = configparser.ConfigParser() 13 | config.read(config_path) 14 | 15 | 16 | return config -------------------------------------------------------------------------------- /src/transcriber.py: -------------------------------------------------------------------------------- 1 | # 第一次运行会自动下载模型文件 2 | import whisper 3 | from whisper.utils import get_writer 4 | import os 5 | import configparser 6 | 7 | #获取当前文件的绝对路径,向上一级,用绝对路径找到config.ini并读取 8 | current_dir = os.path.dirname(os.path.abspath(__file__)) 9 | project_root = os.path.dirname(current_dir) 10 | config_path = os.path.join(project_root, 'config.ini') 11 | MYCONFIG = configparser.ConfigParser() 12 | MYCONFIG.read(config_path,encoding='utf-8') 13 | 14 | 15 | class SpeechTranscriber: 16 | def __init__(self, model_size=MYCONFIG['DEFAULT']['WHISPER_MODEL_SIZW']): 17 | self.model = whisper.load_model(model_size) 18 | 19 | def transcribe(self, audio_path): 20 | #判断这个音频文件大小是否小于1个字节 21 | if os.path.getsize(audio_path) < 1: 22 | return "音频文件大小为0" 23 | result = self.model.transcribe(audio_path) 24 | return result["text"] 25 | 26 | if __name__ == "__main__": 27 | transcriber = SpeechTranscriber() 28 | text = transcriber.transcribe("output/test_record.wav") 29 | print("转写结果:", text) 30 | -------------------------------------------------------------------------------- /main_cmd.py: -------------------------------------------------------------------------------- 1 | from src.audio_capture import LoopbackRecorder 2 | from src.transcriber import SpeechTranscriber 3 | from src.llm_client import LLMClient 4 | import time 5 | import os 6 | 7 | def update_response(new_text): 8 | #作为回调函数,更新response 9 | print(new_text, end="", flush=True,sep="") 10 | 11 | def main(): 12 | while True: 13 | # 1. 录音 14 | # 按ctrl+c开始 15 | client = LLMClient() 16 | input('按任意键开始录音...') 17 | # 开始录音 18 | print("正在启动录音...") 19 | recorder = LoopbackRecorder(device_index=33) 20 | recorder.start_recording("interview.wav") 21 | recorder.record(duration=5)# 录制5秒 22 | 23 | recorder.stop_recording() 24 | print("录音完成") 25 | 26 | # 2. 转写 27 | print("\n开始转写...") 28 | transcriber = SpeechTranscriber() 29 | text = transcriber.transcribe("interview.wav") 30 | print(f"\n转写内容: {text}") 31 | 32 | # 3. LLM处理 33 | 34 | print("\n模型回复:") 35 | full_response = [] 36 | client.get_response(f"\n{text}", callback=update_response) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() -------------------------------------------------------------------------------- /src/llm_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import sys 3 | 4 | import os 5 | import configparser 6 | 7 | #获取当前文件的绝对路径,向上一级,用绝对路径找到config.ini并读取 8 | current_dir = os.path.dirname(os.path.abspath(__file__)) 9 | project_root = os.path.dirname(current_dir) 10 | config_path = os.path.join(project_root, 'config.ini') 11 | MYCONFIG = configparser.ConfigParser() 12 | MYCONFIG.read(config_path,encoding='utf-8') 13 | 14 | def update_response(new_text): 15 | #作为回调函数,更新response 16 | print(new_text, end="", flush=True,sep="") 17 | 18 | class LLMClient: 19 | def __init__(self, api_url=MYCONFIG['DEFAULT']['API_URL'],api_key=MYCONFIG['DEFAULT']['API_KEY'],model=MYCONFIG['DEFAULT']['MODEL']): 20 | self.client = OpenAI( 21 | api_key=api_key, 22 | base_url=api_url 23 | ) 24 | self.model = model 25 | 26 | def get_response(self, prompt, callback=None): 27 | model = self.model 28 | response = self.client.chat.completions.create( 29 | model=model, 30 | messages=[{"role": "user", "content": prompt}], 31 | stream=True 32 | ) 33 | full_response = "" 34 | for chunk in response: 35 | if not chunk.choices: 36 | continue 37 | if chunk.choices[0].delta.content: 38 | content = chunk.choices[0].delta.content 39 | full_response += content 40 | if callback: 41 | callback(content) 42 | if chunk.choices[0].delta.reasoning_content: 43 | reasoning = chunk.choices[0].delta.reasoning_content 44 | full_response += reasoning 45 | if callback: 46 | callback(reasoning) 47 | return full_response 48 | 49 | 50 | 51 | if __name__ == "__main__": 52 | # 需要先设置环境变量 SILICONFLOW_API_KEY 53 | import os 54 | client = LLMClient() 55 | 56 | print(client.get_response("请你作为一个熟悉人工智能知识的专业算法工程师帮助我。我正在参加一场面试,接下来你被输入的文字来自于面试官的语音转文字,请你全力理解并为我写好合适的回答:听你刚刚的介绍,你在训练模型的过程中遇到过拟合了怎么办?", callback=update_response)) -------------------------------------------------------------------------------- /src/voice_generator.py: -------------------------------------------------------------------------------- 1 | import edge_tts 2 | import pyaudiowpatch as pyaudio 3 | import subprocess 4 | import threading 5 | from dataclasses import dataclass 6 | from typing import Optional, Callable 7 | 8 | @dataclass 9 | class TTSConfig: 10 | voice: str = "zh-TW-HsiaoYuNeural" 11 | sample_rate: int = 24000 12 | channels: int = 1 13 | format: int = pyaudio.paInt16 14 | 15 | class VoiceGenerator: 16 | def __init__(self, config: Optional[TTSConfig] = None): 17 | self.config = config or TTSConfig() 18 | self.p = pyaudio.PyAudio() 19 | self._stop_event = threading.Event() 20 | self._current_stream = None 21 | 22 | def _create_ffmpeg_process(self): 23 | return subprocess.Popen( 24 | [ 25 | "ffmpeg", 26 | "-hide_banner", 27 | "-loglevel", "error", 28 | "-i", "pipe:0", 29 | "-f", "s16le", 30 | "-acodec", "pcm_s16le", 31 | "-ac", str(self.config.channels), 32 | "-ar", str(self.config.sample_rate), 33 | "pipe:1" 34 | ], 35 | stdin=subprocess.PIPE, 36 | stdout=subprocess.PIPE 37 | ) 38 | 39 | def _audio_output_thread(self, ffmpeg_process): 40 | stream = self.p.open( 41 | format=self.config.format, 42 | channels=self.config.channels, 43 | rate=self.config.sample_rate, 44 | output=True 45 | ) 46 | self._current_stream = stream 47 | 48 | try: 49 | while not self._stop_event.is_set(): 50 | data = ffmpeg_process.stdout.read(1024) 51 | if not data: 52 | break 53 | stream.write(data) 54 | finally: 55 | stream.stop_stream() 56 | stream.close() 57 | self.p.terminate() 58 | 59 | def speak(self, text: str, callback: Optional[Callable] = None): 60 | """非阻塞式播放""" 61 | self._stop_event.clear() 62 | 63 | def _run(): 64 | try: 65 | communicate = edge_tts.Communicate(text, voice=self.config.voice) 66 | ffmpeg_process = self._create_ffmpeg_process() 67 | 68 | audio_thread = threading.Thread( 69 | target=self._audio_output_thread, 70 | args=(ffmpeg_process,) 71 | ) 72 | audio_thread.start() 73 | 74 | for chunk in communicate.stream_sync(): 75 | if chunk["type"] == "audio": 76 | ffmpeg_process.stdin.write(chunk["data"]) 77 | 78 | ffmpeg_process.stdin.close() 79 | audio_thread.join() 80 | ffmpeg_process.wait() 81 | 82 | if callback: 83 | callback(True, None) 84 | except Exception as e: 85 | if callback: 86 | callback(False, str(e)) 87 | finally: 88 | self._current_stream = None 89 | 90 | threading.Thread(target=_run).start() 91 | 92 | def stop(self): 93 | """立即停止播放""" 94 | self._stop_event.set() 95 | if self._current_stream: 96 | self._current_stream.stop_stream() 97 | 98 | if __name__ == "__main__": 99 | generator = VoiceGenerator() 100 | generator.speak("你好,欢迎使用Edge TTS。") 101 | input("按回车键停止播放...") 102 | generator.stop() 103 | # Compare this snippet from util.py: 104 | -------------------------------------------------------------------------------- /src/audio_capture.py: -------------------------------------------------------------------------------- 1 | # audio_capture.py 2 | import pyaudiowpatch as pyaudio 3 | import wave 4 | import os 5 | import configparser 6 | #获取当前文件的绝对路径,向上一级,用绝对路径找到config.ini并读取 7 | current_dir = os.path.dirname(os.path.abspath(__file__)) 8 | project_root = os.path.dirname(current_dir) 9 | config_path = os.path.join(project_root, 'config.ini') 10 | MYCONFIG = configparser.ConfigParser() 11 | MYCONFIG.read(config_path,encoding='utf-8') 12 | 13 | class LoopbackRecorder: 14 | def __init__(self, device_index=MYCONFIG['DEFAULT'].getint('SPEAKER_DEVICE_INDEX')): 15 | # 初始化 16 | self.p = None 17 | self.stream = None 18 | self.wave_file = None 19 | 20 | self.is_recording = False 21 | self.device_index = None if device_index < 0 else device_index 22 | self.device_info = self._get_device() 23 | 24 | def _cleanup(self): 25 | """严格遵循示例的资源释放顺序""" 26 | print("正在清理资源...") 27 | if self.is_recording: 28 | self.is_recording = False 29 | if self.stream: 30 | self.stream.close() 31 | if self.wave_file: 32 | self.wave_file.close() 33 | if self.p: 34 | self.p.terminate() 35 | 36 | def _get_device(self): 37 | """严格遵循官方示例的设备获取方式""" 38 | self.p = pyaudio.PyAudio() 39 | try: 40 | if self.device_index is None: 41 | self.device_info = self.p.get_default_wasapi_loopback() 42 | else: 43 | self.device_info = self.p.get_device_info_by_index(self.device_index) 44 | 45 | # 验证设备是否支持loopback 46 | if self.device_info["maxInputChannels"] < 1: 47 | raise ValueError("设备不支持loopback输入") 48 | 49 | return self.device_info 50 | except (OSError, LookupError) as e: 51 | raise RuntimeError(f"设备初始化失败: {str(e)}") 52 | 53 | def start_recording(self, filename="output.wav"): 54 | """完全按照官方示例的流初始化方式""" 55 | try: 56 | self._get_device() 57 | 58 | # 参数直接从设备信息获取 59 | self.rate = int(self.device_info["defaultSampleRate"]) 60 | self.channels = self.device_info["maxInputChannels"] 61 | self.format = pyaudio.paInt16 62 | self.sample_size = self.p.get_sample_size(self.format) 63 | 64 | # 初始化WAV文件 65 | self.wave_file = wave.open(filename, 'wb') 66 | self.wave_file.setnchannels(self.channels) 67 | self.wave_file.setsampwidth(self.sample_size) 68 | self.wave_file.setframerate(self.rate) 69 | 70 | # 创建音频流(与示例完全一致) 71 | self.stream = self.p.open( 72 | format=self.format, 73 | channels=self.channels, 74 | rate=self.rate, 75 | input=True, 76 | input_device_index=self.device_info["index"], 77 | frames_per_buffer=1024 78 | ) 79 | 80 | self.is_recording = True 81 | print(f"成功启动录音: {self.device_info['name']}") 82 | 83 | except Exception as e: 84 | self._cleanup() 85 | raise RuntimeError("self._cleanup()出错") 86 | 87 | def record(self, duration=None): 88 | if not self.is_recording: 89 | raise RuntimeError("必须先调用start_recording()") 90 | try: 91 | if duration: # 定时录音模式 92 | print(f"正在录制 {duration} 秒...") 93 | for _ in range(0, int(self.rate / 1024 * duration)): 94 | if not self.is_recording: # 检查是否收到停止信号 95 | break 96 | data = self.stream.read(1024) 97 | self.wave_file.writeframes(data) 98 | else: # 持续录音模式 99 | print("持续录音中...") 100 | while self.is_recording: 101 | data = self.stream.read(1024) 102 | self.wave_file.writeframes(data) 103 | self.stop_recording() 104 | finally: 105 | self._cleanup() 106 | print("录音已停止") 107 | 108 | def stop_recording(self): 109 | """严格遵循示例的资源释放顺序""" 110 | if self.is_recording: 111 | self.is_recording = False 112 | if self.stream: 113 | self.stream.close() 114 | if self.wave_file: 115 | self.wave_file.close() 116 | if self.p: 117 | self.p.terminate() 118 | print("录音已安全停止") 119 | 120 | @staticmethod 121 | def list_devices(): 122 | """设备列表查询(直接使用官方推荐方式)""" 123 | with pyaudio.PyAudio() as p: 124 | # 打印默认输入设备信息 125 | print("\n=== 默认输入设备 ===") 126 | try: 127 | default = p.get_default_input_device_info() 128 | print(f"* 默认设备: [{default['index']}] {default['name']}") 129 | except Exception as e: 130 | print("! 未找到默认输入设备") 131 | 132 | # 打印默认输出设备信息 133 | print("\n=== 默认输出设备 ===") 134 | try: 135 | default = p.get_default_output_device_info() 136 | print(f"* 默认设备: [{default['index']}] {default['name']}") 137 | except Exception as e: 138 | print("! 未找到默认输出设备") 139 | 140 | print("\n=== 默认Loopback设备 ===") 141 | try: 142 | default = p.get_default_wasapi_loopback() 143 | print(f"* 默认设备: [{default['index']}] {default['name']}") 144 | except Exception as e: 145 | print("! 未找到默认loopback设备") 146 | 147 | print("\n所有含有InputChannel的设备:") 148 | for i in range(p.get_device_count()): 149 | dev = p.get_device_info_by_index(i) 150 | if dev["maxInputChannels"] > 0: 151 | print(f"[{dev['index']}] {dev['name']} (输入通道: {dev['maxInputChannels']})") 152 | 153 | print("\n所有含有OutputChannel的设备:") 154 | for i in range(p.get_device_count()): 155 | dev = p.get_device_info_by_index(i) 156 | if dev["maxOutputChannels"] > 0: 157 | print(f"[{dev['index']}] {dev['name']} (输出通道: {dev['maxOutputChannels']})") 158 | 159 | print("\n所有含有Loopback的设备:") 160 | for i in range(p.get_device_count()): 161 | dev = p.get_device_info_by_index(i) 162 | if dev["isLoopbackDevice"] > 0: 163 | print(f"[{dev['index']}] {dev['name']} (Loopback: {dev['maxInputChannels']})") 164 | if __name__ == "__main__": 165 | # 列出设备 166 | LoopbackRecorder.list_devices() 167 | 168 | try: 169 | recorder = LoopbackRecorder() # 明确指定设备索引 170 | recorder.start_recording("output/test_record.wav") 171 | recorder.record(duration=5) # 录制5秒 172 | recorder.stop_recording() 173 | except Exception as e: 174 | print(f"录音失败: {str(e)}") 175 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # InterPilot 4 | 5 | [English](README_en.md) | [中文](README.md) 6 | 7 | [![Windows](https://img.shields.io/badge/Windows-Platform-blue?logo=windows)](https://www.microsoft.com/windows) 8 | [![Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://img.shields.io/badge/License-CC%20BY--NC%204.0-blue?logo=creativecommons)](https://creativecommons.org/licenses/by-nc/4.0/) 9 | [![Python](https://img.shields.io/badge/Python-3.10-blue?logo=python)](https://www.python.org/) 10 | [![PyQt5](https://img.shields.io/badge/PyQt5-5.15.4-blue?logo=qt)](https://pypi.org/project/PyQt5/) 11 | [![FFmpeg](https://img.shields.io/badge/FFmpeg-4.4-blue?logo=ffmpeg)](https://www.ffmpeg.org/) 12 | [![OpenAI](https://img.shields.io/badge/OpenAI-API-blue?logo=openai)](https://www.openai.com/) 13 | [![SiliconFlow](https://img.shields.io/badge/SiliconFlow-API-blue?logo=siliconflow)](https://cloud.siliconflow.cn/i/TzKmtDJH) 14 | 15 | 本项目是一个基于 AI 的助手工具,能够从windows的输入输出设备中捕获音频,将音频转为文字后,再调用 LLM(大语言模型) API 给出回答。项目主要包括录音、转写和 AI 回答三个模块,**旨在为个人的正当学习、工作、科研提供辅助支持。** 16 | 17 | 部分内测用户反映,本工具可能可以在面试、会议、学习等场景中提供一定的帮助,比如在在线会议软件中作为AI面试工具辅助面试:获取面试官的音频然后得到回答,但是请注意:**本工具仅供学习交流使用,不得用于任何不正当用途**。 18 | 19 | 经测试,本工具**能够借助第三方工具隐藏界面以防止被录屏软件、屏幕共享等功能录制到**,但工具本身不具备隐藏界面的功能。**是否使用第三方工具与作者无关,风险由用户自行承担。** 20 | 21 | 22 | ![InterPilot](doc_pic/logo.png) 23 | 24 | 如果对你有所帮助,可以通过[微信](doc_pic/QR.png)扫码打赏,感谢你的支持! 25 | ![赞助](doc_pic/QR.png) 26 | ## 目录 27 | 28 | - [InterPilot](#interpilot) 29 | - [目录](#目录) 30 | - [灵感](#灵感) 31 | - [特性](#特性) 32 | - [项目结构](#项目结构) 33 | - [安装与依赖](#安装与依赖) 34 | - [系统依赖](#系统依赖) 35 | - [Python 依赖](#python-依赖) 36 | - [配置](#配置) 37 | - [具体配置说明](#具体配置说明) 38 | - [API](#api) 39 | - [录音设备索引](#录音设备索引) 40 | - [使用说明](#使用说明) 41 | - [单独测试模块](#单独测试模块) 42 | - [启动图形界面](#启动图形界面) 43 | - [注意事项](#注意事项) 44 | - [应对在线会议等软件的屏幕共享功能(如果你不想让别人看到本工具)](#应对在线会议等软件的屏幕共享功能如果你不想让别人看到本工具) 45 | - [待补充 / TODO](#待补充--todo) 46 | - [贡献](#贡献) 47 | - [免责声明 / Disclaimer](#免责声明--disclaimer) 48 | - [许可证](#许可证) 49 | 50 | ## 灵感 51 | 52 | 来源于[YT-Chowww/InterviewCopilot](https://github.com/YT-Chowww/InterviewCopilot) 53 | 54 | 55 | ## 特性 56 | 57 | - **音频捕获** 58 | 使用 [LoopbackRecorder](src/audio_capture.py) 从系统录制音频(**支持 loopback 设备**),并保存为 WAV 文件。 59 | 60 | - **语音转写** 61 | 基于 [Whisper](https://github.com/openai/whisper) 模型在**本地进行音频转写**,支持多种模型规格(默认使用 `base` 模型)。 62 | 63 | - **AI 辅助回答** 64 | 通过调用 LLM API(配置在 `config.ini` 中)对转写后的文本进行分析,生成回答。支持**流式返回并实时更新界面**。 65 | 66 | - **图形用户界面** 67 | 基于 PyQt5 构建的简洁 GUI,支持录音、转写、发送文本至 LLM 等操作,并对 LLM 回复**支持 Markdown 渲染**。 68 | 69 | ![GUI](doc_pic/GUI.png) 70 | 71 | ## 项目结构 72 | 73 | ``` 74 | C:. 75 | │ config.ini 76 | │ logo.png 77 | │ main.py 78 | | main_cmd.py 79 | | README.md 80 | │ requirements.txt 81 | │ 82 | ├── output 83 | └── src 84 | │ audio_capture.py 85 | │ llm_client.py 86 | │ transcriber.py 87 | │ __init__.py 88 | │ 89 | └── utils 90 | │ config_loader.py 91 | │ __init__.py 92 | ``` 93 | 94 | - **config.ini** 95 | 配置文件,包含 API 接口地址、API key、使用的模型、设备索引、默认提示词等参数。 96 | 97 | - **logo.png** 98 | 应用程序图标(用于 GUI 窗口)。 99 | 100 | - **main.py/main_cmd.py** 101 | 程序入口,负责启动图形界面和整体工作流程。 102 | 103 | - **output/** 104 | 存放录音文件。 105 | 106 | - **requirements.txt** 107 | 列出项目依赖的 Python 包(例如 PyQt5、markdown2、whisper、openai 等)。 108 | 109 | - **src/** 110 | 存放核心模块: 111 | - `audio_capture.py`:音频录制模块。 112 | - `transcriber.py`:语音转写模块。 113 | - `llm_client.py`:调用 LLM API 的客户端。 114 | - `utils/`:包含一些工具类和配置加载模块。 115 | 116 | ## 安装与依赖 117 | 118 | ### 系统依赖 119 | 120 | - **FFmpeg** 121 | 本项目依赖 [FFmpeg](https://www.gyan.dev/ffmpeg/) 进行部分音频处理,请确保已正确安装并配置环境变量。 122 | - **安装方法示例**: 123 | - Windows 用户: 124 | - 使用 [Scoop](https://scoop.sh/): 125 | ```bash 126 | scoop install ffmpeg 127 | ``` 128 | - 或下载 Windows 预编译版本([下载链接](https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-full.7z)) 129 | - 将下载目录下的 `bin` 文件夹(例如 `C:\Users\USERNAME\scoop\apps\ffmpeg\7.1.1\bin`)添加到系统环境变量 `PATH` 中。 130 | - Mac 用户可使用 Homebrew 安装: 131 | ```bash 132 | brew install ffmpeg 133 | ``` 134 | - whisper项目提到`You may need rust installed as well`,所以需要可能安装rust(但不安装好像没事儿,建议先不装,如果`transcriber.py`不能正常运行再参考[Whisper](https://github.com/openai/whisper) ) 135 | 136 | 137 | 138 | ### Python 依赖 139 | 140 | 建议使用miniconda或者anaconda创建虚拟环境(建议安装 `Python 3.10`版本): 141 | 142 | ```bash 143 | conda create -n interview python=3.10 144 | conda activate interview 145 | ``` 146 | 147 | 然后使用以下命令安装项目所需依赖: 148 | 149 | ```bash 150 | pip install -r requirements.txt 151 | ``` 152 | 153 | 154 | ## 配置 155 | 156 | 请根据实际情况修改根目录下的 `config.ini` 文件,其中包括: 157 | 158 | - **API_URL**:LLM API 的地址。 159 | - **API_KEY**:访问 API 的密钥。 160 | - **MODEL**:调用的模型名称(例如 `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B`,其他模型名称可以访问硅基流动([官网链接](https://cloud.siliconflow.cn/i/TzKmtDJH))-模型广场查看。 161 | - **SPEAKER_DEVICE_INDEX** 与 **MIC_DEVICE_INDEX**:录音设备的索引,视具体系统配置而定。建议阅读[录音设备索引](#录音设备索引)和[注意事项](#注意事项)部分。 162 | - **OUTPUT_DIR**:存储录音文件的目录。 163 | - **WHISPER_MODEL_SIZW**:[whisper](https://github.com/openai/whisper)模型的大小,可选项为tiny `base`、`small`、`medium`、`large`、`turbo`。 164 | - **DEFAULT_PROMPT**:是**拼接**在发送给 LLM 的文本最前端的默认提示词,可根据使用场景调整,例如“你是一个XX方面的专家,你马上获取到的文本来自于XX,请你据此给出合理简洁的回答:” 165 | 166 | ### 具体配置说明 167 | 168 | #### API 169 | - 建议注册硅基流动([官网链接](https://cloud.siliconflow.cn/i/TzKmtDJH))获取`API_KEY`,新用户受邀可获取14元额度(邀请码`TzKmtDJH`),足够用一段时间了 170 | - 官网左侧菜单栏-API秘钥-新建API秘钥-获取一段形如`sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx`的长字符串替换`config.ini`里的`API_KEY`即可 171 | - **使用其他支持OpenAI API的服务也可以**,只需替换`API_URL`和`API_KEY`即可(还是建议使用siliconflow,工具默认使用的`deepseek-ai/DeepSeek-R1-Distill-Qwen-7B`模型完全免费,白嫖万岁!) 172 | 173 | #### 录音设备索引 174 | - 默认`SPEAKER_DEVICE_INDEX`置为了-1,这会自动寻找可用的默认wasapi_loopback设备,一般录制的就是你的目前的扬声器(耳机)听到的声音,但如果出现问题,建议手动运行`audio_capture.py`查看全部可用设备后,手动指定正确的设备。你也可以通过修改这个参数使得录制的是麦克风输入的声音。 175 | 176 | ```bash 177 | python src/audio_capture.py 178 | ``` 179 | 180 | ## 使用说明 181 | 182 | ### 单独测试模块 183 | 184 | 项目各核心模块(录音、转写、LLM 客户端)均包含简单的测试代码。你可以分别运行下列文件,检查各功能模块是否正常运行: 185 | 186 | - `src/audio_capture.py` —— 用于实现音频录制功能(能够打印出系统中的音频设备列表)。 187 | - `src/transcriber.py` —— 用于实现音频转写功能(首次运行会自动下载模型)。 188 | - `src/llm_client.py` —— 用于实现 LLM 客户端功能(调用 LLM API 并返回回答)。 189 | 190 | 191 | ### 启动图形界面 192 | 193 | 运行 `main.py` 启动完整的面试助手 GUI: 194 | 195 | 196 | ```bash 197 | python main.py 198 | ``` 199 | 200 | 在 GUI 中你可以依次进行以下操作: 201 | 202 | - **开始录音**:点击“开始录音”按钮,程序将自动生成唯一的录音文件名并开始录制音频。 203 | - **结束录音**:点击“结束录音”按钮结束录音,录音文件保存在 `output` 目录中。 204 | - **转写文字**:录音结束后(或手动点击),调用转写模块,将录音转为文字并显示在界面上。 205 | - **发送给 LLM**:转写完成后,可以将文字发送至 LLM,生成 AI 回答,并在界面上显示支持 Markdown 格式的回复。 206 | - **修改转写文字并发送给 LLM** 207 | 208 | 如果你想在终端中运行,可以使用 `main_cmd.py`: 209 | 210 | ```bash 211 | python main_cmd.py 212 | ``` 213 | 214 | ### 注意事项 215 | 216 | - **录音设备**:根据设备不同,可能需要调整 `config.ini` 中的 `SPEAKER_DEVICE_INDEX` 和 `MIC_DEVICE_INDEX` 参数。 默认设置下,因为录制的是扬声器(你听到)的声音,所以在没有声音播放的时候,是不会录制的,所以必须播放一些音频或者视频,才能获取到音频。测试的时候可以放个视频。 217 | - **环境变量**:确保 FFmpeg 已安装并已添加到环境变量 PATH 中,否则可能会影响音频处理。 218 | - **测试验证**:建议先单独测试各模块,确认音频录制、转写和 LLM 回答均正常后再启动 GUI 整体运行。 219 | 220 | ### 应对在线会议等软件的屏幕共享功能(如果你不想让别人看到本工具) 221 | 222 | 使用[shalzuth/WindowSharingHider](https://github.com/shalzuth/WindowSharingHider)隐藏UI界面————太棒的工具了!又方便又好用! 223 | 224 | 任务栏中图表的隐藏: 225 | - 直接使用windows自带的任务栏隐藏功能,或者干脆把任务栏移到第二个显示器 226 | - 使用一些隐藏工具(可以自己找一下) 227 | 228 | 使用[turbotop](https://www.savardsoftware.com/turbotop/)可以使得窗口始终置顶————也是很好用的工具 229 | 230 | - **注意一下使用顺序不然可能会出现问题**: 231 | - 先使用turbotop使得窗口置顶 232 | - 再使用WindowSharingHider隐藏UI界面 233 | - 如果不太就行就换一下顺序多试几下 234 | 235 | ![使用对比](doc_pic/Use.jpg) 236 | 237 | ## 待补充 / TODO 238 | 239 | - [ ] 在README中增加详细的使用案例或截图(GUI 操作示例、终端输出示例等)。 240 | - [ ] 增加voice_generate功能(TTS)——已经测试好,待集成 241 | - [ ] 增加麦克风扬声器音频共同识别功能 242 | - [ ] 增加截图、上传LLM功能 243 | - [ ] 任务栏中的图标隐藏功能 244 | 245 | ## 贡献 246 | 247 | 欢迎社区开发者提交 issue 或 pull request,一起完善这个 工具。如果有任何建议或改进意见,请随时联系。 248 | 249 | 250 | ## 免责声明 / Disclaimer 251 | 252 | 本项目仅供技术学习与研究交流之用,严禁用于以下用途: 253 | - 任何形式的求职面试作弊行为 254 | - 侵犯他人隐私或商业秘密 255 | - 违反当地法律法规的行为 256 | 257 | 使用者应对自身行为负全部法律责任,作者不承担任何因滥用本项目导致的直接或间接后果。使用即表示您已阅读并同意本声明。 258 | 259 | ## 许可证 260 | 本项目采用 [Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://creativecommons.org/licenses/by-nc/4.0/) 许可证进行开源。 261 | 这意味着您可以自由地共享和修改本项目的内容,但**仅限于非商业用途**。 262 | 263 | 264 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | import time 4 | import ctypes 5 | 6 | from PyQt5 import QtWidgets, QtCore, QtGui 7 | from PyQt5.QtCore import QTimer 8 | from PyQt5.QtWidgets import QMainWindow, QWidget, QGridLayout, QPushButton, QCheckBox, QTextBrowser, QLabel 9 | 10 | import markdown2 # 请使用 pip install markdown2 安装此包 11 | 12 | from src.audio_capture import LoopbackRecorder 13 | from src.transcriber import SpeechTranscriber 14 | from src.llm_client import LLMClient 15 | import os 16 | import configparser 17 | 18 | #获取当前文件的绝对路径,向上一级,用绝对路径找到config.ini并读取 19 | current_dir = os.path.dirname(os.path.abspath(__file__)) 20 | project_root = current_dir 21 | config_path = os.path.join(project_root, 'config.ini') 22 | MYCONFIG = configparser.ConfigParser() 23 | MYCONFIG.read(config_path,encoding='utf-8') 24 | 25 | 26 | # RecorderThread 保持不变,全部录音操作在同一线程内进行 27 | class RecorderThread(threading.Thread): 28 | def __init__(self, recorder, filename, duration=None): 29 | super().__init__() 30 | self.recorder = recorder 31 | self.filename = filename 32 | self.duration = duration 33 | 34 | def run(self): 35 | try: 36 | self.recorder.start_recording(self.filename) 37 | self.recorder.record(duration=self.duration) 38 | finally: 39 | self.recorder._cleanup() # 在同一线程中释放资源 40 | 41 | # 利用 Windows API 防止屏幕捕获 42 | def prevent_screen_capture(winId): 43 | try: 44 | WDA_MONITOR = 1 # 仅在支持该API的 Windows 系统有效 45 | ctypes.windll.user32.SetWindowDisplayAffinity(int(winId), WDA_MONITOR) 46 | except Exception as e: 47 | print("屏幕保护设置失败:", e) 48 | 49 | class InterviewAssistantGUI(QMainWindow): 50 | def __init__(self,config): 51 | super().__init__() 52 | self.setWindowTitle("InterPolit") 53 | #设置图标 54 | self.setWindowIcon(QtGui.QIcon('logo.png')) 55 | self.setGeometry(100, 100, 1200, 900) 56 | # 创建中心控件与布局 57 | central = QWidget() 58 | self.setCentralWidget(central) 59 | layout = QGridLayout(central) 60 | 61 | # 状态变量与模块初始化 62 | self.recorder = None 63 | self.recording_thread = None 64 | self.current_filename = "" 65 | self.llm_full_text = "" 66 | self.llm_client_ask_cnt = 1 67 | self.default_prompt = MYCONFIG['DEFAULT']['DEFAULT_PROMPT'] 68 | 69 | self.transcriber = SpeechTranscriber() 70 | self.llm_client = LLMClient() # 替换为你的 API key 71 | 72 | # 按钮与控件 73 | self.start_btn = QPushButton("开始录音") 74 | self.start_btn.clicked.connect(self.start_recording) 75 | layout.addWidget(self.start_btn, 0, 0) 76 | 77 | self.stop_btn = QPushButton("结束录音") 78 | self.stop_btn.clicked.connect(self.stop_recording) 79 | self.stop_btn.setEnabled(False) 80 | layout.addWidget(self.stop_btn, 0, 1) 81 | 82 | self.transcribe_btn = QPushButton("转写文字") 83 | self.transcribe_btn.clicked.connect(self.transcribe_audio) 84 | self.transcribe_btn.setEnabled(False) 85 | layout.addWidget(self.transcribe_btn, 0, 2) 86 | 87 | self.send_llm_btn = QPushButton("发送给 LLM") 88 | self.send_llm_btn.clicked.connect(self.send_to_llm) 89 | self.send_llm_btn.setEnabled(False) 90 | layout.addWidget(self.send_llm_btn, 0, 3) 91 | 92 | self.auto_transcribe_chk = QCheckBox("结束录音后自动转文字") 93 | self.auto_transcribe_chk.setChecked(True) 94 | layout.addWidget(self.auto_transcribe_chk, 1, 0, 1, 2) 95 | 96 | self.auto_send_llm_chk = QCheckBox("转文字后自动发送给 LLM") 97 | self.auto_send_llm_chk.setChecked(True) 98 | layout.addWidget(self.auto_send_llm_chk, 1, 1, 1, 2) 99 | 100 | #创建是否自动滚动scrollbar的勾选框 101 | self.auto_scroll_chk = QCheckBox("自动滚动") 102 | self.auto_scroll_chk.setChecked(True) 103 | #放在第一行第三列 104 | layout.addWidget(self.auto_scroll_chk, 1, 2,1,2) 105 | 106 | # 转写文本显示区域 107 | self.transcription_browser = QTextBrowser() 108 | self.transcription_browser.setPlaceholderText("转写内容将显示在这里...") 109 | #设置为可编辑 110 | self.transcription_browser.setReadOnly(False) 111 | #设置为可拖拽 112 | self.transcription_browser.setAcceptDrops(True) 113 | layout.addWidget(self.transcription_browser, 2, 0, 1, 4) 114 | # 高度固定 115 | 116 | self.transcription_browser.setFixedHeight(150) # 或者使用 setMinimumHeight(150) 117 | # LLM 回复区域:支持 Markdown 渲染 118 | self.llm_response_browser = QTextBrowser() 119 | self.llm_response_browser.setPlaceholderText("LLM回复将显示在这里(支持Markdown)...") 120 | 121 | #设置为可拖拽 122 | self.llm_response_browser.setAcceptDrops(True) 123 | layout.addWidget(self.llm_response_browser, 3, 0, 1, 4) 124 | 125 | 126 | 127 | # 设置滑动条行为:滑动条在最下面时自动滚动到最新llm_response输出 128 | self.llm_response_browser.textChanged.connect(self.auto_scroll_llm_response) 129 | 130 | self.status_label = QLabel("就绪") 131 | layout.addWidget(self.status_label, 4, 0, 1, 4) 132 | 133 | # 在窗口显示后调用防屏幕捕获设置(仅 Windows 有效) 134 | QTimer.singleShot(100, self.apply_screen_capture_protection) 135 | 136 | def auto_scroll_llm_response(self): 137 | cursor = self.llm_response_browser.textCursor() 138 | cursor.movePosition(QtGui.QTextCursor.End) 139 | self.llm_response_browser.setTextCursor(cursor) 140 | self.llm_response_browser.ensureCursorVisible() 141 | 142 | def apply_screen_capture_protection(self): 143 | if sys.platform.startswith("win"): 144 | prevent_screen_capture(self.winId()) 145 | 146 | def start_recording(self): 147 | try: 148 | # 为每次录音生成唯一文件名 149 | filename = f"interview_{int(time.time())}.wav" 150 | #拼接MYCONFIG['DEFAULT']['OUTPUT_DIR']和filename 151 | filename = os.path.join(MYCONFIG['DEFAULT']['OUTPUT_DIR'],filename) 152 | print(f"开始录音: {filename}") 153 | self.recorder = LoopbackRecorder(device_index=MYCONFIG['DEFAULT'].getint('SPEAKER_DEVICE_INDEX')) 154 | device_info = self.recorder.device_info 155 | self.recording_thread = RecorderThread(self.recorder, filename) 156 | self.recording_thread.start() 157 | self.current_filename = filename 158 | self.status_label.setText(f"录音中... 设备为:({device_info['index']})({device_info['name']})") 159 | self.start_btn.setEnabled(False) 160 | self.stop_btn.setEnabled(True) 161 | except Exception as e: 162 | print(f"启动录音失败: {e}") 163 | self.status_label.setText("录音启动失败") 164 | 165 | def stop_recording(self): 166 | try: 167 | self.recorder.is_recording = False 168 | self.start_btn.setEnabled(True) 169 | self.stop_btn.setEnabled(False) 170 | # 如果保存的文件大小为0,说明录音失败 171 | if os.path.getsize(self.current_filename) < 1: 172 | self.status_label.setText("录音失败:文件大小为0,可能是没有音频输入或输出") 173 | return 174 | self.recording_thread.join() 175 | self.status_label.setText("录音已停止") 176 | 177 | self.transcribe_btn.setEnabled(True) 178 | if self.auto_transcribe_chk.isChecked(): 179 | self.transcribe_audio() 180 | 181 | except Exception as e: 182 | print(f"停止录音失败: {e}") 183 | self.status_label.setText("停止录音失败") 184 | 185 | 186 | def transcribe_audio(self): 187 | self.transcription_browser.clear() 188 | self.status_label.setText("转写中...") 189 | try: 190 | text = self.transcriber.transcribe(self.current_filename) 191 | self.transcription_browser.setPlainText(text) 192 | self.status_label.setText("转写完成") 193 | self.send_llm_btn.setEnabled(True) 194 | self.transcribe_btn.setEnabled(False) 195 | if self.auto_send_llm_chk.isChecked(): 196 | self.send_to_llm() 197 | except Exception as e: 198 | print(f"转写失败: {e}") 199 | self.transcription_browser.setPlainText(f"转写失败: {e}\n") 200 | 201 | def send_to_llm(self): 202 | self.status_label.setText("LLM thinking...") 203 | self.llm_response_browser.clear() 204 | transcription = self.transcription_browser.toPlainText().strip() 205 | # 把DEFAULT_PROMPT拼接到转写的文本前面 206 | transcription = f"{self.default_prompt}\n{transcription}" 207 | if not transcription: 208 | self.llm_response_browser.setPlainText("转写文字为空,请先转写音频!\n") 209 | return 210 | threading.Thread(target=self.llm_thread, args=(transcription,), daemon=True).start() 211 | 212 | def llm_thread(self, text): 213 | # 回调函数:累积流式返回的文本,并用 markdown2 转换为 HTML 更新界面 214 | def update_ui(new_text): 215 | self.llm_full_text += new_text 216 | html = markdown2.markdown(self.llm_full_text) 217 | # 使用 QueuedConnection 确保线程安全更新 218 | QtCore.QMetaObject.invokeMethod( 219 | self.llm_response_browser, "setHtml", QtCore.Qt.QueuedConnection, QtCore.Q_ARG(str, html) 220 | ) 221 | 222 | try: 223 | # 在回复框中先显示第几次调用 224 | 225 | self.llm_client.get_response(text, callback=update_ui) 226 | except Exception as e: 227 | QtCore.QMetaObject.invokeMethod( 228 | self.llm_response_browser, "append", QtCore.Qt.QueuedConnection, QtCore.Q_ARG(str, f"\nLLM调用失败: {e}") 229 | ) 230 | self.status_label.setText("LLM处理完成") 231 | # 更新调用次数并在回复框中显示,并绘制一个分割线(用markdown显示) 232 | divider = f"\n\n**第 {self.llm_client_ask_cnt} 次调用完成**\n\n---\n" 233 | self.llm_full_text += divider 234 | html = markdown2.markdown(self.llm_full_text) 235 | QtCore.QMetaObject.invokeMethod( 236 | self.llm_response_browser, "setHtml", QtCore.Qt.QueuedConnection, QtCore.Q_ARG(str, html) 237 | ) 238 | 239 | 240 | self.llm_client_ask_cnt += 1 241 | 242 | if __name__ == "__main__": 243 | app = QtWidgets.QApplication(sys.argv) 244 | window = InterviewAssistantGUI(MYCONFIG) 245 | window.show() 246 | sys.exit(app.exec_()) 247 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # InterPilot 4 | 5 | [English](README_en.md) | [中文](README.md) 6 | 7 | [![Windows](https://img.shields.io/badge/Windows-Platform-blue?logo=windows)](https://www.microsoft.com/windows) 8 | [![Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://img.shields.io/badge/License-CC%20BY--NC%204.0-blue?logo=creativecommons)](https://creativecommons.org/licenses/by-nc/4.0/) 9 | [![Python](https://img.shields.io/badge/Python-3.10-blue?logo=python)](https://www.python.org/) 10 | [![PyQt5](https://img.shields.io/badge/PyQt5-5.15.4-blue?logo=qt)](https://pypi.org/project/PyQt5/) 11 | [![FFmpeg](https://img.shields.io/badge/FFmpeg-4.4-blue?logo=ffmpeg)](https://www.ffmpeg.org/) 12 | [![OpenAI](https://img.shields.io/badge/OpenAI-API-blue?logo=openai)](https://www.openai.com/) 13 | [![SiliconFlow](https://img.shields.io/badge/SiliconFlow-API-blue?logo=siliconflow)](https://cloud.siliconflow.cn/i/TzKmtDJH) 14 | 15 | InterPilot is an AI-based assistant tool that captures audio from Windows input and output devices, transcribes the audio into text, and then calls an LLM (Large Language Model) API to generate responses. The project comprises three main modules—recording, transcription, and AI response—**aiming to support legitimate personal study, work, and research.** 16 | 17 | Some beta testers have reported that this tool may be helpful in scenarios such as interviews, meetings, and learning. For instance, it can serve as an AI interview assistant in online meeting software by capturing the interviewer’s audio and generating responses. However, please note that **this tool is intended solely for learning and communication purposes and must not be used for any improper activities.** 18 | 19 | Through testing, this tool can leverage third-party utilities to hide its interface so that it is not recorded by screen recording or screen sharing software. However, the tool itself does not possess interface hiding capabilities. **Whether you use third-party tools is not the author’s responsibility; the risk is solely borne by the user.** 20 | 21 | ![InterPilot](doc_pic/logo.png) 22 | 23 | ## Table of Contents 24 | 25 | - [InterPilot](#interpilot) 26 | - [Table of Contents](#table-of-contents) 27 | - [Inspiration](#inspiration) 28 | - [Features](#features) 29 | - [Project Structure](#project-structure) 30 | - [Installation \& Dependencies](#installation--dependencies) 31 | - [System Dependencies](#system-dependencies) 32 | - [Python Dependencies](#python-dependencies) 33 | - [Configuration](#configuration) 34 | - [Detailed Configuration Instructions](#detailed-configuration-instructions) 35 | - [API](#api) 36 | - [Recording Device Index](#recording-device-index) 37 | - [Usage Instructions](#usage-instructions) 38 | - [Testing Individual Modules](#testing-individual-modules) 39 | - [Launching the Graphical User Interface](#launching-the-graphical-user-interface) 40 | - [Notes](#notes) 41 | - [Handling Screen Sharing and UI Hiding (if you wish to keep the tool hidden during meetings)](#handling-screen-sharing-and-ui-hiding-if-you-wish-to-keep-the-tool-hidden-during-meetings) 42 | - [TODO](#todo) 43 | - [Contribution](#contribution) 44 | - [⚠️ Disclaimer](#️-disclaimer) 45 | - [License](#license) 46 | 47 | 48 | 49 | ## Inspiration 50 | 51 | Inspired by [YT-Chowww/InterviewCopilot](https://github.com/YT-Chowww/InterviewCopilot) 52 | 53 | ## Features 54 | 55 | - **Audio Capture** 56 | Uses [LoopbackRecorder](src/audio_capture.py) to record audio from the system (with **support for loopback devices**) and saves it as a WAV file. 57 | 58 | - **Speech Transcription** 59 | Performs **local audio transcription** using the [Whisper](https://github.com/openai/whisper) model. It supports various model sizes (default is the `base` model). 60 | 61 | - **AI-Assisted Response** 62 | Analyzes the transcribed text and generates responses by calling the LLM API (configured in `config.ini`). It supports **streaming responses with real-time UI updates**. 63 | 64 | - **Graphical User Interface** 65 | A clean GUI built with PyQt5 that supports recording, transcription, sending text to the LLM, and renders LLM responses with **Markdown support**. 66 | 67 | ![GUI](doc_pic/GUI.png) 68 | 69 | ## Project Structure 70 | 71 | ``` 72 | C:. 73 | │ config.ini 74 | │ logo.png 75 | │ main.py 76 | │ main_cmd.py 77 | │ README.md 78 | │ requirements.txt 79 | │ 80 | ├── output 81 | └── src 82 | │ audio_capture.py 83 | │ llm_client.py 84 | │ transcriber.py 85 | │ __init__.py 86 | │ 87 | └── utils 88 | │ config_loader.py 89 | │ __init__.py 90 | ``` 91 | 92 | - **config.ini** 93 | Configuration file containing the API endpoint, API key, model to use, device indices, default prompt, etc. 94 | 95 | - **logo.png** 96 | Application icon used in the GUI. 97 | 98 | - **main.py / main_cmd.py** 99 | Entry points for the program, responsible for launching the GUI and the overall workflow. 100 | 101 | - **output/** 102 | Directory for storing recorded audio files. 103 | 104 | - **requirements.txt** 105 | Lists the Python package dependencies (such as PyQt5, markdown2, whisper, openai, etc.). 106 | 107 | - **src/** 108 | Contains the core modules: 109 | - `audio_capture.py`: Audio recording module. 110 | - `transcriber.py`: Speech transcription module. 111 | - `llm_client.py`: Client for calling the LLM API. 112 | - `utils/`: Contains additional utility classes and configuration loader modules. 113 | 114 | ## Installation & Dependencies 115 | 116 | ### System Dependencies 117 | 118 | - **FFmpeg** 119 | This project depends on [FFmpeg](https://www.gyan.dev/ffmpeg/) for some audio processing tasks. Please ensure FFmpeg is properly installed and added to your system's PATH. 120 | - **Example Installation Methods**: 121 | - **For Windows Users**: 122 | - Using [Scoop](https://scoop.sh/): 123 | ```bash 124 | scoop install ffmpeg 125 | ``` 126 | - Or download the Windows precompiled version (see [Download Link](https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-full.7z)). 127 | - Add the `bin` folder from the downloaded directory (e.g., `C:\Users\USERNAME\scoop\apps\ffmpeg\7.1.1\bin`) to your system PATH. 128 | - **For macOS Users**: 129 | ```bash 130 | brew install ffmpeg 131 | ``` 132 | - The Whisper project mentions that "You may need rust installed as well," so if you encounter issues with `transcriber.py`, consider installing Rust (though it usually works without it). 133 | 134 | ### Python Dependencies 135 | 136 | It is recommended to create a virtual environment using Miniconda or Anaconda (suggested Python version: 3.10): 137 | 138 | ```bash 139 | conda create -n interview python=3.10 140 | conda activate interview 141 | ``` 142 | 143 | Then install the required Python packages: 144 | 145 | ```bash 146 | pip install -r requirements.txt 147 | ``` 148 | 149 | ## Configuration 150 | 151 | Please modify the `config.ini` file in the root directory according to your setup, including: 152 | 153 | - **API_URL**: The LLM API endpoint. 154 | - **API_KEY**: Your API access key. 155 | - **MODEL**: The model name to be used (e.g., `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B`). Other model names can be viewed on the Siliconflow website (see [Official Link](https://cloud.siliconflow.cn/i/TzKmtDJH)). 156 | - **SPEAKER_DEVICE_INDEX** and **MIC_DEVICE_INDEX**: The indices of the recording devices, depending on your system configuration. It is recommended to read the [Recording Device Index](#recording-device-index) and [Notes](#notes) sections. 157 | - **OUTPUT_DIR**: Directory to store the recorded audio files. 158 | - **WHISPER_MODEL_SIZE**: Size of the Whisper model. Options include tiny, `base`, `small`, `medium`, `large`, `turbo`. 159 | - **DEFAULT_PROMPT**: It is the default prompt word **spliced at the forefront of the text sent to LLM**, which can be adjusted according to the usage scenario. For example, "You are an expert in XX, and the text you are about to receive comes from XX. Please provide a reasonable and concise answer based on this:" 160 | 161 | ### Detailed Configuration Instructions 162 | 163 | #### API 164 | - It is recommended to register on Siliconflow (see [Official Link](https://cloud.siliconflow.cn/i/TzKmtDJH)) to obtain an `API_KEY`. New users can get a free credit (invite code `TzKmtDJH`) which is sufficient for some time. 165 | - On the website, go to the left sidebar -> API Keys -> Create a new API key. Replace the long string (e.g., `sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx`) in `config.ini` with your new API key. 166 | - **Other services supporting the OpenAI API can be used as well** by replacing `API_URL` and `API_KEY` (though Siliconflow is recommended because the tool uses the free `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` model). 167 | 168 | #### Recording Device Index 169 | - The default `SPEAKER_DEVICE_INDEX` is set to -1, which automatically finds an available default WASAPI loopback device (usually recording what is heard through your speakers or headphones). If issues occur, run `audio_capture.py` to list all available devices and manually specify the correct device. You may also adjust this parameter to record from the microphone instead. 170 | 171 | ```bash 172 | python src/audio_capture.py 173 | ``` 174 | 175 | ## Usage Instructions 176 | 177 | ### Testing Individual Modules 178 | 179 | Each core module (recording, transcription, and LLM client) contains simple test code. You can run the following files individually to verify that each module works correctly: 180 | 181 | - `src/audio_capture.py` — Implements audio recording (lists system audio devices). 182 | - `src/transcriber.py` — Implements audio transcription (the model will be automatically downloaded on first run). 183 | - `src/llm_client.py` — Implements the LLM client (calls the LLM API and returns responses). 184 | 185 | ### Launching the Graphical User Interface 186 | 187 | Run `main.py` to launch the full InterPilot GUI: 188 | 189 | ```bash 190 | python main.py 191 | ``` 192 | 193 | In the GUI, you can perform the following operations sequentially: 194 | 195 | - **Start Recording**: Click the "Start Recording" button. The program will generate a unique filename and start recording audio. 196 | - **Stop Recording**: Click the "Stop Recording" button to end the recording. The audio file is saved in the `output` directory. 197 | - **Transcribe Audio**: After recording (or manually triggering), the transcription module converts the audio to text and displays it in the interface. 198 | - **Send to LLM**: Once transcription is complete, the text can be sent to the LLM to generate an AI response, which will be displayed with Markdown support. 199 | - **Modify Transcribed Text and Resend to LLM** if needed. 200 | 201 | If you prefer running the tool in a command-line mode, you can use `main_cmd.py`: 202 | 203 | ```bash 204 | python main_cmd.py 205 | ``` 206 | 207 | ### Notes 208 | 209 | - **Recording Devices**: Depending on your system, you may need to adjust `SPEAKER_DEVICE_INDEX` and `MIC_DEVICE_INDEX` in `config.ini`. 210 | - **Environment Variables**: Ensure FFmpeg is installed and added to the PATH; otherwise, audio processing might be affected. 211 | - **Testing**: It is recommended to test each module individually to confirm that audio recording, transcription, and LLM response work correctly before running the full GUI. 212 | 213 | ### Handling Screen Sharing and UI Hiding (if you wish to keep the tool hidden during meetings) 214 | 215 | - Use [shalzuth/WindowSharingHider](https://github.com/shalzuth/WindowSharingHider) to hide the UI—an excellent tool that is both convenient and effective! 216 | - **Taskbar Icon Hiding**: 217 | - You can use Windows’ built-in taskbar icon hiding features, or simply move the taskbar to a secondary monitor. 218 | - Alternatively, you may find third-party hiding tools (feel free to search for one that suits your needs). 219 | - Using [turbotop](https://www.savardsoftware.com/turbotop/) can keep the window always on top—another very useful tool. 220 | - **Important**: The order of operations may affect the outcome: 221 | - First, use turbotop to set the window to always on top. 222 | - Then, use WindowSharingHider to hide the UI. 223 | - If the results are not satisfactory, try altering the order. 224 | 225 | ![Usage Comparison](doc_pic/Use.jpg) 226 | 227 | ## TODO 228 | 229 | - [ ] Add more detailed usage examples or screenshots (GUI operation examples, terminal output, etc.) in the README. 230 | - [ ] Integrate a voice generation feature (TTS) – already tested and pending integration. 231 | - [ ] Add functionality for simultaneous recognition of both microphone and speaker audio. 232 | - [ ] Add a feature to upload screenshots and send them to the LLM. 233 | - [ ] Implement the taskbar icon hiding feature. 234 | 235 | ## Contribution 236 | 237 | Contributions are welcome! Feel free to submit issues or pull requests to help improve the tool. If you have any suggestions or improvements, please contact us. 238 | 239 | 240 | ## ⚠️ Disclaimer 241 | 242 | This project is intended solely for technical learning and research purposes. It must not be used for: 243 | - Any form of interview cheating. 244 | - Infringing on others’ privacy or trade secrets. 245 | - Any actions that violate local laws and regulations. 246 | 247 | Users are solely responsible for any legal consequences resulting from misuse. By using this project, you acknowledge that you have read and agreed to this disclaimer. 248 | 249 | ## License 250 | 251 | This project is licensed under the [Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://creativecommons.org/licenses/by-nc/4.0/) license. 252 | This means you are free to share and modify the project’s contents **for non-commercial purposes only**. 253 | ``` 254 | --------------------------------------------------------------------------------