├── server ├── env.example ├── requirements.txt ├── pyproject.toml ├── .gitignore ├── kokoro_worker.py ├── marvis_worker.py ├── bot.py └── tts_mlx_isolated.py ├── client ├── src │ └── app │ │ ├── favicon.ico │ │ ├── page.tsx │ │ └── layout.tsx ├── next.config.ts ├── eslint.config.mjs ├── .gitignore ├── tsconfig.json └── package.json ├── assets └── debug-console-screenshot.png ├── .gitignore └── README.md /server/env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY="" 2 | DEEPGRAM_API_KEY="" 3 | RIME_API_KEY="" -------------------------------------------------------------------------------- /client/src/app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwindla/macos-local-voice-agents/HEAD/client/src/app/favicon.ico -------------------------------------------------------------------------------- /assets/debug-console-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwindla/macos-local-voice-agents/HEAD/assets/debug-console-screenshot.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .env 3 | .venv 4 | .vscode 5 | .next 6 | node_modules 7 | venv 8 | __pycache__ 9 | .env.local 10 | .env.development.local 11 | .env.test.local 12 | .env.production.local -------------------------------------------------------------------------------- /server/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | fastapi[all] 3 | uvicorn 4 | opencv-python 5 | mlx-lm 6 | mlx-audio 7 | pipecat-ai[openai,deepgram,rime,silero,mlx-whisper]>=0.0.80 8 | aiortc 9 | nltk 10 | 11 | 12 | -------------------------------------------------------------------------------- /client/next.config.ts: -------------------------------------------------------------------------------- 1 | import type { NextConfig } from "next"; 2 | 3 | const nextConfig: NextConfig = { 4 | async rewrites() { 5 | return [ 6 | { 7 | source: "/api/:path*", 8 | destination: "http://0.0.0.0:7860/api/:path*", 9 | }, 10 | ]; 11 | }, 12 | }; 13 | 14 | export default nextConfig; 15 | -------------------------------------------------------------------------------- /client/eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import { dirname } from "path"; 2 | import { fileURLToPath } from "url"; 3 | import { FlatCompat } from "@eslint/eslintrc"; 4 | 5 | const __filename = fileURLToPath(import.meta.url); 6 | const __dirname = dirname(__filename); 7 | 8 | const compat = new FlatCompat({ 9 | baseDirectory: __dirname, 10 | }); 11 | 12 | const eslintConfig = [ 13 | ...compat.extends("next/core-web-vitals", "next/typescript"), 14 | ]; 15 | 16 | export default eslintConfig; 17 | -------------------------------------------------------------------------------- /client/src/app/page.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { 4 | ConsoleTemplate, 5 | FullScreenContainer, 6 | ThemeProvider, 7 | } from "@pipecat-ai/voice-ui-kit"; 8 | 9 | export default function Home() { 10 | return ( 11 | 12 | 13 | 20 | 21 | 22 | ); 23 | } -------------------------------------------------------------------------------- /client/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.* 7 | .yarn/* 8 | !.yarn/patches 9 | !.yarn/plugins 10 | !.yarn/releases 11 | !.yarn/versions 12 | 13 | # testing 14 | /coverage 15 | 16 | # next.js 17 | /.next/ 18 | /out/ 19 | 20 | # production 21 | /build 22 | 23 | # misc 24 | .DS_Store 25 | *.pem 26 | 27 | # debug 28 | npm-debug.log* 29 | yarn-debug.log* 30 | yarn-error.log* 31 | .pnpm-debug.log* 32 | 33 | # env files (can opt-in for committing if needed) 34 | .env* 35 | 36 | # vercel 37 | .vercel 38 | 39 | # typescript 40 | *.tsbuildinfo 41 | next-env.d.ts 42 | -------------------------------------------------------------------------------- /server/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "macos-local-voice-agents-server" 3 | version = "0.1.0" 4 | description = "Local voice agents server for macOS" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "pip", 9 | "python-dotenv", 10 | "fastapi[all]", 11 | "uvicorn", 12 | "opencv-python", 13 | "mlx-lm", 14 | "mlx-audio", 15 | "pipecat-ai[openai,deepgram,rime,silero,mlx-whisper]>=0.0.81", 16 | "pipecat-ai-small-webrtc-prebuilt", 17 | "aiortc", 18 | "nltk", 19 | "pyobjc-core", 20 | "pyobjc-framework-Quartz", 21 | "pyobjc-framework-Cocoa", 22 | "pyobjc-framework-ApplicationServices", 23 | ] 24 | -------------------------------------------------------------------------------- /client/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2017", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "noEmit": true, 9 | "esModuleInterop": true, 10 | "module": "esnext", 11 | "moduleResolution": "bundler", 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "jsx": "preserve", 15 | "incremental": true, 16 | "plugins": [ 17 | { 18 | "name": "next" 19 | } 20 | ], 21 | "paths": { 22 | "@/*": ["./src/*"] 23 | } 24 | }, 25 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 26 | "exclude": ["node_modules"] 27 | } 28 | -------------------------------------------------------------------------------- /client/src/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from "next"; 2 | import { Geist, Geist_Mono } from "next/font/google"; 3 | 4 | import "@pipecat-ai/voice-ui-kit/styles.css"; 5 | 6 | const geistSans = Geist({ 7 | variable: "--font-geist-sans", 8 | subsets: ["latin"], 9 | }); 10 | 11 | const geistMono = Geist_Mono({ 12 | variable: "--font-geist-mono", 13 | subsets: ["latin"], 14 | }); 15 | 16 | export const metadata: Metadata = { 17 | title: "Voice UI Kit - Console Template Example", 18 | }; 19 | 20 | export default function RootLayout({ 21 | children, 22 | }: Readonly<{ 23 | children: React.ReactNode; 24 | }>) { 25 | return ( 26 | 27 | 28 | {children} 29 | 30 | 31 | ); 32 | } -------------------------------------------------------------------------------- /server/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | env/ 3 | __pycache__/ 4 | *~ 5 | venv 6 | .venv 7 | /.idea 8 | #*# 9 | 10 | # Distribution / Packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | .DS_Store 30 | .env 31 | fly.toml 32 | 33 | # Examples 34 | examples/telnyx-chatbot/templates/streams.xml 35 | examples/twilio-chatbot/templates/streams.xml 36 | examples/**/node_modules/ 37 | examples/**/.expo/ 38 | examples/**/dist/ 39 | examples/**/npm-debug.* 40 | examples/**/*.jks 41 | examples/**/*.p8 42 | examples/**/*.p12 43 | examples/**/*.key 44 | examples/**/*.mobileprovision 45 | examples/**/*.orig.* 46 | examples/**/web-build/ 47 | 48 | # macOS 49 | .DS_Store 50 | 51 | # Documentation 52 | docs/api/_build/ 53 | docs/api/api -------------------------------------------------------------------------------- /client/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "client", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@pipecat-ai/client-js": "^1.2.0", 13 | "@pipecat-ai/client-react": "^1.0.1", 14 | "@pipecat-ai/daily-transport": "^1.0.0", 15 | "@pipecat-ai/small-webrtc-transport": "^1.2.0", 16 | "@pipecat-ai/voice-ui-kit": "^0.2.0", 17 | "next": "15.4.3", 18 | "react": "19.1.0", 19 | "react-dom": "19.1.0" 20 | }, 21 | "devDependencies": { 22 | "@eslint/eslintrc": "^3", 23 | "@tailwindcss/postcss": "^4", 24 | "@types/node": "^20", 25 | "@types/react": "^19", 26 | "@types/react-dom": "^19", 27 | "eslint": "^9", 28 | "eslint-config-next": "15.4.3", 29 | "tailwindcss": "^4", 30 | "typescript": "^5" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /server/kokoro_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Standalone Kokoro TTS worker process. 4 | 5 | This worker runs in complete isolation to avoid Metal threading conflicts. 6 | It communicates via JSON over stdin/stdout. 7 | 8 | Usage: 9 | python kokoro_worker.py 10 | 11 | Commands: 12 | {"cmd": "init", "model": "mlx-community/Kokoro-82M-bf16", "voice": "af_heart"} 13 | {"cmd": "generate", "text": "Hello world"} 14 | """ 15 | 16 | import sys 17 | import json 18 | import base64 19 | import traceback 20 | import numpy as np 21 | 22 | # Add logging to worker 23 | import logging 24 | logging.basicConfig(level=logging.INFO, format='WORKER: %(message)s') 25 | 26 | try: 27 | import mlx.core as mx 28 | from mlx_audio.tts.utils import load_model 29 | MLX_AVAILABLE = True 30 | except ImportError: 31 | MLX_AVAILABLE = False 32 | 33 | 34 | class Worker: 35 | def __init__(self): 36 | self.model = None 37 | self.voice = None 38 | 39 | def initialize(self, model_name, voice): 40 | if not MLX_AVAILABLE: 41 | return {"error": "MLX not available"} 42 | try: 43 | self.model = load_model(model_name) 44 | self.voice = voice 45 | # Test generation to ensure everything works 46 | list(self.model.generate(text="test", voice=voice, speed=1.0)) 47 | return {"success": True} 48 | except Exception as e: 49 | return {"error": str(e)} 50 | 51 | def generate(self, text): 52 | try: 53 | if not self.model: 54 | return {"error": "Not initialized"} 55 | 56 | segments = [] 57 | for result in self.model.generate(text=text, voice=self.voice, speed=1.0): 58 | # Convert MLX array to numpy immediately 59 | audio_data = np.array(result.audio, copy=True) 60 | print(f"Generated segment shape: {audio_data.shape}, min: {audio_data.min():.4f}, max: {audio_data.max():.4f}", file=sys.stderr) 61 | segments.append(audio_data) 62 | 63 | if not segments: 64 | return {"error": "No audio"} 65 | 66 | # Concatenate all segments 67 | if len(segments) == 1: 68 | audio = segments[0] 69 | else: 70 | audio = np.concatenate(segments, axis=0) 71 | 72 | print(f"Final audio shape: {audio.shape}, min: {audio.min():.4f}, max: {audio.max():.4f}", file=sys.stderr) 73 | 74 | # Check if audio is silent 75 | if np.max(np.abs(audio)) < 1e-6: 76 | return {"error": "Generated audio is silent"} 77 | 78 | # Convert to 16-bit PCM 79 | audio_int16 = (audio * 32767).astype(np.int16) 80 | audio_b64 = base64.b64encode(audio_int16.tobytes()).decode() 81 | 82 | return {"success": True, "audio": audio_b64} 83 | except Exception as e: 84 | import traceback 85 | return {"error": f"{str(e)}\n{traceback.format_exc()}"} 86 | 87 | 88 | def main(): 89 | """Main worker loop - reads commands from stdin, writes responses to stdout.""" 90 | worker = Worker() 91 | 92 | for line in sys.stdin: 93 | try: 94 | req = json.loads(line.strip()) 95 | if req["cmd"] == "init": 96 | resp = worker.initialize(req["model"], req["voice"]) 97 | elif req["cmd"] == "generate": 98 | resp = worker.generate(req["text"]) 99 | else: 100 | resp = {"error": "Unknown command"} 101 | print(json.dumps(resp), flush=True) 102 | except Exception as e: 103 | print(json.dumps({"error": str(e)}), flush=True) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Local voice agents on macOS with Pipecat 2 | 3 | ![screenshot](assets/debug-console-screenshot.png) 4 | 5 | Pipecat is an open-source, vendor-neutral framework for building real-time voice (and video) AI applications. 6 | 7 | This repository contains an example of a voice agent running with all local models on macOS. On an M-series mac, you can achieve voice-to-voice latency of <800 ms with relatively strong models. 8 | 9 | The [server/bot.py](server/bot.py) file uses these models: 10 | 11 | - Silero VAD 12 | - smart-turn v2 13 | - MLX Whisper 14 | - Gemma3n 4B 15 | - Kokoro TTS 16 | 17 | But you can swap any of them out for other models, or completely reconfigure the pipeline. It's easy to add tool calling, MCP server integrations, use parallel pipelines to do async inference alongside the voice conversations, add custom processing steps, configure interrupt handling to work differently, etc. 18 | 19 | The bot and web client here communicate using a low-latency, local, serverless WebRTC connection. For more information on serverless WebRTC, see the Pipecat [SmallWebRTCTransport docs](https://docs.pipecat.ai/server/services/transport/small-webrtc) and this [article](https://www.daily.co/blog/you-dont-need-a-webrtc-server-for-your-voice-agents/). You could switch over to a different Pipecat transport (for example, a WebSocket-based transport), but WebRTC is the best choice for realtime audio. 20 | 21 | For a deep dive into voice AI, including network transport, optimizing for latency, and notes on designing tool calling and complex workflows, see the [Voice AI & Voice Agents Illustrated Guide](https://voiceaiandvoiceagents.com/). 22 | 23 | # Models and dependencies 24 | 25 | Silero VAD and MLX Whisper run inside the Pipecat process. When the agent code starts, it will need to download model weights that aren't already cached, so first startup can take some time. 26 | 27 | The LLM service in this bot uses the OpenAI-compatible chat completion HTTP API. So you will need to run a local OpenAI-compatible LLM server. 28 | 29 | One easy, high-performance, way to run a local LLM server on macOS is [LM Studio](https://lmstudio.ai/). From inside the LM Studio graphical interface, go to the "Developer" tab on the far left to start an HTTP server. 30 | 31 | # Run the voice agent 32 | 33 | The core voice agent code lives in a single file: [server/bot.py](server/bot.py). There's one custom service here that's not included in Pipecat core: we implemented a local MLX-Audio frame processor on top of the excellent [mlx-audio library](https://github.com/Blaizzy/mlx-audio). 34 | 35 | Note that the first time you start the bot it will take some time to initialize the three models. It can be 30 seconds or more before the bot is fully ready to go. Subsequent startups will be much faster. 36 | 37 | It's not a bad idea to run a quick `mlx-audio.generate` process from the command line before you run the bot the first time, so you're not waiting for a relatively bug HuggingFace model download for the voice model. 38 | 39 | ```shell 40 | mlx-audio.generate --model "Marvis-AI/marvis-tts-250m-v0.1" --text "Hello, I'm Pipecat!" --output "output.wav" 41 | # or 42 | mlx-audio.generate --model "mlx-community/Kokoro-82M-bf16" --text "Hello, I'm Pipecat!" --output "output.wav" 43 | ``` 44 | 45 | ```shell 46 | cd server/ 47 | ``` 48 | 49 | If you're using uv 50 | 51 | ``` 52 | uv run bot.py 53 | ``` 54 | 55 | If you're using pip 56 | 57 | ``` 58 | python3.12 -m venv venv 59 | source venv/bin/activate 60 | 61 | pip install -r requirements.txt 62 | 63 | python bot.py 64 | ``` 65 | 66 | After you run the first time and have all the models cached, you can set the HF_HUB_OFFLINE environment variable to prevent the Hugging Face libraries from going to the network and checking for model updates. This makes the initial bot startup and first conversation turn a lot faster. 67 | 68 | ``` 69 | HF_HUB_OFFLINE=1 uv run bot.py 70 | ``` 71 | 72 | # Start the web client 73 | 74 | The web client is a React app. You can connect to your local macOS agent using any client that can negotiate a serverless WebRTC connection. The client in this repo is based on [voice-ui-kit](https://github.com/pipecat-ai/voice-ui-kit) and just uses that library's standard debug console template. 75 | 76 | ```shell 77 | cd client/ 78 | 79 | npm i 80 | 81 | npm run dev 82 | 83 | # Navigate to URL shown in terminal in your web browser 84 | ``` -------------------------------------------------------------------------------- /server/marvis_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Standalone Kokoro TTS worker process. 4 | 5 | This worker runs in complete isolation to avoid Metal threading conflicts. 6 | It communicates via JSON over stdin/stdout. 7 | 8 | Usage: 9 | python kokoro_worker.py 10 | 11 | Commands: 12 | {"cmd": "init", "model": "Marvis-AI/marvis-tts-250m-v0.1-MLX-fp16"} 13 | {"cmd": "generate", "text": "Hello world"} 14 | """ 15 | 16 | import sys 17 | import json 18 | import base64 19 | import numpy as np 20 | 21 | # Add logging to worker 22 | import logging 23 | 24 | logging.basicConfig(level=logging.INFO, format="WORKER: %(message)s") 25 | 26 | try: 27 | import mlx.core as mx 28 | from mlx_audio.tts.utils import load_model 29 | 30 | MLX_AVAILABLE = True 31 | except ImportError: 32 | MLX_AVAILABLE = False 33 | 34 | 35 | def rms_norm(audio: np.ndarray, target_rms: float = 0.1, eps: float = 1e-8) -> np.ndarray: 36 | """RMS normalize audio while ensuring no clipping beyond [-1, 1]. 37 | 38 | Scales the audio toward a target RMS, but also constrains the scale so that 39 | peaks will not exceed 1.0. Finally clamps to [-1, 1] for safety. 40 | Only call this when you detect out-of-bounds samples. 41 | """ 42 | if audio.size == 0: 43 | return audio 44 | a = audio.astype(np.float32, copy=False) 45 | rms = float(np.sqrt(np.mean(a * a)) + eps) 46 | peak = float(np.max(np.abs(a)) + eps) 47 | # Scale toward target RMS but never exceed peak limits 48 | scale_rms = target_rms / rms 49 | scale_peak = 1.0 / peak 50 | scale = min(scale_rms, scale_peak) 51 | if not np.isfinite(scale) or scale <= 0: 52 | return np.clip(a, -1.0, 1.0) 53 | return np.clip(a * scale, -1.0, 1.0) 54 | 55 | 56 | class Worker: 57 | def __init__(self): 58 | self.model = None 59 | self.voice = None 60 | 61 | def initialize(self, model_name, voice): 62 | if not MLX_AVAILABLE: 63 | return {"error": "MLX not available"} 64 | try: 65 | self.model = load_model(model_name) 66 | self.voice = None 67 | # Test generation to ensure everything works 68 | list(self.model.generate(text="test", voice=self.voice, speed=1.0)) 69 | return {"success": True} 70 | except Exception as e: 71 | return {"error": str(e)} 72 | 73 | def generate(self, text): 74 | try: 75 | if not self.model: 76 | return {"error": "Not initialized"} 77 | 78 | segments = [] 79 | for result in self.model.generate(text=text, voice=self.voice, speed=1.0): 80 | # Convert MLX array to numpy immediately 81 | audio_data = np.array(result.audio, copy=True) 82 | print( 83 | f"Generated segment shape: {audio_data.shape}, min: {audio_data.min():.4f}, max: {audio_data.max():.4f}", 84 | file=sys.stderr, 85 | ) 86 | segments.append(audio_data) 87 | 88 | if not segments: 89 | return {"error": "No audio"} 90 | 91 | # Concatenate all segments 92 | if len(segments) == 1: 93 | audio = segments[0] 94 | else: 95 | audio = np.concatenate(segments, axis=0) 96 | 97 | print( 98 | f"Final audio shape: {audio.shape}, min: {audio.min():.4f}, max: {audio.max():.4f}", 99 | file=sys.stderr, 100 | ) 101 | 102 | # If any samples are outside [-1, 1], apply RMS normalization 103 | max_abs = float(np.max(np.abs(audio))) 104 | if ( 105 | max_abs > 1.0 + 1e-6 106 | or float(audio.min()) < -1.0 - 1e-6 107 | or float(audio.max()) > 1.0 + 1e-6 108 | ): 109 | pre_rms = float(np.sqrt(np.mean(audio * audio))) 110 | audio = rms_norm(audio, target_rms=0.1) 111 | post_rms = float(np.sqrt(np.mean(audio * audio))) 112 | print( 113 | f"Applied RMS normalization. pre_rms: {pre_rms:.4f}, post_rms: {post_rms:.4f}, new_min: {audio.min():.4f}, new_max: {audio.max():.4f}", 114 | file=sys.stderr, 115 | ) 116 | 117 | # Check if audio is silent 118 | if np.max(np.abs(audio)) < 1e-6: 119 | return {"error": "Generated audio is silent"} 120 | 121 | # Convert to 16-bit PCM 122 | audio_int16 = (audio * 32767).astype(np.int16) 123 | audio_b64 = base64.b64encode(audio_int16.tobytes()).decode() 124 | 125 | return {"success": True, "audio": audio_b64} 126 | except Exception as e: 127 | import traceback 128 | 129 | return {"error": f"{str(e)}\n{traceback.format_exc()}"} 130 | 131 | 132 | def main(): 133 | """Main worker loop - reads commands from stdin, writes responses to stdout.""" 134 | worker = Worker() 135 | 136 | for line in sys.stdin: 137 | try: 138 | req = json.loads(line.strip()) 139 | if req["cmd"] == "init": 140 | resp = worker.initialize(req["model"], req["voice"]) 141 | elif req["cmd"] == "generate": 142 | resp = worker.generate(req["text"]) 143 | else: 144 | resp = {"error": "Unknown command"} 145 | print(json.dumps(resp), flush=True) 146 | except Exception as e: 147 | print(json.dumps({"error": str(e)}), flush=True) 148 | 149 | 150 | if __name__ == "__main__": 151 | main() 152 | -------------------------------------------------------------------------------- /server/bot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import os 4 | import sys 5 | from contextlib import asynccontextmanager 6 | from typing import Dict 7 | 8 | # Add local pipecat to Python path 9 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pipecat", "src")) 10 | 11 | import uvicorn 12 | from dotenv import load_dotenv 13 | from fastapi import BackgroundTasks, FastAPI 14 | from loguru import logger 15 | 16 | from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams 17 | from pipecat.audio.turn.smart_turn.local_smart_turn_v2 import LocalSmartTurnAnalyzerV2 18 | from pipecat.audio.vad.silero import SileroVADAnalyzer 19 | from pipecat.audio.vad.vad_analyzer import VADParams 20 | from pipecat.pipeline.pipeline import Pipeline 21 | from pipecat.pipeline.runner import PipelineRunner 22 | from pipecat.pipeline.task import PipelineParams, PipelineTask 23 | from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext 24 | from pipecat.services.openai.llm import OpenAILLMService 25 | 26 | from pipecat.services.whisper.stt import WhisperSTTServiceMLX, MLXModel 27 | from pipecat.transports.base_transport import TransportParams 28 | from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor 29 | from pipecat.transports.network.small_webrtc import SmallWebRTCTransport 30 | from pipecat.transports.network.webrtc_connection import IceServer, SmallWebRTCConnection 31 | from pipecat.processors.aggregators.llm_response import LLMUserAggregatorParams 32 | 33 | from tts_mlx_isolated import TTSMLXIsolated 34 | 35 | load_dotenv(override=True) 36 | 37 | app = FastAPI() 38 | 39 | pcs_map: Dict[str, SmallWebRTCConnection] = {} 40 | 41 | ice_servers = [ 42 | IceServer( 43 | urls="stun:stun.l.google.com:19302", 44 | ) 45 | ] 46 | 47 | 48 | SYSTEM_INSTRUCTION = """ 49 | "You are Pipecat, a friendly, helpful chatbot. 50 | 51 | Your input is text transcribed in realtime from the user's voice. There may be transcription errors. Adjust your responses automatically to account for these errors. 52 | 53 | Your output will be converted to audio so don't include special characters in your answers and do not use any markdown or special formatting. 54 | 55 | Respond to what the user said in a creative and helpful way. Keep your responses brief unless you are explicitly asked for long or detailed responses. Normally you should use one or two sentences at most. Keep each sentence short. Prefer simple sentences. Try not to use long sentences with multiple comma clauses. 56 | 57 | Start the conversation by saying, "Hello, I'm Pipecat!" Then stop and wait for the user. 58 | """ 59 | 60 | 61 | async def run_bot(webrtc_connection): 62 | transport = SmallWebRTCTransport( 63 | webrtc_connection=webrtc_connection, 64 | params=TransportParams( 65 | audio_in_enabled=True, 66 | audio_out_enabled=True, 67 | vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), 68 | turn_analyzer=LocalSmartTurnAnalyzerV2( 69 | smart_turn_model_path="", # Download from HuggingFace 70 | params=SmartTurnParams(), 71 | ), 72 | ), 73 | ) 74 | 75 | stt = WhisperSTTServiceMLX(model=MLXModel.LARGE_V3_TURBO_Q4) 76 | 77 | tts = TTSMLXIsolated(model="mlx-community/Kokoro-82M-bf16", voice="af_heart", sample_rate=24000) 78 | # tts = TTSMLXIsolated(model="Marvis-AI/marvis-tts-250m-v0.1", voice=None) 79 | 80 | llm = OpenAILLMService( 81 | api_key="dummyKey", 82 | model="gemma-3n-e4b-it-text", # Small model. Uses ~4GB of RAM. 83 | # model="google/gemma-3-12b", # Medium-sized model. Uses ~8.5GB of RAM. 84 | # model="mlx-community/Qwen3-235B-A22B-Instruct-2507-3bit-DWQ", # Large model. Uses ~110GB of RAM! 85 | base_url="http://127.0.0.1:1234/v1", 86 | max_tokens=4096, 87 | ) 88 | 89 | context = OpenAILLMContext( 90 | [ 91 | { 92 | "role": "user", 93 | "content": SYSTEM_INSTRUCTION, 94 | } 95 | ], 96 | ) 97 | context_aggregator = llm.create_context_aggregator( 98 | context, 99 | # Whisper local service isn't streaming, so it delivers the full text all at 100 | # once, after the UserStoppedSpeaking frame. Set aggregation_timeout to a 101 | # a de minimus value since we don't expect any transcript aggregation to be 102 | # necessary. 103 | user_params=LLMUserAggregatorParams(aggregation_timeout=0.05), 104 | ) 105 | 106 | # 107 | # RTVI events for Pipecat client UI 108 | # 109 | rtvi = RTVIProcessor(config=RTVIConfig(config=[])) 110 | 111 | pipeline = Pipeline( 112 | [ 113 | transport.input(), 114 | stt, 115 | rtvi, 116 | context_aggregator.user(), 117 | llm, 118 | tts, 119 | transport.output(), 120 | context_aggregator.assistant(), 121 | ] 122 | ) 123 | 124 | task = PipelineTask( 125 | pipeline, 126 | params=PipelineParams( 127 | enable_metrics=True, 128 | enable_usage_metrics=True, 129 | ), 130 | observers=[RTVIObserver(rtvi)], 131 | ) 132 | 133 | @rtvi.event_handler("on_client_ready") 134 | async def on_client_ready(rtvi): 135 | await rtvi.set_bot_ready() 136 | # Kick off the conversation 137 | await task.queue_frames([context_aggregator.user().get_context_frame()]) 138 | 139 | @transport.event_handler("on_first_participant_joined") 140 | async def on_first_participant_joined(transport, participant): 141 | print(f"Participant joined: {participant}") 142 | await transport.capture_participant_transcription(participant["id"]) 143 | 144 | @transport.event_handler("on_participant_left") 145 | async def on_participant_left(transport, participant, reason): 146 | print(f"Participant left: {participant}") 147 | await task.cancel() 148 | 149 | runner = PipelineRunner(handle_sigint=False) 150 | 151 | await runner.run(task) 152 | 153 | 154 | @app.post("/api/offer") 155 | async def offer(request: dict, background_tasks: BackgroundTasks): 156 | pc_id = request.get("pc_id") 157 | 158 | if pc_id and pc_id in pcs_map: 159 | pipecat_connection = pcs_map[pc_id] 160 | logger.info(f"Reusing existing connection for pc_id: {pc_id}") 161 | await pipecat_connection.renegotiate( 162 | sdp=request["sdp"], 163 | type=request["type"], 164 | restart_pc=request.get("restart_pc", False), 165 | ) 166 | else: 167 | pipecat_connection = SmallWebRTCConnection(ice_servers) 168 | await pipecat_connection.initialize(sdp=request["sdp"], type=request["type"]) 169 | 170 | @pipecat_connection.event_handler("closed") 171 | async def handle_disconnected(webrtc_connection: SmallWebRTCConnection): 172 | logger.info(f"Discarding peer connection for pc_id: {webrtc_connection.pc_id}") 173 | pcs_map.pop(webrtc_connection.pc_id, None) 174 | 175 | # Run example function with SmallWebRTC transport arguments. 176 | background_tasks.add_task(run_bot, pipecat_connection) 177 | 178 | answer = pipecat_connection.get_answer() 179 | # Updating the peer connection inside the map 180 | pcs_map[answer["pc_id"]] = pipecat_connection 181 | 182 | return answer 183 | 184 | 185 | @asynccontextmanager 186 | async def lifespan(app: FastAPI): 187 | yield # Run app 188 | coros = [pc.disconnect() for pc in pcs_map.values()] 189 | await asyncio.gather(*coros) 190 | pcs_map.clear() 191 | 192 | 193 | if __name__ == "__main__": 194 | parser = argparse.ArgumentParser(description="Pipecat Bot Runner") 195 | parser.add_argument( 196 | "--host", default="localhost", help="Host for HTTP server (default: localhost)" 197 | ) 198 | parser.add_argument( 199 | "--port", type=int, default=7860, help="Port for HTTP server (default: 7860)" 200 | ) 201 | args = parser.parse_args() 202 | 203 | uvicorn.run(app, host=args.host, port=args.port) 204 | -------------------------------------------------------------------------------- /server/tts_mlx_isolated.py: -------------------------------------------------------------------------------- 1 | # 2 | # Process-isolated Kokoro TTS service 3 | # Uses a separate process to avoid Metal threading conflicts on Apple Silicon 4 | # 5 | 6 | import asyncio 7 | import subprocess 8 | import json 9 | import base64 10 | import sys 11 | from typing import AsyncGenerator, Optional 12 | from pathlib import Path 13 | 14 | from loguru import logger 15 | 16 | from pipecat.frames.frames import ( 17 | ErrorFrame, 18 | Frame, 19 | TTSAudioRawFrame, 20 | TTSStartedFrame, 21 | TTSStoppedFrame, 22 | ) 23 | from pipecat.services.tts_service import TTSService 24 | from pipecat.utils.tracing.service_decorators import traced_tts 25 | 26 | 27 | class TTSMLXIsolated(TTSService): 28 | """Completely isolated Kokoro TTS using subprocess to avoid Metal issues.""" 29 | 30 | def __init__( 31 | self, 32 | *, 33 | model: str = "mlx-community/Kokoro-82M-bf16", 34 | voice: str = "af_heart", 35 | device: Optional[str] = None, 36 | sample_rate: int = 24000, 37 | **kwargs, 38 | ): 39 | """Initialize the isolated Kokoro TTS service.""" 40 | super().__init__(sample_rate=sample_rate, **kwargs) 41 | 42 | self._model_name = model 43 | self._voice = voice 44 | self._device = device 45 | 46 | self._process = None 47 | self._initialized = False 48 | 49 | # Get path to worker script 50 | self._worker_script = self._get_worker_script_path() 51 | 52 | self._settings = { 53 | "model": model, 54 | "voice": voice, 55 | "sample_rate": sample_rate, 56 | } 57 | 58 | def _get_worker_script_path(self) -> str: 59 | """Get the path to the standalone worker script.""" 60 | # Look for kokoro_worker.py in the same directory as this file 61 | current_dir = Path(__file__).parent 62 | if self._model_name.startswith("Marvis-AI"): 63 | worker_path = current_dir / "marvis_worker.py" 64 | else: 65 | worker_path = current_dir / "kokoro_worker.py" 66 | 67 | logger.info(f"Using worker script: {worker_path}") 68 | 69 | if not worker_path.exists(): 70 | raise FileNotFoundError( 71 | f"Worker script not found at {worker_path}. " 72 | "Make sure worker script is in the same directory as tts_mlx_isolated.py" 73 | ) 74 | 75 | return str(worker_path) 76 | 77 | def _start_worker(self): 78 | """Start the worker process.""" 79 | try: 80 | self._process = subprocess.Popen( 81 | [sys.executable, self._worker_script], 82 | stdin=subprocess.PIPE, 83 | stdout=subprocess.PIPE, 84 | # stderr=subprocess.PIPE, 85 | text=True, 86 | bufsize=0, 87 | ) 88 | logger.info(f"Started {self._model_name} worker process: {self._process.pid}") 89 | return True 90 | except Exception as e: 91 | logger.error(f"Failed to start worker: {e}") 92 | return False 93 | 94 | def _send_command(self, command: dict) -> dict: 95 | """Send command to worker and get response.""" 96 | try: 97 | if not self._process or self._process.poll() is not None: 98 | logger.debug("Starting worker process...") 99 | if not self._start_worker(): 100 | return {"error": "Failed to start worker"} 101 | 102 | # Send command 103 | cmd_json = json.dumps(command) + "\n" 104 | logger.debug(f"Sending command: {command}") 105 | self._process.stdin.write(cmd_json) 106 | self._process.stdin.flush() 107 | 108 | # Read response with timeout 109 | import select 110 | 111 | ready, _, _ = select.select([self._process.stdout], [], [], 10.0) # 10 second timeout 112 | 113 | if not ready: 114 | return {"error": "Worker response timeout"} 115 | 116 | response_line = self._process.stdout.readline() 117 | if not response_line: 118 | # Check if process died 119 | if self._process.poll() is not None: 120 | stderr_output = self._process.stderr.read() if self._process.stderr else "" 121 | return {"error": f"Worker process died. stderr: {stderr_output}"} 122 | return {"error": "No response from worker"} 123 | 124 | response_data = json.loads(response_line.strip()) 125 | # Don't log the full response if it contains audio data (too verbose) 126 | if "audio" in response_data: 127 | logger.debug( 128 | f"Worker response: success with {len(response_data.get('audio', ''))} chars of audio data" 129 | ) 130 | else: 131 | logger.debug(f"Worker response: {response_line.strip()}") 132 | return response_data 133 | 134 | except Exception as e: 135 | logger.error(f"Worker communication error: {e}") 136 | # Get stderr if available 137 | if self._process and self._process.stderr: 138 | try: 139 | stderr_output = self._process.stderr.read() 140 | logger.error(f"Worker stderr: {stderr_output}") 141 | except: 142 | pass 143 | return {"error": str(e)} 144 | 145 | async def _initialize_if_needed(self): 146 | """Initialize the worker if not already done.""" 147 | if self._initialized: 148 | return True 149 | 150 | loop = asyncio.get_event_loop() 151 | result = await loop.run_in_executor( 152 | None, 153 | self._send_command, 154 | {"cmd": "init", "model": self._model_name, "voice": self._voice}, 155 | ) 156 | 157 | if result.get("success"): 158 | self._initialized = True 159 | logger.info("Kokoro worker initialized") 160 | return True 161 | else: 162 | error_msg = result.get("error", "Unknown error") 163 | logger.error(f"Worker initialization failed: {error_msg}") 164 | 165 | # Also check if process died 166 | if self._process and self._process.poll() is not None: 167 | stderr_output = self._process.stderr.read() if self._process.stderr else "" 168 | logger.error(f"Worker process stderr: {stderr_output}") 169 | 170 | return False 171 | 172 | def can_generate_metrics(self) -> bool: 173 | return True 174 | 175 | @traced_tts 176 | async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: 177 | """Generate speech using isolated worker process.""" 178 | logger.debug(f"{self}: Generating TTS [{text}]") 179 | 180 | try: 181 | await self.start_ttfb_metrics() 182 | await self.start_tts_usage_metrics(text) 183 | 184 | yield TTSStartedFrame() 185 | 186 | # Initialize worker if needed 187 | if not await self._initialize_if_needed(): 188 | raise RuntimeError("Failed to initialize Kokoro worker") 189 | 190 | # Generate audio 191 | loop = asyncio.get_event_loop() 192 | result = await loop.run_in_executor( 193 | None, self._send_command, {"cmd": "generate", "text": text} 194 | ) 195 | 196 | if not result.get("success"): 197 | raise RuntimeError(f"Audio generation failed: {result.get('error')}") 198 | 199 | # Decode audio 200 | audio_b64 = result["audio"] 201 | audio_bytes = base64.b64decode(audio_b64) 202 | 203 | await self.stop_ttfb_metrics() 204 | 205 | # Stream audio 206 | CHUNK_SIZE = self.chunk_size 207 | for i in range(0, len(audio_bytes), CHUNK_SIZE): 208 | chunk = audio_bytes[i : i + CHUNK_SIZE] 209 | if len(chunk) > 0: 210 | yield TTSAudioRawFrame(chunk, self.sample_rate, 1) 211 | await asyncio.sleep(0.001) 212 | 213 | except Exception as e: 214 | logger.error(f"Error in run_tts: {e}") 215 | yield ErrorFrame(error=str(e)) 216 | finally: 217 | logger.debug(f"{self}: Finished TTS [{text}]") 218 | await self.stop_ttfb_metrics() 219 | yield TTSStoppedFrame() 220 | 221 | def _cleanup(self): 222 | """Clean up worker process.""" 223 | if self._process: 224 | try: 225 | self._process.terminate() 226 | self._process.wait(timeout=5) 227 | except: 228 | try: 229 | self._process.kill() 230 | except: 231 | pass 232 | self._process = None 233 | 234 | async def __aenter__(self): 235 | """Async context manager entry.""" 236 | await super().__aenter__() 237 | await self._initialize_if_needed() 238 | return self 239 | 240 | async def __aexit__(self, exc_type, exc_val, exc_tb): 241 | """Clean shutdown.""" 242 | self._cleanup() 243 | await super().__aexit__(exc_type, exc_val, exc_tb) 244 | --------------------------------------------------------------------------------