├── .env.example ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── assets └── demo.gif ├── eslint.config.mjs ├── package-lock.json ├── package.json ├── pnpm-lock.yaml ├── src ├── config │ ├── constants.ts │ └── prompts.ts ├── handlers │ └── openai.handler.ts ├── servers │ ├── mcp.server.ts │ └── voice.server.ts ├── services │ ├── openai │ │ ├── context.service.ts │ │ ├── event.service.ts │ │ └── ws.service.ts │ ├── session-manager.service.ts │ └── twilio │ │ ├── call.service.ts │ │ ├── event.service.ts │ │ └── ws.service.ts ├── start-all.ts ├── types.ts └── utils │ ├── call-utils.ts │ └── execution-utils.ts └── tsconfig.json /.env.example: -------------------------------------------------------------------------------- 1 | # Server configuration 2 | PORT=3004 3 | 4 | # Twilio API credentials 5 | TWILIO_ACCOUNT_SID=your_twilio_account_sid 6 | TWILIO_AUTH_TOKEN=your_twilio_auth_token 7 | TWILIO_NUMBER=your_twilio_number 8 | # OpenAI API key 9 | OPENAI_API_KEY=your_openai_api_key 10 | OPENAI_WEBSOCKET_URL=wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview 11 | 12 | # Ngrok authentication token 13 | NGROK_AUTHTOKEN=your_ngrok_authtoken 14 | 15 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependency directories 2 | node_modules/ 3 | jspm_packages/ 4 | 5 | # Build outputs 6 | dist/ 7 | build/ 8 | out/ 9 | *.tsbuildinfo 10 | 11 | # Environment variables 12 | .env 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | # Logs 19 | logs 20 | *.log 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | lerna-debug.log* 25 | 26 | # Coverage directory used by tools like istanbul 27 | coverage/ 28 | *.lcov 29 | 30 | # Cache directories 31 | .npm 32 | .eslintcache 33 | .stylelintcache 34 | .rpt2_cache/ 35 | .rts2_cache_cjs/ 36 | .rts2_cache_es/ 37 | .rts2_cache_umd/ 38 | 39 | # Runtime data 40 | pids 41 | *.pid 42 | *.seed 43 | *.pid.lock 44 | 45 | # IDE and editor folders 46 | .idea/ 47 | .vscode/ 48 | *.swp 49 | *.swo 50 | .DS_Store 51 | .directory 52 | .project 53 | .settings/ 54 | .classpath 55 | .c9/ 56 | *.launch 57 | .settings/ 58 | *.sublime-workspace 59 | 60 | 61 | # ngrok 62 | .ngrok/ 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 LukasK 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Voice Call MCP Server 2 | 3 | A Model Context Protocol (MCP) server that enables Claude and other AI assistants to initiate and manage voice calls using Twilio and OpenAI (GPT-4o Realtime model). 4 | 5 | Use this as a base to kick-start your AI-powered voice calling explorations, save time and develop additional functionality on top of it. 6 | 7 | ![Demo](./assets/demo.gif) 8 | 9 | 10 | ## Sequence Diagram 11 | 12 | ```mermaid 13 | sequenceDiagram 14 | participant AI as AI Assistant (e.g., Claude) 15 | participant MCP as MCP Server 16 | participant Twilio as Twilio 17 | participant Phone as Destination Phone 18 | participant OpenAI as OpenAI 19 | 20 | AI->>MCP: 1) Initiate outbound call request
(POST /calls) 21 | MCP->>Twilio: 2) Place outbound call via Twilio API 22 | Twilio->>Phone: 3) Ring the destination phone 23 | Twilio->>MCP: 4) Call status updates & audio callbacks (webhooks) 24 | MCP->>OpenAI: 5) Forward real-time audio to OpenaAI's realtime model 25 | OpenAI->>MCP: 6) Return voice stream 26 | MCP->>Twilio: 7) Send voice stream 27 | Twilio->>Phone: 8) Forward voice stream 28 | Note over Phone: Two-way conversation continues
until the call ends 29 | ``` 30 | 31 | 32 | ## Features 33 | 34 | - Make outbound phone calls via Twilio 📞 35 | - Process call audio in real-time with GPT-4o Realtime model 🎙️ 36 | - Real-time language switching during calls 🌐 37 | - Pre-built prompts for common calling scenarios (like restaurant reservations) 🍽️ 38 | - Automatic public URL tunneling with ngrok 🔄 39 | - Secure handling of credentials 🔒 40 | 41 | ## Why MCP? 42 | 43 | The Model Context Protocol (MCP) bridges the gap between AI assistants and real-world actions. By implementing MCP, this server allows AI models like Claude to: 44 | 45 | 1. Initiate actual phone calls on behalf of users 46 | 2. Process and respond to real-time audio conversations 47 | 3. Execute complex tasks requiring voice communication 48 | 49 | This open-source implementation provides transparency and customizability, allowing developers to extend functionality while maintaining control over their data and privacy. 50 | 51 | ## Requirements 52 | 53 | - Node.js >= 22 54 | - If you need to update Node.js, we recommend using `nvm` (Node Version Manager): 55 | ```bash 56 | nvm install 22 57 | nvm use 22 58 | ``` 59 | - Twilio account with API credentials 60 | - OpenAI API key 61 | - Ngrok Authtoken 62 | 63 | ## Installation 64 | 65 | ### Manual Installation 66 | 67 | 1. Clone the repository 68 | ```bash 69 | git clone https://github.com/lukaskai/voice-call-mcp-server.git 70 | cd voice-call-mcp-server 71 | ``` 72 | 73 | 2. Install dependencies and build 74 | ```bash 75 | npm install 76 | npm run build 77 | ``` 78 | 79 | ## Configuration 80 | 81 | The server requires several environment variables: 82 | 83 | - `TWILIO_ACCOUNT_SID`: Your Twilio account SID 84 | - `TWILIO_AUTH_TOKEN`: Your Twilio auth token 85 | - `TWILIO_NUMBER`: Your Twilio number 86 | - `OPENAI_API_KEY`: Your OpenAI API key 87 | - `NGROK_AUTHTOKEN`: Your ngrok authtoken 88 | - `RECORD_CALLS`: Set to "true" to record calls (optional) 89 | 90 | ### Claude Desktop Configuration 91 | 92 | To use this server with Claude Desktop, add the following to your configuration file: 93 | 94 | **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json` 95 | 96 | **Windows**: `%APPDATA%\Claude\claude_desktop_config.json` 97 | 98 | ```json 99 | { 100 | "mcpServers": { 101 | "voice-call": { 102 | "command": "node", 103 | "args": ["/path/to/your/mcp-new/dist/start-all.cjs"], 104 | "env": { 105 | "TWILIO_ACCOUNT_SID": "your_account_sid", 106 | "TWILIO_AUTH_TOKEN": "your_auth_token", 107 | "TWILIO_NUMBER": "your_e.164_format_number", 108 | "OPENAI_API_KEY": "your_openai_api_key", 109 | "NGROK_AUTHTOKEN": "your_ngrok_authtoken" 110 | } 111 | } 112 | } 113 | } 114 | ``` 115 | 116 | After that, restart Claude Desktop to reload the configuration. 117 | If connected, you should see Voice Call under the 🔨 menu. 118 | 119 | ## Example Interactions with Claude 120 | 121 | Here are some natural ways to interact with the server through Claude: 122 | 123 | 1. Simple call: 124 | ``` 125 | Can you call +1-123-456-7890 and let them know I'll be 15 minutes late for our meeting? 126 | ``` 127 | 128 | 2. Restaurant reservation: 129 | ``` 130 | Please call Delicious Restaurant at +1-123-456-7890 and make a reservation for 4 people tonight at 7:30 PM. Please speak in German. 131 | ``` 132 | 133 | 3. Appointment scheduling: 134 | ``` 135 | Please call Expert Dental NYC (+1-123-456-7899) and reschedule my Monday appointment to next Friday between 4–6pm. 136 | ``` 137 | 138 | ## Important Notes 139 | 140 | 1. **Phone Number Format**: All phone numbers must be in E.164 format (e.g., +11234567890) 141 | 2. **Rate Limits**: Be aware of your Twilio and OpenAI account's rate limits and pricing 142 | 3. **Voice Conversations**: The AI will handle natural conversations in real-time 143 | 4. **Call Duration**: Be mindful of call durations as they affect OpenAI API and Twilio costs 144 | 5. **Public Exposure**: Be aware that the ngrok tunnel exposes your server publicly for Twilio to reach it (though with a random URL and protected by a random secret) 145 | 146 | ## Troubleshooting 147 | 148 | Common error messages and solutions: 149 | 150 | 1. "Phone number must be in E.164 format" 151 | - Make sure the phone number starts with "+" and the country code 152 | 153 | 2. "Invalid credentials" 154 | - Double-check your TWILIO_ACCOUNT_SID and TWILIO_AUTH_TOKEN. You can copy them from the [Twilio Console](https://console.twilio.com) 155 | 156 | 3. "OpenAI API error" 157 | - Verify your OPENAI_API_KEY is correct and has sufficient credits 158 | 159 | 4. "Ngrok tunnel failed to start" 160 | - Ensure your NGROK_AUTHTOKEN is valid and not expired 161 | 162 | 5. "OpenAI Realtime does not detect the end of voice input, or is lagging." 163 | - Sometimes, there might be voice encoding issues between Twilio and the receiver's network operator. Try using a different receiver. 164 | 165 | ## Contributing 166 | 167 | Contributions are welcome! Here are some areas we're looking to improve: 168 | 169 | - Implement support for multiple AI models beyond the current implementation 170 | - Add database integration to store conversation history locally and make it accessible for AI context 171 | - Improve latency and response times to enhance call experiences 172 | - Enhance error handling and recovery mechanisms 173 | - Add more pre-built conversation templates for common scenarios 174 | - Implement improved call monitoring and analytics 175 | 176 | If you'd like to contribute, please open an issue to discuss your ideas before submitting a pull request. 177 | 178 | ## License 179 | 180 | This project is licensed under the MIT License - see the LICENSE file for details. 181 | 182 | ## Security 183 | 184 | Please do not include any sensitive information (like phone numbers or API credentials) in GitHub issues or pull requests. This server handles sensitive communications; deploy it responsibly and ensure all credentials are kept secure. 185 | 186 | 187 | ## Time For a New Mission? 188 | 189 | We’re hiring engineers to build at the frontier of voice AI — and bake it into a next-gen telco. 190 | 191 | Curious? Head to [careers.popcorn.space](https://careers.popcorn.space/apply) 🍿 ! 192 | -------------------------------------------------------------------------------- /assets/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/popcornspace/voice-call-mcp-server/13e3bd99ed2d6859a18dba9c030564da90986894/assets/demo.gif -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import eslint from '@eslint/js'; 2 | import tseslint from 'typescript-eslint'; 3 | 4 | export default tseslint.config( 5 | eslint.configs.recommended, 6 | ...tseslint.configs.recommended, 7 | ...tseslint.configs.stylistic, 8 | { 9 | plugins: { 10 | }, 11 | rules: { 12 | '@typescript-eslint/no-explicit-any': 'off', 13 | '@typescript-eslint/prefer-for-of': 'off', 14 | 'no-trailing-spaces': 'error', // Disallow trailing spaces 15 | 'eol-last': ['error', 'always'], // Enforce newline at end of file 16 | 'indent': ['error', 4], // Enforce 4 spaces for indentation 17 | 'quotes': ['error', 'single'], // Enforce single quotes 18 | 'semi': ['error', 'always'], // Enforce semicolons 19 | }, 20 | } 21 | ); 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "voice-call-mcp-server", 3 | "main": "dist/start-all.js", 4 | "type": "module", 5 | "scripts": { 6 | "start-all": "tsx src/start-all.ts", 7 | "start": "node dist/start-all.cjs", 8 | "build": "npm-run-all clean build:app", 9 | "build:app": "tsup src/start-all.ts", 10 | "clean": "rimraf dist" 11 | }, 12 | "dependencies": { 13 | "@modelcontextprotocol/sdk": "1.8.0", 14 | "@ngrok/ngrok": "^1.4.1", 15 | "axios": "^1.6.8", 16 | "body-parser": "^1.20.2", 17 | "colors": "^1.4.0", 18 | "cors": "^2.8.5", 19 | "dotenv": "^16.4.5", 20 | "eslint-plugin-simple-import-sort": "^12.1.1", 21 | "express": "^4.18.3", 22 | "express-ws": "^5.0.2", 23 | "form-data": "^4.0.0", 24 | "google-protobuf": "^3.21.4", 25 | "httpdispatcher": "^2.2.0", 26 | "ngrok": "5.0.0-beta.2", 27 | "node-fetch": "^2.7.0", 28 | "node-vad": "^1.1.4", 29 | "openai": "^4.85.1", 30 | "peerjs": "^1.5.4", 31 | "perf_hooks": "^0.0.1", 32 | "protobufjs": "^7.4.0", 33 | "twilio": "^5.0.1", 34 | "uuid": "^9.0.1", 35 | "websocket": "^1.0.28", 36 | "zod": "^3.22.4" 37 | }, 38 | "devDependencies": { 39 | "@eslint/js": "^9.21.0", 40 | "@types/cors": "^2.8.17", 41 | "@types/express": "^4.17.21", 42 | "@types/express-ws": "^3.0.4", 43 | "@types/node": "^20.11.30", 44 | "@types/uuid": "^9.0.8", 45 | "@types/websocket": "^1.0.10", 46 | "@types/ws": "^8.5.10", 47 | "dotenv": "^16.4.5", 48 | "eslint": "^9.21.0", 49 | "globals": "^16.0.0", 50 | "npm-run-all": "^4.1.5", 51 | "rimraf": "^5.0.5", 52 | "tsup": "^8.0.2", 53 | "tsx": "^4.7.1", 54 | "typescript": "^5.4.2", 55 | "typescript-eslint": "^8.24.1" 56 | }, 57 | "author": "Popcorn", 58 | "license": "MIT", 59 | "packageManager": "pnpm@10.7.0+sha512.6b865ad4b62a1d9842b61d674a393903b871d9244954f652b8842c2b553c72176b278f64c463e52d40fff8aba385c235c8c9ecf5cc7de4fd78b8bb6d49633ab6" 60 | } 61 | -------------------------------------------------------------------------------- /src/config/constants.ts: -------------------------------------------------------------------------------- 1 | export const LOG_EVENT_TYPES = [ 2 | 'error', 3 | 'session.created', 4 | 'response.audio.delta', 5 | 'response.audio_transcript.done', 6 | 'conversation.item.input_audio_transcription.completed', 7 | ]; 8 | 9 | export const DYNAMIC_API_SECRET = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15); 10 | export const SHOW_TIMING_MATH = false; 11 | export const VOICE = 'sage'; 12 | export const RECORD_CALLS = process.env.RECORD === 'true'; 13 | export const GOODBYE_PHRASES = ['bye', 'goodbye', 'have a nice day', 'see you', 'take care']; 14 | -------------------------------------------------------------------------------- /src/config/prompts.ts: -------------------------------------------------------------------------------- 1 | import { CallState } from '../types.js'; 2 | 3 | export const generateOutboundCallContext = (callState: CallState, callContext?: string): string => { 4 | return `Please refer to phone call transcripts. 5 | Stay concise and short. 6 | You are assistant (if asked, you phone number with country code is: ${callState.fromNumber}). You are making an outbound call. 7 | Be friendly and speak in human short sentences. Start conversation with how are you. Do not speak in bullet points. Ask one question at a time, tell one sentence at a time. 8 | After successful task completion, say goodbye and end the conversation. 9 | You ARE NOT a receptionist, NOT an administrator, NOT a person making reservation. 10 | You do not provide any other info, which is not related to the goal. You can calling solely to achieve your tasks 11 | You are the customer making a request, not the restaurant staff. 12 | YOU ARE STRICTLY THE ONE MAKING THE REQUEST (and not the one receiving). YOU MUST ACHIEVE YOUR GOAL AS AN ASSITANT AND PERFORM TASK. 13 | Be focused solely on your task: 14 | ${callContext ? callContext : ''}`; 15 | }; 16 | -------------------------------------------------------------------------------- /src/handlers/openai.handler.ts: -------------------------------------------------------------------------------- 1 | import { WebSocket } from 'ws'; 2 | import twilio from 'twilio'; 3 | import dotenv from 'dotenv'; 4 | import { CallState, CallType, OpenAIConfig } from '../types.js'; 5 | import { VOICE } from '../config/constants.js'; 6 | import { OpenAIContextService } from '../services/openai/context.service.js'; 7 | import { OpenAIWsService } from '../services/openai/ws.service.js'; 8 | import { TwilioWsService } from '../services/twilio/ws.service.js'; 9 | import { OpenAIEventService } from '../services/openai/event.service.js'; 10 | import { TwilioEventService } from '../services/twilio/event.service.js'; 11 | import { SessionManagerService } from '../services/session-manager.service.js'; 12 | import { TwilioCallService } from '../services/twilio/call.service.js'; 13 | 14 | dotenv.config(); 15 | 16 | /** 17 | * Handles the communication between Twilio and OpenAI for voice calls 18 | */ 19 | export class OpenAICallHandler { 20 | private readonly twilioStream: TwilioWsService; 21 | private readonly openAIService: OpenAIWsService; 22 | private readonly openAIEventProcessor: OpenAIEventService; 23 | private readonly twilioEventProcessor: TwilioEventService; 24 | private readonly twilioCallService: TwilioCallService; 25 | private readonly callState: CallState; 26 | 27 | constructor(ws: WebSocket, callType: CallType, twilioClient: twilio.Twilio, contextService: OpenAIContextService) { 28 | this.callState = new CallState(callType); 29 | 30 | // Initialize Twilio services 31 | this.twilioStream = new TwilioWsService(ws, this.callState); 32 | this.twilioCallService = new TwilioCallService(twilioClient); 33 | 34 | // Initialize OpenAI service 35 | const openAIConfig: OpenAIConfig = { 36 | apiKey: process.env.OPENAI_API_KEY || '', 37 | websocketUrl: process.env.OPENAI_WEBSOCKET_URL || 'wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview', 38 | voice: VOICE, 39 | temperature: 0.6 40 | }; 41 | this.openAIService = new OpenAIWsService(openAIConfig); 42 | 43 | // Initialize event processors 44 | this.openAIEventProcessor = new OpenAIEventService( 45 | this.callState, 46 | () => this.endCall(), 47 | (payload) => this.twilioStream.sendAudio(payload), 48 | () => this.handleSpeechStartedEvent() 49 | ); 50 | 51 | this.twilioEventProcessor = new TwilioEventService( 52 | this.callState, 53 | this.twilioCallService, 54 | contextService, 55 | (payload) => this.openAIService.sendAudio(payload),// Log the first media event 56 | ); 57 | 58 | this.setupEventHandlers(); 59 | this.initializeOpenAI(); 60 | } 61 | 62 | private endCall(): void { 63 | if (this.callState.callSid) { 64 | this.twilioCallService.endCall(this.callState.callSid); 65 | } 66 | 67 | setTimeout(() => { 68 | this.closeWebSockets(); 69 | }, 5000); 70 | } 71 | 72 | private closeWebSockets(): void { 73 | this.twilioStream.close(); 74 | this.openAIService.close(); 75 | } 76 | 77 | private initializeOpenAI(): void { 78 | this.openAIService.initialize( 79 | (data) => this.openAIEventProcessor.processMessage(data), 80 | () => { 81 | setTimeout(() => this.openAIService.initializeSession(this.callState.callContext), 100); 82 | }, 83 | (error) => console.error('Error in the OpenAI WebSocket:', error) 84 | ); 85 | } 86 | 87 | private handleSpeechStartedEvent(): void { 88 | if (this.callState.markQueue.length === 0 || this.callState.responseStartTimestampTwilio === null || !this.callState.lastAssistantItemId) { 89 | return; 90 | } 91 | 92 | const elapsedTime = this.callState.latestMediaTimestamp - this.callState.responseStartTimestampTwilio; 93 | 94 | this.openAIService.truncateAssistantResponse(this.callState.lastAssistantItemId, elapsedTime); 95 | this.twilioStream.clearStream(); 96 | this.resetResponseState(); 97 | } 98 | 99 | private resetResponseState(): void { 100 | this.callState.markQueue = []; 101 | this.callState.lastAssistantItemId = null; 102 | this.callState.responseStartTimestampTwilio = null; 103 | } 104 | 105 | private setupEventHandlers(): void { 106 | this.twilioStream.setupEventHandlers( 107 | async (message) => await this.twilioEventProcessor.processMessage(message), 108 | async () => { 109 | this.openAIService.close(); 110 | } 111 | ); 112 | } 113 | } 114 | 115 | /** 116 | * Manages multiple concurrent call sessions 117 | */ 118 | export class CallSessionManager { 119 | private readonly sessionManager: SessionManagerService; 120 | 121 | constructor(twilioClient: twilio.Twilio) { 122 | this.sessionManager = new SessionManagerService(twilioClient); 123 | } 124 | 125 | /** 126 | * Creates a new call session 127 | * @param ws The WebSocket connection 128 | * @param callType The type of call 129 | */ 130 | public createSession(ws: WebSocket, callType: CallType): void { 131 | this.sessionManager.createSession(ws, callType); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/servers/mcp.server.ts: -------------------------------------------------------------------------------- 1 | import { McpServer, ResourceTemplate } from '@modelcontextprotocol/sdk/server/mcp.js'; 2 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; 3 | import { z } from 'zod'; 4 | import { TwilioCallService } from '../services/twilio/call.service.js'; 5 | 6 | export class VoiceCallMcpServer { 7 | private server: McpServer; 8 | private twilioCallService: TwilioCallService; 9 | private twilioCallbackUrl: string; 10 | 11 | constructor(twilioCallService: TwilioCallService, twilioCallbackUrl: string) { 12 | this.twilioCallbackUrl = twilioCallbackUrl; 13 | this.twilioCallService = twilioCallService; 14 | 15 | this.server = new McpServer({ 16 | name: 'Voice Call MCP Server', 17 | version: '1.0.0', 18 | description: 'MCP server that provides tools for initiating phone calls via Twilio' 19 | }); 20 | 21 | this.registerTools(); 22 | this.registerResources(); 23 | this.registerPrompts(); 24 | } 25 | 26 | private registerTools(): void { 27 | this.server.tool( 28 | 'trigger-call', 29 | 'Trigger an outbound phone call via Twilio', 30 | { 31 | toNumber: z.string().describe('The phone number to call'), 32 | callContext: z.string().describe('Context for the call') 33 | }, 34 | async ({ toNumber, callContext }) => { 35 | try { 36 | const callSid = await this.twilioCallService.makeCall(this.twilioCallbackUrl, toNumber, callContext); 37 | 38 | return { 39 | content: [{ 40 | type: 'text', 41 | text: JSON.stringify({ 42 | status: 'success', 43 | message: 'Call triggered successfully', 44 | callSid: callSid 45 | }) 46 | }] 47 | }; 48 | } catch (error) { 49 | const errorMessage = error instanceof Error ? error.message : String(error); 50 | 51 | return { 52 | content: [{ 53 | type: 'text', 54 | text: JSON.stringify({ 55 | status: 'error', 56 | message: `Failed to trigger call: ${errorMessage}` 57 | }) 58 | }], 59 | isError: true 60 | }; 61 | } 62 | } 63 | ); 64 | } 65 | 66 | private registerResources(): void { 67 | this.server.resource( 68 | 'get-latest-call', 69 | new ResourceTemplate('call://transcriptions', { list: undefined }), 70 | async () => { 71 | // TODO: get call transcription 72 | return { 73 | contents: [{ 74 | text: JSON.stringify({ 75 | transcription: '{}', 76 | status: 'completed', 77 | }), 78 | uri: 'call://transcriptions/latest', 79 | mimeType: 'application/json' 80 | }] 81 | }; 82 | } 83 | ); 84 | } 85 | 86 | private registerPrompts(): void { 87 | this.server.prompt( 88 | 'make-restaurant-reservation', 89 | 'Create a prompt for making a restaurant reservation by phone', 90 | { 91 | restaurantNumber: z.string().describe('The phone number of the restaurant'), 92 | peopleNumber: z.string().describe('The number of people in the party'), 93 | date: z.string().describe('Date of the reservation'), 94 | time: z.string().describe('Preferred time for the reservation') 95 | }, 96 | ({ restaurantNumber, peopleNumber, date, time }) => { 97 | return { 98 | messages: [{ 99 | role: 'user', 100 | content: { 101 | type: 'text', 102 | text: `You are calling a restaurant to book a table for ${peopleNumber} people on ${date} at ${time}. Call the restaurant at ${restaurantNumber} from ${process.env.TWILIO_NUMBER}.` 103 | } 104 | }] 105 | }; 106 | } 107 | ); 108 | } 109 | 110 | public async start(): Promise { 111 | const transport = new StdioServerTransport(); 112 | await this.server.connect(transport); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/servers/voice.server.ts: -------------------------------------------------------------------------------- 1 | import dotenv from 'dotenv'; 2 | import express, { Response } from 'express'; 3 | import VoiceResponse from 'twilio/lib/twiml/VoiceResponse.js'; 4 | import ExpressWs from 'express-ws'; 5 | import { WebSocket } from 'ws'; 6 | import { CallType } from '../types.js'; 7 | import { DYNAMIC_API_SECRET } from '../config/constants.js'; 8 | import { CallSessionManager } from '../handlers/openai.handler.js'; 9 | dotenv.config(); 10 | 11 | export class VoiceServer { 12 | private app: express.Application & { ws: any }; 13 | private port: number; 14 | private sessionManager: CallSessionManager; 15 | private callbackUrl: string; 16 | 17 | constructor(callbackUrl: string, sessionManager: CallSessionManager) { 18 | this.callbackUrl = callbackUrl; 19 | this.port = parseInt(process.env.PORT || '3004'); 20 | this.app = ExpressWs(express()).app; 21 | this.sessionManager = sessionManager; 22 | this.configureMiddleware(); 23 | this.setupRoutes(); 24 | } 25 | 26 | private configureMiddleware(): void { 27 | this.app.use(express.json()); 28 | this.app.use(express.urlencoded({ extended: false })); 29 | } 30 | 31 | private setupRoutes(): void { 32 | this.app.post('/call/outgoing', this.handleOutgoingCall.bind(this)); 33 | this.app.ws('/call/connection-outgoing/:secret', this.handleOutgoingConnection.bind(this)); 34 | } 35 | 36 | private async handleOutgoingCall(req: express.Request, res: Response): Promise { 37 | const apiSecret = req.query.apiSecret?.toString(); 38 | 39 | if (req.query.apiSecret?.toString() !== DYNAMIC_API_SECRET) { 40 | res.status(401).json({ error: 'Unauthorized: Invalid or missing API secret' }); 41 | return; 42 | } 43 | 44 | const fromNumber = req.body.From; 45 | const toNumber = req.body.To; 46 | const callContext = req.query.callContext?.toString(); 47 | 48 | const twiml = new VoiceResponse(); 49 | const connect = twiml.connect(); 50 | 51 | const stream = connect.stream({ 52 | url: `${this.callbackUrl.replace('https://', 'wss://')}/call/connection-outgoing/${apiSecret}`, 53 | }); 54 | 55 | stream.parameter({ name: 'fromNumber', value: fromNumber }); 56 | stream.parameter({ name: 'toNumber', value: toNumber }); 57 | stream.parameter({ name: 'callContext', value: callContext }); 58 | 59 | res.writeHead(200, { 'Content-Type': 'text/xml' }); 60 | res.end(twiml.toString()); 61 | } 62 | 63 | private handleOutgoingConnection(ws: WebSocket, req: express.Request): void { 64 | if (req.params.secret !== DYNAMIC_API_SECRET) { 65 | ws.close(1008, 'Unauthorized: Invalid or missing API secret'); 66 | return; 67 | } 68 | 69 | this.sessionManager.createSession(ws, CallType.OUTBOUND); 70 | } 71 | 72 | public start(): void { 73 | this.app.listen(this.port); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/services/openai/context.service.ts: -------------------------------------------------------------------------------- 1 | import { generateOutboundCallContext } from '../../config/prompts.js'; 2 | import { CallState, ConversationMessage } from '../../types.js'; 3 | 4 | export class OpenAIContextService { 5 | 6 | public initializeCallState(callState: CallState, fromNumber: string, toNumber: string): void { 7 | callState.fromNumber = fromNumber; 8 | callState.toNumber = toNumber; 9 | } 10 | 11 | public setupConversationContext(callState: CallState, callContext?: string): void { 12 | callState.initialMessage = 'Hello!'; 13 | callState.callContext = generateOutboundCallContext(callState, callContext); 14 | 15 | const systemMessage: ConversationMessage = { 16 | role: 'system', 17 | content: callState.callContext 18 | }; 19 | 20 | callState.conversationHistory = [systemMessage]; 21 | 22 | const initialMessage: ConversationMessage = { 23 | role: 'user', 24 | content: callState.initialMessage 25 | }; 26 | 27 | callState.conversationHistory.push(initialMessage); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/services/openai/event.service.ts: -------------------------------------------------------------------------------- 1 | import { WebSocket } from 'ws'; 2 | import { CallState } from '../../types.js'; 3 | import { LOG_EVENT_TYPES, SHOW_TIMING_MATH } from '../../config/constants.js'; 4 | import { checkForGoodbye } from '../../utils/call-utils.js'; 5 | 6 | /** 7 | * Service for processing OpenAI events 8 | */ 9 | export class OpenAIEventService { 10 | private readonly callState: CallState; 11 | private readonly onEndCall: () => void; 12 | private readonly onSendAudioToTwilio: (payload: string) => void; 13 | private readonly onTruncateResponse: () => void; 14 | 15 | /** 16 | * Create a new OpenAI event processor 17 | * @param callState The state of the call 18 | * @param onEndCall Callback for ending the call 19 | * @param onSendAudioToTwilio Callback for sending audio to Twilio 20 | * @param onTruncateResponse Callback for truncating the response 21 | */ 22 | constructor( 23 | callState: CallState, 24 | onEndCall: () => void, 25 | onSendAudioToTwilio: (payload: string) => void, 26 | onTruncateResponse: () => void 27 | ) { 28 | this.callState = callState; 29 | this.onEndCall = onEndCall; 30 | this.onSendAudioToTwilio = onSendAudioToTwilio; 31 | this.onTruncateResponse = onTruncateResponse; 32 | } 33 | 34 | /** 35 | * Process an OpenAI message 36 | * @param data The message data 37 | */ 38 | public processMessage(data: WebSocket.Data): void { 39 | try { 40 | const response = JSON.parse(data.toString()); 41 | 42 | if (LOG_EVENT_TYPES.includes(response.type)) { 43 | // console.log(`Received event: ${response.type}`, response); 44 | } 45 | 46 | this.processEvent(response); 47 | } catch (error) { 48 | console.error('Error processing OpenAI message:', error, 'Raw message:', data); 49 | } 50 | } 51 | 52 | /** 53 | * Process an OpenAI event 54 | * @param response The event data 55 | */ 56 | private processEvent(response: any): void { 57 | switch (response.type) { 58 | case 'conversation.item.input_audio_transcription.completed': 59 | this.handleTranscriptionCompleted(response.transcript); 60 | break; 61 | case 'response.audio_transcript.done': 62 | this.handleAudioTranscriptDone(response.transcript); 63 | break; 64 | case 'response.audio.delta': 65 | if (response.delta) { 66 | this.handleAudioDelta(response); 67 | } 68 | break; 69 | case 'input_audio_buffer.speech_started': 70 | this.onTruncateResponse(); 71 | break; 72 | } 73 | } 74 | 75 | /** 76 | * Handle a transcription completed event 77 | * @param transcription The transcription text 78 | */ 79 | private handleTranscriptionCompleted(transcription: string): void { 80 | if (!transcription) { 81 | return; 82 | } 83 | 84 | this.callState.conversationHistory.push({ 85 | role: 'user', 86 | content: transcription 87 | }); 88 | 89 | if (checkForGoodbye(transcription)) { 90 | this.onEndCall(); 91 | } 92 | } 93 | 94 | /** 95 | * Handle an audio transcript done event 96 | * @param transcript The transcript text 97 | */ 98 | private handleAudioTranscriptDone(transcript: string): void { 99 | if (!transcript) { 100 | return; 101 | } 102 | 103 | this.callState.conversationHistory.push({ 104 | role: 'assistant', 105 | content: transcript 106 | }); 107 | } 108 | 109 | /** 110 | * Handle an audio delta event 111 | * @param response The event data 112 | */ 113 | private handleAudioDelta(response: any): void { 114 | this.onSendAudioToTwilio(response.delta); 115 | 116 | if (!this.callState.responseStartTimestampTwilio) { 117 | this.callState.responseStartTimestampTwilio = this.callState.latestMediaTimestamp; 118 | if (SHOW_TIMING_MATH) { 119 | // console.log(`Setting start timestamp for new response: ${this.callState.responseStartTimestampTwilio}ms`); 120 | } 121 | } 122 | 123 | if (response.item_id) { 124 | this.callState.lastAssistantItemId = response.item_id; 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/services/openai/ws.service.ts: -------------------------------------------------------------------------------- 1 | import { WebSocket } from 'ws'; 2 | import { OpenAIConfig } from '../../types.js'; 3 | import { SHOW_TIMING_MATH } from '../../config/constants.js'; 4 | 5 | /** 6 | * Service for handling OpenAI API interactions 7 | */ 8 | export class OpenAIWsService { 9 | private webSocket: WebSocket | null = null; 10 | private readonly config: OpenAIConfig; 11 | 12 | /** 13 | * Create a new OpenAI service 14 | * @param config Configuration for the OpenAI API 15 | */ 16 | constructor(config: OpenAIConfig) { 17 | this.config = config; 18 | } 19 | 20 | /** 21 | * Initialize the WebSocket connection to OpenAI 22 | * @param onMessage Callback for handling messages from OpenAI 23 | * @param onOpen Callback for when the connection is opened 24 | * @param onError Callback for handling errors 25 | */ 26 | public initialize( 27 | onMessage: (data: WebSocket.Data) => void, 28 | onOpen: () => void, 29 | onError: (error: Error) => void 30 | ): void { 31 | this.webSocket = new WebSocket(this.config.websocketUrl, { 32 | headers: { 33 | Authorization: `Bearer ${this.config.apiKey}`, 34 | 'OpenAI-Beta': 'realtime=v1' 35 | } 36 | }); 37 | 38 | this.webSocket.on('open', onOpen); 39 | this.webSocket.on('message', onMessage); 40 | this.webSocket.on('error', onError); 41 | } 42 | 43 | /** 44 | * Initialize the session with OpenAI 45 | * @param callContext The context for the call 46 | */ 47 | public initializeSession(callContext: string): void { 48 | if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) { 49 | return; 50 | } 51 | 52 | const sessionUpdate = { 53 | type: 'session.update', 54 | session: { 55 | turn_detection: { type: 'server_vad' }, 56 | input_audio_format: 'g711_ulaw', 57 | output_audio_format: 'g711_ulaw', 58 | voice: this.config.voice, 59 | instructions: callContext, 60 | modalities: ['text', 'audio'], 61 | temperature: this.config.temperature, 62 | 'input_audio_transcription': { 63 | 'model': 'whisper-1' 64 | }, 65 | } 66 | }; 67 | 68 | this.webSocket.send(JSON.stringify(sessionUpdate)); 69 | } 70 | 71 | /** 72 | * Close the WebSocket connection 73 | */ 74 | public close(): void { 75 | if (this.webSocket && this.webSocket.readyState === WebSocket.OPEN) { 76 | this.webSocket.close(); 77 | } 78 | } 79 | 80 | /** 81 | * Forward audio data to OpenAI 82 | * @param audioPayload The audio payload to forward 83 | */ 84 | public sendAudio(audioPayload: string): void { 85 | if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) { 86 | return; 87 | } 88 | 89 | const audioAppend = { 90 | type: 'input_audio_buffer.append', 91 | audio: audioPayload 92 | }; 93 | 94 | this.webSocket.send(JSON.stringify(audioAppend)); 95 | } 96 | 97 | /** 98 | * Truncate the assistant's response 99 | * @param itemId The ID of the assistant's response 100 | * @param elapsedTime The time elapsed since the response started 101 | */ 102 | public truncateAssistantResponse(itemId: string, elapsedTime: number): void { 103 | if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) { 104 | return; 105 | } 106 | 107 | const truncateEvent = { 108 | type: 'conversation.item.truncate', 109 | item_id: itemId, 110 | content_index: 0, 111 | audio_end_ms: elapsedTime 112 | }; 113 | 114 | if (SHOW_TIMING_MATH) { 115 | console.error('Sending truncation event:', JSON.stringify(truncateEvent)); 116 | } 117 | 118 | this.webSocket.send(JSON.stringify(truncateEvent)); 119 | } 120 | 121 | /** 122 | * Check if the WebSocket is connected 123 | */ 124 | public isConnected(): boolean { 125 | return this.webSocket !== null && this.webSocket.readyState === WebSocket.OPEN; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/services/session-manager.service.ts: -------------------------------------------------------------------------------- 1 | import { WebSocket } from 'ws'; 2 | import twilio from 'twilio'; 3 | import { CallType } from '../types.js'; 4 | import { OpenAIContextService } from './openai/context.service.js'; 5 | import { OpenAICallHandler } from '../handlers/openai.handler.js'; 6 | 7 | /** 8 | * Manages multiple concurrent call sessions 9 | */ 10 | export class SessionManagerService { 11 | private readonly activeSessions: Map; 12 | private readonly twilioClient: twilio.Twilio; 13 | private readonly contextService: OpenAIContextService; 14 | 15 | /** 16 | * Create a new session manager 17 | * @param twilioConfig Configuration for the Twilio client 18 | */ 19 | constructor(twilioClient: twilio.Twilio) { 20 | this.activeSessions = new Map(); 21 | this.twilioClient = twilioClient; 22 | this.contextService = new OpenAIContextService(); 23 | } 24 | 25 | /** 26 | * Creates a new call session and adds it to the active sessions 27 | * @param ws The WebSocket connection 28 | * @param callType The type of call 29 | */ 30 | public createSession(ws: WebSocket, callType: CallType): void { 31 | const handler = new OpenAICallHandler(ws, callType, this.twilioClient, this.contextService); 32 | this.registerSessionCleanup(ws); 33 | this.addSession(ws, handler); 34 | } 35 | 36 | /** 37 | * Register cleanup for a session 38 | * @param ws The WebSocket connection 39 | */ 40 | private registerSessionCleanup(ws: WebSocket): void { 41 | ws.on('close', () => { 42 | this.removeSession(ws); 43 | }); 44 | } 45 | 46 | /** 47 | * Add a session to active sessions 48 | * @param ws The WebSocket connection 49 | * @param handler The OpenAI call handler 50 | */ 51 | private addSession(ws: WebSocket, handler: OpenAICallHandler): void { 52 | this.activeSessions.set(this.getSessionKey(ws), handler); 53 | } 54 | 55 | /** 56 | * Removes a session from active sessions 57 | * @param ws The WebSocket connection 58 | */ 59 | private removeSession(ws: WebSocket): void { 60 | const sessionKey = this.getSessionKey(ws); 61 | if (this.activeSessions.has(sessionKey)) { 62 | this.activeSessions.delete(sessionKey); 63 | } 64 | } 65 | 66 | /** 67 | * Generates a unique key for a session based on the WebSocket object 68 | * @param ws The WebSocket connection 69 | * @returns A unique key for the session 70 | */ 71 | private getSessionKey(ws: WebSocket): string { 72 | return ws.url || ws.toString(); 73 | } 74 | 75 | /** 76 | * Get the Twilio client 77 | * @returns The Twilio client 78 | */ 79 | public getTwilioClient(): twilio.Twilio { 80 | return this.twilioClient; 81 | } 82 | 83 | /** 84 | * Get the context service 85 | * @returns The context service 86 | */ 87 | public getContextService(): OpenAIContextService { 88 | return this.contextService; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/services/twilio/call.service.ts: -------------------------------------------------------------------------------- 1 | import twilio from 'twilio'; 2 | import { DYNAMIC_API_SECRET, RECORD_CALLS } from '../../config/constants.js'; 3 | 4 | /** 5 | * Service for handling Twilio call operations 6 | */ 7 | export class TwilioCallService { 8 | private readonly twilioClient: twilio.Twilio; 9 | 10 | /** 11 | * Create a new Twilio call service 12 | * @param twilioClient The Twilio client 13 | */ 14 | constructor(twilioClient: twilio.Twilio) { 15 | this.twilioClient = twilioClient; 16 | } 17 | 18 | /** 19 | * Start recording a call 20 | * @param callSid The SID of the call to record 21 | */ 22 | public async startRecording(callSid: string): Promise { 23 | if (!RECORD_CALLS || !callSid) { 24 | return; 25 | } 26 | 27 | try { 28 | await this.twilioClient.calls(callSid) 29 | .recordings 30 | .create(); 31 | } catch (error) { 32 | console.error(`Failed to start recording for call ${callSid}:`, error); 33 | } 34 | } 35 | 36 | /** 37 | * End a call 38 | * @param callSid The SID of the call to end 39 | */ 40 | public async endCall(callSid: string): Promise { 41 | if (!callSid) { 42 | return; 43 | } 44 | 45 | try { 46 | await this.twilioClient.calls(callSid) 47 | .update({ status: 'completed' }); 48 | } catch (error) { 49 | console.error(`Failed to end call ${callSid}:`, error); 50 | } 51 | } 52 | 53 | 54 | public async makeCall(twilioCallbackUrl: string, toNumber: string, callContext = ''): Promise { 55 | try { 56 | const twilioClient = twilio(process.env.TWILIO_ACCOUNT_SID, process.env.TWILIO_AUTH_TOKEN); 57 | 58 | const callContextEncoded = encodeURIComponent(callContext); 59 | 60 | const call = await twilioClient.calls.create({ 61 | to: toNumber, 62 | from: process.env.TWILIO_NUMBER || '', 63 | url: `${twilioCallbackUrl}/call/outgoing?apiSecret=${DYNAMIC_API_SECRET}&callType=outgoing&callContext=${callContextEncoded}`, 64 | }); 65 | 66 | return call.sid; 67 | } catch (error) { 68 | console.error(`Error making call: ${error}`); 69 | throw error; 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/services/twilio/event.service.ts: -------------------------------------------------------------------------------- 1 | import { CallState } from '../../types.js'; 2 | import { OpenAIContextService } from '../openai/context.service.js'; 3 | import { RECORD_CALLS, SHOW_TIMING_MATH } from '../../config/constants.js'; 4 | import { TwilioCallService } from './call.service.js'; 5 | 6 | /** 7 | * Service for processing Twilio events 8 | */ 9 | export class TwilioEventService { 10 | private readonly callState: CallState; 11 | private readonly twilioCallService: TwilioCallService; 12 | private readonly contextService: OpenAIContextService; 13 | private readonly onForwardAudioToOpenAI: (payload: string) => void; 14 | 15 | /** 16 | * Create a new Twilio event processor 17 | * @param callState The state of the call 18 | * @param twilioCallService The Twilio call service 19 | * @param contextService The context service 20 | * @param onForwardAudioToOpenAI Callback for forwarding audio to OpenAI 21 | */ 22 | constructor( 23 | callState: CallState, 24 | twilioCallService: TwilioCallService, 25 | contextService: OpenAIContextService, 26 | onForwardAudioToOpenAI: (payload: string) => void, 27 | ) { 28 | this.callState = callState; 29 | this.twilioCallService = twilioCallService; 30 | this.contextService = contextService; 31 | this.onForwardAudioToOpenAI = onForwardAudioToOpenAI; 32 | } 33 | 34 | /** 35 | * Process a Twilio message 36 | * @param message The message data 37 | */ 38 | public async processMessage(message: Buffer | string): Promise { 39 | try { 40 | const data = JSON.parse(message.toString()); 41 | await this.processEvent(data); 42 | } catch (error) { 43 | console.error('Error parsing message:', error, 'Message:', message); 44 | } 45 | } 46 | 47 | /** 48 | * Process a Twilio event 49 | * @param data The event data 50 | */ 51 | private async processEvent(data: any): Promise { 52 | switch (data.event) { 53 | case 'media': 54 | await this.handleMediaEvent(data); 55 | break; 56 | case 'start': 57 | await this.handleStartEvent(data); 58 | break; 59 | case 'mark': 60 | this.handleMarkEvent(); 61 | break; 62 | default: 63 | console.error('Received non-media event:', data.event); 64 | break; 65 | } 66 | } 67 | 68 | /** 69 | * Handle a Twilio media event 70 | * @param data The event data 71 | */ 72 | private async handleMediaEvent(data: any): Promise { 73 | this.callState.latestMediaTimestamp = data.media.timestamp; 74 | if (SHOW_TIMING_MATH) { 75 | // console.log(`Received media message with timestamp: ${this.callState.latestMediaTimestamp}ms`); 76 | } 77 | 78 | await this.handleFirstMediaEventIfNeeded(); 79 | this.onForwardAudioToOpenAI(data.media.payload); 80 | } 81 | 82 | /** 83 | * Handle the first media event if it hasn't been handled yet 84 | */ 85 | private async handleFirstMediaEventIfNeeded(): Promise { 86 | if (this.callState.hasSeenMedia) { 87 | return; 88 | } 89 | 90 | this.callState.hasSeenMedia = true; 91 | 92 | if (RECORD_CALLS && this.callState.callSid) { 93 | await this.startCallRecording(); 94 | } 95 | } 96 | 97 | /** 98 | * Start recording the call 99 | */ 100 | private async startCallRecording(): Promise { 101 | await this.twilioCallService.startRecording(this.callState.callSid); 102 | } 103 | 104 | /** 105 | * Handle a Twilio start event 106 | * @param data The event data 107 | */ 108 | private async handleStartEvent(data: any): Promise { 109 | this.callState.streamSid = data.start.streamSid; 110 | this.callState.responseStartTimestampTwilio = null; 111 | this.callState.latestMediaTimestamp = 0; 112 | 113 | this.contextService.initializeCallState(this.callState, data.start.customParameters.fromNumber, data.start.customParameters.toNumber); 114 | this.contextService.setupConversationContext(this.callState, data.start.customParameters.callContext); 115 | this.callState.callSid = data.start.callSid; 116 | } 117 | 118 | /** 119 | * Handle a Twilio mark event 120 | */ 121 | private handleMarkEvent(): void { 122 | if (this.callState.markQueue.length > 0) { 123 | this.callState.markQueue.shift(); 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/services/twilio/ws.service.ts: -------------------------------------------------------------------------------- 1 | import { WebSocket } from 'ws'; 2 | import { CallState } from '../../types.js'; 3 | import { SHOW_TIMING_MATH } from '../../config/constants.js'; 4 | 5 | /** 6 | * Service for handling Twilio WebSocket streams 7 | */ 8 | export class TwilioWsService { 9 | private readonly webSocket: WebSocket; 10 | private readonly callState: CallState; 11 | 12 | /** 13 | * Create a new Twilio stream service 14 | * @param webSocket The Twilio WebSocket connection 15 | * @param callState The state of the call 16 | */ 17 | constructor(webSocket: WebSocket, callState: CallState) { 18 | this.webSocket = webSocket; 19 | this.callState = callState; 20 | } 21 | 22 | /** 23 | * Close the WebSocket connection 24 | */ 25 | public close(): void { 26 | if (this.webSocket.readyState === WebSocket.OPEN) { 27 | this.webSocket.close(); 28 | } 29 | } 30 | 31 | /** 32 | * Send a mark event to Twilio 33 | */ 34 | public sendMark(): void { 35 | if (!this.callState.streamSid) { 36 | return; 37 | } 38 | 39 | const markEvent = { 40 | event: 'mark', 41 | streamSid: this.callState.streamSid, 42 | mark: { name: 'responsePart' } 43 | }; 44 | this.webSocket.send(JSON.stringify(markEvent)); 45 | this.callState.markQueue.push('responsePart'); 46 | } 47 | 48 | /** 49 | * Send audio data to Twilio 50 | * @param payload The audio payload to send 51 | */ 52 | public sendAudio(payload: string): void { 53 | if (!this.callState.streamSid) { 54 | return; 55 | } 56 | 57 | const audioDelta = { 58 | event: 'media', 59 | streamSid: this.callState.streamSid, 60 | media: { payload } 61 | }; 62 | this.webSocket.send(JSON.stringify(audioDelta)); 63 | } 64 | 65 | /** 66 | * Clear the Twilio stream 67 | */ 68 | public clearStream(): void { 69 | if (!this.callState.streamSid) { 70 | return; 71 | } 72 | 73 | this.webSocket.send(JSON.stringify({ 74 | event: 'clear', 75 | streamSid: this.callState.streamSid 76 | })); 77 | } 78 | 79 | /** 80 | * Set up event handlers for the Twilio WebSocket 81 | * @param onMessage Callback for handling messages from Twilio 82 | * @param onClose Callback for when the connection is closed 83 | */ 84 | public setupEventHandlers( 85 | onMessage: (message: Buffer | string) => void, 86 | onClose: () => void 87 | ): void { 88 | this.webSocket.on('message', onMessage); 89 | this.webSocket.on('close', onClose); 90 | } 91 | 92 | /** 93 | * Process a Twilio start event 94 | * @param data The start event data 95 | */ 96 | public processStartEvent(data: any): void { 97 | this.callState.streamSid = data.start.streamSid; 98 | this.callState.responseStartTimestampTwilio = null; 99 | this.callState.latestMediaTimestamp = 0; 100 | this.callState.callSid = data.start.callSid; 101 | } 102 | 103 | /** 104 | * Process a Twilio mark event 105 | */ 106 | public processMarkEvent(): void { 107 | if (this.callState.markQueue.length > 0) { 108 | this.callState.markQueue.shift(); 109 | } 110 | } 111 | 112 | /** 113 | * Process a Twilio media event 114 | * @param data The media event data 115 | */ 116 | public processMediaEvent(data: any): void { 117 | this.callState.latestMediaTimestamp = data.media.timestamp; 118 | if (SHOW_TIMING_MATH) { 119 | // console.log(`Received media message with timestamp: ${this.callState.latestMediaTimestamp}ms`); 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/start-all.ts: -------------------------------------------------------------------------------- 1 | import dotenv from 'dotenv'; 2 | import ngrok from '@ngrok/ngrok'; 3 | import { isPortInUse } from './utils/execution-utils.js'; 4 | import { VoiceCallMcpServer } from './servers/mcp.server.js'; 5 | import { TwilioCallService } from './services/twilio/call.service.js'; 6 | import { VoiceServer } from './servers/voice.server.js'; 7 | import twilio from 'twilio'; 8 | import { CallSessionManager } from './handlers/openai.handler.js'; 9 | 10 | // Load environment variables 11 | dotenv.config(); 12 | 13 | // Define required environment variables 14 | const REQUIRED_ENV_VARS = [ 15 | 'TWILIO_ACCOUNT_SID', 16 | 'TWILIO_AUTH_TOKEN', 17 | 'OPENAI_API_KEY', 18 | 'NGROK_AUTHTOKEN', 19 | 'TWILIO_NUMBER' 20 | ] as const; 21 | 22 | /** 23 | * Validates that all required environment variables are present 24 | * @returns true if all variables are present, exits process otherwise 25 | */ 26 | function validateEnvironmentVariables(): boolean { 27 | for (const envVar of REQUIRED_ENV_VARS) { 28 | if (!process.env[envVar]) { 29 | console.error(`Error: ${envVar} environment variable is required`); 30 | process.exit(1); 31 | } 32 | } 33 | return true; 34 | } 35 | 36 | /** 37 | * Sets up the port for the application 38 | */ 39 | function setupPort(): number { 40 | const PORT = process.env.PORT || '3004'; 41 | process.env.PORT = PORT; 42 | return parseInt(PORT); 43 | } 44 | 45 | /** 46 | * Establishes ngrok tunnel for external access 47 | * @param portNumber - The port number to forward 48 | * @returns The public URL provided by ngrok 49 | */ 50 | async function setupNgrokTunnel(portNumber: number): Promise { 51 | const listener = await ngrok.forward({ 52 | addr: portNumber, 53 | authtoken_from_env: true 54 | }); 55 | 56 | const twilioCallbackUrl = listener.url(); 57 | if (!twilioCallbackUrl) { 58 | throw new Error('Failed to obtain ngrok URL'); 59 | } 60 | 61 | return twilioCallbackUrl; 62 | } 63 | 64 | /** 65 | * Sets up graceful shutdown handlers 66 | */ 67 | function setupShutdownHandlers(): void { 68 | process.on('SIGINT', async () => { 69 | try { 70 | await ngrok.disconnect(); 71 | } catch (err) { 72 | console.error('Error killing ngrok:', err); 73 | } 74 | process.exit(0); 75 | }); 76 | } 77 | 78 | /** 79 | * Retries starting the server when the port is in use 80 | * @param portNumber - The port number to check 81 | */ 82 | function scheduleServerRetry(portNumber: number): void { 83 | console.error(`Port ${portNumber} is already in use. Server may already be running.`); 84 | console.error('Will retry in 15 seconds...'); 85 | 86 | const RETRY_INTERVAL_MS = 15000; 87 | 88 | const retryInterval = setInterval(async () => { 89 | const stillInUse = await isPortInUse(portNumber); 90 | 91 | if (!stillInUse) { 92 | clearInterval(retryInterval); 93 | main(); 94 | } else { 95 | console.error(`Port ${portNumber} is still in use. Will retry in 15 seconds...`); 96 | } 97 | }, RETRY_INTERVAL_MS); 98 | } 99 | 100 | 101 | async function main(): Promise { 102 | try { 103 | validateEnvironmentVariables(); 104 | const portNumber = setupPort(); 105 | 106 | const twilioClient = twilio(process.env.TWILIO_ACCOUNT_SID, process.env.TWILIO_AUTH_TOKEN); 107 | 108 | const sessionManager = new CallSessionManager(twilioClient); 109 | const twilioCallService = new TwilioCallService(twilioClient); 110 | 111 | // Check if port is already in use 112 | const portInUse = await isPortInUse(portNumber); 113 | if (portInUse) { 114 | scheduleServerRetry(portNumber); 115 | return; 116 | } 117 | 118 | // Establish ngrok connectivity 119 | const twilioCallbackUrl = await setupNgrokTunnel(portNumber); 120 | 121 | // Start the main HTTP server 122 | const server = new VoiceServer(twilioCallbackUrl, sessionManager); 123 | server.start(); 124 | 125 | const mcpServer = new VoiceCallMcpServer(twilioCallService, twilioCallbackUrl); 126 | await mcpServer.start(); 127 | 128 | // Set up graceful shutdown 129 | setupShutdownHandlers(); 130 | } catch (error) { 131 | console.error('Error starting services:', error); 132 | process.exit(1); 133 | } 134 | } 135 | 136 | // Start the main function 137 | main(); 138 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | // state.ts - Shared state variables 2 | export enum CallType { 3 | OUTBOUND = 'OUTBOUND', 4 | } 5 | 6 | export interface ConversationMessage { 7 | role: 'system' | 'user' | 'assistant'; 8 | content: string; 9 | name?: string; 10 | } 11 | 12 | export class CallState { 13 | // Call identification 14 | streamSid = ''; 15 | callSid = ''; 16 | 17 | // Call type and direction 18 | callType: CallType = CallType.OUTBOUND; 19 | 20 | // Phone numbers 21 | fromNumber = ''; 22 | toNumber = ''; 23 | 24 | // Call context and conversation 25 | callContext = ''; 26 | initialMessage = ''; 27 | conversationHistory: ConversationMessage[] = []; 28 | 29 | // Speech state 30 | speaking = false; 31 | 32 | // Timing and processing state 33 | llmStart = 0; 34 | firstByte = true; 35 | sendFirstSentenceInputTime: number | null = null; 36 | 37 | // Media processing state 38 | latestMediaTimestamp = 0; 39 | responseStartTimestampTwilio: number | null = null; 40 | lastAssistantItemId: string | null = null; 41 | markQueue: string[] = []; 42 | hasSeenMedia = false; 43 | 44 | constructor(callType: CallType = CallType.OUTBOUND) { 45 | this.callType = callType; 46 | } 47 | } 48 | 49 | /** 50 | * Configuration for the OpenAI WebSocket connection 51 | */ 52 | export interface OpenAIConfig { 53 | apiKey: string; 54 | websocketUrl: string; 55 | voice: string; 56 | temperature: number; 57 | } 58 | 59 | /** 60 | * Configuration for Twilio client 61 | */ 62 | export interface TwilioConfig { 63 | accountSid: string; 64 | authToken: string; 65 | recordCalls: boolean; 66 | } 67 | -------------------------------------------------------------------------------- /src/utils/call-utils.ts: -------------------------------------------------------------------------------- 1 | import { WebSocket } from 'ws'; 2 | import { GOODBYE_PHRASES } from '../config/constants.js'; 3 | 4 | export const checkForGoodbye = (text: string): boolean => { 5 | const lowercaseText = text.toLowerCase(); 6 | return GOODBYE_PHRASES.some(phrase => lowercaseText.includes(phrase)); 7 | }; 8 | 9 | export const endCall = (ws: WebSocket, openAiWs: WebSocket): void => { 10 | setTimeout(() => { 11 | if (ws.readyState === WebSocket.OPEN) { 12 | ws.close(); 13 | } 14 | if (openAiWs.readyState === WebSocket.OPEN) { 15 | openAiWs.close(); 16 | } 17 | }, 5000); 18 | }; 19 | -------------------------------------------------------------------------------- /src/utils/execution-utils.ts: -------------------------------------------------------------------------------- 1 | import net from 'net'; 2 | 3 | export async function isPortInUse(port: number): Promise { 4 | return new Promise((resolve) => { 5 | const server = net.createServer() 6 | .once('error', (err: NodeJS.ErrnoException) => { 7 | if (err.code === 'EADDRINUSE') { 8 | resolve(true); 9 | } else { 10 | resolve(false); 11 | } 12 | }) 13 | .once('listening', () => { 14 | server.close(); 15 | resolve(false); 16 | }) 17 | .listen(port); 18 | }); 19 | } 20 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2020", 4 | "module": "NodeNext", 5 | "moduleResolution": "NodeNext", 6 | "declaration": true, 7 | "declarationMap": true, 8 | "sourceMap": true, 9 | "outDir": "./dist", 10 | "strict": true, 11 | "esModuleInterop": true, 12 | "forceConsistentCasingInFileNames": true, 13 | "resolveJsonModule": true, 14 | "isolatedModules": true, 15 | "skipLibCheck": true, 16 | "lib": ["es2020", "DOM"], 17 | "allowSyntheticDefaultImports": true 18 | }, 19 | "include": ["src/**/*", "openai-realtime-handler.ts"], 20 | "exclude": ["node_modules", "dist"] 21 | } --------------------------------------------------------------------------------