├── .env.example
├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── assets
└── demo.gif
├── eslint.config.mjs
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── src
├── config
│ ├── constants.ts
│ └── prompts.ts
├── handlers
│ └── openai.handler.ts
├── servers
│ ├── mcp.server.ts
│ └── voice.server.ts
├── services
│ ├── openai
│ │ ├── context.service.ts
│ │ ├── event.service.ts
│ │ └── ws.service.ts
│ ├── session-manager.service.ts
│ └── twilio
│ │ ├── call.service.ts
│ │ ├── event.service.ts
│ │ └── ws.service.ts
├── start-all.ts
├── types.ts
└── utils
│ ├── call-utils.ts
│ └── execution-utils.ts
└── tsconfig.json
/.env.example:
--------------------------------------------------------------------------------
1 | # Server configuration
2 | PORT=3004
3 |
4 | # Twilio API credentials
5 | TWILIO_ACCOUNT_SID=your_twilio_account_sid
6 | TWILIO_AUTH_TOKEN=your_twilio_auth_token
7 | TWILIO_NUMBER=your_twilio_number
8 | # OpenAI API key
9 | OPENAI_API_KEY=your_openai_api_key
10 | OPENAI_WEBSOCKET_URL=wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview
11 |
12 | # Ngrok authentication token
13 | NGROK_AUTHTOKEN=your_ngrok_authtoken
14 |
15 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Dependency directories
2 | node_modules/
3 | jspm_packages/
4 |
5 | # Build outputs
6 | dist/
7 | build/
8 | out/
9 | *.tsbuildinfo
10 |
11 | # Environment variables
12 | .env
13 | .env.local
14 | .env.development.local
15 | .env.test.local
16 | .env.production.local
17 |
18 | # Logs
19 | logs
20 | *.log
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 | lerna-debug.log*
25 |
26 | # Coverage directory used by tools like istanbul
27 | coverage/
28 | *.lcov
29 |
30 | # Cache directories
31 | .npm
32 | .eslintcache
33 | .stylelintcache
34 | .rpt2_cache/
35 | .rts2_cache_cjs/
36 | .rts2_cache_es/
37 | .rts2_cache_umd/
38 |
39 | # Runtime data
40 | pids
41 | *.pid
42 | *.seed
43 | *.pid.lock
44 |
45 | # IDE and editor folders
46 | .idea/
47 | .vscode/
48 | *.swp
49 | *.swo
50 | .DS_Store
51 | .directory
52 | .project
53 | .settings/
54 | .classpath
55 | .c9/
56 | *.launch
57 | .settings/
58 | *.sublime-workspace
59 |
60 |
61 | # ngrok
62 | .ngrok/
63 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 LukasK
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Voice Call MCP Server
2 |
3 | A Model Context Protocol (MCP) server that enables Claude and other AI assistants to initiate and manage voice calls using Twilio and OpenAI (GPT-4o Realtime model).
4 |
5 | Use this as a base to kick-start your AI-powered voice calling explorations, save time and develop additional functionality on top of it.
6 |
7 | 
8 |
9 |
10 | ## Sequence Diagram
11 |
12 | ```mermaid
13 | sequenceDiagram
14 | participant AI as AI Assistant (e.g., Claude)
15 | participant MCP as MCP Server
16 | participant Twilio as Twilio
17 | participant Phone as Destination Phone
18 | participant OpenAI as OpenAI
19 |
20 | AI->>MCP: 1) Initiate outbound call request
(POST /calls)
21 | MCP->>Twilio: 2) Place outbound call via Twilio API
22 | Twilio->>Phone: 3) Ring the destination phone
23 | Twilio->>MCP: 4) Call status updates & audio callbacks (webhooks)
24 | MCP->>OpenAI: 5) Forward real-time audio to OpenaAI's realtime model
25 | OpenAI->>MCP: 6) Return voice stream
26 | MCP->>Twilio: 7) Send voice stream
27 | Twilio->>Phone: 8) Forward voice stream
28 | Note over Phone: Two-way conversation continues
until the call ends
29 | ```
30 |
31 |
32 | ## Features
33 |
34 | - Make outbound phone calls via Twilio 📞
35 | - Process call audio in real-time with GPT-4o Realtime model 🎙️
36 | - Real-time language switching during calls 🌐
37 | - Pre-built prompts for common calling scenarios (like restaurant reservations) 🍽️
38 | - Automatic public URL tunneling with ngrok 🔄
39 | - Secure handling of credentials 🔒
40 |
41 | ## Why MCP?
42 |
43 | The Model Context Protocol (MCP) bridges the gap between AI assistants and real-world actions. By implementing MCP, this server allows AI models like Claude to:
44 |
45 | 1. Initiate actual phone calls on behalf of users
46 | 2. Process and respond to real-time audio conversations
47 | 3. Execute complex tasks requiring voice communication
48 |
49 | This open-source implementation provides transparency and customizability, allowing developers to extend functionality while maintaining control over their data and privacy.
50 |
51 | ## Requirements
52 |
53 | - Node.js >= 22
54 | - If you need to update Node.js, we recommend using `nvm` (Node Version Manager):
55 | ```bash
56 | nvm install 22
57 | nvm use 22
58 | ```
59 | - Twilio account with API credentials
60 | - OpenAI API key
61 | - Ngrok Authtoken
62 |
63 | ## Installation
64 |
65 | ### Manual Installation
66 |
67 | 1. Clone the repository
68 | ```bash
69 | git clone https://github.com/lukaskai/voice-call-mcp-server.git
70 | cd voice-call-mcp-server
71 | ```
72 |
73 | 2. Install dependencies and build
74 | ```bash
75 | npm install
76 | npm run build
77 | ```
78 |
79 | ## Configuration
80 |
81 | The server requires several environment variables:
82 |
83 | - `TWILIO_ACCOUNT_SID`: Your Twilio account SID
84 | - `TWILIO_AUTH_TOKEN`: Your Twilio auth token
85 | - `TWILIO_NUMBER`: Your Twilio number
86 | - `OPENAI_API_KEY`: Your OpenAI API key
87 | - `NGROK_AUTHTOKEN`: Your ngrok authtoken
88 | - `RECORD_CALLS`: Set to "true" to record calls (optional)
89 |
90 | ### Claude Desktop Configuration
91 |
92 | To use this server with Claude Desktop, add the following to your configuration file:
93 |
94 | **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
95 |
96 | **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
97 |
98 | ```json
99 | {
100 | "mcpServers": {
101 | "voice-call": {
102 | "command": "node",
103 | "args": ["/path/to/your/mcp-new/dist/start-all.cjs"],
104 | "env": {
105 | "TWILIO_ACCOUNT_SID": "your_account_sid",
106 | "TWILIO_AUTH_TOKEN": "your_auth_token",
107 | "TWILIO_NUMBER": "your_e.164_format_number",
108 | "OPENAI_API_KEY": "your_openai_api_key",
109 | "NGROK_AUTHTOKEN": "your_ngrok_authtoken"
110 | }
111 | }
112 | }
113 | }
114 | ```
115 |
116 | After that, restart Claude Desktop to reload the configuration.
117 | If connected, you should see Voice Call under the 🔨 menu.
118 |
119 | ## Example Interactions with Claude
120 |
121 | Here are some natural ways to interact with the server through Claude:
122 |
123 | 1. Simple call:
124 | ```
125 | Can you call +1-123-456-7890 and let them know I'll be 15 minutes late for our meeting?
126 | ```
127 |
128 | 2. Restaurant reservation:
129 | ```
130 | Please call Delicious Restaurant at +1-123-456-7890 and make a reservation for 4 people tonight at 7:30 PM. Please speak in German.
131 | ```
132 |
133 | 3. Appointment scheduling:
134 | ```
135 | Please call Expert Dental NYC (+1-123-456-7899) and reschedule my Monday appointment to next Friday between 4–6pm.
136 | ```
137 |
138 | ## Important Notes
139 |
140 | 1. **Phone Number Format**: All phone numbers must be in E.164 format (e.g., +11234567890)
141 | 2. **Rate Limits**: Be aware of your Twilio and OpenAI account's rate limits and pricing
142 | 3. **Voice Conversations**: The AI will handle natural conversations in real-time
143 | 4. **Call Duration**: Be mindful of call durations as they affect OpenAI API and Twilio costs
144 | 5. **Public Exposure**: Be aware that the ngrok tunnel exposes your server publicly for Twilio to reach it (though with a random URL and protected by a random secret)
145 |
146 | ## Troubleshooting
147 |
148 | Common error messages and solutions:
149 |
150 | 1. "Phone number must be in E.164 format"
151 | - Make sure the phone number starts with "+" and the country code
152 |
153 | 2. "Invalid credentials"
154 | - Double-check your TWILIO_ACCOUNT_SID and TWILIO_AUTH_TOKEN. You can copy them from the [Twilio Console](https://console.twilio.com)
155 |
156 | 3. "OpenAI API error"
157 | - Verify your OPENAI_API_KEY is correct and has sufficient credits
158 |
159 | 4. "Ngrok tunnel failed to start"
160 | - Ensure your NGROK_AUTHTOKEN is valid and not expired
161 |
162 | 5. "OpenAI Realtime does not detect the end of voice input, or is lagging."
163 | - Sometimes, there might be voice encoding issues between Twilio and the receiver's network operator. Try using a different receiver.
164 |
165 | ## Contributing
166 |
167 | Contributions are welcome! Here are some areas we're looking to improve:
168 |
169 | - Implement support for multiple AI models beyond the current implementation
170 | - Add database integration to store conversation history locally and make it accessible for AI context
171 | - Improve latency and response times to enhance call experiences
172 | - Enhance error handling and recovery mechanisms
173 | - Add more pre-built conversation templates for common scenarios
174 | - Implement improved call monitoring and analytics
175 |
176 | If you'd like to contribute, please open an issue to discuss your ideas before submitting a pull request.
177 |
178 | ## License
179 |
180 | This project is licensed under the MIT License - see the LICENSE file for details.
181 |
182 | ## Security
183 |
184 | Please do not include any sensitive information (like phone numbers or API credentials) in GitHub issues or pull requests. This server handles sensitive communications; deploy it responsibly and ensure all credentials are kept secure.
185 |
186 |
187 | ## Time For a New Mission?
188 |
189 | We’re hiring engineers to build at the frontier of voice AI — and bake it into a next-gen telco.
190 |
191 | Curious? Head to [careers.popcorn.space](https://careers.popcorn.space/apply) 🍿 !
192 |
--------------------------------------------------------------------------------
/assets/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/popcornspace/voice-call-mcp-server/13e3bd99ed2d6859a18dba9c030564da90986894/assets/demo.gif
--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
1 | import eslint from '@eslint/js';
2 | import tseslint from 'typescript-eslint';
3 |
4 | export default tseslint.config(
5 | eslint.configs.recommended,
6 | ...tseslint.configs.recommended,
7 | ...tseslint.configs.stylistic,
8 | {
9 | plugins: {
10 | },
11 | rules: {
12 | '@typescript-eslint/no-explicit-any': 'off',
13 | '@typescript-eslint/prefer-for-of': 'off',
14 | 'no-trailing-spaces': 'error', // Disallow trailing spaces
15 | 'eol-last': ['error', 'always'], // Enforce newline at end of file
16 | 'indent': ['error', 4], // Enforce 4 spaces for indentation
17 | 'quotes': ['error', 'single'], // Enforce single quotes
18 | 'semi': ['error', 'always'], // Enforce semicolons
19 | },
20 | }
21 | );
22 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "voice-call-mcp-server",
3 | "main": "dist/start-all.js",
4 | "type": "module",
5 | "scripts": {
6 | "start-all": "tsx src/start-all.ts",
7 | "start": "node dist/start-all.cjs",
8 | "build": "npm-run-all clean build:app",
9 | "build:app": "tsup src/start-all.ts",
10 | "clean": "rimraf dist"
11 | },
12 | "dependencies": {
13 | "@modelcontextprotocol/sdk": "1.8.0",
14 | "@ngrok/ngrok": "^1.4.1",
15 | "axios": "^1.6.8",
16 | "body-parser": "^1.20.2",
17 | "colors": "^1.4.0",
18 | "cors": "^2.8.5",
19 | "dotenv": "^16.4.5",
20 | "eslint-plugin-simple-import-sort": "^12.1.1",
21 | "express": "^4.18.3",
22 | "express-ws": "^5.0.2",
23 | "form-data": "^4.0.0",
24 | "google-protobuf": "^3.21.4",
25 | "httpdispatcher": "^2.2.0",
26 | "ngrok": "5.0.0-beta.2",
27 | "node-fetch": "^2.7.0",
28 | "node-vad": "^1.1.4",
29 | "openai": "^4.85.1",
30 | "peerjs": "^1.5.4",
31 | "perf_hooks": "^0.0.1",
32 | "protobufjs": "^7.4.0",
33 | "twilio": "^5.0.1",
34 | "uuid": "^9.0.1",
35 | "websocket": "^1.0.28",
36 | "zod": "^3.22.4"
37 | },
38 | "devDependencies": {
39 | "@eslint/js": "^9.21.0",
40 | "@types/cors": "^2.8.17",
41 | "@types/express": "^4.17.21",
42 | "@types/express-ws": "^3.0.4",
43 | "@types/node": "^20.11.30",
44 | "@types/uuid": "^9.0.8",
45 | "@types/websocket": "^1.0.10",
46 | "@types/ws": "^8.5.10",
47 | "dotenv": "^16.4.5",
48 | "eslint": "^9.21.0",
49 | "globals": "^16.0.0",
50 | "npm-run-all": "^4.1.5",
51 | "rimraf": "^5.0.5",
52 | "tsup": "^8.0.2",
53 | "tsx": "^4.7.1",
54 | "typescript": "^5.4.2",
55 | "typescript-eslint": "^8.24.1"
56 | },
57 | "author": "Popcorn",
58 | "license": "MIT",
59 | "packageManager": "pnpm@10.7.0+sha512.6b865ad4b62a1d9842b61d674a393903b871d9244954f652b8842c2b553c72176b278f64c463e52d40fff8aba385c235c8c9ecf5cc7de4fd78b8bb6d49633ab6"
60 | }
61 |
--------------------------------------------------------------------------------
/src/config/constants.ts:
--------------------------------------------------------------------------------
1 | export const LOG_EVENT_TYPES = [
2 | 'error',
3 | 'session.created',
4 | 'response.audio.delta',
5 | 'response.audio_transcript.done',
6 | 'conversation.item.input_audio_transcription.completed',
7 | ];
8 |
9 | export const DYNAMIC_API_SECRET = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15);
10 | export const SHOW_TIMING_MATH = false;
11 | export const VOICE = 'sage';
12 | export const RECORD_CALLS = process.env.RECORD === 'true';
13 | export const GOODBYE_PHRASES = ['bye', 'goodbye', 'have a nice day', 'see you', 'take care'];
14 |
--------------------------------------------------------------------------------
/src/config/prompts.ts:
--------------------------------------------------------------------------------
1 | import { CallState } from '../types.js';
2 |
3 | export const generateOutboundCallContext = (callState: CallState, callContext?: string): string => {
4 | return `Please refer to phone call transcripts.
5 | Stay concise and short.
6 | You are assistant (if asked, you phone number with country code is: ${callState.fromNumber}). You are making an outbound call.
7 | Be friendly and speak in human short sentences. Start conversation with how are you. Do not speak in bullet points. Ask one question at a time, tell one sentence at a time.
8 | After successful task completion, say goodbye and end the conversation.
9 | You ARE NOT a receptionist, NOT an administrator, NOT a person making reservation.
10 | You do not provide any other info, which is not related to the goal. You can calling solely to achieve your tasks
11 | You are the customer making a request, not the restaurant staff.
12 | YOU ARE STRICTLY THE ONE MAKING THE REQUEST (and not the one receiving). YOU MUST ACHIEVE YOUR GOAL AS AN ASSITANT AND PERFORM TASK.
13 | Be focused solely on your task:
14 | ${callContext ? callContext : ''}`;
15 | };
16 |
--------------------------------------------------------------------------------
/src/handlers/openai.handler.ts:
--------------------------------------------------------------------------------
1 | import { WebSocket } from 'ws';
2 | import twilio from 'twilio';
3 | import dotenv from 'dotenv';
4 | import { CallState, CallType, OpenAIConfig } from '../types.js';
5 | import { VOICE } from '../config/constants.js';
6 | import { OpenAIContextService } from '../services/openai/context.service.js';
7 | import { OpenAIWsService } from '../services/openai/ws.service.js';
8 | import { TwilioWsService } from '../services/twilio/ws.service.js';
9 | import { OpenAIEventService } from '../services/openai/event.service.js';
10 | import { TwilioEventService } from '../services/twilio/event.service.js';
11 | import { SessionManagerService } from '../services/session-manager.service.js';
12 | import { TwilioCallService } from '../services/twilio/call.service.js';
13 |
14 | dotenv.config();
15 |
16 | /**
17 | * Handles the communication between Twilio and OpenAI for voice calls
18 | */
19 | export class OpenAICallHandler {
20 | private readonly twilioStream: TwilioWsService;
21 | private readonly openAIService: OpenAIWsService;
22 | private readonly openAIEventProcessor: OpenAIEventService;
23 | private readonly twilioEventProcessor: TwilioEventService;
24 | private readonly twilioCallService: TwilioCallService;
25 | private readonly callState: CallState;
26 |
27 | constructor(ws: WebSocket, callType: CallType, twilioClient: twilio.Twilio, contextService: OpenAIContextService) {
28 | this.callState = new CallState(callType);
29 |
30 | // Initialize Twilio services
31 | this.twilioStream = new TwilioWsService(ws, this.callState);
32 | this.twilioCallService = new TwilioCallService(twilioClient);
33 |
34 | // Initialize OpenAI service
35 | const openAIConfig: OpenAIConfig = {
36 | apiKey: process.env.OPENAI_API_KEY || '',
37 | websocketUrl: process.env.OPENAI_WEBSOCKET_URL || 'wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview',
38 | voice: VOICE,
39 | temperature: 0.6
40 | };
41 | this.openAIService = new OpenAIWsService(openAIConfig);
42 |
43 | // Initialize event processors
44 | this.openAIEventProcessor = new OpenAIEventService(
45 | this.callState,
46 | () => this.endCall(),
47 | (payload) => this.twilioStream.sendAudio(payload),
48 | () => this.handleSpeechStartedEvent()
49 | );
50 |
51 | this.twilioEventProcessor = new TwilioEventService(
52 | this.callState,
53 | this.twilioCallService,
54 | contextService,
55 | (payload) => this.openAIService.sendAudio(payload),// Log the first media event
56 | );
57 |
58 | this.setupEventHandlers();
59 | this.initializeOpenAI();
60 | }
61 |
62 | private endCall(): void {
63 | if (this.callState.callSid) {
64 | this.twilioCallService.endCall(this.callState.callSid);
65 | }
66 |
67 | setTimeout(() => {
68 | this.closeWebSockets();
69 | }, 5000);
70 | }
71 |
72 | private closeWebSockets(): void {
73 | this.twilioStream.close();
74 | this.openAIService.close();
75 | }
76 |
77 | private initializeOpenAI(): void {
78 | this.openAIService.initialize(
79 | (data) => this.openAIEventProcessor.processMessage(data),
80 | () => {
81 | setTimeout(() => this.openAIService.initializeSession(this.callState.callContext), 100);
82 | },
83 | (error) => console.error('Error in the OpenAI WebSocket:', error)
84 | );
85 | }
86 |
87 | private handleSpeechStartedEvent(): void {
88 | if (this.callState.markQueue.length === 0 || this.callState.responseStartTimestampTwilio === null || !this.callState.lastAssistantItemId) {
89 | return;
90 | }
91 |
92 | const elapsedTime = this.callState.latestMediaTimestamp - this.callState.responseStartTimestampTwilio;
93 |
94 | this.openAIService.truncateAssistantResponse(this.callState.lastAssistantItemId, elapsedTime);
95 | this.twilioStream.clearStream();
96 | this.resetResponseState();
97 | }
98 |
99 | private resetResponseState(): void {
100 | this.callState.markQueue = [];
101 | this.callState.lastAssistantItemId = null;
102 | this.callState.responseStartTimestampTwilio = null;
103 | }
104 |
105 | private setupEventHandlers(): void {
106 | this.twilioStream.setupEventHandlers(
107 | async (message) => await this.twilioEventProcessor.processMessage(message),
108 | async () => {
109 | this.openAIService.close();
110 | }
111 | );
112 | }
113 | }
114 |
115 | /**
116 | * Manages multiple concurrent call sessions
117 | */
118 | export class CallSessionManager {
119 | private readonly sessionManager: SessionManagerService;
120 |
121 | constructor(twilioClient: twilio.Twilio) {
122 | this.sessionManager = new SessionManagerService(twilioClient);
123 | }
124 |
125 | /**
126 | * Creates a new call session
127 | * @param ws The WebSocket connection
128 | * @param callType The type of call
129 | */
130 | public createSession(ws: WebSocket, callType: CallType): void {
131 | this.sessionManager.createSession(ws, callType);
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/src/servers/mcp.server.ts:
--------------------------------------------------------------------------------
1 | import { McpServer, ResourceTemplate } from '@modelcontextprotocol/sdk/server/mcp.js';
2 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
3 | import { z } from 'zod';
4 | import { TwilioCallService } from '../services/twilio/call.service.js';
5 |
6 | export class VoiceCallMcpServer {
7 | private server: McpServer;
8 | private twilioCallService: TwilioCallService;
9 | private twilioCallbackUrl: string;
10 |
11 | constructor(twilioCallService: TwilioCallService, twilioCallbackUrl: string) {
12 | this.twilioCallbackUrl = twilioCallbackUrl;
13 | this.twilioCallService = twilioCallService;
14 |
15 | this.server = new McpServer({
16 | name: 'Voice Call MCP Server',
17 | version: '1.0.0',
18 | description: 'MCP server that provides tools for initiating phone calls via Twilio'
19 | });
20 |
21 | this.registerTools();
22 | this.registerResources();
23 | this.registerPrompts();
24 | }
25 |
26 | private registerTools(): void {
27 | this.server.tool(
28 | 'trigger-call',
29 | 'Trigger an outbound phone call via Twilio',
30 | {
31 | toNumber: z.string().describe('The phone number to call'),
32 | callContext: z.string().describe('Context for the call')
33 | },
34 | async ({ toNumber, callContext }) => {
35 | try {
36 | const callSid = await this.twilioCallService.makeCall(this.twilioCallbackUrl, toNumber, callContext);
37 |
38 | return {
39 | content: [{
40 | type: 'text',
41 | text: JSON.stringify({
42 | status: 'success',
43 | message: 'Call triggered successfully',
44 | callSid: callSid
45 | })
46 | }]
47 | };
48 | } catch (error) {
49 | const errorMessage = error instanceof Error ? error.message : String(error);
50 |
51 | return {
52 | content: [{
53 | type: 'text',
54 | text: JSON.stringify({
55 | status: 'error',
56 | message: `Failed to trigger call: ${errorMessage}`
57 | })
58 | }],
59 | isError: true
60 | };
61 | }
62 | }
63 | );
64 | }
65 |
66 | private registerResources(): void {
67 | this.server.resource(
68 | 'get-latest-call',
69 | new ResourceTemplate('call://transcriptions', { list: undefined }),
70 | async () => {
71 | // TODO: get call transcription
72 | return {
73 | contents: [{
74 | text: JSON.stringify({
75 | transcription: '{}',
76 | status: 'completed',
77 | }),
78 | uri: 'call://transcriptions/latest',
79 | mimeType: 'application/json'
80 | }]
81 | };
82 | }
83 | );
84 | }
85 |
86 | private registerPrompts(): void {
87 | this.server.prompt(
88 | 'make-restaurant-reservation',
89 | 'Create a prompt for making a restaurant reservation by phone',
90 | {
91 | restaurantNumber: z.string().describe('The phone number of the restaurant'),
92 | peopleNumber: z.string().describe('The number of people in the party'),
93 | date: z.string().describe('Date of the reservation'),
94 | time: z.string().describe('Preferred time for the reservation')
95 | },
96 | ({ restaurantNumber, peopleNumber, date, time }) => {
97 | return {
98 | messages: [{
99 | role: 'user',
100 | content: {
101 | type: 'text',
102 | text: `You are calling a restaurant to book a table for ${peopleNumber} people on ${date} at ${time}. Call the restaurant at ${restaurantNumber} from ${process.env.TWILIO_NUMBER}.`
103 | }
104 | }]
105 | };
106 | }
107 | );
108 | }
109 |
110 | public async start(): Promise {
111 | const transport = new StdioServerTransport();
112 | await this.server.connect(transport);
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/servers/voice.server.ts:
--------------------------------------------------------------------------------
1 | import dotenv from 'dotenv';
2 | import express, { Response } from 'express';
3 | import VoiceResponse from 'twilio/lib/twiml/VoiceResponse.js';
4 | import ExpressWs from 'express-ws';
5 | import { WebSocket } from 'ws';
6 | import { CallType } from '../types.js';
7 | import { DYNAMIC_API_SECRET } from '../config/constants.js';
8 | import { CallSessionManager } from '../handlers/openai.handler.js';
9 | dotenv.config();
10 |
11 | export class VoiceServer {
12 | private app: express.Application & { ws: any };
13 | private port: number;
14 | private sessionManager: CallSessionManager;
15 | private callbackUrl: string;
16 |
17 | constructor(callbackUrl: string, sessionManager: CallSessionManager) {
18 | this.callbackUrl = callbackUrl;
19 | this.port = parseInt(process.env.PORT || '3004');
20 | this.app = ExpressWs(express()).app;
21 | this.sessionManager = sessionManager;
22 | this.configureMiddleware();
23 | this.setupRoutes();
24 | }
25 |
26 | private configureMiddleware(): void {
27 | this.app.use(express.json());
28 | this.app.use(express.urlencoded({ extended: false }));
29 | }
30 |
31 | private setupRoutes(): void {
32 | this.app.post('/call/outgoing', this.handleOutgoingCall.bind(this));
33 | this.app.ws('/call/connection-outgoing/:secret', this.handleOutgoingConnection.bind(this));
34 | }
35 |
36 | private async handleOutgoingCall(req: express.Request, res: Response): Promise {
37 | const apiSecret = req.query.apiSecret?.toString();
38 |
39 | if (req.query.apiSecret?.toString() !== DYNAMIC_API_SECRET) {
40 | res.status(401).json({ error: 'Unauthorized: Invalid or missing API secret' });
41 | return;
42 | }
43 |
44 | const fromNumber = req.body.From;
45 | const toNumber = req.body.To;
46 | const callContext = req.query.callContext?.toString();
47 |
48 | const twiml = new VoiceResponse();
49 | const connect = twiml.connect();
50 |
51 | const stream = connect.stream({
52 | url: `${this.callbackUrl.replace('https://', 'wss://')}/call/connection-outgoing/${apiSecret}`,
53 | });
54 |
55 | stream.parameter({ name: 'fromNumber', value: fromNumber });
56 | stream.parameter({ name: 'toNumber', value: toNumber });
57 | stream.parameter({ name: 'callContext', value: callContext });
58 |
59 | res.writeHead(200, { 'Content-Type': 'text/xml' });
60 | res.end(twiml.toString());
61 | }
62 |
63 | private handleOutgoingConnection(ws: WebSocket, req: express.Request): void {
64 | if (req.params.secret !== DYNAMIC_API_SECRET) {
65 | ws.close(1008, 'Unauthorized: Invalid or missing API secret');
66 | return;
67 | }
68 |
69 | this.sessionManager.createSession(ws, CallType.OUTBOUND);
70 | }
71 |
72 | public start(): void {
73 | this.app.listen(this.port);
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/services/openai/context.service.ts:
--------------------------------------------------------------------------------
1 | import { generateOutboundCallContext } from '../../config/prompts.js';
2 | import { CallState, ConversationMessage } from '../../types.js';
3 |
4 | export class OpenAIContextService {
5 |
6 | public initializeCallState(callState: CallState, fromNumber: string, toNumber: string): void {
7 | callState.fromNumber = fromNumber;
8 | callState.toNumber = toNumber;
9 | }
10 |
11 | public setupConversationContext(callState: CallState, callContext?: string): void {
12 | callState.initialMessage = 'Hello!';
13 | callState.callContext = generateOutboundCallContext(callState, callContext);
14 |
15 | const systemMessage: ConversationMessage = {
16 | role: 'system',
17 | content: callState.callContext
18 | };
19 |
20 | callState.conversationHistory = [systemMessage];
21 |
22 | const initialMessage: ConversationMessage = {
23 | role: 'user',
24 | content: callState.initialMessage
25 | };
26 |
27 | callState.conversationHistory.push(initialMessage);
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/services/openai/event.service.ts:
--------------------------------------------------------------------------------
1 | import { WebSocket } from 'ws';
2 | import { CallState } from '../../types.js';
3 | import { LOG_EVENT_TYPES, SHOW_TIMING_MATH } from '../../config/constants.js';
4 | import { checkForGoodbye } from '../../utils/call-utils.js';
5 |
6 | /**
7 | * Service for processing OpenAI events
8 | */
9 | export class OpenAIEventService {
10 | private readonly callState: CallState;
11 | private readonly onEndCall: () => void;
12 | private readonly onSendAudioToTwilio: (payload: string) => void;
13 | private readonly onTruncateResponse: () => void;
14 |
15 | /**
16 | * Create a new OpenAI event processor
17 | * @param callState The state of the call
18 | * @param onEndCall Callback for ending the call
19 | * @param onSendAudioToTwilio Callback for sending audio to Twilio
20 | * @param onTruncateResponse Callback for truncating the response
21 | */
22 | constructor(
23 | callState: CallState,
24 | onEndCall: () => void,
25 | onSendAudioToTwilio: (payload: string) => void,
26 | onTruncateResponse: () => void
27 | ) {
28 | this.callState = callState;
29 | this.onEndCall = onEndCall;
30 | this.onSendAudioToTwilio = onSendAudioToTwilio;
31 | this.onTruncateResponse = onTruncateResponse;
32 | }
33 |
34 | /**
35 | * Process an OpenAI message
36 | * @param data The message data
37 | */
38 | public processMessage(data: WebSocket.Data): void {
39 | try {
40 | const response = JSON.parse(data.toString());
41 |
42 | if (LOG_EVENT_TYPES.includes(response.type)) {
43 | // console.log(`Received event: ${response.type}`, response);
44 | }
45 |
46 | this.processEvent(response);
47 | } catch (error) {
48 | console.error('Error processing OpenAI message:', error, 'Raw message:', data);
49 | }
50 | }
51 |
52 | /**
53 | * Process an OpenAI event
54 | * @param response The event data
55 | */
56 | private processEvent(response: any): void {
57 | switch (response.type) {
58 | case 'conversation.item.input_audio_transcription.completed':
59 | this.handleTranscriptionCompleted(response.transcript);
60 | break;
61 | case 'response.audio_transcript.done':
62 | this.handleAudioTranscriptDone(response.transcript);
63 | break;
64 | case 'response.audio.delta':
65 | if (response.delta) {
66 | this.handleAudioDelta(response);
67 | }
68 | break;
69 | case 'input_audio_buffer.speech_started':
70 | this.onTruncateResponse();
71 | break;
72 | }
73 | }
74 |
75 | /**
76 | * Handle a transcription completed event
77 | * @param transcription The transcription text
78 | */
79 | private handleTranscriptionCompleted(transcription: string): void {
80 | if (!transcription) {
81 | return;
82 | }
83 |
84 | this.callState.conversationHistory.push({
85 | role: 'user',
86 | content: transcription
87 | });
88 |
89 | if (checkForGoodbye(transcription)) {
90 | this.onEndCall();
91 | }
92 | }
93 |
94 | /**
95 | * Handle an audio transcript done event
96 | * @param transcript The transcript text
97 | */
98 | private handleAudioTranscriptDone(transcript: string): void {
99 | if (!transcript) {
100 | return;
101 | }
102 |
103 | this.callState.conversationHistory.push({
104 | role: 'assistant',
105 | content: transcript
106 | });
107 | }
108 |
109 | /**
110 | * Handle an audio delta event
111 | * @param response The event data
112 | */
113 | private handleAudioDelta(response: any): void {
114 | this.onSendAudioToTwilio(response.delta);
115 |
116 | if (!this.callState.responseStartTimestampTwilio) {
117 | this.callState.responseStartTimestampTwilio = this.callState.latestMediaTimestamp;
118 | if (SHOW_TIMING_MATH) {
119 | // console.log(`Setting start timestamp for new response: ${this.callState.responseStartTimestampTwilio}ms`);
120 | }
121 | }
122 |
123 | if (response.item_id) {
124 | this.callState.lastAssistantItemId = response.item_id;
125 | }
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/services/openai/ws.service.ts:
--------------------------------------------------------------------------------
1 | import { WebSocket } from 'ws';
2 | import { OpenAIConfig } from '../../types.js';
3 | import { SHOW_TIMING_MATH } from '../../config/constants.js';
4 |
5 | /**
6 | * Service for handling OpenAI API interactions
7 | */
8 | export class OpenAIWsService {
9 | private webSocket: WebSocket | null = null;
10 | private readonly config: OpenAIConfig;
11 |
12 | /**
13 | * Create a new OpenAI service
14 | * @param config Configuration for the OpenAI API
15 | */
16 | constructor(config: OpenAIConfig) {
17 | this.config = config;
18 | }
19 |
20 | /**
21 | * Initialize the WebSocket connection to OpenAI
22 | * @param onMessage Callback for handling messages from OpenAI
23 | * @param onOpen Callback for when the connection is opened
24 | * @param onError Callback for handling errors
25 | */
26 | public initialize(
27 | onMessage: (data: WebSocket.Data) => void,
28 | onOpen: () => void,
29 | onError: (error: Error) => void
30 | ): void {
31 | this.webSocket = new WebSocket(this.config.websocketUrl, {
32 | headers: {
33 | Authorization: `Bearer ${this.config.apiKey}`,
34 | 'OpenAI-Beta': 'realtime=v1'
35 | }
36 | });
37 |
38 | this.webSocket.on('open', onOpen);
39 | this.webSocket.on('message', onMessage);
40 | this.webSocket.on('error', onError);
41 | }
42 |
43 | /**
44 | * Initialize the session with OpenAI
45 | * @param callContext The context for the call
46 | */
47 | public initializeSession(callContext: string): void {
48 | if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) {
49 | return;
50 | }
51 |
52 | const sessionUpdate = {
53 | type: 'session.update',
54 | session: {
55 | turn_detection: { type: 'server_vad' },
56 | input_audio_format: 'g711_ulaw',
57 | output_audio_format: 'g711_ulaw',
58 | voice: this.config.voice,
59 | instructions: callContext,
60 | modalities: ['text', 'audio'],
61 | temperature: this.config.temperature,
62 | 'input_audio_transcription': {
63 | 'model': 'whisper-1'
64 | },
65 | }
66 | };
67 |
68 | this.webSocket.send(JSON.stringify(sessionUpdate));
69 | }
70 |
71 | /**
72 | * Close the WebSocket connection
73 | */
74 | public close(): void {
75 | if (this.webSocket && this.webSocket.readyState === WebSocket.OPEN) {
76 | this.webSocket.close();
77 | }
78 | }
79 |
80 | /**
81 | * Forward audio data to OpenAI
82 | * @param audioPayload The audio payload to forward
83 | */
84 | public sendAudio(audioPayload: string): void {
85 | if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) {
86 | return;
87 | }
88 |
89 | const audioAppend = {
90 | type: 'input_audio_buffer.append',
91 | audio: audioPayload
92 | };
93 |
94 | this.webSocket.send(JSON.stringify(audioAppend));
95 | }
96 |
97 | /**
98 | * Truncate the assistant's response
99 | * @param itemId The ID of the assistant's response
100 | * @param elapsedTime The time elapsed since the response started
101 | */
102 | public truncateAssistantResponse(itemId: string, elapsedTime: number): void {
103 | if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) {
104 | return;
105 | }
106 |
107 | const truncateEvent = {
108 | type: 'conversation.item.truncate',
109 | item_id: itemId,
110 | content_index: 0,
111 | audio_end_ms: elapsedTime
112 | };
113 |
114 | if (SHOW_TIMING_MATH) {
115 | console.error('Sending truncation event:', JSON.stringify(truncateEvent));
116 | }
117 |
118 | this.webSocket.send(JSON.stringify(truncateEvent));
119 | }
120 |
121 | /**
122 | * Check if the WebSocket is connected
123 | */
124 | public isConnected(): boolean {
125 | return this.webSocket !== null && this.webSocket.readyState === WebSocket.OPEN;
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/services/session-manager.service.ts:
--------------------------------------------------------------------------------
1 | import { WebSocket } from 'ws';
2 | import twilio from 'twilio';
3 | import { CallType } from '../types.js';
4 | import { OpenAIContextService } from './openai/context.service.js';
5 | import { OpenAICallHandler } from '../handlers/openai.handler.js';
6 |
7 | /**
8 | * Manages multiple concurrent call sessions
9 | */
10 | export class SessionManagerService {
11 | private readonly activeSessions: Map;
12 | private readonly twilioClient: twilio.Twilio;
13 | private readonly contextService: OpenAIContextService;
14 |
15 | /**
16 | * Create a new session manager
17 | * @param twilioConfig Configuration for the Twilio client
18 | */
19 | constructor(twilioClient: twilio.Twilio) {
20 | this.activeSessions = new Map();
21 | this.twilioClient = twilioClient;
22 | this.contextService = new OpenAIContextService();
23 | }
24 |
25 | /**
26 | * Creates a new call session and adds it to the active sessions
27 | * @param ws The WebSocket connection
28 | * @param callType The type of call
29 | */
30 | public createSession(ws: WebSocket, callType: CallType): void {
31 | const handler = new OpenAICallHandler(ws, callType, this.twilioClient, this.contextService);
32 | this.registerSessionCleanup(ws);
33 | this.addSession(ws, handler);
34 | }
35 |
36 | /**
37 | * Register cleanup for a session
38 | * @param ws The WebSocket connection
39 | */
40 | private registerSessionCleanup(ws: WebSocket): void {
41 | ws.on('close', () => {
42 | this.removeSession(ws);
43 | });
44 | }
45 |
46 | /**
47 | * Add a session to active sessions
48 | * @param ws The WebSocket connection
49 | * @param handler The OpenAI call handler
50 | */
51 | private addSession(ws: WebSocket, handler: OpenAICallHandler): void {
52 | this.activeSessions.set(this.getSessionKey(ws), handler);
53 | }
54 |
55 | /**
56 | * Removes a session from active sessions
57 | * @param ws The WebSocket connection
58 | */
59 | private removeSession(ws: WebSocket): void {
60 | const sessionKey = this.getSessionKey(ws);
61 | if (this.activeSessions.has(sessionKey)) {
62 | this.activeSessions.delete(sessionKey);
63 | }
64 | }
65 |
66 | /**
67 | * Generates a unique key for a session based on the WebSocket object
68 | * @param ws The WebSocket connection
69 | * @returns A unique key for the session
70 | */
71 | private getSessionKey(ws: WebSocket): string {
72 | return ws.url || ws.toString();
73 | }
74 |
75 | /**
76 | * Get the Twilio client
77 | * @returns The Twilio client
78 | */
79 | public getTwilioClient(): twilio.Twilio {
80 | return this.twilioClient;
81 | }
82 |
83 | /**
84 | * Get the context service
85 | * @returns The context service
86 | */
87 | public getContextService(): OpenAIContextService {
88 | return this.contextService;
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/services/twilio/call.service.ts:
--------------------------------------------------------------------------------
1 | import twilio from 'twilio';
2 | import { DYNAMIC_API_SECRET, RECORD_CALLS } from '../../config/constants.js';
3 |
4 | /**
5 | * Service for handling Twilio call operations
6 | */
7 | export class TwilioCallService {
8 | private readonly twilioClient: twilio.Twilio;
9 |
10 | /**
11 | * Create a new Twilio call service
12 | * @param twilioClient The Twilio client
13 | */
14 | constructor(twilioClient: twilio.Twilio) {
15 | this.twilioClient = twilioClient;
16 | }
17 |
18 | /**
19 | * Start recording a call
20 | * @param callSid The SID of the call to record
21 | */
22 | public async startRecording(callSid: string): Promise {
23 | if (!RECORD_CALLS || !callSid) {
24 | return;
25 | }
26 |
27 | try {
28 | await this.twilioClient.calls(callSid)
29 | .recordings
30 | .create();
31 | } catch (error) {
32 | console.error(`Failed to start recording for call ${callSid}:`, error);
33 | }
34 | }
35 |
36 | /**
37 | * End a call
38 | * @param callSid The SID of the call to end
39 | */
40 | public async endCall(callSid: string): Promise {
41 | if (!callSid) {
42 | return;
43 | }
44 |
45 | try {
46 | await this.twilioClient.calls(callSid)
47 | .update({ status: 'completed' });
48 | } catch (error) {
49 | console.error(`Failed to end call ${callSid}:`, error);
50 | }
51 | }
52 |
53 |
54 | public async makeCall(twilioCallbackUrl: string, toNumber: string, callContext = ''): Promise {
55 | try {
56 | const twilioClient = twilio(process.env.TWILIO_ACCOUNT_SID, process.env.TWILIO_AUTH_TOKEN);
57 |
58 | const callContextEncoded = encodeURIComponent(callContext);
59 |
60 | const call = await twilioClient.calls.create({
61 | to: toNumber,
62 | from: process.env.TWILIO_NUMBER || '',
63 | url: `${twilioCallbackUrl}/call/outgoing?apiSecret=${DYNAMIC_API_SECRET}&callType=outgoing&callContext=${callContextEncoded}`,
64 | });
65 |
66 | return call.sid;
67 | } catch (error) {
68 | console.error(`Error making call: ${error}`);
69 | throw error;
70 | }
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/services/twilio/event.service.ts:
--------------------------------------------------------------------------------
1 | import { CallState } from '../../types.js';
2 | import { OpenAIContextService } from '../openai/context.service.js';
3 | import { RECORD_CALLS, SHOW_TIMING_MATH } from '../../config/constants.js';
4 | import { TwilioCallService } from './call.service.js';
5 |
6 | /**
7 | * Service for processing Twilio events
8 | */
9 | export class TwilioEventService {
10 | private readonly callState: CallState;
11 | private readonly twilioCallService: TwilioCallService;
12 | private readonly contextService: OpenAIContextService;
13 | private readonly onForwardAudioToOpenAI: (payload: string) => void;
14 |
15 | /**
16 | * Create a new Twilio event processor
17 | * @param callState The state of the call
18 | * @param twilioCallService The Twilio call service
19 | * @param contextService The context service
20 | * @param onForwardAudioToOpenAI Callback for forwarding audio to OpenAI
21 | */
22 | constructor(
23 | callState: CallState,
24 | twilioCallService: TwilioCallService,
25 | contextService: OpenAIContextService,
26 | onForwardAudioToOpenAI: (payload: string) => void,
27 | ) {
28 | this.callState = callState;
29 | this.twilioCallService = twilioCallService;
30 | this.contextService = contextService;
31 | this.onForwardAudioToOpenAI = onForwardAudioToOpenAI;
32 | }
33 |
34 | /**
35 | * Process a Twilio message
36 | * @param message The message data
37 | */
38 | public async processMessage(message: Buffer | string): Promise {
39 | try {
40 | const data = JSON.parse(message.toString());
41 | await this.processEvent(data);
42 | } catch (error) {
43 | console.error('Error parsing message:', error, 'Message:', message);
44 | }
45 | }
46 |
47 | /**
48 | * Process a Twilio event
49 | * @param data The event data
50 | */
51 | private async processEvent(data: any): Promise {
52 | switch (data.event) {
53 | case 'media':
54 | await this.handleMediaEvent(data);
55 | break;
56 | case 'start':
57 | await this.handleStartEvent(data);
58 | break;
59 | case 'mark':
60 | this.handleMarkEvent();
61 | break;
62 | default:
63 | console.error('Received non-media event:', data.event);
64 | break;
65 | }
66 | }
67 |
68 | /**
69 | * Handle a Twilio media event
70 | * @param data The event data
71 | */
72 | private async handleMediaEvent(data: any): Promise {
73 | this.callState.latestMediaTimestamp = data.media.timestamp;
74 | if (SHOW_TIMING_MATH) {
75 | // console.log(`Received media message with timestamp: ${this.callState.latestMediaTimestamp}ms`);
76 | }
77 |
78 | await this.handleFirstMediaEventIfNeeded();
79 | this.onForwardAudioToOpenAI(data.media.payload);
80 | }
81 |
82 | /**
83 | * Handle the first media event if it hasn't been handled yet
84 | */
85 | private async handleFirstMediaEventIfNeeded(): Promise {
86 | if (this.callState.hasSeenMedia) {
87 | return;
88 | }
89 |
90 | this.callState.hasSeenMedia = true;
91 |
92 | if (RECORD_CALLS && this.callState.callSid) {
93 | await this.startCallRecording();
94 | }
95 | }
96 |
97 | /**
98 | * Start recording the call
99 | */
100 | private async startCallRecording(): Promise {
101 | await this.twilioCallService.startRecording(this.callState.callSid);
102 | }
103 |
104 | /**
105 | * Handle a Twilio start event
106 | * @param data The event data
107 | */
108 | private async handleStartEvent(data: any): Promise {
109 | this.callState.streamSid = data.start.streamSid;
110 | this.callState.responseStartTimestampTwilio = null;
111 | this.callState.latestMediaTimestamp = 0;
112 |
113 | this.contextService.initializeCallState(this.callState, data.start.customParameters.fromNumber, data.start.customParameters.toNumber);
114 | this.contextService.setupConversationContext(this.callState, data.start.customParameters.callContext);
115 | this.callState.callSid = data.start.callSid;
116 | }
117 |
118 | /**
119 | * Handle a Twilio mark event
120 | */
121 | private handleMarkEvent(): void {
122 | if (this.callState.markQueue.length > 0) {
123 | this.callState.markQueue.shift();
124 | }
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/src/services/twilio/ws.service.ts:
--------------------------------------------------------------------------------
1 | import { WebSocket } from 'ws';
2 | import { CallState } from '../../types.js';
3 | import { SHOW_TIMING_MATH } from '../../config/constants.js';
4 |
5 | /**
6 | * Service for handling Twilio WebSocket streams
7 | */
8 | export class TwilioWsService {
9 | private readonly webSocket: WebSocket;
10 | private readonly callState: CallState;
11 |
12 | /**
13 | * Create a new Twilio stream service
14 | * @param webSocket The Twilio WebSocket connection
15 | * @param callState The state of the call
16 | */
17 | constructor(webSocket: WebSocket, callState: CallState) {
18 | this.webSocket = webSocket;
19 | this.callState = callState;
20 | }
21 |
22 | /**
23 | * Close the WebSocket connection
24 | */
25 | public close(): void {
26 | if (this.webSocket.readyState === WebSocket.OPEN) {
27 | this.webSocket.close();
28 | }
29 | }
30 |
31 | /**
32 | * Send a mark event to Twilio
33 | */
34 | public sendMark(): void {
35 | if (!this.callState.streamSid) {
36 | return;
37 | }
38 |
39 | const markEvent = {
40 | event: 'mark',
41 | streamSid: this.callState.streamSid,
42 | mark: { name: 'responsePart' }
43 | };
44 | this.webSocket.send(JSON.stringify(markEvent));
45 | this.callState.markQueue.push('responsePart');
46 | }
47 |
48 | /**
49 | * Send audio data to Twilio
50 | * @param payload The audio payload to send
51 | */
52 | public sendAudio(payload: string): void {
53 | if (!this.callState.streamSid) {
54 | return;
55 | }
56 |
57 | const audioDelta = {
58 | event: 'media',
59 | streamSid: this.callState.streamSid,
60 | media: { payload }
61 | };
62 | this.webSocket.send(JSON.stringify(audioDelta));
63 | }
64 |
65 | /**
66 | * Clear the Twilio stream
67 | */
68 | public clearStream(): void {
69 | if (!this.callState.streamSid) {
70 | return;
71 | }
72 |
73 | this.webSocket.send(JSON.stringify({
74 | event: 'clear',
75 | streamSid: this.callState.streamSid
76 | }));
77 | }
78 |
79 | /**
80 | * Set up event handlers for the Twilio WebSocket
81 | * @param onMessage Callback for handling messages from Twilio
82 | * @param onClose Callback for when the connection is closed
83 | */
84 | public setupEventHandlers(
85 | onMessage: (message: Buffer | string) => void,
86 | onClose: () => void
87 | ): void {
88 | this.webSocket.on('message', onMessage);
89 | this.webSocket.on('close', onClose);
90 | }
91 |
92 | /**
93 | * Process a Twilio start event
94 | * @param data The start event data
95 | */
96 | public processStartEvent(data: any): void {
97 | this.callState.streamSid = data.start.streamSid;
98 | this.callState.responseStartTimestampTwilio = null;
99 | this.callState.latestMediaTimestamp = 0;
100 | this.callState.callSid = data.start.callSid;
101 | }
102 |
103 | /**
104 | * Process a Twilio mark event
105 | */
106 | public processMarkEvent(): void {
107 | if (this.callState.markQueue.length > 0) {
108 | this.callState.markQueue.shift();
109 | }
110 | }
111 |
112 | /**
113 | * Process a Twilio media event
114 | * @param data The media event data
115 | */
116 | public processMediaEvent(data: any): void {
117 | this.callState.latestMediaTimestamp = data.media.timestamp;
118 | if (SHOW_TIMING_MATH) {
119 | // console.log(`Received media message with timestamp: ${this.callState.latestMediaTimestamp}ms`);
120 | }
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/src/start-all.ts:
--------------------------------------------------------------------------------
1 | import dotenv from 'dotenv';
2 | import ngrok from '@ngrok/ngrok';
3 | import { isPortInUse } from './utils/execution-utils.js';
4 | import { VoiceCallMcpServer } from './servers/mcp.server.js';
5 | import { TwilioCallService } from './services/twilio/call.service.js';
6 | import { VoiceServer } from './servers/voice.server.js';
7 | import twilio from 'twilio';
8 | import { CallSessionManager } from './handlers/openai.handler.js';
9 |
10 | // Load environment variables
11 | dotenv.config();
12 |
13 | // Define required environment variables
14 | const REQUIRED_ENV_VARS = [
15 | 'TWILIO_ACCOUNT_SID',
16 | 'TWILIO_AUTH_TOKEN',
17 | 'OPENAI_API_KEY',
18 | 'NGROK_AUTHTOKEN',
19 | 'TWILIO_NUMBER'
20 | ] as const;
21 |
22 | /**
23 | * Validates that all required environment variables are present
24 | * @returns true if all variables are present, exits process otherwise
25 | */
26 | function validateEnvironmentVariables(): boolean {
27 | for (const envVar of REQUIRED_ENV_VARS) {
28 | if (!process.env[envVar]) {
29 | console.error(`Error: ${envVar} environment variable is required`);
30 | process.exit(1);
31 | }
32 | }
33 | return true;
34 | }
35 |
36 | /**
37 | * Sets up the port for the application
38 | */
39 | function setupPort(): number {
40 | const PORT = process.env.PORT || '3004';
41 | process.env.PORT = PORT;
42 | return parseInt(PORT);
43 | }
44 |
45 | /**
46 | * Establishes ngrok tunnel for external access
47 | * @param portNumber - The port number to forward
48 | * @returns The public URL provided by ngrok
49 | */
50 | async function setupNgrokTunnel(portNumber: number): Promise {
51 | const listener = await ngrok.forward({
52 | addr: portNumber,
53 | authtoken_from_env: true
54 | });
55 |
56 | const twilioCallbackUrl = listener.url();
57 | if (!twilioCallbackUrl) {
58 | throw new Error('Failed to obtain ngrok URL');
59 | }
60 |
61 | return twilioCallbackUrl;
62 | }
63 |
64 | /**
65 | * Sets up graceful shutdown handlers
66 | */
67 | function setupShutdownHandlers(): void {
68 | process.on('SIGINT', async () => {
69 | try {
70 | await ngrok.disconnect();
71 | } catch (err) {
72 | console.error('Error killing ngrok:', err);
73 | }
74 | process.exit(0);
75 | });
76 | }
77 |
78 | /**
79 | * Retries starting the server when the port is in use
80 | * @param portNumber - The port number to check
81 | */
82 | function scheduleServerRetry(portNumber: number): void {
83 | console.error(`Port ${portNumber} is already in use. Server may already be running.`);
84 | console.error('Will retry in 15 seconds...');
85 |
86 | const RETRY_INTERVAL_MS = 15000;
87 |
88 | const retryInterval = setInterval(async () => {
89 | const stillInUse = await isPortInUse(portNumber);
90 |
91 | if (!stillInUse) {
92 | clearInterval(retryInterval);
93 | main();
94 | } else {
95 | console.error(`Port ${portNumber} is still in use. Will retry in 15 seconds...`);
96 | }
97 | }, RETRY_INTERVAL_MS);
98 | }
99 |
100 |
101 | async function main(): Promise {
102 | try {
103 | validateEnvironmentVariables();
104 | const portNumber = setupPort();
105 |
106 | const twilioClient = twilio(process.env.TWILIO_ACCOUNT_SID, process.env.TWILIO_AUTH_TOKEN);
107 |
108 | const sessionManager = new CallSessionManager(twilioClient);
109 | const twilioCallService = new TwilioCallService(twilioClient);
110 |
111 | // Check if port is already in use
112 | const portInUse = await isPortInUse(portNumber);
113 | if (portInUse) {
114 | scheduleServerRetry(portNumber);
115 | return;
116 | }
117 |
118 | // Establish ngrok connectivity
119 | const twilioCallbackUrl = await setupNgrokTunnel(portNumber);
120 |
121 | // Start the main HTTP server
122 | const server = new VoiceServer(twilioCallbackUrl, sessionManager);
123 | server.start();
124 |
125 | const mcpServer = new VoiceCallMcpServer(twilioCallService, twilioCallbackUrl);
126 | await mcpServer.start();
127 |
128 | // Set up graceful shutdown
129 | setupShutdownHandlers();
130 | } catch (error) {
131 | console.error('Error starting services:', error);
132 | process.exit(1);
133 | }
134 | }
135 |
136 | // Start the main function
137 | main();
138 |
--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
1 | // state.ts - Shared state variables
2 | export enum CallType {
3 | OUTBOUND = 'OUTBOUND',
4 | }
5 |
6 | export interface ConversationMessage {
7 | role: 'system' | 'user' | 'assistant';
8 | content: string;
9 | name?: string;
10 | }
11 |
12 | export class CallState {
13 | // Call identification
14 | streamSid = '';
15 | callSid = '';
16 |
17 | // Call type and direction
18 | callType: CallType = CallType.OUTBOUND;
19 |
20 | // Phone numbers
21 | fromNumber = '';
22 | toNumber = '';
23 |
24 | // Call context and conversation
25 | callContext = '';
26 | initialMessage = '';
27 | conversationHistory: ConversationMessage[] = [];
28 |
29 | // Speech state
30 | speaking = false;
31 |
32 | // Timing and processing state
33 | llmStart = 0;
34 | firstByte = true;
35 | sendFirstSentenceInputTime: number | null = null;
36 |
37 | // Media processing state
38 | latestMediaTimestamp = 0;
39 | responseStartTimestampTwilio: number | null = null;
40 | lastAssistantItemId: string | null = null;
41 | markQueue: string[] = [];
42 | hasSeenMedia = false;
43 |
44 | constructor(callType: CallType = CallType.OUTBOUND) {
45 | this.callType = callType;
46 | }
47 | }
48 |
49 | /**
50 | * Configuration for the OpenAI WebSocket connection
51 | */
52 | export interface OpenAIConfig {
53 | apiKey: string;
54 | websocketUrl: string;
55 | voice: string;
56 | temperature: number;
57 | }
58 |
59 | /**
60 | * Configuration for Twilio client
61 | */
62 | export interface TwilioConfig {
63 | accountSid: string;
64 | authToken: string;
65 | recordCalls: boolean;
66 | }
67 |
--------------------------------------------------------------------------------
/src/utils/call-utils.ts:
--------------------------------------------------------------------------------
1 | import { WebSocket } from 'ws';
2 | import { GOODBYE_PHRASES } from '../config/constants.js';
3 |
4 | export const checkForGoodbye = (text: string): boolean => {
5 | const lowercaseText = text.toLowerCase();
6 | return GOODBYE_PHRASES.some(phrase => lowercaseText.includes(phrase));
7 | };
8 |
9 | export const endCall = (ws: WebSocket, openAiWs: WebSocket): void => {
10 | setTimeout(() => {
11 | if (ws.readyState === WebSocket.OPEN) {
12 | ws.close();
13 | }
14 | if (openAiWs.readyState === WebSocket.OPEN) {
15 | openAiWs.close();
16 | }
17 | }, 5000);
18 | };
19 |
--------------------------------------------------------------------------------
/src/utils/execution-utils.ts:
--------------------------------------------------------------------------------
1 | import net from 'net';
2 |
3 | export async function isPortInUse(port: number): Promise {
4 | return new Promise((resolve) => {
5 | const server = net.createServer()
6 | .once('error', (err: NodeJS.ErrnoException) => {
7 | if (err.code === 'EADDRINUSE') {
8 | resolve(true);
9 | } else {
10 | resolve(false);
11 | }
12 | })
13 | .once('listening', () => {
14 | server.close();
15 | resolve(false);
16 | })
17 | .listen(port);
18 | });
19 | }
20 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es2020",
4 | "module": "NodeNext",
5 | "moduleResolution": "NodeNext",
6 | "declaration": true,
7 | "declarationMap": true,
8 | "sourceMap": true,
9 | "outDir": "./dist",
10 | "strict": true,
11 | "esModuleInterop": true,
12 | "forceConsistentCasingInFileNames": true,
13 | "resolveJsonModule": true,
14 | "isolatedModules": true,
15 | "skipLibCheck": true,
16 | "lib": ["es2020", "DOM"],
17 | "allowSyntheticDefaultImports": true
18 | },
19 | "include": ["src/**/*", "openai-realtime-handler.ts"],
20 | "exclude": ["node_modules", "dist"]
21 | }
--------------------------------------------------------------------------------