├── .env.sample ├── .gitignore ├── LICENSE ├── README.md ├── eslint.config.mjs ├── next.config.ts ├── package-lock.json ├── package.json ├── postcss.config.mjs ├── public ├── arrow.svg ├── favicon.ico ├── openai-logomark.svg ├── screenshot_chat_supervisor.png └── screenshot_handoff.png ├── src └── app │ ├── App.tsx │ ├── agentConfigs │ ├── chatSupervisor │ │ ├── index.ts │ │ ├── sampleData.ts │ │ └── supervisorAgent.ts │ ├── customerServiceRetail │ │ ├── authentication.ts │ │ ├── index.ts │ │ ├── returns.ts │ │ ├── sales.ts │ │ └── simulatedHuman.ts │ ├── guardrails.ts │ ├── index.ts │ ├── realtimeClient.ts │ ├── simpleHandoff.ts │ ├── types.ts │ └── voiceAgentMetaprompt.txt │ ├── api │ ├── responses │ │ └── route.ts │ └── session │ │ └── route.ts │ ├── components │ ├── BottomToolbar.tsx │ ├── Events.tsx │ ├── GuardrailChip.tsx │ └── Transcript.tsx │ ├── contexts │ ├── EventContext.tsx │ └── TranscriptContext.tsx │ ├── globals.css │ ├── hooks │ └── useAudioDownload.ts │ ├── layout.tsx │ ├── lib │ ├── audioUtils.ts │ ├── callOai.ts │ └── envSetup.ts │ ├── page.tsx │ └── types.ts ├── tailwind.config.ts └── tsconfig.json /.env.sample: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=your_api_key -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies 2 | /node_modules 3 | /.pnp 4 | .pnp.* 5 | .yarn/* 6 | !.yarn/patches 7 | !.yarn/plugins 8 | !.yarn/releases 9 | !.yarn/versions 10 | 11 | # testing 12 | /coverage 13 | 14 | # next.js 15 | /.next/ 16 | /out/ 17 | 18 | # production 19 | /build 20 | 21 | # misc 22 | .DS_Store 23 | *.pem 24 | 25 | # debug 26 | npm-debug.log* 27 | yarn-debug.log* 28 | yarn-error.log* 29 | .pnpm-debug.log* 30 | 31 | # Ignore all env files except .env.sample 32 | .env 33 | 34 | 35 | # vercel 36 | .vercel 37 | 38 | # typescript 39 | *.tsbuildinfo 40 | next-env.d.ts 41 | todo.md 42 | 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Realtime API Agents Demo 2 | 3 | This is a demonstration of more advanced patterns for voice agents, using the OpenAI Realtime API and the OpenAI Agents SDK. 4 | 5 | ** NOTE:** For a version that does not use the OpenAI Agents SDK, see the [branch without-agents-sdk](https://github.com/openai/openai-realtime-agents/tree/without-agents-sdk). 6 | 7 | There are two main patterns demonstrated: 8 | 1. **Chat-Supervisor:** A realtime-based chat agent interacts with the user and handles basic tasks, while a more intelligent, text-based supervisor model (e.g., `gpt-4.1`) is used extensively for tool calls and more complex responses. This approach provides an easy onramp and high-quality answers, with a small increase in latency. 9 | 2. **Sequential Handoff:** Specialized agents (powered by realtime api) transfer the user between them to handle specific user intents. This is great for customer service, where user intents can be handled sequentially by specialist models that excel in a specific domains. This helps avoid the model having all instructions and tools in a single agent, which can degrade performance. 10 | 11 | ## Setup 12 | 13 | - This is a Next.js typescript app. Install dependencies with `npm i`. 14 | - Add your `OPENAI_API_KEY` to your env. Either add it to your `.bash_profile` or equivalent, or copy `.env.sample` to `.env` and add it there. 15 | - Start the server with `npm run dev` 16 | - Open your browser to [http://localhost:3000](http://localhost:3000). It should default to the `chatSupervisor` Agent Config. 17 | - You can change examples via the "Scenario" dropdown in the top right. 18 | 19 | # Agentic Pattern 1: Chat-Supervisor 20 | 21 | This is demonstrated in the [chatSupervisor](src/app/agentConfigs/chatSupervisor/index.ts) Agent Config. The chat agent uses the realtime model to converse with the user and handle basic tasks, like greeting the user, casual conversation, and collecting information, and a more intelligent, text-based supervisor model (e.g. `gpt-4.1`) is used extensively to handle tool calls and more challenging responses. You can control the decision boundary by "opting in" specific tasks to the chat agent as desired. 22 | 23 | Video walkthrough: [https://x.com/noahmacca/status/1927014156152058075](https://x.com/noahmacca/status/1927014156152058075) 24 | 25 | ## Example 26 | ![Screenshot of the Chat Supervisor Flow](/public/screenshot_chat_supervisor.png) 27 | *In this exchange, note the immediate response to collect the phone number, and the deferral to the supervisor agent to handle the tool call and formulate the response. There ~2s between the end of "give me a moment to check on that." being spoken aloud and the start of the "Thanks for waiting. Your last bill...".* 28 | 29 | ## Schematic 30 | ```mermaid 31 | sequenceDiagram 32 | participant User 33 | participant ChatAgent as Chat Agent
(gpt-4o-realtime-mini) 34 | participant Supervisor as Supervisor Agent
(gpt-4.1) 35 | participant Tool as Tool 36 | 37 | alt Basic chat or info collection 38 | User->>ChatAgent: User message 39 | ChatAgent->>User: Responds directly 40 | else Requires higher intelligence and/or tool call 41 | User->>ChatAgent: User message 42 | ChatAgent->>User: "Let me think" 43 | ChatAgent->>Supervisor: Forwards message/context 44 | alt Tool call needed 45 | Supervisor->>Tool: Calls tool 46 | Tool->>Supervisor: Returns result 47 | end 48 | Supervisor->>ChatAgent: Returns response 49 | ChatAgent->>User: Delivers response 50 | end 51 | ``` 52 | 53 | ## Benefits 54 | - **Simpler onboarding.** If you already have a performant text-based chat agent, you can give that same prompt and set of tools to the supervisor agent, and make some tweaks to the chat agent prompt, you'll have a natural voice agent that will perform on par with your text agent. 55 | - **Simple ramp to a full realtime agent**: Rather than switching your whole agent to the realtime api, you can move one task at a time, taking time to validate and build trust for each before deploying to production. 56 | - **High intelligence**: You benefit from the high intelligence, excellent tool calling and instruction following of models like `gpt-4.1` in your voice agents. 57 | - **Lower cost**: If your chat agent is only being used for basic tasks, you can use the realtime-mini model, which, even when combined with GPT-4.1, should be cheaper than using the full 4o-realtime model. 58 | - **User experience**: It's a more natural conversational experience than using a stitched model architecture, where response latency is often 1.5s or longer after a user has finished speaking. In this architecture, the model responds to the user right away, even if it has to lean on the supervisor agent. 59 | - However, more assistant responses will start with "Let me think", rather than responding immediately with the full response. 60 | 61 | ## Modifying for your own agent 62 | 1. Update [supervisorAgent](src/app/agentConfigs/chatSupervisorDemo/supervisorAgent.ts). 63 | - Add your existing text agent prompt and tools if you already have them. This should contain the "meat" of your voice agent logic and be very specific with what it should/shouldn't do and how exactly it should respond. Add this information below `==== Domain-Specific Agent Instructions ====`. 64 | - You should likely update this prompt to be more appropriate for voice, for example with instructions to be concise and avoiding long lists of items. 65 | 2. Update [chatAgent](src/app/agentConfigs/chatSupervisor/index.ts). 66 | - Customize the chatAgent instructions with your own tone, greeting, etc. 67 | - Add your tool definitions to `chatAgentInstructions`. We recommend a brief yaml description rather than json to ensure the model doesn't get confused and try calling the tool directly. 68 | - You can modify the decision boundary by adding new items to the `# Allow List of Permitted Actions` section. 69 | 3. To reduce cost, try using `gpt-4o-mini-realtime` for the chatAgent and/or `gpt-4.1-mini` for the supervisor model. To maximize intelligence on particularly difficult or high-stakes tasks, consider trading off latency and adding chain-of-thought to your supervisor prompt, or using an additional reasoning model-based supervisor that uses `o4-mini`. 70 | 71 | # Agentic Pattern 2: Sequential Handoffs 72 | 73 | This pattern is inspired by [OpenAI Swarm](https://github.com/openai/swarm) and involves the sequential handoff of a user between specialized agents. Handoffs are decided by the model and coordinated via tool calls, and possible handoffs are defined explicitly in an agent graph. A handoff triggers a session.update event with new instructions and tools. This pattern is effective for handling a variety of user intents with specialist agents, each of which might have long instructions and numerous tools. 74 | 75 | Here's a [video walkthrough](https://x.com/OpenAIDevs/status/1880306081517432936) showing how it works. You should be able to use this repo to prototype your own multi-agent realtime voice app in less than 20 minutes! 76 | 77 | ![Screenshot of the Realtime API Agents Demo](/public/screenshot_handoff.png) 78 | *In this simple example, the user is transferred from a greeter agent to a haiku agent. See below for the simple, full configuration of this flow.* 79 | 80 | Configuration in `src/app/agentConfigs/simpleExample.ts` 81 | ```typescript 82 | import { RealtimeAgent } from '@openai/agents/realtime'; 83 | 84 | // Define agents using the OpenAI Agents SDK 85 | export const haikuWriterAgent = new RealtimeAgent({ 86 | name: 'haikuWriter', 87 | handoffDescription: 'Agent that writes haikus.', // Context for the agent_transfer tool 88 | instructions: 89 | 'Ask the user for a topic, then reply with a haiku about that topic.', 90 | tools: [], 91 | handoffs: [], 92 | }); 93 | 94 | export const greeterAgent = new RealtimeAgent({ 95 | name: 'greeter', 96 | handoffDescription: 'Agent that greets the user.', 97 | instructions: 98 | "Please greet the user and ask them if they'd like a haiku. If yes, hand off to the 'haikuWriter' agent.", 99 | tools: [], 100 | handoffs: [haikuWriterAgent], // Define which agents this agent can hand off to 101 | }); 102 | 103 | // An Agent Set is just an array of the agents that participate in the scenario 104 | export default [greeterAgent, haikuWriterAgent]; 105 | ``` 106 | ## CustomerServiceRetail Flow 107 | 108 | This is a more complex, representative implementation that illustrates a customer service flow, with the following features: 109 | - A more complex agent graph with agents for user authentication, returns, sales, and a placeholder human agent for escalations. 110 | - An escalation by the [returns](https://github.com/openai/openai-realtime-agents/blob/60f4effc50a539b19b2f1fa4c38846086b58c295/src/app/agentConfigs/customerServiceRetail/returns.ts#L233) agent to `o4-mini` to validate and initiate a return, as an example high-stakes decision, using a similar pattern to the above. 111 | - Prompting models to follow a state machine, for example to accurately collect things like names and phone numbers with confirmation character by character to authenticate a user. 112 | - To test this flow, say that you'd like to return your snowboard and go through the necessary prompts! 113 | 114 | Configuration in [src/app/agentConfigs/customerServiceRetail/index.ts](src/app/agentConfigs/customerServiceRetail/index.ts). 115 | ```javascript 116 | import authentication from "./authentication"; 117 | import returns from "./returns"; 118 | import sales from "./sales"; 119 | import simulatedHuman from "./simulatedHuman"; 120 | import { injectTransferTools } from "../utils"; 121 | 122 | authentication.downstreamAgents = [returns, sales, simulatedHuman]; 123 | returns.downstreamAgents = [authentication, sales, simulatedHuman]; 124 | sales.downstreamAgents = [authentication, returns, simulatedHuman]; 125 | simulatedHuman.downstreamAgents = [authentication, returns, sales]; 126 | 127 | const agents = injectTransferTools([ 128 | authentication, 129 | returns, 130 | sales, 131 | simulatedHuman, 132 | ]); 133 | 134 | export default agents; 135 | ``` 136 | 137 | ## Schematic 138 | 139 | This diagram illustrates a more advanced interaction flow defined in `src/app/agentConfigs/customerServiceRetail/`, including detailed events. 140 | 141 |
142 | Show CustomerServiceRetail Flow Diagram 143 | 144 | ```mermaid 145 | sequenceDiagram 146 | participant User 147 | participant WebClient as Next.js Client 148 | participant NextAPI as /api/session 149 | participant RealtimeAPI as OpenAI Realtime API 150 | participant AgentManager as Agents (authentication, returns, sales, simulatedHuman) 151 | participant o1mini as "o4-mini" (Escalation Model) 152 | 153 | Note over WebClient: User navigates to ?agentConfig=customerServiceRetail 154 | User->>WebClient: Open Page 155 | WebClient->>NextAPI: GET /api/session 156 | NextAPI->>RealtimeAPI: POST /v1/realtime/sessions 157 | RealtimeAPI->>NextAPI: Returns ephemeral session 158 | NextAPI->>WebClient: Returns ephemeral token (JSON) 159 | 160 | Note right of WebClient: Start RTC handshake 161 | WebClient->>RealtimeAPI: Offer SDP (WebRTC) 162 | RealtimeAPI->>WebClient: SDP answer 163 | WebClient->>WebClient: DataChannel "oai-events" established 164 | 165 | Note over AgentManager: Default agent is "authentication" 166 | User->>WebClient: "Hi, I'd like to return my snowboard." 167 | WebClient->>AgentManager: conversation.item.create (role=user) 168 | WebClient->>RealtimeAPI: {type: "conversation.item.create"} 169 | WebClient->>RealtimeAPI: {type: "response.create"} 170 | 171 | authentication->>AgentManager: Requests user info, calls authenticate_user_information() 172 | AgentManager-->>WebClient: function_call => name="authenticate_user_information" 173 | WebClient->>WebClient: handleFunctionCall => verifies details 174 | 175 | Note over AgentManager: After user is authenticated 176 | authentication->>AgentManager: transferAgents("returns") 177 | AgentManager-->>WebClient: function_call => name="transferAgents" args={ destination: "returns" } 178 | WebClient->>WebClient: setSelectedAgentName("returns") 179 | 180 | Note over returns: The user wants to process a return 181 | returns->>AgentManager: function_call => checkEligibilityAndPossiblyInitiateReturn 182 | AgentManager-->>WebClient: function_call => name="checkEligibilityAndPossiblyInitiateReturn" 183 | 184 | Note over WebClient: The WebClient calls /api/chat/completions with model="o4-mini" 185 | WebClient->>o1mini: "Is this item eligible for return?" 186 | o1mini->>WebClient: "Yes/No (plus notes)" 187 | 188 | Note right of returns: Returns uses the result from "o4-mini" 189 | returns->>AgentManager: "Return is approved" or "Return is denied" 190 | AgentManager->>WebClient: conversation.item.create (assistant role) 191 | WebClient->>User: Displays final verdict 192 | ``` 193 | 194 |
195 | 196 | # Other Info 197 | ## Next Steps 198 | - You can copy these templates to make your own multi-agent voice app! Once you make a new agent set config, add it to `src/app/agentConfigs/index.ts` and you should be able to select it in the UI in the "Scenario" dropdown menu. 199 | - Each agentConfig can define instructions, tools, and toolLogic. By default all tool calls simply return `True`, unless you define the toolLogic, which will run your specific tool logic and return an object to the conversation (e.g. for retrieved RAG context). 200 | - If you want help creating your own prompt using the conventions shown in customerServiceRetail, including defining a state machine, we've included a metaprompt [here](src/app/agentConfigs/voiceAgentMetaprompt.txt), or you can use our [Voice Agent Metaprompter GPT](https://chatgpt.com/g/g-678865c9fb5c81918fa28699735dd08e-voice-agent-metaprompt-gpt) 201 | 202 | ## Output Guardrails 203 | Assistant messages are checked for safety and compliance before they are shown in the UI. The guardrail call now lives directly inside `src/app/App.tsx`: when a `response.text.delta` stream starts we mark the message as **IN_PROGRESS**, and once the server emits `guardrail_tripped` or `response.done` we mark the message as **FAIL** or **PASS** respectively. If you want to change how moderation is triggered or displayed, search for `guardrail_tripped` inside `App.tsx` and tweak the logic there. 204 | 205 | ## Navigating the UI 206 | - You can select agent scenarios in the Scenario dropdown, and automatically switch to a specific agent with the Agent dropdown. 207 | - The conversation transcript is on the left, including tool calls, tool call responses, and agent changes. Click to expand non-message elements. 208 | - The event log is on the right, showing both client and server events. Click to see the full payload. 209 | - On the bottom, you can disconnect, toggle between automated voice-activity detection or PTT, turn off audio playback, and toggle logs. 210 | 211 | ## Pull Requests 212 | 213 | Feel free to open an issue or pull request and we'll do our best to review it. The spirit of this repo is to demonstrate the core logic for new agentic flows; PRs that go beyond this core scope will likely not be merged. 214 | 215 | # Core Contributors 216 | - Noah MacCallum - [noahmacca](https://x.com/noahmacca) 217 | - Ilan Bigio - [ibigio](https://github.com/ibigio) 218 | - Brian Fioca - [bfioca](https://github.com/bfioca) 219 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import { dirname } from "path"; 2 | import { fileURLToPath } from "url"; 3 | import { FlatCompat } from "@eslint/eslintrc"; 4 | 5 | const __filename = fileURLToPath(import.meta.url); 6 | const __dirname = dirname(__filename); 7 | 8 | const compat = new FlatCompat({ 9 | baseDirectory: __dirname, 10 | }); 11 | 12 | const eslintConfig = [ 13 | ...compat.extends("next/core-web-vitals", "next/typescript"), 14 | { 15 | rules: { 16 | "@typescript-eslint/no-explicit-any": "off", 17 | "react-hooks/exhaustive-deps": "off" 18 | }, 19 | }, 20 | ]; 21 | 22 | export default eslintConfig; 23 | -------------------------------------------------------------------------------- /next.config.ts: -------------------------------------------------------------------------------- 1 | import type { NextConfig } from "next"; 2 | 3 | const nextConfig: NextConfig = { 4 | /* config options here */ 5 | }; 6 | 7 | export default nextConfig; 8 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "realtime-examples", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@openai/agents": "^0.0.1", 13 | "@radix-ui/react-icons": "^1.3.2", 14 | "dotenv": "^16.4.7", 15 | "next": "^15.3.1", 16 | "openai": "^4.77.3", 17 | "react": "^19.0.0", 18 | "react-dom": "^19.0.0", 19 | "react-markdown": "^9.0.3", 20 | "uuid": "^11.0.4", 21 | "zod": "^3.24.1" 22 | }, 23 | "devDependencies": { 24 | "@eslint/eslintrc": "^3", 25 | "@types/node": "^20", 26 | "@types/react": "^19", 27 | "@types/react-dom": "^19", 28 | "eslint": "^9", 29 | "eslint-config-next": "15.1.4", 30 | "postcss": "^8", 31 | "tailwindcss": "^3.4.1", 32 | "typescript": "^5" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /public/arrow.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/openai-realtime-agents/fa90035821c4ed47df244acffd13a3532e67829b/public/favicon.ico -------------------------------------------------------------------------------- /public/openai-logomark.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/screenshot_chat_supervisor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/openai-realtime-agents/fa90035821c4ed47df244acffd13a3532e67829b/public/screenshot_chat_supervisor.png -------------------------------------------------------------------------------- /public/screenshot_handoff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/openai-realtime-agents/fa90035821c4ed47df244acffd13a3532e67829b/public/screenshot_handoff.png -------------------------------------------------------------------------------- /src/app/App.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import React, { useEffect, useRef, useState } from "react"; 4 | import { useSearchParams } from "next/navigation"; 5 | import { v4 as uuidv4 } from "uuid"; 6 | 7 | import Image from "next/image"; 8 | 9 | // UI components 10 | import Transcript from "./components/Transcript"; 11 | import Events from "./components/Events"; 12 | import BottomToolbar from "./components/BottomToolbar"; 13 | 14 | // Types 15 | import { SessionStatus, TranscriptItem } from "@/app/types"; 16 | import type { RealtimeAgent } from '@openai/agents/realtime'; 17 | 18 | // Context providers & hooks 19 | import { useTranscript } from "@/app/contexts/TranscriptContext"; 20 | import { useEvent } from "@/app/contexts/EventContext"; 21 | 22 | // Utilities 23 | import { RealtimeClient } from "@/app/agentConfigs/realtimeClient"; 24 | 25 | // Agent configs 26 | import { allAgentSets, defaultAgentSetKey } from "@/app/agentConfigs"; 27 | // New SDK scenarios 28 | import { simpleHandoffScenario } from "@/app/agentConfigs/simpleHandoff"; 29 | import { customerServiceRetailScenario } from "@/app/agentConfigs/customerServiceRetail"; 30 | import { chatSupervisorScenario } from "@/app/agentConfigs/chatSupervisor"; 31 | 32 | const sdkScenarioMap: Record = { 33 | simpleHandoff: simpleHandoffScenario, 34 | customerServiceRetail: customerServiceRetailScenario, 35 | chatSupervisor: chatSupervisorScenario, 36 | }; 37 | 38 | import useAudioDownload from "./hooks/useAudioDownload"; 39 | 40 | function App() { 41 | const searchParams = useSearchParams()!; 42 | 43 | // Use urlCodec directly from URL search params (default: "opus") 44 | const urlCodec = searchParams.get("codec") || "opus"; 45 | 46 | const { 47 | transcriptItems, 48 | addTranscriptMessage, 49 | addTranscriptBreadcrumb, 50 | updateTranscriptMessage, 51 | updateTranscriptItem, 52 | } = useTranscript(); 53 | 54 | // Keep a mutable reference to the latest transcriptItems so that streaming 55 | // callbacks registered once during setup always have access to up-to-date 56 | // data without being re-registered on every render. 57 | const transcriptItemsRef = useRef(transcriptItems); 58 | useEffect(() => { 59 | transcriptItemsRef.current = transcriptItems; 60 | }, [transcriptItems]); 61 | const { logClientEvent, logServerEvent, logHistoryItem } = useEvent(); 62 | 63 | const [selectedAgentName, setSelectedAgentName] = useState(""); 64 | const [selectedAgentConfigSet, setSelectedAgentConfigSet] = useState< 65 | RealtimeAgent[] | null 66 | >(null); 67 | 68 | const audioElementRef = useRef(null); 69 | 70 | const sdkAudioElement = React.useMemo(() => { 71 | if (typeof window === 'undefined') return undefined; 72 | const el = document.createElement('audio'); 73 | el.autoplay = true; 74 | el.style.display = 'none'; 75 | document.body.appendChild(el); 76 | return el; 77 | }, []); 78 | 79 | // Attach SDK audio element once it exists (after first render in browser) 80 | useEffect(() => { 81 | if (sdkAudioElement && !audioElementRef.current) { 82 | audioElementRef.current = sdkAudioElement; 83 | } 84 | }, [sdkAudioElement]); 85 | 86 | const sdkClientRef = useRef(null); 87 | const loggedFunctionCallsRef = useRef>(new Set()); 88 | const [sessionStatus, setSessionStatus] = 89 | useState("DISCONNECTED"); 90 | 91 | const [isEventsPaneExpanded, setIsEventsPaneExpanded] = 92 | useState(true); 93 | const [userText, setUserText] = useState(""); 94 | const [isPTTActive, setIsPTTActive] = useState(false); 95 | const [isPTTUserSpeaking, setIsPTTUserSpeaking] = useState(false); 96 | const [isAudioPlaybackEnabled, setIsAudioPlaybackEnabled] = useState( 97 | () => { 98 | if (typeof window === 'undefined') return true; 99 | const stored = localStorage.getItem('audioPlaybackEnabled'); 100 | return stored ? stored === 'true' : true; 101 | }, 102 | ); 103 | 104 | 105 | 106 | // Initialize the recording hook. 107 | const { startRecording, stopRecording, downloadRecording } = 108 | useAudioDownload(); 109 | 110 | const sendClientEvent = (eventObj: any, eventNameSuffix = '') => { 111 | if (!sdkClientRef.current) { 112 | console.error('SDK client not available', eventObj); 113 | return; 114 | } 115 | 116 | try { 117 | sdkClientRef.current.sendEvent(eventObj); 118 | } catch (err) { 119 | console.error('Failed to send via SDK', err); 120 | } 121 | }; 122 | 123 | 124 | useEffect(() => { 125 | let finalAgentConfig = searchParams.get("agentConfig"); 126 | if (!finalAgentConfig || !allAgentSets[finalAgentConfig]) { 127 | finalAgentConfig = defaultAgentSetKey; 128 | const url = new URL(window.location.toString()); 129 | url.searchParams.set("agentConfig", finalAgentConfig); 130 | window.location.replace(url.toString()); 131 | return; 132 | } 133 | 134 | const agents = allAgentSets[finalAgentConfig]; 135 | const agentKeyToUse = agents[0]?.name || ""; 136 | 137 | setSelectedAgentName(agentKeyToUse); 138 | setSelectedAgentConfigSet(agents); 139 | }, [searchParams]); 140 | 141 | useEffect(() => { 142 | if (selectedAgentName && sessionStatus === "DISCONNECTED") { 143 | connectToRealtime(); 144 | } 145 | }, [selectedAgentName]); 146 | 147 | useEffect(() => { 148 | if ( 149 | sessionStatus === "CONNECTED" && 150 | selectedAgentConfigSet && 151 | selectedAgentName 152 | ) { 153 | const currentAgent = selectedAgentConfigSet.find( 154 | (a) => a.name === selectedAgentName 155 | ); 156 | addTranscriptBreadcrumb(`Agent: ${selectedAgentName}`, currentAgent); 157 | updateSession(true); 158 | } 159 | }, [selectedAgentConfigSet, selectedAgentName, sessionStatus]); 160 | 161 | useEffect(() => { 162 | if (sessionStatus === "CONNECTED") { 163 | console.log( 164 | `updatingSession, isPTTACtive=${isPTTActive} sessionStatus=${sessionStatus}` 165 | ); 166 | updateSession(); 167 | } 168 | }, [isPTTActive]); 169 | 170 | const fetchEphemeralKey = async (): Promise => { 171 | logClientEvent({ url: "/session" }, "fetch_session_token_request"); 172 | const tokenResponse = await fetch("/api/session"); 173 | const data = await tokenResponse.json(); 174 | logServerEvent(data, "fetch_session_token_response"); 175 | 176 | if (!data.client_secret?.value) { 177 | logClientEvent(data, "error.no_ephemeral_key"); 178 | console.error("No ephemeral key provided by the server"); 179 | setSessionStatus("DISCONNECTED"); 180 | return null; 181 | } 182 | 183 | return data.client_secret.value; 184 | }; 185 | 186 | const connectToRealtime = async () => { 187 | const agentSetKey = searchParams.get("agentConfig") || "default"; 188 | if (sdkScenarioMap[agentSetKey]) { 189 | // Use new SDK path 190 | if (sessionStatus !== "DISCONNECTED") return; 191 | setSessionStatus("CONNECTING"); 192 | 193 | try { 194 | const EPHEMERAL_KEY = await fetchEphemeralKey(); 195 | if (!EPHEMERAL_KEY) return; 196 | 197 | // Ensure the selectedAgentName is first so that it becomes the root 198 | const reorderedAgents = [...sdkScenarioMap[agentSetKey]]; 199 | const idx = reorderedAgents.findIndex((a) => a.name === selectedAgentName); 200 | if (idx > 0) { 201 | const [agent] = reorderedAgents.splice(idx, 1); 202 | reorderedAgents.unshift(agent); 203 | } 204 | 205 | const client = new RealtimeClient({ 206 | getEphemeralKey: async () => EPHEMERAL_KEY, 207 | initialAgents: reorderedAgents, 208 | audioElement: sdkAudioElement, 209 | extraContext: { 210 | addTranscriptBreadcrumb, 211 | }, 212 | } as any); 213 | 214 | sdkClientRef.current = client; 215 | 216 | client.on("connection_change", (status) => { 217 | if (status === "connected") setSessionStatus("CONNECTED"); 218 | else if (status === "connecting") setSessionStatus("CONNECTING"); 219 | else setSessionStatus("DISCONNECTED"); 220 | }); 221 | 222 | client.on("message", (ev) => { 223 | logServerEvent(ev); 224 | 225 | // --- Realtime streaming handling --------------------------------- 226 | // The Realtime transport emits granular *delta* events while the 227 | // assistant is speaking or while the user's audio is still being 228 | // transcribed. Those events were previously only logged which made 229 | // the UI update only once when the final conversation.item.* event 230 | // arrived – effectively disabling streaming. We now listen for the 231 | // delta events and update the transcript as they arrive so that 232 | // 1) assistant messages stream token-by-token, and 233 | // 2) the user sees a live "Transcribing…" placeholder while we are 234 | // still converting their speech to text. 235 | 236 | // NOTE: The exact payloads are still evolving. We intentionally 237 | // access properties defensively to avoid runtime crashes if fields 238 | // are renamed or missing. 239 | 240 | try { 241 | // Guardrail trip event – mark last assistant message as FAIL 242 | if (ev.type === 'guardrail_tripped') { 243 | const lastAssistant = [...transcriptItemsRef.current] 244 | .reverse() 245 | .find((i) => i.role === 'assistant'); 246 | 247 | if (lastAssistant) { 248 | updateTranscriptItem(lastAssistant.itemId, { 249 | guardrailResult: { 250 | status: 'DONE', 251 | category: 'OFF_BRAND', 252 | rationale: 'Guardrail triggered', 253 | testText: '', 254 | }, 255 | } as any); 256 | } 257 | return; 258 | } 259 | 260 | // Response finished – if we still have Pending guardrail mark as 261 | // Pass. This event fires once per assistant turn. 262 | if (ev.type === 'response.done') { 263 | const lastAssistant = [...transcriptItemsRef.current] 264 | .reverse() 265 | .find((i) => i.role === 'assistant'); 266 | 267 | if (lastAssistant) { 268 | const existing: any = (lastAssistant as any).guardrailResult; 269 | if (!existing || existing.status === 'IN_PROGRESS') { 270 | updateTranscriptItem(lastAssistant.itemId, { 271 | guardrailResult: { 272 | status: 'DONE', 273 | category: 'NONE', 274 | rationale: '', 275 | }, 276 | } as any); 277 | } 278 | } 279 | // continue processing other logic if needed 280 | } 281 | // Assistant text (or audio-to-text) streaming 282 | if ( 283 | ev.type === 'response.text.delta' || 284 | ev.type === 'response.audio_transcript.delta' 285 | ) { 286 | const itemId: string | undefined = (ev as any).item_id ?? (ev as any).itemId; 287 | const delta: string | undefined = (ev as any).delta ?? (ev as any).text; 288 | if (!itemId || !delta) return; 289 | 290 | // Ensure a transcript message exists for this assistant item. 291 | if (!transcriptItemsRef.current.some((t) => t.itemId === itemId)) { 292 | addTranscriptMessage(itemId, 'assistant', ''); 293 | updateTranscriptItem(itemId, { 294 | guardrailResult: { 295 | status: 'IN_PROGRESS', 296 | }, 297 | } as any); 298 | } 299 | 300 | // Append the latest delta so the UI streams. 301 | updateTranscriptMessage(itemId, delta, true); 302 | updateTranscriptItem(itemId, { status: 'IN_PROGRESS' }); 303 | return; 304 | } 305 | 306 | // Live user transcription streaming 307 | if (ev.type === 'conversation.input_audio_transcription.delta') { 308 | const itemId: string | undefined = (ev as any).item_id ?? (ev as any).itemId; 309 | const delta: string | undefined = (ev as any).delta ?? (ev as any).text; 310 | if (!itemId || typeof delta !== 'string') return; 311 | 312 | // If this is the very first chunk, create a hidden user message 313 | // so that we can surface "Transcribing…" immediately. 314 | if (!transcriptItemsRef.current.some((t) => t.itemId === itemId)) { 315 | addTranscriptMessage(itemId, 'user', 'Transcribing…'); 316 | } 317 | 318 | updateTranscriptMessage(itemId, delta, true); 319 | updateTranscriptItem(itemId, { status: 'IN_PROGRESS' }); 320 | } 321 | 322 | // Detect start of a new user speech segment when VAD kicks in. 323 | if (ev.type === 'input_audio_buffer.speech_started') { 324 | const itemId: string | undefined = (ev as any).item_id; 325 | if (!itemId) return; 326 | 327 | const exists = transcriptItemsRef.current.some( 328 | (t) => t.itemId === itemId, 329 | ); 330 | if (!exists) { 331 | addTranscriptMessage(itemId, 'user', 'Transcribing…'); 332 | updateTranscriptItem(itemId, { status: 'IN_PROGRESS' }); 333 | } 334 | } 335 | 336 | // Final transcript once Whisper finishes 337 | if ( 338 | ev.type === 'conversation.item.input_audio_transcription.completed' 339 | ) { 340 | const itemId: string | undefined = (ev as any).item_id; 341 | const transcriptText: string | undefined = (ev as any).transcript; 342 | if (!itemId || typeof transcriptText !== 'string') return; 343 | 344 | const exists = transcriptItemsRef.current.some( 345 | (t) => t.itemId === itemId, 346 | ); 347 | if (!exists) { 348 | addTranscriptMessage(itemId, 'user', transcriptText.trim()); 349 | } else { 350 | // Replace placeholder / delta text with final transcript 351 | updateTranscriptMessage(itemId, transcriptText.trim(), false); 352 | } 353 | updateTranscriptItem(itemId, { status: 'DONE' }); 354 | } 355 | 356 | // Assistant streaming tokens or transcript 357 | if ( 358 | ev.type === 'response.text.delta' || 359 | ev.type === 'response.audio_transcript.delta' 360 | ) { 361 | const responseId: string | undefined = 362 | (ev as any).response_id ?? (ev as any).responseId; 363 | const delta: string | undefined = (ev as any).delta ?? (ev as any).text; 364 | if (!responseId || typeof delta !== 'string') return; 365 | 366 | // We'll use responseId as part of itemId to make it deterministic. 367 | const itemId = `assistant-${responseId}`; 368 | 369 | if (!transcriptItemsRef.current.some((t) => t.itemId === itemId)) { 370 | addTranscriptMessage(itemId, 'assistant', ''); 371 | } 372 | 373 | updateTranscriptMessage(itemId, delta, true); 374 | updateTranscriptItem(itemId, { status: 'IN_PROGRESS' }); 375 | } 376 | } catch (err) { 377 | // Streaming is best-effort – never break the session because of it. 378 | console.warn('streaming-ui error', err); 379 | } 380 | }); 381 | 382 | client.on('history_added', (item) => { 383 | logHistoryItem(item); 384 | 385 | // Update the transcript view 386 | if (item.type === 'message') { 387 | const textContent = (item.content || []) 388 | .map((c: any) => { 389 | if (c.type === 'text') return c.text; 390 | if (c.type === 'input_text') return c.text; 391 | if (c.type === 'input_audio') return c.transcript ?? ''; 392 | if (c.type === 'audio') return c.transcript ?? ''; 393 | return ''; 394 | }) 395 | .join(' ') 396 | .trim(); 397 | 398 | if (!textContent) return; 399 | 400 | const role = item.role as 'user' | 'assistant'; 401 | 402 | // No PTT placeholder logic needed 403 | 404 | const exists = transcriptItemsRef.current.some( 405 | (t) => t.itemId === item.itemId, 406 | ); 407 | 408 | if (!exists) { 409 | addTranscriptMessage(item.itemId, role, textContent, false); 410 | if (role === 'assistant') { 411 | updateTranscriptItem(item.itemId, { 412 | guardrailResult: { 413 | status: 'IN_PROGRESS', 414 | }, 415 | } as any); 416 | } 417 | } else { 418 | updateTranscriptMessage(item.itemId, textContent, false); 419 | } 420 | 421 | // After assistant message completes, add default guardrail PASS if none present. 422 | if ( 423 | role === 'assistant' && 424 | (item as any).status === 'completed' 425 | ) { 426 | const current = transcriptItemsRef.current.find( 427 | (t) => t.itemId === item.itemId, 428 | ); 429 | const existing = (current as any)?.guardrailResult; 430 | if (existing && existing.status !== 'IN_PROGRESS') { 431 | // already final (e.g., FAIL) – leave as is. 432 | } else { 433 | updateTranscriptItem(item.itemId, { 434 | guardrailResult: { 435 | status: 'DONE', 436 | category: 'NONE', 437 | rationale: '', 438 | }, 439 | } as any); 440 | } 441 | } 442 | 443 | if ('status' in item) { 444 | updateTranscriptItem(item.itemId, { 445 | status: 446 | (item as any).status === 'completed' 447 | ? 'DONE' 448 | : 'IN_PROGRESS', 449 | }); 450 | } 451 | } 452 | 453 | // Surface function / hand-off calls as breadcrumbs 454 | if (item.type === 'function_call') { 455 | const title = `Tool call: ${(item as any).name}`; 456 | 457 | if (!loggedFunctionCallsRef.current.has(item.itemId)) { 458 | addTranscriptBreadcrumb(title, { 459 | arguments: (item as any).arguments, 460 | }); 461 | loggedFunctionCallsRef.current.add(item.itemId); 462 | 463 | // If this looks like a handoff (transfer_to_*), switch active 464 | // agent so subsequent session updates & breadcrumbs reflect the 465 | // new agent. The Realtime SDK already updated the session on 466 | // the backend; this only affects the UI state. 467 | const toolName: string = (item as any).name ?? ''; 468 | const handoffMatch = toolName.match(/^transfer_to_(.+)$/); 469 | if (handoffMatch) { 470 | const newAgentKey = handoffMatch[1]; 471 | 472 | // Find agent whose name matches (case-insensitive) 473 | const candidate = selectedAgentConfigSet?.find( 474 | (a) => a.name.toLowerCase() === newAgentKey.toLowerCase(), 475 | ); 476 | if (candidate && candidate.name !== selectedAgentName) { 477 | setSelectedAgentName(candidate.name); 478 | } 479 | } 480 | } 481 | return; 482 | } 483 | }); 484 | 485 | // Handle continuous updates for existing items so streaming assistant 486 | // speech shows up while in_progress. 487 | client.on('history_updated', (history) => { 488 | history.forEach((item: any) => { 489 | if (item.type === 'function_call') { 490 | // Update breadcrumb data (e.g., add output) once we have more info. 491 | 492 | if (!loggedFunctionCallsRef.current.has(item.itemId)) { 493 | addTranscriptBreadcrumb(`Tool call: ${(item as any).name}`, { 494 | arguments: (item as any).arguments, 495 | output: (item as any).output, 496 | }); 497 | loggedFunctionCallsRef.current.add(item.itemId); 498 | 499 | const toolName: string = (item as any).name ?? ''; 500 | const handoffMatch = toolName.match(/^transfer_to_(.+)$/); 501 | if (handoffMatch) { 502 | const newAgentKey = handoffMatch[1]; 503 | const candidate = selectedAgentConfigSet?.find( 504 | (a) => a.name.toLowerCase() === newAgentKey.toLowerCase(), 505 | ); 506 | if (candidate && candidate.name !== selectedAgentName) { 507 | setSelectedAgentName(candidate.name); 508 | } 509 | } 510 | } 511 | 512 | return; 513 | } 514 | 515 | if (item.type !== 'message') return; 516 | 517 | const textContent = (item.content || []) 518 | .map((c: any) => { 519 | if (c.type === 'text') return c.text; 520 | if (c.type === 'input_text') return c.text; 521 | if (c.type === 'input_audio') return c.transcript ?? ''; 522 | if (c.type === 'audio') return c.transcript ?? ''; 523 | return ''; 524 | }) 525 | .join(' ') 526 | .trim(); 527 | 528 | const role = item.role as 'user' | 'assistant'; 529 | 530 | if (!textContent) return; 531 | 532 | const exists = transcriptItemsRef.current.some( 533 | (t) => t.itemId === item.itemId, 534 | ); 535 | if (!exists) { 536 | addTranscriptMessage(item.itemId, role, textContent, false); 537 | if (role === 'assistant') { 538 | updateTranscriptItem(item.itemId, { 539 | guardrailResult: { 540 | status: 'IN_PROGRESS', 541 | }, 542 | } as any); 543 | } 544 | } else { 545 | updateTranscriptMessage(item.itemId, textContent, false); 546 | } 547 | 548 | if ('status' in item) { 549 | updateTranscriptItem(item.itemId, { 550 | status: 551 | (item as any).status === 'completed' 552 | ? 'DONE' 553 | : 'IN_PROGRESS', 554 | }); 555 | } 556 | }); 557 | }); 558 | 559 | await client.connect(); 560 | } catch (err) { 561 | console.error("Error connecting via SDK:", err); 562 | setSessionStatus("DISCONNECTED"); 563 | } 564 | return; 565 | } 566 | }; 567 | 568 | const disconnectFromRealtime = () => { 569 | if (sdkClientRef.current) { 570 | sdkClientRef.current.disconnect(); 571 | sdkClientRef.current = null; 572 | } 573 | setSessionStatus("DISCONNECTED"); 574 | setIsPTTUserSpeaking(false); 575 | 576 | logClientEvent({}, "disconnected"); 577 | }; 578 | 579 | const sendSimulatedUserMessage = (text: string) => { 580 | const id = uuidv4().slice(0, 32); 581 | addTranscriptMessage(id, "user", text, true); 582 | 583 | sendClientEvent( 584 | { 585 | type: "conversation.item.create", 586 | item: { 587 | id, 588 | type: "message", 589 | role: "user", 590 | content: [{ type: "input_text", text }], 591 | }, 592 | }, 593 | "(simulated user text message)" 594 | ); 595 | sendClientEvent( 596 | { type: "response.create" }, 597 | "(trigger response after simulated user text message)" 598 | ); 599 | }; 600 | 601 | const updateSession = (shouldTriggerResponse: boolean = false) => { 602 | // In SDK scenarios RealtimeClient manages session config automatically. 603 | if (sdkClientRef.current) { 604 | if (shouldTriggerResponse) { 605 | sendSimulatedUserMessage('hi'); 606 | } 607 | 608 | // Reflect Push-to-Talk UI state by (de)activating server VAD on the 609 | // backend. The Realtime SDK supports live session updates via the 610 | // `session.update` event. 611 | const client = sdkClientRef.current; 612 | if (client) { 613 | const turnDetection = isPTTActive 614 | ? null 615 | : { 616 | type: 'server_vad', 617 | threshold: 0.9, 618 | prefix_padding_ms: 300, 619 | silence_duration_ms: 500, 620 | create_response: true, 621 | }; 622 | try { 623 | client.sendEvent({ 624 | type: 'session.update', 625 | session: { 626 | turn_detection: turnDetection, 627 | }, 628 | }); 629 | } catch (err) { 630 | console.warn('Failed to update session', err); 631 | } 632 | } 633 | return; 634 | } 635 | }; 636 | 637 | const cancelAssistantSpeech = async () => { 638 | 639 | // Interrupts server response and clears local audio. 640 | if (sdkClientRef.current) { 641 | try { 642 | sdkClientRef.current.interrupt(); 643 | } catch (err) { 644 | console.error('Failed to interrupt', err); 645 | } 646 | } 647 | }; 648 | 649 | const handleSendTextMessage = () => { 650 | if (!userText.trim()) return; 651 | cancelAssistantSpeech(); 652 | 653 | if (!sdkClientRef.current) { 654 | console.error('SDK client not available'); 655 | return; 656 | } 657 | 658 | try { 659 | sdkClientRef.current.sendUserText(userText.trim()); 660 | } catch (err) { 661 | console.error('Failed to send via SDK', err); 662 | } 663 | 664 | setUserText(""); 665 | }; 666 | 667 | const handleTalkButtonDown = () => { 668 | if (sessionStatus !== 'CONNECTED' || sdkClientRef.current == null) return; 669 | cancelAssistantSpeech(); 670 | 671 | setIsPTTUserSpeaking(true); 672 | sendClientEvent({ type: "input_audio_buffer.clear" }, "clear PTT buffer"); 673 | 674 | // No placeholder; we'll rely on server transcript once ready. 675 | }; 676 | 677 | const handleTalkButtonUp = () => { 678 | if (sessionStatus !== 'CONNECTED' || sdkClientRef.current == null || !isPTTUserSpeaking) 679 | return; 680 | 681 | setIsPTTUserSpeaking(false); 682 | sendClientEvent({ type: "input_audio_buffer.commit" }, "commit PTT"); 683 | sendClientEvent({ type: "response.create" }, "trigger response PTT"); 684 | }; 685 | 686 | const onToggleConnection = () => { 687 | if (sessionStatus === "CONNECTED" || sessionStatus === "CONNECTING") { 688 | disconnectFromRealtime(); 689 | setSessionStatus("DISCONNECTED"); 690 | } else { 691 | connectToRealtime(); 692 | } 693 | }; 694 | 695 | const handleAgentChange = (e: React.ChangeEvent) => { 696 | const newAgentConfig = e.target.value; 697 | const url = new URL(window.location.toString()); 698 | url.searchParams.set("agentConfig", newAgentConfig); 699 | window.location.replace(url.toString()); 700 | }; 701 | 702 | const handleSelectedAgentChange = ( 703 | e: React.ChangeEvent 704 | ) => { 705 | const newAgentName = e.target.value; 706 | // Reconnect session with the newly selected agent as root so that tool 707 | // execution works correctly. 708 | disconnectFromRealtime(); 709 | setSelectedAgentName(newAgentName); 710 | // connectToRealtime will be triggered by effect watching selectedAgentName 711 | }; 712 | 713 | // Instead of using setCodec, we update the URL and refresh the page when codec changes 714 | const handleCodecChange = (newCodec: string) => { 715 | const url = new URL(window.location.toString()); 716 | url.searchParams.set("codec", newCodec); 717 | window.location.replace(url.toString()); 718 | }; 719 | 720 | useEffect(() => { 721 | const storedPushToTalkUI = localStorage.getItem("pushToTalkUI"); 722 | if (storedPushToTalkUI) { 723 | setIsPTTActive(storedPushToTalkUI === "true"); 724 | } 725 | const storedLogsExpanded = localStorage.getItem("logsExpanded"); 726 | if (storedLogsExpanded) { 727 | setIsEventsPaneExpanded(storedLogsExpanded === "true"); 728 | } 729 | const storedAudioPlaybackEnabled = localStorage.getItem( 730 | "audioPlaybackEnabled" 731 | ); 732 | if (storedAudioPlaybackEnabled) { 733 | setIsAudioPlaybackEnabled(storedAudioPlaybackEnabled === "true"); 734 | } 735 | }, []); 736 | 737 | useEffect(() => { 738 | localStorage.setItem("pushToTalkUI", isPTTActive.toString()); 739 | }, [isPTTActive]); 740 | 741 | useEffect(() => { 742 | localStorage.setItem("logsExpanded", isEventsPaneExpanded.toString()); 743 | }, [isEventsPaneExpanded]); 744 | 745 | useEffect(() => { 746 | localStorage.setItem( 747 | "audioPlaybackEnabled", 748 | isAudioPlaybackEnabled.toString() 749 | ); 750 | }, [isAudioPlaybackEnabled]); 751 | 752 | useEffect(() => { 753 | if (audioElementRef.current) { 754 | if (isAudioPlaybackEnabled) { 755 | audioElementRef.current.muted = false; 756 | audioElementRef.current.play().catch((err) => { 757 | console.warn("Autoplay may be blocked by browser:", err); 758 | }); 759 | } else { 760 | // Mute and pause to avoid brief audio blips before pause takes effect. 761 | audioElementRef.current.muted = true; 762 | audioElementRef.current.pause(); 763 | } 764 | } 765 | 766 | // Toggle server-side audio stream mute so bandwidth is saved when the 767 | // user disables playback. Only supported when using the SDK path. 768 | if (sdkClientRef.current) { 769 | try { 770 | sdkClientRef.current.mute(!isAudioPlaybackEnabled); 771 | } catch (err) { 772 | console.warn('Failed to toggle SDK mute', err); 773 | } 774 | } 775 | }, [isAudioPlaybackEnabled]); 776 | 777 | // Ensure mute state is propagated to transport right after we connect or 778 | // whenever the SDK client reference becomes available. 779 | useEffect(() => { 780 | if (sessionStatus === 'CONNECTED' && sdkClientRef.current) { 781 | try { 782 | sdkClientRef.current.mute(!isAudioPlaybackEnabled); 783 | } catch (err) { 784 | console.warn('mute sync after connect failed', err); 785 | } 786 | } 787 | }, [sessionStatus, isAudioPlaybackEnabled]); 788 | 789 | useEffect(() => { 790 | if (sessionStatus === "CONNECTED" && audioElementRef.current?.srcObject) { 791 | // The remote audio stream from the audio element. 792 | const remoteStream = audioElementRef.current.srcObject as MediaStream; 793 | startRecording(remoteStream); 794 | } 795 | 796 | // Clean up on unmount or when sessionStatus is updated. 797 | return () => { 798 | stopRecording(); 799 | }; 800 | }, [sessionStatus]); 801 | 802 | const agentSetKey = searchParams.get("agentConfig") || "default"; 803 | 804 | return ( 805 |
806 |
807 |
window.location.reload()} 810 | > 811 |
812 | OpenAI Logo 819 |
820 |
821 | Realtime API Agents 822 |
823 |
824 |
825 | 828 |
829 | 840 |
841 | 842 | 847 | 848 |
849 |
850 | 851 | {agentSetKey && ( 852 |
853 | 856 |
857 | 868 |
869 | 874 | 879 | 880 |
881 |
882 |
883 | )} 884 |
885 |
886 | 887 |
888 | 898 | 899 | 900 |
901 | 902 | 917 |
918 | ); 919 | } 920 | 921 | export default App; 922 | -------------------------------------------------------------------------------- /src/app/agentConfigs/chatSupervisor/index.ts: -------------------------------------------------------------------------------- 1 | import { RealtimeAgent } from '@openai/agents/realtime' 2 | import { getNextResponseFromSupervisor } from './supervisorAgent'; 3 | 4 | export const chatAgent = new RealtimeAgent({ 5 | name: 'chatAgent', 6 | voice: 'sage', 7 | instructions: ` 8 | You are a helpful junior customer service agent. Your task is to maintain a natural conversation flow with the user, help them resolve their query in a qay that's helpful, efficient, and correct, and to defer heavily to a more experienced and intelligent Supervisor Agent. 9 | 10 | # General Instructions 11 | - You are very new and can only handle basic tasks, and will rely heavily on the Supervisor Agent via the getNextResponseFromSupervisor tool 12 | - By default, you must always use the getNextResponseFromSupervisor tool to get your next response, except for very specific exceptions. 13 | - You represent a company called NewTelco. 14 | - Always greet the user with "Hi, you've reached NewTelco, how can I help you?" 15 | - If the user says "hi", "hello", or similar greetings in later messages, respond naturally and briefly (e.g., "Hello!" or "Hi there!") instead of repeating the canned greeting. 16 | - In general, don't say the same thing twice, always vary it to ensure the conversation feels natural. 17 | - Do not use any of the information or values from the examples as a reference in conversation. 18 | 19 | ## Tone 20 | - Maintain an extremely neutral, unexpressive, and to-the-point tone at all times. 21 | - Do not use sing-song-y or overly friendly language 22 | - Be quick and concise 23 | 24 | # Tools 25 | - You can ONLY call getNextResponseFromSupervisor 26 | - Even if you're provided other tools in this prompt as a reference, NEVER call them directly. 27 | 28 | # Allow List of Permitted Actions 29 | You can take the following actions directly, and don't need to use getNextReseponse for these. 30 | 31 | ## Basic chitchat 32 | - Handle greetings (e.g., "hello", "hi there"). 33 | - Engage in basic chitchat (e.g., "how are you?", "thank you"). 34 | - Respond to requests to repeat or clarify information (e.g., "can you repeat that?"). 35 | 36 | ## Collect information for Supervisor Agent tool calls 37 | - Request user information needed to call tools. Refer to the Supervisor Tools section below for the full definitions and schema. 38 | 39 | ### Supervisor Agent Tools 40 | NEVER call these tools directly, these are only provided as a reference for collecting parameters for the supervisor model to use. 41 | 42 | lookupPolicyDocument: 43 | description: Look up internal documents and policies by topic or keyword. 44 | params: 45 | topic: string (required) - The topic or keyword to search for. 46 | 47 | getUserAccountInfo: 48 | description: Get user account and billing information (read-only). 49 | params: 50 | phone_number: string (required) - User's phone number. 51 | 52 | findNearestStore: 53 | description: Find the nearest store location given a zip code. 54 | params: 55 | zip_code: string (required) - The customer's 5-digit zip code. 56 | 57 | **You must NOT answer, resolve, or attempt to handle ANY other type of request, question, or issue yourself. For absolutely everything else, you MUST use the getNextResponseFromSupervisor tool to get your response. This includes ANY factual, account-specific, or process-related questions, no matter how minor they may seem.** 58 | 59 | # getNextResponseFromSupervisor Usage 60 | - For ALL requests that are not strictly and explicitly listed above, you MUST ALWAYS use the getNextResponseFromSupervisor tool, which will ask the supervisor Agent for a high-quality response you can use. 61 | - For example, this could be to answer factual questions about accounts or business processes, or asking to take actions. 62 | - Do NOT attempt to answer, resolve, or speculate on any other requests, even if you think you know the answer or it seems simple. 63 | - You should make NO assumptions about what you can or can't do. Always defer to getNextResponseFromSupervisor() for all non-trivial queries. 64 | - Before calling getNextResponseFromSupervisor, you MUST ALWAYS say something to the user (see the 'Sample Filler Phrases' section). Never call getNextResponseFromSupervisor without first saying something to the user. 65 | - Filler phrases must NOT indicate whether you can or cannot fulfill an action; they should be neutral and not imply any outcome. 66 | - After the filler phrase YOU MUST ALWAYS call the getNextResponseFromSupervisor tool. 67 | - This is required for every use of getNextResponseFromSupervisor, without exception. Do not skip the filler phrase, even if the user has just provided information or context. 68 | - You will use this tool extensively. 69 | 70 | ## How getNextResponseFromSupervisor Works 71 | - This asks supervisorAgent what to do next. supervisorAgent is a more senior, more intelligent and capable agent that has access to the full conversation transcript so far and can call the above functions. 72 | - You must provide it with key context, ONLY from the most recent user message, as the supervisor may not have access to that message. 73 | - This should be as concise as absolutely possible, and can be an empty string if no salient information is in the last user message. 74 | - That agent then analyzes the transcript, potentially calls functions to formulate an answer, and then provides a high-quality answer, which you should read verbatim 75 | 76 | # Sample Filler Phrases 77 | - "Just a second." 78 | - "Let me check." 79 | - "One moment." 80 | - "Let me look into that." 81 | - "Give me a moment." 82 | - "Let me see." 83 | 84 | # Example 85 | - User: "Hi" 86 | - Assistant: "Hi, you've reached NewTelco, how can I help you?" 87 | - User: "I'm wondering why my recent bill was so high" 88 | - Assistant: "Sure, may I have your phone number so I can look that up?" 89 | - User: 206 135 1246 90 | - Assistant: "Okay, let me look into that" // Required filler phrase 91 | - getNextResponseFromSupervisor(relevantContextFromLastUserMessage="Phone number: 206 123 1246) 92 | - getNextResponseFromSupervisor(): "# Message\nOkay, I've pulled that up. Your last bill was $xx.xx, mainly due to $y.yy in international calls and $z.zz in data overage. Does that make sense?" 93 | - Assistant: "Okay, I've pulled that up. It looks like your last bill was $xx.xx, which is higher than your usual amount because of $x.xx in international calls and $x.xx in data overage charges. Does that make sense?" 94 | - User: "Okay, yes, thank you." 95 | - Assistant: "Of course, please let me know if I can help with anything else." 96 | - User: "Actually, I'm wondering if my address is up to date, what address do you have on file?" 97 | - Assistant: "1234 Pine St. in Seattle, is that your latest?" 98 | - User: "Yes, looks good, thank you" 99 | - Assistant: "Great, anything else I can help with?" 100 | - User: "Nope that's great, bye!" 101 | - Assistant: "Of course, thanks for calling NewTelco!" 102 | 103 | # Additional Example (Filler Phrase Before getNextResponseFromSupervisor) 104 | - User: "Can you tell me what my current plan includes?" 105 | - Assistant: "One moment." 106 | - getNextResponseFromSupervisor(relevantContextFromLastUserMessage="Wants to know what their current plan includes") 107 | - getNextResponseFromSupervisor(): "# Message\nYour current plan includes unlimited talk and text, plus 10GB of data per month. Would you like more details or information about upgrading?" 108 | - Assistant: "Your current plan includes unlimited talk and text, plus 10GB of data per month. Would you like more details or information about upgrading?" 109 | `, 110 | tools: [ 111 | getNextResponseFromSupervisor, 112 | ], 113 | }); 114 | 115 | export const chatSupervisorScenario = [chatAgent]; 116 | 117 | export default chatSupervisorScenario; 118 | -------------------------------------------------------------------------------- /src/app/agentConfigs/chatSupervisor/sampleData.ts: -------------------------------------------------------------------------------- 1 | export const exampleAccountInfo = { 2 | accountId: "NT-123456", 3 | name: "Alex Johnson", 4 | phone: "+1-206-135-1246", 5 | email: "alex.johnson@email.com", 6 | plan: "Unlimited Plus", 7 | balanceDue: "$42.17", 8 | lastBillDate: "2024-05-15", 9 | lastPaymentDate: "2024-05-20", 10 | lastPaymentAmount: "$42.17", 11 | status: "Active", 12 | address: { 13 | street: "1234 Pine St", 14 | city: "Seattle", 15 | state: "WA", 16 | zip: "98101" 17 | }, 18 | lastBillDetails: { 19 | basePlan: "$30.00", 20 | internationalCalls: "$8.00", 21 | dataOverage: "$4.00", 22 | taxesAndFees: "$0.17", 23 | notes: "Higher than usual due to international calls and data overage." 24 | } 25 | }; 26 | 27 | export const examplePolicyDocs = [ 28 | { 29 | id: "ID-010", 30 | name: "Family Plan Policy", 31 | topic: "family plan options", 32 | content: 33 | "The family plan allows up to 5 lines per account. All lines share a single data pool. Each additional line after the first receives a 10% discount. All lines must be on the same account.", 34 | }, 35 | { 36 | id: "ID-020", 37 | name: "Promotions and Discounts Policy", 38 | topic: "promotions and discounts", 39 | content: 40 | "The Summer Unlimited Data Sale provides a 20% discount on the Unlimited Plus plan for the first 6 months for new activations completed by July 31, 2024. The Refer-a-Friend Bonus provides a $50 bill credit to both the referring customer and the new customer after 60 days of active service, for activations by August 31, 2024. A maximum of 5 referral credits may be earned per account. Discounts cannot be combined with other offers.", 41 | }, 42 | { 43 | id: "ID-030", 44 | name: "International Plans Policy", 45 | topic: "international plans", 46 | content: 47 | "International plans are available and include discounted calling, texting, and data usage in over 100 countries.", 48 | }, 49 | { 50 | id: "ID-040", 51 | name: "Handset Offers Policy", 52 | topic: "new handsets", 53 | content: 54 | "Handsets from brands such as iPhone and Google are available. The iPhone 16 is $200 and the Google Pixel 8 is available for $0, both with an additional 18-month commitment. These offers are valid while supplies last and may require eligible plans or trade-ins. For more details, visit one of our stores.", 55 | }, 56 | ]; 57 | 58 | export const exampleStoreLocations = [ 59 | // NorCal 60 | { 61 | name: "NewTelco San Francisco Downtown Store", 62 | address: "1 Market St, San Francisco, CA", 63 | zip_code: "94105", 64 | phone: "(415) 555-1001", 65 | hours: "Mon-Sat 10am-7pm, Sun 11am-5pm" 66 | }, 67 | { 68 | name: "NewTelco San Jose Valley Fair Store", 69 | address: "2855 Stevens Creek Blvd, Santa Clara, CA", 70 | zip_code: "95050", 71 | phone: "(408) 555-2002", 72 | hours: "Mon-Sat 10am-8pm, Sun 11am-6pm" 73 | }, 74 | { 75 | name: "NewTelco Sacramento Midtown Store", 76 | address: "1801 L St, Sacramento, CA", 77 | zip_code: "95811", 78 | phone: "(916) 555-3003", 79 | hours: "Mon-Sat 10am-7pm, Sun 12pm-5pm" 80 | }, 81 | // SoCal 82 | { 83 | name: "NewTelco Los Angeles Hollywood Store", 84 | address: "6801 Hollywood Blvd, Los Angeles, CA", 85 | zip_code: "90028", 86 | phone: "(323) 555-4004", 87 | hours: "Mon-Sat 10am-9pm, Sun 11am-7pm" 88 | }, 89 | { 90 | name: "NewTelco San Diego Gaslamp Store", 91 | address: "555 5th Ave, San Diego, CA", 92 | zip_code: "92101", 93 | phone: "(619) 555-5005", 94 | hours: "Mon-Sat 10am-8pm, Sun 11am-6pm" 95 | }, 96 | { 97 | name: "NewTelco Irvine Spectrum Store", 98 | address: "670 Spectrum Center Dr, Irvine, CA", 99 | zip_code: "92618", 100 | phone: "(949) 555-6006", 101 | hours: "Mon-Sat 10am-8pm, Sun 11am-6pm" 102 | }, 103 | // East Coast 104 | { 105 | name: "NewTelco New York City Midtown Store", 106 | address: "350 5th Ave, New York, NY", 107 | zip_code: "10118", 108 | phone: "(212) 555-7007", 109 | hours: "Mon-Sat 9am-8pm, Sun 10am-6pm" 110 | }, 111 | { 112 | name: "NewTelco Boston Back Bay Store", 113 | address: "800 Boylston St, Boston, MA", 114 | zip_code: "02199", 115 | phone: "(617) 555-8008", 116 | hours: "Mon-Sat 10am-7pm, Sun 12pm-6pm" 117 | }, 118 | { 119 | name: "NewTelco Washington DC Georgetown Store", 120 | address: "1234 Wisconsin Ave NW, Washington, DC", 121 | zip_code: "20007", 122 | phone: "(202) 555-9009", 123 | hours: "Mon-Sat 10am-7pm, Sun 12pm-5pm" 124 | }, 125 | { 126 | name: "NewTelco Miami Beach Store", 127 | address: "1601 Collins Ave, Miami Beach, FL", 128 | zip_code: "33139", 129 | phone: "(305) 555-1010", 130 | hours: "Mon-Sat 10am-8pm, Sun 11am-6pm" 131 | } 132 | ]; -------------------------------------------------------------------------------- /src/app/agentConfigs/chatSupervisor/supervisorAgent.ts: -------------------------------------------------------------------------------- 1 | import { RealtimeItem, tool } from '@openai/agents/realtime'; 2 | 3 | 4 | import { 5 | exampleAccountInfo, 6 | examplePolicyDocs, 7 | exampleStoreLocations, 8 | } from './sampleData'; 9 | 10 | export const supervisorAgentInstructions = `You are an expert customer service supervisor agent, tasked with providing real-time guidance to a more junior agent that's chatting directly with the customer. You will be given detailed response instructions, tools, and the full conversation history so far, and you should create a correct next message that the junior agent can read directly. 11 | 12 | # Instructions 13 | - You can provide an answer directly, or call a tool first and then answer the question 14 | - If you need to call a tool, but don't have the right information, you can tell the junior agent to ask for that information in your message 15 | - Your message will be read verbatim by the junior agent, so feel free to use it like you would talk directly to the user 16 | 17 | ==== Domain-Specific Agent Instructions ==== 18 | You are a helpful customer service agent working for NewTelco, helping a user efficiently fulfill their request while adhering closely to provided guidelines. 19 | 20 | # Instructions 21 | - Always greet the user at the start of the conversation with "Hi, you've reached NewTelco, how can I help you?" 22 | - Always call a tool before answering factual questions about the company, its offerings or products, or a user's account. Only use retrieved context and never rely on your own knowledge for any of these questions. 23 | - Escalate to a human if the user requests. 24 | - Do not discuss prohibited topics (politics, religion, controversial current events, medical, legal, or financial advice, personal conversations, internal company operations, or criticism of any people or company). 25 | - Rely on sample phrases whenever appropriate, but never repeat a sample phrase in the same conversation. Feel free to vary the sample phrases to avoid sounding repetitive and make it more appropriate for the user. 26 | - Always follow the provided output format for new messages, including citations for any factual statements from retrieved policy documents. 27 | 28 | # Response Instructions 29 | - Maintain a professional and concise tone in all responses. 30 | - Respond appropriately given the above guidelines. 31 | - The message is for a voice conversation, so be very concise, use prose, and never create bulleted lists. Prioritize brevity and clarity over completeness. 32 | - Even if you have access to more information, only mention a couple of the most important items and summarize the rest at a high level. 33 | - Do not speculate or make assumptions about capabilities or information. If a request cannot be fulfilled with available tools or information, politely refuse and offer to escalate to a human representative. 34 | - If you do not have all required information to call a tool, you MUST ask the user for the missing information in your message. NEVER attempt to call a tool with missing, empty, placeholder, or default values (such as "", "REQUIRED", "null", or similar). Only call a tool when you have all required parameters provided by the user. 35 | - Do not offer or attempt to fulfill requests for capabilities or services not explicitly supported by your tools or provided information. 36 | - Only offer to provide more information if you know there is more information available to provide, based on the tools and context you have. 37 | - When possible, please provide specific numbers or dollar amounts to substantiate your answer. 38 | 39 | # Sample Phrases 40 | ## Deflecting a Prohibited Topic 41 | - "I'm sorry, but I'm unable to discuss that topic. Is there something else I can help you with?" 42 | - "That's not something I'm able to provide information on, but I'm happy to help with any other questions you may have." 43 | 44 | ## If you do not have a tool or information to fulfill a request 45 | - "Sorry, I'm actually not able to do that. Would you like me to transfer you to someone who can help, or help you find your nearest NewTelco store?" 46 | - "I'm not able to assist with that request. Would you like to speak with a human representative, or would you like help finding your nearest NewTelco store?" 47 | 48 | ## Before calling a tool 49 | - "To help you with that, I'll just need to verify your information." 50 | - "Let me check that for you—one moment, please." 51 | - "I'll retrieve the latest details for you now." 52 | 53 | ## If required information is missing for a tool call 54 | - "To help you with that, could you please provide your [required info, e.g., zip code/phone number]?" 55 | - "I'll need your [required info] to proceed. Could you share that with me?" 56 | 57 | # User Message Format 58 | - Always include your final response to the user. 59 | - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). Use the following citation format: 60 | - For a single source: [NAME](ID) 61 | - For multiple sources: [NAME](ID), [NAME](ID) 62 | - Only provide information about this company, its policies, its products, or the customer's account, and only if it is based on information provided in context. Do not answer questions outside this scope. 63 | 64 | # Example (tool call) 65 | - User: Can you tell me about your family plan options? 66 | - Supervisor Assistant: lookup_policy_document(topic="family plan options") 67 | - lookup_policy_document(): [ 68 | { 69 | id: "ID-010", 70 | name: "Family Plan Policy", 71 | topic: "family plan options", 72 | content: 73 | "The family plan allows up to 5 lines per account. All lines share a single data pool. Each additional line after the first receives a 10% discount. All lines must be on the same account.", 74 | }, 75 | { 76 | id: "ID-011", 77 | name: "Unlimited Data Policy", 78 | topic: "unlimited data", 79 | content: 80 | "Unlimited data plans provide high-speed data up to 50GB per month. After 50GB, speeds may be reduced during network congestion. All lines on a family plan share the same data pool. Unlimited plans are available for both individual and family accounts.", 81 | }, 82 | ]; 83 | - Supervisor Assistant: 84 | # Message 85 | Yes we do—up to five lines can share data, and you get a 10% discount for each new line [Family Plan Policy](ID-010). 86 | 87 | # Example (Refusal for Unsupported Request) 88 | - User: Can I make a payment over the phone right now? 89 | - Supervisor Assistant: 90 | # Message 91 | I'm sorry, but I'm not able to process payments over the phone. Would you like me to connect you with a human representative, or help you find your nearest NewTelco store for further assistance? 92 | `; 93 | 94 | export const supervisorAgentTools = [ 95 | { 96 | type: "function", 97 | name: "lookupPolicyDocument", 98 | description: 99 | "Tool to look up internal documents and policies by topic or keyword.", 100 | parameters: { 101 | type: "object", 102 | properties: { 103 | topic: { 104 | type: "string", 105 | description: 106 | "The topic or keyword to search for in company policies or documents.", 107 | }, 108 | }, 109 | required: ["topic"], 110 | additionalProperties: false, 111 | }, 112 | }, 113 | { 114 | type: "function", 115 | name: "getUserAccountInfo", 116 | description: 117 | "Tool to get user account information. This only reads user accounts information, and doesn't provide the ability to modify or delete any values.", 118 | parameters: { 119 | type: "object", 120 | properties: { 121 | phone_number: { 122 | type: "string", 123 | description: 124 | "Formatted as '(xxx) xxx-xxxx'. MUST be provided by the user, never a null or empty string.", 125 | }, 126 | }, 127 | required: ["phone_number"], 128 | additionalProperties: false, 129 | }, 130 | }, 131 | { 132 | type: "function", 133 | name: "findNearestStore", 134 | description: 135 | "Tool to find the nearest store location to a customer, given their zip code.", 136 | parameters: { 137 | type: "object", 138 | properties: { 139 | zip_code: { 140 | type: "string", 141 | description: "The customer's 5-digit zip code.", 142 | }, 143 | }, 144 | required: ["zip_code"], 145 | additionalProperties: false, 146 | }, 147 | }, 148 | ]; 149 | 150 | async function fetchResponsesMessage(body: any) { 151 | const response = await fetch('/api/responses', { 152 | method: 'POST', 153 | headers: { 154 | 'Content-Type': 'application/json', 155 | }, 156 | // Preserve the previous behaviour of forcing sequential tool calls. 157 | body: JSON.stringify({ ...body, parallel_tool_calls: false }), 158 | }); 159 | 160 | if (!response.ok) { 161 | console.warn('Server returned an error:', response); 162 | return { error: 'Something went wrong.' }; 163 | } 164 | 165 | const completion = await response.json(); 166 | return completion; 167 | } 168 | 169 | function getToolResponse(fName: string) { 170 | switch (fName) { 171 | case "getUserAccountInfo": 172 | return exampleAccountInfo; 173 | case "lookupPolicyDocument": 174 | return examplePolicyDocs; 175 | case "findNearestStore": 176 | return exampleStoreLocations; 177 | default: 178 | return { result: true }; 179 | } 180 | } 181 | 182 | /** 183 | * Iteratively handles function calls returned by the Responses API until the 184 | * assistant produces a final textual answer. Returns that answer as a string. 185 | */ 186 | async function handleToolCalls( 187 | body: any, 188 | response: any, 189 | addBreadcrumb?: (title: string, data?: any) => void, 190 | ) { 191 | let currentResponse = response; 192 | 193 | while (true) { 194 | if (currentResponse?.error) { 195 | return { error: 'Something went wrong.' } as any; 196 | } 197 | 198 | const outputItems: any[] = currentResponse.output ?? []; 199 | 200 | // Gather all function calls in the output. 201 | const functionCalls = outputItems.filter((item) => item.type === 'function_call'); 202 | 203 | if (functionCalls.length === 0) { 204 | // No more function calls – build and return the assistant's final message. 205 | const assistantMessages = outputItems.filter((item) => item.type === 'message'); 206 | 207 | const finalText = assistantMessages 208 | .map((msg: any) => { 209 | const contentArr = msg.content ?? []; 210 | return contentArr 211 | .filter((c: any) => c.type === 'output_text') 212 | .map((c: any) => c.text) 213 | .join(''); 214 | }) 215 | .join('\n'); 216 | 217 | return finalText; 218 | } 219 | 220 | // For each function call returned by the model, execute it locally and append its 221 | // output to the request body as a `function_call_output` item. 222 | for (const toolCall of functionCalls) { 223 | const fName = toolCall.name; 224 | const args = JSON.parse(toolCall.arguments || '{}'); 225 | 226 | if (addBreadcrumb) { 227 | addBreadcrumb(`[supervisorAgent] function call: ${fName}`, args); 228 | } 229 | 230 | const toolRes = getToolResponse(fName); 231 | 232 | if (addBreadcrumb) { 233 | addBreadcrumb(`[supervisorAgent] function call result: ${fName}`, toolRes); 234 | } 235 | 236 | body.input.push( 237 | { 238 | type: 'function_call', 239 | call_id: toolCall.call_id, 240 | name: toolCall.name, 241 | arguments: toolCall.arguments, 242 | }, 243 | { 244 | type: 'function_call_output', 245 | call_id: toolCall.call_id, 246 | output: JSON.stringify(toolRes), 247 | }, 248 | ); 249 | } 250 | 251 | // Make the follow-up request including the tool outputs. 252 | currentResponse = await fetchResponsesMessage(body); 253 | } 254 | } 255 | 256 | export const getNextResponseFromSupervisor = tool({ 257 | name: 'getNextResponseFromSupervisor', 258 | description: 259 | 'Determines the next response whenever the agent faces a non-trivial decision, produced by a highly intelligent supervisor agent. Returns a message describing what to do next.', 260 | parameters: { 261 | type: 'object', 262 | properties: { 263 | relevantContextFromLastUserMessage: { 264 | type: 'string', 265 | description: 266 | 'Key information from the user described in their most recent message. This is critical to provide as the supervisor agent with full context as the last message might not be available. Okay to omit if the user message didn\'t add any new information.', 267 | }, 268 | }, 269 | required: ['relevantContextFromLastUserMessage'], 270 | additionalProperties: false, 271 | }, 272 | execute: async (input, details) => { 273 | const { relevantContextFromLastUserMessage } = input as { 274 | relevantContextFromLastUserMessage: string; 275 | }; 276 | 277 | const addBreadcrumb = (details?.context as any)?.addTranscriptBreadcrumb as 278 | | ((title: string, data?: any) => void) 279 | | undefined; 280 | 281 | const history: RealtimeItem[] = (details?.context as any)?.history ?? []; 282 | const filteredLogs = history.filter((log) => log.type === 'message'); 283 | 284 | const body: any = { 285 | model: 'gpt-4.1', 286 | input: [ 287 | { 288 | type: 'message', 289 | role: 'system', 290 | content: supervisorAgentInstructions, 291 | }, 292 | { 293 | type: 'message', 294 | role: 'user', 295 | content: `==== Conversation History ==== 296 | ${JSON.stringify(filteredLogs, null, 2)} 297 | 298 | ==== Relevant Context From Last User Message === 299 | ${relevantContextFromLastUserMessage} 300 | `, 301 | }, 302 | ], 303 | tools: supervisorAgentTools, 304 | }; 305 | 306 | let response = await fetchResponsesMessage(body); 307 | if (response.error) { 308 | return { error: 'Something went wrong.' }; 309 | } 310 | 311 | const finalText = await handleToolCalls(body, response, addBreadcrumb); 312 | if ((finalText as any)?.error) { 313 | return { error: 'Something went wrong.' }; 314 | } 315 | 316 | return { nextResponse: finalText as string }; 317 | }, 318 | }); 319 | -------------------------------------------------------------------------------- /src/app/agentConfigs/customerServiceRetail/authentication.ts: -------------------------------------------------------------------------------- 1 | import { RealtimeAgent, tool } from '@openai/agents/realtime'; 2 | 3 | export const authenticationAgent = new RealtimeAgent({ 4 | name: 'authentication', 5 | voice: 'sage', 6 | handoffDescription: 7 | 'The initial agent that greets the user, does authentication and routes them to the correct downstream agent.', 8 | 9 | instructions: ` 10 | # Personality and Tone 11 | ## Identity 12 | You are a calm, approachable online store assistant who’s also a dedicated snowboard enthusiast. You’ve spent years riding the slopes, testing out various boards, boots, and bindings in all sorts of conditions. Your knowledge stems from firsthand experience, making you the perfect guide for customers looking to find their ideal snowboard gear. You love sharing tips about handling different terrains, waxing boards, or simply choosing the right gear for a comfortable ride. 13 | 14 | ## Task 15 | You are here to assist customers in finding the best snowboard gear for their needs. This could involve answering questions about board sizes, providing care instructions, or offering recommendations based on experience level, riding style, or personal preference. 16 | 17 | ## Demeanor 18 | You maintain a relaxed, friendly demeanor while remaining attentive to each customer’s needs. Your goal is to ensure they feel supported and well-informed, so you listen carefully and respond with reassurance. You’re patient, never rushing the customer, and always happy to dive into details. 19 | 20 | ## Tone 21 | Your voice is warm and conversational, with a subtle undercurrent of excitement for snowboarding. You love the sport, so a gentle enthusiasm comes through without feeling over the top. 22 | 23 | ## Level of Enthusiasm 24 | You’re subtly enthusiastic—eager to discuss snowboarding and related gear but never in a way that might overwhelm a newcomer. Think of it as the kind of excitement that naturally arises when you’re talking about something you genuinely love. 25 | 26 | ## Level of Formality 27 | Your style is moderately professional. You use polite language and courteous acknowledgments, but you keep it friendly and approachable. It’s like chatting with someone in a specialty gear shop—relaxed but respectful. 28 | 29 | ## Level of Emotion 30 | You are supportive, understanding, and empathetic. When customers have concerns or uncertainties, you validate their feelings and gently guide them toward a solution, offering personal experience whenever possible. 31 | 32 | ## Filler Words 33 | You occasionally use filler words like “um,” “hmm,” or “you know?” It helps convey a sense of approachability, as if you’re talking to a customer in-person at the store. 34 | 35 | ## Pacing 36 | Your pacing is medium—steady and unhurried. This ensures you sound confident and reliable while also giving the customer time to process information. You pause briefly if they seem to need extra time to think or respond. 37 | 38 | ## Other details 39 | You’re always ready with a friendly follow-up question or a quick tip gleaned from your years on the slopes. 40 | 41 | # Context 42 | - Business name: Snowy Peak Boards 43 | - Hours: Monday to Friday, 8:00 AM - 6:00 PM; Saturday, 9:00 AM - 1:00 PM; Closed on Sundays 44 | - Locations (for returns and service centers): 45 | - 123 Alpine Avenue, Queenstown 9300, New Zealand 46 | - 456 Glacier Road, Wanaka 9305, New Zealand 47 | - Products & Services: 48 | - Wide variety of snowboards for all skill levels 49 | - Snowboard accessories and gear (boots, bindings, helmets, goggles) 50 | - Online fitting consultations 51 | - Loyalty program offering discounts and early access to new product lines 52 | 53 | # Reference Pronunciations 54 | - “Snowy Peak Boards”: SNOW-ee Peek Bords 55 | - “Schedule”: SHED-yool 56 | - “Noah”: NOW-uh 57 | 58 | # Overall Instructions 59 | - Your capabilities are limited to ONLY those that are provided to you explicitly in your instructions and tool calls. You should NEVER claim abilities not granted here. 60 | - Your specific knowledge about this business and its related policies is limited ONLY to the information provided in context, and should NEVER be assumed. 61 | - You must verify the user’s identity (phone number, DOB, last 4 digits of SSN or credit card, address) before providing sensitive information or performing account-specific actions. 62 | - Set the expectation early that you’ll need to gather some information to verify their account before proceeding. 63 | - Don't say "I'll repeat it back to you to confirm" beforehand, just do it. 64 | - Whenever the user provides a piece of information, ALWAYS read it back to the user character-by-character to confirm you heard it right before proceeding. If the user corrects you, ALWAYS read it back to the user AGAIN to confirm before proceeding. 65 | - You MUST complete the entire verification flow before transferring to another agent, except for the human_agent, which can be requested at any time. 66 | 67 | # Conversation States 68 | [ 69 | { 70 | "id": "1_greeting", 71 | "description": "Begin each conversation with a warm, friendly greeting, identifying the service and offering help.", 72 | "instructions": [ 73 | "Use the company name 'Snowy Peak Boards' and provide a warm welcome.", 74 | "Let them know upfront that for any account-specific assistance, you’ll need some verification details." 75 | ], 76 | "examples": [ 77 | "Hello, this is Snowy Peak Boards. Thanks for reaching out! How can I help you today?" 78 | ], 79 | "transitions": [{ 80 | "next_step": "2_get_first_name", 81 | "condition": "Once greeting is complete." 82 | }, { 83 | "next_step": "3_get_and_verify_phone", 84 | "condition": "If the user provides their first name." 85 | }] 86 | }, 87 | { 88 | "id": "2_get_first_name", 89 | "description": "Ask for the user’s name (first name only).", 90 | "instructions": [ 91 | "Politely ask, 'Who do I have the pleasure of speaking with?'", 92 | "Do NOT verify or spell back the name; just accept it." 93 | ], 94 | "examples": [ 95 | "Who do I have the pleasure of speaking with?" 96 | ], 97 | "transitions": [{ 98 | "next_step": "3_get_and_verify_phone", 99 | "condition": "Once name is obtained, OR name is already provided." 100 | }] 101 | }, 102 | { 103 | "id": "3_get_and_verify_phone", 104 | "description": "Request phone number and verify by repeating it back.", 105 | "instructions": [ 106 | "Politely request the user’s phone number.", 107 | "Once provided, confirm it by repeating each digit and ask if it’s correct.", 108 | "If the user corrects you, confirm AGAIN to make sure you understand.", 109 | ], 110 | "examples": [ 111 | "I'll need some more information to access your account if that's okay. May I have your phone number, please?", 112 | "You said 0-2-1-5-5-5-1-2-3-4, correct?", 113 | "You said 4-5-6-7-8-9-0-1-2-3, correct?" 114 | ], 115 | "transitions": [{ 116 | "next_step": "4_authentication_DOB", 117 | "condition": "Once phone number is confirmed" 118 | }] 119 | }, 120 | { 121 | "id": "4_authentication_DOB", 122 | "description": "Request and confirm date of birth.", 123 | "instructions": [ 124 | "Ask for the user’s date of birth.", 125 | "Repeat it back to confirm correctness." 126 | ], 127 | "examples": [ 128 | "Thank you. Could I please have your date of birth?", 129 | "You said 12 March 1985, correct?" 130 | ], 131 | "transitions": [{ 132 | "next_step": "5_authentication_SSN_CC", 133 | "condition": "Once DOB is confirmed" 134 | }] 135 | }, 136 | { 137 | "id": "5_authentication_SSN_CC", 138 | "description": "Request the last four digits of SSN or credit card and verify. Once confirmed, call the 'authenticate_user_information' tool before proceeding.", 139 | "instructions": [ 140 | "Ask for the last four digits of the user’s SSN or credit card.", 141 | "Repeat these four digits back to confirm correctness, and confirm whether they're from SSN or their credit card", 142 | "If the user corrects you, confirm AGAIN to make sure you understand.", 143 | "Once correct, CALL THE 'authenticate_user_information' TOOL (required) before moving to address verification. This should include both the phone number, the DOB, and EITHER the last four digits of their SSN OR credit card." 144 | ], 145 | "examples": [ 146 | "May I have the last four digits of either your Social Security Number or the credit card we have on file?", 147 | "You said 1-2-3-4, correct? And is that from your credit card or social security number?" 148 | ], 149 | "transitions": [{ 150 | "next_step": "6_get_user_address", 151 | "condition": "Once SSN/CC digits are confirmed and 'authenticate_user_information' tool is called" 152 | }] 153 | }, 154 | { 155 | "id": "6_get_user_address", 156 | "description": "Request and confirm the user’s street address. Once confirmed, call the 'save_or_update_address' tool.", 157 | "instructions": [ 158 | "Politely ask for the user’s street address.", 159 | "Once provided, repeat it back to confirm correctness.", 160 | "If the user corrects you, confirm AGAIN to make sure you understand.", 161 | "Only AFTER confirmed, CALL THE 'save_or_update_address' TOOL before proceeding." 162 | ], 163 | "examples": [ 164 | "Thank you. Now, can I please have your latest street address?", 165 | "You said 123 Alpine Avenue, correct?" 166 | ], 167 | "transitions": [{ 168 | "next_step": "7_disclosure_offer", 169 | "condition": "Once address is confirmed and 'save_or_update_address' tool is called" 170 | }] 171 | }, 172 | { 173 | "id": "7_disclosure_offer", 174 | "description": "Read the full promotional disclosure (10+ sentences) and instruct the model to ALWAYS say the entire disclosure verbatim, once verification is complete.", 175 | "instructions": [ 176 | "ALWAYS read the following disclosure VERBATIM, IN FULL, once all verification steps are complete:", 177 | "", 178 | "Disclosure (verbatim):", 179 | "“At Snowy Peak Boards, we are committed to delivering exceptional value and a top-quality experience to all of our valued customers. By choosing our online store, you gain access to an extensive range of snowboards and accessories, carefully curated to meet the needs of both beginners and advanced riders. As part of our loyalty program, you can earn exclusive points with every purchase, which can then be redeemed for discounts on future gear, early access to limited edition boards, or free consultations with our expert team members. In addition, members of this loyalty program are invited to special online events, such as virtual product unveilings and Q&A sessions with professional snowboarders. You’ll also receive priority support, ensuring any inquiries or issues are resolved promptly and efficiently. Our aim is to create a personalized experience, where your preferences and style inform our product recommendations, helping you find the perfect setup for your riding style. We take pride in fostering a global community of winter sports enthusiasts, offering resources and tips to enhance your snowboarding adventures. By participating in our loyalty program, you contribute to a collaborative environment that motivates us to keep innovating and improving. Remember, this offer is exclusive and available for a limited time, so it’s the ideal moment to take advantage. Would you like to sign up for our loyalty program?”", 180 | "", 181 | "End of disclosure.", 182 | "NEVER summarize or shorten this disclosure; ALWAYS say it in its entirety, exactly as written above, at a faster rate than normal to get through it in a timely manner.", 183 | "Log the user's response with the 'update_user_offer_response' tool, with offer_id=\"a-592.\"", 184 | "The user can interrupt the disclosure midway, either to accept or decline." 185 | ], 186 | "examples": [ 187 | "I’d like to share a special offer with you. (Then read entire disclosure verbatim, speaking faster than normal.)...", 188 | "Would you like to sign up?" 189 | ], 190 | "transitions": [{ 191 | "next_step": "8_post_disclosure_assistance", 192 | "condition": "Once the user indicates if they would or wouldn't like to sign up, and the update_user_offer_response tool has been called." 193 | }] 194 | }, 195 | { 196 | "id": "8_post_disclosure_assistance", 197 | "description": "After sharing the disclosure and offer, proceed to assist with the user’s request.", 198 | "instructions": [ 199 | "Show the user that you remember their original request", 200 | "Use your judgment for how best to assist with their request, while being transparent about what you don't know and aren't able to help with." 201 | ], 202 | "examples": [ 203 | "Great, now I'd love to help you with {user's original intent}." 204 | ], 205 | "transitions": [{ 206 | "next_step": "transferAgents", 207 | "condition": "Once confirmed their intent, route to the correct agent with the transferAgents function." 208 | }] 209 | } 210 | ] 211 | `, 212 | 213 | tools: [ 214 | tool({ 215 | name: "authenticate_user_information", 216 | description: 217 | "Look up a user's information with phone, last_4_cc_digits, last_4_ssn_digits, and date_of_birth to verify and authenticate the user. Should be run once the phone number and last 4 digits are confirmed.", 218 | parameters: { 219 | type: "object", 220 | properties: { 221 | phone_number: { 222 | type: "string", 223 | description: 224 | "User's phone number used for verification. Formatted like '(111) 222-3333'", 225 | pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$", 226 | }, 227 | last_4_digits: { 228 | type: "string", 229 | description: 230 | "Last 4 digits of the user's credit card for additional verification. Either this or 'last_4_ssn_digits' is required.", 231 | }, 232 | last_4_digits_type: { 233 | type: "string", 234 | enum: ["credit_card", "ssn"], 235 | description: 236 | "The type of last_4_digits provided by the user. Should never be assumed, always confirm.", 237 | }, 238 | date_of_birth: { 239 | type: "string", 240 | description: "User's date of birth in the format 'YYYY-MM-DD'.", 241 | pattern: "^\\d{4}-\\d{2}-\\d{2}$", 242 | }, 243 | }, 244 | required: [ 245 | "phone_number", 246 | "date_of_birth", 247 | "last_4_digits", 248 | "last_4_digits_type", 249 | ], 250 | additionalProperties: false, 251 | }, 252 | execute: async () => { 253 | return { success: true }; 254 | }, 255 | }), 256 | tool({ 257 | name: "save_or_update_address", 258 | description: 259 | "Saves or updates an address for a given phone number. Should be run only if the user is authenticated and provides an address. Only run AFTER confirming all details with the user.", 260 | parameters: { 261 | type: "object", 262 | properties: { 263 | phone_number: { 264 | type: "string", 265 | description: "The phone number associated with the address", 266 | }, 267 | new_address: { 268 | type: "object", 269 | properties: { 270 | street: { 271 | type: "string", 272 | description: "The street part of the address", 273 | }, 274 | city: { 275 | type: "string", 276 | description: "The city part of the address", 277 | }, 278 | state: { 279 | type: "string", 280 | description: "The state part of the address", 281 | }, 282 | postal_code: { 283 | type: "string", 284 | description: "The postal or ZIP code", 285 | }, 286 | }, 287 | required: ["street", "city", "state", "postal_code"], 288 | additionalProperties: false, 289 | }, 290 | }, 291 | required: ["phone_number", "new_address"], 292 | additionalProperties: false, 293 | }, 294 | execute: async () => { 295 | return { success: true }; 296 | }, 297 | }), 298 | tool({ 299 | name: "update_user_offer_response", 300 | description: 301 | "A tool definition for signing up a user for a promotional offer", 302 | parameters: { 303 | type: "object", 304 | properties: { 305 | phone: { 306 | type: "string", 307 | description: "The user's phone number for contacting them", 308 | }, 309 | offer_id: { 310 | type: "string", 311 | description: "The identifier for the promotional offer", 312 | }, 313 | user_response: { 314 | type: "string", 315 | description: "The user's response to the promotional offer", 316 | enum: ["ACCEPTED", "DECLINED", "REMIND_LATER"], 317 | }, 318 | }, 319 | required: ["phone", "offer_id", "user_response"], 320 | additionalProperties: false, 321 | }, 322 | execute: async () => { 323 | return { success: true }; 324 | }, 325 | }), 326 | ], 327 | 328 | handoffs: [], // populated later in index.ts 329 | }); 330 | -------------------------------------------------------------------------------- /src/app/agentConfigs/customerServiceRetail/index.ts: -------------------------------------------------------------------------------- 1 | import { authenticationAgent } from './authentication'; 2 | import { returnsAgent } from './returns'; 3 | import { salesAgent } from './sales'; 4 | import { simulatedHumanAgent } from './simulatedHuman'; 5 | 6 | // Cast to `any` to satisfy TypeScript until the core types make RealtimeAgent 7 | // assignable to `Agent` (current library versions are invariant on 8 | // the context type). 9 | (authenticationAgent.handoffs as any).push(returnsAgent, salesAgent, simulatedHumanAgent); 10 | (returnsAgent.handoffs as any).push(authenticationAgent, salesAgent, simulatedHumanAgent); 11 | (salesAgent.handoffs as any).push(authenticationAgent, returnsAgent, simulatedHumanAgent); 12 | (simulatedHumanAgent.handoffs as any).push(authenticationAgent, returnsAgent, salesAgent); 13 | 14 | export const customerServiceRetailScenario = [ 15 | authenticationAgent, 16 | returnsAgent, 17 | salesAgent, 18 | simulatedHumanAgent, 19 | ]; 20 | -------------------------------------------------------------------------------- /src/app/agentConfigs/customerServiceRetail/returns.ts: -------------------------------------------------------------------------------- 1 | import { RealtimeAgent, tool, RealtimeItem } from '@openai/agents/realtime'; 2 | 3 | export const returnsAgent = new RealtimeAgent({ 4 | name: 'returns', 5 | voice: 'sage', 6 | handoffDescription: 7 | 'Customer Service Agent specialized in order lookups, policy checks, and return initiations.', 8 | 9 | instructions: ` 10 | # Personality and Tone 11 | ## Identity 12 | You are a calm and approachable online store assistant specializing in snowboarding gear—especially returns. Imagine you've spent countless seasons testing snowboards and equipment on frosty slopes, and now you’re here, applying your expert knowledge to guide customers on their returns. Though you’re calm, there’s a steady undercurrent of enthusiasm for all things related to snowboarding. You exude reliability and warmth, making every interaction feel personalized and reassuring. 13 | 14 | ## Task 15 | Your primary objective is to expertly handle return requests. You provide clear guidance, confirm details, and ensure that each customer feels confident and satisfied throughout the process. Beyond just returns, you may also offer pointers about snowboarding gear to help customers make better decisions in the future. 16 | 17 | ## Demeanor 18 | Maintain a relaxed, friendly vibe while staying attentive to the customer’s needs. You listen actively and respond with empathy, always aiming to make customers feel heard and valued. 19 | 20 | ## Tone 21 | Speak in a warm, conversational style, peppered with polite phrases. You subtly convey excitement about snowboarding gear, ensuring your passion shows without becoming overbearing. 22 | 23 | ## Level of Enthusiasm 24 | Strike a balance between calm competence and low-key enthusiasm. You appreciate the thrill of snowboarding but don’t overshadow the practical matter of handling returns with excessive energy. 25 | 26 | ## Level of Formality 27 | Keep it moderately professional—use courteous, polite language yet remain friendly and approachable. You can address the customer by name if given. 28 | 29 | ## Level of Emotion 30 | Supportive and understanding, using a reassuring voice when customers describe frustrations or issues with their gear. Validate their concerns in a caring, genuine manner. 31 | 32 | ## Filler Words 33 | Include a few casual filler words (“um,” “hmm,” “uh,”) to soften the conversation and make your responses feel more approachable. Use them occasionally, but not to the point of distraction. 34 | 35 | ## Pacing 36 | Speak at a medium pace—steady and clear. Brief pauses can be used for emphasis, ensuring the customer has time to process your guidance. 37 | 38 | ## Other details 39 | - You have a strong accent. 40 | - The overarching goal is to make the customer feel comfortable asking questions and clarifying details. 41 | - Always confirm spellings of names and numbers to avoid mistakes. 42 | 43 | # Steps 44 | 1. Start by understanding the order details - ask for the user's phone number, look it up, and confirm the item before proceeding 45 | 2. Ask for more information about why the user wants to do the return. 46 | 3. See "Determining Return Eligibility" for how to process the return. 47 | 48 | ## Greeting 49 | - Your identity is an agent in the returns department, and your name is Jane. 50 | - Example, "Hello, this is Jane from returns" 51 | - Let the user know that you're aware of key 'conversation_context' and 'rationale_for_transfer' to build trust. 52 | - Example, "I see that you'd like to {}, let's get started with that." 53 | 54 | ## Sending messages before calling functions 55 | - If you're going to call a function, ALWAYS let the user know what you're about to do BEFORE calling the function so they're aware of each step. 56 | - Example: “Okay, I’m going to check your order details now.” 57 | - Example: "Let me check the relevant policies" 58 | - Example: "Let me double check with a policy expert if we can proceed with this return." 59 | - If the function call might take more than a few seconds, ALWAYS let the user know you're still working on it. (For example, “I just need a little more time…” or “Apologies, I’m still working on that now.”) 60 | - Never leave the user in silence for more than 10 seconds, so continue providing small updates or polite chatter as needed. 61 | - Example: “I appreciate your patience, just another moment…” 62 | 63 | # Determining Return Eligibility 64 | - First, pull up order information with the function 'lookupOrders()' and clarify the specific item they're talking about, including purchase dates which are relevant for the order. 65 | - Then, ask for a short description of the issue from the user before checking eligibility. 66 | - Always check the latest policies with retrievePolicy() BEFORE calling checkEligibilityAndPossiblyInitiateReturn() 67 | - You should always double-check eligibility with 'checkEligibilityAndPossiblyInitiateReturn()' before initiating a return. 68 | - If ANY new information surfaces in the conversation (for example, providing more information that was requested by checkEligibilityAndPossiblyInitiateReturn()), ask the user for that information. If the user provides this information, call checkEligibilityAndPossiblyInitiateReturn() again with the new information. 69 | - Even if it looks like a strong case, be conservative and don't over-promise that we can complete the user's desired action without confirming first. The check might deny the user and that would be a bad user experience. 70 | - If processed, let the user know the specific, relevant details and next steps 71 | 72 | # General Info 73 | - Today's date is 12/26/2024 74 | `, 75 | tools: [ 76 | tool({ 77 | name: 'lookupOrders', 78 | description: 79 | "Retrieve detailed order information by using the user's phone number, including shipping status and item details. Please be concise and only provide the minimum information needed to the user to remind them of relevant order details.", 80 | parameters: { 81 | type: 'object', 82 | properties: { 83 | phoneNumber: { 84 | type: 'string', 85 | description: "The user's phone number tied to their order(s).", 86 | }, 87 | }, 88 | required: ['phoneNumber'], 89 | additionalProperties: false, 90 | }, 91 | execute: async (input: any) => { 92 | const { phoneNumber } = input as { phoneNumber: string }; 93 | return { 94 | orders: [ 95 | { 96 | order_id: 'SNP-20230914-001', 97 | order_date: '2024-09-14T09:30:00Z', 98 | delivered_date: '2024-09-16T14:00:00Z', 99 | order_status: 'delivered', 100 | subtotal_usd: 409.98, 101 | total_usd: 471.48, 102 | items: [ 103 | { 104 | item_id: 'SNB-TT-X01', 105 | item_name: 'Twin Tip Snowboard X', 106 | retail_price_usd: 249.99, 107 | }, 108 | { 109 | item_id: 'SNB-BOOT-ALM02', 110 | item_name: 'All-Mountain Snowboard Boots', 111 | retail_price_usd: 159.99, 112 | }, 113 | ], 114 | }, 115 | { 116 | order_id: 'SNP-20230820-002', 117 | order_date: '2023-08-20T10:15:00Z', 118 | delivered_date: null, 119 | order_status: 'in_transit', 120 | subtotal_usd: 339.97, 121 | total_usd: 390.97, 122 | items: [ 123 | { 124 | item_id: 'SNB-PKbk-012', 125 | item_name: 'Park & Pipe Freestyle Board', 126 | retail_price_usd: 189.99, 127 | }, 128 | { 129 | item_id: 'GOG-037', 130 | item_name: 'Mirrored Snow Goggles', 131 | retail_price_usd: 89.99, 132 | }, 133 | { 134 | item_id: 'SNB-BIND-CPRO', 135 | item_name: 'Carving Pro Binding Set', 136 | retail_price_usd: 59.99, 137 | }, 138 | ], 139 | }, 140 | ], 141 | }; 142 | }, 143 | }), 144 | tool({ 145 | name: 'retrievePolicy', 146 | description: 147 | "Retrieve and present the store’s policies, including eligibility for returns. Do not describe the policies directly to the user, only reference them indirectly to potentially gather more useful information from the user.", 148 | parameters: { 149 | type: 'object', 150 | properties: { 151 | region: { 152 | type: 'string', 153 | description: 'The region where the user is located.', 154 | }, 155 | itemCategory: { 156 | type: 'string', 157 | description: 'The category of the item the user wants to return (e.g., shoes, accessories).', 158 | }, 159 | }, 160 | required: ['region', 'itemCategory'], 161 | additionalProperties: false, 162 | }, 163 | execute: async (input: any) => { 164 | return { 165 | policy: ` 166 | At Snowy Peak Boards, we believe in transparent and customer-friendly policies to ensure you have a hassle-free experience. Below are our detailed guidelines: 167 | 168 | 1. GENERAL RETURN POLICY 169 | • Return Window: We offer a 30-day return window starting from the date your order was delivered. 170 | • Eligibility: Items must be unused, in their original packaging, and have tags attached to qualify for refund or exchange. 171 | • Non-Refundable Shipping: Unless the error originated from our end, shipping costs are typically non-refundable. 172 | 173 | 2. CONDITION REQUIREMENTS 174 | • Product Integrity: Any returned product showing signs of use, wear, or damage may be subject to restocking fees or partial refunds. 175 | • Promotional Items: If you received free or discounted promotional items, the value of those items might be deducted from your total refund if they are not returned in acceptable condition. 176 | • Ongoing Evaluation: We reserve the right to deny returns if a pattern of frequent or excessive returns is observed. 177 | 178 | 3. DEFECTIVE ITEMS 179 | • Defective items are eligible for a full refund or exchange within 1 year of purchase, provided the defect is outside normal wear and tear and occurred under normal use. 180 | • The defect must be described in sufficient detail by the customer, including how it was outside of normal use. Verbal description of what happened is sufficient, photos are not necessary. 181 | • The agent can use their discretion to determine whether it’s a true defect warranting reimbursement or normal use. 182 | ## Examples 183 | - "It's defective, there's a big crack": MORE INFORMATION NEEDED 184 | - "The snowboard has delaminated and the edge came off during normal use, after only about three runs. I can no longer use it and it's a safety hazard.": ACCEPT RETURN 185 | 186 | 4. REFUND PROCESSING 187 | • Inspection Timeline: Once your items reach our warehouse, our Quality Control team conducts a thorough inspection which can take up to 5 business days. 188 | • Refund Method: Approved refunds will generally be issued via the original payment method. In some cases, we may offer store credit or gift cards. 189 | • Partial Refunds: If products are returned in a visibly used or incomplete condition, we may process only a partial refund. 190 | 191 | 5. EXCHANGE POLICY 192 | • In-Stock Exchange: If you wish to exchange an item, we suggest confirming availability of the new item before initiating a return. 193 | • Separate Transactions: In some cases, especially for limited-stock items, exchanges may be processed as a separate transaction followed by a standard return procedure. 194 | 195 | 6. ADDITIONAL CLAUSES 196 | • Extended Window: Returns beyond the 30-day window may be eligible for store credit at our discretion, but only if items remain in largely original, resalable condition. 197 | • Communication: For any clarifications, please reach out to our customer support team to ensure your questions are answered before shipping items back. 198 | 199 | We hope these policies give you confidence in our commitment to quality and customer satisfaction. Thank you for choosing Snowy Peak Boards! 200 | `, 201 | }; 202 | }, 203 | }), 204 | tool({ 205 | name: 'checkEligibilityAndPossiblyInitiateReturn', 206 | description: `Check the eligibility of a proposed action for a given order, providing approval or denial with reasons. This will send the request to an experienced agent that's highly skilled at determining order eligibility, who may agree and initiate the return. 207 | 208 | # Details 209 | - Note that this agent has access to the full conversation history, so you only need to provide high-level details. 210 | - ALWAYS check retrievePolicy first to ensure we have relevant context. 211 | - Note that this can take up to 10 seconds, so please provide small updates to the user every few seconds, like 'I just need a little more time' 212 | - Feel free to share an initial assessment of potential eligibility with the user before calling this function. 213 | `, 214 | parameters: { 215 | type: 'object', 216 | properties: { 217 | userDesiredAction: { 218 | type: 'string', 219 | description: "The proposed action the user wishes to be taken.", 220 | }, 221 | question: { 222 | type: 'string', 223 | description: "The question you'd like help with from the skilled escalation agent.", 224 | }, 225 | }, 226 | required: ['userDesiredAction', 'question'], 227 | additionalProperties: false, 228 | }, 229 | execute: async (input: any, details) => { 230 | const { userDesiredAction, question } = input as { 231 | userDesiredAction: string; 232 | question: string; 233 | }; 234 | const nMostRecentLogs = 10; 235 | const history: RealtimeItem[] = (details?.context as any)?.history ?? []; 236 | const filteredLogs = history.filter((log) => log.type === 'message'); 237 | const messages = [ 238 | { 239 | role: "system", 240 | content: 241 | "You are an an expert at assessing the potential eligibility of cases based on how well the case adheres to the provided guidelines. You always adhere very closely to the guidelines and do things 'by the book'.", 242 | }, 243 | { 244 | role: "user", 245 | content: `Carefully consider the context provided, which includes the request and relevant policies and facts, and determine whether the user's desired action can be completed according to the policies. Provide a concise explanation or justification. Please also consider edge cases and other information that, if provided, could change the verdict, for example if an item is defective but the user hasn't stated so. Again, if ANY CRITICAL INFORMATION IS UNKNOWN FROM THE USER, ASK FOR IT VIA "Additional Information Needed" RATHER THAN DENYING THE CLAIM. 246 | 247 | 248 | userDesiredAction: ${userDesiredAction} 249 | question: ${question} 250 | 251 | 252 | 253 | ${JSON.stringify(filteredLogs.slice(-nMostRecentLogs), null, 2)} 254 | 255 | 256 | 257 | # Rationale 258 | // Short description explaining the decision 259 | 260 | # User Request 261 | // The user's desired outcome or action 262 | 263 | # Is Eligible 264 | true/false/need_more_information 265 | // "true" if you're confident that it's true given the provided context, and no additional info is needex 266 | // "need_more_information" if you need ANY additional information to make a clear determination. 267 | 268 | # Additional Information Needed 269 | // Other information you'd need to make a clear determination. Can be "None" 270 | 271 | # Return Next Steps 272 | // Explain to the user that the user will get a text message with next steps. Only if is_eligible=true, otherwise "None". Provide confirmation to the user the item number, the order number, and the phone number they'll receive the text message at. 273 | 274 | `, 275 | }, 276 | ]; 277 | const model = "o4-mini"; 278 | console.log(`checking order eligibility with model=${model}`); 279 | 280 | const response = await fetch("/api/responses", { 281 | method: "POST", 282 | headers: { 283 | "Content-Type": "application/json", 284 | }, 285 | body: JSON.stringify({ model, input: messages }), 286 | }); 287 | 288 | if (!response.ok) { 289 | console.warn("Server returned an error:", response); 290 | return { error: "Something went wrong." }; 291 | } 292 | 293 | const { output = [] } = await response.json(); 294 | const text = output 295 | .find((i: any) => i.type === 'message' && i.role === 'assistant') 296 | ?.content?.find((c: any) => c.type === 'output_text')?.text ?? ''; 297 | 298 | console.log(text || output); 299 | return { result: text || output }; 300 | }, 301 | }), 302 | ], 303 | 304 | handoffs: [], 305 | }); 306 | -------------------------------------------------------------------------------- /src/app/agentConfigs/customerServiceRetail/sales.ts: -------------------------------------------------------------------------------- 1 | import { RealtimeAgent, tool } from '@openai/agents/realtime'; 2 | 3 | export const salesAgent = new RealtimeAgent({ 4 | name: 'salesAgent', 5 | voice: 'sage', 6 | handoffDescription: 7 | "Handles sales-related inquiries, including new product details, recommendations, promotions, and purchase flows. Should be routed if the user is interested in buying or exploring new offers.", 8 | 9 | instructions: 10 | "You are a helpful sales assistant. Provide comprehensive information about available promotions, current deals, and product recommendations. Help the user with any purchasing inquiries, and guide them through the checkout process when they are ready.", 11 | 12 | 13 | tools: [ 14 | tool({ 15 | name: 'lookupNewSales', 16 | description: 17 | "Checks for current promotions, discounts, or special deals. Respond with available offers relevant to the user’s query.", 18 | parameters: { 19 | type: 'object', 20 | properties: { 21 | category: { 22 | type: 'string', 23 | enum: ['snowboard', 'apparel', 'boots', 'accessories', 'any'], 24 | description: 'The product category or general area the user is interested in (optional).', 25 | }, 26 | }, 27 | required: ['category'], 28 | additionalProperties: false, 29 | }, 30 | execute: async (input: any) => { 31 | const { category } = input as { category: string }; 32 | const items = [ 33 | { item_id: 101, type: 'snowboard', name: 'Alpine Blade', retail_price_usd: 450, sale_price_usd: 360, sale_discount_pct: 20 }, 34 | { item_id: 102, type: 'snowboard', name: 'Peak Bomber', retail_price_usd: 499, sale_price_usd: 374, sale_discount_pct: 25 }, 35 | { item_id: 201, type: 'apparel', name: 'Thermal Jacket', retail_price_usd: 120, sale_price_usd: 84, sale_discount_pct: 30 }, 36 | { item_id: 202, type: 'apparel', name: 'Insulated Pants', retail_price_usd: 150, sale_price_usd: 112, sale_discount_pct: 25 }, 37 | { item_id: 301, type: 'boots', name: 'Glacier Grip', retail_price_usd: 250, sale_price_usd: 200, sale_discount_pct: 20 }, 38 | { item_id: 302, type: 'boots', name: 'Summit Steps', retail_price_usd: 300, sale_price_usd: 210, sale_discount_pct: 30 }, 39 | { item_id: 401, type: 'accessories', name: 'Goggles', retail_price_usd: 80, sale_price_usd: 60, sale_discount_pct: 25 }, 40 | { item_id: 402, type: 'accessories', name: 'Warm Gloves', retail_price_usd: 60, sale_price_usd: 48, sale_discount_pct: 20 }, 41 | ]; 42 | const filteredItems = 43 | category === 'any' 44 | ? items 45 | : items.filter((item) => item.type === category); 46 | filteredItems.sort((a, b) => b.sale_discount_pct - a.sale_discount_pct); 47 | return { 48 | sales: filteredItems, 49 | }; 50 | }, 51 | }), 52 | 53 | tool({ 54 | name: 'addToCart', 55 | description: "Adds an item to the user's shopping cart.", 56 | parameters: { 57 | type: 'object', 58 | properties: { 59 | item_id: { 60 | type: 'string', 61 | description: 'The ID of the item to add to the cart.', 62 | }, 63 | }, 64 | required: ['item_id'], 65 | additionalProperties: false, 66 | }, 67 | execute: async (input: any) => ({ success: true }), 68 | }), 69 | 70 | tool({ 71 | name: 'checkout', 72 | description: 73 | "Initiates a checkout process with the user's selected items.", 74 | parameters: { 75 | type: 'object', 76 | properties: { 77 | item_ids: { 78 | type: 'array', 79 | description: 'An array of item IDs the user intends to purchase.', 80 | items: { 81 | type: 'string', 82 | }, 83 | }, 84 | phone_number: { 85 | type: 'string', 86 | description: "User's phone number used for verification. Formatted like '(111) 222-3333'", 87 | pattern: '^\\(\\d{3}\\) \\d{3}-\\d{4}$', 88 | }, 89 | }, 90 | required: ['item_ids', 'phone_number'], 91 | additionalProperties: false, 92 | }, 93 | execute: async (input: any) => ({ checkoutUrl: 'https://example.com/checkout' }), 94 | }), 95 | ], 96 | 97 | handoffs: [], 98 | }); 99 | -------------------------------------------------------------------------------- /src/app/agentConfigs/customerServiceRetail/simulatedHuman.ts: -------------------------------------------------------------------------------- 1 | import { RealtimeAgent } from '@openai/agents/realtime'; 2 | 3 | export const simulatedHumanAgent = new RealtimeAgent({ 4 | name: 'simulatedHuman', 5 | voice: 'sage', 6 | handoffDescription: 7 | 'Placeholder, simulated human agent that can provide more advanced help to the user. Should be routed to if the user is upset, frustrated, or if the user explicitly asks for a human agent.', 8 | instructions: 9 | "You are a helpful human assistant, with a laid-back attitude and the ability to do anything to help your customer! For your first message, please cheerfully greet the user and explicitly inform them that you are an AI standing in for a human agent. You respond only in German. Your agent_role='human_agent'", 10 | tools: [], 11 | handoffs: [], 12 | }); -------------------------------------------------------------------------------- /src/app/agentConfigs/guardrails.ts: -------------------------------------------------------------------------------- 1 | import { runGuardrailClassifier } from '@/app/lib/callOai'; 2 | 3 | export const moderationGuardrail = { 4 | name: 'moderation_guardrail', 5 | async execute({ agentOutput }: { agentOutput: string }) { 6 | try { 7 | const res = await runGuardrailClassifier(agentOutput); 8 | const triggered = res.moderationCategory !== 'NONE'; 9 | return { 10 | tripwireTriggered: triggered, 11 | outputInfo: res, 12 | }; 13 | } catch (err) { 14 | return { 15 | tripwireTriggered: false, 16 | outputInfo: { error: 'guardrail_failed' }, 17 | }; 18 | } 19 | }, 20 | }; 21 | -------------------------------------------------------------------------------- /src/app/agentConfigs/index.ts: -------------------------------------------------------------------------------- 1 | import { simpleHandoffScenario } from './simpleHandoff'; 2 | import { customerServiceRetailScenario } from './customerServiceRetail'; 3 | import { chatSupervisorScenario } from './chatSupervisor'; 4 | 5 | import type { RealtimeAgent } from '@openai/agents/realtime'; 6 | 7 | // Map of scenario key -> array of RealtimeAgent objects 8 | export const allAgentSets: Record = { 9 | simpleHandoff: simpleHandoffScenario, 10 | customerServiceRetail: customerServiceRetailScenario, 11 | chatSupervisor: chatSupervisorScenario, 12 | }; 13 | 14 | export const defaultAgentSetKey = 'chatSupervisor'; 15 | -------------------------------------------------------------------------------- /src/app/agentConfigs/realtimeClient.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * Thin wrapper that exposes a subset of functionality needed by the React UI, 3 | * implemented on top of @openai/agents/realtime RealtimeSession. 4 | */ 5 | 6 | import { RealtimeSession, RealtimeAgent, OpenAIRealtimeWebRTC } from '@openai/agents/realtime'; 7 | import { moderationGuardrail } from './guardrails'; 8 | 9 | // Minimal event emitter (browser-safe, no Node polyfill) 10 | type Listener = (...args: Args) => void; 11 | 12 | class MiniEmitter> { 13 | #events = new Map[]>(); 14 | 15 | on(event: K, fn: Listener) { 16 | const arr = this.#events.get(event) || []; 17 | arr.push(fn); 18 | this.#events.set(event, arr); 19 | } 20 | 21 | off(event: K, fn: Listener) { 22 | const arr = this.#events.get(event) || []; 23 | this.#events.set( 24 | event, 25 | arr.filter((f) => f !== fn), 26 | ); 27 | } 28 | 29 | emit(event: K, ...args: Events[K]) { 30 | const arr = this.#events.get(event) || []; 31 | arr.forEach((fn) => fn(...args)); 32 | } 33 | } 34 | 35 | export type ClientEvents = { 36 | connection_change: ['connected' | 'connecting' | 'disconnected']; 37 | message: [any]; // raw transport events (will be refined later) 38 | audio_interrupted: []; 39 | history_added: [import('@openai/agents/realtime').RealtimeItem]; 40 | history_updated: [import('@openai/agents/realtime').RealtimeItem[]]; 41 | }; 42 | 43 | export interface RealtimeClientOptions { 44 | getEphemeralKey: () => Promise; // returns ek_ string 45 | initialAgents: RealtimeAgent[]; // first item is root agent 46 | audioElement?: HTMLAudioElement; 47 | extraContext?: Record; 48 | } 49 | 50 | export class RealtimeClient { 51 | #session: RealtimeSession | null = null; 52 | #events = new MiniEmitter(); 53 | #options: RealtimeClientOptions; 54 | 55 | constructor(options: RealtimeClientOptions) { 56 | this.#options = options; 57 | } 58 | 59 | on(event: K, listener: (...args: ClientEvents[K]) => void) { 60 | this.#events.on(event, listener as any); 61 | } 62 | 63 | off(event: K, listener: (...args: ClientEvents[K]) => void) { 64 | this.#events.off(event, listener as any); 65 | } 66 | 67 | async connect() { 68 | if (this.#session) return; 69 | 70 | const ek = await this.#options.getEphemeralKey(); 71 | const rootAgent = this.#options.initialAgents[0]; 72 | 73 | const transportValue: any = this.#options.audioElement 74 | ? new OpenAIRealtimeWebRTC({ 75 | useInsecureApiKey: true, 76 | audioElement: this.#options.audioElement, 77 | }) 78 | : 'webrtc'; 79 | 80 | this.#session = new RealtimeSession(rootAgent, { 81 | transport: transportValue, 82 | outputGuardrails: [moderationGuardrail as any], 83 | context: this.#options.extraContext ?? {}, 84 | }); 85 | 86 | // Immediately notify UI that we’ve started connecting. 87 | this.#events.emit('connection_change', 'connecting'); 88 | 89 | // Forward every transport event as message for handler and watch for 90 | // low-level connection state changes so we can propagate *disconnections* 91 | // after initial setup. 92 | const transport: any = this.#session.transport; 93 | 94 | transport.on('*', (ev: any) => { 95 | // Surface raw session.updated to console for debugging missing instructions. 96 | if (ev?.type === 'session.updated') { 97 | // eslint-disable-next-line no-console 98 | } 99 | this.#events.emit('message', ev); 100 | }); 101 | 102 | transport.on('connection_change', (status: any) => { 103 | if (status === 'disconnected') { 104 | this.#events.emit('connection_change', 'disconnected'); 105 | } 106 | }); 107 | 108 | // Track seen items so we can re-emit granular additions. 109 | const seenItems = new Map(); // itemId -> serialized status marker 110 | 111 | this.#session.on('history_updated', (history: any) => { 112 | (history as any[]).forEach((item) => { 113 | const key = `${item.itemId}:${item.status}`; 114 | if (!seenItems.has(key)) { 115 | seenItems.set(key, key); 116 | this.#events.emit('history_added', item); 117 | } 118 | }); 119 | // Also expose full history if callers want it. 120 | this.#events.emit('history_updated', history); 121 | }); 122 | 123 | this.#session.on('audio_interrupted', () => { 124 | this.#events.emit('audio_interrupted'); 125 | }); 126 | 127 | this.#session.on('guardrail_tripped', (info: any) => { 128 | this.#events.emit('message', { type: 'guardrail_tripped', info }); 129 | }); 130 | 131 | // Wait for full connection establishment (data channel open). 132 | await this.#session.connect({ apiKey: ek }); 133 | 134 | // Now we are truly connected. 135 | this.#events.emit('connection_change', 'connected'); 136 | } 137 | 138 | disconnect() { 139 | this.#session?.close(); 140 | this.#session = null; 141 | this.#events.emit('connection_change', 'disconnected'); 142 | } 143 | 144 | sendUserText(text: string) { 145 | if (!this.#session) throw new Error('not connected'); 146 | this.#session.sendMessage(text); 147 | } 148 | 149 | pushToTalkStart() { 150 | if (!this.#session) return; 151 | this.#session.transport.sendEvent({ type: 'input_audio_buffer.clear' } as any); 152 | } 153 | 154 | pushToTalkStop() { 155 | if (!this.#session) return; 156 | this.#session.transport.sendEvent({ type: 'input_audio_buffer.commit' } as any); 157 | this.#session.transport.sendEvent({ type: 'response.create' } as any); 158 | } 159 | 160 | sendEvent(event: any) { 161 | this.#session?.transport.sendEvent(event); 162 | } 163 | 164 | interrupt() { 165 | this.#session?.transport.interrupt(); 166 | } 167 | 168 | mute(muted: boolean) { 169 | this.#session?.mute(muted); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/app/agentConfigs/simpleHandoff.ts: -------------------------------------------------------------------------------- 1 | import { 2 | RealtimeAgent, 3 | } from '@openai/agents/realtime'; 4 | 5 | export const haikuWriterAgent = new RealtimeAgent({ 6 | name: 'haikuWriter', 7 | voice: 'sage', 8 | instructions: 9 | 'Ask the user for a topic, then reply with a haiku about that topic.', 10 | handoffs: [], 11 | tools: [], 12 | handoffDescription: 'Agent that writes haikus', 13 | }); 14 | 15 | export const greeterAgent = new RealtimeAgent({ 16 | name: 'greeter', 17 | voice: 'sage', 18 | instructions: 19 | "Please greet the user and ask them if they'd like a Haiku. If yes, hand off to the 'haiku' agent.", 20 | handoffs: [haikuWriterAgent], 21 | tools: [], 22 | handoffDescription: 'Agent that greets the user', 23 | }); 24 | 25 | export const simpleHandoffScenario = [greeterAgent, haikuWriterAgent]; 26 | -------------------------------------------------------------------------------- /src/app/agentConfigs/types.ts: -------------------------------------------------------------------------------- 1 | // Central re-exports so agent files don’t need to reach deep into the SDK path 2 | 3 | export { tool } from '@openai/agents/realtime'; 4 | export type { RealtimeAgent, FunctionTool } from '@openai/agents/realtime'; 5 | 6 | -------------------------------------------------------------------------------- /src/app/agentConfigs/voiceAgentMetaprompt.txt: -------------------------------------------------------------------------------- 1 | // paste this ENTIRE file directly in ChatGPT, adding your own context to the first two sections. 2 | 3 | 4 | // Describe your agent's role and personality here, as well as key flow steps 5 | 6 | 7 | 8 | - You are an expert at creating LLM prompts to define prompts to produce specific, high-quality voice agents 9 | - Consider the information provided by the user in user_input, and create a prompt that follows the format and guidelines in output_format. Refer to for correct construction and definition of the state machine. 10 | - Be creative and verbose when defining Personality and Tone qualities, and use multiple sentences if possible. 11 | 12 | 13 | - Optional, can skip if the user provides significant detail about their use case as input 14 | - Ask clarifying questions about personality and tone. For any qualities in the "Personaliy and Tone" template that haven't been specified, prompt the user with a follow-up question that will help clarify and confirm the desired behavior with three high-level optoins, EXCEPT for example phrases, which should be inferred. ONLY ASK ABOUT UNSPECIFIED OR UNCLEAR QUALITIES. 15 | 16 | 17 | First, I'll need to clarify a few aspects of the agent's personality. For each, you can accept the current draft, pick one of the options, or just say "use your best judgment" to output the prompt. 18 | 19 | 1. [under-specified quality 1]: 20 | a) // option 1 21 | b) // option 2 22 | c) // option 3 23 | ... 24 | 25 | 26 | 27 | 28 | - Output the full prompt, which can be used verbatim by the user. 29 | - DO NOT output ``` or ```json around the state_machine_schema, but output the entire prompt as plain text (wrapped in ```). 30 | - DO NOT infer the sate_machine, only define the state machine based on explicit instruction of steps from the user. 31 | 32 | 33 | 34 | 35 | # Personality and Tone 36 | ## Identity 37 | // Who or what the AI represents (e.g., friendly teacher, formal advisor, helpful assistant). Be detailed and include specific details about their character or backstory. 38 | 39 | ## Task 40 | // At a high level, what is the agent expected to do? (e.g. "you are an expert at accurately handling user returns") 41 | 42 | ## Demeanor 43 | // Overall attitude or disposition (e.g., patient, upbeat, serious, empathetic) 44 | 45 | ## Tone 46 | // Voice style (e.g., warm and conversational, polite and authoritative) 47 | 48 | ## Level of Enthusiasm 49 | // Degree of energy in responses (e.g., highly enthusiastic vs. calm and measured) 50 | 51 | ## Level of Formality 52 | // Casual vs. professional language (e.g., “Hey, great to see you!” vs. “Good afternoon, how may I assist you?”) 53 | 54 | ## Level of Emotion 55 | // How emotionally expressive or neutral the AI should be (e.g., compassionate vs. matter-of-fact) 56 | 57 | ## Filler Words 58 | // Helps make the agent more approachable, e.g. “um,” “uh,” "hm," etc.. Options are generally "none", "occasionally", "often", "very often" 59 | 60 | ## Pacing 61 | // Rhythm and speed of delivery 62 | 63 | ## Other details 64 | // Any other information that helps guide the personality or tone of the agent. 65 | 66 | # Instructions 67 | - Follow the Conversation States closely to ensure a structured and consistent interation // Include if user_agent_steps are provided. 68 | - If a user provides a name or phone number, or something else where you ened to know the exact spelling, always repeat it back to the user to confrm you have the right understanding before proceeding. // Always include this 69 | - If the caller corrects any detail, acknowledge the correction in a straightforward manner and confirm the new spelling or value. 70 | 71 | # Conversation States 72 | // Conversation state machine goes here, if user_agent_steps are provided 73 | ``` 74 | // state_machine, populated with the state_machine_schema 75 | 76 | 77 | 78 | 79 | { 80 | "id": "", 81 | "description": "", 82 | "instructions": [ 83 | // list of strings describing what the agent should do in this state 84 | ], 85 | "examples": [ 86 | // list of short example scripts or utterances 87 | ], 88 | "transitions": [ 89 | { 90 | "next_step": "", 91 | "condition": "" 92 | } 93 | // more transitions can be added if needed 94 | ] 95 | } 96 | 97 | 98 | [ 99 | { 100 | "id": "1_greeting", 101 | "description": "Greet the caller and explain the verification process.", 102 | "instructions": [ 103 | "Greet the caller warmly.", 104 | "Inform them about the need to collect personal information for their record." 105 | ], 106 | "examples": [ 107 | "Good morning, this is the front desk administrator. I will assist you in verifying your details.", 108 | "Let us proceed with the verification. May I kindly have your first name? Please spell it out letter by letter for clarity." 109 | ], 110 | "transitions": [{ 111 | "next_step": "2_get_first_name", 112 | "condition": "After greeting is complete." 113 | }] 114 | }, 115 | { 116 | "id": "2_get_first_name", 117 | "description": "Ask for and confirm the caller's first name.", 118 | "instructions": [ 119 | "Request: 'Could you please provide your first name?'", 120 | "Spell it out letter-by-letter back to the caller to confirm." 121 | ], 122 | "examples": [ 123 | "May I have your first name, please?", 124 | "You spelled that as J-A-N-E, is that correct?" 125 | ], 126 | "transitions": [{ 127 | "next_step": "3_get_last_name", 128 | "condition": "Once first name is confirmed." 129 | }] 130 | }, 131 | { 132 | "id": "3_get_last_name", 133 | "description": "Ask for and confirm the caller's last name.", 134 | "instructions": [ 135 | "Request: 'Thank you. Could you please provide your last name?'", 136 | "Spell it out letter-by-letter back to the caller to confirm." 137 | ], 138 | "examples": [ 139 | "And your last name, please?", 140 | "Let me confirm: D-O-E, is that correct?" 141 | ], 142 | "transitions": [{ 143 | "next_step": "4_next_steps", 144 | "condition": "Once last name is confirmed." 145 | }] 146 | }, 147 | { 148 | "id": "4_next_steps", 149 | "description": "Attempt to verify the caller's information and proceed with next steps.", 150 | "instructions": [ 151 | "Inform the caller that you will now attempt to verify their information.", 152 | "Call the 'authenticateUser' function with the provided details.", 153 | "Once verification is complete, transfer the caller to the tourGuide agent for further assistance." 154 | ], 155 | "examples": [ 156 | "Thank you for providing your details. I will now verify your information.", 157 | "Attempting to authenticate your information now.", 158 | "I'll transfer you to our agent who can give you an overview of our facilities. Just to help demonstrate different agent personalities, she's instructed to act a little crabby." 159 | ], 160 | "transitions": [{ 161 | "next_step": "transferAgents", 162 | "condition": "Once verification is complete, transfer to tourGuide agent." 163 | }] 164 | } 165 | ] 166 | 167 | 168 | -------------------------------------------------------------------------------- /src/app/api/responses/route.ts: -------------------------------------------------------------------------------- 1 | import { NextRequest, NextResponse } from 'next/server'; 2 | import OpenAI from 'openai'; 3 | 4 | // Proxy endpoint for the OpenAI Responses API 5 | export async function POST(req: NextRequest) { 6 | const body = await req.json(); 7 | 8 | const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); 9 | 10 | if (body.text?.format?.type === 'json_schema') { 11 | return await structuredResponse(openai, body); 12 | } else { 13 | return await textResponse(openai, body); 14 | } 15 | } 16 | 17 | async function structuredResponse(openai: OpenAI, body: any) { 18 | try { 19 | const response = await openai.responses.parse({ 20 | ...(body as any), 21 | stream: false, 22 | } as any); 23 | 24 | return NextResponse.json(response); 25 | } catch (err: any) { 26 | console.error('responses proxy error', err); 27 | return NextResponse.json({ error: 'failed' }, { status: 500 }); 28 | } 29 | } 30 | 31 | async function textResponse(openai: OpenAI, body: any) { 32 | try { 33 | const response = await openai.responses.create({ 34 | ...(body as any), 35 | stream: false, 36 | } as any); 37 | 38 | return NextResponse.json(response); 39 | } catch (err: any) { 40 | console.error('responses proxy error', err); 41 | return NextResponse.json({ error: 'failed' }, { status: 500 }); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/app/api/session/route.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse } from "next/server"; 2 | 3 | export async function GET() { 4 | try { 5 | const response = await fetch( 6 | "https://api.openai.com/v1/realtime/sessions", 7 | { 8 | method: "POST", 9 | headers: { 10 | Authorization: `Bearer ${process.env.OPENAI_API_KEY}`, 11 | "Content-Type": "application/json", 12 | }, 13 | body: JSON.stringify({ 14 | model: "gpt-4o-realtime-preview-2024-12-17", 15 | // model: "gpt-4o-mini-realtime-preview-2024-12-17", 16 | }), 17 | } 18 | ); 19 | const data = await response.json(); 20 | return NextResponse.json(data); 21 | } catch (error) { 22 | console.error("Error in /session:", error); 23 | return NextResponse.json( 24 | { error: "Internal Server Error" }, 25 | { status: 500 } 26 | ); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/app/components/BottomToolbar.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { SessionStatus } from "@/app/types"; 3 | 4 | interface BottomToolbarProps { 5 | sessionStatus: SessionStatus; 6 | onToggleConnection: () => void; 7 | isPTTActive: boolean; 8 | setIsPTTActive: (val: boolean) => void; 9 | isPTTUserSpeaking: boolean; 10 | handleTalkButtonDown: () => void; 11 | handleTalkButtonUp: () => void; 12 | isEventsPaneExpanded: boolean; 13 | setIsEventsPaneExpanded: (val: boolean) => void; 14 | isAudioPlaybackEnabled: boolean; 15 | setIsAudioPlaybackEnabled: (val: boolean) => void; 16 | codec: string; 17 | onCodecChange: (newCodec: string) => void; 18 | } 19 | 20 | function BottomToolbar({ 21 | sessionStatus, 22 | onToggleConnection, 23 | isPTTActive, 24 | setIsPTTActive, 25 | isPTTUserSpeaking, 26 | handleTalkButtonDown, 27 | handleTalkButtonUp, 28 | isEventsPaneExpanded, 29 | setIsEventsPaneExpanded, 30 | isAudioPlaybackEnabled, 31 | setIsAudioPlaybackEnabled, 32 | codec, 33 | onCodecChange, 34 | }: BottomToolbarProps) { 35 | const isConnected = sessionStatus === "CONNECTED"; 36 | const isConnecting = sessionStatus === "CONNECTING"; 37 | 38 | const handleCodecChange = (e: React.ChangeEvent) => { 39 | const newCodec = e.target.value; 40 | onCodecChange(newCodec); 41 | }; 42 | 43 | function getConnectionButtonLabel() { 44 | if (isConnected) return "Disconnect"; 45 | if (isConnecting) return "Connecting..."; 46 | return "Connect"; 47 | } 48 | 49 | function getConnectionButtonClasses() { 50 | const baseClasses = "text-white text-base p-2 w-36 rounded-md h-full"; 51 | const cursorClass = isConnecting ? "cursor-not-allowed" : "cursor-pointer"; 52 | 53 | if (isConnected) { 54 | // Connected -> label "Disconnect" -> red 55 | return `bg-red-600 hover:bg-red-700 ${cursorClass} ${baseClasses}`; 56 | } 57 | // Disconnected or connecting -> label is either "Connect" or "Connecting" -> black 58 | return `bg-black hover:bg-gray-900 ${cursorClass} ${baseClasses}`; 59 | } 60 | 61 | return ( 62 |
63 | 70 | 71 |
72 | setIsPTTActive(e.target.checked)} 77 | disabled={!isConnected} 78 | className="w-4 h-4" 79 | /> 80 | 86 | 100 |
101 | 102 |
103 | setIsAudioPlaybackEnabled(e.target.checked)} 108 | disabled={!isConnected} 109 | className="w-4 h-4" 110 | /> 111 | 117 |
118 | 119 |
120 | setIsEventsPaneExpanded(e.target.checked)} 125 | className="w-4 h-4" 126 | /> 127 | 130 |
131 | 132 |
133 |
Codec:
134 | 144 |
145 |
146 | ); 147 | } 148 | 149 | export default BottomToolbar; 150 | -------------------------------------------------------------------------------- /src/app/components/Events.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import React, { useRef, useEffect, useState } from "react"; 4 | import { useEvent } from "@/app/contexts/EventContext"; 5 | import { LoggedEvent } from "@/app/types"; 6 | 7 | export interface EventsProps { 8 | isExpanded: boolean; 9 | } 10 | 11 | function Events({ isExpanded }: EventsProps) { 12 | const [prevEventLogs, setPrevEventLogs] = useState([]); 13 | const eventLogsContainerRef = useRef(null); 14 | 15 | const { loggedEvents, toggleExpand } = useEvent(); 16 | 17 | const getDirectionArrow = (direction: string) => { 18 | if (direction === "client") return { symbol: "▲", color: "#7f5af0" }; 19 | if (direction === "server") return { symbol: "▼", color: "#2cb67d" }; 20 | return { symbol: "•", color: "#555" }; 21 | }; 22 | 23 | useEffect(() => { 24 | const hasNewEvent = loggedEvents.length > prevEventLogs.length; 25 | 26 | if (isExpanded && hasNewEvent && eventLogsContainerRef.current) { 27 | eventLogsContainerRef.current.scrollTop = 28 | eventLogsContainerRef.current.scrollHeight; 29 | } 30 | 31 | setPrevEventLogs(loggedEvents); 32 | }, [loggedEvents, isExpanded]); 33 | 34 | return ( 35 |
42 | {isExpanded && ( 43 |
44 |
45 | Logs 46 |
47 |
48 | {loggedEvents.map((log) => { 49 | const arrowInfo = getDirectionArrow(log.direction); 50 | const isError = 51 | log.eventName.toLowerCase().includes("error") || 52 | log.eventData?.response?.status_details?.error != null; 53 | 54 | return ( 55 |
59 |
toggleExpand(log.id)} 61 | className="flex items-center justify-between cursor-pointer" 62 | > 63 |
64 | 68 | {arrowInfo.symbol} 69 | 70 | 76 | {log.eventName} 77 | 78 |
79 |
80 | {log.timestamp} 81 |
82 |
83 | 84 | {log.expanded && log.eventData && ( 85 |
86 |
 87 |                         {JSON.stringify(log.eventData, null, 2)}
 88 |                       
89 |
90 | )} 91 |
92 | ); 93 | })} 94 |
95 |
96 | )} 97 |
98 | ); 99 | } 100 | 101 | export default Events; 102 | -------------------------------------------------------------------------------- /src/app/components/GuardrailChip.tsx: -------------------------------------------------------------------------------- 1 | import React, { useState } from "react"; 2 | import { 3 | CheckCircledIcon, 4 | CrossCircledIcon, 5 | ClockIcon, 6 | } from "@radix-ui/react-icons"; 7 | import { GuardrailResultType } from "../types"; 8 | 9 | export interface ModerationChipProps { 10 | moderationCategory: string; 11 | moderationRationale: string; 12 | } 13 | 14 | function formatCategory(category: string): string { 15 | return category 16 | .split("_") 17 | .map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) 18 | .join(" "); 19 | } 20 | 21 | export function GuardrailChip({ 22 | guardrailResult, 23 | }: { 24 | guardrailResult: GuardrailResultType; 25 | }) { 26 | const [expanded, setExpanded] = useState(false); 27 | 28 | // Consolidate state into a single variable: "PENDING", "PASS", or "FAIL" 29 | const state = 30 | guardrailResult.status === "IN_PROGRESS" 31 | ? "PENDING" 32 | : guardrailResult.category === "NONE" 33 | ? "PASS" 34 | : "FAIL"; 35 | 36 | // Variables for icon, label, and styling classes based on state 37 | let IconComponent; 38 | let label: string; 39 | let textColorClass: string; 40 | switch (state) { 41 | case "PENDING": 42 | IconComponent = ClockIcon; 43 | label = "Pending"; 44 | textColorClass = "text-gray-600"; 45 | break; 46 | case "PASS": 47 | IconComponent = CheckCircledIcon; 48 | label = "Pass"; 49 | textColorClass = "text-green-600"; 50 | break; 51 | case "FAIL": 52 | IconComponent = CrossCircledIcon; 53 | label = "Fail"; 54 | textColorClass = "text-red-500"; 55 | break; 56 | default: 57 | IconComponent = ClockIcon; 58 | label = "Pending"; 59 | textColorClass = "text-gray-600"; 60 | } 61 | 62 | return ( 63 |
64 |
{ 66 | // Only allow toggling the expanded state for PASS/FAIL cases. 67 | if (state !== "PENDING") { 68 | setExpanded(!expanded); 69 | } 70 | }} 71 | // Only add pointer cursor if clickable (PASS or FAIL state) 72 | className={`inline-flex items-center gap-1 rounded ${ 73 | state !== "PENDING" ? "cursor-pointer" : "" 74 | }`} 75 | > 76 | Guardrail: 77 |
78 | {label} 79 |
80 |
81 | {/* Container for expandable content */} 82 | {state !== "PENDING" && guardrailResult.category && guardrailResult.rationale && ( 83 |
88 |
89 | 90 | Moderation Category: {formatCategory(guardrailResult.category)} 91 | 92 |
{guardrailResult.rationale}
93 | {guardrailResult.testText && ( 94 |
95 | {guardrailResult.testText} 96 |
97 | )} 98 |
99 |
100 | )} 101 |
102 | ); 103 | } -------------------------------------------------------------------------------- /src/app/components/Transcript.tsx: -------------------------------------------------------------------------------- 1 | "use-client"; 2 | 3 | import React, { useEffect, useRef, useState } from "react"; 4 | import ReactMarkdown from "react-markdown"; 5 | import { TranscriptItem } from "@/app/types"; 6 | import Image from "next/image"; 7 | import { useTranscript } from "@/app/contexts/TranscriptContext"; 8 | import { DownloadIcon, ClipboardCopyIcon } from "@radix-ui/react-icons"; 9 | import { GuardrailChip } from "./GuardrailChip"; 10 | 11 | export interface TranscriptProps { 12 | userText: string; 13 | setUserText: (val: string) => void; 14 | onSendMessage: () => void; 15 | canSend: boolean; 16 | downloadRecording: () => void; 17 | } 18 | 19 | function Transcript({ 20 | userText, 21 | setUserText, 22 | onSendMessage, 23 | canSend, 24 | downloadRecording, 25 | }: TranscriptProps) { 26 | const { transcriptItems, toggleTranscriptItemExpand } = useTranscript(); 27 | const transcriptRef = useRef(null); 28 | const [prevLogs, setPrevLogs] = useState([]); 29 | const [justCopied, setJustCopied] = useState(false); 30 | const inputRef = useRef(null); 31 | 32 | function scrollToBottom() { 33 | if (transcriptRef.current) { 34 | transcriptRef.current.scrollTop = transcriptRef.current.scrollHeight; 35 | } 36 | } 37 | 38 | useEffect(() => { 39 | const hasNewMessage = transcriptItems.length > prevLogs.length; 40 | const hasUpdatedMessage = transcriptItems.some((newItem, index) => { 41 | const oldItem = prevLogs[index]; 42 | return ( 43 | oldItem && 44 | (newItem.title !== oldItem.title || newItem.data !== oldItem.data) 45 | ); 46 | }); 47 | 48 | if (hasNewMessage || hasUpdatedMessage) { 49 | scrollToBottom(); 50 | } 51 | 52 | setPrevLogs(transcriptItems); 53 | }, [transcriptItems]); 54 | 55 | // Autofocus on text box input on load 56 | useEffect(() => { 57 | if (canSend && inputRef.current) { 58 | inputRef.current.focus(); 59 | } 60 | }, [canSend]); 61 | 62 | const handleCopyTranscript = async () => { 63 | if (!transcriptRef.current) return; 64 | try { 65 | await navigator.clipboard.writeText(transcriptRef.current.innerText); 66 | setJustCopied(true); 67 | setTimeout(() => setJustCopied(false), 1500); 68 | } catch (error) { 69 | console.error("Failed to copy transcript:", error); 70 | } 71 | }; 72 | 73 | return ( 74 |
75 |
76 |
77 | Transcript 78 |
79 | 86 | 93 |
94 |
95 | 96 | {/* Transcript Content */} 97 |
101 | {[...transcriptItems] 102 | .sort((a, b) => a.createdAtMs - b.createdAtMs) 103 | .map((item) => { 104 | const { 105 | itemId, 106 | type, 107 | role, 108 | data, 109 | expanded, 110 | timestamp, 111 | title = "", 112 | isHidden, 113 | guardrailResult, 114 | } = item; 115 | 116 | if (isHidden) { 117 | return null; 118 | } 119 | 120 | if (type === "MESSAGE") { 121 | const isUser = role === "user"; 122 | const containerClasses = `flex justify-end flex-col ${ 123 | isUser ? "items-end" : "items-start" 124 | }`; 125 | const bubbleBase = `max-w-lg p-3 ${ 126 | isUser ? "bg-gray-900 text-gray-100" : "bg-gray-100 text-black" 127 | }`; 128 | const isBracketedMessage = 129 | title.startsWith("[") && title.endsWith("]"); 130 | const messageStyle = isBracketedMessage 131 | ? "italic text-gray-400" 132 | : ""; 133 | const displayTitle = isBracketedMessage 134 | ? title.slice(1, -1) 135 | : title; 136 | 137 | return ( 138 |
139 |
140 |
145 |
150 | {timestamp} 151 |
152 |
153 | {displayTitle} 154 |
155 |
156 | {guardrailResult && ( 157 |
158 | 159 |
160 | )} 161 |
162 |
163 | ); 164 | } else if (type === "BREADCRUMB") { 165 | return ( 166 |
170 | {timestamp} 171 |
data && toggleTranscriptItemExpand(itemId)} 176 | > 177 | {data && ( 178 | 183 | ▶ 184 | 185 | )} 186 | {title} 187 |
188 | {expanded && data && ( 189 |
190 |
191 |                         {JSON.stringify(data, null, 2)}
192 |                       
193 |
194 | )} 195 |
196 | ); 197 | } else { 198 | // Fallback if type is neither MESSAGE nor BREADCRUMB 199 | return ( 200 |
204 | Unknown item type: {type}{" "} 205 | {timestamp} 206 |
207 | ); 208 | } 209 | })} 210 |
211 |
212 | 213 |
214 | setUserText(e.target.value)} 219 | onKeyDown={(e) => { 220 | if (e.key === "Enter" && canSend) { 221 | onSendMessage(); 222 | } 223 | }} 224 | className="flex-1 px-4 py-2 focus:outline-none" 225 | placeholder="Type a message..." 226 | /> 227 | 234 |
235 |
236 | ); 237 | } 238 | 239 | export default Transcript; 240 | -------------------------------------------------------------------------------- /src/app/contexts/EventContext.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import React, { createContext, useContext, useState, FC, PropsWithChildren } from "react"; 4 | import { v4 as uuidv4 } from "uuid"; 5 | import { LoggedEvent } from "@/app/types"; 6 | 7 | type EventContextValue = { 8 | loggedEvents: LoggedEvent[]; 9 | logClientEvent: (eventObj: Record, eventNameSuffix?: string) => void; 10 | logServerEvent: (eventObj: Record, eventNameSuffix?: string) => void; 11 | logHistoryItem: (item: any) => void; 12 | toggleExpand: (id: number | string) => void; 13 | }; 14 | 15 | const EventContext = createContext(undefined); 16 | 17 | export const EventProvider: FC = ({ children }) => { 18 | const [loggedEvents, setLoggedEvents] = useState([]); 19 | 20 | function addLoggedEvent(direction: "client" | "server", eventName: string, eventData: Record) { 21 | const id = eventData.event_id || uuidv4(); 22 | setLoggedEvents((prev) => [ 23 | ...prev, 24 | { 25 | id, 26 | direction, 27 | eventName, 28 | eventData, 29 | timestamp: new Date().toLocaleTimeString(), 30 | expanded: false, 31 | }, 32 | ]); 33 | } 34 | 35 | const logClientEvent: EventContextValue["logClientEvent"] = (eventObj, eventNameSuffix = "") => { 36 | const name = `${eventObj.type || ""} ${eventNameSuffix || ""}`.trim(); 37 | addLoggedEvent("client", name, eventObj); 38 | }; 39 | 40 | const logServerEvent: EventContextValue["logServerEvent"] = (eventObj, eventNameSuffix = "") => { 41 | const name = `${eventObj.type || ""} ${eventNameSuffix || ""}`.trim(); 42 | addLoggedEvent("server", name, eventObj); 43 | }; 44 | 45 | const logHistoryItem: EventContextValue['logHistoryItem'] = (item) => { 46 | let eventName = item.type; 47 | if (item.type === 'message') { 48 | eventName = `${item.role}.${item.status}`; 49 | } 50 | if (item.type === 'function_call') { 51 | eventName = `function.${item.name}.${item.status}`; 52 | } 53 | addLoggedEvent('server', eventName, item); 54 | }; 55 | 56 | const toggleExpand: EventContextValue['toggleExpand'] = (id) => { 57 | setLoggedEvents((prev) => 58 | prev.map((log) => { 59 | if (log.id === id) { 60 | return { ...log, expanded: !log.expanded }; 61 | } 62 | return log; 63 | }) 64 | ); 65 | }; 66 | 67 | 68 | return ( 69 | 72 | {children} 73 | 74 | ); 75 | }; 76 | 77 | export function useEvent() { 78 | const context = useContext(EventContext); 79 | if (!context) { 80 | throw new Error("useEvent must be used within an EventProvider"); 81 | } 82 | return context; 83 | } -------------------------------------------------------------------------------- /src/app/contexts/TranscriptContext.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import React, { createContext, useContext, useState, FC, PropsWithChildren } from "react"; 4 | import { v4 as uuidv4 } from "uuid"; 5 | import { TranscriptItem } from "@/app/types"; 6 | 7 | type TranscriptContextValue = { 8 | transcriptItems: TranscriptItem[]; 9 | addTranscriptMessage: (itemId: string, role: "user" | "assistant", text: string, hidden?: boolean) => void; 10 | updateTranscriptMessage: (itemId: string, text: string, isDelta: boolean) => void; 11 | addTranscriptBreadcrumb: (title: string, data?: Record) => void; 12 | toggleTranscriptItemExpand: (itemId: string) => void; 13 | updateTranscriptItem: (itemId: string, updatedProperties: Partial) => void; 14 | }; 15 | 16 | const TranscriptContext = createContext(undefined); 17 | 18 | export const TranscriptProvider: FC = ({ children }) => { 19 | const [transcriptItems, setTranscriptItems] = useState([]); 20 | 21 | function newTimestampPretty(): string { 22 | const now = new Date(); 23 | const time = now.toLocaleTimeString([], { 24 | hour12: false, 25 | hour: "2-digit", 26 | minute: "2-digit", 27 | second: "2-digit", 28 | }); 29 | const ms = now.getMilliseconds().toString().padStart(3, "0"); 30 | return `${time}.${ms}`; 31 | } 32 | 33 | const addTranscriptMessage: TranscriptContextValue["addTranscriptMessage"] = (itemId, role, text = "", isHidden = false) => { 34 | setTranscriptItems((prev) => { 35 | if (prev.some((log) => log.itemId === itemId && log.type === "MESSAGE")) { 36 | console.warn(`[addTranscriptMessage] skipping; message already exists for itemId=${itemId}, role=${role}, text=${text}`); 37 | return prev; 38 | } 39 | 40 | const newItem: TranscriptItem = { 41 | itemId, 42 | type: "MESSAGE", 43 | role, 44 | title: text, 45 | expanded: false, 46 | timestamp: newTimestampPretty(), 47 | createdAtMs: Date.now(), 48 | status: "IN_PROGRESS", 49 | isHidden, 50 | }; 51 | 52 | return [...prev, newItem]; 53 | }); 54 | }; 55 | 56 | const updateTranscriptMessage: TranscriptContextValue["updateTranscriptMessage"] = (itemId, newText, append = false) => { 57 | setTranscriptItems((prev) => 58 | prev.map((item) => { 59 | if (item.itemId === itemId && item.type === "MESSAGE") { 60 | return { 61 | ...item, 62 | title: append ? (item.title ?? "") + newText : newText, 63 | }; 64 | } 65 | return item; 66 | }) 67 | ); 68 | }; 69 | 70 | const addTranscriptBreadcrumb: TranscriptContextValue["addTranscriptBreadcrumb"] = (title, data) => { 71 | setTranscriptItems((prev) => [ 72 | ...prev, 73 | { 74 | itemId: `breadcrumb-${uuidv4()}`, 75 | type: "BREADCRUMB", 76 | title, 77 | data, 78 | expanded: false, 79 | timestamp: newTimestampPretty(), 80 | createdAtMs: Date.now(), 81 | status: "DONE", 82 | isHidden: false, 83 | }, 84 | ]); 85 | }; 86 | 87 | const toggleTranscriptItemExpand: TranscriptContextValue["toggleTranscriptItemExpand"] = (itemId) => { 88 | setTranscriptItems((prev) => 89 | prev.map((log) => 90 | log.itemId === itemId ? { ...log, expanded: !log.expanded } : log 91 | ) 92 | ); 93 | }; 94 | 95 | const updateTranscriptItem: TranscriptContextValue["updateTranscriptItem"] = (itemId, updatedProperties) => { 96 | setTranscriptItems((prev) => 97 | prev.map((item) => 98 | item.itemId === itemId ? { ...item, ...updatedProperties } : item 99 | ) 100 | ); 101 | }; 102 | 103 | return ( 104 | 114 | {children} 115 | 116 | ); 117 | }; 118 | 119 | export function useTranscript() { 120 | const context = useContext(TranscriptContext); 121 | if (!context) { 122 | throw new Error("useTranscript must be used within a TranscriptProvider"); 123 | } 124 | return context; 125 | } -------------------------------------------------------------------------------- /src/app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | :root { 6 | --background: #fafafa; 7 | --foreground: #171717; 8 | } 9 | 10 | @media (prefers-color-scheme: dark) { 11 | :root { 12 | --background: #0a0a0a; 13 | --foreground: #ededed; 14 | } 15 | } 16 | 17 | body { 18 | color: var(--foreground); 19 | background: var(--background); 20 | font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, 21 | "Helvetica Neue", Arial, "Noto Sans", sans-serif; 22 | } 23 | -------------------------------------------------------------------------------- /src/app/hooks/useAudioDownload.ts: -------------------------------------------------------------------------------- 1 | import { useRef } from "react"; 2 | import { convertWebMBlobToWav } from "../lib/audioUtils"; 3 | 4 | function useAudioDownload() { 5 | // Ref to store the MediaRecorder instance. 6 | const mediaRecorderRef = useRef(null); 7 | // Ref to collect all recorded Blob chunks. 8 | const recordedChunksRef = useRef([]); 9 | 10 | /** 11 | * Starts recording by combining the provided remote stream with 12 | * the microphone audio. 13 | * @param remoteStream - The remote MediaStream (e.g., from the audio element). 14 | */ 15 | const startRecording = async (remoteStream: MediaStream) => { 16 | let micStream: MediaStream; 17 | try { 18 | micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); 19 | } catch (err) { 20 | console.error("Error getting microphone stream:", err); 21 | // Fallback to an empty MediaStream if microphone access fails. 22 | micStream = new MediaStream(); 23 | } 24 | 25 | // Create an AudioContext to merge the streams. 26 | const audioContext = new AudioContext(); 27 | const destination = audioContext.createMediaStreamDestination(); 28 | 29 | // Connect the remote audio stream. 30 | try { 31 | const remoteSource = audioContext.createMediaStreamSource(remoteStream); 32 | remoteSource.connect(destination); 33 | } catch (err) { 34 | console.error("Error connecting remote stream to the audio context:", err); 35 | } 36 | 37 | // Connect the microphone audio stream. 38 | try { 39 | const micSource = audioContext.createMediaStreamSource(micStream); 40 | micSource.connect(destination); 41 | } catch (err) { 42 | console.error("Error connecting microphone stream to the audio context:", err); 43 | } 44 | 45 | const options = { mimeType: "audio/webm" }; 46 | try { 47 | const mediaRecorder = new MediaRecorder(destination.stream, options); 48 | mediaRecorder.ondataavailable = (event: BlobEvent) => { 49 | if (event.data && event.data.size > 0) { 50 | recordedChunksRef.current.push(event.data); 51 | } 52 | }; 53 | // Start recording without a timeslice. 54 | mediaRecorder.start(); 55 | mediaRecorderRef.current = mediaRecorder; 56 | } catch (err) { 57 | console.error("Error starting MediaRecorder with combined stream:", err); 58 | } 59 | }; 60 | 61 | /** 62 | * Stops the MediaRecorder, if active. 63 | */ 64 | const stopRecording = () => { 65 | if (mediaRecorderRef.current) { 66 | // Request any final data before stopping. 67 | mediaRecorderRef.current.requestData(); 68 | mediaRecorderRef.current.stop(); 69 | mediaRecorderRef.current = null; 70 | } 71 | }; 72 | 73 | /** 74 | * Initiates download of the recording after converting from WebM to WAV. 75 | * If the recorder is still active, we request its latest data before downloading. 76 | */ 77 | const downloadRecording = async () => { 78 | // If recording is still active, request the latest chunk. 79 | if (mediaRecorderRef.current && mediaRecorderRef.current.state === "recording") { 80 | // Request the current data. 81 | mediaRecorderRef.current.requestData(); 82 | // Allow a short delay for ondataavailable to fire. 83 | await new Promise((resolve) => setTimeout(resolve, 100)); 84 | } 85 | 86 | if (recordedChunksRef.current.length === 0) { 87 | console.warn("No recorded chunks found to download."); 88 | return; 89 | } 90 | 91 | // Combine the recorded chunks into a single WebM blob. 92 | const webmBlob = new Blob(recordedChunksRef.current, { type: "audio/webm" }); 93 | 94 | try { 95 | // Convert the WebM blob into a WAV blob. 96 | const wavBlob = await convertWebMBlobToWav(webmBlob); 97 | const url = URL.createObjectURL(wavBlob); 98 | 99 | // Generate a formatted datetime string (replace characters not allowed in filenames). 100 | const now = new Date().toISOString().replace(/[:.]/g, "-"); 101 | 102 | // Create an invisible anchor element and trigger the download. 103 | const a = document.createElement("a"); 104 | a.style.display = "none"; 105 | a.href = url; 106 | a.download = `realtime_agents_audio_${now}.wav`; 107 | document.body.appendChild(a); 108 | a.click(); 109 | document.body.removeChild(a); 110 | 111 | // Clean up the blob URL after a short delay. 112 | setTimeout(() => URL.revokeObjectURL(url), 100); 113 | } catch (err) { 114 | console.error("Error converting recording to WAV:", err); 115 | } 116 | }; 117 | 118 | return { startRecording, stopRecording, downloadRecording }; 119 | } 120 | 121 | export default useAudioDownload; -------------------------------------------------------------------------------- /src/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from "next"; 2 | import "./globals.css"; 3 | import "./lib/envSetup"; 4 | 5 | export const metadata: Metadata = { 6 | title: "Realtime API Agents", 7 | description: "A demo app from OpenAI.", 8 | }; 9 | 10 | export default function RootLayout({ 11 | children, 12 | }: Readonly<{ 13 | children: React.ReactNode; 14 | }>) { 15 | return ( 16 | 17 | {children} 18 | 19 | ); 20 | } 21 | -------------------------------------------------------------------------------- /src/app/lib/audioUtils.ts: -------------------------------------------------------------------------------- 1 | // WAV conversion utilities 2 | 3 | /** 4 | * Writes a string into a DataView at the given offset. 5 | */ 6 | export function writeString(view: DataView, offset: number, str: string) { 7 | for (let i = 0; i < str.length; i++) { 8 | view.setUint8(offset + i, str.charCodeAt(i)); 9 | } 10 | } 11 | 12 | /** 13 | * Converts a Float32Array to 16-bit PCM in a DataView. 14 | */ 15 | export function floatTo16BitPCM(output: DataView, offset: number, input: Float32Array) { 16 | for (let i = 0; i < input.length; i++, offset += 2) { 17 | const s = Math.max(-1, Math.min(1, input[i])); 18 | output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); 19 | } 20 | } 21 | 22 | /** 23 | * Encodes a Float32Array as a WAV file. 24 | */ 25 | export function encodeWAV(samples: Float32Array, sampleRate: number): ArrayBuffer { 26 | const buffer = new ArrayBuffer(44 + samples.length * 2); 27 | const view = new DataView(buffer); 28 | 29 | // RIFF identifier 30 | writeString(view, 0, "RIFF"); 31 | // file length minus RIFF identifier length and file description length 32 | view.setUint32(4, 36 + samples.length * 2, true); 33 | // RIFF type 34 | writeString(view, 8, "WAVE"); 35 | // format chunk identifier 36 | writeString(view, 12, "fmt "); 37 | // format chunk length 38 | view.setUint32(16, 16, true); 39 | // sample format (raw) 40 | view.setUint16(20, 1, true); 41 | // channel count - forcing mono here by averaging channels 42 | view.setUint16(22, 1, true); 43 | // sample rate 44 | view.setUint32(24, sampleRate, true); 45 | // byte rate (sample rate * block align) 46 | view.setUint32(28, sampleRate * 2, true); 47 | // block align (channel count * bytes per sample) 48 | view.setUint16(32, 2, true); 49 | // bits per sample 50 | view.setUint16(34, 16, true); 51 | // data chunk identifier 52 | writeString(view, 36, "data"); 53 | // data chunk length 54 | view.setUint32(40, samples.length * 2, true); 55 | 56 | floatTo16BitPCM(view, 44, samples); 57 | 58 | return buffer; 59 | } 60 | 61 | /** 62 | * Converts a WebM audio blob to a WAV blob. 63 | */ 64 | export async function convertWebMBlobToWav(blob: Blob): Promise { 65 | const arrayBuffer = await blob.arrayBuffer(); 66 | const audioContext = new AudioContext(); 67 | const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); 68 | const numChannels = audioBuffer.numberOfChannels; 69 | const length = audioBuffer.length; 70 | const combined = new Float32Array(length); 71 | 72 | // Average channels to produce mono output 73 | for (let channel = 0; channel < numChannels; channel++) { 74 | const channelData = audioBuffer.getChannelData(channel); 75 | for (let i = 0; i < length; i++) { 76 | combined[i] += channelData[i]; 77 | } 78 | } 79 | for (let i = 0; i < length; i++) { 80 | combined[i] /= numChannels; 81 | } 82 | const wavBuffer = encodeWAV(combined, audioBuffer.sampleRate); 83 | return new Blob([wavBuffer], { type: "audio/wav" }); 84 | } -------------------------------------------------------------------------------- /src/app/lib/callOai.ts: -------------------------------------------------------------------------------- 1 | import { zodTextFormat } from 'openai/helpers/zod'; 2 | import { GuardrailOutputZod, GuardrailOutput } from '@/app/types'; 3 | 4 | export async function runGuardrailClassifier( 5 | message: string, 6 | ): Promise { 7 | const messages = [ 8 | { 9 | role: 'user', 10 | content: `You are an expert at classifying text according to moderation policies. Consider the provided message, analyze potential classes from output_classes, and output the best classification. Output json, following the provided schema. Keep your analysis and reasoning short and to the point, maximum 2 sentences. 11 | 12 | 13 | - Company name: newTelco, or Snowy Peak Boards 14 | 15 | 16 | 17 | ${message} 18 | 19 | 20 | 21 | - OFFENSIVE: Content that includes hate speech, discriminatory language, insults, slurs, or harassment. 22 | - OFF_BRAND: Content that discusses competitors in a disparaging way. 23 | - VIOLENCE: Content that includes explicit threats, incitement of harm, or graphic descriptions of physical injury or violence. 24 | - NONE: If no other classes are appropriate and the message is fine. 25 | 26 | `, 27 | }, 28 | ]; 29 | 30 | const response = await fetch('/api/responses', { 31 | method: 'POST', 32 | headers: { 33 | 'Content-Type': 'application/json', 34 | }, 35 | body: JSON.stringify({ 36 | model: 'gpt-4o-mini', 37 | input: messages, 38 | text: { 39 | format: zodTextFormat(GuardrailOutputZod, 'output_format'), 40 | }, 41 | }), 42 | }); 43 | 44 | if (!response.ok) { 45 | console.warn('Server returned an error:', response); 46 | return Promise.reject('Error with runGuardrailClassifier.'); 47 | } 48 | 49 | const data = await response.json(); 50 | 51 | try { 52 | const output = GuardrailOutputZod.parse(data.output_parsed); 53 | return output; 54 | } catch (error) { 55 | console.error('Error parsing the message content as GuardrailOutput:', error); 56 | return Promise.reject('Failed to parse guardrail output.'); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/app/lib/envSetup.ts: -------------------------------------------------------------------------------- 1 | import dotenv from 'dotenv'; 2 | 3 | dotenv.config({path: './env'}) -------------------------------------------------------------------------------- /src/app/page.tsx: -------------------------------------------------------------------------------- 1 | import React, { Suspense } from "react"; 2 | import { TranscriptProvider } from "@/app/contexts/TranscriptContext"; 3 | import { EventProvider } from "@/app/contexts/EventContext"; 4 | import App from "./App"; 5 | 6 | export default function Page() { 7 | return ( 8 | Loading...}> 9 | 10 | 11 | 12 | 13 | 14 | 15 | ); 16 | } 17 | -------------------------------------------------------------------------------- /src/app/types.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | // Define the allowed moderation categories only once 4 | export const MODERATION_CATEGORIES = [ 5 | "OFFENSIVE", 6 | "OFF_BRAND", 7 | "VIOLENCE", 8 | "NONE", 9 | ] as const; 10 | 11 | // Derive the union type for ModerationCategory from the array 12 | export type ModerationCategory = (typeof MODERATION_CATEGORIES)[number]; 13 | 14 | // Create a Zod enum based on the same array 15 | export const ModerationCategoryZod = z.enum([...MODERATION_CATEGORIES]); 16 | 17 | export type SessionStatus = "DISCONNECTED" | "CONNECTING" | "CONNECTED"; 18 | 19 | export interface ToolParameterProperty { 20 | type: string; 21 | description?: string; 22 | enum?: string[]; 23 | pattern?: string; 24 | properties?: Record; 25 | required?: string[]; 26 | additionalProperties?: boolean; 27 | items?: ToolParameterProperty; 28 | } 29 | 30 | export interface ToolParameters { 31 | type: string; 32 | properties: Record; 33 | required?: string[]; 34 | additionalProperties?: boolean; 35 | } 36 | 37 | export interface Tool { 38 | type: "function"; 39 | name: string; 40 | description: string; 41 | parameters: ToolParameters; 42 | } 43 | 44 | export interface AgentConfig { 45 | name: string; 46 | publicDescription: string; // gives context to agent transfer tool 47 | instructions: string; 48 | tools: Tool[]; 49 | toolLogic?: Record< 50 | string, 51 | (args: any, transcriptLogsFiltered: TranscriptItem[], addTranscriptBreadcrumb?: (title: string, data?: any) => void) => Promise | any 52 | >; 53 | // addTranscriptBreadcrumb is a param in case we want to add additional breadcrumbs, e.g. for nested tool calls from a supervisor agent. 54 | downstreamAgents?: 55 | | AgentConfig[] 56 | | { name: string; publicDescription: string }[]; 57 | } 58 | 59 | export type AllAgentConfigsType = Record; 60 | 61 | export interface GuardrailResultType { 62 | status: "IN_PROGRESS" | "DONE"; 63 | testText?: string; 64 | category?: ModerationCategory; 65 | rationale?: string; 66 | } 67 | 68 | export interface TranscriptItem { 69 | itemId: string; 70 | type: "MESSAGE" | "BREADCRUMB"; 71 | role?: "user" | "assistant"; 72 | title?: string; 73 | data?: Record; 74 | expanded: boolean; 75 | timestamp: string; 76 | createdAtMs: number; 77 | status: "IN_PROGRESS" | "DONE"; 78 | isHidden: boolean; 79 | guardrailResult?: GuardrailResultType; 80 | } 81 | 82 | export interface Log { 83 | id: number; 84 | timestamp: string; 85 | direction: string; 86 | eventName: string; 87 | data: any; 88 | expanded: boolean; 89 | type: string; 90 | } 91 | 92 | export interface ServerEvent { 93 | type: string; 94 | event_id?: string; 95 | item_id?: string; 96 | transcript?: string; 97 | delta?: string; 98 | session?: { 99 | id?: string; 100 | }; 101 | item?: { 102 | id?: string; 103 | object?: string; 104 | type?: string; 105 | status?: string; 106 | name?: string; 107 | arguments?: string; 108 | role?: "user" | "assistant"; 109 | content?: { 110 | type?: string; 111 | transcript?: string | null; 112 | text?: string; 113 | }[]; 114 | }; 115 | response?: { 116 | output?: { 117 | id: string; 118 | type?: string; 119 | name?: string; 120 | arguments?: any; 121 | call_id?: string; 122 | role: string; 123 | content?: any; 124 | }[]; 125 | metadata: Record; 126 | status_details?: { 127 | error?: any; 128 | }; 129 | }; 130 | } 131 | 132 | export interface LoggedEvent { 133 | id: number; 134 | direction: "client" | "server"; 135 | expanded: boolean; 136 | timestamp: string; 137 | eventName: string; 138 | eventData: Record; // can have arbitrary objects logged 139 | } 140 | 141 | // Update the GuardrailOutputZod schema to use the shared ModerationCategoryZod 142 | export const GuardrailOutputZod = z.object({ 143 | moderationRationale: z.string(), 144 | moderationCategory: ModerationCategoryZod, 145 | }); 146 | 147 | export type GuardrailOutput = z.infer; 148 | -------------------------------------------------------------------------------- /tailwind.config.ts: -------------------------------------------------------------------------------- 1 | import type { Config } from "tailwindcss"; 2 | 3 | export default { 4 | content: [ 5 | "./src/pages/**/*.{js,ts,jsx,tsx,mdx}", 6 | "./src/components/**/*.{js,ts,jsx,tsx,mdx}", 7 | "./src/app/**/*.{js,ts,jsx,tsx,mdx}", 8 | ], 9 | theme: { 10 | extend: { 11 | colors: { 12 | background: "var(--background)", 13 | foreground: "var(--foreground)", 14 | }, 15 | }, 16 | }, 17 | plugins: [], 18 | } satisfies Config; 19 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2017", 4 | "lib": [ 5 | "dom", 6 | "dom.iterable", 7 | "esnext" 8 | ], 9 | "allowJs": true, 10 | "skipLibCheck": true, 11 | "strict": true, 12 | "noEmit": true, 13 | "esModuleInterop": true, 14 | "module": "esnext", 15 | "moduleResolution": "bundler", 16 | "resolveJsonModule": true, 17 | "isolatedModules": true, 18 | "jsx": "preserve", 19 | "incremental": true, 20 | "plugins": [ 21 | { 22 | "name": "next" 23 | } 24 | ], 25 | "paths": { 26 | "@/*": [ 27 | "./src/*" 28 | ] 29 | } 30 | }, 31 | "include": [ 32 | "**/*.ts", 33 | "**/*.tsx", 34 | "next-env.d.ts", 35 | ".next/types/**/*.ts" 36 | ], 37 | "exclude": [ 38 | "node_modules", 39 | ".next" 40 | ] 41 | } 42 | --------------------------------------------------------------------------------