├── .editorconfig ├── .env.example ├── .eslintrc.json ├── .github ├── funding.yml └── workflows │ └── main.yml ├── .gitignore ├── .npmrc ├── .prettierignore ├── .prettierrc ├── examples ├── node │ ├── audio.ts │ ├── basic.ts │ ├── convo.ts │ ├── mic.d.ts │ ├── package.json │ └── relay-server.ts └── openai-realtime-console │ ├── .eslintrc.json │ ├── .gitignore │ ├── .prettierrc │ ├── LICENSE │ ├── README.md │ ├── package.json │ ├── public │ ├── index.html │ ├── openai-logomark.svg │ └── robots.txt │ ├── readme │ └── realtime-console-demo.png │ ├── src │ ├── App.scss │ ├── App.tsx │ ├── components │ │ ├── Map.scss │ │ ├── Map.tsx │ │ ├── button │ │ │ ├── Button.scss │ │ │ └── Button.tsx │ │ └── toggle │ │ │ ├── Toggle.scss │ │ │ └── Toggle.tsx │ ├── index.css │ ├── index.tsx │ ├── lib │ │ └── wavtools │ │ │ ├── index.js │ │ │ └── lib │ │ │ ├── analysis │ │ │ ├── audio_analysis.js │ │ │ └── constants.js │ │ │ ├── wav_packer.js │ │ │ ├── wav_recorder.js │ │ │ ├── wav_stream_player.js │ │ │ └── worklets │ │ │ ├── audio_processor.js │ │ │ └── stream_processor.js │ ├── logo.svg │ ├── pages │ │ ├── ConsolePage.scss │ │ └── ConsolePage.tsx │ ├── react-app-env.d.ts │ ├── reportWebVitals.ts │ ├── setupTests.ts │ └── utils │ │ ├── conversation_config.js │ │ └── wav_renderer.ts │ └── tsconfig.json ├── fixtures └── toronto.mp3 ├── license ├── package.json ├── pnpm-lock.yaml ├── pnpm-workspace.yaml ├── readme.md ├── src ├── api.ts ├── client.test.ts ├── client.ts ├── conversation.ts ├── event-handler.ts ├── events.ts ├── index.ts ├── node │ ├── index.ts │ └── relay-server.ts ├── reset.d.ts ├── types.ts └── utils.ts ├── tsconfig.json └── tsup.config.ts /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | tab_width = 2 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "extends": ["@fisch0920/eslint-config/node"], 4 | "rules": { 5 | "unicorn/consistent-function-scoping": "off" 6 | }, 7 | "ignorePatterns": ["examples/openai-realtime-console"] 8 | } 9 | -------------------------------------------------------------------------------- /.github/funding.yml: -------------------------------------------------------------------------------- 1 | github: [transitive-bullshit] 2 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Test Node.js ${{ matrix.node-version }} 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: true 11 | matrix: 12 | node-version: 13 | - 18 14 | - 20 15 | - 21 16 | - 22 17 | 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v4 21 | 22 | - name: Install pnpm 23 | uses: pnpm/action-setup@v3 24 | id: pnpm-install 25 | with: 26 | version: 9.12.2 27 | run_install: false 28 | 29 | - name: Install Node.js 30 | uses: actions/setup-node@v4 31 | with: 32 | node-version: ${{ matrix.node-version }} 33 | cache: 'pnpm' 34 | 35 | - name: Install libasound2-dev (for optional "speaker" dev dep) 36 | run: sudo apt-get install -y libasound2-dev 37 | 38 | - name: Install dependencies 39 | run: pnpm install --frozen-lockfile --strict-peer-dependencies 40 | 41 | - name: Run test 42 | run: pnpm test 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | .next/ 13 | 14 | # production 15 | build/ 16 | dist/ 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | .pnpm-debug.log* 27 | 28 | # local env files 29 | .env*.local 30 | 31 | # turbo 32 | .turbo 33 | 34 | # vercel 35 | .vercel 36 | 37 | # typescript 38 | *.tsbuildinfo 39 | next-env.d.ts 40 | 41 | .env 42 | 43 | old/ 44 | out/ 45 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | enable-pre-post-scripts=true 2 | package-manager-strict=false 3 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | examples/openai-realtime-console 2 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": true, 3 | "jsxSingleQuote": true, 4 | "semi": false, 5 | "useTabs": false, 6 | "tabWidth": 2, 7 | "bracketSpacing": true, 8 | "bracketSameLine": false, 9 | "arrowParens": "always", 10 | "trailingComma": "none" 11 | } 12 | -------------------------------------------------------------------------------- /examples/node/audio.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config' 2 | 3 | import fs from 'node:fs/promises' 4 | 5 | import decodeAudio from 'audio-decode' 6 | import { arrayBufferToBase64, RealtimeClient } from 'openai-realtime-api' 7 | 8 | /** 9 | * Simple Node.js demo using the `RealtimeClient` which sends a short audio 10 | * message and waits for a complete response. 11 | */ 12 | async function main() { 13 | const audioFile = await fs.readFile('./fixtures/toronto.mp3') 14 | const audioBuffer = await decodeAudio(audioFile) 15 | const channelData = audioBuffer.getChannelData(0) // only accepts mono 16 | const audio = arrayBufferToBase64(channelData) 17 | 18 | const client = new RealtimeClient({ 19 | debug: false, 20 | sessionConfig: { 21 | instructions: 22 | 'Please follow the instructions of any query you receive.\n' + 23 | 'Be concise in your responses. Speak quickly and answer shortly.', 24 | turn_detection: null 25 | } 26 | }) 27 | 28 | await client.connect() 29 | await client.waitForSessionCreated() 30 | 31 | console.log('Sending toronto.mp3 audio message...') 32 | client.sendUserMessageContent([{ type: 'input_audio', audio }]) 33 | 34 | const event = await client.realtime.waitForNext('response.done') 35 | console.log(JSON.stringify(event, null, 2)) 36 | 37 | client.disconnect() 38 | } 39 | 40 | await main() 41 | -------------------------------------------------------------------------------- /examples/node/basic.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config' 2 | 3 | import { RealtimeClient } from 'openai-realtime-api' 4 | 5 | /** 6 | * Simple Node.js demo using the `RealtimeClient` which sends a text message and 7 | * waits for a complete response. 8 | */ 9 | async function main() { 10 | const client = new RealtimeClient({ 11 | debug: false, 12 | sessionConfig: { 13 | instructions: 14 | 'Please follow the instructions of any query you receive.\n' + 15 | 'Be concise in your responses. Speak quickly and answer shortly.', 16 | turn_detection: null 17 | } 18 | }) 19 | 20 | await client.connect() 21 | await client.waitForSessionCreated() 22 | 23 | const text = 'How are you?' 24 | console.log(text) 25 | client.sendUserMessageContent([{ type: 'input_text', text }]) 26 | 27 | const event = await client.realtime.waitForNext('response.done') 28 | console.log(JSON.stringify(event, null, 2)) 29 | 30 | client.disconnect() 31 | } 32 | 33 | await main() 34 | -------------------------------------------------------------------------------- /examples/node/convo.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config' 2 | 3 | import { Readable } from 'node:stream' 4 | 5 | import microphone from 'mic' 6 | import { RealtimeClient } from 'openai-realtime-api' 7 | import Speaker from 'speaker' 8 | 9 | /** 10 | * Simple Node.js demo using the `RealtimeClient` with a microphone and speaker 11 | * to simulate a full, back & forth conversation from the terminal. 12 | */ 13 | async function main() { 14 | const client = new RealtimeClient({ 15 | debug: false, 16 | sessionConfig: { 17 | instructions: 18 | 'Please follow the instructions of any query you receive.\n' + 19 | 'Be concise in your responses. Speak quickly and answer shortly.', 20 | turn_detection: null 21 | } 22 | }) 23 | 24 | await client.connect() 25 | await client.waitForSessionCreated() 26 | 27 | let mic: microphone.Mic | undefined 28 | let speaker: Speaker | undefined 29 | startAudioStream() 30 | 31 | client.on('conversation.item.completed', ({ item }) => { 32 | const { formatted: _, ...rest } = item 33 | console.log('Conversation item completed:', rest) 34 | 35 | if ( 36 | item.type === 'message' && 37 | item.role === 'assistant' && 38 | item.formatted && 39 | item.formatted.audio 40 | ) { 41 | console.log(`Playing audio response... "${item.formatted.transcript}"`) 42 | playAudio(item.formatted.audio) 43 | } 44 | }) 45 | 46 | function startAudioStream() { 47 | try { 48 | mic = microphone({ 49 | rate: '24000', 50 | channels: '1', 51 | debug: false, 52 | exitOnSilence: 6, 53 | fileType: 'raw', 54 | encoding: 'signed-integer' 55 | }) 56 | 57 | const micInputStream = mic!.getAudioStream() 58 | 59 | micInputStream.on('error', (error: any) => { 60 | console.error('Microphone error:', error) 61 | }) 62 | 63 | mic!.start() 64 | console.log('Microphone started streaming.') 65 | 66 | let audioBuffer = Buffer.alloc(0) 67 | const chunkSize = 4800 // 0.2 seconds of audio at 24kHz 68 | 69 | micInputStream.on('data', (data: Buffer) => { 70 | audioBuffer = Buffer.concat([audioBuffer, data]) 71 | 72 | while (audioBuffer.length >= chunkSize) { 73 | const chunk = audioBuffer.subarray(0, chunkSize) 74 | audioBuffer = audioBuffer.subarray(chunkSize) 75 | 76 | const int16Array = new Int16Array( 77 | chunk.buffer, 78 | chunk.byteOffset, 79 | chunk.length / 2 80 | ) 81 | 82 | try { 83 | client.appendInputAudio(int16Array) 84 | } catch (err) { 85 | console.error('Error sending audio data:', err) 86 | } 87 | } 88 | }) 89 | 90 | micInputStream.on('silence', () => { 91 | console.log('Silence detected, creating response...') 92 | try { 93 | client.createResponse() 94 | } catch (err) { 95 | console.error('Error creating response:', err) 96 | } 97 | }) 98 | } catch (err) { 99 | console.error('Error starting audio stream:', err) 100 | } 101 | } 102 | 103 | function playAudio(audioData: Int16Array) { 104 | try { 105 | if (!speaker) { 106 | speaker = new Speaker({ 107 | channels: 1, 108 | bitDepth: 16, 109 | sampleRate: client.conversation.frequency 110 | }) 111 | } 112 | 113 | const origSpeaker = speaker 114 | 115 | const buffer = Buffer.from(audioData.buffer) 116 | const readableStream = new Readable({ 117 | read() { 118 | if (speaker !== origSpeaker) return 119 | this.push(buffer) 120 | this.push(null) 121 | } 122 | }) 123 | 124 | // Pipe the audio stream to the speaker 125 | readableStream.pipe(speaker) 126 | console.log( 127 | 'Audio sent to speaker for playback. Buffer length:', 128 | buffer.length 129 | ) 130 | 131 | speaker.on('close', () => { 132 | speaker = undefined 133 | }) 134 | } catch (err) { 135 | console.error('Error playing audio:', err) 136 | } 137 | } 138 | } 139 | 140 | await main() 141 | -------------------------------------------------------------------------------- /examples/node/mic.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'mic' { 2 | import type { Transform } from 'node:stream' 3 | 4 | export function mic(options: Options): Mic 5 | 6 | export interface Mic { 7 | start(): void 8 | stop(): void 9 | pause(): void 10 | resume(): void 11 | getAudioStream(): Transform 12 | } 13 | 14 | export interface Options { 15 | endian?: 'big' | 'little' 16 | bitwidth?: number | string 17 | encoding?: 'signed-integer' | 'unsigned-integer' 18 | rate?: number | string 19 | channels?: number | string 20 | device?: string 21 | exitOnSilence?: number | string 22 | debug?: boolean | string 23 | fileType?: string 24 | } 25 | 26 | export = mic 27 | } 28 | -------------------------------------------------------------------------------- /examples/node/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "openai-realtime-api-examples-node", 3 | "version": "0.1.0", 4 | "private": true, 5 | "author": "Travis Fischer ", 6 | "license": "MIT", 7 | "repository": { 8 | "type": "git", 9 | "url": "git+https://github.com/transitive-bullshit/openai-realtime-api.git" 10 | }, 11 | "type": "module", 12 | "dependencies": { 13 | "openai-realtime-api": "workspace:*", 14 | "audio-decode": "^2.2.2", 15 | "dotenv": "^16.4.5" 16 | }, 17 | "optionalDependencies": { 18 | "mic": "^2.1.2", 19 | "speaker": "^0.5.5" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /examples/node/relay-server.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config' 2 | 3 | import { RealtimeClient } from 'openai-realtime-api' 4 | import { RealtimeRelay } from 'openai-realtime-api/node' 5 | 6 | /** 7 | * Simple Node.js demo showing how to run the relay server. 8 | */ 9 | async function main() { 10 | const client = new RealtimeClient({ 11 | debug: false, 12 | relay: true, 13 | sessionConfig: { 14 | instructions: 15 | 'Please follow the instructions of any query you receive.\n' + 16 | 'Be concise in your responses. Speak quickly and answer shortly.', 17 | turn_detection: null 18 | } 19 | }) 20 | 21 | const relay = new RealtimeRelay({ client }) 22 | relay.listen() 23 | } 24 | 25 | await main() 26 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "parserOptions": { 4 | "sourceType": "module" 5 | }, 6 | "env": { 7 | "es2022": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | /build 13 | 14 | # packaging 15 | *.zip 16 | *.tar.gz 17 | *.tar 18 | *.tgz 19 | *.bla 20 | 21 | # misc 22 | .DS_Store 23 | .env 24 | .env.local 25 | .env.development.local 26 | .env.test.local 27 | .env.production.local 28 | 29 | npm-debug.log* 30 | yarn-debug.log* 31 | yarn-error.log* 32 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "tabWidth": 2, 3 | "useTabs": false, 4 | "singleQuote": true 5 | } 6 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/README.md: -------------------------------------------------------------------------------- 1 | > [!IMPORTANT] 2 | > This example has been imported from https://github.com/openai/openai-realtime-console ([at commit 6ea4dba](https://github.com/openai/openai-realtime-console/tree/6ea4dba795fee868c60ea9e8e7eba7469974b3e9)). The only change has been to replace `@openai/realtime-api-beta` with `openai-realtime-api` and to fix a few types. 3 | 4 | # OpenAI Realtime Console 5 | 6 | The OpenAI Realtime Console is intended as an inspector and interactive API reference 7 | for the OpenAI Realtime API. It comes packaged with two utility libraries, 8 | [openai/openai-realtime-api-beta](https://github.com/openai/openai-realtime-api-beta) 9 | that acts as a **Reference Client** (for browser and Node.js) and 10 | [`/src/lib/wavtools`](./src/lib/wavtools) which allows for simple audio 11 | management in the browser. 12 | 13 | 14 | 15 | # Starting the console 16 | 17 | This is a React project created using `create-react-app` that is bundled via Webpack. 18 | Install it by extracting the contents of this package and using; 19 | 20 | ```shell 21 | $ npm i 22 | ``` 23 | 24 | Start your server with: 25 | 26 | ```shell 27 | $ npm start 28 | ``` 29 | 30 | It should be available via `localhost:3000`. 31 | 32 | # Table of contents 33 | 34 | - [OpenAI Realtime Console](#openai-realtime-console) 35 | - [Starting the console](#starting-the-console) 36 | - [Table of contents](#table-of-contents) 37 | - [Using the console](#using-the-console) 38 | - [Using a relay server](#using-a-relay-server) 39 | - [Realtime API reference client](#realtime-api-reference-client) 40 | - [Sending streaming audio](#sending-streaming-audio) 41 | - [Adding and using tools](#adding-and-using-tools) 42 | - [Interrupting the model](#interrupting-the-model) 43 | - [Reference client events](#reference-client-events) 44 | - [Wavtools](#wavtools) 45 | - [WavRecorder Quickstart](#wavrecorder-quickstart) 46 | - [WavStreamPlayer Quickstart](#wavstreamplayer-quickstart) 47 | - [Acknowledgements and contact](#acknowledgements-and-contact) 48 | 49 | # Using the console 50 | 51 | The console requires an OpenAI API key (**user key** or **project key**) that has access to the 52 | Realtime API. You'll be prompted on startup to enter it. It will be saved via `localStorage` and can be 53 | changed at any time from the UI. 54 | 55 | To start a session you'll need to **connect**. This will require microphone access. 56 | You can then choose between **manual** (Push-to-talk) and **vad** (Voice Activity Detection) 57 | conversation modes, and switch between them at any time. 58 | 59 | There are two functions enabled; 60 | 61 | - `get_weather`: Ask for the weather anywhere and the model will do its best to pinpoint the 62 | location, show it on a map, and get the weather for that location. Note that it doesn't 63 | have location access, and coordinates are "guessed" from the model's training data so 64 | accuracy might not be perfect. 65 | - `set_memory`: You can ask the model to remember information for you, and it will store it in 66 | a JSON blob on the left. 67 | 68 | You can freely interrupt the model at any time in push-to-talk or VAD mode. 69 | 70 | ## Using a relay server 71 | 72 | If you would like to build a more robust implementation and play around with the reference 73 | client using your own server, we have included a Node.js [Relay Server](/relay-server/index.js). 74 | 75 | ```shell 76 | $ npm run relay 77 | ``` 78 | 79 | It will start automatically on `localhost:8081`. 80 | 81 | **You will need to create a `.env` file** with the following configuration: 82 | 83 | ```conf 84 | OPENAI_API_KEY=YOUR_API_KEY 85 | REACT_APP_LOCAL_RELAY_SERVER_URL=http://localhost:8081 86 | ``` 87 | 88 | You will need to restart both your React app and relay server for the `.env.` changes 89 | to take effect. The local server URL is loaded via [`ConsolePage.tsx`](/src/pages/ConsolePage.tsx). 90 | To stop using the relay server at any time, simply delete the environment 91 | variable or set it to empty string. 92 | 93 | ```javascript 94 | /** 95 | * Running a local relay server will allow you to hide your API key 96 | * and run custom logic on the server 97 | * 98 | * Set the local relay server address to: 99 | * REACT_APP_LOCAL_RELAY_SERVER_URL=http://localhost:8081 100 | * 101 | * This will also require you to set OPENAI_API_KEY= in a `.env` file 102 | * You can run it with `npm run relay`, in parallel with `npm start` 103 | */ 104 | const LOCAL_RELAY_SERVER_URL: string = 105 | process.env.REACT_APP_LOCAL_RELAY_SERVER_URL || ''; 106 | ``` 107 | 108 | This server is **only a simple message relay**, but it can be extended to: 109 | 110 | - Hide API credentials if you would like to ship an app to play with online 111 | - Handle certain calls you would like to keep secret (e.g. `instructions`) on 112 | the server directly 113 | - Restrict what types of events the client can receive and send 114 | 115 | You will have to implement these features yourself. 116 | 117 | # Realtime API reference client 118 | 119 | The latest reference client and documentation are available on GitHub at 120 | [openai/openai-realtime-api-beta](https://github.com/openai/openai-realtime-api-beta). 121 | 122 | You can use this client yourself in any React (front-end) or Node.js project. 123 | For full documentation, refer to the GitHub repository, but you can use the 124 | guide here as a primer to get started. 125 | 126 | ```javascript 127 | import { RealtimeClient } from '/src/lib/realtime-api-beta/index.js'; 128 | 129 | const client = new RealtimeClient({ apiKey: process.env.OPENAI_API_KEY }); 130 | 131 | // Can set parameters ahead of connecting 132 | client.updateSession({ instructions: 'You are a great, upbeat friend.' }); 133 | client.updateSession({ voice: 'alloy' }); 134 | client.updateSession({ turn_detection: 'server_vad' }); 135 | client.updateSession({ input_audio_transcription: { model: 'whisper-1' } }); 136 | 137 | // Set up event handling 138 | client.on('conversation.updated', ({ item, delta }) => { 139 | const items = client.conversation.getItems(); // can use this to render all items 140 | /* includes all changes to conversations, delta may be populated */ 141 | }); 142 | 143 | // Connect to Realtime API 144 | await client.connect(); 145 | 146 | // Send an item and triggers a generation 147 | client.sendUserMessageContent([{ type: 'text', text: `How are you?` }]); 148 | ``` 149 | 150 | ## Sending streaming audio 151 | 152 | To send streaming audio, use the `.appendInputAudio()` method. If you're in `turn_detection: 'disabled'` mode, 153 | then you need to use `.generate()` to tell the model to respond. 154 | 155 | ```javascript 156 | // Send user audio, must be Int16Array or ArrayBuffer 157 | // Default audio format is pcm16 with sample rate of 24,000 Hz 158 | // This populates 1s of noise in 0.1s chunks 159 | for (let i = 0; i < 10; i++) { 160 | const data = new Int16Array(2400); 161 | for (let n = 0; n < 2400; n++) { 162 | const value = Math.floor((Math.random() * 2 - 1) * 0x8000); 163 | data[n] = value; 164 | } 165 | client.appendInputAudio(data); 166 | } 167 | // Pending audio is committed and model is asked to generate 168 | client.createResponse(); 169 | ``` 170 | 171 | ## Adding and using tools 172 | 173 | Working with tools is easy. Just call `.addTool()` and set a callback as the second parameter. 174 | The callback will be executed with the parameters for the tool, and the result will be automatically 175 | sent back to the model. 176 | 177 | ```javascript 178 | // We can add tools as well, with callbacks specified 179 | client.addTool( 180 | { 181 | name: 'get_weather', 182 | description: 183 | 'Retrieves the weather for a given lat, lng coordinate pair. Specify a label for the location.', 184 | parameters: { 185 | type: 'object', 186 | properties: { 187 | lat: { 188 | type: 'number', 189 | description: 'Latitude', 190 | }, 191 | lng: { 192 | type: 'number', 193 | description: 'Longitude', 194 | }, 195 | location: { 196 | type: 'string', 197 | description: 'Name of the location', 198 | }, 199 | }, 200 | required: ['lat', 'lng', 'location'], 201 | }, 202 | }, 203 | async ({ lat, lng, location }) => { 204 | const result = await fetch( 205 | `https://api.open-meteo.com/v1/forecast?latitude=${lat}&longitude=${lng}¤t=temperature_2m,wind_speed_10m`, 206 | ); 207 | const json = await result.json(); 208 | return json; 209 | }, 210 | ); 211 | ``` 212 | 213 | ## Interrupting the model 214 | 215 | You may want to manually interrupt the model, especially in `turn_detection: 'disabled'` mode. 216 | To do this, we can use: 217 | 218 | ```javascript 219 | // id is the id of the item currently being generated 220 | // sampleCount is the number of audio samples that have been heard by the listener 221 | client.cancelResponse(id, sampleCount); 222 | ``` 223 | 224 | This method will cause the model to immediately cease generation, but also truncate the 225 | item being played by removing all audio after `sampleCount` and clearing the text 226 | response. By using this method you can interrupt the model and prevent it from "remembering" 227 | anything it has generated that is ahead of where the user's state is. 228 | 229 | ## Reference client events 230 | 231 | There are five main client events for application control flow in `RealtimeClient`. 232 | Note that this is only an overview of using the client, the full Realtime API 233 | event specification is considerably larger, if you need more control check out the GitHub repository: 234 | [openai/openai-realtime-api-beta](https://github.com/openai/openai-realtime-api-beta). 235 | 236 | ```javascript 237 | // errors like connection failures 238 | client.on('error', (event) => { 239 | // do thing 240 | }); 241 | 242 | // in VAD mode, the user starts speaking 243 | // we can use this to stop audio playback of a previous response if necessary 244 | client.on('conversation.interrupted', () => { 245 | /* do something */ 246 | }); 247 | 248 | // includes all changes to conversations 249 | // delta may be populated 250 | client.on('conversation.updated', ({ item, delta }) => { 251 | // get all items, e.g. if you need to update a chat window 252 | const items = client.conversation.getItems(); 253 | switch (item.type) { 254 | case 'message': 255 | // system, user, or assistant message (item.role) 256 | break; 257 | case 'function_call': 258 | // always a function call from the model 259 | break; 260 | case 'function_call_output': 261 | // always a response from the user / application 262 | break; 263 | } 264 | if (delta) { 265 | // Only one of the following will be populated for any given event 266 | // delta.audio = Int16Array, audio added 267 | // delta.transcript = string, transcript added 268 | // delta.arguments = string, function arguments added 269 | } 270 | }); 271 | 272 | // only triggered after item added to conversation 273 | client.on('conversation.item.appended', ({ item }) => { 274 | /* item status can be 'in_progress' or 'completed' */ 275 | }); 276 | 277 | // only triggered after item completed in conversation 278 | // will always be triggered after conversation.item.appended 279 | client.on('conversation.item.completed', ({ item }) => { 280 | /* item status will always be 'completed' */ 281 | }); 282 | ``` 283 | 284 | # Wavtools 285 | 286 | Wavtools contains easy management of PCM16 audio streams in the browser, both 287 | recording and playing. 288 | 289 | ## WavRecorder Quickstart 290 | 291 | ```javascript 292 | import { WavRecorder } from '/src/lib/wavtools/index.js'; 293 | 294 | const wavRecorder = new WavRecorder({ sampleRate: 24000 }); 295 | wavRecorder.getStatus(); // "ended" 296 | 297 | // request permissions, connect microphone 298 | await wavRecorder.begin(); 299 | wavRecorder.getStatus(); // "paused" 300 | 301 | // Start recording 302 | // This callback will be triggered in chunks of 8192 samples by default 303 | // { mono, raw } are Int16Array (PCM16) mono & full channel data 304 | await wavRecorder.record((data) => { 305 | const { mono, raw } = data; 306 | }); 307 | wavRecorder.getStatus(); // "recording" 308 | 309 | // Stop recording 310 | await wavRecorder.pause(); 311 | wavRecorder.getStatus(); // "paused" 312 | 313 | // outputs "audio/wav" audio file 314 | const audio = await wavRecorder.save(); 315 | 316 | // clears current audio buffer and starts recording 317 | await wavRecorder.clear(); 318 | await wavRecorder.record(); 319 | 320 | // get data for visualization 321 | const frequencyData = wavRecorder.getFrequencies(); 322 | 323 | // Stop recording, disconnects microphone, output file 324 | await wavRecorder.pause(); 325 | const finalAudio = await wavRecorder.end(); 326 | 327 | // Listen for device change; e.g. if somebody disconnects a microphone 328 | // deviceList is array of MediaDeviceInfo[] + `default` property 329 | wavRecorder.listenForDeviceChange((deviceList) => {}); 330 | ``` 331 | 332 | ## WavStreamPlayer Quickstart 333 | 334 | ```javascript 335 | import { WavStreamPlayer } from '/src/lib/wavtools/index.js'; 336 | 337 | const wavStreamPlayer = new WavStreamPlayer({ sampleRate: 24000 }); 338 | 339 | // Connect to audio output 340 | await wavStreamPlayer.connect(); 341 | 342 | // Create 1s of empty PCM16 audio 343 | const audio = new Int16Array(24000); 344 | // Queue 3s of audio, will start playing immediately 345 | wavStreamPlayer.add16BitPCM(audio, 'my-track'); 346 | wavStreamPlayer.add16BitPCM(audio, 'my-track'); 347 | wavStreamPlayer.add16BitPCM(audio, 'my-track'); 348 | 349 | // get data for visualization 350 | const frequencyData = wavStreamPlayer.getFrequencies(); 351 | 352 | // Interrupt the audio (halt playback) at any time 353 | // To restart, need to call .add16BitPCM() again 354 | const trackOffset = await wavStreamPlayer.interrupt(); 355 | trackOffset.trackId; // "my-track" 356 | trackOffset.offset; // sample number 357 | trackOffset.currentTime; // time in track 358 | ``` 359 | 360 | # Acknowledgements and contact 361 | 362 | Thanks for checking out the Realtime Console. We hope you have fun with the Realtime API. 363 | Special thanks to the whole Realtime API team for making this possible. Please feel free 364 | to reach out, ask questions, or give feedback by creating an issue on the repository. 365 | You can also reach out and let us know what you think directly! 366 | 367 | - OpenAI Developers / [@OpenAIDevs](https://x.com/OpenAIDevs) 368 | - Jordan Sitkin / API / [@dustmason](https://x.com/dustmason) 369 | - Mark Hudnall / API / [@landakram](https://x.com/landakram) 370 | - Peter Bakkum / API / [@pbbakkum](https://x.com/pbbakkum) 371 | - Atty Eleti / API / [@athyuttamre](https://x.com/athyuttamre) 372 | - Jason Clark / API / [@onebitToo](https://x.com/onebitToo) 373 | - Karolis Kosas / Design / [@karoliskosas](https://x.com/karoliskosas) 374 | - Keith Horwood / API + DX / [@keithwhor](https://x.com/keithwhor) 375 | - Romain Huet / DX / [@romainhuet](https://x.com/romainhuet) 376 | - Katia Gil Guzman / DX / [@kagigz](https://x.com/kagigz) 377 | - Ilan Bigio / DX / [@ilanbigio](https://x.com/ilanbigio) 378 | - Kevin Whinnery / DX / [@kevinwhinnery](https://x.com/kevinwhinnery) 379 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "openai-realtime-console", 3 | "version": "0.0.0", 4 | "type": "module", 5 | "private": true, 6 | "dependencies": { 7 | "@openai/realtime-api-beta": "github:openai/openai-realtime-api-beta", 8 | "@testing-library/jest-dom": "^5.17.0", 9 | "@testing-library/react": "^13.4.0", 10 | "@testing-library/user-event": "^13.5.0", 11 | "@types/jest": "^27.5.2", 12 | "@types/leaflet": "^1.9.12", 13 | "@types/node": "^16.18.108", 14 | "@types/react": "^18.3.5", 15 | "@types/react-dom": "^18.3.0", 16 | "dotenv": "^16.4.5", 17 | "leaflet": "^1.9.4", 18 | "openai-realtime-api": "workspace:*", 19 | "react": "^18.3.1", 20 | "react-dom": "^18.3.1", 21 | "react-feather": "^2.0.10", 22 | "react-leaflet": "^4.2.1", 23 | "react-scripts": "^5.0.1", 24 | "sass": "^1.78.0", 25 | "save": "^2.9.0", 26 | "typescript": "^4.9.5", 27 | "web-vitals": "^2.1.4", 28 | "ws": "^8.18.0" 29 | }, 30 | "scripts": { 31 | "start": "react-scripts start", 32 | "build": "react-scripts build", 33 | "test": "react-scripts test", 34 | "eject": "react-scripts eject", 35 | "zip": "zip -r realtime-api-console.zip . -x 'node_modules' 'node_modules/*' 'node_modules/**' '.git' '.git/*' '.git/**' '.DS_Store' '*/.DS_Store' 'package-lock.json' '*.zip' '*.tar.gz' '*.tar' '.env'", 36 | "relay": "nodemon ./relay-server/index.js" 37 | }, 38 | "eslintConfig": { 39 | "root": true, 40 | "extends": [ 41 | "react-app", 42 | "react-app/jest" 43 | ] 44 | }, 45 | "browserslist": { 46 | "production": [ 47 | ">0.2%", 48 | "not dead", 49 | "not op_mini all" 50 | ], 51 | "development": [ 52 | "last 1 chrome version", 53 | "last 1 firefox version", 54 | "last 1 safari version" 55 | ] 56 | }, 57 | "devDependencies": { 58 | "@babel/plugin-proposal-private-property-in-object": "^7.21.11", 59 | "nodemon": "^3.1.7" 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | realtime console 8 | 9 | 13 | 14 | 20 | 25 | 26 | 27 | 28 |
29 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/public/openai-logomark.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/readme/realtime-console-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/transitive-bullshit/openai-realtime-api/89d37b5f461fbcb0300241360749abe85ca45d01/examples/openai-realtime-console/readme/realtime-console-demo.png -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/App.scss: -------------------------------------------------------------------------------- 1 | [data-component='App'] { 2 | height: 100%; 3 | width: 100%; 4 | position: relative; 5 | } 6 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/App.tsx: -------------------------------------------------------------------------------- 1 | import { ConsolePage } from './pages/ConsolePage'; 2 | import './App.scss'; 3 | 4 | function App() { 5 | return ( 6 |
7 | 8 |
9 | ); 10 | } 11 | 12 | export default App; 13 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/components/Map.scss: -------------------------------------------------------------------------------- 1 | [data-component='Map'] { 2 | position: absolute; 3 | width: 100%; 4 | height: 100%; 5 | .leaflet-container { 6 | height: 100%; 7 | width: 100%; 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/components/Map.tsx: -------------------------------------------------------------------------------- 1 | import { MapContainer, TileLayer, Marker, Popup, useMap } from 'react-leaflet'; 2 | import { LatLngTuple } from 'leaflet'; 3 | import './Map.scss'; 4 | 5 | function ChangeView({ center, zoom }: { center: LatLngTuple; zoom: number }) { 6 | const map = useMap(); 7 | map.setView(center, zoom); 8 | return null; 9 | } 10 | 11 | export function Map({ 12 | center, 13 | location = 'My Location', 14 | }: { 15 | center: LatLngTuple; 16 | location?: string; 17 | }) { 18 | return ( 19 |
20 | 27 | 28 | 29 | 30 | {location} 31 | 32 | 33 |
34 | ); 35 | } 36 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/components/button/Button.scss: -------------------------------------------------------------------------------- 1 | [data-component='Button'] { 2 | display: flex; 3 | align-items: center; 4 | gap: 8px; 5 | font-family: 'Roboto Mono', monospace; 6 | font-size: 12px; 7 | font-optical-sizing: auto; 8 | font-weight: 400; 9 | font-style: normal; 10 | border: none; 11 | background-color: #ececf1; 12 | color: #101010; 13 | border-radius: 1000px; 14 | padding: 8px 24px; 15 | min-height: 42px; 16 | transition: transform 0.1s ease-in-out, background-color 0.1s ease-in-out; 17 | outline: none; 18 | 19 | &.button-style-action { 20 | background-color: #101010; 21 | color: #ececf1; 22 | &:hover:not([disabled]) { 23 | background-color: #404040; 24 | } 25 | } 26 | 27 | &.button-style-alert { 28 | background-color: #f00; 29 | color: #ececf1; 30 | &:hover:not([disabled]) { 31 | background-color: #f00; 32 | } 33 | } 34 | 35 | &.button-style-flush { 36 | background-color: rgba(255, 255, 255, 0); 37 | } 38 | 39 | &[disabled] { 40 | color: #999; 41 | } 42 | 43 | &:not([disabled]) { 44 | cursor: pointer; 45 | } 46 | 47 | &:hover:not([disabled]) { 48 | background-color: #d8d8d8; 49 | } 50 | 51 | &:active:not([disabled]) { 52 | transform: translateY(1px); 53 | } 54 | 55 | .icon { 56 | display: flex; 57 | &.icon-start { 58 | margin-left: -8px; 59 | } 60 | &.icon-end { 61 | margin-right: -8px; 62 | } 63 | svg { 64 | width: 16px; 65 | height: 16px; 66 | } 67 | } 68 | 69 | &.icon-red .icon { 70 | color: #cc0000; 71 | } 72 | &.icon-green .icon { 73 | color: #009900; 74 | } 75 | &.icon-grey .icon { 76 | color: #909090; 77 | } 78 | &.icon-fill { 79 | svg { 80 | fill: currentColor; 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/components/button/Button.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import './Button.scss'; 3 | 4 | import { Icon } from 'react-feather'; 5 | 6 | interface ButtonProps extends React.ButtonHTMLAttributes { 7 | label?: string; 8 | icon?: Icon; 9 | iconPosition?: 'start' | 'end'; 10 | iconColor?: 'red' | 'green' | 'grey'; 11 | iconFill?: boolean; 12 | buttonStyle?: 'regular' | 'action' | 'alert' | 'flush'; 13 | } 14 | 15 | export function Button({ 16 | label = 'Okay', 17 | icon = void 0, 18 | iconPosition = 'start', 19 | iconColor = void 0, 20 | iconFill = false, 21 | buttonStyle = 'regular', 22 | ...rest 23 | }: ButtonProps) { 24 | const StartIcon = iconPosition === 'start' ? icon : null; 25 | const EndIcon = iconPosition === 'end' ? icon : null; 26 | const classList = []; 27 | if (iconColor) { 28 | classList.push(`icon-${iconColor}`); 29 | } 30 | if (iconFill) { 31 | classList.push(`icon-fill`); 32 | } 33 | classList.push(`button-style-${buttonStyle}`); 34 | 35 | return ( 36 | 49 | ); 50 | } 51 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/components/toggle/Toggle.scss: -------------------------------------------------------------------------------- 1 | [data-component='Toggle'] { 2 | position: relative; 3 | display: flex; 4 | align-items: center; 5 | gap: 8px; 6 | cursor: pointer; 7 | overflow: hidden; 8 | 9 | background-color: #ececf1; 10 | color: #101010; 11 | height: 40px; 12 | border-radius: 1000px; 13 | 14 | &:hover { 15 | background-color: #d8d8d8; 16 | } 17 | 18 | div.label { 19 | position: relative; 20 | color: #666; 21 | transition: color 0.1s ease-in-out; 22 | padding: 0px 16px; 23 | z-index: 2; 24 | user-select: none; 25 | } 26 | 27 | div.label.right { 28 | margin-left: -8px; 29 | } 30 | 31 | .toggle-background { 32 | background-color: #101010; 33 | position: absolute; 34 | top: 0px; 35 | left: 0px; 36 | width: auto; 37 | bottom: 0px; 38 | z-index: 1; 39 | border-radius: 1000px; 40 | transition: left 0.1s ease-in-out, width 0.1s ease-in-out; 41 | } 42 | 43 | &[data-enabled='true'] { 44 | div.label.right { 45 | color: #fff; 46 | } 47 | } 48 | 49 | &[data-enabled='false'] { 50 | div.label.left { 51 | color: #fff; 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/components/toggle/Toggle.tsx: -------------------------------------------------------------------------------- 1 | import { useState, useEffect, useRef } from 'react'; 2 | 3 | import './Toggle.scss'; 4 | 5 | export function Toggle({ 6 | defaultValue = false, 7 | values, 8 | labels, 9 | onChange = () => {}, 10 | }: { 11 | defaultValue?: string | boolean; 12 | values?: string[]; 13 | labels?: string[]; 14 | onChange?: (isEnabled: boolean, value: string) => void; 15 | }) { 16 | if (typeof defaultValue === 'string') { 17 | defaultValue = !!Math.max(0, (values || []).indexOf(defaultValue)); 18 | } 19 | 20 | const leftRef = useRef(null); 21 | const rightRef = useRef(null); 22 | const bgRef = useRef(null); 23 | const [value, setValue] = useState(defaultValue); 24 | 25 | const toggleValue = () => { 26 | const v = !value; 27 | const index = +v; 28 | setValue(v); 29 | onChange(v, (values || [])[index]); 30 | }; 31 | 32 | useEffect(() => { 33 | const leftEl = leftRef.current; 34 | const rightEl = rightRef.current; 35 | const bgEl = bgRef.current; 36 | if (leftEl && rightEl && bgEl) { 37 | if (value) { 38 | bgEl.style.left = rightEl.offsetLeft + 'px'; 39 | bgEl.style.width = rightEl.offsetWidth + 'px'; 40 | } else { 41 | bgEl.style.left = ''; 42 | bgEl.style.width = leftEl.offsetWidth + 'px'; 43 | } 44 | } 45 | }, [value]); 46 | 47 | return ( 48 |
53 | {labels && ( 54 |
55 | {labels[0]} 56 |
57 | )} 58 | {labels && ( 59 |
60 | {labels[1]} 61 |
62 | )} 63 |
64 |
65 | ); 66 | } 67 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/index.css: -------------------------------------------------------------------------------- 1 | html, 2 | body { 3 | padding: 0px; 4 | margin: 0px; 5 | position: relative; 6 | width: 100%; 7 | height: 100%; 8 | font-family: 'Assistant', sans-serif; 9 | font-optical-sizing: auto; 10 | font-weight: 400; 11 | font-style: normal; 12 | color: #18181b; 13 | -webkit-font-smoothing: antialiased; 14 | -moz-osx-font-smoothing: grayscale; 15 | } 16 | 17 | #root { 18 | position: relative; 19 | width: 100%; 20 | height: 100%; 21 | } 22 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/index.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom/client'; 3 | import './index.css'; 4 | import App from './App'; 5 | import reportWebVitals from './reportWebVitals'; 6 | 7 | const root = ReactDOM.createRoot( 8 | document.getElementById('root') as HTMLElement 9 | ); 10 | root.render( 11 | 12 | 13 | 14 | ); 15 | 16 | // If you want to start measuring performance in your app, pass a function 17 | // to log results (for example: reportWebVitals(console.log)) 18 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals 19 | reportWebVitals(); 20 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/index.js: -------------------------------------------------------------------------------- 1 | import { WavPacker } from './lib/wav_packer.js'; 2 | import { AudioAnalysis } from './lib/analysis/audio_analysis.js'; 3 | import { WavStreamPlayer } from './lib/wav_stream_player.js'; 4 | import { WavRecorder } from './lib/wav_recorder.js'; 5 | 6 | export { AudioAnalysis, WavPacker, WavStreamPlayer, WavRecorder }; 7 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/analysis/audio_analysis.js: -------------------------------------------------------------------------------- 1 | import { 2 | noteFrequencies, 3 | noteFrequencyLabels, 4 | voiceFrequencies, 5 | voiceFrequencyLabels, 6 | } from './constants.js'; 7 | 8 | /** 9 | * Output of AudioAnalysis for the frequency domain of the audio 10 | * @typedef {Object} AudioAnalysisOutputType 11 | * @property {Float32Array} values Amplitude of this frequency between {0, 1} inclusive 12 | * @property {number[]} frequencies Raw frequency bucket values 13 | * @property {string[]} labels Labels for the frequency bucket values 14 | */ 15 | 16 | /** 17 | * Analyzes audio for visual output 18 | * @class 19 | */ 20 | export class AudioAnalysis { 21 | /** 22 | * Retrieves frequency domain data from an AnalyserNode adjusted to a decibel range 23 | * returns human-readable formatting and labels 24 | * @param {AnalyserNode} analyser 25 | * @param {number} sampleRate 26 | * @param {Float32Array} [fftResult] 27 | * @param {"frequency"|"music"|"voice"} [analysisType] 28 | * @param {number} [minDecibels] default -100 29 | * @param {number} [maxDecibels] default -30 30 | * @returns {AudioAnalysisOutputType} 31 | */ 32 | static getFrequencies( 33 | analyser, 34 | sampleRate, 35 | fftResult, 36 | analysisType = 'frequency', 37 | minDecibels = -100, 38 | maxDecibels = -30, 39 | ) { 40 | if (!fftResult) { 41 | fftResult = new Float32Array(analyser.frequencyBinCount); 42 | analyser.getFloatFrequencyData(fftResult); 43 | } 44 | const nyquistFrequency = sampleRate / 2; 45 | const frequencyStep = (1 / fftResult.length) * nyquistFrequency; 46 | let outputValues; 47 | let frequencies; 48 | let labels; 49 | if (analysisType === 'music' || analysisType === 'voice') { 50 | const useFrequencies = 51 | analysisType === 'voice' ? voiceFrequencies : noteFrequencies; 52 | const aggregateOutput = Array(useFrequencies.length).fill(minDecibels); 53 | for (let i = 0; i < fftResult.length; i++) { 54 | const frequency = i * frequencyStep; 55 | const amplitude = fftResult[i]; 56 | for (let n = useFrequencies.length - 1; n >= 0; n--) { 57 | if (frequency > useFrequencies[n]) { 58 | aggregateOutput[n] = Math.max(aggregateOutput[n], amplitude); 59 | break; 60 | } 61 | } 62 | } 63 | outputValues = aggregateOutput; 64 | frequencies = 65 | analysisType === 'voice' ? voiceFrequencies : noteFrequencies; 66 | labels = 67 | analysisType === 'voice' ? voiceFrequencyLabels : noteFrequencyLabels; 68 | } else { 69 | outputValues = Array.from(fftResult); 70 | frequencies = outputValues.map((_, i) => frequencyStep * i); 71 | labels = frequencies.map((f) => `${f.toFixed(2)} Hz`); 72 | } 73 | // We normalize to {0, 1} 74 | const normalizedOutput = outputValues.map((v) => { 75 | return Math.max( 76 | 0, 77 | Math.min((v - minDecibels) / (maxDecibels - minDecibels), 1), 78 | ); 79 | }); 80 | const values = new Float32Array(normalizedOutput); 81 | return { 82 | values, 83 | frequencies, 84 | labels, 85 | }; 86 | } 87 | 88 | /** 89 | * Creates a new AudioAnalysis instance for an HTMLAudioElement 90 | * @param {HTMLAudioElement} audioElement 91 | * @param {AudioBuffer|null} [audioBuffer] If provided, will cache all frequency domain data from the buffer 92 | * @returns {AudioAnalysis} 93 | */ 94 | constructor(audioElement, audioBuffer = null) { 95 | this.fftResults = []; 96 | if (audioBuffer) { 97 | /** 98 | * Modified from 99 | * https://stackoverflow.com/questions/75063715/using-the-web-audio-api-to-analyze-a-song-without-playing 100 | * 101 | * We do this to populate FFT values for the audio if provided an `audioBuffer` 102 | * The reason to do this is that Safari fails when using `createMediaElementSource` 103 | * This has a non-zero RAM cost so we only opt-in to run it on Safari, Chrome is better 104 | */ 105 | const { length, sampleRate } = audioBuffer; 106 | const offlineAudioContext = new OfflineAudioContext({ 107 | length, 108 | sampleRate, 109 | }); 110 | const source = offlineAudioContext.createBufferSource(); 111 | source.buffer = audioBuffer; 112 | const analyser = offlineAudioContext.createAnalyser(); 113 | analyser.fftSize = 8192; 114 | analyser.smoothingTimeConstant = 0.1; 115 | source.connect(analyser); 116 | // limit is :: 128 / sampleRate; 117 | // but we just want 60fps - cuts ~1s from 6MB to 1MB of RAM 118 | const renderQuantumInSeconds = 1 / 60; 119 | const durationInSeconds = length / sampleRate; 120 | const analyze = (index) => { 121 | const suspendTime = renderQuantumInSeconds * index; 122 | if (suspendTime < durationInSeconds) { 123 | offlineAudioContext.suspend(suspendTime).then(() => { 124 | const fftResult = new Float32Array(analyser.frequencyBinCount); 125 | analyser.getFloatFrequencyData(fftResult); 126 | this.fftResults.push(fftResult); 127 | analyze(index + 1); 128 | }); 129 | } 130 | if (index === 1) { 131 | offlineAudioContext.startRendering(); 132 | } else { 133 | offlineAudioContext.resume(); 134 | } 135 | }; 136 | source.start(0); 137 | analyze(1); 138 | this.audio = audioElement; 139 | this.context = offlineAudioContext; 140 | this.analyser = analyser; 141 | this.sampleRate = sampleRate; 142 | this.audioBuffer = audioBuffer; 143 | } else { 144 | const audioContext = new AudioContext(); 145 | const track = audioContext.createMediaElementSource(audioElement); 146 | const analyser = audioContext.createAnalyser(); 147 | analyser.fftSize = 8192; 148 | analyser.smoothingTimeConstant = 0.1; 149 | track.connect(analyser); 150 | analyser.connect(audioContext.destination); 151 | this.audio = audioElement; 152 | this.context = audioContext; 153 | this.analyser = analyser; 154 | this.sampleRate = this.context.sampleRate; 155 | this.audioBuffer = null; 156 | } 157 | } 158 | 159 | /** 160 | * Gets the current frequency domain data from the playing audio track 161 | * @param {"frequency"|"music"|"voice"} [analysisType] 162 | * @param {number} [minDecibels] default -100 163 | * @param {number} [maxDecibels] default -30 164 | * @returns {AudioAnalysisOutputType} 165 | */ 166 | getFrequencies( 167 | analysisType = 'frequency', 168 | minDecibels = -100, 169 | maxDecibels = -30, 170 | ) { 171 | let fftResult = null; 172 | if (this.audioBuffer && this.fftResults.length) { 173 | const pct = this.audio.currentTime / this.audio.duration; 174 | const index = Math.min( 175 | (pct * this.fftResults.length) | 0, 176 | this.fftResults.length - 1, 177 | ); 178 | fftResult = this.fftResults[index]; 179 | } 180 | return AudioAnalysis.getFrequencies( 181 | this.analyser, 182 | this.sampleRate, 183 | fftResult, 184 | analysisType, 185 | minDecibels, 186 | maxDecibels, 187 | ); 188 | } 189 | 190 | /** 191 | * Resume the internal AudioContext if it was suspended due to the lack of 192 | * user interaction when the AudioAnalysis was instantiated. 193 | * @returns {Promise} 194 | */ 195 | async resumeIfSuspended() { 196 | if (this.context.state === 'suspended') { 197 | await this.context.resume(); 198 | } 199 | return true; 200 | } 201 | } 202 | 203 | globalThis.AudioAnalysis = AudioAnalysis; 204 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/analysis/constants.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Constants for help with visualization 3 | * Helps map frequency ranges from Fast Fourier Transform 4 | * to human-interpretable ranges, notably music ranges and 5 | * human vocal ranges. 6 | */ 7 | 8 | // Eighth octave frequencies 9 | const octave8Frequencies = [ 10 | 4186.01, 4434.92, 4698.63, 4978.03, 5274.04, 5587.65, 5919.91, 6271.93, 11 | 6644.88, 7040.0, 7458.62, 7902.13, 12 | ]; 13 | 14 | // Labels for each of the above frequencies 15 | const octave8FrequencyLabels = [ 16 | 'C', 17 | 'C#', 18 | 'D', 19 | 'D#', 20 | 'E', 21 | 'F', 22 | 'F#', 23 | 'G', 24 | 'G#', 25 | 'A', 26 | 'A#', 27 | 'B', 28 | ]; 29 | 30 | /** 31 | * All note frequencies from 1st to 8th octave 32 | * in format "A#8" (A#, 8th octave) 33 | */ 34 | export const noteFrequencies = []; 35 | export const noteFrequencyLabels = []; 36 | for (let i = 1; i <= 8; i++) { 37 | for (let f = 0; f < octave8Frequencies.length; f++) { 38 | const freq = octave8Frequencies[f]; 39 | noteFrequencies.push(freq / Math.pow(2, 8 - i)); 40 | noteFrequencyLabels.push(octave8FrequencyLabels[f] + i); 41 | } 42 | } 43 | 44 | /** 45 | * Subset of the note frequencies between 32 and 2000 Hz 46 | * 6 octave range: C1 to B6 47 | */ 48 | const voiceFrequencyRange = [32.0, 2000.0]; 49 | export const voiceFrequencies = noteFrequencies.filter((_, i) => { 50 | return ( 51 | noteFrequencies[i] > voiceFrequencyRange[0] && 52 | noteFrequencies[i] < voiceFrequencyRange[1] 53 | ); 54 | }); 55 | export const voiceFrequencyLabels = noteFrequencyLabels.filter((_, i) => { 56 | return ( 57 | noteFrequencies[i] > voiceFrequencyRange[0] && 58 | noteFrequencies[i] < voiceFrequencyRange[1] 59 | ); 60 | }); 61 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/wav_packer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Raw wav audio file contents 3 | * @typedef {Object} WavPackerAudioType 4 | * @property {Blob} blob 5 | * @property {string} url 6 | * @property {number} channelCount 7 | * @property {number} sampleRate 8 | * @property {number} duration 9 | */ 10 | 11 | /** 12 | * Utility class for assembling PCM16 "audio/wav" data 13 | * @class 14 | */ 15 | export class WavPacker { 16 | /** 17 | * Converts Float32Array of amplitude data to ArrayBuffer in Int16Array format 18 | * @param {Float32Array} float32Array 19 | * @returns {ArrayBuffer} 20 | */ 21 | static floatTo16BitPCM(float32Array) { 22 | const buffer = new ArrayBuffer(float32Array.length * 2); 23 | const view = new DataView(buffer); 24 | let offset = 0; 25 | for (let i = 0; i < float32Array.length; i++, offset += 2) { 26 | let s = Math.max(-1, Math.min(1, float32Array[i])); 27 | view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); 28 | } 29 | return buffer; 30 | } 31 | 32 | /** 33 | * Concatenates two ArrayBuffers 34 | * @param {ArrayBuffer} leftBuffer 35 | * @param {ArrayBuffer} rightBuffer 36 | * @returns {ArrayBuffer} 37 | */ 38 | static mergeBuffers(leftBuffer, rightBuffer) { 39 | const tmpArray = new Uint8Array( 40 | leftBuffer.byteLength + rightBuffer.byteLength 41 | ); 42 | tmpArray.set(new Uint8Array(leftBuffer), 0); 43 | tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength); 44 | return tmpArray.buffer; 45 | } 46 | 47 | /** 48 | * Packs data into an Int16 format 49 | * @private 50 | * @param {number} size 0 = 1x Int16, 1 = 2x Int16 51 | * @param {number} arg value to pack 52 | * @returns 53 | */ 54 | _packData(size, arg) { 55 | return [ 56 | new Uint8Array([arg, arg >> 8]), 57 | new Uint8Array([arg, arg >> 8, arg >> 16, arg >> 24]), 58 | ][size]; 59 | } 60 | 61 | /** 62 | * Packs audio into "audio/wav" Blob 63 | * @param {number} sampleRate 64 | * @param {{bitsPerSample: number, channels: Array, data: Int16Array}} audio 65 | * @returns {WavPackerAudioType} 66 | */ 67 | pack(sampleRate, audio) { 68 | if (!audio?.bitsPerSample) { 69 | throw new Error(`Missing "bitsPerSample"`); 70 | } else if (!audio?.channels) { 71 | throw new Error(`Missing "channels"`); 72 | } else if (!audio?.data) { 73 | throw new Error(`Missing "data"`); 74 | } 75 | const { bitsPerSample, channels, data } = audio; 76 | const output = [ 77 | // Header 78 | 'RIFF', 79 | this._packData( 80 | 1, 81 | 4 + (8 + 24) /* chunk 1 length */ + (8 + 8) /* chunk 2 length */ 82 | ), // Length 83 | 'WAVE', 84 | // chunk 1 85 | 'fmt ', // Sub-chunk identifier 86 | this._packData(1, 16), // Chunk length 87 | this._packData(0, 1), // Audio format (1 is linear quantization) 88 | this._packData(0, channels.length), 89 | this._packData(1, sampleRate), 90 | this._packData(1, (sampleRate * channels.length * bitsPerSample) / 8), // Byte rate 91 | this._packData(0, (channels.length * bitsPerSample) / 8), 92 | this._packData(0, bitsPerSample), 93 | // chunk 2 94 | 'data', // Sub-chunk identifier 95 | this._packData( 96 | 1, 97 | (channels[0].length * channels.length * bitsPerSample) / 8 98 | ), // Chunk length 99 | data, 100 | ]; 101 | const blob = new Blob(output, { type: 'audio/mpeg' }); 102 | const url = URL.createObjectURL(blob); 103 | return { 104 | blob, 105 | url, 106 | channelCount: channels.length, 107 | sampleRate, 108 | duration: data.byteLength / (channels.length * sampleRate * 2), 109 | }; 110 | } 111 | } 112 | 113 | globalThis.WavPacker = WavPacker; 114 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/wav_recorder.js: -------------------------------------------------------------------------------- 1 | import { AudioProcessorSrc } from './worklets/audio_processor.js'; 2 | import { AudioAnalysis } from './analysis/audio_analysis.js'; 3 | import { WavPacker } from './wav_packer.js'; 4 | 5 | /** 6 | * Decodes audio into a wav file 7 | * @typedef {Object} DecodedAudioType 8 | * @property {Blob} blob 9 | * @property {string} url 10 | * @property {Float32Array} values 11 | * @property {AudioBuffer} audioBuffer 12 | */ 13 | 14 | /** 15 | * Records live stream of user audio as PCM16 "audio/wav" data 16 | * @class 17 | */ 18 | export class WavRecorder { 19 | /** 20 | * Create a new WavRecorder instance 21 | * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options] 22 | * @returns {WavRecorder} 23 | */ 24 | constructor({ 25 | sampleRate = 44100, 26 | outputToSpeakers = false, 27 | debug = false, 28 | } = {}) { 29 | // Script source 30 | this.scriptSrc = AudioProcessorSrc; 31 | // Config 32 | this.sampleRate = sampleRate; 33 | this.outputToSpeakers = outputToSpeakers; 34 | this.debug = !!debug; 35 | this._deviceChangeCallback = null; 36 | this._devices = []; 37 | // State variables 38 | this.stream = null; 39 | this.processor = null; 40 | this.source = null; 41 | this.node = null; 42 | this.recording = false; 43 | // Event handling with AudioWorklet 44 | this._lastEventId = 0; 45 | this.eventReceipts = {}; 46 | this.eventTimeout = 5000; 47 | // Process chunks of audio 48 | this._chunkProcessor = () => {}; 49 | this._chunkProcessorSize = void 0; 50 | this._chunkProcessorBuffer = { 51 | raw: new ArrayBuffer(0), 52 | mono: new ArrayBuffer(0), 53 | }; 54 | } 55 | 56 | /** 57 | * Decodes audio data from multiple formats to a Blob, url, Float32Array and AudioBuffer 58 | * @param {Blob|Float32Array|Int16Array|ArrayBuffer|number[]} audioData 59 | * @param {number} sampleRate 60 | * @param {number} fromSampleRate 61 | * @returns {Promise} 62 | */ 63 | static async decode(audioData, sampleRate = 44100, fromSampleRate = -1) { 64 | const context = new AudioContext({ sampleRate }); 65 | let arrayBuffer; 66 | let blob; 67 | if (audioData instanceof Blob) { 68 | if (fromSampleRate !== -1) { 69 | throw new Error( 70 | `Can not specify "fromSampleRate" when reading from Blob`, 71 | ); 72 | } 73 | blob = audioData; 74 | arrayBuffer = await blob.arrayBuffer(); 75 | } else if (audioData instanceof ArrayBuffer) { 76 | if (fromSampleRate !== -1) { 77 | throw new Error( 78 | `Can not specify "fromSampleRate" when reading from ArrayBuffer`, 79 | ); 80 | } 81 | arrayBuffer = audioData; 82 | blob = new Blob([arrayBuffer], { type: 'audio/wav' }); 83 | } else { 84 | let float32Array; 85 | let data; 86 | if (audioData instanceof Int16Array) { 87 | data = audioData; 88 | float32Array = new Float32Array(audioData.length); 89 | for (let i = 0; i < audioData.length; i++) { 90 | float32Array[i] = audioData[i] / 0x8000; 91 | } 92 | } else if (audioData instanceof Float32Array) { 93 | float32Array = audioData; 94 | } else if (audioData instanceof Array) { 95 | float32Array = new Float32Array(audioData); 96 | } else { 97 | throw new Error( 98 | `"audioData" must be one of: Blob, Float32Arrray, Int16Array, ArrayBuffer, Array`, 99 | ); 100 | } 101 | if (fromSampleRate === -1) { 102 | throw new Error( 103 | `Must specify "fromSampleRate" when reading from Float32Array, In16Array or Array`, 104 | ); 105 | } else if (fromSampleRate < 3000) { 106 | throw new Error(`Minimum "fromSampleRate" is 3000 (3kHz)`); 107 | } 108 | if (!data) { 109 | data = WavPacker.floatTo16BitPCM(float32Array); 110 | } 111 | const audio = { 112 | bitsPerSample: 16, 113 | channels: [float32Array], 114 | data, 115 | }; 116 | const packer = new WavPacker(); 117 | const result = packer.pack(fromSampleRate, audio); 118 | blob = result.blob; 119 | arrayBuffer = await blob.arrayBuffer(); 120 | } 121 | const audioBuffer = await context.decodeAudioData(arrayBuffer); 122 | const values = audioBuffer.getChannelData(0); 123 | const url = URL.createObjectURL(blob); 124 | return { 125 | blob, 126 | url, 127 | values, 128 | audioBuffer, 129 | }; 130 | } 131 | 132 | /** 133 | * Logs data in debug mode 134 | * @param {...any} arguments 135 | * @returns {true} 136 | */ 137 | log() { 138 | if (this.debug) { 139 | this.log(...arguments); 140 | } 141 | return true; 142 | } 143 | 144 | /** 145 | * Retrieves the current sampleRate for the recorder 146 | * @returns {number} 147 | */ 148 | getSampleRate() { 149 | return this.sampleRate; 150 | } 151 | 152 | /** 153 | * Retrieves the current status of the recording 154 | * @returns {"ended"|"paused"|"recording"} 155 | */ 156 | getStatus() { 157 | if (!this.processor) { 158 | return 'ended'; 159 | } else if (!this.recording) { 160 | return 'paused'; 161 | } else { 162 | return 'recording'; 163 | } 164 | } 165 | 166 | /** 167 | * Sends an event to the AudioWorklet 168 | * @private 169 | * @param {string} name 170 | * @param {{[key: string]: any}} data 171 | * @param {AudioWorkletNode} [_processor] 172 | * @returns {Promise<{[key: string]: any}>} 173 | */ 174 | async _event(name, data = {}, _processor = null) { 175 | _processor = _processor || this.processor; 176 | if (!_processor) { 177 | throw new Error('Can not send events without recording first'); 178 | } 179 | const message = { 180 | event: name, 181 | id: this._lastEventId++, 182 | data, 183 | }; 184 | _processor.port.postMessage(message); 185 | const t0 = new Date().valueOf(); 186 | while (!this.eventReceipts[message.id]) { 187 | if (new Date().valueOf() - t0 > this.eventTimeout) { 188 | throw new Error(`Timeout waiting for "${name}" event`); 189 | } 190 | await new Promise((res) => setTimeout(() => res(true), 1)); 191 | } 192 | const payload = this.eventReceipts[message.id]; 193 | delete this.eventReceipts[message.id]; 194 | return payload; 195 | } 196 | 197 | /** 198 | * Sets device change callback, remove if callback provided is `null` 199 | * @param {(Array): void|null} callback 200 | * @returns {true} 201 | */ 202 | listenForDeviceChange(callback) { 203 | if (callback === null && this._deviceChangeCallback) { 204 | navigator.mediaDevices.removeEventListener( 205 | 'devicechange', 206 | this._deviceChangeCallback, 207 | ); 208 | this._deviceChangeCallback = null; 209 | } else if (callback !== null) { 210 | // Basically a debounce; we only want this called once when devices change 211 | // And we only want the most recent callback() to be executed 212 | // if a few are operating at the same time 213 | let lastId = 0; 214 | let lastDevices = []; 215 | const serializeDevices = (devices) => 216 | devices 217 | .map((d) => d.deviceId) 218 | .sort() 219 | .join(','); 220 | const cb = async () => { 221 | let id = ++lastId; 222 | const devices = await this.listDevices(); 223 | if (id === lastId) { 224 | if (serializeDevices(lastDevices) !== serializeDevices(devices)) { 225 | lastDevices = devices; 226 | callback(devices.slice()); 227 | } 228 | } 229 | }; 230 | navigator.mediaDevices.addEventListener('devicechange', cb); 231 | cb(); 232 | this._deviceChangeCallback = cb; 233 | } 234 | return true; 235 | } 236 | 237 | /** 238 | * Manually request permission to use the microphone 239 | * @returns {Promise} 240 | */ 241 | async requestPermission() { 242 | const permissionStatus = await navigator.permissions.query({ 243 | name: 'microphone', 244 | }); 245 | if (permissionStatus.state === 'denied') { 246 | window.alert('You must grant microphone access to use this feature.'); 247 | } else if (permissionStatus.state === 'prompt') { 248 | try { 249 | const stream = await navigator.mediaDevices.getUserMedia({ 250 | audio: true, 251 | }); 252 | const tracks = stream.getTracks(); 253 | tracks.forEach((track) => track.stop()); 254 | } catch (e) { 255 | window.alert('You must grant microphone access to use this feature.'); 256 | } 257 | } 258 | return true; 259 | } 260 | 261 | /** 262 | * List all eligible devices for recording, will request permission to use microphone 263 | * @returns {Promise>} 264 | */ 265 | async listDevices() { 266 | if ( 267 | !navigator.mediaDevices || 268 | !('enumerateDevices' in navigator.mediaDevices) 269 | ) { 270 | throw new Error('Could not request user devices'); 271 | } 272 | await this.requestPermission(); 273 | const devices = await navigator.mediaDevices.enumerateDevices(); 274 | const audioDevices = devices.filter( 275 | (device) => device.kind === 'audioinput', 276 | ); 277 | const defaultDeviceIndex = audioDevices.findIndex( 278 | (device) => device.deviceId === 'default', 279 | ); 280 | const deviceList = []; 281 | if (defaultDeviceIndex !== -1) { 282 | let defaultDevice = audioDevices.splice(defaultDeviceIndex, 1)[0]; 283 | let existingIndex = audioDevices.findIndex( 284 | (device) => device.groupId === defaultDevice.groupId, 285 | ); 286 | if (existingIndex !== -1) { 287 | defaultDevice = audioDevices.splice(existingIndex, 1)[0]; 288 | } 289 | defaultDevice.default = true; 290 | deviceList.push(defaultDevice); 291 | } 292 | return deviceList.concat(audioDevices); 293 | } 294 | 295 | /** 296 | * Begins a recording session and requests microphone permissions if not already granted 297 | * Microphone recording indicator will appear on browser tab but status will be "paused" 298 | * @param {string} [deviceId] if no device provided, default device will be used 299 | * @returns {Promise} 300 | */ 301 | async begin(deviceId) { 302 | if (this.processor) { 303 | throw new Error( 304 | `Already connected: please call .end() to start a new session`, 305 | ); 306 | } 307 | 308 | if ( 309 | !navigator.mediaDevices || 310 | !('getUserMedia' in navigator.mediaDevices) 311 | ) { 312 | throw new Error('Could not request user media'); 313 | } 314 | try { 315 | const config = { audio: true }; 316 | if (deviceId) { 317 | config.audio = { deviceId: { exact: deviceId } }; 318 | } 319 | this.stream = await navigator.mediaDevices.getUserMedia(config); 320 | } catch (err) { 321 | throw new Error('Could not start media stream'); 322 | } 323 | 324 | const context = new AudioContext({ sampleRate: this.sampleRate }); 325 | const source = context.createMediaStreamSource(this.stream); 326 | // Load and execute the module script. 327 | try { 328 | await context.audioWorklet.addModule(this.scriptSrc); 329 | } catch (e) { 330 | console.error(e); 331 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); 332 | } 333 | const processor = new AudioWorkletNode(context, 'audio_processor'); 334 | processor.port.onmessage = (e) => { 335 | const { event, id, data } = e.data; 336 | if (event === 'receipt') { 337 | this.eventReceipts[id] = data; 338 | } else if (event === 'chunk') { 339 | if (this._chunkProcessorSize) { 340 | const buffer = this._chunkProcessorBuffer; 341 | this._chunkProcessorBuffer = { 342 | raw: WavPacker.mergeBuffers(buffer.raw, data.raw), 343 | mono: WavPacker.mergeBuffers(buffer.mono, data.mono), 344 | }; 345 | if ( 346 | this._chunkProcessorBuffer.mono.byteLength >= 347 | this._chunkProcessorSize 348 | ) { 349 | this._chunkProcessor(this._chunkProcessorBuffer); 350 | this._chunkProcessorBuffer = { 351 | raw: new ArrayBuffer(0), 352 | mono: new ArrayBuffer(0), 353 | }; 354 | } 355 | } else { 356 | this._chunkProcessor(data); 357 | } 358 | } 359 | }; 360 | 361 | const node = source.connect(processor); 362 | const analyser = context.createAnalyser(); 363 | analyser.fftSize = 8192; 364 | analyser.smoothingTimeConstant = 0.1; 365 | node.connect(analyser); 366 | if (this.outputToSpeakers) { 367 | // eslint-disable-next-line no-console 368 | console.warn( 369 | 'Warning: Output to speakers may affect sound quality,\n' + 370 | 'especially due to system audio feedback preventative measures.\n' + 371 | 'use only for debugging', 372 | ); 373 | analyser.connect(context.destination); 374 | } 375 | 376 | this.source = source; 377 | this.node = node; 378 | this.analyser = analyser; 379 | this.processor = processor; 380 | return true; 381 | } 382 | 383 | /** 384 | * Gets the current frequency domain data from the recording track 385 | * @param {"frequency"|"music"|"voice"} [analysisType] 386 | * @param {number} [minDecibels] default -100 387 | * @param {number} [maxDecibels] default -30 388 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} 389 | */ 390 | getFrequencies( 391 | analysisType = 'frequency', 392 | minDecibels = -100, 393 | maxDecibels = -30, 394 | ) { 395 | if (!this.processor) { 396 | throw new Error('Session ended: please call .begin() first'); 397 | } 398 | return AudioAnalysis.getFrequencies( 399 | this.analyser, 400 | this.sampleRate, 401 | null, 402 | analysisType, 403 | minDecibels, 404 | maxDecibels, 405 | ); 406 | } 407 | 408 | /** 409 | * Pauses the recording 410 | * Keeps microphone stream open but halts storage of audio 411 | * @returns {Promise} 412 | */ 413 | async pause() { 414 | if (!this.processor) { 415 | throw new Error('Session ended: please call .begin() first'); 416 | } else if (!this.recording) { 417 | throw new Error('Already paused: please call .record() first'); 418 | } 419 | if (this._chunkProcessorBuffer.raw.byteLength) { 420 | this._chunkProcessor(this._chunkProcessorBuffer); 421 | } 422 | this.log('Pausing ...'); 423 | await this._event('stop'); 424 | this.recording = false; 425 | return true; 426 | } 427 | 428 | /** 429 | * Start recording stream and storing to memory from the connected audio source 430 | * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor] 431 | * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio 432 | * @returns {Promise} 433 | */ 434 | async record(chunkProcessor = () => {}, chunkSize = 8192) { 435 | if (!this.processor) { 436 | throw new Error('Session ended: please call .begin() first'); 437 | } else if (this.recording) { 438 | throw new Error('Already recording: please call .pause() first'); 439 | } else if (typeof chunkProcessor !== 'function') { 440 | throw new Error(`chunkProcessor must be a function`); 441 | } 442 | this._chunkProcessor = chunkProcessor; 443 | this._chunkProcessorSize = chunkSize; 444 | this._chunkProcessorBuffer = { 445 | raw: new ArrayBuffer(0), 446 | mono: new ArrayBuffer(0), 447 | }; 448 | this.log('Recording ...'); 449 | await this._event('start'); 450 | this.recording = true; 451 | return true; 452 | } 453 | 454 | /** 455 | * Clears the audio buffer, empties stored recording 456 | * @returns {Promise} 457 | */ 458 | async clear() { 459 | if (!this.processor) { 460 | throw new Error('Session ended: please call .begin() first'); 461 | } 462 | await this._event('clear'); 463 | return true; 464 | } 465 | 466 | /** 467 | * Reads the current audio stream data 468 | * @returns {Promise<{meanValues: Float32Array, channels: Array}>} 469 | */ 470 | async read() { 471 | if (!this.processor) { 472 | throw new Error('Session ended: please call .begin() first'); 473 | } 474 | this.log('Reading ...'); 475 | const result = await this._event('read'); 476 | return result; 477 | } 478 | 479 | /** 480 | * Saves the current audio stream to a file 481 | * @param {boolean} [force] Force saving while still recording 482 | * @returns {Promise} 483 | */ 484 | async save(force = false) { 485 | if (!this.processor) { 486 | throw new Error('Session ended: please call .begin() first'); 487 | } 488 | if (!force && this.recording) { 489 | throw new Error( 490 | 'Currently recording: please call .pause() first, or call .save(true) to force', 491 | ); 492 | } 493 | this.log('Exporting ...'); 494 | const exportData = await this._event('export'); 495 | const packer = new WavPacker(); 496 | const result = packer.pack(this.sampleRate, exportData.audio); 497 | return result; 498 | } 499 | 500 | /** 501 | * Ends the current recording session and saves the result 502 | * @returns {Promise} 503 | */ 504 | async end() { 505 | if (!this.processor) { 506 | throw new Error('Session ended: please call .begin() first'); 507 | } 508 | 509 | const _processor = this.processor; 510 | 511 | this.log('Stopping ...'); 512 | await this._event('stop'); 513 | this.recording = false; 514 | const tracks = this.stream.getTracks(); 515 | tracks.forEach((track) => track.stop()); 516 | 517 | this.log('Exporting ...'); 518 | const exportData = await this._event('export', {}, _processor); 519 | 520 | this.processor.disconnect(); 521 | this.source.disconnect(); 522 | this.node.disconnect(); 523 | this.analyser.disconnect(); 524 | this.stream = null; 525 | this.processor = null; 526 | this.source = null; 527 | this.node = null; 528 | 529 | const packer = new WavPacker(); 530 | const result = packer.pack(this.sampleRate, exportData.audio); 531 | return result; 532 | } 533 | 534 | /** 535 | * Performs a full cleanup of WavRecorder instance 536 | * Stops actively listening via microphone and removes existing listeners 537 | * @returns {Promise} 538 | */ 539 | async quit() { 540 | this.listenForDeviceChange(null); 541 | if (this.processor) { 542 | await this.end(); 543 | } 544 | return true; 545 | } 546 | } 547 | 548 | globalThis.WavRecorder = WavRecorder; 549 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/wav_stream_player.js: -------------------------------------------------------------------------------- 1 | import { StreamProcessorSrc } from './worklets/stream_processor.js'; 2 | import { AudioAnalysis } from './analysis/audio_analysis.js'; 3 | 4 | /** 5 | * Plays audio streams received in raw PCM16 chunks from the browser 6 | * @class 7 | */ 8 | export class WavStreamPlayer { 9 | /** 10 | * Creates a new WavStreamPlayer instance 11 | * @param {{sampleRate?: number}} options 12 | * @returns {WavStreamPlayer} 13 | */ 14 | constructor({ sampleRate = 44100 } = {}) { 15 | this.scriptSrc = StreamProcessorSrc; 16 | this.sampleRate = sampleRate; 17 | this.context = null; 18 | this.stream = null; 19 | this.analyser = null; 20 | this.trackSampleOffsets = {}; 21 | this.interruptedTrackIds = {}; 22 | } 23 | 24 | /** 25 | * Connects the audio context and enables output to speakers 26 | * @returns {Promise} 27 | */ 28 | async connect() { 29 | this.context = new AudioContext({ sampleRate: this.sampleRate }); 30 | if (this.context.state === 'suspended') { 31 | await this.context.resume(); 32 | } 33 | try { 34 | await this.context.audioWorklet.addModule(this.scriptSrc); 35 | } catch (e) { 36 | console.error(e); 37 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); 38 | } 39 | const analyser = this.context.createAnalyser(); 40 | analyser.fftSize = 8192; 41 | analyser.smoothingTimeConstant = 0.1; 42 | this.analyser = analyser; 43 | return true; 44 | } 45 | 46 | /** 47 | * Gets the current frequency domain data from the playing track 48 | * @param {"frequency"|"music"|"voice"} [analysisType] 49 | * @param {number} [minDecibels] default -100 50 | * @param {number} [maxDecibels] default -30 51 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} 52 | */ 53 | getFrequencies( 54 | analysisType = 'frequency', 55 | minDecibels = -100, 56 | maxDecibels = -30 57 | ) { 58 | if (!this.analyser) { 59 | throw new Error('Not connected, please call .connect() first'); 60 | } 61 | return AudioAnalysis.getFrequencies( 62 | this.analyser, 63 | this.sampleRate, 64 | null, 65 | analysisType, 66 | minDecibels, 67 | maxDecibels 68 | ); 69 | } 70 | 71 | /** 72 | * Starts audio streaming 73 | * @private 74 | * @returns {Promise} 75 | */ 76 | _start() { 77 | const streamNode = new AudioWorkletNode(this.context, 'stream_processor'); 78 | streamNode.connect(this.context.destination); 79 | streamNode.port.onmessage = (e) => { 80 | const { event } = e.data; 81 | if (event === 'stop') { 82 | streamNode.disconnect(); 83 | this.stream = null; 84 | } else if (event === 'offset') { 85 | const { requestId, trackId, offset } = e.data; 86 | const currentTime = offset / this.sampleRate; 87 | this.trackSampleOffsets[requestId] = { trackId, offset, currentTime }; 88 | } 89 | }; 90 | this.analyser.disconnect(); 91 | streamNode.connect(this.analyser); 92 | this.stream = streamNode; 93 | return true; 94 | } 95 | 96 | /** 97 | * Adds 16BitPCM data to the currently playing audio stream 98 | * You can add chunks beyond the current play point and they will be queued for play 99 | * @param {ArrayBuffer|Int16Array} arrayBuffer 100 | * @param {string} [trackId] 101 | * @returns {Int16Array} 102 | */ 103 | add16BitPCM(arrayBuffer, trackId = 'default') { 104 | if (typeof trackId !== 'string') { 105 | throw new Error(`trackId must be a string`); 106 | } else if (this.interruptedTrackIds[trackId]) { 107 | return; 108 | } 109 | if (!this.stream) { 110 | this._start(); 111 | } 112 | let buffer; 113 | if (arrayBuffer instanceof Int16Array) { 114 | buffer = arrayBuffer; 115 | } else if (arrayBuffer instanceof ArrayBuffer) { 116 | buffer = new Int16Array(arrayBuffer); 117 | } else { 118 | throw new Error(`argument must be Int16Array or ArrayBuffer`); 119 | } 120 | this.stream.port.postMessage({ event: 'write', buffer, trackId }); 121 | return buffer; 122 | } 123 | 124 | /** 125 | * Gets the offset (sample count) of the currently playing stream 126 | * @param {boolean} [interrupt] 127 | * @returns {{trackId: string|null, offset: number, currentTime: number}} 128 | */ 129 | async getTrackSampleOffset(interrupt = false) { 130 | if (!this.stream) { 131 | return null; 132 | } 133 | const requestId = crypto.randomUUID(); 134 | this.stream.port.postMessage({ 135 | event: interrupt ? 'interrupt' : 'offset', 136 | requestId, 137 | }); 138 | let trackSampleOffset; 139 | while (!trackSampleOffset) { 140 | trackSampleOffset = this.trackSampleOffsets[requestId]; 141 | await new Promise((r) => setTimeout(() => r(), 1)); 142 | } 143 | const { trackId } = trackSampleOffset; 144 | if (interrupt && trackId) { 145 | this.interruptedTrackIds[trackId] = true; 146 | } 147 | return trackSampleOffset; 148 | } 149 | 150 | /** 151 | * Strips the current stream and returns the sample offset of the audio 152 | * @param {boolean} [interrupt] 153 | * @returns {{trackId: string|null, offset: number, currentTime: number}} 154 | */ 155 | async interrupt() { 156 | return this.getTrackSampleOffset(true); 157 | } 158 | } 159 | 160 | globalThis.WavStreamPlayer = WavStreamPlayer; 161 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/worklets/audio_processor.js: -------------------------------------------------------------------------------- 1 | const AudioProcessorWorklet = ` 2 | class AudioProcessor extends AudioWorkletProcessor { 3 | 4 | constructor() { 5 | super(); 6 | this.port.onmessage = this.receive.bind(this); 7 | this.initialize(); 8 | } 9 | 10 | initialize() { 11 | this.foundAudio = false; 12 | this.recording = false; 13 | this.chunks = []; 14 | } 15 | 16 | /** 17 | * Concatenates sampled chunks into channels 18 | * Format is chunk[Left[], Right[]] 19 | */ 20 | readChannelData(chunks, channel = -1, maxChannels = 9) { 21 | let channelLimit; 22 | if (channel !== -1) { 23 | if (chunks[0] && chunks[0].length - 1 < channel) { 24 | throw new Error( 25 | \`Channel \${channel} out of range: max \${chunks[0].length}\` 26 | ); 27 | } 28 | channelLimit = channel + 1; 29 | } else { 30 | channel = 0; 31 | channelLimit = Math.min(chunks[0] ? chunks[0].length : 1, maxChannels); 32 | } 33 | const channels = []; 34 | for (let n = channel; n < channelLimit; n++) { 35 | const length = chunks.reduce((sum, chunk) => { 36 | return sum + chunk[n].length; 37 | }, 0); 38 | const buffers = chunks.map((chunk) => chunk[n]); 39 | const result = new Float32Array(length); 40 | let offset = 0; 41 | for (let i = 0; i < buffers.length; i++) { 42 | result.set(buffers[i], offset); 43 | offset += buffers[i].length; 44 | } 45 | channels[n] = result; 46 | } 47 | return channels; 48 | } 49 | 50 | /** 51 | * Combines parallel audio data into correct format, 52 | * channels[Left[], Right[]] to float32Array[LRLRLRLR...] 53 | */ 54 | formatAudioData(channels) { 55 | if (channels.length === 1) { 56 | // Simple case is only one channel 57 | const float32Array = channels[0].slice(); 58 | const meanValues = channels[0].slice(); 59 | return { float32Array, meanValues }; 60 | } else { 61 | const float32Array = new Float32Array( 62 | channels[0].length * channels.length 63 | ); 64 | const meanValues = new Float32Array(channels[0].length); 65 | for (let i = 0; i < channels[0].length; i++) { 66 | const offset = i * channels.length; 67 | let meanValue = 0; 68 | for (let n = 0; n < channels.length; n++) { 69 | float32Array[offset + n] = channels[n][i]; 70 | meanValue += channels[n][i]; 71 | } 72 | meanValues[i] = meanValue / channels.length; 73 | } 74 | return { float32Array, meanValues }; 75 | } 76 | } 77 | 78 | /** 79 | * Converts 32-bit float data to 16-bit integers 80 | */ 81 | floatTo16BitPCM(float32Array) { 82 | const buffer = new ArrayBuffer(float32Array.length * 2); 83 | const view = new DataView(buffer); 84 | let offset = 0; 85 | for (let i = 0; i < float32Array.length; i++, offset += 2) { 86 | let s = Math.max(-1, Math.min(1, float32Array[i])); 87 | view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); 88 | } 89 | return buffer; 90 | } 91 | 92 | /** 93 | * Retrieves the most recent amplitude values from the audio stream 94 | * @param {number} channel 95 | */ 96 | getValues(channel = -1) { 97 | const channels = this.readChannelData(this.chunks, channel); 98 | const { meanValues } = this.formatAudioData(channels); 99 | return { meanValues, channels }; 100 | } 101 | 102 | /** 103 | * Exports chunks as an audio/wav file 104 | */ 105 | export() { 106 | const channels = this.readChannelData(this.chunks); 107 | const { float32Array, meanValues } = this.formatAudioData(channels); 108 | const audioData = this.floatTo16BitPCM(float32Array); 109 | return { 110 | meanValues: meanValues, 111 | audio: { 112 | bitsPerSample: 16, 113 | channels: channels, 114 | data: audioData, 115 | }, 116 | }; 117 | } 118 | 119 | receive(e) { 120 | const { event, id } = e.data; 121 | let receiptData = {}; 122 | switch (event) { 123 | case 'start': 124 | this.recording = true; 125 | break; 126 | case 'stop': 127 | this.recording = false; 128 | break; 129 | case 'clear': 130 | this.initialize(); 131 | break; 132 | case 'export': 133 | receiptData = this.export(); 134 | break; 135 | case 'read': 136 | receiptData = this.getValues(); 137 | break; 138 | default: 139 | break; 140 | } 141 | // Always send back receipt 142 | this.port.postMessage({ event: 'receipt', id, data: receiptData }); 143 | } 144 | 145 | sendChunk(chunk) { 146 | const channels = this.readChannelData([chunk]); 147 | const { float32Array, meanValues } = this.formatAudioData(channels); 148 | const rawAudioData = this.floatTo16BitPCM(float32Array); 149 | const monoAudioData = this.floatTo16BitPCM(meanValues); 150 | this.port.postMessage({ 151 | event: 'chunk', 152 | data: { 153 | mono: monoAudioData, 154 | raw: rawAudioData, 155 | }, 156 | }); 157 | } 158 | 159 | process(inputList, outputList, parameters) { 160 | // Copy input to output (e.g. speakers) 161 | // Note that this creates choppy sounds with Mac products 162 | const sourceLimit = Math.min(inputList.length, outputList.length); 163 | for (let inputNum = 0; inputNum < sourceLimit; inputNum++) { 164 | const input = inputList[inputNum]; 165 | const output = outputList[inputNum]; 166 | const channelCount = Math.min(input.length, output.length); 167 | for (let channelNum = 0; channelNum < channelCount; channelNum++) { 168 | input[channelNum].forEach((sample, i) => { 169 | output[channelNum][i] = sample; 170 | }); 171 | } 172 | } 173 | const inputs = inputList[0]; 174 | // There's latency at the beginning of a stream before recording starts 175 | // Make sure we actually receive audio data before we start storing chunks 176 | let sliceIndex = 0; 177 | if (!this.foundAudio) { 178 | for (const channel of inputs) { 179 | sliceIndex = 0; // reset for each channel 180 | if (this.foundAudio) { 181 | break; 182 | } 183 | if (channel) { 184 | for (const value of channel) { 185 | if (value !== 0) { 186 | // find only one non-zero entry in any channel 187 | this.foundAudio = true; 188 | break; 189 | } else { 190 | sliceIndex++; 191 | } 192 | } 193 | } 194 | } 195 | } 196 | if (inputs && inputs[0] && this.foundAudio && this.recording) { 197 | // We need to copy the TypedArray, because the \`process\` 198 | // internals will reuse the same buffer to hold each input 199 | const chunk = inputs.map((input) => input.slice(sliceIndex)); 200 | this.chunks.push(chunk); 201 | this.sendChunk(chunk); 202 | } 203 | return true; 204 | } 205 | } 206 | 207 | registerProcessor('audio_processor', AudioProcessor); 208 | `; 209 | 210 | const script = new Blob([AudioProcessorWorklet], { 211 | type: 'application/javascript', 212 | }); 213 | const src = URL.createObjectURL(script); 214 | export const AudioProcessorSrc = src; 215 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/lib/wavtools/lib/worklets/stream_processor.js: -------------------------------------------------------------------------------- 1 | export const StreamProcessorWorklet = ` 2 | class StreamProcessor extends AudioWorkletProcessor { 3 | constructor() { 4 | super(); 5 | this.hasStarted = false; 6 | this.hasInterrupted = false; 7 | this.outputBuffers = []; 8 | this.bufferLength = 128; 9 | this.write = { buffer: new Float32Array(this.bufferLength), trackId: null }; 10 | this.writeOffset = 0; 11 | this.trackSampleOffsets = {}; 12 | this.port.onmessage = (event) => { 13 | if (event.data) { 14 | const payload = event.data; 15 | if (payload.event === 'write') { 16 | const int16Array = payload.buffer; 17 | const float32Array = new Float32Array(int16Array.length); 18 | for (let i = 0; i < int16Array.length; i++) { 19 | float32Array[i] = int16Array[i] / 0x8000; // Convert Int16 to Float32 20 | } 21 | this.writeData(float32Array, payload.trackId); 22 | } else if ( 23 | payload.event === 'offset' || 24 | payload.event === 'interrupt' 25 | ) { 26 | const requestId = payload.requestId; 27 | const trackId = this.write.trackId; 28 | const offset = this.trackSampleOffsets[trackId] || 0; 29 | this.port.postMessage({ 30 | event: 'offset', 31 | requestId, 32 | trackId, 33 | offset, 34 | }); 35 | if (payload.event === 'interrupt') { 36 | this.hasInterrupted = true; 37 | } 38 | } else { 39 | throw new Error(\`Unhandled event "\${payload.event}"\`); 40 | } 41 | } 42 | }; 43 | } 44 | 45 | writeData(float32Array, trackId = null) { 46 | let { buffer } = this.write; 47 | let offset = this.writeOffset; 48 | for (let i = 0; i < float32Array.length; i++) { 49 | buffer[offset++] = float32Array[i]; 50 | if (offset >= buffer.length) { 51 | this.outputBuffers.push(this.write); 52 | this.write = { buffer: new Float32Array(this.bufferLength), trackId }; 53 | buffer = this.write.buffer; 54 | offset = 0; 55 | } 56 | } 57 | this.writeOffset = offset; 58 | return true; 59 | } 60 | 61 | process(inputs, outputs, parameters) { 62 | const output = outputs[0]; 63 | const outputChannelData = output[0]; 64 | const outputBuffers = this.outputBuffers; 65 | if (this.hasInterrupted) { 66 | this.port.postMessage({ event: 'stop' }); 67 | return false; 68 | } else if (outputBuffers.length) { 69 | this.hasStarted = true; 70 | const { buffer, trackId } = outputBuffers.shift(); 71 | for (let i = 0; i < outputChannelData.length; i++) { 72 | outputChannelData[i] = buffer[i] || 0; 73 | } 74 | if (trackId) { 75 | this.trackSampleOffsets[trackId] = 76 | this.trackSampleOffsets[trackId] || 0; 77 | this.trackSampleOffsets[trackId] += buffer.length; 78 | } 79 | return true; 80 | } else if (this.hasStarted) { 81 | this.port.postMessage({ event: 'stop' }); 82 | return false; 83 | } else { 84 | return true; 85 | } 86 | } 87 | } 88 | 89 | registerProcessor('stream_processor', StreamProcessor); 90 | `; 91 | 92 | const script = new Blob([StreamProcessorWorklet], { 93 | type: 'application/javascript', 94 | }); 95 | const src = URL.createObjectURL(script); 96 | export const StreamProcessorSrc = src; 97 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/pages/ConsolePage.scss: -------------------------------------------------------------------------------- 1 | [data-component='ConsolePage'] { 2 | font-family: 'Roboto Mono', monospace; 3 | font-weight: 400; 4 | font-style: normal; 5 | font-size: 12px; 6 | height: 100%; 7 | display: flex; 8 | flex-direction: column; 9 | overflow: hidden; 10 | margin: 0px 8px; 11 | & > div { 12 | flex-shrink: 0; 13 | } 14 | 15 | .spacer { 16 | flex-grow: 1; 17 | } 18 | 19 | .content-top { 20 | display: flex; 21 | align-items: center; 22 | padding: 8px 16px; 23 | min-height: 40px; 24 | .content-title { 25 | flex-grow: 1; 26 | display: flex; 27 | align-items: center; 28 | gap: 12px; 29 | img { 30 | width: 24px; 31 | height: 24px; 32 | } 33 | } 34 | } 35 | 36 | .content-main { 37 | flex-grow: 1; 38 | flex-shrink: 1 !important; 39 | margin: 0px 16px; 40 | display: flex; 41 | overflow: hidden; 42 | margin-bottom: 24px; 43 | .content-block { 44 | position: relative; 45 | display: flex; 46 | flex-direction: column; 47 | max-height: 100%; 48 | width: 100%; 49 | .content-block-title { 50 | flex-shrink: 0; 51 | padding-top: 16px; 52 | padding-bottom: 4px; 53 | position: relative; 54 | } 55 | .content-block-body { 56 | color: #6e6e7f; 57 | position: relative; 58 | flex-grow: 1; 59 | padding: 8px 0px; 60 | padding-top: 4px; 61 | line-height: 1.2em; 62 | overflow: auto; 63 | &.full { 64 | padding: 0px; 65 | } 66 | } 67 | } 68 | .content-right { 69 | width: 300px; 70 | flex-shrink: 0; 71 | display: flex; 72 | flex-direction: column; 73 | margin-left: 24px; 74 | gap: 24px; 75 | & > div { 76 | border-radius: 16px; 77 | flex-grow: 1; 78 | flex-shrink: 0; 79 | overflow: hidden; 80 | position: relative; 81 | .content-block-title { 82 | position: absolute; 83 | display: flex; 84 | align-items: center; 85 | justify-content: center; 86 | line-height: 2em; 87 | top: 16px; 88 | left: 16px; 89 | padding: 4px 16px; 90 | background-color: #fff; 91 | border-radius: 1000px; 92 | min-height: 32px; 93 | z-index: 9999; 94 | text-align: center; 95 | white-space: pre; 96 | &.bottom { 97 | top: auto; 98 | bottom: 16px; 99 | right: 16px; 100 | } 101 | } 102 | } 103 | & > div.kv { 104 | height: 250px; 105 | max-height: 250px; 106 | white-space: pre; 107 | background-color: #ececf1; 108 | .content-block-body { 109 | padding: 16px; 110 | margin-top: 56px; 111 | } 112 | } 113 | } 114 | .content-logs { 115 | flex-grow: 1; 116 | display: flex; 117 | flex-direction: column; 118 | overflow: hidden; 119 | & > div { 120 | flex-grow: 1; 121 | } 122 | & > .content-actions { 123 | flex-grow: 0; 124 | flex-shrink: 0; 125 | display: flex; 126 | align-items: center; 127 | justify-content: center; 128 | gap: 16px; 129 | } 130 | & > div.events { 131 | overflow: hidden; 132 | } 133 | .events { 134 | border-top: 1px solid #e7e7e7; 135 | } 136 | .conversation { 137 | display: flex; 138 | flex-shrink: 0; 139 | width: 100%; 140 | overflow: hidden; 141 | height: 200px; 142 | min-height: 0; 143 | max-height: 200px; 144 | border-top: 1px solid #e7e7e7; 145 | } 146 | } 147 | } 148 | 149 | .conversation-item { 150 | position: relative; 151 | display: flex; 152 | gap: 16px; 153 | margin-bottom: 16px; 154 | &:not(:hover) .close { 155 | display: none; 156 | } 157 | .close { 158 | position: absolute; 159 | top: 0px; 160 | right: -20px; 161 | background: #aaa; 162 | color: #fff; 163 | display: flex; 164 | border-radius: 16px; 165 | padding: 2px; 166 | cursor: pointer; 167 | &:hover { 168 | background: #696969; 169 | } 170 | svg { 171 | stroke-width: 3; 172 | width: 12px; 173 | height: 12px; 174 | } 175 | } 176 | .speaker { 177 | position: relative; 178 | text-align: left; 179 | gap: 16px; 180 | width: 80px; 181 | flex-shrink: 0; 182 | margin-right: 16px; 183 | &.user { 184 | color: #0099ff; 185 | } 186 | &.assistant { 187 | color: #009900; 188 | } 189 | } 190 | .speaker-content { 191 | color: #18181b; 192 | overflow: hidden; 193 | word-wrap: break-word; 194 | } 195 | } 196 | 197 | .event { 198 | border-radius: 3px; 199 | white-space: pre; 200 | display: flex; 201 | padding: 0px; 202 | gap: 16px; 203 | .event-timestamp { 204 | text-align: left; 205 | gap: 8px; 206 | padding: 4px 0px; 207 | width: 80px; 208 | flex-shrink: 0; 209 | margin-right: 16px; 210 | } 211 | .event-details { 212 | display: flex; 213 | flex-direction: column; 214 | color: #18181b; 215 | gap: 8px; 216 | .event-summary { 217 | padding: 4px 8px; 218 | margin: 0px -8px; 219 | &:hover { 220 | border-radius: 8px; 221 | background-color: #f0f0f0; 222 | } 223 | cursor: pointer; 224 | display: flex; 225 | gap: 8px; 226 | align-items: center; 227 | .event-source { 228 | flex-shrink: 0; 229 | display: flex; 230 | align-items: center; 231 | gap: 8px; 232 | &.client { 233 | color: #0099ff; 234 | } 235 | &.server { 236 | color: #009900; 237 | } 238 | &.error { 239 | color: #990000; 240 | } 241 | svg { 242 | stroke-width: 3; 243 | width: 12px; 244 | height: 12px; 245 | } 246 | } 247 | } 248 | } 249 | } 250 | 251 | .visualization { 252 | position: absolute; 253 | display: flex; 254 | bottom: 4px; 255 | right: 8px; 256 | padding: 4px; 257 | border-radius: 16px; 258 | z-index: 10; 259 | gap: 2px; 260 | .visualization-entry { 261 | position: relative; 262 | display: flex; 263 | align-items: center; 264 | height: 40px; 265 | width: 100px; 266 | gap: 4px; 267 | &.client { 268 | color: #0099ff; 269 | } 270 | &.server { 271 | color: #009900; 272 | } 273 | canvas { 274 | width: 100%; 275 | height: 100%; 276 | color: currentColor; 277 | } 278 | } 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/react-app-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/reportWebVitals.ts: -------------------------------------------------------------------------------- 1 | import { ReportHandler } from 'web-vitals'; 2 | 3 | const reportWebVitals = (onPerfEntry?: ReportHandler) => { 4 | if (onPerfEntry && onPerfEntry instanceof Function) { 5 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => { 6 | getCLS(onPerfEntry); 7 | getFID(onPerfEntry); 8 | getFCP(onPerfEntry); 9 | getLCP(onPerfEntry); 10 | getTTFB(onPerfEntry); 11 | }); 12 | } 13 | }; 14 | 15 | export default reportWebVitals; 16 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/setupTests.ts: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import '@testing-library/jest-dom'; 6 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/utils/conversation_config.js: -------------------------------------------------------------------------------- 1 | export const instructions = `System settings: 2 | Tool use: enabled. 3 | 4 | Instructions: 5 | - You are an artificial intelligence agent responsible for helping test realtime voice capabilities 6 | - Please make sure to respond with a helpful voice via audio 7 | - Be kind, helpful, and curteous 8 | - It is okay to ask the user questions 9 | - Use tools and functions you have available liberally, it is part of the training apparatus 10 | - Be open to exploration and conversation 11 | - Remember: this is just for fun and testing! 12 | 13 | Personality: 14 | - Be upbeat and genuine 15 | - Try speaking quickly as if excited 16 | `; 17 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/src/utils/wav_renderer.ts: -------------------------------------------------------------------------------- 1 | const dataMap = new WeakMap(); 2 | 3 | /** 4 | * Normalizes a Float32Array to Array(m): We use this to draw amplitudes on a graph 5 | * If we're rendering the same audio data, then we'll often be using 6 | * the same (data, m, downsamplePeaks) triplets so we give option to memoize 7 | */ 8 | const normalizeArray = ( 9 | data: Float32Array, 10 | m: number, 11 | downsamplePeaks: boolean = false, 12 | memoize: boolean = false 13 | ) => { 14 | let cache, mKey, dKey; 15 | if (memoize) { 16 | mKey = m.toString(); 17 | dKey = downsamplePeaks.toString(); 18 | cache = dataMap.has(data) ? dataMap.get(data) : {}; 19 | dataMap.set(data, cache); 20 | cache[mKey] = cache[mKey] || {}; 21 | if (cache[mKey][dKey]) { 22 | return cache[mKey][dKey]; 23 | } 24 | } 25 | const n = data.length; 26 | const result = new Array(m); 27 | if (m <= n) { 28 | // Downsampling 29 | result.fill(0); 30 | const count = new Array(m).fill(0); 31 | for (let i = 0; i < n; i++) { 32 | const index = Math.floor(i * (m / n)); 33 | if (downsamplePeaks) { 34 | // take highest result in the set 35 | result[index] = Math.max(result[index], Math.abs(data[i])); 36 | } else { 37 | result[index] += Math.abs(data[i]); 38 | } 39 | count[index]++; 40 | } 41 | if (!downsamplePeaks) { 42 | for (let i = 0; i < result.length; i++) { 43 | result[i] = result[i] / count[i]; 44 | } 45 | } 46 | } else { 47 | for (let i = 0; i < m; i++) { 48 | const index = (i * (n - 1)) / (m - 1); 49 | const low = Math.floor(index); 50 | const high = Math.ceil(index); 51 | const t = index - low; 52 | if (high >= n) { 53 | result[i] = data[n - 1]; 54 | } else { 55 | result[i] = data[low] * (1 - t) + data[high] * t; 56 | } 57 | } 58 | } 59 | if (memoize) { 60 | cache[mKey as string][dKey as string] = result; 61 | } 62 | return result; 63 | }; 64 | 65 | export const WavRenderer = { 66 | /** 67 | * Renders a point-in-time snapshot of an audio sample, usually frequency values 68 | * @param canvas 69 | * @param ctx 70 | * @param data 71 | * @param color 72 | * @param pointCount number of bars to render 73 | * @param barWidth width of bars in px 74 | * @param barSpacing spacing between bars in px 75 | * @param center vertically center the bars 76 | */ 77 | drawBars: ( 78 | canvas: HTMLCanvasElement, 79 | ctx: CanvasRenderingContext2D, 80 | data: Float32Array, 81 | color: string, 82 | pointCount: number = 0, 83 | barWidth: number = 0, 84 | barSpacing: number = 0, 85 | center: boolean = false 86 | ) => { 87 | pointCount = Math.floor( 88 | Math.min( 89 | pointCount, 90 | (canvas.width - barSpacing) / (Math.max(barWidth, 1) + barSpacing) 91 | ) 92 | ); 93 | if (!pointCount) { 94 | pointCount = Math.floor( 95 | (canvas.width - barSpacing) / (Math.max(barWidth, 1) + barSpacing) 96 | ); 97 | } 98 | if (!barWidth) { 99 | barWidth = (canvas.width - barSpacing) / pointCount - barSpacing; 100 | } 101 | const points = normalizeArray(data, pointCount, true); 102 | for (let i = 0; i < pointCount; i++) { 103 | const amplitude = Math.abs(points[i]); 104 | const height = Math.max(1, amplitude * canvas.height); 105 | const x = barSpacing + i * (barWidth + barSpacing); 106 | const y = center ? (canvas.height - height) / 2 : canvas.height - height; 107 | ctx.fillStyle = color; 108 | ctx.fillRect(x, y, barWidth, height); 109 | } 110 | }, 111 | }; 112 | -------------------------------------------------------------------------------- /examples/openai-realtime-console/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "lib": ["dom", "dom.iterable", "esnext", "ES2020"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "esModuleInterop": true, 8 | "allowSyntheticDefaultImports": true, 9 | "strict": true, 10 | "forceConsistentCasingInFileNames": true, 11 | "noFallthroughCasesInSwitch": true, 12 | "module": "esnext", 13 | "moduleResolution": "node", 14 | "resolveJsonModule": true, 15 | "isolatedModules": true, 16 | "noEmit": true, 17 | "jsx": "react-jsx" 18 | }, 19 | "include": ["src", "src/lib"] 20 | } 21 | -------------------------------------------------------------------------------- /fixtures/toronto.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/transitive-bullshit/openai-realtime-api/89d37b5f461fbcb0300241360749abe85ca45d01/fixtures/toronto.mp3 -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Travis Fischer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "openai-realtime-api", 3 | "version": "1.0.7", 4 | "description": "TypeScript client OpenAI's realtime voice API.", 5 | "author": "Travis Fischer ", 6 | "license": "MIT", 7 | "repository": { 8 | "type": "git", 9 | "url": "git+https://github.com/transitive-bullshit/openai-realtime-api.git" 10 | }, 11 | "packageManager": "pnpm@9.12.2", 12 | "engines": { 13 | "node": ">=18" 14 | }, 15 | "type": "module", 16 | "main": "./dist/index.js", 17 | "source": "./src/index.ts", 18 | "types": "./dist/index.d.ts", 19 | "sideEffects": false, 20 | "exports": { 21 | ".": { 22 | "types": "./dist/index.d.ts", 23 | "default": "./dist/index.js" 24 | }, 25 | "./node": { 26 | "types": "./dist/node/index.d.ts", 27 | "default": "./dist/node/index.js" 28 | } 29 | }, 30 | "files": [ 31 | "dist" 32 | ], 33 | "scripts": { 34 | "build": "tsup", 35 | "dev": "tsup --watch", 36 | "pretest": "run-s build", 37 | "test": "run-s test:*", 38 | "test:format": "prettier --check \"**/*.{js,ts,tsx}\"", 39 | "test:lint": "eslint .", 40 | "test:typecheck": "tsc --noEmit", 41 | "test-unit": "vitest run", 42 | "preinstall": "npx only-allow pnpm" 43 | }, 44 | "dependencies": { 45 | "nanoid": "^5.0.8", 46 | "ws": "^8.18.0" 47 | }, 48 | "devDependencies": { 49 | "@fisch0920/eslint-config": "^1.4.0", 50 | "@total-typescript/ts-reset": "^0.6.1", 51 | "@types/node": "^22.8.6", 52 | "@types/ws": "^8.5.12", 53 | "audio-decode": "^2.2.2", 54 | "dotenv": "^16.4.5", 55 | "eslint": "^8.57.1", 56 | "npm-run-all2": "^7.0.1", 57 | "only-allow": "^1.2.1", 58 | "prettier": "^3.3.3", 59 | "tsup": "^8.3.5", 60 | "typescript": "^5.6.3", 61 | "vitest": "2.1.4" 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - 'examples/*' 3 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # openai-realtime-api 2 | 3 | > TypeScript client for OpenAI's realtime voice API. 4 | 5 |

6 | Build Status 7 | NPM 8 | MIT License 9 | Prettier Code Formatting 10 |

11 | 12 | - [Features](#features) 13 | - [Install](#install) 14 | - [Usage](#usage) 15 | - [Server Usage](#server-usage) 16 | - [Browser Usage](#browser-usage) 17 | - [Relay Server](#relay-server) 18 | - [Examples](#examples) 19 | - [Node.js Basic](#nodejs-basic) 20 | - [Node.js Audio](#nodejs-audio) 21 | - [Node.js Conversation](#nodejs-conversation) 22 | - [OpenAI Realtime Console](#openai-realtime-console) 23 | - [TODO](#todo) 24 | - [License](#license) 25 | 26 | ## Features 27 | 28 | - **Strongly typed** TS fork of [openai/openai-realtime-api-beta](https://github.com/openai/openai-realtime-api-beta) 29 | - [All events](./src/events.ts) and handlers are 100% typed 30 | - **Drop-in replacement for OpenAI's JS version** 31 | - Fixes many small bugs and inconsistencies 32 | - ([#3](https://github.com/openai/openai-realtime-api-beta/issues/3), [#11](https://github.com/openai/openai-realtime-api-beta/pull/11), [#12](https://github.com/openai/openai-realtime-api-beta/pull/12), [#14](https://github.com/openai/openai-realtime-api-beta/issues/14), [#17](https://github.com/openai/openai-realtime-api-beta/pull/17), [#29](https://github.com/openai/openai-realtime-api-beta/pull/29), [#34](https://github.com/openai/openai-realtime-api-beta/pull/34), [#35](https://github.com/openai/openai-realtime-api-beta/pull/35), [#37](https://github.com/openai/openai-realtime-api-beta/pull/37), [#43](https://github.com/openai/openai-realtime-api-beta/pull/43), [#44](https://github.com/openai/openai-realtime-api-beta/pull/44), and likely others) 33 | - Published to NPM 34 | - Supports Node.js, browser, deno, bun, CF workers, etc 35 | - Includes Node.js CLI examples for easy local testing 36 | - Includes a simple relay server 37 | - Includes the [OpenAI Realtime Console demo](#openai-realtime-console) using this package 🔥 38 | 39 | ## Install 40 | 41 | ```sh 42 | npm install openai-realtime-api 43 | ``` 44 | 45 | This package is [ESM-only](https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c). It requires `Node.js >= 18`, a browser environment, or an equivalent JS runtime (Deno, Bun, CF workers, etc). 46 | 47 | ## Usage 48 | 49 | > [!IMPORTANT] 50 | > All usage and events are 100% compatible with the [OpenAI JS version](https://github.com/openai/openai-realtime-api-beta). The main difference aside from bug fixes is that **all events are fully-typed**. 51 | 52 | ```ts 53 | import { RealtimeClient } from 'openai-realtime-api' 54 | 55 | // Create a new client; all params are optional; apiKey defaults to the 56 | // `OPENAI_API_KEY` environment variable (when using Node.js). 57 | const client = new RealtimeClient({ 58 | sessionConfig: { 59 | instructions: 'You are a great, upbeat friend.', 60 | voice: 'alloy' 61 | } 62 | }) 63 | 64 | // Can change session config ahead of connecting. 65 | client.updateSession({ 66 | turn_detection: null, 67 | input_audio_transcription: { model: 'whisper-1' } 68 | }) 69 | 70 | // Example of custom event handling 71 | client.on('conversation.updated', (event) => { 72 | // All events are fully-typed based on the event name. 73 | // In this case, `event` will have the type `RealtimeCustomEvents.ConversationUpdatedEvent` 74 | const { item, delta } = event 75 | 76 | // Access the full list of conversation items. 77 | const items = client.conversation.getItems() 78 | }) 79 | 80 | // Connect to the Realtime API. 81 | await client.connect() 82 | 83 | // Send a text message and trigger a response generation. 84 | client.sendUserMessageContent([{ type: 'input_text', text: 'How are you?' }]) 85 | 86 | // Wait for a completed response from the model. 87 | // (`event` will be of type `RealtimeServerEvents.ResponseDoneEvent`) 88 | const event = await client.realtime.waitForNext('response.done') 89 | ``` 90 | 91 | See [examples](#examples) for more complete demos. 92 | 93 | See also the official [OpenAI Realtime API Guide](https://platform.openai.com/docs/guides/realtime) and [API Reference](https://platform.openai.com/docs/api-reference/realtime). 94 | 95 | For more info on usage, tools, and custom events, see [OpenAI's readme](https://github.com/openai/openai-realtime-api-beta). Note that this package is 100% compatible with OpenAI's beta package in terms of both official and unofficial events. The only difference is that all events are typed. 96 | 97 | ### Server Usage 98 | 99 | `RealtimeClient` takes in an optional `apiKey` which defaults to `process.env.OPENAI_API_KEY`. 100 | 101 | ### Browser Usage 102 | 103 | `RealtimeClient` takes in an optional `url` which can be pointed at a relay server. 104 | 105 | ```ts 106 | import { RealtimeClient } from 'openai-realtime-api' 107 | 108 | // Create a browser client which points to a relay server. 109 | const client = new RealtimeClient({ url: RELAY_SERVER_URL }) 110 | ``` 111 | 112 | Alternatively, you can use `apiKey` with `RealtimeClient` in the browser, but you also have to pass `dangerouslyAllowAPIKeyInBrowser: true`. 113 | 114 | ```ts 115 | import { RealtimeClient } from 'openai-realtime-api' 116 | 117 | // Create a browser client which connects directly to the OpenAI realtime API 118 | // with an unsafe, client-side API key. 119 | const client = new RealtimeClient({ 120 | apiKey: process.env.OPENAI_API_KEY, 121 | dangerouslyAllowAPIKeyInBrowser: true 122 | }) 123 | ``` 124 | 125 | > [!CAUTION] 126 | > We strongly recommend against including your API key in any client (mobile or browser). It can be useful for local testing, but for production, you should be using a relay server. 127 | 128 | ### Relay Server 129 | 130 | ```ts 131 | import { RealtimeClient } from 'openai-realtime-api' 132 | import { RealtimeRelay } from 'openai-realtime-api/node' 133 | 134 | // Setting `relay: true` disables tool calls and directly modifying the session, 135 | // since that will be the responsibility of the upstream client. 136 | const client = new RealtimeClient({ relay: true }) 137 | const relay = new RealtimeRelay({ client }) 138 | 139 | relay.listen(8081) 140 | ``` 141 | 142 | Note that `RealtimeRelay` uses a different import path because it contains Node.js-specific code. 143 | 144 | A full example is included in [examples/node/relay-server.ts](./examples/node/relay-server.ts). 145 | 146 | ## Examples 147 | 148 | To run the included examples (requires `Node.js >= 18`): 149 | 150 | 1. Clone this repo 151 | 2. Run `pnpm install` 152 | 3. Setup `.env` with your `OPENAI_API_KEY` 153 | 154 | You can set `debug: true` in the `RealtimeClient` constructor of these examples to print out the full event log. 155 | 156 | ### Node.js Basic 157 | 158 | Simple Node.js demo using the `RealtimeClient` which sends a text message and waits for a complete response. 159 | 160 | - [examples/node/basic.ts](./examples/node/basic.ts) 161 | - Run `npx tsx examples/node/basic.ts` 162 | 163 | ### Node.js Audio 164 | 165 | Simple Node.js demo using the `RealtimeClient` which sends a short audio message and waits for a complete response. 166 | 167 | - [examples/node/audio.ts](./examples/node/audio.ts) 168 | - Run `npx tsx examples/node/audio.ts` 169 | 170 | ### Node.js Conversation 171 | 172 | Simple Node.js demo using the `RealtimeClient` with a microphone and speaker to simulate a full, back & forth conversation from the terminal. 173 | 174 | - [examples/node/convo.ts](./examples/node/convo.ts) 175 | - This demo uses the [mic](https://github.com/ashishbajaj99/mic) and [speaker](https://github.com/TooTallNate/node-speaker) npm packages 176 | - `mic` requires [sox](https://sourceforge.net/projects/sox/); on macOS, you can run `brew install sox` 177 | - `npx tsx examples/node/convo.ts` 178 | 179 | ### OpenAI Realtime Console 180 | 181 | This example has been imported from https://github.com/openai/openai-realtime-console ([at commit 6ea4dba](https://github.com/openai/openai-realtime-console/tree/6ea4dba795fee868c60ea9e8e7eba7469974b3e9)). The only change has been to replace `@openai/realtime-api-beta` with `openai-realtime-api` and to fix a few types. 182 | 183 | 184 | 185 | To run the realtime console example: 186 | 187 | ```sh 188 | pnpm install 189 | cd examples/openai-realtime-console 190 | pnpm start 191 | ``` 192 | 193 | ## TODO 194 | 195 | - add an example using tools 196 | - add an example next.js app 197 | - improve readme docs 198 | 199 | ## License 200 | 201 | MIT © [Travis Fischer](https://x.com/transitive_bs) 202 | 203 | If you found this project interesting, [consider following me on Twitter](https://x.com/transitive_bs). 204 | -------------------------------------------------------------------------------- /src/api.ts: -------------------------------------------------------------------------------- 1 | import type { ClientRequest } from 'node:http' 2 | 3 | import type { WebSocket as WS } from 'ws' 4 | 5 | import type { 6 | Event, 7 | RealtimeClientEvents, 8 | RealtimeServerEvents 9 | } from './events' 10 | import { RealtimeEventHandler } from './event-handler' 11 | import { 12 | generateId, 13 | getEnv, 14 | hasNativeWebSocket, 15 | isBrowser, 16 | trimDebugEvent 17 | } from './utils' 18 | 19 | /** 20 | * The RealtimeAPI class handles low-level communication with the OpenAI 21 | * Realtime API via WebSockets. 22 | */ 23 | export class RealtimeAPI extends RealtimeEventHandler< 24 | | RealtimeClientEvents.EventType 25 | | RealtimeServerEvents.EventType 26 | | 'close' 27 | | `client.${RealtimeClientEvents.EventType}` 28 | | `server.${RealtimeServerEvents.EventType}` 29 | | 'client.*' 30 | | 'server.*', 31 | Event, 32 | RealtimeClientEvents.EventMap & 33 | RealtimeServerEvents.EventMap & 34 | RealtimeClientEvents.PrefixedEventMap & 35 | RealtimeServerEvents.PrefixedEventMap & { 36 | 'client.*': RealtimeClientEvents.ClientEvent 37 | } & { 38 | 'server.*': RealtimeServerEvents.ServerEvent 39 | } & { 40 | close: { type: 'close'; error: boolean } 41 | } 42 | > { 43 | readonly model: string 44 | readonly url: string 45 | readonly apiKey?: string 46 | readonly debug: boolean 47 | ws?: WebSocket | WS 48 | 49 | /** 50 | * Creates a new RealtimeAPI instance. 51 | */ 52 | constructor({ 53 | model = 'gpt-4o-realtime-preview-2024-10-01', 54 | url = 'wss://api.openai.com/v1/realtime', 55 | apiKey = getEnv('OPENAI_API_KEY'), 56 | dangerouslyAllowAPIKeyInBrowser, 57 | debug 58 | }: { 59 | model?: string 60 | url?: string 61 | apiKey?: string 62 | dangerouslyAllowAPIKeyInBrowser?: boolean 63 | debug?: boolean 64 | } = {}) { 65 | super() 66 | 67 | this.model = model 68 | this.url = url 69 | this.apiKey = apiKey 70 | this.debug = !!debug 71 | 72 | if (isBrowser && this.apiKey) { 73 | if (!dangerouslyAllowAPIKeyInBrowser) { 74 | throw new Error( 75 | 'Unable to provide API key in the browser without "dangerouslyAllowAPIKeyInBrowser" set to true' 76 | ) 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * Whether or not the WebSocket is connected. 83 | */ 84 | get isConnected(): boolean { 85 | return !!this.ws 86 | } 87 | 88 | /** 89 | * Connects to Realtime API WebSocket Server. 90 | */ 91 | async connect() { 92 | if (this.isConnected) { 93 | return 94 | } 95 | 96 | if (!this.apiKey && !isBrowser) { 97 | console.warn(`No apiKey provided for connection to "${this.url}"`) 98 | } 99 | 100 | const url = new URL(this.url) 101 | url.searchParams.set('model', this.model) 102 | 103 | if (hasNativeWebSocket()) { 104 | if (isBrowser && this.apiKey) { 105 | console.warn( 106 | 'Warning: Connecting using API key in the browser, this is not recommended' 107 | ) 108 | } 109 | 110 | const ws = new WebSocket( 111 | url.toString(), 112 | [ 113 | 'realtime', 114 | this.apiKey ? `openai-insecure-api-key.${this.apiKey}` : undefined, 115 | 'openai-beta.realtime-v1' 116 | ].filter(Boolean) 117 | ) 118 | 119 | ws.addEventListener('message', (event) => { 120 | const message: any = JSON.parse(event.data) 121 | this.receive(message.type, message) 122 | }) 123 | 124 | return new Promise((resolve, reject) => { 125 | const connectionErrorHandler = () => { 126 | this.disconnect(ws) 127 | reject(new Error(`Could not connect to "${this.url}"`)) 128 | } 129 | 130 | ws.addEventListener('error', connectionErrorHandler) 131 | ws.addEventListener('open', () => { 132 | this._log(`Connected to "${this.url}"`) 133 | 134 | ws.removeEventListener('error', connectionErrorHandler) 135 | ws.addEventListener('error', () => { 136 | this.disconnect(ws) 137 | this._log(`Error, disconnected from "${this.url}"`) 138 | this.dispatch('close', { type: 'close', error: true }) 139 | }) 140 | 141 | ws.addEventListener('close', () => { 142 | this.disconnect(ws) 143 | this._log(`Disconnected from "${this.url}"`) 144 | this.dispatch('close', { type: 'close', error: false }) 145 | }) 146 | 147 | this.ws = ws 148 | resolve(true) 149 | }) 150 | }) 151 | } else { 152 | // Node.js 153 | const wsModule = await import('ws') 154 | const ws: WS = new wsModule.WebSocket(url.toString(), [], { 155 | // Add auth headers 156 | finishRequest: (request: ClientRequest) => { 157 | request.setHeader('OpenAI-Beta', 'realtime=v1') 158 | 159 | if (this.apiKey) { 160 | request.setHeader('Authorization', `Bearer ${this.apiKey}`) 161 | 162 | // Needed for Azure OpenAI 163 | request.setHeader('api-key', this.apiKey) 164 | } 165 | 166 | request.end() 167 | } 168 | // TODO: this `any` is a workaround for `@types/ws` being out-of-date. 169 | } as any) 170 | 171 | ws.on('message', (data) => { 172 | const message: any = JSON.parse(data.toString()) 173 | this.receive(message.type, message) 174 | }) 175 | 176 | return new Promise((resolve, reject) => { 177 | const connectionErrorHandler = () => { 178 | this.disconnect(ws) 179 | reject(new Error(`Could not connect to "${this.url}"`)) 180 | } 181 | 182 | ws.on('error', connectionErrorHandler) 183 | ws.on('open', () => { 184 | this._log(`Connected to "${this.url}"`) 185 | 186 | ws.removeListener('error', connectionErrorHandler) 187 | ws.on('error', () => { 188 | this._log(`Error, disconnected from "${this.url}"`) 189 | this.disconnect(ws) 190 | this.dispatch('close', { type: 'close', error: true }) 191 | }) 192 | 193 | ws.on('close', () => { 194 | this.disconnect(ws) 195 | this._log(`Disconnected from "${this.url}"`) 196 | this.dispatch('close', { type: 'close', error: false }) 197 | }) 198 | 199 | this.ws = ws 200 | resolve() 201 | }) 202 | }) 203 | } 204 | } 205 | 206 | /** 207 | * Disconnects from the Realtime API server. 208 | */ 209 | disconnect(ws?: WebSocket | WS) { 210 | if (this.ws && (!ws || this.ws === ws)) { 211 | this.ws?.close() 212 | this.ws = undefined 213 | } 214 | } 215 | 216 | /** 217 | * Receives an event from WebSocket and dispatches related events. 218 | */ 219 | receive< 220 | E extends RealtimeServerEvents.EventType, 221 | D extends 222 | RealtimeServerEvents.ServerEvent = RealtimeServerEvents.EventMap[E] extends RealtimeServerEvents.ServerEvent 223 | ? RealtimeServerEvents.EventMap[E] 224 | : RealtimeServerEvents.ServerEvent 225 | >(eventName: E, event: D) { 226 | this._log('received:', eventName, event) 227 | this.dispatch(eventName, event) 228 | this.dispatch(`server.${eventName}`, event) 229 | this.dispatch('server.*', event) 230 | } 231 | 232 | /** 233 | * Sends an event to the underlying WebSocket and dispatches related events. 234 | */ 235 | send< 236 | E extends RealtimeClientEvents.EventType, 237 | D extends 238 | RealtimeClientEvents.ClientEvent = RealtimeClientEvents.EventMap[E] extends RealtimeClientEvents.ClientEvent 239 | ? RealtimeClientEvents.EventMap[E] 240 | : RealtimeClientEvents.ClientEvent 241 | >(eventName: E, data: Omit = {} as any) { 242 | if (!this.isConnected) { 243 | throw new Error(`RealtimeAPI is not connected`) 244 | } 245 | data = data || {} 246 | if (typeof data !== 'object') { 247 | throw new TypeError(`data must be an object`) 248 | } 249 | 250 | const event = { 251 | event_id: generateId('evt_'), 252 | type: eventName, 253 | ...data 254 | } 255 | this.dispatch(eventName, event) 256 | this.dispatch(`client.${eventName}`, event) 257 | this.dispatch('client.*', event) 258 | this._log('sent:', eventName, event) 259 | this.ws!.send(JSON.stringify(event)) 260 | } 261 | 262 | /** 263 | * Writes WebSocket logs to the console if `debug` is enabled. 264 | */ 265 | protected _log(...args: any[]) { 266 | const date = new Date().toISOString() 267 | const logs = [`[Websocket/${date}]`].concat(args).map((arg) => { 268 | if (typeof arg === 'object' && arg !== null) { 269 | return JSON.stringify(trimDebugEvent(arg), null, 2) 270 | } else { 271 | return arg 272 | } 273 | }) 274 | 275 | if (this.debug) { 276 | console.log(...logs) 277 | } 278 | } 279 | } 280 | -------------------------------------------------------------------------------- /src/client.test.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config' 2 | 3 | import fs from 'node:fs/promises' 4 | 5 | import decodeAudio from 'audio-decode' 6 | import { expect, expectTypeOf, test } from 'vitest' 7 | 8 | import type { Event, RealtimeServerEvents } from './events' 9 | import { RealtimeClient } from './client' 10 | import { arrayBufferToBase64, trimDebugEvent } from './utils' 11 | 12 | const fixtures = ['./fixtures/toronto.mp3'] 13 | const fixtureData = await Promise.all( 14 | fixtures.map(async (filePath) => { 15 | const audioFile = await fs.readFile(filePath) 16 | const audioBuffer = await decodeAudio(audioFile) 17 | const channelData = audioBuffer.getChannelData(0) // only accepts mono 18 | const base64 = arrayBufferToBase64(channelData) 19 | return { filePath, base64 } 20 | }) 21 | ) 22 | 23 | test( 24 | 'e2e', 25 | { 26 | timeout: 60_000 27 | }, 28 | async () => { 29 | const events: Event[] = [] 30 | const client = new RealtimeClient({ 31 | debug: true, 32 | sessionConfig: { 33 | instructions: 34 | 'Please follow the instructions of any query you receive.\n' + 35 | 'Be concise in your responses. Speak quickly and answer shortly.', 36 | turn_detection: null 37 | } 38 | }) 39 | 40 | client.on('realtime.event', (event) => { 41 | events.push(trimDebugEvent(event.event)) 42 | }) 43 | 44 | expect(client.isConnected).toBe(false) 45 | await client.connect() 46 | expect(client.isConnected).toBe(true) 47 | 48 | await client.waitForSessionCreated() 49 | 50 | const sample = fixtureData[0]!.base64 51 | client.sendUserMessageContent([{ type: 'input_audio', audio: sample }]) 52 | 53 | const item = await client.waitForNextItem() 54 | console.log(item) 55 | expect(item.type).toBe('message') 56 | expect(item.role).toBe('user') 57 | expect(item.status).toBe('completed') 58 | expect(item.content).toHaveLength(1) 59 | expect(item.content[0]!.type).toBe('input_audio') 60 | 61 | // Wait for the full response to complete from the server 62 | const event = await client.realtime.waitForNext('response.done') 63 | expectTypeOf(event).toEqualTypeOf() 64 | console.log(event) 65 | 66 | client.disconnect() 67 | expect(client.isConnected).toBe(false) 68 | 69 | expect(event).toBeDefined() 70 | expect(event.type).toBe('response.done') 71 | expect(event.response).toBeDefined() 72 | expect(event.response.status).toBe('completed') 73 | expect(event.response.output).toBeDefined() 74 | expect(event.response.output).toHaveLength(1) 75 | expect(event.response.output[0]!.type).toBe('message') 76 | expect(event.response.output[0]!.role).toBe('assistant') 77 | expect(event.response.output[0]!.status).toBe('completed') 78 | expect(event.response.output[0]!.content).toBeDefined() 79 | expect(event.response.output[0]!.content).toHaveLength(1) 80 | expect(event.response.output[0]!.content[0]!.type).toBe('audio') 81 | expect(event.response.output[0]!.content[0]!.transcript).toMatch(/toronto/i) 82 | expect(event.response.usage).toBeDefined() 83 | 84 | expect( 85 | events.filter((e) => e.type === 'response.audio_transcript.delta').length 86 | ).toBeGreaterThanOrEqual(1) 87 | 88 | expect( 89 | events.filter((e) => e.type === 'response.audio.delta').length 90 | ).toBeGreaterThanOrEqual(1) 91 | 92 | expect(events.filter((e) => e.type === 'response.audio.done')).toHaveLength( 93 | 1 94 | ) 95 | 96 | expect( 97 | events.filter((e) => e.type === 'response.audio_transcript.done') 98 | ).toHaveLength(1) 99 | 100 | expect( 101 | events.filter((e) => e.type === 'response.content_part.done') 102 | ).toHaveLength(1) 103 | 104 | expect( 105 | events.filter((e) => e.type === 'response.output_item.done') 106 | ).toHaveLength(1) 107 | 108 | console.log(JSON.stringify(events, null, 2)) 109 | } 110 | ) 111 | -------------------------------------------------------------------------------- /src/client.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | Event, 3 | RealtimeClientEvents, 4 | RealtimeCustomEvents, 5 | RealtimeServerEvents 6 | } from './events' 7 | import type { 8 | EventHandlerResult, 9 | FormattedTool, 10 | Realtime, 11 | ToolHandler 12 | } from './types' 13 | import { RealtimeAPI } from './api' 14 | import { RealtimeConversation } from './conversation' 15 | import { RealtimeEventHandler } from './event-handler' 16 | import { arrayBufferToBase64, assert, mergeInt16Arrays, sleep } from './utils' 17 | 18 | /** 19 | * The RealtimeClient class is the main interface for interacting with the 20 | * OpenAI Realtime API. It handles connection, configuration, conversation 21 | * updates, and server event handling. 22 | */ 23 | export class RealtimeClient extends RealtimeEventHandler< 24 | | RealtimeClientEvents.EventType 25 | | RealtimeServerEvents.EventType 26 | | RealtimeCustomEvents.EventType, 27 | Event, 28 | RealtimeClientEvents.EventMap & 29 | RealtimeServerEvents.EventMap & 30 | RealtimeCustomEvents.EventMap 31 | > { 32 | readonly defaultSessionConfig: Realtime.SessionConfig 33 | sessionConfig: Realtime.SessionConfig 34 | 35 | readonly relay: boolean 36 | 37 | realtime: RealtimeAPI 38 | conversation: RealtimeConversation 39 | 40 | inputAudioBuffer: Int16Array 41 | sessionCreated: boolean 42 | tools: Record< 43 | string, 44 | { 45 | definition: Realtime.ToolDefinition 46 | handler: ToolHandler 47 | } 48 | > 49 | 50 | constructor({ 51 | sessionConfig, 52 | relay = false, 53 | ...apiParams 54 | }: { 55 | sessionConfig?: Partial> 56 | apiKey?: string 57 | model?: string 58 | url?: string 59 | dangerouslyAllowAPIKeyInBrowser?: boolean 60 | debug?: boolean 61 | /** 62 | * Relay mode disables tool use, since it will be the responsibility of the 63 | * upstream client to handle tool calls. 64 | */ 65 | relay?: boolean 66 | } = {}) { 67 | super() 68 | 69 | this.defaultSessionConfig = { 70 | modalities: ['text', 'audio'], 71 | voice: 'alloy', 72 | input_audio_format: 'pcm16', 73 | output_audio_format: 'pcm16', 74 | input_audio_transcription: { 75 | model: 'whisper-1' 76 | }, 77 | turn_detection: null, 78 | // turn_detection: { 79 | // type: 'server_vad', 80 | // threshold: 0.5, 81 | // prefix_padding_ms: 300, 82 | // silence_duration_ms: 500 83 | // }, 84 | tools: [], 85 | tool_choice: 'auto', 86 | temperature: 0.8, 87 | max_response_output_tokens: 4096, 88 | ...sessionConfig 89 | } 90 | this.sessionConfig = {} 91 | this.sessionCreated = false 92 | this.tools = {} 93 | this.inputAudioBuffer = new Int16Array(0) 94 | this.relay = !!relay 95 | 96 | this.realtime = new RealtimeAPI(apiParams) 97 | this.conversation = new RealtimeConversation({ debug: apiParams.debug }) 98 | 99 | this._resetConfig() 100 | this._addAPIEventHandlers() 101 | } 102 | 103 | /** 104 | * Resets sessionConfig and conversation to defaults. 105 | */ 106 | protected _resetConfig() { 107 | this.sessionCreated = false 108 | this.tools = {} 109 | this.sessionConfig = structuredClone(this.defaultSessionConfig) 110 | this.inputAudioBuffer = new Int16Array(0) 111 | } 112 | 113 | /** 114 | * Sets up event handlers for a fully-functional application control flow. 115 | */ 116 | protected _addAPIEventHandlers() { 117 | // Event Logging handlers 118 | this.realtime.on('client.*', (event: any) => { 119 | this.dispatch('realtime.event', { 120 | type: 'realtime.event', 121 | time: new Date().toISOString(), 122 | source: 'client', 123 | event 124 | }) 125 | }) 126 | 127 | this.realtime.on('server.*', (event: RealtimeServerEvents.ServerEvent) => { 128 | this.dispatch('realtime.event', { 129 | type: 'realtime.event', 130 | time: new Date().toISOString(), 131 | source: 'server', 132 | event 133 | }) 134 | }) 135 | 136 | // Handles session created event 137 | this.realtime.on('server.session.created', () => { 138 | this.sessionCreated = true 139 | }) 140 | 141 | // Setup for application control flow 142 | const handler = (event: any, ...args: any[]): EventHandlerResult => { 143 | if (!this.isConnected) return {} 144 | return this.conversation.processEvent(event, ...args) 145 | } 146 | 147 | const handlerWithDispatch = (event: any, ...args: any[]) => { 148 | const res = handler(event, ...args) 149 | 150 | if (res.item) { 151 | // FIXME: This is only here because `item.input_audio_transcription.completed` 152 | // can fire before `item.created`, resulting in empty item. This happens in 153 | // VAD mode with empty audio. 154 | this.dispatch('conversation.updated', { 155 | type: 'conversation.updated', 156 | ...res 157 | }) 158 | } 159 | 160 | return res 161 | } 162 | 163 | const callTool = async (tool: FormattedTool) => { 164 | // In relay mode, we don't attempt to call tools. That is the 165 | // responsibility of the upstream client. 166 | if (this.isRelay) return 167 | 168 | try { 169 | const jsonArguments = JSON.parse(tool.arguments) 170 | const toolConfig = this.tools[tool.name] 171 | if (!toolConfig) { 172 | console.warn(`Tool "${tool.name}" not found`) 173 | return 174 | } 175 | 176 | const result = await Promise.resolve(toolConfig.handler(jsonArguments)) 177 | this.realtime.send('conversation.item.create', { 178 | item: { 179 | type: 'function_call_output', 180 | call_id: tool.call_id, 181 | output: JSON.stringify(result) 182 | } 183 | }) 184 | } catch (err: any) { 185 | console.warn(`Error calling tool "${tool.name}":`, err.message) 186 | 187 | this.realtime.send('conversation.item.create', { 188 | item: { 189 | type: 'function_call_output', 190 | call_id: tool.call_id, 191 | output: JSON.stringify({ error: err.message }) 192 | } 193 | }) 194 | } 195 | 196 | this.createResponse() 197 | } 198 | 199 | // Handlers to update internal conversation state 200 | this.realtime.on('server.response.created', handler) 201 | this.realtime.on('server.response.output_item.added', handler) 202 | this.realtime.on('server.response.content_part.added', handler) 203 | this.realtime.on( 204 | 'server.input_audio_buffer.speech_started', 205 | (event: RealtimeServerEvents.InputAudioBufferSpeechStartedEvent) => { 206 | handler(event) 207 | this.dispatch('conversation.interrupted', event) 208 | } 209 | ) 210 | this.realtime.on( 211 | 'server.input_audio_buffer.speech_stopped', 212 | (event: RealtimeServerEvents.InputAudioBufferSpeechStoppedEvent) => { 213 | handler(event, this.inputAudioBuffer) 214 | } 215 | ) 216 | 217 | // Handlers to update application state 218 | this.realtime.on( 219 | 'server.conversation.item.created', 220 | (event: RealtimeServerEvents.ConversationItemCreatedEvent) => { 221 | const res = handlerWithDispatch(event) 222 | if (!res.item) return 223 | 224 | this.dispatch('conversation.item.appended', { 225 | type: 'conversation.item.appended', 226 | ...res 227 | }) 228 | 229 | if (res.item.status === 'completed') { 230 | this.dispatch('conversation.item.completed', { 231 | type: 'conversation.item.completed', 232 | ...res 233 | }) 234 | } 235 | } 236 | ) 237 | this.realtime.on('server.conversation.item.truncated', handlerWithDispatch) 238 | this.realtime.on('server.conversation.item.deleted', handlerWithDispatch) 239 | this.realtime.on( 240 | 'server.conversation.item.input_audio_transcription.completed', 241 | handlerWithDispatch 242 | ) 243 | this.realtime.on( 244 | 'server.response.audio_transcript.delta', 245 | handlerWithDispatch 246 | ) 247 | this.realtime.on('server.response.audio.delta', handlerWithDispatch) 248 | this.realtime.on('server.response.text.delta', handlerWithDispatch) 249 | this.realtime.on( 250 | 'server.response.function_call_arguments.delta', 251 | handlerWithDispatch 252 | ) 253 | this.realtime.on( 254 | 'server.response.output_item.done', 255 | async (event: RealtimeServerEvents.ResponseOutputItemDoneEvent) => { 256 | const res = handlerWithDispatch(event) 257 | if (!res.item?.formatted) return 258 | 259 | if (res.item.status === 'completed') { 260 | this.dispatch('conversation.item.completed', { 261 | type: 'conversation.item.completed', 262 | ...res 263 | }) 264 | } 265 | 266 | if (res.item.formatted.tool) { 267 | callTool(res.item.formatted.tool) 268 | } 269 | } 270 | ) 271 | } 272 | 273 | /** 274 | * Whether the realtime socket is connected. 275 | */ 276 | get isConnected(): boolean { 277 | return this.realtime.isConnected 278 | } 279 | 280 | /** 281 | * Whether the client is in relay mode. When in relay mode, the client will 282 | * not attempt to invoke tools. 283 | */ 284 | get isRelay(): boolean { 285 | return this.relay 286 | } 287 | 288 | /** 289 | * Resets the client instance entirely: disconnects and clears configs. 290 | */ 291 | reset() { 292 | this.disconnect() 293 | this.clearEventHandlers() 294 | this.realtime.clearEventHandlers() 295 | this._resetConfig() 296 | this._addAPIEventHandlers() 297 | } 298 | 299 | /** 300 | * Connects to the Realtime WebSocket API and updates the session config. 301 | */ 302 | async connect() { 303 | if (this.isConnected) { 304 | return 305 | } 306 | 307 | await this.realtime.connect() 308 | this.updateSession() 309 | } 310 | 311 | /** 312 | * Waits for a session.created event to be executed before proceeding. 313 | */ 314 | async waitForSessionCreated() { 315 | assert(this.isConnected, 'Not connected, use .connect() first') 316 | 317 | while (!this.sessionCreated) { 318 | await sleep(1) 319 | } 320 | } 321 | 322 | /** 323 | * Disconnects from the Realtime API and clears the conversation history. 324 | */ 325 | disconnect() { 326 | this.sessionCreated = false 327 | this.realtime.disconnect() 328 | this.conversation.clear() 329 | } 330 | 331 | /** 332 | * Gets the active turn detection mode. 333 | */ 334 | getTurnDetectionType(): 'server_vad' | undefined { 335 | return this.sessionConfig.turn_detection?.type 336 | } 337 | 338 | /** 339 | * Adds a tool to the session. 340 | */ 341 | addTool(definition: Realtime.PartialToolDefinition, handler: ToolHandler) { 342 | assert(!this.isRelay, 'Unable to add tools in relay mode') 343 | assert(definition?.name, 'Missing tool name in definition') 344 | const { name } = definition 345 | 346 | assert( 347 | typeof handler === 'function', 348 | `Tool "${name}" handler must be a function` 349 | ) 350 | 351 | this.tools[name] = { 352 | definition: { 353 | type: 'function', 354 | ...definition 355 | }, 356 | handler 357 | } 358 | this.updateSession() 359 | } 360 | 361 | /** 362 | * Removes a tool from the session. 363 | */ 364 | removeTool(name: string) { 365 | assert(!this.isRelay, 'Unable to add tools in relay mode') 366 | assert( 367 | this.tools[name], 368 | `Tool "${name}" does not exist, can not be removed.` 369 | ) 370 | delete this.tools[name] 371 | this.updateSession() 372 | } 373 | 374 | /** 375 | * Deletes an item. 376 | */ 377 | deleteItem(id: string) { 378 | this.realtime.send('conversation.item.delete', { item_id: id }) 379 | } 380 | 381 | /** 382 | * Updates session configuration. 383 | * 384 | * If the client is not yet connected, the session will be updated upon connection. 385 | */ 386 | updateSession(sessionConfig: Realtime.SessionConfig = {}) { 387 | const tools = Object.values(this.tools).map(({ definition }) => definition) 388 | 389 | this.sessionConfig = { 390 | ...this.sessionConfig, 391 | ...sessionConfig, 392 | tools 393 | } 394 | 395 | if (this.isConnected && !this.isRelay) { 396 | this.realtime.send('session.update', { 397 | session: structuredClone(this.sessionConfig) 398 | }) 399 | } 400 | } 401 | 402 | /** 403 | * Sends user message content and generates a response. 404 | */ 405 | sendUserMessageContent( 406 | content: Array< 407 | Realtime.InputTextContentPart | Realtime.InputAudioContentPart 408 | > 409 | ) { 410 | assert(!this.isRelay, 'Unable to send messages directly in relay mode') 411 | 412 | if (content.length) { 413 | this.realtime.send('conversation.item.create', { 414 | item: { 415 | type: 'message', 416 | role: 'user', 417 | content 418 | } 419 | }) 420 | } 421 | 422 | this.createResponse() 423 | } 424 | 425 | /** 426 | * Appends user audio to the existing audio buffer. 427 | */ 428 | appendInputAudio(arrayBuffer: Int16Array | ArrayBuffer) { 429 | assert(!this.isRelay, 'Unable to append input audio directly in relay mode') 430 | 431 | if (arrayBuffer.byteLength > 0) { 432 | this.realtime.send('input_audio_buffer.append', { 433 | audio: arrayBufferToBase64(arrayBuffer) 434 | }) 435 | 436 | this.inputAudioBuffer = mergeInt16Arrays( 437 | this.inputAudioBuffer, 438 | arrayBuffer 439 | ) 440 | } 441 | } 442 | 443 | /** 444 | * Forces the model to generate a response. 445 | */ 446 | createResponse() { 447 | assert(!this.isRelay, 'Unable to create a response directly in relay mode') 448 | 449 | if (!this.getTurnDetectionType() && this.inputAudioBuffer.byteLength > 0) { 450 | this.realtime.send('input_audio_buffer.commit') 451 | this.conversation.queueInputAudio(this.inputAudioBuffer) 452 | this.inputAudioBuffer = new Int16Array(0) 453 | } 454 | 455 | this.realtime.send('response.create') 456 | } 457 | 458 | /** 459 | * Cancels the ongoing server generation and truncates ongoing generation, if 460 | * applicable. 461 | * 462 | * If no id provided, will simply call `cancel_generation` command. 463 | */ 464 | cancelResponse( 465 | /** The ID of the item to cancel. */ 466 | id?: string, 467 | /** The number of samples to truncate past for the ongoing generation. */ 468 | sampleCount = 0 469 | ): Realtime.AssistantItem | undefined { 470 | assert(!this.isRelay, 'Unable to cancel a response directly in relay mode') 471 | 472 | if (!id) { 473 | this.realtime.send('response.cancel') 474 | return 475 | } 476 | 477 | const item = this.conversation.getItem(id) 478 | assert(item, `Could not find item "${id}"`) 479 | assert( 480 | item.type === 'message', 481 | `Can only cancelResponse messages with type "message"` 482 | ) 483 | assert( 484 | item.role === 'assistant', 485 | `Can only cancelResponse messages with role "assistant"` 486 | ) 487 | 488 | this.realtime.send('response.cancel') 489 | const audioIndex = item.content.findIndex((c) => c.type === 'audio') 490 | assert(audioIndex >= 0, `Could not find audio on item ${id} to cancel`) 491 | 492 | this.realtime.send('conversation.item.truncate', { 493 | item_id: id, 494 | content_index: audioIndex, 495 | audio_end_ms: Math.floor( 496 | (sampleCount / this.conversation.defaultFrequency) * 1000 497 | ) 498 | }) 499 | 500 | return item 501 | } 502 | 503 | /** 504 | * Utility for waiting for the next `conversation.item.appended` event to be 505 | * triggered by the server. 506 | */ 507 | async waitForNextItem(): Promise { 508 | const event = await this.waitForNext('conversation.item.appended') 509 | return event.item 510 | } 511 | 512 | /** 513 | * Utility for waiting for the next `conversation.item.completed` event to be 514 | * triggered by the server. 515 | */ 516 | async waitForNextCompletedItem(): Promise { 517 | const event = await this.waitForNext('conversation.item.completed') 518 | return event.item 519 | } 520 | } 521 | -------------------------------------------------------------------------------- /src/conversation.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/naming-convention */ 2 | import type { RealtimeServerEvents } from './events' 3 | import type { EventHandlerResult, FormattedItem, Realtime } from './types' 4 | import { assert, base64ToArrayBuffer, mergeInt16Arrays } from './utils' 5 | 6 | /** 7 | * RealtimeConversation holds conversation history and performs event 8 | * validation for RealtimeAPI. 9 | */ 10 | export class RealtimeConversation { 11 | readonly defaultFrequency = 24_000 // 24,000 Hz 12 | 13 | readonly frequency: number 14 | readonly debug: boolean 15 | 16 | itemLookup: Record = {} 17 | items: FormattedItem[] = [] 18 | responseLookup: Record = {} 19 | responses: Realtime.Response[] = [] 20 | queuedSpeechItems: Record< 21 | string, 22 | { audio_start_ms: number; audio_end_ms?: number; audio?: Int16Array } 23 | > = {} 24 | queuedTranscriptItems: Record = {} 25 | queuedInputAudio?: Int16Array 26 | 27 | constructor({ 28 | frequency = this.defaultFrequency, 29 | debug = false 30 | }: { 31 | frequency?: number 32 | debug?: boolean 33 | } = {}) { 34 | // Default to 24,000 Hz if not provided 35 | if (frequency === undefined) { 36 | frequency = this.defaultFrequency 37 | } 38 | assert(frequency > 0, `Invalid frequency: ${frequency}`) 39 | 40 | this.frequency = frequency 41 | this.debug = debug 42 | 43 | this.clear() 44 | } 45 | 46 | /** 47 | * Clears the conversation history and resets to defaults. 48 | */ 49 | clear() { 50 | this.itemLookup = {} 51 | this.items = [] 52 | this.responseLookup = {} 53 | this.responses = [] 54 | this.queuedSpeechItems = {} 55 | this.queuedTranscriptItems = {} 56 | this.queuedInputAudio = undefined 57 | } 58 | 59 | /** 60 | * Queue input audio for manual speech event. 61 | */ 62 | queueInputAudio(inputAudio: Int16Array) { 63 | this.queuedInputAudio = inputAudio 64 | } 65 | 66 | /** 67 | * Process an event from the WebSocket server and compose items. 68 | */ 69 | processEvent( 70 | event: RealtimeServerEvents.ServerEvent, 71 | ...args: any[] 72 | ): EventHandlerResult { 73 | assert(event.event_id, `Missing "event_id" on event`) 74 | assert(event.type, `Missing "type" on event`) 75 | 76 | const eventProcessor = this.EventProcessors[event.type] 77 | assert(eventProcessor, `Missing event processor for "${event.type}"`) 78 | 79 | try { 80 | return eventProcessor.call(this, event as any, ...args) 81 | } catch (err: any) { 82 | if (this.debug) { 83 | console.error( 84 | `Error processing event "${event.type}":`, 85 | err.message, 86 | event 87 | ) 88 | } 89 | 90 | return {} 91 | } 92 | } 93 | 94 | /** 95 | * Retrieves an item by ID. 96 | */ 97 | getItem(id: string): FormattedItem | undefined { 98 | return this.itemLookup[id] 99 | } 100 | 101 | /** 102 | * Retrieves all items in the conversation. 103 | */ 104 | getItems(): FormattedItem[] { 105 | return this.items.slice() 106 | } 107 | 108 | /** Event handlers. */ 109 | EventProcessors: Partial<{ 110 | [K in keyof RealtimeServerEvents.EventMap]: ( 111 | event: RealtimeServerEvents.EventMap[K], 112 | ...args: any[] 113 | ) => EventHandlerResult 114 | }> = { 115 | 'conversation.item.created': (event) => { 116 | const { item } = event 117 | const newItem: FormattedItem = { 118 | ...structuredClone(item), 119 | formatted: { 120 | audio: new Int16Array(0), 121 | text: '', 122 | transcript: '' 123 | } 124 | } 125 | 126 | if (!this.itemLookup[newItem.id]) { 127 | this.itemLookup[newItem.id] = newItem 128 | this.items.push(newItem) 129 | } 130 | 131 | // If we have a speech item, can populate audio 132 | if (this.queuedSpeechItems[newItem.id]?.audio) { 133 | newItem.formatted.audio = this.queuedSpeechItems[newItem.id]!.audio! 134 | delete this.queuedSpeechItems[newItem.id] // free up some memory 135 | } 136 | 137 | // Populate formatted text if it comes out on creation 138 | if (newItem.content) { 139 | const textContent = newItem.content.filter( 140 | (c) => c.type === 'text' || c.type === 'input_text' 141 | ) as Array 142 | 143 | for (const content of textContent) { 144 | newItem.formatted.text += content.text 145 | } 146 | } 147 | 148 | // If we have a transcript item, can pre-populate transcript 149 | if (this.queuedTranscriptItems[newItem.id]) { 150 | newItem.formatted.transcript = 151 | this.queuedTranscriptItems[newItem.id]!.transcript 152 | delete this.queuedTranscriptItems[newItem.id] 153 | } 154 | 155 | if (newItem.type === 'message') { 156 | if (newItem.role === 'user') { 157 | newItem.status = 'completed' 158 | if (this.queuedInputAudio) { 159 | newItem.formatted.audio = this.queuedInputAudio 160 | this.queuedInputAudio = undefined 161 | } 162 | } else { 163 | newItem.status = 'in_progress' 164 | } 165 | } else if (newItem.type === 'function_call') { 166 | newItem.formatted.tool = { 167 | type: 'function', 168 | name: newItem.name, 169 | call_id: newItem.call_id, 170 | arguments: '' 171 | } 172 | 173 | newItem.status = 'in_progress' 174 | } else if (newItem.type === 'function_call_output') { 175 | newItem.status = 'completed' 176 | newItem.formatted.output = newItem.output 177 | } 178 | 179 | return { item: newItem } 180 | }, 181 | 182 | 'conversation.item.truncated': (event) => { 183 | const { item_id, audio_end_ms } = event 184 | const item = this.itemLookup[item_id] 185 | if (!item) { 186 | throw new Error(`item.truncated: Item "${item_id}" not found`) 187 | } 188 | 189 | const endIndex = Math.floor((audio_end_ms * this.frequency) / 1000) 190 | item.formatted.transcript = '' 191 | item.formatted.audio = item.formatted.audio!.slice(0, endIndex) 192 | 193 | return { item } 194 | }, 195 | 196 | 'conversation.item.deleted': (event) => { 197 | const { item_id } = event 198 | const item = this.itemLookup[item_id] 199 | if (!item) { 200 | throw new Error(`item.deleted: Item "${item_id}" not found`) 201 | } 202 | 203 | delete this.itemLookup[item.id] 204 | const index = this.items.indexOf(item) 205 | 206 | if (index >= 0) { 207 | this.items.splice(index, 1) 208 | } 209 | 210 | return { item } 211 | }, 212 | 213 | 'conversation.item.input_audio_transcription.completed': (event) => { 214 | const { item_id, content_index, transcript } = event 215 | const item = this.itemLookup[item_id] 216 | 217 | // We use a single space to represent an empty transcript for .formatted values 218 | // Otherwise it looks like no transcript provided 219 | const formattedTranscript = transcript || ' ' 220 | 221 | if (!item) { 222 | // We can receive transcripts in VAD mode before item.created 223 | // This happens specifically when audio is empty 224 | this.queuedTranscriptItems[item_id] = { 225 | transcript: formattedTranscript 226 | } 227 | 228 | return {} 229 | } else { 230 | if (item.content[content_index]) { 231 | ;( 232 | item.content[content_index] as Realtime.AudioContentPart 233 | ).transcript = transcript 234 | } 235 | item.formatted.transcript = formattedTranscript 236 | return { item, delta: { transcript } } 237 | } 238 | }, 239 | 240 | 'input_audio_buffer.speech_started': (event) => { 241 | const { item_id, audio_start_ms } = event 242 | const item = this.itemLookup[item_id] 243 | this.queuedSpeechItems[item_id] = { audio_start_ms } 244 | return { item } 245 | }, 246 | 247 | 'input_audio_buffer.speech_stopped': ( 248 | event, 249 | inputAudioBuffer: Int16Array 250 | ) => { 251 | const { item_id, audio_end_ms } = event 252 | const item = this.itemLookup[item_id] 253 | 254 | if (!this.queuedSpeechItems[item_id]) { 255 | this.queuedSpeechItems[item_id] = { audio_start_ms: audio_end_ms } 256 | } 257 | 258 | const speech = this.queuedSpeechItems[item_id] 259 | assert(speech, `Speech item not found for "${item_id}"`) 260 | speech.audio_end_ms = audio_end_ms 261 | 262 | if (inputAudioBuffer) { 263 | const startIndex = Math.floor( 264 | (speech.audio_start_ms * this.frequency) / 1000 265 | ) 266 | const endIndex = Math.floor( 267 | (speech.audio_end_ms * this.frequency) / 1000 268 | ) 269 | 270 | speech.audio = inputAudioBuffer.slice(startIndex, endIndex) 271 | } 272 | 273 | return { item } 274 | }, 275 | 276 | 'response.created': (event) => { 277 | const { response } = event 278 | 279 | if (!this.responseLookup[response.id]) { 280 | this.responseLookup[response.id] = response 281 | this.responses.push(response) 282 | } 283 | 284 | return { response } 285 | }, 286 | 287 | 'response.output_item.added': (event) => { 288 | const { response_id, item } = event 289 | const response = this.responseLookup[response_id] 290 | 291 | if (!response) { 292 | throw new Error( 293 | `response.output_item.added: Response "${response_id}" not found` 294 | ) 295 | } 296 | 297 | response.output.push(item) 298 | return { item, response } 299 | }, 300 | 301 | 'response.output_item.done': (event) => { 302 | const { item } = event 303 | if (!item) { 304 | throw new Error(`response.output_item.done: Missing "item"`) 305 | } 306 | 307 | const foundItem = this.itemLookup[item.id] 308 | if (!foundItem) { 309 | throw new Error( 310 | `response.output_item.done: Item "${item.id}" not found` 311 | ) 312 | } 313 | 314 | foundItem.status = item.status 315 | return { item: foundItem } 316 | }, 317 | 318 | 'response.content_part.added': (event) => { 319 | const { item_id, part } = event 320 | const item = this.itemLookup[item_id] 321 | if (!item) { 322 | throw new Error( 323 | `response.content_part.added: Item "${item_id}" not found` 324 | ) 325 | } 326 | 327 | item.content.push(part as any) 328 | return { item } 329 | }, 330 | 331 | 'response.audio_transcript.delta': (event) => { 332 | const { item_id, content_index, delta } = event 333 | const item = this.itemLookup[item_id] 334 | if (!item) { 335 | throw new Error( 336 | `response.audio_transcript.delta: Item "${item_id}" not found` 337 | ) 338 | } 339 | 340 | ;(item.content[content_index] as Realtime.AudioContentPart).transcript += 341 | delta 342 | item.formatted.transcript += delta 343 | 344 | return { item, delta: { transcript: delta } } 345 | }, 346 | 347 | 'response.audio.delta': (event) => { 348 | const { item_id, content_index: _, delta } = event 349 | const item = this.itemLookup[item_id] 350 | if (!item) { 351 | throw new Error(`response.audio.delta: Item "${item_id}" not found`) 352 | } 353 | 354 | // This never gets renderered; we care about the formatted data instead. 355 | // (item.content[content_index] as Realtime.AudioContentPart)!.audio += delta; 356 | 357 | const arrayBuffer = base64ToArrayBuffer(delta) 358 | const appendValues = new Int16Array(arrayBuffer) 359 | item.formatted.audio = mergeInt16Arrays( 360 | item.formatted.audio, 361 | appendValues 362 | ) 363 | 364 | return { item, delta: { audio: appendValues } } 365 | }, 366 | 367 | 'response.text.delta': (event) => { 368 | const { item_id, content_index, delta } = event 369 | const item = this.itemLookup[item_id] 370 | if (!item) { 371 | throw new Error(`response.text.delta: Item "${item_id}" not found`) 372 | } 373 | 374 | ;(item.content[content_index] as Realtime.TextContentPart).text += delta 375 | item.formatted.text += delta 376 | 377 | return { item, delta: { text: delta } } 378 | }, 379 | 380 | 'response.function_call_arguments.delta': (event) => { 381 | const { item_id, delta } = event 382 | const item = this.itemLookup[item_id] 383 | if (!item) { 384 | throw new Error( 385 | `response.function_call_arguments.delta: Item "${item_id}" not found` 386 | ) 387 | } 388 | 389 | ;(item as Realtime.FunctionCallItem).arguments += delta 390 | item.formatted.tool!.arguments += delta 391 | 392 | return { item, delta: { arguments: delta } } 393 | } 394 | } 395 | } 396 | -------------------------------------------------------------------------------- /src/event-handler.ts: -------------------------------------------------------------------------------- 1 | import type { Event } from './events' 2 | import type { MaybePromise } from './types' 3 | 4 | export type EventHandlerCallback = ( 5 | event: EventData 6 | ) => MaybePromise 7 | 8 | /** 9 | * Basic event handler. 10 | */ 11 | export class RealtimeEventHandler< 12 | EventType extends string = string, 13 | EventData extends Event = Event, 14 | EventMap extends Record = Record 15 | > { 16 | eventHandlers: Record[]> = 17 | {} as Record[]> 18 | 19 | /** 20 | * Clears all event handlers. 21 | */ 22 | clearEventHandlers() { 23 | this.eventHandlers = {} as Record< 24 | EventType, 25 | EventHandlerCallback[] 26 | > 27 | } 28 | 29 | /** 30 | * Adds a listener for a specific event. 31 | */ 32 | on< 33 | E extends EventType, 34 | D extends EventData = EventMap[E] extends EventData 35 | ? EventMap[E] 36 | : EventData 37 | >(eventName: E, callback: EventHandlerCallback) { 38 | this.eventHandlers[eventName] = this.eventHandlers[eventName] || [] 39 | this.eventHandlers[eventName].push( 40 | callback as EventHandlerCallback 41 | ) 42 | } 43 | 44 | /** 45 | * Adds a listener for a single occurrence of an event. 46 | */ 47 | once< 48 | E extends EventType, 49 | D extends EventData = EventMap[E] extends EventData 50 | ? EventMap[E] 51 | : EventData 52 | >(eventName: E, callback: EventHandlerCallback) { 53 | const onceCallback = (event: D) => { 54 | this.off(eventName, onceCallback) 55 | return callback(event) 56 | } 57 | this.on(eventName, onceCallback) 58 | } 59 | 60 | /** 61 | * Removes a listener for an event. 62 | * Calling without a callback will remove all listeners for the event. 63 | */ 64 | off< 65 | E extends EventType, 66 | D extends EventData = EventMap[E] extends EventData 67 | ? EventMap[E] 68 | : EventData 69 | >(eventName: E, callback?: EventHandlerCallback) { 70 | const handlers = this.eventHandlers[eventName] || [] 71 | if (callback) { 72 | const index = handlers.indexOf( 73 | callback as EventHandlerCallback 74 | ) 75 | if (index < 0) { 76 | throw new Error( 77 | `Could not turn off specified event listener for "${eventName}": not found as a listener` 78 | ) 79 | } 80 | 81 | handlers.splice(index, 1) 82 | } else { 83 | delete this.eventHandlers[eventName] 84 | } 85 | } 86 | 87 | /** 88 | * Waits for next event of a specific type and returns the payload. 89 | */ 90 | async waitForNext< 91 | E extends EventType, 92 | D extends EventData = EventMap[E] extends EventData 93 | ? EventMap[E] 94 | : EventData 95 | >(eventName: E, { timeoutMs }: { timeoutMs?: number } = {}): Promise { 96 | return new Promise((resolve, reject) => { 97 | this.once(eventName, resolve as any) 98 | 99 | if (timeoutMs !== undefined) { 100 | setTimeout( 101 | () => reject(new Error(`Timeout waiting for "${eventName}"`)), 102 | timeoutMs 103 | ) 104 | } 105 | }) 106 | } 107 | 108 | /** 109 | * Executes all events handlers in the order they were added. 110 | */ 111 | dispatch< 112 | E extends EventType, 113 | D extends EventData = EventMap[E] extends EventData 114 | ? EventMap[E] 115 | : EventData 116 | >(eventName: E, event: D) { 117 | const handlers = this.eventHandlers[eventName] || [] 118 | for (const handler of handlers) { 119 | handler(event) 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/events.ts: -------------------------------------------------------------------------------- 1 | import type { EventHandlerResult, FormattedItem, Realtime } from './types' 2 | 3 | export interface Event { 4 | /** The event type. */ 5 | type: string 6 | } 7 | 8 | export type RealtimeEvent = RealtimeCustomEvents.CustomEvent & { 9 | type: 'realtime.event' 10 | source: 'server' | 'client' 11 | time: string 12 | event: Event 13 | } & ( 14 | | { 15 | source: 'server' 16 | event: RealtimeServerEvents.EventMap[RealtimeServerEvents.EventType] 17 | } 18 | | { 19 | source: 'client' 20 | event: RealtimeClientEvents.EventMap[RealtimeClientEvents.EventType] 21 | } 22 | ) 23 | 24 | // See https://platform.openai.com/docs/guides/realtime/events 25 | export namespace RealtimeClientEvents { 26 | /** Event types sent by the client. */ 27 | export type EventType = 28 | | 'session.update' 29 | | 'input_audio_buffer.append' 30 | | 'input_audio_buffer.commit' 31 | | 'input_audio_buffer.clear' 32 | | 'conversation.item.create' 33 | | 'conversation.item.truncate' 34 | | 'conversation.item.delete' 35 | | 'response.create' 36 | | 'response.cancel' 37 | 38 | export type EventMap = { 39 | 'session.update': SessionUpdateEvent 40 | 'input_audio_buffer.append': InputAudioBufferAppendEvent 41 | 'input_audio_buffer.commit': InputAudioBufferCommitEvent 42 | 'input_audio_buffer.clear': InputAudioBufferClearEvent 43 | 'conversation.item.create': ConversationItemCreateEvent 44 | 'conversation.item.truncate': ConversationItemTruncateEvent 45 | 'conversation.item.delete': ConversationItemDeleteEvent 46 | 'response.create': ResponseCreateEvent 47 | 'response.cancel': ResponseCancelEvent 48 | } 49 | 50 | // Same as EventMap but every key is prefixed by 'client.' 51 | export type PrefixedEventMap = { 52 | [K in keyof EventMap as `client.${Extract}`]: EventMap[K] 53 | } 54 | 55 | export interface ClientEvent extends Event { 56 | /** The event type. */ 57 | type: EventType 58 | 59 | /** Optional client-generated ID used to identify this event. */ 60 | event_id?: string 61 | } 62 | 63 | /** Send this event to update the session’s default configuration. */ 64 | export interface SessionUpdateEvent extends ClientEvent { 65 | type: 'session.update' 66 | 67 | /** Session configuration to update. */ 68 | session: Realtime.SessionConfig 69 | } 70 | 71 | /** Send this event to append audio bytes to the input audio buffer. */ 72 | export interface InputAudioBufferAppendEvent extends ClientEvent { 73 | type: 'input_audio_buffer.append' 74 | 75 | /** Base64-encoded audio bytes. */ 76 | audio: string 77 | } 78 | 79 | /** Send this event to commit audio bytes to a user message. */ 80 | export interface InputAudioBufferCommitEvent extends ClientEvent { 81 | type: 'input_audio_buffer.commit' 82 | } 83 | 84 | /** Send this event to clear the audio bytes in the buffer. */ 85 | export interface InputAudioBufferClearEvent extends ClientEvent { 86 | type: 'input_audio_buffer.clear' 87 | } 88 | 89 | /** Send this event when adding an item to the conversation. */ 90 | export interface ConversationItemCreateEvent extends ClientEvent { 91 | type: 'conversation.item.create' 92 | 93 | /** The ID of the preceding item after which the new item will be inserted. */ 94 | previous_item_id?: string 95 | 96 | /** The item to add to the conversation. */ 97 | item?: Realtime.ClientItem 98 | } 99 | 100 | /** 101 | * Send this event when you want to truncate a previous assistant message’s audio. 102 | */ 103 | export interface ConversationItemTruncateEvent extends ClientEvent { 104 | type: 'conversation.item.truncate' 105 | 106 | /** The ID of the assistant message item to truncate. */ 107 | item_id: string 108 | 109 | /** The index of the content part to truncate. */ 110 | content_index: number 111 | 112 | /** Inclusive duration up to which audio is truncated, in milliseconds. */ 113 | audio_end_ms: number 114 | } 115 | 116 | /** 117 | * Send this event when you want to remove any item from the conversation history. 118 | */ 119 | export interface ConversationItemDeleteEvent extends ClientEvent { 120 | type: 'conversation.item.delete' 121 | 122 | /** The ID of the item to delete. */ 123 | item_id: string 124 | } 125 | 126 | /** Send this event to trigger a response generation. */ 127 | export interface ResponseCreateEvent extends ClientEvent { 128 | type: 'response.create' 129 | 130 | /** Configuration for the response. */ 131 | response: Realtime.ResponseConfig 132 | } 133 | 134 | /** Send this event to cancel an in-progress response. */ 135 | export interface ResponseCancelEvent extends ClientEvent { 136 | type: 'response.cancel' 137 | } 138 | } 139 | 140 | // See // See https://platform.openai.com/docs/guides/realtime/events 141 | export namespace RealtimeServerEvents { 142 | /** Event types sent by the server. */ 143 | export type EventType = 144 | | 'error' 145 | | 'session.created' 146 | | 'session.updated' 147 | | 'conversation.created' 148 | | 'conversation.item.created' 149 | | 'conversation.item.input_audio_transcription.completed' 150 | | 'conversation.item.input_audio_transcription.failed' 151 | | 'conversation.item.truncated' 152 | | 'conversation.item.deleted' 153 | | 'input_audio_buffer.committed' 154 | | 'input_audio_buffer.cleared' 155 | | 'input_audio_buffer.speech_started' 156 | | 'input_audio_buffer.speech_stopped' 157 | | 'response.created' 158 | | 'response.done' 159 | | 'response.output_item.added' 160 | | 'response.output_item.done' 161 | | 'response.content_part.added' 162 | | 'response.content_part.done' 163 | | 'response.text.delta' 164 | | 'response.text.done' 165 | | 'response.audio_transcript.delta' 166 | | 'response.audio_transcript.done' 167 | | 'response.audio.delta' 168 | | 'response.audio.done' 169 | | 'response.function_call_arguments.delta' 170 | | 'response.function_call_arguments.done' 171 | | 'rate_limits.updated' 172 | 173 | export type EventMap = { 174 | error: ErrorEvent 175 | 'session.created': SessionCreatedEvent 176 | 'session.updated': SessionUpdatedEvent 177 | 'conversation.created': ConversationCreatedEvent 178 | 'conversation.item.created': ConversationItemCreatedEvent 179 | 'conversation.item.input_audio_transcription.completed': ConversationItemInputAudioTranscriptionCompletedEvent 180 | 'conversation.item.input_audio_transcription.failed': ConversationItemInputAudioTranscriptionFailedEvent 181 | 'conversation.item.truncated': ConversationItemTruncatedEvent 182 | 'conversation.item.deleted': ConversationItemDeletedEvent 183 | 'input_audio_buffer.committed': InputAudioBufferCommittedEvent 184 | 'input_audio_buffer.cleared': InputAudioBufferClearedEvent 185 | 'input_audio_buffer.speech_started': InputAudioBufferSpeechStartedEvent 186 | 'input_audio_buffer.speech_stopped': InputAudioBufferSpeechStoppedEvent 187 | 'response.created': ResponseCreatedEvent 188 | 'response.done': ResponseDoneEvent 189 | 'response.output_item.added': ResponseOutputItemAddedEvent 190 | 'response.output_item.done': ResponseOutputItemDoneEvent 191 | 'response.content_part.added': ResponseContentPartItemAddedEvent 192 | 'response.content_part.done': ResponseContentPartItemDoneEvent 193 | 'response.text.delta': ResponseTextDeltaEvent 194 | 'response.text.done': ResponseTextDoneEvent 195 | 'response.audio_transcript.delta': ResponseAudioTranscriptDeltaEvent 196 | 'response.audio_transcript.done': ResponseAudioTranscriptDoneEvent 197 | 'response.audio.delta': ResponseAudioDeltaEvent 198 | 'response.audio.done': ResponseAudioDoneEvent 199 | 'response.function_call_arguments.delta': ResponseFunctionCallArgumentsDeltaEvent 200 | 'response.function_call_arguments.done': ResponseFunctionCallArgumentsDoneEvent 201 | 'rate_limits.updated': RateLimitsUpdatedEvent 202 | } 203 | 204 | // Same as EventMap but every key is prefixed by 'server.' 205 | export type PrefixedEventMap = { 206 | [K in keyof EventMap as `server.${Extract}`]: EventMap[K] 207 | } 208 | 209 | export interface ServerEvent extends Event { 210 | /** The event type. */ 211 | type: EventType 212 | 213 | /** The unique ID of the server event. */ 214 | event_id: string 215 | } 216 | 217 | /** Returned when an error occurs. */ 218 | export interface ErrorEvent extends ServerEvent { 219 | type: 'error' 220 | 221 | /** Details of the error. */ 222 | error: Realtime.Error 223 | } 224 | 225 | /** 226 | * Returned when a session is created. Emitted automatically when a new 227 | * connection is established. 228 | */ 229 | export interface SessionCreatedEvent extends ServerEvent { 230 | type: 'session.created' 231 | 232 | /** The session resource. */ 233 | session: Realtime.Session 234 | } 235 | 236 | /** 237 | * Returned when a session is updated. 238 | */ 239 | export interface SessionUpdatedEvent extends ServerEvent { 240 | type: 'session.updated' 241 | 242 | /** The updated session resource. */ 243 | session: Realtime.Session 244 | } 245 | 246 | /** 247 | * Returned when a conversation is created. Emitted right after session 248 | * creation. 249 | */ 250 | export interface ConversationCreatedEvent extends ServerEvent { 251 | type: 'conversation.created' 252 | 253 | /** The conversation resource. */ 254 | conversation: Realtime.Conversation 255 | } 256 | 257 | /** 258 | * Returned when a conversation item is created. 259 | */ 260 | export interface ConversationItemCreatedEvent extends ServerEvent { 261 | type: 'conversation.item.created' 262 | 263 | /** The ID of the preceding item. */ 264 | previous_item_id?: string 265 | 266 | /** The item that was created. */ 267 | item: Realtime.Item 268 | } 269 | 270 | /** 271 | * Returned when input audio transcription is enabled and a transcription succeeds. 272 | */ 273 | export interface ConversationItemInputAudioTranscriptionCompletedEvent 274 | extends ServerEvent { 275 | type: 'conversation.item.input_audio_transcription.completed' 276 | 277 | /** The ID of the user message item. */ 278 | item_id: string 279 | 280 | /** The index of the content part containing the audio. */ 281 | content_index: number 282 | 283 | /** The transcribed text. */ 284 | transcript: string 285 | } 286 | 287 | /** 288 | * Returned when input audio transcription is configured, and a transcription 289 | * request for a user message failed. 290 | */ 291 | export interface ConversationItemInputAudioTranscriptionFailedEvent 292 | extends ServerEvent { 293 | type: 'conversation.item.input_audio_transcription.failed' 294 | 295 | /** The ID of the user message item. */ 296 | item_id: string 297 | 298 | /** The index of the content part containing the audio. */ 299 | content_index: number 300 | 301 | /** Details of the transcription error. */ 302 | error: Realtime.Error 303 | } 304 | 305 | /** 306 | * Returned when an earlier assistant audio message item is truncated by the client. 307 | */ 308 | export interface ConversationItemTruncatedEvent extends ServerEvent { 309 | type: 'conversation.item.truncated' 310 | 311 | /** The ID of the assistant message item that was truncated. */ 312 | item_id: string 313 | 314 | /** The index of the content part thtat was truncated. */ 315 | content_index: number 316 | 317 | /** The duration up to which the audio was truncated, in milliseconds. */ 318 | audio_end_ms: number 319 | } 320 | 321 | /** 322 | * Returned when an item in the conversation is deleted. 323 | */ 324 | export interface ConversationItemDeletedEvent extends ServerEvent { 325 | type: 'conversation.item.deleted' 326 | 327 | /** The ID of the item that was deleted. */ 328 | item_id: string 329 | } 330 | 331 | /** 332 | * Returned when an input audio buffer is committed, either by the client or 333 | * automatically in server VAD mode. 334 | */ 335 | export interface InputAudioBufferCommittedEvent extends ServerEvent { 336 | type: 'input_audio_buffer.committed' 337 | 338 | /** The ID of the preceding item after which the new item will be inserted. */ 339 | previous_item_id?: string 340 | 341 | /** The ID of the user message item that will be created. */ 342 | item_id: string 343 | } 344 | 345 | /** 346 | * Returned when the input audio buffer is cleared by the client. 347 | */ 348 | export interface InputAudioBufferClearedEvent extends ServerEvent { 349 | type: 'input_audio_buffer.cleared' 350 | } 351 | 352 | /** 353 | * Returned in server turn detection mode when speech is detected. 354 | */ 355 | export interface InputAudioBufferSpeechStartedEvent extends ServerEvent { 356 | type: 'input_audio_buffer.speech_started' 357 | 358 | /** The ID of the user message item that will be created when speech stops. */ 359 | item_id: string 360 | 361 | /** Milliseconds since the session started when speech was detected. */ 362 | audio_start_ms: number 363 | } 364 | 365 | /** 366 | * Returned in server turn detection mode when speech stops. 367 | */ 368 | export interface InputAudioBufferSpeechStoppedEvent extends ServerEvent { 369 | type: 'input_audio_buffer.speech_stopped' 370 | 371 | /** The ID of the user message item that will be created. */ 372 | item_id: string 373 | 374 | /** Milliseconds since the session started when speech stopped. */ 375 | audio_end_ms: number 376 | } 377 | 378 | /** 379 | * Returned when a new Response is created. The first event of response 380 | * creation, where the response is in an initial state of "in_progress". 381 | */ 382 | export interface ResponseCreatedEvent extends ServerEvent { 383 | type: 'response.created' 384 | 385 | /** The response resource. */ 386 | response: Realtime.Response 387 | } 388 | 389 | /** 390 | * Returned when a Response is done streaming. Always emitted, no matter the 391 | * final state. 392 | */ 393 | export interface ResponseDoneEvent extends ServerEvent { 394 | type: 'response.done' 395 | 396 | /** The response resource. */ 397 | response: Realtime.Response 398 | } 399 | 400 | /** 401 | * Returned when a new Item is created during response generation. 402 | */ 403 | export interface ResponseOutputItemAddedEvent extends ServerEvent { 404 | type: 'response.output_item.added' 405 | 406 | /** The ID of the response. */ 407 | response_id: string 408 | 409 | /** The index of the output item in the response. */ 410 | output_index: string 411 | 412 | /** The item that was added. */ 413 | item: Realtime.Item 414 | } 415 | 416 | /** 417 | * Returned when an Item is done streaming. Also emitted when a Response is 418 | * interrupted, incomplete, or cancelled. 419 | */ 420 | export interface ResponseOutputItemDoneEvent extends ServerEvent { 421 | type: 'response.output_item.done' 422 | 423 | /** The ID of the response. */ 424 | response_id: string 425 | 426 | /** The index of the output item in the response. */ 427 | output_index: string 428 | 429 | /** The item that was added. */ 430 | item: Realtime.Item 431 | } 432 | 433 | /** 434 | * Returned when a new content part is added to an assistant message item 435 | * during response generation. 436 | */ 437 | export interface ResponseContentPartItemAddedEvent extends ServerEvent { 438 | type: 'response.content_part.added' 439 | 440 | /** The ID of the response. */ 441 | response_id: string 442 | 443 | /** The ID of the item. */ 444 | item_id: string 445 | 446 | /** The index of the output item in the response. */ 447 | output_index: string 448 | 449 | /** The index of the content part in the item's content array. */ 450 | content_index: number 451 | 452 | /** The content part. */ 453 | part: Realtime.ContentPart 454 | } 455 | 456 | /** 457 | * Returned when a content part is done streaming in an assistant message item. 458 | * Also emitted when a Response is interrupted, incomplete, or cancelled. 459 | */ 460 | export interface ResponseContentPartItemDoneEvent extends ServerEvent { 461 | type: 'response.content_part.done' 462 | 463 | /** The ID of the response. */ 464 | response_id: string 465 | 466 | /** The ID of the item. */ 467 | item_id: string 468 | 469 | /** The index of the output item in the response. */ 470 | output_index: string 471 | 472 | /** The index of the content part in the item's content array. */ 473 | content_index: number 474 | 475 | /** The content part. */ 476 | part: Realtime.ContentPart 477 | } 478 | 479 | /** 480 | * Returned when the text value of a "text" content part is updated. 481 | */ 482 | export interface ResponseTextDeltaEvent extends ServerEvent { 483 | type: 'response.text.delta' 484 | 485 | /** The ID of the response. */ 486 | response_id: string 487 | 488 | /** The ID of the item. */ 489 | item_id: string 490 | 491 | /** The index of the output item in the response. */ 492 | output_index: string 493 | 494 | /** The index of the content part in the item's content array. */ 495 | content_index: number 496 | 497 | /** The text delta. */ 498 | delta: string 499 | } 500 | 501 | /** 502 | * Returned when the text value of a "text" content part is done streaming. 503 | * Also emitted when a Response is interrupted, incomplete, or cancelled. 504 | */ 505 | export interface ResponseTextDoneEvent extends ServerEvent { 506 | type: 'response.text.done' 507 | 508 | /** The ID of the response. */ 509 | response_id: string 510 | 511 | /** The ID of the item. */ 512 | item_id: string 513 | 514 | /** The index of the output item in the response. */ 515 | output_index: string 516 | 517 | /** The index of the content part in the item's content array. */ 518 | content_index: number 519 | 520 | /** The final text content. */ 521 | text: string 522 | } 523 | 524 | /** 525 | * Returned when the model-generated transcription of audio output is updated. 526 | */ 527 | export interface ResponseAudioTranscriptDeltaEvent extends ServerEvent { 528 | type: 'response.audio_transcript.delta' 529 | 530 | /** The ID of the response. */ 531 | response_id: string 532 | 533 | /** The ID of the item. */ 534 | item_id: string 535 | 536 | /** The index of the output item in the response. */ 537 | output_index: string 538 | 539 | /** The index of the content part in the item's content array. */ 540 | content_index: number 541 | 542 | /** The transcript delta. */ 543 | delta: string 544 | } 545 | 546 | /** 547 | * Returned when the model-generated transcription of audio output is done 548 | * streaming. Also emitted when a Response is interrupted, incomplete, or 549 | * cancelled. 550 | */ 551 | export interface ResponseAudioTranscriptDoneEvent extends ServerEvent { 552 | type: 'response.audio_transcript.done' 553 | 554 | /** The ID of the response. */ 555 | response_id: string 556 | 557 | /** The ID of the item. */ 558 | item_id: string 559 | 560 | /** The index of the output item in the response. */ 561 | output_index: string 562 | 563 | /** The index of the content part in the item's content array. */ 564 | content_index: number 565 | 566 | /** The final transcript. */ 567 | transcript: string 568 | } 569 | 570 | /** 571 | * Returned when the model-generated audio is updated. 572 | */ 573 | export interface ResponseAudioDeltaEvent extends ServerEvent { 574 | type: 'response.audio.delta' 575 | 576 | /** The ID of the response. */ 577 | response_id: string 578 | 579 | /** The ID of the item. */ 580 | item_id: string 581 | 582 | /** The index of the output item in the response. */ 583 | output_index: string 584 | 585 | /** The index of the content part in the item's content array. */ 586 | content_index: number 587 | 588 | /** Base64-encoded audio data delta. */ 589 | delta: string 590 | } 591 | 592 | /** 593 | * Returned when the model-generated audio is done. Also emitted when a 594 | * Response is interrupted, incomplete, or cancelled. 595 | */ 596 | export interface ResponseAudioDoneEvent extends ServerEvent { 597 | type: 'response.audio.done' 598 | 599 | /** The ID of the response. */ 600 | response_id: string 601 | 602 | /** The ID of the item. */ 603 | item_id: string 604 | 605 | /** The index of the output item in the response. */ 606 | output_index: string 607 | 608 | /** The index of the content part in the item's content array. */ 609 | content_index: number 610 | } 611 | 612 | /** 613 | * Returned when the model-generated function call arguments are updated. 614 | */ 615 | export interface ResponseFunctionCallArgumentsDeltaEvent extends ServerEvent { 616 | type: 'response.function_call_arguments.delta' 617 | 618 | /** The ID of the response. */ 619 | response_id: string 620 | 621 | /** The ID of the item. */ 622 | item_id: string 623 | 624 | /** The index of the output item in the response. */ 625 | output_index: string 626 | 627 | /** The index of the content part in the item's content array. */ 628 | content_index: number 629 | 630 | /** The ID of the function call. */ 631 | call_id: string 632 | 633 | /** The arguments delta as a JSON string. */ 634 | delta: string 635 | } 636 | 637 | /** 638 | * Returned when the model-generated function call arguments are done streaming. 639 | * Also emitted when a Response is interrupted, incomplete, or cancelled. 640 | */ 641 | export interface ResponseFunctionCallArgumentsDoneEvent extends ServerEvent { 642 | type: 'response.function_call_arguments.done' 643 | 644 | /** The ID of the response. */ 645 | response_id: string 646 | 647 | /** The ID of the item. */ 648 | item_id: string 649 | 650 | /** The index of the output item in the response. */ 651 | output_index: string 652 | 653 | /** The index of the content part in the item's content array. */ 654 | content_index: number 655 | 656 | /** The ID of the function call. */ 657 | call_id: string 658 | 659 | /** The final arguments as a JSON string. */ 660 | arguments: string 661 | } 662 | 663 | /** 664 | * Emitted after every `response.done` event to indicate the updated rate 665 | * limits. 666 | */ 667 | export interface RateLimitsUpdatedEvent extends ServerEvent { 668 | type: 'rate_limits.updated' 669 | 670 | /** Array of rate limit information. */ 671 | rate_limits: Realtime.RateLimit[] 672 | } 673 | } 674 | 675 | export namespace RealtimeCustomEvents { 676 | /** Custom event types that are not part of the official realtime API. */ 677 | export type EventType = 678 | | 'conversation.item.appended' 679 | | 'conversation.item.completed' 680 | | 'conversation.updated' 681 | | 'conversation.interrupted' 682 | | 'realtime.event' 683 | 684 | export type EventMap = { 685 | 'conversation.item.appended': ConversationItemAppendedEvent 686 | 'conversation.item.completed': ConversationItemCompletedEvent 687 | 'conversation.updated': ConversationUpdatedEvent 688 | 'conversation.interrupted': ConversationInterruptedEvent 689 | 'realtime.event': 690 | | CustomServerEvent 691 | | CustomClientEvent 692 | } 693 | 694 | export interface CustomEvent extends Event { 695 | /** The custom event type. */ 696 | type: EventType 697 | } 698 | 699 | export type CustomServerEvent = 700 | RealtimeEvent & { 701 | type: 'realtime.event' 702 | source: 'server' 703 | time: string 704 | event: RealtimeServerEvents.EventMap[T] 705 | } 706 | 707 | export type CustomClientEvent = 708 | RealtimeEvent & { 709 | type: 'realtime.event' 710 | source: 'client' 711 | time: string 712 | event: RealtimeClientEvents.EventMap[T] 713 | } 714 | 715 | export interface ConversationItemAppendedEvent 716 | extends CustomEvent, 717 | Omit { 718 | type: 'conversation.item.appended' 719 | item: FormattedItem 720 | } 721 | 722 | export interface ConversationItemCompletedEvent 723 | extends CustomEvent, 724 | Omit { 725 | type: 'conversation.item.completed' 726 | item: FormattedItem 727 | } 728 | 729 | export interface ConversationUpdatedEvent 730 | extends CustomEvent, 731 | Omit { 732 | type: 'conversation.updated' 733 | item: FormattedItem 734 | } 735 | 736 | export interface ConversationInterruptedEvent 737 | extends CustomEvent, 738 | Omit { 739 | type: 'conversation.interrupted' 740 | } 741 | } 742 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export * from './api' 2 | export * from './client' 3 | export * from './conversation' 4 | export * from './event-handler' 5 | export type * from './events' 6 | export type * from './types' 7 | export * from './utils' 8 | -------------------------------------------------------------------------------- /src/node/index.ts: -------------------------------------------------------------------------------- 1 | export * from './relay-server' 2 | -------------------------------------------------------------------------------- /src/node/relay-server.ts: -------------------------------------------------------------------------------- 1 | import type { IncomingMessage } from 'node:http' 2 | 3 | import { type WebSocket, WebSocketServer } from 'ws' 4 | 5 | import type { RealtimeClient } from '../client' 6 | import type { RealtimeClientEvents } from '../events' 7 | import { assert, getEnv } from '../utils' 8 | 9 | /** 10 | * Simple Node.js relay server for the OpenAI Realtime API. 11 | * 12 | * @example 13 | * 14 | * ```ts 15 | * import { RealtimeClient } from 'openai-realtime-api' 16 | * import { RealtimeRelay } from 'openai-realtime-api/node' 17 | * 18 | * const client = new RealtimeClient({ relay: true }) 19 | * const relay = new RealtimeRelay({ client }) 20 | * relay.listen(8081) 21 | * ``` 22 | */ 23 | export class RealtimeRelay { 24 | readonly client: RealtimeClient 25 | wss?: WebSocketServer 26 | 27 | constructor({ client }: { client: RealtimeClient }) { 28 | assert( 29 | client.relay, 30 | 'RealtimeRelay client must have the "relay" option set' 31 | ) 32 | assert( 33 | client.realtime.apiKey, 34 | 'RealtimeRelay client must have an API key set' 35 | ) 36 | 37 | this.client = client 38 | } 39 | 40 | /** 41 | * Creates a `WebSocketServer` and begins listening for connections. 42 | * 43 | * @param port Port to listen on; defaults to the PORT environment variable or 8081. 44 | */ 45 | listen(port?: number) { 46 | assert(!this.wss, 'RealtimeRelay is already listening') 47 | 48 | if (!port) { 49 | port = Number.parseInt(getEnv('PORT') ?? '8081') 50 | assert(!Number.isNaN(port), `Invalid port: ${port}`) 51 | } 52 | 53 | this.wss = new WebSocketServer({ port }) 54 | this.wss.on('connection', this._connectionHandler.bind(this)) 55 | 56 | this._info(`Listening on ws://localhost:${port}`) 57 | } 58 | 59 | /** 60 | * Closes the WebSocket server. 61 | */ 62 | close() { 63 | this.wss?.close() 64 | this.wss = undefined 65 | } 66 | 67 | protected async _connectionHandler(ws: WebSocket, req: IncomingMessage) { 68 | if (!req.url) { 69 | this._error('No URL provided, closing connection.') 70 | ws.close() 71 | return 72 | } 73 | 74 | const url = new URL(req.url, `http://${req.headers.host}`) 75 | const pathname = url.pathname 76 | 77 | if (pathname !== '/') { 78 | this._error(`Invalid pathname: "${pathname}"`) 79 | ws.close() 80 | return 81 | } 82 | 83 | // Relay: OpenAI server events -> browser 84 | this.client.realtime.on('server.*', (event) => { 85 | this._debug(`Relaying "${event.type}" to client`) 86 | ws.send(JSON.stringify(event)) 87 | }) 88 | this.client.realtime.on('close', () => ws.close()) 89 | 90 | // Relay: browser events -> OpenAI server 91 | // We need to queue data waiting for the OpenAI connection 92 | const messageQueue: string[] = [] 93 | const messageHandler = (data: string) => { 94 | try { 95 | const event = JSON.parse(data) as RealtimeClientEvents.ClientEvent 96 | this._debug(`Relaying "${event.type}" to server`) 97 | this.client.realtime.send(event.type, event) 98 | } catch (err: any) { 99 | this._error(`Error parsing event from client: ${data}`, err.message) 100 | } 101 | } 102 | 103 | ws.on('message', (data) => { 104 | if (!this.client.isConnected) { 105 | messageQueue.push(data.toString()) 106 | } else { 107 | messageHandler(data.toString()) 108 | } 109 | }) 110 | ws.on('close', () => this.client.disconnect()) 111 | 112 | // Connect to OpenAI Realtime API 113 | try { 114 | this._info('Connecting to server...', this.client.realtime.url) 115 | await this.client.connect() 116 | } catch (err: any) { 117 | this._error('Error connecting to server', err.message) 118 | ws.close() 119 | return 120 | } 121 | 122 | this._info('Connected to server successfully', this.client.realtime.url) 123 | while (messageQueue.length) { 124 | messageHandler(messageQueue.shift()!) 125 | } 126 | } 127 | 128 | protected _info(...args: any[]) { 129 | console.log('[RealtimeRelay]', ...args) 130 | } 131 | 132 | protected _debug(...args: any[]) { 133 | if (this.client.realtime.debug) { 134 | console.log('[RealtimeRelay]', ...args) 135 | } 136 | } 137 | 138 | protected _error(...args: any[]) { 139 | console.error('[RealtimeRelay]', ...args) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/reset.d.ts: -------------------------------------------------------------------------------- 1 | import '@total-typescript/ts-reset' 2 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export namespace Realtime { 2 | export type AudioFormat = 'pcm16' | 'g711_ulaw' | 'g711_alaw' 3 | export type AudioTranscriptionModel = 'whisper-1' | (string & {}) 4 | 5 | export type ItemRole = 'user' | 'assistant' | 'system' 6 | export type ItemType = 'message' | 'function_call' | 'function_call_output' 7 | export type ItemStatus = 'in_progress' | 'completed' | 'incomplete' 8 | export type ContentPartType = 'input_text' | 'input_audio' | 'text' | 'audio' 9 | 10 | export type Voice = 11 | | 'alloy' 12 | | 'ash' 13 | | 'ballad' 14 | | 'coral' 15 | | 'echo' 16 | | 'sage' 17 | | 'shimmer' 18 | | 'verse' 19 | | (string & {}) 20 | 21 | export type ToolChoice = 22 | | 'auto' 23 | | 'none' 24 | | 'required' 25 | | { type: 'function'; name: string } 26 | 27 | export type ObjectType = 28 | | 'realtime.item' 29 | | 'realtime.response' 30 | | 'realtime.session' 31 | | 'realtime.conversation' 32 | 33 | export type ResponseStatus = 34 | | 'in_progress' 35 | | 'completed' 36 | | 'incomplete' 37 | | 'cancelled' 38 | | 'failed' 39 | 40 | export interface BaseObject { 41 | /** The unique ID of the object. */ 42 | id?: string 43 | 44 | /** Discriminator for the type of this object. */ 45 | object?: ObjectType 46 | } 47 | 48 | export interface AudioTranscription { 49 | model: AudioTranscriptionModel 50 | } 51 | 52 | export interface TurnDetection { 53 | type: 'server_vad' 54 | 55 | /** 0.0 to 1.0 */ 56 | threshold?: number 57 | 58 | /** How much audio to include in the audio stream before the speech starts. */ 59 | prefix_padding_ms?: number 60 | 61 | /** How long to wait to mark the speech as stopped. */ 62 | silence_duration_ms?: number 63 | } 64 | 65 | export interface ToolDefinition { 66 | type: 'function' 67 | name: string 68 | description: string 69 | parameters: { [key: string]: any } 70 | } 71 | 72 | export type PartialToolDefinition = Omit & { 73 | type?: 'function' 74 | } 75 | 76 | export interface SessionConfig { 77 | /** The default system instructions prepended to model calls. */ 78 | instructions?: string 79 | 80 | /** 81 | * The set of modalities the model can respond with. To disable audio, set 82 | * this to ["text"]. 83 | */ 84 | modalities?: string[] 85 | 86 | /** 87 | * The voice the model uses to respond - one of alloy, echo, or shimmer. 88 | * 89 | * Cannot be changed once the model has responded with audio at least once. 90 | */ 91 | voice?: Voice 92 | 93 | /** The format of input audio. */ 94 | input_audio_format?: AudioFormat 95 | 96 | /** The format of output audio. */ 97 | output_audio_format?: AudioFormat 98 | 99 | /** Configuration for input audio transcription. Can be set to null to turn off. */ 100 | input_audio_transcription?: AudioTranscription | null 101 | 102 | /** Configuration for turn detection. Can be set to null to turn off. */ 103 | turn_detection?: TurnDetection | null 104 | 105 | /** Tools (functions) available to the model. */ 106 | tools?: ToolDefinition[] 107 | 108 | /** How the model chooses tools. */ 109 | tool_choice?: ToolChoice 110 | 111 | /** Sampling temperature for the model. */ 112 | temperature?: number 113 | 114 | /** 115 | * Maximum number of output tokens for a single assistant response, inclusive 116 | * of tool calls. Provide an integer between 1 and 4096 to limit output 117 | * tokens, or "inf" for the maximum available tokens for a given model. 118 | * 119 | * Defaults to "inf". 120 | */ 121 | max_response_output_tokens?: number | 'inf' 122 | } 123 | 124 | export interface Session extends BaseObject, SessionConfig { 125 | /** The unique ID of the session. */ 126 | id: string 127 | 128 | /** Type of object. */ 129 | object: 'realtime.session' 130 | } 131 | 132 | export interface BaseContentPart { 133 | /** The type of the content. */ 134 | type: ContentPartType 135 | 136 | /** Text content for "text" and "input_text" content parts. */ 137 | text?: string 138 | 139 | /** Base64-encoded audio data. */ 140 | audio?: string 141 | 142 | /** Optional text transcript. */ 143 | transcript?: string | null 144 | } 145 | 146 | export interface InputTextContentPart extends BaseContentPart { 147 | type: 'input_text' 148 | text: string 149 | } 150 | 151 | export interface InputAudioContentPart extends BaseContentPart { 152 | type: 'input_audio' 153 | /** Base64-encoded audio data. */ 154 | audio?: string 155 | transcript?: string | null 156 | } 157 | 158 | export interface TextContentPart extends BaseContentPart { 159 | type: 'text' 160 | text: string 161 | } 162 | 163 | export interface AudioContentPart extends BaseContentPart { 164 | type: 'audio' 165 | /** Base64-encoded audio data. */ 166 | audio?: string 167 | transcript?: string | null 168 | } 169 | 170 | export type ContentPart = 171 | | InputTextContentPart 172 | | InputAudioContentPart 173 | | TextContentPart 174 | | AudioContentPart 175 | 176 | export interface BaseItem extends BaseObject { 177 | /** The unique ID of the item. */ 178 | id: string 179 | 180 | /** Type of object. */ 181 | object?: 'realtime.item' 182 | 183 | /** The type of the item. */ 184 | type: ItemType 185 | 186 | /** The status of the item. */ 187 | status: ItemStatus 188 | 189 | /** The role of the message sender. */ 190 | role: ItemRole 191 | 192 | /** The content of the item. */ 193 | content: ContentPart[] 194 | } 195 | 196 | export interface SystemItem { 197 | role: 'system' 198 | type: 'message' 199 | content: InputTextContentPart[] 200 | } 201 | 202 | export interface UserItem { 203 | role: 'user' 204 | type: 'message' 205 | content: Array 206 | } 207 | 208 | export interface AssistantItem { 209 | role: 'assistant' 210 | type: 'message' 211 | content: Array 212 | } 213 | 214 | export interface FunctionCallItem { 215 | type: 'function_call' 216 | 217 | /** The ID of the function call. */ 218 | call_id: string 219 | 220 | /** The name of the function being called. */ 221 | name: string 222 | 223 | /** The arguments of the function call. */ 224 | arguments: string 225 | } 226 | 227 | export interface FunctionCallOutputItem { 228 | type: 'function_call_output' 229 | 230 | /** The ID of the function call. */ 231 | call_id: string 232 | 233 | /** The output of the function call. */ 234 | output: string 235 | } 236 | 237 | export type Item = BaseItem & 238 | ( 239 | | SystemItem 240 | | UserItem 241 | | AssistantItem 242 | | FunctionCallItem 243 | | FunctionCallOutputItem 244 | ) 245 | 246 | export type ClientItem = 247 | | SystemItem 248 | | UserItem 249 | | AssistantItem 250 | | FunctionCallItem 251 | | FunctionCallOutputItem 252 | 253 | export interface Usage { 254 | total_tokens: number 255 | input_tokens: number 256 | output_tokens: number 257 | } 258 | 259 | export interface ResponseConfig { 260 | /** Instructions for the model. */ 261 | instructions?: string 262 | 263 | /** 264 | * The modalities for the response. To disable audio, set this to ["text"]. 265 | */ 266 | modalities?: string[] 267 | 268 | /** 269 | * The voice the model uses to respond - one of alloy, echo, or shimmer. 270 | */ 271 | voice?: Voice 272 | 273 | /** The format of output audio. */ 274 | output_audio_format?: AudioFormat 275 | 276 | /** Tools (functions) available to the model. */ 277 | tools?: ToolDefinition[] 278 | 279 | /** How the model chooses tools. */ 280 | tool_choice?: ToolChoice 281 | 282 | /** Sampling temperature for the model. */ 283 | temperature?: number 284 | 285 | /** 286 | * Maximum number of output tokens for a single assistant response, inclusive 287 | * of tool calls. Provide an integer between 1 and 4096 to limit output 288 | * tokens, or "inf" for the maximum available tokens for a given model. 289 | * Defaults to "inf". 290 | */ 291 | max_output_tokens?: number | 'inf' 292 | } 293 | 294 | export interface Response extends BaseObject, ResponseConfig { 295 | /** The unique ID of the response. */ 296 | id: string 297 | 298 | /** Type of object. */ 299 | object: 'realtime.response' 300 | 301 | /** Status of the response. */ 302 | status: ResponseStatus 303 | 304 | /** Additional details about the status. */ 305 | status_details?: 306 | | { 307 | type: 'incomplete' 308 | reason: 'interruption' | 'max_output_tokens' | 'content_filter' 309 | } 310 | | { 311 | type: 'failed' 312 | error?: Error | null 313 | } 314 | | null 315 | 316 | /** The list of output items generated by the response. */ 317 | output: Item[] 318 | 319 | /** Usage statistics for the response. */ 320 | usage?: Usage 321 | } 322 | 323 | export interface Error { 324 | /** The type of error. */ 325 | type: string 326 | 327 | /** Error code, if any. */ 328 | code?: string 329 | 330 | /** A human-readable error message. */ 331 | message: string 332 | 333 | /** Parameter related to the error, if any. */ 334 | param?: string | null 335 | 336 | /** Unique ID of the event, if any. */ 337 | event_id?: string 338 | } 339 | 340 | export interface Conversation extends BaseObject { 341 | /** The unique ID of the conversation. */ 342 | id: string 343 | 344 | /** Type of object. */ 345 | object: 'realtime.conversation' 346 | } 347 | 348 | export interface RateLimit { 349 | name: 'requests' | 'tokens' | (string & {}) 350 | limit: number 351 | remaining: number 352 | reset_seconds: number 353 | } 354 | } 355 | 356 | // NOTE: all types outside of the Realtime namespace are local to this project 357 | // and not part of the official API. 358 | 359 | export type MaybePromise = T | Promise 360 | 361 | export interface FormattedTool { 362 | type: 'function' 363 | name: string 364 | call_id: string 365 | arguments: string 366 | } 367 | 368 | export interface FormattedProperty { 369 | audio: Int16Array 370 | text: string 371 | transcript: string 372 | tool?: FormattedTool 373 | output?: string 374 | file?: any 375 | } 376 | 377 | /** Local item used strictly for convenience and not part of the API. */ 378 | export type FormattedItem = Realtime.Item & { 379 | formatted: FormattedProperty 380 | } 381 | 382 | /** Local item used strictly for convenience and not part of the API. */ 383 | export type MaybeFormattedItem = Realtime.Item & { 384 | formatted?: FormattedProperty 385 | } 386 | 387 | export interface EventHandlerResult { 388 | item?: MaybeFormattedItem 389 | delta?: { 390 | transcript?: string 391 | audio?: Int16Array 392 | text?: string 393 | arguments?: string 394 | } 395 | response?: Realtime.Response 396 | } 397 | 398 | export type ToolHandler = (params: any) => MaybePromise 399 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | import { customAlphabet } from 'nanoid' 2 | 3 | export const isBrowser = !!(globalThis as any).document 4 | 5 | export function hasNativeWebSocket(): boolean { 6 | return !!globalThis.WebSocket 7 | } 8 | 9 | export function getEnv(name: string): string | undefined { 10 | try { 11 | return typeof process !== 'undefined' 12 | ? // eslint-disable-next-line no-process-env 13 | process.env?.[name] 14 | : undefined 15 | } catch { 16 | return undefined 17 | } 18 | } 19 | 20 | export function assert( 21 | value: unknown, 22 | message?: string | Error 23 | ): asserts value { 24 | if (value) { 25 | return 26 | } 27 | 28 | if (!message) { 29 | throw new Error('Assertion failed') 30 | } 31 | 32 | throw typeof message === 'string' ? new Error(message) : message 33 | } 34 | 35 | /** 36 | * Converts Float32Array of amplitude data to ArrayBuffer in Int16Array format. 37 | */ 38 | export function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer { 39 | const buffer = new ArrayBuffer(float32Array.length * 2) 40 | const view = new DataView(buffer) 41 | let offset = 0 42 | 43 | for (let i = 0; i < float32Array.length; i++, offset += 2) { 44 | const s = Math.max(-1, Math.min(1, float32Array[i]!)) 45 | view.setInt16(offset, s < 0 ? s * 0x80_00 : s * 0x7f_ff, true) 46 | } 47 | 48 | return buffer 49 | } 50 | 51 | /** 52 | * Converts a base64 string to an ArrayBuffer. 53 | */ 54 | export function base64ToArrayBuffer(base64: string): ArrayBuffer { 55 | const binaryString = atob(base64) 56 | const len = binaryString.length 57 | const bytes = new Uint8Array(len) 58 | 59 | for (let i = 0; i < len; i++) { 60 | // eslint-disable-next-line unicorn/prefer-code-point 61 | bytes[i] = binaryString.charCodeAt(i) 62 | } 63 | 64 | return bytes.buffer 65 | } 66 | 67 | /** 68 | * Converts an ArrayBuffer, Int16Array or Float32Array to a base64 string. 69 | */ 70 | export function arrayBufferToBase64( 71 | arrayBuffer: ArrayBuffer | Int16Array | Float32Array 72 | ): string { 73 | if (arrayBuffer instanceof Float32Array) { 74 | arrayBuffer = floatTo16BitPCM(arrayBuffer) 75 | } else if (arrayBuffer instanceof Int16Array) { 76 | arrayBuffer = arrayBuffer.buffer 77 | } 78 | 79 | const bytes = new Uint8Array(arrayBuffer) 80 | const chunkSize = 0x80_00 // 32KB chunk size 81 | let binary = '' 82 | 83 | for (let i = 0; i < bytes.length; i += chunkSize) { 84 | const chunk = bytes.subarray(i, i + chunkSize) 85 | binary += String.fromCharCode.apply(null, chunk as any) 86 | } 87 | 88 | return btoa(binary) 89 | } 90 | 91 | /** 92 | * Merge two Int16Arrays from Int16Arrays or ArrayBuffers. 93 | */ 94 | export function mergeInt16Arrays( 95 | left: ArrayBuffer | Int16Array, 96 | right: ArrayBuffer | Int16Array 97 | ): Int16Array { 98 | if (left instanceof ArrayBuffer) { 99 | left = new Int16Array(left) 100 | } 101 | 102 | if (right instanceof ArrayBuffer) { 103 | right = new Int16Array(right) 104 | } 105 | 106 | if (!(left instanceof Int16Array) || !(right instanceof Int16Array)) { 107 | throw new TypeError(`Both items must be Int16Array`) 108 | } 109 | 110 | const newValues = new Int16Array(left.length + right.length) 111 | for (const [i, element] of left.entries()) { 112 | newValues[i] = element 113 | } 114 | 115 | for (const [j, element] of right.entries()) { 116 | newValues[left.length + j] = element 117 | } 118 | 119 | return newValues 120 | } 121 | 122 | // base58; non-repeating chars 123 | const alphabet = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' 124 | const generateIdImpl = customAlphabet(alphabet, 21) 125 | 126 | /** 127 | * Generates an id to send with events and messages. 128 | */ 129 | export function generateId(prefix: string, size = 21): string { 130 | const id = generateIdImpl(size) 131 | return `${prefix}${id}` 132 | } 133 | 134 | export const sleep = (t: number) => 135 | new Promise((r) => setTimeout(() => r(), t)) 136 | 137 | /** 138 | * Trims an event's content for debugging purposes to make logs easier to read. 139 | */ 140 | export function trimDebugEvent( 141 | event?: any, 142 | { 143 | maxLimit = 200 144 | }: { 145 | maxLimit?: number 146 | } = {} 147 | ): any { 148 | if (!event) return event 149 | 150 | const e = structuredClone(event) 151 | 152 | if (e.item?.content?.find((c: any) => c.audio)) { 153 | e.item.content = e.item.content.map(({ audio, c }: any) => { 154 | if (audio) { 155 | return { 156 | ...c, 157 | audio: '' 158 | } 159 | } else { 160 | return c 161 | } 162 | }) 163 | } 164 | 165 | if (e.audio) { 166 | e.audio = '