├── .github
    ├── build.sh
    └── workflows
    │   ├── build.yml
    │   └── dependencies.yml
├── .gitignore
├── .spi.yml
├── LICENSE
├── Package.swift
├── README.md
└── src
    ├── Connectors
        ├── WebRTCConnector.swift
        └── WebSocketConnector.swift
    ├── Conversation.swift
    ├── Extensions
        ├── AVAudioPCMBuffer+fromData.swift
        ├── Collection+safe.swift
        ├── Continuation+error.swift
        └── String+random.swift
    ├── Models
        ├── ClientEvent.swift
        ├── Item.swift
        ├── Response.swift
        ├── ServerError.swift
        ├── ServerEvent.swift
        └── Session.swift
    ├── OpenAIRealtime.swift
    ├── Protocols
        └── Connector.swift
    └── Support
        ├── UnsafeInteriorMutable.swift
        └── UnsafeMutableArray.swift


/.github/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd -P)"
 6 | PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 7 | 
 8 | PROJECT_BUILD_DIR="${PROJECT_BUILD_DIR:-"${PROJECT_ROOT}/build"}"
 9 | XCODEBUILD_BUILD_DIR="$PROJECT_BUILD_DIR/xcodebuild"
10 | XCODEBUILD_DERIVED_DATA_PATH="$XCODEBUILD_BUILD_DIR/DerivedData"
11 | 
12 | build_framework() {
13 |     local sdk="$1"
14 |     local destination="$2"
15 |     local scheme="$3"
16 | 
17 |     local XCODEBUILD_ARCHIVE_PATH="./$scheme-$sdk.xcarchive"
18 | 
19 |     rm -rf "$XCODEBUILD_ARCHIVE_PATH"
20 | 
21 |     xcodebuild archive \
22 |         -scheme "$scheme" \
23 |         -archivePath "$XCODEBUILD_ARCHIVE_PATH" \
24 |         -derivedDataPath "$XCODEBUILD_DERIVED_DATA_PATH" \
25 |         -sdk "$sdk" \
26 |         -destination "$destination" \
27 |         BUILD_LIBRARY_FOR_DISTRIBUTION=YES \
28 |         INSTALL_PATH='Library/Frameworks' \
29 |         OTHER_SWIFT_FLAGS=-no-verify-emitted-module-interface \
30 |         LD_GENERATE_MAP_FILE=YES
31 | 
32 |     FRAMEWORK_MODULES_PATH="$XCODEBUILD_ARCHIVE_PATH/Products/Library/Frameworks/$scheme.framework/Modules"
33 |     mkdir -p "$FRAMEWORK_MODULES_PATH"
34 |     cp -r \
35 |     "$XCODEBUILD_DERIVED_DATA_PATH/Build/Intermediates.noindex/ArchiveIntermediates/$scheme/BuildProductsPath/Release-$sdk/$scheme.swiftmodule" \
36 |     "$FRAMEWORK_MODULES_PATH/$scheme.swiftmodule"
37 |     # Delete private swiftinterface
38 |     rm -f "$FRAMEWORK_MODULES_PATH/$scheme.swiftmodule/*.private.swiftinterface"
39 |     mkdir -p "$scheme-$sdk.xcarchive/LinkMaps"
40 |     find "$XCODEBUILD_DERIVED_DATA_PATH" -name "$scheme-LinkMap-*.txt" -exec cp {} "./$scheme-$sdk.xcarchive/LinkMaps/" \;
41 | }
42 | 
43 | # Update the Package.swift to build the library as dynamic instead of static
44 | sed -i '' 's/type: \.static/type: .dynamic/g' Package.swift
45 | 
46 | build_framework "iphoneos" "generic/platform=iOS" "OpenAIRealtime"
47 | build_framework "iphonesimulator" "generic/platform=iOS Simulator" "OpenAIRealtime"
48 | 
49 | echo "Builds completed successfully."
50 | 
51 | rm -rf "OpenAIRealtime.xcframework"
52 | xcodebuild -create-xcframework -framework OpenAIRealtime-iphonesimulator.xcarchive/Products/Library/Frameworks/OpenAIRealtime.framework -framework OpenAIRealtime-iphoneos.xcarchive/Products/Library/Frameworks/OpenAIRealtime.framework -output OpenAIRealtime.xcframework
53 | 
54 | cp -r OpenAIRealtime-iphoneos.xcarchive/dSYMs OpenAIRealtime.xcframework/ios-arm64
55 | cp -r OpenAIRealtime-iphonesimulator.xcarchive/dSYMs OpenAIRealtime.xcframework/ios-arm64_x86_64-simulator
56 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | on:
 3 |     push:
 4 |         branches: [main]
 5 | 
 6 | jobs:
 7 |     build:
 8 |         name: Swift 6.0
 9 |         runs-on: macos-latest
10 |         steps:
11 |             - uses: actions/checkout@v4
12 | 
13 |             - uses: maxim-lobanov/setup-xcode@v1
14 |               with:
15 |                   xcode-version: latest-stable
16 | 
17 |             - name: Build
18 |               run: ./.github/build.sh
19 | 
20 |             - run: zip -r ./OpenAIRealtime.xcframework.zip ./OpenAIRealtime.xcframework
21 | 
22 |             - name: Upload artifact to Emerge
23 |               uses: EmergeTools/emerge-upload-action@v1.1.0
24 |               with:
25 |                   build_type: release
26 |                   artifact_path: ./OpenAIRealtime.xcframework.zip
27 |                   emerge_api_key: ${{ secrets.EMERGE_API_KEY }}
28 | 


--------------------------------------------------------------------------------
/.github/workflows/dependencies.yml:
--------------------------------------------------------------------------------
 1 | name: Swift Dependency Submission
 2 | on:
 3 |     push:
 4 |         branches:
 5 |             - main
 6 | 
 7 | permissions:
 8 |     contents: write
 9 | 
10 | jobs:
11 |     swift-action-detection:
12 |         runs-on: ubuntu-latest
13 |         steps:
14 |             - name: Checkout Repository
15 |               uses: actions/checkout@v4
16 | 
17 |             - name: Install Swift
18 |               uses: vapor/swiftly-action@v0.1
19 |               with:
20 |                   toolchain: latest
21 | 
22 |             - name: Submit Dependencies
23 |               uses: vapor-community/swift-dependency-submission@v0.1
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .netrc
 2 | /.build
 3 | .DS_Store
 4 | /Packages
 5 | *.xcarchive
 6 | xcuserdata/
 7 | DerivedData/
 8 | *.xcframework
 9 | Package.resolved
10 | .swiftpm/configuration/registries.json
11 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
12 | 


--------------------------------------------------------------------------------
/.spi.yml:
--------------------------------------------------------------------------------
1 | version: 1
2 | builder:
3 |     configs:
4 |         - documentation_targets: [OpenAIRealtime]
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Miguel Piedrafita
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 6.0
 2 | 
 3 | import PackageDescription
 4 | 
 5 | let package = Package(
 6 | 	name: "OpenAIRealtime",
 7 | 	platforms: [
 8 | 		.iOS(.v17),
 9 | 		.tvOS(.v17),
10 | 		.macOS(.v14),
11 | 		.watchOS(.v10),
12 | 		.visionOS(.v1),
13 | 		.macCatalyst(.v17),
14 | 	],
15 | 	products: [
16 | 		.library(name: "OpenAIRealtime", type: .static, targets: ["OpenAIRealtime"]),
17 | 	],
18 | 	dependencies: [
19 | 		.package(url: "https://github.com/stasel/WebRTC.git", branch: "latest"),
20 | 	],
21 | 	targets: [
22 | 		.target(name: "OpenAIRealtime", dependencies: ["WebRTC"], path: "./src"),
23 | 	]
24 | )
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A modern Swift SDK for OpenAI's Realtime API
  2 | 
  3 | [![Install Size](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fwww.emergetools.com%2Fapi%2Fv2%2Fpublic_new_build%3FexampleId%3Dswift-realtime-openai.OpenAIRealtime%26platform%3Dios%26badgeOption%3Dmax_install_size_only%26buildType%3Drelease&query=$.badgeMetadata&label=OpenAI&logo=apple)](https://www.emergetools.com/app/example/ios/swift-realtime-openai.OpenAIRealtime/release)
  4 | [![Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fm1guelpf%2Fswift-realtime-openai%2Fbadge%3Ftype%3Dswift-versions&color=brightgreen)](https://swiftpackageindex.com/m1guelpf/swift-realtime-openai)
  5 | [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/m1guelpf/swift-realtime-openai/main/LICENSE)
  6 | 
  7 | This library provides a simple interface for implementing multi-modal conversations using OpenAI's new Realtime API.
  8 | 
  9 | It can handle automatically recording the user's microphone and playing back the assistant's response, and also gives you a transparent layer over the API for advanced use cases.
 10 | 
 11 | ## Installation
 12 | 
 13 | ### Swift Package Manager
 14 | 
 15 | The Swift Package Manager allows for developers to easily integrate packages into their Xcode projects and packages; and is also fully integrated into the swift compiler.
 16 | 
 17 | ### SPM Through XCode Project
 18 | 
 19 | -   File > Swift Packages > Add Package Dependency
 20 | -   Add https://github.com/m1guelpf/swift-realtime-openai.git
 21 | -   Select "Branch" with "main"
 22 | 
 23 | ### SPM Through Xcode Package
 24 | 
 25 | Once you have your Swift package set up, add the Git link within the dependencies value of your Package.swift file.
 26 | 
 27 | ```swift
 28 | dependencies: [
 29 |     .package(url: "https://github.com/m1guelpf/swift-realtime-openai.git", .branch("main"))
 30 | ]
 31 | ```
 32 | 
 33 | ## Getting started 🚀
 34 | 
 35 | You can build an iMessage-like app with built-in AI chat in less than 60 lines of code (UI included!):
 36 | 
 37 | ```swift
 38 | import SwiftUI
 39 | import OpenAIRealtime
 40 | 
 41 | struct ContentView: View {
 42 | 	@State private var newMessage: String = ""
 43 | 	@State private var conversation = Conversation(authToken: OPENAI_KEY)
 44 | 
 45 | 	var messages: [Item.Message] {
 46 | 		conversation.entries.compactMap { switch $0 {
 47 | 			case let .message(message): return message
 48 | 			default: return nil
 49 | 		} }
 50 | 	}
 51 | 
 52 | 	var body: some View {
 53 | 		VStack(spacing: 0) {
 54 | 			ScrollView {
 55 |                 VStack(spacing: 12) {
 56 |                     ForEach(messages, id: \.id) { message in
 57 |                         MessageBubble(message: message)
 58 |                     }
 59 |                 }
 60 |                 .padding()
 61 | 			}
 62 | 
 63 | 			HStack(spacing: 12) {
 64 | 				HStack {
 65 | 					TextField("Chat", text: $newMessage, onCommit: { sendMessage() })
 66 | 						.frame(height: 40)
 67 | 						.submitLabel(.send)
 68 | 
 69 | 					if newMessage != "" {
 70 | 						Button(action: sendMessage) {
 71 | 							Image(systemName: "arrow.up.circle.fill")
 72 | 								.resizable()
 73 | 								.aspectRatio(contentMode: .fill)
 74 | 								.frame(width: 28, height: 28)
 75 | 								.foregroundStyle(.white, .blue)
 76 | 						}
 77 | 					}
 78 | 				}
 79 | 				.padding(.leading)
 80 | 				.padding(.trailing, 6)
 81 | 				.overlay(RoundedRectangle(cornerRadius: 20).stroke(.quaternary, lineWidth: 1))
 82 | 			}
 83 | 			.padding()
 84 | 		}
 85 | 		.navigationTitle("Chat")
 86 | 		.navigationBarTitleDisplayMode(.inline)
 87 | 		.onAppear { try! conversation.startHandlingVoice() }
 88 | 	}
 89 | 
 90 | 	func sendMessage() {
 91 | 		guard newMessage != "" else { return }
 92 | 
 93 | 		Task {
 94 | 			try await conversation.send(from: .user, text: newMessage)
 95 | 			newMessage = ""
 96 | 		}
 97 | 	}
 98 | }
 99 | ```
100 | 
101 | Or, if you just want a simple app that lets the user talk and the AI respond:
102 | 
103 | ```swift
104 | import SwiftUI
105 | import OpenAIRealtime
106 | 
107 | struct ContentView: View {
108 | 	@State private var conversation = Conversation(authToken: OPENAI_KEY)
109 | 
110 | 	var body: some View {
111 | 		Text("Say something!")
112 | 			.onAppear { try! conversation.startListening() }
113 | 	}
114 | }
115 | ```
116 | 
117 | ## Features
118 | 
119 | -   [x] A simple interface for directly interacting with the API
120 | -   [x] Wrap the API in an interface that manages the conversation for you
121 | -   [x] Optionally handle recording the user's mic and sending it to the API
122 | -   [x] Optionally handle playing model responses as they stream in
123 | -   [x] Allow interrupting the model
124 | -   [ ] WebRTC support
125 | 
126 | ## Architecture
127 | 
128 | ### `Conversation`
129 | 
130 | The `Conversation` class provides a high-level interface for managing a conversation with the model. It wraps the `RealtimeAPI` class and handles the details of sending and receiving messages, as well as managing the conversation history. It can optionally also handle recording the user's mic and sending it to the API, as well as playing model responses as they stream in.
131 | 
132 | #### Reading messages
133 | 
134 | You can access the messages in the conversation through the `messages` property. Note that this won't include function calls and its responses, only the messages between the user and the model. To access the full conversation history, use the `entries` property. For example:
135 | 
136 | ```swift
137 | ScrollView {
138 |     ScrollViewReader { scrollView in
139 |         VStack(spacing: 12) {
140 |             ForEach(conversation.messages, id: \.id) { message in
141 |                 MessageBubble(message: message).id(message.id)
142 |             }
143 |         }
144 |         .onReceive(conversation.messages.publisher) { _ in
145 |             withAnimation { scrollView.scrollTo(conversation.messages.last?.id, anchor: .center) }
146 |         }
147 |     }
148 | }
149 | ```
150 | 
151 | #### Customizing the session
152 | 
153 | You can customize the current session using the `setSession(_: Session)` or `updateSession(withChanges: (inout Session) -> Void)` methods. Note that they requires that a session has already been established, so it's recommended you call them from a `whenConnected(_: @Sendable () async throws -> Void)` callback or await `waitForConnection()` first. For example:
154 | 
155 | ```swift
156 | try await conversation.whenConnected {
157 |     try await conversation.updateSession { session in
158 |         // update system prompt
159 |         session.instructions = "You are a helpful assistant."
160 | 
161 |         // enable transcription of users' voice messages
162 |         session.inputAudioTranscription = Session.InputAudioTranscription()
163 | 
164 |         // ...
165 |     }
166 | }
167 | ```
168 | 
169 | #### Handling voice conversations
170 | 
171 | The `Conversation` class can automatically handle 2-way voice conversations. Calling `startListening()` will start listening to the user's voice and sending it to the model, and playing back the model's responses. Calling `stopListening()` will stop listening, but continue playing back responses.
172 | 
173 | If you just want to play model responses, call `startHandlingVoice()`. To stop both listening and playing back responses, call `stopHandlingVoice()`.
174 | 
175 | #### Manually sending messages
176 | 
177 | To send a text message, call the `send(from: Item.ItemRole, text: String, response: Response.Config? = nil)` providing the role of the sender (`.user`, `.assistant`, or `.system`) and the contents of the message. You can optionally also provide a `Response.Config` object to customize the response, such as enabling or disabling function calls.
178 | 
179 | To manually send an audio message (or part of one), call the `send(audioDelta: Data, commit: Bool = false)` with a valid audio chunk. If `commit` is `true`, the model will consider the message finished and begin responding to it. Otherwise, it might wait for more audio depending on your `Session.turnDetection` settings.
180 | 
181 | #### Manually sending events
182 | 
183 | To manually send an event to the API, use the `send(event: RealtimeAPI.ClientEvent)` method. Note that this bypasses some of the logic in the `Conversation` class such as handling interrupts, so you should prefer to use other methods whenever possible.
184 | 
185 | ### `RealtimeAPI`
186 | 
187 | To interact with the API directly, create a new instance of `RealtimeAPI` providing one of the available connectors. There are helper methods that let you create an instance from an apiKey or a `URLRequest`, like so:
188 | 
189 | ```swift
190 | let api = RealtimeAPI.webSocket(authToken: YOUR_OPENAI_API_KEY, model: String = "gpt-4o-realtime-preview") // or RealtimeAPI.webSocket(connectingTo: URLRequest)
191 | let api = RealtimeAPI.webRTC(authToken: YOUR_OPENAI_API_KEY, model: String = "gpt-4o-realtime-preview") // or RealtimeAPI.webRTC(connectingTo: URLRequest)
192 | ```
193 | 
194 | You can listen for new events through the `events` property, like so:
195 | 
196 | ```swift
197 | for try await event in api.events {
198 |     switch event {
199 |         case let .sessionCreated(event):
200 |             print(event.session.id)
201 |     }
202 | }
203 | ```
204 | 
205 | To send an event to the API, call the `send` method with a `ClientEvent` instance:
206 | 
207 | ```swift
208 | try await api.send(event: .updateSession(session))
209 | try await api.send(event: .appendInputAudioBuffer(encoding: audioData))
210 | try await api.send(event: .createResponse())
211 | ```
212 | 
213 | ## License
214 | 
215 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
216 | 


--------------------------------------------------------------------------------
/src/Connectors/WebRTCConnector.swift:
--------------------------------------------------------------------------------
  1 | @preconcurrency import WebRTC
  2 | import Foundation
  3 | #if canImport(FoundationNetworking)
  4 | import FoundationNetworking
  5 | #endif
  6 | 
  7 | public final class WebRTCConnector: NSObject, Connector, Sendable {
  8 | 	enum WebRTCError: Error {
  9 | 		case failedToCreateDataChannel
 10 | 		case failedToCreatePeerConnection
 11 | 		case badServerResponse
 12 | 	}
 13 | 
 14 | 	@MainActor public private(set) var onDisconnect: (@Sendable () -> Void)? = nil
 15 | 	public let events: AsyncThrowingStream<ServerEvent, Error>
 16 | 
 17 | 	private let connection: RTCPeerConnection
 18 | 	private let dataChannel: RTCDataChannel
 19 | 
 20 | 	private let stream: AsyncThrowingStream<ServerEvent, Error>.Continuation
 21 | 
 22 | 	private static let factory: RTCPeerConnectionFactory = {
 23 | 		RTCInitializeSSL()
 24 | 
 25 | 		return RTCPeerConnectionFactory()
 26 | 	}()
 27 | 
 28 | 	private let encoder: JSONEncoder = {
 29 | 		let encoder = JSONEncoder()
 30 | 		encoder.keyEncodingStrategy = .convertToSnakeCase
 31 | 		return encoder
 32 | 	}()
 33 | 
 34 | 	private let decoder: JSONDecoder = {
 35 | 		let decoder = JSONDecoder()
 36 | 		decoder.keyDecodingStrategy = .convertFromSnakeCase
 37 | 		return decoder
 38 | 	}()
 39 | 
 40 | 	public required init(connectingTo request: URLRequest) async throws {
 41 | 		guard let connection = WebRTCConnector.factory.peerConnection(with: .init(), constraints: .init(mandatoryConstraints: nil, optionalConstraints: nil), delegate: nil) else {
 42 | 			throw WebRTCError.failedToCreatePeerConnection
 43 | 		}
 44 | 		self.connection = connection
 45 | 
 46 | 		let audioTrackSource = WebRTCConnector.factory.audioSource(with: nil)
 47 | 		let audioTrack = WebRTCConnector.factory.audioTrack(with: audioTrackSource, trackId: "audio0")
 48 | 		let mediaStream = WebRTCConnector.factory.mediaStream(withStreamId: "stream0")
 49 | 		mediaStream.addAudioTrack(audioTrack)
 50 | 		self.connection.add(audioTrack, streamIds: ["stream0"])
 51 | 
 52 | 		guard let dataChannel = self.connection.dataChannel(forLabel: "oai-events", configuration: RTCDataChannelConfiguration()) else {
 53 | 			throw WebRTCError.failedToCreateDataChannel
 54 | 		}
 55 | 		self.dataChannel = dataChannel
 56 | 
 57 | 		(events, stream) = AsyncThrowingStream.makeStream(of: ServerEvent.self)
 58 | 
 59 | 		super.init()
 60 | 
 61 | 		connection.delegate = self
 62 | 		dataChannel.delegate = self
 63 | 
 64 | 		var request = request
 65 | 
 66 | 		let offer = try await self.connection.offer(for: RTCMediaConstraints(mandatoryConstraints: nil, optionalConstraints: [
 67 | 			"OfferToReceiveAudio": "true",
 68 | 			"googEchoCancellation": "true",
 69 | 			"googAutoGainControl": "true",
 70 | 			"googNoiseSuppression": "true",
 71 | 			"googHighpassFilter": "true",
 72 | 		]))
 73 | 		try await self.connection.setLocalDescription(offer)
 74 | 
 75 | 		request.httpBody = offer.sdp.data(using: .utf8)
 76 | 
 77 | 		let (data, res) = try await URLSession.shared.data(for: request)
 78 | 		guard let res = res as? HTTPURLResponse, res.statusCode == 201, let sdp = String(data: data, encoding: .utf8) else {
 79 | 			throw WebRTCError.badServerResponse
 80 | 		}
 81 | 
 82 | 		try await self.connection.setRemoteDescription(RTCSessionDescription(type: .answer, sdp: sdp))
 83 | 	}
 84 | 
 85 | 	deinit {
 86 | 		connection.close()
 87 | 		stream.finish()
 88 | 		onDisconnect?()
 89 | 	}
 90 | 
 91 | 	public func send(event: ClientEvent) async throws {
 92 | 		try dataChannel.sendData(RTCDataBuffer(data: encoder.encode(event), isBinary: false))
 93 | 	}
 94 | 
 95 | 	@MainActor public func onDisconnect(_ action: (@Sendable () -> Void)?) {
 96 | 		onDisconnect = action
 97 | 	}
 98 | }
 99 | 
100 | extension WebRTCConnector: RTCPeerConnectionDelegate {
101 | 	public func peerConnection(_: RTCPeerConnection, didChange _: RTCSignalingState) {
102 | 		print("Connection state changed to \(connection.signalingState)")
103 | 	}
104 | 
105 | 	public func peerConnection(_: RTCPeerConnection, didAdd _: RTCMediaStream) {
106 | 		print("Media stream added.")
107 | 	}
108 | 
109 | 	public func peerConnection(_: RTCPeerConnection, didRemove _: RTCMediaStream) {
110 | 		print("Media stream removed.")
111 | 	}
112 | 
113 | 	public func peerConnectionShouldNegotiate(_: RTCPeerConnection) {
114 | 		print("Negotiating connection.")
115 | 	}
116 | 
117 | 	public func peerConnection(_: RTCPeerConnection, didChange _: RTCIceConnectionState) {
118 | 		print("ICE connection state changed to \(connection.iceConnectionState)")
119 | 	}
120 | 
121 | 	public func peerConnection(_: RTCPeerConnection, didChange _: RTCIceGatheringState) {
122 | 		print("ICE gathering state changed to \(connection.iceGatheringState)")
123 | 	}
124 | 
125 | 	public func peerConnection(_: RTCPeerConnection, didGenerate _: RTCIceCandidate) {
126 | 		print("ICE candidate generated.")
127 | 	}
128 | 
129 | 	public func peerConnection(_: RTCPeerConnection, didRemove _: [RTCIceCandidate]) {
130 | 		print("ICE candidate removed.")
131 | 	}
132 | 
133 | 	public func peerConnection(_: RTCPeerConnection, didOpen _: RTCDataChannel) {
134 | 		print("Data channel opened.")
135 | 	}
136 | }
137 | 
138 | extension WebRTCConnector: RTCDataChannelDelegate {
139 | 	public func dataChannel(_: RTCDataChannel, didReceiveMessageWith buffer: RTCDataBuffer) {
140 | 		stream.yield(with: Result { try self.decoder.decode(ServerEvent.self, from: buffer.data) })
141 | 	}
142 | 
143 | 	public func dataChannelDidChangeState(_ dataChannel: RTCDataChannel) {
144 | 		print("Data channel changed to \(dataChannel.readyState)")
145 | 	}
146 | }
147 | 


--------------------------------------------------------------------------------
/src/Connectors/WebSocketConnector.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | #if canImport(FoundationNetworking)
 3 | import FoundationNetworking
 4 | #endif
 5 | 
 6 | public final class WebSocketConnector: Connector, Sendable {
 7 | 	@MainActor public private(set) var onDisconnect: (@Sendable () -> Void)? = nil
 8 | 	public let events: AsyncThrowingStream<ServerEvent, Error>
 9 | 
10 | 	private let task: Task<Void, Never>
11 | 	private let webSocket: URLSessionWebSocketTask
12 | 	private let stream: AsyncThrowingStream<ServerEvent, Error>.Continuation
13 | 
14 | 	private let encoder: JSONEncoder = {
15 | 		let encoder = JSONEncoder()
16 | 		encoder.keyEncodingStrategy = .convertToSnakeCase
17 | 		return encoder
18 | 	}()
19 | 
20 | 	public init(connectingTo request: URLRequest) {
21 | 		let (events, stream) = AsyncThrowingStream.makeStream(of: ServerEvent.self)
22 | 
23 | 		let webSocket = URLSession.shared.webSocketTask(with: request)
24 | 		webSocket.resume()
25 | 
26 | 		task = Task.detached { [webSocket, stream] in
27 | 			var isActive = true
28 | 
29 | 			let decoder = JSONDecoder()
30 | 			decoder.keyDecodingStrategy = .convertFromSnakeCase
31 | 
32 | 			while isActive, webSocket.closeCode == .invalid, !Task.isCancelled {
33 | 				guard webSocket.closeCode == .invalid else {
34 | 					stream.finish()
35 | 					isActive = false
36 | 					break
37 | 				}
38 | 
39 | 				do {
40 | 					let message = try await webSocket.receive()
41 | 
42 | 					guard case let .string(text) = message, let data = text.data(using: .utf8) else {
43 | 						stream.yield(error: RealtimeAPIError.invalidMessage)
44 | 						continue
45 | 					}
46 | 
47 | 					try stream.yield(decoder.decode(ServerEvent.self, from: data))
48 | 				} catch {
49 | 					stream.yield(error: error)
50 | 					isActive = false
51 | 				}
52 | 			}
53 | 
54 | 			webSocket.cancel(with: .goingAway, reason: nil)
55 | 		}
56 | 
57 | 		self.events = events
58 | 		self.stream = stream
59 | 		self.webSocket = webSocket
60 | 	}
61 | 
62 | 	deinit {
63 | 		webSocket.cancel(with: .goingAway, reason: nil)
64 | 		task.cancel()
65 | 		stream.finish()
66 | 		onDisconnect?()
67 | 	}
68 | 
69 | 	public func send(event: ClientEvent) async throws {
70 | 		let message = try URLSessionWebSocketTask.Message.string(String(data: encoder.encode(event), encoding: .utf8)!)
71 | 		try await webSocket.send(message)
72 | 	}
73 | 
74 | 	@MainActor public func onDisconnect(_ action: (@Sendable () -> Void)?) {
75 | 		onDisconnect = action
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/src/Conversation.swift:
--------------------------------------------------------------------------------
  1 | import Foundation
  2 | @preconcurrency import AVFoundation
  3 | 
  4 | public enum ConversationError: Error {
  5 | 	case sessionNotFound
  6 | 	case converterInitializationFailed
  7 | }
  8 | 
  9 | @Observable
 10 | public final class Conversation: @unchecked Sendable {
 11 | 	private let client: RealtimeAPI
 12 | 	@MainActor private var isInterrupting: Bool = false
 13 | 	private let errorStream: AsyncStream<ServerError>.Continuation
 14 | 
 15 | 	private var task: Task<Void, Error>!
 16 | 	private let audioEngine = AVAudioEngine()
 17 | 	private let playerNode = AVAudioPlayerNode()
 18 | 	private let queuedSamples = UnsafeMutableArray<String>()
 19 | 	private let apiConverter = UnsafeInteriorMutable<AVAudioConverter>()
 20 | 	private let userConverter = UnsafeInteriorMutable<AVAudioConverter>()
 21 | 	private let desiredFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 24000, channels: 1, interleaved: false)!
 22 | 
 23 | 	/// A stream of errors that occur during the conversation.
 24 | 	public let errors: AsyncStream<ServerError>
 25 | 
 26 | 	/// The unique ID of the conversation.
 27 | 	@MainActor public private(set) var id: String?
 28 | 
 29 | 	/// The current session for this conversation.
 30 | 	@MainActor public private(set) var session: Session?
 31 | 
 32 | 	/// A list of items in the conversation.
 33 | 	@MainActor public private(set) var entries: [Item] = []
 34 | 
 35 | 	/// Whether the conversation is currently connected to the server.
 36 | 	@MainActor public private(set) var connected: Bool = false
 37 | 
 38 | 	/// Whether the conversation is currently listening to the user's microphone.
 39 | 	@MainActor public private(set) var isListening: Bool = false
 40 | 
 41 | 	/// Whether this conversation is currently handling voice input and output.
 42 | 	@MainActor public private(set) var handlingVoice: Bool = false
 43 | 
 44 | 	/// Whether the user is currently speaking.
 45 | 	/// This only works when using the server's voice detection.
 46 | 	@MainActor public private(set) var isUserSpeaking: Bool = false
 47 | 
 48 | 	/// Whether the model is currently speaking.
 49 | 	@MainActor public private(set) var isPlaying: Bool = false
 50 | 
 51 | 	/// A list of messages in the conversation.
 52 | 	/// Note that this doesn't include function call events. To get a complete list, use `entries`.
 53 | 	@MainActor public var messages: [Item.Message] {
 54 | 		entries.compactMap { switch $0 {
 55 | 			case let .message(message): return message
 56 | 			default: return nil
 57 | 		} }
 58 | 	}
 59 | 
 60 | 	private init(client: RealtimeAPI) {
 61 | 		self.client = client
 62 | 		(errors, errorStream) = AsyncStream.makeStream(of: ServerError.self)
 63 | 
 64 | 		let events = client.events
 65 | 		task = Task.detached { [weak self] in
 66 | 			for try await event in events {
 67 | 				guard !Task.isCancelled else { break }
 68 | 
 69 | 				await self?.handleEvent(event)
 70 | 			}
 71 | 
 72 | 			await MainActor.run { [weak self] in
 73 | 				self?.connected = false
 74 | 			}
 75 | 		}
 76 | 
 77 | 		Task { @MainActor in
 78 | 			client.onDisconnect = { [weak self] in
 79 | 				guard let self else { return }
 80 | 
 81 | 				Task { @MainActor in
 82 | 					self.connected = false
 83 | 				}
 84 | 			}
 85 | 
 86 | 			_keepIsPlayingPropertyUpdated()
 87 | 		}
 88 | 	}
 89 | 
 90 | 	deinit {
 91 | 		task.cancel()
 92 | 		errorStream.finish()
 93 | 
 94 | 		Task { [playerNode, audioEngine] in
 95 | 			Self.cleanUpAudio(playerNode: playerNode, audioEngine: audioEngine)
 96 | 		}
 97 | 	}
 98 | 
 99 | 	/// Create a new conversation providing an API token and, optionally, a model.
100 | 	public convenience init(authToken token: String, model: String = "gpt-4o-realtime-preview") {
101 | 		self.init(client: RealtimeAPI.webSocket(authToken: token, model: model))
102 | 	}
103 | 
104 | 	/// Create a new conversation that connects using a custom `URLRequest`.
105 | 	public convenience init(connectingTo request: URLRequest) {
106 | 		self.init(client: RealtimeAPI.webSocket(connectingTo: request))
107 | 	}
108 | 
109 | 	/// Wait for the connection to be established
110 | 	@MainActor public func waitForConnection() async {
111 | 		while true {
112 | 			if connected {
113 | 				return
114 | 			}
115 | 
116 | 			try? await Task.sleep(for: .milliseconds(500))
117 | 		}
118 | 	}
119 | 
120 | 	/// Execute a block of code when the connection is established
121 | 	@MainActor public func whenConnected<E>(_ callback: @Sendable () async throws(E) -> Void) async throws(E) {
122 | 		await waitForConnection()
123 | 		try await callback()
124 | 	}
125 | 
126 | 	/// Make changes to the current session
127 | 	/// Note that this will fail if the session hasn't started yet. Use `whenConnected` to ensure the session is ready.
128 | 	public func updateSession(withChanges callback: (inout Session) -> Void) async throws {
129 | 		guard var session = await session else {
130 | 			throw ConversationError.sessionNotFound
131 | 		}
132 | 
133 | 		callback(&session)
134 | 
135 | 		try await setSession(session)
136 | 	}
137 | 
138 | 	/// Set the configuration of the current session
139 | 	public func setSession(_ session: Session) async throws {
140 | 		// update endpoint errors if we include the session id
141 | 		var session = session
142 | 		session.id = nil
143 | 
144 | 		try await client.send(event: .updateSession(session))
145 | 	}
146 | 
147 | 	/// Send a client event to the server.
148 | 	/// > Warning: This function is intended for advanced use cases. Use the other functions to send messages and audio data.
149 | 	public func send(event: ClientEvent) async throws {
150 | 		try await client.send(event: event)
151 | 	}
152 | 
153 | 	/// Manually append audio bytes to the conversation.
154 | 	/// Commit the audio to trigger a model response when server turn detection is disabled.
155 | 	/// > Note: The `Conversation` class can automatically handle listening to the user's mic and playing back model responses.
156 | 	/// > To get started, call the `startListening` function.
157 | 	public func send(audioDelta audio: Data, commit: Bool = false) async throws {
158 | 		try await send(event: .appendInputAudioBuffer(encoding: audio))
159 | 		if commit { try await send(event: .commitInputAudioBuffer()) }
160 | 	}
161 | 
162 | 	/// Send a text message and wait for a response.
163 | 	/// Optionally, you can provide a response configuration to customize the model's behavior.
164 | 	/// > Note: Calling this function will automatically call `interruptSpeech` if the model is currently speaking.
165 | 	public func send(from role: Item.ItemRole, text: String, response: Response.Config? = nil) async throws {
166 | 		if await handlingVoice { await interruptSpeech() }
167 | 
168 | 		try await send(event: .createConversationItem(Item(message: Item.Message(id: String(randomLength: 32), from: role, content: [.input_text(text)]))))
169 | 		try await send(event: .createResponse(response))
170 | 	}
171 | 
172 | 	/// Send the response of a function call.
173 | 	public func send(result output: Item.FunctionCallOutput) async throws {
174 | 		try await send(event: .createConversationItem(Item(with: output)))
175 | 	}
176 | }
177 | 
178 | /// Listening/Speaking public API
179 | public extension Conversation {
180 | 	/// Start listening to the user's microphone and sending audio data to the model.
181 | 	/// This will automatically call `startHandlingVoice` if it hasn't been called yet.
182 | 	/// > Warning: Make sure to handle the case where the user denies microphone access.
183 | 	@MainActor func startListening() throws {
184 | 		guard !isListening else { return }
185 | 		if !handlingVoice { try startHandlingVoice() }
186 | 
187 | 		Task.detached { [audioEngine] in
188 | 			audioEngine.inputNode.installTap(onBus: 0, bufferSize: 4096, format: audioEngine.inputNode.outputFormat(forBus: 0)) { [weak self] buffer, _ in
189 | 				self?.processAudioBufferFromUser(buffer: buffer)
190 | 			}
191 | 		}
192 | 
193 | 		isListening = true
194 | 	}
195 | 
196 | 	/// Stop listening to the user's microphone.
197 | 	/// This won't stop playing back model responses. To fully stop handling voice conversations, call `stopHandlingVoice`.
198 | 	@MainActor func stopListening() {
199 | 		guard isListening else { return }
200 | 
201 | 		audioEngine.inputNode.removeTap(onBus: 0)
202 | 		isListening = false
203 | 	}
204 | 
205 | 	/// Handle the playback of audio responses from the model.
206 | 	@MainActor func startHandlingVoice() throws {
207 | 		guard !handlingVoice else { return }
208 | 
209 | 		guard let converter = AVAudioConverter(from: audioEngine.inputNode.outputFormat(forBus: 0), to: desiredFormat) else {
210 | 			throw ConversationError.converterInitializationFailed
211 | 		}
212 | 		userConverter.set(converter)
213 | 
214 | 		audioEngine.attach(playerNode)
215 | 		audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: converter.inputFormat)
216 | 
217 | 		#if os(iOS)
218 | 		try audioEngine.inputNode.setVoiceProcessingEnabled(true)
219 | 		#endif
220 | 
221 | 		audioEngine.prepare()
222 | 		do {
223 | 			try audioEngine.start()
224 | 
225 | 			#if os(iOS)
226 | 			let audioSession = AVAudioSession.sharedInstance()
227 | 			try audioSession.setCategory(.playAndRecord, mode: .voiceChat, options: [.defaultToSpeaker, .allowBluetooth])
228 | 			try audioSession.setActive(true)
229 | 			#endif
230 | 
231 | 			handlingVoice = true
232 | 		} catch {
233 | 			print("Failed to enable audio engine: \(error)")
234 | 
235 | 			audioEngine.disconnectNodeInput(playerNode)
236 | 			audioEngine.disconnectNodeOutput(playerNode)
237 | 
238 | 			throw error
239 | 		}
240 | 	}
241 | 
242 | 	/// Interrupt the model's response if it's currently playing.
243 | 	/// This lets the model know that the user didn't hear the full response.
244 | 	@MainActor func interruptSpeech() {
245 | 		guard !isInterrupting else { return }
246 | 		isInterrupting = true
247 | 
248 | 		if isPlaying,
249 | 		   let nodeTime = playerNode.lastRenderTime,
250 | 		   let playerTime = playerNode.playerTime(forNodeTime: nodeTime),
251 | 		   let itemID = queuedSamples.first
252 | 		{
253 | 			let audioTimeInMiliseconds = Int((Double(playerTime.sampleTime) / playerTime.sampleRate) * 1000)
254 | 
255 | 			Task { [client] in
256 | 				do {
257 | 					try await client.send(event: .truncateConversationItem(forItem: itemID, atAudioMs: audioTimeInMiliseconds))
258 | 				} catch {
259 | 					print("Failed to send automatic truncation event: \(error)")
260 | 				}
261 | 			}
262 | 		}
263 | 
264 | 		playerNode.stop()
265 | 		queuedSamples.clear()
266 | 		isInterrupting = false
267 | 	}
268 | 
269 | 	@MainActor func stopHandlingVoice() {
270 | 		guard handlingVoice else { return }
271 | 
272 | 		Self.cleanUpAudio(playerNode: playerNode, audioEngine: audioEngine)
273 | 
274 | 		isListening = false
275 | 		handlingVoice = false
276 | 	}
277 | 
278 | 	/// Stop playing audio responses from the model and listening to the user's microphone.
279 | 	static func cleanUpAudio(playerNode: AVAudioPlayerNode, audioEngine: AVAudioEngine) {
280 | 		// If attachedNodes does not contain the playerNode then `startHandlingVoice` was never called
281 | 		guard audioEngine.attachedNodes.contains(playerNode) else { return }
282 | 
283 | 		audioEngine.inputNode.removeTap(onBus: 0)
284 | 		audioEngine.stop()
285 | 		audioEngine.disconnectNodeInput(playerNode)
286 | 		audioEngine.disconnectNodeOutput(playerNode)
287 | 
288 | 		#if os(iOS)
289 | 		try? AVAudioSession.sharedInstance().setActive(false)
290 | 		#elseif os(macOS)
291 | 		if audioEngine.isRunning {
292 | 			audioEngine.stop()
293 | 			audioEngine.reset()
294 | 		}
295 | 		#endif
296 | 	}
297 | }
298 | 
299 | /// Event handling private API
300 | private extension Conversation {
301 | 	@MainActor func handleEvent(_ event: ServerEvent) {
302 | 		switch event {
303 | 			case let .error(event):
304 | 				errorStream.yield(event.error)
305 | 			case let .sessionCreated(event):
306 | 				connected = true
307 | 				session = event.session
308 | 			case let .sessionUpdated(event):
309 | 				session = event.session
310 | 			case let .conversationCreated(event):
311 | 				id = event.conversation.id
312 | 			case let .conversationItemCreated(event):
313 | 				entries.append(event.item)
314 | 			case let .conversationItemDeleted(event):
315 | 				entries.removeAll { $0.id == event.itemId }
316 | 			case let .conversationItemInputAudioTranscriptionCompleted(event):
317 | 				updateEvent(id: event.itemId) { message in
318 | 					guard case let .input_audio(audio) = message.content[event.contentIndex] else { return }
319 | 
320 | 					message.content[event.contentIndex] = .input_audio(.init(audio: audio.audio, transcript: event.transcript))
321 | 				}
322 | 			case let .conversationItemInputAudioTranscriptionFailed(event):
323 | 				errorStream.yield(event.error)
324 | 			case let .responseContentPartAdded(event):
325 | 				updateEvent(id: event.itemId) { message in
326 | 					message.content.insert(.init(from: event.part), at: event.contentIndex)
327 | 				}
328 | 			case let .responseContentPartDone(event):
329 | 				updateEvent(id: event.itemId) { message in
330 | 					message.content[event.contentIndex] = .init(from: event.part)
331 | 				}
332 | 			case let .responseTextDelta(event):
333 | 				updateEvent(id: event.itemId) { message in
334 | 					guard case let .text(text) = message.content[event.contentIndex] else { return }
335 | 
336 | 					message.content[event.contentIndex] = .text(text + event.delta)
337 | 				}
338 | 			case let .responseTextDone(event):
339 | 				updateEvent(id: event.itemId) { message in
340 | 					message.content[event.contentIndex] = .text(event.text)
341 | 				}
342 | 			case let .responseAudioTranscriptDelta(event):
343 | 				updateEvent(id: event.itemId) { message in
344 | 					guard case let .audio(audio) = message.content[event.contentIndex] else { return }
345 | 
346 | 					message.content[event.contentIndex] = .audio(.init(audio: audio.audio, transcript: (audio.transcript ?? "") + event.delta))
347 | 				}
348 | 			case let .responseAudioTranscriptDone(event):
349 | 				updateEvent(id: event.itemId) { message in
350 | 					guard case let .audio(audio) = message.content[event.contentIndex] else { return }
351 | 
352 | 					message.content[event.contentIndex] = .audio(.init(audio: audio.audio, transcript: event.transcript))
353 | 				}
354 | 			case let .responseAudioDelta(event):
355 | 				updateEvent(id: event.itemId) { message in
356 | 					guard case let .audio(audio) = message.content[event.contentIndex] else { return }
357 | 
358 | 					if handlingVoice { queueAudioSample(event) }
359 | 					message.content[event.contentIndex] = .audio(.init(audio: audio.audio + event.delta, transcript: audio.transcript))
360 | 				}
361 | 			case let .responseFunctionCallArgumentsDelta(event):
362 | 				updateEvent(id: event.itemId) { functionCall in
363 | 					functionCall.arguments.append(event.delta)
364 | 				}
365 | 			case let .responseFunctionCallArgumentsDone(event):
366 | 				updateEvent(id: event.itemId) { functionCall in
367 | 					functionCall.arguments = event.arguments
368 | 				}
369 | 			case .inputAudioBufferSpeechStarted:
370 | 				isUserSpeaking = true
371 | 				if handlingVoice { interruptSpeech() }
372 | 			case .inputAudioBufferSpeechStopped:
373 | 				isUserSpeaking = false
374 | 			case let .responseOutputItemDone(event):
375 | 				updateEvent(id: event.item.id) { message in
376 | 					guard case let .message(newMessage) = event.item else { return }
377 | 
378 | 					message = newMessage
379 | 				}
380 | 			default:
381 | 				return
382 | 		}
383 | 	}
384 | 
385 | 	@MainActor
386 | 	func updateEvent(id: String, modifying closure: (inout Item.Message) -> Void) {
387 | 		guard let index = entries.firstIndex(where: { $0.id == id }), case var .message(message) = entries[index] else {
388 | 			return
389 | 		}
390 | 
391 | 		closure(&message)
392 | 
393 | 		entries[index] = .message(message)
394 | 	}
395 | 
396 | 	@MainActor
397 | 	func updateEvent(id: String, modifying closure: (inout Item.FunctionCall) -> Void) {
398 | 		guard let index = entries.firstIndex(where: { $0.id == id }), case var .functionCall(functionCall) = entries[index] else {
399 | 			return
400 | 		}
401 | 
402 | 		closure(&functionCall)
403 | 
404 | 		entries[index] = .functionCall(functionCall)
405 | 	}
406 | }
407 | 
408 | /// Audio processing private API
409 | private extension Conversation {
410 | 	private func queueAudioSample(_ event: ServerEvent.ResponseAudioDeltaEvent) {
411 | 		guard let buffer = AVAudioPCMBuffer.fromData(event.delta, format: desiredFormat) else {
412 | 			print("Failed to create audio buffer.")
413 | 			return
414 | 		}
415 | 
416 | 		guard let converter = apiConverter.lazy({ AVAudioConverter(from: buffer.format, to: playerNode.outputFormat(forBus: 0)) }) else {
417 | 			print("Failed to create audio converter.")
418 | 			return
419 | 		}
420 | 
421 | 		let outputFrameCapacity = AVAudioFrameCount(ceil(converter.outputFormat.sampleRate / buffer.format.sampleRate) * Double(buffer.frameLength))
422 | 
423 | 		guard let sample = convertBuffer(buffer: buffer, using: converter, capacity: outputFrameCapacity) else {
424 | 			print("Failed to convert buffer.")
425 | 			return
426 | 		}
427 | 
428 | 		queuedSamples.push(event.itemId)
429 | 
430 | 		playerNode.scheduleBuffer(sample, at: nil, completionCallbackType: .dataPlayedBack) { [weak self] _ in
431 | 			guard let self else { return }
432 | 
433 | 			self.queuedSamples.popFirst()
434 | 			if self.queuedSamples.isEmpty {
435 | 				Task { @MainActor in
436 | 					playerNode.pause()
437 | 				}
438 | 			}
439 | 		}
440 | 
441 | 		playerNode.play()
442 | 	}
443 | 
444 | 	private func processAudioBufferFromUser(buffer: AVAudioPCMBuffer) {
445 | 		let ratio = desiredFormat.sampleRate / buffer.format.sampleRate
446 | 
447 | 		guard let convertedBuffer = convertBuffer(buffer: buffer, using: userConverter.get()!, capacity: AVAudioFrameCount(Double(buffer.frameLength) * ratio)) else {
448 | 			print("Buffer conversion failed.")
449 | 			return
450 | 		}
451 | 
452 | 		guard let sampleBytes = convertedBuffer.audioBufferList.pointee.mBuffers.mData else { return }
453 | 		let audioData = Data(bytes: sampleBytes, count: Int(convertedBuffer.audioBufferList.pointee.mBuffers.mDataByteSize))
454 | 
455 | 		Task {
456 | 			try await send(audioDelta: audioData)
457 | 		}
458 | 	}
459 | 
460 | 	private func convertBuffer(buffer: AVAudioPCMBuffer, using converter: AVAudioConverter, capacity: AVAudioFrameCount) -> AVAudioPCMBuffer? {
461 | 		if buffer.format == converter.outputFormat {
462 | 			return buffer
463 | 		}
464 | 
465 | 		guard let convertedBuffer = AVAudioPCMBuffer(pcmFormat: converter.outputFormat, frameCapacity: capacity) else {
466 | 			print("Failed to create converted audio buffer.")
467 | 			return nil
468 | 		}
469 | 
470 | 		var error: NSError?
471 | 		var allSamplesReceived = false
472 | 
473 | 		let status = converter.convert(to: convertedBuffer, error: &error) { _, outStatus in
474 | 			if allSamplesReceived {
475 | 				outStatus.pointee = .noDataNow
476 | 				return nil
477 | 			}
478 | 
479 | 			allSamplesReceived = true
480 | 			outStatus.pointee = .haveData
481 | 			return buffer
482 | 		}
483 | 
484 | 		if status == .error {
485 | 			if let error = error {
486 | 				print("Error during conversion: \(error.localizedDescription)")
487 | 			}
488 | 			return nil
489 | 		}
490 | 
491 | 		return convertedBuffer
492 | 	}
493 | }
494 | 
495 | // Other private methods
496 | extension Conversation {
497 | 	/// This hack is required because relying on `queuedSamples.isEmpty` directly crashes the app.
498 | 	/// This is because updating the `queuedSamples` array on a background thread will trigger a re-render of any views that depend on it on that thread.
499 | 	/// So, instead, we observe the property and update `isPlaying` on the main actor.
500 | 	private func _keepIsPlayingPropertyUpdated() {
501 | 		withObservationTracking { _ = queuedSamples.isEmpty } onChange: { [weak self] in
502 | 			Task { @MainActor in
503 | 				guard let self else { return }
504 | 
505 | 				self.isPlaying = self.queuedSamples.isEmpty
506 | 			}
507 | 
508 | 			self?._keepIsPlayingPropertyUpdated()
509 | 		}
510 | 	}
511 | }
512 | 


--------------------------------------------------------------------------------
/src/Extensions/AVAudioPCMBuffer+fromData.swift:
--------------------------------------------------------------------------------
 1 | import AVFoundation
 2 | 
 3 | extension AVAudioPCMBuffer {
 4 | 	static func fromData(_ data: Data, format: AVAudioFormat) -> AVAudioPCMBuffer? {
 5 | 		let frameCount = UInt32(data.count) / format.streamDescription.pointee.mBytesPerFrame
 6 | 
 7 | 		guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
 8 | 			print("Error: Failed to create AVAudioPCMBuffer")
 9 | 			return nil
10 | 		}
11 | 
12 | 		buffer.frameLength = frameCount
13 | 		let audioBuffer = buffer.audioBufferList.pointee.mBuffers
14 | 
15 | 		data.withUnsafeBytes { bufferPointer in
16 | 			guard let address = bufferPointer.baseAddress else {
17 | 				print("Error: Failed to get base address of data")
18 | 				return
19 | 			}
20 | 
21 | 			audioBuffer.mData?.copyMemory(from: address, byteCount: Int(audioBuffer.mDataByteSize))
22 | 		}
23 | 
24 | 		return buffer
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/Extensions/Collection+safe.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | 
3 | extension Collection {
4 | 	// Returns the element at the specified index if it is within bounds, otherwise nil.
5 | 	subscript(safe index: Index) -> Element? {
6 | 		indices.contains(index) ? self[index] : nil
7 | 	}
8 | }
9 | 


--------------------------------------------------------------------------------
/src/Extensions/Continuation+error.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | 
3 | extension AsyncThrowingStream.Continuation where Failure == any Error {
4 | 	func yield(error: Failure) {
5 | 		yield(with: Result.failure(error))
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/src/Extensions/String+random.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | 
3 | extension String {
4 | 	init(randomLength length: Int) {
5 | 		let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
6 | 		self = String((0..<length).map { _ in letters.randomElement()! })
7 | 	}
8 | }
9 | 


--------------------------------------------------------------------------------
/src/Models/ClientEvent.swift:
--------------------------------------------------------------------------------
  1 | import Foundation
  2 | 
  3 | public enum ClientEvent: Equatable, Sendable {
  4 | 	public struct SessionUpdateEvent: Encodable, Equatable, Sendable {
  5 | 		/// Optional client-generated ID used to identify this event.
  6 | 		public var eventId: String?
  7 | 		/// Session configuration to update.
  8 | 		public var session: Session
  9 | 
 10 | 		private let type = "session.update"
 11 | 	}
 12 | 
 13 | 	public struct InputAudioBufferAppendEvent: Encodable, Equatable, Sendable {
 14 | 		/// Optional client-generated ID used to identify this event.
 15 | 		public var eventId: String?
 16 | 		/// Base64-encoded audio bytes.
 17 | 		public var audio: String
 18 | 
 19 | 		private let type = "input_audio_buffer.append"
 20 | 	}
 21 | 
 22 | 	public struct InputAudioBufferCommitEvent: Encodable, Equatable, Sendable {
 23 | 		/// Optional client-generated ID used to identify this event.
 24 | 		public var eventId: String?
 25 | 
 26 | 		private let type = "input_audio_buffer.commit"
 27 | 	}
 28 | 
 29 | 	public struct InputAudioBufferClearEvent: Encodable, Equatable, Sendable {
 30 | 		/// Optional client-generated ID used to identify this event.
 31 | 		public var eventId: String?
 32 | 
 33 | 		private let type = "input_audio_buffer.clear"
 34 | 	}
 35 | 
 36 | 	public struct ConversationItemCreateEvent: Encodable, Equatable, Sendable {
 37 | 		/// Optional client-generated ID used to identify this event.
 38 | 		public var eventId: String?
 39 | 		/// The ID of the preceding item after which the new item will be inserted.
 40 | 		public var previousItemId: String?
 41 | 		/// The item to add to the conversation.
 42 | 		public var item: Item
 43 | 
 44 | 		private let type = "conversation.item.create"
 45 | 	}
 46 | 
 47 | 	public struct ConversationItemTruncateEvent: Encodable, Equatable, Sendable {
 48 | 		/// Optional client-generated ID used to identify this event.
 49 | 		public var eventId: String?
 50 | 		/// The ID of the assistant message item to truncate.
 51 | 		public var itemId: String?
 52 | 		/// The index of the content part to truncate.
 53 | 		public var contentIndex: Int
 54 | 		/// Inclusive duration up to which audio is truncated, in milliseconds.
 55 | 		public var audioEndMs: Int
 56 | 
 57 | 		private let type = "conversation.item.truncate"
 58 | 	}
 59 | 
 60 | 	public struct ConversationItemDeleteEvent: Encodable, Equatable, Sendable {
 61 | 		/// Optional client-generated ID used to identify this event.
 62 | 		public var eventId: String?
 63 | 		/// The ID of the assistant message item to truncate.
 64 | 		public var itemId: String?
 65 | 		/// The index of the content part to truncate.
 66 | 		public var contentIndex: Int
 67 | 		/// Inclusive duration up to which audio is truncated, in milliseconds.
 68 | 		public var audioEndMs: Int
 69 | 
 70 | 		private let type = "conversation.item.delete"
 71 | 	}
 72 | 
 73 | 	public struct ResponseCreateEvent: Encodable, Equatable, Sendable {
 74 | 		/// Optional client-generated ID used to identify this event.
 75 | 		public var eventId: String?
 76 | 		/// Configuration for the response.
 77 | 		public var response: Response.Config?
 78 | 
 79 | 		private let type = "response.create"
 80 | 	}
 81 | 
 82 | 	public struct ResponseCancelEvent: Encodable, Equatable, Sendable {
 83 | 		/// Optional client-generated ID used to identify this event.
 84 | 		public var eventId: String?
 85 | 
 86 | 		private let type = "response.cancel"
 87 | 	}
 88 | 
 89 | 	/// Send this event to update the session’s default configuration.
 90 | 	case updateSession(SessionUpdateEvent)
 91 | 	/// Send this event to append audio bytes to the input audio buffer.
 92 | 	case appendInputAudioBuffer(InputAudioBufferAppendEvent)
 93 | 	/// Send this event to commit audio bytes to a user message.
 94 | 	case commitInputAudioBuffer(InputAudioBufferCommitEvent)
 95 | 	/// Send this event to clear the audio bytes in the buffer.
 96 | 	case clearInputAudioBuffer(InputAudioBufferClearEvent)
 97 | 	/// Send this event when adding an item to the conversation.
 98 | 	case createConversationItem(ConversationItemCreateEvent)
 99 | 	/// Send this event when you want to truncate a previous assistant message’s audio.
100 | 	case truncateConversationItem(ConversationItemTruncateEvent)
101 | 	/// Send this event when you want to remove any item from the conversation history.
102 | 	case deleteConversationItem(ConversationItemDeleteEvent)
103 | 	/// Send this event to trigger a response generation.
104 | 	case createResponse(ResponseCreateEvent)
105 | 	/// Send this event to cancel an in-progress response.
106 | 	case cancelResponse(ResponseCancelEvent)
107 | }
108 | 
109 | public extension ClientEvent {
110 | 	static func updateSession(id: String? = nil, _ session: Session) -> Self {
111 | 		.updateSession(SessionUpdateEvent(eventId: id, session: session))
112 | 	}
113 | 
114 | 	static func appendInputAudioBuffer(id: String? = nil, encoding audio: Data) -> Self {
115 | 		.appendInputAudioBuffer(InputAudioBufferAppendEvent(eventId: id, audio: audio.base64EncodedString()))
116 | 	}
117 | 
118 | 	static func commitInputAudioBuffer(id: String? = nil) -> Self {
119 | 		.commitInputAudioBuffer(InputAudioBufferCommitEvent(eventId: id))
120 | 	}
121 | 
122 | 	static func clearInputAudioBuffer(id: String? = nil) -> Self {
123 | 		.clearInputAudioBuffer(InputAudioBufferClearEvent(eventId: id))
124 | 	}
125 | 
126 | 	static func createConversationItem(id: String? = nil, previous previousID: String? = nil, _ item: Item) -> Self {
127 | 		.createConversationItem(ConversationItemCreateEvent(eventId: id, previousItemId: previousID, item: item))
128 | 	}
129 | 
130 | 	static func truncateConversationItem(id eventId: String? = nil, forItem itemId: String, at index: Int = 0, atAudioMs audioMs: Int) -> Self {
131 | 		.truncateConversationItem(ConversationItemTruncateEvent(eventId: eventId, itemId: itemId, contentIndex: index, audioEndMs: audioMs))
132 | 	}
133 | 
134 | 	static func deleteConversationItem(id eventId: String? = nil, for id: String? = nil, at index: Int, atAudio audioIndex: Int) -> Self {
135 | 		.deleteConversationItem(ConversationItemDeleteEvent(eventId: eventId, itemId: id, contentIndex: index, audioEndMs: audioIndex))
136 | 	}
137 | 
138 | 	static func createResponse(id: String? = nil, _ response: Response.Config? = nil) -> Self {
139 | 		.createResponse(ResponseCreateEvent(eventId: id, response: response))
140 | 	}
141 | 
142 | 	static func cancelResponse(id: String? = nil) -> Self {
143 | 		.cancelResponse(ResponseCancelEvent(eventId: id))
144 | 	}
145 | }
146 | 
147 | extension ClientEvent: Encodable {
148 | 	private enum CodingKeys: String, CodingKey {
149 | 		case type
150 | 	}
151 | 
152 | 	public func encode(to encoder: Encoder) throws {
153 | 		switch self {
154 | 			case let .updateSession(event):
155 | 				try event.encode(to: encoder)
156 | 			case let .appendInputAudioBuffer(event):
157 | 				try event.encode(to: encoder)
158 | 			case let .commitInputAudioBuffer(event):
159 | 				try event.encode(to: encoder)
160 | 			case let .clearInputAudioBuffer(event):
161 | 				try event.encode(to: encoder)
162 | 			case let .createConversationItem(event):
163 | 				try event.encode(to: encoder)
164 | 			case let .truncateConversationItem(event):
165 | 				try event.encode(to: encoder)
166 | 			case let .deleteConversationItem(event):
167 | 				try event.encode(to: encoder)
168 | 			case let .createResponse(event):
169 | 				try event.encode(to: encoder)
170 | 			case let .cancelResponse(event):
171 | 				try event.encode(to: encoder)
172 | 		}
173 | 	}
174 | }
175 | 


--------------------------------------------------------------------------------
/src/Models/Item.swift:
--------------------------------------------------------------------------------
  1 | import Foundation
  2 | 
  3 | public enum Item: Identifiable, Equatable, Sendable {
  4 | 	public enum ItemStatus: String, Codable, Sendable {
  5 | 		case completed
  6 | 		case in_progress
  7 | 		case incomplete
  8 | 	}
  9 | 
 10 | 	public enum ItemRole: String, Codable, Sendable {
 11 | 		case user
 12 | 		case system
 13 | 		case assistant
 14 | 	}
 15 | 
 16 | 	public struct Audio: Equatable, Sendable {
 17 | 		/// Base64-encoded audio bytes.
 18 | 		public var audio: Data
 19 | 		/// The transcript of the audio.
 20 | 		public var transcript: String?
 21 | 
 22 | 		public init(audio: Data = Data(), transcript: String? = nil) {
 23 | 			self.audio = audio
 24 | 			self.transcript = transcript
 25 | 		}
 26 | 	}
 27 | 
 28 | 	public enum ContentPart: Equatable, Sendable {
 29 | 		case text(String)
 30 | 		case audio(Audio)
 31 | 	}
 32 | 
 33 | 	public struct Message: Identifiable, Codable, Equatable, Sendable {
 34 | 		public enum Content: Equatable, Sendable {
 35 | 			case text(String)
 36 | 			case audio(Audio)
 37 | 			case input_text(String)
 38 | 			case input_audio(Audio)
 39 | 
 40 | 			public var text: String? {
 41 | 				switch self {
 42 | 					case let .text(text):
 43 | 						return text
 44 | 					case let .input_text(text):
 45 | 						return text
 46 | 					case let .input_audio(audio):
 47 | 						return audio.transcript
 48 | 					case let .audio(audio):
 49 | 						return audio.transcript
 50 | 				}
 51 | 			}
 52 | 		}
 53 | 
 54 | 		/// The unique ID of the item.
 55 | 		public var id: String
 56 | 		/// The type of the item
 57 | 		private var type: String = "message"
 58 | 		/// The status of the item
 59 | 		public var status: ItemStatus
 60 | 		/// The role associated with the item
 61 | 		public var role: ItemRole
 62 | 		/// The content of the message.
 63 | 		public var content: [Content]
 64 | 
 65 | 		public init(id: String, from role: ItemRole, content: [Content]) {
 66 | 			self.id = id
 67 | 			self.role = role
 68 | 			status = .completed
 69 | 			self.content = content
 70 | 		}
 71 | 	}
 72 | 
 73 | 	public struct FunctionCall: Identifiable, Codable, Equatable, Sendable {
 74 | 		/// The unique ID of the item.
 75 | 		public var id: String
 76 | 		/// The type of the item
 77 | 		private var type: String = "function_call"
 78 | 		/// The status of the item
 79 | 		public var status: ItemStatus
 80 | 		/// The ID of the function call
 81 | 		public var callId: String
 82 | 		/// The name of the function being called
 83 | 		public var name: String
 84 | 		/// The arguments of the function call
 85 | 		public var arguments: String
 86 | 	}
 87 | 
 88 | 	public struct FunctionCallOutput: Identifiable, Codable, Equatable, Sendable {
 89 | 		/// The unique ID of the item.
 90 | 		public var id: String
 91 | 		/// The type of the item
 92 | 		private var type: String = "function_call_output"
 93 | 		/// The ID of the function call
 94 | 		public var callId: String
 95 | 		/// The output of the function call
 96 | 		public var output: String
 97 | 
 98 | 		public init(id: String, callId: String, output: String) {
 99 | 			self.id = id
100 | 			self.callId = callId
101 | 			self.output = output
102 | 		}
103 | 	}
104 | 
105 | 	case message(Message)
106 | 	case functionCall(FunctionCall)
107 | 	case functionCallOutput(FunctionCallOutput)
108 | 
109 | 	public var id: String {
110 | 		switch self {
111 | 			case let .message(message):
112 | 				return message.id
113 | 			case let .functionCall(functionCall):
114 | 				return functionCall.id
115 | 			case let .functionCallOutput(functionCallOutput):
116 | 				return functionCallOutput.id
117 | 		}
118 | 	}
119 | 
120 | 	public init(message: Message) {
121 | 		self = .message(message)
122 | 	}
123 | 
124 | 	public init(calling functionCall: FunctionCall) {
125 | 		self = .functionCall(functionCall)
126 | 	}
127 | 
128 | 	public init(with functionCallOutput: FunctionCallOutput) {
129 | 		self = .functionCallOutput(functionCallOutput)
130 | 	}
131 | }
132 | 
133 | // MARK: Helpers
134 | 
135 | public extension Item.Message.Content {
136 | 	init(from part: Item.ContentPart) {
137 | 		switch part {
138 | 			case let .audio(audio):
139 | 				self = .audio(audio)
140 | 			case let .text(text):
141 | 				self = .text(text)
142 | 		}
143 | 	}
144 | }
145 | 
146 | // MARK: Codable implementations
147 | 
148 | extension Item: Codable {
149 | 	private enum CodingKeys: String, CodingKey {
150 | 		case type
151 | 	}
152 | 
153 | 	public init(from decoder: any Decoder) throws {
154 | 		let container = try decoder.container(keyedBy: CodingKeys.self)
155 | 		let type = try container.decode(String.self, forKey: .type)
156 | 
157 | 		switch type {
158 | 			case "message":
159 | 				self = try .message(Message(from: decoder))
160 | 			case "function_call":
161 | 				self = try .functionCall(FunctionCall(from: decoder))
162 | 			case "function_call_output":
163 | 				self = try .functionCallOutput(FunctionCallOutput(from: decoder))
164 | 			default:
165 | 				throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown item type: \(type)")
166 | 		}
167 | 	}
168 | 
169 | 	public func encode(to encoder: Encoder) throws {
170 | 		switch self {
171 | 			case let .message(message):
172 | 				try message.encode(to: encoder)
173 | 			case let .functionCall(functionCall):
174 | 				try functionCall.encode(to: encoder)
175 | 			case let .functionCallOutput(functionCallOutput):
176 | 				try functionCallOutput.encode(to: encoder)
177 | 		}
178 | 	}
179 | }
180 | 
181 | extension Item.Audio: Decodable {
182 | 	private enum CodingKeys: String, CodingKey {
183 | 		case audio
184 | 		case transcript
185 | 	}
186 | 
187 | 	public init(from decoder: any Decoder) throws {
188 | 		let container = try decoder.container(keyedBy: CodingKeys.self)
189 | 		transcript = try container.decodeIfPresent(String.self, forKey: .transcript)
190 | 		let encodedAudio = try container.decodeIfPresent(String.self, forKey: .audio)
191 | 
192 | 		if let encodedAudio {
193 | 			guard let decodedAudio = Data(base64Encoded: encodedAudio) else {
194 | 				throw DecodingError.dataCorruptedError(forKey: .audio, in: container, debugDescription: "Invalid base64-encoded audio data.")
195 | 			}
196 | 			audio = decodedAudio
197 | 		} else {
198 | 			audio = Data()
199 | 		}
200 | 	}
201 | }
202 | 
203 | extension Item.ContentPart: Decodable {
204 | 	private enum CodingKeys: String, CodingKey {
205 | 		case type
206 | 		case text
207 | 		case audio
208 | 		case transcript
209 | 	}
210 | 
211 | 	private struct Text: Codable {
212 | 		let text: String
213 | 
214 | 		enum CodingKeys: CodingKey {
215 | 			case text
216 | 		}
217 | 	}
218 | 
219 | 	public init(from decoder: any Decoder) throws {
220 | 		let container = try decoder.container(keyedBy: CodingKeys.self)
221 | 		let type = try container.decode(String.self, forKey: .type)
222 | 
223 | 		switch type {
224 | 			case "text":
225 | 				let container = try decoder.container(keyedBy: Text.CodingKeys.self)
226 | 				self = try .text(container.decode(String.self, forKey: .text))
227 | 			case "audio":
228 | 				self = try .audio(Item.Audio(from: decoder))
229 | 			default:
230 | 				throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown content type: \(type)")
231 | 		}
232 | 	}
233 | }
234 | 
235 | extension Item.Message.Content: Codable {
236 | 	private enum CodingKeys: String, CodingKey {
237 | 		case type
238 | 		case text
239 | 		case audio
240 | 		case transcript
241 | 	}
242 | 
243 | 	private struct Text: Codable {
244 | 		let text: String
245 | 
246 | 		enum CodingKeys: CodingKey {
247 | 			case text
248 | 		}
249 | 	}
250 | 
251 | 	public init(from decoder: any Decoder) throws {
252 | 		let container = try decoder.container(keyedBy: CodingKeys.self)
253 | 		let type = try container.decode(String.self, forKey: .type)
254 | 
255 | 		switch type {
256 | 			case "text":
257 | 				let container = try decoder.container(keyedBy: Text.CodingKeys.self)
258 | 				self = try .text(container.decode(String.self, forKey: .text))
259 | 			case "input_text":
260 | 				let container = try decoder.container(keyedBy: Text.CodingKeys.self)
261 | 				self = try .input_text(container.decode(String.self, forKey: .text))
262 | 			case "audio":
263 | 				self = try .audio(Item.Audio(from: decoder))
264 | 			case "input_audio":
265 | 				self = try .input_audio(Item.Audio(from: decoder))
266 | 			default:
267 | 				throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown content type: \(type)")
268 | 		}
269 | 	}
270 | 
271 | 	public func encode(to encoder: Encoder) throws {
272 | 		var container = encoder.container(keyedBy: CodingKeys.self)
273 | 
274 | 		switch self {
275 | 			case let .text(text):
276 | 				try container.encode(text, forKey: .text)
277 | 				try container.encode("text", forKey: .type)
278 | 			case let .input_text(text):
279 | 				try container.encode(text, forKey: .text)
280 | 				try container.encode("input_text", forKey: .type)
281 | 			case let .audio(audio):
282 | 				try container.encode("audio", forKey: .type)
283 | 				try container.encode(audio.transcript, forKey: .transcript)
284 | 				try container.encode(audio.audio.base64EncodedString(), forKey: .audio)
285 | 			case let .input_audio(audio):
286 | 				try container.encode("input_audio", forKey: .type)
287 | 				try container.encode(audio.transcript, forKey: .transcript)
288 | 				try container.encode(audio.audio.base64EncodedString(), forKey: .audio)
289 | 		}
290 | 	}
291 | }
292 | 


--------------------------------------------------------------------------------
/src/Models/Response.swift:
--------------------------------------------------------------------------------
 1 | public struct Response: Identifiable, Codable, Equatable, Sendable {
 2 | 	public struct Config: Codable, Equatable, Sendable {
 3 | 		public enum Conversation: String, Codable, Equatable, Sendable {
 4 | 			/// The contents of the response will be added to the default conversation.
 5 | 			case auto
 6 | 			/// An out-of-band response which will not add items to default conversation.
 7 | 			case none
 8 | 		}
 9 | 
10 | 		/// The modalities for the response.
11 | 		public let modalities: [Session.Modality]
12 | 		/// Instructions for the model.
13 | 		public let instructions: String
14 | 		/// The voice the model uses to respond.
15 | 		public let voice: Session.Voice
16 | 		/// The format of output audio.
17 | 		public let outputAudioFormat: Session.AudioFormat
18 | 		/// Tools (functions) available to the model.
19 | 		public let tools: [Session.Tool]
20 | 		/// How the model chooses tools.
21 | 		public let toolChoice: Session.ToolChoice
22 | 		/// Sampling temperature.
23 | 		public let temperature: Double
24 | 		/// Maximum number of output tokens.
25 | 		public let maxResponseOutputTokens: Int?
26 | 		/// Controls which conversation the response is added to.
27 | 		public let conversation: Conversation?
28 | 		/// Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format. Keys can be a maximum of 64 characters long and values can be a maximum of 512 characters long.
29 | 		public let metadata: [String: String]?
30 | 		/// Input items to include in the prompt for the model. Creates a new context for this response, without including the default conversation. Can include references to items from the default conversation.
31 | 		public let input: [Item]?
32 | 
33 | 		public init(modalities: [Session.Modality] = [.text, .audio], instructions: String, voice: Session.Voice = .alloy, outputAudioFormat: Session.AudioFormat = .pcm16, tools: [Session.Tool] = [], toolChoice: Session.ToolChoice = .auto, temperature: Double = 1, maxResponseOutputTokens: Int? = nil, conversation: Conversation? = .auto, metadata: [String: String]? = nil, input: [Item]? = nil) {
34 | 			self.input = input
35 | 			self.voice = voice
36 | 			self.tools = tools
37 | 			self.metadata = metadata
38 | 			self.toolChoice = toolChoice
39 | 			self.modalities = modalities
40 | 			self.temperature = temperature
41 | 			self.instructions = instructions
42 | 			self.conversation = conversation
43 | 			self.outputAudioFormat = outputAudioFormat
44 | 			self.maxResponseOutputTokens = maxResponseOutputTokens
45 | 		}
46 | 	}
47 | 
48 | 	public enum Status: String, Codable, Equatable, Sendable {
49 | 		case failed
50 | 		case completed
51 | 		case cancelled
52 | 		case incomplete
53 | 		case inProgress = "in_progress"
54 | 	}
55 | 
56 | 	public struct Usage: Codable, Equatable, Sendable {
57 | 		public let totalTokens: Int
58 | 		public let inputTokens: Int
59 | 		public let outputTokens: Int
60 | 		public let inputTokenDetails: InputTokenDetails
61 | 		public let outputTokenDetails: OutputTokenDetails
62 | 
63 | 		public struct InputTokenDetails: Codable, Equatable, Sendable {
64 | 			public let textTokens: Int
65 | 			public let audioTokens: Int
66 | 			public let cachedTokens: Int
67 | 			public let cachedTokensDetails: CachedTokensDetails
68 | 
69 | 			public struct CachedTokensDetails: Codable, Equatable, Sendable {
70 | 				public let textTokens: Int
71 | 				public let audioTokens: Int
72 | 			}
73 | 		}
74 | 
75 | 		public struct OutputTokenDetails: Codable, Equatable, Sendable {
76 | 			public let textTokens: Int
77 | 			public let audioTokens: Int
78 | 		}
79 | 	}
80 | 
81 | 	/// The unique ID of the response.
82 | 	public let id: String
83 | 	/// The status of the response.
84 | 	public let status: Status
85 | 	/// The list of output items generated by the response.
86 | 	public let output: [Item]
87 | 	/// Usage statistics for the response.
88 | 	public let usage: Usage?
89 | 	/// Developer-provided string key-value pairs associated with this response.
90 | 	public let metadata: [String: String]?
91 | }
92 | 


--------------------------------------------------------------------------------
/src/Models/ServerError.swift:
--------------------------------------------------------------------------------
 1 | public struct ServerError: Codable, Equatable, Sendable {
 2 | 	/// The type of error (e.g., "invalid_request_error", "server_error").
 3 | 	public let type: String
 4 | 	/// Error code, if any.
 5 | 	public let code: String?
 6 | 	/// A human-readable error message.
 7 | 	public let message: String
 8 | 	/// Parameter related to the error, if any.
 9 | 	public let param: String?
10 | 	/// The eventId of the client event that caused the error, if applicable.
11 | 	public let eventId: String?
12 | }
13 | 


--------------------------------------------------------------------------------
/src/Models/ServerEvent.swift:
--------------------------------------------------------------------------------
  1 | import Foundation
  2 | 
  3 | public enum ServerEvent: Sendable {
  4 | 	public struct ErrorEvent: Decodable, Sendable {
  5 | 		/// The unique ID of the server event.
  6 | 		public let eventId: String
  7 | 		/// Details of the error.
  8 | 		public let error: ServerError
  9 | 	}
 10 | 
 11 | 	public struct SessionEvent: Decodable, Sendable {
 12 | 		/// The unique ID of the server event.
 13 | 		public let eventId: String
 14 | 		/// The session resource.
 15 | 		public let session: Session
 16 | 	}
 17 | 
 18 | 	public struct ConversationCreatedEvent: Decodable, Sendable {
 19 | 		public struct Conversation: Codable, Sendable {
 20 | 			/// The unique ID of the conversation.
 21 | 			public let id: String
 22 | 		}
 23 | 
 24 | 		/// The unique ID of the server event.
 25 | 		public let eventId: String
 26 | 		/// The conversation resource.
 27 | 		public let conversation: Conversation
 28 | 	}
 29 | 
 30 | 	public struct InputAudioBufferCommittedEvent: Decodable, Sendable {
 31 | 		/// The unique ID of the server event.
 32 | 		public let eventId: String
 33 | 		/// The ID of the preceding item after which the new item will be inserted.
 34 | 		public let previousItemId: String?
 35 | 		/// The ID of the user message item that will be created.
 36 | 		public let itemId: String
 37 | 	}
 38 | 
 39 | 	public struct InputAudioBufferClearedEvent: Decodable, Sendable {
 40 | 		/// The unique ID of the server event.
 41 | 		public let eventId: String
 42 | 	}
 43 | 
 44 | 	public struct InputAudioBufferSpeechStartedEvent: Decodable, Sendable {
 45 | 		/// The unique ID of the server event.
 46 | 		public let eventId: String
 47 | 		/// Milliseconds since the session started when speech was detected.
 48 | 		public let audioStartMs: Int
 49 | 		/// The ID of the user message item that will be created when speech stops.
 50 | 		public let itemId: String
 51 | 	}
 52 | 
 53 | 	public struct InputAudioBufferSpeechStoppedEvent: Decodable, Sendable {
 54 | 		/// The unique ID of the server event.
 55 | 		public let eventId: String
 56 | 		/// Milliseconds since the session started when speech stopped.
 57 | 		public let audioEndMs: Int
 58 | 		/// The ID of the user message item that will be created.
 59 | 		public let itemId: String
 60 | 	}
 61 | 
 62 | 	public struct ConversationItemCreatedEvent: Decodable, Sendable {
 63 | 		/// The unique ID of the server event.
 64 | 		public let eventId: String
 65 | 		/// The ID of the preceding item.
 66 | 		public let previousItemId: String?
 67 | 		/// The item that was created.
 68 | 		public let item: Item
 69 | 	}
 70 | 
 71 | 	public struct ConversationItemInputAudioTranscriptionCompletedEvent: Decodable, Sendable {
 72 | 		/// The unique ID of the server event.
 73 | 		public let eventId: String
 74 | 		/// The ID of the user message item.
 75 | 		public let itemId: String
 76 | 		/// The index of the content part containing the audio.
 77 | 		public let contentIndex: Int
 78 | 		/// The transcribed text.
 79 | 		public let transcript: String
 80 | 	}
 81 | 
 82 | 	public struct ConversationItemInputAudioTranscriptionDeltaEvent: Decodable, Sendable {
 83 | 		/// The unique ID of the server event.
 84 | 		public let eventId: String
 85 | 		/// The ID of the user message item.
 86 | 		public let itemId: String
 87 | 		/// The index of the content part containing the audio.
 88 | 		public let contentIndex: Int
 89 | 		/// The transcribed delta text.
 90 | 		public let delta: String
 91 | 	}
 92 | 
 93 | 	public struct ConversationItemInputAudioTranscriptionFailedEvent: Decodable, Sendable {
 94 | 		/// The unique ID of the server event.
 95 | 		public let eventId: String
 96 | 		/// The ID of the user message item.
 97 | 		public let itemId: String
 98 | 		/// The index of the content part containing the audio.
 99 | 		public let contentIndex: Int
100 | 		/// Details of the transcription error.
101 | 		public let error: ServerError
102 | 	}
103 | 
104 | 	public struct ConversationItemTruncatedEvent: Decodable, Sendable {
105 | 		/// The unique ID of the server event.
106 | 		public let eventId: String
107 | 		/// The ID of the assistant message item that was truncated.
108 | 		public let itemId: String
109 | 		/// The index of the content part that was truncated.
110 | 		public let contentIndex: Int
111 | 		/// The duration up to which the audio was truncated, in milliseconds.
112 | 		public let audioEndMs: Int
113 | 	}
114 | 
115 | 	public struct ConversationItemDeletedEvent: Decodable, Sendable {
116 | 		/// The unique ID of the server event.
117 | 		public let eventId: String
118 | 		/// The ID of the item that was deleted.
119 | 		public let itemId: String
120 | 	}
121 | 
122 | 	public struct OutputAudioBufferStartedEvent: Decodable, Sendable {
123 | 		/// The unique ID of the server event.
124 | 		public let eventId: String
125 | 		/// The ID of the response.
126 | 		public let responseId: String
127 | 	}
128 | 
129 | 	public struct OutputAudioBufferStoppedEvent: Decodable, Sendable {
130 | 		/// The unique ID of the server event.
131 | 		public let eventId: String
132 | 		/// The ID of the response.
133 | 		public let responseId: String
134 | 	}
135 | 
136 | 	public struct ResponseEvent: Decodable, Sendable {
137 | 		/// The unique ID of the server event.
138 | 		public let eventId: String
139 | 		/// The response resource.
140 | 		public let response: Response
141 | 	}
142 | 
143 | 	public struct ResponseOutputItemAddedEvent: Decodable, Sendable {
144 | 		/// The unique ID of the server event.
145 | 		public let eventId: String
146 | 		/// The ID of the response to which the item belongs.
147 | 		public let responseId: String
148 | 		/// The index of the output item in the response.
149 | 		public let outputIndex: Int
150 | 		/// The item that was added.
151 | 		public let item: Item
152 | 	}
153 | 
154 | 	public struct ResponseOutputItemDoneEvent: Decodable, Sendable {
155 | 		/// The unique ID of the server event.
156 | 		public let eventId: String
157 | 		/// The ID of the response to which the item belongs.
158 | 		public let responseId: String
159 | 		/// The index of the output item in the response.
160 | 		public let outputIndex: Int
161 | 		/// The completed item.
162 | 		public let item: Item
163 | 	}
164 | 
165 | 	public struct ResponseContentPartAddedEvent: Decodable, Sendable {
166 | 		/// The unique ID of the server event.
167 | 		public let eventId: String
168 | 		/// The ID of the response.
169 | 		public let responseId: String
170 | 		/// The ID of the item to which the content part was added.
171 | 		public let itemId: String
172 | 		/// The index of the output item in the response.
173 | 		public let outputIndex: Int
174 | 		/// The index of the content part in the item's content array.
175 | 		public let contentIndex: Int
176 | 		/// The content part that was added.
177 | 		public let part: Item.ContentPart
178 | 	}
179 | 
180 | 	public struct ResponseContentPartDoneEvent: Decodable, Sendable {
181 | 		/// The unique ID of the server event.
182 | 		public let eventId: String
183 | 		/// The ID of the response.
184 | 		public let responseId: String
185 | 		/// The ID of the item.
186 | 		public let itemId: String
187 | 		/// The index of the output item in the response.
188 | 		public let outputIndex: Int
189 | 		/// The index of the content part in the item's content array.
190 | 		public let contentIndex: Int
191 | 		/// The content part that is done.
192 | 		public let part: Item.ContentPart
193 | 	}
194 | 
195 | 	public struct ResponseTextDeltaEvent: Decodable, Sendable {
196 | 		/// The unique ID of the server event.
197 | 		public let eventId: String
198 | 		/// The ID of the response.
199 | 		public let responseId: String
200 | 		/// The ID of the item.
201 | 		public let itemId: String
202 | 		/// The index of the output item in the response.
203 | 		public let outputIndex: Int
204 | 		/// The index of the content part in the item's content array.
205 | 		public let contentIndex: Int
206 | 		/// The text delta.
207 | 		public let delta: String
208 | 	}
209 | 
210 | 	public struct ResponseTextDoneEvent: Decodable, Sendable {
211 | 		/// The unique ID of the server event.
212 | 		public let eventId: String
213 | 		/// The ID of the response.
214 | 		public let responseId: String
215 | 		/// The ID of the item.
216 | 		public let itemId: String
217 | 		/// The index of the output item in the response.
218 | 		public let outputIndex: Int
219 | 		/// The index of the content part in the item's content array.
220 | 		public let contentIndex: Int
221 | 		/// The final text content.
222 | 		public let text: String
223 | 	}
224 | 
225 | 	public struct ResponseAudioTranscriptDeltaEvent: Decodable, Sendable {
226 | 		/// The unique ID of the server event.
227 | 		public let eventId: String
228 | 		/// The ID of the response.
229 | 		public let responseId: String
230 | 		/// The ID of the item.
231 | 		public let itemId: String
232 | 		/// The index of the output item in the response.
233 | 		public let outputIndex: Int
234 | 		/// The index of the content part in the item's content array.
235 | 		public let contentIndex: Int
236 | 		/// The transcript delta.
237 | 		public let delta: String
238 | 	}
239 | 
240 | 	public struct ResponseAudioTranscriptDoneEvent: Decodable, Sendable {
241 | 		/// The unique ID of the server event.
242 | 		public let eventId: String
243 | 		/// The ID of the response.
244 | 		public let responseId: String
245 | 		/// The ID of the item.
246 | 		public let itemId: String
247 | 		/// The index of the output item in the response.
248 | 		public let outputIndex: Int
249 | 		/// The index of the content part in the item's content array.
250 | 		public let contentIndex: Int
251 | 		/// The final transcript of the audio.
252 | 		public let transcript: String
253 | 	}
254 | 
255 | 	public struct ResponseAudioDeltaEvent: Sendable {
256 | 		/// The unique ID of the server event.
257 | 		public let eventId: String
258 | 		/// The ID of the response.
259 | 		public let responseId: String
260 | 		/// The ID of the item.
261 | 		public let itemId: String
262 | 		/// The index of the output item in the response.
263 | 		public let outputIndex: Int
264 | 		/// The index of the content part in the item's content array.
265 | 		public let contentIndex: Int
266 | 		/// Base64-encoded audio data delta.
267 | 		public let delta: Data
268 | 	}
269 | 
270 | 	public struct ResponseAudioDoneEvent: Decodable, Sendable {
271 | 		/// The unique ID of the server event.
272 | 		public let eventId: String
273 | 		/// The ID of the response.
274 | 		public let responseId: String
275 | 		/// The ID of the item.
276 | 		public let itemId: String
277 | 		/// The index of the output item in the response.
278 | 		public let outputIndex: Int
279 | 		/// The index of the content part in the item's content array.
280 | 		public let contentIndex: Int
281 | 	}
282 | 
283 | 	public struct ResponseFunctionCallArgumentsDeltaEvent: Decodable, Sendable {
284 | 		/// The unique ID of the server event.
285 | 		public let eventId: String
286 | 		/// The ID of the response.
287 | 		public let responseId: String
288 | 		/// The ID of the function call item.
289 | 		public let itemId: String
290 | 		/// The index of the output item in the response.
291 | 		public let outputIndex: Int
292 | 		/// The ID of the function call.
293 | 		public let callId: String
294 | 		/// The arguments delta as a JSON string.
295 | 		public let delta: String
296 | 	}
297 | 
298 | 	public struct ResponseFunctionCallArgumentsDoneEvent: Decodable, Sendable {
299 | 		/// The unique ID of the server event.
300 | 		public let eventId: String
301 | 		/// The ID of the response.
302 | 		public let responseId: String
303 | 		/// The ID of the function call item.
304 | 		public let itemId: String
305 | 		/// The index of the output item in the response.
306 | 		public let outputIndex: Int
307 | 		/// The ID of the function call.
308 | 		public let callId: String
309 | 		/// The final arguments as a JSON string.
310 | 		public let arguments: String
311 | 	}
312 | 
313 | 	public struct RateLimitsUpdatedEvent: Decodable, Sendable {
314 | 		public struct RateLimit: Codable, Sendable {
315 | 			/// The name of the rate limit
316 | 			public let name: String
317 | 			/// The maximum allowed value for the rate limit.
318 | 			public let limit: Int
319 | 			/// The remaining value before the limit is reached.
320 | 			public let remaining: Int
321 | 			/// Seconds until the rate limit resets.
322 | 			public let resetSeconds: Double
323 | 		}
324 | 
325 | 		/// The unique ID of the server event.
326 | 		public let eventId: String
327 | 		/// List of rate limit information.
328 | 		public let rateLimits: [RateLimit]
329 | 	}
330 | 
331 | 	/// Returned when an error occurs.
332 | 	case error(ErrorEvent)
333 | 	/// Returned when a session is created. Emitted automatically when a new connection is established.
334 | 	case sessionCreated(SessionEvent)
335 | 	/// Returned when a session is updated.
336 | 	case sessionUpdated(SessionEvent)
337 | 	/// Returned when a conversation is created. Emitted right after session creation.
338 | 	case conversationCreated(ConversationCreatedEvent)
339 | 	/// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode.
340 | 	case inputAudioBufferCommitted(InputAudioBufferCommittedEvent)
341 | 	/// Returned when the input audio buffer is cleared by the client.
342 | 	case inputAudioBufferCleared(InputAudioBufferClearedEvent)
343 | 	/// Returned in server turn detection mode when speech is detected.
344 | 	case inputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent)
345 | 	/// Returned in server turn detection mode when speech stops.
346 | 	case inputAudioBufferSpeechStopped(InputAudioBufferSpeechStoppedEvent)
347 | 	/// Returned when a conversation item is created.
348 | 	case conversationItemCreated(ConversationItemCreatedEvent)
349 | 	/// Returned when input audio transcription is enabled and a transcription succeeds.
350 | 	case conversationItemInputAudioTranscriptionCompleted(ConversationItemInputAudioTranscriptionCompletedEvent)
351 | 	/// Returned when input audio transcription is enabled and a transcription receives delta.
352 | 	case conversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent)
353 | 	/// Returned when input audio transcription is configured, and a transcription request for a user message failed.
354 | 	case conversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailedEvent)
355 | 	/// Returned when an earlier assistant audio message item is truncated by the client.
356 | 	case conversationItemTruncated(ConversationItemTruncatedEvent)
357 | 	/// Returned when an item in the conversation is deleted.
358 | 	case conversationItemDeleted(ConversationItemDeletedEvent)
359 | 	/// Returned when the output audio buffer is started.
360 | 	case outputAudioBufferStarted(OutputAudioBufferStartedEvent)
361 | 	/// Returned when the output audio buffer is stopped.
362 | 	case outputAudioBufferStopped(OutputAudioBufferStoppedEvent)
363 | 	/// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress".
364 | 	case responseCreated(ResponseEvent)
365 | 	/// Returned when a Response is done streaming. Always emitted, no matter the final state.
366 | 	case responseDone(ResponseEvent)
367 | 	/// Returned when a new Item is created during response generation.
368 | 	case responseOutputItemAdded(ResponseOutputItemAddedEvent)
369 | 	/// Returned when an Item is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled.
370 | 	case responseOutputItemDone(ResponseOutputItemDoneEvent)
371 | 	/// Returned when a new content part is added to an assistant message item during response generation.
372 | 	case responseContentPartAdded(ResponseContentPartAddedEvent)
373 | 	/// Returned when a content part is done streaming in an assistant message item. Also emitted when a Response is interrupted, incomplete, or cancelled.
374 | 	case responseContentPartDone(ResponseContentPartDoneEvent)
375 | 	/// Returned when the text value of a "text" content part is updated.
376 | 	case responseTextDelta(ResponseTextDeltaEvent)
377 | 	/// Returned when the text value of a "text" content part is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled.
378 | 	case responseTextDone(ResponseTextDoneEvent)
379 | 	/// Returned when the model-generated transcription of audio output is updated.
380 | 	case responseAudioTranscriptDelta(ResponseAudioTranscriptDeltaEvent)
381 | 	/// Returned when the model-generated transcription of audio output is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled.
382 | 	case responseAudioTranscriptDone(ResponseAudioTranscriptDoneEvent)
383 | 	/// Returned when the model-generated audio is updated.
384 | 	case responseAudioDelta(ResponseAudioDeltaEvent)
385 | 	/// Returned when the model-generated audio is done. Also emitted when a Response is interrupted, incomplete, or cancelled.
386 | 	case responseAudioDone(ResponseAudioDoneEvent)
387 | 	/// Returned when the model-generated function call arguments are updated.
388 | 	case responseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDeltaEvent)
389 | 	/// Returned when the model-generated function call arguments are done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled.
390 | 	case responseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDoneEvent)
391 | 	/// Emitted after every "response.done" event to indicate the updated rate limits.
392 | 	case rateLimitsUpdated(RateLimitsUpdatedEvent)
393 | }
394 | 
395 | extension ServerEvent: Identifiable {
396 | 	public var id: String {
397 | 		switch self {
398 | 			case let .error(event):
399 | 				return event.eventId
400 | 			case let .sessionCreated(event):
401 | 				return event.eventId
402 | 			case let .sessionUpdated(event):
403 | 				return event.eventId
404 | 			case let .conversationCreated(event):
405 | 				return event.eventId
406 | 			case let .inputAudioBufferCommitted(event):
407 | 				return event.eventId
408 | 			case let .inputAudioBufferCleared(event):
409 | 				return event.eventId
410 | 			case let .inputAudioBufferSpeechStarted(event):
411 | 				return event.eventId
412 | 			case let .inputAudioBufferSpeechStopped(event):
413 | 				return event.eventId
414 | 			case let .conversationItemCreated(event):
415 | 				return event.eventId
416 | 			case let .conversationItemInputAudioTranscriptionCompleted(event):
417 | 				return event.eventId
418 | 			case let .conversationItemInputAudioTranscriptionDelta(event):
419 | 				return event.eventId
420 | 			case let .conversationItemInputAudioTranscriptionFailed(event):
421 | 				return event.eventId
422 | 			case let .conversationItemTruncated(event):
423 | 				return event.eventId
424 | 			case let .conversationItemDeleted(event):
425 | 				return event.eventId
426 | 			case let .outputAudioBufferStarted(event):
427 | 				return event.eventId
428 | 			case let .outputAudioBufferStopped(event):
429 | 				return event.eventId
430 | 			case let .responseCreated(event):
431 | 				return event.eventId
432 | 			case let .responseDone(event):
433 | 				return event.eventId
434 | 			case let .responseOutputItemAdded(event):
435 | 				return event.eventId
436 | 			case let .responseOutputItemDone(event):
437 | 				return event.eventId
438 | 			case let .responseContentPartAdded(event):
439 | 				return event.eventId
440 | 			case let .responseContentPartDone(event):
441 | 				return event.eventId
442 | 			case let .responseTextDelta(event):
443 | 				return event.eventId
444 | 			case let .responseTextDone(event):
445 | 				return event.eventId
446 | 			case let .responseAudioTranscriptDelta(event):
447 | 				return event.eventId
448 | 			case let .responseAudioTranscriptDone(event):
449 | 				return event.eventId
450 | 			case let .responseAudioDelta(event):
451 | 				return event.eventId
452 | 			case let .responseAudioDone(event):
453 | 				return event.eventId
454 | 			case let .responseFunctionCallArgumentsDelta(event):
455 | 				return event.eventId
456 | 			case let .responseFunctionCallArgumentsDone(event):
457 | 				return event.eventId
458 | 			case let .rateLimitsUpdated(event):
459 | 				return event.eventId
460 | 		}
461 | 	}
462 | }
463 | 
464 | extension ServerEvent: Decodable {
465 | 	private enum CodingKeys: String, CodingKey {
466 | 		case type
467 | 	}
468 | 
469 | 	public init(from decoder: any Decoder) throws {
470 | 		let container = try decoder.container(keyedBy: CodingKeys.self)
471 | 		let eventType = try container.decode(String.self, forKey: .type)
472 | 
473 | 		switch eventType {
474 | 			case "error":
475 | 				self = try .error(ErrorEvent(from: decoder))
476 | 			case "session.created":
477 | 				self = try .sessionCreated(SessionEvent(from: decoder))
478 | 			case "session.updated":
479 | 				self = try .sessionUpdated(SessionEvent(from: decoder))
480 | 			case "conversation.created":
481 | 				self = try .conversationCreated(ConversationCreatedEvent(from: decoder))
482 | 			case "input_audio_buffer.committed":
483 | 				self = try .inputAudioBufferCommitted(InputAudioBufferCommittedEvent(from: decoder))
484 | 			case "input_audio_buffer.cleared":
485 | 				self = try .inputAudioBufferCleared(InputAudioBufferClearedEvent(from: decoder))
486 | 			case "input_audio_buffer.speech_started":
487 | 				self = try .inputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent(from: decoder))
488 | 			case "input_audio_buffer.speech_stopped":
489 | 				self = try .inputAudioBufferSpeechStopped(InputAudioBufferSpeechStoppedEvent(from: decoder))
490 | 			case "conversation.item.created":
491 | 				self = try .conversationItemCreated(ConversationItemCreatedEvent(from: decoder))
492 | 			case "conversation.item.input_audio_transcription.completed":
493 | 				self = try .conversationItemInputAudioTranscriptionCompleted(ConversationItemInputAudioTranscriptionCompletedEvent(from: decoder))
494 | 			case "conversation.item.input_audio_transcription.delta":
495 | 				self = try .conversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent(from: decoder))
496 | 			case "conversation.item.input_audio_transcription.failed":
497 | 				self = try .conversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailedEvent(from: decoder))
498 | 			case "conversation.item.truncated":
499 | 				self = try .conversationItemTruncated(ConversationItemTruncatedEvent(from: decoder))
500 | 			case "conversation.item.deleted":
501 | 				self = try .conversationItemDeleted(ConversationItemDeletedEvent(from: decoder))
502 | 			case "output_audio_buffer.started":
503 | 				self = try .outputAudioBufferStarted(OutputAudioBufferStartedEvent(from: decoder))
504 | 			case "output_audio_buffer.stopped":
505 | 				self = try .outputAudioBufferStopped(OutputAudioBufferStoppedEvent(from: decoder))
506 | 			case "response.created":
507 | 				self = try .responseCreated(ResponseEvent(from: decoder))
508 | 			case "response.done":
509 | 				self = try .responseDone(ResponseEvent(from: decoder))
510 | 			case "response.output_item.added":
511 | 				self = try .responseOutputItemAdded(ResponseOutputItemAddedEvent(from: decoder))
512 | 			case "response.output_item.done":
513 | 				self = try .responseOutputItemDone(ResponseOutputItemDoneEvent(from: decoder))
514 | 			case "response.content_part.added":
515 | 				self = try .responseContentPartAdded(ResponseContentPartAddedEvent(from: decoder))
516 | 			case "response.content_part.done":
517 | 				self = try .responseContentPartDone(ResponseContentPartDoneEvent(from: decoder))
518 | 			case "response.text.delta":
519 | 				self = try .responseTextDelta(ResponseTextDeltaEvent(from: decoder))
520 | 			case "response.text.done":
521 | 				self = try .responseTextDone(ResponseTextDoneEvent(from: decoder))
522 | 			case "response.audio_transcript.delta":
523 | 				self = try .responseAudioTranscriptDelta(ResponseAudioTranscriptDeltaEvent(from: decoder))
524 | 			case "response.audio_transcript.done":
525 | 				self = try .responseAudioTranscriptDone(ResponseAudioTranscriptDoneEvent(from: decoder))
526 | 			case "response.audio.delta":
527 | 				self = try .responseAudioDelta(ResponseAudioDeltaEvent(from: decoder))
528 | 			case "response.audio.done":
529 | 				self = try .responseAudioDone(ResponseAudioDoneEvent(from: decoder))
530 | 			case "response.function_call_arguments.delta":
531 | 				self = try .responseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDeltaEvent(from: decoder))
532 | 			case "response.function_call_arguments.done":
533 | 				self = try .responseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDoneEvent(from: decoder))
534 | 			case "rate_limits.updated":
535 | 				self = try .rateLimitsUpdated(RateLimitsUpdatedEvent(from: decoder))
536 | 			default:
537 | 				throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown event type: \(eventType)")
538 | 		}
539 | 	}
540 | }
541 | 
542 | extension ServerEvent.ResponseAudioDeltaEvent: Decodable {
543 | 	private enum CodingKeys: CodingKey {
544 | 		case eventId
545 | 		case responseId
546 | 		case itemId
547 | 		case outputIndex
548 | 		case contentIndex
549 | 		case delta
550 | 	}
551 | 
552 | 	public init(from decoder: any Decoder) throws {
553 | 		let container = try decoder.container(keyedBy: CodingKeys.self)
554 | 
555 | 		itemId = try container.decode(String.self, forKey: .itemId)
556 | 		eventId = try container.decode(String.self, forKey: .eventId)
557 | 		outputIndex = try container.decode(Int.self, forKey: .outputIndex)
558 | 		responseId = try container.decode(String.self, forKey: .responseId)
559 | 		contentIndex = try container.decode(Int.self, forKey: .contentIndex)
560 | 
561 | 		guard let decodedDelta = try Data(base64Encoded: container.decode(String.self, forKey: .delta)) else {
562 | 			throw DecodingError.dataCorruptedError(forKey: .delta, in: container, debugDescription: "Invalid base64-encoded audio data.")
563 | 		}
564 | 		delta = decodedDelta
565 | 	}
566 | }
567 | 


--------------------------------------------------------------------------------
/src/Models/Session.swift:
--------------------------------------------------------------------------------
  1 | public struct Session: Codable, Equatable, Sendable {
  2 | 	public enum Modality: String, Codable, Sendable {
  3 | 		case text
  4 | 		case audio
  5 | 	}
  6 | 
  7 | 	public enum Voice: String, Codable, Sendable {
  8 | 		case alloy
  9 | 		case echo
 10 | 		case shimmer
 11 | 		case ash
 12 | 		case ballad
 13 | 		case coral
 14 | 		case sage
 15 | 		case verse
 16 | 		case fable
 17 | 		case onyx
 18 | 		case nova
 19 | 	}
 20 | 
 21 | 	public enum AudioFormat: String, Codable, Sendable {
 22 | 		case pcm16
 23 | 		case g711_ulaw
 24 | 		case g711_alaw
 25 | 	}
 26 | 
 27 | 	public struct InputAudioTranscription: Codable, Equatable, Sendable {
 28 | 		public enum TranscriptionModel: String, CaseIterable, Codable, Sendable {
 29 | 			case whisper = "whisper-1"
 30 | 			case gpt4o = "gpt-4o-transcribe"
 31 | 			case gpt4oMini = "gpt-4o-mini-transcribe"
 32 | 		}
 33 | 
 34 | 		/// The model to use for transcription
 35 | 		public var model: TranscriptionModel
 36 | 		/// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. `en`) format will improve accuracy and latency.
 37 | 		public var language: String?
 38 | 		/// An optional text to guide the model's style or continue a previous audio segment.
 39 | 		///
 40 | 		/// For `whisper`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt4o` models, the prompt is a free text string, for example "expect words related to technology".
 41 | 		public var prompt: String?
 42 | 
 43 | 		public init(model: TranscriptionModel = .whisper) {
 44 | 			self.model = model
 45 | 		}
 46 | 	}
 47 | 
 48 | 	public struct InputAudioNoiseReduction: Codable, Equatable, Sendable {
 49 | 		/// Type of noise reduction.
 50 | 		public enum NoiseReductionType: String, CaseIterable, Codable, Sendable {
 51 | 			/// For close-talking microphones such as headphones
 52 | 			case nearField = "near_field"
 53 | 			/// For far-field microphones such as laptop or conference room microphones
 54 | 			case farField = "far_field"
 55 | 		}
 56 | 
 57 | 		/// Type of noise reduction.
 58 | 		public var type: NoiseReductionType?
 59 | 
 60 | 		public init(type: NoiseReductionType? = nil) {
 61 | 			self.type = type
 62 | 		}
 63 | 	}
 64 | 
 65 | 	public struct TurnDetection: Codable, Equatable, Sendable {
 66 | 		public enum TurnDetectionType: String, Codable, Sendable {
 67 | 			case serverVad = "server_vad"
 68 | 			case semanticVad = "semantic_vad"
 69 | 			case none
 70 | 		}
 71 | 
 72 | 		public enum TurnDetectionEagerness: String, Codable, Sendable {
 73 | 			case low
 74 | 			case high
 75 | 			case auto
 76 | 			case medium
 77 | 		}
 78 | 
 79 | 		/// The type of turn detection.
 80 | 		public var type: TurnDetectionType
 81 | 		/// Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0).
 82 | 		public var threshold: Double?
 83 | 		/// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
 84 | 		public var interruptResponse: Bool?
 85 | 		/// Used only for `server_vad` mode. Amount of audio to include before speech starts (in milliseconds).
 86 | 		public var prefixPaddingMs: Int?
 87 | 		/// Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds).
 88 | 		public var silenceDurationMs: Int?
 89 | 		/// Whether or not to automatically generate a response when VAD is enabled.
 90 | 		public var createResponse: Bool
 91 | 		/// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`.
 92 | 		public var eagerness: TurnDetectionEagerness?
 93 | 
 94 | 		public init(type: TurnDetectionType = .serverVad, threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, createResponse: Bool = true, eagerness: TurnDetectionEagerness? = nil) {
 95 | 			self.type = type
 96 | 			self.eagerness = eagerness
 97 | 			self.threshold = threshold
 98 | 			self.createResponse = createResponse
 99 | 			self.prefixPaddingMs = prefixPaddingMs
100 | 			self.silenceDurationMs = silenceDurationMs
101 | 			self.interruptResponse = interruptResponse
102 | 		}
103 | 
104 | 		public static func serverVad(threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil) -> TurnDetection {
105 | 			.init(type: .serverVad, threshold: threshold, interruptResponse: interruptResponse, prefixPaddingMs: prefixPaddingMs, silenceDurationMs: silenceDurationMs)
106 | 		}
107 | 
108 | 		public static func semanticVad(eagerness: TurnDetectionEagerness = .auto) -> TurnDetection {
109 | 			.init(type: .semanticVad, eagerness: eagerness)
110 | 		}
111 | 	}
112 | 
113 | 	public struct Tool: Codable, Equatable, Sendable {
114 | 		public struct FunctionParameters: Codable, Equatable, Sendable {
115 | 			public var type: JSONType
116 | 			public var properties: [String: Property]?
117 | 			public var required: [String]?
118 | 			public var pattern: String?
119 | 			public var const: String?
120 | 			public var `enum`: [String]?
121 | 			public var multipleOf: Int?
122 | 			public var minimum: Int?
123 | 			public var maximum: Int?
124 | 
125 | 			public init(
126 | 				type: JSONType,
127 | 				properties: [String: Property]? = nil,
128 | 				required: [String]? = nil,
129 | 				pattern: String? = nil,
130 | 				const: String? = nil,
131 | 				enum: [String]? = nil,
132 | 				multipleOf: Int? = nil,
133 | 				minimum: Int? = nil,
134 | 				maximum: Int? = nil
135 | 			) {
136 | 				self.type = type
137 | 				self.properties = properties
138 | 				self.required = required
139 | 				self.pattern = pattern
140 | 				self.const = const
141 | 				self.enum = `enum`
142 | 				self.multipleOf = multipleOf
143 | 				self.minimum = minimum
144 | 				self.maximum = maximum
145 | 			}
146 | 
147 | 			public struct Property: Codable, Equatable, Sendable {
148 | 				public var type: JSONType
149 | 				public var description: String?
150 | 				public var format: String?
151 | 				public var items: Items?
152 | 				public var required: [String]?
153 | 				public var pattern: String?
154 | 				public var const: String?
155 | 				public var `enum`: [String]?
156 | 				public var multipleOf: Int?
157 | 				public var minimum: Double?
158 | 				public var maximum: Double?
159 | 				public var minItems: Int?
160 | 				public var maxItems: Int?
161 | 				public var uniqueItems: Bool?
162 | 
163 | 				public init(
164 | 					type: JSONType,
165 | 					description: String? = nil,
166 | 					format: String? = nil,
167 | 					items: Self.Items? = nil,
168 | 					required: [String]? = nil,
169 | 					pattern: String? = nil,
170 | 					const: String? = nil,
171 | 					enum: [String]? = nil,
172 | 					multipleOf: Int? = nil,
173 | 					minimum: Double? = nil,
174 | 					maximum: Double? = nil,
175 | 					minItems: Int? = nil,
176 | 					maxItems: Int? = nil,
177 | 					uniqueItems: Bool? = nil
178 | 				) {
179 | 					self.type = type
180 | 					self.description = description
181 | 					self.format = format
182 | 					self.items = items
183 | 					self.required = required
184 | 					self.pattern = pattern
185 | 					self.const = const
186 | 					self.enum = `enum`
187 | 					self.multipleOf = multipleOf
188 | 					self.minimum = minimum
189 | 					self.maximum = maximum
190 | 					self.minItems = minItems
191 | 					self.maxItems = maxItems
192 | 					self.uniqueItems = uniqueItems
193 | 				}
194 | 
195 | 				public struct Items: Codable, Equatable, Sendable {
196 | 					public var type: JSONType
197 | 					public var properties: [String: Property]?
198 | 					public var pattern: String?
199 | 					public var const: String?
200 | 					public var `enum`: [String]?
201 | 					public var multipleOf: Int?
202 | 					public var minimum: Double?
203 | 					public var maximum: Double?
204 | 					public var minItems: Int?
205 | 					public var maxItems: Int?
206 | 					public var uniqueItems: Bool?
207 | 
208 | 					public init(
209 | 						type: JSONType,
210 | 						properties: [String: Property]? = nil,
211 | 						pattern: String? = nil,
212 | 						const: String? = nil,
213 | 						enum: [String]? = nil,
214 | 						multipleOf: Int? = nil,
215 | 						minimum: Double? = nil,
216 | 						maximum: Double? = nil,
217 | 						minItems: Int? = nil,
218 | 						maxItems: Int? = nil,
219 | 						uniqueItems: Bool? = nil
220 | 					) {
221 | 						self.type = type
222 | 						self.properties = properties
223 | 						self.pattern = pattern
224 | 						self.const = const
225 | 						self.enum = `enum`
226 | 						self.multipleOf = multipleOf
227 | 						self.minimum = minimum
228 | 						self.maximum = maximum
229 | 						self.minItems = minItems
230 | 						self.maxItems = maxItems
231 | 						self.uniqueItems = uniqueItems
232 | 					}
233 | 				}
234 | 			}
235 | 
236 | 			public enum JSONType: String, Codable, Sendable {
237 | 				case integer
238 | 				case string
239 | 				case boolean
240 | 				case array
241 | 				case object
242 | 				case number
243 | 				case null
244 | 			}
245 | 		}
246 | 
247 | 		/// The type of the tool.
248 | 		public var type: String = "function"
249 | 		/// The name of the function.
250 | 		public var name: String
251 | 		/// The description of the function.
252 | 		public var description: String
253 | 		/// Parameters of the function in JSON Schema.
254 | 		public var parameters: FunctionParameters
255 | 
256 | 		public init(type: String = "function", name: String, description: String, parameters: FunctionParameters) {
257 | 			self.type = type
258 | 			self.name = name
259 | 			self.description = description
260 | 			self.parameters = parameters
261 | 		}
262 | 	}
263 | 
264 | 	public enum ToolChoice: Equatable, Sendable {
265 | 		case auto
266 | 		case none
267 | 		case required
268 | 		case function(String)
269 | 
270 | 		public init(function name: String) {
271 | 			self = .function(name)
272 | 		}
273 | 	}
274 | 
275 | 	/// The unique ID of the session.
276 | 	public var id: String?
277 | 	/// The default model used for this session.
278 | 	public var model: String
279 | 	/// The set of modalities the model can respond with.
280 | 	public var modalities: [Modality]
281 | 	/// The default system instructions.
282 | 	public var instructions: String
283 | 	/// The voice the model uses to respond.
284 | 	public var voice: Voice
285 | 	/// The format of input audio.
286 | 	public var inputAudioFormat: AudioFormat
287 | 	/// The format of output audio.
288 | 	public var outputAudioFormat: AudioFormat
289 | 	/// Configuration for input audio transcription.
290 | 	public var inputAudioTranscription: InputAudioTranscription?
291 | 	/// Configuration for input audio noise reduction.
292 | 	public var inputAudioNoiseReduction: InputAudioNoiseReduction?
293 | 	/// Configuration for turn detection.
294 | 	public var turnDetection: TurnDetection?
295 | 	/// Tools (functions) available to the model.
296 | 	public var tools: [Tool]
297 | 	/// How the model chooses tools.
298 | 	public var toolChoice: ToolChoice
299 | 	/// Sampling temperature.
300 | 	public var temperature: Double
301 | 	/// Maximum number of output tokens.
302 | 	public var maxOutputTokens: Int?
303 | 
304 | 	public init(
305 | 		id: String? = nil,
306 | 		model: String,
307 | 		tools: [Tool] = [],
308 | 		instructions: String,
309 | 		voice: Voice = .alloy,
310 | 		temperature: Double = 1,
311 | 		maxOutputTokens: Int? = nil,
312 | 		toolChoice: ToolChoice = .auto,
313 | 		turnDetection: TurnDetection? = nil,
314 | 		inputAudioFormat: AudioFormat = .pcm16,
315 | 		outputAudioFormat: AudioFormat = .pcm16,
316 | 		modalities: [Modality] = [.text, .audio],
317 | 		inputAudioTranscription: InputAudioTranscription? = nil
318 | 	) {
319 | 		self.id = id
320 | 		self.model = model
321 | 		self.tools = tools
322 | 		self.voice = voice
323 | 		self.toolChoice = toolChoice
324 | 		self.modalities = modalities
325 | 		self.temperature = temperature
326 | 		self.instructions = instructions
327 | 		self.turnDetection = turnDetection
328 | 		self.maxOutputTokens = maxOutputTokens
329 | 		self.inputAudioFormat = inputAudioFormat
330 | 		self.outputAudioFormat = outputAudioFormat
331 | 		self.inputAudioTranscription = inputAudioTranscription
332 | 	}
333 | }
334 | 
335 | extension Session.ToolChoice: Codable {
336 | 	private enum FunctionCall: Codable {
337 | 		case type
338 | 		case function
339 | 
340 | 		enum CodingKeys: CodingKey {
341 | 			case type
342 | 			case function
343 | 		}
344 | 	}
345 | 
346 | 	public init(from decoder: any Decoder) throws {
347 | 		let container = try decoder.singleValueContainer()
348 | 
349 | 		if let stringValue = try? container.decode(String.self) {
350 | 			switch stringValue {
351 | 				case "none":
352 | 					self = .none
353 | 				case "auto":
354 | 					self = .auto
355 | 				case "required":
356 | 					self = .required
357 | 				default:
358 | 					throw DecodingError.dataCorruptedError(in: container, debugDescription: "Invalid value for enum.")
359 | 			}
360 | 		} else {
361 | 			let container = try decoder.container(keyedBy: FunctionCall.CodingKeys.self)
362 | 			let functionContainer = try container.decode([String: String].self, forKey: .function)
363 | 
364 | 			guard let name = functionContainer["name"] else {
365 | 				throw DecodingError.dataCorruptedError(forKey: .function, in: container, debugDescription: "Missing function name.")
366 | 			}
367 | 
368 | 			self = .function(name)
369 | 		}
370 | 	}
371 | 
372 | 	public func encode(to encoder: Encoder) throws {
373 | 		switch self {
374 | 			case .none:
375 | 				var container = encoder.singleValueContainer()
376 | 				try container.encode("none")
377 | 			case .auto:
378 | 				var container = encoder.singleValueContainer()
379 | 				try container.encode("auto")
380 | 			case .required:
381 | 				var container = encoder.singleValueContainer()
382 | 				try container.encode("required")
383 | 			case let .function(name):
384 | 				var container = encoder.container(keyedBy: FunctionCall.CodingKeys.self)
385 | 				try container.encode("function", forKey: .type)
386 | 				try container.encode(["name": name], forKey: .function)
387 | 		}
388 | 	}
389 | }
390 | 


--------------------------------------------------------------------------------
/src/OpenAIRealtime.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | #if canImport(FoundationNetworking)
 3 | import FoundationNetworking
 4 | #endif
 5 | 
 6 | enum RealtimeAPIError: Error {
 7 | 	case invalidMessage
 8 | }
 9 | 
10 | public final class RealtimeAPI: NSObject, Sendable {
11 | 	@MainActor public var onDisconnect: (@Sendable () -> Void)? {
12 | 		get { connector.onDisconnect }
13 | 		set { connector.onDisconnect(newValue) }
14 | 	}
15 | 
16 | 	public var events: AsyncThrowingStream<ServerEvent, Error> {
17 | 		connector.events
18 | 	}
19 | 
20 | 	let connector: any Connector
21 | 
22 | 	/// Connect to the OpenAI Realtime API using the given connector instance.
23 | 	public init(connector: any Connector) {
24 | 		self.connector = connector
25 | 
26 | 		super.init()
27 | 	}
28 | 
29 | 	public func send(event: ClientEvent) async throws {
30 | 		try await connector.send(event: event)
31 | 	}
32 | }
33 | 
34 | /// Helper methods for connecting to the OpenAI Realtime API.
35 | extension RealtimeAPI {
36 | 	/// Connect to the OpenAI WebSocket Realtime API with the given request.
37 | 	static func webSocket(connectingTo request: URLRequest) -> RealtimeAPI {
38 | 		RealtimeAPI(connector: WebSocketConnector(connectingTo: request))
39 | 	}
40 | 
41 | 	/// Connect to the OpenAI WebSocket Realtime API with the given authentication token and model.
42 | 	static func webSocket(authToken: String, model: String = "gpt-4o-realtime-preview") -> RealtimeAPI {
43 | 		var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [
44 | 			URLQueryItem(name: "model", value: model),
45 | 		]))
46 | 		request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")
47 | 		request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization")
48 | 
49 | 		return webSocket(connectingTo: request)
50 | 	}
51 | 
52 | 	/// Connect to the OpenAI WebRTC Realtime API with the given request.
53 | 	static func webRTC(connectingTo request: URLRequest) async throws -> RealtimeAPI {
54 | 		try RealtimeAPI(connector: await WebRTCConnector(connectingTo: request))
55 | 	}
56 | 
57 | 	/// Connect to the OpenAI WebRTC Realtime API with the given authentication token and model.
58 | 	static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview") async throws -> RealtimeAPI {
59 | 		var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [
60 | 			URLQueryItem(name: "model", value: model),
61 | 		]))
62 | 
63 | 		request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")
64 | 		request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization")
65 | 		return try await webRTC(connectingTo: request)
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/src/Protocols/Connector.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | #if canImport(FoundationNetworking)
 3 | import FoundationNetworking
 4 | #endif
 5 | 
 6 | public protocol Connector: Sendable {
 7 | 	var events: AsyncThrowingStream<ServerEvent, Error> { get }
 8 | 	@MainActor var onDisconnect: (@Sendable () -> Void)? { get }
 9 | 
10 | 	init(connectingTo request: URLRequest) async throws
11 | 
12 | 	func send(event: ClientEvent) async throws
13 | 
14 | 	@MainActor func onDisconnect(_ action: (@Sendable () -> Void)?)
15 | }
16 | 


--------------------------------------------------------------------------------
/src/Support/UnsafeInteriorMutable.swift:
--------------------------------------------------------------------------------
 1 | final class UnsafeInteriorMutable<T: Sendable>: @unchecked Sendable {
 2 | 	private var value: T?
 3 | 
 4 | 	func set(_ value: T) {
 5 | 		self.value = value
 6 | 	}
 7 | 
 8 | 	func get() -> T? {
 9 | 		return value
10 | 	}
11 | 
12 | 	func lazy(_ closure: () -> T?) -> T? {
13 | 		if case let .some(wrapped) = value {
14 | 			return wrapped
15 | 		}
16 | 
17 | 		if let newValue = closure() {
18 | 			value = newValue
19 | 			return newValue
20 | 		}
21 | 
22 | 		return nil
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/src/Support/UnsafeMutableArray.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | 
 3 | /// An unsafe mutable array that can be accessed from multiple threads.
 4 | /// > Warning: Exposing any observable property externally (such as by having a computed property use `isEmpty` will lead to very hard to debug crashes
 5 | /// >
 6 | /// > If you really need to, manually observe the property using `withObservationTracking` and write changes in the main actor.
 7 | @Observable final class UnsafeMutableArray<T: Sendable>: @unchecked Sendable {
 8 | 	private var array = [T]()
 9 | 
10 | 	public var isEmpty: Bool {
11 | 		array.isEmpty
12 | 	}
13 | 
14 | 	var first: T? {
15 | 		array.first
16 | 	}
17 | 
18 | 	func push(_ value: T) {
19 | 		array.append(value)
20 | 	}
21 | 
22 | 	@discardableResult
23 | 	func popFirst() -> T? {
24 | 		array.removeFirst()
25 | 	}
26 | 
27 | 	func clear() {
28 | 		array.removeAll()
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------