├── .github ├── build.sh └── workflows │ ├── build.yml │ └── dependencies.yml ├── .gitignore ├── .spi.yml ├── LICENSE ├── Package.swift ├── README.md └── src ├── Connectors ├── WebRTCConnector.swift └── WebSocketConnector.swift ├── Conversation.swift ├── Extensions ├── AVAudioPCMBuffer+fromData.swift ├── Collection+safe.swift ├── Continuation+error.swift └── String+random.swift ├── Models ├── ClientEvent.swift ├── Item.swift ├── Response.swift ├── ServerError.swift ├── ServerEvent.swift └── Session.swift ├── OpenAIRealtime.swift ├── Protocols └── Connector.swift └── Support ├── UnsafeInteriorMutable.swift └── UnsafeMutableArray.swift /.github/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd -P)" 6 | PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" 7 | 8 | PROJECT_BUILD_DIR="${PROJECT_BUILD_DIR:-"${PROJECT_ROOT}/build"}" 9 | XCODEBUILD_BUILD_DIR="$PROJECT_BUILD_DIR/xcodebuild" 10 | XCODEBUILD_DERIVED_DATA_PATH="$XCODEBUILD_BUILD_DIR/DerivedData" 11 | 12 | build_framework() { 13 | local sdk="$1" 14 | local destination="$2" 15 | local scheme="$3" 16 | 17 | local XCODEBUILD_ARCHIVE_PATH="./$scheme-$sdk.xcarchive" 18 | 19 | rm -rf "$XCODEBUILD_ARCHIVE_PATH" 20 | 21 | xcodebuild archive \ 22 | -scheme "$scheme" \ 23 | -archivePath "$XCODEBUILD_ARCHIVE_PATH" \ 24 | -derivedDataPath "$XCODEBUILD_DERIVED_DATA_PATH" \ 25 | -sdk "$sdk" \ 26 | -destination "$destination" \ 27 | BUILD_LIBRARY_FOR_DISTRIBUTION=YES \ 28 | INSTALL_PATH='Library/Frameworks' \ 29 | OTHER_SWIFT_FLAGS=-no-verify-emitted-module-interface \ 30 | LD_GENERATE_MAP_FILE=YES 31 | 32 | FRAMEWORK_MODULES_PATH="$XCODEBUILD_ARCHIVE_PATH/Products/Library/Frameworks/$scheme.framework/Modules" 33 | mkdir -p "$FRAMEWORK_MODULES_PATH" 34 | cp -r \ 35 | "$XCODEBUILD_DERIVED_DATA_PATH/Build/Intermediates.noindex/ArchiveIntermediates/$scheme/BuildProductsPath/Release-$sdk/$scheme.swiftmodule" \ 36 | "$FRAMEWORK_MODULES_PATH/$scheme.swiftmodule" 37 | # Delete private swiftinterface 38 | rm -f "$FRAMEWORK_MODULES_PATH/$scheme.swiftmodule/*.private.swiftinterface" 39 | mkdir -p "$scheme-$sdk.xcarchive/LinkMaps" 40 | find "$XCODEBUILD_DERIVED_DATA_PATH" -name "$scheme-LinkMap-*.txt" -exec cp {} "./$scheme-$sdk.xcarchive/LinkMaps/" \; 41 | } 42 | 43 | # Update the Package.swift to build the library as dynamic instead of static 44 | sed -i '' 's/type: \.static/type: .dynamic/g' Package.swift 45 | 46 | build_framework "iphoneos" "generic/platform=iOS" "OpenAIRealtime" 47 | build_framework "iphonesimulator" "generic/platform=iOS Simulator" "OpenAIRealtime" 48 | 49 | echo "Builds completed successfully." 50 | 51 | rm -rf "OpenAIRealtime.xcframework" 52 | xcodebuild -create-xcframework -framework OpenAIRealtime-iphonesimulator.xcarchive/Products/Library/Frameworks/OpenAIRealtime.framework -framework OpenAIRealtime-iphoneos.xcarchive/Products/Library/Frameworks/OpenAIRealtime.framework -output OpenAIRealtime.xcframework 53 | 54 | cp -r OpenAIRealtime-iphoneos.xcarchive/dSYMs OpenAIRealtime.xcframework/ios-arm64 55 | cp -r OpenAIRealtime-iphonesimulator.xcarchive/dSYMs OpenAIRealtime.xcframework/ios-arm64_x86_64-simulator 56 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | push: 4 | branches: [main] 5 | 6 | jobs: 7 | build: 8 | name: Swift 6.0 9 | runs-on: macos-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | 13 | - uses: maxim-lobanov/setup-xcode@v1 14 | with: 15 | xcode-version: latest-stable 16 | 17 | - name: Build 18 | run: ./.github/build.sh 19 | 20 | - run: zip -r ./OpenAIRealtime.xcframework.zip ./OpenAIRealtime.xcframework 21 | 22 | - name: Upload artifact to Emerge 23 | uses: EmergeTools/emerge-upload-action@v1.1.0 24 | with: 25 | build_type: release 26 | artifact_path: ./OpenAIRealtime.xcframework.zip 27 | emerge_api_key: ${{ secrets.EMERGE_API_KEY }} 28 | -------------------------------------------------------------------------------- /.github/workflows/dependencies.yml: -------------------------------------------------------------------------------- 1 | name: Swift Dependency Submission 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | swift-action-detection: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Repository 15 | uses: actions/checkout@v4 16 | 17 | - name: Install Swift 18 | uses: vapor/swiftly-action@v0.1 19 | with: 20 | toolchain: latest 21 | 22 | - name: Submit Dependencies 23 | uses: vapor-community/swift-dependency-submission@v0.1 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .netrc 2 | /.build 3 | .DS_Store 4 | /Packages 5 | *.xcarchive 6 | xcuserdata/ 7 | DerivedData/ 8 | *.xcframework 9 | Package.resolved 10 | .swiftpm/configuration/registries.json 11 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 12 | -------------------------------------------------------------------------------- /.spi.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | builder: 3 | configs: 4 | - documentation_targets: [OpenAIRealtime] 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Miguel Piedrafita 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 6.0 2 | 3 | import PackageDescription 4 | 5 | let package = Package( 6 | name: "OpenAIRealtime", 7 | platforms: [ 8 | .iOS(.v17), 9 | .tvOS(.v17), 10 | .macOS(.v14), 11 | .watchOS(.v10), 12 | .visionOS(.v1), 13 | .macCatalyst(.v17), 14 | ], 15 | products: [ 16 | .library(name: "OpenAIRealtime", type: .static, targets: ["OpenAIRealtime"]), 17 | ], 18 | dependencies: [ 19 | .package(url: "https://github.com/stasel/WebRTC.git", branch: "latest"), 20 | ], 21 | targets: [ 22 | .target(name: "OpenAIRealtime", dependencies: ["WebRTC"], path: "./src"), 23 | ] 24 | ) 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A modern Swift SDK for OpenAI's Realtime API 2 | 3 | [![Install Size](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fwww.emergetools.com%2Fapi%2Fv2%2Fpublic_new_build%3FexampleId%3Dswift-realtime-openai.OpenAIRealtime%26platform%3Dios%26badgeOption%3Dmax_install_size_only%26buildType%3Drelease&query=$.badgeMetadata&label=OpenAI&logo=apple)](https://www.emergetools.com/app/example/ios/swift-realtime-openai.OpenAIRealtime/release) 4 | [![Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fm1guelpf%2Fswift-realtime-openai%2Fbadge%3Ftype%3Dswift-versions&color=brightgreen)](https://swiftpackageindex.com/m1guelpf/swift-realtime-openai) 5 | [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/m1guelpf/swift-realtime-openai/main/LICENSE) 6 | 7 | This library provides a simple interface for implementing multi-modal conversations using OpenAI's new Realtime API. 8 | 9 | It can handle automatically recording the user's microphone and playing back the assistant's response, and also gives you a transparent layer over the API for advanced use cases. 10 | 11 | ## Installation 12 | 13 | ### Swift Package Manager 14 | 15 | The Swift Package Manager allows for developers to easily integrate packages into their Xcode projects and packages; and is also fully integrated into the swift compiler. 16 | 17 | ### SPM Through XCode Project 18 | 19 | - File > Swift Packages > Add Package Dependency 20 | - Add https://github.com/m1guelpf/swift-realtime-openai.git 21 | - Select "Branch" with "main" 22 | 23 | ### SPM Through Xcode Package 24 | 25 | Once you have your Swift package set up, add the Git link within the dependencies value of your Package.swift file. 26 | 27 | ```swift 28 | dependencies: [ 29 | .package(url: "https://github.com/m1guelpf/swift-realtime-openai.git", .branch("main")) 30 | ] 31 | ``` 32 | 33 | ## Getting started 🚀 34 | 35 | You can build an iMessage-like app with built-in AI chat in less than 60 lines of code (UI included!): 36 | 37 | ```swift 38 | import SwiftUI 39 | import OpenAIRealtime 40 | 41 | struct ContentView: View { 42 | @State private var newMessage: String = "" 43 | @State private var conversation = Conversation(authToken: OPENAI_KEY) 44 | 45 | var messages: [Item.Message] { 46 | conversation.entries.compactMap { switch $0 { 47 | case let .message(message): return message 48 | default: return nil 49 | } } 50 | } 51 | 52 | var body: some View { 53 | VStack(spacing: 0) { 54 | ScrollView { 55 | VStack(spacing: 12) { 56 | ForEach(messages, id: \.id) { message in 57 | MessageBubble(message: message) 58 | } 59 | } 60 | .padding() 61 | } 62 | 63 | HStack(spacing: 12) { 64 | HStack { 65 | TextField("Chat", text: $newMessage, onCommit: { sendMessage() }) 66 | .frame(height: 40) 67 | .submitLabel(.send) 68 | 69 | if newMessage != "" { 70 | Button(action: sendMessage) { 71 | Image(systemName: "arrow.up.circle.fill") 72 | .resizable() 73 | .aspectRatio(contentMode: .fill) 74 | .frame(width: 28, height: 28) 75 | .foregroundStyle(.white, .blue) 76 | } 77 | } 78 | } 79 | .padding(.leading) 80 | .padding(.trailing, 6) 81 | .overlay(RoundedRectangle(cornerRadius: 20).stroke(.quaternary, lineWidth: 1)) 82 | } 83 | .padding() 84 | } 85 | .navigationTitle("Chat") 86 | .navigationBarTitleDisplayMode(.inline) 87 | .onAppear { try! conversation.startHandlingVoice() } 88 | } 89 | 90 | func sendMessage() { 91 | guard newMessage != "" else { return } 92 | 93 | Task { 94 | try await conversation.send(from: .user, text: newMessage) 95 | newMessage = "" 96 | } 97 | } 98 | } 99 | ``` 100 | 101 | Or, if you just want a simple app that lets the user talk and the AI respond: 102 | 103 | ```swift 104 | import SwiftUI 105 | import OpenAIRealtime 106 | 107 | struct ContentView: View { 108 | @State private var conversation = Conversation(authToken: OPENAI_KEY) 109 | 110 | var body: some View { 111 | Text("Say something!") 112 | .onAppear { try! conversation.startListening() } 113 | } 114 | } 115 | ``` 116 | 117 | ## Features 118 | 119 | - [x] A simple interface for directly interacting with the API 120 | - [x] Wrap the API in an interface that manages the conversation for you 121 | - [x] Optionally handle recording the user's mic and sending it to the API 122 | - [x] Optionally handle playing model responses as they stream in 123 | - [x] Allow interrupting the model 124 | - [ ] WebRTC support 125 | 126 | ## Architecture 127 | 128 | ### `Conversation` 129 | 130 | The `Conversation` class provides a high-level interface for managing a conversation with the model. It wraps the `RealtimeAPI` class and handles the details of sending and receiving messages, as well as managing the conversation history. It can optionally also handle recording the user's mic and sending it to the API, as well as playing model responses as they stream in. 131 | 132 | #### Reading messages 133 | 134 | You can access the messages in the conversation through the `messages` property. Note that this won't include function calls and its responses, only the messages between the user and the model. To access the full conversation history, use the `entries` property. For example: 135 | 136 | ```swift 137 | ScrollView { 138 | ScrollViewReader { scrollView in 139 | VStack(spacing: 12) { 140 | ForEach(conversation.messages, id: \.id) { message in 141 | MessageBubble(message: message).id(message.id) 142 | } 143 | } 144 | .onReceive(conversation.messages.publisher) { _ in 145 | withAnimation { scrollView.scrollTo(conversation.messages.last?.id, anchor: .center) } 146 | } 147 | } 148 | } 149 | ``` 150 | 151 | #### Customizing the session 152 | 153 | You can customize the current session using the `setSession(_: Session)` or `updateSession(withChanges: (inout Session) -> Void)` methods. Note that they requires that a session has already been established, so it's recommended you call them from a `whenConnected(_: @Sendable () async throws -> Void)` callback or await `waitForConnection()` first. For example: 154 | 155 | ```swift 156 | try await conversation.whenConnected { 157 | try await conversation.updateSession { session in 158 | // update system prompt 159 | session.instructions = "You are a helpful assistant." 160 | 161 | // enable transcription of users' voice messages 162 | session.inputAudioTranscription = Session.InputAudioTranscription() 163 | 164 | // ... 165 | } 166 | } 167 | ``` 168 | 169 | #### Handling voice conversations 170 | 171 | The `Conversation` class can automatically handle 2-way voice conversations. Calling `startListening()` will start listening to the user's voice and sending it to the model, and playing back the model's responses. Calling `stopListening()` will stop listening, but continue playing back responses. 172 | 173 | If you just want to play model responses, call `startHandlingVoice()`. To stop both listening and playing back responses, call `stopHandlingVoice()`. 174 | 175 | #### Manually sending messages 176 | 177 | To send a text message, call the `send(from: Item.ItemRole, text: String, response: Response.Config? = nil)` providing the role of the sender (`.user`, `.assistant`, or `.system`) and the contents of the message. You can optionally also provide a `Response.Config` object to customize the response, such as enabling or disabling function calls. 178 | 179 | To manually send an audio message (or part of one), call the `send(audioDelta: Data, commit: Bool = false)` with a valid audio chunk. If `commit` is `true`, the model will consider the message finished and begin responding to it. Otherwise, it might wait for more audio depending on your `Session.turnDetection` settings. 180 | 181 | #### Manually sending events 182 | 183 | To manually send an event to the API, use the `send(event: RealtimeAPI.ClientEvent)` method. Note that this bypasses some of the logic in the `Conversation` class such as handling interrupts, so you should prefer to use other methods whenever possible. 184 | 185 | ### `RealtimeAPI` 186 | 187 | To interact with the API directly, create a new instance of `RealtimeAPI` providing one of the available connectors. There are helper methods that let you create an instance from an apiKey or a `URLRequest`, like so: 188 | 189 | ```swift 190 | let api = RealtimeAPI.webSocket(authToken: YOUR_OPENAI_API_KEY, model: String = "gpt-4o-realtime-preview") // or RealtimeAPI.webSocket(connectingTo: URLRequest) 191 | let api = RealtimeAPI.webRTC(authToken: YOUR_OPENAI_API_KEY, model: String = "gpt-4o-realtime-preview") // or RealtimeAPI.webRTC(connectingTo: URLRequest) 192 | ``` 193 | 194 | You can listen for new events through the `events` property, like so: 195 | 196 | ```swift 197 | for try await event in api.events { 198 | switch event { 199 | case let .sessionCreated(event): 200 | print(event.session.id) 201 | } 202 | } 203 | ``` 204 | 205 | To send an event to the API, call the `send` method with a `ClientEvent` instance: 206 | 207 | ```swift 208 | try await api.send(event: .updateSession(session)) 209 | try await api.send(event: .appendInputAudioBuffer(encoding: audioData)) 210 | try await api.send(event: .createResponse()) 211 | ``` 212 | 213 | ## License 214 | 215 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 216 | -------------------------------------------------------------------------------- /src/Connectors/WebRTCConnector.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import WebRTC 2 | import Foundation 3 | #if canImport(FoundationNetworking) 4 | import FoundationNetworking 5 | #endif 6 | 7 | public final class WebRTCConnector: NSObject, Connector, Sendable { 8 | enum WebRTCError: Error { 9 | case failedToCreateDataChannel 10 | case failedToCreatePeerConnection 11 | case badServerResponse 12 | } 13 | 14 | @MainActor public private(set) var onDisconnect: (@Sendable () -> Void)? = nil 15 | public let events: AsyncThrowingStream 16 | 17 | private let connection: RTCPeerConnection 18 | private let dataChannel: RTCDataChannel 19 | 20 | private let stream: AsyncThrowingStream.Continuation 21 | 22 | private static let factory: RTCPeerConnectionFactory = { 23 | RTCInitializeSSL() 24 | 25 | return RTCPeerConnectionFactory() 26 | }() 27 | 28 | private let encoder: JSONEncoder = { 29 | let encoder = JSONEncoder() 30 | encoder.keyEncodingStrategy = .convertToSnakeCase 31 | return encoder 32 | }() 33 | 34 | private let decoder: JSONDecoder = { 35 | let decoder = JSONDecoder() 36 | decoder.keyDecodingStrategy = .convertFromSnakeCase 37 | return decoder 38 | }() 39 | 40 | public required init(connectingTo request: URLRequest) async throws { 41 | guard let connection = WebRTCConnector.factory.peerConnection(with: .init(), constraints: .init(mandatoryConstraints: nil, optionalConstraints: nil), delegate: nil) else { 42 | throw WebRTCError.failedToCreatePeerConnection 43 | } 44 | self.connection = connection 45 | 46 | let audioTrackSource = WebRTCConnector.factory.audioSource(with: nil) 47 | let audioTrack = WebRTCConnector.factory.audioTrack(with: audioTrackSource, trackId: "audio0") 48 | let mediaStream = WebRTCConnector.factory.mediaStream(withStreamId: "stream0") 49 | mediaStream.addAudioTrack(audioTrack) 50 | self.connection.add(audioTrack, streamIds: ["stream0"]) 51 | 52 | guard let dataChannel = self.connection.dataChannel(forLabel: "oai-events", configuration: RTCDataChannelConfiguration()) else { 53 | throw WebRTCError.failedToCreateDataChannel 54 | } 55 | self.dataChannel = dataChannel 56 | 57 | (events, stream) = AsyncThrowingStream.makeStream(of: ServerEvent.self) 58 | 59 | super.init() 60 | 61 | connection.delegate = self 62 | dataChannel.delegate = self 63 | 64 | var request = request 65 | 66 | let offer = try await self.connection.offer(for: RTCMediaConstraints(mandatoryConstraints: nil, optionalConstraints: [ 67 | "OfferToReceiveAudio": "true", 68 | "googEchoCancellation": "true", 69 | "googAutoGainControl": "true", 70 | "googNoiseSuppression": "true", 71 | "googHighpassFilter": "true", 72 | ])) 73 | try await self.connection.setLocalDescription(offer) 74 | 75 | request.httpBody = offer.sdp.data(using: .utf8) 76 | 77 | let (data, res) = try await URLSession.shared.data(for: request) 78 | guard let res = res as? HTTPURLResponse, res.statusCode == 201, let sdp = String(data: data, encoding: .utf8) else { 79 | throw WebRTCError.badServerResponse 80 | } 81 | 82 | try await self.connection.setRemoteDescription(RTCSessionDescription(type: .answer, sdp: sdp)) 83 | } 84 | 85 | deinit { 86 | connection.close() 87 | stream.finish() 88 | onDisconnect?() 89 | } 90 | 91 | public func send(event: ClientEvent) async throws { 92 | try dataChannel.sendData(RTCDataBuffer(data: encoder.encode(event), isBinary: false)) 93 | } 94 | 95 | @MainActor public func onDisconnect(_ action: (@Sendable () -> Void)?) { 96 | onDisconnect = action 97 | } 98 | } 99 | 100 | extension WebRTCConnector: RTCPeerConnectionDelegate { 101 | public func peerConnection(_: RTCPeerConnection, didChange _: RTCSignalingState) { 102 | print("Connection state changed to \(connection.signalingState)") 103 | } 104 | 105 | public func peerConnection(_: RTCPeerConnection, didAdd _: RTCMediaStream) { 106 | print("Media stream added.") 107 | } 108 | 109 | public func peerConnection(_: RTCPeerConnection, didRemove _: RTCMediaStream) { 110 | print("Media stream removed.") 111 | } 112 | 113 | public func peerConnectionShouldNegotiate(_: RTCPeerConnection) { 114 | print("Negotiating connection.") 115 | } 116 | 117 | public func peerConnection(_: RTCPeerConnection, didChange _: RTCIceConnectionState) { 118 | print("ICE connection state changed to \(connection.iceConnectionState)") 119 | } 120 | 121 | public func peerConnection(_: RTCPeerConnection, didChange _: RTCIceGatheringState) { 122 | print("ICE gathering state changed to \(connection.iceGatheringState)") 123 | } 124 | 125 | public func peerConnection(_: RTCPeerConnection, didGenerate _: RTCIceCandidate) { 126 | print("ICE candidate generated.") 127 | } 128 | 129 | public func peerConnection(_: RTCPeerConnection, didRemove _: [RTCIceCandidate]) { 130 | print("ICE candidate removed.") 131 | } 132 | 133 | public func peerConnection(_: RTCPeerConnection, didOpen _: RTCDataChannel) { 134 | print("Data channel opened.") 135 | } 136 | } 137 | 138 | extension WebRTCConnector: RTCDataChannelDelegate { 139 | public func dataChannel(_: RTCDataChannel, didReceiveMessageWith buffer: RTCDataBuffer) { 140 | stream.yield(with: Result { try self.decoder.decode(ServerEvent.self, from: buffer.data) }) 141 | } 142 | 143 | public func dataChannelDidChangeState(_ dataChannel: RTCDataChannel) { 144 | print("Data channel changed to \(dataChannel.readyState)") 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/Connectors/WebSocketConnector.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | #if canImport(FoundationNetworking) 3 | import FoundationNetworking 4 | #endif 5 | 6 | public final class WebSocketConnector: Connector, Sendable { 7 | @MainActor public private(set) var onDisconnect: (@Sendable () -> Void)? = nil 8 | public let events: AsyncThrowingStream 9 | 10 | private let task: Task 11 | private let webSocket: URLSessionWebSocketTask 12 | private let stream: AsyncThrowingStream.Continuation 13 | 14 | private let encoder: JSONEncoder = { 15 | let encoder = JSONEncoder() 16 | encoder.keyEncodingStrategy = .convertToSnakeCase 17 | return encoder 18 | }() 19 | 20 | public init(connectingTo request: URLRequest) { 21 | let (events, stream) = AsyncThrowingStream.makeStream(of: ServerEvent.self) 22 | 23 | let webSocket = URLSession.shared.webSocketTask(with: request) 24 | webSocket.resume() 25 | 26 | task = Task.detached { [webSocket, stream] in 27 | var isActive = true 28 | 29 | let decoder = JSONDecoder() 30 | decoder.keyDecodingStrategy = .convertFromSnakeCase 31 | 32 | while isActive, webSocket.closeCode == .invalid, !Task.isCancelled { 33 | guard webSocket.closeCode == .invalid else { 34 | stream.finish() 35 | isActive = false 36 | break 37 | } 38 | 39 | do { 40 | let message = try await webSocket.receive() 41 | 42 | guard case let .string(text) = message, let data = text.data(using: .utf8) else { 43 | stream.yield(error: RealtimeAPIError.invalidMessage) 44 | continue 45 | } 46 | 47 | try stream.yield(decoder.decode(ServerEvent.self, from: data)) 48 | } catch { 49 | stream.yield(error: error) 50 | isActive = false 51 | } 52 | } 53 | 54 | webSocket.cancel(with: .goingAway, reason: nil) 55 | } 56 | 57 | self.events = events 58 | self.stream = stream 59 | self.webSocket = webSocket 60 | } 61 | 62 | deinit { 63 | webSocket.cancel(with: .goingAway, reason: nil) 64 | task.cancel() 65 | stream.finish() 66 | onDisconnect?() 67 | } 68 | 69 | public func send(event: ClientEvent) async throws { 70 | let message = try URLSessionWebSocketTask.Message.string(String(data: encoder.encode(event), encoding: .utf8)!) 71 | try await webSocket.send(message) 72 | } 73 | 74 | @MainActor public func onDisconnect(_ action: (@Sendable () -> Void)?) { 75 | onDisconnect = action 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/Conversation.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | @preconcurrency import AVFoundation 3 | 4 | public enum ConversationError: Error { 5 | case sessionNotFound 6 | case converterInitializationFailed 7 | } 8 | 9 | @Observable 10 | public final class Conversation: @unchecked Sendable { 11 | private let client: RealtimeAPI 12 | @MainActor private var isInterrupting: Bool = false 13 | private let errorStream: AsyncStream.Continuation 14 | 15 | private var task: Task! 16 | private let audioEngine = AVAudioEngine() 17 | private let playerNode = AVAudioPlayerNode() 18 | private let queuedSamples = UnsafeMutableArray() 19 | private let apiConverter = UnsafeInteriorMutable() 20 | private let userConverter = UnsafeInteriorMutable() 21 | private let desiredFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 24000, channels: 1, interleaved: false)! 22 | 23 | /// A stream of errors that occur during the conversation. 24 | public let errors: AsyncStream 25 | 26 | /// The unique ID of the conversation. 27 | @MainActor public private(set) var id: String? 28 | 29 | /// The current session for this conversation. 30 | @MainActor public private(set) var session: Session? 31 | 32 | /// A list of items in the conversation. 33 | @MainActor public private(set) var entries: [Item] = [] 34 | 35 | /// Whether the conversation is currently connected to the server. 36 | @MainActor public private(set) var connected: Bool = false 37 | 38 | /// Whether the conversation is currently listening to the user's microphone. 39 | @MainActor public private(set) var isListening: Bool = false 40 | 41 | /// Whether this conversation is currently handling voice input and output. 42 | @MainActor public private(set) var handlingVoice: Bool = false 43 | 44 | /// Whether the user is currently speaking. 45 | /// This only works when using the server's voice detection. 46 | @MainActor public private(set) var isUserSpeaking: Bool = false 47 | 48 | /// Whether the model is currently speaking. 49 | @MainActor public private(set) var isPlaying: Bool = false 50 | 51 | /// A list of messages in the conversation. 52 | /// Note that this doesn't include function call events. To get a complete list, use `entries`. 53 | @MainActor public var messages: [Item.Message] { 54 | entries.compactMap { switch $0 { 55 | case let .message(message): return message 56 | default: return nil 57 | } } 58 | } 59 | 60 | private init(client: RealtimeAPI) { 61 | self.client = client 62 | (errors, errorStream) = AsyncStream.makeStream(of: ServerError.self) 63 | 64 | let events = client.events 65 | task = Task.detached { [weak self] in 66 | for try await event in events { 67 | guard !Task.isCancelled else { break } 68 | 69 | await self?.handleEvent(event) 70 | } 71 | 72 | await MainActor.run { [weak self] in 73 | self?.connected = false 74 | } 75 | } 76 | 77 | Task { @MainActor in 78 | client.onDisconnect = { [weak self] in 79 | guard let self else { return } 80 | 81 | Task { @MainActor in 82 | self.connected = false 83 | } 84 | } 85 | 86 | _keepIsPlayingPropertyUpdated() 87 | } 88 | } 89 | 90 | deinit { 91 | task.cancel() 92 | errorStream.finish() 93 | 94 | Task { [playerNode, audioEngine] in 95 | Self.cleanUpAudio(playerNode: playerNode, audioEngine: audioEngine) 96 | } 97 | } 98 | 99 | /// Create a new conversation providing an API token and, optionally, a model. 100 | public convenience init(authToken token: String, model: String = "gpt-4o-realtime-preview") { 101 | self.init(client: RealtimeAPI.webSocket(authToken: token, model: model)) 102 | } 103 | 104 | /// Create a new conversation that connects using a custom `URLRequest`. 105 | public convenience init(connectingTo request: URLRequest) { 106 | self.init(client: RealtimeAPI.webSocket(connectingTo: request)) 107 | } 108 | 109 | /// Wait for the connection to be established 110 | @MainActor public func waitForConnection() async { 111 | while true { 112 | if connected { 113 | return 114 | } 115 | 116 | try? await Task.sleep(for: .milliseconds(500)) 117 | } 118 | } 119 | 120 | /// Execute a block of code when the connection is established 121 | @MainActor public func whenConnected(_ callback: @Sendable () async throws(E) -> Void) async throws(E) { 122 | await waitForConnection() 123 | try await callback() 124 | } 125 | 126 | /// Make changes to the current session 127 | /// Note that this will fail if the session hasn't started yet. Use `whenConnected` to ensure the session is ready. 128 | public func updateSession(withChanges callback: (inout Session) -> Void) async throws { 129 | guard var session = await session else { 130 | throw ConversationError.sessionNotFound 131 | } 132 | 133 | callback(&session) 134 | 135 | try await setSession(session) 136 | } 137 | 138 | /// Set the configuration of the current session 139 | public func setSession(_ session: Session) async throws { 140 | // update endpoint errors if we include the session id 141 | var session = session 142 | session.id = nil 143 | 144 | try await client.send(event: .updateSession(session)) 145 | } 146 | 147 | /// Send a client event to the server. 148 | /// > Warning: This function is intended for advanced use cases. Use the other functions to send messages and audio data. 149 | public func send(event: ClientEvent) async throws { 150 | try await client.send(event: event) 151 | } 152 | 153 | /// Manually append audio bytes to the conversation. 154 | /// Commit the audio to trigger a model response when server turn detection is disabled. 155 | /// > Note: The `Conversation` class can automatically handle listening to the user's mic and playing back model responses. 156 | /// > To get started, call the `startListening` function. 157 | public func send(audioDelta audio: Data, commit: Bool = false) async throws { 158 | try await send(event: .appendInputAudioBuffer(encoding: audio)) 159 | if commit { try await send(event: .commitInputAudioBuffer()) } 160 | } 161 | 162 | /// Send a text message and wait for a response. 163 | /// Optionally, you can provide a response configuration to customize the model's behavior. 164 | /// > Note: Calling this function will automatically call `interruptSpeech` if the model is currently speaking. 165 | public func send(from role: Item.ItemRole, text: String, response: Response.Config? = nil) async throws { 166 | if await handlingVoice { await interruptSpeech() } 167 | 168 | try await send(event: .createConversationItem(Item(message: Item.Message(id: String(randomLength: 32), from: role, content: [.input_text(text)])))) 169 | try await send(event: .createResponse(response)) 170 | } 171 | 172 | /// Send the response of a function call. 173 | public func send(result output: Item.FunctionCallOutput) async throws { 174 | try await send(event: .createConversationItem(Item(with: output))) 175 | } 176 | } 177 | 178 | /// Listening/Speaking public API 179 | public extension Conversation { 180 | /// Start listening to the user's microphone and sending audio data to the model. 181 | /// This will automatically call `startHandlingVoice` if it hasn't been called yet. 182 | /// > Warning: Make sure to handle the case where the user denies microphone access. 183 | @MainActor func startListening() throws { 184 | guard !isListening else { return } 185 | if !handlingVoice { try startHandlingVoice() } 186 | 187 | Task.detached { [audioEngine] in 188 | audioEngine.inputNode.installTap(onBus: 0, bufferSize: 4096, format: audioEngine.inputNode.outputFormat(forBus: 0)) { [weak self] buffer, _ in 189 | self?.processAudioBufferFromUser(buffer: buffer) 190 | } 191 | } 192 | 193 | isListening = true 194 | } 195 | 196 | /// Stop listening to the user's microphone. 197 | /// This won't stop playing back model responses. To fully stop handling voice conversations, call `stopHandlingVoice`. 198 | @MainActor func stopListening() { 199 | guard isListening else { return } 200 | 201 | audioEngine.inputNode.removeTap(onBus: 0) 202 | isListening = false 203 | } 204 | 205 | /// Handle the playback of audio responses from the model. 206 | @MainActor func startHandlingVoice() throws { 207 | guard !handlingVoice else { return } 208 | 209 | guard let converter = AVAudioConverter(from: audioEngine.inputNode.outputFormat(forBus: 0), to: desiredFormat) else { 210 | throw ConversationError.converterInitializationFailed 211 | } 212 | userConverter.set(converter) 213 | 214 | audioEngine.attach(playerNode) 215 | audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: converter.inputFormat) 216 | 217 | #if os(iOS) 218 | try audioEngine.inputNode.setVoiceProcessingEnabled(true) 219 | #endif 220 | 221 | audioEngine.prepare() 222 | do { 223 | try audioEngine.start() 224 | 225 | #if os(iOS) 226 | let audioSession = AVAudioSession.sharedInstance() 227 | try audioSession.setCategory(.playAndRecord, mode: .voiceChat, options: [.defaultToSpeaker, .allowBluetooth]) 228 | try audioSession.setActive(true) 229 | #endif 230 | 231 | handlingVoice = true 232 | } catch { 233 | print("Failed to enable audio engine: \(error)") 234 | 235 | audioEngine.disconnectNodeInput(playerNode) 236 | audioEngine.disconnectNodeOutput(playerNode) 237 | 238 | throw error 239 | } 240 | } 241 | 242 | /// Interrupt the model's response if it's currently playing. 243 | /// This lets the model know that the user didn't hear the full response. 244 | @MainActor func interruptSpeech() { 245 | guard !isInterrupting else { return } 246 | isInterrupting = true 247 | 248 | if isPlaying, 249 | let nodeTime = playerNode.lastRenderTime, 250 | let playerTime = playerNode.playerTime(forNodeTime: nodeTime), 251 | let itemID = queuedSamples.first 252 | { 253 | let audioTimeInMiliseconds = Int((Double(playerTime.sampleTime) / playerTime.sampleRate) * 1000) 254 | 255 | Task { [client] in 256 | do { 257 | try await client.send(event: .truncateConversationItem(forItem: itemID, atAudioMs: audioTimeInMiliseconds)) 258 | } catch { 259 | print("Failed to send automatic truncation event: \(error)") 260 | } 261 | } 262 | } 263 | 264 | playerNode.stop() 265 | queuedSamples.clear() 266 | isInterrupting = false 267 | } 268 | 269 | @MainActor func stopHandlingVoice() { 270 | guard handlingVoice else { return } 271 | 272 | Self.cleanUpAudio(playerNode: playerNode, audioEngine: audioEngine) 273 | 274 | isListening = false 275 | handlingVoice = false 276 | } 277 | 278 | /// Stop playing audio responses from the model and listening to the user's microphone. 279 | static func cleanUpAudio(playerNode: AVAudioPlayerNode, audioEngine: AVAudioEngine) { 280 | // If attachedNodes does not contain the playerNode then `startHandlingVoice` was never called 281 | guard audioEngine.attachedNodes.contains(playerNode) else { return } 282 | 283 | audioEngine.inputNode.removeTap(onBus: 0) 284 | audioEngine.stop() 285 | audioEngine.disconnectNodeInput(playerNode) 286 | audioEngine.disconnectNodeOutput(playerNode) 287 | 288 | #if os(iOS) 289 | try? AVAudioSession.sharedInstance().setActive(false) 290 | #elseif os(macOS) 291 | if audioEngine.isRunning { 292 | audioEngine.stop() 293 | audioEngine.reset() 294 | } 295 | #endif 296 | } 297 | } 298 | 299 | /// Event handling private API 300 | private extension Conversation { 301 | @MainActor func handleEvent(_ event: ServerEvent) { 302 | switch event { 303 | case let .error(event): 304 | errorStream.yield(event.error) 305 | case let .sessionCreated(event): 306 | connected = true 307 | session = event.session 308 | case let .sessionUpdated(event): 309 | session = event.session 310 | case let .conversationCreated(event): 311 | id = event.conversation.id 312 | case let .conversationItemCreated(event): 313 | entries.append(event.item) 314 | case let .conversationItemDeleted(event): 315 | entries.removeAll { $0.id == event.itemId } 316 | case let .conversationItemInputAudioTranscriptionCompleted(event): 317 | updateEvent(id: event.itemId) { message in 318 | guard case let .input_audio(audio) = message.content[event.contentIndex] else { return } 319 | 320 | message.content[event.contentIndex] = .input_audio(.init(audio: audio.audio, transcript: event.transcript)) 321 | } 322 | case let .conversationItemInputAudioTranscriptionFailed(event): 323 | errorStream.yield(event.error) 324 | case let .responseContentPartAdded(event): 325 | updateEvent(id: event.itemId) { message in 326 | message.content.insert(.init(from: event.part), at: event.contentIndex) 327 | } 328 | case let .responseContentPartDone(event): 329 | updateEvent(id: event.itemId) { message in 330 | message.content[event.contentIndex] = .init(from: event.part) 331 | } 332 | case let .responseTextDelta(event): 333 | updateEvent(id: event.itemId) { message in 334 | guard case let .text(text) = message.content[event.contentIndex] else { return } 335 | 336 | message.content[event.contentIndex] = .text(text + event.delta) 337 | } 338 | case let .responseTextDone(event): 339 | updateEvent(id: event.itemId) { message in 340 | message.content[event.contentIndex] = .text(event.text) 341 | } 342 | case let .responseAudioTranscriptDelta(event): 343 | updateEvent(id: event.itemId) { message in 344 | guard case let .audio(audio) = message.content[event.contentIndex] else { return } 345 | 346 | message.content[event.contentIndex] = .audio(.init(audio: audio.audio, transcript: (audio.transcript ?? "") + event.delta)) 347 | } 348 | case let .responseAudioTranscriptDone(event): 349 | updateEvent(id: event.itemId) { message in 350 | guard case let .audio(audio) = message.content[event.contentIndex] else { return } 351 | 352 | message.content[event.contentIndex] = .audio(.init(audio: audio.audio, transcript: event.transcript)) 353 | } 354 | case let .responseAudioDelta(event): 355 | updateEvent(id: event.itemId) { message in 356 | guard case let .audio(audio) = message.content[event.contentIndex] else { return } 357 | 358 | if handlingVoice { queueAudioSample(event) } 359 | message.content[event.contentIndex] = .audio(.init(audio: audio.audio + event.delta, transcript: audio.transcript)) 360 | } 361 | case let .responseFunctionCallArgumentsDelta(event): 362 | updateEvent(id: event.itemId) { functionCall in 363 | functionCall.arguments.append(event.delta) 364 | } 365 | case let .responseFunctionCallArgumentsDone(event): 366 | updateEvent(id: event.itemId) { functionCall in 367 | functionCall.arguments = event.arguments 368 | } 369 | case .inputAudioBufferSpeechStarted: 370 | isUserSpeaking = true 371 | if handlingVoice { interruptSpeech() } 372 | case .inputAudioBufferSpeechStopped: 373 | isUserSpeaking = false 374 | case let .responseOutputItemDone(event): 375 | updateEvent(id: event.item.id) { message in 376 | guard case let .message(newMessage) = event.item else { return } 377 | 378 | message = newMessage 379 | } 380 | default: 381 | return 382 | } 383 | } 384 | 385 | @MainActor 386 | func updateEvent(id: String, modifying closure: (inout Item.Message) -> Void) { 387 | guard let index = entries.firstIndex(where: { $0.id == id }), case var .message(message) = entries[index] else { 388 | return 389 | } 390 | 391 | closure(&message) 392 | 393 | entries[index] = .message(message) 394 | } 395 | 396 | @MainActor 397 | func updateEvent(id: String, modifying closure: (inout Item.FunctionCall) -> Void) { 398 | guard let index = entries.firstIndex(where: { $0.id == id }), case var .functionCall(functionCall) = entries[index] else { 399 | return 400 | } 401 | 402 | closure(&functionCall) 403 | 404 | entries[index] = .functionCall(functionCall) 405 | } 406 | } 407 | 408 | /// Audio processing private API 409 | private extension Conversation { 410 | private func queueAudioSample(_ event: ServerEvent.ResponseAudioDeltaEvent) { 411 | guard let buffer = AVAudioPCMBuffer.fromData(event.delta, format: desiredFormat) else { 412 | print("Failed to create audio buffer.") 413 | return 414 | } 415 | 416 | guard let converter = apiConverter.lazy({ AVAudioConverter(from: buffer.format, to: playerNode.outputFormat(forBus: 0)) }) else { 417 | print("Failed to create audio converter.") 418 | return 419 | } 420 | 421 | let outputFrameCapacity = AVAudioFrameCount(ceil(converter.outputFormat.sampleRate / buffer.format.sampleRate) * Double(buffer.frameLength)) 422 | 423 | guard let sample = convertBuffer(buffer: buffer, using: converter, capacity: outputFrameCapacity) else { 424 | print("Failed to convert buffer.") 425 | return 426 | } 427 | 428 | queuedSamples.push(event.itemId) 429 | 430 | playerNode.scheduleBuffer(sample, at: nil, completionCallbackType: .dataPlayedBack) { [weak self] _ in 431 | guard let self else { return } 432 | 433 | self.queuedSamples.popFirst() 434 | if self.queuedSamples.isEmpty { 435 | Task { @MainActor in 436 | playerNode.pause() 437 | } 438 | } 439 | } 440 | 441 | playerNode.play() 442 | } 443 | 444 | private func processAudioBufferFromUser(buffer: AVAudioPCMBuffer) { 445 | let ratio = desiredFormat.sampleRate / buffer.format.sampleRate 446 | 447 | guard let convertedBuffer = convertBuffer(buffer: buffer, using: userConverter.get()!, capacity: AVAudioFrameCount(Double(buffer.frameLength) * ratio)) else { 448 | print("Buffer conversion failed.") 449 | return 450 | } 451 | 452 | guard let sampleBytes = convertedBuffer.audioBufferList.pointee.mBuffers.mData else { return } 453 | let audioData = Data(bytes: sampleBytes, count: Int(convertedBuffer.audioBufferList.pointee.mBuffers.mDataByteSize)) 454 | 455 | Task { 456 | try await send(audioDelta: audioData) 457 | } 458 | } 459 | 460 | private func convertBuffer(buffer: AVAudioPCMBuffer, using converter: AVAudioConverter, capacity: AVAudioFrameCount) -> AVAudioPCMBuffer? { 461 | if buffer.format == converter.outputFormat { 462 | return buffer 463 | } 464 | 465 | guard let convertedBuffer = AVAudioPCMBuffer(pcmFormat: converter.outputFormat, frameCapacity: capacity) else { 466 | print("Failed to create converted audio buffer.") 467 | return nil 468 | } 469 | 470 | var error: NSError? 471 | var allSamplesReceived = false 472 | 473 | let status = converter.convert(to: convertedBuffer, error: &error) { _, outStatus in 474 | if allSamplesReceived { 475 | outStatus.pointee = .noDataNow 476 | return nil 477 | } 478 | 479 | allSamplesReceived = true 480 | outStatus.pointee = .haveData 481 | return buffer 482 | } 483 | 484 | if status == .error { 485 | if let error = error { 486 | print("Error during conversion: \(error.localizedDescription)") 487 | } 488 | return nil 489 | } 490 | 491 | return convertedBuffer 492 | } 493 | } 494 | 495 | // Other private methods 496 | extension Conversation { 497 | /// This hack is required because relying on `queuedSamples.isEmpty` directly crashes the app. 498 | /// This is because updating the `queuedSamples` array on a background thread will trigger a re-render of any views that depend on it on that thread. 499 | /// So, instead, we observe the property and update `isPlaying` on the main actor. 500 | private func _keepIsPlayingPropertyUpdated() { 501 | withObservationTracking { _ = queuedSamples.isEmpty } onChange: { [weak self] in 502 | Task { @MainActor in 503 | guard let self else { return } 504 | 505 | self.isPlaying = self.queuedSamples.isEmpty 506 | } 507 | 508 | self?._keepIsPlayingPropertyUpdated() 509 | } 510 | } 511 | } 512 | -------------------------------------------------------------------------------- /src/Extensions/AVAudioPCMBuffer+fromData.swift: -------------------------------------------------------------------------------- 1 | import AVFoundation 2 | 3 | extension AVAudioPCMBuffer { 4 | static func fromData(_ data: Data, format: AVAudioFormat) -> AVAudioPCMBuffer? { 5 | let frameCount = UInt32(data.count) / format.streamDescription.pointee.mBytesPerFrame 6 | 7 | guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else { 8 | print("Error: Failed to create AVAudioPCMBuffer") 9 | return nil 10 | } 11 | 12 | buffer.frameLength = frameCount 13 | let audioBuffer = buffer.audioBufferList.pointee.mBuffers 14 | 15 | data.withUnsafeBytes { bufferPointer in 16 | guard let address = bufferPointer.baseAddress else { 17 | print("Error: Failed to get base address of data") 18 | return 19 | } 20 | 21 | audioBuffer.mData?.copyMemory(from: address, byteCount: Int(audioBuffer.mDataByteSize)) 22 | } 23 | 24 | return buffer 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/Extensions/Collection+safe.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | extension Collection { 4 | // Returns the element at the specified index if it is within bounds, otherwise nil. 5 | subscript(safe index: Index) -> Element? { 6 | indices.contains(index) ? self[index] : nil 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/Extensions/Continuation+error.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | extension AsyncThrowingStream.Continuation where Failure == any Error { 4 | func yield(error: Failure) { 5 | yield(with: Result.failure(error)) 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /src/Extensions/String+random.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | extension String { 4 | init(randomLength length: Int) { 5 | let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" 6 | self = String((0.. Self { 111 | .updateSession(SessionUpdateEvent(eventId: id, session: session)) 112 | } 113 | 114 | static func appendInputAudioBuffer(id: String? = nil, encoding audio: Data) -> Self { 115 | .appendInputAudioBuffer(InputAudioBufferAppendEvent(eventId: id, audio: audio.base64EncodedString())) 116 | } 117 | 118 | static func commitInputAudioBuffer(id: String? = nil) -> Self { 119 | .commitInputAudioBuffer(InputAudioBufferCommitEvent(eventId: id)) 120 | } 121 | 122 | static func clearInputAudioBuffer(id: String? = nil) -> Self { 123 | .clearInputAudioBuffer(InputAudioBufferClearEvent(eventId: id)) 124 | } 125 | 126 | static func createConversationItem(id: String? = nil, previous previousID: String? = nil, _ item: Item) -> Self { 127 | .createConversationItem(ConversationItemCreateEvent(eventId: id, previousItemId: previousID, item: item)) 128 | } 129 | 130 | static func truncateConversationItem(id eventId: String? = nil, forItem itemId: String, at index: Int = 0, atAudioMs audioMs: Int) -> Self { 131 | .truncateConversationItem(ConversationItemTruncateEvent(eventId: eventId, itemId: itemId, contentIndex: index, audioEndMs: audioMs)) 132 | } 133 | 134 | static func deleteConversationItem(id eventId: String? = nil, for id: String? = nil, at index: Int, atAudio audioIndex: Int) -> Self { 135 | .deleteConversationItem(ConversationItemDeleteEvent(eventId: eventId, itemId: id, contentIndex: index, audioEndMs: audioIndex)) 136 | } 137 | 138 | static func createResponse(id: String? = nil, _ response: Response.Config? = nil) -> Self { 139 | .createResponse(ResponseCreateEvent(eventId: id, response: response)) 140 | } 141 | 142 | static func cancelResponse(id: String? = nil) -> Self { 143 | .cancelResponse(ResponseCancelEvent(eventId: id)) 144 | } 145 | } 146 | 147 | extension ClientEvent: Encodable { 148 | private enum CodingKeys: String, CodingKey { 149 | case type 150 | } 151 | 152 | public func encode(to encoder: Encoder) throws { 153 | switch self { 154 | case let .updateSession(event): 155 | try event.encode(to: encoder) 156 | case let .appendInputAudioBuffer(event): 157 | try event.encode(to: encoder) 158 | case let .commitInputAudioBuffer(event): 159 | try event.encode(to: encoder) 160 | case let .clearInputAudioBuffer(event): 161 | try event.encode(to: encoder) 162 | case let .createConversationItem(event): 163 | try event.encode(to: encoder) 164 | case let .truncateConversationItem(event): 165 | try event.encode(to: encoder) 166 | case let .deleteConversationItem(event): 167 | try event.encode(to: encoder) 168 | case let .createResponse(event): 169 | try event.encode(to: encoder) 170 | case let .cancelResponse(event): 171 | try event.encode(to: encoder) 172 | } 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/Models/Item.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | public enum Item: Identifiable, Equatable, Sendable { 4 | public enum ItemStatus: String, Codable, Sendable { 5 | case completed 6 | case in_progress 7 | case incomplete 8 | } 9 | 10 | public enum ItemRole: String, Codable, Sendable { 11 | case user 12 | case system 13 | case assistant 14 | } 15 | 16 | public struct Audio: Equatable, Sendable { 17 | /// Base64-encoded audio bytes. 18 | public var audio: Data 19 | /// The transcript of the audio. 20 | public var transcript: String? 21 | 22 | public init(audio: Data = Data(), transcript: String? = nil) { 23 | self.audio = audio 24 | self.transcript = transcript 25 | } 26 | } 27 | 28 | public enum ContentPart: Equatable, Sendable { 29 | case text(String) 30 | case audio(Audio) 31 | } 32 | 33 | public struct Message: Identifiable, Codable, Equatable, Sendable { 34 | public enum Content: Equatable, Sendable { 35 | case text(String) 36 | case audio(Audio) 37 | case input_text(String) 38 | case input_audio(Audio) 39 | 40 | public var text: String? { 41 | switch self { 42 | case let .text(text): 43 | return text 44 | case let .input_text(text): 45 | return text 46 | case let .input_audio(audio): 47 | return audio.transcript 48 | case let .audio(audio): 49 | return audio.transcript 50 | } 51 | } 52 | } 53 | 54 | /// The unique ID of the item. 55 | public var id: String 56 | /// The type of the item 57 | private var type: String = "message" 58 | /// The status of the item 59 | public var status: ItemStatus 60 | /// The role associated with the item 61 | public var role: ItemRole 62 | /// The content of the message. 63 | public var content: [Content] 64 | 65 | public init(id: String, from role: ItemRole, content: [Content]) { 66 | self.id = id 67 | self.role = role 68 | status = .completed 69 | self.content = content 70 | } 71 | } 72 | 73 | public struct FunctionCall: Identifiable, Codable, Equatable, Sendable { 74 | /// The unique ID of the item. 75 | public var id: String 76 | /// The type of the item 77 | private var type: String = "function_call" 78 | /// The status of the item 79 | public var status: ItemStatus 80 | /// The ID of the function call 81 | public var callId: String 82 | /// The name of the function being called 83 | public var name: String 84 | /// The arguments of the function call 85 | public var arguments: String 86 | } 87 | 88 | public struct FunctionCallOutput: Identifiable, Codable, Equatable, Sendable { 89 | /// The unique ID of the item. 90 | public var id: String 91 | /// The type of the item 92 | private var type: String = "function_call_output" 93 | /// The ID of the function call 94 | public var callId: String 95 | /// The output of the function call 96 | public var output: String 97 | 98 | public init(id: String, callId: String, output: String) { 99 | self.id = id 100 | self.callId = callId 101 | self.output = output 102 | } 103 | } 104 | 105 | case message(Message) 106 | case functionCall(FunctionCall) 107 | case functionCallOutput(FunctionCallOutput) 108 | 109 | public var id: String { 110 | switch self { 111 | case let .message(message): 112 | return message.id 113 | case let .functionCall(functionCall): 114 | return functionCall.id 115 | case let .functionCallOutput(functionCallOutput): 116 | return functionCallOutput.id 117 | } 118 | } 119 | 120 | public init(message: Message) { 121 | self = .message(message) 122 | } 123 | 124 | public init(calling functionCall: FunctionCall) { 125 | self = .functionCall(functionCall) 126 | } 127 | 128 | public init(with functionCallOutput: FunctionCallOutput) { 129 | self = .functionCallOutput(functionCallOutput) 130 | } 131 | } 132 | 133 | // MARK: Helpers 134 | 135 | public extension Item.Message.Content { 136 | init(from part: Item.ContentPart) { 137 | switch part { 138 | case let .audio(audio): 139 | self = .audio(audio) 140 | case let .text(text): 141 | self = .text(text) 142 | } 143 | } 144 | } 145 | 146 | // MARK: Codable implementations 147 | 148 | extension Item: Codable { 149 | private enum CodingKeys: String, CodingKey { 150 | case type 151 | } 152 | 153 | public init(from decoder: any Decoder) throws { 154 | let container = try decoder.container(keyedBy: CodingKeys.self) 155 | let type = try container.decode(String.self, forKey: .type) 156 | 157 | switch type { 158 | case "message": 159 | self = try .message(Message(from: decoder)) 160 | case "function_call": 161 | self = try .functionCall(FunctionCall(from: decoder)) 162 | case "function_call_output": 163 | self = try .functionCallOutput(FunctionCallOutput(from: decoder)) 164 | default: 165 | throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown item type: \(type)") 166 | } 167 | } 168 | 169 | public func encode(to encoder: Encoder) throws { 170 | switch self { 171 | case let .message(message): 172 | try message.encode(to: encoder) 173 | case let .functionCall(functionCall): 174 | try functionCall.encode(to: encoder) 175 | case let .functionCallOutput(functionCallOutput): 176 | try functionCallOutput.encode(to: encoder) 177 | } 178 | } 179 | } 180 | 181 | extension Item.Audio: Decodable { 182 | private enum CodingKeys: String, CodingKey { 183 | case audio 184 | case transcript 185 | } 186 | 187 | public init(from decoder: any Decoder) throws { 188 | let container = try decoder.container(keyedBy: CodingKeys.self) 189 | transcript = try container.decodeIfPresent(String.self, forKey: .transcript) 190 | let encodedAudio = try container.decodeIfPresent(String.self, forKey: .audio) 191 | 192 | if let encodedAudio { 193 | guard let decodedAudio = Data(base64Encoded: encodedAudio) else { 194 | throw DecodingError.dataCorruptedError(forKey: .audio, in: container, debugDescription: "Invalid base64-encoded audio data.") 195 | } 196 | audio = decodedAudio 197 | } else { 198 | audio = Data() 199 | } 200 | } 201 | } 202 | 203 | extension Item.ContentPart: Decodable { 204 | private enum CodingKeys: String, CodingKey { 205 | case type 206 | case text 207 | case audio 208 | case transcript 209 | } 210 | 211 | private struct Text: Codable { 212 | let text: String 213 | 214 | enum CodingKeys: CodingKey { 215 | case text 216 | } 217 | } 218 | 219 | public init(from decoder: any Decoder) throws { 220 | let container = try decoder.container(keyedBy: CodingKeys.self) 221 | let type = try container.decode(String.self, forKey: .type) 222 | 223 | switch type { 224 | case "text": 225 | let container = try decoder.container(keyedBy: Text.CodingKeys.self) 226 | self = try .text(container.decode(String.self, forKey: .text)) 227 | case "audio": 228 | self = try .audio(Item.Audio(from: decoder)) 229 | default: 230 | throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown content type: \(type)") 231 | } 232 | } 233 | } 234 | 235 | extension Item.Message.Content: Codable { 236 | private enum CodingKeys: String, CodingKey { 237 | case type 238 | case text 239 | case audio 240 | case transcript 241 | } 242 | 243 | private struct Text: Codable { 244 | let text: String 245 | 246 | enum CodingKeys: CodingKey { 247 | case text 248 | } 249 | } 250 | 251 | public init(from decoder: any Decoder) throws { 252 | let container = try decoder.container(keyedBy: CodingKeys.self) 253 | let type = try container.decode(String.self, forKey: .type) 254 | 255 | switch type { 256 | case "text": 257 | let container = try decoder.container(keyedBy: Text.CodingKeys.self) 258 | self = try .text(container.decode(String.self, forKey: .text)) 259 | case "input_text": 260 | let container = try decoder.container(keyedBy: Text.CodingKeys.self) 261 | self = try .input_text(container.decode(String.self, forKey: .text)) 262 | case "audio": 263 | self = try .audio(Item.Audio(from: decoder)) 264 | case "input_audio": 265 | self = try .input_audio(Item.Audio(from: decoder)) 266 | default: 267 | throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown content type: \(type)") 268 | } 269 | } 270 | 271 | public func encode(to encoder: Encoder) throws { 272 | var container = encoder.container(keyedBy: CodingKeys.self) 273 | 274 | switch self { 275 | case let .text(text): 276 | try container.encode(text, forKey: .text) 277 | try container.encode("text", forKey: .type) 278 | case let .input_text(text): 279 | try container.encode(text, forKey: .text) 280 | try container.encode("input_text", forKey: .type) 281 | case let .audio(audio): 282 | try container.encode("audio", forKey: .type) 283 | try container.encode(audio.transcript, forKey: .transcript) 284 | try container.encode(audio.audio.base64EncodedString(), forKey: .audio) 285 | case let .input_audio(audio): 286 | try container.encode("input_audio", forKey: .type) 287 | try container.encode(audio.transcript, forKey: .transcript) 288 | try container.encode(audio.audio.base64EncodedString(), forKey: .audio) 289 | } 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /src/Models/Response.swift: -------------------------------------------------------------------------------- 1 | public struct Response: Identifiable, Codable, Equatable, Sendable { 2 | public struct Config: Codable, Equatable, Sendable { 3 | public enum Conversation: String, Codable, Equatable, Sendable { 4 | /// The contents of the response will be added to the default conversation. 5 | case auto 6 | /// An out-of-band response which will not add items to default conversation. 7 | case none 8 | } 9 | 10 | /// The modalities for the response. 11 | public let modalities: [Session.Modality] 12 | /// Instructions for the model. 13 | public let instructions: String 14 | /// The voice the model uses to respond. 15 | public let voice: Session.Voice 16 | /// The format of output audio. 17 | public let outputAudioFormat: Session.AudioFormat 18 | /// Tools (functions) available to the model. 19 | public let tools: [Session.Tool] 20 | /// How the model chooses tools. 21 | public let toolChoice: Session.ToolChoice 22 | /// Sampling temperature. 23 | public let temperature: Double 24 | /// Maximum number of output tokens. 25 | public let maxResponseOutputTokens: Int? 26 | /// Controls which conversation the response is added to. 27 | public let conversation: Conversation? 28 | /// Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format. Keys can be a maximum of 64 characters long and values can be a maximum of 512 characters long. 29 | public let metadata: [String: String]? 30 | /// Input items to include in the prompt for the model. Creates a new context for this response, without including the default conversation. Can include references to items from the default conversation. 31 | public let input: [Item]? 32 | 33 | public init(modalities: [Session.Modality] = [.text, .audio], instructions: String, voice: Session.Voice = .alloy, outputAudioFormat: Session.AudioFormat = .pcm16, tools: [Session.Tool] = [], toolChoice: Session.ToolChoice = .auto, temperature: Double = 1, maxResponseOutputTokens: Int? = nil, conversation: Conversation? = .auto, metadata: [String: String]? = nil, input: [Item]? = nil) { 34 | self.input = input 35 | self.voice = voice 36 | self.tools = tools 37 | self.metadata = metadata 38 | self.toolChoice = toolChoice 39 | self.modalities = modalities 40 | self.temperature = temperature 41 | self.instructions = instructions 42 | self.conversation = conversation 43 | self.outputAudioFormat = outputAudioFormat 44 | self.maxResponseOutputTokens = maxResponseOutputTokens 45 | } 46 | } 47 | 48 | public enum Status: String, Codable, Equatable, Sendable { 49 | case failed 50 | case completed 51 | case cancelled 52 | case incomplete 53 | case inProgress = "in_progress" 54 | } 55 | 56 | public struct Usage: Codable, Equatable, Sendable { 57 | public let totalTokens: Int 58 | public let inputTokens: Int 59 | public let outputTokens: Int 60 | public let inputTokenDetails: InputTokenDetails 61 | public let outputTokenDetails: OutputTokenDetails 62 | 63 | public struct InputTokenDetails: Codable, Equatable, Sendable { 64 | public let textTokens: Int 65 | public let audioTokens: Int 66 | public let cachedTokens: Int 67 | public let cachedTokensDetails: CachedTokensDetails 68 | 69 | public struct CachedTokensDetails: Codable, Equatable, Sendable { 70 | public let textTokens: Int 71 | public let audioTokens: Int 72 | } 73 | } 74 | 75 | public struct OutputTokenDetails: Codable, Equatable, Sendable { 76 | public let textTokens: Int 77 | public let audioTokens: Int 78 | } 79 | } 80 | 81 | /// The unique ID of the response. 82 | public let id: String 83 | /// The status of the response. 84 | public let status: Status 85 | /// The list of output items generated by the response. 86 | public let output: [Item] 87 | /// Usage statistics for the response. 88 | public let usage: Usage? 89 | /// Developer-provided string key-value pairs associated with this response. 90 | public let metadata: [String: String]? 91 | } 92 | -------------------------------------------------------------------------------- /src/Models/ServerError.swift: -------------------------------------------------------------------------------- 1 | public struct ServerError: Codable, Equatable, Sendable { 2 | /// The type of error (e.g., "invalid_request_error", "server_error"). 3 | public let type: String 4 | /// Error code, if any. 5 | public let code: String? 6 | /// A human-readable error message. 7 | public let message: String 8 | /// Parameter related to the error, if any. 9 | public let param: String? 10 | /// The eventId of the client event that caused the error, if applicable. 11 | public let eventId: String? 12 | } 13 | -------------------------------------------------------------------------------- /src/Models/ServerEvent.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | public enum ServerEvent: Sendable { 4 | public struct ErrorEvent: Decodable, Sendable { 5 | /// The unique ID of the server event. 6 | public let eventId: String 7 | /// Details of the error. 8 | public let error: ServerError 9 | } 10 | 11 | public struct SessionEvent: Decodable, Sendable { 12 | /// The unique ID of the server event. 13 | public let eventId: String 14 | /// The session resource. 15 | public let session: Session 16 | } 17 | 18 | public struct ConversationCreatedEvent: Decodable, Sendable { 19 | public struct Conversation: Codable, Sendable { 20 | /// The unique ID of the conversation. 21 | public let id: String 22 | } 23 | 24 | /// The unique ID of the server event. 25 | public let eventId: String 26 | /// The conversation resource. 27 | public let conversation: Conversation 28 | } 29 | 30 | public struct InputAudioBufferCommittedEvent: Decodable, Sendable { 31 | /// The unique ID of the server event. 32 | public let eventId: String 33 | /// The ID of the preceding item after which the new item will be inserted. 34 | public let previousItemId: String? 35 | /// The ID of the user message item that will be created. 36 | public let itemId: String 37 | } 38 | 39 | public struct InputAudioBufferClearedEvent: Decodable, Sendable { 40 | /// The unique ID of the server event. 41 | public let eventId: String 42 | } 43 | 44 | public struct InputAudioBufferSpeechStartedEvent: Decodable, Sendable { 45 | /// The unique ID of the server event. 46 | public let eventId: String 47 | /// Milliseconds since the session started when speech was detected. 48 | public let audioStartMs: Int 49 | /// The ID of the user message item that will be created when speech stops. 50 | public let itemId: String 51 | } 52 | 53 | public struct InputAudioBufferSpeechStoppedEvent: Decodable, Sendable { 54 | /// The unique ID of the server event. 55 | public let eventId: String 56 | /// Milliseconds since the session started when speech stopped. 57 | public let audioEndMs: Int 58 | /// The ID of the user message item that will be created. 59 | public let itemId: String 60 | } 61 | 62 | public struct ConversationItemCreatedEvent: Decodable, Sendable { 63 | /// The unique ID of the server event. 64 | public let eventId: String 65 | /// The ID of the preceding item. 66 | public let previousItemId: String? 67 | /// The item that was created. 68 | public let item: Item 69 | } 70 | 71 | public struct ConversationItemInputAudioTranscriptionCompletedEvent: Decodable, Sendable { 72 | /// The unique ID of the server event. 73 | public let eventId: String 74 | /// The ID of the user message item. 75 | public let itemId: String 76 | /// The index of the content part containing the audio. 77 | public let contentIndex: Int 78 | /// The transcribed text. 79 | public let transcript: String 80 | } 81 | 82 | public struct ConversationItemInputAudioTranscriptionDeltaEvent: Decodable, Sendable { 83 | /// The unique ID of the server event. 84 | public let eventId: String 85 | /// The ID of the user message item. 86 | public let itemId: String 87 | /// The index of the content part containing the audio. 88 | public let contentIndex: Int 89 | /// The transcribed delta text. 90 | public let delta: String 91 | } 92 | 93 | public struct ConversationItemInputAudioTranscriptionFailedEvent: Decodable, Sendable { 94 | /// The unique ID of the server event. 95 | public let eventId: String 96 | /// The ID of the user message item. 97 | public let itemId: String 98 | /// The index of the content part containing the audio. 99 | public let contentIndex: Int 100 | /// Details of the transcription error. 101 | public let error: ServerError 102 | } 103 | 104 | public struct ConversationItemTruncatedEvent: Decodable, Sendable { 105 | /// The unique ID of the server event. 106 | public let eventId: String 107 | /// The ID of the assistant message item that was truncated. 108 | public let itemId: String 109 | /// The index of the content part that was truncated. 110 | public let contentIndex: Int 111 | /// The duration up to which the audio was truncated, in milliseconds. 112 | public let audioEndMs: Int 113 | } 114 | 115 | public struct ConversationItemDeletedEvent: Decodable, Sendable { 116 | /// The unique ID of the server event. 117 | public let eventId: String 118 | /// The ID of the item that was deleted. 119 | public let itemId: String 120 | } 121 | 122 | public struct OutputAudioBufferStartedEvent: Decodable, Sendable { 123 | /// The unique ID of the server event. 124 | public let eventId: String 125 | /// The ID of the response. 126 | public let responseId: String 127 | } 128 | 129 | public struct OutputAudioBufferStoppedEvent: Decodable, Sendable { 130 | /// The unique ID of the server event. 131 | public let eventId: String 132 | /// The ID of the response. 133 | public let responseId: String 134 | } 135 | 136 | public struct ResponseEvent: Decodable, Sendable { 137 | /// The unique ID of the server event. 138 | public let eventId: String 139 | /// The response resource. 140 | public let response: Response 141 | } 142 | 143 | public struct ResponseOutputItemAddedEvent: Decodable, Sendable { 144 | /// The unique ID of the server event. 145 | public let eventId: String 146 | /// The ID of the response to which the item belongs. 147 | public let responseId: String 148 | /// The index of the output item in the response. 149 | public let outputIndex: Int 150 | /// The item that was added. 151 | public let item: Item 152 | } 153 | 154 | public struct ResponseOutputItemDoneEvent: Decodable, Sendable { 155 | /// The unique ID of the server event. 156 | public let eventId: String 157 | /// The ID of the response to which the item belongs. 158 | public let responseId: String 159 | /// The index of the output item in the response. 160 | public let outputIndex: Int 161 | /// The completed item. 162 | public let item: Item 163 | } 164 | 165 | public struct ResponseContentPartAddedEvent: Decodable, Sendable { 166 | /// The unique ID of the server event. 167 | public let eventId: String 168 | /// The ID of the response. 169 | public let responseId: String 170 | /// The ID of the item to which the content part was added. 171 | public let itemId: String 172 | /// The index of the output item in the response. 173 | public let outputIndex: Int 174 | /// The index of the content part in the item's content array. 175 | public let contentIndex: Int 176 | /// The content part that was added. 177 | public let part: Item.ContentPart 178 | } 179 | 180 | public struct ResponseContentPartDoneEvent: Decodable, Sendable { 181 | /// The unique ID of the server event. 182 | public let eventId: String 183 | /// The ID of the response. 184 | public let responseId: String 185 | /// The ID of the item. 186 | public let itemId: String 187 | /// The index of the output item in the response. 188 | public let outputIndex: Int 189 | /// The index of the content part in the item's content array. 190 | public let contentIndex: Int 191 | /// The content part that is done. 192 | public let part: Item.ContentPart 193 | } 194 | 195 | public struct ResponseTextDeltaEvent: Decodable, Sendable { 196 | /// The unique ID of the server event. 197 | public let eventId: String 198 | /// The ID of the response. 199 | public let responseId: String 200 | /// The ID of the item. 201 | public let itemId: String 202 | /// The index of the output item in the response. 203 | public let outputIndex: Int 204 | /// The index of the content part in the item's content array. 205 | public let contentIndex: Int 206 | /// The text delta. 207 | public let delta: String 208 | } 209 | 210 | public struct ResponseTextDoneEvent: Decodable, Sendable { 211 | /// The unique ID of the server event. 212 | public let eventId: String 213 | /// The ID of the response. 214 | public let responseId: String 215 | /// The ID of the item. 216 | public let itemId: String 217 | /// The index of the output item in the response. 218 | public let outputIndex: Int 219 | /// The index of the content part in the item's content array. 220 | public let contentIndex: Int 221 | /// The final text content. 222 | public let text: String 223 | } 224 | 225 | public struct ResponseAudioTranscriptDeltaEvent: Decodable, Sendable { 226 | /// The unique ID of the server event. 227 | public let eventId: String 228 | /// The ID of the response. 229 | public let responseId: String 230 | /// The ID of the item. 231 | public let itemId: String 232 | /// The index of the output item in the response. 233 | public let outputIndex: Int 234 | /// The index of the content part in the item's content array. 235 | public let contentIndex: Int 236 | /// The transcript delta. 237 | public let delta: String 238 | } 239 | 240 | public struct ResponseAudioTranscriptDoneEvent: Decodable, Sendable { 241 | /// The unique ID of the server event. 242 | public let eventId: String 243 | /// The ID of the response. 244 | public let responseId: String 245 | /// The ID of the item. 246 | public let itemId: String 247 | /// The index of the output item in the response. 248 | public let outputIndex: Int 249 | /// The index of the content part in the item's content array. 250 | public let contentIndex: Int 251 | /// The final transcript of the audio. 252 | public let transcript: String 253 | } 254 | 255 | public struct ResponseAudioDeltaEvent: Sendable { 256 | /// The unique ID of the server event. 257 | public let eventId: String 258 | /// The ID of the response. 259 | public let responseId: String 260 | /// The ID of the item. 261 | public let itemId: String 262 | /// The index of the output item in the response. 263 | public let outputIndex: Int 264 | /// The index of the content part in the item's content array. 265 | public let contentIndex: Int 266 | /// Base64-encoded audio data delta. 267 | public let delta: Data 268 | } 269 | 270 | public struct ResponseAudioDoneEvent: Decodable, Sendable { 271 | /// The unique ID of the server event. 272 | public let eventId: String 273 | /// The ID of the response. 274 | public let responseId: String 275 | /// The ID of the item. 276 | public let itemId: String 277 | /// The index of the output item in the response. 278 | public let outputIndex: Int 279 | /// The index of the content part in the item's content array. 280 | public let contentIndex: Int 281 | } 282 | 283 | public struct ResponseFunctionCallArgumentsDeltaEvent: Decodable, Sendable { 284 | /// The unique ID of the server event. 285 | public let eventId: String 286 | /// The ID of the response. 287 | public let responseId: String 288 | /// The ID of the function call item. 289 | public let itemId: String 290 | /// The index of the output item in the response. 291 | public let outputIndex: Int 292 | /// The ID of the function call. 293 | public let callId: String 294 | /// The arguments delta as a JSON string. 295 | public let delta: String 296 | } 297 | 298 | public struct ResponseFunctionCallArgumentsDoneEvent: Decodable, Sendable { 299 | /// The unique ID of the server event. 300 | public let eventId: String 301 | /// The ID of the response. 302 | public let responseId: String 303 | /// The ID of the function call item. 304 | public let itemId: String 305 | /// The index of the output item in the response. 306 | public let outputIndex: Int 307 | /// The ID of the function call. 308 | public let callId: String 309 | /// The final arguments as a JSON string. 310 | public let arguments: String 311 | } 312 | 313 | public struct RateLimitsUpdatedEvent: Decodable, Sendable { 314 | public struct RateLimit: Codable, Sendable { 315 | /// The name of the rate limit 316 | public let name: String 317 | /// The maximum allowed value for the rate limit. 318 | public let limit: Int 319 | /// The remaining value before the limit is reached. 320 | public let remaining: Int 321 | /// Seconds until the rate limit resets. 322 | public let resetSeconds: Double 323 | } 324 | 325 | /// The unique ID of the server event. 326 | public let eventId: String 327 | /// List of rate limit information. 328 | public let rateLimits: [RateLimit] 329 | } 330 | 331 | /// Returned when an error occurs. 332 | case error(ErrorEvent) 333 | /// Returned when a session is created. Emitted automatically when a new connection is established. 334 | case sessionCreated(SessionEvent) 335 | /// Returned when a session is updated. 336 | case sessionUpdated(SessionEvent) 337 | /// Returned when a conversation is created. Emitted right after session creation. 338 | case conversationCreated(ConversationCreatedEvent) 339 | /// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode. 340 | case inputAudioBufferCommitted(InputAudioBufferCommittedEvent) 341 | /// Returned when the input audio buffer is cleared by the client. 342 | case inputAudioBufferCleared(InputAudioBufferClearedEvent) 343 | /// Returned in server turn detection mode when speech is detected. 344 | case inputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent) 345 | /// Returned in server turn detection mode when speech stops. 346 | case inputAudioBufferSpeechStopped(InputAudioBufferSpeechStoppedEvent) 347 | /// Returned when a conversation item is created. 348 | case conversationItemCreated(ConversationItemCreatedEvent) 349 | /// Returned when input audio transcription is enabled and a transcription succeeds. 350 | case conversationItemInputAudioTranscriptionCompleted(ConversationItemInputAudioTranscriptionCompletedEvent) 351 | /// Returned when input audio transcription is enabled and a transcription receives delta. 352 | case conversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent) 353 | /// Returned when input audio transcription is configured, and a transcription request for a user message failed. 354 | case conversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailedEvent) 355 | /// Returned when an earlier assistant audio message item is truncated by the client. 356 | case conversationItemTruncated(ConversationItemTruncatedEvent) 357 | /// Returned when an item in the conversation is deleted. 358 | case conversationItemDeleted(ConversationItemDeletedEvent) 359 | /// Returned when the output audio buffer is started. 360 | case outputAudioBufferStarted(OutputAudioBufferStartedEvent) 361 | /// Returned when the output audio buffer is stopped. 362 | case outputAudioBufferStopped(OutputAudioBufferStoppedEvent) 363 | /// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress". 364 | case responseCreated(ResponseEvent) 365 | /// Returned when a Response is done streaming. Always emitted, no matter the final state. 366 | case responseDone(ResponseEvent) 367 | /// Returned when a new Item is created during response generation. 368 | case responseOutputItemAdded(ResponseOutputItemAddedEvent) 369 | /// Returned when an Item is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. 370 | case responseOutputItemDone(ResponseOutputItemDoneEvent) 371 | /// Returned when a new content part is added to an assistant message item during response generation. 372 | case responseContentPartAdded(ResponseContentPartAddedEvent) 373 | /// Returned when a content part is done streaming in an assistant message item. Also emitted when a Response is interrupted, incomplete, or cancelled. 374 | case responseContentPartDone(ResponseContentPartDoneEvent) 375 | /// Returned when the text value of a "text" content part is updated. 376 | case responseTextDelta(ResponseTextDeltaEvent) 377 | /// Returned when the text value of a "text" content part is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. 378 | case responseTextDone(ResponseTextDoneEvent) 379 | /// Returned when the model-generated transcription of audio output is updated. 380 | case responseAudioTranscriptDelta(ResponseAudioTranscriptDeltaEvent) 381 | /// Returned when the model-generated transcription of audio output is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. 382 | case responseAudioTranscriptDone(ResponseAudioTranscriptDoneEvent) 383 | /// Returned when the model-generated audio is updated. 384 | case responseAudioDelta(ResponseAudioDeltaEvent) 385 | /// Returned when the model-generated audio is done. Also emitted when a Response is interrupted, incomplete, or cancelled. 386 | case responseAudioDone(ResponseAudioDoneEvent) 387 | /// Returned when the model-generated function call arguments are updated. 388 | case responseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDeltaEvent) 389 | /// Returned when the model-generated function call arguments are done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. 390 | case responseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDoneEvent) 391 | /// Emitted after every "response.done" event to indicate the updated rate limits. 392 | case rateLimitsUpdated(RateLimitsUpdatedEvent) 393 | } 394 | 395 | extension ServerEvent: Identifiable { 396 | public var id: String { 397 | switch self { 398 | case let .error(event): 399 | return event.eventId 400 | case let .sessionCreated(event): 401 | return event.eventId 402 | case let .sessionUpdated(event): 403 | return event.eventId 404 | case let .conversationCreated(event): 405 | return event.eventId 406 | case let .inputAudioBufferCommitted(event): 407 | return event.eventId 408 | case let .inputAudioBufferCleared(event): 409 | return event.eventId 410 | case let .inputAudioBufferSpeechStarted(event): 411 | return event.eventId 412 | case let .inputAudioBufferSpeechStopped(event): 413 | return event.eventId 414 | case let .conversationItemCreated(event): 415 | return event.eventId 416 | case let .conversationItemInputAudioTranscriptionCompleted(event): 417 | return event.eventId 418 | case let .conversationItemInputAudioTranscriptionDelta(event): 419 | return event.eventId 420 | case let .conversationItemInputAudioTranscriptionFailed(event): 421 | return event.eventId 422 | case let .conversationItemTruncated(event): 423 | return event.eventId 424 | case let .conversationItemDeleted(event): 425 | return event.eventId 426 | case let .outputAudioBufferStarted(event): 427 | return event.eventId 428 | case let .outputAudioBufferStopped(event): 429 | return event.eventId 430 | case let .responseCreated(event): 431 | return event.eventId 432 | case let .responseDone(event): 433 | return event.eventId 434 | case let .responseOutputItemAdded(event): 435 | return event.eventId 436 | case let .responseOutputItemDone(event): 437 | return event.eventId 438 | case let .responseContentPartAdded(event): 439 | return event.eventId 440 | case let .responseContentPartDone(event): 441 | return event.eventId 442 | case let .responseTextDelta(event): 443 | return event.eventId 444 | case let .responseTextDone(event): 445 | return event.eventId 446 | case let .responseAudioTranscriptDelta(event): 447 | return event.eventId 448 | case let .responseAudioTranscriptDone(event): 449 | return event.eventId 450 | case let .responseAudioDelta(event): 451 | return event.eventId 452 | case let .responseAudioDone(event): 453 | return event.eventId 454 | case let .responseFunctionCallArgumentsDelta(event): 455 | return event.eventId 456 | case let .responseFunctionCallArgumentsDone(event): 457 | return event.eventId 458 | case let .rateLimitsUpdated(event): 459 | return event.eventId 460 | } 461 | } 462 | } 463 | 464 | extension ServerEvent: Decodable { 465 | private enum CodingKeys: String, CodingKey { 466 | case type 467 | } 468 | 469 | public init(from decoder: any Decoder) throws { 470 | let container = try decoder.container(keyedBy: CodingKeys.self) 471 | let eventType = try container.decode(String.self, forKey: .type) 472 | 473 | switch eventType { 474 | case "error": 475 | self = try .error(ErrorEvent(from: decoder)) 476 | case "session.created": 477 | self = try .sessionCreated(SessionEvent(from: decoder)) 478 | case "session.updated": 479 | self = try .sessionUpdated(SessionEvent(from: decoder)) 480 | case "conversation.created": 481 | self = try .conversationCreated(ConversationCreatedEvent(from: decoder)) 482 | case "input_audio_buffer.committed": 483 | self = try .inputAudioBufferCommitted(InputAudioBufferCommittedEvent(from: decoder)) 484 | case "input_audio_buffer.cleared": 485 | self = try .inputAudioBufferCleared(InputAudioBufferClearedEvent(from: decoder)) 486 | case "input_audio_buffer.speech_started": 487 | self = try .inputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent(from: decoder)) 488 | case "input_audio_buffer.speech_stopped": 489 | self = try .inputAudioBufferSpeechStopped(InputAudioBufferSpeechStoppedEvent(from: decoder)) 490 | case "conversation.item.created": 491 | self = try .conversationItemCreated(ConversationItemCreatedEvent(from: decoder)) 492 | case "conversation.item.input_audio_transcription.completed": 493 | self = try .conversationItemInputAudioTranscriptionCompleted(ConversationItemInputAudioTranscriptionCompletedEvent(from: decoder)) 494 | case "conversation.item.input_audio_transcription.delta": 495 | self = try .conversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent(from: decoder)) 496 | case "conversation.item.input_audio_transcription.failed": 497 | self = try .conversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailedEvent(from: decoder)) 498 | case "conversation.item.truncated": 499 | self = try .conversationItemTruncated(ConversationItemTruncatedEvent(from: decoder)) 500 | case "conversation.item.deleted": 501 | self = try .conversationItemDeleted(ConversationItemDeletedEvent(from: decoder)) 502 | case "output_audio_buffer.started": 503 | self = try .outputAudioBufferStarted(OutputAudioBufferStartedEvent(from: decoder)) 504 | case "output_audio_buffer.stopped": 505 | self = try .outputAudioBufferStopped(OutputAudioBufferStoppedEvent(from: decoder)) 506 | case "response.created": 507 | self = try .responseCreated(ResponseEvent(from: decoder)) 508 | case "response.done": 509 | self = try .responseDone(ResponseEvent(from: decoder)) 510 | case "response.output_item.added": 511 | self = try .responseOutputItemAdded(ResponseOutputItemAddedEvent(from: decoder)) 512 | case "response.output_item.done": 513 | self = try .responseOutputItemDone(ResponseOutputItemDoneEvent(from: decoder)) 514 | case "response.content_part.added": 515 | self = try .responseContentPartAdded(ResponseContentPartAddedEvent(from: decoder)) 516 | case "response.content_part.done": 517 | self = try .responseContentPartDone(ResponseContentPartDoneEvent(from: decoder)) 518 | case "response.text.delta": 519 | self = try .responseTextDelta(ResponseTextDeltaEvent(from: decoder)) 520 | case "response.text.done": 521 | self = try .responseTextDone(ResponseTextDoneEvent(from: decoder)) 522 | case "response.audio_transcript.delta": 523 | self = try .responseAudioTranscriptDelta(ResponseAudioTranscriptDeltaEvent(from: decoder)) 524 | case "response.audio_transcript.done": 525 | self = try .responseAudioTranscriptDone(ResponseAudioTranscriptDoneEvent(from: decoder)) 526 | case "response.audio.delta": 527 | self = try .responseAudioDelta(ResponseAudioDeltaEvent(from: decoder)) 528 | case "response.audio.done": 529 | self = try .responseAudioDone(ResponseAudioDoneEvent(from: decoder)) 530 | case "response.function_call_arguments.delta": 531 | self = try .responseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDeltaEvent(from: decoder)) 532 | case "response.function_call_arguments.done": 533 | self = try .responseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDoneEvent(from: decoder)) 534 | case "rate_limits.updated": 535 | self = try .rateLimitsUpdated(RateLimitsUpdatedEvent(from: decoder)) 536 | default: 537 | throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown event type: \(eventType)") 538 | } 539 | } 540 | } 541 | 542 | extension ServerEvent.ResponseAudioDeltaEvent: Decodable { 543 | private enum CodingKeys: CodingKey { 544 | case eventId 545 | case responseId 546 | case itemId 547 | case outputIndex 548 | case contentIndex 549 | case delta 550 | } 551 | 552 | public init(from decoder: any Decoder) throws { 553 | let container = try decoder.container(keyedBy: CodingKeys.self) 554 | 555 | itemId = try container.decode(String.self, forKey: .itemId) 556 | eventId = try container.decode(String.self, forKey: .eventId) 557 | outputIndex = try container.decode(Int.self, forKey: .outputIndex) 558 | responseId = try container.decode(String.self, forKey: .responseId) 559 | contentIndex = try container.decode(Int.self, forKey: .contentIndex) 560 | 561 | guard let decodedDelta = try Data(base64Encoded: container.decode(String.self, forKey: .delta)) else { 562 | throw DecodingError.dataCorruptedError(forKey: .delta, in: container, debugDescription: "Invalid base64-encoded audio data.") 563 | } 564 | delta = decodedDelta 565 | } 566 | } 567 | -------------------------------------------------------------------------------- /src/Models/Session.swift: -------------------------------------------------------------------------------- 1 | public struct Session: Codable, Equatable, Sendable { 2 | public enum Modality: String, Codable, Sendable { 3 | case text 4 | case audio 5 | } 6 | 7 | public enum Voice: String, Codable, Sendable { 8 | case alloy 9 | case echo 10 | case shimmer 11 | case ash 12 | case ballad 13 | case coral 14 | case sage 15 | case verse 16 | case fable 17 | case onyx 18 | case nova 19 | } 20 | 21 | public enum AudioFormat: String, Codable, Sendable { 22 | case pcm16 23 | case g711_ulaw 24 | case g711_alaw 25 | } 26 | 27 | public struct InputAudioTranscription: Codable, Equatable, Sendable { 28 | public enum TranscriptionModel: String, CaseIterable, Codable, Sendable { 29 | case whisper = "whisper-1" 30 | case gpt4o = "gpt-4o-transcribe" 31 | case gpt4oMini = "gpt-4o-mini-transcribe" 32 | } 33 | 34 | /// The model to use for transcription 35 | public var model: TranscriptionModel 36 | /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. `en`) format will improve accuracy and latency. 37 | public var language: String? 38 | /// An optional text to guide the model's style or continue a previous audio segment. 39 | /// 40 | /// For `whisper`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt4o` models, the prompt is a free text string, for example "expect words related to technology". 41 | public var prompt: String? 42 | 43 | public init(model: TranscriptionModel = .whisper) { 44 | self.model = model 45 | } 46 | } 47 | 48 | public struct InputAudioNoiseReduction: Codable, Equatable, Sendable { 49 | /// Type of noise reduction. 50 | public enum NoiseReductionType: String, CaseIterable, Codable, Sendable { 51 | /// For close-talking microphones such as headphones 52 | case nearField = "near_field" 53 | /// For far-field microphones such as laptop or conference room microphones 54 | case farField = "far_field" 55 | } 56 | 57 | /// Type of noise reduction. 58 | public var type: NoiseReductionType? 59 | 60 | public init(type: NoiseReductionType? = nil) { 61 | self.type = type 62 | } 63 | } 64 | 65 | public struct TurnDetection: Codable, Equatable, Sendable { 66 | public enum TurnDetectionType: String, Codable, Sendable { 67 | case serverVad = "server_vad" 68 | case semanticVad = "semantic_vad" 69 | case none 70 | } 71 | 72 | public enum TurnDetectionEagerness: String, Codable, Sendable { 73 | case low 74 | case high 75 | case auto 76 | case medium 77 | } 78 | 79 | /// The type of turn detection. 80 | public var type: TurnDetectionType 81 | /// Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0). 82 | public var threshold: Double? 83 | /// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. 84 | public var interruptResponse: Bool? 85 | /// Used only for `server_vad` mode. Amount of audio to include before speech starts (in milliseconds). 86 | public var prefixPaddingMs: Int? 87 | /// Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds). 88 | public var silenceDurationMs: Int? 89 | /// Whether or not to automatically generate a response when VAD is enabled. 90 | public var createResponse: Bool 91 | /// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. 92 | public var eagerness: TurnDetectionEagerness? 93 | 94 | public init(type: TurnDetectionType = .serverVad, threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, createResponse: Bool = true, eagerness: TurnDetectionEagerness? = nil) { 95 | self.type = type 96 | self.eagerness = eagerness 97 | self.threshold = threshold 98 | self.createResponse = createResponse 99 | self.prefixPaddingMs = prefixPaddingMs 100 | self.silenceDurationMs = silenceDurationMs 101 | self.interruptResponse = interruptResponse 102 | } 103 | 104 | public static func serverVad(threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil) -> TurnDetection { 105 | .init(type: .serverVad, threshold: threshold, interruptResponse: interruptResponse, prefixPaddingMs: prefixPaddingMs, silenceDurationMs: silenceDurationMs) 106 | } 107 | 108 | public static func semanticVad(eagerness: TurnDetectionEagerness = .auto) -> TurnDetection { 109 | .init(type: .semanticVad, eagerness: eagerness) 110 | } 111 | } 112 | 113 | public struct Tool: Codable, Equatable, Sendable { 114 | public struct FunctionParameters: Codable, Equatable, Sendable { 115 | public var type: JSONType 116 | public var properties: [String: Property]? 117 | public var required: [String]? 118 | public var pattern: String? 119 | public var const: String? 120 | public var `enum`: [String]? 121 | public var multipleOf: Int? 122 | public var minimum: Int? 123 | public var maximum: Int? 124 | 125 | public init( 126 | type: JSONType, 127 | properties: [String: Property]? = nil, 128 | required: [String]? = nil, 129 | pattern: String? = nil, 130 | const: String? = nil, 131 | enum: [String]? = nil, 132 | multipleOf: Int? = nil, 133 | minimum: Int? = nil, 134 | maximum: Int? = nil 135 | ) { 136 | self.type = type 137 | self.properties = properties 138 | self.required = required 139 | self.pattern = pattern 140 | self.const = const 141 | self.enum = `enum` 142 | self.multipleOf = multipleOf 143 | self.minimum = minimum 144 | self.maximum = maximum 145 | } 146 | 147 | public struct Property: Codable, Equatable, Sendable { 148 | public var type: JSONType 149 | public var description: String? 150 | public var format: String? 151 | public var items: Items? 152 | public var required: [String]? 153 | public var pattern: String? 154 | public var const: String? 155 | public var `enum`: [String]? 156 | public var multipleOf: Int? 157 | public var minimum: Double? 158 | public var maximum: Double? 159 | public var minItems: Int? 160 | public var maxItems: Int? 161 | public var uniqueItems: Bool? 162 | 163 | public init( 164 | type: JSONType, 165 | description: String? = nil, 166 | format: String? = nil, 167 | items: Self.Items? = nil, 168 | required: [String]? = nil, 169 | pattern: String? = nil, 170 | const: String? = nil, 171 | enum: [String]? = nil, 172 | multipleOf: Int? = nil, 173 | minimum: Double? = nil, 174 | maximum: Double? = nil, 175 | minItems: Int? = nil, 176 | maxItems: Int? = nil, 177 | uniqueItems: Bool? = nil 178 | ) { 179 | self.type = type 180 | self.description = description 181 | self.format = format 182 | self.items = items 183 | self.required = required 184 | self.pattern = pattern 185 | self.const = const 186 | self.enum = `enum` 187 | self.multipleOf = multipleOf 188 | self.minimum = minimum 189 | self.maximum = maximum 190 | self.minItems = minItems 191 | self.maxItems = maxItems 192 | self.uniqueItems = uniqueItems 193 | } 194 | 195 | public struct Items: Codable, Equatable, Sendable { 196 | public var type: JSONType 197 | public var properties: [String: Property]? 198 | public var pattern: String? 199 | public var const: String? 200 | public var `enum`: [String]? 201 | public var multipleOf: Int? 202 | public var minimum: Double? 203 | public var maximum: Double? 204 | public var minItems: Int? 205 | public var maxItems: Int? 206 | public var uniqueItems: Bool? 207 | 208 | public init( 209 | type: JSONType, 210 | properties: [String: Property]? = nil, 211 | pattern: String? = nil, 212 | const: String? = nil, 213 | enum: [String]? = nil, 214 | multipleOf: Int? = nil, 215 | minimum: Double? = nil, 216 | maximum: Double? = nil, 217 | minItems: Int? = nil, 218 | maxItems: Int? = nil, 219 | uniqueItems: Bool? = nil 220 | ) { 221 | self.type = type 222 | self.properties = properties 223 | self.pattern = pattern 224 | self.const = const 225 | self.enum = `enum` 226 | self.multipleOf = multipleOf 227 | self.minimum = minimum 228 | self.maximum = maximum 229 | self.minItems = minItems 230 | self.maxItems = maxItems 231 | self.uniqueItems = uniqueItems 232 | } 233 | } 234 | } 235 | 236 | public enum JSONType: String, Codable, Sendable { 237 | case integer 238 | case string 239 | case boolean 240 | case array 241 | case object 242 | case number 243 | case null 244 | } 245 | } 246 | 247 | /// The type of the tool. 248 | public var type: String = "function" 249 | /// The name of the function. 250 | public var name: String 251 | /// The description of the function. 252 | public var description: String 253 | /// Parameters of the function in JSON Schema. 254 | public var parameters: FunctionParameters 255 | 256 | public init(type: String = "function", name: String, description: String, parameters: FunctionParameters) { 257 | self.type = type 258 | self.name = name 259 | self.description = description 260 | self.parameters = parameters 261 | } 262 | } 263 | 264 | public enum ToolChoice: Equatable, Sendable { 265 | case auto 266 | case none 267 | case required 268 | case function(String) 269 | 270 | public init(function name: String) { 271 | self = .function(name) 272 | } 273 | } 274 | 275 | /// The unique ID of the session. 276 | public var id: String? 277 | /// The default model used for this session. 278 | public var model: String 279 | /// The set of modalities the model can respond with. 280 | public var modalities: [Modality] 281 | /// The default system instructions. 282 | public var instructions: String 283 | /// The voice the model uses to respond. 284 | public var voice: Voice 285 | /// The format of input audio. 286 | public var inputAudioFormat: AudioFormat 287 | /// The format of output audio. 288 | public var outputAudioFormat: AudioFormat 289 | /// Configuration for input audio transcription. 290 | public var inputAudioTranscription: InputAudioTranscription? 291 | /// Configuration for input audio noise reduction. 292 | public var inputAudioNoiseReduction: InputAudioNoiseReduction? 293 | /// Configuration for turn detection. 294 | public var turnDetection: TurnDetection? 295 | /// Tools (functions) available to the model. 296 | public var tools: [Tool] 297 | /// How the model chooses tools. 298 | public var toolChoice: ToolChoice 299 | /// Sampling temperature. 300 | public var temperature: Double 301 | /// Maximum number of output tokens. 302 | public var maxOutputTokens: Int? 303 | 304 | public init( 305 | id: String? = nil, 306 | model: String, 307 | tools: [Tool] = [], 308 | instructions: String, 309 | voice: Voice = .alloy, 310 | temperature: Double = 1, 311 | maxOutputTokens: Int? = nil, 312 | toolChoice: ToolChoice = .auto, 313 | turnDetection: TurnDetection? = nil, 314 | inputAudioFormat: AudioFormat = .pcm16, 315 | outputAudioFormat: AudioFormat = .pcm16, 316 | modalities: [Modality] = [.text, .audio], 317 | inputAudioTranscription: InputAudioTranscription? = nil 318 | ) { 319 | self.id = id 320 | self.model = model 321 | self.tools = tools 322 | self.voice = voice 323 | self.toolChoice = toolChoice 324 | self.modalities = modalities 325 | self.temperature = temperature 326 | self.instructions = instructions 327 | self.turnDetection = turnDetection 328 | self.maxOutputTokens = maxOutputTokens 329 | self.inputAudioFormat = inputAudioFormat 330 | self.outputAudioFormat = outputAudioFormat 331 | self.inputAudioTranscription = inputAudioTranscription 332 | } 333 | } 334 | 335 | extension Session.ToolChoice: Codable { 336 | private enum FunctionCall: Codable { 337 | case type 338 | case function 339 | 340 | enum CodingKeys: CodingKey { 341 | case type 342 | case function 343 | } 344 | } 345 | 346 | public init(from decoder: any Decoder) throws { 347 | let container = try decoder.singleValueContainer() 348 | 349 | if let stringValue = try? container.decode(String.self) { 350 | switch stringValue { 351 | case "none": 352 | self = .none 353 | case "auto": 354 | self = .auto 355 | case "required": 356 | self = .required 357 | default: 358 | throw DecodingError.dataCorruptedError(in: container, debugDescription: "Invalid value for enum.") 359 | } 360 | } else { 361 | let container = try decoder.container(keyedBy: FunctionCall.CodingKeys.self) 362 | let functionContainer = try container.decode([String: String].self, forKey: .function) 363 | 364 | guard let name = functionContainer["name"] else { 365 | throw DecodingError.dataCorruptedError(forKey: .function, in: container, debugDescription: "Missing function name.") 366 | } 367 | 368 | self = .function(name) 369 | } 370 | } 371 | 372 | public func encode(to encoder: Encoder) throws { 373 | switch self { 374 | case .none: 375 | var container = encoder.singleValueContainer() 376 | try container.encode("none") 377 | case .auto: 378 | var container = encoder.singleValueContainer() 379 | try container.encode("auto") 380 | case .required: 381 | var container = encoder.singleValueContainer() 382 | try container.encode("required") 383 | case let .function(name): 384 | var container = encoder.container(keyedBy: FunctionCall.CodingKeys.self) 385 | try container.encode("function", forKey: .type) 386 | try container.encode(["name": name], forKey: .function) 387 | } 388 | } 389 | } 390 | -------------------------------------------------------------------------------- /src/OpenAIRealtime.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | #if canImport(FoundationNetworking) 3 | import FoundationNetworking 4 | #endif 5 | 6 | enum RealtimeAPIError: Error { 7 | case invalidMessage 8 | } 9 | 10 | public final class RealtimeAPI: NSObject, Sendable { 11 | @MainActor public var onDisconnect: (@Sendable () -> Void)? { 12 | get { connector.onDisconnect } 13 | set { connector.onDisconnect(newValue) } 14 | } 15 | 16 | public var events: AsyncThrowingStream { 17 | connector.events 18 | } 19 | 20 | let connector: any Connector 21 | 22 | /// Connect to the OpenAI Realtime API using the given connector instance. 23 | public init(connector: any Connector) { 24 | self.connector = connector 25 | 26 | super.init() 27 | } 28 | 29 | public func send(event: ClientEvent) async throws { 30 | try await connector.send(event: event) 31 | } 32 | } 33 | 34 | /// Helper methods for connecting to the OpenAI Realtime API. 35 | extension RealtimeAPI { 36 | /// Connect to the OpenAI WebSocket Realtime API with the given request. 37 | static func webSocket(connectingTo request: URLRequest) -> RealtimeAPI { 38 | RealtimeAPI(connector: WebSocketConnector(connectingTo: request)) 39 | } 40 | 41 | /// Connect to the OpenAI WebSocket Realtime API with the given authentication token and model. 42 | static func webSocket(authToken: String, model: String = "gpt-4o-realtime-preview") -> RealtimeAPI { 43 | var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [ 44 | URLQueryItem(name: "model", value: model), 45 | ])) 46 | request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta") 47 | request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization") 48 | 49 | return webSocket(connectingTo: request) 50 | } 51 | 52 | /// Connect to the OpenAI WebRTC Realtime API with the given request. 53 | static func webRTC(connectingTo request: URLRequest) async throws -> RealtimeAPI { 54 | try RealtimeAPI(connector: await WebRTCConnector(connectingTo: request)) 55 | } 56 | 57 | /// Connect to the OpenAI WebRTC Realtime API with the given authentication token and model. 58 | static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview") async throws -> RealtimeAPI { 59 | var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [ 60 | URLQueryItem(name: "model", value: model), 61 | ])) 62 | 63 | request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta") 64 | request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization") 65 | return try await webRTC(connectingTo: request) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/Protocols/Connector.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | #if canImport(FoundationNetworking) 3 | import FoundationNetworking 4 | #endif 5 | 6 | public protocol Connector: Sendable { 7 | var events: AsyncThrowingStream { get } 8 | @MainActor var onDisconnect: (@Sendable () -> Void)? { get } 9 | 10 | init(connectingTo request: URLRequest) async throws 11 | 12 | func send(event: ClientEvent) async throws 13 | 14 | @MainActor func onDisconnect(_ action: (@Sendable () -> Void)?) 15 | } 16 | -------------------------------------------------------------------------------- /src/Support/UnsafeInteriorMutable.swift: -------------------------------------------------------------------------------- 1 | final class UnsafeInteriorMutable: @unchecked Sendable { 2 | private var value: T? 3 | 4 | func set(_ value: T) { 5 | self.value = value 6 | } 7 | 8 | func get() -> T? { 9 | return value 10 | } 11 | 12 | func lazy(_ closure: () -> T?) -> T? { 13 | if case let .some(wrapped) = value { 14 | return wrapped 15 | } 16 | 17 | if let newValue = closure() { 18 | value = newValue 19 | return newValue 20 | } 21 | 22 | return nil 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/Support/UnsafeMutableArray.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// An unsafe mutable array that can be accessed from multiple threads. 4 | /// > Warning: Exposing any observable property externally (such as by having a computed property use `isEmpty` will lead to very hard to debug crashes 5 | /// > 6 | /// > If you really need to, manually observe the property using `withObservationTracking` and write changes in the main actor. 7 | @Observable final class UnsafeMutableArray: @unchecked Sendable { 8 | private var array = [T]() 9 | 10 | public var isEmpty: Bool { 11 | array.isEmpty 12 | } 13 | 14 | var first: T? { 15 | array.first 16 | } 17 | 18 | func push(_ value: T) { 19 | array.append(value) 20 | } 21 | 22 | @discardableResult 23 | func popFirst() -> T? { 24 | array.removeFirst() 25 | } 26 | 27 | func clear() { 28 | array.removeAll() 29 | } 30 | } 31 | --------------------------------------------------------------------------------