├── LICENSE ├── README.md ├── VoiceModeWebRTCSwift.xcodeproj ├── project.pbxproj ├── project.xcworkspace │ ├── contents.xcworkspacedata │ └── xcshareddata │ │ └── swiftpm │ │ └── Package.resolved └── xcuserdata │ └── pallav.xcuserdatad │ └── xcschemes │ └── xcschememanagement.plist └── VoiceModeWebRTCSwift ├── Assets.xcassets ├── AccentColor.colorset │ └── Contents.json ├── AppIcon.appiconset │ ├── AppIcon1024x1024.png │ └── Contents.json └── Contents.json ├── ContentView.swift ├── Preview Content └── Preview Assets.xcassets │ └── Contents.json ├── VoiceModeWebRTCSwiftApp.swift └── WebRTCManager.swift /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Pallav Agarwal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenAI Swift Realtime API with WebRTC 2 | ![Swift](https://img.shields.io/badge/Swift-5-orange?logo=swift) 3 | ![iOS](https://img.shields.io/badge/iOS-16%2B-blue?logo=apple) 4 | 5 | ### Overview 6 | 7 | This Xcode project demos OpenAI's [**Realtime API with WebRTC**](https://platform.openai.com/docs/guides/realtime-webrtc) (Advanced Voice Mode). It's an iOS application built with SwiftUI, AVFoundation, and the [WebRTC](https://github.com/stasel/WebRTC) package. It supports full AVM capabilities including interrupting the audio, sending text events manually, and controlling options such as the system message, realtime audio model, and voice. 8 | 9 | https://github.com/user-attachments/assets/0e731764-569a-4f35-976e-972ef16cb699 10 | 11 | > This video demos the iOS application running on MacOS 12 | 13 | --- 14 | 15 | ## Requirements 16 | - iOS 16.0 or later 17 | - OpenAI API Key 18 | 19 | --- 20 | 21 | ## Installation 22 | 23 | 1. **Clone the the Repository**: 24 | ```bash 25 | git clone https://github.com/PallavAg/VoiceModeWebRTCSwift.git 26 | ``` 27 | 28 | 3. **Setup API Key**: 29 | - Replace the placeholder `API_KEY` in the code with your OpenAI API key: 30 | ```swift 31 | let API_KEY = "your_openai_api_key" 32 | ``` 33 | - Alternatively, you can specify the OpenAI API Key in the app itself 34 | 35 | 3. **Run the App**: 36 | - Go to the **Signing & Capabilities** section to first specify your account. 37 | - Build and run the app on your iOS device, MacOS device, or simulator. 38 | 39 | --- 40 | 41 | ## Usage 42 | 43 | 1. **Start Connection**: 44 | - Launch the app and enter your API key in **Settings** if not specified already. 45 | - Select your preferred AI model and voice, then press 'Start Connection' to begin the conversation. 46 | 47 | 2. **Interact**: 48 | - Use the text input field or speak into the microphone to interact with the Realtime API. 49 | 50 | --- 51 | 52 | ## Key Components 53 | 54 | - **`ContentView`**: 55 | - The primary UI that orchestrates conversation, input, and connection controls. 56 | - **`WebRTCManager`**: 57 | - Handles WebRTC connection setup, data channel communication, and audio processing. 58 | - **`OptionsView`**: 59 | - Allows customization of API keys, models, and voice settings. 60 | 61 | --- 62 | 63 | ## Troubleshooting 64 | 65 | - **Microphone Permission**: 66 | - Ensure the app has microphone access in iOS settings. 67 | - **Connection Issues**: 68 | - Check API key validity and server accessibility. 69 | 70 | --- 71 | 72 | ## License 73 | 74 | This project is licensed under the MIT License. See the `LICENSE` file for details. 75 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 77; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 8CDC395F2D29034E008A13BF /* WebRTC in Frameworks */ = {isa = PBXBuildFile; productRef = 8CDC395E2D29034E008A13BF /* WebRTC */; }; 11 | /* End PBXBuildFile section */ 12 | 13 | /* Begin PBXFileReference section */ 14 | 8CDC39492D29030B008A13BF /* VoiceModeWebRTCSwift.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = VoiceModeWebRTCSwift.app; sourceTree = BUILT_PRODUCTS_DIR; }; 15 | /* End PBXFileReference section */ 16 | 17 | /* Begin PBXFileSystemSynchronizedRootGroup section */ 18 | 8CDC394B2D29030B008A13BF /* VoiceModeWebRTCSwift */ = { 19 | isa = PBXFileSystemSynchronizedRootGroup; 20 | path = VoiceModeWebRTCSwift; 21 | sourceTree = ""; 22 | }; 23 | /* End PBXFileSystemSynchronizedRootGroup section */ 24 | 25 | /* Begin PBXFrameworksBuildPhase section */ 26 | 8CDC39462D29030B008A13BF /* Frameworks */ = { 27 | isa = PBXFrameworksBuildPhase; 28 | buildActionMask = 2147483647; 29 | files = ( 30 | 8CDC395F2D29034E008A13BF /* WebRTC in Frameworks */, 31 | ); 32 | runOnlyForDeploymentPostprocessing = 0; 33 | }; 34 | /* End PBXFrameworksBuildPhase section */ 35 | 36 | /* Begin PBXGroup section */ 37 | 8CDC39402D29030B008A13BF = { 38 | isa = PBXGroup; 39 | children = ( 40 | 8CDC394B2D29030B008A13BF /* VoiceModeWebRTCSwift */, 41 | 8CDC394A2D29030B008A13BF /* Products */, 42 | ); 43 | sourceTree = ""; 44 | }; 45 | 8CDC394A2D29030B008A13BF /* Products */ = { 46 | isa = PBXGroup; 47 | children = ( 48 | 8CDC39492D29030B008A13BF /* VoiceModeWebRTCSwift.app */, 49 | ); 50 | name = Products; 51 | sourceTree = ""; 52 | }; 53 | /* End PBXGroup section */ 54 | 55 | /* Begin PBXNativeTarget section */ 56 | 8CDC39482D29030B008A13BF /* VoiceModeWebRTCSwift */ = { 57 | isa = PBXNativeTarget; 58 | buildConfigurationList = 8CDC39572D29030C008A13BF /* Build configuration list for PBXNativeTarget "VoiceModeWebRTCSwift" */; 59 | buildPhases = ( 60 | 8CDC39452D29030B008A13BF /* Sources */, 61 | 8CDC39462D29030B008A13BF /* Frameworks */, 62 | 8CDC39472D29030B008A13BF /* Resources */, 63 | ); 64 | buildRules = ( 65 | ); 66 | dependencies = ( 67 | ); 68 | fileSystemSynchronizedGroups = ( 69 | 8CDC394B2D29030B008A13BF /* VoiceModeWebRTCSwift */, 70 | ); 71 | name = VoiceModeWebRTCSwift; 72 | packageProductDependencies = ( 73 | 8CDC395E2D29034E008A13BF /* WebRTC */, 74 | ); 75 | productName = VoiceModeWebRTCSwift; 76 | productReference = 8CDC39492D29030B008A13BF /* VoiceModeWebRTCSwift.app */; 77 | productType = "com.apple.product-type.application"; 78 | }; 79 | /* End PBXNativeTarget section */ 80 | 81 | /* Begin PBXProject section */ 82 | 8CDC39412D29030B008A13BF /* Project object */ = { 83 | isa = PBXProject; 84 | attributes = { 85 | BuildIndependentTargetsInParallel = 1; 86 | LastSwiftUpdateCheck = 1620; 87 | LastUpgradeCheck = 1620; 88 | TargetAttributes = { 89 | 8CDC39482D29030B008A13BF = { 90 | CreatedOnToolsVersion = 16.2; 91 | }; 92 | }; 93 | }; 94 | buildConfigurationList = 8CDC39442D29030B008A13BF /* Build configuration list for PBXProject "VoiceModeWebRTCSwift" */; 95 | developmentRegion = en; 96 | hasScannedForEncodings = 0; 97 | knownRegions = ( 98 | en, 99 | Base, 100 | ); 101 | mainGroup = 8CDC39402D29030B008A13BF; 102 | minimizedProjectReferenceProxies = 1; 103 | packageReferences = ( 104 | 8CDC395D2D29034E008A13BF /* XCRemoteSwiftPackageReference "WebRTC" */, 105 | ); 106 | preferredProjectObjectVersion = 77; 107 | productRefGroup = 8CDC394A2D29030B008A13BF /* Products */; 108 | projectDirPath = ""; 109 | projectRoot = ""; 110 | targets = ( 111 | 8CDC39482D29030B008A13BF /* VoiceModeWebRTCSwift */, 112 | ); 113 | }; 114 | /* End PBXProject section */ 115 | 116 | /* Begin PBXResourcesBuildPhase section */ 117 | 8CDC39472D29030B008A13BF /* Resources */ = { 118 | isa = PBXResourcesBuildPhase; 119 | buildActionMask = 2147483647; 120 | files = ( 121 | ); 122 | runOnlyForDeploymentPostprocessing = 0; 123 | }; 124 | /* End PBXResourcesBuildPhase section */ 125 | 126 | /* Begin PBXSourcesBuildPhase section */ 127 | 8CDC39452D29030B008A13BF /* Sources */ = { 128 | isa = PBXSourcesBuildPhase; 129 | buildActionMask = 2147483647; 130 | files = ( 131 | ); 132 | runOnlyForDeploymentPostprocessing = 0; 133 | }; 134 | /* End PBXSourcesBuildPhase section */ 135 | 136 | /* Begin XCBuildConfiguration section */ 137 | 8CDC39552D29030C008A13BF /* Debug */ = { 138 | isa = XCBuildConfiguration; 139 | buildSettings = { 140 | ALWAYS_SEARCH_USER_PATHS = NO; 141 | ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; 142 | CLANG_ANALYZER_NONNULL = YES; 143 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 144 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; 145 | CLANG_ENABLE_MODULES = YES; 146 | CLANG_ENABLE_OBJC_ARC = YES; 147 | CLANG_ENABLE_OBJC_WEAK = YES; 148 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 149 | CLANG_WARN_BOOL_CONVERSION = YES; 150 | CLANG_WARN_COMMA = YES; 151 | CLANG_WARN_CONSTANT_CONVERSION = YES; 152 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 153 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 154 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 155 | CLANG_WARN_EMPTY_BODY = YES; 156 | CLANG_WARN_ENUM_CONVERSION = YES; 157 | CLANG_WARN_INFINITE_RECURSION = YES; 158 | CLANG_WARN_INT_CONVERSION = YES; 159 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 160 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 161 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 162 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 163 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 164 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 165 | CLANG_WARN_STRICT_PROTOTYPES = YES; 166 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 167 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 168 | CLANG_WARN_UNREACHABLE_CODE = YES; 169 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 170 | COPY_PHASE_STRIP = NO; 171 | DEBUG_INFORMATION_FORMAT = dwarf; 172 | ENABLE_STRICT_OBJC_MSGSEND = YES; 173 | ENABLE_TESTABILITY = YES; 174 | ENABLE_USER_SCRIPT_SANDBOXING = YES; 175 | GCC_C_LANGUAGE_STANDARD = gnu17; 176 | GCC_DYNAMIC_NO_PIC = NO; 177 | GCC_NO_COMMON_BLOCKS = YES; 178 | GCC_OPTIMIZATION_LEVEL = 0; 179 | GCC_PREPROCESSOR_DEFINITIONS = ( 180 | "DEBUG=1", 181 | "$(inherited)", 182 | ); 183 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 184 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 185 | GCC_WARN_UNDECLARED_SELECTOR = YES; 186 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 187 | GCC_WARN_UNUSED_FUNCTION = YES; 188 | GCC_WARN_UNUSED_VARIABLE = YES; 189 | IPHONEOS_DEPLOYMENT_TARGET = 18.2; 190 | LOCALIZATION_PREFERS_STRING_CATALOGS = YES; 191 | MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; 192 | MTL_FAST_MATH = YES; 193 | ONLY_ACTIVE_ARCH = YES; 194 | SDKROOT = iphoneos; 195 | SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; 196 | SWIFT_OPTIMIZATION_LEVEL = "-Onone"; 197 | }; 198 | name = Debug; 199 | }; 200 | 8CDC39562D29030C008A13BF /* Release */ = { 201 | isa = XCBuildConfiguration; 202 | buildSettings = { 203 | ALWAYS_SEARCH_USER_PATHS = NO; 204 | ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; 205 | CLANG_ANALYZER_NONNULL = YES; 206 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 207 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; 208 | CLANG_ENABLE_MODULES = YES; 209 | CLANG_ENABLE_OBJC_ARC = YES; 210 | CLANG_ENABLE_OBJC_WEAK = YES; 211 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 212 | CLANG_WARN_BOOL_CONVERSION = YES; 213 | CLANG_WARN_COMMA = YES; 214 | CLANG_WARN_CONSTANT_CONVERSION = YES; 215 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 216 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 217 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 218 | CLANG_WARN_EMPTY_BODY = YES; 219 | CLANG_WARN_ENUM_CONVERSION = YES; 220 | CLANG_WARN_INFINITE_RECURSION = YES; 221 | CLANG_WARN_INT_CONVERSION = YES; 222 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 223 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 224 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 225 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 226 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 227 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 228 | CLANG_WARN_STRICT_PROTOTYPES = YES; 229 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 230 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 231 | CLANG_WARN_UNREACHABLE_CODE = YES; 232 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 233 | COPY_PHASE_STRIP = NO; 234 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 235 | ENABLE_NS_ASSERTIONS = NO; 236 | ENABLE_STRICT_OBJC_MSGSEND = YES; 237 | ENABLE_USER_SCRIPT_SANDBOXING = YES; 238 | GCC_C_LANGUAGE_STANDARD = gnu17; 239 | GCC_NO_COMMON_BLOCKS = YES; 240 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 241 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 242 | GCC_WARN_UNDECLARED_SELECTOR = YES; 243 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 244 | GCC_WARN_UNUSED_FUNCTION = YES; 245 | GCC_WARN_UNUSED_VARIABLE = YES; 246 | IPHONEOS_DEPLOYMENT_TARGET = 18.2; 247 | LOCALIZATION_PREFERS_STRING_CATALOGS = YES; 248 | MTL_ENABLE_DEBUG_INFO = NO; 249 | MTL_FAST_MATH = YES; 250 | SDKROOT = iphoneos; 251 | SWIFT_COMPILATION_MODE = wholemodule; 252 | VALIDATE_PRODUCT = YES; 253 | }; 254 | name = Release; 255 | }; 256 | 8CDC39582D29030C008A13BF /* Debug */ = { 257 | isa = XCBuildConfiguration; 258 | buildSettings = { 259 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 260 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 261 | CODE_SIGN_STYLE = Automatic; 262 | CURRENT_PROJECT_VERSION = 1; 263 | DEVELOPMENT_ASSET_PATHS = "\"VoiceModeWebRTCSwift/Preview Content\""; 264 | DEVELOPMENT_TEAM = ""; 265 | ENABLE_PREVIEWS = YES; 266 | GENERATE_INFOPLIST_FILE = YES; 267 | INFOPLIST_KEY_CFBundleDisplayName = AVM; 268 | INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access required for Advanced Voice Mode"; 269 | INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; 270 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 271 | INFOPLIST_KEY_UILaunchScreen_Generation = YES; 272 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 273 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 274 | IPHONEOS_DEPLOYMENT_TARGET = 16.0; 275 | LD_RUNPATH_SEARCH_PATHS = ( 276 | "$(inherited)", 277 | "@executable_path/Frameworks", 278 | ); 279 | MARKETING_VERSION = 1.0; 280 | PRODUCT_BUNDLE_IDENTIFIER = sampleApp.VoiceModeWebRTCSwift; 281 | PRODUCT_NAME = "$(TARGET_NAME)"; 282 | SWIFT_EMIT_LOC_STRINGS = YES; 283 | SWIFT_VERSION = 5.0; 284 | TARGETED_DEVICE_FAMILY = "1,2"; 285 | }; 286 | name = Debug; 287 | }; 288 | 8CDC39592D29030C008A13BF /* Release */ = { 289 | isa = XCBuildConfiguration; 290 | buildSettings = { 291 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 292 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 293 | CODE_SIGN_STYLE = Automatic; 294 | CURRENT_PROJECT_VERSION = 1; 295 | DEVELOPMENT_ASSET_PATHS = "\"VoiceModeWebRTCSwift/Preview Content\""; 296 | DEVELOPMENT_TEAM = ""; 297 | ENABLE_PREVIEWS = YES; 298 | GENERATE_INFOPLIST_FILE = YES; 299 | INFOPLIST_KEY_CFBundleDisplayName = AVM; 300 | INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access required for Advanced Voice Mode"; 301 | INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; 302 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 303 | INFOPLIST_KEY_UILaunchScreen_Generation = YES; 304 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 305 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 306 | IPHONEOS_DEPLOYMENT_TARGET = 16.0; 307 | LD_RUNPATH_SEARCH_PATHS = ( 308 | "$(inherited)", 309 | "@executable_path/Frameworks", 310 | ); 311 | MARKETING_VERSION = 1.0; 312 | PRODUCT_BUNDLE_IDENTIFIER = sampleApp.VoiceModeWebRTCSwift; 313 | PRODUCT_NAME = "$(TARGET_NAME)"; 314 | SWIFT_EMIT_LOC_STRINGS = YES; 315 | SWIFT_VERSION = 5.0; 316 | TARGETED_DEVICE_FAMILY = "1,2"; 317 | }; 318 | name = Release; 319 | }; 320 | /* End XCBuildConfiguration section */ 321 | 322 | /* Begin XCConfigurationList section */ 323 | 8CDC39442D29030B008A13BF /* Build configuration list for PBXProject "VoiceModeWebRTCSwift" */ = { 324 | isa = XCConfigurationList; 325 | buildConfigurations = ( 326 | 8CDC39552D29030C008A13BF /* Debug */, 327 | 8CDC39562D29030C008A13BF /* Release */, 328 | ); 329 | defaultConfigurationIsVisible = 0; 330 | defaultConfigurationName = Release; 331 | }; 332 | 8CDC39572D29030C008A13BF /* Build configuration list for PBXNativeTarget "VoiceModeWebRTCSwift" */ = { 333 | isa = XCConfigurationList; 334 | buildConfigurations = ( 335 | 8CDC39582D29030C008A13BF /* Debug */, 336 | 8CDC39592D29030C008A13BF /* Release */, 337 | ); 338 | defaultConfigurationIsVisible = 0; 339 | defaultConfigurationName = Release; 340 | }; 341 | /* End XCConfigurationList section */ 342 | 343 | /* Begin XCRemoteSwiftPackageReference section */ 344 | 8CDC395D2D29034E008A13BF /* XCRemoteSwiftPackageReference "WebRTC" */ = { 345 | isa = XCRemoteSwiftPackageReference; 346 | repositoryURL = "https://github.com/stasel/WebRTC.git"; 347 | requirement = { 348 | kind = upToNextMajorVersion; 349 | minimumVersion = 130.0.0; 350 | }; 351 | }; 352 | /* End XCRemoteSwiftPackageReference section */ 353 | 354 | /* Begin XCSwiftPackageProductDependency section */ 355 | 8CDC395E2D29034E008A13BF /* WebRTC */ = { 356 | isa = XCSwiftPackageProductDependency; 357 | package = 8CDC395D2D29034E008A13BF /* XCRemoteSwiftPackageReference "WebRTC" */; 358 | productName = WebRTC; 359 | }; 360 | /* End XCSwiftPackageProductDependency section */ 361 | }; 362 | rootObject = 8CDC39412D29030B008A13BF /* Project object */; 363 | } 364 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "originHash" : "7ca243407a3b132834ec3b117a3aea85a8f743151ef868a872c8bbf8082921f6", 3 | "pins" : [ 4 | { 5 | "identity" : "webrtc", 6 | "kind" : "remoteSourceControl", 7 | "location" : "https://github.com/stasel/WebRTC.git", 8 | "state" : { 9 | "revision" : "1048f8396529c10e259f8240d0c2cd607a13defd", 10 | "version" : "130.0.0" 11 | } 12 | } 13 | ], 14 | "version" : 3 15 | } 16 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift.xcodeproj/xcuserdata/pallav.xcuserdatad/xcschemes/xcschememanagement.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SchemeUserState 6 | 7 | VoiceModeWebRTCSwift.xcscheme_^#shared#^_ 8 | 9 | orderHint 10 | 0 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "idiom" : "universal" 5 | } 6 | ], 7 | "info" : { 8 | "author" : "xcode", 9 | "version" : 1 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/Assets.xcassets/AppIcon.appiconset/AppIcon1024x1024.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PallavAg/VoiceModeWebRTCSwift/2b76b6ad3f4e4f0540377e3fe1c7e8c98e9242a5/VoiceModeWebRTCSwift/Assets.xcassets/AppIcon.appiconset/AppIcon1024x1024.png -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "filename" : "AppIcon1024x1024.png", 5 | "idiom" : "universal", 6 | "platform" : "ios", 7 | "size" : "1024x1024" 8 | }, 9 | { 10 | "appearances" : [ 11 | { 12 | "appearance" : "luminosity", 13 | "value" : "dark" 14 | } 15 | ], 16 | "idiom" : "universal", 17 | "platform" : "ios", 18 | "size" : "1024x1024" 19 | }, 20 | { 21 | "appearances" : [ 22 | { 23 | "appearance" : "luminosity", 24 | "value" : "tinted" 25 | } 26 | ], 27 | "idiom" : "universal", 28 | "platform" : "ios", 29 | "size" : "1024x1024" 30 | } 31 | ], 32 | "info" : { 33 | "author" : "xcode", 34 | "version" : 1 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/ContentView.swift: -------------------------------------------------------------------------------- 1 | import SwiftUI 2 | import AVFoundation 3 | 4 | let API_KEY = "" 5 | 6 | struct ContentView: View { 7 | @StateObject private var webrtcManager = WebRTCManager() 8 | 9 | @State private var showOptionsSheet = false 10 | @FocusState private var isTextFieldFocused: Bool 11 | 12 | // AppStorage properties 13 | @AppStorage("apiKey") private var apiKey = API_KEY 14 | @AppStorage("systemMessage") private var systemMessage = "You are a helpful, witty, and friendly AI. Act like a human. Your voice and personality should be warm and engaging, with a lively and playful tone. Talk quickly." 15 | @AppStorage("selectedModel") private var selectedModel = "gpt-4o-mini-realtime-preview-2024-12-17" 16 | @AppStorage("selectedVoice") private var selectedVoice = "alloy" 17 | 18 | // Constants 19 | private let modelOptions = [ 20 | "gpt-4o-mini-realtime-preview-2024-12-17", 21 | "gpt-4o-realtime-preview-2024-12-17" 22 | ] 23 | private let voiceOptions = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"] 24 | 25 | var body: some View { 26 | VStack(spacing: 12) { 27 | HeaderView() 28 | ConnectionControls() 29 | Divider().padding(.vertical, 6) 30 | 31 | ConversationView() 32 | 33 | MessageInputView() 34 | } 35 | .onAppear(perform: requestMicrophonePermission) 36 | .sheet(isPresented: $showOptionsSheet) { 37 | OptionsView( 38 | apiKey: $apiKey, 39 | systemMessage: $systemMessage, 40 | selectedModel: $selectedModel, 41 | selectedVoice: $selectedVoice, 42 | modelOptions: modelOptions, 43 | voiceOptions: voiceOptions 44 | ) 45 | } 46 | } 47 | 48 | private func requestMicrophonePermission() { 49 | AVAudioSession.sharedInstance().requestRecordPermission { granted in 50 | print("Microphone permission granted: \(granted)") 51 | } 52 | if apiKey.isEmpty { 53 | showOptionsSheet = true 54 | } 55 | } 56 | 57 | @ViewBuilder 58 | private func HeaderView() -> some View { 59 | VStack(spacing: 2) { 60 | Text("Advanced Voice Mode") 61 | .font(.system(size: 24, weight: .bold)) 62 | .padding(.top, 12) 63 | .lineLimit(1) 64 | .minimumScaleFactor(0.5) 65 | Text("In Swift with WebRTC") 66 | .font(.system(size: 15, weight: .light)) 67 | .padding(.bottom, 10) 68 | } 69 | } 70 | 71 | @ViewBuilder 72 | private func ConnectionControls() -> some View { 73 | HStack { 74 | // Connection status indicator 75 | Circle() 76 | .frame(width: 12, height: 12) 77 | .foregroundColor(webrtcManager.connectionStatus.color) 78 | Text(webrtcManager.connectionStatus.description) 79 | .foregroundColor(webrtcManager.connectionStatus.color) 80 | .contentTransition(.numericText()) 81 | .animation(.easeInOut(duration: 0.3), value: webrtcManager.connectionStatus) 82 | .onChange(of: webrtcManager.connectionStatus) { _ in 83 | switch webrtcManager.connectionStatus { 84 | case .connecting: 85 | UIImpactFeedbackGenerator(style: .soft).impactOccurred() 86 | case .connected: 87 | UIImpactFeedbackGenerator(style: .medium).impactOccurred() 88 | case .disconnected: 89 | webrtcManager.eventTypeStr = "" 90 | } 91 | } 92 | 93 | Spacer() 94 | 95 | // Connection Button 96 | if webrtcManager.connectionStatus == .connected { 97 | Button("Stop Connection") { 98 | UIImpactFeedbackGenerator(style: .light).impactOccurred() 99 | webrtcManager.stopConnection() 100 | } 101 | .buttonStyle(.borderedProminent) 102 | } else { 103 | Button("Start Connection") { 104 | UIImpactFeedbackGenerator(style: .soft).impactOccurred() 105 | webrtcManager.connectionStatus = .connecting 106 | webrtcManager.startConnection( 107 | apiKey: apiKey, 108 | modelName: selectedModel, 109 | systemMessage: systemMessage, 110 | voice: selectedVoice 111 | ) 112 | } 113 | .buttonStyle(.borderedProminent) 114 | .disabled(webrtcManager.connectionStatus == .connecting) 115 | Button { 116 | showOptionsSheet.toggle() 117 | } label: { 118 | Image(systemName: "gearshape") 119 | } 120 | .padding(.leading, 10) 121 | } 122 | } 123 | .padding(.horizontal) 124 | } 125 | 126 | // MARK: - Conversation View 127 | @ViewBuilder 128 | private func ConversationView() -> some View { 129 | VStack(alignment: .leading, spacing: 0) { 130 | HStack { 131 | Text("Conversation") 132 | .font(.headline) 133 | Spacer() 134 | Text(webrtcManager.eventTypeStr) 135 | .font(.system(size: 12, weight: .medium, design: .monospaced)) 136 | .lineLimit(1) 137 | .minimumScaleFactor(0.5) 138 | .padding(.leading, 16) 139 | } 140 | .padding(.horizontal) 141 | ScrollView { 142 | VStack(alignment: .leading, spacing: 0) { 143 | ForEach(webrtcManager.conversation) { msg in 144 | MessageRow(msg: msg) 145 | } 146 | } 147 | .padding() 148 | } 149 | } 150 | } 151 | 152 | // MARK: - Message Row 153 | @ViewBuilder 154 | private func MessageRow(msg: ConversationItem) -> some View { 155 | HStack(alignment: .top, spacing: 8) { 156 | Image(systemName: msg.roleSymbol) 157 | .foregroundColor(msg.roleColor) 158 | .padding(.top, 4) 159 | Text(msg.text.trimmingCharacters(in: .whitespacesAndNewlines)) 160 | .frame(maxWidth: .infinity, alignment: .leading) 161 | .contentTransition(.numericText()) 162 | .animation(.easeInOut(duration: 0.1), value: msg.text) 163 | } 164 | .contextMenu { 165 | Button("Copy") { 166 | UIPasteboard.general.string = msg.text 167 | } 168 | } 169 | .padding(.bottom, msg.role == "assistant" ? 24 : 8) 170 | } 171 | 172 | // MARK: - Message Input 173 | @ViewBuilder 174 | private func MessageInputView() -> some View { 175 | HStack { 176 | TextField("Insert message...", text: $webrtcManager.outgoingMessage, axis: .vertical) 177 | .textFieldStyle(.roundedBorder) 178 | .focused($isTextFieldFocused) 179 | Button("Send") { 180 | webrtcManager.sendMessage() 181 | isTextFieldFocused = false 182 | } 183 | .disabled(webrtcManager.connectionStatus != .connected) 184 | .buttonStyle(.bordered) 185 | } 186 | .padding([.horizontal, .bottom]) 187 | } 188 | } 189 | 190 | struct OptionsView: View { 191 | @Binding var apiKey: String 192 | @Binding var systemMessage: String 193 | @Binding var selectedModel: String 194 | @Binding var selectedVoice: String 195 | 196 | let modelOptions: [String] 197 | let voiceOptions: [String] 198 | 199 | @Environment(\.presentationMode) var presentationMode 200 | 201 | var body: some View { 202 | NavigationView { 203 | Form { 204 | Section(header: Text("API Key")) { 205 | TextField("Enter API Key", text: $apiKey) 206 | .autocapitalization(.none) 207 | } 208 | Section(header: Text("System Message")) { 209 | TextEditor(text: $systemMessage) 210 | .frame(minHeight: 100) 211 | .cornerRadius(5) 212 | } 213 | Section(header: Text("Model")) { 214 | Picker("Model", selection: $selectedModel) { 215 | ForEach(modelOptions, id: \.self) { 216 | Text($0) 217 | } 218 | } 219 | .pickerStyle(.menu) 220 | } 221 | Section(header: Text("Voice")) { 222 | Picker("Voice", selection: $selectedVoice) { 223 | ForEach(voiceOptions, id: \.self) { 224 | Text($0.capitalized) 225 | } 226 | } 227 | .pickerStyle(.menu) 228 | } 229 | } 230 | .navigationTitle("Options") 231 | .toolbar { 232 | ToolbarItem(placement: .cancellationAction) { 233 | Button("Close") { 234 | presentationMode.wrappedValue.dismiss() 235 | } 236 | } 237 | } 238 | } 239 | } 240 | } 241 | 242 | // MARK: - Models and Enums 243 | 244 | struct ConversationItem: Identifiable { 245 | let id: String // item_id from the JSON 246 | let role: String // "user" / "assistant" 247 | var text: String // transcript 248 | 249 | var roleSymbol: String { 250 | role.lowercased() == "user" ? "person.fill" : "sparkles" 251 | } 252 | 253 | var roleColor: Color { 254 | role.lowercased() == "user" ? .blue : .purple 255 | } 256 | } 257 | 258 | enum ConnectionStatus: String { 259 | case connected 260 | case connecting 261 | case disconnected 262 | 263 | var color: Color { 264 | switch self { 265 | case .connected: 266 | return .green 267 | case .connecting: 268 | return .yellow 269 | case .disconnected: 270 | return .red 271 | } 272 | } 273 | 274 | var description: String { 275 | switch self { 276 | case .connected: 277 | return "Connected" 278 | case .connecting: 279 | return "Connecting" 280 | case .disconnected: 281 | return "Not Connected" 282 | } 283 | } 284 | } 285 | 286 | // MARK: - Preview 287 | 288 | struct ContentView_Previews: PreviewProvider { 289 | static var previews: some View { 290 | ContentView() 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/VoiceModeWebRTCSwiftApp.swift: -------------------------------------------------------------------------------- 1 | // 2 | // VoiceModeWebRTCSwiftApp.swift 3 | // VoiceModeWebRTCSwift 4 | // 5 | // Created by Pallav Agarwal on 1/3/25. 6 | // 7 | 8 | import SwiftUI 9 | 10 | @main 11 | struct VoiceModeWebRTCSwiftApp: App { 12 | var body: some Scene { 13 | WindowGroup { 14 | ContentView() 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /VoiceModeWebRTCSwift/WebRTCManager.swift: -------------------------------------------------------------------------------- 1 | import WebRTC 2 | 3 | // MARK: - WebRTCManager 4 | class WebRTCManager: NSObject, ObservableObject { 5 | // UI State 6 | @Published var connectionStatus: ConnectionStatus = .disconnected 7 | @Published var eventTypeStr: String = "" 8 | 9 | // Basic conversation text 10 | @Published var conversation: [ConversationItem] = [] 11 | @Published var outgoingMessage: String = "" 12 | 13 | // We’ll store items by item_id for easy updates 14 | private var conversationMap: [String : ConversationItem] = [:] 15 | 16 | // Model & session config 17 | private var modelName: String = "gpt-4o-mini-realtime-preview-2024-12-17" 18 | private var systemInstructions: String = "" 19 | private var voice: String = "alloy" 20 | 21 | // WebRTC references 22 | private var peerConnection: RTCPeerConnection? 23 | private var dataChannel: RTCDataChannel? 24 | private var audioTrack: RTCAudioTrack? 25 | 26 | // MARK: - Public Methods 27 | 28 | /// Start a WebRTC connection using a standard API key for local testing. 29 | func startConnection( 30 | apiKey: String, 31 | modelName: String, 32 | systemMessage: String, 33 | voice: String 34 | ) { 35 | conversation.removeAll() 36 | conversationMap.removeAll() 37 | 38 | // Store updated config 39 | self.modelName = modelName 40 | self.systemInstructions = systemMessage 41 | self.voice = voice 42 | 43 | setupPeerConnection() 44 | setupLocalAudio() 45 | configureAudioSession() 46 | 47 | guard let peerConnection = peerConnection else { return } 48 | 49 | // Create a Data Channel for sending/receiving events 50 | let config = RTCDataChannelConfiguration() 51 | if let channel = peerConnection.dataChannel(forLabel: "oai-events", configuration: config) { 52 | dataChannel = channel 53 | dataChannel?.delegate = self 54 | } 55 | 56 | // Create an SDP offer 57 | let constraints = RTCMediaConstraints( 58 | mandatoryConstraints: ["levelControl": "true"], 59 | optionalConstraints: nil 60 | ) 61 | peerConnection.offer(for: constraints) { [weak self] sdp, error in 62 | guard let self = self, 63 | let sdp = sdp, 64 | error == nil else { 65 | print("Failed to create offer: \(String(describing: error))") 66 | return 67 | } 68 | // Set local description 69 | peerConnection.setLocalDescription(sdp) { [weak self] error in 70 | guard let self = self, error == nil else { 71 | print("Failed to set local description: \(String(describing: error))") 72 | return 73 | } 74 | 75 | Task { 76 | do { 77 | guard let localSdp = peerConnection.localDescription?.sdp else { 78 | return 79 | } 80 | // Post SDP offer to Realtime 81 | let answerSdp = try await self.fetchRemoteSDP(apiKey: apiKey, localSdp: localSdp) 82 | 83 | // Set remote description (answer) 84 | let answer = RTCSessionDescription(type: .answer, sdp: answerSdp) 85 | peerConnection.setRemoteDescription(answer) { error in 86 | DispatchQueue.main.async { 87 | if let error { 88 | print("Failed to set remote description: \(error)") 89 | self.connectionStatus = .disconnected 90 | } else { 91 | self.connectionStatus = .connected 92 | } 93 | } 94 | } 95 | } catch { 96 | print("Error fetching remote SDP: \(error)") 97 | self.connectionStatus = .disconnected 98 | } 99 | } 100 | } 101 | } 102 | } 103 | 104 | func stopConnection() { 105 | peerConnection?.close() 106 | peerConnection = nil 107 | dataChannel = nil 108 | audioTrack = nil 109 | connectionStatus = .disconnected 110 | } 111 | 112 | /// Sends a custom "conversation.item.create" event 113 | func sendMessage() { 114 | guard let dc = dataChannel, 115 | !outgoingMessage.trimmingCharacters(in: .whitespaces).isEmpty else { 116 | return 117 | } 118 | 119 | let realtimeEvent: [String: Any] = [ 120 | "type": "conversation.item.create", 121 | "item": [ 122 | "type": "message", 123 | "role": "user", 124 | "content": [ 125 | [ 126 | "type": "input_text", 127 | "text": outgoingMessage 128 | ] 129 | ] 130 | ] 131 | ] 132 | if let jsonData = try? JSONSerialization.data(withJSONObject: realtimeEvent) { 133 | let buffer = RTCDataBuffer(data: jsonData, isBinary: false) 134 | dc.sendData(buffer) 135 | self.outgoingMessage = "" 136 | createResponse() 137 | } 138 | } 139 | 140 | /// Sends a "response.create" event 141 | func createResponse() { 142 | guard let dc = dataChannel else { return } 143 | 144 | let realtimeEvent: [String: Any] = [ "type": "response.create" ] 145 | if let jsonData = try? JSONSerialization.data(withJSONObject: realtimeEvent) { 146 | let buffer = RTCDataBuffer(data: jsonData, isBinary: false) 147 | dc.sendData(buffer) 148 | } 149 | } 150 | 151 | /// Called automatically when data channel opens, or you can manually call it. 152 | /// Updates session configuration with the latest instructions and voice. 153 | func sendSessionUpdate() { 154 | guard let dc = dataChannel, dc.readyState == .open else { 155 | print("Data channel is not open. Cannot send session.update.") 156 | return 157 | } 158 | 159 | let sessionUpdate: [String: Any] = [ 160 | "type": "session.update", 161 | "session": [ 162 | "modalities": ["text", "audio"], // Enable both text and audio 163 | "instructions": systemInstructions, 164 | "voice": voice, 165 | "input_audio_format": "pcm16", 166 | "output_audio_format": "pcm16", 167 | "input_audio_transcription": [ 168 | "model": "whisper-1" 169 | ], 170 | "turn_detection": [ 171 | "type": "server_vad", 172 | "threshold": 0.5, 173 | "prefix_padding_ms": 300, 174 | "silence_duration_ms": 500, 175 | "create_response": true 176 | ], 177 | "max_response_output_tokens": "inf" 178 | ] 179 | ] 180 | 181 | do { 182 | let jsonData = try JSONSerialization.data(withJSONObject: sessionUpdate) 183 | let buffer = RTCDataBuffer(data: jsonData, isBinary: false) 184 | dc.sendData(buffer) 185 | print("session.update event sent.") 186 | } catch { 187 | print("Failed to serialize session.update JSON: \(error)") 188 | } 189 | } 190 | 191 | // MARK: - Private Methods 192 | 193 | private func setupPeerConnection() { 194 | let config = RTCConfiguration() 195 | // If needed, configure ICE servers here 196 | let constraints = RTCMediaConstraints(mandatoryConstraints: nil, optionalConstraints: nil) 197 | let factory = RTCPeerConnectionFactory() 198 | peerConnection = factory.peerConnection(with: config, constraints: constraints, delegate: self) 199 | } 200 | 201 | private func configureAudioSession() { 202 | do { 203 | let audioSession = AVAudioSession.sharedInstance() 204 | try audioSession.setCategory(.playAndRecord, options: [.defaultToSpeaker, .allowBluetooth]) 205 | try audioSession.setMode(.videoChat) 206 | try audioSession.setActive(true, options: .notifyOthersOnDeactivation) 207 | } catch { 208 | print("Failed to configure AVAudioSession: \(error)") 209 | } 210 | } 211 | 212 | private func setupLocalAudio() { 213 | guard let peerConnection = peerConnection else { return } 214 | let factory = RTCPeerConnectionFactory() 215 | 216 | let constraints = RTCMediaConstraints( 217 | mandatoryConstraints: [ 218 | "googEchoCancellation": "true", 219 | "googAutoGainControl": "true", 220 | "googNoiseSuppression": "true", 221 | "googHighpassFilter": "true" 222 | ], 223 | optionalConstraints: nil 224 | ) 225 | 226 | let audioSource = factory.audioSource(with: constraints) 227 | 228 | let localAudioTrack = factory.audioTrack(with: audioSource, trackId: "local_audio") 229 | peerConnection.add(localAudioTrack, streamIds: ["local_stream"]) 230 | audioTrack = localAudioTrack 231 | } 232 | 233 | /// Posts our SDP offer to the Realtime API, returns the answer SDP. 234 | private func fetchRemoteSDP(apiKey: String, localSdp: String) async throws -> String { 235 | let baseUrl = "https://api.openai.com/v1/realtime" 236 | guard let url = URL(string: "\(baseUrl)?model=\(modelName)") else { 237 | throw URLError(.badURL) 238 | } 239 | 240 | var request = URLRequest(url: url) 241 | request.httpMethod = "POST" 242 | request.setValue("application/sdp", forHTTPHeaderField: "Content-Type") 243 | request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") 244 | request.httpBody = localSdp.data(using: .utf8) 245 | 246 | let (data, response) = try await URLSession.shared.data(for: request) 247 | guard let httpResponse = response as? HTTPURLResponse, 248 | (200...299).contains(httpResponse.statusCode) else { 249 | let code = (response as? HTTPURLResponse)?.statusCode ?? -1 250 | throw NSError(domain: "WebRTCManager.fetchRemoteSDP", 251 | code: code, 252 | userInfo: [NSLocalizedDescriptionKey: "Invalid server response"]) 253 | } 254 | 255 | guard let answerSdp = String(data: data, encoding: .utf8) else { 256 | throw NSError(domain: "WebRTCManager.fetchRemoteSDP", 257 | code: -1, 258 | userInfo: [NSLocalizedDescriptionKey: "Unable to decode SDP"]) 259 | } 260 | 261 | return answerSdp 262 | } 263 | 264 | private func handleIncomingJSON(_ jsonString: String) { 265 | print("Received JSON:\n\(jsonString)\n") 266 | 267 | guard let data = jsonString.data(using: .utf8), 268 | let rawEvent = try? JSONSerialization.jsonObject(with: data), 269 | let eventDict = rawEvent as? [String: Any], 270 | let eventType = eventDict["type"] as? String else { 271 | return 272 | } 273 | 274 | eventTypeStr = eventType 275 | 276 | switch eventType { 277 | case "conversation.item.created": 278 | if let item = eventDict["item"] as? [String: Any], 279 | let itemId = item["id"] as? String, 280 | let role = item["role"] as? String 281 | { 282 | // If item contains "content", extract the text 283 | let text = (item["content"] as? [[String: Any]])?.first?["text"] as? String ?? "" 284 | 285 | let newItem = ConversationItem(id: itemId, role: role, text: text) 286 | conversationMap[itemId] = newItem 287 | if role == "assistant" || role == "user" { 288 | conversation.append(newItem) 289 | } 290 | } 291 | 292 | case "response.audio_transcript.delta": 293 | // partial transcript for assistant’s message 294 | if let itemId = eventDict["item_id"] as? String, 295 | let delta = eventDict["delta"] as? String 296 | { 297 | if var convItem = conversationMap[itemId] { 298 | convItem.text += delta 299 | conversationMap[itemId] = convItem 300 | if let idx = conversation.firstIndex(where: { $0.id == itemId }) { 301 | conversation[idx].text = convItem.text 302 | } 303 | } 304 | } 305 | 306 | case "response.audio_transcript.done": 307 | // final transcript for assistant’s message 308 | if let itemId = eventDict["item_id"] as? String, 309 | let transcript = eventDict["transcript"] as? String 310 | { 311 | if var convItem = conversationMap[itemId] { 312 | convItem.text = transcript 313 | conversationMap[itemId] = convItem 314 | if let idx = conversation.firstIndex(where: { $0.id == itemId }) { 315 | conversation[idx].text = transcript 316 | } 317 | } 318 | } 319 | 320 | case "conversation.item.input_audio_transcription.completed": 321 | // final transcript for user's audio input 322 | if let itemId = eventDict["item_id"] as? String, 323 | let transcript = eventDict["transcript"] as? String 324 | { 325 | if var convItem = conversationMap[itemId] { 326 | convItem.text = transcript 327 | conversationMap[itemId] = convItem 328 | if let idx = conversation.firstIndex(where: { $0.id == itemId }) { 329 | conversation[idx].text = transcript 330 | } 331 | } 332 | } 333 | 334 | default: 335 | break 336 | } 337 | } 338 | } 339 | 340 | // MARK: - RTCPeerConnectionDelegate 341 | extension WebRTCManager: RTCPeerConnectionDelegate { 342 | func peerConnection(_ peerConnection: RTCPeerConnection, didChange stateChanged: RTCSignalingState) {} 343 | func peerConnection(_ peerConnection: RTCPeerConnection, didAdd stream: RTCMediaStream) {} 344 | func peerConnection(_ peerConnection: RTCPeerConnection, didRemove stream: RTCMediaStream) {} 345 | func peerConnectionShouldNegotiate(_ peerConnection: RTCPeerConnection) {} 346 | 347 | func peerConnection(_ peerConnection: RTCPeerConnection, didChange newState: RTCIceConnectionState) { 348 | print("ICE Connection State changed to: \(newState)") 349 | } 350 | 351 | func peerConnection(_ peerConnection: RTCPeerConnection, didChange newState: RTCIceGatheringState) {} 352 | func peerConnection(_ peerConnection: RTCPeerConnection, didGenerate candidate: RTCIceCandidate) {} 353 | func peerConnection(_ peerConnection: RTCPeerConnection, didRemove candidates: [RTCIceCandidate]) {} 354 | 355 | func peerConnection(_ peerConnection: RTCPeerConnection, didOpen dataChannel: RTCDataChannel) { 356 | // If the server creates the data channel on its side, handle it here 357 | dataChannel.delegate = self 358 | } 359 | } 360 | 361 | // MARK: - RTCDataChannelDelegate 362 | extension WebRTCManager: RTCDataChannelDelegate { 363 | func dataChannelDidChangeState(_ dataChannel: RTCDataChannel) { 364 | print("Data channel state changed: \(dataChannel.readyState)") 365 | // Auto-send session.update after channel is open 366 | if dataChannel.readyState == .open { 367 | sendSessionUpdate() 368 | } 369 | } 370 | 371 | func dataChannel(_ dataChannel: RTCDataChannel, 372 | didReceiveMessageWith buffer: RTCDataBuffer) { 373 | guard let message = String(data: buffer.data, encoding: .utf8) else { 374 | return 375 | } 376 | DispatchQueue.main.async { 377 | self.handleIncomingJSON(message) 378 | } 379 | } 380 | } 381 | --------------------------------------------------------------------------------