├── Examples ├── WhisperAX │ ├── WhisperAXTests │ │ ├── WhisperKitTests │ │ └── WhisperAXTests.swift │ ├── Debug.xcconfig │ ├── WhisperAXWatchApp │ │ ├── Assets.xcassets │ │ │ ├── Contents.json │ │ │ ├── AppIcon.appiconset │ │ │ │ ├── appstore.png │ │ │ │ └── Contents.json │ │ │ └── AccentColor.colorset │ │ │ │ └── Contents.json │ │ ├── Preview Content │ │ │ └── Preview Assets.xcassets │ │ │ │ └── Contents.json │ │ └── WhisperAXWatchApp.swift │ ├── WhisperAX │ │ ├── Resources │ │ │ ├── Assets.xcassets │ │ │ │ ├── Contents.json │ │ │ │ └── AppIcon.appiconset │ │ │ │ │ ├── 100.png │ │ │ │ │ ├── 102.png │ │ │ │ │ ├── 108.png │ │ │ │ │ ├── 114.png │ │ │ │ │ ├── 120.png │ │ │ │ │ ├── 128.png │ │ │ │ │ ├── 136.png │ │ │ │ │ ├── 152.png │ │ │ │ │ ├── 16.png │ │ │ │ │ ├── 167.png │ │ │ │ │ ├── 172.png │ │ │ │ │ ├── 180.png │ │ │ │ │ ├── 192.png │ │ │ │ │ ├── 196.png │ │ │ │ │ ├── 216.png │ │ │ │ │ ├── 234.png │ │ │ │ │ ├── 256.png │ │ │ │ │ ├── 258.png │ │ │ │ │ ├── 32.png │ │ │ │ │ ├── 40.png │ │ │ │ │ ├── 44.png │ │ │ │ │ ├── 48.png │ │ │ │ │ ├── 512.png │ │ │ │ │ ├── 55.png │ │ │ │ │ ├── 58.png │ │ │ │ │ ├── 60.png │ │ │ │ │ ├── 64.png │ │ │ │ │ ├── 66.png │ │ │ │ │ ├── 76.png │ │ │ │ │ ├── 80.png │ │ │ │ │ ├── 87.png │ │ │ │ │ ├── 88.png │ │ │ │ │ ├── 92.png │ │ │ │ │ ├── 1024.png │ │ │ │ │ ├── 120 1.png │ │ │ │ │ ├── 128 1.png │ │ │ │ │ ├── 58 1.png │ │ │ │ │ ├── 60 1.png │ │ │ │ │ ├── 64 1.png │ │ │ │ │ ├── 80 1.png │ │ │ │ │ ├── 87 1.png │ │ │ │ │ ├── 1024 1.png │ │ │ │ │ ├── 1024 2.png │ │ │ │ │ └── Contents.json │ │ │ ├── Info.plist │ │ │ └── WhisperAX.entitlements │ │ ├── Preview Content │ │ │ └── Preview Assets.xcassets │ │ │ │ └── Contents.json │ │ ├── Info.plist │ │ └── WhisperAXApp.swift │ ├── WhisperAX.xcodeproj │ │ ├── project.xcworkspace │ │ │ ├── contents.xcworkspacedata │ │ │ └── xcshareddata │ │ │ │ ├── IDEWorkspaceChecks.plist │ │ │ │ └── swiftpm │ │ │ │ └── Package.resolved │ │ └── xcshareddata │ │ │ └── xcschemes │ │ │ └── WhisperAX.xcscheme │ ├── WhisperAXUITests │ │ ├── WhisperAXUITestsLaunchTests.swift │ │ └── WhisperAXUITests.swift │ ├── WhisperAXWatchAppUITests │ │ ├── WhisperAX_Watch_AppUITestsLaunchTests.swift │ │ └── WhisperAX_Watch_AppUITests.swift │ └── WhisperAXWatchAppTests │ │ └── WhisperAX_Watch_AppTests.swift └── ServeCLIClient │ ├── Python │ ├── pyproject.toml │ ├── README.md │ ├── test_translate.py │ └── test_transcribe.py │ ├── Swift │ ├── updateClient.sh │ ├── Package.swift │ ├── README.md │ └── Package.resolved │ └── Curl │ ├── README.md │ ├── translate.sh │ ├── transcribe.sh │ └── test.sh ├── .spi.yml ├── Tests └── WhisperKitTests │ ├── Resources │ ├── jfk.wav │ ├── ted_60.m4a │ ├── jfk_441khz.m4a │ ├── 8_Channel_ID.m4a │ ├── es_test_clip.wav │ ├── ja_test_clip.wav │ ├── config-v02.json │ └── config-v03.json │ ├── UnitTestsPlan.xctestplan │ └── Evaluate │ ├── WERUtils.swift │ └── DistanceCalculation.swift ├── scripts ├── specs │ └── openapi-generator-config.yaml └── pyproject.toml ├── .github └── workflows │ ├── homebrew-update.yml │ ├── release-tests.yml │ ├── expo-update.yml │ ├── development-tests.yml │ └── unit-tests.yml ├── Sources ├── WhisperKitCLI │ ├── Server │ │ ├── ServeCLIArguments.swift │ │ └── ServeCLI.swift │ ├── CLIUtils.swift │ ├── WhisperKitCLI.swift │ ├── TranscribeCLIUtils.swift │ └── TranscribeCLIArguments.swift └── WhisperKit │ ├── Utilities │ ├── WhisperError.swift │ ├── TextUtilities.swift │ ├── Concurrency.swift │ ├── Logging.swift │ ├── ResultWriter.swift │ └── Extensions+Internal.swift │ └── Core │ ├── Audio │ ├── EnergyVAD.swift │ ├── AudioChunker.swift │ └── VoiceActivityDetector.swift │ ├── FeatureExtractor.swift │ └── AudioEncoder.swift ├── .swiftpm ├── configuration │ └── Package.resolved └── xcode │ └── xcshareddata │ └── xcschemes │ └── whisperkit-Package.xcscheme ├── LICENSE ├── fastlane └── README.md ├── Package.resolved ├── .gitignore ├── Package.swift ├── CONTRIBUTING.md ├── BENCHMARKS.md └── Makefile /Examples/WhisperAX/WhisperAXTests/WhisperKitTests: -------------------------------------------------------------------------------- 1 | ../../../Tests/WhisperKitTests -------------------------------------------------------------------------------- /.spi.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | builder: 3 | configs: 4 | - documentation_targets: [WhisperKit] -------------------------------------------------------------------------------- /Examples/WhisperAX/Debug.xcconfig: -------------------------------------------------------------------------------- 1 | // Run `make setup` to add your team here 2 | DEVELOPMENT_TEAM= 3 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/jfk.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/jfk.wav -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/ted_60.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/ted_60.m4a -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/jfk_441khz.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/jfk_441khz.m4a -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/8_Channel_ID.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/8_Channel_ID.m4a -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/es_test_clip.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/es_test_clip.wav -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/ja_test_clip.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/ja_test_clip.wav -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scripts/specs/openapi-generator-config.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | - types 3 | - server 4 | 5 | accessModifier: internal 6 | namingStrategy: idiomatic 7 | 8 | filter: 9 | paths: 10 | - /audio/transcriptions 11 | - /audio/translations -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scripts/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "whisperkit-scripts" 3 | version = "0.1.0" 4 | description = "Scripts for WhisperKit development" 5 | requires-python = ">=3.8" 6 | dependencies = [ 7 | "requests", 8 | "ruamel.yaml", 9 | ] 10 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "idiom" : "universal" 5 | } 6 | ], 7 | "info" : { 8 | "author" : "xcode", 9 | "version" : 1 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "whisperkit-python-client" 3 | version = "0.1.0" 4 | description = "Python client for WhisperKit local server" 5 | requires-python = ">=3.8" 6 | dependencies = [ 7 | "openai>=1.0.0", 8 | "requests>=2.25.0", 9 | "python-dotenv>=0.19.0", 10 | "argparse", 11 | ] 12 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDEDidComputeMac32BitWarning 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "filename" : "appstore.png", 5 | "idiom" : "universal", 6 | "platform" : "watchos", 7 | "size" : "1024x1024" 8 | } 9 | ], 10 | "info" : { 11 | "author" : "xcode", 12 | "version" : 1 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/WhisperAXWatchApp.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import SwiftUI 5 | 6 | @main 7 | struct WhisperAXWatchApp: App { 8 | var body: some Scene { 9 | WindowGroup { 10 | WhisperAXWatchView() 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /.github/workflows/homebrew-update.yml: -------------------------------------------------------------------------------- 1 | name: Bump Homebrew Formula 2 | 3 | on: 4 | push: 5 | tags: 'v*' 6 | 7 | jobs: 8 | homebrew: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: mislav/bump-homebrew-formula-action@v3 12 | with: 13 | formula-name: whisperkit-cli 14 | env: 15 | COMMITTER_TOKEN: ${{ secrets.COMMITTER_TOKEN }} 16 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | NSPrivacyAccessedAPITypes 6 | 7 | NSPrivacyAccessedAPIType 8 | NSPrivacyAccessedAPICategoryUserDefaults 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/WhisperAXApp.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import SwiftUI 5 | 6 | @main 7 | struct WhisperAXApp: App { 8 | var body: some Scene { 9 | WindowGroup { 10 | ContentView() 11 | #if os(macOS) 12 | .frame(minWidth: 1000, minHeight: 700) 13 | #endif 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/Server/ServeCLIArguments.swift: -------------------------------------------------------------------------------- 1 | // Copyright © 2025 Argmax, Inc. All rights reserved. 2 | // For licensing see accompanying LICENSE.md file. 3 | 4 | import ArgumentParser 5 | 6 | struct ServeCLIArguments: ParsableArguments { 7 | @OptionGroup 8 | var transcribe: TranscribeCLIArguments 9 | 10 | @Option(name: .long, help: "Port to run the server on") 11 | var port: Int = 50060 12 | 13 | @Option(name: .long, help: "Host to bind the server to") 14 | var host: String = "localhost" 15 | } 16 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Swift/updateClient.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Update WhisperKit Swift Client from OpenAPI spec 4 | # This script regenerates the client code when the server spec changes 5 | 6 | set -e 7 | 8 | echo "Updating WhisperKit Swift Client..." 9 | 10 | # Generate client code 11 | echo "Generating client code..." 12 | swift run swift-openapi-generator generate \ 13 | ../../../scripts/specs/localserver_openapi.yaml \ 14 | --output-directory Sources/WhisperKitSwiftClient/Generated \ 15 | --access-modifier public \ 16 | --mode client \ 17 | --mode types 18 | 19 | echo "Client code updated successfully!" 20 | echo "Files generated in Sources/Generated/" 21 | -------------------------------------------------------------------------------- /.swiftpm/configuration/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "swift-argument-parser", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/apple/swift-argument-parser.git", 7 | "state" : { 8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 9 | "version" : "1.3.0" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-transformers", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/huggingface/swift-transformers.git", 16 | "state" : { 17 | "revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe", 18 | "version" : "0.1.7" 19 | } 20 | } 21 | ], 22 | "version" : 2 23 | } 24 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/CLIUtils.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import ArgumentParser 5 | import CoreML 6 | import Foundation 7 | 8 | enum ComputeUnits: String, ExpressibleByArgument, CaseIterable { 9 | case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine, random 10 | var asMLComputeUnits: MLComputeUnits { 11 | switch self { 12 | case .all: return .all 13 | case .cpuAndGPU: return .cpuAndGPU 14 | case .cpuOnly: return .cpuOnly 15 | case .cpuAndNeuralEngine: return .cpuAndNeuralEngine 16 | case .random: return Bool.random() ? .cpuAndGPU : .cpuAndNeuralEngine 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/WhisperAX.entitlements: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | com.apple.developer.kernel.increased-memory-limit 6 | 7 | com.apple.security.app-sandbox 8 | 9 | com.apple.security.device.audio-input 10 | 11 | com.apple.security.files.downloads.read-only 12 | 13 | com.apple.security.files.user-selected.read-write 14 | 15 | com.apple.security.network.client 16 | 17 | com.apple.security.network.server 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/WhisperKitCLI.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import ArgumentParser 5 | import Foundation 6 | 7 | let VERSION: String = "development" 8 | 9 | var subcommands: [ParsableCommand.Type] { 10 | #if BUILD_SERVER_CLI 11 | [TranscribeCLI.self, ServeCLI.self] 12 | #else 13 | [TranscribeCLI.self] 14 | #endif 15 | } 16 | 17 | @main 18 | struct WhisperKitCLI: AsyncParsableCommand { 19 | static let configuration = CommandConfiguration( 20 | commandName: "whisperkit-cli", 21 | abstract: "WhisperKit CLI", 22 | discussion: "Swift native speech recognition with Whisper for Apple Silicon", 23 | version: VERSION, 24 | subcommands: subcommands 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "originHash" : "831ad63194a5262b2549d58e383a520f9cbbc80b4a75660fbbcc56d65edfdab4", 3 | "pins" : [ 4 | { 5 | "identity" : "swift-argument-parser", 6 | "kind" : "remoteSourceControl", 7 | "location" : "https://github.com/apple/swift-argument-parser.git", 8 | "state" : { 9 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 10 | "version" : "1.3.0" 11 | } 12 | }, 13 | { 14 | "identity" : "swift-transformers", 15 | "kind" : "remoteSourceControl", 16 | "location" : "https://github.com/huggingface/swift-transformers.git", 17 | "state" : { 18 | "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08", 19 | "version" : "0.1.8" 20 | } 21 | } 22 | ], 23 | "version" : 3 24 | } 25 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXUITests/WhisperAXUITestsLaunchTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAXUITestsLaunchTests: XCTestCase { 7 | override class var runsForEachTargetApplicationUIConfiguration: Bool { 8 | true 9 | } 10 | 11 | override func setUpWithError() throws { 12 | continueAfterFailure = false 13 | } 14 | 15 | func testLaunch() throws { 16 | let app = XCUIApplication() 17 | app.launch() 18 | 19 | // Insert steps here to perform after app launch but before taking a screenshot, 20 | // such as logging into a test account or navigating somewhere in the app 21 | 22 | let attachment = XCTAttachment(screenshot: app.screenshot()) 23 | attachment.name = "Launch Screen" 24 | attachment.lifetime = .keepAlways 25 | add(attachment) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITestsLaunchTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAX_Watch_AppUITestsLaunchTests: XCTestCase { 7 | override class var runsForEachTargetApplicationUIConfiguration: Bool { 8 | true 9 | } 10 | 11 | override func setUpWithError() throws { 12 | continueAfterFailure = false 13 | } 14 | 15 | func testLaunch() throws { 16 | let app = XCUIApplication() 17 | app.launch() 18 | 19 | // Insert steps here to perform after app launch but before taking a screenshot, 20 | // such as logging into a test account or navigating somewhere in the app 21 | 22 | let attachment = XCTAttachment(screenshot: app.screenshot()) 23 | attachment.name = "Launch Screen" 24 | attachment.lifetime = .keepAlways 25 | add(attachment) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 argmax, inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Swift/Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.9 2 | import PackageDescription 3 | 4 | let package = Package( 5 | name: "WhisperKitSwiftClient", 6 | platforms: [ 7 | .macOS(.v13) 8 | ], 9 | products: [ 10 | .executable(name: "whisperkit-client", targets: ["WhisperKitSwiftClient"]), 11 | ], 12 | dependencies: [ 13 | .package(url: "https://github.com/apple/swift-argument-parser", from: "1.2.0"), 14 | .package(url: "https://github.com/apple/swift-openapi-runtime", from: "1.0.0"), 15 | .package(url: "https://github.com/apple/swift-openapi-urlsession", from: "1.0.0"), 16 | .package(url: "https://github.com/apple/swift-http-types", from: "1.0.0"), 17 | .package(url: "https://github.com/apple/swift-openapi-generator", from: "1.0.0"), 18 | ], 19 | targets: [ 20 | .executableTarget( 21 | name: "WhisperKitSwiftClient", 22 | dependencies: [ 23 | .product(name: "ArgumentParser", package: "swift-argument-parser"), 24 | .product(name: "OpenAPIRuntime", package: "swift-openapi-runtime"), 25 | .product(name: "OpenAPIURLSession", package: "swift-openapi-urlsession"), 26 | .product(name: "HTTPTypes", package: "swift-http-types"), 27 | ], 28 | path: "Sources/WhisperKitSwiftClient" 29 | ) 30 | ] 31 | ) 32 | -------------------------------------------------------------------------------- /fastlane/README.md: -------------------------------------------------------------------------------- 1 | fastlane documentation 2 | ---- 3 | 4 | # Installation 5 | 6 | Make sure you have the latest version of the Xcode command line tools installed: 7 | 8 | ```sh 9 | xcode-select --install 10 | ``` 11 | 12 | For _fastlane_ installation instructions, see [Installing _fastlane_](https://docs.fastlane.tools/#installing-fastlane) 13 | 14 | # Available Actions 15 | 16 | ## iOS 17 | 18 | ### ios list_devices 19 | 20 | ```sh 21 | [bundle exec] fastlane ios list_devices 22 | ``` 23 | 24 | List all connected devices 25 | 26 | ### ios benchmark 27 | 28 | ```sh 29 | [bundle exec] fastlane ios benchmark 30 | ``` 31 | 32 | Benchmark devices with options 33 | 34 | ### ios extract_results 35 | 36 | ```sh 37 | [bundle exec] fastlane ios extract_results 38 | ``` 39 | 40 | Extract benchmark results 41 | 42 | ### ios upload_results 43 | 44 | ```sh 45 | [bundle exec] fastlane ios upload_results 46 | ``` 47 | 48 | Upload benchmark results 49 | 50 | ---- 51 | 52 | This README.md is auto-generated and will be re-generated every time [_fastlane_](https://fastlane.tools) is run. 53 | 54 | More information about _fastlane_ can be found on [fastlane.tools](https://fastlane.tools). 55 | 56 | The documentation of _fastlane_ can be found on [docs.fastlane.tools](https://docs.fastlane.tools). 57 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "swift-argument-parser", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/apple/swift-argument-parser.git", 7 | "state" : { 8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 9 | "version" : "1.3.0" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-collections", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/apple/swift-collections.git", 16 | "state" : { 17 | "revision" : "7b847a3b7008b2dc2f47ca3110d8c782fb2e5c7e", 18 | "version" : "1.3.0" 19 | } 20 | }, 21 | { 22 | "identity" : "swift-jinja", 23 | "kind" : "remoteSourceControl", 24 | "location" : "https://github.com/huggingface/swift-jinja.git", 25 | "state" : { 26 | "revision" : "38b7beeec5d968accd19a8a70c1882cc89979d1c", 27 | "version" : "2.1.1" 28 | } 29 | }, 30 | { 31 | "identity" : "swift-transformers", 32 | "kind" : "remoteSourceControl", 33 | "location" : "https://github.com/huggingface/swift-transformers.git", 34 | "state" : { 35 | "revision" : "d363e83a77bafe144808a3d01556139fe67cd8bc", 36 | "version" : "1.1.2" 37 | } 38 | } 39 | ], 40 | "version" : 2 41 | } 42 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXTests/WhisperAXTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAXTests: XCTestCase { 7 | override func setUpWithError() throws { 8 | // Put setup code here. This method is called before the invocation of each test method in the class. 9 | } 10 | 11 | override func tearDownWithError() throws { 12 | // Put teardown code here. This method is called after the invocation of each test method in the class. 13 | } 14 | 15 | func testExample() throws { 16 | // This is an example of a functional test case. 17 | // Use XCTAssert and related functions to verify your tests produce the correct results. 18 | // Any test you write for XCTest can be annotated as throws and async. 19 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error. 20 | // Mark your test async to allow awaiting for asynchronous code to complete. Check the results with assertions afterwards. 21 | } 22 | 23 | func testPerformanceExample() throws { 24 | // This is an example of a performance test case. 25 | measure { 26 | // Put the code you want to measure the time of here. 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /.github/workflows/release-tests.yml: -------------------------------------------------------------------------------- 1 | name: Release Tests 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build-and-test-all-platforms: 10 | name: "Build and Test All Platforms" 11 | strategy: 12 | matrix: 13 | include: 14 | - os: macos-14 15 | ios-version: "17.5" 16 | ios-device: "iPhone 15" 17 | watchos-version: "10.2" 18 | visionos-version: "1.0" 19 | xcode-version: "16.1" 20 | - os: macos-15 21 | ios-version: "18.5" 22 | ios-device: "iPhone 16" 23 | watchos-version: "11.5" 24 | visionos-version: "2.5" 25 | xcode-version: "16.4" 26 | - os: macos-26 27 | ios-version: "26.0.1" 28 | ios-device: "iPhone 17" 29 | watchos-version: "26.0" 30 | visionos-version: "26.0" 31 | macos-runner: "macos-26" 32 | xcode-version: "26.0" 33 | uses: ./.github/workflows/unit-tests.yml 34 | with: 35 | macos-runner: ${{ matrix.os }} 36 | ios-version: ${{ matrix.ios-version }} 37 | ios-device: ${{ matrix.ios-device }} 38 | watchos-version: ${{ matrix.watchos-version }} 39 | visionos-version: ${{ matrix.visionos-version }} 40 | xcode-version: ${{ matrix.xcode-version }} 41 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchAppTests/WhisperAX_Watch_AppTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | @testable import WhisperAX_Watch_App 5 | import XCTest 6 | 7 | final class WhisperAX_Watch_AppTests: XCTestCase { 8 | override func setUpWithError() throws { 9 | // Put setup code here. This method is called before the invocation of each test method in the class. 10 | } 11 | 12 | override func tearDownWithError() throws { 13 | // Put teardown code here. This method is called after the invocation of each test method in the class. 14 | } 15 | 16 | func testExample() throws { 17 | // This is an example of a functional test case. 18 | // Use XCTAssert and related functions to verify your tests produce the correct results. 19 | // Any test you write for XCTest can be annotated as throws and async. 20 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error. 21 | // Tests marked async will run the test method on an arbitrary thread managed by the Swift runtime. 22 | } 23 | 24 | func testPerformanceExample() throws { 25 | // This is an example of a performance test case. 26 | self.measure { 27 | // Put the code you want to measure the time of here. 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | .vscode/ 5 | xcuserdata/ 6 | DerivedData/ 7 | .swiftpm/configuration/registries.json 8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 9 | .swiftpm/xcode/xcshareddata/ 10 | **/*.xcscheme 11 | .netrc 12 | .env 13 | 14 | # Core ML Model Files 15 | Models 16 | **/*.mlpackage 17 | **/*.mlmodel 18 | **/*.mlmodelc 19 | **/*.zip 20 | **/*.tar.gz 21 | 22 | # Audio files (add manually if needed) 23 | **/*.wav 24 | **/*.mp3 25 | **/*.m4a 26 | **/*.flac 27 | 28 | # Swift Client build artifacts 29 | Examples/ServeCLIClient/Swift/.build 30 | Examples/ServeCLIClient/Swift/.swiftpm 31 | 32 | ## Xcode 33 | # Build generated 34 | build/ 35 | DerivedData/ 36 | 37 | # Various settings 38 | *.pbxuser 39 | !default.pbxuser 40 | *.mode1v3 41 | !default.mode1v3 42 | *.mode2v3 43 | !default.mode2v3 44 | *.perspectivev3 45 | !default.perspectivev3 46 | xcuserdata/ 47 | 48 | # Other 49 | *.moved-aside 50 | *.xccheckout 51 | *.xcscmblueprint 52 | 53 | # Obj-C/Swift specific 54 | *.hmap 55 | *.ipa 56 | *.dSYM.zip 57 | *.dSYM 58 | 59 | # fastlane 60 | fastlane/report.xml 61 | fastlane/Preview.html 62 | fastlane/screenshots 63 | fastlane/test_output 64 | fastlane/benchmark_data 65 | fastlane/upload_folder 66 | 67 | ### Xcode Patch ### 68 | **/*.xcconfig 69 | *.xcodeproj/* 70 | !*.xcodeproj/project.pbxproj 71 | !*.xcodeproj/xcshareddata/ 72 | !*.xcworkspace/contents.xcworkspacedata 73 | /*.gcno 74 | 75 | # Swift build artifacts 76 | *.d 77 | *.o 78 | *.swiftdeps 79 | *.swiftmodule 80 | *.swiftdoc 81 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXUITests/WhisperAXUITests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAXUITests: XCTestCase { 7 | override func setUpWithError() throws { 8 | // Put setup code here. This method is called before the invocation of each test method in the class. 9 | 10 | // In UI tests it is usually best to stop immediately when a failure occurs. 11 | continueAfterFailure = false 12 | 13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this. 14 | } 15 | 16 | override func tearDownWithError() throws { 17 | // Put teardown code here. This method is called after the invocation of each test method in the class. 18 | } 19 | 20 | func testExample() throws { 21 | // UI tests must launch the application that they test. 22 | let app = XCUIApplication() 23 | app.launch() 24 | 25 | // Use XCTAssert and related functions to verify your tests produce the correct results. 26 | } 27 | 28 | func testLaunchPerformance() throws { 29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) { 30 | // This measures how long it takes to launch your application. 31 | measure(metrics: [XCTApplicationLaunchMetric()]) { 32 | XCUIApplication().launch() 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAX_Watch_AppUITests: XCTestCase { 7 | override func setUpWithError() throws { 8 | // Put setup code here. This method is called before the invocation of each test method in the class. 9 | 10 | // In UI tests it is usually best to stop immediately when a failure occurs. 11 | continueAfterFailure = false 12 | 13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this. 14 | } 15 | 16 | override func tearDownWithError() throws { 17 | // Put teardown code here. This method is called after the invocation of each test method in the class. 18 | } 19 | 20 | func testExample() throws { 21 | // UI tests must launch the application that they test. 22 | let app = XCUIApplication() 23 | app.launch() 24 | 25 | // Use XCTAssert and related functions to verify your tests produce the correct results. 26 | } 27 | 28 | func testLaunchPerformance() throws { 29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) { 30 | // This measures how long it takes to launch your application. 31 | measure(metrics: [XCTApplicationLaunchMetric()]) { 32 | XCUIApplication().launch() 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/UnitTestsPlan.xctestplan: -------------------------------------------------------------------------------- 1 | { 2 | "configurations" : [ 3 | { 4 | "id" : "427C492D-C19B-4C99-A90A-F4AEF0EC4B54", 5 | "name" : "Configuration 1", 6 | "options" : { 7 | 8 | } 9 | } 10 | ], 11 | "defaultOptions" : { 12 | "maximumTestRepetitions" : 3, 13 | "testRepetitionMode" : "retryOnFailure", 14 | "testTimeoutsEnabled" : true 15 | }, 16 | "testTargets" : [ 17 | { 18 | "skippedTests" : [ 19 | "FunctionalTests", 20 | "FunctionalTests\/testAsyncImplementation()", 21 | "FunctionalTests\/testBaseImplementation()", 22 | "FunctionalTests\/testBatchTranscribeAudioArrays()", 23 | "FunctionalTests\/testBatchTranscribeAudioPaths()", 24 | "FunctionalTests\/testBatchTranscribeAudioPathsWithErrors()", 25 | "FunctionalTests\/testInitLarge()", 26 | "FunctionalTests\/testModelSearchPathLarge()", 27 | "FunctionalTests\/testRealTimeFactorLarge()", 28 | "FunctionalTests\/testRealTimeFactorTiny()", 29 | "RegressionTests", 30 | "RegressionTests\/testHirschberg()", 31 | "RegressionTests\/testInMemoryAndDiskUsage()", 32 | "RegressionTests\/testLargeWER()", 33 | "RegressionTests\/testLevenshtein()", 34 | "RegressionTests\/testModelPerformance()", 35 | "RegressionTests\/testModelPerformanceWithDebugConfig()", 36 | "RegressionTests\/testNormalizer()" 37 | ], 38 | "target" : { 39 | "containerPath" : "container:", 40 | "identifier" : "WhisperKitTests", 41 | "name" : "WhisperKitTests" 42 | } 43 | } 44 | ], 45 | "version" : 1 46 | } 47 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Utilities/WhisperError.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | @frozen 7 | public enum WhisperError: Error, LocalizedError, Equatable { 8 | case tokenizerUnavailable(String = "Tokenizer is unavailable") 9 | case modelsUnavailable(String = "Models are unavailable") 10 | case prefillFailed(String = "Prefill failed") 11 | case audioProcessingFailed(String = "Audio processing failed") 12 | case decodingLogitsFailed(String = "Unable to decode logits from the model output") 13 | case segmentingFailed(String = "Creating segments failed") 14 | case loadAudioFailed(String = "Load audio failed") 15 | case prepareDecoderInputsFailed(String = "Prepare decoder inputs failed") 16 | case transcriptionFailed(String = "Transcription failed") 17 | case decodingFailed(String = "Decoding failed") 18 | case microphoneUnavailable(String = "No available microphone to record or stream") 19 | case initializationError(String = "Error initializing WhisperKit") 20 | 21 | public var errorDescription: String? { 22 | switch self { 23 | case let .tokenizerUnavailable(message), 24 | let .modelsUnavailable(message), 25 | let .prefillFailed(message), 26 | let .audioProcessingFailed(message), 27 | let .decodingLogitsFailed(message), 28 | let .segmentingFailed(message), 29 | let .loadAudioFailed(message), 30 | let .prepareDecoderInputsFailed(message), 31 | let .transcriptionFailed(message), 32 | let .decodingFailed(message), 33 | let .microphoneUnavailable(message), 34 | let .initializationError(message): 35 | Logging.error(message) 36 | return message 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Curl/README.md: -------------------------------------------------------------------------------- 1 | # WhisperKit CurlClient 2 | 3 | A simple, lightweight client for the WhisperKit Local Server using shell scripts and curl. 4 | 5 | ## Quick Start 6 | 7 | 1. **Make scripts executable:** 8 | ```bash 9 | chmod +x *.sh 10 | ``` 11 | 12 | 2. **Start the WhisperKit server:** 13 | ```bash 14 | whisperkit-cli serve --model tiny 15 | ``` 16 | 17 | 3. **Use the scripts:** 18 | ```bash 19 | # Transcribe audio 20 | ./transcribe.sh audio.wav 21 | 22 | # Translate audio to English 23 | ./translate.sh audio.wav --language es 24 | 25 | # Run test suite 26 | ./test.sh 27 | ``` 28 | 29 | ## Scripts 30 | 31 | ### `transcribe.sh` 32 | Transcribes audio files to text. 33 | 34 | **Basic usage:** 35 | ```bash 36 | ./transcribe.sh audio.wav 37 | ./transcribe.sh audio.wav --language en --timestamp-granularities word,segment 38 | ./transcribe.sh audio.wav --stream true --logprobs 39 | ``` 40 | 41 | ### `translate.sh` 42 | Translates audio files to English. 43 | 44 | **Basic usage:** 45 | ```bash 46 | ./translate.sh audio.wav 47 | ./translate.sh audio.wav --language es 48 | ./translate.sh audio.wav --stream true --logprobs 49 | ``` 50 | 51 | ### `test.sh` 52 | Runs comprehensive tests on sample files. 53 | 54 | ## Options 55 | 56 | - `-h, --help` - Show help 57 | - `-s, --server ` - Server URL (default: http://localhost:50060/v1) 58 | - `-l, --language ` - Source language (e.g., en, es, ja) 59 | - `-f, --response-format ` - Response format: json, verbose_json 60 | - `--timestamp-granularities ` - Timestamp granularities: word,segment 61 | - `--stream ` - Enable streaming (default: false) 62 | - `--logprobs` - Include logprobs in response (default: false) 63 | - `--temperature ` - Sampling temperature 0.0-1.0 (default: 0.0) 64 | - `--verbose` - Show verbose curl output 65 | 66 | ## Prerequisites 67 | 68 | - `curl` (usually pre-installed) 69 | - `bash` shell 70 | - WhisperKit Local Server running 71 | -------------------------------------------------------------------------------- /.github/workflows/expo-update.yml: -------------------------------------------------------------------------------- 1 | # Tested on MacOS with: 2 | # act -s COMMITTER_TOKEN="$(gh auth token)" release --container-architecture linux/amd64 -P ubuntu-latest=catthehacker/ubuntu:act-latest -e <(echo '{ "release": { "tag_name": "v0.0.0" }}') 3 | name: Update whisper-kit-expo 4 | 5 | on: 6 | release: 7 | types: [released] 8 | 9 | jobs: 10 | update-whisperkit: 11 | runs-on: ubuntu-latest 12 | env: 13 | TAG: ${{ github.event.release.tag_name }} 14 | BRANCH_NAME: update-whisperkit-${{ github.event.release.tag_name }} 15 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }} 16 | steps: 17 | - name: Checkout whisper-kit-expo 18 | uses: actions/checkout@v4 19 | with: 20 | repository: seb-sep/whisper-kit-expo 21 | token: ${{ secrets.COMMITTER_TOKEN }} 22 | ref: main 23 | 24 | - name: Setup Node 25 | uses: actions/setup-node@v4 26 | with: 27 | node-version: '20.x' 28 | 29 | - name: New branch 30 | run: | 31 | git checkout -b $BRANCH_NAME 32 | echo ${{ github.event.release }} 33 | echo "Release tag is $TAG" 34 | 35 | - name: Update package.json version 36 | run: | 37 | PACKAGE_PATH="package.json" 38 | if [ ! -f "$PACKAGE_PATH" ]; then 39 | echo "Could not find package.json at path: $PACKAGE_PATH." 40 | exit 1 41 | fi 42 | RELEASE_TAG=${TAG#v} 43 | jq --arg newver "$RELEASE_TAG" '.whisperKit.version = $newver' "$PACKAGE_PATH" > tmp.$$.json && mv tmp.$$.json "$PACKAGE_PATH" 44 | cat "$PACKAGE_PATH" 45 | 46 | - name: Commit changes 47 | run: | 48 | git config --global user.email "164233781+argmaxincbot@users.noreply.github.com" 49 | git config --global user.name "argmaxincbot" 50 | git add ./package.json 51 | git commit -m "Update WhisperKit to $TAG" 52 | git push origin $BRANCH_NAME 53 | - name: PR with changes 54 | env: 55 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }} 56 | run: | 57 | gh pr create --title "Update WhisperKit to $TAG" --body "Update WhisperKit to $TAG" --base main --head $BRANCH_NAME 58 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Audio/EnergyVAD.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Voice activity detection based on energy threshold 7 | public final class EnergyVAD: VoiceActivityDetector { 8 | public let energyThreshold: Float 9 | 10 | /// Initialize a new EnergyVAD instance 11 | /// - Parameters: 12 | /// - sampleRate: Audio sample rate 13 | /// - frameLength: Frame length in seconds 14 | /// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries 15 | /// - energyThreshold: minimal energy threshold 16 | public convenience init( 17 | sampleRate: Int = WhisperKit.sampleRate, 18 | frameLength: Float = 0.1, 19 | frameOverlap: Float = 0.0, 20 | energyThreshold: Float = 0.02 21 | ) { 22 | self.init( 23 | sampleRate: sampleRate, 24 | // Compute frame length and overlap in number of samples 25 | frameLengthSamples: Int(frameLength * Float(sampleRate)), 26 | frameOverlapSamples: Int(frameOverlap * Float(sampleRate)), 27 | energyThreshold: energyThreshold 28 | ) 29 | } 30 | 31 | public required init( 32 | sampleRate: Int = 16000, 33 | frameLengthSamples: Int, 34 | frameOverlapSamples: Int = 0, 35 | energyThreshold: Float = 0.02 36 | ) { 37 | self.energyThreshold = energyThreshold 38 | super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples) 39 | } 40 | 41 | public override func voiceActivity(in waveform: [Float]) -> [Bool] { 42 | let chunkRatio = Double(waveform.count) / Double(frameLengthSamples) 43 | 44 | // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long 45 | let count = Int(chunkRatio.rounded(.up)) 46 | 47 | let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks( 48 | of: waveform, 49 | chunkCount: count, 50 | frameLengthSamples: frameLengthSamples, 51 | frameOverlapSamples: frameOverlapSamples, 52 | energyThreshold: energyThreshold 53 | ) 54 | 55 | return chunkedVoiceActivity 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /.github/workflows/development-tests.yml: -------------------------------------------------------------------------------- 1 | name: Development Tests 2 | 3 | on: 4 | pull_request: 5 | pull_request_review: 6 | types: [submitted] 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-and-test: 11 | name: "Build and Test" 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}-build-and-test 14 | cancel-in-progress: true 15 | uses: ./.github/workflows/unit-tests.yml 16 | with: 17 | ios-version: "26.0.1" 18 | ios-device: "iPhone 17" 19 | watchos-version: "26.0" 20 | visionos-version: "26.0" 21 | macos-runner: "macos-26" 22 | xcode-version: "26.0" 23 | 24 | check-approvals: 25 | runs-on: ubuntu-latest 26 | outputs: 27 | reviews: ${{ steps.reviews.outputs.state }} 28 | permissions: 29 | pull-requests: read 30 | contents: read 31 | steps: 32 | - uses: actions/checkout@v4 33 | - name: Check Approvals 34 | id: reviews 35 | env: 36 | GH_TOKEN: ${{ github.token }} 37 | pr: ${{ github.event.pull_request.number }} 38 | run: | 39 | echo "Checking PR approval for: $pr" 40 | state=$(gh pr view $pr --json reviewDecision --jq '.reviewDecision') 41 | echo "Review decision state: $state" 42 | echo "state=$state" >> "$GITHUB_OUTPUT" 43 | 44 | pre-merge-tests: 45 | name: "Pre-merge Tests" 46 | needs: [check-approvals] 47 | if: needs.check-approvals.outputs.reviews == 'APPROVED' || github.event_name == 'workflow_dispatch' 48 | concurrency: 49 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}-${{ matrix.os }} 50 | cancel-in-progress: true 51 | strategy: 52 | matrix: 53 | include: 54 | - os: macos-14 55 | ios-version: "17.2" 56 | ios-device: "iPhone 15" 57 | watchos-version: "10.2" 58 | visionos-version: "1.0" 59 | xcode-version: "16.1" 60 | - os: macos-15 61 | ios-version: "18.5" 62 | ios-device: "iPhone 16" 63 | watchos-version: "11.5" 64 | visionos-version: "2.5" 65 | xcode-version: "16.4" 66 | uses: ./.github/workflows/unit-tests.yml 67 | with: 68 | macos-runner: ${{ matrix.os }} 69 | ios-version: ${{ matrix.ios-version }} 70 | ios-device: ${{ matrix.ios-device }} 71 | watchos-version: ${{ matrix.watchos-version }} 72 | visionos-version: ${{ matrix.visionos-version }} 73 | xcode-version: ${{ matrix.xcode-version }} 74 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/FeatureExtractor.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Accelerate 5 | import AVFoundation 6 | import CoreGraphics 7 | import CoreML 8 | import Foundation 9 | 10 | public protocol FeatureExtractorOutputType {} 11 | extension MLMultiArray: FeatureExtractorOutputType {} 12 | 13 | public protocol FeatureExtracting { 14 | var melCount: Int? { get } 15 | var windowSamples: Int? { get } 16 | func logMelSpectrogram(fromAudio inputAudio: any AudioProcessorOutputType) async throws -> (any FeatureExtractorOutputType)? 17 | } 18 | 19 | open class FeatureExtractor: FeatureExtracting, WhisperMLModel { 20 | public var model: MLModel? 21 | 22 | public init() {} 23 | 24 | public var melCount: Int? { 25 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["melspectrogram_features"] else { return nil } 26 | guard inputDescription.type == .multiArray else { return nil } 27 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 28 | let shape = shapeConstraint.shape.map { $0.intValue } 29 | return shape[1] 30 | } 31 | 32 | public var windowSamples: Int? { 33 | guard let inputDescription = model?.modelDescription.inputDescriptionsByName["audio"] else { return nil } 34 | guard inputDescription.type == .multiArray else { return nil } 35 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 36 | let shape = shapeConstraint.shape.map { $0.intValue } 37 | return shape[0] // The audio input is a 1D array 38 | } 39 | 40 | open func logMelSpectrogram(fromAudio inputAudio: any AudioProcessorOutputType) async throws -> (any FeatureExtractorOutputType)? { 41 | guard let audioArray = inputAudio as? MLMultiArray else { 42 | throw WhisperError.audioProcessingFailed("FeatureExtractor input must be MLMultiArray") 43 | } 44 | guard let model else { 45 | throw WhisperError.modelsUnavailable() 46 | } 47 | try Task.checkCancellation() 48 | 49 | let interval = Logging.beginSignpost("ExtractAudioFeatures", signposter: Logging.FeatureExtractor.signposter) 50 | defer { Logging.endSignpost("ExtractAudioFeatures", interval: interval, signposter: Logging.FeatureExtractor.signposter) } 51 | 52 | let modelInputs = MelSpectrogramInput(audio: audioArray) 53 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) 54 | let output = MelSpectrogramOutput(features: outputFeatures) 55 | return output.melspectrogramFeatures 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Utilities/TextUtilities.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// A utility struct providing text compression and analysis functionality 7 | public struct TextUtilities { 8 | 9 | private init() {} 10 | 11 | /// Calculates the compression ratio of an array of text tokens using zlib compression 12 | /// - Parameter textTokens: Array of integer tokens to compress 13 | /// - Returns: The compression ratio (original size / compressed size). Returns infinity if compression fails 14 | public static func compressionRatio(of textTokens: [Int]) -> Float { 15 | // Convert the integer array to a byte array (Data) 16 | let dataBuffer = textTokens.compactMap { Int32($0) } 17 | let data = dataBuffer.withUnsafeBufferPointer { Data(buffer: $0) } 18 | 19 | // Compress the data using NSData compression 20 | do { 21 | let compressedData = try (data as NSData).compressed(using: .zlib) 22 | // Calculate and return the compression ratio 23 | return Float(data.count) / Float(compressedData.length) 24 | } catch { 25 | Logging.debug("Compression error: \(error.localizedDescription)") 26 | return Float.infinity 27 | } 28 | } 29 | 30 | /// Calculates the compression ratio of a text string using zlib compression 31 | /// - Parameter text: The text string to compress 32 | /// - Returns: The compression ratio (original size / compressed size). Returns infinity if text is empty or compression fails 33 | public static func compressionRatio(of text: String) -> Float { 34 | if text.isEmpty { 35 | return Float.infinity // TODO: throw to caller instead of return infinity 36 | } 37 | 38 | // Encode the string as UTF-8 39 | guard let data = text.data(using: .utf8) else { 40 | Logging.debug("String encoding error") 41 | return Float.infinity 42 | } 43 | 44 | // Compress the data using NSData compression 45 | do { 46 | let compressedData = try (data as NSData).compressed(using: .zlib) 47 | // Calculate and return the compression ratio 48 | return Float(data.count) / Float(compressedData.length) 49 | } catch { 50 | Logging.debug("Compression error: \(error.localizedDescription)") 51 | return Float.infinity 52 | } 53 | } 54 | } 55 | 56 | @available(*, deprecated, message: "Subject to removal in a future version. Use `TextUtilities.compressionRatio(of:)` instead.") 57 | public func compressionRatio(of array: [Int]) -> Float { 58 | return TextUtilities.compressionRatio(of: array) 59 | } 60 | 61 | @available(*, deprecated, message: "Subject to removal in a future version. Use `TextUtilities.compressionRatio(of:)` instead.") 62 | public func compressionRatio(of text: String) -> Float { 63 | return TextUtilities.compressionRatio(of: text) 64 | } 65 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Utilities/Concurrency.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | import os.lock 6 | 7 | /// An actor that provides thread-safe early stopping functionality using UUIDs as keys 8 | public actor EarlyStopActor { 9 | private var shouldStop = [UUID: Bool]() 10 | 11 | public init() {} 12 | 13 | /// Sets the stop flag for a given UUID 14 | /// - Parameters: 15 | /// - value: The boolean value to set 16 | /// - uuid: The UUID key 17 | public func set(_ value: Bool, for uuid: UUID) { 18 | shouldStop[uuid] = value 19 | } 20 | 21 | /// Gets the stop flag for a given UUID 22 | /// - Parameter uuid: The UUID key 23 | /// - Returns: The current stop flag value, or false if not set 24 | public func get(for uuid: UUID) -> Bool { 25 | return shouldStop[uuid] ?? false 26 | } 27 | 28 | /// Removes and returns the stop flag for a given UUID 29 | /// - Parameter uuid: The UUID key 30 | /// - Returns: The removed stop flag value, if it existed 31 | public func remove(for uuid: UUID) -> Bool? { 32 | return shouldStop.removeValue(forKey: uuid) 33 | } 34 | } 35 | 36 | /// Serializes access to a value with an `os_unfair_lock` so mutation stays 37 | /// thread-safe. The wrapper is used by `TranscriptionResult`, which is marked 38 | /// `@unchecked Sendable`; guarding each property with this lock helps keep the 39 | /// result instance safe when shared across concurrent contexts. 40 | @propertyWrapper 41 | public struct TranscriptionPropertyLock: Sendable, Codable { 42 | private let lock: UnfairLock 43 | private var value: Value 44 | 45 | public init(wrappedValue: Value) { 46 | self.lock = UnfairLock() 47 | self.value = wrappedValue 48 | } 49 | public init(from decoder: Swift.Decoder) throws { 50 | self.lock = UnfairLock() 51 | self.value = try Value(from: decoder) 52 | } 53 | 54 | public func encode(to encoder: Encoder) throws { 55 | try lock.withLock { 56 | try value.encode(to: encoder) 57 | } 58 | 59 | } 60 | 61 | public var wrappedValue: Value { 62 | get { 63 | lock.withLock { 64 | return value 65 | } 66 | } 67 | set { 68 | lock.withLock { 69 | value = newValue 70 | } 71 | } 72 | } 73 | } 74 | 75 | /// Thin wrapper around `os_unfair_lock` that exposes a Swift-friendly 76 | /// `withLock` helper. This lock is non-reentrant and optimized for low 77 | /// contention, matching the semantics of Core Foundation’s unfair lock. 78 | @usableFromInline 79 | final class UnfairLock: @unchecked Sendable { 80 | @usableFromInline 81 | var lock = os_unfair_lock() 82 | 83 | @inlinable 84 | func withLock(_ body: () throws -> T) rethrows -> T { 85 | os_unfair_lock_lock(&lock) 86 | defer { os_unfair_lock_unlock(&lock) } 87 | return try body() 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/AudioEncoder.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import CoreML 5 | 6 | public protocol AudioEncoderOutputType {} 7 | extension MLMultiArray: AudioEncoderOutputType {} 8 | 9 | /// AudioEncoding protocol defines the requirements for an audio encoding implementation. 10 | public protocol AudioEncoding { 11 | /// The size of the embedding produced by the encoder. 12 | var embedSize: Int? { get } 13 | 14 | /// Encodes the given audio features asynchronously. 15 | /// - Parameter features: The audio features to be encoded. 16 | /// - Returns: An optional tensor containing the encoded features. 17 | func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)? 18 | } 19 | 20 | /// Backwards-compatible AudioEncoder implementation 21 | public class AudioEncoder: AudioEncoding, WhisperMLModel { 22 | public var model: MLModel? 23 | 24 | public var embedSize: Int? { 25 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil } 26 | guard inputDescription.type == .multiArray else { return nil } 27 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 28 | let shape = shapeConstraint.shape.map { $0.intValue } 29 | return shape[1] 30 | } 31 | 32 | public var sequenceLength: Int? { 33 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil } 34 | guard inputDescription.type == .multiArray else { return nil } 35 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 36 | let shape = shapeConstraint.shape.map { $0.intValue } 37 | return shape[3] 38 | } 39 | 40 | public init() {} 41 | 42 | public func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)? { 43 | guard let features = features as? MLMultiArray else { 44 | throw WhisperError.audioProcessingFailed("AudioEncoder input must be MLMultiArray") 45 | } 46 | 47 | return try await encodeFeatures(features) 48 | } 49 | 50 | public func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? { 51 | guard let model else { 52 | throw WhisperError.modelsUnavailable() 53 | } 54 | try Task.checkCancellation() 55 | 56 | let interval = Logging.beginSignpost("EncodeAudio", signposter: Logging.AudioEncoding.signposter) 57 | defer { Logging.endSignpost("EncodeAudio", interval: interval, signposter: Logging.AudioEncoding.signposter) } 58 | 59 | let modelInputs = AudioEncoderInput(melspectrogram_features: features) 60 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) 61 | let output = AudioEncoderOutput(features: outputFeatures) 62 | return output.encoder_output_embeds 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.9 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | import Foundation 6 | 7 | let package = Package( 8 | name: "whisperkit", 9 | platforms: [ 10 | .iOS(.v16), 11 | .macOS(.v13), 12 | .watchOS(.v10), 13 | .visionOS(.v1) 14 | ], 15 | products: [ 16 | .library( 17 | name: "WhisperKit", 18 | targets: ["WhisperKit"] 19 | ), 20 | .executable( 21 | name: "whisperkit-cli", 22 | targets: ["WhisperKitCLI"] 23 | ) 24 | ], 25 | dependencies: [ 26 | .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMinor(from: "1.1.2")), 27 | .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"), 28 | ] + (isServerEnabled() ? [ 29 | .package(url: "https://github.com/vapor/vapor.git", from: "4.115.1"), 30 | .package(url: "https://github.com/apple/swift-openapi-generator", from: "1.10.2"), 31 | .package(url: "https://github.com/apple/swift-openapi-runtime", from: "1.8.2"), 32 | .package(url: "https://github.com/swift-server/swift-openapi-vapor", from: "1.0.1"), 33 | 34 | ] : []), 35 | targets: [ 36 | .target( 37 | name: "WhisperKit", 38 | dependencies: [ 39 | .product(name: "Hub", package: "swift-transformers"), 40 | .product(name: "Tokenizers", package: "swift-transformers"), 41 | ] 42 | ), 43 | .testTarget( 44 | name: "WhisperKitTests", 45 | dependencies: [ 46 | "WhisperKit", 47 | .product(name: "Hub", package: "swift-transformers"), 48 | .product(name: "Tokenizers", package: "swift-transformers"), 49 | ], 50 | path: "Tests", 51 | resources: [ 52 | .process("WhisperKitTests/Resources"), 53 | ] 54 | ), 55 | .executableTarget( 56 | name: "WhisperKitCLI", 57 | dependencies: [ 58 | "WhisperKit", 59 | .product(name: "ArgumentParser", package: "swift-argument-parser"), 60 | ] + (isServerEnabled() ? [ 61 | .product(name: "Vapor", package: "vapor"), 62 | .product(name: "OpenAPIRuntime", package: "swift-openapi-runtime"), 63 | .product(name: "OpenAPIVapor", package: "swift-openapi-vapor"), 64 | ] : []), 65 | path: "Sources/WhisperKitCLI", 66 | exclude: (isServerEnabled() ? [] : ["Server"]), 67 | swiftSettings: (isServerEnabled() ? [.define("BUILD_SERVER_CLI")] : []) 68 | ) 69 | ], 70 | swiftLanguageVersions: [.v5] 71 | ) 72 | 73 | func isServerEnabled() -> Bool { 74 | if let enabledValue = Context.environment["BUILD_ALL"] { 75 | return enabledValue.lowercased() == "true" || enabledValue == "1" 76 | } 77 | 78 | // Default disabled, change to true temporarily for local development 79 | return false 80 | } 81 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Swift/README.md: -------------------------------------------------------------------------------- 1 | # WhisperKit Swift Client 2 | 3 | A simple Swift client for the WhisperKit local server. 4 | 5 | ## Quick Start 6 | 7 | 1. **Start the WhisperKit server** (in another terminal): 8 | ```bash 9 | whisperkit-cli serve 10 | ``` 11 | 12 | 2. **Build the client**: 13 | ```bash 14 | swift build 15 | ``` 16 | 17 | 3. **Run commands**: 18 | ```bash 19 | # Transcribe an audio file 20 | swift run whisperkit-client transcribe audio.wav 21 | 22 | # Translate an audio file to English 23 | swift run whisperkit-client translate audio.wav 24 | 25 | # Test with sample files 26 | swift run whisperkit-client test 27 | ``` 28 | 29 | ## Available Commands 30 | 31 | - `transcribe ` - Transcribe audio to text 32 | - `translate ` - Translate audio to English 33 | - `test` - Test transcription and translation on sample files 34 | 35 | ## Options 36 | 37 | - `--language, -l` - Source language for transcription (default: auto-detect) 38 | - `--model, -m` - Model to use (default: tiny) 39 | - `--response-format` - Response format: json, verbose_json (default: verbose_json) 40 | - `--timestamp-granularities` - Comma-separated: word,segment (default: segment) 41 | - `--stream` - Enable streaming output 42 | - `--server-url, -s` - Server URL (default: http://localhost:50060/v1) 43 | 44 | ## Examples 45 | 46 | ```bash 47 | # Transcribe in Spanish 48 | swift run whisperkit-client transcribe -l es audio.wav 49 | 50 | # Transcribe with word-level timestamps 51 | swift run whisperkit-client transcribe --timestamp-granularities "word,segment" audio.wav 52 | 53 | # Translate from Spanish to English 54 | swift run whisperkit-client translate -l es audio.wav 55 | 56 | # Use custom server 57 | swift run whisperkit-client transcribe -s http://192.168.1.100:50060 audio.wav 58 | 59 | # Stream transcription 60 | swift run whisperkit-client transcribe --stream audio.wav 61 | ``` 62 | 63 | ## Project Structure 64 | 65 | ``` 66 | Sources/ 67 | ├── CLI.swift # All CLI commands and client logic 68 | └── Generated/ # Auto-generated OpenAPI client code 69 | ├── Client.swift 70 | └── Types.swift 71 | ``` 72 | 73 | ## Current Limitations 74 | 75 | - **Response Format**: The `--response-format` parameter is not fully working due to OpenAPI schema discrimination issues. The client always receives basic JSON responses instead of verbose JSON with segments and word timestamps. 76 | - **Word Timestamps**: Word-level timestamps are not displayed due to the response format issue above. 77 | - **Basic Functionality**: Basic transcription and translation work correctly. 78 | 79 | > **Note**: This is a known issue with the Swift OpenAPI generator's handling of `oneOf` schemas with discriminators. The server correctly sends verbose JSON responses, but the Swift client cannot properly parse them. Consider using the Python client or CurlClient for full functionality. 80 | 81 | ## Updating Generated Code 82 | 83 | When the server spec changes, regenerate the client code: 84 | 85 | ```bash 86 | ./updateClient.sh 87 | ``` 88 | 89 | This will update the files in `Sources/Generated/` from `scripts/specs/localserver_openapi.yaml`. 90 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to WhisperKit 2 | 3 | ## Overview 4 | 5 | We welcome and encourage contributions to WhisperKit! Whether you're fixing bugs, improving documentation, or adding new features from the roadmap, your help is appreciated. This guide will help you get started with contributing to WhisperKit. 6 | 7 | ## Getting Started 8 | 9 | 1. **Fork the Repository**: Start by [forking](https://github.com/argmaxinc/WhisperKit/fork) the WhisperKit repository on GitHub to your personal account. 10 | 11 | 2. **Clone Your Fork**: Clone your fork to your local machine to start making changes. 12 | 13 | ```bash 14 | git clone https://github.com/[your-username]/whisperkit.git 15 | cd whisperkit 16 | ``` 17 | 18 | ## Setting Up Your Development Environment 19 | 20 | 1. **Install Dependencies**: Use the provided `Makefile` to set up your environment. Run `make setup` to install necessary dependencies. 21 | 22 | ```bash 23 | make setup 24 | ``` 25 | 26 | 2. **Download Models**: Run `make download-models` to download the required models to run and test locally. 27 | 28 | ```bash 29 | make download-model MODEL=tiny 30 | ``` 31 | 32 | ## Making Changes 33 | 34 | 1. **Make Your Changes**: Implement your changes, add new features, or fix bugs. Ensure you adhere to the existing coding style. If you're adding new features, make sure to update or add any documentation or tests as needed. 35 | 36 | 2. **Build and Test**: You can use the `Makefile` to build and test your changes. Run `make build` to build WhisperKit and `make test` to run tests. 37 | 38 | ```bash 39 | make build 40 | make test 41 | ``` 42 | 43 | You can also run and test directly from Xcode. We've provided an example app that contains various use cases, just open the `Examples/WhisperAX/WhisperAX.xcodeproj` file in Xcode and run the app. 44 | 45 | ## Submitting Your Changes 46 | 47 | 1. **Commit Your Changes**: Once you're satisfied with your changes, commit them with a clear and concise commit message. 48 | 49 | ```bash 50 | git commit -am "Add a new feature" 51 | ``` 52 | 53 | 2. **Push to Your Fork**: Push your changes to your fork on GitHub. 54 | 55 | ```bash 56 | git push origin my-branch 57 | ``` 58 | 59 | 3. **Create a Pull Request**: Go to the WhisperKit repository on GitHub and create a new pull request from your fork. Ensure your pull request has a clear title and description. 60 | 61 | 4. **Code Review**: Wait for the maintainers to review your pull request. Be responsive to feedback and make any necessary changes. 62 | 63 | ## Guidelines 64 | 65 | - **Code Style**: Follow the existing code style in the project. 66 | - **Commit Messages**: Write meaningful commit messages that clearly describe the changes. 67 | - **Documentation**: Update documentation if you're adding new features or making changes that affect how users interact with WhisperKit. 68 | - **Tests**: Add or update tests for new features or bug fixes. 69 | 70 | ## Final Steps 71 | 72 | After your pull request has been reviewed and approved, a maintainer will merge it into the main branch. Congratulations, you've successfully contributed to WhisperKit! 73 | 74 | Thank you for making WhisperKit better for everyone! ❤️‍🔥 75 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Swift/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "openapikit", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/mattpolzin/OpenAPIKit", 7 | "state" : { 8 | "revision" : "e0ecdf050c4bebc0104ed2505ec6fa1f6afb7555", 9 | "version" : "3.7.0" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-algorithms", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/apple/swift-algorithms", 16 | "state" : { 17 | "revision" : "87e50f483c54e6efd60e885f7f5aa946cee68023", 18 | "version" : "1.2.1" 19 | } 20 | }, 21 | { 22 | "identity" : "swift-argument-parser", 23 | "kind" : "remoteSourceControl", 24 | "location" : "https://github.com/apple/swift-argument-parser", 25 | "state" : { 26 | "revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3", 27 | "version" : "1.6.1" 28 | } 29 | }, 30 | { 31 | "identity" : "swift-collections", 32 | "kind" : "remoteSourceControl", 33 | "location" : "https://github.com/apple/swift-collections", 34 | "state" : { 35 | "revision" : "8c0c0a8b49e080e54e5e328cc552821ff07cd341", 36 | "version" : "1.2.1" 37 | } 38 | }, 39 | { 40 | "identity" : "swift-http-types", 41 | "kind" : "remoteSourceControl", 42 | "location" : "https://github.com/apple/swift-http-types", 43 | "state" : { 44 | "revision" : "a0a57e949a8903563aba4615869310c0ebf14c03", 45 | "version" : "1.4.0" 46 | } 47 | }, 48 | { 49 | "identity" : "swift-numerics", 50 | "kind" : "remoteSourceControl", 51 | "location" : "https://github.com/apple/swift-numerics.git", 52 | "state" : { 53 | "revision" : "e0ec0f5f3af6f3e4d5e7a19d2af26b481acb6ba8", 54 | "version" : "1.0.3" 55 | } 56 | }, 57 | { 58 | "identity" : "swift-openapi-generator", 59 | "kind" : "remoteSourceControl", 60 | "location" : "https://github.com/apple/swift-openapi-generator", 61 | "state" : { 62 | "revision" : "bb9a13596af11db9bb83389295d91cd335810fe8", 63 | "version" : "1.10.2" 64 | } 65 | }, 66 | { 67 | "identity" : "swift-openapi-runtime", 68 | "kind" : "remoteSourceControl", 69 | "location" : "https://github.com/apple/swift-openapi-runtime", 70 | "state" : { 71 | "revision" : "8f33cc5dfe81169fb167da73584b9c72c3e8bc23", 72 | "version" : "1.8.2" 73 | } 74 | }, 75 | { 76 | "identity" : "swift-openapi-urlsession", 77 | "kind" : "remoteSourceControl", 78 | "location" : "https://github.com/apple/swift-openapi-urlsession", 79 | "state" : { 80 | "revision" : "6fac6f7c428d5feea2639b5f5c8b06ddfb79434b", 81 | "version" : "1.1.0" 82 | } 83 | }, 84 | { 85 | "identity" : "yams", 86 | "kind" : "remoteSourceControl", 87 | "location" : "https://github.com/jpsim/Yams", 88 | "state" : { 89 | "revision" : "d41ba4e7164c0838c6d48351f7575f7f762151fe", 90 | "version" : "6.1.0" 91 | } 92 | } 93 | ], 94 | "version" : 2 95 | } 96 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Python/README.md: -------------------------------------------------------------------------------- 1 | # WhisperKit Python Client 2 | 3 | A simple Python client for the WhisperKit local server using OpenAI's SDK. 4 | 5 | ## Quick Start 6 | 7 | 1. **Start the WhisperKit server** (in another terminal): 8 | ```bash 9 | whisperkit-cli serve 10 | ``` 11 | 12 | 2. **Install dependencies**: 13 | ```bash 14 | uv sync 15 | ``` 16 | 17 | 3. **Run commands**: 18 | ```bash 19 | # Transcribe an audio file 20 | python whisperkit_client.py transcribe audio.wav 21 | 22 | # Translate an audio file to English 23 | python whisperkit_client.py translate audio.wav 24 | 25 | # Test with sample files 26 | python whisperkit_client.py test 27 | ``` 28 | 29 | ## Available Commands 30 | 31 | - `transcribe ` - Transcribe audio to text 32 | - `translate ` - Translate audio to English 33 | - `test` - Test transcription and translation on sample files 34 | 35 | ## Options 36 | 37 | - `--server, -s` - Server URL (default: http://localhost:50060) 38 | - `--model, -m` - Model to use (default: tiny) 39 | - `--language, -l` - Source language for transcription (default: auto-detect) 40 | - `--response-format` - Response format: json, verbose_json (default: verbose_json) 41 | - `--timestamp-granularities` - Comma-separated: word,segment (default: segment) 42 | - `--stream` - Enable streaming output 43 | - `--debug` - Show raw JSON response for debugging 44 | 45 | 46 | ## Examples 47 | 48 | ```bash 49 | # Transcribe in Spanish 50 | python whisperkit_client.py transcribe -l es audio.wav 51 | 52 | # Translate to English (auto-detects source language) 53 | python whisperkit_client.py translate audio.wav 54 | 55 | # Use custom server and model 56 | python whisperkit_client.py -s http://192.168.1.100:50060 -m large transcribe audio.wav 57 | 58 | # Transcribe with word-level timestamps 59 | python whisperkit_client.py transcribe --timestamp-granularities "word,segment" audio.wav 60 | 61 | # Stream transcription 62 | python whisperkit_client.py transcribe --stream audio.wav 63 | 64 | # Debug mode to see raw JSON 65 | python whisperkit_client.py transcribe --debug audio.wav 66 | 67 | # Test with sample files 68 | python whisperkit_client.py test 69 | ``` 70 | 71 | ## Project Structure 72 | 73 | ``` 74 | Examples/ServeCLIClient/Python/ 75 | ├── whisperkit_client.py # Main CLI script with all functionality 76 | ├── test_transcribe.py # Test script for transcription 77 | ├── test_translate.py # Test script for translation 78 | ├── requirements.txt # Python dependencies 79 | ├── uv.lock # Locked dependency versions 80 | └── README.md # This file 81 | ``` 82 | 83 | ## Dependencies 84 | 85 | - `openai` - OpenAI Python SDK for API communication 86 | - `uv` - Fast Python package manager 87 | 88 | ## Testing 89 | 90 | The client automatically finds test audio files from `Tests/WhisperKitTests/Resources/` in the main project directory. 91 | 92 | ```bash 93 | # Run tests on sample files 94 | python whisperkit_client.py test 95 | 96 | # Or run individual test scripts 97 | python test_transcribe.py 98 | python test_translate.py 99 | ``` 100 | 101 | ## Alternative Clients 102 | 103 | For lightweight testing without Python dependencies, see the [CurlClient](../Curl/README.md) which provides shell script implementations using curl. 104 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/TranscribeCLIUtils.swift: -------------------------------------------------------------------------------- 1 | // Copyright © 2025 Argmax, Inc. All rights reserved. 2 | // For licensing see accompanying LICENSE.md file. 3 | 4 | import Foundation 5 | import CoreML 6 | @preconcurrency import WhisperKit 7 | 8 | internal class TranscribeCLIUtils { 9 | 10 | /// Creates WhisperKit configuration from CLI arguments 11 | static func createWhisperKitConfig(from arguments: TranscribeCLIArguments) -> WhisperKitConfig { 12 | var audioEncoderComputeUnits = arguments.audioEncoderComputeUnits.asMLComputeUnits 13 | let textDecoderComputeUnits = arguments.textDecoderComputeUnits.asMLComputeUnits 14 | 15 | // Use gpu for audio encoder on macOS below 14 16 | if audioEncoderComputeUnits == .cpuAndNeuralEngine { 17 | if #unavailable(macOS 14.0) { 18 | audioEncoderComputeUnits = .cpuAndGPU 19 | } 20 | } 21 | 22 | let computeOptions = ModelComputeOptions( 23 | audioEncoderCompute: audioEncoderComputeUnits, 24 | textDecoderCompute: textDecoderComputeUnits 25 | ) 26 | 27 | let downloadTokenizerFolder: URL? = arguments.downloadTokenizerPath.map { URL(filePath: $0) } 28 | let downloadModelFolder: URL? = arguments.downloadModelPath.map { URL(filePath: $0) } 29 | let modelName: String? = arguments.model.map { arguments.modelPrefix + "*" + $0 } 30 | 31 | return WhisperKitConfig( 32 | model: modelName, 33 | downloadBase: downloadModelFolder, 34 | modelFolder: arguments.modelPath, 35 | tokenizerFolder: downloadTokenizerFolder, 36 | computeOptions: computeOptions, 37 | verbose: arguments.verbose, 38 | logLevel: arguments.verbose ? .debug : .info, 39 | prewarm: false, 40 | load: true, 41 | useBackgroundDownloadSession: false 42 | ) 43 | } 44 | 45 | /// Creates DecodingOptions from CLI arguments and task 46 | static func createDecodingOptions(from arguments: TranscribeCLIArguments, task: DecodingTask) -> DecodingOptions { 47 | let options = DecodingOptions( 48 | verbose: arguments.verbose, 49 | task: task, 50 | language: arguments.language, 51 | temperature: arguments.temperature, 52 | temperatureIncrementOnFallback: arguments.temperatureIncrementOnFallback, 53 | temperatureFallbackCount: arguments.temperatureFallbackCount, 54 | topK: arguments.bestOf, 55 | usePrefillPrompt: arguments.usePrefillPrompt || arguments.language != nil || task == .translate, 56 | usePrefillCache: arguments.usePrefillCache, 57 | skipSpecialTokens: arguments.skipSpecialTokens, 58 | withoutTimestamps: arguments.withoutTimestamps, 59 | wordTimestamps: arguments.wordTimestamps, 60 | clipTimestamps: arguments.clipTimestamps, 61 | supressTokens: arguments.supressTokens, 62 | compressionRatioThreshold: arguments.compressionRatioThreshold ?? 2.4, 63 | logProbThreshold: arguments.logprobThreshold ?? -1.0, 64 | firstTokenLogProbThreshold: arguments.firstTokenLogProbThreshold, 65 | noSpeechThreshold: arguments.noSpeechThreshold ?? 0.6, 66 | concurrentWorkerCount: arguments.concurrentWorkerCount, 67 | chunkingStrategy: ChunkingStrategy(rawValue: arguments.chunkingStrategy) 68 | ) 69 | 70 | return options 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/Server/ServeCLI.swift: -------------------------------------------------------------------------------- 1 | // Copyright © 2025 Argmax, Inc. All rights reserved. 2 | // For licensing see accompanying LICENSE.md file. 3 | 4 | import ArgumentParser 5 | import CoreML 6 | import Foundation 7 | @preconcurrency import WhisperKit 8 | import Vapor 9 | import OpenAPIRuntime 10 | import OpenAPIVapor 11 | import AVFoundation 12 | 13 | struct ServeCLI: AsyncParsableCommand { 14 | static let configuration = CommandConfiguration( 15 | commandName: "serve", 16 | abstract: "Start a local server for WhisperKit transcription" 17 | ) 18 | 19 | @OptionGroup 20 | var cliArguments: ServeCLIArguments 21 | 22 | mutating func run() async throws { 23 | try await serve() 24 | } 25 | 26 | public func configure(_ app: Application) async throws { 27 | let transport = VaporTransport(routesBuilder: app) 28 | 29 | var transcribeArguments = cliArguments.transcribe 30 | transcribeArguments.skipSpecialTokens = true // always skip special tokens for server responses 31 | if let modelPath = cliArguments.transcribe.modelPath { 32 | app.logger.notice("Loading model from path: \(modelPath)") 33 | } else if let model = cliArguments.transcribe.model { 34 | app.logger.notice("Loading model: \(model)") 35 | } else { 36 | let defaultModel = WhisperKit.recommendedModels().default 37 | app.logger.notice("Loading default model: \(defaultModel)") 38 | transcribeArguments.model = defaultModel 39 | transcribeArguments.modelPrefix = "" 40 | } 41 | 42 | let config = TranscribeCLIUtils.createWhisperKitConfig(from: transcribeArguments) 43 | let whisperKit = try await WhisperKit(config) 44 | let handler = OpenAIHandler(whisperKit: whisperKit, logger: app.logger, transcribeArguments: transcribeArguments) 45 | try handler.registerHandlers(on: transport, serverURL: URL(string: "/v1")!) 46 | 47 | // Register base routes after OpenAPI routes to ensure they take precedence 48 | app.get("") { req async throws -> EndpointInfo in 49 | return EndpointInfo( 50 | status: "ok", 51 | service: "WhisperKit Local Server", 52 | endpoints: [ 53 | Endpoint(method: "POST", path: "/v1/audio/transcriptions", description: "Transcribe audio to text"), 54 | Endpoint(method: "POST", path: "/v1/audio/translations", description: "Translate audio to English"), 55 | Endpoint(method: "GET", path: "/health", description: "Health check endpoint") 56 | ] 57 | ) 58 | } 59 | 60 | app.get("health") { req async throws -> [String: String] in 61 | return ["status": "ok"] 62 | } 63 | } 64 | 65 | private func serve() async throws { 66 | var env = try Environment.detect() 67 | try LoggingSystem.bootstrap(from: &env) 68 | let app = try await Application.make() 69 | app.logger.logLevel = cliArguments.transcribe.verbose ? .debug : .info 70 | app.logger.notice("Starting WhisperKit Server...") 71 | app.environment.arguments = [""] // override arguments, handled by swift-argument-parser 72 | 73 | // Configure server to bind to specified host and port 74 | app.http.server.configuration.hostname = cliArguments.host 75 | app.http.server.configuration.port = cliArguments.port 76 | app.logger.notice("Server will bind to \(cliArguments.host):\(cliArguments.port)") 77 | 78 | do { 79 | try await configure(app) 80 | try await app.execute() 81 | } catch { 82 | app.logger.report(error: error) 83 | try? await app.asyncShutdown() 84 | throw error 85 | } 86 | try await app.asyncShutdown() 87 | } 88 | } 89 | 90 | // Response structs for the base endpoint 91 | fileprivate struct Endpoint: Content { 92 | let method: String 93 | let path: String 94 | let description: String 95 | } 96 | 97 | fileprivate struct EndpointInfo: Content { 98 | let status: String 99 | let service: String 100 | let endpoints: [Endpoint] 101 | } 102 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | ios-version: 7 | required: true 8 | type: string 9 | ios-device: 10 | required: true 11 | type: string 12 | watchos-version: 13 | required: true 14 | type: string 15 | visionos-version: 16 | required: true 17 | type: string 18 | macos-runner: 19 | required: true 20 | type: string 21 | xcode-version: 22 | required: false 23 | type: string 24 | 25 | jobs: 26 | unit-tests: 27 | name: "${{ matrix.run-config['name'] }} on ${{ inputs.macos-runner }}" 28 | runs-on: ${{ inputs.macos-runner }} 29 | strategy: 30 | matrix: 31 | run-config: 32 | - { 33 | name: "macOS", 34 | condition: true, 35 | clean-destination: "generic/platform=macOS", 36 | test-destination: "platform=macOS,arch=arm64", 37 | } 38 | - { 39 | name: "iOS", 40 | condition: true, 41 | clean-destination: "generic/platform=iOS", 42 | test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=${{ inputs.ios-device }}", 43 | } 44 | - { 45 | name: "watchOS", 46 | condition: "${{ inputs.macos-runner == 'macos-26' }}", 47 | clean-destination: "generic/platform=watchOS", 48 | test-destination: "platform=watchOS Simulator,OS=${{ inputs.watchos-version }},name=Apple Watch Ultra 3 (49mm)", 49 | } 50 | - { 51 | name: "visionOS", 52 | condition: "${{ inputs.macos-runner == 'macos-26' }}", 53 | clean-destination: "generic/platform=visionOS", 54 | test-destination: "platform=visionOS Simulator,OS=${{ inputs.visionos-version }},name=Apple Vision Pro", 55 | } 56 | timeout-minutes: ${{ matrix.run-config['name'] == 'visionOS' && 60 || 30 }} 57 | steps: 58 | - uses: actions/checkout@v4 59 | - uses: maxim-lobanov/setup-xcode@v1 60 | with: 61 | xcode-version: ${{ inputs.xcode-version || '26.0' }} 62 | - name: Setup environment 63 | run: make setup 64 | - name: Setup Cache 65 | id: model-cache 66 | uses: actions/cache@v4 67 | with: 68 | path: Models 69 | key: ${{ runner.os }}-models 70 | - name: Download Models 71 | if: steps.model-cache.outputs.cache-hit != 'true' 72 | run: make download-model MODEL=tiny 73 | - name: Install and discover destinations 74 | if: ${{ matrix.run-config['condition'] == true }} 75 | run: | 76 | echo "Simulators on runner:" 77 | xcrun simctl list 78 | if [[ "${{ matrix.run-config['name'] }}" == "visionOS" ]]; then 79 | xcodebuild -downloadPlatform ${{ matrix.run-config['name'] }} 80 | fi 81 | echo "Runtimes for testing:" 82 | xcrun simctl list runtimes 83 | echo "Destinations for testing:" 84 | xcodebuild test-without-building -testPlan UnitTestsPlan -scheme whisperkit-Package -showdestinations 85 | - name: Boot Simulator and Wait 86 | if: ${{ matrix.run-config['condition'] == true }} && ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-26' }} 87 | # Slower runners require some time to fully boot the simulator 88 | # Parse the simulator name from the destination string, boot it, and wait 89 | run: | 90 | simulator_name=$(echo '${{ matrix.run-config['test-destination'] }}' | sed -n 's/.*name=\([^,]*\).*/\1/p') 91 | xcrun simctl boot "$simulator_name" || true 92 | sleep 15 93 | xcrun simctl list devices 94 | - name: Build and Test - ${{ matrix.run-config['name'] }} 95 | if: ${{ matrix.run-config['condition'] == true }} 96 | run: | 97 | set -o pipefail 98 | xcodebuild clean build-for-testing -scheme whisperkit-Package -destination '${{ matrix.run-config['clean-destination'] }}' | xcpretty 99 | xcodebuild test -testPlan UnitTestsPlan -scheme whisperkit-Package -destination '${{ matrix.run-config['test-destination'] }}' 100 | - name: Upload Test Results 101 | if: failure() 102 | uses: actions/upload-artifact@v4 103 | with: 104 | name: test-results-${{ matrix.run-config['name']}}-on-${{ inputs.macos-runner }} 105 | path: | 106 | ~/Library/Developer/Xcode/DerivedData/**/Logs/Test/*.xcresult 107 | retention-days: 5 108 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/xcshareddata/xcschemes/WhisperAX.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 31 | 32 | 35 | 41 | 42 | 43 | 46 | 52 | 53 | 54 | 55 | 56 | 66 | 68 | 74 | 75 | 76 | 77 | 81 | 82 | 86 | 87 | 88 | 89 | 95 | 97 | 103 | 104 | 105 | 106 | 108 | 109 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/TranscribeCLIArguments.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import ArgumentParser 5 | 6 | struct TranscribeCLIArguments: ParsableArguments { 7 | @Option(help: "Paths to audio files") 8 | var audioPath = [String]() 9 | 10 | @Option(help: "Path to a folder containing audio files") 11 | var audioFolder: String? 12 | 13 | @Option(help: "Path of model files") 14 | var modelPath: String? 15 | 16 | @Option(help: "Model to download if no modelPath is provided") 17 | var model: String? 18 | 19 | @Option(help: "Text to add in front of the model name to specify between different types of the same variant (values: \"openai\", \"distil\")") 20 | var modelPrefix: String = "openai" 21 | 22 | @Option(help: "Path to save the downloaded model") 23 | var downloadModelPath: String? 24 | 25 | @Option(help: "Path to save the downloaded tokenizer files") 26 | var downloadTokenizerPath: String? 27 | 28 | @Option(help: "Compute units for audio encoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}") 29 | var audioEncoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine 30 | 31 | @Option(help: "Compute units for text decoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}") 32 | var textDecoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine 33 | 34 | @Flag(help: "Verbose mode") 35 | var verbose: Bool = false 36 | 37 | @Option(help: "Task to perform (transcribe or translate)") 38 | var task: String = "transcribe" 39 | 40 | @Option(help: "Language spoken in the audio") 41 | var language: String? 42 | 43 | @Option(help: "Temperature to use for sampling") 44 | var temperature: Float = 0 45 | 46 | @Option(help: "Temperature to increase on fallbacks during decoding") 47 | var temperatureIncrementOnFallback: Float = 0.2 48 | 49 | @Option(help: "Number of times to increase temperature when falling back during decoding") 50 | var temperatureFallbackCount: Int = 5 51 | 52 | @Option(help: "Number of candidates when sampling with non-zero temperature") 53 | var bestOf: Int = 5 54 | 55 | @Flag(help: "Force initial prompt tokens based on language, task, and timestamp options") 56 | var usePrefillPrompt: Bool = false 57 | 58 | @Flag(help: "Use decoder prefill data for faster initial decoding") 59 | var usePrefillCache: Bool = false 60 | 61 | @Flag(help: "Skip special tokens in the output") 62 | var skipSpecialTokens: Bool = false 63 | 64 | @Flag(help: "Force no timestamps when decoding") 65 | var withoutTimestamps: Bool = false 66 | 67 | @Flag(help: "Add timestamps for each word in the output") 68 | var wordTimestamps: Bool = false 69 | 70 | @Option(help: "Force prefix text when decoding") 71 | var prefix: String? 72 | 73 | @Option(help: "Condition on this text when decoding") 74 | var prompt: String? 75 | 76 | @Option(parsing: .upToNextOption, help: "List of timestamps (in seconds) of start and end values to transcribe as seperate clips in single audio file (example: --clip-timestamps 0 10.2 34.5 60.0)") 77 | var clipTimestamps: [Float] = [] 78 | 79 | @Option(parsing: .upToNextOption, help: "List of tokens to supress in the output (example: --supress-tokens 1 2 3)") 80 | var supressTokens: [Int] = [] 81 | 82 | @Option(help: "Gzip compression ratio threshold for decoding failure") 83 | var compressionRatioThreshold: Float? 84 | 85 | @Option(help: "Average log probability threshold for decoding failure") 86 | var logprobThreshold: Float? 87 | 88 | @Option(help: "Log probability threshold for first token decoding failure") 89 | var firstTokenLogProbThreshold: Float? 90 | 91 | @Option(help: "Probability threshold to consider a segment as silence") 92 | var noSpeechThreshold: Float? 93 | 94 | @Flag(help: "Output a report of the results") 95 | var report: Bool = false 96 | 97 | @Option(help: "Directory to save the report") 98 | var reportPath: String = "." 99 | 100 | @Flag(help: "Process audio directly from the microphone") 101 | var stream: Bool = false 102 | 103 | @Flag(help: "Simulate streaming transcription using the input audio file") 104 | var streamSimulated: Bool = false 105 | 106 | @Option(help: "Maximum concurrent inference, might be helpful when processing more than 1 audio file at the same time. 0 means unlimited. Default: 4") 107 | var concurrentWorkerCount: Int = 4 108 | 109 | @Option(help: "Chunking strategy for audio processing, `none` means no chunking, `vad` means using voice activity detection. Default: `vad`") 110 | var chunkingStrategy: String = "vad" 111 | } 112 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/whisperkit-Package.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 29 | 35 | 36 | 37 | 43 | 49 | 50 | 51 | 52 | 53 | 58 | 59 | 62 | 63 | 64 | 65 | 67 | 73 | 74 | 75 | 76 | 77 | 87 | 89 | 95 | 96 | 97 | 98 | 104 | 106 | 112 | 113 | 114 | 115 | 117 | 118 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Curl/translate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright © 2025 Argmax, Inc. All rights reserved. 4 | # For licensing see accompanying LICENSE.md file. 5 | 6 | # WhisperKit CurlClient - Translate Audio 7 | # Usage: ./translate.sh [options] 8 | 9 | set -e 10 | 11 | # Default values 12 | SERVER_URL="http://localhost:50060/v1" 13 | MODEL="tiny" 14 | LANGUAGE="" 15 | PROMPT="" 16 | RESPONSE_FORMAT="verbose_json" 17 | TEMPERATURE="0.0" 18 | VERBOSE="false" 19 | 20 | # Colors for output 21 | RED='\033[0;31m' 22 | GREEN='\033[0;32m' 23 | YELLOW='\033[1;33m' 24 | BLUE='\033[0;34m' 25 | NC='\033[0m' # No Color 26 | 27 | # Help function 28 | show_help() { 29 | echo "Usage: $0 [options]" 30 | echo "" 31 | echo "Arguments:" 32 | echo " audio-file Path to audio file (wav, mp3, m4a, flac, etc.)" 33 | echo "" 34 | echo "Options:" 35 | echo " -h, --help Show this help message" 36 | echo " -s, --server Server URL (default: http://localhost:50060/v1)" 37 | echo " -m, --model Model to use (default: tiny)" 38 | echo " -l, --language Source language code (e.g., es, ja, fr)" 39 | echo " -p, --prompt Text to guide translation (should be in English)" 40 | echo " -f, --response-format Response format: json, verbose_json (default: verbose_json)" 41 | echo " -t, --temperature Sampling temperature 0.0-1.0 (default: 0.0)" 42 | echo " --verbose Show verbose curl output" 43 | echo "" 44 | echo "Examples:" 45 | echo " $0 audio.wav" 46 | echo " $0 audio.wav --language es --response-format json" 47 | echo " $0 audio.wav --language ja --prompt \"This is a formal conversation\"" 48 | echo "" 49 | } 50 | 51 | # Parse command line arguments 52 | AUDIO_FILE="" 53 | while [[ $# -gt 0 ]]; do 54 | case $1 in 55 | -h|--help) 56 | show_help 57 | exit 0 58 | ;; 59 | -s|--server) 60 | SERVER_URL="$2" 61 | shift 2 62 | ;; 63 | -m|--model) 64 | MODEL="$2" 65 | shift 2 66 | ;; 67 | -l|--language) 68 | LANGUAGE="$2" 69 | shift 2 70 | ;; 71 | -p|--prompt) 72 | PROMPT="$2" 73 | shift 2 74 | ;; 75 | -f|--response-format) 76 | RESPONSE_FORMAT="$2" 77 | shift 2 78 | ;; 79 | -t|--temperature) 80 | TEMPERATURE="$2" 81 | shift 2 82 | ;; 83 | 84 | --verbose) 85 | VERBOSE="true" 86 | shift 87 | ;; 88 | -*) 89 | echo -e "${RED}Error: Unknown option $1${NC}" 90 | show_help 91 | exit 1 92 | ;; 93 | *) 94 | if [[ -z "$AUDIO_FILE" ]]; then 95 | AUDIO_FILE="$1" 96 | else 97 | echo -e "${RED}Error: Multiple audio files specified${NC}" 98 | exit 1 99 | fi 100 | shift 101 | ;; 102 | esac 103 | done 104 | 105 | # Check if audio file is provided 106 | if [[ -z "$AUDIO_FILE" ]]; then 107 | echo -e "${RED}Error: Audio file is required${NC}" 108 | show_help 109 | exit 1 110 | fi 111 | 112 | # Check if audio file exists 113 | if [[ ! -f "$AUDIO_FILE" ]]; then 114 | echo -e "${RED}Error: Audio file '$AUDIO_FILE' not found${NC}" 115 | exit 1 116 | fi 117 | 118 | # Build curl command 119 | CURL_CMD="curl -X POST \"$SERVER_URL/audio/translations\"" 120 | CURL_CMD="$CURL_CMD -H \"Content-Type: multipart/form-data\"" 121 | CURL_CMD="$CURL_CMD -F \"file=@$AUDIO_FILE\"" 122 | CURL_CMD="$CURL_CMD -F \"model=$MODEL\"" 123 | 124 | if [[ -n "$LANGUAGE" ]]; then 125 | CURL_CMD="$CURL_CMD -F \"language=$LANGUAGE\"" 126 | fi 127 | 128 | if [[ -n "$PROMPT" ]]; then 129 | CURL_CMD="$CURL_CMD -F \"prompt=$PROMPT\"" 130 | fi 131 | 132 | CURL_CMD="$CURL_CMD -F \"response_format=$RESPONSE_FORMAT\"" 133 | CURL_CMD="$CURL_CMD -F \"temperature=$TEMPERATURE\"" 134 | 135 | # Add output flags based on verbose setting 136 | if [[ "$VERBOSE" == "true" ]]; then 137 | CURL_CMD="$CURL_CMD -v" 138 | else 139 | CURL_CMD="$CURL_CMD -s -S" 140 | fi 141 | 142 | echo -e "${BLUE}🚀 Starting translation...${NC}" 143 | echo -e "${YELLOW}📁 Audio file:${NC} $AUDIO_FILE" 144 | echo -e "${YELLOW}🌐 Server:${NC} $SERVER_URL" 145 | echo -e "${YELLOW}🤖 Model:${NC} $MODEL" 146 | echo -e "${YELLOW}📝 Response format:${NC} $RESPONSE_FORMAT" 147 | echo -e "${YELLOW}🌡️ Temperature:${NC} $TEMPERATURE" 148 | if [[ -n "$LANGUAGE" ]]; then 149 | echo -e "${YELLOW}🌍 Source language:${NC} $LANGUAGE" 150 | fi 151 | if [[ -n "$PROMPT" ]]; then 152 | echo -e "${YELLOW}💡 Prompt:${NC} $PROMPT" 153 | fi 154 | echo "" 155 | 156 | # Execute curl command 157 | echo -e "${BLUE}📡 Sending request...${NC}" 158 | echo "" 159 | eval $CURL_CMD 160 | 161 | echo "" 162 | echo -e "${GREEN}✅ Translation complete!${NC}" 163 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Utilities/Logging.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import OSLog 5 | 6 | open class Logging { 7 | public static let shared = Logging() 8 | public var logLevel: LogLevel = .none 9 | 10 | public typealias LoggingCallback = (_ message: String) -> Void 11 | public var loggingCallback: LoggingCallback? 12 | 13 | private let logger = OSLog(subsystem: Bundle.main.bundleIdentifier ?? "com.argmax.whisperkit", category: "WhisperKit") 14 | 15 | @frozen 16 | public enum LogLevel: Int { 17 | case debug = 1 18 | case info = 2 19 | case error = 3 20 | case none = 4 21 | 22 | func shouldLog(level: LogLevel) -> Bool { 23 | return self.rawValue <= level.rawValue 24 | } 25 | } 26 | 27 | private init() {} 28 | 29 | public func log(_ items: Any..., separator: String = " ", terminator: String = "\n", type: OSLogType) { 30 | let message = items.map { "\($0)" }.joined(separator: separator) 31 | if let logger = loggingCallback { 32 | logger(message) 33 | } else { 34 | os_log("%{public}@", log: logger, type: type, message) 35 | } 36 | } 37 | 38 | public static func debug(_ items: Any..., separator: String = " ", terminator: String = "\n") { 39 | if shared.logLevel.shouldLog(level: .debug) { 40 | shared.log(items, separator: separator, terminator: terminator, type: .debug) 41 | } 42 | } 43 | 44 | public static func info(_ items: Any..., separator: String = " ", terminator: String = "\n") { 45 | if shared.logLevel.shouldLog(level: .info) { 46 | shared.log(items, separator: separator, terminator: terminator, type: .info) 47 | } 48 | } 49 | 50 | public static func error(_ items: Any..., separator: String = " ", terminator: String = "\n") { 51 | if shared.logLevel.shouldLog(level: .error) { 52 | shared.log(items, separator: separator, terminator: terminator, type: .error) 53 | } 54 | } 55 | } 56 | 57 | public extension Logging { 58 | static func logCurrentMemoryUsage(_ message: String) { 59 | let memoryUsage = getMemoryUsage() 60 | Logging.debug("\(message) - Memory usage: \(memoryUsage) MB") 61 | } 62 | 63 | static func getMemoryUsage() -> UInt64 { 64 | var info = mach_task_basic_info() 65 | var count = mach_msg_type_number_t(MemoryLayout.size) / 4 66 | 67 | let kerr: kern_return_t = withUnsafeMutablePointer(to: &info) { 68 | $0.withMemoryRebound(to: integer_t.self, capacity: 1) { 69 | task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count) 70 | } 71 | } 72 | 73 | guard kerr == KERN_SUCCESS else { 74 | return 0 // If the call fails, return 0 75 | } 76 | 77 | return info.resident_size / 1024 / 1024 // Convert to MB 78 | } 79 | } 80 | 81 | @available(*, deprecated, message: "Subject to removal in a future version. Use `Logging.logCurrentMemoryUsage(_:)` instead.") 82 | public func logCurrentMemoryUsage(_ message: String) { 83 | Logging.logCurrentMemoryUsage(message) 84 | } 85 | 86 | @available(*, deprecated, message: "Subject to removal in a future version. Use `Logging.getMemoryUsage()` instead.") 87 | public func getMemoryUsage() -> UInt64 { 88 | return Logging.getMemoryUsage() 89 | } 90 | 91 | extension Logging { 92 | enum AudioEncoding { 93 | static let logger = Logger( 94 | subsystem: Constants.Logging.subsystem, 95 | category: "AudioEncoding" 96 | ) 97 | static let signposter = OSSignposter(logger: logger) 98 | } 99 | 100 | enum FeatureExtractor { 101 | static let logger = Logger( 102 | subsystem: Constants.Logging.subsystem, 103 | category: "FeatureExtractor" 104 | ) 105 | static let signposter = OSSignposter(logger: logger) 106 | } 107 | 108 | enum TranscribeTask { 109 | static let logger = Logger( 110 | subsystem: Constants.Logging.subsystem, 111 | category: "TranscribeTask" 112 | ) 113 | static let signposter = OSSignposter(logger: logger) 114 | } 115 | 116 | static func beginSignpost( 117 | _ intervalName: StaticString, 118 | signposter: OSSignposter 119 | ) -> OSSignpostIntervalState { 120 | let signpostId = signposter.makeSignpostID() 121 | return signposter.beginInterval(intervalName, id: signpostId) 122 | } 123 | 124 | static func endSignpost( 125 | _ intervalName: StaticString, 126 | interval: OSSignpostIntervalState, 127 | signposter: OSSignposter 128 | ) { 129 | signposter.endInterval(intervalName, interval) 130 | } 131 | 132 | static func formatTimestamp(_ timestamp: Float) -> String { 133 | return String(format: "%.2f", timestamp) 134 | } 135 | 136 | static func formatTimeWithPercentage(_ time: Double, _ runs: Double, _ fullPipelineDuration: Double) -> String { 137 | let percentage = (time * 1000 / fullPipelineDuration) * 100 // Convert to percentage 138 | let runTime = runs > 0 ? time * 1000 / Double(runs) : 0 139 | let formattedString = String(format: "%8.2f ms / %6.0f runs (%8.2f ms/run) %5.2f%%", time * 1000, runs, runTime, percentage) 140 | return formattedString 141 | } 142 | } 143 | 144 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Curl/transcribe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright © 2025 Argmax, Inc. All rights reserved. 4 | # For licensing see accompanying LICENSE.md file. 5 | 6 | # WhisperKit Local Server Transcription Client 7 | # Usage: ./transcribe.sh [options] 8 | 9 | set -e 10 | 11 | # Default values 12 | AUDIO_FILE="" 13 | MODEL="tiny" 14 | LANGUAGE="" 15 | PROMPT="" 16 | RESPONSE_FORMAT="verbose_json" 17 | TIMESTAMP_GRANULARITIES="segment" 18 | TEMPERATURE="0.0" 19 | STREAM="false" 20 | VERBOSE="false" 21 | LOGPROBS="false" 22 | 23 | # Parse command line arguments 24 | while [[ $# -gt 0 ]]; do 25 | case $1 in 26 | --model) 27 | MODEL="$2" 28 | shift 2 29 | ;; 30 | --language) 31 | LANGUAGE="$2" 32 | shift 2 33 | ;; 34 | --prompt) 35 | PROMPT="$2" 36 | shift 2 37 | ;; 38 | --response-format) 39 | RESPONSE_FORMAT="$2" 40 | shift 2 41 | ;; 42 | --timestamp-granularities) 43 | TIMESTAMP_GRANULARITIES="$2" 44 | shift 2 45 | ;; 46 | --temperature) 47 | TEMPERATURE="$2" 48 | shift 2 49 | ;; 50 | --stream) 51 | STREAM="$2" 52 | shift 2 53 | ;; 54 | --logprobs) 55 | LOGPROBS="true" 56 | shift 57 | ;; 58 | --verbose) 59 | VERBOSE="true" 60 | shift 61 | ;; 62 | -h|--help) 63 | echo "Usage: $0 [options]" 64 | echo "" 65 | echo "Options:" 66 | echo " --model Model to use (default: tiny)" 67 | echo " --language Language code (e.g., en, es, fr)" 68 | echo " --prompt Prompt text for transcription" 69 | echo " --response-format Response format: json, verbose_json (default: verbose_json)" 70 | echo " --timestamp-granularities Comma-separated list: word,segment (default: segment)" 71 | echo " --temperature Temperature for sampling (default: 0.0)" 72 | echo " --stream Enable streaming (default: false)" 73 | echo " --logprobs Include logprobs in transcription (default: false)" 74 | echo " --verbose Show verbose output" 75 | echo " -h, --help Show this help message" 76 | echo "" 77 | echo "Examples:" 78 | echo " $0 audio.wav" 79 | echo " $0 audio.wav --model base --language en" 80 | echo " $0 audio.wav --timestamp-granularities word,segment --stream true" 81 | exit 0 82 | ;; 83 | *) 84 | if [[ -z "$AUDIO_FILE" ]]; then 85 | AUDIO_FILE="$1" 86 | else 87 | echo "Error: Unknown option $1" 88 | exit 1 89 | fi 90 | shift 91 | ;; 92 | esac 93 | done 94 | 95 | # Check if audio file is provided 96 | if [[ -z "$AUDIO_FILE" ]]; then 97 | echo "Error: Audio file is required" 98 | echo "Usage: $0 [options]" 99 | exit 1 100 | fi 101 | 102 | # Check if audio file exists 103 | if [[ ! -f "$AUDIO_FILE" ]]; then 104 | echo "Error: Audio file '$AUDIO_FILE' not found" 105 | exit 1 106 | fi 107 | 108 | # Build curl command 109 | CURL_CMD="curl -s -S" 110 | 111 | # Add verbose flag if requested 112 | if [[ "$VERBOSE" == "true" ]]; then 113 | CURL_CMD="$CURL_CMD -v" 114 | fi 115 | 116 | CURL_CMD="$CURL_CMD -X POST http://localhost:50060/v1/audio/transcriptions" 117 | 118 | # Build multipart form data 119 | CURL_CMD="$CURL_CMD -F file=@\"$AUDIO_FILE\"" 120 | CURL_CMD="$CURL_CMD -F model=\"$MODEL\"" 121 | CURL_CMD="$CURL_CMD -F response_format=\"$RESPONSE_FORMAT\"" 122 | CURL_CMD="$CURL_CMD -F timestamp_granularities[]=\"$TIMESTAMP_GRANULARITIES\"" 123 | CURL_CMD="$CURL_CMD -F temperature=\"$TEMPERATURE\"" 124 | CURL_CMD="$CURL_CMD -F stream=\"$STREAM\"" 125 | # Add logprobs if specified 126 | if [ "$LOGPROBS" = "true" ]; then 127 | CURL_CMD="$CURL_CMD -F \"include[]=logprobs\"" 128 | fi 129 | 130 | if [[ -n "$LANGUAGE" ]]; then 131 | CURL_CMD="$CURL_CMD -F language=\"$LANGUAGE\"" 132 | fi 133 | 134 | if [[ -n "$PROMPT" ]]; then 135 | CURL_CMD="$CURL_CMD -F prompt=\"$PROMPT\"" 136 | fi 137 | 138 | echo "🔄 Transcribing: $AUDIO_FILE" 139 | echo "📋 Options: model=$MODEL, format=$RESPONSE_FORMAT, granularities=$TIMESTAMP_GRANULARITIES, stream=$STREAM" 140 | echo "" 141 | 142 | # Execute curl command 143 | if [[ "$STREAM" == "true" ]]; then 144 | # For streaming, process line by line with timestamps 145 | echo "📡 Starting streaming transcription..." 146 | echo "⏰ Timestamps show when each piece of data arrives:" 147 | echo "" 148 | 149 | # Use a function to add timestamps to each line 150 | timestamp_stream() { 151 | while IFS= read -r line; do 152 | if [[ -n "$line" ]]; then 153 | timestamp=$(date '+%H:%M:%S.%3N') 154 | echo "[$timestamp] $line" 155 | fi 156 | done 157 | } 158 | 159 | eval "$CURL_CMD" | timestamp_stream 160 | else 161 | # For non-streaming, just execute normally 162 | eval "$CURL_CMD" 163 | fi 164 | 165 | echo "" 166 | echo "✅ Transcription complete" 167 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/config-v02.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "whisperkit-coreml", 3 | "version": "0.2", 4 | "device_support": [ 5 | { 6 | "identifiers": ["iPhone11", "iPhone12", "Watch7", "Watch8"], 7 | "models": { 8 | "default": "openai_whisper-tiny", 9 | "supported": [ 10 | "openai_whisper-tiny", 11 | "openai_whisper-tiny.en", 12 | "openai_whisper-base", 13 | "openai_whisper-base.en" 14 | ] 15 | } 16 | }, 17 | { 18 | "identifiers": ["iPhone13", "iPad13,18", "iPad13,1"], 19 | "models": { 20 | "default": "openai_whisper-base", 21 | "supported": [ 22 | "openai_whisper-tiny", 23 | "openai_whisper-tiny.en", 24 | "openai_whisper-base", 25 | "openai_whisper-base.en", 26 | "openai_whisper-small", 27 | "openai_whisper-small.en" 28 | ] 29 | } 30 | }, 31 | { 32 | "identifiers": [ 33 | "iPhone14", 34 | "iPhone15", 35 | "iPhone16", 36 | "iPhone17", 37 | "iPad14,1", 38 | "iPad14,2" 39 | ], 40 | "models": { 41 | "default": "openai_whisper-base", 42 | "supported": [ 43 | "openai_whisper-tiny", 44 | "openai_whisper-tiny.en", 45 | "openai_whisper-base", 46 | "openai_whisper-base.en", 47 | "openai_whisper-small", 48 | "openai_whisper-small.en", 49 | "openai_whisper-large-v2_949MB", 50 | "openai_whisper-large-v2_turbo_955MB", 51 | "openai_whisper-large-v3_947MB", 52 | "openai_whisper-large-v3_turbo_954MB", 53 | "distil-whisper_distil-large-v3_594MB", 54 | "distil-whisper_distil-large-v3_turbo_600MB", 55 | "openai_whisper-large-v3-v20240930_626MB", 56 | "openai_whisper-large-v3-v20240930_turbo_632MB" 57 | ] 58 | } 59 | }, 60 | { 61 | "identifiers": [ 62 | "Mac13", 63 | "iMac21", 64 | "MacBookAir10,1", 65 | "MacBookPro17", 66 | "MacBookPro18", 67 | "Macmini9", 68 | "iPad13,16", 69 | "iPad13,4", 70 | "iPad13,8" 71 | ], 72 | "models": { 73 | "default": "openai_whisper-large-v3-v20240930", 74 | "supported": [ 75 | "openai_whisper-tiny", 76 | "openai_whisper-tiny.en", 77 | "openai_whisper-base", 78 | "openai_whisper-base.en", 79 | "openai_whisper-small", 80 | "openai_whisper-small.en", 81 | "openai_whisper-large-v2", 82 | "openai_whisper-large-v2_949MB", 83 | "openai_whisper-large-v3", 84 | "openai_whisper-large-v3_947MB", 85 | "distil-whisper_distil-large-v3", 86 | "distil-whisper_distil-large-v3_594MB", 87 | "openai_whisper-large-v3-v20240930", 88 | "openai_whisper-large-v3-v20240930_626MB" 89 | ] 90 | } 91 | }, 92 | { 93 | "identifiers": [ 94 | "Mac14", 95 | "Mac15", 96 | "Mac16", 97 | "iPad14,3", 98 | "iPad14,4", 99 | "iPad14,5", 100 | "iPad14,6", 101 | "iPad14,8", 102 | "iPad14,9", 103 | "iPad14,10", 104 | "iPad14,11", 105 | "iPad16" 106 | ], 107 | "models": { 108 | "default": "openai_whisper-large-v3-v20240930", 109 | "supported": [ 110 | "openai_whisper-tiny", 111 | "openai_whisper-tiny.en", 112 | "openai_whisper-base", 113 | "openai_whisper-base.en", 114 | "openai_whisper-small", 115 | "openai_whisper-small.en", 116 | "openai_whisper-large-v2", 117 | "openai_whisper-large-v2_949MB", 118 | "openai_whisper-large-v2_turbo", 119 | "openai_whisper-large-v2_turbo_955MB", 120 | "openai_whisper-large-v3", 121 | "openai_whisper-large-v3_947MB", 122 | "openai_whisper-large-v3_turbo", 123 | "openai_whisper-large-v3_turbo_954MB", 124 | "distil-whisper_distil-large-v3", 125 | "distil-whisper_distil-large-v3_594MB", 126 | "distil-whisper_distil-large-v3_turbo", 127 | "distil-whisper_distil-large-v3_turbo_600MB", 128 | "openai_whisper-large-v3-v20240930", 129 | "openai_whisper-large-v3-v20240930_turbo", 130 | "openai_whisper-large-v3-v20240930_626MB", 131 | "openai_whisper-large-v3-v20240930_turbo_632MB" 132 | ] 133 | } 134 | } 135 | ] 136 | } 137 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Audio/AudioChunker.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Accelerate 5 | import AVFoundation 6 | import Foundation 7 | 8 | /// Responsible for chunking audio into smaller pieces 9 | public protocol AudioChunking { 10 | func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk] 11 | } 12 | 13 | public extension AudioChunking { 14 | func updateSeekOffsetsForResults( 15 | chunkedResults: [Result<[TranscriptionResult], Swift.Error>], 16 | audioChunks: [AudioChunk] 17 | ) -> [TranscriptionResult] { 18 | var updatedTranscriptionResults = [TranscriptionResult]() 19 | for (index, chunkedResult) in chunkedResults.enumerated() { 20 | switch chunkedResult { 21 | case let .success(results): 22 | let seekTime = Float(audioChunks[index].seekOffsetIndex) / Float(WhisperKit.sampleRate) 23 | for result in results { 24 | var updatedSegments = [TranscriptionSegment]() 25 | for segment in result.segments { 26 | let updatedSegment = TranscriptionUtilities.updateSegmentTimings(segment: segment, seekTime: seekTime) 27 | updatedSegments.append(updatedSegment) 28 | } 29 | var updatedResult = result 30 | updatedResult.seekTime = seekTime 31 | updatedResult.segments = updatedSegments 32 | updatedTranscriptionResults.append(updatedResult) 33 | } 34 | case let .failure(error): 35 | Logging.debug("Error transcribing chunk \(index): \(error)") 36 | } 37 | } 38 | return updatedTranscriptionResults 39 | } 40 | } 41 | 42 | /// A audio chunker that splits audio into smaller pieces based on voice activity detection 43 | open class VADAudioChunker: AudioChunking { 44 | /// prevent hallucinations at the end of the clip by stopping up to 1.0s early 45 | private let windowPadding: Int 46 | private let vad: VoiceActivityDetector 47 | 48 | public init(windowPadding: Int = 16000, vad: VoiceActivityDetector? = nil) { 49 | self.windowPadding = windowPadding 50 | self.vad = vad ?? EnergyVAD() 51 | } 52 | 53 | private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int { 54 | // NOTE: we want to check just the 2nd part for the silence to attempt to get closest to a max length chunk 55 | let audioMidIndex = startIndex + (endIndex - startIndex) / 2 56 | let vadAudioSlice = Array(audioArray[audioMidIndex.. [AudioChunk] { 67 | // If the audio array length is less than or equal to maxLength, return it as a single chunk 68 | if audioArray.count <= maxChunkLength { 69 | return [AudioChunk(seekOffsetIndex: 0, audioSamples: audioArray)] 70 | } 71 | 72 | // First create chunks from seek clips 73 | let options = decodeOptions ?? DecodingOptions() 74 | let seekClips = options.prepareSeekClips(contentFrames: audioArray.count) 75 | 76 | var chunkedAudio = [AudioChunk]() 77 | for (seekClipStart, seekClipEnd) in seekClips { 78 | // Loop through the current clip until we reach the end 79 | // Typically this will be the full audio file, unless seek points are explicitly provided 80 | var startIndex = seekClipStart 81 | while startIndex < seekClipEnd - windowPadding { 82 | guard startIndex >= 0 && startIndex < audioArray.count else { 83 | throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size") 84 | } 85 | 86 | // Make sure we still need chunking for this seek clip, otherwise use the original seek clip end 87 | var endIndex = seekClipEnd 88 | if startIndex + maxChunkLength < endIndex { 89 | // Adjust the end index based on VAD 90 | endIndex = splitOnMiddleOfLongestSilence( 91 | audioArray: audioArray, 92 | startIndex: startIndex, 93 | endIndex: min(audioArray.count, startIndex + maxChunkLength) 94 | ) 95 | } 96 | 97 | guard endIndex > startIndex else { 98 | break 99 | } 100 | Logging.debug("Found chunk from \(Logging.formatTimestamp(Float(startIndex) / Float(WhisperKit.sampleRate))) to \(Logging.formatTimestamp(Float(endIndex) / Float(WhisperKit.sampleRate)))") 101 | let audioSlice = AudioChunk(seekOffsetIndex: startIndex, audioSamples: Array(audioArray[startIndex.. [!IMPORTANT] 41 | > An active developer account is required to run the tests on physical devices. 42 | 43 | Before running tests, all external devices need to be connected and paired to your Mac, as well as registered with your developer account. Ensure the devices are in Developer Mode. If nothing appears after connecting the devices via cable, press `Command + Shift + 2` to open the list of devices and track their progress. 44 | 45 | ## Datasets 46 | 47 | The datasets for the test suite can be set in a global array called `datasets` in the file [`Tests/WhisperKitTests/RegressionTests.swift`](Tests/WhisperKitTests/RegressionTests.swift). It is prefilled with the datasets that are currently available. 48 | 49 | ## Models 50 | 51 | The models for the test suite can be set in the [`Fastfile`](fastlane/Fastfile). Simply find `BENCHMARK_CONFIGS` and modify the `models` array under the benchmark you want to run. 52 | 53 | ## Makefile and Fastlane 54 | 55 | The tests are run using [Fastlane](fastlane/Fastfile), which is controlled by a [Makefile](Makefile). The Makefile contains the following commands: 56 | 57 | ### List Connected Devices 58 | 59 | Before running the tests it might be a good idea to list the connected devices to resolve any connection issues. Simply run: 60 | 61 | ```sh 62 | make list-devices 63 | ``` 64 | 65 | The output will be a list with entries that look something like this: 66 | 67 | ```ruby 68 | { 69 | :name=>"My Mac", 70 | :type=>"Apple M2 Pro", 71 | :platform=>"macOS", 72 | :os_version=>"15.0.1", 73 | :product=>"Mac14,12", 74 | :id=>"XXXXXXXX-1234-5678-9012-XXXXXXXXXXXX", 75 | :state=>"connected" 76 | } 77 | ``` 78 | 79 | Verify that the devices are connected and the state is `connected`. 80 | 81 | ### Running Benchmarks 82 | 83 | After completing the above steps, you can run the tests. Note that there are two different test configurations: one named `full` and the other named `debug`. To check for potential errors, run the `debug` tests: 84 | 85 | ```sh 86 | make benchmark-devices DEBUG=true 87 | ``` 88 | 89 | Otherwise run the `full` tests: 90 | 91 | ```sh 92 | make benchmark-devices 93 | ``` 94 | 95 | Optionally, for both tests, you can specify the list of devices for the tests using the `DEVICES` option: 96 | 97 | ```sh 98 | make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac" 99 | ``` 100 | 101 | The `DEVICES` option is a comma-separated list of device names. The device names can be found by running `make list-devices` and using the value for the `:name` key. 102 | 103 | ### Results 104 | 105 | After the tests are run, the generated results can be found under `fastlane/benchmark_data` including the .xcresult file with logs and attachments for each device. There will also be a folder called `fastlane/upload_folder/benchmark_data` that contains only the JSON results in `fastlane/benchmark_data` that can used for further analysis. 106 | 107 | We will periodically run these tests on a range of devices and upload the results to the [argmaxinc/whisperkit-evals-dataset](https://huggingface.co/datasets/argmaxinc/whisperkit-evals-dataset), which will propagate to the [WhisperKit Benchmarks](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) space and be available for comparison. 108 | 109 | 110 | # Troubleshooting 111 | 112 | 113 | If you encounter issues while running the tests, heres a few things to try: 114 | 115 | 1. Open the project in Xcode and run the tests directly from there. 116 | 1. To do this, open the example app (from command line type: `xed Examples/WhisperAX`) and run the test named `RegressionTests/testModelPerformanceWithDebugConfig` from the test navigator. 117 | 2. If the tests run successfully, you can rule out any issues with the device or the models. 118 | 3. If they dont run successfully, Xcode will provide more detailed error messages. 119 | 2. Try specifying a single device to run the tests on. This can be done by running `make list-devices` and then running the tests with the `DEVICES` option set to the name of the device you want to test on. For example, `make benchmark-devices DEVICES="My Mac"`. This will also enable you to see the logs for that specific device. 120 | 3. If you are still encountering issues, please reach out to us on the [Discord](https://discord.gg/G5F5GZGecC) or create an [issue](https://github.com/argmaxinc/WhisperKit/issues) on GitHub. 121 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Utilities/ResultWriter.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | public protocol ResultWriting { 7 | var outputDir: String { get } 8 | func write(result: TranscriptionResult, to file: String, options: [String: Any]?) -> Result 9 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String 10 | } 11 | 12 | public extension ResultWriting { 13 | /// Format a time value as a string 14 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String { 15 | let hrs = Int(seconds / 3600) 16 | let mins = Int((seconds.truncatingRemainder(dividingBy: 3600)) / 60) 17 | let secs = Int(seconds.truncatingRemainder(dividingBy: 60)) 18 | let msec = Int((seconds - floor(seconds)) * 1000) 19 | 20 | if alwaysIncludeHours || hrs > 0 { 21 | return String(format: "%02d:%02d:%02d\(decimalMarker)%03d", hrs, mins, secs, msec) 22 | } else { 23 | return String(format: "%02d:%02d\(decimalMarker)%03d", mins, secs, msec) 24 | } 25 | } 26 | 27 | func formatSegment(index: Int, start: Float, end: Float, text: String) -> String { 28 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: true, decimalMarker: ",") 29 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: true, decimalMarker: ",") 30 | return "\(index)\n\(startFormatted) --> \(endFormatted)\n\(text)\n\n" 31 | } 32 | 33 | func formatTiming(start: Float, end: Float, text: String) -> String { 34 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: false, decimalMarker: ".") 35 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: false, decimalMarker: ".") 36 | return "\(startFormatted) --> \(endFormatted)\n\(text)\n\n" 37 | } 38 | } 39 | 40 | open class WriteJSON: ResultWriting { 41 | public let outputDir: String 42 | 43 | public init(outputDir: String) { 44 | self.outputDir = outputDir 45 | } 46 | 47 | /// Write a transcription result to a JSON file 48 | /// - Parameters: 49 | /// - result: Completed transcription result 50 | /// - file: Name of the file to write, without the extension 51 | /// - options: Not used 52 | /// - Returns: The URL of the written file, or a error if the write failed 53 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result { 54 | let reportPathURL = URL(fileURLWithPath: outputDir) 55 | let reportURL = reportPathURL.appendingPathComponent("\(file).json") 56 | let jsonEncoder = JSONEncoder() 57 | jsonEncoder.outputFormatting = .prettyPrinted 58 | do { 59 | let reportJson = try jsonEncoder.encode(result) 60 | try reportJson.write(to: reportURL) 61 | } catch { 62 | return .failure(error) 63 | } 64 | 65 | return .success(reportURL.absoluteString) 66 | } 67 | } 68 | 69 | open class WriteSRT: ResultWriting { 70 | public let outputDir: String 71 | 72 | public init(outputDir: String) { 73 | self.outputDir = outputDir 74 | } 75 | 76 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result { 77 | let outputPathURL = URL(fileURLWithPath: outputDir) 78 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).srt") 79 | 80 | do { 81 | var srtContent = "" 82 | var index = 1 83 | for segment in result.segments { 84 | if let wordTimings = segment.words, !wordTimings.isEmpty { 85 | for wordTiming in wordTimings { 86 | srtContent += formatSegment(index: index, start: wordTiming.start, end: wordTiming.end, text: wordTiming.word) 87 | index += 1 88 | } 89 | } else { 90 | // Use segment timing if word timings are not available 91 | srtContent += formatSegment(index: index, start: segment.start, end: segment.end, text: segment.text) 92 | index += 1 93 | } 94 | } 95 | 96 | try srtContent.write(to: outputFileURL, atomically: true, encoding: .utf8) 97 | return .success(outputFileURL.absoluteString) 98 | } catch { 99 | return .failure(error) 100 | } 101 | } 102 | } 103 | 104 | open class WriteVTT: ResultWriting { 105 | public let outputDir: String 106 | 107 | public init(outputDir: String) { 108 | self.outputDir = outputDir 109 | } 110 | 111 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result { 112 | let outputPathURL = URL(fileURLWithPath: outputDir) 113 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).vtt") 114 | 115 | do { 116 | var vttContent = "WEBVTT\n\n" 117 | for segment in result.segments { 118 | if let wordTimings = segment.words, !wordTimings.isEmpty { 119 | for wordTiming in wordTimings { 120 | vttContent += formatTiming(start: wordTiming.start, end: wordTiming.end, text: wordTiming.word) 121 | } 122 | } else { 123 | // Use segment timing if word timings are not available 124 | vttContent += formatTiming(start: segment.start, end: segment.end, text: segment.text) 125 | } 126 | } 127 | 128 | try vttContent.write(to: outputFileURL, atomically: true, encoding: .utf8) 129 | return .success(outputFileURL.absoluteString) 130 | } catch { 131 | return .failure(error) 132 | } 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Python/test_translate.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2025 Argmax, Inc. All rights reserved. 2 | # For licensing see accompanying LICENSE.md file. 3 | 4 | """ 5 | Test translation with audio files from Tests/WhisperKitTests/Resources/ 6 | 7 | This script tests translation functionality using the actual test audio files 8 | from the WhisperKit test suite. 9 | """ 10 | 11 | import os 12 | import sys 13 | import argparse 14 | from pathlib import Path 15 | from openai import OpenAI 16 | 17 | 18 | def get_test_audio_files(): 19 | """ 20 | Get list of available test audio files from Tests/WhisperKitTests/Resources/ 21 | 22 | Returns: 23 | List of audio file paths 24 | """ 25 | # Path to test resources relative to project root 26 | resources_dir = Path(__file__).parent.parent.parent.parent / "Tests" / "WhisperKitTests" / "Resources" 27 | 28 | if not resources_dir.exists(): 29 | print(f"Error: Test resources directory not found: {resources_dir}") 30 | return [] 31 | 32 | audio_extensions = {'.wav', '.m4a', '.mp3', '.flac', '.aac'} 33 | 34 | audio_files = [] 35 | for file_path in resources_dir.iterdir(): 36 | if file_path.is_file() and file_path.suffix.lower() in audio_extensions: 37 | audio_files.append(file_path) 38 | 39 | return sorted(audio_files) 40 | 41 | 42 | def translate_test_file(client, audio_file_path, prompt=None): 43 | """ 44 | Translate a test audio file using the local WhisperKit server. 45 | 46 | Args: 47 | client: OpenAI client instance 48 | audio_file_path: Path to the audio file 49 | prompt: Optional prompt to guide translation 50 | 51 | Returns: 52 | Translation result or None if failed 53 | """ 54 | try: 55 | print(f"Translating: {audio_file_path.name}") 56 | 57 | with open(audio_file_path, "rb") as audio_file: 58 | response = client.audio.translations.create( 59 | model="tiny", 60 | file=audio_file, 61 | prompt=prompt, 62 | response_format="verbose_json" 63 | ) 64 | return response 65 | except Exception as e: 66 | print(f"Error translating {audio_file_path.name}: {e}") 67 | return None 68 | 69 | 70 | def main(): 71 | parser = argparse.ArgumentParser( 72 | description="Test translation with WhisperKit test audio files" 73 | ) 74 | parser.add_argument( 75 | "--prompt", 76 | help="Optional prompt to guide translation" 77 | ) 78 | parser.add_argument( 79 | "--server-url", 80 | default="http://localhost:50060/v1", 81 | help="WhisperKit server URL (default: http://localhost:50060/v1)" 82 | ) 83 | parser.add_argument( 84 | "--file", 85 | help="Specific test file to translate (e.g., 'es_test_clip.wav')" 86 | ) 87 | parser.add_argument( 88 | "--target-language", 89 | default="en", 90 | help="Target language for translation (default: 'en')" 91 | ) 92 | 93 | args = parser.parse_args() 94 | 95 | # Get available test audio files 96 | test_files = get_test_audio_files() 97 | 98 | if not test_files: 99 | print("No test audio files found!") 100 | sys.exit(1) 101 | 102 | print("Available test audio files:") 103 | for i, file_path in enumerate(test_files, 1): 104 | print(f" {i}. {file_path.name}") 105 | 106 | # Initialize OpenAI client with local server 107 | client = OpenAI( 108 | base_url=args.server_url, 109 | api_key="dummy-key" 110 | ) 111 | 112 | print(f"\nConnecting to WhisperKit server at: {args.server_url}") 113 | print(f"Target language: {args.target_language}") 114 | 115 | if args.prompt: 116 | print(f"Prompt: {args.prompt}") 117 | 118 | if args.file: 119 | target_file = None 120 | for file_path in test_files: 121 | if file_path.name == args.file: 122 | target_file = file_path 123 | break 124 | 125 | if not target_file: 126 | print(f"Error: Test file '{args.file}' not found") 127 | print("Available files:", [f.name for f in test_files]) 128 | sys.exit(1) 129 | 130 | files_to_process = [target_file] 131 | else: 132 | # Process all files 133 | files_to_process = test_files 134 | 135 | print(f"\nProcessing {len(files_to_process)} file(s)...") 136 | 137 | # Process each file 138 | for i, audio_file in enumerate(files_to_process, 1): 139 | print(f"\n{'='*50}") 140 | print(f"File {i}/{len(files_to_process)}: {audio_file.name}") 141 | print(f"{'='*50}") 142 | 143 | result = translate_test_file( 144 | client, 145 | audio_file, 146 | prompt=args.prompt 147 | ) 148 | 149 | if result: 150 | print(f"\n✓ Translation successful!") 151 | print(f"Translated text: {result.text}") 152 | 153 | if hasattr(result, 'segments') and result.segments: 154 | print(f"\nSegments ({len(result.segments)}):") 155 | for segment in result.segments: 156 | print(f" [{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}") 157 | 158 | if hasattr(result, 'language') and result.language: 159 | print(f"\nSource language: {result.language}") 160 | 161 | # File size info 162 | file_size = audio_file.stat().st_size / 1024 # KB 163 | print(f"\nFile size: {file_size:.1f} KB") 164 | 165 | else: 166 | print(f"✗ Translation failed for {audio_file.name}") 167 | 168 | print(f"\n{'='*50}") 169 | print("Test translation complete!") 170 | print(f"Processed {len(files_to_process)} file(s)") 171 | print(f"All audio translated to: {args.target_language}") 172 | 173 | 174 | if __name__ == "__main__": 175 | main() 176 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup setup-huggingface-cli setup-model-repo download-models download-model build build-cli test \ 2 | clean-package-caches list-devices benchmark-connected-devices benchmark-device benchmark-devices \ 3 | extract-xcresult build-local-server generate-server generate-server-spec generate-server-code 4 | 5 | PIP_COMMAND := pip3 6 | PYTHON_COMMAND := python3 7 | 8 | # Define model repository and directories 9 | MODEL_REPO := argmaxinc/whisperkit-coreml 10 | MODEL_REPO_DIR := ./Models/whisperkit-coreml 11 | BASE_COMPILED_DIR := ./Models 12 | 13 | GIT_HASH := $(shell git rev-parse --short HEAD) 14 | 15 | setup: 16 | @echo "Setting up environment..." 17 | @which $(PIP_COMMAND) 18 | @which $(PYTHON_COMMAND) 19 | @echo "Checking for Homebrew..." 20 | @which brew > /dev/null || (echo "Error: Homebrew is not installed. Install it from https://brew.sh and try again" && exit 1) 21 | @echo "Homebrew is installed." 22 | @echo "Checking for huggingface-cli..." 23 | @which huggingface-cli > /dev/null || (echo "Installing huggingface-cli..." && brew install huggingface-cli) 24 | @echo "huggingface-cli is installed." 25 | @echo "Checking for git-lfs..." 26 | @which git-lfs > /dev/null || (echo "Installing git-lfs..." && brew install git-lfs) 27 | @echo "git-lfs is installed." 28 | @echo "Checking for trash..." 29 | @which trash > /dev/null || (echo "Installing trash..." && brew install trash) 30 | @echo "trash is installed." 31 | @echo "Checking for fastlane" 32 | @which fastlane > /dev/null || (echo "Installing fastlane..." && brew install fastlane) 33 | @echo "fastlane is installed." 34 | @$(MAKE) generate-whisperax-xcconfig 35 | @echo "Done 🚀" 36 | 37 | 38 | generate-whisperax-xcconfig: 39 | @echo "Updating DEVELOPMENT_TEAM in Examples/WhisperAX/Debug.xcconfig..." 40 | @TEAM_ID=$$(defaults read com.apple.dt.Xcode IDEProvisioningTeams | plutil -convert json -r -o - -- - | jq -r 'to_entries[0].value | sort_by(.teamType == "Individual") | .[0].teamID' 2>/dev/null); \ 41 | if [ -z "$$TEAM_ID" ]; then \ 42 | echo "Error: No Development Team ID found. Please log into Xcode with your Apple ID and select a team."; \ 43 | else \ 44 | echo "DEVELOPMENT_TEAM=$$TEAM_ID" > Examples/WhisperAX/Debug.xcconfig; \ 45 | echo "DEVELOPMENT_TEAM has been updated in Examples/WhisperAX/Debug.xcconfig with your Development Team ID: $$TEAM_ID"; \ 46 | fi 47 | 48 | 49 | setup-huggingface-cli: 50 | @if huggingface-cli whoami; then \ 51 | echo "Already logged in to Hugging Face."; \ 52 | else \ 53 | echo "Not logged in to Hugging Face."; \ 54 | if [ -z "$$HF_TOKEN" ]; then \ 55 | echo "Environment variable HF_TOKEN is not set. Running normal login."; \ 56 | huggingface-cli login; \ 57 | else \ 58 | echo "Using HF_TOKEN from environment variable."; \ 59 | huggingface-cli login --token $$HF_TOKEN; \ 60 | fi; \ 61 | fi 62 | 63 | 64 | setup-model-repo: 65 | @echo "Setting up repository..." 66 | @mkdir -p $(BASE_COMPILED_DIR) 67 | @if [ -d "$(MODEL_REPO_DIR)/.git" ]; then \ 68 | echo "Repository exists, resetting..."; \ 69 | export GIT_LFS_SKIP_SMUDGE=1; \ 70 | cd $(MODEL_REPO_DIR) && git fetch --all && git reset --hard origin/main && git clean -fdx; \ 71 | else \ 72 | echo "Repository not found, initializing..."; \ 73 | export GIT_LFS_SKIP_SMUDGE=1; \ 74 | git clone https://huggingface.co/$(MODEL_REPO) $(MODEL_REPO_DIR); \ 75 | fi 76 | 77 | 78 | # Download all models 79 | download-models: setup-model-repo 80 | @echo "Downloading all models..." 81 | @cd $(MODEL_REPO_DIR) && \ 82 | git lfs pull 83 | 84 | 85 | # Download a specific model 86 | download-model: 87 | @if [ -z "$(MODEL)" ]; then \ 88 | echo "Error: MODEL is not set. Usage: make download-model MODEL=base"; \ 89 | exit 1; \ 90 | fi 91 | @echo "Downloading model $(MODEL)..." 92 | @$(MAKE) setup-model-repo 93 | @echo "Fetching model $(MODEL)..." 94 | @cd $(MODEL_REPO_DIR) && \ 95 | git lfs pull --include="openai_whisper-$(MODEL)/*" 96 | 97 | build: 98 | @echo "Building WhisperKit..." 99 | @swift build -v 100 | 101 | 102 | build-cli: 103 | @echo "Building WhisperKit CLI..." 104 | @swift build -c release --product whisperkit-cli 105 | 106 | test: 107 | @echo "Running tests..." 108 | @swift test -v 109 | 110 | 111 | list-devices: 112 | fastlane ios list_devices 113 | 114 | 115 | # Usage: 116 | # make benchmark-devices # Benchmark all connected devices 117 | # make benchmark-devices DEBUG=true # Benchmark all connected devices with small test matrix 118 | # make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac" # Benchmark specific device names from `make list-devices` 119 | DEVICES ?= 120 | DEBUG ?= false 121 | benchmark-devices: generate-whisperax-xcconfig 122 | @if [ -n "$(DEVICES)" ]; then \ 123 | echo "Benchmarking specific devices: $(DEVICES)"; \ 124 | fastlane benchmark devices:"$(DEVICES)" debug:$(DEBUG); \ 125 | else \ 126 | echo "Benchmarking all connected devices"; \ 127 | fastlane benchmark debug:$(DEBUG); \ 128 | fi 129 | 130 | upload-benchmark-results: 131 | @echo "Uploading benchmark results..." 132 | @fastlane upload_results 133 | 134 | clean-package-caches: 135 | @trash ~/Library/Developer/Xcode/DerivedData/WhisperKit* || true 136 | @swift package purge-cache 137 | @swift package reset 138 | 139 | build-local-server: 140 | @echo "Building WhisperKit CLI with server support..." 141 | @BUILD_ALL=1 swift build -c release --product whisperkit-cli 142 | 143 | generate-server: 144 | @echo "Generating server OpenAPI spec and code..." 145 | @cd scripts && uv run python3 generate_local_server_openapi.py --latest 146 | @echo "" 147 | @echo "==========================================" 148 | @echo "Generating server code from OpenAPI spec..." 149 | @echo "==========================================" 150 | @BUILD_ALL=1 swift run swift-openapi-generator generate scripts/specs/localserver_openapi.yaml \ 151 | --output-directory Sources/WhisperKitCLI/Server/GeneratedSources \ 152 | --mode types \ 153 | --mode server 154 | @echo "" 155 | @echo "==========================================" 156 | @echo "Server generation complete!" 157 | @echo "==========================================" 158 | @echo "Run 'BUILD_ALL=1 swift run whisperkit-cli serve' to start the server" 159 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Python/test_transcribe.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2025 Argmax, Inc. All rights reserved. 2 | # For licensing see accompanying LICENSE.md file. 3 | 4 | """ 5 | Test transcription with audio files from Tests/WhisperKitTests/Resources/ 6 | 7 | This script tests transcription functionality using the actual test audio files 8 | from the WhisperKit test suite. 9 | """ 10 | 11 | import os 12 | import sys 13 | import argparse 14 | from pathlib import Path 15 | from openai import OpenAI 16 | 17 | 18 | def get_test_audio_files(): 19 | """ 20 | Get list of available test audio files from Tests/WhisperKitTests/Resources/ 21 | 22 | Returns: 23 | List of audio file paths 24 | """ 25 | # Path to test resources relative to project root 26 | resources_dir = Path(__file__).parent.parent.parent.parent / "Tests" / "WhisperKitTests" / "Resources" 27 | 28 | if not resources_dir.exists(): 29 | print(f"Error: Test resources directory not found: {resources_dir}") 30 | return [] 31 | 32 | # Audio file extensions to look for 33 | audio_extensions = {'.wav', '.m4a', '.mp3', '.flac', '.aac'} 34 | 35 | audio_files = [] 36 | for file_path in resources_dir.iterdir(): 37 | if file_path.is_file() and file_path.suffix.lower() in audio_extensions: 38 | audio_files.append(file_path) 39 | 40 | return sorted(audio_files) 41 | 42 | 43 | def transcribe_test_file(client, audio_file_path, language=None, prompt=None): 44 | """ 45 | Transcribe a test audio file using the local WhisperKit server. 46 | 47 | Args: 48 | client: OpenAI client instance 49 | audio_file_path: Path to the audio file 50 | language: Optional language code 51 | prompt: Optional prompt to guide transcription 52 | 53 | Returns: 54 | Transcription result or None if failed 55 | """ 56 | try: 57 | print(f"Transcribing: {audio_file_path.name}") 58 | 59 | with open(audio_file_path, "rb") as audio_file: 60 | response = client.audio.transcriptions.create( 61 | model="tiny", 62 | file=audio_file, 63 | language=language, 64 | prompt=prompt, 65 | response_format="verbose_json" 66 | ) 67 | return response 68 | except Exception as e: 69 | print(f"Error transcribing {audio_file_path.name}: {e}") 70 | return None 71 | 72 | 73 | def main(): 74 | parser = argparse.ArgumentParser( 75 | description="Test transcription with WhisperKit test audio files" 76 | ) 77 | parser.add_argument( 78 | "--language", 79 | help="Language code (e.g., 'en', 'es', 'ja')" 80 | ) 81 | parser.add_argument( 82 | "--prompt", 83 | help="Optional prompt to guide transcription" 84 | ) 85 | parser.add_argument( 86 | "--server-url", 87 | default="http://localhost:50060/v1", 88 | help="WhisperKit server URL (default: http://localhost:50060/v1)" 89 | ) 90 | parser.add_argument( 91 | "--file", 92 | help="Specific test file to transcribe (e.g., 'jfk.wav')" 93 | ) 94 | 95 | args = parser.parse_args() 96 | 97 | # Get available test audio files 98 | test_files = get_test_audio_files() 99 | 100 | if not test_files: 101 | print("No test audio files found!") 102 | sys.exit(1) 103 | 104 | print("Available test audio files:") 105 | for i, file_path in enumerate(test_files, 1): 106 | print(f" {i}. {file_path.name}") 107 | 108 | # Initialize OpenAI client with local server 109 | client = OpenAI( 110 | base_url=args.server_url, 111 | api_key="dummy-key" 112 | ) 113 | 114 | print(f"\nConnecting to WhisperKit server at: {args.server_url}") 115 | 116 | if args.language: 117 | print(f"Language: {args.language}") 118 | if args.prompt: 119 | print(f"Prompt: {args.prompt}") 120 | 121 | # Determine which files to process 122 | if args.file: 123 | # Process specific file 124 | target_file = None 125 | for file_path in test_files: 126 | if file_path.name == args.file: 127 | target_file = file_path 128 | break 129 | 130 | if not target_file: 131 | print(f"Error: Test file '{args.file}' not found") 132 | print("Available files:", [f.name for f in test_files]) 133 | sys.exit(1) 134 | 135 | files_to_process = [target_file] 136 | else: 137 | # Process all files 138 | files_to_process = test_files 139 | 140 | print(f"\nProcessing {len(files_to_process)} file(s)...") 141 | 142 | # Process each file 143 | for i, audio_file in enumerate(files_to_process, 1): 144 | print(f"\n{'='*50}") 145 | print(f"File {i}/{len(files_to_process)}: {audio_file.name}") 146 | print(f"{'='*50}") 147 | 148 | result = transcribe_test_file( 149 | client, 150 | audio_file, 151 | language=args.language, 152 | prompt=args.prompt 153 | ) 154 | 155 | if result: 156 | print(f"\n✓ Transcription successful!") 157 | print(f"Text: {result.text}") 158 | 159 | if hasattr(result, 'segments') and result.segments: 160 | print(f"\nSegments ({len(result.segments)}):") 161 | for segment in result.segments: 162 | print(f" [{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}") 163 | 164 | if hasattr(result, 'language') and result.language: 165 | print(f"\nDetected Language: {result.language}") 166 | 167 | # File size info 168 | file_size = audio_file.stat().st_size / 1024 # KB 169 | print(f"\nFile size: {file_size:.1f} KB") 170 | 171 | else: 172 | print(f"✗ Transcription failed for {audio_file.name}") 173 | 174 | print(f"\n{'='*50}") 175 | print("Test transcription complete!") 176 | print(f"Processed {len(files_to_process)} file(s)") 177 | 178 | 179 | if __name__ == "__main__": 180 | main() 181 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Evaluate/WERUtils.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Return the operations needed to transform s1 into s2 using Wagner-Fischer algo. 7 | /// "i" = insertion, "d" = deletion, "r" = replacement 8 | enum EditOp: UInt8 { 9 | case blank 10 | case replace 11 | case delete 12 | case insert 13 | } 14 | 15 | enum WERUtils { 16 | static func wordsToChars(reference: [[String]], hypothesis: [[String]]) -> ([String], [String]) { 17 | // tokenize each word into an integer 18 | let vocabulary = Set((reference + hypothesis).flatMap { $0 }) 19 | let word2char = Dictionary(uniqueKeysWithValues: vocabulary.enumerated().map { index, value in 20 | (value, index) 21 | }) 22 | 23 | let referenceCharsEfficient = reference.map { sentence in 24 | String(sentence.lazy.compactMap { word in 25 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) { 26 | return Character(unicodeScalar) 27 | } 28 | return nil 29 | }) 30 | } 31 | 32 | let hypothesisCharsEfficient = hypothesis.map { sentence in 33 | String(sentence.lazy.compactMap { word in 34 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) { 35 | return Character(unicodeScalar) 36 | } 37 | return nil 38 | }) 39 | } 40 | 41 | return (referenceCharsEfficient, hypothesisCharsEfficient) 42 | } 43 | 44 | static func processWords(reference: [String], hypothesis: [String]) -> (Double, [[String?]]) { 45 | var refTransformed = NormalizationUtils.removeMultipleSpaces(sentences: reference) 46 | refTransformed = NormalizationUtils.strip(sentences: refTransformed) 47 | let refTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: refTransformed) 48 | 49 | var hypTransformed = NormalizationUtils.removeMultipleSpaces(sentences: hypothesis) 50 | hypTransformed = NormalizationUtils.strip(sentences: hypTransformed) 51 | let hypTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: hypTransformed) 52 | 53 | let (refAsChars, hypAsChars) = WERUtils.wordsToChars(reference: refTransformedReduced, hypothesis: hypTransformedReduced) 54 | 55 | let refArrays = refAsChars.map { Array($0.unicodeScalars) } 56 | let hypArrays = hypAsChars.map { Array($0.unicodeScalars) } 57 | 58 | var (numHits, numSubstitutions, numDeletions, numInsertions) = (0, 0, 0, 0) 59 | var (numRfWords, numHypWords) = (0, 0) 60 | var diffResult: [[String?]] = [] 61 | 62 | for (referenceSentence, hypothesisSentence) in zip(refArrays, hypArrays) { 63 | let editOps = levenshtein(referenceSentence, hypothesisSentence) 64 | 65 | // count the number of edits of each type 66 | var substitutions = 0 67 | var deletions = 0 68 | var insertions = 0 69 | 70 | var referenceIndex = 0 71 | var hypothesisIndex = 0 72 | for op in editOps { 73 | switch op { 74 | case .replace: 75 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"]) 76 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"]) 77 | substitutions += 1 78 | referenceIndex += 1 79 | hypothesisIndex += 1 80 | case .delete: 81 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"]) 82 | deletions += 1 83 | referenceIndex += 1 84 | case .insert: 85 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"]) 86 | insertions += 1 87 | hypothesisIndex += 1 88 | case .blank: 89 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), nil]) 90 | referenceIndex += 1 91 | hypothesisIndex += 1 92 | } 93 | } 94 | 95 | let hits: Int = referenceSentence.count - (substitutions + deletions) 96 | 97 | numHits += hits 98 | numSubstitutions += substitutions 99 | numDeletions += deletions 100 | numInsertions += insertions 101 | numRfWords += referenceSentence.count 102 | numHypWords += hypothesisSentence.count 103 | } 104 | 105 | let wer = Double(numSubstitutions + numDeletions + numInsertions) / Double(numHits + numSubstitutions + numDeletions) 106 | 107 | return (wer, diffResult) 108 | } 109 | 110 | static func evaluate(originalTranscript: String, generatedTranscript: String, normalizeOriginal: Bool = true) -> (wer: Double, diff: [[String?]]) { 111 | let normalizer = EnglishTextNormalizer() 112 | let reference = normalizeOriginal ? normalizer.normalize(text: originalTranscript) : originalTranscript 113 | let hypothesis = normalizer.normalize(text: generatedTranscript) 114 | 115 | let (wer, diff) = WERUtils.processWords( 116 | reference: [reference], 117 | hypothesis: [hypothesis] 118 | ) 119 | return (wer, diff) 120 | } 121 | 122 | static func processDiff(originalTranscript: String, generatedTranscript: String) -> [[String?]] { 123 | let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript) 124 | return diff 125 | } 126 | 127 | static func diffString(from diff: [[String?]]) -> String { 128 | return diff.compactMap { entry -> String? in 129 | guard let word = entry[0], word != " " else { return nil } 130 | if let changeType = entry[1] { 131 | return "\(changeType)\(word)" 132 | } 133 | return word 134 | }.joined(separator: " ") 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Evaluate/DistanceCalculation.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Compute the last row of the edit distance dynamic programming matrix 7 | /// between s1 and s2. 8 | func computeLastRow(_ s1Chars: [Unicode.Scalar], _ s2Chars: [Unicode.Scalar]) -> [Int] { 9 | var prevRow = Array(0...s2Chars.endIndex) 10 | 11 | for i in 1...s1Chars.endIndex { 12 | var currentRow = [Int](repeating: 0, count: s2Chars.endIndex + 1) 13 | currentRow[0] = i 14 | 15 | for j in 1...s2Chars.endIndex { 16 | let cost = s1Chars[i - 1] == s2Chars[j - 1] ? 0 : 1 17 | currentRow[j] = min( 18 | prevRow[j] + 1, // Deletion 19 | currentRow[j - 1] + 1, // Insertion 20 | prevRow[j - 1] + cost // Substitution 21 | ) 22 | } 23 | prevRow = currentRow 24 | } 25 | 26 | return prevRow 27 | } 28 | 29 | func needlemanWunsch(_ xArray: [Unicode.Scalar], _ yArray: [Unicode.Scalar]) -> [EditOp] { 30 | let m = xArray.count 31 | let n = yArray.count 32 | 33 | var dp = [[Int]](repeating: [Int](repeating: 0, count: n + 1), count: m + 1) 34 | for i in 1...m { 35 | dp[i][0] = i 36 | } 37 | for j in 1...n { 38 | dp[0][j] = j 39 | } 40 | 41 | for i in 1...m { 42 | for j in 1...n { 43 | let cost = xArray[i - 1] == yArray[j - 1] ? 0 : 1 44 | dp[i][j] = min( 45 | dp[i - 1][j] + 1, // Deletion 46 | dp[i][j - 1] + 1, // Insertion 47 | dp[i - 1][j - 1] + cost // Substitution 48 | ) 49 | } 50 | } 51 | 52 | var i = m 53 | var j = n 54 | var ops = [EditOp]() 55 | 56 | while i > 0, j > 0 { 57 | if dp[i][j] == dp[i - 1][j - 1], xArray[i - 1] == yArray[j - 1] { 58 | // Match operation is omitted 59 | i -= 1 60 | j -= 1 61 | } else if dp[i][j] == dp[i - 1][j - 1] + 1 { 62 | ops.append(EditOp.replace) // Substitution 63 | i -= 1 64 | j -= 1 65 | } else if dp[i][j] == dp[i][j - 1] + 1 { 66 | ops.append(EditOp.insert) // Insertion 67 | j -= 1 68 | } else { 69 | ops.append(EditOp.delete) // Deletion 70 | i -= 1 71 | } 72 | } 73 | 74 | while i > 0 { 75 | ops.append(EditOp.delete) 76 | i -= 1 77 | } 78 | while j > 0 { 79 | ops.append(EditOp.insert) 80 | j -= 1 81 | } 82 | 83 | return ops.reversed() 84 | } 85 | 86 | func hirschberg(_ reference: [Unicode.Scalar], _ s2: [Unicode.Scalar]) -> [EditOp] { 87 | func hirschbergRec(_ x: [Unicode.Scalar], _ y: [Unicode.Scalar]) -> [EditOp] { 88 | let m = x.endIndex 89 | let n = y.endIndex 90 | 91 | if m == 0 { 92 | let result = y.map { _ in EditOp.insert } 93 | return result 94 | } 95 | if n == 0 { 96 | let result = x.map { _ in EditOp.delete } 97 | return result 98 | } 99 | if m == 1 || n == 1 { 100 | let result = needlemanWunsch(x, y) 101 | return result 102 | } 103 | 104 | let i = m / 2 105 | let xPrefix = Array(x[x.startIndex.. [EditOp] { 136 | let n = sourceText.count 137 | let m = targetText.count 138 | let maxD = n + m 139 | let vSize = 2 * maxD + 1 140 | var v = [Int](repeating: 0, count: vSize) 141 | var trace = [[Int]]() 142 | 143 | let offset = maxD 144 | 145 | for d in 0...maxD { 146 | let vSnapshot = v 147 | for k in stride(from: -d, through: d, by: 2) { 148 | let kIndex = k + offset 149 | var x: Int 150 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) { 151 | x = v[kIndex + 1] 152 | } else { 153 | x = v[kIndex - 1] + 1 154 | } 155 | var y = x - k 156 | while x < n, y < m, sourceText[x] == targetText[y] { 157 | x += 1 158 | y += 1 159 | } 160 | v[kIndex] = x 161 | if x >= n, y >= m { 162 | trace.append(vSnapshot) 163 | return backtrack(trace: trace, sourceText: sourceText, targetText: targetText) 164 | } 165 | } 166 | trace.append(vSnapshot) 167 | } 168 | return [] 169 | } 170 | 171 | func backtrack(trace: [[Int]], sourceText: [Unicode.Scalar], targetText: [Unicode.Scalar]) -> [EditOp] { 172 | var editOps = [EditOp]() 173 | let n = sourceText.count 174 | let m = targetText.count 175 | let offset = trace[0].count / 2 176 | var x = n 177 | var y = m 178 | 179 | for d in stride(from: trace.count - 1, through: 0, by: -1) { 180 | let v = trace[d] 181 | let k = x - y 182 | let kIndex = k + offset 183 | 184 | var prevK: Int 185 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) { 186 | prevK = k + 1 187 | } else { 188 | prevK = k - 1 189 | } 190 | let prevX = v[prevK + offset] 191 | let prevY = prevX - prevK 192 | 193 | while x > prevX, y > prevY { 194 | // Match or Replace 195 | if sourceText[x - 1] == targetText[y - 1] { 196 | editOps.append(.blank) 197 | } else { 198 | editOps.append(.replace) 199 | } 200 | x -= 1 201 | y -= 1 202 | } 203 | 204 | if d > 0 { 205 | if x == prevX { 206 | // Insertion 207 | editOps.append(.insert) 208 | y -= 1 209 | } else { 210 | // Deletion 211 | editOps.append(.delete) 212 | x -= 1 213 | } 214 | } 215 | } 216 | 217 | return editOps.reversed() 218 | } 219 | -------------------------------------------------------------------------------- /Examples/ServeCLIClient/Curl/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright © 2025 Argmax, Inc. All rights reserved. 4 | # For licensing see accompanying LICENSE.md file. 5 | 6 | # WhisperKit CurlClient - Test Script 7 | # This script demonstrates various features of the CurlClient 8 | 9 | set -e 10 | 11 | # Colors for output 12 | RED='\033[0;31m' 13 | GREEN='\033[0;32m' 14 | YELLOW='\033[1;33m' 15 | BLUE='\033[0;34m' 16 | NC='\033[0m' # No Color 17 | 18 | # Test audio files (adjust paths as needed) 19 | TEST_FILES=( 20 | "../../../Tests/WhisperKitTests/Resources/jfk.wav" 21 | "../../../Tests/WhisperKitTests/Resources/es_test_clip.wav" 22 | "../../../Tests/WhisperKitTests/Resources/ja_test_clip.wav" 23 | ) 24 | 25 | # Server URL 26 | SERVER_URL="http://localhost:50060" 27 | 28 | echo -e "${BLUE}🧪 WhisperKit CurlClient Test Suite${NC}" 29 | echo -e "${YELLOW}Testing against server:${NC} $SERVER_URL" 30 | echo "" 31 | 32 | # Check if server is running 33 | echo -e "${BLUE}🔍 Checking server status...${NC}" 34 | if curl -s "$SERVER_URL" > /dev/null 2>&1; then 35 | echo -e "${GREEN}✅ Server is running${NC}" 36 | else 37 | echo -e "${RED}❌ Server is not running at $SERVER_URL${NC}" 38 | echo -e "${YELLOW}Please start the server first:${NC}" 39 | echo " whisperkit-cli serve --model tiny" 40 | exit 1 41 | fi 42 | 43 | echo "" 44 | 45 | test_logprobs() { 46 | echo "🧪 Testing transcription with logprobs..." 47 | 48 | # Find test audio files - use absolute path 49 | local test_file="" 50 | if [ -f "../../../Tests/WhisperKitTests/Resources/jfk.wav" ]; then 51 | test_file="$(cd ../../../Tests/WhisperKitTests/Resources && pwd)/jfk.wav" 52 | elif [ -f "../../../Tests/WhisperKitTests/Resources/es_test_clip.wav" ]; then 53 | test_file="$(cd ../../../Tests/WhisperKitTests/Resources && pwd)/es_test_clip.wav" 54 | elif [ -f "../../../Tests/WhisperKitTests/Resources/ja_test_clip.wav" ]; then 55 | test_file="$(cd ../../../Tests/WhisperKitTests/Resources && pwd)/ja_test_clip.wav" 56 | fi 57 | 58 | if [ -z "$test_file" ]; then 59 | echo "❌ No test audio files found" 60 | return 1 61 | fi 62 | 63 | echo "📁 Using test file: $(basename "$test_file")" 64 | echo "🔍 Full path: $test_file" 65 | 66 | # Test with logprobs enabled 67 | echo "🔍 Testing with file: $test_file" 68 | echo "🔍 Server URL: $SERVER_URL" 69 | 70 | local response=$(curl -s -X POST "$SERVER_URL/v1/audio/transcriptions" \ 71 | -H "Content-Type: multipart/form-data" \ 72 | -F "file=@$test_file" \ 73 | -F "model=tiny" \ 74 | -F "response_format=json" \ 75 | -F "include[]=logprobs") 76 | 77 | # Debug: Show response length and first part 78 | echo "🔍 Response length: ${#response}" 79 | echo "🔍 Response preview: ${response:0:200}..." 80 | 81 | if echo "$response" | grep -q "logprobs"; then 82 | echo "✅ Logprobs received in response" 83 | 84 | # Extract and display logprobs info 85 | local logprobs_count=$(echo "$response" | jq -r '.logprobs | length' 2>/dev/null || echo "0") 86 | echo "📊 Found $logprobs_count logprob entries" 87 | 88 | # Show first few logprobs 89 | if [ "$logprobs_count" -gt 0 ]; then 90 | echo "🔍 First few logprobs:" 91 | echo "$response" | jq -r '.logprobs[0:3][] | " Token: \(.token) - Logprob: \(.logprob)"' 2>/dev/null || echo " Could not parse logprobs" 92 | fi 93 | 94 | return 0 95 | else 96 | echo "❌ No logprobs in response" 97 | echo "Available keys: $(echo "$response" | jq -r 'keys | join(", ")' 2>/dev/null || echo "Could not parse response")" 98 | return 1 99 | fi 100 | } 101 | 102 | # Test 1: Basic transcription 103 | echo -e "${BLUE}📝 Test 1: Basic Transcription (verbose_json)${NC}" 104 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}" 105 | echo "" 106 | ./transcribe.sh "${TEST_FILES[0]}" --response-format verbose_json 107 | echo "" 108 | 109 | # Test 2: Basic transcription with JSON format 110 | echo -e "${BLUE}📝 Test 2: Basic Transcription (json)${NC}" 111 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}" 112 | echo "" 113 | ./transcribe.sh "${TEST_FILES[0]}" --response-format json 114 | echo "" 115 | 116 | # Test 3: Transcription with word timestamps 117 | echo -e "${BLUE}📝 Test 3: Transcription with Word Timestamps${NC}" 118 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}" 119 | echo "" 120 | ./transcribe.sh "${TEST_FILES[0]}" --timestamp-granularities "word,segment" 121 | echo "" 122 | 123 | # Test 4: Spanish transcription 124 | echo -e "${BLUE}📝 Test 4: Spanish Transcription${NC}" 125 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}" 126 | echo -e "${YELLOW}Language:${NC} es" 127 | echo "" 128 | ./transcribe.sh "${TEST_FILES[1]}" --language es 129 | echo "" 130 | 131 | # Test 5: Japanese transcription 132 | echo -e "${BLUE}📝 Test 5: Japanese Transcription${NC}" 133 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[2]}" 134 | echo -e "${YELLOW}Language:${NC} ja" 135 | echo "" 136 | ./transcribe.sh "${TEST_FILES[2]}" --language ja 137 | echo "" 138 | 139 | # Test 6: Translation (Spanish to English) 140 | echo -e "${BLUE}🌐 Test 6: Translation (Spanish to English)${NC}" 141 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}" 142 | echo -e "${YELLOW}Source Language:${NC} es" 143 | echo "" 144 | ./translate.sh "${TEST_FILES[1]}" --language es 145 | echo "" 146 | 147 | # Test 7: Translation (Japanese to English) 148 | echo -e "${BLUE}🌐 Test 7: Translation (Japanese to English)${NC}" 149 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[2]}" 150 | echo -e "${YELLOW}Source Language:${NC} ja" 151 | echo "" 152 | ./translate.sh "${TEST_FILES[2]}" --language ja 153 | echo "" 154 | 155 | # Test 7.5: Translation with basic JSON format 156 | echo -e "${BLUE}🌐 Test 7.5: Translation with JSON Format${NC}" 157 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}" 158 | echo -e "${YELLOW}Source Language:${NC} es" 159 | echo "" 160 | ./translate.sh "${TEST_FILES[1]}" --language es --response-format json 161 | echo "" 162 | 163 | # Test 8: Streaming transcription 164 | echo -e "${BLUE}📡 Test 8: Streaming Transcription${NC}" 165 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}" 166 | echo "" 167 | ./transcribe.sh "${TEST_FILES[0]}" --stream true 168 | echo "" 169 | 170 | # Test 8.5: Translation with prompt 171 | echo -e "${BLUE}📝 Test 8.5: Translation with Prompt${NC}" 172 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}" 173 | echo -e "${YELLOW}Source Language:${NC} es" 174 | echo "" 175 | ./translate.sh "${TEST_FILES[1]}" --language es --prompt "This is a formal conversation" 176 | echo "" 177 | 178 | # Test 9: Logprobs functionality 179 | echo -e "${BLUE}🧪 Test 9: Logprobs Functionality${NC}" 180 | if test_logprobs; then 181 | echo -e "${GREEN}✅ Logprobs test passed${NC}" 182 | else 183 | echo -e "${RED}❌ Logprobs test failed${NC}" 184 | fi 185 | echo "" 186 | 187 | # Test 10: Translation with different temperature 188 | echo -e "${BLUE}🌡️ Test 10: Translation with Temperature${NC}" 189 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}" 190 | echo -e "${YELLOW}Source Language:${NC} es" 191 | echo "" 192 | ./translate.sh "${TEST_FILES[1]}" --language es --temperature 0.2 193 | echo "" 194 | 195 | echo -e "${GREEN}🎉 All tests completed!${NC}" 196 | echo "" 197 | echo -e "${BLUE}📚 For more examples, see:${NC}" 198 | echo " ./transcribe.sh --help" 199 | echo " ./translate.sh --help" 200 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Utilities/Extensions+Internal.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import AVFoundation 5 | import CoreML 6 | 7 | extension MLMultiArray { 8 | /// All values will be stored in the last dimension of the MLMultiArray (default is dims=1) 9 | static func from(_ array: [Int], dims: Int = 1) throws -> MLMultiArray { 10 | var shape = Array(repeating: 1, count: dims) 11 | shape[shape.count - 1] = array.count 12 | /// Examples: 13 | /// dims=1 : [arr.count] 14 | /// dims=2 : [1, arr.count] 15 | /// 16 | let output = try MLMultiArray(shape: shape as [NSNumber], dataType: .int32) 17 | let pointer = UnsafeMutablePointer(OpaquePointer(output.dataPointer)) 18 | for (i, item) in array.enumerated() { 19 | pointer[i] = Int32(item) 20 | } 21 | return output 22 | } 23 | } 24 | 25 | extension Array { 26 | func batched(into size: Int) -> [[Element]] { 27 | return stride(from: 0, to: count, by: size).map { 28 | Array(self[$0.. { 34 | /// Convenience method to convert the `Result` object into an array of optional arrays of `TranscriptionResult`. 35 | /// - Returns: An array of optional arrays containing `TranscriptionResult`. 36 | func toOptionalArrays() -> [[TranscriptionResult]?] { 37 | return self.map { try? $0.get() } 38 | } 39 | } 40 | 41 | extension Array where Element: Hashable { 42 | /// Returns an array with duplicates removed, preserving the original order. 43 | var orderedSet: [Element] { 44 | var seen = Set() 45 | return self.filter { element in 46 | if seen.contains(element) { 47 | return false 48 | } else { 49 | seen.insert(element) 50 | return true 51 | } 52 | } 53 | } 54 | } 55 | 56 | extension String { 57 | /// Reference: https://github.com/huggingface/swift-transformers/blob/94610577e4af9bbc267060af1e25e977604dd796/Sources/Tokenizers/Decoder.swift#L267-L275 58 | func trimmingFromEnd(character: Character = " ", upto: Int) -> String { 59 | var result = self 60 | var trimmed = 0 61 | while trimmed < upto && result.last == character { 62 | result.removeLast() 63 | trimmed += 1 64 | } 65 | return result 66 | } 67 | } 68 | 69 | extension [String] { 70 | /// Reference: https://github.com/huggingface/swift-transformers/blob/94610577e4af9bbc267060af1e25e977604dd796/Sources/Hub/HubApi.swift#L983-L987 71 | func matching(glob: String) -> [String] { 72 | filter { fnmatch(glob, $0, 0) == 0 } 73 | } 74 | } 75 | 76 | extension AVAudioPCMBuffer { 77 | /// Converts the buffer to a float array 78 | func asFloatArray() throws -> [Float] { 79 | guard let data = floatChannelData?.pointee else { 80 | throw WhisperError.audioProcessingFailed("Error converting audio, missing floatChannelData") 81 | } 82 | return Array(UnsafeBufferPointer(start: data, count: Int(frameLength))) 83 | } 84 | 85 | /// Appends the contents of another buffer to the current buffer 86 | func appendContents(of buffer: AVAudioPCMBuffer) -> Bool { 87 | return appendContents(of: buffer, startingFrame: 0, frameCount: buffer.frameLength) 88 | } 89 | 90 | /// Appends a specific range of frames from another buffer to the current buffer 91 | func appendContents(of buffer: AVAudioPCMBuffer, startingFrame: AVAudioFramePosition, frameCount: AVAudioFrameCount) -> Bool { 92 | guard format == buffer.format else { 93 | Logging.debug("Format mismatch") 94 | return false 95 | } 96 | 97 | guard startingFrame + AVAudioFramePosition(frameCount) <= AVAudioFramePosition(buffer.frameLength) else { 98 | Logging.error("Insufficient audio in buffer") 99 | return false 100 | } 101 | 102 | guard let destination = floatChannelData, let source = buffer.floatChannelData else { 103 | Logging.error("Failed to access float channel data") 104 | return false 105 | } 106 | 107 | var calculatedFrameCount = frameCount 108 | if frameLength + frameCount > frameCapacity { 109 | Logging.debug("Insufficient space in buffer, reducing frame count to fit") 110 | calculatedFrameCount = frameCapacity - frameLength 111 | } 112 | 113 | let calculatedStride = stride 114 | let destinationPointer = destination.pointee.advanced(by: calculatedStride * Int(frameLength)) 115 | let sourcePointer = source.pointee.advanced(by: calculatedStride * Int(startingFrame)) 116 | 117 | memcpy(destinationPointer, sourcePointer, Int(calculatedFrameCount) * calculatedStride * MemoryLayout.size) 118 | 119 | frameLength += calculatedFrameCount 120 | return true 121 | } 122 | 123 | /// Convenience initializer to concatenate multiple buffers into one 124 | convenience init?(concatenating buffers: [AVAudioPCMBuffer]) { 125 | guard !buffers.isEmpty else { 126 | Logging.debug("Buffers array should not be empty") 127 | return nil 128 | } 129 | 130 | let totalFrames = buffers.reduce(0) { $0 + $1.frameLength } 131 | 132 | guard let firstBuffer = buffers.first else { 133 | Logging.debug("Failed to get the first buffer") 134 | return nil 135 | } 136 | 137 | self.init(pcmFormat: firstBuffer.format, frameCapacity: totalFrames) 138 | 139 | for buffer in buffers { 140 | if !appendContents(of: buffer) { 141 | Logging.debug("Failed to append buffer") 142 | return nil 143 | } 144 | } 145 | } 146 | 147 | /// Computed property to determine the stride for float channel data 148 | private var stride: Int { 149 | return Int(format.streamDescription.pointee.mBytesPerFrame) / MemoryLayout.size 150 | } 151 | } 152 | 153 | // MARK: - WhisperKit Components 154 | 155 | extension AudioProcessing { 156 | static func getDownloadsDirectory() -> URL { 157 | let paths = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask) 158 | return paths[0] 159 | } 160 | 161 | static func saveBuffer(_ buffer: AVAudioPCMBuffer, to url: URL) throws { 162 | // create folder 163 | let folderURL = url.deletingLastPathComponent() 164 | if !FileManager.default.fileExists(atPath: folderURL.path) { 165 | try FileManager.default.createDirectory(at: folderURL, withIntermediateDirectories: true, attributes: nil) 166 | } 167 | let audioFile = try AVAudioFile(forWriting: url, settings: buffer.format.settings) 168 | try audioFile.write(from: buffer) 169 | } 170 | } 171 | 172 | extension DecodingOptions { 173 | func prepareSeekClips(contentFrames: Int) -> [(start: Int, end: Int)] { 174 | var seekPoints: [Int] = clipTimestamps.map { Int(round($0 * Float(WhisperKit.sampleRate))) } 175 | if seekPoints.count == 0 { 176 | seekPoints.append(0) 177 | } 178 | 179 | if seekPoints.count % 2 == 1 { 180 | seekPoints.append(contentFrames) 181 | } 182 | 183 | var seekClips: [(start: Int, end: Int)] = [] 184 | for i in stride(from: 0, to: seekPoints.count, by: 2) { 185 | let start = seekPoints[i] 186 | let end = i + 1 < seekPoints.count ? seekPoints[i + 1] : contentFrames 187 | seekClips.append((start, end)) 188 | } 189 | 190 | return seekClips 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "filename" : "40.png", 5 | "idiom" : "universal", 6 | "platform" : "ios", 7 | "scale" : "2x", 8 | "size" : "20x20" 9 | }, 10 | { 11 | "filename" : "60.png", 12 | "idiom" : "universal", 13 | "platform" : "ios", 14 | "scale" : "3x", 15 | "size" : "20x20" 16 | }, 17 | { 18 | "filename" : "58 1.png", 19 | "idiom" : "universal", 20 | "platform" : "ios", 21 | "scale" : "2x", 22 | "size" : "29x29" 23 | }, 24 | { 25 | "filename" : "87 1.png", 26 | "idiom" : "universal", 27 | "platform" : "ios", 28 | "scale" : "3x", 29 | "size" : "29x29" 30 | }, 31 | { 32 | "filename" : "76.png", 33 | "idiom" : "universal", 34 | "platform" : "ios", 35 | "scale" : "2x", 36 | "size" : "38x38" 37 | }, 38 | { 39 | "filename" : "114.png", 40 | "idiom" : "universal", 41 | "platform" : "ios", 42 | "scale" : "3x", 43 | "size" : "38x38" 44 | }, 45 | { 46 | "filename" : "80 1.png", 47 | "idiom" : "universal", 48 | "platform" : "ios", 49 | "scale" : "2x", 50 | "size" : "40x40" 51 | }, 52 | { 53 | "filename" : "120.png", 54 | "idiom" : "universal", 55 | "platform" : "ios", 56 | "scale" : "3x", 57 | "size" : "40x40" 58 | }, 59 | { 60 | "filename" : "120 1.png", 61 | "idiom" : "universal", 62 | "platform" : "ios", 63 | "scale" : "2x", 64 | "size" : "60x60" 65 | }, 66 | { 67 | "filename" : "180.png", 68 | "idiom" : "universal", 69 | "platform" : "ios", 70 | "scale" : "3x", 71 | "size" : "60x60" 72 | }, 73 | { 74 | "filename" : "128 1.png", 75 | "idiom" : "universal", 76 | "platform" : "ios", 77 | "scale" : "2x", 78 | "size" : "64x64" 79 | }, 80 | { 81 | "filename" : "192.png", 82 | "idiom" : "universal", 83 | "platform" : "ios", 84 | "scale" : "3x", 85 | "size" : "64x64" 86 | }, 87 | { 88 | "filename" : "136.png", 89 | "idiom" : "universal", 90 | "platform" : "ios", 91 | "scale" : "2x", 92 | "size" : "68x68" 93 | }, 94 | { 95 | "filename" : "152.png", 96 | "idiom" : "universal", 97 | "platform" : "ios", 98 | "scale" : "2x", 99 | "size" : "76x76" 100 | }, 101 | { 102 | "filename" : "167.png", 103 | "idiom" : "universal", 104 | "platform" : "ios", 105 | "scale" : "2x", 106 | "size" : "83.5x83.5" 107 | }, 108 | { 109 | "filename" : "1024 1.png", 110 | "idiom" : "universal", 111 | "platform" : "ios", 112 | "size" : "1024x1024" 113 | }, 114 | { 115 | "filename" : "16.png", 116 | "idiom" : "mac", 117 | "scale" : "1x", 118 | "size" : "16x16" 119 | }, 120 | { 121 | "filename" : "32.png", 122 | "idiom" : "mac", 123 | "scale" : "2x", 124 | "size" : "16x16" 125 | }, 126 | { 127 | "filename" : "32.png", 128 | "idiom" : "mac", 129 | "scale" : "1x", 130 | "size" : "32x32" 131 | }, 132 | { 133 | "filename" : "64.png", 134 | "idiom" : "mac", 135 | "scale" : "2x", 136 | "size" : "32x32" 137 | }, 138 | { 139 | "filename" : "128.png", 140 | "idiom" : "mac", 141 | "scale" : "1x", 142 | "size" : "128x128" 143 | }, 144 | { 145 | "filename" : "256.png", 146 | "idiom" : "mac", 147 | "scale" : "2x", 148 | "size" : "128x128" 149 | }, 150 | { 151 | "filename" : "256.png", 152 | "idiom" : "mac", 153 | "scale" : "1x", 154 | "size" : "256x256" 155 | }, 156 | { 157 | "filename" : "512.png", 158 | "idiom" : "mac", 159 | "scale" : "2x", 160 | "size" : "256x256" 161 | }, 162 | { 163 | "filename" : "512.png", 164 | "idiom" : "mac", 165 | "scale" : "1x", 166 | "size" : "512x512" 167 | }, 168 | { 169 | "filename" : "1024.png", 170 | "idiom" : "mac", 171 | "scale" : "2x", 172 | "size" : "512x512" 173 | }, 174 | { 175 | "filename" : "44.png", 176 | "idiom" : "universal", 177 | "platform" : "watchos", 178 | "scale" : "2x", 179 | "size" : "22x22" 180 | }, 181 | { 182 | "filename" : "48.png", 183 | "idiom" : "universal", 184 | "platform" : "watchos", 185 | "scale" : "2x", 186 | "size" : "24x24" 187 | }, 188 | { 189 | "filename" : "55.png", 190 | "idiom" : "universal", 191 | "platform" : "watchos", 192 | "scale" : "2x", 193 | "size" : "27.5x27.5" 194 | }, 195 | { 196 | "filename" : "58.png", 197 | "idiom" : "universal", 198 | "platform" : "watchos", 199 | "scale" : "2x", 200 | "size" : "29x29" 201 | }, 202 | { 203 | "filename" : "60 1.png", 204 | "idiom" : "universal", 205 | "platform" : "watchos", 206 | "scale" : "2x", 207 | "size" : "30x30" 208 | }, 209 | { 210 | "filename" : "64 1.png", 211 | "idiom" : "universal", 212 | "platform" : "watchos", 213 | "scale" : "2x", 214 | "size" : "32x32" 215 | }, 216 | { 217 | "filename" : "66.png", 218 | "idiom" : "universal", 219 | "platform" : "watchos", 220 | "scale" : "2x", 221 | "size" : "33x33" 222 | }, 223 | { 224 | "filename" : "80.png", 225 | "idiom" : "universal", 226 | "platform" : "watchos", 227 | "scale" : "2x", 228 | "size" : "40x40" 229 | }, 230 | { 231 | "filename" : "87.png", 232 | "idiom" : "universal", 233 | "platform" : "watchos", 234 | "scale" : "2x", 235 | "size" : "43.5x43.5" 236 | }, 237 | { 238 | "filename" : "88.png", 239 | "idiom" : "universal", 240 | "platform" : "watchos", 241 | "scale" : "2x", 242 | "size" : "44x44" 243 | }, 244 | { 245 | "filename" : "92.png", 246 | "idiom" : "universal", 247 | "platform" : "watchos", 248 | "scale" : "2x", 249 | "size" : "46x46" 250 | }, 251 | { 252 | "filename" : "100.png", 253 | "idiom" : "universal", 254 | "platform" : "watchos", 255 | "scale" : "2x", 256 | "size" : "50x50" 257 | }, 258 | { 259 | "filename" : "102.png", 260 | "idiom" : "universal", 261 | "platform" : "watchos", 262 | "scale" : "2x", 263 | "size" : "51x51" 264 | }, 265 | { 266 | "filename" : "108.png", 267 | "idiom" : "universal", 268 | "platform" : "watchos", 269 | "scale" : "2x", 270 | "size" : "54x54" 271 | }, 272 | { 273 | "filename" : "172.png", 274 | "idiom" : "universal", 275 | "platform" : "watchos", 276 | "scale" : "2x", 277 | "size" : "86x86" 278 | }, 279 | { 280 | "filename" : "196.png", 281 | "idiom" : "universal", 282 | "platform" : "watchos", 283 | "scale" : "2x", 284 | "size" : "98x98" 285 | }, 286 | { 287 | "filename" : "216.png", 288 | "idiom" : "universal", 289 | "platform" : "watchos", 290 | "scale" : "2x", 291 | "size" : "108x108" 292 | }, 293 | { 294 | "filename" : "234.png", 295 | "idiom" : "universal", 296 | "platform" : "watchos", 297 | "scale" : "2x", 298 | "size" : "117x117" 299 | }, 300 | { 301 | "filename" : "258.png", 302 | "idiom" : "universal", 303 | "platform" : "watchos", 304 | "scale" : "2x", 305 | "size" : "129x129" 306 | }, 307 | { 308 | "filename" : "1024 2.png", 309 | "idiom" : "universal", 310 | "platform" : "watchos", 311 | "size" : "1024x1024" 312 | } 313 | ], 314 | "info" : { 315 | "author" : "xcode", 316 | "version" : 1 317 | } 318 | } 319 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Audio/VoiceActivityDetector.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not. 7 | /// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality. 8 | open class VoiceActivityDetector { 9 | /// The sample rate of the audio signal, in samples per second. 10 | public let sampleRate: Int 11 | 12 | /// The length of each frame in samples. 13 | public let frameLengthSamples: Int 14 | 15 | /// The number of samples overlapping between consecutive frames. 16 | public let frameOverlapSamples: Int 17 | 18 | /// Initializes a new `VoiceActivityDetector` instance with the specified parameters. 19 | /// - Parameters: 20 | /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000. 21 | /// - frameLengthSamples: The length of each frame in samples. 22 | /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0. 23 | /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality. 24 | public init( 25 | sampleRate: Int = 16000, 26 | frameLengthSamples: Int, 27 | frameOverlapSamples: Int = 0 28 | ) { 29 | self.sampleRate = sampleRate 30 | self.frameLengthSamples = frameLengthSamples 31 | self.frameOverlapSamples = frameOverlapSamples 32 | } 33 | 34 | /// Analyzes the provided audio waveform to determine which segments contain voice activity. 35 | /// - Parameter waveform: An array of `Float` values representing the audio waveform. 36 | /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence. 37 | open func voiceActivity(in waveform: [Float]) -> [Bool] { 38 | fatalError("`voiceActivity` must be implemented by subclass") 39 | } 40 | 41 | /// Analyzes the provided audio waveform to determine which segments contain voice activity. 42 | /// - Parameter waveform: An array of `Float` values representing the audio waveform. 43 | /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence. 44 | /// - Throws: An error if voice activity detection fails. 45 | open func voiceActivityAsync(in waveform: [Float]) async throws -> [Bool] { 46 | return voiceActivity(in: waveform) 47 | } 48 | 49 | /// Calculates and returns a list of active audio chunks, each represented by a start and end index. 50 | /// - Parameter waveform: An array of `Float` values representing the audio waveform. 51 | /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk. 52 | public func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] { 53 | let vad: [Bool] = voiceActivity(in: waveform) 54 | var result = [(startIndex: Int, endIndex: Int)]() 55 | 56 | // Temporary variables to hold the start of the current non-silent segment 57 | var currentStartIndex: Int? 58 | 59 | for (index, vadChunk) in vad.enumerated() { 60 | if vadChunk { 61 | let chunkStart = index * frameLengthSamples 62 | let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count) 63 | 64 | if currentStartIndex != nil { 65 | // If we already have a starting point, just update the end point in the last added segment 66 | result[result.count - 1].endIndex = chunkEnd 67 | } else { 68 | // If there is no current start, this is a new segment 69 | currentStartIndex = chunkStart 70 | result.append((startIndex: chunkStart, endIndex: chunkEnd)) 71 | } 72 | } else { 73 | // Reset currentStartIndex when encountering a silent chunk 74 | currentStartIndex = nil 75 | } 76 | } 77 | 78 | return result 79 | } 80 | 81 | /// Converts a voice activity index to the corresponding audio sample index. 82 | /// - Parameter index: The voice activity index to convert. 83 | /// - Returns: The corresponding audio sample index. 84 | public func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int { 85 | return index * frameLengthSamples 86 | } 87 | 88 | public func voiceActivityIndexToSeconds(_ index: Int) -> Float { 89 | return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate) 90 | } 91 | 92 | /// Identifies the longest continuous period of silence within the provided voice activity detection results. 93 | /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results. 94 | /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found. 95 | public func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? { 96 | var longestStartIndex: Int? 97 | var longestEndIndex: Int? 98 | var longestCount = 0 99 | var index = 0 100 | while index < vadResult.count { 101 | let value = vadResult[index] 102 | if value { 103 | // found non-silence, skip 104 | index += 1 105 | } else { 106 | // found beginning of silence, find the end 107 | var endIndex = index 108 | while endIndex < vadResult.count, !vadResult[endIndex] { 109 | endIndex += 1 110 | } 111 | let count = endIndex - index 112 | if count > longestCount { 113 | longestCount = count 114 | longestStartIndex = index 115 | longestEndIndex = endIndex 116 | } 117 | index = endIndex 118 | } 119 | } 120 | if let longestStartIndex, let longestEndIndex { 121 | return (startIndex: longestStartIndex, endIndex: longestEndIndex) 122 | } else { 123 | return nil 124 | } 125 | } 126 | 127 | // MARK: - Utility 128 | 129 | func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] { 130 | let nonSilentChunks = calculateActiveChunks(in: waveform) 131 | var clipTimestamps = [Float]() 132 | 133 | for chunk in nonSilentChunks { 134 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) 135 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) 136 | 137 | clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp]) 138 | } 139 | 140 | return clipTimestamps 141 | } 142 | 143 | func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] { 144 | let clipTimestamps = voiceActivityClipTimestamps(in: waveform) 145 | let options = DecodingOptions(clipTimestamps: clipTimestamps) 146 | let seekClips = options.prepareSeekClips(contentFrames: waveform.count) 147 | return seekClips 148 | } 149 | 150 | func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] { 151 | let nonSilentChunks = calculateActiveChunks(in: waveform) 152 | var seekTimestamps = [(startTime: Float, endTime: Float)]() 153 | 154 | for chunk in nonSilentChunks { 155 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) 156 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) 157 | 158 | seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)]) 159 | } 160 | 161 | return seekTimestamps 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/config-v03.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "whisperkit-coreml", 3 | "version": "0.3", 4 | "device_support": [ 5 | { 6 | "chips": "A12, A13, S9, S10", 7 | "identifiers": [ 8 | "iPhone11", 9 | "iPhone12", 10 | "Watch7", 11 | "Watch8" 12 | ], 13 | "models": { 14 | "default": "openai_whisper-tiny", 15 | "supported": [ 16 | "openai_whisper-tiny", 17 | "openai_whisper-tiny.en", 18 | "openai_whisper-base", 19 | "openai_whisper-base.en" 20 | ] 21 | } 22 | }, 23 | { 24 | "chips": "A14", 25 | "identifiers": [ 26 | "iPhone13", 27 | "iPad13,1", 28 | "iPad13,2", 29 | "iPad13,18", 30 | "iPad13,19" 31 | ], 32 | "models": { 33 | "default": "openai_whisper-base", 34 | "supported": [ 35 | "openai_whisper-tiny", 36 | "openai_whisper-tiny.en", 37 | "openai_whisper-base", 38 | "openai_whisper-base.en", 39 | "openai_whisper-small", 40 | "openai_whisper-small.en" 41 | ] 42 | } 43 | }, 44 | { 45 | "chips": "A15, A16, A17 Pro, A18", 46 | "identifiers": [ 47 | "iPhone14", 48 | "iPhone15", 49 | "iPhone16", 50 | "iPhone17", 51 | "iPad14,1", 52 | "iPad14,2", 53 | "iPad15,7", 54 | "iPad15,8", 55 | "iPad16,1", 56 | "iPad16,2" 57 | ], 58 | "models": { 59 | "default": "openai_whisper-base", 60 | "supported": [ 61 | "openai_whisper-tiny", 62 | "openai_whisper-tiny.en", 63 | "openai_whisper-base", 64 | "openai_whisper-base.en", 65 | "openai_whisper-small", 66 | "openai_whisper-small.en", 67 | "openai_whisper-large-v2_949MB", 68 | "openai_whisper-large-v2_turbo_955MB", 69 | "openai_whisper-large-v3_947MB", 70 | "openai_whisper-large-v3_turbo_954MB", 71 | "distil-whisper_distil-large-v3_594MB", 72 | "distil-whisper_distil-large-v3_turbo_600MB", 73 | "openai_whisper-large-v3-v20240930_626MB", 74 | "openai_whisper-large-v3-v20240930_turbo_632MB" 75 | ] 76 | } 77 | }, 78 | { 79 | "chips": "M1", 80 | "identifiers": [ 81 | "MacBookPro17,1", 82 | "MacBookPro18,1", 83 | "MacBookPro18,2", 84 | "MacBookPro18,3", 85 | "MacBookPro18,4", 86 | "MacBookAir10,1", 87 | "Macmini9,1", 88 | "iMac21,1", 89 | "iMac21,2", 90 | "Mac13", 91 | "iPad13,4", 92 | "iPad13,5", 93 | "iPad13,6", 94 | "iPad13,7", 95 | "iPad13,8", 96 | "iPad13,9", 97 | "iPad13,10", 98 | "iPad13,11", 99 | "iPad13,16", 100 | "iPad13,17" 101 | ], 102 | "models": { 103 | "default": "openai_whisper-large-v3-v20240930_626MB", 104 | "supported": [ 105 | "openai_whisper-tiny", 106 | "openai_whisper-tiny.en", 107 | "openai_whisper-base", 108 | "openai_whisper-base.en", 109 | "openai_whisper-small", 110 | "openai_whisper-small.en", 111 | "openai_whisper-large-v2", 112 | "openai_whisper-large-v2_949MB", 113 | "openai_whisper-large-v3", 114 | "openai_whisper-large-v3_947MB", 115 | "distil-whisper_distil-large-v3", 116 | "distil-whisper_distil-large-v3_594MB", 117 | "openai_whisper-large-v3-v20240930_626MB" 118 | ] 119 | } 120 | }, 121 | { 122 | "chips": "M2, M3, M4", 123 | "identifiers": [ 124 | "Mac14", 125 | "Mac15", 126 | "Mac16", 127 | "iPad14,3", 128 | "iPad14,4", 129 | "iPad14,5", 130 | "iPad14,6", 131 | "iPad14,8", 132 | "iPad14,9", 133 | "iPad14,10", 134 | "iPad14,11", 135 | "iPad15", 136 | "iPad16" 137 | ], 138 | "models": { 139 | "default": "openai_whisper-large-v3-v20240930", 140 | "supported": [ 141 | "openai_whisper-tiny", 142 | "openai_whisper-tiny.en", 143 | "openai_whisper-base", 144 | "openai_whisper-base.en", 145 | "openai_whisper-small", 146 | "openai_whisper-small.en", 147 | "openai_whisper-large-v2", 148 | "openai_whisper-large-v2_949MB", 149 | "openai_whisper-large-v2_turbo", 150 | "openai_whisper-large-v2_turbo_955MB", 151 | "openai_whisper-large-v3", 152 | "openai_whisper-large-v3_947MB", 153 | "openai_whisper-large-v3_turbo", 154 | "openai_whisper-large-v3_turbo_954MB", 155 | "distil-whisper_distil-large-v3", 156 | "distil-whisper_distil-large-v3_594MB", 157 | "distil-whisper_distil-large-v3_turbo", 158 | "distil-whisper_distil-large-v3_turbo_600MB", 159 | "openai_whisper-large-v3-v20240930", 160 | "openai_whisper-large-v3-v20240930_turbo", 161 | "openai_whisper-large-v3-v20240930_626MB", 162 | "openai_whisper-large-v3-v20240930_turbo_632MB" 163 | ] 164 | } 165 | } 166 | ], 167 | "model_checksums": { 168 | "distil-whisper_distil-large-v3": "9cd8271143b919402ae776c30b479565", 169 | "distil-whisper_distil-large-v3_594MB": "ca532f45ddbf8a3d241132cc5cf41639", 170 | "distil-whisper_distil-large-v3_turbo": "b8638452c6568dfe33a33bfcc2bc6aca", 171 | "distil-whisper_distil-large-v3_turbo_600MB": "81746b4b1afbbb01a8ae9ea452460d88", 172 | "openai_whisper-base.en": "fbcfd586f15e2952251b1d3257f18471", 173 | "openai_whisper-base": "36e60501ad0f01c1a5719e83a1f63f20", 174 | "openai_whisper-large-v2": "21b86c07318aeeef54598f15b7903979", 175 | "openai_whisper-large-v2_949MB": "71bad4e1566749d1060eda42308d9fb4", 176 | "openai_whisper-large-v2_turbo": "7734959b6550e7b5c2d732bf2b7acd23", 177 | "openai_whisper-large-v2_turbo_955MB": "cb6411862a48ec75325572081f01e5b5", 178 | "openai_whisper-large-v3-v20240930": "17ebd78ff7edfa59001b554e9cc4c021", 179 | "openai_whisper-large-v3-v20240930_547MB": "c945dad68449ac3c78ecb2d561ac189d", 180 | "openai_whisper-large-v3-v20240930_626MB": "578fe5a07f4eb7e4187c920bca571aa5", 181 | "openai_whisper-large-v3-v20240930_turbo": "dfbf09ab741af1d5400ddbd07bb37dad", 182 | "openai_whisper-large-v3-v20240930_turbo_632MB": "33954440dbd785ca1828afe25514f5a5", 183 | "openai_whisper-large-v3": "a6f24dc72785722e9cea89e227856dfe", 184 | "openai_whisper-large-v3_947MB": "ef6b0e9622a046ce2361b4c72307877f", 185 | "openai_whisper-large-v3_turbo": "c550fbdea70c5784d322c0a427f8b5cd", 186 | "openai_whisper-large-v3_turbo_954MB": "e639c4bb98d905064ef5dd38757dd9d1", 187 | "openai_whisper-small.en": "38efe6a00706bbdb995795c67a836e5e", 188 | "openai_whisper-small": "f1d21adb950bc9be5d5343bcdeccd23b", 189 | "openai_whisper-tiny.en": "e1183fd55448923b1ce43a2da67aa21f", 190 | "openai_whisper-tiny": "7147518a3d68ddbea0691e04cfffa4ff" 191 | } 192 | } 193 | --------------------------------------------------------------------------------