├── Examples
├── WhisperAX
│ ├── WhisperAXTests
│ │ ├── WhisperKitTests
│ │ └── WhisperAXTests.swift
│ ├── Debug.xcconfig
│ ├── WhisperAXWatchApp
│ │ ├── Assets.xcassets
│ │ │ ├── Contents.json
│ │ │ ├── AppIcon.appiconset
│ │ │ │ ├── appstore.png
│ │ │ │ └── Contents.json
│ │ │ └── AccentColor.colorset
│ │ │ │ └── Contents.json
│ │ ├── Preview Content
│ │ │ └── Preview Assets.xcassets
│ │ │ │ └── Contents.json
│ │ └── WhisperAXWatchApp.swift
│ ├── WhisperAX
│ │ ├── Resources
│ │ │ ├── Assets.xcassets
│ │ │ │ ├── Contents.json
│ │ │ │ └── AppIcon.appiconset
│ │ │ │ │ ├── 100.png
│ │ │ │ │ ├── 102.png
│ │ │ │ │ ├── 108.png
│ │ │ │ │ ├── 114.png
│ │ │ │ │ ├── 120.png
│ │ │ │ │ ├── 128.png
│ │ │ │ │ ├── 136.png
│ │ │ │ │ ├── 152.png
│ │ │ │ │ ├── 16.png
│ │ │ │ │ ├── 167.png
│ │ │ │ │ ├── 172.png
│ │ │ │ │ ├── 180.png
│ │ │ │ │ ├── 192.png
│ │ │ │ │ ├── 196.png
│ │ │ │ │ ├── 216.png
│ │ │ │ │ ├── 234.png
│ │ │ │ │ ├── 256.png
│ │ │ │ │ ├── 258.png
│ │ │ │ │ ├── 32.png
│ │ │ │ │ ├── 40.png
│ │ │ │ │ ├── 44.png
│ │ │ │ │ ├── 48.png
│ │ │ │ │ ├── 512.png
│ │ │ │ │ ├── 55.png
│ │ │ │ │ ├── 58.png
│ │ │ │ │ ├── 60.png
│ │ │ │ │ ├── 64.png
│ │ │ │ │ ├── 66.png
│ │ │ │ │ ├── 76.png
│ │ │ │ │ ├── 80.png
│ │ │ │ │ ├── 87.png
│ │ │ │ │ ├── 88.png
│ │ │ │ │ ├── 92.png
│ │ │ │ │ ├── 1024.png
│ │ │ │ │ ├── 120 1.png
│ │ │ │ │ ├── 128 1.png
│ │ │ │ │ ├── 58 1.png
│ │ │ │ │ ├── 60 1.png
│ │ │ │ │ ├── 64 1.png
│ │ │ │ │ ├── 80 1.png
│ │ │ │ │ ├── 87 1.png
│ │ │ │ │ ├── 1024 1.png
│ │ │ │ │ ├── 1024 2.png
│ │ │ │ │ └── Contents.json
│ │ │ ├── Info.plist
│ │ │ └── WhisperAX.entitlements
│ │ ├── Preview Content
│ │ │ └── Preview Assets.xcassets
│ │ │ │ └── Contents.json
│ │ ├── Info.plist
│ │ └── WhisperAXApp.swift
│ ├── WhisperAX.xcodeproj
│ │ ├── project.xcworkspace
│ │ │ ├── contents.xcworkspacedata
│ │ │ └── xcshareddata
│ │ │ │ ├── IDEWorkspaceChecks.plist
│ │ │ │ └── swiftpm
│ │ │ │ └── Package.resolved
│ │ └── xcshareddata
│ │ │ └── xcschemes
│ │ │ └── WhisperAX.xcscheme
│ ├── WhisperAXUITests
│ │ ├── WhisperAXUITestsLaunchTests.swift
│ │ └── WhisperAXUITests.swift
│ ├── WhisperAXWatchAppUITests
│ │ ├── WhisperAX_Watch_AppUITestsLaunchTests.swift
│ │ └── WhisperAX_Watch_AppUITests.swift
│ └── WhisperAXWatchAppTests
│ │ └── WhisperAX_Watch_AppTests.swift
└── ServeCLIClient
│ ├── Python
│ ├── pyproject.toml
│ ├── README.md
│ ├── test_translate.py
│ └── test_transcribe.py
│ ├── Swift
│ ├── updateClient.sh
│ ├── Package.swift
│ ├── README.md
│ └── Package.resolved
│ └── Curl
│ ├── README.md
│ ├── translate.sh
│ ├── transcribe.sh
│ └── test.sh
├── .spi.yml
├── Tests
└── WhisperKitTests
│ ├── Resources
│ ├── jfk.wav
│ ├── ted_60.m4a
│ ├── jfk_441khz.m4a
│ ├── 8_Channel_ID.m4a
│ ├── es_test_clip.wav
│ ├── ja_test_clip.wav
│ ├── config-v02.json
│ └── config-v03.json
│ ├── UnitTestsPlan.xctestplan
│ └── Evaluate
│ ├── WERUtils.swift
│ └── DistanceCalculation.swift
├── scripts
├── specs
│ └── openapi-generator-config.yaml
└── pyproject.toml
├── .github
└── workflows
│ ├── homebrew-update.yml
│ ├── release-tests.yml
│ ├── expo-update.yml
│ ├── development-tests.yml
│ └── unit-tests.yml
├── Sources
├── WhisperKitCLI
│ ├── Server
│ │ ├── ServeCLIArguments.swift
│ │ └── ServeCLI.swift
│ ├── CLIUtils.swift
│ ├── WhisperKitCLI.swift
│ ├── TranscribeCLIUtils.swift
│ └── TranscribeCLIArguments.swift
└── WhisperKit
│ ├── Utilities
│ ├── WhisperError.swift
│ ├── TextUtilities.swift
│ ├── Concurrency.swift
│ ├── Logging.swift
│ ├── ResultWriter.swift
│ └── Extensions+Internal.swift
│ └── Core
│ ├── Audio
│ ├── EnergyVAD.swift
│ ├── AudioChunker.swift
│ └── VoiceActivityDetector.swift
│ ├── FeatureExtractor.swift
│ └── AudioEncoder.swift
├── .swiftpm
├── configuration
│ └── Package.resolved
└── xcode
│ └── xcshareddata
│ └── xcschemes
│ └── whisperkit-Package.xcscheme
├── LICENSE
├── fastlane
└── README.md
├── Package.resolved
├── .gitignore
├── Package.swift
├── CONTRIBUTING.md
├── BENCHMARKS.md
└── Makefile
/Examples/WhisperAX/WhisperAXTests/WhisperKitTests:
--------------------------------------------------------------------------------
1 | ../../../Tests/WhisperKitTests
--------------------------------------------------------------------------------
/.spi.yml:
--------------------------------------------------------------------------------
1 | version: 1
2 | builder:
3 | configs:
4 | - documentation_targets: [WhisperKit]
--------------------------------------------------------------------------------
/Examples/WhisperAX/Debug.xcconfig:
--------------------------------------------------------------------------------
1 | // Run `make setup` to add your team here
2 | DEVELOPMENT_TEAM=
3 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/jfk.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/jfk.wav
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/ted_60.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/ted_60.m4a
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/jfk_441khz.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/jfk_441khz.m4a
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/8_Channel_ID.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/8_Channel_ID.m4a
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/es_test_clip.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/es_test_clip.wav
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/ja_test_clip.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Tests/WhisperKitTests/Resources/ja_test_clip.wav
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Preview Content/Preview Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Preview Content/Preview Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/HEAD/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scripts/specs/openapi-generator-config.yaml:
--------------------------------------------------------------------------------
1 | generate:
2 | - types
3 | - server
4 |
5 | accessModifier: internal
6 | namingStrategy: idiomatic
7 |
8 | filter:
9 | paths:
10 | - /audio/transcriptions
11 | - /audio/translations
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scripts/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "whisperkit-scripts"
3 | version = "0.1.0"
4 | description = "Scripts for WhisperKit development"
5 | requires-python = ">=3.8"
6 | dependencies = [
7 | "requests",
8 | "ruamel.yaml",
9 | ]
10 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AccentColor.colorset/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "colors" : [
3 | {
4 | "idiom" : "universal"
5 | }
6 | ],
7 | "info" : {
8 | "author" : "xcode",
9 | "version" : 1
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "whisperkit-python-client"
3 | version = "0.1.0"
4 | description = "Python client for WhisperKit local server"
5 | requires-python = ">=3.8"
6 | dependencies = [
7 | "openai>=1.0.0",
8 | "requests>=2.25.0",
9 | "python-dotenv>=0.19.0",
10 | "argparse",
11 | ]
12 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | IDEDidComputeMac32BitWarning
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "images" : [
3 | {
4 | "filename" : "appstore.png",
5 | "idiom" : "universal",
6 | "platform" : "watchos",
7 | "size" : "1024x1024"
8 | }
9 | ],
10 | "info" : {
11 | "author" : "xcode",
12 | "version" : 1
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXWatchApp.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import SwiftUI
5 |
6 | @main
7 | struct WhisperAXWatchApp: App {
8 | var body: some Scene {
9 | WindowGroup {
10 | WhisperAXWatchView()
11 | }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/.github/workflows/homebrew-update.yml:
--------------------------------------------------------------------------------
1 | name: Bump Homebrew Formula
2 |
3 | on:
4 | push:
5 | tags: 'v*'
6 |
7 | jobs:
8 | homebrew:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: mislav/bump-homebrew-formula-action@v3
12 | with:
13 | formula-name: whisperkit-cli
14 | env:
15 | COMMITTER_TOKEN: ${{ secrets.COMMITTER_TOKEN }}
16 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | NSPrivacyAccessedAPITypes
6 |
7 | NSPrivacyAccessedAPIType
8 | NSPrivacyAccessedAPICategoryUserDefaults
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/WhisperAXApp.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import SwiftUI
5 |
6 | @main
7 | struct WhisperAXApp: App {
8 | var body: some Scene {
9 | WindowGroup {
10 | ContentView()
11 | #if os(macOS)
12 | .frame(minWidth: 1000, minHeight: 700)
13 | #endif
14 | }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/Server/ServeCLIArguments.swift:
--------------------------------------------------------------------------------
1 | // Copyright © 2025 Argmax, Inc. All rights reserved.
2 | // For licensing see accompanying LICENSE.md file.
3 |
4 | import ArgumentParser
5 |
6 | struct ServeCLIArguments: ParsableArguments {
7 | @OptionGroup
8 | var transcribe: TranscribeCLIArguments
9 |
10 | @Option(name: .long, help: "Port to run the server on")
11 | var port: Int = 50060
12 |
13 | @Option(name: .long, help: "Host to bind the server to")
14 | var host: String = "localhost"
15 | }
16 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Swift/updateClient.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Update WhisperKit Swift Client from OpenAPI spec
4 | # This script regenerates the client code when the server spec changes
5 |
6 | set -e
7 |
8 | echo "Updating WhisperKit Swift Client..."
9 |
10 | # Generate client code
11 | echo "Generating client code..."
12 | swift run swift-openapi-generator generate \
13 | ../../../scripts/specs/localserver_openapi.yaml \
14 | --output-directory Sources/WhisperKitSwiftClient/Generated \
15 | --access-modifier public \
16 | --mode client \
17 | --mode types
18 |
19 | echo "Client code updated successfully!"
20 | echo "Files generated in Sources/Generated/"
21 |
--------------------------------------------------------------------------------
/.swiftpm/configuration/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "pins" : [
3 | {
4 | "identity" : "swift-argument-parser",
5 | "kind" : "remoteSourceControl",
6 | "location" : "https://github.com/apple/swift-argument-parser.git",
7 | "state" : {
8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
9 | "version" : "1.3.0"
10 | }
11 | },
12 | {
13 | "identity" : "swift-transformers",
14 | "kind" : "remoteSourceControl",
15 | "location" : "https://github.com/huggingface/swift-transformers.git",
16 | "state" : {
17 | "revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe",
18 | "version" : "0.1.7"
19 | }
20 | }
21 | ],
22 | "version" : 2
23 | }
24 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/CLIUtils.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import ArgumentParser
5 | import CoreML
6 | import Foundation
7 |
8 | enum ComputeUnits: String, ExpressibleByArgument, CaseIterable {
9 | case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine, random
10 | var asMLComputeUnits: MLComputeUnits {
11 | switch self {
12 | case .all: return .all
13 | case .cpuAndGPU: return .cpuAndGPU
14 | case .cpuOnly: return .cpuOnly
15 | case .cpuAndNeuralEngine: return .cpuAndNeuralEngine
16 | case .random: return Bool.random() ? .cpuAndGPU : .cpuAndNeuralEngine
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/WhisperAX.entitlements:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | com.apple.developer.kernel.increased-memory-limit
6 |
7 | com.apple.security.app-sandbox
8 |
9 | com.apple.security.device.audio-input
10 |
11 | com.apple.security.files.downloads.read-only
12 |
13 | com.apple.security.files.user-selected.read-write
14 |
15 | com.apple.security.network.client
16 |
17 | com.apple.security.network.server
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/WhisperKitCLI.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import ArgumentParser
5 | import Foundation
6 |
7 | let VERSION: String = "development"
8 |
9 | var subcommands: [ParsableCommand.Type] {
10 | #if BUILD_SERVER_CLI
11 | [TranscribeCLI.self, ServeCLI.self]
12 | #else
13 | [TranscribeCLI.self]
14 | #endif
15 | }
16 |
17 | @main
18 | struct WhisperKitCLI: AsyncParsableCommand {
19 | static let configuration = CommandConfiguration(
20 | commandName: "whisperkit-cli",
21 | abstract: "WhisperKit CLI",
22 | discussion: "Swift native speech recognition with Whisper for Apple Silicon",
23 | version: VERSION,
24 | subcommands: subcommands
25 | )
26 | }
27 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "originHash" : "831ad63194a5262b2549d58e383a520f9cbbc80b4a75660fbbcc56d65edfdab4",
3 | "pins" : [
4 | {
5 | "identity" : "swift-argument-parser",
6 | "kind" : "remoteSourceControl",
7 | "location" : "https://github.com/apple/swift-argument-parser.git",
8 | "state" : {
9 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
10 | "version" : "1.3.0"
11 | }
12 | },
13 | {
14 | "identity" : "swift-transformers",
15 | "kind" : "remoteSourceControl",
16 | "location" : "https://github.com/huggingface/swift-transformers.git",
17 | "state" : {
18 | "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08",
19 | "version" : "0.1.8"
20 | }
21 | }
22 | ],
23 | "version" : 3
24 | }
25 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXUITests/WhisperAXUITestsLaunchTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAXUITestsLaunchTests: XCTestCase {
7 | override class var runsForEachTargetApplicationUIConfiguration: Bool {
8 | true
9 | }
10 |
11 | override func setUpWithError() throws {
12 | continueAfterFailure = false
13 | }
14 |
15 | func testLaunch() throws {
16 | let app = XCUIApplication()
17 | app.launch()
18 |
19 | // Insert steps here to perform after app launch but before taking a screenshot,
20 | // such as logging into a test account or navigating somewhere in the app
21 |
22 | let attachment = XCTAttachment(screenshot: app.screenshot())
23 | attachment.name = "Launch Screen"
24 | attachment.lifetime = .keepAlways
25 | add(attachment)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITestsLaunchTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAX_Watch_AppUITestsLaunchTests: XCTestCase {
7 | override class var runsForEachTargetApplicationUIConfiguration: Bool {
8 | true
9 | }
10 |
11 | override func setUpWithError() throws {
12 | continueAfterFailure = false
13 | }
14 |
15 | func testLaunch() throws {
16 | let app = XCUIApplication()
17 | app.launch()
18 |
19 | // Insert steps here to perform after app launch but before taking a screenshot,
20 | // such as logging into a test account or navigating somewhere in the app
21 |
22 | let attachment = XCTAttachment(screenshot: app.screenshot())
23 | attachment.name = "Launch Screen"
24 | attachment.lifetime = .keepAlways
25 | add(attachment)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 argmax, inc.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Swift/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version: 5.9
2 | import PackageDescription
3 |
4 | let package = Package(
5 | name: "WhisperKitSwiftClient",
6 | platforms: [
7 | .macOS(.v13)
8 | ],
9 | products: [
10 | .executable(name: "whisperkit-client", targets: ["WhisperKitSwiftClient"]),
11 | ],
12 | dependencies: [
13 | .package(url: "https://github.com/apple/swift-argument-parser", from: "1.2.0"),
14 | .package(url: "https://github.com/apple/swift-openapi-runtime", from: "1.0.0"),
15 | .package(url: "https://github.com/apple/swift-openapi-urlsession", from: "1.0.0"),
16 | .package(url: "https://github.com/apple/swift-http-types", from: "1.0.0"),
17 | .package(url: "https://github.com/apple/swift-openapi-generator", from: "1.0.0"),
18 | ],
19 | targets: [
20 | .executableTarget(
21 | name: "WhisperKitSwiftClient",
22 | dependencies: [
23 | .product(name: "ArgumentParser", package: "swift-argument-parser"),
24 | .product(name: "OpenAPIRuntime", package: "swift-openapi-runtime"),
25 | .product(name: "OpenAPIURLSession", package: "swift-openapi-urlsession"),
26 | .product(name: "HTTPTypes", package: "swift-http-types"),
27 | ],
28 | path: "Sources/WhisperKitSwiftClient"
29 | )
30 | ]
31 | )
32 |
--------------------------------------------------------------------------------
/fastlane/README.md:
--------------------------------------------------------------------------------
1 | fastlane documentation
2 | ----
3 |
4 | # Installation
5 |
6 | Make sure you have the latest version of the Xcode command line tools installed:
7 |
8 | ```sh
9 | xcode-select --install
10 | ```
11 |
12 | For _fastlane_ installation instructions, see [Installing _fastlane_](https://docs.fastlane.tools/#installing-fastlane)
13 |
14 | # Available Actions
15 |
16 | ## iOS
17 |
18 | ### ios list_devices
19 |
20 | ```sh
21 | [bundle exec] fastlane ios list_devices
22 | ```
23 |
24 | List all connected devices
25 |
26 | ### ios benchmark
27 |
28 | ```sh
29 | [bundle exec] fastlane ios benchmark
30 | ```
31 |
32 | Benchmark devices with options
33 |
34 | ### ios extract_results
35 |
36 | ```sh
37 | [bundle exec] fastlane ios extract_results
38 | ```
39 |
40 | Extract benchmark results
41 |
42 | ### ios upload_results
43 |
44 | ```sh
45 | [bundle exec] fastlane ios upload_results
46 | ```
47 |
48 | Upload benchmark results
49 |
50 | ----
51 |
52 | This README.md is auto-generated and will be re-generated every time [_fastlane_](https://fastlane.tools) is run.
53 |
54 | More information about _fastlane_ can be found on [fastlane.tools](https://fastlane.tools).
55 |
56 | The documentation of _fastlane_ can be found on [docs.fastlane.tools](https://docs.fastlane.tools).
57 |
--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "pins" : [
3 | {
4 | "identity" : "swift-argument-parser",
5 | "kind" : "remoteSourceControl",
6 | "location" : "https://github.com/apple/swift-argument-parser.git",
7 | "state" : {
8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
9 | "version" : "1.3.0"
10 | }
11 | },
12 | {
13 | "identity" : "swift-collections",
14 | "kind" : "remoteSourceControl",
15 | "location" : "https://github.com/apple/swift-collections.git",
16 | "state" : {
17 | "revision" : "7b847a3b7008b2dc2f47ca3110d8c782fb2e5c7e",
18 | "version" : "1.3.0"
19 | }
20 | },
21 | {
22 | "identity" : "swift-jinja",
23 | "kind" : "remoteSourceControl",
24 | "location" : "https://github.com/huggingface/swift-jinja.git",
25 | "state" : {
26 | "revision" : "38b7beeec5d968accd19a8a70c1882cc89979d1c",
27 | "version" : "2.1.1"
28 | }
29 | },
30 | {
31 | "identity" : "swift-transformers",
32 | "kind" : "remoteSourceControl",
33 | "location" : "https://github.com/huggingface/swift-transformers.git",
34 | "state" : {
35 | "revision" : "d363e83a77bafe144808a3d01556139fe67cd8bc",
36 | "version" : "1.1.2"
37 | }
38 | }
39 | ],
40 | "version" : 2
41 | }
42 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXTests/WhisperAXTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAXTests: XCTestCase {
7 | override func setUpWithError() throws {
8 | // Put setup code here. This method is called before the invocation of each test method in the class.
9 | }
10 |
11 | override func tearDownWithError() throws {
12 | // Put teardown code here. This method is called after the invocation of each test method in the class.
13 | }
14 |
15 | func testExample() throws {
16 | // This is an example of a functional test case.
17 | // Use XCTAssert and related functions to verify your tests produce the correct results.
18 | // Any test you write for XCTest can be annotated as throws and async.
19 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error.
20 | // Mark your test async to allow awaiting for asynchronous code to complete. Check the results with assertions afterwards.
21 | }
22 |
23 | func testPerformanceExample() throws {
24 | // This is an example of a performance test case.
25 | measure {
26 | // Put the code you want to measure the time of here.
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/.github/workflows/release-tests.yml:
--------------------------------------------------------------------------------
1 | name: Release Tests
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | build-and-test-all-platforms:
10 | name: "Build and Test All Platforms"
11 | strategy:
12 | matrix:
13 | include:
14 | - os: macos-14
15 | ios-version: "17.5"
16 | ios-device: "iPhone 15"
17 | watchos-version: "10.2"
18 | visionos-version: "1.0"
19 | xcode-version: "16.1"
20 | - os: macos-15
21 | ios-version: "18.5"
22 | ios-device: "iPhone 16"
23 | watchos-version: "11.5"
24 | visionos-version: "2.5"
25 | xcode-version: "16.4"
26 | - os: macos-26
27 | ios-version: "26.0.1"
28 | ios-device: "iPhone 17"
29 | watchos-version: "26.0"
30 | visionos-version: "26.0"
31 | macos-runner: "macos-26"
32 | xcode-version: "26.0"
33 | uses: ./.github/workflows/unit-tests.yml
34 | with:
35 | macos-runner: ${{ matrix.os }}
36 | ios-version: ${{ matrix.ios-version }}
37 | ios-device: ${{ matrix.ios-device }}
38 | watchos-version: ${{ matrix.watchos-version }}
39 | visionos-version: ${{ matrix.visionos-version }}
40 | xcode-version: ${{ matrix.xcode-version }}
41 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchAppTests/WhisperAX_Watch_AppTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | @testable import WhisperAX_Watch_App
5 | import XCTest
6 |
7 | final class WhisperAX_Watch_AppTests: XCTestCase {
8 | override func setUpWithError() throws {
9 | // Put setup code here. This method is called before the invocation of each test method in the class.
10 | }
11 |
12 | override func tearDownWithError() throws {
13 | // Put teardown code here. This method is called after the invocation of each test method in the class.
14 | }
15 |
16 | func testExample() throws {
17 | // This is an example of a functional test case.
18 | // Use XCTAssert and related functions to verify your tests produce the correct results.
19 | // Any test you write for XCTest can be annotated as throws and async.
20 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error.
21 | // Tests marked async will run the test method on an arbitrary thread managed by the Swift runtime.
22 | }
23 |
24 | func testPerformanceExample() throws {
25 | // This is an example of a performance test case.
26 | self.measure {
27 | // Put the code you want to measure the time of here.
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /.build
3 | /Packages
4 | .vscode/
5 | xcuserdata/
6 | DerivedData/
7 | .swiftpm/configuration/registries.json
8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
9 | .swiftpm/xcode/xcshareddata/
10 | **/*.xcscheme
11 | .netrc
12 | .env
13 |
14 | # Core ML Model Files
15 | Models
16 | **/*.mlpackage
17 | **/*.mlmodel
18 | **/*.mlmodelc
19 | **/*.zip
20 | **/*.tar.gz
21 |
22 | # Audio files (add manually if needed)
23 | **/*.wav
24 | **/*.mp3
25 | **/*.m4a
26 | **/*.flac
27 |
28 | # Swift Client build artifacts
29 | Examples/ServeCLIClient/Swift/.build
30 | Examples/ServeCLIClient/Swift/.swiftpm
31 |
32 | ## Xcode
33 | # Build generated
34 | build/
35 | DerivedData/
36 |
37 | # Various settings
38 | *.pbxuser
39 | !default.pbxuser
40 | *.mode1v3
41 | !default.mode1v3
42 | *.mode2v3
43 | !default.mode2v3
44 | *.perspectivev3
45 | !default.perspectivev3
46 | xcuserdata/
47 |
48 | # Other
49 | *.moved-aside
50 | *.xccheckout
51 | *.xcscmblueprint
52 |
53 | # Obj-C/Swift specific
54 | *.hmap
55 | *.ipa
56 | *.dSYM.zip
57 | *.dSYM
58 |
59 | # fastlane
60 | fastlane/report.xml
61 | fastlane/Preview.html
62 | fastlane/screenshots
63 | fastlane/test_output
64 | fastlane/benchmark_data
65 | fastlane/upload_folder
66 |
67 | ### Xcode Patch ###
68 | **/*.xcconfig
69 | *.xcodeproj/*
70 | !*.xcodeproj/project.pbxproj
71 | !*.xcodeproj/xcshareddata/
72 | !*.xcworkspace/contents.xcworkspacedata
73 | /*.gcno
74 |
75 | # Swift build artifacts
76 | *.d
77 | *.o
78 | *.swiftdeps
79 | *.swiftmodule
80 | *.swiftdoc
81 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXUITests/WhisperAXUITests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAXUITests: XCTestCase {
7 | override func setUpWithError() throws {
8 | // Put setup code here. This method is called before the invocation of each test method in the class.
9 |
10 | // In UI tests it is usually best to stop immediately when a failure occurs.
11 | continueAfterFailure = false
12 |
13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this.
14 | }
15 |
16 | override func tearDownWithError() throws {
17 | // Put teardown code here. This method is called after the invocation of each test method in the class.
18 | }
19 |
20 | func testExample() throws {
21 | // UI tests must launch the application that they test.
22 | let app = XCUIApplication()
23 | app.launch()
24 |
25 | // Use XCTAssert and related functions to verify your tests produce the correct results.
26 | }
27 |
28 | func testLaunchPerformance() throws {
29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) {
30 | // This measures how long it takes to launch your application.
31 | measure(metrics: [XCTApplicationLaunchMetric()]) {
32 | XCUIApplication().launch()
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAX_Watch_AppUITests: XCTestCase {
7 | override func setUpWithError() throws {
8 | // Put setup code here. This method is called before the invocation of each test method in the class.
9 |
10 | // In UI tests it is usually best to stop immediately when a failure occurs.
11 | continueAfterFailure = false
12 |
13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this.
14 | }
15 |
16 | override func tearDownWithError() throws {
17 | // Put teardown code here. This method is called after the invocation of each test method in the class.
18 | }
19 |
20 | func testExample() throws {
21 | // UI tests must launch the application that they test.
22 | let app = XCUIApplication()
23 | app.launch()
24 |
25 | // Use XCTAssert and related functions to verify your tests produce the correct results.
26 | }
27 |
28 | func testLaunchPerformance() throws {
29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) {
30 | // This measures how long it takes to launch your application.
31 | measure(metrics: [XCTApplicationLaunchMetric()]) {
32 | XCUIApplication().launch()
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/UnitTestsPlan.xctestplan:
--------------------------------------------------------------------------------
1 | {
2 | "configurations" : [
3 | {
4 | "id" : "427C492D-C19B-4C99-A90A-F4AEF0EC4B54",
5 | "name" : "Configuration 1",
6 | "options" : {
7 |
8 | }
9 | }
10 | ],
11 | "defaultOptions" : {
12 | "maximumTestRepetitions" : 3,
13 | "testRepetitionMode" : "retryOnFailure",
14 | "testTimeoutsEnabled" : true
15 | },
16 | "testTargets" : [
17 | {
18 | "skippedTests" : [
19 | "FunctionalTests",
20 | "FunctionalTests\/testAsyncImplementation()",
21 | "FunctionalTests\/testBaseImplementation()",
22 | "FunctionalTests\/testBatchTranscribeAudioArrays()",
23 | "FunctionalTests\/testBatchTranscribeAudioPaths()",
24 | "FunctionalTests\/testBatchTranscribeAudioPathsWithErrors()",
25 | "FunctionalTests\/testInitLarge()",
26 | "FunctionalTests\/testModelSearchPathLarge()",
27 | "FunctionalTests\/testRealTimeFactorLarge()",
28 | "FunctionalTests\/testRealTimeFactorTiny()",
29 | "RegressionTests",
30 | "RegressionTests\/testHirschberg()",
31 | "RegressionTests\/testInMemoryAndDiskUsage()",
32 | "RegressionTests\/testLargeWER()",
33 | "RegressionTests\/testLevenshtein()",
34 | "RegressionTests\/testModelPerformance()",
35 | "RegressionTests\/testModelPerformanceWithDebugConfig()",
36 | "RegressionTests\/testNormalizer()"
37 | ],
38 | "target" : {
39 | "containerPath" : "container:",
40 | "identifier" : "WhisperKitTests",
41 | "name" : "WhisperKitTests"
42 | }
43 | }
44 | ],
45 | "version" : 1
46 | }
47 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Utilities/WhisperError.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | @frozen
7 | public enum WhisperError: Error, LocalizedError, Equatable {
8 | case tokenizerUnavailable(String = "Tokenizer is unavailable")
9 | case modelsUnavailable(String = "Models are unavailable")
10 | case prefillFailed(String = "Prefill failed")
11 | case audioProcessingFailed(String = "Audio processing failed")
12 | case decodingLogitsFailed(String = "Unable to decode logits from the model output")
13 | case segmentingFailed(String = "Creating segments failed")
14 | case loadAudioFailed(String = "Load audio failed")
15 | case prepareDecoderInputsFailed(String = "Prepare decoder inputs failed")
16 | case transcriptionFailed(String = "Transcription failed")
17 | case decodingFailed(String = "Decoding failed")
18 | case microphoneUnavailable(String = "No available microphone to record or stream")
19 | case initializationError(String = "Error initializing WhisperKit")
20 |
21 | public var errorDescription: String? {
22 | switch self {
23 | case let .tokenizerUnavailable(message),
24 | let .modelsUnavailable(message),
25 | let .prefillFailed(message),
26 | let .audioProcessingFailed(message),
27 | let .decodingLogitsFailed(message),
28 | let .segmentingFailed(message),
29 | let .loadAudioFailed(message),
30 | let .prepareDecoderInputsFailed(message),
31 | let .transcriptionFailed(message),
32 | let .decodingFailed(message),
33 | let .microphoneUnavailable(message),
34 | let .initializationError(message):
35 | Logging.error(message)
36 | return message
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Curl/README.md:
--------------------------------------------------------------------------------
1 | # WhisperKit CurlClient
2 |
3 | A simple, lightweight client for the WhisperKit Local Server using shell scripts and curl.
4 |
5 | ## Quick Start
6 |
7 | 1. **Make scripts executable:**
8 | ```bash
9 | chmod +x *.sh
10 | ```
11 |
12 | 2. **Start the WhisperKit server:**
13 | ```bash
14 | whisperkit-cli serve --model tiny
15 | ```
16 |
17 | 3. **Use the scripts:**
18 | ```bash
19 | # Transcribe audio
20 | ./transcribe.sh audio.wav
21 |
22 | # Translate audio to English
23 | ./translate.sh audio.wav --language es
24 |
25 | # Run test suite
26 | ./test.sh
27 | ```
28 |
29 | ## Scripts
30 |
31 | ### `transcribe.sh`
32 | Transcribes audio files to text.
33 |
34 | **Basic usage:**
35 | ```bash
36 | ./transcribe.sh audio.wav
37 | ./transcribe.sh audio.wav --language en --timestamp-granularities word,segment
38 | ./transcribe.sh audio.wav --stream true --logprobs
39 | ```
40 |
41 | ### `translate.sh`
42 | Translates audio files to English.
43 |
44 | **Basic usage:**
45 | ```bash
46 | ./translate.sh audio.wav
47 | ./translate.sh audio.wav --language es
48 | ./translate.sh audio.wav --stream true --logprobs
49 | ```
50 |
51 | ### `test.sh`
52 | Runs comprehensive tests on sample files.
53 |
54 | ## Options
55 |
56 | - `-h, --help` - Show help
57 | - `-s, --server ` - Server URL (default: http://localhost:50060/v1)
58 | - `-l, --language ` - Source language (e.g., en, es, ja)
59 | - `-f, --response-format ` - Response format: json, verbose_json
60 | - `--timestamp-granularities ` - Timestamp granularities: word,segment
61 | - `--stream ` - Enable streaming (default: false)
62 | - `--logprobs` - Include logprobs in response (default: false)
63 | - `--temperature ` - Sampling temperature 0.0-1.0 (default: 0.0)
64 | - `--verbose` - Show verbose curl output
65 |
66 | ## Prerequisites
67 |
68 | - `curl` (usually pre-installed)
69 | - `bash` shell
70 | - WhisperKit Local Server running
71 |
--------------------------------------------------------------------------------
/.github/workflows/expo-update.yml:
--------------------------------------------------------------------------------
1 | # Tested on MacOS with:
2 | # act -s COMMITTER_TOKEN="$(gh auth token)" release --container-architecture linux/amd64 -P ubuntu-latest=catthehacker/ubuntu:act-latest -e <(echo '{ "release": { "tag_name": "v0.0.0" }}')
3 | name: Update whisper-kit-expo
4 |
5 | on:
6 | release:
7 | types: [released]
8 |
9 | jobs:
10 | update-whisperkit:
11 | runs-on: ubuntu-latest
12 | env:
13 | TAG: ${{ github.event.release.tag_name }}
14 | BRANCH_NAME: update-whisperkit-${{ github.event.release.tag_name }}
15 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }}
16 | steps:
17 | - name: Checkout whisper-kit-expo
18 | uses: actions/checkout@v4
19 | with:
20 | repository: seb-sep/whisper-kit-expo
21 | token: ${{ secrets.COMMITTER_TOKEN }}
22 | ref: main
23 |
24 | - name: Setup Node
25 | uses: actions/setup-node@v4
26 | with:
27 | node-version: '20.x'
28 |
29 | - name: New branch
30 | run: |
31 | git checkout -b $BRANCH_NAME
32 | echo ${{ github.event.release }}
33 | echo "Release tag is $TAG"
34 |
35 | - name: Update package.json version
36 | run: |
37 | PACKAGE_PATH="package.json"
38 | if [ ! -f "$PACKAGE_PATH" ]; then
39 | echo "Could not find package.json at path: $PACKAGE_PATH."
40 | exit 1
41 | fi
42 | RELEASE_TAG=${TAG#v}
43 | jq --arg newver "$RELEASE_TAG" '.whisperKit.version = $newver' "$PACKAGE_PATH" > tmp.$$.json && mv tmp.$$.json "$PACKAGE_PATH"
44 | cat "$PACKAGE_PATH"
45 |
46 | - name: Commit changes
47 | run: |
48 | git config --global user.email "164233781+argmaxincbot@users.noreply.github.com"
49 | git config --global user.name "argmaxincbot"
50 | git add ./package.json
51 | git commit -m "Update WhisperKit to $TAG"
52 | git push origin $BRANCH_NAME
53 | - name: PR with changes
54 | env:
55 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }}
56 | run: |
57 | gh pr create --title "Update WhisperKit to $TAG" --body "Update WhisperKit to $TAG" --base main --head $BRANCH_NAME
58 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Audio/EnergyVAD.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Voice activity detection based on energy threshold
7 | public final class EnergyVAD: VoiceActivityDetector {
8 | public let energyThreshold: Float
9 |
10 | /// Initialize a new EnergyVAD instance
11 | /// - Parameters:
12 | /// - sampleRate: Audio sample rate
13 | /// - frameLength: Frame length in seconds
14 | /// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries
15 | /// - energyThreshold: minimal energy threshold
16 | public convenience init(
17 | sampleRate: Int = WhisperKit.sampleRate,
18 | frameLength: Float = 0.1,
19 | frameOverlap: Float = 0.0,
20 | energyThreshold: Float = 0.02
21 | ) {
22 | self.init(
23 | sampleRate: sampleRate,
24 | // Compute frame length and overlap in number of samples
25 | frameLengthSamples: Int(frameLength * Float(sampleRate)),
26 | frameOverlapSamples: Int(frameOverlap * Float(sampleRate)),
27 | energyThreshold: energyThreshold
28 | )
29 | }
30 |
31 | public required init(
32 | sampleRate: Int = 16000,
33 | frameLengthSamples: Int,
34 | frameOverlapSamples: Int = 0,
35 | energyThreshold: Float = 0.02
36 | ) {
37 | self.energyThreshold = energyThreshold
38 | super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples)
39 | }
40 |
41 | public override func voiceActivity(in waveform: [Float]) -> [Bool] {
42 | let chunkRatio = Double(waveform.count) / Double(frameLengthSamples)
43 |
44 | // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long
45 | let count = Int(chunkRatio.rounded(.up))
46 |
47 | let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks(
48 | of: waveform,
49 | chunkCount: count,
50 | frameLengthSamples: frameLengthSamples,
51 | frameOverlapSamples: frameOverlapSamples,
52 | energyThreshold: energyThreshold
53 | )
54 |
55 | return chunkedVoiceActivity
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/.github/workflows/development-tests.yml:
--------------------------------------------------------------------------------
1 | name: Development Tests
2 |
3 | on:
4 | pull_request:
5 | pull_request_review:
6 | types: [submitted]
7 | workflow_dispatch:
8 |
9 | jobs:
10 | build-and-test:
11 | name: "Build and Test"
12 | concurrency:
13 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}-build-and-test
14 | cancel-in-progress: true
15 | uses: ./.github/workflows/unit-tests.yml
16 | with:
17 | ios-version: "26.0.1"
18 | ios-device: "iPhone 17"
19 | watchos-version: "26.0"
20 | visionos-version: "26.0"
21 | macos-runner: "macos-26"
22 | xcode-version: "26.0"
23 |
24 | check-approvals:
25 | runs-on: ubuntu-latest
26 | outputs:
27 | reviews: ${{ steps.reviews.outputs.state }}
28 | permissions:
29 | pull-requests: read
30 | contents: read
31 | steps:
32 | - uses: actions/checkout@v4
33 | - name: Check Approvals
34 | id: reviews
35 | env:
36 | GH_TOKEN: ${{ github.token }}
37 | pr: ${{ github.event.pull_request.number }}
38 | run: |
39 | echo "Checking PR approval for: $pr"
40 | state=$(gh pr view $pr --json reviewDecision --jq '.reviewDecision')
41 | echo "Review decision state: $state"
42 | echo "state=$state" >> "$GITHUB_OUTPUT"
43 |
44 | pre-merge-tests:
45 | name: "Pre-merge Tests"
46 | needs: [check-approvals]
47 | if: needs.check-approvals.outputs.reviews == 'APPROVED' || github.event_name == 'workflow_dispatch'
48 | concurrency:
49 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}-${{ matrix.os }}
50 | cancel-in-progress: true
51 | strategy:
52 | matrix:
53 | include:
54 | - os: macos-14
55 | ios-version: "17.2"
56 | ios-device: "iPhone 15"
57 | watchos-version: "10.2"
58 | visionos-version: "1.0"
59 | xcode-version: "16.1"
60 | - os: macos-15
61 | ios-version: "18.5"
62 | ios-device: "iPhone 16"
63 | watchos-version: "11.5"
64 | visionos-version: "2.5"
65 | xcode-version: "16.4"
66 | uses: ./.github/workflows/unit-tests.yml
67 | with:
68 | macos-runner: ${{ matrix.os }}
69 | ios-version: ${{ matrix.ios-version }}
70 | ios-device: ${{ matrix.ios-device }}
71 | watchos-version: ${{ matrix.watchos-version }}
72 | visionos-version: ${{ matrix.visionos-version }}
73 | xcode-version: ${{ matrix.xcode-version }}
74 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/FeatureExtractor.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Accelerate
5 | import AVFoundation
6 | import CoreGraphics
7 | import CoreML
8 | import Foundation
9 |
10 | public protocol FeatureExtractorOutputType {}
11 | extension MLMultiArray: FeatureExtractorOutputType {}
12 |
13 | public protocol FeatureExtracting {
14 | var melCount: Int? { get }
15 | var windowSamples: Int? { get }
16 | func logMelSpectrogram(fromAudio inputAudio: any AudioProcessorOutputType) async throws -> (any FeatureExtractorOutputType)?
17 | }
18 |
19 | open class FeatureExtractor: FeatureExtracting, WhisperMLModel {
20 | public var model: MLModel?
21 |
22 | public init() {}
23 |
24 | public var melCount: Int? {
25 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["melspectrogram_features"] else { return nil }
26 | guard inputDescription.type == .multiArray else { return nil }
27 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
28 | let shape = shapeConstraint.shape.map { $0.intValue }
29 | return shape[1]
30 | }
31 |
32 | public var windowSamples: Int? {
33 | guard let inputDescription = model?.modelDescription.inputDescriptionsByName["audio"] else { return nil }
34 | guard inputDescription.type == .multiArray else { return nil }
35 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
36 | let shape = shapeConstraint.shape.map { $0.intValue }
37 | return shape[0] // The audio input is a 1D array
38 | }
39 |
40 | open func logMelSpectrogram(fromAudio inputAudio: any AudioProcessorOutputType) async throws -> (any FeatureExtractorOutputType)? {
41 | guard let audioArray = inputAudio as? MLMultiArray else {
42 | throw WhisperError.audioProcessingFailed("FeatureExtractor input must be MLMultiArray")
43 | }
44 | guard let model else {
45 | throw WhisperError.modelsUnavailable()
46 | }
47 | try Task.checkCancellation()
48 |
49 | let interval = Logging.beginSignpost("ExtractAudioFeatures", signposter: Logging.FeatureExtractor.signposter)
50 | defer { Logging.endSignpost("ExtractAudioFeatures", interval: interval, signposter: Logging.FeatureExtractor.signposter) }
51 |
52 | let modelInputs = MelSpectrogramInput(audio: audioArray)
53 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions())
54 | let output = MelSpectrogramOutput(features: outputFeatures)
55 | return output.melspectrogramFeatures
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Utilities/TextUtilities.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// A utility struct providing text compression and analysis functionality
7 | public struct TextUtilities {
8 |
9 | private init() {}
10 |
11 | /// Calculates the compression ratio of an array of text tokens using zlib compression
12 | /// - Parameter textTokens: Array of integer tokens to compress
13 | /// - Returns: The compression ratio (original size / compressed size). Returns infinity if compression fails
14 | public static func compressionRatio(of textTokens: [Int]) -> Float {
15 | // Convert the integer array to a byte array (Data)
16 | let dataBuffer = textTokens.compactMap { Int32($0) }
17 | let data = dataBuffer.withUnsafeBufferPointer { Data(buffer: $0) }
18 |
19 | // Compress the data using NSData compression
20 | do {
21 | let compressedData = try (data as NSData).compressed(using: .zlib)
22 | // Calculate and return the compression ratio
23 | return Float(data.count) / Float(compressedData.length)
24 | } catch {
25 | Logging.debug("Compression error: \(error.localizedDescription)")
26 | return Float.infinity
27 | }
28 | }
29 |
30 | /// Calculates the compression ratio of a text string using zlib compression
31 | /// - Parameter text: The text string to compress
32 | /// - Returns: The compression ratio (original size / compressed size). Returns infinity if text is empty or compression fails
33 | public static func compressionRatio(of text: String) -> Float {
34 | if text.isEmpty {
35 | return Float.infinity // TODO: throw to caller instead of return infinity
36 | }
37 |
38 | // Encode the string as UTF-8
39 | guard let data = text.data(using: .utf8) else {
40 | Logging.debug("String encoding error")
41 | return Float.infinity
42 | }
43 |
44 | // Compress the data using NSData compression
45 | do {
46 | let compressedData = try (data as NSData).compressed(using: .zlib)
47 | // Calculate and return the compression ratio
48 | return Float(data.count) / Float(compressedData.length)
49 | } catch {
50 | Logging.debug("Compression error: \(error.localizedDescription)")
51 | return Float.infinity
52 | }
53 | }
54 | }
55 |
56 | @available(*, deprecated, message: "Subject to removal in a future version. Use `TextUtilities.compressionRatio(of:)` instead.")
57 | public func compressionRatio(of array: [Int]) -> Float {
58 | return TextUtilities.compressionRatio(of: array)
59 | }
60 |
61 | @available(*, deprecated, message: "Subject to removal in a future version. Use `TextUtilities.compressionRatio(of:)` instead.")
62 | public func compressionRatio(of text: String) -> Float {
63 | return TextUtilities.compressionRatio(of: text)
64 | }
65 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Utilities/Concurrency.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 | import os.lock
6 |
7 | /// An actor that provides thread-safe early stopping functionality using UUIDs as keys
8 | public actor EarlyStopActor {
9 | private var shouldStop = [UUID: Bool]()
10 |
11 | public init() {}
12 |
13 | /// Sets the stop flag for a given UUID
14 | /// - Parameters:
15 | /// - value: The boolean value to set
16 | /// - uuid: The UUID key
17 | public func set(_ value: Bool, for uuid: UUID) {
18 | shouldStop[uuid] = value
19 | }
20 |
21 | /// Gets the stop flag for a given UUID
22 | /// - Parameter uuid: The UUID key
23 | /// - Returns: The current stop flag value, or false if not set
24 | public func get(for uuid: UUID) -> Bool {
25 | return shouldStop[uuid] ?? false
26 | }
27 |
28 | /// Removes and returns the stop flag for a given UUID
29 | /// - Parameter uuid: The UUID key
30 | /// - Returns: The removed stop flag value, if it existed
31 | public func remove(for uuid: UUID) -> Bool? {
32 | return shouldStop.removeValue(forKey: uuid)
33 | }
34 | }
35 |
36 | /// Serializes access to a value with an `os_unfair_lock` so mutation stays
37 | /// thread-safe. The wrapper is used by `TranscriptionResult`, which is marked
38 | /// `@unchecked Sendable`; guarding each property with this lock helps keep the
39 | /// result instance safe when shared across concurrent contexts.
40 | @propertyWrapper
41 | public struct TranscriptionPropertyLock: Sendable, Codable {
42 | private let lock: UnfairLock
43 | private var value: Value
44 |
45 | public init(wrappedValue: Value) {
46 | self.lock = UnfairLock()
47 | self.value = wrappedValue
48 | }
49 | public init(from decoder: Swift.Decoder) throws {
50 | self.lock = UnfairLock()
51 | self.value = try Value(from: decoder)
52 | }
53 |
54 | public func encode(to encoder: Encoder) throws {
55 | try lock.withLock {
56 | try value.encode(to: encoder)
57 | }
58 |
59 | }
60 |
61 | public var wrappedValue: Value {
62 | get {
63 | lock.withLock {
64 | return value
65 | }
66 | }
67 | set {
68 | lock.withLock {
69 | value = newValue
70 | }
71 | }
72 | }
73 | }
74 |
75 | /// Thin wrapper around `os_unfair_lock` that exposes a Swift-friendly
76 | /// `withLock` helper. This lock is non-reentrant and optimized for low
77 | /// contention, matching the semantics of Core Foundation’s unfair lock.
78 | @usableFromInline
79 | final class UnfairLock: @unchecked Sendable {
80 | @usableFromInline
81 | var lock = os_unfair_lock()
82 |
83 | @inlinable
84 | func withLock(_ body: () throws -> T) rethrows -> T {
85 | os_unfair_lock_lock(&lock)
86 | defer { os_unfair_lock_unlock(&lock) }
87 | return try body()
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/AudioEncoder.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import CoreML
5 |
6 | public protocol AudioEncoderOutputType {}
7 | extension MLMultiArray: AudioEncoderOutputType {}
8 |
9 | /// AudioEncoding protocol defines the requirements for an audio encoding implementation.
10 | public protocol AudioEncoding {
11 | /// The size of the embedding produced by the encoder.
12 | var embedSize: Int? { get }
13 |
14 | /// Encodes the given audio features asynchronously.
15 | /// - Parameter features: The audio features to be encoded.
16 | /// - Returns: An optional tensor containing the encoded features.
17 | func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)?
18 | }
19 |
20 | /// Backwards-compatible AudioEncoder implementation
21 | public class AudioEncoder: AudioEncoding, WhisperMLModel {
22 | public var model: MLModel?
23 |
24 | public var embedSize: Int? {
25 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
26 | guard inputDescription.type == .multiArray else { return nil }
27 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
28 | let shape = shapeConstraint.shape.map { $0.intValue }
29 | return shape[1]
30 | }
31 |
32 | public var sequenceLength: Int? {
33 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
34 | guard inputDescription.type == .multiArray else { return nil }
35 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
36 | let shape = shapeConstraint.shape.map { $0.intValue }
37 | return shape[3]
38 | }
39 |
40 | public init() {}
41 |
42 | public func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)? {
43 | guard let features = features as? MLMultiArray else {
44 | throw WhisperError.audioProcessingFailed("AudioEncoder input must be MLMultiArray")
45 | }
46 |
47 | return try await encodeFeatures(features)
48 | }
49 |
50 | public func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? {
51 | guard let model else {
52 | throw WhisperError.modelsUnavailable()
53 | }
54 | try Task.checkCancellation()
55 |
56 | let interval = Logging.beginSignpost("EncodeAudio", signposter: Logging.AudioEncoding.signposter)
57 | defer { Logging.endSignpost("EncodeAudio", interval: interval, signposter: Logging.AudioEncoding.signposter) }
58 |
59 | let modelInputs = AudioEncoderInput(melspectrogram_features: features)
60 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions())
61 | let output = AudioEncoderOutput(features: outputFeatures)
62 | return output.encoder_output_embeds
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version: 5.9
2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
3 |
4 | import PackageDescription
5 | import Foundation
6 |
7 | let package = Package(
8 | name: "whisperkit",
9 | platforms: [
10 | .iOS(.v16),
11 | .macOS(.v13),
12 | .watchOS(.v10),
13 | .visionOS(.v1)
14 | ],
15 | products: [
16 | .library(
17 | name: "WhisperKit",
18 | targets: ["WhisperKit"]
19 | ),
20 | .executable(
21 | name: "whisperkit-cli",
22 | targets: ["WhisperKitCLI"]
23 | )
24 | ],
25 | dependencies: [
26 | .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMinor(from: "1.1.2")),
27 | .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
28 | ] + (isServerEnabled() ? [
29 | .package(url: "https://github.com/vapor/vapor.git", from: "4.115.1"),
30 | .package(url: "https://github.com/apple/swift-openapi-generator", from: "1.10.2"),
31 | .package(url: "https://github.com/apple/swift-openapi-runtime", from: "1.8.2"),
32 | .package(url: "https://github.com/swift-server/swift-openapi-vapor", from: "1.0.1"),
33 |
34 | ] : []),
35 | targets: [
36 | .target(
37 | name: "WhisperKit",
38 | dependencies: [
39 | .product(name: "Hub", package: "swift-transformers"),
40 | .product(name: "Tokenizers", package: "swift-transformers"),
41 | ]
42 | ),
43 | .testTarget(
44 | name: "WhisperKitTests",
45 | dependencies: [
46 | "WhisperKit",
47 | .product(name: "Hub", package: "swift-transformers"),
48 | .product(name: "Tokenizers", package: "swift-transformers"),
49 | ],
50 | path: "Tests",
51 | resources: [
52 | .process("WhisperKitTests/Resources"),
53 | ]
54 | ),
55 | .executableTarget(
56 | name: "WhisperKitCLI",
57 | dependencies: [
58 | "WhisperKit",
59 | .product(name: "ArgumentParser", package: "swift-argument-parser"),
60 | ] + (isServerEnabled() ? [
61 | .product(name: "Vapor", package: "vapor"),
62 | .product(name: "OpenAPIRuntime", package: "swift-openapi-runtime"),
63 | .product(name: "OpenAPIVapor", package: "swift-openapi-vapor"),
64 | ] : []),
65 | path: "Sources/WhisperKitCLI",
66 | exclude: (isServerEnabled() ? [] : ["Server"]),
67 | swiftSettings: (isServerEnabled() ? [.define("BUILD_SERVER_CLI")] : [])
68 | )
69 | ],
70 | swiftLanguageVersions: [.v5]
71 | )
72 |
73 | func isServerEnabled() -> Bool {
74 | if let enabledValue = Context.environment["BUILD_ALL"] {
75 | return enabledValue.lowercased() == "true" || enabledValue == "1"
76 | }
77 |
78 | // Default disabled, change to true temporarily for local development
79 | return false
80 | }
81 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Swift/README.md:
--------------------------------------------------------------------------------
1 | # WhisperKit Swift Client
2 |
3 | A simple Swift client for the WhisperKit local server.
4 |
5 | ## Quick Start
6 |
7 | 1. **Start the WhisperKit server** (in another terminal):
8 | ```bash
9 | whisperkit-cli serve
10 | ```
11 |
12 | 2. **Build the client**:
13 | ```bash
14 | swift build
15 | ```
16 |
17 | 3. **Run commands**:
18 | ```bash
19 | # Transcribe an audio file
20 | swift run whisperkit-client transcribe audio.wav
21 |
22 | # Translate an audio file to English
23 | swift run whisperkit-client translate audio.wav
24 |
25 | # Test with sample files
26 | swift run whisperkit-client test
27 | ```
28 |
29 | ## Available Commands
30 |
31 | - `transcribe ` - Transcribe audio to text
32 | - `translate ` - Translate audio to English
33 | - `test` - Test transcription and translation on sample files
34 |
35 | ## Options
36 |
37 | - `--language, -l` - Source language for transcription (default: auto-detect)
38 | - `--model, -m` - Model to use (default: tiny)
39 | - `--response-format` - Response format: json, verbose_json (default: verbose_json)
40 | - `--timestamp-granularities` - Comma-separated: word,segment (default: segment)
41 | - `--stream` - Enable streaming output
42 | - `--server-url, -s` - Server URL (default: http://localhost:50060/v1)
43 |
44 | ## Examples
45 |
46 | ```bash
47 | # Transcribe in Spanish
48 | swift run whisperkit-client transcribe -l es audio.wav
49 |
50 | # Transcribe with word-level timestamps
51 | swift run whisperkit-client transcribe --timestamp-granularities "word,segment" audio.wav
52 |
53 | # Translate from Spanish to English
54 | swift run whisperkit-client translate -l es audio.wav
55 |
56 | # Use custom server
57 | swift run whisperkit-client transcribe -s http://192.168.1.100:50060 audio.wav
58 |
59 | # Stream transcription
60 | swift run whisperkit-client transcribe --stream audio.wav
61 | ```
62 |
63 | ## Project Structure
64 |
65 | ```
66 | Sources/
67 | ├── CLI.swift # All CLI commands and client logic
68 | └── Generated/ # Auto-generated OpenAPI client code
69 | ├── Client.swift
70 | └── Types.swift
71 | ```
72 |
73 | ## Current Limitations
74 |
75 | - **Response Format**: The `--response-format` parameter is not fully working due to OpenAPI schema discrimination issues. The client always receives basic JSON responses instead of verbose JSON with segments and word timestamps.
76 | - **Word Timestamps**: Word-level timestamps are not displayed due to the response format issue above.
77 | - **Basic Functionality**: Basic transcription and translation work correctly.
78 |
79 | > **Note**: This is a known issue with the Swift OpenAPI generator's handling of `oneOf` schemas with discriminators. The server correctly sends verbose JSON responses, but the Swift client cannot properly parse them. Consider using the Python client or CurlClient for full functionality.
80 |
81 | ## Updating Generated Code
82 |
83 | When the server spec changes, regenerate the client code:
84 |
85 | ```bash
86 | ./updateClient.sh
87 | ```
88 |
89 | This will update the files in `Sources/Generated/` from `scripts/specs/localserver_openapi.yaml`.
90 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to WhisperKit
2 |
3 | ## Overview
4 |
5 | We welcome and encourage contributions to WhisperKit! Whether you're fixing bugs, improving documentation, or adding new features from the roadmap, your help is appreciated. This guide will help you get started with contributing to WhisperKit.
6 |
7 | ## Getting Started
8 |
9 | 1. **Fork the Repository**: Start by [forking](https://github.com/argmaxinc/WhisperKit/fork) the WhisperKit repository on GitHub to your personal account.
10 |
11 | 2. **Clone Your Fork**: Clone your fork to your local machine to start making changes.
12 |
13 | ```bash
14 | git clone https://github.com/[your-username]/whisperkit.git
15 | cd whisperkit
16 | ```
17 |
18 | ## Setting Up Your Development Environment
19 |
20 | 1. **Install Dependencies**: Use the provided `Makefile` to set up your environment. Run `make setup` to install necessary dependencies.
21 |
22 | ```bash
23 | make setup
24 | ```
25 |
26 | 2. **Download Models**: Run `make download-models` to download the required models to run and test locally.
27 |
28 | ```bash
29 | make download-model MODEL=tiny
30 | ```
31 |
32 | ## Making Changes
33 |
34 | 1. **Make Your Changes**: Implement your changes, add new features, or fix bugs. Ensure you adhere to the existing coding style. If you're adding new features, make sure to update or add any documentation or tests as needed.
35 |
36 | 2. **Build and Test**: You can use the `Makefile` to build and test your changes. Run `make build` to build WhisperKit and `make test` to run tests.
37 |
38 | ```bash
39 | make build
40 | make test
41 | ```
42 |
43 | You can also run and test directly from Xcode. We've provided an example app that contains various use cases, just open the `Examples/WhisperAX/WhisperAX.xcodeproj` file in Xcode and run the app.
44 |
45 | ## Submitting Your Changes
46 |
47 | 1. **Commit Your Changes**: Once you're satisfied with your changes, commit them with a clear and concise commit message.
48 |
49 | ```bash
50 | git commit -am "Add a new feature"
51 | ```
52 |
53 | 2. **Push to Your Fork**: Push your changes to your fork on GitHub.
54 |
55 | ```bash
56 | git push origin my-branch
57 | ```
58 |
59 | 3. **Create a Pull Request**: Go to the WhisperKit repository on GitHub and create a new pull request from your fork. Ensure your pull request has a clear title and description.
60 |
61 | 4. **Code Review**: Wait for the maintainers to review your pull request. Be responsive to feedback and make any necessary changes.
62 |
63 | ## Guidelines
64 |
65 | - **Code Style**: Follow the existing code style in the project.
66 | - **Commit Messages**: Write meaningful commit messages that clearly describe the changes.
67 | - **Documentation**: Update documentation if you're adding new features or making changes that affect how users interact with WhisperKit.
68 | - **Tests**: Add or update tests for new features or bug fixes.
69 |
70 | ## Final Steps
71 |
72 | After your pull request has been reviewed and approved, a maintainer will merge it into the main branch. Congratulations, you've successfully contributed to WhisperKit!
73 |
74 | Thank you for making WhisperKit better for everyone! ❤️🔥
75 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Swift/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "pins" : [
3 | {
4 | "identity" : "openapikit",
5 | "kind" : "remoteSourceControl",
6 | "location" : "https://github.com/mattpolzin/OpenAPIKit",
7 | "state" : {
8 | "revision" : "e0ecdf050c4bebc0104ed2505ec6fa1f6afb7555",
9 | "version" : "3.7.0"
10 | }
11 | },
12 | {
13 | "identity" : "swift-algorithms",
14 | "kind" : "remoteSourceControl",
15 | "location" : "https://github.com/apple/swift-algorithms",
16 | "state" : {
17 | "revision" : "87e50f483c54e6efd60e885f7f5aa946cee68023",
18 | "version" : "1.2.1"
19 | }
20 | },
21 | {
22 | "identity" : "swift-argument-parser",
23 | "kind" : "remoteSourceControl",
24 | "location" : "https://github.com/apple/swift-argument-parser",
25 | "state" : {
26 | "revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3",
27 | "version" : "1.6.1"
28 | }
29 | },
30 | {
31 | "identity" : "swift-collections",
32 | "kind" : "remoteSourceControl",
33 | "location" : "https://github.com/apple/swift-collections",
34 | "state" : {
35 | "revision" : "8c0c0a8b49e080e54e5e328cc552821ff07cd341",
36 | "version" : "1.2.1"
37 | }
38 | },
39 | {
40 | "identity" : "swift-http-types",
41 | "kind" : "remoteSourceControl",
42 | "location" : "https://github.com/apple/swift-http-types",
43 | "state" : {
44 | "revision" : "a0a57e949a8903563aba4615869310c0ebf14c03",
45 | "version" : "1.4.0"
46 | }
47 | },
48 | {
49 | "identity" : "swift-numerics",
50 | "kind" : "remoteSourceControl",
51 | "location" : "https://github.com/apple/swift-numerics.git",
52 | "state" : {
53 | "revision" : "e0ec0f5f3af6f3e4d5e7a19d2af26b481acb6ba8",
54 | "version" : "1.0.3"
55 | }
56 | },
57 | {
58 | "identity" : "swift-openapi-generator",
59 | "kind" : "remoteSourceControl",
60 | "location" : "https://github.com/apple/swift-openapi-generator",
61 | "state" : {
62 | "revision" : "bb9a13596af11db9bb83389295d91cd335810fe8",
63 | "version" : "1.10.2"
64 | }
65 | },
66 | {
67 | "identity" : "swift-openapi-runtime",
68 | "kind" : "remoteSourceControl",
69 | "location" : "https://github.com/apple/swift-openapi-runtime",
70 | "state" : {
71 | "revision" : "8f33cc5dfe81169fb167da73584b9c72c3e8bc23",
72 | "version" : "1.8.2"
73 | }
74 | },
75 | {
76 | "identity" : "swift-openapi-urlsession",
77 | "kind" : "remoteSourceControl",
78 | "location" : "https://github.com/apple/swift-openapi-urlsession",
79 | "state" : {
80 | "revision" : "6fac6f7c428d5feea2639b5f5c8b06ddfb79434b",
81 | "version" : "1.1.0"
82 | }
83 | },
84 | {
85 | "identity" : "yams",
86 | "kind" : "remoteSourceControl",
87 | "location" : "https://github.com/jpsim/Yams",
88 | "state" : {
89 | "revision" : "d41ba4e7164c0838c6d48351f7575f7f762151fe",
90 | "version" : "6.1.0"
91 | }
92 | }
93 | ],
94 | "version" : 2
95 | }
96 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Python/README.md:
--------------------------------------------------------------------------------
1 | # WhisperKit Python Client
2 |
3 | A simple Python client for the WhisperKit local server using OpenAI's SDK.
4 |
5 | ## Quick Start
6 |
7 | 1. **Start the WhisperKit server** (in another terminal):
8 | ```bash
9 | whisperkit-cli serve
10 | ```
11 |
12 | 2. **Install dependencies**:
13 | ```bash
14 | uv sync
15 | ```
16 |
17 | 3. **Run commands**:
18 | ```bash
19 | # Transcribe an audio file
20 | python whisperkit_client.py transcribe audio.wav
21 |
22 | # Translate an audio file to English
23 | python whisperkit_client.py translate audio.wav
24 |
25 | # Test with sample files
26 | python whisperkit_client.py test
27 | ```
28 |
29 | ## Available Commands
30 |
31 | - `transcribe ` - Transcribe audio to text
32 | - `translate ` - Translate audio to English
33 | - `test` - Test transcription and translation on sample files
34 |
35 | ## Options
36 |
37 | - `--server, -s` - Server URL (default: http://localhost:50060)
38 | - `--model, -m` - Model to use (default: tiny)
39 | - `--language, -l` - Source language for transcription (default: auto-detect)
40 | - `--response-format` - Response format: json, verbose_json (default: verbose_json)
41 | - `--timestamp-granularities` - Comma-separated: word,segment (default: segment)
42 | - `--stream` - Enable streaming output
43 | - `--debug` - Show raw JSON response for debugging
44 |
45 |
46 | ## Examples
47 |
48 | ```bash
49 | # Transcribe in Spanish
50 | python whisperkit_client.py transcribe -l es audio.wav
51 |
52 | # Translate to English (auto-detects source language)
53 | python whisperkit_client.py translate audio.wav
54 |
55 | # Use custom server and model
56 | python whisperkit_client.py -s http://192.168.1.100:50060 -m large transcribe audio.wav
57 |
58 | # Transcribe with word-level timestamps
59 | python whisperkit_client.py transcribe --timestamp-granularities "word,segment" audio.wav
60 |
61 | # Stream transcription
62 | python whisperkit_client.py transcribe --stream audio.wav
63 |
64 | # Debug mode to see raw JSON
65 | python whisperkit_client.py transcribe --debug audio.wav
66 |
67 | # Test with sample files
68 | python whisperkit_client.py test
69 | ```
70 |
71 | ## Project Structure
72 |
73 | ```
74 | Examples/ServeCLIClient/Python/
75 | ├── whisperkit_client.py # Main CLI script with all functionality
76 | ├── test_transcribe.py # Test script for transcription
77 | ├── test_translate.py # Test script for translation
78 | ├── requirements.txt # Python dependencies
79 | ├── uv.lock # Locked dependency versions
80 | └── README.md # This file
81 | ```
82 |
83 | ## Dependencies
84 |
85 | - `openai` - OpenAI Python SDK for API communication
86 | - `uv` - Fast Python package manager
87 |
88 | ## Testing
89 |
90 | The client automatically finds test audio files from `Tests/WhisperKitTests/Resources/` in the main project directory.
91 |
92 | ```bash
93 | # Run tests on sample files
94 | python whisperkit_client.py test
95 |
96 | # Or run individual test scripts
97 | python test_transcribe.py
98 | python test_translate.py
99 | ```
100 |
101 | ## Alternative Clients
102 |
103 | For lightweight testing without Python dependencies, see the [CurlClient](../Curl/README.md) which provides shell script implementations using curl.
104 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/TranscribeCLIUtils.swift:
--------------------------------------------------------------------------------
1 | // Copyright © 2025 Argmax, Inc. All rights reserved.
2 | // For licensing see accompanying LICENSE.md file.
3 |
4 | import Foundation
5 | import CoreML
6 | @preconcurrency import WhisperKit
7 |
8 | internal class TranscribeCLIUtils {
9 |
10 | /// Creates WhisperKit configuration from CLI arguments
11 | static func createWhisperKitConfig(from arguments: TranscribeCLIArguments) -> WhisperKitConfig {
12 | var audioEncoderComputeUnits = arguments.audioEncoderComputeUnits.asMLComputeUnits
13 | let textDecoderComputeUnits = arguments.textDecoderComputeUnits.asMLComputeUnits
14 |
15 | // Use gpu for audio encoder on macOS below 14
16 | if audioEncoderComputeUnits == .cpuAndNeuralEngine {
17 | if #unavailable(macOS 14.0) {
18 | audioEncoderComputeUnits = .cpuAndGPU
19 | }
20 | }
21 |
22 | let computeOptions = ModelComputeOptions(
23 | audioEncoderCompute: audioEncoderComputeUnits,
24 | textDecoderCompute: textDecoderComputeUnits
25 | )
26 |
27 | let downloadTokenizerFolder: URL? = arguments.downloadTokenizerPath.map { URL(filePath: $0) }
28 | let downloadModelFolder: URL? = arguments.downloadModelPath.map { URL(filePath: $0) }
29 | let modelName: String? = arguments.model.map { arguments.modelPrefix + "*" + $0 }
30 |
31 | return WhisperKitConfig(
32 | model: modelName,
33 | downloadBase: downloadModelFolder,
34 | modelFolder: arguments.modelPath,
35 | tokenizerFolder: downloadTokenizerFolder,
36 | computeOptions: computeOptions,
37 | verbose: arguments.verbose,
38 | logLevel: arguments.verbose ? .debug : .info,
39 | prewarm: false,
40 | load: true,
41 | useBackgroundDownloadSession: false
42 | )
43 | }
44 |
45 | /// Creates DecodingOptions from CLI arguments and task
46 | static func createDecodingOptions(from arguments: TranscribeCLIArguments, task: DecodingTask) -> DecodingOptions {
47 | let options = DecodingOptions(
48 | verbose: arguments.verbose,
49 | task: task,
50 | language: arguments.language,
51 | temperature: arguments.temperature,
52 | temperatureIncrementOnFallback: arguments.temperatureIncrementOnFallback,
53 | temperatureFallbackCount: arguments.temperatureFallbackCount,
54 | topK: arguments.bestOf,
55 | usePrefillPrompt: arguments.usePrefillPrompt || arguments.language != nil || task == .translate,
56 | usePrefillCache: arguments.usePrefillCache,
57 | skipSpecialTokens: arguments.skipSpecialTokens,
58 | withoutTimestamps: arguments.withoutTimestamps,
59 | wordTimestamps: arguments.wordTimestamps,
60 | clipTimestamps: arguments.clipTimestamps,
61 | supressTokens: arguments.supressTokens,
62 | compressionRatioThreshold: arguments.compressionRatioThreshold ?? 2.4,
63 | logProbThreshold: arguments.logprobThreshold ?? -1.0,
64 | firstTokenLogProbThreshold: arguments.firstTokenLogProbThreshold,
65 | noSpeechThreshold: arguments.noSpeechThreshold ?? 0.6,
66 | concurrentWorkerCount: arguments.concurrentWorkerCount,
67 | chunkingStrategy: ChunkingStrategy(rawValue: arguments.chunkingStrategy)
68 | )
69 |
70 | return options
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/Server/ServeCLI.swift:
--------------------------------------------------------------------------------
1 | // Copyright © 2025 Argmax, Inc. All rights reserved.
2 | // For licensing see accompanying LICENSE.md file.
3 |
4 | import ArgumentParser
5 | import CoreML
6 | import Foundation
7 | @preconcurrency import WhisperKit
8 | import Vapor
9 | import OpenAPIRuntime
10 | import OpenAPIVapor
11 | import AVFoundation
12 |
13 | struct ServeCLI: AsyncParsableCommand {
14 | static let configuration = CommandConfiguration(
15 | commandName: "serve",
16 | abstract: "Start a local server for WhisperKit transcription"
17 | )
18 |
19 | @OptionGroup
20 | var cliArguments: ServeCLIArguments
21 |
22 | mutating func run() async throws {
23 | try await serve()
24 | }
25 |
26 | public func configure(_ app: Application) async throws {
27 | let transport = VaporTransport(routesBuilder: app)
28 |
29 | var transcribeArguments = cliArguments.transcribe
30 | transcribeArguments.skipSpecialTokens = true // always skip special tokens for server responses
31 | if let modelPath = cliArguments.transcribe.modelPath {
32 | app.logger.notice("Loading model from path: \(modelPath)")
33 | } else if let model = cliArguments.transcribe.model {
34 | app.logger.notice("Loading model: \(model)")
35 | } else {
36 | let defaultModel = WhisperKit.recommendedModels().default
37 | app.logger.notice("Loading default model: \(defaultModel)")
38 | transcribeArguments.model = defaultModel
39 | transcribeArguments.modelPrefix = ""
40 | }
41 |
42 | let config = TranscribeCLIUtils.createWhisperKitConfig(from: transcribeArguments)
43 | let whisperKit = try await WhisperKit(config)
44 | let handler = OpenAIHandler(whisperKit: whisperKit, logger: app.logger, transcribeArguments: transcribeArguments)
45 | try handler.registerHandlers(on: transport, serverURL: URL(string: "/v1")!)
46 |
47 | // Register base routes after OpenAPI routes to ensure they take precedence
48 | app.get("") { req async throws -> EndpointInfo in
49 | return EndpointInfo(
50 | status: "ok",
51 | service: "WhisperKit Local Server",
52 | endpoints: [
53 | Endpoint(method: "POST", path: "/v1/audio/transcriptions", description: "Transcribe audio to text"),
54 | Endpoint(method: "POST", path: "/v1/audio/translations", description: "Translate audio to English"),
55 | Endpoint(method: "GET", path: "/health", description: "Health check endpoint")
56 | ]
57 | )
58 | }
59 |
60 | app.get("health") { req async throws -> [String: String] in
61 | return ["status": "ok"]
62 | }
63 | }
64 |
65 | private func serve() async throws {
66 | var env = try Environment.detect()
67 | try LoggingSystem.bootstrap(from: &env)
68 | let app = try await Application.make()
69 | app.logger.logLevel = cliArguments.transcribe.verbose ? .debug : .info
70 | app.logger.notice("Starting WhisperKit Server...")
71 | app.environment.arguments = [""] // override arguments, handled by swift-argument-parser
72 |
73 | // Configure server to bind to specified host and port
74 | app.http.server.configuration.hostname = cliArguments.host
75 | app.http.server.configuration.port = cliArguments.port
76 | app.logger.notice("Server will bind to \(cliArguments.host):\(cliArguments.port)")
77 |
78 | do {
79 | try await configure(app)
80 | try await app.execute()
81 | } catch {
82 | app.logger.report(error: error)
83 | try? await app.asyncShutdown()
84 | throw error
85 | }
86 | try await app.asyncShutdown()
87 | }
88 | }
89 |
90 | // Response structs for the base endpoint
91 | fileprivate struct Endpoint: Content {
92 | let method: String
93 | let path: String
94 | let description: String
95 | }
96 |
97 | fileprivate struct EndpointInfo: Content {
98 | let status: String
99 | let service: String
100 | let endpoints: [Endpoint]
101 | }
102 |
--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
1 | name: Unit Tests
2 |
3 | on:
4 | workflow_call:
5 | inputs:
6 | ios-version:
7 | required: true
8 | type: string
9 | ios-device:
10 | required: true
11 | type: string
12 | watchos-version:
13 | required: true
14 | type: string
15 | visionos-version:
16 | required: true
17 | type: string
18 | macos-runner:
19 | required: true
20 | type: string
21 | xcode-version:
22 | required: false
23 | type: string
24 |
25 | jobs:
26 | unit-tests:
27 | name: "${{ matrix.run-config['name'] }} on ${{ inputs.macos-runner }}"
28 | runs-on: ${{ inputs.macos-runner }}
29 | strategy:
30 | matrix:
31 | run-config:
32 | - {
33 | name: "macOS",
34 | condition: true,
35 | clean-destination: "generic/platform=macOS",
36 | test-destination: "platform=macOS,arch=arm64",
37 | }
38 | - {
39 | name: "iOS",
40 | condition: true,
41 | clean-destination: "generic/platform=iOS",
42 | test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=${{ inputs.ios-device }}",
43 | }
44 | - {
45 | name: "watchOS",
46 | condition: "${{ inputs.macos-runner == 'macos-26' }}",
47 | clean-destination: "generic/platform=watchOS",
48 | test-destination: "platform=watchOS Simulator,OS=${{ inputs.watchos-version }},name=Apple Watch Ultra 3 (49mm)",
49 | }
50 | - {
51 | name: "visionOS",
52 | condition: "${{ inputs.macos-runner == 'macos-26' }}",
53 | clean-destination: "generic/platform=visionOS",
54 | test-destination: "platform=visionOS Simulator,OS=${{ inputs.visionos-version }},name=Apple Vision Pro",
55 | }
56 | timeout-minutes: ${{ matrix.run-config['name'] == 'visionOS' && 60 || 30 }}
57 | steps:
58 | - uses: actions/checkout@v4
59 | - uses: maxim-lobanov/setup-xcode@v1
60 | with:
61 | xcode-version: ${{ inputs.xcode-version || '26.0' }}
62 | - name: Setup environment
63 | run: make setup
64 | - name: Setup Cache
65 | id: model-cache
66 | uses: actions/cache@v4
67 | with:
68 | path: Models
69 | key: ${{ runner.os }}-models
70 | - name: Download Models
71 | if: steps.model-cache.outputs.cache-hit != 'true'
72 | run: make download-model MODEL=tiny
73 | - name: Install and discover destinations
74 | if: ${{ matrix.run-config['condition'] == true }}
75 | run: |
76 | echo "Simulators on runner:"
77 | xcrun simctl list
78 | if [[ "${{ matrix.run-config['name'] }}" == "visionOS" ]]; then
79 | xcodebuild -downloadPlatform ${{ matrix.run-config['name'] }}
80 | fi
81 | echo "Runtimes for testing:"
82 | xcrun simctl list runtimes
83 | echo "Destinations for testing:"
84 | xcodebuild test-without-building -testPlan UnitTestsPlan -scheme whisperkit-Package -showdestinations
85 | - name: Boot Simulator and Wait
86 | if: ${{ matrix.run-config['condition'] == true }} && ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-26' }}
87 | # Slower runners require some time to fully boot the simulator
88 | # Parse the simulator name from the destination string, boot it, and wait
89 | run: |
90 | simulator_name=$(echo '${{ matrix.run-config['test-destination'] }}' | sed -n 's/.*name=\([^,]*\).*/\1/p')
91 | xcrun simctl boot "$simulator_name" || true
92 | sleep 15
93 | xcrun simctl list devices
94 | - name: Build and Test - ${{ matrix.run-config['name'] }}
95 | if: ${{ matrix.run-config['condition'] == true }}
96 | run: |
97 | set -o pipefail
98 | xcodebuild clean build-for-testing -scheme whisperkit-Package -destination '${{ matrix.run-config['clean-destination'] }}' | xcpretty
99 | xcodebuild test -testPlan UnitTestsPlan -scheme whisperkit-Package -destination '${{ matrix.run-config['test-destination'] }}'
100 | - name: Upload Test Results
101 | if: failure()
102 | uses: actions/upload-artifact@v4
103 | with:
104 | name: test-results-${{ matrix.run-config['name']}}-on-${{ inputs.macos-runner }}
105 | path: |
106 | ~/Library/Developer/Xcode/DerivedData/**/Logs/Test/*.xcresult
107 | retention-days: 5
108 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/xcshareddata/xcschemes/WhisperAX.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
8 |
9 |
15 |
21 |
22 |
23 |
24 |
25 |
31 |
32 |
35 |
41 |
42 |
43 |
46 |
52 |
53 |
54 |
55 |
56 |
66 |
68 |
74 |
75 |
76 |
77 |
81 |
82 |
86 |
87 |
88 |
89 |
95 |
97 |
103 |
104 |
105 |
106 |
108 |
109 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/TranscribeCLIArguments.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import ArgumentParser
5 |
6 | struct TranscribeCLIArguments: ParsableArguments {
7 | @Option(help: "Paths to audio files")
8 | var audioPath = [String]()
9 |
10 | @Option(help: "Path to a folder containing audio files")
11 | var audioFolder: String?
12 |
13 | @Option(help: "Path of model files")
14 | var modelPath: String?
15 |
16 | @Option(help: "Model to download if no modelPath is provided")
17 | var model: String?
18 |
19 | @Option(help: "Text to add in front of the model name to specify between different types of the same variant (values: \"openai\", \"distil\")")
20 | var modelPrefix: String = "openai"
21 |
22 | @Option(help: "Path to save the downloaded model")
23 | var downloadModelPath: String?
24 |
25 | @Option(help: "Path to save the downloaded tokenizer files")
26 | var downloadTokenizerPath: String?
27 |
28 | @Option(help: "Compute units for audio encoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
29 | var audioEncoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
30 |
31 | @Option(help: "Compute units for text decoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
32 | var textDecoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
33 |
34 | @Flag(help: "Verbose mode")
35 | var verbose: Bool = false
36 |
37 | @Option(help: "Task to perform (transcribe or translate)")
38 | var task: String = "transcribe"
39 |
40 | @Option(help: "Language spoken in the audio")
41 | var language: String?
42 |
43 | @Option(help: "Temperature to use for sampling")
44 | var temperature: Float = 0
45 |
46 | @Option(help: "Temperature to increase on fallbacks during decoding")
47 | var temperatureIncrementOnFallback: Float = 0.2
48 |
49 | @Option(help: "Number of times to increase temperature when falling back during decoding")
50 | var temperatureFallbackCount: Int = 5
51 |
52 | @Option(help: "Number of candidates when sampling with non-zero temperature")
53 | var bestOf: Int = 5
54 |
55 | @Flag(help: "Force initial prompt tokens based on language, task, and timestamp options")
56 | var usePrefillPrompt: Bool = false
57 |
58 | @Flag(help: "Use decoder prefill data for faster initial decoding")
59 | var usePrefillCache: Bool = false
60 |
61 | @Flag(help: "Skip special tokens in the output")
62 | var skipSpecialTokens: Bool = false
63 |
64 | @Flag(help: "Force no timestamps when decoding")
65 | var withoutTimestamps: Bool = false
66 |
67 | @Flag(help: "Add timestamps for each word in the output")
68 | var wordTimestamps: Bool = false
69 |
70 | @Option(help: "Force prefix text when decoding")
71 | var prefix: String?
72 |
73 | @Option(help: "Condition on this text when decoding")
74 | var prompt: String?
75 |
76 | @Option(parsing: .upToNextOption, help: "List of timestamps (in seconds) of start and end values to transcribe as seperate clips in single audio file (example: --clip-timestamps 0 10.2 34.5 60.0)")
77 | var clipTimestamps: [Float] = []
78 |
79 | @Option(parsing: .upToNextOption, help: "List of tokens to supress in the output (example: --supress-tokens 1 2 3)")
80 | var supressTokens: [Int] = []
81 |
82 | @Option(help: "Gzip compression ratio threshold for decoding failure")
83 | var compressionRatioThreshold: Float?
84 |
85 | @Option(help: "Average log probability threshold for decoding failure")
86 | var logprobThreshold: Float?
87 |
88 | @Option(help: "Log probability threshold for first token decoding failure")
89 | var firstTokenLogProbThreshold: Float?
90 |
91 | @Option(help: "Probability threshold to consider a segment as silence")
92 | var noSpeechThreshold: Float?
93 |
94 | @Flag(help: "Output a report of the results")
95 | var report: Bool = false
96 |
97 | @Option(help: "Directory to save the report")
98 | var reportPath: String = "."
99 |
100 | @Flag(help: "Process audio directly from the microphone")
101 | var stream: Bool = false
102 |
103 | @Flag(help: "Simulate streaming transcription using the input audio file")
104 | var streamSimulated: Bool = false
105 |
106 | @Option(help: "Maximum concurrent inference, might be helpful when processing more than 1 audio file at the same time. 0 means unlimited. Default: 4")
107 | var concurrentWorkerCount: Int = 4
108 |
109 | @Option(help: "Chunking strategy for audio processing, `none` means no chunking, `vad` means using voice activity detection. Default: `vad`")
110 | var chunkingStrategy: String = "vad"
111 | }
112 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/whisperkit-Package.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
8 |
9 |
15 |
21 |
22 |
23 |
29 |
35 |
36 |
37 |
43 |
49 |
50 |
51 |
52 |
53 |
58 |
59 |
62 |
63 |
64 |
65 |
67 |
73 |
74 |
75 |
76 |
77 |
87 |
89 |
95 |
96 |
97 |
98 |
104 |
106 |
112 |
113 |
114 |
115 |
117 |
118 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Curl/translate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright © 2025 Argmax, Inc. All rights reserved.
4 | # For licensing see accompanying LICENSE.md file.
5 |
6 | # WhisperKit CurlClient - Translate Audio
7 | # Usage: ./translate.sh [options]
8 |
9 | set -e
10 |
11 | # Default values
12 | SERVER_URL="http://localhost:50060/v1"
13 | MODEL="tiny"
14 | LANGUAGE=""
15 | PROMPT=""
16 | RESPONSE_FORMAT="verbose_json"
17 | TEMPERATURE="0.0"
18 | VERBOSE="false"
19 |
20 | # Colors for output
21 | RED='\033[0;31m'
22 | GREEN='\033[0;32m'
23 | YELLOW='\033[1;33m'
24 | BLUE='\033[0;34m'
25 | NC='\033[0m' # No Color
26 |
27 | # Help function
28 | show_help() {
29 | echo "Usage: $0 [options]"
30 | echo ""
31 | echo "Arguments:"
32 | echo " audio-file Path to audio file (wav, mp3, m4a, flac, etc.)"
33 | echo ""
34 | echo "Options:"
35 | echo " -h, --help Show this help message"
36 | echo " -s, --server Server URL (default: http://localhost:50060/v1)"
37 | echo " -m, --model Model to use (default: tiny)"
38 | echo " -l, --language Source language code (e.g., es, ja, fr)"
39 | echo " -p, --prompt Text to guide translation (should be in English)"
40 | echo " -f, --response-format Response format: json, verbose_json (default: verbose_json)"
41 | echo " -t, --temperature Sampling temperature 0.0-1.0 (default: 0.0)"
42 | echo " --verbose Show verbose curl output"
43 | echo ""
44 | echo "Examples:"
45 | echo " $0 audio.wav"
46 | echo " $0 audio.wav --language es --response-format json"
47 | echo " $0 audio.wav --language ja --prompt \"This is a formal conversation\""
48 | echo ""
49 | }
50 |
51 | # Parse command line arguments
52 | AUDIO_FILE=""
53 | while [[ $# -gt 0 ]]; do
54 | case $1 in
55 | -h|--help)
56 | show_help
57 | exit 0
58 | ;;
59 | -s|--server)
60 | SERVER_URL="$2"
61 | shift 2
62 | ;;
63 | -m|--model)
64 | MODEL="$2"
65 | shift 2
66 | ;;
67 | -l|--language)
68 | LANGUAGE="$2"
69 | shift 2
70 | ;;
71 | -p|--prompt)
72 | PROMPT="$2"
73 | shift 2
74 | ;;
75 | -f|--response-format)
76 | RESPONSE_FORMAT="$2"
77 | shift 2
78 | ;;
79 | -t|--temperature)
80 | TEMPERATURE="$2"
81 | shift 2
82 | ;;
83 |
84 | --verbose)
85 | VERBOSE="true"
86 | shift
87 | ;;
88 | -*)
89 | echo -e "${RED}Error: Unknown option $1${NC}"
90 | show_help
91 | exit 1
92 | ;;
93 | *)
94 | if [[ -z "$AUDIO_FILE" ]]; then
95 | AUDIO_FILE="$1"
96 | else
97 | echo -e "${RED}Error: Multiple audio files specified${NC}"
98 | exit 1
99 | fi
100 | shift
101 | ;;
102 | esac
103 | done
104 |
105 | # Check if audio file is provided
106 | if [[ -z "$AUDIO_FILE" ]]; then
107 | echo -e "${RED}Error: Audio file is required${NC}"
108 | show_help
109 | exit 1
110 | fi
111 |
112 | # Check if audio file exists
113 | if [[ ! -f "$AUDIO_FILE" ]]; then
114 | echo -e "${RED}Error: Audio file '$AUDIO_FILE' not found${NC}"
115 | exit 1
116 | fi
117 |
118 | # Build curl command
119 | CURL_CMD="curl -X POST \"$SERVER_URL/audio/translations\""
120 | CURL_CMD="$CURL_CMD -H \"Content-Type: multipart/form-data\""
121 | CURL_CMD="$CURL_CMD -F \"file=@$AUDIO_FILE\""
122 | CURL_CMD="$CURL_CMD -F \"model=$MODEL\""
123 |
124 | if [[ -n "$LANGUAGE" ]]; then
125 | CURL_CMD="$CURL_CMD -F \"language=$LANGUAGE\""
126 | fi
127 |
128 | if [[ -n "$PROMPT" ]]; then
129 | CURL_CMD="$CURL_CMD -F \"prompt=$PROMPT\""
130 | fi
131 |
132 | CURL_CMD="$CURL_CMD -F \"response_format=$RESPONSE_FORMAT\""
133 | CURL_CMD="$CURL_CMD -F \"temperature=$TEMPERATURE\""
134 |
135 | # Add output flags based on verbose setting
136 | if [[ "$VERBOSE" == "true" ]]; then
137 | CURL_CMD="$CURL_CMD -v"
138 | else
139 | CURL_CMD="$CURL_CMD -s -S"
140 | fi
141 |
142 | echo -e "${BLUE}🚀 Starting translation...${NC}"
143 | echo -e "${YELLOW}📁 Audio file:${NC} $AUDIO_FILE"
144 | echo -e "${YELLOW}🌐 Server:${NC} $SERVER_URL"
145 | echo -e "${YELLOW}🤖 Model:${NC} $MODEL"
146 | echo -e "${YELLOW}📝 Response format:${NC} $RESPONSE_FORMAT"
147 | echo -e "${YELLOW}🌡️ Temperature:${NC} $TEMPERATURE"
148 | if [[ -n "$LANGUAGE" ]]; then
149 | echo -e "${YELLOW}🌍 Source language:${NC} $LANGUAGE"
150 | fi
151 | if [[ -n "$PROMPT" ]]; then
152 | echo -e "${YELLOW}💡 Prompt:${NC} $PROMPT"
153 | fi
154 | echo ""
155 |
156 | # Execute curl command
157 | echo -e "${BLUE}📡 Sending request...${NC}"
158 | echo ""
159 | eval $CURL_CMD
160 |
161 | echo ""
162 | echo -e "${GREEN}✅ Translation complete!${NC}"
163 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Utilities/Logging.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import OSLog
5 |
6 | open class Logging {
7 | public static let shared = Logging()
8 | public var logLevel: LogLevel = .none
9 |
10 | public typealias LoggingCallback = (_ message: String) -> Void
11 | public var loggingCallback: LoggingCallback?
12 |
13 | private let logger = OSLog(subsystem: Bundle.main.bundleIdentifier ?? "com.argmax.whisperkit", category: "WhisperKit")
14 |
15 | @frozen
16 | public enum LogLevel: Int {
17 | case debug = 1
18 | case info = 2
19 | case error = 3
20 | case none = 4
21 |
22 | func shouldLog(level: LogLevel) -> Bool {
23 | return self.rawValue <= level.rawValue
24 | }
25 | }
26 |
27 | private init() {}
28 |
29 | public func log(_ items: Any..., separator: String = " ", terminator: String = "\n", type: OSLogType) {
30 | let message = items.map { "\($0)" }.joined(separator: separator)
31 | if let logger = loggingCallback {
32 | logger(message)
33 | } else {
34 | os_log("%{public}@", log: logger, type: type, message)
35 | }
36 | }
37 |
38 | public static func debug(_ items: Any..., separator: String = " ", terminator: String = "\n") {
39 | if shared.logLevel.shouldLog(level: .debug) {
40 | shared.log(items, separator: separator, terminator: terminator, type: .debug)
41 | }
42 | }
43 |
44 | public static func info(_ items: Any..., separator: String = " ", terminator: String = "\n") {
45 | if shared.logLevel.shouldLog(level: .info) {
46 | shared.log(items, separator: separator, terminator: terminator, type: .info)
47 | }
48 | }
49 |
50 | public static func error(_ items: Any..., separator: String = " ", terminator: String = "\n") {
51 | if shared.logLevel.shouldLog(level: .error) {
52 | shared.log(items, separator: separator, terminator: terminator, type: .error)
53 | }
54 | }
55 | }
56 |
57 | public extension Logging {
58 | static func logCurrentMemoryUsage(_ message: String) {
59 | let memoryUsage = getMemoryUsage()
60 | Logging.debug("\(message) - Memory usage: \(memoryUsage) MB")
61 | }
62 |
63 | static func getMemoryUsage() -> UInt64 {
64 | var info = mach_task_basic_info()
65 | var count = mach_msg_type_number_t(MemoryLayout.size) / 4
66 |
67 | let kerr: kern_return_t = withUnsafeMutablePointer(to: &info) {
68 | $0.withMemoryRebound(to: integer_t.self, capacity: 1) {
69 | task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count)
70 | }
71 | }
72 |
73 | guard kerr == KERN_SUCCESS else {
74 | return 0 // If the call fails, return 0
75 | }
76 |
77 | return info.resident_size / 1024 / 1024 // Convert to MB
78 | }
79 | }
80 |
81 | @available(*, deprecated, message: "Subject to removal in a future version. Use `Logging.logCurrentMemoryUsage(_:)` instead.")
82 | public func logCurrentMemoryUsage(_ message: String) {
83 | Logging.logCurrentMemoryUsage(message)
84 | }
85 |
86 | @available(*, deprecated, message: "Subject to removal in a future version. Use `Logging.getMemoryUsage()` instead.")
87 | public func getMemoryUsage() -> UInt64 {
88 | return Logging.getMemoryUsage()
89 | }
90 |
91 | extension Logging {
92 | enum AudioEncoding {
93 | static let logger = Logger(
94 | subsystem: Constants.Logging.subsystem,
95 | category: "AudioEncoding"
96 | )
97 | static let signposter = OSSignposter(logger: logger)
98 | }
99 |
100 | enum FeatureExtractor {
101 | static let logger = Logger(
102 | subsystem: Constants.Logging.subsystem,
103 | category: "FeatureExtractor"
104 | )
105 | static let signposter = OSSignposter(logger: logger)
106 | }
107 |
108 | enum TranscribeTask {
109 | static let logger = Logger(
110 | subsystem: Constants.Logging.subsystem,
111 | category: "TranscribeTask"
112 | )
113 | static let signposter = OSSignposter(logger: logger)
114 | }
115 |
116 | static func beginSignpost(
117 | _ intervalName: StaticString,
118 | signposter: OSSignposter
119 | ) -> OSSignpostIntervalState {
120 | let signpostId = signposter.makeSignpostID()
121 | return signposter.beginInterval(intervalName, id: signpostId)
122 | }
123 |
124 | static func endSignpost(
125 | _ intervalName: StaticString,
126 | interval: OSSignpostIntervalState,
127 | signposter: OSSignposter
128 | ) {
129 | signposter.endInterval(intervalName, interval)
130 | }
131 |
132 | static func formatTimestamp(_ timestamp: Float) -> String {
133 | return String(format: "%.2f", timestamp)
134 | }
135 |
136 | static func formatTimeWithPercentage(_ time: Double, _ runs: Double, _ fullPipelineDuration: Double) -> String {
137 | let percentage = (time * 1000 / fullPipelineDuration) * 100 // Convert to percentage
138 | let runTime = runs > 0 ? time * 1000 / Double(runs) : 0
139 | let formattedString = String(format: "%8.2f ms / %6.0f runs (%8.2f ms/run) %5.2f%%", time * 1000, runs, runTime, percentage)
140 | return formattedString
141 | }
142 | }
143 |
144 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Curl/transcribe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright © 2025 Argmax, Inc. All rights reserved.
4 | # For licensing see accompanying LICENSE.md file.
5 |
6 | # WhisperKit Local Server Transcription Client
7 | # Usage: ./transcribe.sh [options]
8 |
9 | set -e
10 |
11 | # Default values
12 | AUDIO_FILE=""
13 | MODEL="tiny"
14 | LANGUAGE=""
15 | PROMPT=""
16 | RESPONSE_FORMAT="verbose_json"
17 | TIMESTAMP_GRANULARITIES="segment"
18 | TEMPERATURE="0.0"
19 | STREAM="false"
20 | VERBOSE="false"
21 | LOGPROBS="false"
22 |
23 | # Parse command line arguments
24 | while [[ $# -gt 0 ]]; do
25 | case $1 in
26 | --model)
27 | MODEL="$2"
28 | shift 2
29 | ;;
30 | --language)
31 | LANGUAGE="$2"
32 | shift 2
33 | ;;
34 | --prompt)
35 | PROMPT="$2"
36 | shift 2
37 | ;;
38 | --response-format)
39 | RESPONSE_FORMAT="$2"
40 | shift 2
41 | ;;
42 | --timestamp-granularities)
43 | TIMESTAMP_GRANULARITIES="$2"
44 | shift 2
45 | ;;
46 | --temperature)
47 | TEMPERATURE="$2"
48 | shift 2
49 | ;;
50 | --stream)
51 | STREAM="$2"
52 | shift 2
53 | ;;
54 | --logprobs)
55 | LOGPROBS="true"
56 | shift
57 | ;;
58 | --verbose)
59 | VERBOSE="true"
60 | shift
61 | ;;
62 | -h|--help)
63 | echo "Usage: $0 [options]"
64 | echo ""
65 | echo "Options:"
66 | echo " --model Model to use (default: tiny)"
67 | echo " --language Language code (e.g., en, es, fr)"
68 | echo " --prompt Prompt text for transcription"
69 | echo " --response-format Response format: json, verbose_json (default: verbose_json)"
70 | echo " --timestamp-granularities Comma-separated list: word,segment (default: segment)"
71 | echo " --temperature Temperature for sampling (default: 0.0)"
72 | echo " --stream Enable streaming (default: false)"
73 | echo " --logprobs Include logprobs in transcription (default: false)"
74 | echo " --verbose Show verbose output"
75 | echo " -h, --help Show this help message"
76 | echo ""
77 | echo "Examples:"
78 | echo " $0 audio.wav"
79 | echo " $0 audio.wav --model base --language en"
80 | echo " $0 audio.wav --timestamp-granularities word,segment --stream true"
81 | exit 0
82 | ;;
83 | *)
84 | if [[ -z "$AUDIO_FILE" ]]; then
85 | AUDIO_FILE="$1"
86 | else
87 | echo "Error: Unknown option $1"
88 | exit 1
89 | fi
90 | shift
91 | ;;
92 | esac
93 | done
94 |
95 | # Check if audio file is provided
96 | if [[ -z "$AUDIO_FILE" ]]; then
97 | echo "Error: Audio file is required"
98 | echo "Usage: $0 [options]"
99 | exit 1
100 | fi
101 |
102 | # Check if audio file exists
103 | if [[ ! -f "$AUDIO_FILE" ]]; then
104 | echo "Error: Audio file '$AUDIO_FILE' not found"
105 | exit 1
106 | fi
107 |
108 | # Build curl command
109 | CURL_CMD="curl -s -S"
110 |
111 | # Add verbose flag if requested
112 | if [[ "$VERBOSE" == "true" ]]; then
113 | CURL_CMD="$CURL_CMD -v"
114 | fi
115 |
116 | CURL_CMD="$CURL_CMD -X POST http://localhost:50060/v1/audio/transcriptions"
117 |
118 | # Build multipart form data
119 | CURL_CMD="$CURL_CMD -F file=@\"$AUDIO_FILE\""
120 | CURL_CMD="$CURL_CMD -F model=\"$MODEL\""
121 | CURL_CMD="$CURL_CMD -F response_format=\"$RESPONSE_FORMAT\""
122 | CURL_CMD="$CURL_CMD -F timestamp_granularities[]=\"$TIMESTAMP_GRANULARITIES\""
123 | CURL_CMD="$CURL_CMD -F temperature=\"$TEMPERATURE\""
124 | CURL_CMD="$CURL_CMD -F stream=\"$STREAM\""
125 | # Add logprobs if specified
126 | if [ "$LOGPROBS" = "true" ]; then
127 | CURL_CMD="$CURL_CMD -F \"include[]=logprobs\""
128 | fi
129 |
130 | if [[ -n "$LANGUAGE" ]]; then
131 | CURL_CMD="$CURL_CMD -F language=\"$LANGUAGE\""
132 | fi
133 |
134 | if [[ -n "$PROMPT" ]]; then
135 | CURL_CMD="$CURL_CMD -F prompt=\"$PROMPT\""
136 | fi
137 |
138 | echo "🔄 Transcribing: $AUDIO_FILE"
139 | echo "📋 Options: model=$MODEL, format=$RESPONSE_FORMAT, granularities=$TIMESTAMP_GRANULARITIES, stream=$STREAM"
140 | echo ""
141 |
142 | # Execute curl command
143 | if [[ "$STREAM" == "true" ]]; then
144 | # For streaming, process line by line with timestamps
145 | echo "📡 Starting streaming transcription..."
146 | echo "⏰ Timestamps show when each piece of data arrives:"
147 | echo ""
148 |
149 | # Use a function to add timestamps to each line
150 | timestamp_stream() {
151 | while IFS= read -r line; do
152 | if [[ -n "$line" ]]; then
153 | timestamp=$(date '+%H:%M:%S.%3N')
154 | echo "[$timestamp] $line"
155 | fi
156 | done
157 | }
158 |
159 | eval "$CURL_CMD" | timestamp_stream
160 | else
161 | # For non-streaming, just execute normally
162 | eval "$CURL_CMD"
163 | fi
164 |
165 | echo ""
166 | echo "✅ Transcription complete"
167 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/config-v02.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "whisperkit-coreml",
3 | "version": "0.2",
4 | "device_support": [
5 | {
6 | "identifiers": ["iPhone11", "iPhone12", "Watch7", "Watch8"],
7 | "models": {
8 | "default": "openai_whisper-tiny",
9 | "supported": [
10 | "openai_whisper-tiny",
11 | "openai_whisper-tiny.en",
12 | "openai_whisper-base",
13 | "openai_whisper-base.en"
14 | ]
15 | }
16 | },
17 | {
18 | "identifiers": ["iPhone13", "iPad13,18", "iPad13,1"],
19 | "models": {
20 | "default": "openai_whisper-base",
21 | "supported": [
22 | "openai_whisper-tiny",
23 | "openai_whisper-tiny.en",
24 | "openai_whisper-base",
25 | "openai_whisper-base.en",
26 | "openai_whisper-small",
27 | "openai_whisper-small.en"
28 | ]
29 | }
30 | },
31 | {
32 | "identifiers": [
33 | "iPhone14",
34 | "iPhone15",
35 | "iPhone16",
36 | "iPhone17",
37 | "iPad14,1",
38 | "iPad14,2"
39 | ],
40 | "models": {
41 | "default": "openai_whisper-base",
42 | "supported": [
43 | "openai_whisper-tiny",
44 | "openai_whisper-tiny.en",
45 | "openai_whisper-base",
46 | "openai_whisper-base.en",
47 | "openai_whisper-small",
48 | "openai_whisper-small.en",
49 | "openai_whisper-large-v2_949MB",
50 | "openai_whisper-large-v2_turbo_955MB",
51 | "openai_whisper-large-v3_947MB",
52 | "openai_whisper-large-v3_turbo_954MB",
53 | "distil-whisper_distil-large-v3_594MB",
54 | "distil-whisper_distil-large-v3_turbo_600MB",
55 | "openai_whisper-large-v3-v20240930_626MB",
56 | "openai_whisper-large-v3-v20240930_turbo_632MB"
57 | ]
58 | }
59 | },
60 | {
61 | "identifiers": [
62 | "Mac13",
63 | "iMac21",
64 | "MacBookAir10,1",
65 | "MacBookPro17",
66 | "MacBookPro18",
67 | "Macmini9",
68 | "iPad13,16",
69 | "iPad13,4",
70 | "iPad13,8"
71 | ],
72 | "models": {
73 | "default": "openai_whisper-large-v3-v20240930",
74 | "supported": [
75 | "openai_whisper-tiny",
76 | "openai_whisper-tiny.en",
77 | "openai_whisper-base",
78 | "openai_whisper-base.en",
79 | "openai_whisper-small",
80 | "openai_whisper-small.en",
81 | "openai_whisper-large-v2",
82 | "openai_whisper-large-v2_949MB",
83 | "openai_whisper-large-v3",
84 | "openai_whisper-large-v3_947MB",
85 | "distil-whisper_distil-large-v3",
86 | "distil-whisper_distil-large-v3_594MB",
87 | "openai_whisper-large-v3-v20240930",
88 | "openai_whisper-large-v3-v20240930_626MB"
89 | ]
90 | }
91 | },
92 | {
93 | "identifiers": [
94 | "Mac14",
95 | "Mac15",
96 | "Mac16",
97 | "iPad14,3",
98 | "iPad14,4",
99 | "iPad14,5",
100 | "iPad14,6",
101 | "iPad14,8",
102 | "iPad14,9",
103 | "iPad14,10",
104 | "iPad14,11",
105 | "iPad16"
106 | ],
107 | "models": {
108 | "default": "openai_whisper-large-v3-v20240930",
109 | "supported": [
110 | "openai_whisper-tiny",
111 | "openai_whisper-tiny.en",
112 | "openai_whisper-base",
113 | "openai_whisper-base.en",
114 | "openai_whisper-small",
115 | "openai_whisper-small.en",
116 | "openai_whisper-large-v2",
117 | "openai_whisper-large-v2_949MB",
118 | "openai_whisper-large-v2_turbo",
119 | "openai_whisper-large-v2_turbo_955MB",
120 | "openai_whisper-large-v3",
121 | "openai_whisper-large-v3_947MB",
122 | "openai_whisper-large-v3_turbo",
123 | "openai_whisper-large-v3_turbo_954MB",
124 | "distil-whisper_distil-large-v3",
125 | "distil-whisper_distil-large-v3_594MB",
126 | "distil-whisper_distil-large-v3_turbo",
127 | "distil-whisper_distil-large-v3_turbo_600MB",
128 | "openai_whisper-large-v3-v20240930",
129 | "openai_whisper-large-v3-v20240930_turbo",
130 | "openai_whisper-large-v3-v20240930_626MB",
131 | "openai_whisper-large-v3-v20240930_turbo_632MB"
132 | ]
133 | }
134 | }
135 | ]
136 | }
137 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Audio/AudioChunker.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Accelerate
5 | import AVFoundation
6 | import Foundation
7 |
8 | /// Responsible for chunking audio into smaller pieces
9 | public protocol AudioChunking {
10 | func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk]
11 | }
12 |
13 | public extension AudioChunking {
14 | func updateSeekOffsetsForResults(
15 | chunkedResults: [Result<[TranscriptionResult], Swift.Error>],
16 | audioChunks: [AudioChunk]
17 | ) -> [TranscriptionResult] {
18 | var updatedTranscriptionResults = [TranscriptionResult]()
19 | for (index, chunkedResult) in chunkedResults.enumerated() {
20 | switch chunkedResult {
21 | case let .success(results):
22 | let seekTime = Float(audioChunks[index].seekOffsetIndex) / Float(WhisperKit.sampleRate)
23 | for result in results {
24 | var updatedSegments = [TranscriptionSegment]()
25 | for segment in result.segments {
26 | let updatedSegment = TranscriptionUtilities.updateSegmentTimings(segment: segment, seekTime: seekTime)
27 | updatedSegments.append(updatedSegment)
28 | }
29 | var updatedResult = result
30 | updatedResult.seekTime = seekTime
31 | updatedResult.segments = updatedSegments
32 | updatedTranscriptionResults.append(updatedResult)
33 | }
34 | case let .failure(error):
35 | Logging.debug("Error transcribing chunk \(index): \(error)")
36 | }
37 | }
38 | return updatedTranscriptionResults
39 | }
40 | }
41 |
42 | /// A audio chunker that splits audio into smaller pieces based on voice activity detection
43 | open class VADAudioChunker: AudioChunking {
44 | /// prevent hallucinations at the end of the clip by stopping up to 1.0s early
45 | private let windowPadding: Int
46 | private let vad: VoiceActivityDetector
47 |
48 | public init(windowPadding: Int = 16000, vad: VoiceActivityDetector? = nil) {
49 | self.windowPadding = windowPadding
50 | self.vad = vad ?? EnergyVAD()
51 | }
52 |
53 | private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int {
54 | // NOTE: we want to check just the 2nd part for the silence to attempt to get closest to a max length chunk
55 | let audioMidIndex = startIndex + (endIndex - startIndex) / 2
56 | let vadAudioSlice = Array(audioArray[audioMidIndex.. [AudioChunk] {
67 | // If the audio array length is less than or equal to maxLength, return it as a single chunk
68 | if audioArray.count <= maxChunkLength {
69 | return [AudioChunk(seekOffsetIndex: 0, audioSamples: audioArray)]
70 | }
71 |
72 | // First create chunks from seek clips
73 | let options = decodeOptions ?? DecodingOptions()
74 | let seekClips = options.prepareSeekClips(contentFrames: audioArray.count)
75 |
76 | var chunkedAudio = [AudioChunk]()
77 | for (seekClipStart, seekClipEnd) in seekClips {
78 | // Loop through the current clip until we reach the end
79 | // Typically this will be the full audio file, unless seek points are explicitly provided
80 | var startIndex = seekClipStart
81 | while startIndex < seekClipEnd - windowPadding {
82 | guard startIndex >= 0 && startIndex < audioArray.count else {
83 | throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
84 | }
85 |
86 | // Make sure we still need chunking for this seek clip, otherwise use the original seek clip end
87 | var endIndex = seekClipEnd
88 | if startIndex + maxChunkLength < endIndex {
89 | // Adjust the end index based on VAD
90 | endIndex = splitOnMiddleOfLongestSilence(
91 | audioArray: audioArray,
92 | startIndex: startIndex,
93 | endIndex: min(audioArray.count, startIndex + maxChunkLength)
94 | )
95 | }
96 |
97 | guard endIndex > startIndex else {
98 | break
99 | }
100 | Logging.debug("Found chunk from \(Logging.formatTimestamp(Float(startIndex) / Float(WhisperKit.sampleRate))) to \(Logging.formatTimestamp(Float(endIndex) / Float(WhisperKit.sampleRate)))")
101 | let audioSlice = AudioChunk(seekOffsetIndex: startIndex, audioSamples: Array(audioArray[startIndex.. [!IMPORTANT]
41 | > An active developer account is required to run the tests on physical devices.
42 |
43 | Before running tests, all external devices need to be connected and paired to your Mac, as well as registered with your developer account. Ensure the devices are in Developer Mode. If nothing appears after connecting the devices via cable, press `Command + Shift + 2` to open the list of devices and track their progress.
44 |
45 | ## Datasets
46 |
47 | The datasets for the test suite can be set in a global array called `datasets` in the file [`Tests/WhisperKitTests/RegressionTests.swift`](Tests/WhisperKitTests/RegressionTests.swift). It is prefilled with the datasets that are currently available.
48 |
49 | ## Models
50 |
51 | The models for the test suite can be set in the [`Fastfile`](fastlane/Fastfile). Simply find `BENCHMARK_CONFIGS` and modify the `models` array under the benchmark you want to run.
52 |
53 | ## Makefile and Fastlane
54 |
55 | The tests are run using [Fastlane](fastlane/Fastfile), which is controlled by a [Makefile](Makefile). The Makefile contains the following commands:
56 |
57 | ### List Connected Devices
58 |
59 | Before running the tests it might be a good idea to list the connected devices to resolve any connection issues. Simply run:
60 |
61 | ```sh
62 | make list-devices
63 | ```
64 |
65 | The output will be a list with entries that look something like this:
66 |
67 | ```ruby
68 | {
69 | :name=>"My Mac",
70 | :type=>"Apple M2 Pro",
71 | :platform=>"macOS",
72 | :os_version=>"15.0.1",
73 | :product=>"Mac14,12",
74 | :id=>"XXXXXXXX-1234-5678-9012-XXXXXXXXXXXX",
75 | :state=>"connected"
76 | }
77 | ```
78 |
79 | Verify that the devices are connected and the state is `connected`.
80 |
81 | ### Running Benchmarks
82 |
83 | After completing the above steps, you can run the tests. Note that there are two different test configurations: one named `full` and the other named `debug`. To check for potential errors, run the `debug` tests:
84 |
85 | ```sh
86 | make benchmark-devices DEBUG=true
87 | ```
88 |
89 | Otherwise run the `full` tests:
90 |
91 | ```sh
92 | make benchmark-devices
93 | ```
94 |
95 | Optionally, for both tests, you can specify the list of devices for the tests using the `DEVICES` option:
96 |
97 | ```sh
98 | make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac"
99 | ```
100 |
101 | The `DEVICES` option is a comma-separated list of device names. The device names can be found by running `make list-devices` and using the value for the `:name` key.
102 |
103 | ### Results
104 |
105 | After the tests are run, the generated results can be found under `fastlane/benchmark_data` including the .xcresult file with logs and attachments for each device. There will also be a folder called `fastlane/upload_folder/benchmark_data` that contains only the JSON results in `fastlane/benchmark_data` that can used for further analysis.
106 |
107 | We will periodically run these tests on a range of devices and upload the results to the [argmaxinc/whisperkit-evals-dataset](https://huggingface.co/datasets/argmaxinc/whisperkit-evals-dataset), which will propagate to the [WhisperKit Benchmarks](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) space and be available for comparison.
108 |
109 |
110 | # Troubleshooting
111 |
112 |
113 | If you encounter issues while running the tests, heres a few things to try:
114 |
115 | 1. Open the project in Xcode and run the tests directly from there.
116 | 1. To do this, open the example app (from command line type: `xed Examples/WhisperAX`) and run the test named `RegressionTests/testModelPerformanceWithDebugConfig` from the test navigator.
117 | 2. If the tests run successfully, you can rule out any issues with the device or the models.
118 | 3. If they dont run successfully, Xcode will provide more detailed error messages.
119 | 2. Try specifying a single device to run the tests on. This can be done by running `make list-devices` and then running the tests with the `DEVICES` option set to the name of the device you want to test on. For example, `make benchmark-devices DEVICES="My Mac"`. This will also enable you to see the logs for that specific device.
120 | 3. If you are still encountering issues, please reach out to us on the [Discord](https://discord.gg/G5F5GZGecC) or create an [issue](https://github.com/argmaxinc/WhisperKit/issues) on GitHub.
121 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Utilities/ResultWriter.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | public protocol ResultWriting {
7 | var outputDir: String { get }
8 | func write(result: TranscriptionResult, to file: String, options: [String: Any]?) -> Result
9 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String
10 | }
11 |
12 | public extension ResultWriting {
13 | /// Format a time value as a string
14 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String {
15 | let hrs = Int(seconds / 3600)
16 | let mins = Int((seconds.truncatingRemainder(dividingBy: 3600)) / 60)
17 | let secs = Int(seconds.truncatingRemainder(dividingBy: 60))
18 | let msec = Int((seconds - floor(seconds)) * 1000)
19 |
20 | if alwaysIncludeHours || hrs > 0 {
21 | return String(format: "%02d:%02d:%02d\(decimalMarker)%03d", hrs, mins, secs, msec)
22 | } else {
23 | return String(format: "%02d:%02d\(decimalMarker)%03d", mins, secs, msec)
24 | }
25 | }
26 |
27 | func formatSegment(index: Int, start: Float, end: Float, text: String) -> String {
28 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: true, decimalMarker: ",")
29 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: true, decimalMarker: ",")
30 | return "\(index)\n\(startFormatted) --> \(endFormatted)\n\(text)\n\n"
31 | }
32 |
33 | func formatTiming(start: Float, end: Float, text: String) -> String {
34 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: false, decimalMarker: ".")
35 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: false, decimalMarker: ".")
36 | return "\(startFormatted) --> \(endFormatted)\n\(text)\n\n"
37 | }
38 | }
39 |
40 | open class WriteJSON: ResultWriting {
41 | public let outputDir: String
42 |
43 | public init(outputDir: String) {
44 | self.outputDir = outputDir
45 | }
46 |
47 | /// Write a transcription result to a JSON file
48 | /// - Parameters:
49 | /// - result: Completed transcription result
50 | /// - file: Name of the file to write, without the extension
51 | /// - options: Not used
52 | /// - Returns: The URL of the written file, or a error if the write failed
53 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result {
54 | let reportPathURL = URL(fileURLWithPath: outputDir)
55 | let reportURL = reportPathURL.appendingPathComponent("\(file).json")
56 | let jsonEncoder = JSONEncoder()
57 | jsonEncoder.outputFormatting = .prettyPrinted
58 | do {
59 | let reportJson = try jsonEncoder.encode(result)
60 | try reportJson.write(to: reportURL)
61 | } catch {
62 | return .failure(error)
63 | }
64 |
65 | return .success(reportURL.absoluteString)
66 | }
67 | }
68 |
69 | open class WriteSRT: ResultWriting {
70 | public let outputDir: String
71 |
72 | public init(outputDir: String) {
73 | self.outputDir = outputDir
74 | }
75 |
76 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result {
77 | let outputPathURL = URL(fileURLWithPath: outputDir)
78 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).srt")
79 |
80 | do {
81 | var srtContent = ""
82 | var index = 1
83 | for segment in result.segments {
84 | if let wordTimings = segment.words, !wordTimings.isEmpty {
85 | for wordTiming in wordTimings {
86 | srtContent += formatSegment(index: index, start: wordTiming.start, end: wordTiming.end, text: wordTiming.word)
87 | index += 1
88 | }
89 | } else {
90 | // Use segment timing if word timings are not available
91 | srtContent += formatSegment(index: index, start: segment.start, end: segment.end, text: segment.text)
92 | index += 1
93 | }
94 | }
95 |
96 | try srtContent.write(to: outputFileURL, atomically: true, encoding: .utf8)
97 | return .success(outputFileURL.absoluteString)
98 | } catch {
99 | return .failure(error)
100 | }
101 | }
102 | }
103 |
104 | open class WriteVTT: ResultWriting {
105 | public let outputDir: String
106 |
107 | public init(outputDir: String) {
108 | self.outputDir = outputDir
109 | }
110 |
111 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result {
112 | let outputPathURL = URL(fileURLWithPath: outputDir)
113 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).vtt")
114 |
115 | do {
116 | var vttContent = "WEBVTT\n\n"
117 | for segment in result.segments {
118 | if let wordTimings = segment.words, !wordTimings.isEmpty {
119 | for wordTiming in wordTimings {
120 | vttContent += formatTiming(start: wordTiming.start, end: wordTiming.end, text: wordTiming.word)
121 | }
122 | } else {
123 | // Use segment timing if word timings are not available
124 | vttContent += formatTiming(start: segment.start, end: segment.end, text: segment.text)
125 | }
126 | }
127 |
128 | try vttContent.write(to: outputFileURL, atomically: true, encoding: .utf8)
129 | return .success(outputFileURL.absoluteString)
130 | } catch {
131 | return .failure(error)
132 | }
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Python/test_translate.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2025 Argmax, Inc. All rights reserved.
2 | # For licensing see accompanying LICENSE.md file.
3 |
4 | """
5 | Test translation with audio files from Tests/WhisperKitTests/Resources/
6 |
7 | This script tests translation functionality using the actual test audio files
8 | from the WhisperKit test suite.
9 | """
10 |
11 | import os
12 | import sys
13 | import argparse
14 | from pathlib import Path
15 | from openai import OpenAI
16 |
17 |
18 | def get_test_audio_files():
19 | """
20 | Get list of available test audio files from Tests/WhisperKitTests/Resources/
21 |
22 | Returns:
23 | List of audio file paths
24 | """
25 | # Path to test resources relative to project root
26 | resources_dir = Path(__file__).parent.parent.parent.parent / "Tests" / "WhisperKitTests" / "Resources"
27 |
28 | if not resources_dir.exists():
29 | print(f"Error: Test resources directory not found: {resources_dir}")
30 | return []
31 |
32 | audio_extensions = {'.wav', '.m4a', '.mp3', '.flac', '.aac'}
33 |
34 | audio_files = []
35 | for file_path in resources_dir.iterdir():
36 | if file_path.is_file() and file_path.suffix.lower() in audio_extensions:
37 | audio_files.append(file_path)
38 |
39 | return sorted(audio_files)
40 |
41 |
42 | def translate_test_file(client, audio_file_path, prompt=None):
43 | """
44 | Translate a test audio file using the local WhisperKit server.
45 |
46 | Args:
47 | client: OpenAI client instance
48 | audio_file_path: Path to the audio file
49 | prompt: Optional prompt to guide translation
50 |
51 | Returns:
52 | Translation result or None if failed
53 | """
54 | try:
55 | print(f"Translating: {audio_file_path.name}")
56 |
57 | with open(audio_file_path, "rb") as audio_file:
58 | response = client.audio.translations.create(
59 | model="tiny",
60 | file=audio_file,
61 | prompt=prompt,
62 | response_format="verbose_json"
63 | )
64 | return response
65 | except Exception as e:
66 | print(f"Error translating {audio_file_path.name}: {e}")
67 | return None
68 |
69 |
70 | def main():
71 | parser = argparse.ArgumentParser(
72 | description="Test translation with WhisperKit test audio files"
73 | )
74 | parser.add_argument(
75 | "--prompt",
76 | help="Optional prompt to guide translation"
77 | )
78 | parser.add_argument(
79 | "--server-url",
80 | default="http://localhost:50060/v1",
81 | help="WhisperKit server URL (default: http://localhost:50060/v1)"
82 | )
83 | parser.add_argument(
84 | "--file",
85 | help="Specific test file to translate (e.g., 'es_test_clip.wav')"
86 | )
87 | parser.add_argument(
88 | "--target-language",
89 | default="en",
90 | help="Target language for translation (default: 'en')"
91 | )
92 |
93 | args = parser.parse_args()
94 |
95 | # Get available test audio files
96 | test_files = get_test_audio_files()
97 |
98 | if not test_files:
99 | print("No test audio files found!")
100 | sys.exit(1)
101 |
102 | print("Available test audio files:")
103 | for i, file_path in enumerate(test_files, 1):
104 | print(f" {i}. {file_path.name}")
105 |
106 | # Initialize OpenAI client with local server
107 | client = OpenAI(
108 | base_url=args.server_url,
109 | api_key="dummy-key"
110 | )
111 |
112 | print(f"\nConnecting to WhisperKit server at: {args.server_url}")
113 | print(f"Target language: {args.target_language}")
114 |
115 | if args.prompt:
116 | print(f"Prompt: {args.prompt}")
117 |
118 | if args.file:
119 | target_file = None
120 | for file_path in test_files:
121 | if file_path.name == args.file:
122 | target_file = file_path
123 | break
124 |
125 | if not target_file:
126 | print(f"Error: Test file '{args.file}' not found")
127 | print("Available files:", [f.name for f in test_files])
128 | sys.exit(1)
129 |
130 | files_to_process = [target_file]
131 | else:
132 | # Process all files
133 | files_to_process = test_files
134 |
135 | print(f"\nProcessing {len(files_to_process)} file(s)...")
136 |
137 | # Process each file
138 | for i, audio_file in enumerate(files_to_process, 1):
139 | print(f"\n{'='*50}")
140 | print(f"File {i}/{len(files_to_process)}: {audio_file.name}")
141 | print(f"{'='*50}")
142 |
143 | result = translate_test_file(
144 | client,
145 | audio_file,
146 | prompt=args.prompt
147 | )
148 |
149 | if result:
150 | print(f"\n✓ Translation successful!")
151 | print(f"Translated text: {result.text}")
152 |
153 | if hasattr(result, 'segments') and result.segments:
154 | print(f"\nSegments ({len(result.segments)}):")
155 | for segment in result.segments:
156 | print(f" [{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}")
157 |
158 | if hasattr(result, 'language') and result.language:
159 | print(f"\nSource language: {result.language}")
160 |
161 | # File size info
162 | file_size = audio_file.stat().st_size / 1024 # KB
163 | print(f"\nFile size: {file_size:.1f} KB")
164 |
165 | else:
166 | print(f"✗ Translation failed for {audio_file.name}")
167 |
168 | print(f"\n{'='*50}")
169 | print("Test translation complete!")
170 | print(f"Processed {len(files_to_process)} file(s)")
171 | print(f"All audio translated to: {args.target_language}")
172 |
173 |
174 | if __name__ == "__main__":
175 | main()
176 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: setup setup-huggingface-cli setup-model-repo download-models download-model build build-cli test \
2 | clean-package-caches list-devices benchmark-connected-devices benchmark-device benchmark-devices \
3 | extract-xcresult build-local-server generate-server generate-server-spec generate-server-code
4 |
5 | PIP_COMMAND := pip3
6 | PYTHON_COMMAND := python3
7 |
8 | # Define model repository and directories
9 | MODEL_REPO := argmaxinc/whisperkit-coreml
10 | MODEL_REPO_DIR := ./Models/whisperkit-coreml
11 | BASE_COMPILED_DIR := ./Models
12 |
13 | GIT_HASH := $(shell git rev-parse --short HEAD)
14 |
15 | setup:
16 | @echo "Setting up environment..."
17 | @which $(PIP_COMMAND)
18 | @which $(PYTHON_COMMAND)
19 | @echo "Checking for Homebrew..."
20 | @which brew > /dev/null || (echo "Error: Homebrew is not installed. Install it from https://brew.sh and try again" && exit 1)
21 | @echo "Homebrew is installed."
22 | @echo "Checking for huggingface-cli..."
23 | @which huggingface-cli > /dev/null || (echo "Installing huggingface-cli..." && brew install huggingface-cli)
24 | @echo "huggingface-cli is installed."
25 | @echo "Checking for git-lfs..."
26 | @which git-lfs > /dev/null || (echo "Installing git-lfs..." && brew install git-lfs)
27 | @echo "git-lfs is installed."
28 | @echo "Checking for trash..."
29 | @which trash > /dev/null || (echo "Installing trash..." && brew install trash)
30 | @echo "trash is installed."
31 | @echo "Checking for fastlane"
32 | @which fastlane > /dev/null || (echo "Installing fastlane..." && brew install fastlane)
33 | @echo "fastlane is installed."
34 | @$(MAKE) generate-whisperax-xcconfig
35 | @echo "Done 🚀"
36 |
37 |
38 | generate-whisperax-xcconfig:
39 | @echo "Updating DEVELOPMENT_TEAM in Examples/WhisperAX/Debug.xcconfig..."
40 | @TEAM_ID=$$(defaults read com.apple.dt.Xcode IDEProvisioningTeams | plutil -convert json -r -o - -- - | jq -r 'to_entries[0].value | sort_by(.teamType == "Individual") | .[0].teamID' 2>/dev/null); \
41 | if [ -z "$$TEAM_ID" ]; then \
42 | echo "Error: No Development Team ID found. Please log into Xcode with your Apple ID and select a team."; \
43 | else \
44 | echo "DEVELOPMENT_TEAM=$$TEAM_ID" > Examples/WhisperAX/Debug.xcconfig; \
45 | echo "DEVELOPMENT_TEAM has been updated in Examples/WhisperAX/Debug.xcconfig with your Development Team ID: $$TEAM_ID"; \
46 | fi
47 |
48 |
49 | setup-huggingface-cli:
50 | @if huggingface-cli whoami; then \
51 | echo "Already logged in to Hugging Face."; \
52 | else \
53 | echo "Not logged in to Hugging Face."; \
54 | if [ -z "$$HF_TOKEN" ]; then \
55 | echo "Environment variable HF_TOKEN is not set. Running normal login."; \
56 | huggingface-cli login; \
57 | else \
58 | echo "Using HF_TOKEN from environment variable."; \
59 | huggingface-cli login --token $$HF_TOKEN; \
60 | fi; \
61 | fi
62 |
63 |
64 | setup-model-repo:
65 | @echo "Setting up repository..."
66 | @mkdir -p $(BASE_COMPILED_DIR)
67 | @if [ -d "$(MODEL_REPO_DIR)/.git" ]; then \
68 | echo "Repository exists, resetting..."; \
69 | export GIT_LFS_SKIP_SMUDGE=1; \
70 | cd $(MODEL_REPO_DIR) && git fetch --all && git reset --hard origin/main && git clean -fdx; \
71 | else \
72 | echo "Repository not found, initializing..."; \
73 | export GIT_LFS_SKIP_SMUDGE=1; \
74 | git clone https://huggingface.co/$(MODEL_REPO) $(MODEL_REPO_DIR); \
75 | fi
76 |
77 |
78 | # Download all models
79 | download-models: setup-model-repo
80 | @echo "Downloading all models..."
81 | @cd $(MODEL_REPO_DIR) && \
82 | git lfs pull
83 |
84 |
85 | # Download a specific model
86 | download-model:
87 | @if [ -z "$(MODEL)" ]; then \
88 | echo "Error: MODEL is not set. Usage: make download-model MODEL=base"; \
89 | exit 1; \
90 | fi
91 | @echo "Downloading model $(MODEL)..."
92 | @$(MAKE) setup-model-repo
93 | @echo "Fetching model $(MODEL)..."
94 | @cd $(MODEL_REPO_DIR) && \
95 | git lfs pull --include="openai_whisper-$(MODEL)/*"
96 |
97 | build:
98 | @echo "Building WhisperKit..."
99 | @swift build -v
100 |
101 |
102 | build-cli:
103 | @echo "Building WhisperKit CLI..."
104 | @swift build -c release --product whisperkit-cli
105 |
106 | test:
107 | @echo "Running tests..."
108 | @swift test -v
109 |
110 |
111 | list-devices:
112 | fastlane ios list_devices
113 |
114 |
115 | # Usage:
116 | # make benchmark-devices # Benchmark all connected devices
117 | # make benchmark-devices DEBUG=true # Benchmark all connected devices with small test matrix
118 | # make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac" # Benchmark specific device names from `make list-devices`
119 | DEVICES ?=
120 | DEBUG ?= false
121 | benchmark-devices: generate-whisperax-xcconfig
122 | @if [ -n "$(DEVICES)" ]; then \
123 | echo "Benchmarking specific devices: $(DEVICES)"; \
124 | fastlane benchmark devices:"$(DEVICES)" debug:$(DEBUG); \
125 | else \
126 | echo "Benchmarking all connected devices"; \
127 | fastlane benchmark debug:$(DEBUG); \
128 | fi
129 |
130 | upload-benchmark-results:
131 | @echo "Uploading benchmark results..."
132 | @fastlane upload_results
133 |
134 | clean-package-caches:
135 | @trash ~/Library/Developer/Xcode/DerivedData/WhisperKit* || true
136 | @swift package purge-cache
137 | @swift package reset
138 |
139 | build-local-server:
140 | @echo "Building WhisperKit CLI with server support..."
141 | @BUILD_ALL=1 swift build -c release --product whisperkit-cli
142 |
143 | generate-server:
144 | @echo "Generating server OpenAPI spec and code..."
145 | @cd scripts && uv run python3 generate_local_server_openapi.py --latest
146 | @echo ""
147 | @echo "=========================================="
148 | @echo "Generating server code from OpenAPI spec..."
149 | @echo "=========================================="
150 | @BUILD_ALL=1 swift run swift-openapi-generator generate scripts/specs/localserver_openapi.yaml \
151 | --output-directory Sources/WhisperKitCLI/Server/GeneratedSources \
152 | --mode types \
153 | --mode server
154 | @echo ""
155 | @echo "=========================================="
156 | @echo "Server generation complete!"
157 | @echo "=========================================="
158 | @echo "Run 'BUILD_ALL=1 swift run whisperkit-cli serve' to start the server"
159 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Python/test_transcribe.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2025 Argmax, Inc. All rights reserved.
2 | # For licensing see accompanying LICENSE.md file.
3 |
4 | """
5 | Test transcription with audio files from Tests/WhisperKitTests/Resources/
6 |
7 | This script tests transcription functionality using the actual test audio files
8 | from the WhisperKit test suite.
9 | """
10 |
11 | import os
12 | import sys
13 | import argparse
14 | from pathlib import Path
15 | from openai import OpenAI
16 |
17 |
18 | def get_test_audio_files():
19 | """
20 | Get list of available test audio files from Tests/WhisperKitTests/Resources/
21 |
22 | Returns:
23 | List of audio file paths
24 | """
25 | # Path to test resources relative to project root
26 | resources_dir = Path(__file__).parent.parent.parent.parent / "Tests" / "WhisperKitTests" / "Resources"
27 |
28 | if not resources_dir.exists():
29 | print(f"Error: Test resources directory not found: {resources_dir}")
30 | return []
31 |
32 | # Audio file extensions to look for
33 | audio_extensions = {'.wav', '.m4a', '.mp3', '.flac', '.aac'}
34 |
35 | audio_files = []
36 | for file_path in resources_dir.iterdir():
37 | if file_path.is_file() and file_path.suffix.lower() in audio_extensions:
38 | audio_files.append(file_path)
39 |
40 | return sorted(audio_files)
41 |
42 |
43 | def transcribe_test_file(client, audio_file_path, language=None, prompt=None):
44 | """
45 | Transcribe a test audio file using the local WhisperKit server.
46 |
47 | Args:
48 | client: OpenAI client instance
49 | audio_file_path: Path to the audio file
50 | language: Optional language code
51 | prompt: Optional prompt to guide transcription
52 |
53 | Returns:
54 | Transcription result or None if failed
55 | """
56 | try:
57 | print(f"Transcribing: {audio_file_path.name}")
58 |
59 | with open(audio_file_path, "rb") as audio_file:
60 | response = client.audio.transcriptions.create(
61 | model="tiny",
62 | file=audio_file,
63 | language=language,
64 | prompt=prompt,
65 | response_format="verbose_json"
66 | )
67 | return response
68 | except Exception as e:
69 | print(f"Error transcribing {audio_file_path.name}: {e}")
70 | return None
71 |
72 |
73 | def main():
74 | parser = argparse.ArgumentParser(
75 | description="Test transcription with WhisperKit test audio files"
76 | )
77 | parser.add_argument(
78 | "--language",
79 | help="Language code (e.g., 'en', 'es', 'ja')"
80 | )
81 | parser.add_argument(
82 | "--prompt",
83 | help="Optional prompt to guide transcription"
84 | )
85 | parser.add_argument(
86 | "--server-url",
87 | default="http://localhost:50060/v1",
88 | help="WhisperKit server URL (default: http://localhost:50060/v1)"
89 | )
90 | parser.add_argument(
91 | "--file",
92 | help="Specific test file to transcribe (e.g., 'jfk.wav')"
93 | )
94 |
95 | args = parser.parse_args()
96 |
97 | # Get available test audio files
98 | test_files = get_test_audio_files()
99 |
100 | if not test_files:
101 | print("No test audio files found!")
102 | sys.exit(1)
103 |
104 | print("Available test audio files:")
105 | for i, file_path in enumerate(test_files, 1):
106 | print(f" {i}. {file_path.name}")
107 |
108 | # Initialize OpenAI client with local server
109 | client = OpenAI(
110 | base_url=args.server_url,
111 | api_key="dummy-key"
112 | )
113 |
114 | print(f"\nConnecting to WhisperKit server at: {args.server_url}")
115 |
116 | if args.language:
117 | print(f"Language: {args.language}")
118 | if args.prompt:
119 | print(f"Prompt: {args.prompt}")
120 |
121 | # Determine which files to process
122 | if args.file:
123 | # Process specific file
124 | target_file = None
125 | for file_path in test_files:
126 | if file_path.name == args.file:
127 | target_file = file_path
128 | break
129 |
130 | if not target_file:
131 | print(f"Error: Test file '{args.file}' not found")
132 | print("Available files:", [f.name for f in test_files])
133 | sys.exit(1)
134 |
135 | files_to_process = [target_file]
136 | else:
137 | # Process all files
138 | files_to_process = test_files
139 |
140 | print(f"\nProcessing {len(files_to_process)} file(s)...")
141 |
142 | # Process each file
143 | for i, audio_file in enumerate(files_to_process, 1):
144 | print(f"\n{'='*50}")
145 | print(f"File {i}/{len(files_to_process)}: {audio_file.name}")
146 | print(f"{'='*50}")
147 |
148 | result = transcribe_test_file(
149 | client,
150 | audio_file,
151 | language=args.language,
152 | prompt=args.prompt
153 | )
154 |
155 | if result:
156 | print(f"\n✓ Transcription successful!")
157 | print(f"Text: {result.text}")
158 |
159 | if hasattr(result, 'segments') and result.segments:
160 | print(f"\nSegments ({len(result.segments)}):")
161 | for segment in result.segments:
162 | print(f" [{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}")
163 |
164 | if hasattr(result, 'language') and result.language:
165 | print(f"\nDetected Language: {result.language}")
166 |
167 | # File size info
168 | file_size = audio_file.stat().st_size / 1024 # KB
169 | print(f"\nFile size: {file_size:.1f} KB")
170 |
171 | else:
172 | print(f"✗ Transcription failed for {audio_file.name}")
173 |
174 | print(f"\n{'='*50}")
175 | print("Test transcription complete!")
176 | print(f"Processed {len(files_to_process)} file(s)")
177 |
178 |
179 | if __name__ == "__main__":
180 | main()
181 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Evaluate/WERUtils.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Return the operations needed to transform s1 into s2 using Wagner-Fischer algo.
7 | /// "i" = insertion, "d" = deletion, "r" = replacement
8 | enum EditOp: UInt8 {
9 | case blank
10 | case replace
11 | case delete
12 | case insert
13 | }
14 |
15 | enum WERUtils {
16 | static func wordsToChars(reference: [[String]], hypothesis: [[String]]) -> ([String], [String]) {
17 | // tokenize each word into an integer
18 | let vocabulary = Set((reference + hypothesis).flatMap { $0 })
19 | let word2char = Dictionary(uniqueKeysWithValues: vocabulary.enumerated().map { index, value in
20 | (value, index)
21 | })
22 |
23 | let referenceCharsEfficient = reference.map { sentence in
24 | String(sentence.lazy.compactMap { word in
25 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) {
26 | return Character(unicodeScalar)
27 | }
28 | return nil
29 | })
30 | }
31 |
32 | let hypothesisCharsEfficient = hypothesis.map { sentence in
33 | String(sentence.lazy.compactMap { word in
34 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) {
35 | return Character(unicodeScalar)
36 | }
37 | return nil
38 | })
39 | }
40 |
41 | return (referenceCharsEfficient, hypothesisCharsEfficient)
42 | }
43 |
44 | static func processWords(reference: [String], hypothesis: [String]) -> (Double, [[String?]]) {
45 | var refTransformed = NormalizationUtils.removeMultipleSpaces(sentences: reference)
46 | refTransformed = NormalizationUtils.strip(sentences: refTransformed)
47 | let refTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: refTransformed)
48 |
49 | var hypTransformed = NormalizationUtils.removeMultipleSpaces(sentences: hypothesis)
50 | hypTransformed = NormalizationUtils.strip(sentences: hypTransformed)
51 | let hypTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: hypTransformed)
52 |
53 | let (refAsChars, hypAsChars) = WERUtils.wordsToChars(reference: refTransformedReduced, hypothesis: hypTransformedReduced)
54 |
55 | let refArrays = refAsChars.map { Array($0.unicodeScalars) }
56 | let hypArrays = hypAsChars.map { Array($0.unicodeScalars) }
57 |
58 | var (numHits, numSubstitutions, numDeletions, numInsertions) = (0, 0, 0, 0)
59 | var (numRfWords, numHypWords) = (0, 0)
60 | var diffResult: [[String?]] = []
61 |
62 | for (referenceSentence, hypothesisSentence) in zip(refArrays, hypArrays) {
63 | let editOps = levenshtein(referenceSentence, hypothesisSentence)
64 |
65 | // count the number of edits of each type
66 | var substitutions = 0
67 | var deletions = 0
68 | var insertions = 0
69 |
70 | var referenceIndex = 0
71 | var hypothesisIndex = 0
72 | for op in editOps {
73 | switch op {
74 | case .replace:
75 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"])
76 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"])
77 | substitutions += 1
78 | referenceIndex += 1
79 | hypothesisIndex += 1
80 | case .delete:
81 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"])
82 | deletions += 1
83 | referenceIndex += 1
84 | case .insert:
85 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"])
86 | insertions += 1
87 | hypothesisIndex += 1
88 | case .blank:
89 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), nil])
90 | referenceIndex += 1
91 | hypothesisIndex += 1
92 | }
93 | }
94 |
95 | let hits: Int = referenceSentence.count - (substitutions + deletions)
96 |
97 | numHits += hits
98 | numSubstitutions += substitutions
99 | numDeletions += deletions
100 | numInsertions += insertions
101 | numRfWords += referenceSentence.count
102 | numHypWords += hypothesisSentence.count
103 | }
104 |
105 | let wer = Double(numSubstitutions + numDeletions + numInsertions) / Double(numHits + numSubstitutions + numDeletions)
106 |
107 | return (wer, diffResult)
108 | }
109 |
110 | static func evaluate(originalTranscript: String, generatedTranscript: String, normalizeOriginal: Bool = true) -> (wer: Double, diff: [[String?]]) {
111 | let normalizer = EnglishTextNormalizer()
112 | let reference = normalizeOriginal ? normalizer.normalize(text: originalTranscript) : originalTranscript
113 | let hypothesis = normalizer.normalize(text: generatedTranscript)
114 |
115 | let (wer, diff) = WERUtils.processWords(
116 | reference: [reference],
117 | hypothesis: [hypothesis]
118 | )
119 | return (wer, diff)
120 | }
121 |
122 | static func processDiff(originalTranscript: String, generatedTranscript: String) -> [[String?]] {
123 | let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript)
124 | return diff
125 | }
126 |
127 | static func diffString(from diff: [[String?]]) -> String {
128 | return diff.compactMap { entry -> String? in
129 | guard let word = entry[0], word != " " else { return nil }
130 | if let changeType = entry[1] {
131 | return "\(changeType)\(word)"
132 | }
133 | return word
134 | }.joined(separator: " ")
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Evaluate/DistanceCalculation.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Compute the last row of the edit distance dynamic programming matrix
7 | /// between s1 and s2.
8 | func computeLastRow(_ s1Chars: [Unicode.Scalar], _ s2Chars: [Unicode.Scalar]) -> [Int] {
9 | var prevRow = Array(0...s2Chars.endIndex)
10 |
11 | for i in 1...s1Chars.endIndex {
12 | var currentRow = [Int](repeating: 0, count: s2Chars.endIndex + 1)
13 | currentRow[0] = i
14 |
15 | for j in 1...s2Chars.endIndex {
16 | let cost = s1Chars[i - 1] == s2Chars[j - 1] ? 0 : 1
17 | currentRow[j] = min(
18 | prevRow[j] + 1, // Deletion
19 | currentRow[j - 1] + 1, // Insertion
20 | prevRow[j - 1] + cost // Substitution
21 | )
22 | }
23 | prevRow = currentRow
24 | }
25 |
26 | return prevRow
27 | }
28 |
29 | func needlemanWunsch(_ xArray: [Unicode.Scalar], _ yArray: [Unicode.Scalar]) -> [EditOp] {
30 | let m = xArray.count
31 | let n = yArray.count
32 |
33 | var dp = [[Int]](repeating: [Int](repeating: 0, count: n + 1), count: m + 1)
34 | for i in 1...m {
35 | dp[i][0] = i
36 | }
37 | for j in 1...n {
38 | dp[0][j] = j
39 | }
40 |
41 | for i in 1...m {
42 | for j in 1...n {
43 | let cost = xArray[i - 1] == yArray[j - 1] ? 0 : 1
44 | dp[i][j] = min(
45 | dp[i - 1][j] + 1, // Deletion
46 | dp[i][j - 1] + 1, // Insertion
47 | dp[i - 1][j - 1] + cost // Substitution
48 | )
49 | }
50 | }
51 |
52 | var i = m
53 | var j = n
54 | var ops = [EditOp]()
55 |
56 | while i > 0, j > 0 {
57 | if dp[i][j] == dp[i - 1][j - 1], xArray[i - 1] == yArray[j - 1] {
58 | // Match operation is omitted
59 | i -= 1
60 | j -= 1
61 | } else if dp[i][j] == dp[i - 1][j - 1] + 1 {
62 | ops.append(EditOp.replace) // Substitution
63 | i -= 1
64 | j -= 1
65 | } else if dp[i][j] == dp[i][j - 1] + 1 {
66 | ops.append(EditOp.insert) // Insertion
67 | j -= 1
68 | } else {
69 | ops.append(EditOp.delete) // Deletion
70 | i -= 1
71 | }
72 | }
73 |
74 | while i > 0 {
75 | ops.append(EditOp.delete)
76 | i -= 1
77 | }
78 | while j > 0 {
79 | ops.append(EditOp.insert)
80 | j -= 1
81 | }
82 |
83 | return ops.reversed()
84 | }
85 |
86 | func hirschberg(_ reference: [Unicode.Scalar], _ s2: [Unicode.Scalar]) -> [EditOp] {
87 | func hirschbergRec(_ x: [Unicode.Scalar], _ y: [Unicode.Scalar]) -> [EditOp] {
88 | let m = x.endIndex
89 | let n = y.endIndex
90 |
91 | if m == 0 {
92 | let result = y.map { _ in EditOp.insert }
93 | return result
94 | }
95 | if n == 0 {
96 | let result = x.map { _ in EditOp.delete }
97 | return result
98 | }
99 | if m == 1 || n == 1 {
100 | let result = needlemanWunsch(x, y)
101 | return result
102 | }
103 |
104 | let i = m / 2
105 | let xPrefix = Array(x[x.startIndex.. [EditOp] {
136 | let n = sourceText.count
137 | let m = targetText.count
138 | let maxD = n + m
139 | let vSize = 2 * maxD + 1
140 | var v = [Int](repeating: 0, count: vSize)
141 | var trace = [[Int]]()
142 |
143 | let offset = maxD
144 |
145 | for d in 0...maxD {
146 | let vSnapshot = v
147 | for k in stride(from: -d, through: d, by: 2) {
148 | let kIndex = k + offset
149 | var x: Int
150 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) {
151 | x = v[kIndex + 1]
152 | } else {
153 | x = v[kIndex - 1] + 1
154 | }
155 | var y = x - k
156 | while x < n, y < m, sourceText[x] == targetText[y] {
157 | x += 1
158 | y += 1
159 | }
160 | v[kIndex] = x
161 | if x >= n, y >= m {
162 | trace.append(vSnapshot)
163 | return backtrack(trace: trace, sourceText: sourceText, targetText: targetText)
164 | }
165 | }
166 | trace.append(vSnapshot)
167 | }
168 | return []
169 | }
170 |
171 | func backtrack(trace: [[Int]], sourceText: [Unicode.Scalar], targetText: [Unicode.Scalar]) -> [EditOp] {
172 | var editOps = [EditOp]()
173 | let n = sourceText.count
174 | let m = targetText.count
175 | let offset = trace[0].count / 2
176 | var x = n
177 | var y = m
178 |
179 | for d in stride(from: trace.count - 1, through: 0, by: -1) {
180 | let v = trace[d]
181 | let k = x - y
182 | let kIndex = k + offset
183 |
184 | var prevK: Int
185 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) {
186 | prevK = k + 1
187 | } else {
188 | prevK = k - 1
189 | }
190 | let prevX = v[prevK + offset]
191 | let prevY = prevX - prevK
192 |
193 | while x > prevX, y > prevY {
194 | // Match or Replace
195 | if sourceText[x - 1] == targetText[y - 1] {
196 | editOps.append(.blank)
197 | } else {
198 | editOps.append(.replace)
199 | }
200 | x -= 1
201 | y -= 1
202 | }
203 |
204 | if d > 0 {
205 | if x == prevX {
206 | // Insertion
207 | editOps.append(.insert)
208 | y -= 1
209 | } else {
210 | // Deletion
211 | editOps.append(.delete)
212 | x -= 1
213 | }
214 | }
215 | }
216 |
217 | return editOps.reversed()
218 | }
219 |
--------------------------------------------------------------------------------
/Examples/ServeCLIClient/Curl/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright © 2025 Argmax, Inc. All rights reserved.
4 | # For licensing see accompanying LICENSE.md file.
5 |
6 | # WhisperKit CurlClient - Test Script
7 | # This script demonstrates various features of the CurlClient
8 |
9 | set -e
10 |
11 | # Colors for output
12 | RED='\033[0;31m'
13 | GREEN='\033[0;32m'
14 | YELLOW='\033[1;33m'
15 | BLUE='\033[0;34m'
16 | NC='\033[0m' # No Color
17 |
18 | # Test audio files (adjust paths as needed)
19 | TEST_FILES=(
20 | "../../../Tests/WhisperKitTests/Resources/jfk.wav"
21 | "../../../Tests/WhisperKitTests/Resources/es_test_clip.wav"
22 | "../../../Tests/WhisperKitTests/Resources/ja_test_clip.wav"
23 | )
24 |
25 | # Server URL
26 | SERVER_URL="http://localhost:50060"
27 |
28 | echo -e "${BLUE}🧪 WhisperKit CurlClient Test Suite${NC}"
29 | echo -e "${YELLOW}Testing against server:${NC} $SERVER_URL"
30 | echo ""
31 |
32 | # Check if server is running
33 | echo -e "${BLUE}🔍 Checking server status...${NC}"
34 | if curl -s "$SERVER_URL" > /dev/null 2>&1; then
35 | echo -e "${GREEN}✅ Server is running${NC}"
36 | else
37 | echo -e "${RED}❌ Server is not running at $SERVER_URL${NC}"
38 | echo -e "${YELLOW}Please start the server first:${NC}"
39 | echo " whisperkit-cli serve --model tiny"
40 | exit 1
41 | fi
42 |
43 | echo ""
44 |
45 | test_logprobs() {
46 | echo "🧪 Testing transcription with logprobs..."
47 |
48 | # Find test audio files - use absolute path
49 | local test_file=""
50 | if [ -f "../../../Tests/WhisperKitTests/Resources/jfk.wav" ]; then
51 | test_file="$(cd ../../../Tests/WhisperKitTests/Resources && pwd)/jfk.wav"
52 | elif [ -f "../../../Tests/WhisperKitTests/Resources/es_test_clip.wav" ]; then
53 | test_file="$(cd ../../../Tests/WhisperKitTests/Resources && pwd)/es_test_clip.wav"
54 | elif [ -f "../../../Tests/WhisperKitTests/Resources/ja_test_clip.wav" ]; then
55 | test_file="$(cd ../../../Tests/WhisperKitTests/Resources && pwd)/ja_test_clip.wav"
56 | fi
57 |
58 | if [ -z "$test_file" ]; then
59 | echo "❌ No test audio files found"
60 | return 1
61 | fi
62 |
63 | echo "📁 Using test file: $(basename "$test_file")"
64 | echo "🔍 Full path: $test_file"
65 |
66 | # Test with logprobs enabled
67 | echo "🔍 Testing with file: $test_file"
68 | echo "🔍 Server URL: $SERVER_URL"
69 |
70 | local response=$(curl -s -X POST "$SERVER_URL/v1/audio/transcriptions" \
71 | -H "Content-Type: multipart/form-data" \
72 | -F "file=@$test_file" \
73 | -F "model=tiny" \
74 | -F "response_format=json" \
75 | -F "include[]=logprobs")
76 |
77 | # Debug: Show response length and first part
78 | echo "🔍 Response length: ${#response}"
79 | echo "🔍 Response preview: ${response:0:200}..."
80 |
81 | if echo "$response" | grep -q "logprobs"; then
82 | echo "✅ Logprobs received in response"
83 |
84 | # Extract and display logprobs info
85 | local logprobs_count=$(echo "$response" | jq -r '.logprobs | length' 2>/dev/null || echo "0")
86 | echo "📊 Found $logprobs_count logprob entries"
87 |
88 | # Show first few logprobs
89 | if [ "$logprobs_count" -gt 0 ]; then
90 | echo "🔍 First few logprobs:"
91 | echo "$response" | jq -r '.logprobs[0:3][] | " Token: \(.token) - Logprob: \(.logprob)"' 2>/dev/null || echo " Could not parse logprobs"
92 | fi
93 |
94 | return 0
95 | else
96 | echo "❌ No logprobs in response"
97 | echo "Available keys: $(echo "$response" | jq -r 'keys | join(", ")' 2>/dev/null || echo "Could not parse response")"
98 | return 1
99 | fi
100 | }
101 |
102 | # Test 1: Basic transcription
103 | echo -e "${BLUE}📝 Test 1: Basic Transcription (verbose_json)${NC}"
104 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}"
105 | echo ""
106 | ./transcribe.sh "${TEST_FILES[0]}" --response-format verbose_json
107 | echo ""
108 |
109 | # Test 2: Basic transcription with JSON format
110 | echo -e "${BLUE}📝 Test 2: Basic Transcription (json)${NC}"
111 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}"
112 | echo ""
113 | ./transcribe.sh "${TEST_FILES[0]}" --response-format json
114 | echo ""
115 |
116 | # Test 3: Transcription with word timestamps
117 | echo -e "${BLUE}📝 Test 3: Transcription with Word Timestamps${NC}"
118 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}"
119 | echo ""
120 | ./transcribe.sh "${TEST_FILES[0]}" --timestamp-granularities "word,segment"
121 | echo ""
122 |
123 | # Test 4: Spanish transcription
124 | echo -e "${BLUE}📝 Test 4: Spanish Transcription${NC}"
125 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}"
126 | echo -e "${YELLOW}Language:${NC} es"
127 | echo ""
128 | ./transcribe.sh "${TEST_FILES[1]}" --language es
129 | echo ""
130 |
131 | # Test 5: Japanese transcription
132 | echo -e "${BLUE}📝 Test 5: Japanese Transcription${NC}"
133 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[2]}"
134 | echo -e "${YELLOW}Language:${NC} ja"
135 | echo ""
136 | ./transcribe.sh "${TEST_FILES[2]}" --language ja
137 | echo ""
138 |
139 | # Test 6: Translation (Spanish to English)
140 | echo -e "${BLUE}🌐 Test 6: Translation (Spanish to English)${NC}"
141 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}"
142 | echo -e "${YELLOW}Source Language:${NC} es"
143 | echo ""
144 | ./translate.sh "${TEST_FILES[1]}" --language es
145 | echo ""
146 |
147 | # Test 7: Translation (Japanese to English)
148 | echo -e "${BLUE}🌐 Test 7: Translation (Japanese to English)${NC}"
149 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[2]}"
150 | echo -e "${YELLOW}Source Language:${NC} ja"
151 | echo ""
152 | ./translate.sh "${TEST_FILES[2]}" --language ja
153 | echo ""
154 |
155 | # Test 7.5: Translation with basic JSON format
156 | echo -e "${BLUE}🌐 Test 7.5: Translation with JSON Format${NC}"
157 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}"
158 | echo -e "${YELLOW}Source Language:${NC} es"
159 | echo ""
160 | ./translate.sh "${TEST_FILES[1]}" --language es --response-format json
161 | echo ""
162 |
163 | # Test 8: Streaming transcription
164 | echo -e "${BLUE}📡 Test 8: Streaming Transcription${NC}"
165 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[0]}"
166 | echo ""
167 | ./transcribe.sh "${TEST_FILES[0]}" --stream true
168 | echo ""
169 |
170 | # Test 8.5: Translation with prompt
171 | echo -e "${BLUE}📝 Test 8.5: Translation with Prompt${NC}"
172 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}"
173 | echo -e "${YELLOW}Source Language:${NC} es"
174 | echo ""
175 | ./translate.sh "${TEST_FILES[1]}" --language es --prompt "This is a formal conversation"
176 | echo ""
177 |
178 | # Test 9: Logprobs functionality
179 | echo -e "${BLUE}🧪 Test 9: Logprobs Functionality${NC}"
180 | if test_logprobs; then
181 | echo -e "${GREEN}✅ Logprobs test passed${NC}"
182 | else
183 | echo -e "${RED}❌ Logprobs test failed${NC}"
184 | fi
185 | echo ""
186 |
187 | # Test 10: Translation with different temperature
188 | echo -e "${BLUE}🌡️ Test 10: Translation with Temperature${NC}"
189 | echo -e "${YELLOW}File:${NC} ${TEST_FILES[1]}"
190 | echo -e "${YELLOW}Source Language:${NC} es"
191 | echo ""
192 | ./translate.sh "${TEST_FILES[1]}" --language es --temperature 0.2
193 | echo ""
194 |
195 | echo -e "${GREEN}🎉 All tests completed!${NC}"
196 | echo ""
197 | echo -e "${BLUE}📚 For more examples, see:${NC}"
198 | echo " ./transcribe.sh --help"
199 | echo " ./translate.sh --help"
200 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Utilities/Extensions+Internal.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import AVFoundation
5 | import CoreML
6 |
7 | extension MLMultiArray {
8 | /// All values will be stored in the last dimension of the MLMultiArray (default is dims=1)
9 | static func from(_ array: [Int], dims: Int = 1) throws -> MLMultiArray {
10 | var shape = Array(repeating: 1, count: dims)
11 | shape[shape.count - 1] = array.count
12 | /// Examples:
13 | /// dims=1 : [arr.count]
14 | /// dims=2 : [1, arr.count]
15 | ///
16 | let output = try MLMultiArray(shape: shape as [NSNumber], dataType: .int32)
17 | let pointer = UnsafeMutablePointer(OpaquePointer(output.dataPointer))
18 | for (i, item) in array.enumerated() {
19 | pointer[i] = Int32(item)
20 | }
21 | return output
22 | }
23 | }
24 |
25 | extension Array {
26 | func batched(into size: Int) -> [[Element]] {
27 | return stride(from: 0, to: count, by: size).map {
28 | Array(self[$0.. {
34 | /// Convenience method to convert the `Result` object into an array of optional arrays of `TranscriptionResult`.
35 | /// - Returns: An array of optional arrays containing `TranscriptionResult`.
36 | func toOptionalArrays() -> [[TranscriptionResult]?] {
37 | return self.map { try? $0.get() }
38 | }
39 | }
40 |
41 | extension Array where Element: Hashable {
42 | /// Returns an array with duplicates removed, preserving the original order.
43 | var orderedSet: [Element] {
44 | var seen = Set()
45 | return self.filter { element in
46 | if seen.contains(element) {
47 | return false
48 | } else {
49 | seen.insert(element)
50 | return true
51 | }
52 | }
53 | }
54 | }
55 |
56 | extension String {
57 | /// Reference: https://github.com/huggingface/swift-transformers/blob/94610577e4af9bbc267060af1e25e977604dd796/Sources/Tokenizers/Decoder.swift#L267-L275
58 | func trimmingFromEnd(character: Character = " ", upto: Int) -> String {
59 | var result = self
60 | var trimmed = 0
61 | while trimmed < upto && result.last == character {
62 | result.removeLast()
63 | trimmed += 1
64 | }
65 | return result
66 | }
67 | }
68 |
69 | extension [String] {
70 | /// Reference: https://github.com/huggingface/swift-transformers/blob/94610577e4af9bbc267060af1e25e977604dd796/Sources/Hub/HubApi.swift#L983-L987
71 | func matching(glob: String) -> [String] {
72 | filter { fnmatch(glob, $0, 0) == 0 }
73 | }
74 | }
75 |
76 | extension AVAudioPCMBuffer {
77 | /// Converts the buffer to a float array
78 | func asFloatArray() throws -> [Float] {
79 | guard let data = floatChannelData?.pointee else {
80 | throw WhisperError.audioProcessingFailed("Error converting audio, missing floatChannelData")
81 | }
82 | return Array(UnsafeBufferPointer(start: data, count: Int(frameLength)))
83 | }
84 |
85 | /// Appends the contents of another buffer to the current buffer
86 | func appendContents(of buffer: AVAudioPCMBuffer) -> Bool {
87 | return appendContents(of: buffer, startingFrame: 0, frameCount: buffer.frameLength)
88 | }
89 |
90 | /// Appends a specific range of frames from another buffer to the current buffer
91 | func appendContents(of buffer: AVAudioPCMBuffer, startingFrame: AVAudioFramePosition, frameCount: AVAudioFrameCount) -> Bool {
92 | guard format == buffer.format else {
93 | Logging.debug("Format mismatch")
94 | return false
95 | }
96 |
97 | guard startingFrame + AVAudioFramePosition(frameCount) <= AVAudioFramePosition(buffer.frameLength) else {
98 | Logging.error("Insufficient audio in buffer")
99 | return false
100 | }
101 |
102 | guard let destination = floatChannelData, let source = buffer.floatChannelData else {
103 | Logging.error("Failed to access float channel data")
104 | return false
105 | }
106 |
107 | var calculatedFrameCount = frameCount
108 | if frameLength + frameCount > frameCapacity {
109 | Logging.debug("Insufficient space in buffer, reducing frame count to fit")
110 | calculatedFrameCount = frameCapacity - frameLength
111 | }
112 |
113 | let calculatedStride = stride
114 | let destinationPointer = destination.pointee.advanced(by: calculatedStride * Int(frameLength))
115 | let sourcePointer = source.pointee.advanced(by: calculatedStride * Int(startingFrame))
116 |
117 | memcpy(destinationPointer, sourcePointer, Int(calculatedFrameCount) * calculatedStride * MemoryLayout.size)
118 |
119 | frameLength += calculatedFrameCount
120 | return true
121 | }
122 |
123 | /// Convenience initializer to concatenate multiple buffers into one
124 | convenience init?(concatenating buffers: [AVAudioPCMBuffer]) {
125 | guard !buffers.isEmpty else {
126 | Logging.debug("Buffers array should not be empty")
127 | return nil
128 | }
129 |
130 | let totalFrames = buffers.reduce(0) { $0 + $1.frameLength }
131 |
132 | guard let firstBuffer = buffers.first else {
133 | Logging.debug("Failed to get the first buffer")
134 | return nil
135 | }
136 |
137 | self.init(pcmFormat: firstBuffer.format, frameCapacity: totalFrames)
138 |
139 | for buffer in buffers {
140 | if !appendContents(of: buffer) {
141 | Logging.debug("Failed to append buffer")
142 | return nil
143 | }
144 | }
145 | }
146 |
147 | /// Computed property to determine the stride for float channel data
148 | private var stride: Int {
149 | return Int(format.streamDescription.pointee.mBytesPerFrame) / MemoryLayout.size
150 | }
151 | }
152 |
153 | // MARK: - WhisperKit Components
154 |
155 | extension AudioProcessing {
156 | static func getDownloadsDirectory() -> URL {
157 | let paths = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask)
158 | return paths[0]
159 | }
160 |
161 | static func saveBuffer(_ buffer: AVAudioPCMBuffer, to url: URL) throws {
162 | // create folder
163 | let folderURL = url.deletingLastPathComponent()
164 | if !FileManager.default.fileExists(atPath: folderURL.path) {
165 | try FileManager.default.createDirectory(at: folderURL, withIntermediateDirectories: true, attributes: nil)
166 | }
167 | let audioFile = try AVAudioFile(forWriting: url, settings: buffer.format.settings)
168 | try audioFile.write(from: buffer)
169 | }
170 | }
171 |
172 | extension DecodingOptions {
173 | func prepareSeekClips(contentFrames: Int) -> [(start: Int, end: Int)] {
174 | var seekPoints: [Int] = clipTimestamps.map { Int(round($0 * Float(WhisperKit.sampleRate))) }
175 | if seekPoints.count == 0 {
176 | seekPoints.append(0)
177 | }
178 |
179 | if seekPoints.count % 2 == 1 {
180 | seekPoints.append(contentFrames)
181 | }
182 |
183 | var seekClips: [(start: Int, end: Int)] = []
184 | for i in stride(from: 0, to: seekPoints.count, by: 2) {
185 | let start = seekPoints[i]
186 | let end = i + 1 < seekPoints.count ? seekPoints[i + 1] : contentFrames
187 | seekClips.append((start, end))
188 | }
189 |
190 | return seekClips
191 | }
192 | }
193 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "images" : [
3 | {
4 | "filename" : "40.png",
5 | "idiom" : "universal",
6 | "platform" : "ios",
7 | "scale" : "2x",
8 | "size" : "20x20"
9 | },
10 | {
11 | "filename" : "60.png",
12 | "idiom" : "universal",
13 | "platform" : "ios",
14 | "scale" : "3x",
15 | "size" : "20x20"
16 | },
17 | {
18 | "filename" : "58 1.png",
19 | "idiom" : "universal",
20 | "platform" : "ios",
21 | "scale" : "2x",
22 | "size" : "29x29"
23 | },
24 | {
25 | "filename" : "87 1.png",
26 | "idiom" : "universal",
27 | "platform" : "ios",
28 | "scale" : "3x",
29 | "size" : "29x29"
30 | },
31 | {
32 | "filename" : "76.png",
33 | "idiom" : "universal",
34 | "platform" : "ios",
35 | "scale" : "2x",
36 | "size" : "38x38"
37 | },
38 | {
39 | "filename" : "114.png",
40 | "idiom" : "universal",
41 | "platform" : "ios",
42 | "scale" : "3x",
43 | "size" : "38x38"
44 | },
45 | {
46 | "filename" : "80 1.png",
47 | "idiom" : "universal",
48 | "platform" : "ios",
49 | "scale" : "2x",
50 | "size" : "40x40"
51 | },
52 | {
53 | "filename" : "120.png",
54 | "idiom" : "universal",
55 | "platform" : "ios",
56 | "scale" : "3x",
57 | "size" : "40x40"
58 | },
59 | {
60 | "filename" : "120 1.png",
61 | "idiom" : "universal",
62 | "platform" : "ios",
63 | "scale" : "2x",
64 | "size" : "60x60"
65 | },
66 | {
67 | "filename" : "180.png",
68 | "idiom" : "universal",
69 | "platform" : "ios",
70 | "scale" : "3x",
71 | "size" : "60x60"
72 | },
73 | {
74 | "filename" : "128 1.png",
75 | "idiom" : "universal",
76 | "platform" : "ios",
77 | "scale" : "2x",
78 | "size" : "64x64"
79 | },
80 | {
81 | "filename" : "192.png",
82 | "idiom" : "universal",
83 | "platform" : "ios",
84 | "scale" : "3x",
85 | "size" : "64x64"
86 | },
87 | {
88 | "filename" : "136.png",
89 | "idiom" : "universal",
90 | "platform" : "ios",
91 | "scale" : "2x",
92 | "size" : "68x68"
93 | },
94 | {
95 | "filename" : "152.png",
96 | "idiom" : "universal",
97 | "platform" : "ios",
98 | "scale" : "2x",
99 | "size" : "76x76"
100 | },
101 | {
102 | "filename" : "167.png",
103 | "idiom" : "universal",
104 | "platform" : "ios",
105 | "scale" : "2x",
106 | "size" : "83.5x83.5"
107 | },
108 | {
109 | "filename" : "1024 1.png",
110 | "idiom" : "universal",
111 | "platform" : "ios",
112 | "size" : "1024x1024"
113 | },
114 | {
115 | "filename" : "16.png",
116 | "idiom" : "mac",
117 | "scale" : "1x",
118 | "size" : "16x16"
119 | },
120 | {
121 | "filename" : "32.png",
122 | "idiom" : "mac",
123 | "scale" : "2x",
124 | "size" : "16x16"
125 | },
126 | {
127 | "filename" : "32.png",
128 | "idiom" : "mac",
129 | "scale" : "1x",
130 | "size" : "32x32"
131 | },
132 | {
133 | "filename" : "64.png",
134 | "idiom" : "mac",
135 | "scale" : "2x",
136 | "size" : "32x32"
137 | },
138 | {
139 | "filename" : "128.png",
140 | "idiom" : "mac",
141 | "scale" : "1x",
142 | "size" : "128x128"
143 | },
144 | {
145 | "filename" : "256.png",
146 | "idiom" : "mac",
147 | "scale" : "2x",
148 | "size" : "128x128"
149 | },
150 | {
151 | "filename" : "256.png",
152 | "idiom" : "mac",
153 | "scale" : "1x",
154 | "size" : "256x256"
155 | },
156 | {
157 | "filename" : "512.png",
158 | "idiom" : "mac",
159 | "scale" : "2x",
160 | "size" : "256x256"
161 | },
162 | {
163 | "filename" : "512.png",
164 | "idiom" : "mac",
165 | "scale" : "1x",
166 | "size" : "512x512"
167 | },
168 | {
169 | "filename" : "1024.png",
170 | "idiom" : "mac",
171 | "scale" : "2x",
172 | "size" : "512x512"
173 | },
174 | {
175 | "filename" : "44.png",
176 | "idiom" : "universal",
177 | "platform" : "watchos",
178 | "scale" : "2x",
179 | "size" : "22x22"
180 | },
181 | {
182 | "filename" : "48.png",
183 | "idiom" : "universal",
184 | "platform" : "watchos",
185 | "scale" : "2x",
186 | "size" : "24x24"
187 | },
188 | {
189 | "filename" : "55.png",
190 | "idiom" : "universal",
191 | "platform" : "watchos",
192 | "scale" : "2x",
193 | "size" : "27.5x27.5"
194 | },
195 | {
196 | "filename" : "58.png",
197 | "idiom" : "universal",
198 | "platform" : "watchos",
199 | "scale" : "2x",
200 | "size" : "29x29"
201 | },
202 | {
203 | "filename" : "60 1.png",
204 | "idiom" : "universal",
205 | "platform" : "watchos",
206 | "scale" : "2x",
207 | "size" : "30x30"
208 | },
209 | {
210 | "filename" : "64 1.png",
211 | "idiom" : "universal",
212 | "platform" : "watchos",
213 | "scale" : "2x",
214 | "size" : "32x32"
215 | },
216 | {
217 | "filename" : "66.png",
218 | "idiom" : "universal",
219 | "platform" : "watchos",
220 | "scale" : "2x",
221 | "size" : "33x33"
222 | },
223 | {
224 | "filename" : "80.png",
225 | "idiom" : "universal",
226 | "platform" : "watchos",
227 | "scale" : "2x",
228 | "size" : "40x40"
229 | },
230 | {
231 | "filename" : "87.png",
232 | "idiom" : "universal",
233 | "platform" : "watchos",
234 | "scale" : "2x",
235 | "size" : "43.5x43.5"
236 | },
237 | {
238 | "filename" : "88.png",
239 | "idiom" : "universal",
240 | "platform" : "watchos",
241 | "scale" : "2x",
242 | "size" : "44x44"
243 | },
244 | {
245 | "filename" : "92.png",
246 | "idiom" : "universal",
247 | "platform" : "watchos",
248 | "scale" : "2x",
249 | "size" : "46x46"
250 | },
251 | {
252 | "filename" : "100.png",
253 | "idiom" : "universal",
254 | "platform" : "watchos",
255 | "scale" : "2x",
256 | "size" : "50x50"
257 | },
258 | {
259 | "filename" : "102.png",
260 | "idiom" : "universal",
261 | "platform" : "watchos",
262 | "scale" : "2x",
263 | "size" : "51x51"
264 | },
265 | {
266 | "filename" : "108.png",
267 | "idiom" : "universal",
268 | "platform" : "watchos",
269 | "scale" : "2x",
270 | "size" : "54x54"
271 | },
272 | {
273 | "filename" : "172.png",
274 | "idiom" : "universal",
275 | "platform" : "watchos",
276 | "scale" : "2x",
277 | "size" : "86x86"
278 | },
279 | {
280 | "filename" : "196.png",
281 | "idiom" : "universal",
282 | "platform" : "watchos",
283 | "scale" : "2x",
284 | "size" : "98x98"
285 | },
286 | {
287 | "filename" : "216.png",
288 | "idiom" : "universal",
289 | "platform" : "watchos",
290 | "scale" : "2x",
291 | "size" : "108x108"
292 | },
293 | {
294 | "filename" : "234.png",
295 | "idiom" : "universal",
296 | "platform" : "watchos",
297 | "scale" : "2x",
298 | "size" : "117x117"
299 | },
300 | {
301 | "filename" : "258.png",
302 | "idiom" : "universal",
303 | "platform" : "watchos",
304 | "scale" : "2x",
305 | "size" : "129x129"
306 | },
307 | {
308 | "filename" : "1024 2.png",
309 | "idiom" : "universal",
310 | "platform" : "watchos",
311 | "size" : "1024x1024"
312 | }
313 | ],
314 | "info" : {
315 | "author" : "xcode",
316 | "version" : 1
317 | }
318 | }
319 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Audio/VoiceActivityDetector.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not.
7 | /// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality.
8 | open class VoiceActivityDetector {
9 | /// The sample rate of the audio signal, in samples per second.
10 | public let sampleRate: Int
11 |
12 | /// The length of each frame in samples.
13 | public let frameLengthSamples: Int
14 |
15 | /// The number of samples overlapping between consecutive frames.
16 | public let frameOverlapSamples: Int
17 |
18 | /// Initializes a new `VoiceActivityDetector` instance with the specified parameters.
19 | /// - Parameters:
20 | /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000.
21 | /// - frameLengthSamples: The length of each frame in samples.
22 | /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0.
23 | /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality.
24 | public init(
25 | sampleRate: Int = 16000,
26 | frameLengthSamples: Int,
27 | frameOverlapSamples: Int = 0
28 | ) {
29 | self.sampleRate = sampleRate
30 | self.frameLengthSamples = frameLengthSamples
31 | self.frameOverlapSamples = frameOverlapSamples
32 | }
33 |
34 | /// Analyzes the provided audio waveform to determine which segments contain voice activity.
35 | /// - Parameter waveform: An array of `Float` values representing the audio waveform.
36 | /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence.
37 | open func voiceActivity(in waveform: [Float]) -> [Bool] {
38 | fatalError("`voiceActivity` must be implemented by subclass")
39 | }
40 |
41 | /// Analyzes the provided audio waveform to determine which segments contain voice activity.
42 | /// - Parameter waveform: An array of `Float` values representing the audio waveform.
43 | /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence.
44 | /// - Throws: An error if voice activity detection fails.
45 | open func voiceActivityAsync(in waveform: [Float]) async throws -> [Bool] {
46 | return voiceActivity(in: waveform)
47 | }
48 |
49 | /// Calculates and returns a list of active audio chunks, each represented by a start and end index.
50 | /// - Parameter waveform: An array of `Float` values representing the audio waveform.
51 | /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk.
52 | public func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] {
53 | let vad: [Bool] = voiceActivity(in: waveform)
54 | var result = [(startIndex: Int, endIndex: Int)]()
55 |
56 | // Temporary variables to hold the start of the current non-silent segment
57 | var currentStartIndex: Int?
58 |
59 | for (index, vadChunk) in vad.enumerated() {
60 | if vadChunk {
61 | let chunkStart = index * frameLengthSamples
62 | let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count)
63 |
64 | if currentStartIndex != nil {
65 | // If we already have a starting point, just update the end point in the last added segment
66 | result[result.count - 1].endIndex = chunkEnd
67 | } else {
68 | // If there is no current start, this is a new segment
69 | currentStartIndex = chunkStart
70 | result.append((startIndex: chunkStart, endIndex: chunkEnd))
71 | }
72 | } else {
73 | // Reset currentStartIndex when encountering a silent chunk
74 | currentStartIndex = nil
75 | }
76 | }
77 |
78 | return result
79 | }
80 |
81 | /// Converts a voice activity index to the corresponding audio sample index.
82 | /// - Parameter index: The voice activity index to convert.
83 | /// - Returns: The corresponding audio sample index.
84 | public func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int {
85 | return index * frameLengthSamples
86 | }
87 |
88 | public func voiceActivityIndexToSeconds(_ index: Int) -> Float {
89 | return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate)
90 | }
91 |
92 | /// Identifies the longest continuous period of silence within the provided voice activity detection results.
93 | /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results.
94 | /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found.
95 | public func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? {
96 | var longestStartIndex: Int?
97 | var longestEndIndex: Int?
98 | var longestCount = 0
99 | var index = 0
100 | while index < vadResult.count {
101 | let value = vadResult[index]
102 | if value {
103 | // found non-silence, skip
104 | index += 1
105 | } else {
106 | // found beginning of silence, find the end
107 | var endIndex = index
108 | while endIndex < vadResult.count, !vadResult[endIndex] {
109 | endIndex += 1
110 | }
111 | let count = endIndex - index
112 | if count > longestCount {
113 | longestCount = count
114 | longestStartIndex = index
115 | longestEndIndex = endIndex
116 | }
117 | index = endIndex
118 | }
119 | }
120 | if let longestStartIndex, let longestEndIndex {
121 | return (startIndex: longestStartIndex, endIndex: longestEndIndex)
122 | } else {
123 | return nil
124 | }
125 | }
126 |
127 | // MARK: - Utility
128 |
129 | func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] {
130 | let nonSilentChunks = calculateActiveChunks(in: waveform)
131 | var clipTimestamps = [Float]()
132 |
133 | for chunk in nonSilentChunks {
134 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
135 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
136 |
137 | clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp])
138 | }
139 |
140 | return clipTimestamps
141 | }
142 |
143 | func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] {
144 | let clipTimestamps = voiceActivityClipTimestamps(in: waveform)
145 | let options = DecodingOptions(clipTimestamps: clipTimestamps)
146 | let seekClips = options.prepareSeekClips(contentFrames: waveform.count)
147 | return seekClips
148 | }
149 |
150 | func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] {
151 | let nonSilentChunks = calculateActiveChunks(in: waveform)
152 | var seekTimestamps = [(startTime: Float, endTime: Float)]()
153 |
154 | for chunk in nonSilentChunks {
155 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
156 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
157 |
158 | seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)])
159 | }
160 |
161 | return seekTimestamps
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/config-v03.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "whisperkit-coreml",
3 | "version": "0.3",
4 | "device_support": [
5 | {
6 | "chips": "A12, A13, S9, S10",
7 | "identifiers": [
8 | "iPhone11",
9 | "iPhone12",
10 | "Watch7",
11 | "Watch8"
12 | ],
13 | "models": {
14 | "default": "openai_whisper-tiny",
15 | "supported": [
16 | "openai_whisper-tiny",
17 | "openai_whisper-tiny.en",
18 | "openai_whisper-base",
19 | "openai_whisper-base.en"
20 | ]
21 | }
22 | },
23 | {
24 | "chips": "A14",
25 | "identifiers": [
26 | "iPhone13",
27 | "iPad13,1",
28 | "iPad13,2",
29 | "iPad13,18",
30 | "iPad13,19"
31 | ],
32 | "models": {
33 | "default": "openai_whisper-base",
34 | "supported": [
35 | "openai_whisper-tiny",
36 | "openai_whisper-tiny.en",
37 | "openai_whisper-base",
38 | "openai_whisper-base.en",
39 | "openai_whisper-small",
40 | "openai_whisper-small.en"
41 | ]
42 | }
43 | },
44 | {
45 | "chips": "A15, A16, A17 Pro, A18",
46 | "identifiers": [
47 | "iPhone14",
48 | "iPhone15",
49 | "iPhone16",
50 | "iPhone17",
51 | "iPad14,1",
52 | "iPad14,2",
53 | "iPad15,7",
54 | "iPad15,8",
55 | "iPad16,1",
56 | "iPad16,2"
57 | ],
58 | "models": {
59 | "default": "openai_whisper-base",
60 | "supported": [
61 | "openai_whisper-tiny",
62 | "openai_whisper-tiny.en",
63 | "openai_whisper-base",
64 | "openai_whisper-base.en",
65 | "openai_whisper-small",
66 | "openai_whisper-small.en",
67 | "openai_whisper-large-v2_949MB",
68 | "openai_whisper-large-v2_turbo_955MB",
69 | "openai_whisper-large-v3_947MB",
70 | "openai_whisper-large-v3_turbo_954MB",
71 | "distil-whisper_distil-large-v3_594MB",
72 | "distil-whisper_distil-large-v3_turbo_600MB",
73 | "openai_whisper-large-v3-v20240930_626MB",
74 | "openai_whisper-large-v3-v20240930_turbo_632MB"
75 | ]
76 | }
77 | },
78 | {
79 | "chips": "M1",
80 | "identifiers": [
81 | "MacBookPro17,1",
82 | "MacBookPro18,1",
83 | "MacBookPro18,2",
84 | "MacBookPro18,3",
85 | "MacBookPro18,4",
86 | "MacBookAir10,1",
87 | "Macmini9,1",
88 | "iMac21,1",
89 | "iMac21,2",
90 | "Mac13",
91 | "iPad13,4",
92 | "iPad13,5",
93 | "iPad13,6",
94 | "iPad13,7",
95 | "iPad13,8",
96 | "iPad13,9",
97 | "iPad13,10",
98 | "iPad13,11",
99 | "iPad13,16",
100 | "iPad13,17"
101 | ],
102 | "models": {
103 | "default": "openai_whisper-large-v3-v20240930_626MB",
104 | "supported": [
105 | "openai_whisper-tiny",
106 | "openai_whisper-tiny.en",
107 | "openai_whisper-base",
108 | "openai_whisper-base.en",
109 | "openai_whisper-small",
110 | "openai_whisper-small.en",
111 | "openai_whisper-large-v2",
112 | "openai_whisper-large-v2_949MB",
113 | "openai_whisper-large-v3",
114 | "openai_whisper-large-v3_947MB",
115 | "distil-whisper_distil-large-v3",
116 | "distil-whisper_distil-large-v3_594MB",
117 | "openai_whisper-large-v3-v20240930_626MB"
118 | ]
119 | }
120 | },
121 | {
122 | "chips": "M2, M3, M4",
123 | "identifiers": [
124 | "Mac14",
125 | "Mac15",
126 | "Mac16",
127 | "iPad14,3",
128 | "iPad14,4",
129 | "iPad14,5",
130 | "iPad14,6",
131 | "iPad14,8",
132 | "iPad14,9",
133 | "iPad14,10",
134 | "iPad14,11",
135 | "iPad15",
136 | "iPad16"
137 | ],
138 | "models": {
139 | "default": "openai_whisper-large-v3-v20240930",
140 | "supported": [
141 | "openai_whisper-tiny",
142 | "openai_whisper-tiny.en",
143 | "openai_whisper-base",
144 | "openai_whisper-base.en",
145 | "openai_whisper-small",
146 | "openai_whisper-small.en",
147 | "openai_whisper-large-v2",
148 | "openai_whisper-large-v2_949MB",
149 | "openai_whisper-large-v2_turbo",
150 | "openai_whisper-large-v2_turbo_955MB",
151 | "openai_whisper-large-v3",
152 | "openai_whisper-large-v3_947MB",
153 | "openai_whisper-large-v3_turbo",
154 | "openai_whisper-large-v3_turbo_954MB",
155 | "distil-whisper_distil-large-v3",
156 | "distil-whisper_distil-large-v3_594MB",
157 | "distil-whisper_distil-large-v3_turbo",
158 | "distil-whisper_distil-large-v3_turbo_600MB",
159 | "openai_whisper-large-v3-v20240930",
160 | "openai_whisper-large-v3-v20240930_turbo",
161 | "openai_whisper-large-v3-v20240930_626MB",
162 | "openai_whisper-large-v3-v20240930_turbo_632MB"
163 | ]
164 | }
165 | }
166 | ],
167 | "model_checksums": {
168 | "distil-whisper_distil-large-v3": "9cd8271143b919402ae776c30b479565",
169 | "distil-whisper_distil-large-v3_594MB": "ca532f45ddbf8a3d241132cc5cf41639",
170 | "distil-whisper_distil-large-v3_turbo": "b8638452c6568dfe33a33bfcc2bc6aca",
171 | "distil-whisper_distil-large-v3_turbo_600MB": "81746b4b1afbbb01a8ae9ea452460d88",
172 | "openai_whisper-base.en": "fbcfd586f15e2952251b1d3257f18471",
173 | "openai_whisper-base": "36e60501ad0f01c1a5719e83a1f63f20",
174 | "openai_whisper-large-v2": "21b86c07318aeeef54598f15b7903979",
175 | "openai_whisper-large-v2_949MB": "71bad4e1566749d1060eda42308d9fb4",
176 | "openai_whisper-large-v2_turbo": "7734959b6550e7b5c2d732bf2b7acd23",
177 | "openai_whisper-large-v2_turbo_955MB": "cb6411862a48ec75325572081f01e5b5",
178 | "openai_whisper-large-v3-v20240930": "17ebd78ff7edfa59001b554e9cc4c021",
179 | "openai_whisper-large-v3-v20240930_547MB": "c945dad68449ac3c78ecb2d561ac189d",
180 | "openai_whisper-large-v3-v20240930_626MB": "578fe5a07f4eb7e4187c920bca571aa5",
181 | "openai_whisper-large-v3-v20240930_turbo": "dfbf09ab741af1d5400ddbd07bb37dad",
182 | "openai_whisper-large-v3-v20240930_turbo_632MB": "33954440dbd785ca1828afe25514f5a5",
183 | "openai_whisper-large-v3": "a6f24dc72785722e9cea89e227856dfe",
184 | "openai_whisper-large-v3_947MB": "ef6b0e9622a046ce2361b4c72307877f",
185 | "openai_whisper-large-v3_turbo": "c550fbdea70c5784d322c0a427f8b5cd",
186 | "openai_whisper-large-v3_turbo_954MB": "e639c4bb98d905064ef5dd38757dd9d1",
187 | "openai_whisper-small.en": "38efe6a00706bbdb995795c67a836e5e",
188 | "openai_whisper-small": "f1d21adb950bc9be5d5343bcdeccd23b",
189 | "openai_whisper-tiny.en": "e1183fd55448923b1ce43a2da67aa21f",
190 | "openai_whisper-tiny": "7147518a3d68ddbea0691e04cfffa4ff"
191 | }
192 | }
193 |
--------------------------------------------------------------------------------