├── .github
└── workflows
│ ├── development-tests.yml
│ ├── expo-update.yml
│ ├── homebrew-update.yml
│ ├── pre-release-tests.yml
│ └── unit-tests.yml
├── .gitignore
├── .spi.yml
├── .swiftpm
└── configuration
│ └── Package.resolved
├── BENCHMARKS.md
├── CONTRIBUTING.md
├── Examples
└── WhisperAX
│ ├── Debug.xcconfig
│ ├── WhisperAX.xcodeproj
│ ├── project.pbxproj
│ ├── project.xcworkspace
│ │ ├── contents.xcworkspacedata
│ │ └── xcshareddata
│ │ │ ├── IDEWorkspaceChecks.plist
│ │ │ └── swiftpm
│ │ │ └── Package.resolved
│ └── xcshareddata
│ │ └── xcschemes
│ │ └── WhisperAX.xcscheme
│ ├── WhisperAX
│ ├── Info.plist
│ ├── Preview Content
│ │ └── Preview Assets.xcassets
│ │ │ └── Contents.json
│ ├── Resources
│ │ ├── Assets.xcassets
│ │ │ ├── AppIcon.appiconset
│ │ │ │ ├── 100.png
│ │ │ │ ├── 102.png
│ │ │ │ ├── 1024 1.png
│ │ │ │ ├── 1024 2.png
│ │ │ │ ├── 1024.png
│ │ │ │ ├── 108.png
│ │ │ │ ├── 114.png
│ │ │ │ ├── 120 1.png
│ │ │ │ ├── 120.png
│ │ │ │ ├── 128 1.png
│ │ │ │ ├── 128.png
│ │ │ │ ├── 136.png
│ │ │ │ ├── 152.png
│ │ │ │ ├── 16.png
│ │ │ │ ├── 167.png
│ │ │ │ ├── 172.png
│ │ │ │ ├── 180.png
│ │ │ │ ├── 192.png
│ │ │ │ ├── 196.png
│ │ │ │ ├── 216.png
│ │ │ │ ├── 234.png
│ │ │ │ ├── 256.png
│ │ │ │ ├── 258.png
│ │ │ │ ├── 32.png
│ │ │ │ ├── 40.png
│ │ │ │ ├── 44.png
│ │ │ │ ├── 48.png
│ │ │ │ ├── 512.png
│ │ │ │ ├── 55.png
│ │ │ │ ├── 58 1.png
│ │ │ │ ├── 58.png
│ │ │ │ ├── 60 1.png
│ │ │ │ ├── 60.png
│ │ │ │ ├── 64 1.png
│ │ │ │ ├── 64.png
│ │ │ │ ├── 66.png
│ │ │ │ ├── 76.png
│ │ │ │ ├── 80 1.png
│ │ │ │ ├── 80.png
│ │ │ │ ├── 87 1.png
│ │ │ │ ├── 87.png
│ │ │ │ ├── 88.png
│ │ │ │ ├── 92.png
│ │ │ │ └── Contents.json
│ │ │ └── Contents.json
│ │ ├── Info.plist
│ │ └── WhisperAX.entitlements
│ ├── Views
│ │ └── ContentView.swift
│ └── WhisperAXApp.swift
│ ├── WhisperAXTests
│ ├── WhisperAXTests.swift
│ └── WhisperKitTests
│ ├── WhisperAXUITests
│ ├── WhisperAXUITests.swift
│ └── WhisperAXUITestsLaunchTests.swift
│ ├── WhisperAXWatchApp
│ ├── Assets.xcassets
│ │ ├── AccentColor.colorset
│ │ │ └── Contents.json
│ │ ├── AppIcon.appiconset
│ │ │ ├── Contents.json
│ │ │ └── appstore.png
│ │ └── Contents.json
│ ├── Preview Content
│ │ └── Preview Assets.xcassets
│ │ │ └── Contents.json
│ ├── WhisperAXExampleView.swift
│ └── WhisperAXWatchApp.swift
│ ├── WhisperAXWatchAppTests
│ └── WhisperAX_Watch_AppTests.swift
│ └── WhisperAXWatchAppUITests
│ ├── WhisperAX_Watch_AppUITests.swift
│ └── WhisperAX_Watch_AppUITestsLaunchTests.swift
├── LICENSE
├── Makefile
├── Package.resolved
├── Package.swift
├── README.md
├── Sources
├── WhisperKit
│ └── Core
│ │ ├── Audio
│ │ ├── AudioChunker.swift
│ │ ├── AudioProcessor.swift
│ │ ├── AudioStreamTranscriber.swift
│ │ ├── EnergyVAD.swift
│ │ └── VoiceActivityDetector.swift
│ │ ├── AudioEncoder.swift
│ │ ├── Configurations.swift
│ │ ├── FeatureExtractor.swift
│ │ ├── Models.swift
│ │ ├── ResultWriter.swift
│ │ ├── Text
│ │ ├── LogitsFilter.swift
│ │ ├── SegmentSeeker.swift
│ │ └── TokenSampler.swift
│ │ ├── TextDecoder.swift
│ │ ├── TranscribeTask.swift
│ │ ├── Utils
│ │ ├── Concurrency.swift
│ │ └── Utils.swift
│ │ └── WhisperKit.swift
└── WhisperKitCLI
│ ├── CLIArguments.swift
│ ├── CLIUtils.swift
│ ├── TranscribeCLI.swift
│ └── WhisperKitCLI.swift
├── Tests
└── WhisperKitTests
│ ├── Evaluate
│ ├── DistanceCalculation.swift
│ ├── NormalizeEn.swift
│ ├── SpellingMapping.swift
│ └── WERUtils.swift
│ ├── FunctionalTests.swift
│ ├── RegressionTestUtils.swift
│ ├── RegressionTests.swift
│ ├── Resources
│ ├── 8_Channel_ID.m4a
│ ├── config-v02.json
│ ├── config-v03.json
│ ├── es_test_clip.wav
│ ├── ja_test_clip.wav
│ ├── jfk.wav
│ ├── jfk_441khz.m4a
│ └── ted_60.m4a
│ ├── TestUtils.swift
│ └── UnitTests.swift
└── fastlane
├── Fastfile
└── README.md
/.github/workflows/development-tests.yml:
--------------------------------------------------------------------------------
1 | name: Development Tests
2 |
3 | on:
4 | pull_request:
5 | pull_request_review:
6 | types: [submitted]
7 | workflow_dispatch:
8 |
9 | concurrency:
10 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
11 | cancel-in-progress: true
12 |
13 | jobs:
14 | build-and-test:
15 | name: "Build and Test"
16 | uses: ./.github/workflows/unit-tests.yml
17 | with:
18 | ios-version: "18.2"
19 | ios-device: "iPhone 16"
20 | macos-runner: "macos-15"
21 |
22 | check-approvals:
23 | runs-on: ubuntu-latest
24 | outputs:
25 | reviews: ${{ steps.reviews.outputs.state }}
26 | permissions:
27 | pull-requests: read
28 | contents: read
29 | steps:
30 | - uses: actions/checkout@v4
31 | - name: Check Approvals
32 | id: reviews
33 | env:
34 | GH_TOKEN: ${{ github.token }}
35 | pr: ${{ github.event.pull_request.number }}
36 | run: |
37 | echo "Checking PR approval for: $pr"
38 | state=$(gh pr view $pr --json reviewDecision --jq '.reviewDecision')
39 | echo "Review decision state: $state"
40 | echo "state=$state" >> "$GITHUB_OUTPUT"
41 |
42 | pre-merge-tests:
43 | name: "Pre-merge Tests"
44 | needs: [check-approvals]
45 | if: needs.check-approvals.outputs.reviews == 'APPROVED' || github.event_name == 'workflow_dispatch'
46 | strategy:
47 | matrix:
48 | include:
49 | - os: macos-13-xlarge
50 | ios-version: "17.2"
51 | ios-device: "iPhone 14"
52 | xcode-version: "15.2"
53 | - os: macos-14
54 | ios-version: "17.2"
55 | ios-device: "iPhone 15"
56 | xcode-version: "15.2"
57 | uses: ./.github/workflows/unit-tests.yml
58 | with:
59 | macos-runner: ${{ matrix.os }}
60 | ios-version: ${{ matrix.ios-version }}
61 | ios-device: ${{ matrix.ios-device }}
62 | xcode-version: ${{ matrix.xcode-version }}
63 |
--------------------------------------------------------------------------------
/.github/workflows/expo-update.yml:
--------------------------------------------------------------------------------
1 | # Tested on MacOS with:
2 | # act -s COMMITTER_TOKEN="$(gh auth token)" release --container-architecture linux/amd64 -P ubuntu-latest=catthehacker/ubuntu:act-latest -e <(echo '{ "release": { "tag_name": "v0.0.0" }}')
3 | name: Update whisper-kit-expo
4 |
5 | on:
6 | release:
7 | types: [released]
8 |
9 | jobs:
10 | update-whisperkit:
11 | runs-on: ubuntu-latest
12 | env:
13 | TAG: ${{ github.event.release.tag_name }}
14 | BRANCH_NAME: update-whisperkit-${{ github.event.release.tag_name }}
15 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }}
16 | steps:
17 | - name: Checkout whisper-kit-expo
18 | uses: actions/checkout@v4
19 | with:
20 | repository: seb-sep/whisper-kit-expo
21 | token: ${{ secrets.COMMITTER_TOKEN }}
22 | ref: main
23 |
24 | - name: Setup Node
25 | uses: actions/setup-node@v4
26 | with:
27 | node-version: '20.x'
28 |
29 | - name: New branch
30 | run: |
31 | git checkout -b $BRANCH_NAME
32 | echo ${{ github.event.release }}
33 | echo "Release tag is $TAG"
34 |
35 | - name: Update package.json version
36 | run: |
37 | PACKAGE_PATH="package.json"
38 | if [ ! -f "$PACKAGE_PATH" ]; then
39 | echo "Could not find package.json at path: $PACKAGE_PATH."
40 | exit 1
41 | fi
42 | RELEASE_TAG=${TAG#v}
43 | jq --arg newver "$RELEASE_TAG" '.whisperKit.version = $newver' "$PACKAGE_PATH" > tmp.$$.json && mv tmp.$$.json "$PACKAGE_PATH"
44 | cat "$PACKAGE_PATH"
45 |
46 | - name: Commit changes
47 | run: |
48 | git config --global user.email "164233781+argmaxincbot@users.noreply.github.com"
49 | git config --global user.name "argmaxincbot"
50 | git add ./package.json
51 | git commit -m "Update WhisperKit to $TAG"
52 | git push origin $BRANCH_NAME
53 | - name: PR with changes
54 | env:
55 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }}
56 | run: |
57 | gh pr create --title "Update WhisperKit to $TAG" --body "Update WhisperKit to $TAG" --base main --head $BRANCH_NAME
58 |
--------------------------------------------------------------------------------
/.github/workflows/homebrew-update.yml:
--------------------------------------------------------------------------------
1 | name: Bump Homebrew Formula
2 |
3 | on:
4 | push:
5 | tags: 'v*'
6 |
7 | jobs:
8 | homebrew:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: mislav/bump-homebrew-formula-action@v3
12 | with:
13 | formula-name: whisperkit-cli
14 | env:
15 | COMMITTER_TOKEN: ${{ secrets.COMMITTER_TOKEN }}
16 |
--------------------------------------------------------------------------------
/.github/workflows/pre-release-tests.yml:
--------------------------------------------------------------------------------
1 | name: Pre-Release Tests
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | build-and-test-all-platforms:
10 | name: "Build and Test All Platforms"
11 | strategy:
12 | matrix:
13 | include:
14 | - os: macos-13-xlarge
15 | ios-version: "17.2" # TODO: Download older simulators for macOS 13
16 | ios-device: "iPhone 14"
17 | xcode-version: "15.2"
18 | - os: macos-14
19 | ios-version: "17.2"
20 | ios-device: "iPhone 15"
21 | xcode-version: "15.2"
22 | - os: macos-15
23 | ios-version: "18.2" # Latest available version
24 | ios-device: "iPhone 16"
25 | xcode-version: "latest-stable"
26 | uses: ./.github/workflows/unit-tests.yml
27 | with:
28 | macos-runner: ${{ matrix.os }}
29 | ios-version: ${{ matrix.ios-version }}
30 | ios-device: ${{ matrix.ios-device }}
31 | xcode-version: ${{ matrix.xcode-version }}
--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
1 | name: Unit Tests
2 |
3 | on:
4 | workflow_call:
5 | inputs:
6 | ios-version:
7 | required: true
8 | type: string
9 | ios-device:
10 | required: true
11 | type: string
12 | macos-runner:
13 | required: true
14 | type: string
15 | xcode-version:
16 | required: false
17 | type: string
18 |
19 | jobs:
20 | unit-tests:
21 | name: "${{ matrix.run-config['name'] }} on ${{ inputs.macos-runner }}"
22 | runs-on: ${{ inputs.macos-runner }}
23 | strategy:
24 | matrix:
25 | run-config:
26 | - {
27 | name: "macOS",
28 | condition: true,
29 | clean-destination: "generic/platform=macOS",
30 | test-destination: "platform=macOS,arch=arm64",
31 | }
32 | - {
33 | name: "iOS",
34 | condition: true,
35 | clean-destination: "generic/platform=iOS",
36 | test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=${{ inputs.ios-device }}",
37 | }
38 | - {
39 | name: "watchOS",
40 | condition: "${{ inputs.macos-runner == 'macos-15' }}",
41 | clean-destination: "generic/platform=watchOS",
42 | test-destination: "platform=watchOS Simulator,name=Apple Watch Ultra 2 (49mm)",
43 | }
44 | - {
45 | name: "visionOS",
46 | condition: "${{ inputs.macos-runner == 'macos-15' }}",
47 | clean-destination: "generic/platform=visionOS",
48 | test-destination: "platform=visionOS Simulator,name=Apple Vision Pro",
49 | }
50 | timeout-minutes: 30
51 | steps:
52 | - uses: actions/checkout@v4
53 | - uses: maxim-lobanov/setup-xcode@v1
54 | with:
55 | xcode-version: ${{ inputs.xcode-version || 'latest-stable' }}
56 | - name: Setup environment
57 | run: make setup
58 | - name: Setup Cache
59 | id: model-cache
60 | uses: actions/cache@v4
61 | with:
62 | path: Models
63 | key: ${{ runner.os }}-models
64 | - name: Download Models
65 | if: steps.model-cache.outputs.cache-hit != 'true'
66 | run: make download-model MODEL=tiny
67 | - name: Install and discover destinations
68 | if: ${{ matrix.run-config['condition'] == true }}
69 | run: |
70 | if [[ "${{ matrix.run-config['name'] }}" != "macOS" ]]; then
71 | xcodebuild -downloadPlatform ${{ matrix.run-config['name'] }}
72 | fi
73 | echo "Runtimes for testing:"
74 | xcrun simctl list runtimes
75 | echo "Destinations for testing:"
76 | xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations
77 | - name: Boot Simulator and Wait
78 | if: ${{ matrix.run-config['condition'] == true }} && ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }}
79 | # Slower runners require some time to fully boot the simulator
80 | # Parse the simulator name from the destination string, boot it, and wait
81 | run: |
82 | simulator_name=$(echo '${{ matrix.run-config['test-destination'] }}' | sed -n 's/.*name=\([^,]*\).*/\1/p')
83 | xcrun simctl boot "$simulator_name" || true
84 | sleep 15
85 | xcrun simctl list devices
86 | - name: Build and Test - ${{ matrix.run-config['name'] }}
87 | if: ${{ matrix.run-config['condition'] == true }}
88 | run: |
89 | set -o pipefail
90 | xcodebuild clean build-for-testing -scheme whisperkit-Package -destination '${{ matrix.run-config['clean-destination'] }}' | xcpretty
91 | xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination '${{ matrix.run-config['test-destination'] }}'
92 | - name: Upload Test Results
93 | if: failure()
94 | uses: actions/upload-artifact@v4
95 | with:
96 | name: test-results-${{ matrix.run-config['name']}}-on-${{ inputs.macos-runner }}
97 | path: |
98 | ~/Library/Developer/Xcode/DerivedData/**/Logs/Test/*.xcresult
99 | retention-days: 5
100 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /.build
3 | /Packages
4 | .vscode/
5 | xcuserdata/
6 | DerivedData/
7 | .swiftpm/configuration/registries.json
8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
9 | .swiftpm/xcode/xcshareddata/
10 | **/*.xcscheme
11 | .netrc
12 | .env
13 |
14 | # Core ML Model Files
15 | Models
16 | **/*.mlpackage
17 | **/*.mlmodel
18 | **/*.mlmodelc
19 | **/*.zip
20 | **/*.tar.gz
21 |
22 | # Audio files (add manually if needed)
23 | **/*.wav
24 | **/*.mp3
25 | **/*.m4a
26 | **/*.flac
27 |
28 | ## Xcode
29 | # Build generated
30 | build/
31 | DerivedData/
32 |
33 | # Various settings
34 | *.pbxuser
35 | !default.pbxuser
36 | *.mode1v3
37 | !default.mode1v3
38 | *.mode2v3
39 | !default.mode2v3
40 | *.perspectivev3
41 | !default.perspectivev3
42 | xcuserdata/
43 |
44 | # Other
45 | *.moved-aside
46 | *.xccheckout
47 | *.xcscmblueprint
48 |
49 | # Obj-C/Swift specific
50 | *.hmap
51 | *.ipa
52 | *.dSYM.zip
53 | *.dSYM
54 |
55 | # fastlane
56 | fastlane/report.xml
57 | fastlane/Preview.html
58 | fastlane/screenshots
59 | fastlane/test_output
60 | fastlane/benchmark_data
61 | fastlane/upload_folder
62 |
63 | ### Xcode Patch ###
64 | **/*.xcconfig
65 | *.xcodeproj/*
66 | !*.xcodeproj/project.pbxproj
67 | !*.xcodeproj/xcshareddata/
68 | !*.xcworkspace/contents.xcworkspacedata
69 | /*.gcno
--------------------------------------------------------------------------------
/.spi.yml:
--------------------------------------------------------------------------------
1 | version: 1
2 | builder:
3 | configs:
4 | - documentation_targets: [WhisperKit]
--------------------------------------------------------------------------------
/.swiftpm/configuration/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "pins" : [
3 | {
4 | "identity" : "swift-argument-parser",
5 | "kind" : "remoteSourceControl",
6 | "location" : "https://github.com/apple/swift-argument-parser.git",
7 | "state" : {
8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
9 | "version" : "1.3.0"
10 | }
11 | },
12 | {
13 | "identity" : "swift-transformers",
14 | "kind" : "remoteSourceControl",
15 | "location" : "https://github.com/huggingface/swift-transformers.git",
16 | "state" : {
17 | "revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe",
18 | "version" : "0.1.7"
19 | }
20 | }
21 | ],
22 | "version" : 2
23 | }
24 |
--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
1 | # WhisperKit Benchmarks
2 |
3 | This document describes how to run the benchmarks for WhisperKit. The benchmarks can be run on a specific device or all connected devices. The results are saved in JSON files and can be uploaded to the [argmaxinc/whisperkit-evals-dataset](https://huggingface.co/datasets/argmaxinc/whisperkit-evals-dataset) dataset on HuggingFace as a pull request. Below are the steps to run the benchmarks locally in order to reproduce the results shown in our [WhisperKit Benchmarks](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) space.
4 |
5 | ## Download the Source
6 |
7 | To download the code to run the test suite, run:
8 |
9 | ```sh
10 | git clone git@github.com:argmaxinc/WhisperKit.git
11 | ```
12 |
13 | ## Local Environment
14 |
15 | Before running the benchmarks, you'll need to set up your local environment with the necessary dependencies. To do this, run:
16 |
17 | ```sh
18 | make setup
19 | ```
20 |
21 | See [Contributing](CONTRIBUTING.md) for more information.
22 |
23 |
24 | ## Xcode Environment
25 |
26 | When running the tests, the model to test needs is provided to the Xcode from Fastlane as an environment variable:
27 |
28 | 1. Open the example project:
29 |
30 | ```sh
31 | xed Examples/WhisperAX
32 | ```
33 |
34 | 2. At the top, you will see the app icon and `WhisperAX` written next to it. Click on `WhisperAX` and select `Edit Scheme` at the bottom.
35 |
36 | 3. Under `Environment Variables`, you will see an entry with `MODEL_NAME` as the name and `$(MODEL_NAME)` as the value.
37 |
38 | ## Devices
39 |
40 | > [!IMPORTANT]
41 | > An active developer account is required to run the tests on physical devices.
42 |
43 | Before running tests, all external devices need to be connected and paired to your Mac, as well as registered with your developer account. Ensure the devices are in Developer Mode. If nothing appears after connecting the devices via cable, press `Command + Shift + 2` to open the list of devices and track their progress.
44 |
45 | ## Datasets
46 |
47 | The datasets for the test suite can be set in a global array called `datasets` in the file [`Tests/WhisperKitTests/RegressionTests.swift`](Tests/WhisperKitTests/RegressionTests.swift). It is prefilled with the datasets that are currently available.
48 |
49 | ## Models
50 |
51 | The models for the test suite can be set in the [`Fastfile`](fastlane/Fastfile). Simply find `BENCHMARK_CONFIGS` and modify the `models` array under the benchmark you want to run.
52 |
53 | ## Makefile and Fastlane
54 |
55 | The tests are run using [Fastlane](fastlane/Fastfile), which is controlled by a [Makefile](Makefile). The Makefile contains the following commands:
56 |
57 | ### List Connected Devices
58 |
59 | Before running the tests it might be a good idea to list the connected devices to resolve any connection issues. Simply run:
60 |
61 | ```sh
62 | make list-devices
63 | ```
64 |
65 | The output will be a list with entries that look something like this:
66 |
67 | ```ruby
68 | {
69 | :name=>"My Mac",
70 | :type=>"Apple M2 Pro",
71 | :platform=>"macOS",
72 | :os_version=>"15.0.1",
73 | :product=>"Mac14,12",
74 | :id=>"XXXXXXXX-1234-5678-9012-XXXXXXXXXXXX",
75 | :state=>"connected"
76 | }
77 | ```
78 |
79 | Verify that the devices are connected and the state is `connected`.
80 |
81 | ### Running Benchmarks
82 |
83 | After completing the above steps, you can run the tests. Note that there are two different test configurations: one named `full` and the other named `debug`. To check for potential errors, run the `debug` tests:
84 |
85 | ```sh
86 | make benchmark-devices DEBUG=true
87 | ```
88 |
89 | Otherwise run the `full` tests:
90 |
91 | ```sh
92 | make benchmark-devices
93 | ```
94 |
95 | Optionally, for both tests, you can specify the list of devices for the tests using the `DEVICES` option:
96 |
97 | ```sh
98 | make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac"
99 | ```
100 |
101 | The `DEVICES` option is a comma-separated list of device names. The device names can be found by running `make list-devices` and using the value for the `:name` key.
102 |
103 | ### Results
104 |
105 | After the tests are run, the generated results can be found under `fastlane/benchmark_data` including the .xcresult file with logs and attachments for each device. There will also be a folder called `fastlane/upload_folder/benchmark_data` that contains only the JSON results in `fastlane/benchmark_data` that can used for further analysis.
106 |
107 | We will periodically run these tests on a range of devices and upload the results to the [argmaxinc/whisperkit-evals-dataset](https://huggingface.co/datasets/argmaxinc/whisperkit-evals-dataset), which will propagate to the [WhisperKit Benchmarks](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) space and be available for comparison.
108 |
109 |
110 | # Troubleshooting
111 |
112 |
113 | If you encounter issues while running the tests, heres a few things to try:
114 |
115 | 1. Open the project in Xcode and run the tests directly from there.
116 | 1. To do this, open the example app (from command line type: `xed Examples/WhisperAX`) and run the test named `RegressionTests/testModelPerformanceWithDebugConfig` from the test navigator.
117 | 2. If the tests run successfully, you can rule out any issues with the device or the models.
118 | 3. If they dont run successfully, Xcode will provide more detailed error messages.
119 | 2. Try specifying a single device to run the tests on. This can be done by running `make list-devices` and then running the tests with the `DEVICES` option set to the name of the device you want to test on. For example, `make benchmark-devices DEVICES="My Mac"`. This will also enable you to see the logs for that specific device.
120 | 3. If you are still encountering issues, please reach out to us on the [Discord](https://discord.gg/G5F5GZGecC) or create an [issue](https://github.com/argmaxinc/WhisperKit/issues) on GitHub.
121 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to WhisperKit
2 |
3 | ## Overview
4 |
5 | We welcome and encourage contributions to WhisperKit! Whether you're fixing bugs, improving documentation, or adding new features from the roadmap, your help is appreciated. This guide will help you get started with contributing to WhisperKit.
6 |
7 | ## Getting Started
8 |
9 | 1. **Fork the Repository**: Start by [forking](https://github.com/argmaxinc/WhisperKit/fork) the WhisperKit repository on GitHub to your personal account.
10 |
11 | 2. **Clone Your Fork**: Clone your fork to your local machine to start making changes.
12 |
13 | ```bash
14 | git clone https://github.com/[your-username]/whisperkit.git
15 | cd whisperkit
16 | ```
17 |
18 | ## Setting Up Your Development Environment
19 |
20 | 1. **Install Dependencies**: Use the provided `Makefile` to set up your environment. Run `make setup` to install necessary dependencies.
21 |
22 | ```bash
23 | make setup
24 | ```
25 |
26 | 2. **Download Models**: Run `make download-models` to download the required models to run and test locally.
27 |
28 | ```bash
29 | make download-model MODEL=tiny
30 | ```
31 |
32 | ## Making Changes
33 |
34 | 1. **Make Your Changes**: Implement your changes, add new features, or fix bugs. Ensure you adhere to the existing coding style. If you're adding new features, make sure to update or add any documentation or tests as needed.
35 |
36 | 2. **Build and Test**: You can use the `Makefile` to build and test your changes. Run `make build` to build WhisperKit and `make test` to run tests.
37 |
38 | ```bash
39 | make build
40 | make test
41 | ```
42 |
43 | You can also run and test directly from Xcode. We've provided an example app that contains various use cases, just open the `Examples/WhisperAX/WhisperAX.xcodeproj` file in Xcode and run the app.
44 |
45 | ## Submitting Your Changes
46 |
47 | 1. **Commit Your Changes**: Once you're satisfied with your changes, commit them with a clear and concise commit message.
48 |
49 | ```bash
50 | git commit -am "Add a new feature"
51 | ```
52 |
53 | 2. **Push to Your Fork**: Push your changes to your fork on GitHub.
54 |
55 | ```bash
56 | git push origin my-branch
57 | ```
58 |
59 | 3. **Create a Pull Request**: Go to the WhisperKit repository on GitHub and create a new pull request from your fork. Ensure your pull request has a clear title and description.
60 |
61 | 4. **Code Review**: Wait for the maintainers to review your pull request. Be responsive to feedback and make any necessary changes.
62 |
63 | ## Guidelines
64 |
65 | - **Code Style**: Follow the existing code style in the project.
66 | - **Commit Messages**: Write meaningful commit messages that clearly describe the changes.
67 | - **Documentation**: Update documentation if you're adding new features or making changes that affect how users interact with WhisperKit.
68 | - **Tests**: Add or update tests for new features or bug fixes.
69 |
70 | ## Final Steps
71 |
72 | After your pull request has been reviewed and approved, a maintainer will merge it into the main branch. Congratulations, you've successfully contributed to WhisperKit!
73 |
74 | Thank you for making WhisperKit better for everyone! ❤️🔥
75 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/Debug.xcconfig:
--------------------------------------------------------------------------------
1 | // Run `make setup` to add your team here
2 | DEVELOPMENT_TEAM=
3 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | IDEDidComputeMac32BitWarning
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "originHash" : "831ad63194a5262b2549d58e383a520f9cbbc80b4a75660fbbcc56d65edfdab4",
3 | "pins" : [
4 | {
5 | "identity" : "swift-argument-parser",
6 | "kind" : "remoteSourceControl",
7 | "location" : "https://github.com/apple/swift-argument-parser.git",
8 | "state" : {
9 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
10 | "version" : "1.3.0"
11 | }
12 | },
13 | {
14 | "identity" : "swift-transformers",
15 | "kind" : "remoteSourceControl",
16 | "location" : "https://github.com/huggingface/swift-transformers.git",
17 | "state" : {
18 | "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08",
19 | "version" : "0.1.8"
20 | }
21 | }
22 | ],
23 | "version" : 3
24 | }
25 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX.xcodeproj/xcshareddata/xcschemes/WhisperAX.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
8 |
9 |
15 |
21 |
22 |
23 |
24 |
25 |
31 |
32 |
35 |
41 |
42 |
43 |
46 |
52 |
53 |
54 |
55 |
56 |
66 |
68 |
74 |
75 |
76 |
77 |
81 |
82 |
86 |
87 |
88 |
89 |
95 |
97 |
103 |
104 |
105 |
106 |
108 |
109 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | NSPrivacyAccessedAPITypes
6 |
7 | NSPrivacyAccessedAPIType
8 | NSPrivacyAccessedAPICategoryUserDefaults
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Preview Content/Preview Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "images" : [
3 | {
4 | "filename" : "40.png",
5 | "idiom" : "universal",
6 | "platform" : "ios",
7 | "scale" : "2x",
8 | "size" : "20x20"
9 | },
10 | {
11 | "filename" : "60.png",
12 | "idiom" : "universal",
13 | "platform" : "ios",
14 | "scale" : "3x",
15 | "size" : "20x20"
16 | },
17 | {
18 | "filename" : "58 1.png",
19 | "idiom" : "universal",
20 | "platform" : "ios",
21 | "scale" : "2x",
22 | "size" : "29x29"
23 | },
24 | {
25 | "filename" : "87 1.png",
26 | "idiom" : "universal",
27 | "platform" : "ios",
28 | "scale" : "3x",
29 | "size" : "29x29"
30 | },
31 | {
32 | "filename" : "76.png",
33 | "idiom" : "universal",
34 | "platform" : "ios",
35 | "scale" : "2x",
36 | "size" : "38x38"
37 | },
38 | {
39 | "filename" : "114.png",
40 | "idiom" : "universal",
41 | "platform" : "ios",
42 | "scale" : "3x",
43 | "size" : "38x38"
44 | },
45 | {
46 | "filename" : "80 1.png",
47 | "idiom" : "universal",
48 | "platform" : "ios",
49 | "scale" : "2x",
50 | "size" : "40x40"
51 | },
52 | {
53 | "filename" : "120.png",
54 | "idiom" : "universal",
55 | "platform" : "ios",
56 | "scale" : "3x",
57 | "size" : "40x40"
58 | },
59 | {
60 | "filename" : "120 1.png",
61 | "idiom" : "universal",
62 | "platform" : "ios",
63 | "scale" : "2x",
64 | "size" : "60x60"
65 | },
66 | {
67 | "filename" : "180.png",
68 | "idiom" : "universal",
69 | "platform" : "ios",
70 | "scale" : "3x",
71 | "size" : "60x60"
72 | },
73 | {
74 | "filename" : "128 1.png",
75 | "idiom" : "universal",
76 | "platform" : "ios",
77 | "scale" : "2x",
78 | "size" : "64x64"
79 | },
80 | {
81 | "filename" : "192.png",
82 | "idiom" : "universal",
83 | "platform" : "ios",
84 | "scale" : "3x",
85 | "size" : "64x64"
86 | },
87 | {
88 | "filename" : "136.png",
89 | "idiom" : "universal",
90 | "platform" : "ios",
91 | "scale" : "2x",
92 | "size" : "68x68"
93 | },
94 | {
95 | "filename" : "152.png",
96 | "idiom" : "universal",
97 | "platform" : "ios",
98 | "scale" : "2x",
99 | "size" : "76x76"
100 | },
101 | {
102 | "filename" : "167.png",
103 | "idiom" : "universal",
104 | "platform" : "ios",
105 | "scale" : "2x",
106 | "size" : "83.5x83.5"
107 | },
108 | {
109 | "filename" : "1024 1.png",
110 | "idiom" : "universal",
111 | "platform" : "ios",
112 | "size" : "1024x1024"
113 | },
114 | {
115 | "filename" : "16.png",
116 | "idiom" : "mac",
117 | "scale" : "1x",
118 | "size" : "16x16"
119 | },
120 | {
121 | "filename" : "32.png",
122 | "idiom" : "mac",
123 | "scale" : "2x",
124 | "size" : "16x16"
125 | },
126 | {
127 | "filename" : "32.png",
128 | "idiom" : "mac",
129 | "scale" : "1x",
130 | "size" : "32x32"
131 | },
132 | {
133 | "filename" : "64.png",
134 | "idiom" : "mac",
135 | "scale" : "2x",
136 | "size" : "32x32"
137 | },
138 | {
139 | "filename" : "128.png",
140 | "idiom" : "mac",
141 | "scale" : "1x",
142 | "size" : "128x128"
143 | },
144 | {
145 | "filename" : "256.png",
146 | "idiom" : "mac",
147 | "scale" : "2x",
148 | "size" : "128x128"
149 | },
150 | {
151 | "filename" : "256.png",
152 | "idiom" : "mac",
153 | "scale" : "1x",
154 | "size" : "256x256"
155 | },
156 | {
157 | "filename" : "512.png",
158 | "idiom" : "mac",
159 | "scale" : "2x",
160 | "size" : "256x256"
161 | },
162 | {
163 | "filename" : "512.png",
164 | "idiom" : "mac",
165 | "scale" : "1x",
166 | "size" : "512x512"
167 | },
168 | {
169 | "filename" : "1024.png",
170 | "idiom" : "mac",
171 | "scale" : "2x",
172 | "size" : "512x512"
173 | },
174 | {
175 | "filename" : "44.png",
176 | "idiom" : "universal",
177 | "platform" : "watchos",
178 | "scale" : "2x",
179 | "size" : "22x22"
180 | },
181 | {
182 | "filename" : "48.png",
183 | "idiom" : "universal",
184 | "platform" : "watchos",
185 | "scale" : "2x",
186 | "size" : "24x24"
187 | },
188 | {
189 | "filename" : "55.png",
190 | "idiom" : "universal",
191 | "platform" : "watchos",
192 | "scale" : "2x",
193 | "size" : "27.5x27.5"
194 | },
195 | {
196 | "filename" : "58.png",
197 | "idiom" : "universal",
198 | "platform" : "watchos",
199 | "scale" : "2x",
200 | "size" : "29x29"
201 | },
202 | {
203 | "filename" : "60 1.png",
204 | "idiom" : "universal",
205 | "platform" : "watchos",
206 | "scale" : "2x",
207 | "size" : "30x30"
208 | },
209 | {
210 | "filename" : "64 1.png",
211 | "idiom" : "universal",
212 | "platform" : "watchos",
213 | "scale" : "2x",
214 | "size" : "32x32"
215 | },
216 | {
217 | "filename" : "66.png",
218 | "idiom" : "universal",
219 | "platform" : "watchos",
220 | "scale" : "2x",
221 | "size" : "33x33"
222 | },
223 | {
224 | "filename" : "80.png",
225 | "idiom" : "universal",
226 | "platform" : "watchos",
227 | "scale" : "2x",
228 | "size" : "40x40"
229 | },
230 | {
231 | "filename" : "87.png",
232 | "idiom" : "universal",
233 | "platform" : "watchos",
234 | "scale" : "2x",
235 | "size" : "43.5x43.5"
236 | },
237 | {
238 | "filename" : "88.png",
239 | "idiom" : "universal",
240 | "platform" : "watchos",
241 | "scale" : "2x",
242 | "size" : "44x44"
243 | },
244 | {
245 | "filename" : "92.png",
246 | "idiom" : "universal",
247 | "platform" : "watchos",
248 | "scale" : "2x",
249 | "size" : "46x46"
250 | },
251 | {
252 | "filename" : "100.png",
253 | "idiom" : "universal",
254 | "platform" : "watchos",
255 | "scale" : "2x",
256 | "size" : "50x50"
257 | },
258 | {
259 | "filename" : "102.png",
260 | "idiom" : "universal",
261 | "platform" : "watchos",
262 | "scale" : "2x",
263 | "size" : "51x51"
264 | },
265 | {
266 | "filename" : "108.png",
267 | "idiom" : "universal",
268 | "platform" : "watchos",
269 | "scale" : "2x",
270 | "size" : "54x54"
271 | },
272 | {
273 | "filename" : "172.png",
274 | "idiom" : "universal",
275 | "platform" : "watchos",
276 | "scale" : "2x",
277 | "size" : "86x86"
278 | },
279 | {
280 | "filename" : "196.png",
281 | "idiom" : "universal",
282 | "platform" : "watchos",
283 | "scale" : "2x",
284 | "size" : "98x98"
285 | },
286 | {
287 | "filename" : "216.png",
288 | "idiom" : "universal",
289 | "platform" : "watchos",
290 | "scale" : "2x",
291 | "size" : "108x108"
292 | },
293 | {
294 | "filename" : "234.png",
295 | "idiom" : "universal",
296 | "platform" : "watchos",
297 | "scale" : "2x",
298 | "size" : "117x117"
299 | },
300 | {
301 | "filename" : "258.png",
302 | "idiom" : "universal",
303 | "platform" : "watchos",
304 | "scale" : "2x",
305 | "size" : "129x129"
306 | },
307 | {
308 | "filename" : "1024 2.png",
309 | "idiom" : "universal",
310 | "platform" : "watchos",
311 | "size" : "1024x1024"
312 | }
313 | ],
314 | "info" : {
315 | "author" : "xcode",
316 | "version" : 1
317 | }
318 | }
319 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/Resources/WhisperAX.entitlements:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | com.apple.developer.kernel.increased-memory-limit
6 |
7 | com.apple.security.app-sandbox
8 |
9 | com.apple.security.device.audio-input
10 |
11 | com.apple.security.files.downloads.read-only
12 |
13 | com.apple.security.files.user-selected.read-write
14 |
15 | com.apple.security.network.client
16 |
17 | com.apple.security.network.server
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAX/WhisperAXApp.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import SwiftUI
5 |
6 | @main
7 | struct WhisperAXApp: App {
8 | var body: some Scene {
9 | WindowGroup {
10 | ContentView()
11 | #if os(macOS)
12 | .frame(minWidth: 1000, minHeight: 700)
13 | #endif
14 | }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXTests/WhisperAXTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAXTests: XCTestCase {
7 | override func setUpWithError() throws {
8 | // Put setup code here. This method is called before the invocation of each test method in the class.
9 | }
10 |
11 | override func tearDownWithError() throws {
12 | // Put teardown code here. This method is called after the invocation of each test method in the class.
13 | }
14 |
15 | func testExample() throws {
16 | // This is an example of a functional test case.
17 | // Use XCTAssert and related functions to verify your tests produce the correct results.
18 | // Any test you write for XCTest can be annotated as throws and async.
19 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error.
20 | // Mark your test async to allow awaiting for asynchronous code to complete. Check the results with assertions afterwards.
21 | }
22 |
23 | func testPerformanceExample() throws {
24 | // This is an example of a performance test case.
25 | measure {
26 | // Put the code you want to measure the time of here.
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXTests/WhisperKitTests:
--------------------------------------------------------------------------------
1 | ../../../Tests/WhisperKitTests
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXUITests/WhisperAXUITests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAXUITests: XCTestCase {
7 | override func setUpWithError() throws {
8 | // Put setup code here. This method is called before the invocation of each test method in the class.
9 |
10 | // In UI tests it is usually best to stop immediately when a failure occurs.
11 | continueAfterFailure = false
12 |
13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this.
14 | }
15 |
16 | override func tearDownWithError() throws {
17 | // Put teardown code here. This method is called after the invocation of each test method in the class.
18 | }
19 |
20 | func testExample() throws {
21 | // UI tests must launch the application that they test.
22 | let app = XCUIApplication()
23 | app.launch()
24 |
25 | // Use XCTAssert and related functions to verify your tests produce the correct results.
26 | }
27 |
28 | func testLaunchPerformance() throws {
29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) {
30 | // This measures how long it takes to launch your application.
31 | measure(metrics: [XCTApplicationLaunchMetric()]) {
32 | XCUIApplication().launch()
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXUITests/WhisperAXUITestsLaunchTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAXUITestsLaunchTests: XCTestCase {
7 | override class var runsForEachTargetApplicationUIConfiguration: Bool {
8 | true
9 | }
10 |
11 | override func setUpWithError() throws {
12 | continueAfterFailure = false
13 | }
14 |
15 | func testLaunch() throws {
16 | let app = XCUIApplication()
17 | app.launch()
18 |
19 | // Insert steps here to perform after app launch but before taking a screenshot,
20 | // such as logging into a test account or navigating somewhere in the app
21 |
22 | let attachment = XCTAttachment(screenshot: app.screenshot())
23 | attachment.name = "Launch Screen"
24 | attachment.lifetime = .keepAlways
25 | add(attachment)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AccentColor.colorset/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "colors" : [
3 | {
4 | "idiom" : "universal"
5 | }
6 | ],
7 | "info" : {
8 | "author" : "xcode",
9 | "version" : 1
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "images" : [
3 | {
4 | "filename" : "appstore.png",
5 | "idiom" : "universal",
6 | "platform" : "watchos",
7 | "size" : "1024x1024"
8 | }
9 | ],
10 | "info" : {
11 | "author" : "xcode",
12 | "version" : 1
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/Preview Content/Preview Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 | "info" : {
3 | "author" : "xcode",
4 | "version" : 1
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXWatchApp.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import SwiftUI
5 |
6 | @main
7 | struct WhisperAXWatchApp: App {
8 | var body: some Scene {
9 | WindowGroup {
10 | WhisperAXWatchView()
11 | }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchAppTests/WhisperAX_Watch_AppTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | @testable import WhisperAX_Watch_App
5 | import XCTest
6 |
7 | final class WhisperAX_Watch_AppTests: XCTestCase {
8 | override func setUpWithError() throws {
9 | // Put setup code here. This method is called before the invocation of each test method in the class.
10 | }
11 |
12 | override func tearDownWithError() throws {
13 | // Put teardown code here. This method is called after the invocation of each test method in the class.
14 | }
15 |
16 | func testExample() throws {
17 | // This is an example of a functional test case.
18 | // Use XCTAssert and related functions to verify your tests produce the correct results.
19 | // Any test you write for XCTest can be annotated as throws and async.
20 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error.
21 | // Tests marked async will run the test method on an arbitrary thread managed by the Swift runtime.
22 | }
23 |
24 | func testPerformanceExample() throws {
25 | // This is an example of a performance test case.
26 | self.measure {
27 | // Put the code you want to measure the time of here.
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAX_Watch_AppUITests: XCTestCase {
7 | override func setUpWithError() throws {
8 | // Put setup code here. This method is called before the invocation of each test method in the class.
9 |
10 | // In UI tests it is usually best to stop immediately when a failure occurs.
11 | continueAfterFailure = false
12 |
13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this.
14 | }
15 |
16 | override func tearDownWithError() throws {
17 | // Put teardown code here. This method is called after the invocation of each test method in the class.
18 | }
19 |
20 | func testExample() throws {
21 | // UI tests must launch the application that they test.
22 | let app = XCUIApplication()
23 | app.launch()
24 |
25 | // Use XCTAssert and related functions to verify your tests produce the correct results.
26 | }
27 |
28 | func testLaunchPerformance() throws {
29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) {
30 | // This measures how long it takes to launch your application.
31 | measure(metrics: [XCTApplicationLaunchMetric()]) {
32 | XCUIApplication().launch()
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITestsLaunchTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import XCTest
5 |
6 | final class WhisperAX_Watch_AppUITestsLaunchTests: XCTestCase {
7 | override class var runsForEachTargetApplicationUIConfiguration: Bool {
8 | true
9 | }
10 |
11 | override func setUpWithError() throws {
12 | continueAfterFailure = false
13 | }
14 |
15 | func testLaunch() throws {
16 | let app = XCUIApplication()
17 | app.launch()
18 |
19 | // Insert steps here to perform after app launch but before taking a screenshot,
20 | // such as logging into a test account or navigating somewhere in the app
21 |
22 | let attachment = XCTAttachment(screenshot: app.screenshot())
23 | attachment.name = "Launch Screen"
24 | attachment.lifetime = .keepAlways
25 | add(attachment)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 argmax, inc.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: setup setup-huggingface-cli setup-model-repo download-models download-model build build-cli test clean-package-caches list-devices benchmark-connected-devices benchmark-device benchmark-devices extract-xcresult
2 |
3 | PIP_COMMAND := pip3
4 | PYTHON_COMMAND := python3
5 |
6 | # Define model repository and directories
7 | MODEL_REPO := argmaxinc/whisperkit-coreml
8 | MODEL_REPO_DIR := ./Models/whisperkit-coreml
9 | BASE_COMPILED_DIR := ./Models
10 |
11 | GIT_HASH := $(shell git rev-parse --short HEAD)
12 |
13 | setup:
14 | @echo "Setting up environment..."
15 | @which $(PIP_COMMAND)
16 | @which $(PYTHON_COMMAND)
17 | @echo "Checking for Homebrew..."
18 | @which brew > /dev/null || (echo "Error: Homebrew is not installed. Install it from https://brew.sh and try again" && exit 1)
19 | @echo "Homebrew is installed."
20 | @echo "Checking for huggingface-cli..."
21 | @which huggingface-cli > /dev/null || (echo "Installing huggingface-cli..." && brew install huggingface-cli)
22 | @echo "huggingface-cli is installed."
23 | @echo "Checking for git-lfs..."
24 | @which git-lfs > /dev/null || (echo "Installing git-lfs..." && brew install git-lfs)
25 | @echo "git-lfs is installed."
26 | @echo "Checking for trash..."
27 | @which trash > /dev/null || (echo "Installing trash..." && brew install trash)
28 | @echo "trash is installed."
29 | @echo "Checking for fastlane"
30 | @which fastlane > /dev/null || (echo "Installing fastlane..." && brew install fastlane)
31 | @echo "fastlane is installed."
32 | @$(MAKE) generate-whisperax-xcconfig
33 | @echo "Done 🚀"
34 |
35 |
36 | generate-whisperax-xcconfig:
37 | @echo "Updating DEVELOPMENT_TEAM in Examples/WhisperAX/Debug.xcconfig..."
38 | @TEAM_ID=$$(defaults read com.apple.dt.Xcode IDEProvisioningTeams | plutil -convert json -r -o - -- - | jq -r 'to_entries[0].value | sort_by(.teamType == "Individual") | .[0].teamID' 2>/dev/null); \
39 | if [ -z "$$TEAM_ID" ]; then \
40 | echo "Error: No Development Team ID found. Please log into Xcode with your Apple ID and select a team."; \
41 | else \
42 | echo "DEVELOPMENT_TEAM=$$TEAM_ID" > Examples/WhisperAX/Debug.xcconfig; \
43 | echo "DEVELOPMENT_TEAM has been updated in Examples/WhisperAX/Debug.xcconfig with your Development Team ID: $$TEAM_ID"; \
44 | fi
45 |
46 |
47 | setup-huggingface-cli:
48 | @if huggingface-cli whoami; then \
49 | echo "Already logged in to Hugging Face."; \
50 | else \
51 | echo "Not logged in to Hugging Face."; \
52 | if [ -z "$$HF_TOKEN" ]; then \
53 | echo "Environment variable HF_TOKEN is not set. Running normal login."; \
54 | huggingface-cli login; \
55 | else \
56 | echo "Using HF_TOKEN from environment variable."; \
57 | huggingface-cli login --token $$HF_TOKEN; \
58 | fi; \
59 | fi
60 |
61 |
62 | setup-model-repo:
63 | @echo "Setting up repository..."
64 | @mkdir -p $(BASE_COMPILED_DIR)
65 | @if [ -d "$(MODEL_REPO_DIR)/.git" ]; then \
66 | echo "Repository exists, resetting..."; \
67 | export GIT_LFS_SKIP_SMUDGE=1; \
68 | cd $(MODEL_REPO_DIR) && git fetch --all && git reset --hard origin/main && git clean -fdx; \
69 | else \
70 | echo "Repository not found, initializing..."; \
71 | export GIT_LFS_SKIP_SMUDGE=1; \
72 | git clone https://huggingface.co/$(MODEL_REPO) $(MODEL_REPO_DIR); \
73 | fi
74 |
75 |
76 | # Download all models
77 | download-models: setup-model-repo
78 | @echo "Downloading all models..."
79 | @cd $(MODEL_REPO_DIR) && \
80 | git lfs pull
81 |
82 |
83 | # Download a specific model
84 | download-model:
85 | @if [ -z "$(MODEL)" ]; then \
86 | echo "Error: MODEL is not set. Usage: make download-model MODEL=base"; \
87 | exit 1; \
88 | fi
89 | @echo "Downloading model $(MODEL)..."
90 | @$(MAKE) setup-model-repo
91 | @echo "Fetching model $(MODEL)..."
92 | @cd $(MODEL_REPO_DIR) && \
93 | git lfs pull --include="openai_whisper-$(MODEL)/*"
94 |
95 | build:
96 | @echo "Building WhisperKit..."
97 | @swift build -v
98 |
99 |
100 | build-cli:
101 | @echo "Building WhisperKit CLI..."
102 | @swift build -c release --product whisperkit-cli
103 |
104 |
105 | test:
106 | @echo "Running tests..."
107 | @swift test -v
108 |
109 |
110 | list-devices:
111 | fastlane ios list_devices
112 |
113 |
114 | # Usage:
115 | # make benchmark-devices # Benchmark all connected devices
116 | # make benchmark-devices DEBUG=true # Benchmark all connected devices with small test matrix
117 | # make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac" # Benchmark specific device names from `make list-devices`
118 | DEVICES ?=
119 | DEBUG ?= false
120 | benchmark-devices: generate-whisperax-xcconfig
121 | @if [ -n "$(DEVICES)" ]; then \
122 | echo "Benchmarking specific devices: $(DEVICES)"; \
123 | fastlane benchmark devices:"$(DEVICES)" debug:$(DEBUG); \
124 | else \
125 | echo "Benchmarking all connected devices"; \
126 | fastlane benchmark debug:$(DEBUG); \
127 | fi
128 |
129 | upload-benchmark-results:
130 | @echo "Uploading benchmark results..."
131 | @fastlane upload_results
132 |
133 | clean-package-caches:
134 | @trash ~/Library/Developer/Xcode/DerivedData/WhisperKit* || true
135 | @swift package purge-cache
136 | @swift package reset
--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "pins" : [
3 | {
4 | "identity" : "swift-argument-parser",
5 | "kind" : "remoteSourceControl",
6 | "location" : "https://github.com/apple/swift-argument-parser.git",
7 | "state" : {
8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
9 | "version" : "1.3.0"
10 | }
11 | },
12 | {
13 | "identity" : "swift-transformers",
14 | "kind" : "remoteSourceControl",
15 | "location" : "https://github.com/huggingface/swift-transformers.git",
16 | "state" : {
17 | "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08",
18 | "version" : "0.1.8"
19 | }
20 | }
21 | ],
22 | "version" : 2
23 | }
24 |
--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version: 5.9
2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
3 |
4 | import PackageDescription
5 |
6 | let package = Package(
7 | name: "whisperkit",
8 | platforms: [
9 | .iOS(.v16),
10 | .macOS(.v13),
11 | ],
12 | products: [
13 | .library(
14 | name: "WhisperKit",
15 | targets: ["WhisperKit"]
16 | ),
17 | .executable(
18 | name: "whisperkit-cli",
19 | targets: ["WhisperKitCLI"]
20 | ),
21 | ],
22 | dependencies: [
23 | .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMinor(from: "0.1.8")),
24 | .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
25 | ],
26 | targets: [
27 | .target(
28 | name: "WhisperKit",
29 | dependencies: [
30 | .product(name: "Transformers", package: "swift-transformers"),
31 | ]
32 | ),
33 | .executableTarget(
34 | name: "WhisperKitCLI",
35 | dependencies: [
36 | "WhisperKit",
37 | .product(name: "ArgumentParser", package: "swift-argument-parser"),
38 | ]
39 | ),
40 | .testTarget(
41 | name: "WhisperKitTests",
42 | dependencies: [
43 | "WhisperKit",
44 | .product(name: "Transformers", package: "swift-transformers"),
45 | ],
46 | path: ".",
47 | exclude: [
48 | "Examples",
49 | "Sources",
50 | "Makefile",
51 | "README.md",
52 | "LICENSE",
53 | "CONTRIBUTING.md",
54 | ],
55 | resources: [
56 | .process("Tests/WhisperKitTests/Resources"),
57 | .copy("Models/whisperkit-coreml"),
58 | ]
59 | ),
60 | ]
61 | )
62 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | # WhisperKit
13 |
14 | [](https://github.com/argmaxinc/whisperkit/actions/workflows/pre-release-tests.yml)
15 | [](LICENSE.md)
16 | [](https://swiftpackageindex.com/argmaxinc/WhisperKit) [](https://swiftpackageindex.com/argmaxinc/WhisperKit)
17 | [](https://discord.gg/G5F5GZGecC)
18 |
19 |
20 |
21 |
22 | WhisperKit is an [Argmax](https://www.takeargmax.com) framework for deploying state-of-the-art speech-to-text systems (e.g. [Whisper](https://github.com/openai/whisper)) on device with advanced features such as real-time streaming, word timestamps, voice activity detection, and more.
23 |
24 | [[TestFlight Demo App]](https://testflight.apple.com/join/LPVOyJZW) [[Python Tools]](https://github.com/argmaxinc/whisperkittools) [[Benchmarks & Device Support]](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) [[WhisperKit Android]](https://github.com/argmaxinc/WhisperKitAndroid)
25 |
26 | > [!IMPORTANT]
27 | > If you are looking for more features such as speaker diarization and upgraded performance, check out [WhisperKit Pro](https://huggingface.co/argmaxinc/whisperkit-pro) and [SpeakerKit Pro](https://huggingface.co/argmaxinc/speakerkit-pro)! For commercial use or evaluation, please reach out to [whisperkitpro@argmaxinc.com](mailto:whisperkitpro@argmaxinc.com).
28 |
29 | ## Table of Contents
30 |
31 | - [Installation](#installation)
32 | - [Swift Package Manager](#swift-package-manager)
33 | - [Prerequisites](#prerequisites)
34 | - [Xcode Steps](#xcode-steps)
35 | - [Package.swift](#packageswift)
36 | - [Homebrew](#homebrew)
37 | - [Getting Started](#getting-started)
38 | - [Quick Example](#quick-example)
39 | - [Model Selection](#model-selection)
40 | - [Generating Models](#generating-models)
41 | - [Swift CLI](#swift-cli)
42 | - [Contributing \& Roadmap](#contributing--roadmap)
43 | - [License](#license)
44 | - [Citation](#citation)
45 |
46 | ## Installation
47 |
48 | ### Swift Package Manager
49 |
50 | WhisperKit can be integrated into your Swift project using the Swift Package Manager.
51 |
52 | ### Prerequisites
53 |
54 | - macOS 14.0 or later.
55 | - Xcode 15.0 or later.
56 |
57 | ### Xcode Steps
58 |
59 | 1. Open your Swift project in Xcode.
60 | 2. Navigate to `File` > `Add Package Dependencies...`.
61 | 3. Enter the package repository URL: `https://github.com/argmaxinc/whisperkit`.
62 | 4. Choose the version range or specific version.
63 | 5. Click `Finish` to add WhisperKit to your project.
64 |
65 | ### Package.swift
66 |
67 | If you're using WhisperKit as part of a swift package, you can include it in your Package.swift dependencies as follows:
68 |
69 | ```swift
70 | dependencies: [
71 | .package(url: "https://github.com/argmaxinc/WhisperKit.git", from: "0.9.0"),
72 | ],
73 | ```
74 |
75 | Then add `WhisperKit` as a dependency for your target:
76 |
77 | ```swift
78 | .target(
79 | name: "YourApp",
80 | dependencies: ["WhisperKit"]
81 | ),
82 | ```
83 |
84 | ### Homebrew
85 |
86 | You can install `WhisperKit` command line app using [Homebrew](https://brew.sh) by running the following command:
87 |
88 | ```bash
89 | brew install whisperkit-cli
90 | ```
91 |
92 | ## Getting Started
93 |
94 | To get started with WhisperKit, you need to initialize it in your project.
95 |
96 | ### Quick Example
97 |
98 | This example demonstrates how to transcribe a local audio file:
99 |
100 | ```swift
101 | import WhisperKit
102 |
103 | // Initialize WhisperKit with default settings
104 | Task {
105 | let pipe = try? await WhisperKit()
106 | let transcription = try? await pipe!.transcribe(audioPath: "path/to/your/audio.{wav,mp3,m4a,flac}")?.text
107 | print(transcription)
108 | }
109 | ```
110 |
111 | ### Model Selection
112 |
113 | WhisperKit automatically downloads the recommended model for the device if not specified. You can also select a specific model by passing in the model name:
114 |
115 | ```swift
116 | let pipe = try? await WhisperKit(WhisperKitConfig(model: "large-v3"))
117 | ```
118 |
119 | This method also supports glob search, so you can use wildcards to select a model:
120 |
121 | ```swift
122 | let pipe = try? await WhisperKit(WhisperKitConfig(model: "distil*large-v3"))
123 | ```
124 |
125 | Note that the model search must return a single model from the source repo, otherwise an error will be thrown.
126 |
127 | For a list of available models, see our [HuggingFace repo](https://huggingface.co/argmaxinc/whisperkit-coreml).
128 |
129 | ### Generating Models
130 |
131 | WhisperKit also comes with the supporting repo [`whisperkittools`](https://github.com/argmaxinc/whisperkittools) which lets you create and deploy your own fine tuned versions of Whisper in CoreML format to HuggingFace. Once generated, they can be loaded by simply changing the repo name to the one used to upload the model:
132 |
133 | ```swift
134 | let config = WhisperKitConfig(model: "large-v3", modelRepo: "username/your-model-repo")
135 | let pipe = try? await WhisperKit(config)
136 | ```
137 |
138 | ### Swift CLI
139 |
140 | The Swift CLI allows for quick testing and debugging outside of an Xcode project. To install it, run the following:
141 |
142 | ```bash
143 | git clone https://github.com/argmaxinc/whisperkit.git
144 | cd whisperkit
145 | ```
146 |
147 | Then, setup the environment and download your desired model.
148 |
149 | ```bash
150 | make setup
151 | make download-model MODEL=large-v3
152 | ```
153 |
154 | **Note**:
155 |
156 | 1. This will download only the model specified by `MODEL` (see what's available in our [HuggingFace repo](https://huggingface.co/argmaxinc/whisperkit-coreml), where we use the prefix `openai_whisper-{MODEL}`)
157 | 2. Before running `download-model`, make sure [git-lfs](https://git-lfs.com) is installed
158 |
159 | If you would like download all available models to your local folder, use this command instead:
160 |
161 | ```bash
162 | make download-models
163 | ```
164 |
165 | You can then run them via the CLI with:
166 |
167 | ```bash
168 | swift run whisperkit-cli transcribe --model-path "Models/whisperkit-coreml/openai_whisper-large-v3" --audio-path "path/to/your/audio.{wav,mp3,m4a,flac}"
169 | ```
170 |
171 | Which should print a transcription of the audio file. If you would like to stream the audio directly from a microphone, use:
172 |
173 | ```bash
174 | swift run whisperkit-cli transcribe --model-path "Models/whisperkit-coreml/openai_whisper-large-v3" --stream
175 | ```
176 |
177 | ## Contributing & Roadmap
178 |
179 | Our goal is to make WhisperKit better and better over time and we'd love your help! Just search the code for "TODO" for a variety of features that are yet to be built. Please refer to our [contribution guidelines](CONTRIBUTING.md) for submitting issues, pull requests, and coding standards, where we also have a public roadmap of features we are looking forward to building in the future.
180 |
181 | ## License
182 |
183 | WhisperKit is released under the MIT License. See [LICENSE](LICENSE) for more details.
184 |
185 | ## Citation
186 |
187 | If you use WhisperKit for something cool or just find it useful, please drop us a note at [info@argmaxinc.com](mailto:info@argmaxinc.com)!
188 |
189 | If you use WhisperKit for academic work, here is the BibTeX:
190 |
191 | ```bibtex
192 | @misc{whisperkit-argmax,
193 | title = {WhisperKit},
194 | author = {Argmax, Inc.},
195 | year = {2024},
196 | URL = {https://github.com/argmaxinc/WhisperKit}
197 | }
198 | ```
199 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Audio/AudioChunker.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Accelerate
5 | import AVFoundation
6 | import Foundation
7 |
8 | /// Responsible for chunking audio into smaller pieces
9 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
10 | public protocol AudioChunking {
11 | func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk]
12 | }
13 |
14 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
15 | public extension AudioChunking {
16 | func updateSeekOffsetsForResults(
17 | chunkedResults: [Result<[TranscriptionResult], Swift.Error>],
18 | audioChunks: [AudioChunk]
19 | ) -> [TranscriptionResult] {
20 | var updatedTranscriptionResults = [TranscriptionResult]()
21 | for (index, chunkedResult) in chunkedResults.enumerated() {
22 | switch chunkedResult {
23 | case let .success(results):
24 | let seekTime = Float(audioChunks[index].seekOffsetIndex) / Float(WhisperKit.sampleRate)
25 | for result in results {
26 | var updatedSegments = [TranscriptionSegment]()
27 | for segment in result.segments {
28 | let updatedSegment = updateSegmentTimings(segment: segment, seekTime: seekTime)
29 | updatedSegments.append(updatedSegment)
30 | }
31 | var updatedResult = result
32 | updatedResult.seekTime = seekTime
33 | updatedResult.segments = updatedSegments
34 | updatedTranscriptionResults.append(updatedResult)
35 | }
36 | case let .failure(error):
37 | Logging.debug("Error transcribing chunk \(index): \(error)")
38 | }
39 | }
40 | return updatedTranscriptionResults
41 | }
42 | }
43 |
44 | /// A audio chunker that splits audio into smaller pieces based on voice activity detection
45 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
46 | open class VADAudioChunker: AudioChunking {
47 | /// prevent hallucinations at the end of the clip by stopping up to 1.0s early
48 | private let windowPadding: Int
49 | private let vad: VoiceActivityDetector
50 |
51 | public init(windowPadding: Int = 16000, vad: VoiceActivityDetector? = nil) {
52 | self.windowPadding = windowPadding
53 | self.vad = vad ?? EnergyVAD()
54 | }
55 |
56 | private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int {
57 | // NOTE: we want to check just the 2nd part for the silence to attempt to get closest to a max length chunk
58 | let audioMidIndex = startIndex + (endIndex - startIndex) / 2
59 | let vadAudioSlice = Array(audioArray[audioMidIndex.. [AudioChunk] {
70 | // If the audio array length is less than or equal to maxLength, return it as a single chunk
71 | if audioArray.count <= maxChunkLength {
72 | return [AudioChunk(seekOffsetIndex: 0, audioSamples: audioArray)]
73 | }
74 |
75 | // First create chunks from seek clips
76 | let seekClips = prepareSeekClips(contentFrames: audioArray.count, decodeOptions: decodeOptions)
77 |
78 | var chunkedAudio = [AudioChunk]()
79 | for (seekClipStart, seekClipEnd) in seekClips {
80 | // Loop through the current clip until we reach the end
81 | // Typically this will be the full audio file, unless seek points are explicitly provided
82 | var startIndex = seekClipStart
83 | while startIndex < seekClipEnd - windowPadding {
84 | guard startIndex >= 0 && startIndex < audioArray.count else {
85 | throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
86 | }
87 |
88 | // Make sure we still need chunking for this seek clip, otherwise use the original seek clip end
89 | var endIndex = seekClipEnd
90 | if startIndex + maxChunkLength < endIndex {
91 | // Adjust the end index based on VAD
92 | endIndex = splitOnMiddleOfLongestSilence(
93 | audioArray: audioArray,
94 | startIndex: startIndex,
95 | endIndex: min(audioArray.count, startIndex + maxChunkLength)
96 | )
97 | }
98 |
99 | guard endIndex > startIndex else {
100 | break
101 | }
102 | Logging.debug("Found chunk from \(formatTimestamp(Float(startIndex) / Float(WhisperKit.sampleRate))) to \(formatTimestamp(Float(endIndex) / Float(WhisperKit.sampleRate)))")
103 | let audioSlice = AudioChunk(seekOffsetIndex: startIndex, audioSamples: Array(audioArray[startIndex.. Void
23 |
24 | /// Responsible for streaming audio from the microphone, processing it, and transcribing it in real-time.
25 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
26 | public actor AudioStreamTranscriber {
27 | private var state: AudioStreamTranscriber.State = .init() {
28 | didSet {
29 | stateChangeCallback?(oldValue, state)
30 | }
31 | }
32 |
33 | private let stateChangeCallback: AudioStreamTranscriberCallback?
34 |
35 | private let requiredSegmentsForConfirmation: Int
36 | private let useVAD: Bool
37 | private let silenceThreshold: Float
38 | private let compressionCheckWindow: Int
39 | private let transcribeTask: TranscribeTask
40 | private let audioProcessor: any AudioProcessing
41 | private let decodingOptions: DecodingOptions
42 |
43 | public init(
44 | audioEncoder: any AudioEncoding,
45 | featureExtractor: any FeatureExtracting,
46 | segmentSeeker: any SegmentSeeking,
47 | textDecoder: any TextDecoding,
48 | tokenizer: any WhisperTokenizer,
49 | audioProcessor: any AudioProcessing,
50 | decodingOptions: DecodingOptions,
51 | requiredSegmentsForConfirmation: Int = 2,
52 | silenceThreshold: Float = 0.3,
53 | compressionCheckWindow: Int = 60,
54 | useVAD: Bool = true,
55 | stateChangeCallback: AudioStreamTranscriberCallback?
56 | ) {
57 | self.transcribeTask = TranscribeTask(
58 | currentTimings: TranscriptionTimings(),
59 | progress: Progress(),
60 | audioEncoder: audioEncoder,
61 | featureExtractor: featureExtractor,
62 | segmentSeeker: segmentSeeker,
63 | textDecoder: textDecoder,
64 | tokenizer: tokenizer
65 | )
66 | self.audioProcessor = audioProcessor
67 | self.decodingOptions = decodingOptions
68 | self.requiredSegmentsForConfirmation = requiredSegmentsForConfirmation
69 | self.silenceThreshold = silenceThreshold
70 | self.compressionCheckWindow = compressionCheckWindow
71 | self.useVAD = useVAD
72 | self.stateChangeCallback = stateChangeCallback
73 | }
74 |
75 | public func startStreamTranscription() async throws {
76 | guard !state.isRecording else { return }
77 | guard await AudioProcessor.requestRecordPermission() else {
78 | Logging.error("Microphone access was not granted.")
79 | return
80 | }
81 | state.isRecording = true
82 | try audioProcessor.startRecordingLive { [weak self] _ in
83 | Task { [weak self] in
84 | await self?.onAudioBufferCallback()
85 | }
86 | }
87 | await realtimeLoop()
88 | Logging.info("Realtime transcription has started")
89 | }
90 |
91 | public func stopStreamTranscription() {
92 | state.isRecording = false
93 | audioProcessor.stopRecording()
94 | Logging.info("Realtime transcription has ended")
95 | }
96 |
97 | private func realtimeLoop() async {
98 | while state.isRecording {
99 | do {
100 | try await transcribeCurrentBuffer()
101 | } catch {
102 | Logging.error("Error: \(error.localizedDescription)")
103 | break
104 | }
105 | }
106 | }
107 |
108 | private func onAudioBufferCallback() {
109 | state.bufferEnergy = audioProcessor.relativeEnergy
110 | }
111 |
112 | private func onProgressCallback(_ progress: TranscriptionProgress) {
113 | let fallbacks = Int(progress.timings.totalDecodingFallbacks)
114 | if progress.text.count < state.currentText.count {
115 | if fallbacks == state.currentFallbacks {
116 | state.unconfirmedText.append(state.currentText)
117 | } else {
118 | Logging.info("Fallback occured: \(fallbacks)")
119 | }
120 | }
121 | state.currentText = progress.text
122 | state.currentFallbacks = fallbacks
123 | }
124 |
125 | private func transcribeCurrentBuffer() async throws {
126 | // Retrieve the current audio buffer from the audio processor
127 | let currentBuffer = audioProcessor.audioSamples
128 |
129 | // Calculate the size and duration of the next buffer segment
130 | let nextBufferSize = currentBuffer.count - state.lastBufferSize
131 | let nextBufferSeconds = Float(nextBufferSize) / Float(WhisperKit.sampleRate)
132 |
133 | // Only run the transcribe if the next buffer has at least 1 second of audio
134 | guard nextBufferSeconds > 1 else {
135 | if state.currentText == "" {
136 | state.currentText = "Waiting for speech..."
137 | }
138 | return try await Task.sleep(nanoseconds: 100_000_000) // sleep for 100ms for next buffer
139 | }
140 |
141 | if useVAD {
142 | let voiceDetected = AudioProcessor.isVoiceDetected(
143 | in: audioProcessor.relativeEnergy,
144 | nextBufferInSeconds: nextBufferSeconds,
145 | silenceThreshold: silenceThreshold
146 | )
147 | // Only run the transcribe if the next buffer has voice
148 | if !voiceDetected {
149 | Logging.debug("No voice detected, skipping transcribe")
150 | if state.currentText == "" {
151 | state.currentText = "Waiting for speech..."
152 | }
153 | // Sleep for 100ms and check the next buffer
154 | return try await Task.sleep(nanoseconds: 100_000_000)
155 | }
156 | }
157 |
158 | // Run transcribe
159 | state.lastBufferSize = currentBuffer.count
160 |
161 | let transcription = try await transcribeAudioSamples(Array(currentBuffer))
162 |
163 | state.currentText = ""
164 | state.unconfirmedText = []
165 | let segments = transcription.segments
166 |
167 | // Logic for moving segments to confirmedSegments
168 | if segments.count > requiredSegmentsForConfirmation {
169 | // Calculate the number of segments to confirm
170 | let numberOfSegmentsToConfirm = segments.count - requiredSegmentsForConfirmation
171 |
172 | // Confirm the required number of segments
173 | let confirmedSegmentsArray = Array(segments.prefix(numberOfSegmentsToConfirm))
174 | let remainingSegments = Array(segments.suffix(requiredSegmentsForConfirmation))
175 |
176 | // Update lastConfirmedSegmentEnd based on the last confirmed segment
177 | if let lastConfirmedSegment = confirmedSegmentsArray.last, lastConfirmedSegment.end > state.lastConfirmedSegmentEndSeconds {
178 | state.lastConfirmedSegmentEndSeconds = lastConfirmedSegment.end
179 |
180 | // Add confirmed segments to the confirmedSegments array
181 | if !state.confirmedSegments.contains(confirmedSegmentsArray) {
182 | state.confirmedSegments.append(contentsOf: confirmedSegmentsArray)
183 | }
184 | }
185 |
186 | // Update transcriptions to reflect the remaining segments
187 | state.unconfirmedSegments = remainingSegments
188 | } else {
189 | // Handle the case where segments are fewer or equal to required
190 | state.unconfirmedSegments = segments
191 | }
192 | }
193 |
194 | private func transcribeAudioSamples(_ samples: [Float]) async throws -> TranscriptionResult {
195 | var options = decodingOptions
196 | options.clipTimestamps = [state.lastConfirmedSegmentEndSeconds]
197 | let checkWindow = compressionCheckWindow
198 | return try await transcribeTask.run(audioArray: samples, decodeOptions: options) { [weak self] progress in
199 | Task { [weak self] in
200 | await self?.onProgressCallback(progress)
201 | }
202 | return AudioStreamTranscriber.shouldStopEarly(progress: progress, options: options, compressionCheckWindow: checkWindow)
203 | }
204 | }
205 |
206 | private static func shouldStopEarly(
207 | progress: TranscriptionProgress,
208 | options: DecodingOptions,
209 | compressionCheckWindow: Int
210 | ) -> Bool? {
211 | let currentTokens = progress.tokens
212 | if currentTokens.count > compressionCheckWindow {
213 | let checkTokens: [Int] = currentTokens.suffix(compressionCheckWindow)
214 | let compressionRatio = compressionRatio(of: checkTokens)
215 | if compressionRatio > options.compressionRatioThreshold ?? 0.0 {
216 | return false
217 | }
218 | }
219 | if let avgLogprob = progress.avgLogprob, let logProbThreshold = options.logProbThreshold {
220 | if avgLogprob < logProbThreshold {
221 | return false
222 | }
223 | }
224 | return nil
225 | }
226 | }
227 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Audio/EnergyVAD.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Voice activity detection based on energy threshold
7 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
8 | final class EnergyVAD: VoiceActivityDetector {
9 | var energyThreshold: Float
10 |
11 | /// Initialize a new EnergyVAD instance
12 | /// - Parameters:
13 | /// - sampleRate: Audio sample rate
14 | /// - frameLength: Frame length in seconds
15 | /// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries
16 | /// - energyThreshold: minimal energy threshold
17 | convenience init(
18 | sampleRate: Int = WhisperKit.sampleRate,
19 | frameLength: Float = 0.1,
20 | frameOverlap: Float = 0.0,
21 | energyThreshold: Float = 0.02
22 | ) {
23 | self.init(
24 | sampleRate: sampleRate,
25 | // Compute frame length and overlap in number of samples
26 | frameLengthSamples: Int(frameLength * Float(sampleRate)),
27 | frameOverlapSamples: Int(frameOverlap * Float(sampleRate)),
28 | energyThreshold: energyThreshold
29 | )
30 | }
31 |
32 | required init(
33 | sampleRate: Int = 16000,
34 | frameLengthSamples: Int,
35 | frameOverlapSamples: Int = 0,
36 | energyThreshold: Float = 0.02
37 | ) {
38 | self.energyThreshold = energyThreshold
39 | super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples)
40 | }
41 |
42 | override func voiceActivity(in waveform: [Float]) -> [Bool] {
43 | let chunkRatio = Double(waveform.count) / Double(frameLengthSamples)
44 |
45 | // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long
46 | let count = Int(chunkRatio.rounded(.up))
47 |
48 | let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks(
49 | of: waveform,
50 | chunkCount: count,
51 | frameLengthSamples: frameLengthSamples,
52 | frameOverlapSamples: frameOverlapSamples,
53 | energyThreshold: energyThreshold
54 | )
55 |
56 | return chunkedVoiceActivity
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Audio/VoiceActivityDetector.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not.
7 | /// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality.
8 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
9 | open class VoiceActivityDetector {
10 | /// The sample rate of the audio signal, in samples per second.
11 | public let sampleRate: Int
12 |
13 | /// The length of each frame in samples.
14 | public let frameLengthSamples: Int
15 |
16 | /// The number of samples overlapping between consecutive frames.
17 | public let frameOverlapSamples: Int
18 |
19 | /// Initializes a new `VoiceActivityDetector` instance with the specified parameters.
20 | /// - Parameters:
21 | /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000.
22 | /// - frameLengthSamples: The length of each frame in samples.
23 | /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0.
24 | /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality.
25 | public init(
26 | sampleRate: Int = 16000,
27 | frameLengthSamples: Int,
28 | frameOverlapSamples: Int = 0
29 | ) {
30 | self.sampleRate = sampleRate
31 | self.frameLengthSamples = frameLengthSamples
32 | self.frameOverlapSamples = frameOverlapSamples
33 | }
34 |
35 | /// Analyzes the provided audio waveform to determine which segments contain voice activity.
36 | /// - Parameter waveform: An array of `Float` values representing the audio waveform.
37 | /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence.
38 | open func voiceActivity(in waveform: [Float]) -> [Bool] {
39 | fatalError("`voiceActivity` must be implemented by subclass")
40 | }
41 |
42 | /// Calculates and returns a list of active audio chunks, each represented by a start and end index.
43 | /// - Parameter waveform: An array of `Float` values representing the audio waveform.
44 | /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk.
45 | public func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] {
46 | let vad: [Bool] = voiceActivity(in: waveform)
47 | var result = [(startIndex: Int, endIndex: Int)]()
48 |
49 | // Temporary variables to hold the start of the current non-silent segment
50 | var currentStartIndex: Int?
51 |
52 | for (index, vadChunk) in vad.enumerated() {
53 | if vadChunk {
54 | let chunkStart = index * frameLengthSamples
55 | let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count)
56 |
57 | if currentStartIndex != nil {
58 | // If we already have a starting point, just update the end point in the last added segment
59 | result[result.count - 1].endIndex = chunkEnd
60 | } else {
61 | // If there is no current start, this is a new segment
62 | currentStartIndex = chunkStart
63 | result.append((startIndex: chunkStart, endIndex: chunkEnd))
64 | }
65 | } else {
66 | // Reset currentStartIndex when encountering a silent chunk
67 | currentStartIndex = nil
68 | }
69 | }
70 |
71 | return result
72 | }
73 |
74 | /// Converts a voice activity index to the corresponding audio sample index.
75 | /// - Parameter index: The voice activity index to convert.
76 | /// - Returns: The corresponding audio sample index.
77 | public func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int {
78 | return index * frameLengthSamples
79 | }
80 |
81 | public func voiceActivityIndexToSeconds(_ index: Int) -> Float {
82 | return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate)
83 | }
84 |
85 | /// Identifies the longest continuous period of silence within the provided voice activity detection results.
86 | /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results.
87 | /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found.
88 | public func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? {
89 | var longestStartIndex: Int?
90 | var longestEndIndex: Int?
91 | var longestCount = 0
92 | var index = 0
93 | while index < vadResult.count {
94 | let value = vadResult[index]
95 | if value {
96 | // found non-silence, skip
97 | index += 1
98 | } else {
99 | // found beginning of silence, find the end
100 | var endIndex = index
101 | while endIndex < vadResult.count, !vadResult[endIndex] {
102 | endIndex += 1
103 | }
104 | let count = endIndex - index
105 | if count > longestCount {
106 | longestCount = count
107 | longestStartIndex = index
108 | longestEndIndex = endIndex
109 | }
110 | index = endIndex
111 | }
112 | }
113 | if let longestStartIndex, let longestEndIndex {
114 | return (startIndex: longestStartIndex, endIndex: longestEndIndex)
115 | } else {
116 | return nil
117 | }
118 | }
119 |
120 | // MARK: - Utility
121 |
122 | func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] {
123 | let nonSilentChunks = calculateActiveChunks(in: waveform)
124 | var clipTimestamps = [Float]()
125 |
126 | for chunk in nonSilentChunks {
127 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
128 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
129 |
130 | clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp])
131 | }
132 |
133 | return clipTimestamps
134 | }
135 |
136 | func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] {
137 | let clipTimestamps = voiceActivityClipTimestamps(in: waveform)
138 | let options = DecodingOptions(clipTimestamps: clipTimestamps)
139 | let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options)
140 | return seekClips
141 | }
142 |
143 | func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] {
144 | let nonSilentChunks = calculateActiveChunks(in: waveform)
145 | var seekTimestamps = [(startTime: Float, endTime: Float)]()
146 |
147 | for chunk in nonSilentChunks {
148 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
149 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
150 |
151 | seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)])
152 | }
153 |
154 | return seekTimestamps
155 | }
156 | }
157 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/AudioEncoder.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import CoreML
5 |
6 | public protocol AudioEncoderOutputType {}
7 | extension MLMultiArray: AudioEncoderOutputType {}
8 |
9 | /// AudioEncoding protocol defines the requirements for an audio encoding implementation.
10 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
11 | public protocol AudioEncoding {
12 | /// The size of the embedding produced by the encoder.
13 | var embedSize: Int? { get }
14 |
15 | /// Encodes the given audio features asynchronously.
16 | /// - Parameter features: The audio features to be encoded.
17 | /// - Returns: An optional tensor containing the encoded features.
18 | func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)?
19 | }
20 |
21 | /// Backwards-compatible AudioEncoder implementation
22 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
23 | public class AudioEncoder: AudioEncoding, WhisperMLModel {
24 | public var model: MLModel?
25 |
26 | public var embedSize: Int? {
27 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
28 | guard inputDescription.type == .multiArray else { return nil }
29 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
30 | let shape = shapeConstraint.shape.map { $0.intValue }
31 | return shape[1]
32 | }
33 |
34 | public var sequenceLength: Int? {
35 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
36 | guard inputDescription.type == .multiArray else { return nil }
37 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
38 | let shape = shapeConstraint.shape.map { $0.intValue }
39 | return shape[3]
40 | }
41 |
42 | public init() {}
43 |
44 | public func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)? {
45 | guard let features = features as? MLMultiArray else {
46 | throw WhisperError.audioProcessingFailed("AudioEncoder input must be MLMultiArray")
47 | }
48 |
49 | return try await encodeFeatures(features)
50 | }
51 |
52 | public func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? {
53 | guard let model else {
54 | throw WhisperError.modelsUnavailable()
55 | }
56 | try Task.checkCancellation()
57 |
58 | let interval = Logging.beginSignpost("EncodeAudio", signposter: Logging.AudioEncoding.signposter)
59 | defer { Logging.endSignpost("EncodeAudio", interval: interval, signposter: Logging.AudioEncoding.signposter) }
60 |
61 | let modelInputs = AudioEncoderInput(melspectrogram_features: features)
62 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions())
63 | let output = AudioEncoderOutput(features: outputFeatures)
64 | return output.encoder_output_embeds
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Configurations.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Configuration to initialize WhisperKit
7 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
8 | open class WhisperKitConfig {
9 | /// Name for whisper model to use
10 | public var model: String?
11 | /// Base URL for downloading models
12 | public var downloadBase: URL?
13 | /// Repository for downloading models
14 | public var modelRepo: String?
15 | /// Token for downloading models from repo (if required)
16 | public var modelToken: String?
17 |
18 | /// Folder to store models
19 | public var modelFolder: String?
20 | /// Folder to store tokenizers
21 | public var tokenizerFolder: URL?
22 |
23 | /// Model compute options, see `ModelComputeOptions`
24 | public var computeOptions: ModelComputeOptions?
25 | /// Audio input config to define how to process audio input
26 | public var audioInputConfig: AudioInputConfig?
27 | /// Audio processor for the model
28 | public var audioProcessor: (any AudioProcessing)?
29 | public var featureExtractor: (any FeatureExtracting)?
30 | public var audioEncoder: (any AudioEncoding)?
31 | public var textDecoder: (any TextDecoding)?
32 | public var logitsFilters: [any LogitsFiltering]?
33 | public var segmentSeeker: (any SegmentSeeking)?
34 | public var voiceActivityDetector: VoiceActivityDetector?
35 |
36 | /// Enable extra verbosity for logging
37 | public var verbose: Bool
38 | /// Maximum log level
39 | public var logLevel: Logging.LogLevel
40 |
41 | /// Enable model prewarming
42 | public var prewarm: Bool?
43 | /// Load models if available
44 | public var load: Bool?
45 | /// Download models if not available
46 | public var download: Bool
47 | /// Use background download session
48 | public var useBackgroundDownloadSession: Bool
49 |
50 | public init(model: String? = nil,
51 | downloadBase: URL? = nil,
52 | modelRepo: String? = nil,
53 | modelToken: String? = nil,
54 | modelFolder: String? = nil,
55 | tokenizerFolder: URL? = nil,
56 | computeOptions: ModelComputeOptions? = nil,
57 | audioInputConfig: AudioInputConfig? = nil,
58 | audioProcessor: (any AudioProcessing)? = nil,
59 | featureExtractor: (any FeatureExtracting)? = nil,
60 | audioEncoder: (any AudioEncoding)? = nil,
61 | textDecoder: (any TextDecoding)? = nil,
62 | logitsFilters: [any LogitsFiltering]? = nil,
63 | segmentSeeker: (any SegmentSeeking)? = nil,
64 | voiceActivityDetector: VoiceActivityDetector? = nil,
65 | verbose: Bool = true,
66 | logLevel: Logging.LogLevel = .info,
67 | prewarm: Bool? = nil,
68 | load: Bool? = nil,
69 | download: Bool = true,
70 | useBackgroundDownloadSession: Bool = false)
71 | {
72 | self.model = model
73 | self.downloadBase = downloadBase
74 | self.modelRepo = modelRepo
75 | self.modelToken = modelToken
76 | self.modelFolder = modelFolder
77 | self.tokenizerFolder = tokenizerFolder
78 | self.computeOptions = computeOptions
79 | self.audioInputConfig = audioInputConfig
80 | self.audioProcessor = audioProcessor
81 | self.featureExtractor = featureExtractor
82 | self.audioEncoder = audioEncoder
83 | self.textDecoder = textDecoder
84 | self.logitsFilters = logitsFilters
85 | self.segmentSeeker = segmentSeeker
86 | self.voiceActivityDetector = voiceActivityDetector
87 | self.verbose = verbose
88 | self.logLevel = logLevel
89 | self.prewarm = prewarm
90 | self.load = load
91 | self.download = download
92 | self.useBackgroundDownloadSession = useBackgroundDownloadSession
93 | }
94 | }
95 |
96 | /// Options for how to transcribe an audio file using WhisperKit.
97 | ///
98 | /// - Parameters:
99 | /// - verbose: Whether to display the text being decoded to the console.
100 | /// If true, displays all details; if false, displays minimal details;
101 | /// - task: Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
102 | /// - language: Language spoken in the audio
103 | /// - temperature: Temperature to use for sampling.
104 | /// - temperatureIncrementOnFallback: Increment which will be
105 | /// successively added to temperature upon failures according to either `compressionRatioThreshold`
106 | /// or `logProbThreshold`.
107 | /// - temperatureFallbackCount: Number of times to increment temperature on fallback.
108 | /// - sampleLength: The maximum number of tokens to sample.
109 | /// - topK: Number of candidates when sampling with non-zero temperature.
110 | /// - usePrefillPrompt: If true, the prefill tokens will be forced according to task and language settings.
111 | /// - usePrefillCache: If true, the kv cache will be prefilled based on the prefill data mlmodel.
112 | /// - detectLanguage: Use this in conjuntion with `usePrefillPrompt: true` to detect the language of the input audio.
113 | /// - skipSpecialTokens: Whether to skip special tokens in the output.
114 | /// - withoutTimestamps: Whether to include timestamps in the transcription result.
115 | /// - wordTimestamps: Whether to include word-level timestamps in the transcription result.
116 | /// - maxInitialTimestamp: Maximal initial timestamp.
117 | /// - clipTimestamps: Array of timestamps (in seconds) to split the audio into segments for transcription.
118 | /// - promptTokens: Array of token IDs to use as the conditioning prompt for the decoder. These are prepended to the prefill tokens.
119 | /// - prefixTokens: Array of token IDs to use as the initial prefix for the decoder. These are appended to the prefill tokens.
120 | /// - suppressBlank: If true, blank tokens will be suppressed during decoding.
121 | /// - supressTokens: List of token IDs to suppress during decoding.
122 | /// - compressionRatioThreshold: If the compression ratio of the transcription text is above this value, it is too repetitive and treated as failed.
123 | /// - logProbThreshold: If the average log probability over sampled tokens is below this value, treat as failed.
124 | /// - firstTokenLogProbThreshold: If the log probability over the first sampled token is below this value, treat as failed.
125 | /// - noSpeechThreshold: If the no speech probability is higher than this value AND the average log
126 | /// probability over sampled tokens is below `logProbThreshold`, consider the segment as silent.
127 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
128 | public struct DecodingOptions: Codable {
129 | public var verbose: Bool
130 | public var task: DecodingTask
131 | public var language: String?
132 | public var temperature: Float
133 | public var temperatureIncrementOnFallback: Float
134 | public var temperatureFallbackCount: Int
135 | public var sampleLength: Int
136 | public var topK: Int
137 | public var usePrefillPrompt: Bool
138 | public var usePrefillCache: Bool
139 | public var detectLanguage: Bool
140 | public var skipSpecialTokens: Bool
141 | public var withoutTimestamps: Bool
142 | public var wordTimestamps: Bool
143 | public var maxInitialTimestamp: Float?
144 | public var clipTimestamps: [Float]
145 | public var promptTokens: [Int]?
146 | public var prefixTokens: [Int]?
147 | public var suppressBlank: Bool
148 | public var supressTokens: [Int]
149 | public var compressionRatioThreshold: Float?
150 | public var logProbThreshold: Float?
151 | public var firstTokenLogProbThreshold: Float?
152 | public var noSpeechThreshold: Float?
153 | public var concurrentWorkerCount: Int
154 | public var chunkingStrategy: ChunkingStrategy?
155 |
156 | public init(
157 | verbose: Bool = false,
158 | task: DecodingTask = .transcribe,
159 | language: String? = nil,
160 | temperature: Float = 0.0,
161 | temperatureIncrementOnFallback: Float = 0.2,
162 | temperatureFallbackCount: Int = 5,
163 | sampleLength: Int = Constants.maxTokenContext,
164 | topK: Int = 5,
165 | usePrefillPrompt: Bool = true,
166 | usePrefillCache: Bool = true,
167 | detectLanguage: Bool? = nil,
168 | skipSpecialTokens: Bool = false,
169 | withoutTimestamps: Bool = false,
170 | wordTimestamps: Bool = false,
171 | maxInitialTimestamp: Float? = nil,
172 | clipTimestamps: [Float] = [],
173 | promptTokens: [Int]? = nil,
174 | prefixTokens: [Int]? = nil,
175 | suppressBlank: Bool = false,
176 | supressTokens: [Int]? = nil,
177 | compressionRatioThreshold: Float? = 2.4,
178 | logProbThreshold: Float? = -1.0,
179 | firstTokenLogProbThreshold: Float? = -1.5,
180 | noSpeechThreshold: Float? = 0.6,
181 | concurrentWorkerCount: Int? = nil,
182 | chunkingStrategy: ChunkingStrategy? = nil
183 | ) {
184 | self.verbose = verbose
185 | self.task = task
186 | self.language = language
187 | self.temperature = temperature
188 | self.temperatureIncrementOnFallback = temperatureIncrementOnFallback
189 | self.temperatureFallbackCount = temperatureFallbackCount
190 | self.sampleLength = sampleLength
191 | self.topK = topK
192 | self.usePrefillPrompt = usePrefillPrompt
193 | self.usePrefillCache = usePrefillCache
194 | self.detectLanguage = detectLanguage ?? !usePrefillPrompt // If prefill is false, detect language by default
195 | self.skipSpecialTokens = skipSpecialTokens
196 | self.withoutTimestamps = withoutTimestamps
197 | self.wordTimestamps = wordTimestamps
198 | self.maxInitialTimestamp = maxInitialTimestamp
199 | self.clipTimestamps = clipTimestamps
200 | self.promptTokens = promptTokens
201 | self.prefixTokens = prefixTokens
202 | self.suppressBlank = suppressBlank
203 | self.supressTokens = supressTokens ?? [] // nonSpeechTokens() // TODO: implement these as default
204 | self.compressionRatioThreshold = compressionRatioThreshold
205 | self.logProbThreshold = logProbThreshold
206 | self.firstTokenLogProbThreshold = firstTokenLogProbThreshold
207 | self.noSpeechThreshold = noSpeechThreshold
208 | // Set platform-specific default worker count if not explicitly provided
209 | // Non-macOS devices have shown regressions with >4 workers, default to 4 for safety
210 | #if os(macOS)
211 | self.concurrentWorkerCount = concurrentWorkerCount ?? 16
212 | #else
213 | self.concurrentWorkerCount = concurrentWorkerCount ?? 4
214 | #endif
215 | self.chunkingStrategy = chunkingStrategy
216 | }
217 | }
218 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/FeatureExtractor.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Accelerate
5 | import AVFoundation
6 | import CoreGraphics
7 | import CoreML
8 | import Foundation
9 |
10 | public protocol FeatureExtractorOutputType {}
11 | extension MLMultiArray: FeatureExtractorOutputType {}
12 |
13 | public protocol FeatureExtracting {
14 | associatedtype OutputType: FeatureExtractorOutputType
15 |
16 | var melCount: Int? { get }
17 | var windowSamples: Int? { get }
18 | func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> OutputType?
19 | }
20 |
21 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
22 | open class FeatureExtractor: FeatureExtracting, WhisperMLModel {
23 | public var model: MLModel?
24 |
25 | public init() {}
26 |
27 | public var melCount: Int? {
28 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["melspectrogram_features"] else { return nil }
29 | guard inputDescription.type == .multiArray else { return nil }
30 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
31 | let shape = shapeConstraint.shape.map { $0.intValue }
32 | return shape[1]
33 | }
34 |
35 | public var windowSamples: Int? {
36 | guard let inputDescription = model?.modelDescription.inputDescriptionsByName["audio"] else { return nil }
37 | guard inputDescription.type == .multiArray else { return nil }
38 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
39 | let shape = shapeConstraint.shape.map { $0.intValue }
40 | return shape[0] // The audio input is a 1D array
41 | }
42 |
43 | public func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> MLMultiArray? {
44 | guard let model else {
45 | throw WhisperError.modelsUnavailable()
46 | }
47 | try Task.checkCancellation()
48 |
49 | let interval = Logging.beginSignpost("ExtractAudioFeatures", signposter: Logging.FeatureExtractor.signposter)
50 | defer { Logging.endSignpost("ExtractAudioFeatures", interval: interval, signposter: Logging.FeatureExtractor.signposter) }
51 |
52 | let modelInputs = MelSpectrogramInput(audio: inputAudio)
53 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions())
54 | let output = MelSpectrogramOutput(features: outputFeatures)
55 | return output.melspectrogramFeatures
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/ResultWriter.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | public protocol ResultWriting {
7 | var outputDir: String { get }
8 | func write(result: TranscriptionResult, to file: String, options: [String: Any]?) -> Result
9 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String
10 | }
11 |
12 | public extension ResultWriting {
13 | /// Format a time value as a string
14 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String {
15 | let hrs = Int(seconds / 3600)
16 | let mins = Int((seconds.truncatingRemainder(dividingBy: 3600)) / 60)
17 | let secs = Int(seconds.truncatingRemainder(dividingBy: 60))
18 | let msec = Int((seconds - floor(seconds)) * 1000)
19 |
20 | if alwaysIncludeHours || hrs > 0 {
21 | return String(format: "%02d:%02d:%02d\(decimalMarker)%03d", hrs, mins, secs, msec)
22 | } else {
23 | return String(format: "%02d:%02d\(decimalMarker)%03d", mins, secs, msec)
24 | }
25 | }
26 |
27 | func formatSegment(index: Int, start: Float, end: Float, text: String) -> String {
28 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: true, decimalMarker: ",")
29 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: true, decimalMarker: ",")
30 | return "\(index)\n\(startFormatted) --> \(endFormatted)\n\(text)\n\n"
31 | }
32 |
33 | func formatTiming(start: Float, end: Float, text: String) -> String {
34 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: false, decimalMarker: ".")
35 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: false, decimalMarker: ".")
36 | return "\(startFormatted) --> \(endFormatted)\n\(text)\n\n"
37 | }
38 | }
39 |
40 | open class WriteJSON: ResultWriting {
41 | public let outputDir: String
42 |
43 | public init(outputDir: String) {
44 | self.outputDir = outputDir
45 | }
46 |
47 | /// Write a transcription result to a JSON file
48 | /// - Parameters:
49 | /// - result: Completed transcription result
50 | /// - file: Name of the file to write, without the extension
51 | /// - options: Not used
52 | /// - Returns: The URL of the written file, or a error if the write failed
53 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result {
54 | let reportPathURL = URL(fileURLWithPath: outputDir)
55 | let reportURL = reportPathURL.appendingPathComponent("\(file).json")
56 | let jsonEncoder = JSONEncoder()
57 | jsonEncoder.outputFormatting = .prettyPrinted
58 | do {
59 | let reportJson = try jsonEncoder.encode(result)
60 | try reportJson.write(to: reportURL)
61 | } catch {
62 | return .failure(error)
63 | }
64 |
65 | return .success(reportURL.absoluteString)
66 | }
67 | }
68 |
69 | open class WriteSRT: ResultWriting {
70 | public let outputDir: String
71 |
72 | public init(outputDir: String) {
73 | self.outputDir = outputDir
74 | }
75 |
76 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result {
77 | let outputPathURL = URL(fileURLWithPath: outputDir)
78 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).srt")
79 |
80 | do {
81 | var srtContent = ""
82 | var index = 1
83 | for segment in result.segments {
84 | if let wordTimings = segment.words, !wordTimings.isEmpty {
85 | for wordTiming in wordTimings {
86 | srtContent += formatSegment(index: index, start: wordTiming.start, end: wordTiming.end, text: wordTiming.word)
87 | index += 1
88 | }
89 | } else {
90 | // Use segment timing if word timings are not available
91 | srtContent += formatSegment(index: index, start: segment.start, end: segment.end, text: segment.text)
92 | index += 1
93 | }
94 | }
95 |
96 | try srtContent.write(to: outputFileURL, atomically: true, encoding: .utf8)
97 | return .success(outputFileURL.absoluteString)
98 | } catch {
99 | return .failure(error)
100 | }
101 | }
102 | }
103 |
104 | open class WriteVTT: ResultWriting {
105 | public let outputDir: String
106 |
107 | public init(outputDir: String) {
108 | self.outputDir = outputDir
109 | }
110 |
111 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result {
112 | let outputPathURL = URL(fileURLWithPath: outputDir)
113 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).vtt")
114 |
115 | do {
116 | var vttContent = "WEBVTT\n\n"
117 | for segment in result.segments {
118 | if let wordTimings = segment.words, !wordTimings.isEmpty {
119 | for wordTiming in wordTimings {
120 | vttContent += formatTiming(start: wordTiming.start, end: wordTiming.end, text: wordTiming.word)
121 | }
122 | } else {
123 | // Use segment timing if word timings are not available
124 | vttContent += formatTiming(start: segment.start, end: segment.end, text: segment.text)
125 | }
126 | }
127 |
128 | try vttContent.write(to: outputFileURL, atomically: true, encoding: .utf8)
129 | return .success(outputFileURL.absoluteString)
130 | } catch {
131 | return .failure(error)
132 | }
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Text/LogitsFilter.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Accelerate
5 | import CoreML
6 | import Foundation
7 | import Tokenizers
8 |
9 | public protocol LogitsFiltering {
10 | func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray
11 | }
12 |
13 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
14 | open class SuppressTokensFilter: LogitsFiltering {
15 | let suppressTokens: [Int]
16 | private let suppressTokenIndexes: [[NSNumber]]
17 |
18 | public init(suppressTokens: [Int]) {
19 | self.suppressTokens = suppressTokens
20 | self.suppressTokenIndexes = suppressTokens.map { [0, 0, $0 as NSNumber] }
21 | }
22 |
23 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray {
24 | logits.fill(indexes: suppressTokenIndexes, with: -FloatType.infinity)
25 | return logits
26 | }
27 | }
28 |
29 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
30 | open class SuppressBlankFilter: LogitsFiltering {
31 | let specialTokens: SpecialTokens
32 | let sampleBegin: Int
33 | private let suppressTokenIndexes: [[NSNumber]]
34 |
35 | public init(
36 | specialTokens: SpecialTokens,
37 | sampleBegin: Int
38 | ) {
39 | self.specialTokens = specialTokens
40 | self.sampleBegin = sampleBegin
41 | self.suppressTokenIndexes = [
42 | [0, 0, specialTokens.whitespaceToken as NSNumber],
43 | [0, 0, specialTokens.endToken as NSNumber],
44 | ]
45 | }
46 |
47 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray {
48 | guard tokens.count == sampleBegin else {
49 | return logits
50 | }
51 | logits.fill(indexes: suppressTokenIndexes, with: -FloatType.infinity)
52 | return logits
53 | }
54 | }
55 |
56 | /// Implementation based on https://github.com/openai/whisper/blob/master/whisper/decoding.py#L441
57 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
58 | open class TimestampRulesFilter: LogitsFiltering {
59 | let specialTokens: SpecialTokens
60 | let sampleBegin: Int
61 | let maxInitialTimestampIndex: Int?
62 | let isModelMultilingual: Bool
63 |
64 | public init(
65 | specialTokens: SpecialTokens,
66 | sampleBegin: Int,
67 | maxInitialTimestampIndex: Int?,
68 | isModelMultilingual: Bool
69 | ) {
70 | self.specialTokens = specialTokens
71 | self.sampleBegin = sampleBegin
72 | self.maxInitialTimestampIndex = maxInitialTimestampIndex
73 | self.isModelMultilingual = isModelMultilingual
74 | }
75 |
76 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray {
77 | guard let sampleBegin = sampleBegin(for: tokens),
78 | sampleBegin <= tokens.count
79 | else {
80 | // Early return if we are still prefilling the prompt
81 | return logits
82 | }
83 |
84 | // suppress <|notimestamps|> which is handled by `withoutTimestamps`
85 | logits.fill(indexes: [[0, 0, specialTokens.noTimestampsToken as NSNumber]], with: -FloatType.infinity)
86 |
87 | if tokens.count > sampleBegin {
88 | // timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
89 | let sampledTokens = tokens[sampleBegin...]
90 | let lastWasTimestamp = sampledTokens.count >= 1 && sampledTokens.last! >= specialTokens.timeTokenBegin
91 | let penultimateWasTimestamp = sampledTokens.count < 2 || sampledTokens.dropLast().last! >= specialTokens.timeTokenBegin
92 | if lastWasTimestamp {
93 | if penultimateWasTimestamp {
94 | // has to be non-timestamp
95 | logits.fillLastDimension(indexes: specialTokens.timeTokenBegin..= specialTokens.timeTokenBegin }
103 | if let lastTimestamp = timestamps.last {
104 | // timestamps shouldn't decrease; forbid timestamp tokens smaller than the last
105 | // also force each segment to have a nonzero length, to prevent infinite looping
106 | let timestampLast =
107 | if lastWasTimestamp && !penultimateWasTimestamp {
108 | lastTimestamp
109 | } else {
110 | lastTimestamp + 1
111 | }
112 | logits.fillLastDimension(indexes: specialTokens.timeTokenBegin.. every time
118 | // if tokens.count == sampleBegin {
119 | // // suppress generating non-timestamp tokens at the beginning
120 | // logits.fillLastDimension(indexes: 0.. Int? {
136 | if isModelMultilingual {
137 | // NOTE: for multilingual model we don't want to supress "<|transcribe|>" or "<|translate|>" tokens
138 | if let taskTokenIndex = tokens.prefix(3).firstIndex(where: { $0 == specialTokens.transcribeToken || $0 == specialTokens.translateToken }) {
139 | return max(taskTokenIndex + 1, sampleBegin)
140 | } else {
141 | return nil
142 | }
143 | } else {
144 | return sampleBegin
145 | }
146 | }
147 |
148 | private func sumOfProbabilityOverTimestampsIsAboveAnyOtherToken(logits: MLMultiArray, timeTokenBegin: Int) -> Bool {
149 | let timeTokenBeginOffset = logits.linearOffset(for: [0, 0, timeTokenBegin as NSNumber])
150 |
151 | let logprobsInputPointer = UnsafeMutableRawBufferPointer(
152 | start: logits.dataPointer,
153 | count: logits.count * MemoryLayout.stride
154 | )
155 |
156 | guard let logprobsInputDescriptor = BNNSNDArrayDescriptor(
157 | data: logprobsInputPointer,
158 | scalarType: FloatType.self,
159 | shape: .vector(logits.count, stride: 1)
160 | ) else {
161 | Logging.error("Cannot create `logprobsInputDescriptor`")
162 | return false
163 | }
164 |
165 | let logprobs = BNNSNDArrayDescriptor.allocateUninitialized(
166 | scalarType: FloatType.self,
167 | shape: .vector(logits.count, stride: 1)
168 | )
169 | defer { logprobs.deallocate() }
170 |
171 | do {
172 | try BNNS.applyActivation(
173 | activation: BNNS.ActivationFunction.logSoftmax,
174 | input: logprobsInputDescriptor,
175 | output: logprobs,
176 | batchSize: 1
177 | )
178 |
179 | let timeTokenCount = logits.count - timeTokenBeginOffset
180 | let noTimeTokenCount = timeTokenBeginOffset
181 | let logSumExpInputPointer = UnsafeMutableRawBufferPointer(
182 | start: logprobs.data!.advanced(by: timeTokenBeginOffset * MemoryLayout.stride),
183 | count: timeTokenCount * MemoryLayout.stride
184 | )
185 |
186 | guard let logSumExpInputDescriptor = BNNSNDArrayDescriptor(
187 | data: logSumExpInputPointer,
188 | scalarType: FloatType.self,
189 | shape: .vector(timeTokenCount, stride: 1)
190 | ) else {
191 | Logging.error("Cannot create `logSumExpInputDescriptor`")
192 | return false
193 | }
194 |
195 | let timestampLogProb = BNNSNDArrayDescriptor.allocateUninitialized(
196 | scalarType: FloatType.self,
197 | shape: .vector(1, stride: 1)
198 | )
199 | defer { timestampLogProb.deallocate() }
200 |
201 | try BNNS.applyReduction(
202 | .logSumExp,
203 | input: logSumExpInputDescriptor,
204 | output: timestampLogProb,
205 | weights: nil
206 | )
207 |
208 | let maxTextTokenLogProbInputPointer = UnsafeMutableRawBufferPointer(
209 | start: logprobs.data,
210 | count: noTimeTokenCount * MemoryLayout.stride
211 | )
212 |
213 | guard let maxTextTokenLogProbInputDescriptor = BNNSNDArrayDescriptor(
214 | data: maxTextTokenLogProbInputPointer,
215 | scalarType: FloatType.self,
216 | shape: .vector(noTimeTokenCount, stride: 1)
217 | ) else {
218 | Logging.error("Cannot create `maxTextTokenLogProbInputDescriptor`")
219 | return false
220 | }
221 |
222 | let maxTextTokenLogProb = BNNSNDArrayDescriptor.allocateUninitialized(
223 | scalarType: FloatType.self,
224 | shape: .vector(1, stride: 1)
225 | )
226 | defer { maxTextTokenLogProb.deallocate() }
227 |
228 | try BNNS.applyReduction(
229 | .max,
230 | input: maxTextTokenLogProbInputDescriptor,
231 | output: maxTextTokenLogProb,
232 | weights: nil
233 | )
234 |
235 | guard let timestampLogProbValue = timestampLogProb.makeArray(of: FloatType.self)?.first,
236 | let maxTextTokenLogProbValue = maxTextTokenLogProb.makeArray(of: FloatType.self)?.first
237 | else {
238 | Logging.error("Cannot create logProb arrays")
239 | return false
240 | }
241 | return timestampLogProbValue > maxTextTokenLogProbValue
242 | } catch {
243 | Logging.error("TimestampRulesFilter error: \(error)")
244 | return false
245 | }
246 | }
247 | }
248 |
249 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
250 | open class LanguageLogitsFilter: LogitsFiltering {
251 | let allLanguageTokens: Set
252 | let logitsDim: Int
253 | let sampleBegin: Int
254 | let nonLanguageTokenIndexes: [[NSNumber]]
255 |
256 | public init(allLanguageTokens: Set, logitsDim: Int, sampleBegin: Int) {
257 | self.allLanguageTokens = allLanguageTokens
258 | self.logitsDim = logitsDim
259 | self.sampleBegin = sampleBegin
260 | self.nonLanguageTokenIndexes = LanguageLogitsFilter.getNonLanguageTokenIndexes(logitsDim: self.logitsDim, allLanguageTokens: self.allLanguageTokens)
261 | }
262 |
263 | /// Retain the logits that correspond to language tokens and suppress non-language tokens
264 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray {
265 | guard tokens.count >= sampleBegin else {
266 | return logits
267 | }
268 | logits.fill(indexes: nonLanguageTokenIndexes, with: -FloatType.infinity)
269 | return logits
270 | }
271 |
272 | private static func getNonLanguageTokenIndexes(logitsDim: Int, allLanguageTokens: Set) -> [[NSNumber]] {
273 | var indexes: [[NSNumber]] = []
274 | for i in 0.. SamplingResult
10 | func finalize(tokens: [Int], logProbs: [Float]) -> SamplingResult
11 | }
12 |
13 | public struct SamplingResult {
14 | public var tokens: [Int]
15 | public var logProbs: [Float]
16 | public var completed: Bool
17 |
18 | public init(
19 | tokens: [Int],
20 | logProbs: [Float],
21 | completed: Bool
22 | ) {
23 | self.tokens = tokens
24 | self.logProbs = logProbs
25 | self.completed = completed
26 | }
27 | }
28 |
29 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
30 | open class GreedyTokenSampler: TokenSampling {
31 | public var temperature: FloatType
32 | public var eotToken: Int
33 | public var decodingOptions: DecodingOptions
34 |
35 | public init(temperature: FloatType, eotToken: Int, decodingOptions: DecodingOptions) {
36 | self.temperature = temperature
37 | self.eotToken = eotToken
38 | self.decodingOptions = decodingOptions
39 | }
40 |
41 | #if canImport(CoreML.MLState)
42 | @available(macOS 15, iOS 18, watchOS 11, visionOS 2, *)
43 | private func sampleWithMLTensor(logits: MLMultiArray) -> (token: Int, logprob: Float) {
44 | // Use MLTensor operations if available for sampling
45 | // Reference: https://github.com/huggingface/swift-transformers/blob/preview/Sources/Generation/Decoders.swift
46 | var logitsTensor = MLTensor(MLShapedArray(logits)).cast(to: Float.self)
47 | var nextTokenTensor: MLTensor
48 | var nextLogprobTensor: MLTensor
49 |
50 | if temperature != 0.0 {
51 | // Scale logits by temperature if > 0
52 | logitsTensor = logitsTensor / temperature
53 | }
54 |
55 | // Always softmax once
56 | let softmaxScores = logitsTensor.softmax(alongAxis: -1)
57 |
58 | if temperature != 0.0 {
59 | // top-k multinomial sampling
60 | let (topKProbs, topKIndices) = softmaxScores.topK(decodingOptions.topK)
61 |
62 | let rnd = topKProbs.sum() * Float.random(in: 0..<1)
63 | var accumTopKProbs = topKProbs.cumulativeSum(alongAxis: -1)
64 | accumTopKProbs += (accumTopKProbs .< rnd) * 100.0
65 | let topKIndex = accumTopKProbs.argsort()[..., 0]
66 |
67 | nextTokenTensor = topKIndices.gathering(
68 | atIndices: topKIndex,
69 | alongAxis: topKIndices.rank - 1
70 | )
71 | nextLogprobTensor = topKProbs.gathering(
72 | atIndices: topKIndex,
73 | alongAxis: topKIndices.rank - 1
74 | ).log()
75 | } else {
76 | nextTokenTensor = logitsTensor.argmax(alongAxis: -1)
77 | nextLogprobTensor = softmaxScores.gathering(atIndices: nextTokenTensor, alongAxis: -1).log()
78 | }
79 |
80 | return (
81 | token: nextTokenTensor.asIntArray()[0],
82 | logprob: nextLogprobTensor.asFloatArray()[0]
83 | )
84 | }
85 | #endif
86 |
87 | private func sampleWithBNNS(logits: MLMultiArray) -> (token: Int, logprob: Float) {
88 | // TODO: BNNS operations here are deprecated, replace with vDSP or MLX
89 | var softmaxOutput: BNNSNDArrayDescriptor?
90 | var argmaxOutput: BNNSNDArrayDescriptor?
91 | var softmaxInput: BNNSNDArrayDescriptor?
92 | var softmaxInputNeedsDeallocate = false
93 |
94 | var nextToken: Int?
95 |
96 | do {
97 | let logitsRawPointer = UnsafeMutableRawBufferPointer(
98 | start: logits.dataPointer,
99 | count: logits.count * MemoryLayout.stride
100 | )
101 |
102 | let logitsDescriptor = BNNSNDArrayDescriptor(
103 | data: logitsRawPointer,
104 | scalarType: FloatType.self,
105 | shape: .vector(logits.count, stride: 1)
106 | )!
107 |
108 | softmaxInput = logitsDescriptor
109 |
110 | // Scale logits by temperature if > 0
111 | if temperature != 0.0 {
112 | let scaledLogits = BNNSNDArrayDescriptor.allocateUninitialized(
113 | scalarType: FloatType.self,
114 | shape: .vector(logits.count, stride: 1)
115 | )
116 |
117 | try! BNNS.applyActivation(
118 | activation: BNNS.ActivationFunction.linear(alpha: Float(1 / temperature)),
119 | input: logitsDescriptor,
120 | output: scaledLogits,
121 | batchSize: 1
122 | )
123 |
124 | softmaxInput = scaledLogits
125 | softmaxInputNeedsDeallocate = true
126 | }
127 |
128 | // Always softmax once
129 | softmaxOutput = BNNSNDArrayDescriptor.allocateUninitialized(
130 | scalarType: Float.self,
131 | shape: .vector(logits.count, stride: 1)
132 | )
133 |
134 | try BNNS.applyActivation(
135 | activation: BNNS.ActivationFunction.softmax,
136 | input: softmaxInput!,
137 | output: softmaxOutput!,
138 | batchSize: 1
139 | )
140 |
141 | if temperature != 0.0 {
142 | // top-k multinomial sampling
143 | let k = decodingOptions.topK
144 | let bestValues = BNNSNDArrayDescriptor.allocateUninitialized(
145 | scalarType: Float.self,
146 | shape: .vector(k, stride: 1)
147 | )
148 | let bestIndices = BNNSNDArrayDescriptor.allocateUninitialized(
149 | scalarType: Int32.self,
150 | shape: .vector(k, stride: 1)
151 | )
152 |
153 | try! BNNS.applyTopK(
154 | k: k,
155 | input: softmaxOutput!,
156 | bestValues: bestValues,
157 | bestIndices: bestIndices,
158 | axis: 0,
159 | batchSize: 1
160 | )
161 |
162 | let bestValuesResult = bestValues.makeArray(of: Float.self)!
163 | let bestIndicesResult = bestIndices.makeArray(of: Int32.self)!
164 |
165 | bestValues.deallocate()
166 | bestIndices.deallocate()
167 |
168 | // multinomial sample from top-k
169 | let sumOfbestIndicesResult = bestValuesResult.reduce(0, +)
170 | let rnd = Float.random(in: 0.. SamplingResult {
217 | var nextTokens = tokens
218 | var nextLogprobs = logProbs
219 | var completed = false
220 |
221 | var result: (token: Int, logprob: Float)
222 | #if canImport(CoreML.MLState)
223 | if #available(macOS 15.0, iOS 18.0, watchOS 11.0, visionOS 2.0, *) {
224 | result = sampleWithMLTensor(logits: logits)
225 | } else {
226 | result = sampleWithBNNS(logits: logits)
227 | }
228 | #else
229 | result = sampleWithBNNS(logits: logits)
230 | #endif
231 |
232 | nextTokens = tokens + [result.token]
233 | nextLogprobs = logProbs + [result.logprob]
234 | completed = result.token == eotToken
235 |
236 | return SamplingResult(
237 | tokens: nextTokens,
238 | logProbs: nextLogprobs,
239 | completed: completed
240 | )
241 | }
242 |
243 | public func finalize(tokens: [Int], logProbs: [Float]) -> SamplingResult {
244 | var finalTokens = tokens
245 | var finalLogProbs = logProbs
246 | if tokens.last != eotToken {
247 | finalTokens.append(eotToken)
248 | finalLogProbs.append(0)
249 | }
250 |
251 | return SamplingResult(tokens: finalTokens, logProbs: finalLogProbs, completed: true)
252 | }
253 | }
254 |
255 | open class BeamSearchTokenSampler: TokenSampling {
256 | public var beamSize: Int
257 | public var eotToken: Int
258 | public var patience: Float
259 | var maxCandidates: Int
260 | var finishedSequences: [Float]
261 |
262 | public init(
263 | beamSize: Int,
264 | eotToken: Int,
265 | patience: Float = 1
266 | ) {
267 | self.beamSize = beamSize
268 | self.eotToken = eotToken
269 | self.patience = patience
270 | self.maxCandidates = Int(Float(beamSize) * patience)
271 | self.finishedSequences = []
272 | if self.maxCandidates <= 0 {
273 | self.maxCandidates = 1
274 | fatalError("Invalid beam size \(beamSize) or patience \(patience)")
275 | }
276 | }
277 |
278 | public func reset() {
279 | finishedSequences = []
280 | }
281 |
282 | public func update(tokens: [Int], logits: MLMultiArray, logProbs: [Float]) -> SamplingResult {
283 | // TODO: Implement
284 | fatalError("Not implemented: \(#function)")
285 | }
286 |
287 | public func finalize(tokens: [Int], logProbs: [Float]) -> SamplingResult {
288 | // TODO: Implement
289 | fatalError("Not implemented: \(#function)")
290 | }
291 | }
292 |
--------------------------------------------------------------------------------
/Sources/WhisperKit/Core/Utils/Concurrency.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// An actor that provides thread-safe early stopping functionality using UUIDs as keys
7 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
8 | public actor EarlyStopActor {
9 | private var shouldStop = [UUID: Bool]()
10 |
11 | public init() {}
12 |
13 | /// Sets the stop flag for a given UUID
14 | /// - Parameters:
15 | /// - value: The boolean value to set
16 | /// - uuid: The UUID key
17 | public func set(_ value: Bool, for uuid: UUID) {
18 | shouldStop[uuid] = value
19 | }
20 |
21 | /// Gets the stop flag for a given UUID
22 | /// - Parameter uuid: The UUID key
23 | /// - Returns: The current stop flag value, or false if not set
24 | public func get(for uuid: UUID) -> Bool {
25 | return shouldStop[uuid] ?? false
26 | }
27 |
28 | /// Removes and returns the stop flag for a given UUID
29 | /// - Parameter uuid: The UUID key
30 | /// - Returns: The removed stop flag value, if it existed
31 | public func remove(for uuid: UUID) -> Bool? {
32 | return shouldStop.removeValue(forKey: uuid)
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/CLIArguments.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import ArgumentParser
5 |
6 | struct CLIArguments: ParsableArguments {
7 | @Option(help: "Paths to audio files")
8 | var audioPath = [String]()
9 |
10 | @Option(help: "Path to a folder containing audio files")
11 | var audioFolder: String?
12 |
13 | @Option(help: "Path of model files")
14 | var modelPath: String?
15 |
16 | @Option(help: "Model to download if no modelPath is provided")
17 | var model: String?
18 |
19 | @Option(help: "Text to add in front of the model name to specify between different types of the same variant (values: \"openai\", \"distil\")")
20 | var modelPrefix: String = "openai"
21 |
22 | @Option(help: "Path to save the downloaded model")
23 | var downloadModelPath: String?
24 |
25 | @Option(help: "Path to save the downloaded tokenizer files")
26 | var downloadTokenizerPath: String?
27 |
28 | @Option(help: "Compute units for audio encoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
29 | var audioEncoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
30 |
31 | @Option(help: "Compute units for text decoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
32 | var textDecoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
33 |
34 | @Flag(help: "Verbose mode")
35 | var verbose: Bool = false
36 |
37 | @Option(help: "Task to perform (transcribe or translate)")
38 | var task: String = "transcribe"
39 |
40 | @Option(help: "Language spoken in the audio")
41 | var language: String?
42 |
43 | @Option(help: "Temperature to use for sampling")
44 | var temperature: Float = 0
45 |
46 | @Option(help: "Temperature to increase on fallbacks during decoding")
47 | var temperatureIncrementOnFallback: Float = 0.2
48 |
49 | @Option(help: "Number of times to increase temperature when falling back during decoding")
50 | var temperatureFallbackCount: Int = 5
51 |
52 | @Option(help: "Number of candidates when sampling with non-zero temperature")
53 | var bestOf: Int = 5
54 |
55 | @Flag(help: "Force initial prompt tokens based on language, task, and timestamp options")
56 | var usePrefillPrompt: Bool = false
57 |
58 | @Flag(help: "Use decoder prefill data for faster initial decoding")
59 | var usePrefillCache: Bool = false
60 |
61 | @Flag(help: "Skip special tokens in the output")
62 | var skipSpecialTokens: Bool = false
63 |
64 | @Flag(help: "Force no timestamps when decoding")
65 | var withoutTimestamps: Bool = false
66 |
67 | @Flag(help: "Add timestamps for each word in the output")
68 | var wordTimestamps: Bool = false
69 |
70 | @Option(help: "Force prefix text when decoding")
71 | var prefix: String?
72 |
73 | @Option(help: "Condition on this text when decoding")
74 | var prompt: String?
75 |
76 | @Option(parsing: .upToNextOption, help: "List of timestamps (in seconds) of start and end values to transcribe as seperate clips in single audio file (example: --clip-timestamps 0 10.2 34.5 60.0)")
77 | var clipTimestamps: [Float] = []
78 |
79 | @Option(parsing: .upToNextOption, help: "List of tokens to supress in the output (example: --supress-tokens 1 2 3)")
80 | var supressTokens: [Int] = []
81 |
82 | @Option(help: "Gzip compression ratio threshold for decoding failure")
83 | var compressionRatioThreshold: Float?
84 |
85 | @Option(help: "Average log probability threshold for decoding failure")
86 | var logprobThreshold: Float?
87 |
88 | @Option(help: "Log probability threshold for first token decoding failure")
89 | var firstTokenLogProbThreshold: Float?
90 |
91 | @Option(help: "Probability threshold to consider a segment as silence")
92 | var noSpeechThreshold: Float?
93 |
94 | @Flag(help: "Output a report of the results")
95 | var report: Bool = false
96 |
97 | @Option(help: "Directory to save the report")
98 | var reportPath: String = "."
99 |
100 | @Flag(help: "Process audio directly from the microphone")
101 | var stream: Bool = false
102 |
103 | @Flag(help: "Simulate streaming transcription using the input audio file")
104 | var streamSimulated: Bool = false
105 |
106 | @Option(help: "Maximum concurrent inference, might be helpful when processing more than 1 audio file at the same time. 0 means unlimited. Default: 4")
107 | var concurrentWorkerCount: Int = 4
108 |
109 | @Option(help: "Chunking strategy for audio processing, `none` means no chunking, `vad` means using voice activity detection. Default: `vad`")
110 | var chunkingStrategy: String = "vad"
111 | }
112 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/CLIUtils.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import ArgumentParser
5 | import CoreML
6 | import Foundation
7 |
8 | enum ComputeUnits: String, ExpressibleByArgument, CaseIterable {
9 | case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine, random
10 | var asMLComputeUnits: MLComputeUnits {
11 | switch self {
12 | case .all: return .all
13 | case .cpuAndGPU: return .cpuAndGPU
14 | case .cpuOnly: return .cpuOnly
15 | case .cpuAndNeuralEngine: return .cpuAndNeuralEngine
16 | case .random: return Bool.random() ? .cpuAndGPU : .cpuAndNeuralEngine
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/Sources/WhisperKitCLI/WhisperKitCLI.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import ArgumentParser
5 | import Foundation
6 |
7 | let VERSION: String = "development"
8 |
9 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
10 | @main
11 | struct WhisperKitCLI: AsyncParsableCommand {
12 | static let configuration = CommandConfiguration(
13 | commandName: "whisperkit-cli",
14 | abstract: "WhisperKit CLI",
15 | discussion: "Swift native speech recognition with Whisper for Apple Silicon",
16 | version: VERSION,
17 | subcommands: [TranscribeCLI.self]
18 | )
19 | }
20 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Evaluate/DistanceCalculation.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Compute the last row of the edit distance dynamic programming matrix
7 | /// between s1 and s2.
8 | func computeLastRow(_ s1Chars: [Unicode.Scalar], _ s2Chars: [Unicode.Scalar]) -> [Int] {
9 | var prevRow = Array(0...s2Chars.endIndex)
10 |
11 | for i in 1...s1Chars.endIndex {
12 | var currentRow = [Int](repeating: 0, count: s2Chars.endIndex + 1)
13 | currentRow[0] = i
14 |
15 | for j in 1...s2Chars.endIndex {
16 | let cost = s1Chars[i - 1] == s2Chars[j - 1] ? 0 : 1
17 | currentRow[j] = min(
18 | prevRow[j] + 1, // Deletion
19 | currentRow[j - 1] + 1, // Insertion
20 | prevRow[j - 1] + cost // Substitution
21 | )
22 | }
23 | prevRow = currentRow
24 | }
25 |
26 | return prevRow
27 | }
28 |
29 | func needlemanWunsch(_ xArray: [Unicode.Scalar], _ yArray: [Unicode.Scalar]) -> [EditOp] {
30 | let m = xArray.count
31 | let n = yArray.count
32 |
33 | var dp = [[Int]](repeating: [Int](repeating: 0, count: n + 1), count: m + 1)
34 | for i in 1...m {
35 | dp[i][0] = i
36 | }
37 | for j in 1...n {
38 | dp[0][j] = j
39 | }
40 |
41 | for i in 1...m {
42 | for j in 1...n {
43 | let cost = xArray[i - 1] == yArray[j - 1] ? 0 : 1
44 | dp[i][j] = min(
45 | dp[i - 1][j] + 1, // Deletion
46 | dp[i][j - 1] + 1, // Insertion
47 | dp[i - 1][j - 1] + cost // Substitution
48 | )
49 | }
50 | }
51 |
52 | var i = m
53 | var j = n
54 | var ops = [EditOp]()
55 |
56 | while i > 0, j > 0 {
57 | if dp[i][j] == dp[i - 1][j - 1], xArray[i - 1] == yArray[j - 1] {
58 | // Match operation is omitted
59 | i -= 1
60 | j -= 1
61 | } else if dp[i][j] == dp[i - 1][j - 1] + 1 {
62 | ops.append(EditOp.replace) // Substitution
63 | i -= 1
64 | j -= 1
65 | } else if dp[i][j] == dp[i][j - 1] + 1 {
66 | ops.append(EditOp.insert) // Insertion
67 | j -= 1
68 | } else {
69 | ops.append(EditOp.delete) // Deletion
70 | i -= 1
71 | }
72 | }
73 |
74 | while i > 0 {
75 | ops.append(EditOp.delete)
76 | i -= 1
77 | }
78 | while j > 0 {
79 | ops.append(EditOp.insert)
80 | j -= 1
81 | }
82 |
83 | return ops.reversed()
84 | }
85 |
86 | func hirschberg(_ reference: [Unicode.Scalar], _ s2: [Unicode.Scalar]) -> [EditOp] {
87 | func hirschbergRec(_ x: [Unicode.Scalar], _ y: [Unicode.Scalar]) -> [EditOp] {
88 | let m = x.endIndex
89 | let n = y.endIndex
90 |
91 | if m == 0 {
92 | let result = y.map { _ in EditOp.insert }
93 | return result
94 | }
95 | if n == 0 {
96 | let result = x.map { _ in EditOp.delete }
97 | return result
98 | }
99 | if m == 1 || n == 1 {
100 | let result = needlemanWunsch(x, y)
101 | return result
102 | }
103 |
104 | let i = m / 2
105 | let xPrefix = Array(x[x.startIndex.. [EditOp] {
136 | let n = sourceText.count
137 | let m = targetText.count
138 | let maxD = n + m
139 | let vSize = 2 * maxD + 1
140 | var v = [Int](repeating: 0, count: vSize)
141 | var trace = [[Int]]()
142 |
143 | let offset = maxD
144 |
145 | for d in 0...maxD {
146 | let vSnapshot = v
147 | for k in stride(from: -d, through: d, by: 2) {
148 | let kIndex = k + offset
149 | var x: Int
150 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) {
151 | x = v[kIndex + 1]
152 | } else {
153 | x = v[kIndex - 1] + 1
154 | }
155 | var y = x - k
156 | while x < n, y < m, sourceText[x] == targetText[y] {
157 | x += 1
158 | y += 1
159 | }
160 | v[kIndex] = x
161 | if x >= n, y >= m {
162 | trace.append(vSnapshot)
163 | return backtrack(trace: trace, sourceText: sourceText, targetText: targetText)
164 | }
165 | }
166 | trace.append(vSnapshot)
167 | }
168 | return []
169 | }
170 |
171 | func backtrack(trace: [[Int]], sourceText: [Unicode.Scalar], targetText: [Unicode.Scalar]) -> [EditOp] {
172 | var editOps = [EditOp]()
173 | let n = sourceText.count
174 | let m = targetText.count
175 | let offset = trace[0].count / 2
176 | var x = n
177 | var y = m
178 |
179 | for d in stride(from: trace.count - 1, through: 0, by: -1) {
180 | let v = trace[d]
181 | let k = x - y
182 | let kIndex = k + offset
183 |
184 | var prevK: Int
185 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) {
186 | prevK = k + 1
187 | } else {
188 | prevK = k - 1
189 | }
190 | let prevX = v[prevK + offset]
191 | let prevY = prevX - prevK
192 |
193 | while x > prevX, y > prevY {
194 | // Match or Replace
195 | if sourceText[x - 1] == targetText[y - 1] {
196 | editOps.append(.blank)
197 | } else {
198 | editOps.append(.replace)
199 | }
200 | x -= 1
201 | y -= 1
202 | }
203 |
204 | if d > 0 {
205 | if x == prevX {
206 | // Insertion
207 | editOps.append(.insert)
208 | y -= 1
209 | } else {
210 | // Deletion
211 | editOps.append(.delete)
212 | x -= 1
213 | }
214 | }
215 | }
216 |
217 | return editOps.reversed()
218 | }
219 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Evaluate/WERUtils.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Foundation
5 |
6 | /// Return the operations needed to transform s1 into s2 using Wagner-Fischer algo.
7 | /// "i" = insertion, "d" = deletion, "r" = replacement
8 | enum EditOp: UInt8 {
9 | case blank
10 | case replace
11 | case delete
12 | case insert
13 | }
14 |
15 | enum WERUtils {
16 | static func wordsToChars(reference: [[String]], hypothesis: [[String]]) -> ([String], [String]) {
17 | // tokenize each word into an integer
18 | let vocabulary = Set((reference + hypothesis).flatMap { $0 })
19 | let word2char = Dictionary(uniqueKeysWithValues: vocabulary.enumerated().map { index, value in
20 | (value, index)
21 | })
22 |
23 | let referenceCharsEfficient = reference.map { sentence in
24 | String(sentence.lazy.compactMap { word in
25 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) {
26 | return Character(unicodeScalar)
27 | }
28 | return nil
29 | })
30 | }
31 |
32 | let hypothesisCharsEfficient = hypothesis.map { sentence in
33 | String(sentence.lazy.compactMap { word in
34 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) {
35 | return Character(unicodeScalar)
36 | }
37 | return nil
38 | })
39 | }
40 |
41 | return (referenceCharsEfficient, hypothesisCharsEfficient)
42 | }
43 |
44 | static func processWords(reference: [String], hypothesis: [String]) -> (Double, [[String?]]) {
45 | var refTransformed = NormalizationUtils.removeMultipleSpaces(sentences: reference)
46 | refTransformed = NormalizationUtils.strip(sentences: refTransformed)
47 | let refTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: refTransformed)
48 |
49 | var hypTransformed = NormalizationUtils.removeMultipleSpaces(sentences: hypothesis)
50 | hypTransformed = NormalizationUtils.strip(sentences: hypTransformed)
51 | let hypTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: hypTransformed)
52 |
53 | let (refAsChars, hypAsChars) = WERUtils.wordsToChars(reference: refTransformedReduced, hypothesis: hypTransformedReduced)
54 |
55 | let refArrays = refAsChars.map { Array($0.unicodeScalars) }
56 | let hypArrays = hypAsChars.map { Array($0.unicodeScalars) }
57 |
58 | var (numHits, numSubstitutions, numDeletions, numInsertions) = (0, 0, 0, 0)
59 | var (numRfWords, numHypWords) = (0, 0)
60 | var diffResult: [[String?]] = []
61 |
62 | for (referenceSentence, hypothesisSentence) in zip(refArrays, hypArrays) {
63 | let editOps = levenshtein(referenceSentence, hypothesisSentence)
64 |
65 | // count the number of edits of each type
66 | var substitutions = 0
67 | var deletions = 0
68 | var insertions = 0
69 |
70 | var referenceIndex = 0
71 | var hypothesisIndex = 0
72 | for op in editOps {
73 | switch op {
74 | case .replace:
75 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"])
76 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"])
77 | substitutions += 1
78 | referenceIndex += 1
79 | hypothesisIndex += 1
80 | case .delete:
81 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"])
82 | deletions += 1
83 | referenceIndex += 1
84 | case .insert:
85 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"])
86 | insertions += 1
87 | hypothesisIndex += 1
88 | case .blank:
89 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), nil])
90 | referenceIndex += 1
91 | hypothesisIndex += 1
92 | }
93 | }
94 |
95 | let hits: Int = referenceSentence.count - (substitutions + deletions)
96 |
97 | numHits += hits
98 | numSubstitutions += substitutions
99 | numDeletions += deletions
100 | numInsertions += insertions
101 | numRfWords += referenceSentence.count
102 | numHypWords += hypothesisSentence.count
103 | }
104 |
105 | let wer = Double(numSubstitutions + numDeletions + numInsertions) / Double(numHits + numSubstitutions + numDeletions)
106 |
107 | return (wer, diffResult)
108 | }
109 |
110 | static func evaluate(originalTranscript: String, generatedTranscript: String, normalizeOriginal: Bool = true) -> (wer: Double, diff: [[String?]]) {
111 | let normalizer = EnglishTextNormalizer()
112 | let reference = normalizeOriginal ? normalizer.normalize(text: originalTranscript) : originalTranscript
113 | let hypothesis = normalizer.normalize(text: generatedTranscript)
114 |
115 | let (wer, diff) = WERUtils.processWords(
116 | reference: [reference],
117 | hypothesis: [hypothesis]
118 | )
119 | return (wer, diff)
120 | }
121 |
122 | static func processDiff(originalTranscript: String, generatedTranscript: String) -> [[String?]] {
123 | let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript)
124 | return diff
125 | }
126 |
127 | static func diffString(from diff: [[String?]]) -> String {
128 | return diff.compactMap { entry -> String? in
129 | guard let word = entry[0], word != " " else { return nil }
130 | if let changeType = entry[1] {
131 | return "\(changeType)\(word)"
132 | }
133 | return word
134 | }.joined(separator: " ")
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/FunctionalTests.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import CoreML
5 | import WhisperKit
6 | import XCTest
7 |
8 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
9 | final class FunctionalTests: XCTestCase {
10 | func testInitLarge() async throws {
11 | try await XCTAssertNoThrowAsync(
12 | await WhisperKit(modelFolder: largev3ModelPath(), logLevel: .error)
13 | )
14 | }
15 |
16 | func testRealTimeFactorTiny() async throws {
17 | let modelPath = try await tinyModelPath()
18 |
19 | let metrics: [XCTMetric] = [XCTMemoryMetric(), XCTStorageMetric(), XCTClockMetric()]
20 |
21 | let measureOptions = XCTMeasureOptions.default
22 | measureOptions.iterationCount = 5
23 |
24 | let audioFilePath = try XCTUnwrap(
25 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
26 | "Audio file not found"
27 | )
28 |
29 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: modelPath))
30 |
31 | measure(metrics: metrics, options: measureOptions) {
32 | let dispatchSemaphore = DispatchSemaphore(value: 0)
33 | Task {
34 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath)
35 | let transcriptionResultText = transcriptionResult.map(\.text).joined(separator: " ")
36 | XCTAssertGreaterThan(transcriptionResultText.count, 0)
37 | dispatchSemaphore.signal()
38 | }
39 | dispatchSemaphore.wait()
40 | }
41 | }
42 |
43 | func testRealTimeFactorLarge() async throws {
44 | let modelPath = try largev3ModelPath()
45 |
46 | let metrics: [XCTMetric] = [XCTMemoryMetric(), XCTStorageMetric(), XCTClockMetric()]
47 |
48 | let measureOptions = XCTMeasureOptions.default
49 | measureOptions.iterationCount = 5
50 |
51 | let audioFilePath = try XCTUnwrap(
52 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
53 | "Audio file not found"
54 | )
55 |
56 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: modelPath, verbose: false))
57 |
58 | measure(metrics: metrics, options: measureOptions) {
59 | let dispatchSemaphore = DispatchSemaphore(value: 0)
60 | Task {
61 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath)
62 | XCTAssertGreaterThan(transcriptionResult.text.count, 0)
63 | dispatchSemaphore.signal()
64 | }
65 | dispatchSemaphore.wait()
66 | }
67 | }
68 |
69 | func testBaseImplementation() throws {
70 | let audioFilePath = try XCTUnwrap(
71 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
72 | "Audio file not found"
73 | )
74 |
75 | let dispatchSemaphore = DispatchSemaphore(value: 0)
76 |
77 | Task {
78 | let whisperKit = try await XCTUnwrapAsync(await WhisperKit(model: "large-v3"))
79 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath)
80 | XCTAssertGreaterThan(transcriptionResult.text.count, 0)
81 | dispatchSemaphore.signal()
82 | }
83 |
84 | dispatchSemaphore.wait()
85 | }
86 |
87 | func testAsyncImplementation() async throws {
88 | let audioFilePath = try XCTUnwrap(
89 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
90 | "Audio file not found"
91 | )
92 | let whisperKit = try await WhisperKit(WhisperKitConfig(model: "large-v3"))
93 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath)
94 |
95 | XCTAssertGreaterThan(transcriptionResult.text.count, 0)
96 | }
97 |
98 | func testBatchTranscribeAudioPaths() async throws {
99 | let audioPaths = try [
100 | XCTUnwrap(
101 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
102 | "Audio file not found"
103 | ),
104 | XCTUnwrap(
105 | Bundle.current(for: self).path(forResource: "es_test_clip", ofType: "wav"),
106 | "Audio file not found"
107 | ),
108 | XCTUnwrap(
109 | Bundle.current(for: self).path(forResource: "ja_test_clip", ofType: "wav"),
110 | "Audio file not found"
111 | ),
112 | ]
113 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: tinyModelPath()))
114 | let transcriptionResults: [Result<[TranscriptionResult], Swift.Error>] = await whisperKit.transcribeWithResults(audioPaths: audioPaths)
115 |
116 | XCTAssertEqual(transcriptionResults.count, 3)
117 | XCTAssertTrue(transcriptionResults.allSatisfy { $0.isSuccess })
118 | XCTAssertEqual(
119 | try transcriptionResults[0].normalizedText(prefix: 5),
120 | "and so my fellow americans"
121 | )
122 | XCTAssertEqual(
123 | try transcriptionResults[1].normalizedText(prefix: 2),
124 | "this is"
125 | )
126 | XCTAssertEqual(
127 | try transcriptionResults[2].normalizedText(prefix: 1),
128 | "tokyo"
129 | )
130 | }
131 |
132 | func testBatchTranscribeAudioPathsWithErrors() async throws {
133 | let audioPaths = try [
134 | "/path/to/file1.wav",
135 | XCTUnwrap(
136 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
137 | "Audio file not found"
138 | ),
139 | "/path/to/file2.wav",
140 | ]
141 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: tinyModelPath()))
142 | let transcriptionResults: [Result<[TranscriptionResult], Swift.Error>] = await whisperKit.transcribeWithResults(audioPaths: audioPaths)
143 |
144 | XCTAssertEqual(transcriptionResults.count, 3)
145 | XCTAssertEqual(
146 | transcriptionResults[0].whisperError(),
147 | .loadAudioFailed("Resource path does not exist /path/to/file1.wav")
148 | )
149 | XCTAssertEqual(
150 | try transcriptionResults[1].normalizedText(prefix: 5),
151 | "and so my fellow americans"
152 | )
153 | XCTAssertEqual(
154 | transcriptionResults[2].whisperError(),
155 | .loadAudioFailed("Resource path does not exist /path/to/file2.wav")
156 | )
157 | }
158 |
159 | func testBatchTranscribeAudioArrays() async throws {
160 | let audioPaths = try [
161 | XCTUnwrap(
162 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
163 | "Audio file not found"
164 | ),
165 | XCTUnwrap(
166 | Bundle.current(for: self).path(forResource: "es_test_clip", ofType: "wav"),
167 | "Audio file not found"
168 | ),
169 | XCTUnwrap(
170 | Bundle.current(for: self).path(forResource: "ja_test_clip", ofType: "wav"),
171 | "Audio file not found"
172 | ),
173 | ]
174 | let audioArrays = try audioPaths
175 | .map { try AudioProcessor.loadAudio(fromPath: $0) }
176 | .map { AudioProcessor.convertBufferToArray(buffer: $0) }
177 |
178 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: tinyModelPath()))
179 | let transcriptionResults: [Result<[TranscriptionResult], Swift.Error>] = await whisperKit.transcribeWithResults(audioArrays: audioArrays)
180 |
181 | XCTAssertEqual(transcriptionResults.count, 3)
182 | XCTAssertTrue(transcriptionResults.allSatisfy { $0.isSuccess })
183 | XCTAssertEqual(
184 | try transcriptionResults[0].normalizedText(prefix: 5),
185 | "and so my fellow americans"
186 | )
187 | XCTAssertEqual(
188 | try transcriptionResults[1].normalizedText(prefix: 2),
189 | "this is"
190 | )
191 | XCTAssertEqual(
192 | try transcriptionResults[2].normalizedText(prefix: 1),
193 | "tokyo"
194 | )
195 | }
196 |
197 | func testModelSearchPathLarge() async throws {
198 | let audioFilePath = try XCTUnwrap(
199 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"),
200 | "Audio file not found"
201 | )
202 |
203 | var config = WhisperKitConfig(model: "large-v3", verbose: true, logLevel: .debug)
204 | let pipe1 = try await WhisperKit(config)
205 | let transcriptionResult1: [TranscriptionResult] = try await pipe1.transcribe(audioPath: audioFilePath)
206 | XCTAssertFalse(transcriptionResult1.text.isEmpty)
207 |
208 | config = WhisperKitConfig(model: "distil*large-v3", verbose: true, logLevel: .debug)
209 | let pipe2 = try await WhisperKit(config)
210 | let transcriptionResult2: [TranscriptionResult] = try await pipe2.transcribe(audioPath: audioFilePath)
211 | XCTAssertFalse(transcriptionResult2.text.isEmpty)
212 |
213 | config = WhisperKitConfig(model: "distil*large-v3", verbose: true, logLevel: .debug)
214 | let pipe3 = try await WhisperKit(config)
215 | let transcriptionResult3: [TranscriptionResult] = try await pipe3.transcribe(audioPath: audioFilePath)
216 | XCTAssertFalse(transcriptionResult3.text.isEmpty)
217 | }
218 | }
219 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/8_Channel_ID.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/8_Channel_ID.m4a
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/config-v02.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "whisperkit-coreml",
3 | "version": "0.2",
4 | "device_support": [
5 | {
6 | "identifiers": ["iPhone11", "iPhone12", "Watch7", "Watch8"],
7 | "models": {
8 | "default": "openai_whisper-tiny",
9 | "supported": [
10 | "openai_whisper-tiny",
11 | "openai_whisper-tiny.en",
12 | "openai_whisper-base",
13 | "openai_whisper-base.en"
14 | ]
15 | }
16 | },
17 | {
18 | "identifiers": ["iPhone13", "iPad13,18", "iPad13,1"],
19 | "models": {
20 | "default": "openai_whisper-base",
21 | "supported": [
22 | "openai_whisper-tiny",
23 | "openai_whisper-tiny.en",
24 | "openai_whisper-base",
25 | "openai_whisper-base.en",
26 | "openai_whisper-small",
27 | "openai_whisper-small.en"
28 | ]
29 | }
30 | },
31 | {
32 | "identifiers": [
33 | "iPhone14",
34 | "iPhone15",
35 | "iPhone16",
36 | "iPhone17",
37 | "iPad14,1",
38 | "iPad14,2"
39 | ],
40 | "models": {
41 | "default": "openai_whisper-base",
42 | "supported": [
43 | "openai_whisper-tiny",
44 | "openai_whisper-tiny.en",
45 | "openai_whisper-base",
46 | "openai_whisper-base.en",
47 | "openai_whisper-small",
48 | "openai_whisper-small.en",
49 | "openai_whisper-large-v2_949MB",
50 | "openai_whisper-large-v2_turbo_955MB",
51 | "openai_whisper-large-v3_947MB",
52 | "openai_whisper-large-v3_turbo_954MB",
53 | "distil-whisper_distil-large-v3_594MB",
54 | "distil-whisper_distil-large-v3_turbo_600MB",
55 | "openai_whisper-large-v3-v20240930_626MB",
56 | "openai_whisper-large-v3-v20240930_turbo_632MB"
57 | ]
58 | }
59 | },
60 | {
61 | "identifiers": [
62 | "Mac13",
63 | "iMac21",
64 | "MacBookAir10,1",
65 | "MacBookPro17",
66 | "MacBookPro18",
67 | "Macmini9",
68 | "iPad13,16",
69 | "iPad13,4",
70 | "iPad13,8"
71 | ],
72 | "models": {
73 | "default": "openai_whisper-large-v3-v20240930",
74 | "supported": [
75 | "openai_whisper-tiny",
76 | "openai_whisper-tiny.en",
77 | "openai_whisper-base",
78 | "openai_whisper-base.en",
79 | "openai_whisper-small",
80 | "openai_whisper-small.en",
81 | "openai_whisper-large-v2",
82 | "openai_whisper-large-v2_949MB",
83 | "openai_whisper-large-v3",
84 | "openai_whisper-large-v3_947MB",
85 | "distil-whisper_distil-large-v3",
86 | "distil-whisper_distil-large-v3_594MB",
87 | "openai_whisper-large-v3-v20240930",
88 | "openai_whisper-large-v3-v20240930_626MB"
89 | ]
90 | }
91 | },
92 | {
93 | "identifiers": [
94 | "Mac14",
95 | "Mac15",
96 | "Mac16",
97 | "iPad14,3",
98 | "iPad14,4",
99 | "iPad14,5",
100 | "iPad14,6",
101 | "iPad14,8",
102 | "iPad14,9",
103 | "iPad14,10",
104 | "iPad14,11",
105 | "iPad16"
106 | ],
107 | "models": {
108 | "default": "openai_whisper-large-v3-v20240930",
109 | "supported": [
110 | "openai_whisper-tiny",
111 | "openai_whisper-tiny.en",
112 | "openai_whisper-base",
113 | "openai_whisper-base.en",
114 | "openai_whisper-small",
115 | "openai_whisper-small.en",
116 | "openai_whisper-large-v2",
117 | "openai_whisper-large-v2_949MB",
118 | "openai_whisper-large-v2_turbo",
119 | "openai_whisper-large-v2_turbo_955MB",
120 | "openai_whisper-large-v3",
121 | "openai_whisper-large-v3_947MB",
122 | "openai_whisper-large-v3_turbo",
123 | "openai_whisper-large-v3_turbo_954MB",
124 | "distil-whisper_distil-large-v3",
125 | "distil-whisper_distil-large-v3_594MB",
126 | "distil-whisper_distil-large-v3_turbo",
127 | "distil-whisper_distil-large-v3_turbo_600MB",
128 | "openai_whisper-large-v3-v20240930",
129 | "openai_whisper-large-v3-v20240930_turbo",
130 | "openai_whisper-large-v3-v20240930_626MB",
131 | "openai_whisper-large-v3-v20240930_turbo_632MB"
132 | ]
133 | }
134 | }
135 | ]
136 | }
137 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/config-v03.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "whisperkit-coreml",
3 | "version": "0.3",
4 | "device_support": [
5 | {
6 | "chips": "A12, A13, S9, S10",
7 | "identifiers": [
8 | "iPhone11",
9 | "iPhone12",
10 | "Watch7",
11 | "Watch8"
12 | ],
13 | "models": {
14 | "default": "openai_whisper-tiny",
15 | "supported": [
16 | "openai_whisper-tiny",
17 | "openai_whisper-tiny.en",
18 | "openai_whisper-base",
19 | "openai_whisper-base.en"
20 | ]
21 | }
22 | },
23 | {
24 | "chips": "A14",
25 | "identifiers": [
26 | "iPhone13",
27 | "iPad13,1",
28 | "iPad13,2",
29 | "iPad13,18",
30 | "iPad13,19"
31 | ],
32 | "models": {
33 | "default": "openai_whisper-base",
34 | "supported": [
35 | "openai_whisper-tiny",
36 | "openai_whisper-tiny.en",
37 | "openai_whisper-base",
38 | "openai_whisper-base.en",
39 | "openai_whisper-small",
40 | "openai_whisper-small.en"
41 | ]
42 | }
43 | },
44 | {
45 | "chips": "A15, A16, A17 Pro, A18",
46 | "identifiers": [
47 | "iPhone14",
48 | "iPhone15",
49 | "iPhone16",
50 | "iPhone17",
51 | "iPad14,1",
52 | "iPad14,2",
53 | "iPad15,7",
54 | "iPad15,8",
55 | "iPad16,1",
56 | "iPad16,2"
57 | ],
58 | "models": {
59 | "default": "openai_whisper-base",
60 | "supported": [
61 | "openai_whisper-tiny",
62 | "openai_whisper-tiny.en",
63 | "openai_whisper-base",
64 | "openai_whisper-base.en",
65 | "openai_whisper-small",
66 | "openai_whisper-small.en",
67 | "openai_whisper-large-v2_949MB",
68 | "openai_whisper-large-v2_turbo_955MB",
69 | "openai_whisper-large-v3_947MB",
70 | "openai_whisper-large-v3_turbo_954MB",
71 | "distil-whisper_distil-large-v3_594MB",
72 | "distil-whisper_distil-large-v3_turbo_600MB",
73 | "openai_whisper-large-v3-v20240930_626MB",
74 | "openai_whisper-large-v3-v20240930_turbo_632MB"
75 | ]
76 | }
77 | },
78 | {
79 | "chips": "M1",
80 | "identifiers": [
81 | "MacBookPro17,1",
82 | "MacBookPro18,1",
83 | "MacBookPro18,2",
84 | "MacBookPro18,3",
85 | "MacBookPro18,4",
86 | "MacBookAir10,1",
87 | "Macmini9,1",
88 | "iMac21,1",
89 | "iMac21,2",
90 | "Mac13",
91 | "iPad13,4",
92 | "iPad13,5",
93 | "iPad13,6",
94 | "iPad13,7",
95 | "iPad13,8",
96 | "iPad13,9",
97 | "iPad13,10",
98 | "iPad13,11",
99 | "iPad13,16",
100 | "iPad13,17"
101 | ],
102 | "models": {
103 | "default": "openai_whisper-large-v3-v20240930_626MB",
104 | "supported": [
105 | "openai_whisper-tiny",
106 | "openai_whisper-tiny.en",
107 | "openai_whisper-base",
108 | "openai_whisper-base.en",
109 | "openai_whisper-small",
110 | "openai_whisper-small.en",
111 | "openai_whisper-large-v2",
112 | "openai_whisper-large-v2_949MB",
113 | "openai_whisper-large-v3",
114 | "openai_whisper-large-v3_947MB",
115 | "distil-whisper_distil-large-v3",
116 | "distil-whisper_distil-large-v3_594MB",
117 | "openai_whisper-large-v3-v20240930_626MB"
118 | ]
119 | }
120 | },
121 | {
122 | "chips": "M2, M3, M4",
123 | "identifiers": [
124 | "Mac14",
125 | "Mac15",
126 | "Mac16",
127 | "iPad14,3",
128 | "iPad14,4",
129 | "iPad14,5",
130 | "iPad14,6",
131 | "iPad14,8",
132 | "iPad14,9",
133 | "iPad14,10",
134 | "iPad14,11",
135 | "iPad15",
136 | "iPad16"
137 | ],
138 | "models": {
139 | "default": "openai_whisper-large-v3-v20240930",
140 | "supported": [
141 | "openai_whisper-tiny",
142 | "openai_whisper-tiny.en",
143 | "openai_whisper-base",
144 | "openai_whisper-base.en",
145 | "openai_whisper-small",
146 | "openai_whisper-small.en",
147 | "openai_whisper-large-v2",
148 | "openai_whisper-large-v2_949MB",
149 | "openai_whisper-large-v2_turbo",
150 | "openai_whisper-large-v2_turbo_955MB",
151 | "openai_whisper-large-v3",
152 | "openai_whisper-large-v3_947MB",
153 | "openai_whisper-large-v3_turbo",
154 | "openai_whisper-large-v3_turbo_954MB",
155 | "distil-whisper_distil-large-v3",
156 | "distil-whisper_distil-large-v3_594MB",
157 | "distil-whisper_distil-large-v3_turbo",
158 | "distil-whisper_distil-large-v3_turbo_600MB",
159 | "openai_whisper-large-v3-v20240930",
160 | "openai_whisper-large-v3-v20240930_turbo",
161 | "openai_whisper-large-v3-v20240930_626MB",
162 | "openai_whisper-large-v3-v20240930_turbo_632MB"
163 | ]
164 | }
165 | }
166 | ],
167 | "model_checksums": {
168 | "distil-whisper_distil-large-v3": "9cd8271143b919402ae776c30b479565",
169 | "distil-whisper_distil-large-v3_594MB": "ca532f45ddbf8a3d241132cc5cf41639",
170 | "distil-whisper_distil-large-v3_turbo": "b8638452c6568dfe33a33bfcc2bc6aca",
171 | "distil-whisper_distil-large-v3_turbo_600MB": "81746b4b1afbbb01a8ae9ea452460d88",
172 | "openai_whisper-base.en": "fbcfd586f15e2952251b1d3257f18471",
173 | "openai_whisper-base": "36e60501ad0f01c1a5719e83a1f63f20",
174 | "openai_whisper-large-v2": "21b86c07318aeeef54598f15b7903979",
175 | "openai_whisper-large-v2_949MB": "71bad4e1566749d1060eda42308d9fb4",
176 | "openai_whisper-large-v2_turbo": "7734959b6550e7b5c2d732bf2b7acd23",
177 | "openai_whisper-large-v2_turbo_955MB": "cb6411862a48ec75325572081f01e5b5",
178 | "openai_whisper-large-v3-v20240930": "17ebd78ff7edfa59001b554e9cc4c021",
179 | "openai_whisper-large-v3-v20240930_547MB": "c945dad68449ac3c78ecb2d561ac189d",
180 | "openai_whisper-large-v3-v20240930_626MB": "578fe5a07f4eb7e4187c920bca571aa5",
181 | "openai_whisper-large-v3-v20240930_turbo": "dfbf09ab741af1d5400ddbd07bb37dad",
182 | "openai_whisper-large-v3-v20240930_turbo_632MB": "33954440dbd785ca1828afe25514f5a5",
183 | "openai_whisper-large-v3": "a6f24dc72785722e9cea89e227856dfe",
184 | "openai_whisper-large-v3_947MB": "ef6b0e9622a046ce2361b4c72307877f",
185 | "openai_whisper-large-v3_turbo": "c550fbdea70c5784d322c0a427f8b5cd",
186 | "openai_whisper-large-v3_turbo_954MB": "e639c4bb98d905064ef5dd38757dd9d1",
187 | "openai_whisper-small.en": "38efe6a00706bbdb995795c67a836e5e",
188 | "openai_whisper-small": "f1d21adb950bc9be5d5343bcdeccd23b",
189 | "openai_whisper-tiny.en": "e1183fd55448923b1ce43a2da67aa21f",
190 | "openai_whisper-tiny": "7147518a3d68ddbea0691e04cfffa4ff"
191 | }
192 | }
193 |
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/es_test_clip.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/es_test_clip.wav
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/ja_test_clip.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/ja_test_clip.wav
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/jfk.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/jfk.wav
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/jfk_441khz.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/jfk_441khz.m4a
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/Resources/ted_60.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/ted_60.m4a
--------------------------------------------------------------------------------
/Tests/WhisperKitTests/TestUtils.swift:
--------------------------------------------------------------------------------
1 | // For licensing see accompanying LICENSE.md file.
2 | // Copyright © 2024 Argmax, Inc. All rights reserved.
3 |
4 | import Accelerate
5 | import AVFAudio
6 | import Combine
7 | import CoreML
8 | import Foundation
9 | @testable import WhisperKit
10 | import XCTest
11 |
12 | enum TestError: Error {
13 | case missingFile(String)
14 | case missingDirectory(String)
15 | }
16 |
17 | @discardableResult
18 | func XCTUnwrapAsync(
19 | _ expression: @autoclosure () async throws -> T,
20 | _ message: @autoclosure () -> String = "",
21 | file: StaticString = #filePath,
22 | line: UInt = #line
23 | ) async throws -> T {
24 | let evaluated = try? await expression()
25 | return try XCTUnwrap(evaluated, message(), file: file, line: line)
26 | }
27 |
28 | @discardableResult
29 | func XCTUnwrapAsync(
30 | _ expression: @autoclosure () async throws -> T?,
31 | _ message: @autoclosure () -> String = "",
32 | file: StaticString = #filePath,
33 | line: UInt = #line
34 | ) async throws -> T {
35 | let evaluated = try? await expression()
36 | return try XCTUnwrap(evaluated, message(), file: file, line: line)
37 | }
38 |
39 | func XCTAssertNoThrowAsync(
40 | _ expression: @autoclosure () async throws -> T,
41 | _ message: @autoclosure () -> String = "",
42 | file: StaticString = #filePath,
43 | line: UInt = #line
44 | ) async {
45 | do {
46 | _ = try await expression()
47 | } catch {
48 | XCTFail(message(), file: file, line: line)
49 | }
50 | }
51 |
52 | func XCTAssertNoThrowAsync(
53 | _ expression: @autoclosure () async throws -> T?,
54 | _ message: @autoclosure () -> String = "",
55 | file: StaticString = #filePath,
56 | line: UInt = #line
57 | ) async {
58 | do {
59 | _ = try await expression()
60 | } catch {
61 | XCTFail(message(), file: file, line: line)
62 | }
63 | }
64 |
65 | func XCTAssertNoThrowAsync(
66 | _ expression: @autoclosure () async throws -> Void,
67 | _ message: @autoclosure () -> String = "",
68 | file: StaticString = #filePath,
69 | line: UInt = #line
70 | ) async {
71 | do {
72 | try await expression()
73 | } catch {
74 | XCTFail(message(), file: file, line: line)
75 | }
76 | }
77 |
78 | // MARK: Helpers
79 |
80 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
81 | extension Bundle {
82 | static func current(for classObject: AnyObject? = nil) -> Bundle {
83 | #if SWIFT_PACKAGE
84 | return Bundle.module
85 | #else
86 | // Use bundle for class type if passed in
87 | if let classObject = classObject {
88 | return Bundle(for: type(of: classObject))
89 | } else {
90 | return Bundle.main
91 | }
92 | #endif
93 | }
94 | }
95 |
96 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
97 | extension FileManager {
98 | func allocatedSizeOfDirectory(at url: URL) throws -> Int64 {
99 | guard let enumerator = enumerator(at: url, includingPropertiesForKeys: [.totalFileAllocatedSizeKey, .fileAllocatedSizeKey]) else {
100 | throw NSError(domain: NSCocoaErrorDomain, code: NSFileReadUnknownError, userInfo: nil)
101 | }
102 |
103 | var accumulatedSize: Int64 = 0
104 | for case let fileURL as URL in enumerator {
105 | let resourceValues = try fileURL.resourceValues(forKeys: [.totalFileAllocatedSizeKey, .fileAllocatedSizeKey])
106 | accumulatedSize += Int64(resourceValues.totalFileAllocatedSize ?? resourceValues.fileAllocatedSize ?? 0)
107 | }
108 | return accumulatedSize
109 | }
110 | }
111 |
112 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
113 | extension MLMultiArray {
114 | /// Create `MLMultiArray` of shape [1, 1, arr.count] and fill up the last
115 | /// dimension with with values from arr.
116 | static func logits(_ arr: [FloatType]) throws -> MLMultiArray {
117 | let logits = try MLMultiArray(shape: [1, 1, arr.count] as [NSNumber], dataType: .float16)
118 | let ptr = UnsafeMutablePointer(OpaquePointer(logits.dataPointer))
119 | for (index, value) in arr.enumerated() {
120 | let linearOffset = logits.linearOffset(for: [0, 0, index as NSNumber])
121 | ptr[linearOffset] = value
122 | }
123 | return logits
124 | }
125 |
126 | /// Get the data from `MLMultiArray` for given dimension
127 | func data(for dimension: Int) -> [FloatType] {
128 | let count = shape[dimension].intValue
129 | let indexes = stride(from: 0, to: count, by: 1).map { [0, 0, $0 as NSNumber] }
130 | var result = [FloatType]()
131 | let ptr = UnsafeMutablePointer(OpaquePointer(dataPointer))
132 | for index in indexes {
133 | let linearOffset = linearOffset(for: index as [NSNumber])
134 | result.append(ptr[linearOffset])
135 | }
136 | return result
137 | }
138 | }
139 |
140 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
141 | extension XCTestCase {
142 | func transcribe(
143 | with variant: ModelVariant,
144 | options: DecodingOptions,
145 | callback: TranscriptionCallback = nil,
146 | audioFile: String = "jfk.wav",
147 | file: StaticString = #file,
148 | line: UInt = #line
149 | ) async throws -> [TranscriptionResult] {
150 | let modelName: String
151 | switch variant {
152 | case .largev3:
153 | modelName = "large-v3"
154 | default:
155 | modelName = "tiny"
156 | }
157 | let config = WhisperKitConfig(model: modelName, verbose: true, logLevel: .debug)
158 | let whisperKit = try await WhisperKit(config)
159 | trackForMemoryLeaks(on: whisperKit, file: file, line: line)
160 |
161 | let audioComponents = audioFile.components(separatedBy: ".")
162 | guard let audioFileURL = Bundle.current(for: self).path(forResource: audioComponents.first, ofType: audioComponents.last) else {
163 | throw TestError.missingFile("Missing audio file")
164 | }
165 | return try await whisperKit.transcribe(audioPath: audioFileURL, decodeOptions: options, callback: callback)
166 | }
167 |
168 | func tinyModelPath() async throws -> String {
169 | let modelDir = try await WhisperKit.download(variant: "tiny").path()
170 | return modelDir
171 | }
172 |
173 | func largev3ModelPath() throws -> String {
174 | let modelDir = "whisperkit-coreml/openai_whisper-large-v3" // use faster to compile model for tests
175 | guard let modelPath = Bundle.current(for: self).urls(forResourcesWithExtension: "mlmodelc", subdirectory: modelDir)?.first?.deletingLastPathComponent().path else {
176 | throw TestError.missingFile("Failed to load model, ensure \"Models/\(modelDir)\" exists via Makefile command: `make download-models`")
177 | }
178 | return modelPath
179 | }
180 |
181 | func largev3TurboModelPath() throws -> String {
182 | let modelDir = "whisperkit-coreml/openai_whisper-large-v3_turbo"
183 | guard let modelPath = Bundle.current(for: self).urls(forResourcesWithExtension: "mlmodelc", subdirectory: modelDir)?.first?.deletingLastPathComponent().path else {
184 | throw TestError.missingFile("Failed to load model, ensure \"Models/\(modelDir)\" exists via Makefile command: `make download-models`")
185 | }
186 | return modelPath
187 | }
188 |
189 | func allModelPaths() throws -> [String] {
190 | let fileManager = FileManager.default
191 | var modelPaths: [String] = []
192 | let directory = "whisperkit-coreml"
193 | let resourceKeys: [URLResourceKey] = [.isDirectoryKey]
194 | guard let baseurl = Bundle.current(for: self).resourceURL?.appendingPathComponent(directory) else {
195 | throw TestError.missingDirectory("Base URL for directory \(directory) not found.")
196 | }
197 | let directoryContents = try fileManager.contentsOfDirectory(at: baseurl, includingPropertiesForKeys: resourceKeys, options: .skipsHiddenFiles)
198 | for folderURL in directoryContents {
199 | let resourceValues = try folderURL.resourceValues(forKeys: Set(resourceKeys))
200 | if resourceValues.isDirectory == true {
201 | // Check if the directory contains actual data files, or if it contains pointer files.
202 | // As a proxy, use the MelSpectrogramc.mlmodel/coredata.bin file.
203 | let proxyFileToCheck = folderURL.appendingPathComponent("MelSpectrogram.mlmodelc/coremldata.bin")
204 | if try isGitLFSPointerFile(url: proxyFileToCheck) {
205 | continue
206 | }
207 |
208 | // Check if the directory name contains the quantization pattern
209 | // Only test large quantized models
210 | let dirName = folderURL.lastPathComponent
211 | if !(dirName.contains("q") && !dirName.contains("large")) {
212 | modelPaths.append(folderURL.absoluteString)
213 | }
214 | }
215 | }
216 | return modelPaths
217 | }
218 |
219 | /// Function to check if the beginning of the file matches a Git LFS pointer pattern
220 | func isGitLFSPointerFile(url: URL) throws -> Bool {
221 | let fileHandle = try FileHandle(forReadingFrom: url)
222 | // Read the first few bytes of the file to get enough for the Git LFS pointer signature
223 | let data = fileHandle.readData(ofLength: 512) // Read first 512 bytes
224 | fileHandle.closeFile()
225 | if let string = String(data: data, encoding: .utf8),
226 | string.starts(with: "version https://git-lfs.github.com/")
227 | {
228 | return true
229 | }
230 | return false
231 | }
232 |
233 | func trackForMemoryLeaks(on instance: AnyObject, file: StaticString = #filePath, line: UInt = #line) {
234 | addTeardownBlock { [weak instance] in
235 | XCTAssertNil(instance, "Detected potential memory leak", file: file, line: line)
236 | }
237 | }
238 |
239 | /// Helper to create an extended audio buffer by repeating the original buffer
240 | func createExtendedBuffer(from originalBuffer: AVAudioPCMBuffer, repeatCount: Int) -> AVAudioPCMBuffer {
241 | let frameCount = originalBuffer.frameLength
242 | let totalFrames = frameCount * AVAudioFrameCount(repeatCount)
243 |
244 | // Create new buffer with same format but longer length
245 | let extendedBuffer = AVAudioPCMBuffer(
246 | pcmFormat: originalBuffer.format,
247 | frameCapacity: totalFrames
248 | )!
249 | extendedBuffer.frameLength = totalFrames
250 |
251 | // For each channel
252 | for channel in 0.. AVAudioPCMBuffer {
278 | guard FileManager.default.fileExists(atPath: audioFilePath) else {
279 | throw WhisperError.loadAudioFailed("Resource path does not exist \(audioFilePath)")
280 | }
281 |
282 | let audioFileURL = URL(fileURLWithPath: audioFilePath)
283 |
284 | // Create an audio file with original format preserved
285 | let audioFile = try AVAudioFile(forReading: audioFileURL)
286 |
287 | // Create a buffer with the original format (preserving all channels)
288 | guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat,
289 | frameCapacity: AVAudioFrameCount(audioFile.length))
290 | else {
291 | throw WhisperError.loadAudioFailed("Unable to create audio buffer")
292 | }
293 |
294 | // Read the entire file into the buffer
295 | try audioFile.read(into: buffer)
296 |
297 | return buffer
298 | }
299 |
300 | /// Helper to measure channel processing operations
301 | func measureChannelProcessing(buffer: AVAudioPCMBuffer, mode: AudioInputConfig.ChannelMode, iterations: Int = 5) -> Double {
302 | // Add warm-up iterations
303 | for _ in 0..<3 {
304 | _ = AudioProcessor.convertToMono(buffer, mode: mode)
305 | }
306 |
307 | var totalTime: Double = 0
308 | // Then measure the actual timing
309 | for _ in 0.. SpecialTokens {
334 | SpecialTokens(
335 | endToken: endToken,
336 | englishToken: englishToken,
337 | noSpeechToken: noSpeechToken,
338 | noTimestampsToken: noTimestampsToken,
339 | specialTokenBegin: specialTokenBegin,
340 | startOfPreviousToken: startOfPreviousToken,
341 | startOfTranscriptToken: startOfTranscriptToken,
342 | timeTokenBegin: timeTokenBegin,
343 | transcribeToken: transcribeToken,
344 | translateToken: translateToken,
345 | whitespaceToken: whitespaceToken
346 | )
347 | }
348 | }
349 |
350 | extension Result {
351 | var isSuccess: Bool {
352 | switch self {
353 | case .success:
354 | return true
355 | case .failure:
356 | return false
357 | }
358 | }
359 |
360 | func whisperError() -> WhisperError? {
361 | switch self {
362 | case .success:
363 | return nil
364 | case let .failure(error):
365 | return error as? WhisperError
366 | }
367 | }
368 | }
369 |
370 | extension Result where Success == [TranscriptionResult] {
371 | func normalizedText(prefix: Int) throws -> String {
372 | try get().text.normalized.split(separator: " ").prefix(prefix).joined(separator: " ")
373 | }
374 | }
375 |
376 | extension Collection where Element == TranscriptionResult {
377 | var text: String {
378 | map(\.text).joined(separator: " ")
379 | }
380 | }
381 |
382 | extension Collection where Element == TranscriptionResult {
383 | var segments: [TranscriptionSegment] {
384 | flatMap(\.segments)
385 | }
386 | }
387 |
388 | public extension Publisher {
389 | func withPrevious() -> AnyPublisher<(previous: Output?, current: Output), Failure> {
390 | scan((Output?, Output)?.none) { ($0?.1, $1) }
391 | .compactMap { $0 }
392 | .eraseToAnyPublisher()
393 | }
394 | }
395 |
--------------------------------------------------------------------------------
/fastlane/README.md:
--------------------------------------------------------------------------------
1 | fastlane documentation
2 | ----
3 |
4 | # Installation
5 |
6 | Make sure you have the latest version of the Xcode command line tools installed:
7 |
8 | ```sh
9 | xcode-select --install
10 | ```
11 |
12 | For _fastlane_ installation instructions, see [Installing _fastlane_](https://docs.fastlane.tools/#installing-fastlane)
13 |
14 | # Available Actions
15 |
16 | ## iOS
17 |
18 | ### ios list_devices
19 |
20 | ```sh
21 | [bundle exec] fastlane ios list_devices
22 | ```
23 |
24 | List all connected devices
25 |
26 | ### ios benchmark
27 |
28 | ```sh
29 | [bundle exec] fastlane ios benchmark
30 | ```
31 |
32 | Benchmark devices with options
33 |
34 | ### ios extract_results
35 |
36 | ```sh
37 | [bundle exec] fastlane ios extract_results
38 | ```
39 |
40 | Extract benchmark results
41 |
42 | ### ios upload_results
43 |
44 | ```sh
45 | [bundle exec] fastlane ios upload_results
46 | ```
47 |
48 | Upload benchmark results
49 |
50 | ----
51 |
52 | This README.md is auto-generated and will be re-generated every time [_fastlane_](https://fastlane.tools) is run.
53 |
54 | More information about _fastlane_ can be found on [fastlane.tools](https://fastlane.tools).
55 |
56 | The documentation of _fastlane_ can be found on [docs.fastlane.tools](https://docs.fastlane.tools).
57 |
--------------------------------------------------------------------------------