├── .github └── workflows │ ├── development-tests.yml │ ├── expo-update.yml │ ├── homebrew-update.yml │ ├── pre-release-tests.yml │ └── unit-tests.yml ├── .gitignore ├── .spi.yml ├── .swiftpm └── configuration │ └── Package.resolved ├── BENCHMARKS.md ├── CONTRIBUTING.md ├── Examples └── WhisperAX │ ├── Debug.xcconfig │ ├── WhisperAX.xcodeproj │ ├── project.pbxproj │ ├── project.xcworkspace │ │ ├── contents.xcworkspacedata │ │ └── xcshareddata │ │ │ ├── IDEWorkspaceChecks.plist │ │ │ └── swiftpm │ │ │ └── Package.resolved │ └── xcshareddata │ │ └── xcschemes │ │ └── WhisperAX.xcscheme │ ├── WhisperAX │ ├── Info.plist │ ├── Preview Content │ │ └── Preview Assets.xcassets │ │ │ └── Contents.json │ ├── Resources │ │ ├── Assets.xcassets │ │ │ ├── AppIcon.appiconset │ │ │ │ ├── 100.png │ │ │ │ ├── 102.png │ │ │ │ ├── 1024 1.png │ │ │ │ ├── 1024 2.png │ │ │ │ ├── 1024.png │ │ │ │ ├── 108.png │ │ │ │ ├── 114.png │ │ │ │ ├── 120 1.png │ │ │ │ ├── 120.png │ │ │ │ ├── 128 1.png │ │ │ │ ├── 128.png │ │ │ │ ├── 136.png │ │ │ │ ├── 152.png │ │ │ │ ├── 16.png │ │ │ │ ├── 167.png │ │ │ │ ├── 172.png │ │ │ │ ├── 180.png │ │ │ │ ├── 192.png │ │ │ │ ├── 196.png │ │ │ │ ├── 216.png │ │ │ │ ├── 234.png │ │ │ │ ├── 256.png │ │ │ │ ├── 258.png │ │ │ │ ├── 32.png │ │ │ │ ├── 40.png │ │ │ │ ├── 44.png │ │ │ │ ├── 48.png │ │ │ │ ├── 512.png │ │ │ │ ├── 55.png │ │ │ │ ├── 58 1.png │ │ │ │ ├── 58.png │ │ │ │ ├── 60 1.png │ │ │ │ ├── 60.png │ │ │ │ ├── 64 1.png │ │ │ │ ├── 64.png │ │ │ │ ├── 66.png │ │ │ │ ├── 76.png │ │ │ │ ├── 80 1.png │ │ │ │ ├── 80.png │ │ │ │ ├── 87 1.png │ │ │ │ ├── 87.png │ │ │ │ ├── 88.png │ │ │ │ ├── 92.png │ │ │ │ └── Contents.json │ │ │ └── Contents.json │ │ ├── Info.plist │ │ └── WhisperAX.entitlements │ ├── Views │ │ └── ContentView.swift │ └── WhisperAXApp.swift │ ├── WhisperAXTests │ ├── WhisperAXTests.swift │ └── WhisperKitTests │ ├── WhisperAXUITests │ ├── WhisperAXUITests.swift │ └── WhisperAXUITestsLaunchTests.swift │ ├── WhisperAXWatchApp │ ├── Assets.xcassets │ │ ├── AccentColor.colorset │ │ │ └── Contents.json │ │ ├── AppIcon.appiconset │ │ │ ├── Contents.json │ │ │ └── appstore.png │ │ └── Contents.json │ ├── Preview Content │ │ └── Preview Assets.xcassets │ │ │ └── Contents.json │ ├── WhisperAXExampleView.swift │ └── WhisperAXWatchApp.swift │ ├── WhisperAXWatchAppTests │ └── WhisperAX_Watch_AppTests.swift │ └── WhisperAXWatchAppUITests │ ├── WhisperAX_Watch_AppUITests.swift │ └── WhisperAX_Watch_AppUITestsLaunchTests.swift ├── LICENSE ├── Makefile ├── Package.resolved ├── Package.swift ├── README.md ├── Sources ├── WhisperKit │ └── Core │ │ ├── Audio │ │ ├── AudioChunker.swift │ │ ├── AudioProcessor.swift │ │ ├── AudioStreamTranscriber.swift │ │ ├── EnergyVAD.swift │ │ └── VoiceActivityDetector.swift │ │ ├── AudioEncoder.swift │ │ ├── Configurations.swift │ │ ├── FeatureExtractor.swift │ │ ├── Models.swift │ │ ├── ResultWriter.swift │ │ ├── Text │ │ ├── LogitsFilter.swift │ │ ├── SegmentSeeker.swift │ │ └── TokenSampler.swift │ │ ├── TextDecoder.swift │ │ ├── TranscribeTask.swift │ │ ├── Utils │ │ ├── Concurrency.swift │ │ └── Utils.swift │ │ └── WhisperKit.swift └── WhisperKitCLI │ ├── CLIArguments.swift │ ├── CLIUtils.swift │ ├── TranscribeCLI.swift │ └── WhisperKitCLI.swift ├── Tests └── WhisperKitTests │ ├── Evaluate │ ├── DistanceCalculation.swift │ ├── NormalizeEn.swift │ ├── SpellingMapping.swift │ └── WERUtils.swift │ ├── FunctionalTests.swift │ ├── RegressionTestUtils.swift │ ├── RegressionTests.swift │ ├── Resources │ ├── 8_Channel_ID.m4a │ ├── config-v02.json │ ├── config-v03.json │ ├── es_test_clip.wav │ ├── ja_test_clip.wav │ ├── jfk.wav │ ├── jfk_441khz.m4a │ └── ted_60.m4a │ ├── TestUtils.swift │ └── UnitTests.swift └── fastlane ├── Fastfile └── README.md /.github/workflows/development-tests.yml: -------------------------------------------------------------------------------- 1 | name: Development Tests 2 | 3 | on: 4 | pull_request: 5 | pull_request_review: 6 | types: [submitted] 7 | workflow_dispatch: 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | build-and-test: 15 | name: "Build and Test" 16 | uses: ./.github/workflows/unit-tests.yml 17 | with: 18 | ios-version: "18.2" 19 | ios-device: "iPhone 16" 20 | macos-runner: "macos-15" 21 | 22 | check-approvals: 23 | runs-on: ubuntu-latest 24 | outputs: 25 | reviews: ${{ steps.reviews.outputs.state }} 26 | permissions: 27 | pull-requests: read 28 | contents: read 29 | steps: 30 | - uses: actions/checkout@v4 31 | - name: Check Approvals 32 | id: reviews 33 | env: 34 | GH_TOKEN: ${{ github.token }} 35 | pr: ${{ github.event.pull_request.number }} 36 | run: | 37 | echo "Checking PR approval for: $pr" 38 | state=$(gh pr view $pr --json reviewDecision --jq '.reviewDecision') 39 | echo "Review decision state: $state" 40 | echo "state=$state" >> "$GITHUB_OUTPUT" 41 | 42 | pre-merge-tests: 43 | name: "Pre-merge Tests" 44 | needs: [check-approvals] 45 | if: needs.check-approvals.outputs.reviews == 'APPROVED' || github.event_name == 'workflow_dispatch' 46 | strategy: 47 | matrix: 48 | include: 49 | - os: macos-13-xlarge 50 | ios-version: "17.2" 51 | ios-device: "iPhone 14" 52 | xcode-version: "15.2" 53 | - os: macos-14 54 | ios-version: "17.2" 55 | ios-device: "iPhone 15" 56 | xcode-version: "15.2" 57 | uses: ./.github/workflows/unit-tests.yml 58 | with: 59 | macos-runner: ${{ matrix.os }} 60 | ios-version: ${{ matrix.ios-version }} 61 | ios-device: ${{ matrix.ios-device }} 62 | xcode-version: ${{ matrix.xcode-version }} 63 | -------------------------------------------------------------------------------- /.github/workflows/expo-update.yml: -------------------------------------------------------------------------------- 1 | # Tested on MacOS with: 2 | # act -s COMMITTER_TOKEN="$(gh auth token)" release --container-architecture linux/amd64 -P ubuntu-latest=catthehacker/ubuntu:act-latest -e <(echo '{ "release": { "tag_name": "v0.0.0" }}') 3 | name: Update whisper-kit-expo 4 | 5 | on: 6 | release: 7 | types: [released] 8 | 9 | jobs: 10 | update-whisperkit: 11 | runs-on: ubuntu-latest 12 | env: 13 | TAG: ${{ github.event.release.tag_name }} 14 | BRANCH_NAME: update-whisperkit-${{ github.event.release.tag_name }} 15 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }} 16 | steps: 17 | - name: Checkout whisper-kit-expo 18 | uses: actions/checkout@v4 19 | with: 20 | repository: seb-sep/whisper-kit-expo 21 | token: ${{ secrets.COMMITTER_TOKEN }} 22 | ref: main 23 | 24 | - name: Setup Node 25 | uses: actions/setup-node@v4 26 | with: 27 | node-version: '20.x' 28 | 29 | - name: New branch 30 | run: | 31 | git checkout -b $BRANCH_NAME 32 | echo ${{ github.event.release }} 33 | echo "Release tag is $TAG" 34 | 35 | - name: Update package.json version 36 | run: | 37 | PACKAGE_PATH="package.json" 38 | if [ ! -f "$PACKAGE_PATH" ]; then 39 | echo "Could not find package.json at path: $PACKAGE_PATH." 40 | exit 1 41 | fi 42 | RELEASE_TAG=${TAG#v} 43 | jq --arg newver "$RELEASE_TAG" '.whisperKit.version = $newver' "$PACKAGE_PATH" > tmp.$$.json && mv tmp.$$.json "$PACKAGE_PATH" 44 | cat "$PACKAGE_PATH" 45 | 46 | - name: Commit changes 47 | run: | 48 | git config --global user.email "164233781+argmaxincbot@users.noreply.github.com" 49 | git config --global user.name "argmaxincbot" 50 | git add ./package.json 51 | git commit -m "Update WhisperKit to $TAG" 52 | git push origin $BRANCH_NAME 53 | - name: PR with changes 54 | env: 55 | GH_TOKEN: ${{ secrets.COMMITTER_TOKEN }} 56 | run: | 57 | gh pr create --title "Update WhisperKit to $TAG" --body "Update WhisperKit to $TAG" --base main --head $BRANCH_NAME 58 | -------------------------------------------------------------------------------- /.github/workflows/homebrew-update.yml: -------------------------------------------------------------------------------- 1 | name: Bump Homebrew Formula 2 | 3 | on: 4 | push: 5 | tags: 'v*' 6 | 7 | jobs: 8 | homebrew: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: mislav/bump-homebrew-formula-action@v3 12 | with: 13 | formula-name: whisperkit-cli 14 | env: 15 | COMMITTER_TOKEN: ${{ secrets.COMMITTER_TOKEN }} 16 | -------------------------------------------------------------------------------- /.github/workflows/pre-release-tests.yml: -------------------------------------------------------------------------------- 1 | name: Pre-Release Tests 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build-and-test-all-platforms: 10 | name: "Build and Test All Platforms" 11 | strategy: 12 | matrix: 13 | include: 14 | - os: macos-13-xlarge 15 | ios-version: "17.2" # TODO: Download older simulators for macOS 13 16 | ios-device: "iPhone 14" 17 | xcode-version: "15.2" 18 | - os: macos-14 19 | ios-version: "17.2" 20 | ios-device: "iPhone 15" 21 | xcode-version: "15.2" 22 | - os: macos-15 23 | ios-version: "18.2" # Latest available version 24 | ios-device: "iPhone 16" 25 | xcode-version: "latest-stable" 26 | uses: ./.github/workflows/unit-tests.yml 27 | with: 28 | macos-runner: ${{ matrix.os }} 29 | ios-version: ${{ matrix.ios-version }} 30 | ios-device: ${{ matrix.ios-device }} 31 | xcode-version: ${{ matrix.xcode-version }} -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | ios-version: 7 | required: true 8 | type: string 9 | ios-device: 10 | required: true 11 | type: string 12 | macos-runner: 13 | required: true 14 | type: string 15 | xcode-version: 16 | required: false 17 | type: string 18 | 19 | jobs: 20 | unit-tests: 21 | name: "${{ matrix.run-config['name'] }} on ${{ inputs.macos-runner }}" 22 | runs-on: ${{ inputs.macos-runner }} 23 | strategy: 24 | matrix: 25 | run-config: 26 | - { 27 | name: "macOS", 28 | condition: true, 29 | clean-destination: "generic/platform=macOS", 30 | test-destination: "platform=macOS,arch=arm64", 31 | } 32 | - { 33 | name: "iOS", 34 | condition: true, 35 | clean-destination: "generic/platform=iOS", 36 | test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=${{ inputs.ios-device }}", 37 | } 38 | - { 39 | name: "watchOS", 40 | condition: "${{ inputs.macos-runner == 'macos-15' }}", 41 | clean-destination: "generic/platform=watchOS", 42 | test-destination: "platform=watchOS Simulator,name=Apple Watch Ultra 2 (49mm)", 43 | } 44 | - { 45 | name: "visionOS", 46 | condition: "${{ inputs.macos-runner == 'macos-15' }}", 47 | clean-destination: "generic/platform=visionOS", 48 | test-destination: "platform=visionOS Simulator,name=Apple Vision Pro", 49 | } 50 | timeout-minutes: 30 51 | steps: 52 | - uses: actions/checkout@v4 53 | - uses: maxim-lobanov/setup-xcode@v1 54 | with: 55 | xcode-version: ${{ inputs.xcode-version || 'latest-stable' }} 56 | - name: Setup environment 57 | run: make setup 58 | - name: Setup Cache 59 | id: model-cache 60 | uses: actions/cache@v4 61 | with: 62 | path: Models 63 | key: ${{ runner.os }}-models 64 | - name: Download Models 65 | if: steps.model-cache.outputs.cache-hit != 'true' 66 | run: make download-model MODEL=tiny 67 | - name: Install and discover destinations 68 | if: ${{ matrix.run-config['condition'] == true }} 69 | run: | 70 | if [[ "${{ matrix.run-config['name'] }}" != "macOS" ]]; then 71 | xcodebuild -downloadPlatform ${{ matrix.run-config['name'] }} 72 | fi 73 | echo "Runtimes for testing:" 74 | xcrun simctl list runtimes 75 | echo "Destinations for testing:" 76 | xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations 77 | - name: Boot Simulator and Wait 78 | if: ${{ matrix.run-config['condition'] == true }} && ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }} 79 | # Slower runners require some time to fully boot the simulator 80 | # Parse the simulator name from the destination string, boot it, and wait 81 | run: | 82 | simulator_name=$(echo '${{ matrix.run-config['test-destination'] }}' | sed -n 's/.*name=\([^,]*\).*/\1/p') 83 | xcrun simctl boot "$simulator_name" || true 84 | sleep 15 85 | xcrun simctl list devices 86 | - name: Build and Test - ${{ matrix.run-config['name'] }} 87 | if: ${{ matrix.run-config['condition'] == true }} 88 | run: | 89 | set -o pipefail 90 | xcodebuild clean build-for-testing -scheme whisperkit-Package -destination '${{ matrix.run-config['clean-destination'] }}' | xcpretty 91 | xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination '${{ matrix.run-config['test-destination'] }}' 92 | - name: Upload Test Results 93 | if: failure() 94 | uses: actions/upload-artifact@v4 95 | with: 96 | name: test-results-${{ matrix.run-config['name']}}-on-${{ inputs.macos-runner }} 97 | path: | 98 | ~/Library/Developer/Xcode/DerivedData/**/Logs/Test/*.xcresult 99 | retention-days: 5 100 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | .vscode/ 5 | xcuserdata/ 6 | DerivedData/ 7 | .swiftpm/configuration/registries.json 8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 9 | .swiftpm/xcode/xcshareddata/ 10 | **/*.xcscheme 11 | .netrc 12 | .env 13 | 14 | # Core ML Model Files 15 | Models 16 | **/*.mlpackage 17 | **/*.mlmodel 18 | **/*.mlmodelc 19 | **/*.zip 20 | **/*.tar.gz 21 | 22 | # Audio files (add manually if needed) 23 | **/*.wav 24 | **/*.mp3 25 | **/*.m4a 26 | **/*.flac 27 | 28 | ## Xcode 29 | # Build generated 30 | build/ 31 | DerivedData/ 32 | 33 | # Various settings 34 | *.pbxuser 35 | !default.pbxuser 36 | *.mode1v3 37 | !default.mode1v3 38 | *.mode2v3 39 | !default.mode2v3 40 | *.perspectivev3 41 | !default.perspectivev3 42 | xcuserdata/ 43 | 44 | # Other 45 | *.moved-aside 46 | *.xccheckout 47 | *.xcscmblueprint 48 | 49 | # Obj-C/Swift specific 50 | *.hmap 51 | *.ipa 52 | *.dSYM.zip 53 | *.dSYM 54 | 55 | # fastlane 56 | fastlane/report.xml 57 | fastlane/Preview.html 58 | fastlane/screenshots 59 | fastlane/test_output 60 | fastlane/benchmark_data 61 | fastlane/upload_folder 62 | 63 | ### Xcode Patch ### 64 | **/*.xcconfig 65 | *.xcodeproj/* 66 | !*.xcodeproj/project.pbxproj 67 | !*.xcodeproj/xcshareddata/ 68 | !*.xcworkspace/contents.xcworkspacedata 69 | /*.gcno -------------------------------------------------------------------------------- /.spi.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | builder: 3 | configs: 4 | - documentation_targets: [WhisperKit] -------------------------------------------------------------------------------- /.swiftpm/configuration/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "swift-argument-parser", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/apple/swift-argument-parser.git", 7 | "state" : { 8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 9 | "version" : "1.3.0" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-transformers", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/huggingface/swift-transformers.git", 16 | "state" : { 17 | "revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe", 18 | "version" : "0.1.7" 19 | } 20 | } 21 | ], 22 | "version" : 2 23 | } 24 | -------------------------------------------------------------------------------- /BENCHMARKS.md: -------------------------------------------------------------------------------- 1 | # WhisperKit Benchmarks 2 | 3 | This document describes how to run the benchmarks for WhisperKit. The benchmarks can be run on a specific device or all connected devices. The results are saved in JSON files and can be uploaded to the [argmaxinc/whisperkit-evals-dataset](https://huggingface.co/datasets/argmaxinc/whisperkit-evals-dataset) dataset on HuggingFace as a pull request. Below are the steps to run the benchmarks locally in order to reproduce the results shown in our [WhisperKit Benchmarks](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) space. 4 | 5 | ## Download the Source 6 | 7 | To download the code to run the test suite, run: 8 | 9 | ```sh 10 | git clone git@github.com:argmaxinc/WhisperKit.git 11 | ``` 12 | 13 | ## Local Environment 14 | 15 | Before running the benchmarks, you'll need to set up your local environment with the necessary dependencies. To do this, run: 16 | 17 | ```sh 18 | make setup 19 | ``` 20 | 21 | See [Contributing](CONTRIBUTING.md) for more information. 22 | 23 | 24 | ## Xcode Environment 25 | 26 | When running the tests, the model to test needs is provided to the Xcode from Fastlane as an environment variable: 27 | 28 | 1. Open the example project: 29 | 30 | ```sh 31 | xed Examples/WhisperAX 32 | ``` 33 | 34 | 2. At the top, you will see the app icon and `WhisperAX` written next to it. Click on `WhisperAX` and select `Edit Scheme` at the bottom. 35 | 36 | 3. Under `Environment Variables`, you will see an entry with `MODEL_NAME` as the name and `$(MODEL_NAME)` as the value. 37 | 38 | ## Devices 39 | 40 | > [!IMPORTANT] 41 | > An active developer account is required to run the tests on physical devices. 42 | 43 | Before running tests, all external devices need to be connected and paired to your Mac, as well as registered with your developer account. Ensure the devices are in Developer Mode. If nothing appears after connecting the devices via cable, press `Command + Shift + 2` to open the list of devices and track their progress. 44 | 45 | ## Datasets 46 | 47 | The datasets for the test suite can be set in a global array called `datasets` in the file [`Tests/WhisperKitTests/RegressionTests.swift`](Tests/WhisperKitTests/RegressionTests.swift). It is prefilled with the datasets that are currently available. 48 | 49 | ## Models 50 | 51 | The models for the test suite can be set in the [`Fastfile`](fastlane/Fastfile). Simply find `BENCHMARK_CONFIGS` and modify the `models` array under the benchmark you want to run. 52 | 53 | ## Makefile and Fastlane 54 | 55 | The tests are run using [Fastlane](fastlane/Fastfile), which is controlled by a [Makefile](Makefile). The Makefile contains the following commands: 56 | 57 | ### List Connected Devices 58 | 59 | Before running the tests it might be a good idea to list the connected devices to resolve any connection issues. Simply run: 60 | 61 | ```sh 62 | make list-devices 63 | ``` 64 | 65 | The output will be a list with entries that look something like this: 66 | 67 | ```ruby 68 | { 69 | :name=>"My Mac", 70 | :type=>"Apple M2 Pro", 71 | :platform=>"macOS", 72 | :os_version=>"15.0.1", 73 | :product=>"Mac14,12", 74 | :id=>"XXXXXXXX-1234-5678-9012-XXXXXXXXXXXX", 75 | :state=>"connected" 76 | } 77 | ``` 78 | 79 | Verify that the devices are connected and the state is `connected`. 80 | 81 | ### Running Benchmarks 82 | 83 | After completing the above steps, you can run the tests. Note that there are two different test configurations: one named `full` and the other named `debug`. To check for potential errors, run the `debug` tests: 84 | 85 | ```sh 86 | make benchmark-devices DEBUG=true 87 | ``` 88 | 89 | Otherwise run the `full` tests: 90 | 91 | ```sh 92 | make benchmark-devices 93 | ``` 94 | 95 | Optionally, for both tests, you can specify the list of devices for the tests using the `DEVICES` option: 96 | 97 | ```sh 98 | make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac" 99 | ``` 100 | 101 | The `DEVICES` option is a comma-separated list of device names. The device names can be found by running `make list-devices` and using the value for the `:name` key. 102 | 103 | ### Results 104 | 105 | After the tests are run, the generated results can be found under `fastlane/benchmark_data` including the .xcresult file with logs and attachments for each device. There will also be a folder called `fastlane/upload_folder/benchmark_data` that contains only the JSON results in `fastlane/benchmark_data` that can used for further analysis. 106 | 107 | We will periodically run these tests on a range of devices and upload the results to the [argmaxinc/whisperkit-evals-dataset](https://huggingface.co/datasets/argmaxinc/whisperkit-evals-dataset), which will propagate to the [WhisperKit Benchmarks](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) space and be available for comparison. 108 | 109 | 110 | # Troubleshooting 111 | 112 | 113 | If you encounter issues while running the tests, heres a few things to try: 114 | 115 | 1. Open the project in Xcode and run the tests directly from there. 116 | 1. To do this, open the example app (from command line type: `xed Examples/WhisperAX`) and run the test named `RegressionTests/testModelPerformanceWithDebugConfig` from the test navigator. 117 | 2. If the tests run successfully, you can rule out any issues with the device or the models. 118 | 3. If they dont run successfully, Xcode will provide more detailed error messages. 119 | 2. Try specifying a single device to run the tests on. This can be done by running `make list-devices` and then running the tests with the `DEVICES` option set to the name of the device you want to test on. For example, `make benchmark-devices DEVICES="My Mac"`. This will also enable you to see the logs for that specific device. 120 | 3. If you are still encountering issues, please reach out to us on the [Discord](https://discord.gg/G5F5GZGecC) or create an [issue](https://github.com/argmaxinc/WhisperKit/issues) on GitHub. 121 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to WhisperKit 2 | 3 | ## Overview 4 | 5 | We welcome and encourage contributions to WhisperKit! Whether you're fixing bugs, improving documentation, or adding new features from the roadmap, your help is appreciated. This guide will help you get started with contributing to WhisperKit. 6 | 7 | ## Getting Started 8 | 9 | 1. **Fork the Repository**: Start by [forking](https://github.com/argmaxinc/WhisperKit/fork) the WhisperKit repository on GitHub to your personal account. 10 | 11 | 2. **Clone Your Fork**: Clone your fork to your local machine to start making changes. 12 | 13 | ```bash 14 | git clone https://github.com/[your-username]/whisperkit.git 15 | cd whisperkit 16 | ``` 17 | 18 | ## Setting Up Your Development Environment 19 | 20 | 1. **Install Dependencies**: Use the provided `Makefile` to set up your environment. Run `make setup` to install necessary dependencies. 21 | 22 | ```bash 23 | make setup 24 | ``` 25 | 26 | 2. **Download Models**: Run `make download-models` to download the required models to run and test locally. 27 | 28 | ```bash 29 | make download-model MODEL=tiny 30 | ``` 31 | 32 | ## Making Changes 33 | 34 | 1. **Make Your Changes**: Implement your changes, add new features, or fix bugs. Ensure you adhere to the existing coding style. If you're adding new features, make sure to update or add any documentation or tests as needed. 35 | 36 | 2. **Build and Test**: You can use the `Makefile` to build and test your changes. Run `make build` to build WhisperKit and `make test` to run tests. 37 | 38 | ```bash 39 | make build 40 | make test 41 | ``` 42 | 43 | You can also run and test directly from Xcode. We've provided an example app that contains various use cases, just open the `Examples/WhisperAX/WhisperAX.xcodeproj` file in Xcode and run the app. 44 | 45 | ## Submitting Your Changes 46 | 47 | 1. **Commit Your Changes**: Once you're satisfied with your changes, commit them with a clear and concise commit message. 48 | 49 | ```bash 50 | git commit -am "Add a new feature" 51 | ``` 52 | 53 | 2. **Push to Your Fork**: Push your changes to your fork on GitHub. 54 | 55 | ```bash 56 | git push origin my-branch 57 | ``` 58 | 59 | 3. **Create a Pull Request**: Go to the WhisperKit repository on GitHub and create a new pull request from your fork. Ensure your pull request has a clear title and description. 60 | 61 | 4. **Code Review**: Wait for the maintainers to review your pull request. Be responsive to feedback and make any necessary changes. 62 | 63 | ## Guidelines 64 | 65 | - **Code Style**: Follow the existing code style in the project. 66 | - **Commit Messages**: Write meaningful commit messages that clearly describe the changes. 67 | - **Documentation**: Update documentation if you're adding new features or making changes that affect how users interact with WhisperKit. 68 | - **Tests**: Add or update tests for new features or bug fixes. 69 | 70 | ## Final Steps 71 | 72 | After your pull request has been reviewed and approved, a maintainer will merge it into the main branch. Congratulations, you've successfully contributed to WhisperKit! 73 | 74 | Thank you for making WhisperKit better for everyone! ❤️‍🔥 75 | -------------------------------------------------------------------------------- /Examples/WhisperAX/Debug.xcconfig: -------------------------------------------------------------------------------- 1 | // Run `make setup` to add your team here 2 | DEVELOPMENT_TEAM= 3 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDEDidComputeMac32BitWarning 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "originHash" : "831ad63194a5262b2549d58e383a520f9cbbc80b4a75660fbbcc56d65edfdab4", 3 | "pins" : [ 4 | { 5 | "identity" : "swift-argument-parser", 6 | "kind" : "remoteSourceControl", 7 | "location" : "https://github.com/apple/swift-argument-parser.git", 8 | "state" : { 9 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 10 | "version" : "1.3.0" 11 | } 12 | }, 13 | { 14 | "identity" : "swift-transformers", 15 | "kind" : "remoteSourceControl", 16 | "location" : "https://github.com/huggingface/swift-transformers.git", 17 | "state" : { 18 | "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08", 19 | "version" : "0.1.8" 20 | } 21 | } 22 | ], 23 | "version" : 3 24 | } 25 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX.xcodeproj/xcshareddata/xcschemes/WhisperAX.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 31 | 32 | 35 | 41 | 42 | 43 | 46 | 52 | 53 | 54 | 55 | 56 | 66 | 68 | 74 | 75 | 76 | 77 | 81 | 82 | 86 | 87 | 88 | 89 | 95 | 97 | 103 | 104 | 105 | 106 | 108 | 109 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | NSPrivacyAccessedAPITypes 6 | 7 | NSPrivacyAccessedAPIType 8 | NSPrivacyAccessedAPICategoryUserDefaults 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/100.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/102.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024 2.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/1024.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/108.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/114.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/120.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/128.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/136.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/152.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/16.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/167.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/172.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/180.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/192.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/196.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/216.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/234.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/256.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/258.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/32.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/40.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/44.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/48.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/512.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/55.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/58.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/60.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/64.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/66.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/76.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/80.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87 1.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/87.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/88.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/92.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "filename" : "40.png", 5 | "idiom" : "universal", 6 | "platform" : "ios", 7 | "scale" : "2x", 8 | "size" : "20x20" 9 | }, 10 | { 11 | "filename" : "60.png", 12 | "idiom" : "universal", 13 | "platform" : "ios", 14 | "scale" : "3x", 15 | "size" : "20x20" 16 | }, 17 | { 18 | "filename" : "58 1.png", 19 | "idiom" : "universal", 20 | "platform" : "ios", 21 | "scale" : "2x", 22 | "size" : "29x29" 23 | }, 24 | { 25 | "filename" : "87 1.png", 26 | "idiom" : "universal", 27 | "platform" : "ios", 28 | "scale" : "3x", 29 | "size" : "29x29" 30 | }, 31 | { 32 | "filename" : "76.png", 33 | "idiom" : "universal", 34 | "platform" : "ios", 35 | "scale" : "2x", 36 | "size" : "38x38" 37 | }, 38 | { 39 | "filename" : "114.png", 40 | "idiom" : "universal", 41 | "platform" : "ios", 42 | "scale" : "3x", 43 | "size" : "38x38" 44 | }, 45 | { 46 | "filename" : "80 1.png", 47 | "idiom" : "universal", 48 | "platform" : "ios", 49 | "scale" : "2x", 50 | "size" : "40x40" 51 | }, 52 | { 53 | "filename" : "120.png", 54 | "idiom" : "universal", 55 | "platform" : "ios", 56 | "scale" : "3x", 57 | "size" : "40x40" 58 | }, 59 | { 60 | "filename" : "120 1.png", 61 | "idiom" : "universal", 62 | "platform" : "ios", 63 | "scale" : "2x", 64 | "size" : "60x60" 65 | }, 66 | { 67 | "filename" : "180.png", 68 | "idiom" : "universal", 69 | "platform" : "ios", 70 | "scale" : "3x", 71 | "size" : "60x60" 72 | }, 73 | { 74 | "filename" : "128 1.png", 75 | "idiom" : "universal", 76 | "platform" : "ios", 77 | "scale" : "2x", 78 | "size" : "64x64" 79 | }, 80 | { 81 | "filename" : "192.png", 82 | "idiom" : "universal", 83 | "platform" : "ios", 84 | "scale" : "3x", 85 | "size" : "64x64" 86 | }, 87 | { 88 | "filename" : "136.png", 89 | "idiom" : "universal", 90 | "platform" : "ios", 91 | "scale" : "2x", 92 | "size" : "68x68" 93 | }, 94 | { 95 | "filename" : "152.png", 96 | "idiom" : "universal", 97 | "platform" : "ios", 98 | "scale" : "2x", 99 | "size" : "76x76" 100 | }, 101 | { 102 | "filename" : "167.png", 103 | "idiom" : "universal", 104 | "platform" : "ios", 105 | "scale" : "2x", 106 | "size" : "83.5x83.5" 107 | }, 108 | { 109 | "filename" : "1024 1.png", 110 | "idiom" : "universal", 111 | "platform" : "ios", 112 | "size" : "1024x1024" 113 | }, 114 | { 115 | "filename" : "16.png", 116 | "idiom" : "mac", 117 | "scale" : "1x", 118 | "size" : "16x16" 119 | }, 120 | { 121 | "filename" : "32.png", 122 | "idiom" : "mac", 123 | "scale" : "2x", 124 | "size" : "16x16" 125 | }, 126 | { 127 | "filename" : "32.png", 128 | "idiom" : "mac", 129 | "scale" : "1x", 130 | "size" : "32x32" 131 | }, 132 | { 133 | "filename" : "64.png", 134 | "idiom" : "mac", 135 | "scale" : "2x", 136 | "size" : "32x32" 137 | }, 138 | { 139 | "filename" : "128.png", 140 | "idiom" : "mac", 141 | "scale" : "1x", 142 | "size" : "128x128" 143 | }, 144 | { 145 | "filename" : "256.png", 146 | "idiom" : "mac", 147 | "scale" : "2x", 148 | "size" : "128x128" 149 | }, 150 | { 151 | "filename" : "256.png", 152 | "idiom" : "mac", 153 | "scale" : "1x", 154 | "size" : "256x256" 155 | }, 156 | { 157 | "filename" : "512.png", 158 | "idiom" : "mac", 159 | "scale" : "2x", 160 | "size" : "256x256" 161 | }, 162 | { 163 | "filename" : "512.png", 164 | "idiom" : "mac", 165 | "scale" : "1x", 166 | "size" : "512x512" 167 | }, 168 | { 169 | "filename" : "1024.png", 170 | "idiom" : "mac", 171 | "scale" : "2x", 172 | "size" : "512x512" 173 | }, 174 | { 175 | "filename" : "44.png", 176 | "idiom" : "universal", 177 | "platform" : "watchos", 178 | "scale" : "2x", 179 | "size" : "22x22" 180 | }, 181 | { 182 | "filename" : "48.png", 183 | "idiom" : "universal", 184 | "platform" : "watchos", 185 | "scale" : "2x", 186 | "size" : "24x24" 187 | }, 188 | { 189 | "filename" : "55.png", 190 | "idiom" : "universal", 191 | "platform" : "watchos", 192 | "scale" : "2x", 193 | "size" : "27.5x27.5" 194 | }, 195 | { 196 | "filename" : "58.png", 197 | "idiom" : "universal", 198 | "platform" : "watchos", 199 | "scale" : "2x", 200 | "size" : "29x29" 201 | }, 202 | { 203 | "filename" : "60 1.png", 204 | "idiom" : "universal", 205 | "platform" : "watchos", 206 | "scale" : "2x", 207 | "size" : "30x30" 208 | }, 209 | { 210 | "filename" : "64 1.png", 211 | "idiom" : "universal", 212 | "platform" : "watchos", 213 | "scale" : "2x", 214 | "size" : "32x32" 215 | }, 216 | { 217 | "filename" : "66.png", 218 | "idiom" : "universal", 219 | "platform" : "watchos", 220 | "scale" : "2x", 221 | "size" : "33x33" 222 | }, 223 | { 224 | "filename" : "80.png", 225 | "idiom" : "universal", 226 | "platform" : "watchos", 227 | "scale" : "2x", 228 | "size" : "40x40" 229 | }, 230 | { 231 | "filename" : "87.png", 232 | "idiom" : "universal", 233 | "platform" : "watchos", 234 | "scale" : "2x", 235 | "size" : "43.5x43.5" 236 | }, 237 | { 238 | "filename" : "88.png", 239 | "idiom" : "universal", 240 | "platform" : "watchos", 241 | "scale" : "2x", 242 | "size" : "44x44" 243 | }, 244 | { 245 | "filename" : "92.png", 246 | "idiom" : "universal", 247 | "platform" : "watchos", 248 | "scale" : "2x", 249 | "size" : "46x46" 250 | }, 251 | { 252 | "filename" : "100.png", 253 | "idiom" : "universal", 254 | "platform" : "watchos", 255 | "scale" : "2x", 256 | "size" : "50x50" 257 | }, 258 | { 259 | "filename" : "102.png", 260 | "idiom" : "universal", 261 | "platform" : "watchos", 262 | "scale" : "2x", 263 | "size" : "51x51" 264 | }, 265 | { 266 | "filename" : "108.png", 267 | "idiom" : "universal", 268 | "platform" : "watchos", 269 | "scale" : "2x", 270 | "size" : "54x54" 271 | }, 272 | { 273 | "filename" : "172.png", 274 | "idiom" : "universal", 275 | "platform" : "watchos", 276 | "scale" : "2x", 277 | "size" : "86x86" 278 | }, 279 | { 280 | "filename" : "196.png", 281 | "idiom" : "universal", 282 | "platform" : "watchos", 283 | "scale" : "2x", 284 | "size" : "98x98" 285 | }, 286 | { 287 | "filename" : "216.png", 288 | "idiom" : "universal", 289 | "platform" : "watchos", 290 | "scale" : "2x", 291 | "size" : "108x108" 292 | }, 293 | { 294 | "filename" : "234.png", 295 | "idiom" : "universal", 296 | "platform" : "watchos", 297 | "scale" : "2x", 298 | "size" : "117x117" 299 | }, 300 | { 301 | "filename" : "258.png", 302 | "idiom" : "universal", 303 | "platform" : "watchos", 304 | "scale" : "2x", 305 | "size" : "129x129" 306 | }, 307 | { 308 | "filename" : "1024 2.png", 309 | "idiom" : "universal", 310 | "platform" : "watchos", 311 | "size" : "1024x1024" 312 | } 313 | ], 314 | "info" : { 315 | "author" : "xcode", 316 | "version" : 1 317 | } 318 | } 319 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/Resources/WhisperAX.entitlements: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | com.apple.developer.kernel.increased-memory-limit 6 | 7 | com.apple.security.app-sandbox 8 | 9 | com.apple.security.device.audio-input 10 | 11 | com.apple.security.files.downloads.read-only 12 | 13 | com.apple.security.files.user-selected.read-write 14 | 15 | com.apple.security.network.client 16 | 17 | com.apple.security.network.server 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAX/WhisperAXApp.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import SwiftUI 5 | 6 | @main 7 | struct WhisperAXApp: App { 8 | var body: some Scene { 9 | WindowGroup { 10 | ContentView() 11 | #if os(macOS) 12 | .frame(minWidth: 1000, minHeight: 700) 13 | #endif 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXTests/WhisperAXTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAXTests: XCTestCase { 7 | override func setUpWithError() throws { 8 | // Put setup code here. This method is called before the invocation of each test method in the class. 9 | } 10 | 11 | override func tearDownWithError() throws { 12 | // Put teardown code here. This method is called after the invocation of each test method in the class. 13 | } 14 | 15 | func testExample() throws { 16 | // This is an example of a functional test case. 17 | // Use XCTAssert and related functions to verify your tests produce the correct results. 18 | // Any test you write for XCTest can be annotated as throws and async. 19 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error. 20 | // Mark your test async to allow awaiting for asynchronous code to complete. Check the results with assertions afterwards. 21 | } 22 | 23 | func testPerformanceExample() throws { 24 | // This is an example of a performance test case. 25 | measure { 26 | // Put the code you want to measure the time of here. 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXTests/WhisperKitTests: -------------------------------------------------------------------------------- 1 | ../../../Tests/WhisperKitTests -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXUITests/WhisperAXUITests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAXUITests: XCTestCase { 7 | override func setUpWithError() throws { 8 | // Put setup code here. This method is called before the invocation of each test method in the class. 9 | 10 | // In UI tests it is usually best to stop immediately when a failure occurs. 11 | continueAfterFailure = false 12 | 13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this. 14 | } 15 | 16 | override func tearDownWithError() throws { 17 | // Put teardown code here. This method is called after the invocation of each test method in the class. 18 | } 19 | 20 | func testExample() throws { 21 | // UI tests must launch the application that they test. 22 | let app = XCUIApplication() 23 | app.launch() 24 | 25 | // Use XCTAssert and related functions to verify your tests produce the correct results. 26 | } 27 | 28 | func testLaunchPerformance() throws { 29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) { 30 | // This measures how long it takes to launch your application. 31 | measure(metrics: [XCTApplicationLaunchMetric()]) { 32 | XCUIApplication().launch() 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXUITests/WhisperAXUITestsLaunchTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAXUITestsLaunchTests: XCTestCase { 7 | override class var runsForEachTargetApplicationUIConfiguration: Bool { 8 | true 9 | } 10 | 11 | override func setUpWithError() throws { 12 | continueAfterFailure = false 13 | } 14 | 15 | func testLaunch() throws { 16 | let app = XCUIApplication() 17 | app.launch() 18 | 19 | // Insert steps here to perform after app launch but before taking a screenshot, 20 | // such as logging into a test account or navigating somewhere in the app 21 | 22 | let attachment = XCTAttachment(screenshot: app.screenshot()) 23 | attachment.name = "Launch Screen" 24 | attachment.lifetime = .keepAlways 25 | add(attachment) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "idiom" : "universal" 5 | } 6 | ], 7 | "info" : { 8 | "author" : "xcode", 9 | "version" : 1 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "filename" : "appstore.png", 5 | "idiom" : "universal", 6 | "platform" : "watchos", 7 | "size" : "1024x1024" 8 | } 9 | ], 10 | "info" : { 11 | "author" : "xcode", 12 | "version" : 1 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/AppIcon.appiconset/appstore.png -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchApp/WhisperAXWatchApp.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import SwiftUI 5 | 6 | @main 7 | struct WhisperAXWatchApp: App { 8 | var body: some Scene { 9 | WindowGroup { 10 | WhisperAXWatchView() 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchAppTests/WhisperAX_Watch_AppTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | @testable import WhisperAX_Watch_App 5 | import XCTest 6 | 7 | final class WhisperAX_Watch_AppTests: XCTestCase { 8 | override func setUpWithError() throws { 9 | // Put setup code here. This method is called before the invocation of each test method in the class. 10 | } 11 | 12 | override func tearDownWithError() throws { 13 | // Put teardown code here. This method is called after the invocation of each test method in the class. 14 | } 15 | 16 | func testExample() throws { 17 | // This is an example of a functional test case. 18 | // Use XCTAssert and related functions to verify your tests produce the correct results. 19 | // Any test you write for XCTest can be annotated as throws and async. 20 | // Mark your test throws to produce an unexpected failure when your test encounters an uncaught error. 21 | // Tests marked async will run the test method on an arbitrary thread managed by the Swift runtime. 22 | } 23 | 24 | func testPerformanceExample() throws { 25 | // This is an example of a performance test case. 26 | self.measure { 27 | // Put the code you want to measure the time of here. 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAX_Watch_AppUITests: XCTestCase { 7 | override func setUpWithError() throws { 8 | // Put setup code here. This method is called before the invocation of each test method in the class. 9 | 10 | // In UI tests it is usually best to stop immediately when a failure occurs. 11 | continueAfterFailure = false 12 | 13 | // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this. 14 | } 15 | 16 | override func tearDownWithError() throws { 17 | // Put teardown code here. This method is called after the invocation of each test method in the class. 18 | } 19 | 20 | func testExample() throws { 21 | // UI tests must launch the application that they test. 22 | let app = XCUIApplication() 23 | app.launch() 24 | 25 | // Use XCTAssert and related functions to verify your tests produce the correct results. 26 | } 27 | 28 | func testLaunchPerformance() throws { 29 | if #available(macOS 10.15, iOS 13.0, tvOS 13.0, watchOS 7.0, *) { 30 | // This measures how long it takes to launch your application. 31 | measure(metrics: [XCTApplicationLaunchMetric()]) { 32 | XCUIApplication().launch() 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Examples/WhisperAX/WhisperAXWatchAppUITests/WhisperAX_Watch_AppUITestsLaunchTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import XCTest 5 | 6 | final class WhisperAX_Watch_AppUITestsLaunchTests: XCTestCase { 7 | override class var runsForEachTargetApplicationUIConfiguration: Bool { 8 | true 9 | } 10 | 11 | override func setUpWithError() throws { 12 | continueAfterFailure = false 13 | } 14 | 15 | func testLaunch() throws { 16 | let app = XCUIApplication() 17 | app.launch() 18 | 19 | // Insert steps here to perform after app launch but before taking a screenshot, 20 | // such as logging into a test account or navigating somewhere in the app 21 | 22 | let attachment = XCTAttachment(screenshot: app.screenshot()) 23 | attachment.name = "Launch Screen" 24 | attachment.lifetime = .keepAlways 25 | add(attachment) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 argmax, inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup setup-huggingface-cli setup-model-repo download-models download-model build build-cli test clean-package-caches list-devices benchmark-connected-devices benchmark-device benchmark-devices extract-xcresult 2 | 3 | PIP_COMMAND := pip3 4 | PYTHON_COMMAND := python3 5 | 6 | # Define model repository and directories 7 | MODEL_REPO := argmaxinc/whisperkit-coreml 8 | MODEL_REPO_DIR := ./Models/whisperkit-coreml 9 | BASE_COMPILED_DIR := ./Models 10 | 11 | GIT_HASH := $(shell git rev-parse --short HEAD) 12 | 13 | setup: 14 | @echo "Setting up environment..." 15 | @which $(PIP_COMMAND) 16 | @which $(PYTHON_COMMAND) 17 | @echo "Checking for Homebrew..." 18 | @which brew > /dev/null || (echo "Error: Homebrew is not installed. Install it from https://brew.sh and try again" && exit 1) 19 | @echo "Homebrew is installed." 20 | @echo "Checking for huggingface-cli..." 21 | @which huggingface-cli > /dev/null || (echo "Installing huggingface-cli..." && brew install huggingface-cli) 22 | @echo "huggingface-cli is installed." 23 | @echo "Checking for git-lfs..." 24 | @which git-lfs > /dev/null || (echo "Installing git-lfs..." && brew install git-lfs) 25 | @echo "git-lfs is installed." 26 | @echo "Checking for trash..." 27 | @which trash > /dev/null || (echo "Installing trash..." && brew install trash) 28 | @echo "trash is installed." 29 | @echo "Checking for fastlane" 30 | @which fastlane > /dev/null || (echo "Installing fastlane..." && brew install fastlane) 31 | @echo "fastlane is installed." 32 | @$(MAKE) generate-whisperax-xcconfig 33 | @echo "Done 🚀" 34 | 35 | 36 | generate-whisperax-xcconfig: 37 | @echo "Updating DEVELOPMENT_TEAM in Examples/WhisperAX/Debug.xcconfig..." 38 | @TEAM_ID=$$(defaults read com.apple.dt.Xcode IDEProvisioningTeams | plutil -convert json -r -o - -- - | jq -r 'to_entries[0].value | sort_by(.teamType == "Individual") | .[0].teamID' 2>/dev/null); \ 39 | if [ -z "$$TEAM_ID" ]; then \ 40 | echo "Error: No Development Team ID found. Please log into Xcode with your Apple ID and select a team."; \ 41 | else \ 42 | echo "DEVELOPMENT_TEAM=$$TEAM_ID" > Examples/WhisperAX/Debug.xcconfig; \ 43 | echo "DEVELOPMENT_TEAM has been updated in Examples/WhisperAX/Debug.xcconfig with your Development Team ID: $$TEAM_ID"; \ 44 | fi 45 | 46 | 47 | setup-huggingface-cli: 48 | @if huggingface-cli whoami; then \ 49 | echo "Already logged in to Hugging Face."; \ 50 | else \ 51 | echo "Not logged in to Hugging Face."; \ 52 | if [ -z "$$HF_TOKEN" ]; then \ 53 | echo "Environment variable HF_TOKEN is not set. Running normal login."; \ 54 | huggingface-cli login; \ 55 | else \ 56 | echo "Using HF_TOKEN from environment variable."; \ 57 | huggingface-cli login --token $$HF_TOKEN; \ 58 | fi; \ 59 | fi 60 | 61 | 62 | setup-model-repo: 63 | @echo "Setting up repository..." 64 | @mkdir -p $(BASE_COMPILED_DIR) 65 | @if [ -d "$(MODEL_REPO_DIR)/.git" ]; then \ 66 | echo "Repository exists, resetting..."; \ 67 | export GIT_LFS_SKIP_SMUDGE=1; \ 68 | cd $(MODEL_REPO_DIR) && git fetch --all && git reset --hard origin/main && git clean -fdx; \ 69 | else \ 70 | echo "Repository not found, initializing..."; \ 71 | export GIT_LFS_SKIP_SMUDGE=1; \ 72 | git clone https://huggingface.co/$(MODEL_REPO) $(MODEL_REPO_DIR); \ 73 | fi 74 | 75 | 76 | # Download all models 77 | download-models: setup-model-repo 78 | @echo "Downloading all models..." 79 | @cd $(MODEL_REPO_DIR) && \ 80 | git lfs pull 81 | 82 | 83 | # Download a specific model 84 | download-model: 85 | @if [ -z "$(MODEL)" ]; then \ 86 | echo "Error: MODEL is not set. Usage: make download-model MODEL=base"; \ 87 | exit 1; \ 88 | fi 89 | @echo "Downloading model $(MODEL)..." 90 | @$(MAKE) setup-model-repo 91 | @echo "Fetching model $(MODEL)..." 92 | @cd $(MODEL_REPO_DIR) && \ 93 | git lfs pull --include="openai_whisper-$(MODEL)/*" 94 | 95 | build: 96 | @echo "Building WhisperKit..." 97 | @swift build -v 98 | 99 | 100 | build-cli: 101 | @echo "Building WhisperKit CLI..." 102 | @swift build -c release --product whisperkit-cli 103 | 104 | 105 | test: 106 | @echo "Running tests..." 107 | @swift test -v 108 | 109 | 110 | list-devices: 111 | fastlane ios list_devices 112 | 113 | 114 | # Usage: 115 | # make benchmark-devices # Benchmark all connected devices 116 | # make benchmark-devices DEBUG=true # Benchmark all connected devices with small test matrix 117 | # make benchmark-devices DEVICES="iPhone 15 Pro Max,My Mac" # Benchmark specific device names from `make list-devices` 118 | DEVICES ?= 119 | DEBUG ?= false 120 | benchmark-devices: generate-whisperax-xcconfig 121 | @if [ -n "$(DEVICES)" ]; then \ 122 | echo "Benchmarking specific devices: $(DEVICES)"; \ 123 | fastlane benchmark devices:"$(DEVICES)" debug:$(DEBUG); \ 124 | else \ 125 | echo "Benchmarking all connected devices"; \ 126 | fastlane benchmark debug:$(DEBUG); \ 127 | fi 128 | 129 | upload-benchmark-results: 130 | @echo "Uploading benchmark results..." 131 | @fastlane upload_results 132 | 133 | clean-package-caches: 134 | @trash ~/Library/Developer/Xcode/DerivedData/WhisperKit* || true 135 | @swift package purge-cache 136 | @swift package reset -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "swift-argument-parser", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/apple/swift-argument-parser.git", 7 | "state" : { 8 | "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", 9 | "version" : "1.3.0" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-transformers", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/huggingface/swift-transformers.git", 16 | "state" : { 17 | "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08", 18 | "version" : "0.1.8" 19 | } 20 | } 21 | ], 22 | "version" : 2 23 | } 24 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.9 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "whisperkit", 8 | platforms: [ 9 | .iOS(.v16), 10 | .macOS(.v13), 11 | ], 12 | products: [ 13 | .library( 14 | name: "WhisperKit", 15 | targets: ["WhisperKit"] 16 | ), 17 | .executable( 18 | name: "whisperkit-cli", 19 | targets: ["WhisperKitCLI"] 20 | ), 21 | ], 22 | dependencies: [ 23 | .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMinor(from: "0.1.8")), 24 | .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"), 25 | ], 26 | targets: [ 27 | .target( 28 | name: "WhisperKit", 29 | dependencies: [ 30 | .product(name: "Transformers", package: "swift-transformers"), 31 | ] 32 | ), 33 | .executableTarget( 34 | name: "WhisperKitCLI", 35 | dependencies: [ 36 | "WhisperKit", 37 | .product(name: "ArgumentParser", package: "swift-argument-parser"), 38 | ] 39 | ), 40 | .testTarget( 41 | name: "WhisperKitTests", 42 | dependencies: [ 43 | "WhisperKit", 44 | .product(name: "Transformers", package: "swift-transformers"), 45 | ], 46 | path: ".", 47 | exclude: [ 48 | "Examples", 49 | "Sources", 50 | "Makefile", 51 | "README.md", 52 | "LICENSE", 53 | "CONTRIBUTING.md", 54 | ], 55 | resources: [ 56 | .process("Tests/WhisperKitTests/Resources"), 57 | .copy("Models/whisperkit-coreml"), 58 | ] 59 | ), 60 | ] 61 | ) 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 | WhisperKit 6 | 7 | 8 | 9 | WhisperKit 10 | 11 | 12 | # WhisperKit 13 | 14 | [![Tests](https://github.com/argmaxinc/whisperkit/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/argmaxinc/whisperkit/actions/workflows/pre-release-tests.yml) 15 | [![License](https://img.shields.io/github/license/argmaxinc/whisperkit?logo=github&logoColor=969da4&label=License&labelColor=353a41&color=32d058)](LICENSE.md) 16 | [![Supported Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fargmaxinc%2FWhisperKit%2Fbadge%3Ftype%3Dswift-versions&labelColor=353a41&color=32d058)](https://swiftpackageindex.com/argmaxinc/WhisperKit) [![Supported Platforms](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fargmaxinc%2FWhisperKit%2Fbadge%3Ftype%3Dplatforms&labelColor=353a41&color=32d058)](https://swiftpackageindex.com/argmaxinc/WhisperKit) 17 | [![Discord](https://img.shields.io/discord/1171912382512115722?style=flat&logo=discord&logoColor=969da4&label=Discord&labelColor=353a41&color=32d058&link=https%3A%2F%2Fdiscord.gg%2FG5F5GZGecC)](https://discord.gg/G5F5GZGecC) 18 | 19 | 20 |
21 | 22 | WhisperKit is an [Argmax](https://www.takeargmax.com) framework for deploying state-of-the-art speech-to-text systems (e.g. [Whisper](https://github.com/openai/whisper)) on device with advanced features such as real-time streaming, word timestamps, voice activity detection, and more. 23 | 24 | [[TestFlight Demo App]](https://testflight.apple.com/join/LPVOyJZW) [[Python Tools]](https://github.com/argmaxinc/whisperkittools) [[Benchmarks & Device Support]](https://huggingface.co/spaces/argmaxinc/whisperkit-benchmarks) [[WhisperKit Android]](https://github.com/argmaxinc/WhisperKitAndroid) 25 | 26 | > [!IMPORTANT] 27 | > If you are looking for more features such as speaker diarization and upgraded performance, check out [WhisperKit Pro](https://huggingface.co/argmaxinc/whisperkit-pro) and [SpeakerKit Pro](https://huggingface.co/argmaxinc/speakerkit-pro)! For commercial use or evaluation, please reach out to [whisperkitpro@argmaxinc.com](mailto:whisperkitpro@argmaxinc.com). 28 | 29 | ## Table of Contents 30 | 31 | - [Installation](#installation) 32 | - [Swift Package Manager](#swift-package-manager) 33 | - [Prerequisites](#prerequisites) 34 | - [Xcode Steps](#xcode-steps) 35 | - [Package.swift](#packageswift) 36 | - [Homebrew](#homebrew) 37 | - [Getting Started](#getting-started) 38 | - [Quick Example](#quick-example) 39 | - [Model Selection](#model-selection) 40 | - [Generating Models](#generating-models) 41 | - [Swift CLI](#swift-cli) 42 | - [Contributing \& Roadmap](#contributing--roadmap) 43 | - [License](#license) 44 | - [Citation](#citation) 45 | 46 | ## Installation 47 | 48 | ### Swift Package Manager 49 | 50 | WhisperKit can be integrated into your Swift project using the Swift Package Manager. 51 | 52 | ### Prerequisites 53 | 54 | - macOS 14.0 or later. 55 | - Xcode 15.0 or later. 56 | 57 | ### Xcode Steps 58 | 59 | 1. Open your Swift project in Xcode. 60 | 2. Navigate to `File` > `Add Package Dependencies...`. 61 | 3. Enter the package repository URL: `https://github.com/argmaxinc/whisperkit`. 62 | 4. Choose the version range or specific version. 63 | 5. Click `Finish` to add WhisperKit to your project. 64 | 65 | ### Package.swift 66 | 67 | If you're using WhisperKit as part of a swift package, you can include it in your Package.swift dependencies as follows: 68 | 69 | ```swift 70 | dependencies: [ 71 | .package(url: "https://github.com/argmaxinc/WhisperKit.git", from: "0.9.0"), 72 | ], 73 | ``` 74 | 75 | Then add `WhisperKit` as a dependency for your target: 76 | 77 | ```swift 78 | .target( 79 | name: "YourApp", 80 | dependencies: ["WhisperKit"] 81 | ), 82 | ``` 83 | 84 | ### Homebrew 85 | 86 | You can install `WhisperKit` command line app using [Homebrew](https://brew.sh) by running the following command: 87 | 88 | ```bash 89 | brew install whisperkit-cli 90 | ``` 91 | 92 | ## Getting Started 93 | 94 | To get started with WhisperKit, you need to initialize it in your project. 95 | 96 | ### Quick Example 97 | 98 | This example demonstrates how to transcribe a local audio file: 99 | 100 | ```swift 101 | import WhisperKit 102 | 103 | // Initialize WhisperKit with default settings 104 | Task { 105 | let pipe = try? await WhisperKit() 106 | let transcription = try? await pipe!.transcribe(audioPath: "path/to/your/audio.{wav,mp3,m4a,flac}")?.text 107 | print(transcription) 108 | } 109 | ``` 110 | 111 | ### Model Selection 112 | 113 | WhisperKit automatically downloads the recommended model for the device if not specified. You can also select a specific model by passing in the model name: 114 | 115 | ```swift 116 | let pipe = try? await WhisperKit(WhisperKitConfig(model: "large-v3")) 117 | ``` 118 | 119 | This method also supports glob search, so you can use wildcards to select a model: 120 | 121 | ```swift 122 | let pipe = try? await WhisperKit(WhisperKitConfig(model: "distil*large-v3")) 123 | ``` 124 | 125 | Note that the model search must return a single model from the source repo, otherwise an error will be thrown. 126 | 127 | For a list of available models, see our [HuggingFace repo](https://huggingface.co/argmaxinc/whisperkit-coreml). 128 | 129 | ### Generating Models 130 | 131 | WhisperKit also comes with the supporting repo [`whisperkittools`](https://github.com/argmaxinc/whisperkittools) which lets you create and deploy your own fine tuned versions of Whisper in CoreML format to HuggingFace. Once generated, they can be loaded by simply changing the repo name to the one used to upload the model: 132 | 133 | ```swift 134 | let config = WhisperKitConfig(model: "large-v3", modelRepo: "username/your-model-repo") 135 | let pipe = try? await WhisperKit(config) 136 | ``` 137 | 138 | ### Swift CLI 139 | 140 | The Swift CLI allows for quick testing and debugging outside of an Xcode project. To install it, run the following: 141 | 142 | ```bash 143 | git clone https://github.com/argmaxinc/whisperkit.git 144 | cd whisperkit 145 | ``` 146 | 147 | Then, setup the environment and download your desired model. 148 | 149 | ```bash 150 | make setup 151 | make download-model MODEL=large-v3 152 | ``` 153 | 154 | **Note**: 155 | 156 | 1. This will download only the model specified by `MODEL` (see what's available in our [HuggingFace repo](https://huggingface.co/argmaxinc/whisperkit-coreml), where we use the prefix `openai_whisper-{MODEL}`) 157 | 2. Before running `download-model`, make sure [git-lfs](https://git-lfs.com) is installed 158 | 159 | If you would like download all available models to your local folder, use this command instead: 160 | 161 | ```bash 162 | make download-models 163 | ``` 164 | 165 | You can then run them via the CLI with: 166 | 167 | ```bash 168 | swift run whisperkit-cli transcribe --model-path "Models/whisperkit-coreml/openai_whisper-large-v3" --audio-path "path/to/your/audio.{wav,mp3,m4a,flac}" 169 | ``` 170 | 171 | Which should print a transcription of the audio file. If you would like to stream the audio directly from a microphone, use: 172 | 173 | ```bash 174 | swift run whisperkit-cli transcribe --model-path "Models/whisperkit-coreml/openai_whisper-large-v3" --stream 175 | ``` 176 | 177 | ## Contributing & Roadmap 178 | 179 | Our goal is to make WhisperKit better and better over time and we'd love your help! Just search the code for "TODO" for a variety of features that are yet to be built. Please refer to our [contribution guidelines](CONTRIBUTING.md) for submitting issues, pull requests, and coding standards, where we also have a public roadmap of features we are looking forward to building in the future. 180 | 181 | ## License 182 | 183 | WhisperKit is released under the MIT License. See [LICENSE](LICENSE) for more details. 184 | 185 | ## Citation 186 | 187 | If you use WhisperKit for something cool or just find it useful, please drop us a note at [info@argmaxinc.com](mailto:info@argmaxinc.com)! 188 | 189 | If you use WhisperKit for academic work, here is the BibTeX: 190 | 191 | ```bibtex 192 | @misc{whisperkit-argmax, 193 | title = {WhisperKit}, 194 | author = {Argmax, Inc.}, 195 | year = {2024}, 196 | URL = {https://github.com/argmaxinc/WhisperKit} 197 | } 198 | ``` 199 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Audio/AudioChunker.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Accelerate 5 | import AVFoundation 6 | import Foundation 7 | 8 | /// Responsible for chunking audio into smaller pieces 9 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 10 | public protocol AudioChunking { 11 | func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk] 12 | } 13 | 14 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 15 | public extension AudioChunking { 16 | func updateSeekOffsetsForResults( 17 | chunkedResults: [Result<[TranscriptionResult], Swift.Error>], 18 | audioChunks: [AudioChunk] 19 | ) -> [TranscriptionResult] { 20 | var updatedTranscriptionResults = [TranscriptionResult]() 21 | for (index, chunkedResult) in chunkedResults.enumerated() { 22 | switch chunkedResult { 23 | case let .success(results): 24 | let seekTime = Float(audioChunks[index].seekOffsetIndex) / Float(WhisperKit.sampleRate) 25 | for result in results { 26 | var updatedSegments = [TranscriptionSegment]() 27 | for segment in result.segments { 28 | let updatedSegment = updateSegmentTimings(segment: segment, seekTime: seekTime) 29 | updatedSegments.append(updatedSegment) 30 | } 31 | var updatedResult = result 32 | updatedResult.seekTime = seekTime 33 | updatedResult.segments = updatedSegments 34 | updatedTranscriptionResults.append(updatedResult) 35 | } 36 | case let .failure(error): 37 | Logging.debug("Error transcribing chunk \(index): \(error)") 38 | } 39 | } 40 | return updatedTranscriptionResults 41 | } 42 | } 43 | 44 | /// A audio chunker that splits audio into smaller pieces based on voice activity detection 45 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 46 | open class VADAudioChunker: AudioChunking { 47 | /// prevent hallucinations at the end of the clip by stopping up to 1.0s early 48 | private let windowPadding: Int 49 | private let vad: VoiceActivityDetector 50 | 51 | public init(windowPadding: Int = 16000, vad: VoiceActivityDetector? = nil) { 52 | self.windowPadding = windowPadding 53 | self.vad = vad ?? EnergyVAD() 54 | } 55 | 56 | private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int { 57 | // NOTE: we want to check just the 2nd part for the silence to attempt to get closest to a max length chunk 58 | let audioMidIndex = startIndex + (endIndex - startIndex) / 2 59 | let vadAudioSlice = Array(audioArray[audioMidIndex.. [AudioChunk] { 70 | // If the audio array length is less than or equal to maxLength, return it as a single chunk 71 | if audioArray.count <= maxChunkLength { 72 | return [AudioChunk(seekOffsetIndex: 0, audioSamples: audioArray)] 73 | } 74 | 75 | // First create chunks from seek clips 76 | let seekClips = prepareSeekClips(contentFrames: audioArray.count, decodeOptions: decodeOptions) 77 | 78 | var chunkedAudio = [AudioChunk]() 79 | for (seekClipStart, seekClipEnd) in seekClips { 80 | // Loop through the current clip until we reach the end 81 | // Typically this will be the full audio file, unless seek points are explicitly provided 82 | var startIndex = seekClipStart 83 | while startIndex < seekClipEnd - windowPadding { 84 | guard startIndex >= 0 && startIndex < audioArray.count else { 85 | throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size") 86 | } 87 | 88 | // Make sure we still need chunking for this seek clip, otherwise use the original seek clip end 89 | var endIndex = seekClipEnd 90 | if startIndex + maxChunkLength < endIndex { 91 | // Adjust the end index based on VAD 92 | endIndex = splitOnMiddleOfLongestSilence( 93 | audioArray: audioArray, 94 | startIndex: startIndex, 95 | endIndex: min(audioArray.count, startIndex + maxChunkLength) 96 | ) 97 | } 98 | 99 | guard endIndex > startIndex else { 100 | break 101 | } 102 | Logging.debug("Found chunk from \(formatTimestamp(Float(startIndex) / Float(WhisperKit.sampleRate))) to \(formatTimestamp(Float(endIndex) / Float(WhisperKit.sampleRate)))") 103 | let audioSlice = AudioChunk(seekOffsetIndex: startIndex, audioSamples: Array(audioArray[startIndex.. Void 23 | 24 | /// Responsible for streaming audio from the microphone, processing it, and transcribing it in real-time. 25 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 26 | public actor AudioStreamTranscriber { 27 | private var state: AudioStreamTranscriber.State = .init() { 28 | didSet { 29 | stateChangeCallback?(oldValue, state) 30 | } 31 | } 32 | 33 | private let stateChangeCallback: AudioStreamTranscriberCallback? 34 | 35 | private let requiredSegmentsForConfirmation: Int 36 | private let useVAD: Bool 37 | private let silenceThreshold: Float 38 | private let compressionCheckWindow: Int 39 | private let transcribeTask: TranscribeTask 40 | private let audioProcessor: any AudioProcessing 41 | private let decodingOptions: DecodingOptions 42 | 43 | public init( 44 | audioEncoder: any AudioEncoding, 45 | featureExtractor: any FeatureExtracting, 46 | segmentSeeker: any SegmentSeeking, 47 | textDecoder: any TextDecoding, 48 | tokenizer: any WhisperTokenizer, 49 | audioProcessor: any AudioProcessing, 50 | decodingOptions: DecodingOptions, 51 | requiredSegmentsForConfirmation: Int = 2, 52 | silenceThreshold: Float = 0.3, 53 | compressionCheckWindow: Int = 60, 54 | useVAD: Bool = true, 55 | stateChangeCallback: AudioStreamTranscriberCallback? 56 | ) { 57 | self.transcribeTask = TranscribeTask( 58 | currentTimings: TranscriptionTimings(), 59 | progress: Progress(), 60 | audioEncoder: audioEncoder, 61 | featureExtractor: featureExtractor, 62 | segmentSeeker: segmentSeeker, 63 | textDecoder: textDecoder, 64 | tokenizer: tokenizer 65 | ) 66 | self.audioProcessor = audioProcessor 67 | self.decodingOptions = decodingOptions 68 | self.requiredSegmentsForConfirmation = requiredSegmentsForConfirmation 69 | self.silenceThreshold = silenceThreshold 70 | self.compressionCheckWindow = compressionCheckWindow 71 | self.useVAD = useVAD 72 | self.stateChangeCallback = stateChangeCallback 73 | } 74 | 75 | public func startStreamTranscription() async throws { 76 | guard !state.isRecording else { return } 77 | guard await AudioProcessor.requestRecordPermission() else { 78 | Logging.error("Microphone access was not granted.") 79 | return 80 | } 81 | state.isRecording = true 82 | try audioProcessor.startRecordingLive { [weak self] _ in 83 | Task { [weak self] in 84 | await self?.onAudioBufferCallback() 85 | } 86 | } 87 | await realtimeLoop() 88 | Logging.info("Realtime transcription has started") 89 | } 90 | 91 | public func stopStreamTranscription() { 92 | state.isRecording = false 93 | audioProcessor.stopRecording() 94 | Logging.info("Realtime transcription has ended") 95 | } 96 | 97 | private func realtimeLoop() async { 98 | while state.isRecording { 99 | do { 100 | try await transcribeCurrentBuffer() 101 | } catch { 102 | Logging.error("Error: \(error.localizedDescription)") 103 | break 104 | } 105 | } 106 | } 107 | 108 | private func onAudioBufferCallback() { 109 | state.bufferEnergy = audioProcessor.relativeEnergy 110 | } 111 | 112 | private func onProgressCallback(_ progress: TranscriptionProgress) { 113 | let fallbacks = Int(progress.timings.totalDecodingFallbacks) 114 | if progress.text.count < state.currentText.count { 115 | if fallbacks == state.currentFallbacks { 116 | state.unconfirmedText.append(state.currentText) 117 | } else { 118 | Logging.info("Fallback occured: \(fallbacks)") 119 | } 120 | } 121 | state.currentText = progress.text 122 | state.currentFallbacks = fallbacks 123 | } 124 | 125 | private func transcribeCurrentBuffer() async throws { 126 | // Retrieve the current audio buffer from the audio processor 127 | let currentBuffer = audioProcessor.audioSamples 128 | 129 | // Calculate the size and duration of the next buffer segment 130 | let nextBufferSize = currentBuffer.count - state.lastBufferSize 131 | let nextBufferSeconds = Float(nextBufferSize) / Float(WhisperKit.sampleRate) 132 | 133 | // Only run the transcribe if the next buffer has at least 1 second of audio 134 | guard nextBufferSeconds > 1 else { 135 | if state.currentText == "" { 136 | state.currentText = "Waiting for speech..." 137 | } 138 | return try await Task.sleep(nanoseconds: 100_000_000) // sleep for 100ms for next buffer 139 | } 140 | 141 | if useVAD { 142 | let voiceDetected = AudioProcessor.isVoiceDetected( 143 | in: audioProcessor.relativeEnergy, 144 | nextBufferInSeconds: nextBufferSeconds, 145 | silenceThreshold: silenceThreshold 146 | ) 147 | // Only run the transcribe if the next buffer has voice 148 | if !voiceDetected { 149 | Logging.debug("No voice detected, skipping transcribe") 150 | if state.currentText == "" { 151 | state.currentText = "Waiting for speech..." 152 | } 153 | // Sleep for 100ms and check the next buffer 154 | return try await Task.sleep(nanoseconds: 100_000_000) 155 | } 156 | } 157 | 158 | // Run transcribe 159 | state.lastBufferSize = currentBuffer.count 160 | 161 | let transcription = try await transcribeAudioSamples(Array(currentBuffer)) 162 | 163 | state.currentText = "" 164 | state.unconfirmedText = [] 165 | let segments = transcription.segments 166 | 167 | // Logic for moving segments to confirmedSegments 168 | if segments.count > requiredSegmentsForConfirmation { 169 | // Calculate the number of segments to confirm 170 | let numberOfSegmentsToConfirm = segments.count - requiredSegmentsForConfirmation 171 | 172 | // Confirm the required number of segments 173 | let confirmedSegmentsArray = Array(segments.prefix(numberOfSegmentsToConfirm)) 174 | let remainingSegments = Array(segments.suffix(requiredSegmentsForConfirmation)) 175 | 176 | // Update lastConfirmedSegmentEnd based on the last confirmed segment 177 | if let lastConfirmedSegment = confirmedSegmentsArray.last, lastConfirmedSegment.end > state.lastConfirmedSegmentEndSeconds { 178 | state.lastConfirmedSegmentEndSeconds = lastConfirmedSegment.end 179 | 180 | // Add confirmed segments to the confirmedSegments array 181 | if !state.confirmedSegments.contains(confirmedSegmentsArray) { 182 | state.confirmedSegments.append(contentsOf: confirmedSegmentsArray) 183 | } 184 | } 185 | 186 | // Update transcriptions to reflect the remaining segments 187 | state.unconfirmedSegments = remainingSegments 188 | } else { 189 | // Handle the case where segments are fewer or equal to required 190 | state.unconfirmedSegments = segments 191 | } 192 | } 193 | 194 | private func transcribeAudioSamples(_ samples: [Float]) async throws -> TranscriptionResult { 195 | var options = decodingOptions 196 | options.clipTimestamps = [state.lastConfirmedSegmentEndSeconds] 197 | let checkWindow = compressionCheckWindow 198 | return try await transcribeTask.run(audioArray: samples, decodeOptions: options) { [weak self] progress in 199 | Task { [weak self] in 200 | await self?.onProgressCallback(progress) 201 | } 202 | return AudioStreamTranscriber.shouldStopEarly(progress: progress, options: options, compressionCheckWindow: checkWindow) 203 | } 204 | } 205 | 206 | private static func shouldStopEarly( 207 | progress: TranscriptionProgress, 208 | options: DecodingOptions, 209 | compressionCheckWindow: Int 210 | ) -> Bool? { 211 | let currentTokens = progress.tokens 212 | if currentTokens.count > compressionCheckWindow { 213 | let checkTokens: [Int] = currentTokens.suffix(compressionCheckWindow) 214 | let compressionRatio = compressionRatio(of: checkTokens) 215 | if compressionRatio > options.compressionRatioThreshold ?? 0.0 { 216 | return false 217 | } 218 | } 219 | if let avgLogprob = progress.avgLogprob, let logProbThreshold = options.logProbThreshold { 220 | if avgLogprob < logProbThreshold { 221 | return false 222 | } 223 | } 224 | return nil 225 | } 226 | } 227 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Audio/EnergyVAD.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Voice activity detection based on energy threshold 7 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 8 | final class EnergyVAD: VoiceActivityDetector { 9 | var energyThreshold: Float 10 | 11 | /// Initialize a new EnergyVAD instance 12 | /// - Parameters: 13 | /// - sampleRate: Audio sample rate 14 | /// - frameLength: Frame length in seconds 15 | /// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries 16 | /// - energyThreshold: minimal energy threshold 17 | convenience init( 18 | sampleRate: Int = WhisperKit.sampleRate, 19 | frameLength: Float = 0.1, 20 | frameOverlap: Float = 0.0, 21 | energyThreshold: Float = 0.02 22 | ) { 23 | self.init( 24 | sampleRate: sampleRate, 25 | // Compute frame length and overlap in number of samples 26 | frameLengthSamples: Int(frameLength * Float(sampleRate)), 27 | frameOverlapSamples: Int(frameOverlap * Float(sampleRate)), 28 | energyThreshold: energyThreshold 29 | ) 30 | } 31 | 32 | required init( 33 | sampleRate: Int = 16000, 34 | frameLengthSamples: Int, 35 | frameOverlapSamples: Int = 0, 36 | energyThreshold: Float = 0.02 37 | ) { 38 | self.energyThreshold = energyThreshold 39 | super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples) 40 | } 41 | 42 | override func voiceActivity(in waveform: [Float]) -> [Bool] { 43 | let chunkRatio = Double(waveform.count) / Double(frameLengthSamples) 44 | 45 | // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long 46 | let count = Int(chunkRatio.rounded(.up)) 47 | 48 | let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks( 49 | of: waveform, 50 | chunkCount: count, 51 | frameLengthSamples: frameLengthSamples, 52 | frameOverlapSamples: frameOverlapSamples, 53 | energyThreshold: energyThreshold 54 | ) 55 | 56 | return chunkedVoiceActivity 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Audio/VoiceActivityDetector.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not. 7 | /// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality. 8 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 9 | open class VoiceActivityDetector { 10 | /// The sample rate of the audio signal, in samples per second. 11 | public let sampleRate: Int 12 | 13 | /// The length of each frame in samples. 14 | public let frameLengthSamples: Int 15 | 16 | /// The number of samples overlapping between consecutive frames. 17 | public let frameOverlapSamples: Int 18 | 19 | /// Initializes a new `VoiceActivityDetector` instance with the specified parameters. 20 | /// - Parameters: 21 | /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000. 22 | /// - frameLengthSamples: The length of each frame in samples. 23 | /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0. 24 | /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality. 25 | public init( 26 | sampleRate: Int = 16000, 27 | frameLengthSamples: Int, 28 | frameOverlapSamples: Int = 0 29 | ) { 30 | self.sampleRate = sampleRate 31 | self.frameLengthSamples = frameLengthSamples 32 | self.frameOverlapSamples = frameOverlapSamples 33 | } 34 | 35 | /// Analyzes the provided audio waveform to determine which segments contain voice activity. 36 | /// - Parameter waveform: An array of `Float` values representing the audio waveform. 37 | /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence. 38 | open func voiceActivity(in waveform: [Float]) -> [Bool] { 39 | fatalError("`voiceActivity` must be implemented by subclass") 40 | } 41 | 42 | /// Calculates and returns a list of active audio chunks, each represented by a start and end index. 43 | /// - Parameter waveform: An array of `Float` values representing the audio waveform. 44 | /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk. 45 | public func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] { 46 | let vad: [Bool] = voiceActivity(in: waveform) 47 | var result = [(startIndex: Int, endIndex: Int)]() 48 | 49 | // Temporary variables to hold the start of the current non-silent segment 50 | var currentStartIndex: Int? 51 | 52 | for (index, vadChunk) in vad.enumerated() { 53 | if vadChunk { 54 | let chunkStart = index * frameLengthSamples 55 | let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count) 56 | 57 | if currentStartIndex != nil { 58 | // If we already have a starting point, just update the end point in the last added segment 59 | result[result.count - 1].endIndex = chunkEnd 60 | } else { 61 | // If there is no current start, this is a new segment 62 | currentStartIndex = chunkStart 63 | result.append((startIndex: chunkStart, endIndex: chunkEnd)) 64 | } 65 | } else { 66 | // Reset currentStartIndex when encountering a silent chunk 67 | currentStartIndex = nil 68 | } 69 | } 70 | 71 | return result 72 | } 73 | 74 | /// Converts a voice activity index to the corresponding audio sample index. 75 | /// - Parameter index: The voice activity index to convert. 76 | /// - Returns: The corresponding audio sample index. 77 | public func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int { 78 | return index * frameLengthSamples 79 | } 80 | 81 | public func voiceActivityIndexToSeconds(_ index: Int) -> Float { 82 | return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate) 83 | } 84 | 85 | /// Identifies the longest continuous period of silence within the provided voice activity detection results. 86 | /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results. 87 | /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found. 88 | public func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? { 89 | var longestStartIndex: Int? 90 | var longestEndIndex: Int? 91 | var longestCount = 0 92 | var index = 0 93 | while index < vadResult.count { 94 | let value = vadResult[index] 95 | if value { 96 | // found non-silence, skip 97 | index += 1 98 | } else { 99 | // found beginning of silence, find the end 100 | var endIndex = index 101 | while endIndex < vadResult.count, !vadResult[endIndex] { 102 | endIndex += 1 103 | } 104 | let count = endIndex - index 105 | if count > longestCount { 106 | longestCount = count 107 | longestStartIndex = index 108 | longestEndIndex = endIndex 109 | } 110 | index = endIndex 111 | } 112 | } 113 | if let longestStartIndex, let longestEndIndex { 114 | return (startIndex: longestStartIndex, endIndex: longestEndIndex) 115 | } else { 116 | return nil 117 | } 118 | } 119 | 120 | // MARK: - Utility 121 | 122 | func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] { 123 | let nonSilentChunks = calculateActiveChunks(in: waveform) 124 | var clipTimestamps = [Float]() 125 | 126 | for chunk in nonSilentChunks { 127 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) 128 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) 129 | 130 | clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp]) 131 | } 132 | 133 | return clipTimestamps 134 | } 135 | 136 | func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] { 137 | let clipTimestamps = voiceActivityClipTimestamps(in: waveform) 138 | let options = DecodingOptions(clipTimestamps: clipTimestamps) 139 | let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options) 140 | return seekClips 141 | } 142 | 143 | func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] { 144 | let nonSilentChunks = calculateActiveChunks(in: waveform) 145 | var seekTimestamps = [(startTime: Float, endTime: Float)]() 146 | 147 | for chunk in nonSilentChunks { 148 | let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) 149 | let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) 150 | 151 | seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)]) 152 | } 153 | 154 | return seekTimestamps 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/AudioEncoder.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import CoreML 5 | 6 | public protocol AudioEncoderOutputType {} 7 | extension MLMultiArray: AudioEncoderOutputType {} 8 | 9 | /// AudioEncoding protocol defines the requirements for an audio encoding implementation. 10 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 11 | public protocol AudioEncoding { 12 | /// The size of the embedding produced by the encoder. 13 | var embedSize: Int? { get } 14 | 15 | /// Encodes the given audio features asynchronously. 16 | /// - Parameter features: The audio features to be encoded. 17 | /// - Returns: An optional tensor containing the encoded features. 18 | func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)? 19 | } 20 | 21 | /// Backwards-compatible AudioEncoder implementation 22 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 23 | public class AudioEncoder: AudioEncoding, WhisperMLModel { 24 | public var model: MLModel? 25 | 26 | public var embedSize: Int? { 27 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil } 28 | guard inputDescription.type == .multiArray else { return nil } 29 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 30 | let shape = shapeConstraint.shape.map { $0.intValue } 31 | return shape[1] 32 | } 33 | 34 | public var sequenceLength: Int? { 35 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil } 36 | guard inputDescription.type == .multiArray else { return nil } 37 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 38 | let shape = shapeConstraint.shape.map { $0.intValue } 39 | return shape[3] 40 | } 41 | 42 | public init() {} 43 | 44 | public func encodeFeatures(_ features: any FeatureExtractorOutputType) async throws -> (any AudioEncoderOutputType)? { 45 | guard let features = features as? MLMultiArray else { 46 | throw WhisperError.audioProcessingFailed("AudioEncoder input must be MLMultiArray") 47 | } 48 | 49 | return try await encodeFeatures(features) 50 | } 51 | 52 | public func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? { 53 | guard let model else { 54 | throw WhisperError.modelsUnavailable() 55 | } 56 | try Task.checkCancellation() 57 | 58 | let interval = Logging.beginSignpost("EncodeAudio", signposter: Logging.AudioEncoding.signposter) 59 | defer { Logging.endSignpost("EncodeAudio", interval: interval, signposter: Logging.AudioEncoding.signposter) } 60 | 61 | let modelInputs = AudioEncoderInput(melspectrogram_features: features) 62 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) 63 | let output = AudioEncoderOutput(features: outputFeatures) 64 | return output.encoder_output_embeds 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Configurations.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Configuration to initialize WhisperKit 7 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 8 | open class WhisperKitConfig { 9 | /// Name for whisper model to use 10 | public var model: String? 11 | /// Base URL for downloading models 12 | public var downloadBase: URL? 13 | /// Repository for downloading models 14 | public var modelRepo: String? 15 | /// Token for downloading models from repo (if required) 16 | public var modelToken: String? 17 | 18 | /// Folder to store models 19 | public var modelFolder: String? 20 | /// Folder to store tokenizers 21 | public var tokenizerFolder: URL? 22 | 23 | /// Model compute options, see `ModelComputeOptions` 24 | public var computeOptions: ModelComputeOptions? 25 | /// Audio input config to define how to process audio input 26 | public var audioInputConfig: AudioInputConfig? 27 | /// Audio processor for the model 28 | public var audioProcessor: (any AudioProcessing)? 29 | public var featureExtractor: (any FeatureExtracting)? 30 | public var audioEncoder: (any AudioEncoding)? 31 | public var textDecoder: (any TextDecoding)? 32 | public var logitsFilters: [any LogitsFiltering]? 33 | public var segmentSeeker: (any SegmentSeeking)? 34 | public var voiceActivityDetector: VoiceActivityDetector? 35 | 36 | /// Enable extra verbosity for logging 37 | public var verbose: Bool 38 | /// Maximum log level 39 | public var logLevel: Logging.LogLevel 40 | 41 | /// Enable model prewarming 42 | public var prewarm: Bool? 43 | /// Load models if available 44 | public var load: Bool? 45 | /// Download models if not available 46 | public var download: Bool 47 | /// Use background download session 48 | public var useBackgroundDownloadSession: Bool 49 | 50 | public init(model: String? = nil, 51 | downloadBase: URL? = nil, 52 | modelRepo: String? = nil, 53 | modelToken: String? = nil, 54 | modelFolder: String? = nil, 55 | tokenizerFolder: URL? = nil, 56 | computeOptions: ModelComputeOptions? = nil, 57 | audioInputConfig: AudioInputConfig? = nil, 58 | audioProcessor: (any AudioProcessing)? = nil, 59 | featureExtractor: (any FeatureExtracting)? = nil, 60 | audioEncoder: (any AudioEncoding)? = nil, 61 | textDecoder: (any TextDecoding)? = nil, 62 | logitsFilters: [any LogitsFiltering]? = nil, 63 | segmentSeeker: (any SegmentSeeking)? = nil, 64 | voiceActivityDetector: VoiceActivityDetector? = nil, 65 | verbose: Bool = true, 66 | logLevel: Logging.LogLevel = .info, 67 | prewarm: Bool? = nil, 68 | load: Bool? = nil, 69 | download: Bool = true, 70 | useBackgroundDownloadSession: Bool = false) 71 | { 72 | self.model = model 73 | self.downloadBase = downloadBase 74 | self.modelRepo = modelRepo 75 | self.modelToken = modelToken 76 | self.modelFolder = modelFolder 77 | self.tokenizerFolder = tokenizerFolder 78 | self.computeOptions = computeOptions 79 | self.audioInputConfig = audioInputConfig 80 | self.audioProcessor = audioProcessor 81 | self.featureExtractor = featureExtractor 82 | self.audioEncoder = audioEncoder 83 | self.textDecoder = textDecoder 84 | self.logitsFilters = logitsFilters 85 | self.segmentSeeker = segmentSeeker 86 | self.voiceActivityDetector = voiceActivityDetector 87 | self.verbose = verbose 88 | self.logLevel = logLevel 89 | self.prewarm = prewarm 90 | self.load = load 91 | self.download = download 92 | self.useBackgroundDownloadSession = useBackgroundDownloadSession 93 | } 94 | } 95 | 96 | /// Options for how to transcribe an audio file using WhisperKit. 97 | /// 98 | /// - Parameters: 99 | /// - verbose: Whether to display the text being decoded to the console. 100 | /// If true, displays all details; if false, displays minimal details; 101 | /// - task: Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') 102 | /// - language: Language spoken in the audio 103 | /// - temperature: Temperature to use for sampling. 104 | /// - temperatureIncrementOnFallback: Increment which will be 105 | /// successively added to temperature upon failures according to either `compressionRatioThreshold` 106 | /// or `logProbThreshold`. 107 | /// - temperatureFallbackCount: Number of times to increment temperature on fallback. 108 | /// - sampleLength: The maximum number of tokens to sample. 109 | /// - topK: Number of candidates when sampling with non-zero temperature. 110 | /// - usePrefillPrompt: If true, the prefill tokens will be forced according to task and language settings. 111 | /// - usePrefillCache: If true, the kv cache will be prefilled based on the prefill data mlmodel. 112 | /// - detectLanguage: Use this in conjuntion with `usePrefillPrompt: true` to detect the language of the input audio. 113 | /// - skipSpecialTokens: Whether to skip special tokens in the output. 114 | /// - withoutTimestamps: Whether to include timestamps in the transcription result. 115 | /// - wordTimestamps: Whether to include word-level timestamps in the transcription result. 116 | /// - maxInitialTimestamp: Maximal initial timestamp. 117 | /// - clipTimestamps: Array of timestamps (in seconds) to split the audio into segments for transcription. 118 | /// - promptTokens: Array of token IDs to use as the conditioning prompt for the decoder. These are prepended to the prefill tokens. 119 | /// - prefixTokens: Array of token IDs to use as the initial prefix for the decoder. These are appended to the prefill tokens. 120 | /// - suppressBlank: If true, blank tokens will be suppressed during decoding. 121 | /// - supressTokens: List of token IDs to suppress during decoding. 122 | /// - compressionRatioThreshold: If the compression ratio of the transcription text is above this value, it is too repetitive and treated as failed. 123 | /// - logProbThreshold: If the average log probability over sampled tokens is below this value, treat as failed. 124 | /// - firstTokenLogProbThreshold: If the log probability over the first sampled token is below this value, treat as failed. 125 | /// - noSpeechThreshold: If the no speech probability is higher than this value AND the average log 126 | /// probability over sampled tokens is below `logProbThreshold`, consider the segment as silent. 127 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 128 | public struct DecodingOptions: Codable { 129 | public var verbose: Bool 130 | public var task: DecodingTask 131 | public var language: String? 132 | public var temperature: Float 133 | public var temperatureIncrementOnFallback: Float 134 | public var temperatureFallbackCount: Int 135 | public var sampleLength: Int 136 | public var topK: Int 137 | public var usePrefillPrompt: Bool 138 | public var usePrefillCache: Bool 139 | public var detectLanguage: Bool 140 | public var skipSpecialTokens: Bool 141 | public var withoutTimestamps: Bool 142 | public var wordTimestamps: Bool 143 | public var maxInitialTimestamp: Float? 144 | public var clipTimestamps: [Float] 145 | public var promptTokens: [Int]? 146 | public var prefixTokens: [Int]? 147 | public var suppressBlank: Bool 148 | public var supressTokens: [Int] 149 | public var compressionRatioThreshold: Float? 150 | public var logProbThreshold: Float? 151 | public var firstTokenLogProbThreshold: Float? 152 | public var noSpeechThreshold: Float? 153 | public var concurrentWorkerCount: Int 154 | public var chunkingStrategy: ChunkingStrategy? 155 | 156 | public init( 157 | verbose: Bool = false, 158 | task: DecodingTask = .transcribe, 159 | language: String? = nil, 160 | temperature: Float = 0.0, 161 | temperatureIncrementOnFallback: Float = 0.2, 162 | temperatureFallbackCount: Int = 5, 163 | sampleLength: Int = Constants.maxTokenContext, 164 | topK: Int = 5, 165 | usePrefillPrompt: Bool = true, 166 | usePrefillCache: Bool = true, 167 | detectLanguage: Bool? = nil, 168 | skipSpecialTokens: Bool = false, 169 | withoutTimestamps: Bool = false, 170 | wordTimestamps: Bool = false, 171 | maxInitialTimestamp: Float? = nil, 172 | clipTimestamps: [Float] = [], 173 | promptTokens: [Int]? = nil, 174 | prefixTokens: [Int]? = nil, 175 | suppressBlank: Bool = false, 176 | supressTokens: [Int]? = nil, 177 | compressionRatioThreshold: Float? = 2.4, 178 | logProbThreshold: Float? = -1.0, 179 | firstTokenLogProbThreshold: Float? = -1.5, 180 | noSpeechThreshold: Float? = 0.6, 181 | concurrentWorkerCount: Int? = nil, 182 | chunkingStrategy: ChunkingStrategy? = nil 183 | ) { 184 | self.verbose = verbose 185 | self.task = task 186 | self.language = language 187 | self.temperature = temperature 188 | self.temperatureIncrementOnFallback = temperatureIncrementOnFallback 189 | self.temperatureFallbackCount = temperatureFallbackCount 190 | self.sampleLength = sampleLength 191 | self.topK = topK 192 | self.usePrefillPrompt = usePrefillPrompt 193 | self.usePrefillCache = usePrefillCache 194 | self.detectLanguage = detectLanguage ?? !usePrefillPrompt // If prefill is false, detect language by default 195 | self.skipSpecialTokens = skipSpecialTokens 196 | self.withoutTimestamps = withoutTimestamps 197 | self.wordTimestamps = wordTimestamps 198 | self.maxInitialTimestamp = maxInitialTimestamp 199 | self.clipTimestamps = clipTimestamps 200 | self.promptTokens = promptTokens 201 | self.prefixTokens = prefixTokens 202 | self.suppressBlank = suppressBlank 203 | self.supressTokens = supressTokens ?? [] // nonSpeechTokens() // TODO: implement these as default 204 | self.compressionRatioThreshold = compressionRatioThreshold 205 | self.logProbThreshold = logProbThreshold 206 | self.firstTokenLogProbThreshold = firstTokenLogProbThreshold 207 | self.noSpeechThreshold = noSpeechThreshold 208 | // Set platform-specific default worker count if not explicitly provided 209 | // Non-macOS devices have shown regressions with >4 workers, default to 4 for safety 210 | #if os(macOS) 211 | self.concurrentWorkerCount = concurrentWorkerCount ?? 16 212 | #else 213 | self.concurrentWorkerCount = concurrentWorkerCount ?? 4 214 | #endif 215 | self.chunkingStrategy = chunkingStrategy 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/FeatureExtractor.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Accelerate 5 | import AVFoundation 6 | import CoreGraphics 7 | import CoreML 8 | import Foundation 9 | 10 | public protocol FeatureExtractorOutputType {} 11 | extension MLMultiArray: FeatureExtractorOutputType {} 12 | 13 | public protocol FeatureExtracting { 14 | associatedtype OutputType: FeatureExtractorOutputType 15 | 16 | var melCount: Int? { get } 17 | var windowSamples: Int? { get } 18 | func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> OutputType? 19 | } 20 | 21 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 22 | open class FeatureExtractor: FeatureExtracting, WhisperMLModel { 23 | public var model: MLModel? 24 | 25 | public init() {} 26 | 27 | public var melCount: Int? { 28 | guard let inputDescription = model?.modelDescription.outputDescriptionsByName["melspectrogram_features"] else { return nil } 29 | guard inputDescription.type == .multiArray else { return nil } 30 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 31 | let shape = shapeConstraint.shape.map { $0.intValue } 32 | return shape[1] 33 | } 34 | 35 | public var windowSamples: Int? { 36 | guard let inputDescription = model?.modelDescription.inputDescriptionsByName["audio"] else { return nil } 37 | guard inputDescription.type == .multiArray else { return nil } 38 | guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil } 39 | let shape = shapeConstraint.shape.map { $0.intValue } 40 | return shape[0] // The audio input is a 1D array 41 | } 42 | 43 | public func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> MLMultiArray? { 44 | guard let model else { 45 | throw WhisperError.modelsUnavailable() 46 | } 47 | try Task.checkCancellation() 48 | 49 | let interval = Logging.beginSignpost("ExtractAudioFeatures", signposter: Logging.FeatureExtractor.signposter) 50 | defer { Logging.endSignpost("ExtractAudioFeatures", interval: interval, signposter: Logging.FeatureExtractor.signposter) } 51 | 52 | let modelInputs = MelSpectrogramInput(audio: inputAudio) 53 | let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) 54 | let output = MelSpectrogramOutput(features: outputFeatures) 55 | return output.melspectrogramFeatures 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/ResultWriter.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | public protocol ResultWriting { 7 | var outputDir: String { get } 8 | func write(result: TranscriptionResult, to file: String, options: [String: Any]?) -> Result 9 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String 10 | } 11 | 12 | public extension ResultWriting { 13 | /// Format a time value as a string 14 | func formatTime(seconds: Float, alwaysIncludeHours: Bool, decimalMarker: String) -> String { 15 | let hrs = Int(seconds / 3600) 16 | let mins = Int((seconds.truncatingRemainder(dividingBy: 3600)) / 60) 17 | let secs = Int(seconds.truncatingRemainder(dividingBy: 60)) 18 | let msec = Int((seconds - floor(seconds)) * 1000) 19 | 20 | if alwaysIncludeHours || hrs > 0 { 21 | return String(format: "%02d:%02d:%02d\(decimalMarker)%03d", hrs, mins, secs, msec) 22 | } else { 23 | return String(format: "%02d:%02d\(decimalMarker)%03d", mins, secs, msec) 24 | } 25 | } 26 | 27 | func formatSegment(index: Int, start: Float, end: Float, text: String) -> String { 28 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: true, decimalMarker: ",") 29 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: true, decimalMarker: ",") 30 | return "\(index)\n\(startFormatted) --> \(endFormatted)\n\(text)\n\n" 31 | } 32 | 33 | func formatTiming(start: Float, end: Float, text: String) -> String { 34 | let startFormatted = formatTime(seconds: Float(start), alwaysIncludeHours: false, decimalMarker: ".") 35 | let endFormatted = formatTime(seconds: Float(end), alwaysIncludeHours: false, decimalMarker: ".") 36 | return "\(startFormatted) --> \(endFormatted)\n\(text)\n\n" 37 | } 38 | } 39 | 40 | open class WriteJSON: ResultWriting { 41 | public let outputDir: String 42 | 43 | public init(outputDir: String) { 44 | self.outputDir = outputDir 45 | } 46 | 47 | /// Write a transcription result to a JSON file 48 | /// - Parameters: 49 | /// - result: Completed transcription result 50 | /// - file: Name of the file to write, without the extension 51 | /// - options: Not used 52 | /// - Returns: The URL of the written file, or a error if the write failed 53 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result { 54 | let reportPathURL = URL(fileURLWithPath: outputDir) 55 | let reportURL = reportPathURL.appendingPathComponent("\(file).json") 56 | let jsonEncoder = JSONEncoder() 57 | jsonEncoder.outputFormatting = .prettyPrinted 58 | do { 59 | let reportJson = try jsonEncoder.encode(result) 60 | try reportJson.write(to: reportURL) 61 | } catch { 62 | return .failure(error) 63 | } 64 | 65 | return .success(reportURL.absoluteString) 66 | } 67 | } 68 | 69 | open class WriteSRT: ResultWriting { 70 | public let outputDir: String 71 | 72 | public init(outputDir: String) { 73 | self.outputDir = outputDir 74 | } 75 | 76 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result { 77 | let outputPathURL = URL(fileURLWithPath: outputDir) 78 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).srt") 79 | 80 | do { 81 | var srtContent = "" 82 | var index = 1 83 | for segment in result.segments { 84 | if let wordTimings = segment.words, !wordTimings.isEmpty { 85 | for wordTiming in wordTimings { 86 | srtContent += formatSegment(index: index, start: wordTiming.start, end: wordTiming.end, text: wordTiming.word) 87 | index += 1 88 | } 89 | } else { 90 | // Use segment timing if word timings are not available 91 | srtContent += formatSegment(index: index, start: segment.start, end: segment.end, text: segment.text) 92 | index += 1 93 | } 94 | } 95 | 96 | try srtContent.write(to: outputFileURL, atomically: true, encoding: .utf8) 97 | return .success(outputFileURL.absoluteString) 98 | } catch { 99 | return .failure(error) 100 | } 101 | } 102 | } 103 | 104 | open class WriteVTT: ResultWriting { 105 | public let outputDir: String 106 | 107 | public init(outputDir: String) { 108 | self.outputDir = outputDir 109 | } 110 | 111 | public func write(result: TranscriptionResult, to file: String, options: [String: Any]? = nil) -> Result { 112 | let outputPathURL = URL(fileURLWithPath: outputDir) 113 | let outputFileURL = outputPathURL.appendingPathComponent("\(file).vtt") 114 | 115 | do { 116 | var vttContent = "WEBVTT\n\n" 117 | for segment in result.segments { 118 | if let wordTimings = segment.words, !wordTimings.isEmpty { 119 | for wordTiming in wordTimings { 120 | vttContent += formatTiming(start: wordTiming.start, end: wordTiming.end, text: wordTiming.word) 121 | } 122 | } else { 123 | // Use segment timing if word timings are not available 124 | vttContent += formatTiming(start: segment.start, end: segment.end, text: segment.text) 125 | } 126 | } 127 | 128 | try vttContent.write(to: outputFileURL, atomically: true, encoding: .utf8) 129 | return .success(outputFileURL.absoluteString) 130 | } catch { 131 | return .failure(error) 132 | } 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Text/LogitsFilter.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Accelerate 5 | import CoreML 6 | import Foundation 7 | import Tokenizers 8 | 9 | public protocol LogitsFiltering { 10 | func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray 11 | } 12 | 13 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 14 | open class SuppressTokensFilter: LogitsFiltering { 15 | let suppressTokens: [Int] 16 | private let suppressTokenIndexes: [[NSNumber]] 17 | 18 | public init(suppressTokens: [Int]) { 19 | self.suppressTokens = suppressTokens 20 | self.suppressTokenIndexes = suppressTokens.map { [0, 0, $0 as NSNumber] } 21 | } 22 | 23 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray { 24 | logits.fill(indexes: suppressTokenIndexes, with: -FloatType.infinity) 25 | return logits 26 | } 27 | } 28 | 29 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 30 | open class SuppressBlankFilter: LogitsFiltering { 31 | let specialTokens: SpecialTokens 32 | let sampleBegin: Int 33 | private let suppressTokenIndexes: [[NSNumber]] 34 | 35 | public init( 36 | specialTokens: SpecialTokens, 37 | sampleBegin: Int 38 | ) { 39 | self.specialTokens = specialTokens 40 | self.sampleBegin = sampleBegin 41 | self.suppressTokenIndexes = [ 42 | [0, 0, specialTokens.whitespaceToken as NSNumber], 43 | [0, 0, specialTokens.endToken as NSNumber], 44 | ] 45 | } 46 | 47 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray { 48 | guard tokens.count == sampleBegin else { 49 | return logits 50 | } 51 | logits.fill(indexes: suppressTokenIndexes, with: -FloatType.infinity) 52 | return logits 53 | } 54 | } 55 | 56 | /// Implementation based on https://github.com/openai/whisper/blob/master/whisper/decoding.py#L441 57 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 58 | open class TimestampRulesFilter: LogitsFiltering { 59 | let specialTokens: SpecialTokens 60 | let sampleBegin: Int 61 | let maxInitialTimestampIndex: Int? 62 | let isModelMultilingual: Bool 63 | 64 | public init( 65 | specialTokens: SpecialTokens, 66 | sampleBegin: Int, 67 | maxInitialTimestampIndex: Int?, 68 | isModelMultilingual: Bool 69 | ) { 70 | self.specialTokens = specialTokens 71 | self.sampleBegin = sampleBegin 72 | self.maxInitialTimestampIndex = maxInitialTimestampIndex 73 | self.isModelMultilingual = isModelMultilingual 74 | } 75 | 76 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray { 77 | guard let sampleBegin = sampleBegin(for: tokens), 78 | sampleBegin <= tokens.count 79 | else { 80 | // Early return if we are still prefilling the prompt 81 | return logits 82 | } 83 | 84 | // suppress <|notimestamps|> which is handled by `withoutTimestamps` 85 | logits.fill(indexes: [[0, 0, specialTokens.noTimestampsToken as NSNumber]], with: -FloatType.infinity) 86 | 87 | if tokens.count > sampleBegin { 88 | // timestamps have to appear in pairs, except directly before EOT; mask logits accordingly 89 | let sampledTokens = tokens[sampleBegin...] 90 | let lastWasTimestamp = sampledTokens.count >= 1 && sampledTokens.last! >= specialTokens.timeTokenBegin 91 | let penultimateWasTimestamp = sampledTokens.count < 2 || sampledTokens.dropLast().last! >= specialTokens.timeTokenBegin 92 | if lastWasTimestamp { 93 | if penultimateWasTimestamp { 94 | // has to be non-timestamp 95 | logits.fillLastDimension(indexes: specialTokens.timeTokenBegin..= specialTokens.timeTokenBegin } 103 | if let lastTimestamp = timestamps.last { 104 | // timestamps shouldn't decrease; forbid timestamp tokens smaller than the last 105 | // also force each segment to have a nonzero length, to prevent infinite looping 106 | let timestampLast = 107 | if lastWasTimestamp && !penultimateWasTimestamp { 108 | lastTimestamp 109 | } else { 110 | lastTimestamp + 1 111 | } 112 | logits.fillLastDimension(indexes: specialTokens.timeTokenBegin.. every time 118 | // if tokens.count == sampleBegin { 119 | // // suppress generating non-timestamp tokens at the beginning 120 | // logits.fillLastDimension(indexes: 0.. Int? { 136 | if isModelMultilingual { 137 | // NOTE: for multilingual model we don't want to supress "<|transcribe|>" or "<|translate|>" tokens 138 | if let taskTokenIndex = tokens.prefix(3).firstIndex(where: { $0 == specialTokens.transcribeToken || $0 == specialTokens.translateToken }) { 139 | return max(taskTokenIndex + 1, sampleBegin) 140 | } else { 141 | return nil 142 | } 143 | } else { 144 | return sampleBegin 145 | } 146 | } 147 | 148 | private func sumOfProbabilityOverTimestampsIsAboveAnyOtherToken(logits: MLMultiArray, timeTokenBegin: Int) -> Bool { 149 | let timeTokenBeginOffset = logits.linearOffset(for: [0, 0, timeTokenBegin as NSNumber]) 150 | 151 | let logprobsInputPointer = UnsafeMutableRawBufferPointer( 152 | start: logits.dataPointer, 153 | count: logits.count * MemoryLayout.stride 154 | ) 155 | 156 | guard let logprobsInputDescriptor = BNNSNDArrayDescriptor( 157 | data: logprobsInputPointer, 158 | scalarType: FloatType.self, 159 | shape: .vector(logits.count, stride: 1) 160 | ) else { 161 | Logging.error("Cannot create `logprobsInputDescriptor`") 162 | return false 163 | } 164 | 165 | let logprobs = BNNSNDArrayDescriptor.allocateUninitialized( 166 | scalarType: FloatType.self, 167 | shape: .vector(logits.count, stride: 1) 168 | ) 169 | defer { logprobs.deallocate() } 170 | 171 | do { 172 | try BNNS.applyActivation( 173 | activation: BNNS.ActivationFunction.logSoftmax, 174 | input: logprobsInputDescriptor, 175 | output: logprobs, 176 | batchSize: 1 177 | ) 178 | 179 | let timeTokenCount = logits.count - timeTokenBeginOffset 180 | let noTimeTokenCount = timeTokenBeginOffset 181 | let logSumExpInputPointer = UnsafeMutableRawBufferPointer( 182 | start: logprobs.data!.advanced(by: timeTokenBeginOffset * MemoryLayout.stride), 183 | count: timeTokenCount * MemoryLayout.stride 184 | ) 185 | 186 | guard let logSumExpInputDescriptor = BNNSNDArrayDescriptor( 187 | data: logSumExpInputPointer, 188 | scalarType: FloatType.self, 189 | shape: .vector(timeTokenCount, stride: 1) 190 | ) else { 191 | Logging.error("Cannot create `logSumExpInputDescriptor`") 192 | return false 193 | } 194 | 195 | let timestampLogProb = BNNSNDArrayDescriptor.allocateUninitialized( 196 | scalarType: FloatType.self, 197 | shape: .vector(1, stride: 1) 198 | ) 199 | defer { timestampLogProb.deallocate() } 200 | 201 | try BNNS.applyReduction( 202 | .logSumExp, 203 | input: logSumExpInputDescriptor, 204 | output: timestampLogProb, 205 | weights: nil 206 | ) 207 | 208 | let maxTextTokenLogProbInputPointer = UnsafeMutableRawBufferPointer( 209 | start: logprobs.data, 210 | count: noTimeTokenCount * MemoryLayout.stride 211 | ) 212 | 213 | guard let maxTextTokenLogProbInputDescriptor = BNNSNDArrayDescriptor( 214 | data: maxTextTokenLogProbInputPointer, 215 | scalarType: FloatType.self, 216 | shape: .vector(noTimeTokenCount, stride: 1) 217 | ) else { 218 | Logging.error("Cannot create `maxTextTokenLogProbInputDescriptor`") 219 | return false 220 | } 221 | 222 | let maxTextTokenLogProb = BNNSNDArrayDescriptor.allocateUninitialized( 223 | scalarType: FloatType.self, 224 | shape: .vector(1, stride: 1) 225 | ) 226 | defer { maxTextTokenLogProb.deallocate() } 227 | 228 | try BNNS.applyReduction( 229 | .max, 230 | input: maxTextTokenLogProbInputDescriptor, 231 | output: maxTextTokenLogProb, 232 | weights: nil 233 | ) 234 | 235 | guard let timestampLogProbValue = timestampLogProb.makeArray(of: FloatType.self)?.first, 236 | let maxTextTokenLogProbValue = maxTextTokenLogProb.makeArray(of: FloatType.self)?.first 237 | else { 238 | Logging.error("Cannot create logProb arrays") 239 | return false 240 | } 241 | return timestampLogProbValue > maxTextTokenLogProbValue 242 | } catch { 243 | Logging.error("TimestampRulesFilter error: \(error)") 244 | return false 245 | } 246 | } 247 | } 248 | 249 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 250 | open class LanguageLogitsFilter: LogitsFiltering { 251 | let allLanguageTokens: Set 252 | let logitsDim: Int 253 | let sampleBegin: Int 254 | let nonLanguageTokenIndexes: [[NSNumber]] 255 | 256 | public init(allLanguageTokens: Set, logitsDim: Int, sampleBegin: Int) { 257 | self.allLanguageTokens = allLanguageTokens 258 | self.logitsDim = logitsDim 259 | self.sampleBegin = sampleBegin 260 | self.nonLanguageTokenIndexes = LanguageLogitsFilter.getNonLanguageTokenIndexes(logitsDim: self.logitsDim, allLanguageTokens: self.allLanguageTokens) 261 | } 262 | 263 | /// Retain the logits that correspond to language tokens and suppress non-language tokens 264 | public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray { 265 | guard tokens.count >= sampleBegin else { 266 | return logits 267 | } 268 | logits.fill(indexes: nonLanguageTokenIndexes, with: -FloatType.infinity) 269 | return logits 270 | } 271 | 272 | private static func getNonLanguageTokenIndexes(logitsDim: Int, allLanguageTokens: Set) -> [[NSNumber]] { 273 | var indexes: [[NSNumber]] = [] 274 | for i in 0.. SamplingResult 10 | func finalize(tokens: [Int], logProbs: [Float]) -> SamplingResult 11 | } 12 | 13 | public struct SamplingResult { 14 | public var tokens: [Int] 15 | public var logProbs: [Float] 16 | public var completed: Bool 17 | 18 | public init( 19 | tokens: [Int], 20 | logProbs: [Float], 21 | completed: Bool 22 | ) { 23 | self.tokens = tokens 24 | self.logProbs = logProbs 25 | self.completed = completed 26 | } 27 | } 28 | 29 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 30 | open class GreedyTokenSampler: TokenSampling { 31 | public var temperature: FloatType 32 | public var eotToken: Int 33 | public var decodingOptions: DecodingOptions 34 | 35 | public init(temperature: FloatType, eotToken: Int, decodingOptions: DecodingOptions) { 36 | self.temperature = temperature 37 | self.eotToken = eotToken 38 | self.decodingOptions = decodingOptions 39 | } 40 | 41 | #if canImport(CoreML.MLState) 42 | @available(macOS 15, iOS 18, watchOS 11, visionOS 2, *) 43 | private func sampleWithMLTensor(logits: MLMultiArray) -> (token: Int, logprob: Float) { 44 | // Use MLTensor operations if available for sampling 45 | // Reference: https://github.com/huggingface/swift-transformers/blob/preview/Sources/Generation/Decoders.swift 46 | var logitsTensor = MLTensor(MLShapedArray(logits)).cast(to: Float.self) 47 | var nextTokenTensor: MLTensor 48 | var nextLogprobTensor: MLTensor 49 | 50 | if temperature != 0.0 { 51 | // Scale logits by temperature if > 0 52 | logitsTensor = logitsTensor / temperature 53 | } 54 | 55 | // Always softmax once 56 | let softmaxScores = logitsTensor.softmax(alongAxis: -1) 57 | 58 | if temperature != 0.0 { 59 | // top-k multinomial sampling 60 | let (topKProbs, topKIndices) = softmaxScores.topK(decodingOptions.topK) 61 | 62 | let rnd = topKProbs.sum() * Float.random(in: 0..<1) 63 | var accumTopKProbs = topKProbs.cumulativeSum(alongAxis: -1) 64 | accumTopKProbs += (accumTopKProbs .< rnd) * 100.0 65 | let topKIndex = accumTopKProbs.argsort()[..., 0] 66 | 67 | nextTokenTensor = topKIndices.gathering( 68 | atIndices: topKIndex, 69 | alongAxis: topKIndices.rank - 1 70 | ) 71 | nextLogprobTensor = topKProbs.gathering( 72 | atIndices: topKIndex, 73 | alongAxis: topKIndices.rank - 1 74 | ).log() 75 | } else { 76 | nextTokenTensor = logitsTensor.argmax(alongAxis: -1) 77 | nextLogprobTensor = softmaxScores.gathering(atIndices: nextTokenTensor, alongAxis: -1).log() 78 | } 79 | 80 | return ( 81 | token: nextTokenTensor.asIntArray()[0], 82 | logprob: nextLogprobTensor.asFloatArray()[0] 83 | ) 84 | } 85 | #endif 86 | 87 | private func sampleWithBNNS(logits: MLMultiArray) -> (token: Int, logprob: Float) { 88 | // TODO: BNNS operations here are deprecated, replace with vDSP or MLX 89 | var softmaxOutput: BNNSNDArrayDescriptor? 90 | var argmaxOutput: BNNSNDArrayDescriptor? 91 | var softmaxInput: BNNSNDArrayDescriptor? 92 | var softmaxInputNeedsDeallocate = false 93 | 94 | var nextToken: Int? 95 | 96 | do { 97 | let logitsRawPointer = UnsafeMutableRawBufferPointer( 98 | start: logits.dataPointer, 99 | count: logits.count * MemoryLayout.stride 100 | ) 101 | 102 | let logitsDescriptor = BNNSNDArrayDescriptor( 103 | data: logitsRawPointer, 104 | scalarType: FloatType.self, 105 | shape: .vector(logits.count, stride: 1) 106 | )! 107 | 108 | softmaxInput = logitsDescriptor 109 | 110 | // Scale logits by temperature if > 0 111 | if temperature != 0.0 { 112 | let scaledLogits = BNNSNDArrayDescriptor.allocateUninitialized( 113 | scalarType: FloatType.self, 114 | shape: .vector(logits.count, stride: 1) 115 | ) 116 | 117 | try! BNNS.applyActivation( 118 | activation: BNNS.ActivationFunction.linear(alpha: Float(1 / temperature)), 119 | input: logitsDescriptor, 120 | output: scaledLogits, 121 | batchSize: 1 122 | ) 123 | 124 | softmaxInput = scaledLogits 125 | softmaxInputNeedsDeallocate = true 126 | } 127 | 128 | // Always softmax once 129 | softmaxOutput = BNNSNDArrayDescriptor.allocateUninitialized( 130 | scalarType: Float.self, 131 | shape: .vector(logits.count, stride: 1) 132 | ) 133 | 134 | try BNNS.applyActivation( 135 | activation: BNNS.ActivationFunction.softmax, 136 | input: softmaxInput!, 137 | output: softmaxOutput!, 138 | batchSize: 1 139 | ) 140 | 141 | if temperature != 0.0 { 142 | // top-k multinomial sampling 143 | let k = decodingOptions.topK 144 | let bestValues = BNNSNDArrayDescriptor.allocateUninitialized( 145 | scalarType: Float.self, 146 | shape: .vector(k, stride: 1) 147 | ) 148 | let bestIndices = BNNSNDArrayDescriptor.allocateUninitialized( 149 | scalarType: Int32.self, 150 | shape: .vector(k, stride: 1) 151 | ) 152 | 153 | try! BNNS.applyTopK( 154 | k: k, 155 | input: softmaxOutput!, 156 | bestValues: bestValues, 157 | bestIndices: bestIndices, 158 | axis: 0, 159 | batchSize: 1 160 | ) 161 | 162 | let bestValuesResult = bestValues.makeArray(of: Float.self)! 163 | let bestIndicesResult = bestIndices.makeArray(of: Int32.self)! 164 | 165 | bestValues.deallocate() 166 | bestIndices.deallocate() 167 | 168 | // multinomial sample from top-k 169 | let sumOfbestIndicesResult = bestValuesResult.reduce(0, +) 170 | let rnd = Float.random(in: 0.. SamplingResult { 217 | var nextTokens = tokens 218 | var nextLogprobs = logProbs 219 | var completed = false 220 | 221 | var result: (token: Int, logprob: Float) 222 | #if canImport(CoreML.MLState) 223 | if #available(macOS 15.0, iOS 18.0, watchOS 11.0, visionOS 2.0, *) { 224 | result = sampleWithMLTensor(logits: logits) 225 | } else { 226 | result = sampleWithBNNS(logits: logits) 227 | } 228 | #else 229 | result = sampleWithBNNS(logits: logits) 230 | #endif 231 | 232 | nextTokens = tokens + [result.token] 233 | nextLogprobs = logProbs + [result.logprob] 234 | completed = result.token == eotToken 235 | 236 | return SamplingResult( 237 | tokens: nextTokens, 238 | logProbs: nextLogprobs, 239 | completed: completed 240 | ) 241 | } 242 | 243 | public func finalize(tokens: [Int], logProbs: [Float]) -> SamplingResult { 244 | var finalTokens = tokens 245 | var finalLogProbs = logProbs 246 | if tokens.last != eotToken { 247 | finalTokens.append(eotToken) 248 | finalLogProbs.append(0) 249 | } 250 | 251 | return SamplingResult(tokens: finalTokens, logProbs: finalLogProbs, completed: true) 252 | } 253 | } 254 | 255 | open class BeamSearchTokenSampler: TokenSampling { 256 | public var beamSize: Int 257 | public var eotToken: Int 258 | public var patience: Float 259 | var maxCandidates: Int 260 | var finishedSequences: [Float] 261 | 262 | public init( 263 | beamSize: Int, 264 | eotToken: Int, 265 | patience: Float = 1 266 | ) { 267 | self.beamSize = beamSize 268 | self.eotToken = eotToken 269 | self.patience = patience 270 | self.maxCandidates = Int(Float(beamSize) * patience) 271 | self.finishedSequences = [] 272 | if self.maxCandidates <= 0 { 273 | self.maxCandidates = 1 274 | fatalError("Invalid beam size \(beamSize) or patience \(patience)") 275 | } 276 | } 277 | 278 | public func reset() { 279 | finishedSequences = [] 280 | } 281 | 282 | public func update(tokens: [Int], logits: MLMultiArray, logProbs: [Float]) -> SamplingResult { 283 | // TODO: Implement 284 | fatalError("Not implemented: \(#function)") 285 | } 286 | 287 | public func finalize(tokens: [Int], logProbs: [Float]) -> SamplingResult { 288 | // TODO: Implement 289 | fatalError("Not implemented: \(#function)") 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /Sources/WhisperKit/Core/Utils/Concurrency.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// An actor that provides thread-safe early stopping functionality using UUIDs as keys 7 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 8 | public actor EarlyStopActor { 9 | private var shouldStop = [UUID: Bool]() 10 | 11 | public init() {} 12 | 13 | /// Sets the stop flag for a given UUID 14 | /// - Parameters: 15 | /// - value: The boolean value to set 16 | /// - uuid: The UUID key 17 | public func set(_ value: Bool, for uuid: UUID) { 18 | shouldStop[uuid] = value 19 | } 20 | 21 | /// Gets the stop flag for a given UUID 22 | /// - Parameter uuid: The UUID key 23 | /// - Returns: The current stop flag value, or false if not set 24 | public func get(for uuid: UUID) -> Bool { 25 | return shouldStop[uuid] ?? false 26 | } 27 | 28 | /// Removes and returns the stop flag for a given UUID 29 | /// - Parameter uuid: The UUID key 30 | /// - Returns: The removed stop flag value, if it existed 31 | public func remove(for uuid: UUID) -> Bool? { 32 | return shouldStop.removeValue(forKey: uuid) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/CLIArguments.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import ArgumentParser 5 | 6 | struct CLIArguments: ParsableArguments { 7 | @Option(help: "Paths to audio files") 8 | var audioPath = [String]() 9 | 10 | @Option(help: "Path to a folder containing audio files") 11 | var audioFolder: String? 12 | 13 | @Option(help: "Path of model files") 14 | var modelPath: String? 15 | 16 | @Option(help: "Model to download if no modelPath is provided") 17 | var model: String? 18 | 19 | @Option(help: "Text to add in front of the model name to specify between different types of the same variant (values: \"openai\", \"distil\")") 20 | var modelPrefix: String = "openai" 21 | 22 | @Option(help: "Path to save the downloaded model") 23 | var downloadModelPath: String? 24 | 25 | @Option(help: "Path to save the downloaded tokenizer files") 26 | var downloadTokenizerPath: String? 27 | 28 | @Option(help: "Compute units for audio encoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}") 29 | var audioEncoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine 30 | 31 | @Option(help: "Compute units for text decoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}") 32 | var textDecoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine 33 | 34 | @Flag(help: "Verbose mode") 35 | var verbose: Bool = false 36 | 37 | @Option(help: "Task to perform (transcribe or translate)") 38 | var task: String = "transcribe" 39 | 40 | @Option(help: "Language spoken in the audio") 41 | var language: String? 42 | 43 | @Option(help: "Temperature to use for sampling") 44 | var temperature: Float = 0 45 | 46 | @Option(help: "Temperature to increase on fallbacks during decoding") 47 | var temperatureIncrementOnFallback: Float = 0.2 48 | 49 | @Option(help: "Number of times to increase temperature when falling back during decoding") 50 | var temperatureFallbackCount: Int = 5 51 | 52 | @Option(help: "Number of candidates when sampling with non-zero temperature") 53 | var bestOf: Int = 5 54 | 55 | @Flag(help: "Force initial prompt tokens based on language, task, and timestamp options") 56 | var usePrefillPrompt: Bool = false 57 | 58 | @Flag(help: "Use decoder prefill data for faster initial decoding") 59 | var usePrefillCache: Bool = false 60 | 61 | @Flag(help: "Skip special tokens in the output") 62 | var skipSpecialTokens: Bool = false 63 | 64 | @Flag(help: "Force no timestamps when decoding") 65 | var withoutTimestamps: Bool = false 66 | 67 | @Flag(help: "Add timestamps for each word in the output") 68 | var wordTimestamps: Bool = false 69 | 70 | @Option(help: "Force prefix text when decoding") 71 | var prefix: String? 72 | 73 | @Option(help: "Condition on this text when decoding") 74 | var prompt: String? 75 | 76 | @Option(parsing: .upToNextOption, help: "List of timestamps (in seconds) of start and end values to transcribe as seperate clips in single audio file (example: --clip-timestamps 0 10.2 34.5 60.0)") 77 | var clipTimestamps: [Float] = [] 78 | 79 | @Option(parsing: .upToNextOption, help: "List of tokens to supress in the output (example: --supress-tokens 1 2 3)") 80 | var supressTokens: [Int] = [] 81 | 82 | @Option(help: "Gzip compression ratio threshold for decoding failure") 83 | var compressionRatioThreshold: Float? 84 | 85 | @Option(help: "Average log probability threshold for decoding failure") 86 | var logprobThreshold: Float? 87 | 88 | @Option(help: "Log probability threshold for first token decoding failure") 89 | var firstTokenLogProbThreshold: Float? 90 | 91 | @Option(help: "Probability threshold to consider a segment as silence") 92 | var noSpeechThreshold: Float? 93 | 94 | @Flag(help: "Output a report of the results") 95 | var report: Bool = false 96 | 97 | @Option(help: "Directory to save the report") 98 | var reportPath: String = "." 99 | 100 | @Flag(help: "Process audio directly from the microphone") 101 | var stream: Bool = false 102 | 103 | @Flag(help: "Simulate streaming transcription using the input audio file") 104 | var streamSimulated: Bool = false 105 | 106 | @Option(help: "Maximum concurrent inference, might be helpful when processing more than 1 audio file at the same time. 0 means unlimited. Default: 4") 107 | var concurrentWorkerCount: Int = 4 108 | 109 | @Option(help: "Chunking strategy for audio processing, `none` means no chunking, `vad` means using voice activity detection. Default: `vad`") 110 | var chunkingStrategy: String = "vad" 111 | } 112 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/CLIUtils.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import ArgumentParser 5 | import CoreML 6 | import Foundation 7 | 8 | enum ComputeUnits: String, ExpressibleByArgument, CaseIterable { 9 | case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine, random 10 | var asMLComputeUnits: MLComputeUnits { 11 | switch self { 12 | case .all: return .all 13 | case .cpuAndGPU: return .cpuAndGPU 14 | case .cpuOnly: return .cpuOnly 15 | case .cpuAndNeuralEngine: return .cpuAndNeuralEngine 16 | case .random: return Bool.random() ? .cpuAndGPU : .cpuAndNeuralEngine 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Sources/WhisperKitCLI/WhisperKitCLI.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import ArgumentParser 5 | import Foundation 6 | 7 | let VERSION: String = "development" 8 | 9 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 10 | @main 11 | struct WhisperKitCLI: AsyncParsableCommand { 12 | static let configuration = CommandConfiguration( 13 | commandName: "whisperkit-cli", 14 | abstract: "WhisperKit CLI", 15 | discussion: "Swift native speech recognition with Whisper for Apple Silicon", 16 | version: VERSION, 17 | subcommands: [TranscribeCLI.self] 18 | ) 19 | } 20 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Evaluate/DistanceCalculation.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Compute the last row of the edit distance dynamic programming matrix 7 | /// between s1 and s2. 8 | func computeLastRow(_ s1Chars: [Unicode.Scalar], _ s2Chars: [Unicode.Scalar]) -> [Int] { 9 | var prevRow = Array(0...s2Chars.endIndex) 10 | 11 | for i in 1...s1Chars.endIndex { 12 | var currentRow = [Int](repeating: 0, count: s2Chars.endIndex + 1) 13 | currentRow[0] = i 14 | 15 | for j in 1...s2Chars.endIndex { 16 | let cost = s1Chars[i - 1] == s2Chars[j - 1] ? 0 : 1 17 | currentRow[j] = min( 18 | prevRow[j] + 1, // Deletion 19 | currentRow[j - 1] + 1, // Insertion 20 | prevRow[j - 1] + cost // Substitution 21 | ) 22 | } 23 | prevRow = currentRow 24 | } 25 | 26 | return prevRow 27 | } 28 | 29 | func needlemanWunsch(_ xArray: [Unicode.Scalar], _ yArray: [Unicode.Scalar]) -> [EditOp] { 30 | let m = xArray.count 31 | let n = yArray.count 32 | 33 | var dp = [[Int]](repeating: [Int](repeating: 0, count: n + 1), count: m + 1) 34 | for i in 1...m { 35 | dp[i][0] = i 36 | } 37 | for j in 1...n { 38 | dp[0][j] = j 39 | } 40 | 41 | for i in 1...m { 42 | for j in 1...n { 43 | let cost = xArray[i - 1] == yArray[j - 1] ? 0 : 1 44 | dp[i][j] = min( 45 | dp[i - 1][j] + 1, // Deletion 46 | dp[i][j - 1] + 1, // Insertion 47 | dp[i - 1][j - 1] + cost // Substitution 48 | ) 49 | } 50 | } 51 | 52 | var i = m 53 | var j = n 54 | var ops = [EditOp]() 55 | 56 | while i > 0, j > 0 { 57 | if dp[i][j] == dp[i - 1][j - 1], xArray[i - 1] == yArray[j - 1] { 58 | // Match operation is omitted 59 | i -= 1 60 | j -= 1 61 | } else if dp[i][j] == dp[i - 1][j - 1] + 1 { 62 | ops.append(EditOp.replace) // Substitution 63 | i -= 1 64 | j -= 1 65 | } else if dp[i][j] == dp[i][j - 1] + 1 { 66 | ops.append(EditOp.insert) // Insertion 67 | j -= 1 68 | } else { 69 | ops.append(EditOp.delete) // Deletion 70 | i -= 1 71 | } 72 | } 73 | 74 | while i > 0 { 75 | ops.append(EditOp.delete) 76 | i -= 1 77 | } 78 | while j > 0 { 79 | ops.append(EditOp.insert) 80 | j -= 1 81 | } 82 | 83 | return ops.reversed() 84 | } 85 | 86 | func hirschberg(_ reference: [Unicode.Scalar], _ s2: [Unicode.Scalar]) -> [EditOp] { 87 | func hirschbergRec(_ x: [Unicode.Scalar], _ y: [Unicode.Scalar]) -> [EditOp] { 88 | let m = x.endIndex 89 | let n = y.endIndex 90 | 91 | if m == 0 { 92 | let result = y.map { _ in EditOp.insert } 93 | return result 94 | } 95 | if n == 0 { 96 | let result = x.map { _ in EditOp.delete } 97 | return result 98 | } 99 | if m == 1 || n == 1 { 100 | let result = needlemanWunsch(x, y) 101 | return result 102 | } 103 | 104 | let i = m / 2 105 | let xPrefix = Array(x[x.startIndex.. [EditOp] { 136 | let n = sourceText.count 137 | let m = targetText.count 138 | let maxD = n + m 139 | let vSize = 2 * maxD + 1 140 | var v = [Int](repeating: 0, count: vSize) 141 | var trace = [[Int]]() 142 | 143 | let offset = maxD 144 | 145 | for d in 0...maxD { 146 | let vSnapshot = v 147 | for k in stride(from: -d, through: d, by: 2) { 148 | let kIndex = k + offset 149 | var x: Int 150 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) { 151 | x = v[kIndex + 1] 152 | } else { 153 | x = v[kIndex - 1] + 1 154 | } 155 | var y = x - k 156 | while x < n, y < m, sourceText[x] == targetText[y] { 157 | x += 1 158 | y += 1 159 | } 160 | v[kIndex] = x 161 | if x >= n, y >= m { 162 | trace.append(vSnapshot) 163 | return backtrack(trace: trace, sourceText: sourceText, targetText: targetText) 164 | } 165 | } 166 | trace.append(vSnapshot) 167 | } 168 | return [] 169 | } 170 | 171 | func backtrack(trace: [[Int]], sourceText: [Unicode.Scalar], targetText: [Unicode.Scalar]) -> [EditOp] { 172 | var editOps = [EditOp]() 173 | let n = sourceText.count 174 | let m = targetText.count 175 | let offset = trace[0].count / 2 176 | var x = n 177 | var y = m 178 | 179 | for d in stride(from: trace.count - 1, through: 0, by: -1) { 180 | let v = trace[d] 181 | let k = x - y 182 | let kIndex = k + offset 183 | 184 | var prevK: Int 185 | if k == -d || (k != d && v[kIndex - 1] < v[kIndex + 1]) { 186 | prevK = k + 1 187 | } else { 188 | prevK = k - 1 189 | } 190 | let prevX = v[prevK + offset] 191 | let prevY = prevX - prevK 192 | 193 | while x > prevX, y > prevY { 194 | // Match or Replace 195 | if sourceText[x - 1] == targetText[y - 1] { 196 | editOps.append(.blank) 197 | } else { 198 | editOps.append(.replace) 199 | } 200 | x -= 1 201 | y -= 1 202 | } 203 | 204 | if d > 0 { 205 | if x == prevX { 206 | // Insertion 207 | editOps.append(.insert) 208 | y -= 1 209 | } else { 210 | // Deletion 211 | editOps.append(.delete) 212 | x -= 1 213 | } 214 | } 215 | } 216 | 217 | return editOps.reversed() 218 | } 219 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Evaluate/WERUtils.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Foundation 5 | 6 | /// Return the operations needed to transform s1 into s2 using Wagner-Fischer algo. 7 | /// "i" = insertion, "d" = deletion, "r" = replacement 8 | enum EditOp: UInt8 { 9 | case blank 10 | case replace 11 | case delete 12 | case insert 13 | } 14 | 15 | enum WERUtils { 16 | static func wordsToChars(reference: [[String]], hypothesis: [[String]]) -> ([String], [String]) { 17 | // tokenize each word into an integer 18 | let vocabulary = Set((reference + hypothesis).flatMap { $0 }) 19 | let word2char = Dictionary(uniqueKeysWithValues: vocabulary.enumerated().map { index, value in 20 | (value, index) 21 | }) 22 | 23 | let referenceCharsEfficient = reference.map { sentence in 24 | String(sentence.lazy.compactMap { word in 25 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) { 26 | return Character(unicodeScalar) 27 | } 28 | return nil 29 | }) 30 | } 31 | 32 | let hypothesisCharsEfficient = hypothesis.map { sentence in 33 | String(sentence.lazy.compactMap { word in 34 | if let charCode = word2char[word], let unicodeScalar = UnicodeScalar(charCode) { 35 | return Character(unicodeScalar) 36 | } 37 | return nil 38 | }) 39 | } 40 | 41 | return (referenceCharsEfficient, hypothesisCharsEfficient) 42 | } 43 | 44 | static func processWords(reference: [String], hypothesis: [String]) -> (Double, [[String?]]) { 45 | var refTransformed = NormalizationUtils.removeMultipleSpaces(sentences: reference) 46 | refTransformed = NormalizationUtils.strip(sentences: refTransformed) 47 | let refTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: refTransformed) 48 | 49 | var hypTransformed = NormalizationUtils.removeMultipleSpaces(sentences: hypothesis) 50 | hypTransformed = NormalizationUtils.strip(sentences: hypTransformed) 51 | let hypTransformedReduced = NormalizationUtils.reduceToListOfListOfWordsWithSpaces(sentences: hypTransformed) 52 | 53 | let (refAsChars, hypAsChars) = WERUtils.wordsToChars(reference: refTransformedReduced, hypothesis: hypTransformedReduced) 54 | 55 | let refArrays = refAsChars.map { Array($0.unicodeScalars) } 56 | let hypArrays = hypAsChars.map { Array($0.unicodeScalars) } 57 | 58 | var (numHits, numSubstitutions, numDeletions, numInsertions) = (0, 0, 0, 0) 59 | var (numRfWords, numHypWords) = (0, 0) 60 | var diffResult: [[String?]] = [] 61 | 62 | for (referenceSentence, hypothesisSentence) in zip(refArrays, hypArrays) { 63 | let editOps = levenshtein(referenceSentence, hypothesisSentence) 64 | 65 | // count the number of edits of each type 66 | var substitutions = 0 67 | var deletions = 0 68 | var insertions = 0 69 | 70 | var referenceIndex = 0 71 | var hypothesisIndex = 0 72 | for op in editOps { 73 | switch op { 74 | case .replace: 75 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"]) 76 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"]) 77 | substitutions += 1 78 | referenceIndex += 1 79 | hypothesisIndex += 1 80 | case .delete: 81 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), "-"]) 82 | deletions += 1 83 | referenceIndex += 1 84 | case .insert: 85 | diffResult.append([String(hypTransformedReduced[0][hypothesisIndex]), "+"]) 86 | insertions += 1 87 | hypothesisIndex += 1 88 | case .blank: 89 | diffResult.append([String(refTransformedReduced[0][referenceIndex]), nil]) 90 | referenceIndex += 1 91 | hypothesisIndex += 1 92 | } 93 | } 94 | 95 | let hits: Int = referenceSentence.count - (substitutions + deletions) 96 | 97 | numHits += hits 98 | numSubstitutions += substitutions 99 | numDeletions += deletions 100 | numInsertions += insertions 101 | numRfWords += referenceSentence.count 102 | numHypWords += hypothesisSentence.count 103 | } 104 | 105 | let wer = Double(numSubstitutions + numDeletions + numInsertions) / Double(numHits + numSubstitutions + numDeletions) 106 | 107 | return (wer, diffResult) 108 | } 109 | 110 | static func evaluate(originalTranscript: String, generatedTranscript: String, normalizeOriginal: Bool = true) -> (wer: Double, diff: [[String?]]) { 111 | let normalizer = EnglishTextNormalizer() 112 | let reference = normalizeOriginal ? normalizer.normalize(text: originalTranscript) : originalTranscript 113 | let hypothesis = normalizer.normalize(text: generatedTranscript) 114 | 115 | let (wer, diff) = WERUtils.processWords( 116 | reference: [reference], 117 | hypothesis: [hypothesis] 118 | ) 119 | return (wer, diff) 120 | } 121 | 122 | static func processDiff(originalTranscript: String, generatedTranscript: String) -> [[String?]] { 123 | let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript) 124 | return diff 125 | } 126 | 127 | static func diffString(from diff: [[String?]]) -> String { 128 | return diff.compactMap { entry -> String? in 129 | guard let word = entry[0], word != " " else { return nil } 130 | if let changeType = entry[1] { 131 | return "\(changeType)\(word)" 132 | } 133 | return word 134 | }.joined(separator: " ") 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/FunctionalTests.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import CoreML 5 | import WhisperKit 6 | import XCTest 7 | 8 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 9 | final class FunctionalTests: XCTestCase { 10 | func testInitLarge() async throws { 11 | try await XCTAssertNoThrowAsync( 12 | await WhisperKit(modelFolder: largev3ModelPath(), logLevel: .error) 13 | ) 14 | } 15 | 16 | func testRealTimeFactorTiny() async throws { 17 | let modelPath = try await tinyModelPath() 18 | 19 | let metrics: [XCTMetric] = [XCTMemoryMetric(), XCTStorageMetric(), XCTClockMetric()] 20 | 21 | let measureOptions = XCTMeasureOptions.default 22 | measureOptions.iterationCount = 5 23 | 24 | let audioFilePath = try XCTUnwrap( 25 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 26 | "Audio file not found" 27 | ) 28 | 29 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: modelPath)) 30 | 31 | measure(metrics: metrics, options: measureOptions) { 32 | let dispatchSemaphore = DispatchSemaphore(value: 0) 33 | Task { 34 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath) 35 | let transcriptionResultText = transcriptionResult.map(\.text).joined(separator: " ") 36 | XCTAssertGreaterThan(transcriptionResultText.count, 0) 37 | dispatchSemaphore.signal() 38 | } 39 | dispatchSemaphore.wait() 40 | } 41 | } 42 | 43 | func testRealTimeFactorLarge() async throws { 44 | let modelPath = try largev3ModelPath() 45 | 46 | let metrics: [XCTMetric] = [XCTMemoryMetric(), XCTStorageMetric(), XCTClockMetric()] 47 | 48 | let measureOptions = XCTMeasureOptions.default 49 | measureOptions.iterationCount = 5 50 | 51 | let audioFilePath = try XCTUnwrap( 52 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 53 | "Audio file not found" 54 | ) 55 | 56 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: modelPath, verbose: false)) 57 | 58 | measure(metrics: metrics, options: measureOptions) { 59 | let dispatchSemaphore = DispatchSemaphore(value: 0) 60 | Task { 61 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath) 62 | XCTAssertGreaterThan(transcriptionResult.text.count, 0) 63 | dispatchSemaphore.signal() 64 | } 65 | dispatchSemaphore.wait() 66 | } 67 | } 68 | 69 | func testBaseImplementation() throws { 70 | let audioFilePath = try XCTUnwrap( 71 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 72 | "Audio file not found" 73 | ) 74 | 75 | let dispatchSemaphore = DispatchSemaphore(value: 0) 76 | 77 | Task { 78 | let whisperKit = try await XCTUnwrapAsync(await WhisperKit(model: "large-v3")) 79 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath) 80 | XCTAssertGreaterThan(transcriptionResult.text.count, 0) 81 | dispatchSemaphore.signal() 82 | } 83 | 84 | dispatchSemaphore.wait() 85 | } 86 | 87 | func testAsyncImplementation() async throws { 88 | let audioFilePath = try XCTUnwrap( 89 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 90 | "Audio file not found" 91 | ) 92 | let whisperKit = try await WhisperKit(WhisperKitConfig(model: "large-v3")) 93 | let transcriptionResult: [TranscriptionResult] = try await whisperKit.transcribe(audioPath: audioFilePath) 94 | 95 | XCTAssertGreaterThan(transcriptionResult.text.count, 0) 96 | } 97 | 98 | func testBatchTranscribeAudioPaths() async throws { 99 | let audioPaths = try [ 100 | XCTUnwrap( 101 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 102 | "Audio file not found" 103 | ), 104 | XCTUnwrap( 105 | Bundle.current(for: self).path(forResource: "es_test_clip", ofType: "wav"), 106 | "Audio file not found" 107 | ), 108 | XCTUnwrap( 109 | Bundle.current(for: self).path(forResource: "ja_test_clip", ofType: "wav"), 110 | "Audio file not found" 111 | ), 112 | ] 113 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: tinyModelPath())) 114 | let transcriptionResults: [Result<[TranscriptionResult], Swift.Error>] = await whisperKit.transcribeWithResults(audioPaths: audioPaths) 115 | 116 | XCTAssertEqual(transcriptionResults.count, 3) 117 | XCTAssertTrue(transcriptionResults.allSatisfy { $0.isSuccess }) 118 | XCTAssertEqual( 119 | try transcriptionResults[0].normalizedText(prefix: 5), 120 | "and so my fellow americans" 121 | ) 122 | XCTAssertEqual( 123 | try transcriptionResults[1].normalizedText(prefix: 2), 124 | "this is" 125 | ) 126 | XCTAssertEqual( 127 | try transcriptionResults[2].normalizedText(prefix: 1), 128 | "tokyo" 129 | ) 130 | } 131 | 132 | func testBatchTranscribeAudioPathsWithErrors() async throws { 133 | let audioPaths = try [ 134 | "/path/to/file1.wav", 135 | XCTUnwrap( 136 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 137 | "Audio file not found" 138 | ), 139 | "/path/to/file2.wav", 140 | ] 141 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: tinyModelPath())) 142 | let transcriptionResults: [Result<[TranscriptionResult], Swift.Error>] = await whisperKit.transcribeWithResults(audioPaths: audioPaths) 143 | 144 | XCTAssertEqual(transcriptionResults.count, 3) 145 | XCTAssertEqual( 146 | transcriptionResults[0].whisperError(), 147 | .loadAudioFailed("Resource path does not exist /path/to/file1.wav") 148 | ) 149 | XCTAssertEqual( 150 | try transcriptionResults[1].normalizedText(prefix: 5), 151 | "and so my fellow americans" 152 | ) 153 | XCTAssertEqual( 154 | transcriptionResults[2].whisperError(), 155 | .loadAudioFailed("Resource path does not exist /path/to/file2.wav") 156 | ) 157 | } 158 | 159 | func testBatchTranscribeAudioArrays() async throws { 160 | let audioPaths = try [ 161 | XCTUnwrap( 162 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 163 | "Audio file not found" 164 | ), 165 | XCTUnwrap( 166 | Bundle.current(for: self).path(forResource: "es_test_clip", ofType: "wav"), 167 | "Audio file not found" 168 | ), 169 | XCTUnwrap( 170 | Bundle.current(for: self).path(forResource: "ja_test_clip", ofType: "wav"), 171 | "Audio file not found" 172 | ), 173 | ] 174 | let audioArrays = try audioPaths 175 | .map { try AudioProcessor.loadAudio(fromPath: $0) } 176 | .map { AudioProcessor.convertBufferToArray(buffer: $0) } 177 | 178 | let whisperKit = try await WhisperKit(WhisperKitConfig(modelFolder: tinyModelPath())) 179 | let transcriptionResults: [Result<[TranscriptionResult], Swift.Error>] = await whisperKit.transcribeWithResults(audioArrays: audioArrays) 180 | 181 | XCTAssertEqual(transcriptionResults.count, 3) 182 | XCTAssertTrue(transcriptionResults.allSatisfy { $0.isSuccess }) 183 | XCTAssertEqual( 184 | try transcriptionResults[0].normalizedText(prefix: 5), 185 | "and so my fellow americans" 186 | ) 187 | XCTAssertEqual( 188 | try transcriptionResults[1].normalizedText(prefix: 2), 189 | "this is" 190 | ) 191 | XCTAssertEqual( 192 | try transcriptionResults[2].normalizedText(prefix: 1), 193 | "tokyo" 194 | ) 195 | } 196 | 197 | func testModelSearchPathLarge() async throws { 198 | let audioFilePath = try XCTUnwrap( 199 | Bundle.current(for: self).path(forResource: "jfk", ofType: "wav"), 200 | "Audio file not found" 201 | ) 202 | 203 | var config = WhisperKitConfig(model: "large-v3", verbose: true, logLevel: .debug) 204 | let pipe1 = try await WhisperKit(config) 205 | let transcriptionResult1: [TranscriptionResult] = try await pipe1.transcribe(audioPath: audioFilePath) 206 | XCTAssertFalse(transcriptionResult1.text.isEmpty) 207 | 208 | config = WhisperKitConfig(model: "distil*large-v3", verbose: true, logLevel: .debug) 209 | let pipe2 = try await WhisperKit(config) 210 | let transcriptionResult2: [TranscriptionResult] = try await pipe2.transcribe(audioPath: audioFilePath) 211 | XCTAssertFalse(transcriptionResult2.text.isEmpty) 212 | 213 | config = WhisperKitConfig(model: "distil*large-v3", verbose: true, logLevel: .debug) 214 | let pipe3 = try await WhisperKit(config) 215 | let transcriptionResult3: [TranscriptionResult] = try await pipe3.transcribe(audioPath: audioFilePath) 216 | XCTAssertFalse(transcriptionResult3.text.isEmpty) 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/8_Channel_ID.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/8_Channel_ID.m4a -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/config-v02.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "whisperkit-coreml", 3 | "version": "0.2", 4 | "device_support": [ 5 | { 6 | "identifiers": ["iPhone11", "iPhone12", "Watch7", "Watch8"], 7 | "models": { 8 | "default": "openai_whisper-tiny", 9 | "supported": [ 10 | "openai_whisper-tiny", 11 | "openai_whisper-tiny.en", 12 | "openai_whisper-base", 13 | "openai_whisper-base.en" 14 | ] 15 | } 16 | }, 17 | { 18 | "identifiers": ["iPhone13", "iPad13,18", "iPad13,1"], 19 | "models": { 20 | "default": "openai_whisper-base", 21 | "supported": [ 22 | "openai_whisper-tiny", 23 | "openai_whisper-tiny.en", 24 | "openai_whisper-base", 25 | "openai_whisper-base.en", 26 | "openai_whisper-small", 27 | "openai_whisper-small.en" 28 | ] 29 | } 30 | }, 31 | { 32 | "identifiers": [ 33 | "iPhone14", 34 | "iPhone15", 35 | "iPhone16", 36 | "iPhone17", 37 | "iPad14,1", 38 | "iPad14,2" 39 | ], 40 | "models": { 41 | "default": "openai_whisper-base", 42 | "supported": [ 43 | "openai_whisper-tiny", 44 | "openai_whisper-tiny.en", 45 | "openai_whisper-base", 46 | "openai_whisper-base.en", 47 | "openai_whisper-small", 48 | "openai_whisper-small.en", 49 | "openai_whisper-large-v2_949MB", 50 | "openai_whisper-large-v2_turbo_955MB", 51 | "openai_whisper-large-v3_947MB", 52 | "openai_whisper-large-v3_turbo_954MB", 53 | "distil-whisper_distil-large-v3_594MB", 54 | "distil-whisper_distil-large-v3_turbo_600MB", 55 | "openai_whisper-large-v3-v20240930_626MB", 56 | "openai_whisper-large-v3-v20240930_turbo_632MB" 57 | ] 58 | } 59 | }, 60 | { 61 | "identifiers": [ 62 | "Mac13", 63 | "iMac21", 64 | "MacBookAir10,1", 65 | "MacBookPro17", 66 | "MacBookPro18", 67 | "Macmini9", 68 | "iPad13,16", 69 | "iPad13,4", 70 | "iPad13,8" 71 | ], 72 | "models": { 73 | "default": "openai_whisper-large-v3-v20240930", 74 | "supported": [ 75 | "openai_whisper-tiny", 76 | "openai_whisper-tiny.en", 77 | "openai_whisper-base", 78 | "openai_whisper-base.en", 79 | "openai_whisper-small", 80 | "openai_whisper-small.en", 81 | "openai_whisper-large-v2", 82 | "openai_whisper-large-v2_949MB", 83 | "openai_whisper-large-v3", 84 | "openai_whisper-large-v3_947MB", 85 | "distil-whisper_distil-large-v3", 86 | "distil-whisper_distil-large-v3_594MB", 87 | "openai_whisper-large-v3-v20240930", 88 | "openai_whisper-large-v3-v20240930_626MB" 89 | ] 90 | } 91 | }, 92 | { 93 | "identifiers": [ 94 | "Mac14", 95 | "Mac15", 96 | "Mac16", 97 | "iPad14,3", 98 | "iPad14,4", 99 | "iPad14,5", 100 | "iPad14,6", 101 | "iPad14,8", 102 | "iPad14,9", 103 | "iPad14,10", 104 | "iPad14,11", 105 | "iPad16" 106 | ], 107 | "models": { 108 | "default": "openai_whisper-large-v3-v20240930", 109 | "supported": [ 110 | "openai_whisper-tiny", 111 | "openai_whisper-tiny.en", 112 | "openai_whisper-base", 113 | "openai_whisper-base.en", 114 | "openai_whisper-small", 115 | "openai_whisper-small.en", 116 | "openai_whisper-large-v2", 117 | "openai_whisper-large-v2_949MB", 118 | "openai_whisper-large-v2_turbo", 119 | "openai_whisper-large-v2_turbo_955MB", 120 | "openai_whisper-large-v3", 121 | "openai_whisper-large-v3_947MB", 122 | "openai_whisper-large-v3_turbo", 123 | "openai_whisper-large-v3_turbo_954MB", 124 | "distil-whisper_distil-large-v3", 125 | "distil-whisper_distil-large-v3_594MB", 126 | "distil-whisper_distil-large-v3_turbo", 127 | "distil-whisper_distil-large-v3_turbo_600MB", 128 | "openai_whisper-large-v3-v20240930", 129 | "openai_whisper-large-v3-v20240930_turbo", 130 | "openai_whisper-large-v3-v20240930_626MB", 131 | "openai_whisper-large-v3-v20240930_turbo_632MB" 132 | ] 133 | } 134 | } 135 | ] 136 | } 137 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/config-v03.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "whisperkit-coreml", 3 | "version": "0.3", 4 | "device_support": [ 5 | { 6 | "chips": "A12, A13, S9, S10", 7 | "identifiers": [ 8 | "iPhone11", 9 | "iPhone12", 10 | "Watch7", 11 | "Watch8" 12 | ], 13 | "models": { 14 | "default": "openai_whisper-tiny", 15 | "supported": [ 16 | "openai_whisper-tiny", 17 | "openai_whisper-tiny.en", 18 | "openai_whisper-base", 19 | "openai_whisper-base.en" 20 | ] 21 | } 22 | }, 23 | { 24 | "chips": "A14", 25 | "identifiers": [ 26 | "iPhone13", 27 | "iPad13,1", 28 | "iPad13,2", 29 | "iPad13,18", 30 | "iPad13,19" 31 | ], 32 | "models": { 33 | "default": "openai_whisper-base", 34 | "supported": [ 35 | "openai_whisper-tiny", 36 | "openai_whisper-tiny.en", 37 | "openai_whisper-base", 38 | "openai_whisper-base.en", 39 | "openai_whisper-small", 40 | "openai_whisper-small.en" 41 | ] 42 | } 43 | }, 44 | { 45 | "chips": "A15, A16, A17 Pro, A18", 46 | "identifiers": [ 47 | "iPhone14", 48 | "iPhone15", 49 | "iPhone16", 50 | "iPhone17", 51 | "iPad14,1", 52 | "iPad14,2", 53 | "iPad15,7", 54 | "iPad15,8", 55 | "iPad16,1", 56 | "iPad16,2" 57 | ], 58 | "models": { 59 | "default": "openai_whisper-base", 60 | "supported": [ 61 | "openai_whisper-tiny", 62 | "openai_whisper-tiny.en", 63 | "openai_whisper-base", 64 | "openai_whisper-base.en", 65 | "openai_whisper-small", 66 | "openai_whisper-small.en", 67 | "openai_whisper-large-v2_949MB", 68 | "openai_whisper-large-v2_turbo_955MB", 69 | "openai_whisper-large-v3_947MB", 70 | "openai_whisper-large-v3_turbo_954MB", 71 | "distil-whisper_distil-large-v3_594MB", 72 | "distil-whisper_distil-large-v3_turbo_600MB", 73 | "openai_whisper-large-v3-v20240930_626MB", 74 | "openai_whisper-large-v3-v20240930_turbo_632MB" 75 | ] 76 | } 77 | }, 78 | { 79 | "chips": "M1", 80 | "identifiers": [ 81 | "MacBookPro17,1", 82 | "MacBookPro18,1", 83 | "MacBookPro18,2", 84 | "MacBookPro18,3", 85 | "MacBookPro18,4", 86 | "MacBookAir10,1", 87 | "Macmini9,1", 88 | "iMac21,1", 89 | "iMac21,2", 90 | "Mac13", 91 | "iPad13,4", 92 | "iPad13,5", 93 | "iPad13,6", 94 | "iPad13,7", 95 | "iPad13,8", 96 | "iPad13,9", 97 | "iPad13,10", 98 | "iPad13,11", 99 | "iPad13,16", 100 | "iPad13,17" 101 | ], 102 | "models": { 103 | "default": "openai_whisper-large-v3-v20240930_626MB", 104 | "supported": [ 105 | "openai_whisper-tiny", 106 | "openai_whisper-tiny.en", 107 | "openai_whisper-base", 108 | "openai_whisper-base.en", 109 | "openai_whisper-small", 110 | "openai_whisper-small.en", 111 | "openai_whisper-large-v2", 112 | "openai_whisper-large-v2_949MB", 113 | "openai_whisper-large-v3", 114 | "openai_whisper-large-v3_947MB", 115 | "distil-whisper_distil-large-v3", 116 | "distil-whisper_distil-large-v3_594MB", 117 | "openai_whisper-large-v3-v20240930_626MB" 118 | ] 119 | } 120 | }, 121 | { 122 | "chips": "M2, M3, M4", 123 | "identifiers": [ 124 | "Mac14", 125 | "Mac15", 126 | "Mac16", 127 | "iPad14,3", 128 | "iPad14,4", 129 | "iPad14,5", 130 | "iPad14,6", 131 | "iPad14,8", 132 | "iPad14,9", 133 | "iPad14,10", 134 | "iPad14,11", 135 | "iPad15", 136 | "iPad16" 137 | ], 138 | "models": { 139 | "default": "openai_whisper-large-v3-v20240930", 140 | "supported": [ 141 | "openai_whisper-tiny", 142 | "openai_whisper-tiny.en", 143 | "openai_whisper-base", 144 | "openai_whisper-base.en", 145 | "openai_whisper-small", 146 | "openai_whisper-small.en", 147 | "openai_whisper-large-v2", 148 | "openai_whisper-large-v2_949MB", 149 | "openai_whisper-large-v2_turbo", 150 | "openai_whisper-large-v2_turbo_955MB", 151 | "openai_whisper-large-v3", 152 | "openai_whisper-large-v3_947MB", 153 | "openai_whisper-large-v3_turbo", 154 | "openai_whisper-large-v3_turbo_954MB", 155 | "distil-whisper_distil-large-v3", 156 | "distil-whisper_distil-large-v3_594MB", 157 | "distil-whisper_distil-large-v3_turbo", 158 | "distil-whisper_distil-large-v3_turbo_600MB", 159 | "openai_whisper-large-v3-v20240930", 160 | "openai_whisper-large-v3-v20240930_turbo", 161 | "openai_whisper-large-v3-v20240930_626MB", 162 | "openai_whisper-large-v3-v20240930_turbo_632MB" 163 | ] 164 | } 165 | } 166 | ], 167 | "model_checksums": { 168 | "distil-whisper_distil-large-v3": "9cd8271143b919402ae776c30b479565", 169 | "distil-whisper_distil-large-v3_594MB": "ca532f45ddbf8a3d241132cc5cf41639", 170 | "distil-whisper_distil-large-v3_turbo": "b8638452c6568dfe33a33bfcc2bc6aca", 171 | "distil-whisper_distil-large-v3_turbo_600MB": "81746b4b1afbbb01a8ae9ea452460d88", 172 | "openai_whisper-base.en": "fbcfd586f15e2952251b1d3257f18471", 173 | "openai_whisper-base": "36e60501ad0f01c1a5719e83a1f63f20", 174 | "openai_whisper-large-v2": "21b86c07318aeeef54598f15b7903979", 175 | "openai_whisper-large-v2_949MB": "71bad4e1566749d1060eda42308d9fb4", 176 | "openai_whisper-large-v2_turbo": "7734959b6550e7b5c2d732bf2b7acd23", 177 | "openai_whisper-large-v2_turbo_955MB": "cb6411862a48ec75325572081f01e5b5", 178 | "openai_whisper-large-v3-v20240930": "17ebd78ff7edfa59001b554e9cc4c021", 179 | "openai_whisper-large-v3-v20240930_547MB": "c945dad68449ac3c78ecb2d561ac189d", 180 | "openai_whisper-large-v3-v20240930_626MB": "578fe5a07f4eb7e4187c920bca571aa5", 181 | "openai_whisper-large-v3-v20240930_turbo": "dfbf09ab741af1d5400ddbd07bb37dad", 182 | "openai_whisper-large-v3-v20240930_turbo_632MB": "33954440dbd785ca1828afe25514f5a5", 183 | "openai_whisper-large-v3": "a6f24dc72785722e9cea89e227856dfe", 184 | "openai_whisper-large-v3_947MB": "ef6b0e9622a046ce2361b4c72307877f", 185 | "openai_whisper-large-v3_turbo": "c550fbdea70c5784d322c0a427f8b5cd", 186 | "openai_whisper-large-v3_turbo_954MB": "e639c4bb98d905064ef5dd38757dd9d1", 187 | "openai_whisper-small.en": "38efe6a00706bbdb995795c67a836e5e", 188 | "openai_whisper-small": "f1d21adb950bc9be5d5343bcdeccd23b", 189 | "openai_whisper-tiny.en": "e1183fd55448923b1ce43a2da67aa21f", 190 | "openai_whisper-tiny": "7147518a3d68ddbea0691e04cfffa4ff" 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/es_test_clip.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/es_test_clip.wav -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/ja_test_clip.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/ja_test_clip.wav -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/jfk.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/jfk.wav -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/jfk_441khz.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/jfk_441khz.m4a -------------------------------------------------------------------------------- /Tests/WhisperKitTests/Resources/ted_60.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argmaxinc/WhisperKit/cd16844c270b7e78bf21d24b1d9ff7bc88e904e4/Tests/WhisperKitTests/Resources/ted_60.m4a -------------------------------------------------------------------------------- /Tests/WhisperKitTests/TestUtils.swift: -------------------------------------------------------------------------------- 1 | // For licensing see accompanying LICENSE.md file. 2 | // Copyright © 2024 Argmax, Inc. All rights reserved. 3 | 4 | import Accelerate 5 | import AVFAudio 6 | import Combine 7 | import CoreML 8 | import Foundation 9 | @testable import WhisperKit 10 | import XCTest 11 | 12 | enum TestError: Error { 13 | case missingFile(String) 14 | case missingDirectory(String) 15 | } 16 | 17 | @discardableResult 18 | func XCTUnwrapAsync( 19 | _ expression: @autoclosure () async throws -> T, 20 | _ message: @autoclosure () -> String = "", 21 | file: StaticString = #filePath, 22 | line: UInt = #line 23 | ) async throws -> T { 24 | let evaluated = try? await expression() 25 | return try XCTUnwrap(evaluated, message(), file: file, line: line) 26 | } 27 | 28 | @discardableResult 29 | func XCTUnwrapAsync( 30 | _ expression: @autoclosure () async throws -> T?, 31 | _ message: @autoclosure () -> String = "", 32 | file: StaticString = #filePath, 33 | line: UInt = #line 34 | ) async throws -> T { 35 | let evaluated = try? await expression() 36 | return try XCTUnwrap(evaluated, message(), file: file, line: line) 37 | } 38 | 39 | func XCTAssertNoThrowAsync( 40 | _ expression: @autoclosure () async throws -> T, 41 | _ message: @autoclosure () -> String = "", 42 | file: StaticString = #filePath, 43 | line: UInt = #line 44 | ) async { 45 | do { 46 | _ = try await expression() 47 | } catch { 48 | XCTFail(message(), file: file, line: line) 49 | } 50 | } 51 | 52 | func XCTAssertNoThrowAsync( 53 | _ expression: @autoclosure () async throws -> T?, 54 | _ message: @autoclosure () -> String = "", 55 | file: StaticString = #filePath, 56 | line: UInt = #line 57 | ) async { 58 | do { 59 | _ = try await expression() 60 | } catch { 61 | XCTFail(message(), file: file, line: line) 62 | } 63 | } 64 | 65 | func XCTAssertNoThrowAsync( 66 | _ expression: @autoclosure () async throws -> Void, 67 | _ message: @autoclosure () -> String = "", 68 | file: StaticString = #filePath, 69 | line: UInt = #line 70 | ) async { 71 | do { 72 | try await expression() 73 | } catch { 74 | XCTFail(message(), file: file, line: line) 75 | } 76 | } 77 | 78 | // MARK: Helpers 79 | 80 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 81 | extension Bundle { 82 | static func current(for classObject: AnyObject? = nil) -> Bundle { 83 | #if SWIFT_PACKAGE 84 | return Bundle.module 85 | #else 86 | // Use bundle for class type if passed in 87 | if let classObject = classObject { 88 | return Bundle(for: type(of: classObject)) 89 | } else { 90 | return Bundle.main 91 | } 92 | #endif 93 | } 94 | } 95 | 96 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 97 | extension FileManager { 98 | func allocatedSizeOfDirectory(at url: URL) throws -> Int64 { 99 | guard let enumerator = enumerator(at: url, includingPropertiesForKeys: [.totalFileAllocatedSizeKey, .fileAllocatedSizeKey]) else { 100 | throw NSError(domain: NSCocoaErrorDomain, code: NSFileReadUnknownError, userInfo: nil) 101 | } 102 | 103 | var accumulatedSize: Int64 = 0 104 | for case let fileURL as URL in enumerator { 105 | let resourceValues = try fileURL.resourceValues(forKeys: [.totalFileAllocatedSizeKey, .fileAllocatedSizeKey]) 106 | accumulatedSize += Int64(resourceValues.totalFileAllocatedSize ?? resourceValues.fileAllocatedSize ?? 0) 107 | } 108 | return accumulatedSize 109 | } 110 | } 111 | 112 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 113 | extension MLMultiArray { 114 | /// Create `MLMultiArray` of shape [1, 1, arr.count] and fill up the last 115 | /// dimension with with values from arr. 116 | static func logits(_ arr: [FloatType]) throws -> MLMultiArray { 117 | let logits = try MLMultiArray(shape: [1, 1, arr.count] as [NSNumber], dataType: .float16) 118 | let ptr = UnsafeMutablePointer(OpaquePointer(logits.dataPointer)) 119 | for (index, value) in arr.enumerated() { 120 | let linearOffset = logits.linearOffset(for: [0, 0, index as NSNumber]) 121 | ptr[linearOffset] = value 122 | } 123 | return logits 124 | } 125 | 126 | /// Get the data from `MLMultiArray` for given dimension 127 | func data(for dimension: Int) -> [FloatType] { 128 | let count = shape[dimension].intValue 129 | let indexes = stride(from: 0, to: count, by: 1).map { [0, 0, $0 as NSNumber] } 130 | var result = [FloatType]() 131 | let ptr = UnsafeMutablePointer(OpaquePointer(dataPointer)) 132 | for index in indexes { 133 | let linearOffset = linearOffset(for: index as [NSNumber]) 134 | result.append(ptr[linearOffset]) 135 | } 136 | return result 137 | } 138 | } 139 | 140 | @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) 141 | extension XCTestCase { 142 | func transcribe( 143 | with variant: ModelVariant, 144 | options: DecodingOptions, 145 | callback: TranscriptionCallback = nil, 146 | audioFile: String = "jfk.wav", 147 | file: StaticString = #file, 148 | line: UInt = #line 149 | ) async throws -> [TranscriptionResult] { 150 | let modelName: String 151 | switch variant { 152 | case .largev3: 153 | modelName = "large-v3" 154 | default: 155 | modelName = "tiny" 156 | } 157 | let config = WhisperKitConfig(model: modelName, verbose: true, logLevel: .debug) 158 | let whisperKit = try await WhisperKit(config) 159 | trackForMemoryLeaks(on: whisperKit, file: file, line: line) 160 | 161 | let audioComponents = audioFile.components(separatedBy: ".") 162 | guard let audioFileURL = Bundle.current(for: self).path(forResource: audioComponents.first, ofType: audioComponents.last) else { 163 | throw TestError.missingFile("Missing audio file") 164 | } 165 | return try await whisperKit.transcribe(audioPath: audioFileURL, decodeOptions: options, callback: callback) 166 | } 167 | 168 | func tinyModelPath() async throws -> String { 169 | let modelDir = try await WhisperKit.download(variant: "tiny").path() 170 | return modelDir 171 | } 172 | 173 | func largev3ModelPath() throws -> String { 174 | let modelDir = "whisperkit-coreml/openai_whisper-large-v3" // use faster to compile model for tests 175 | guard let modelPath = Bundle.current(for: self).urls(forResourcesWithExtension: "mlmodelc", subdirectory: modelDir)?.first?.deletingLastPathComponent().path else { 176 | throw TestError.missingFile("Failed to load model, ensure \"Models/\(modelDir)\" exists via Makefile command: `make download-models`") 177 | } 178 | return modelPath 179 | } 180 | 181 | func largev3TurboModelPath() throws -> String { 182 | let modelDir = "whisperkit-coreml/openai_whisper-large-v3_turbo" 183 | guard let modelPath = Bundle.current(for: self).urls(forResourcesWithExtension: "mlmodelc", subdirectory: modelDir)?.first?.deletingLastPathComponent().path else { 184 | throw TestError.missingFile("Failed to load model, ensure \"Models/\(modelDir)\" exists via Makefile command: `make download-models`") 185 | } 186 | return modelPath 187 | } 188 | 189 | func allModelPaths() throws -> [String] { 190 | let fileManager = FileManager.default 191 | var modelPaths: [String] = [] 192 | let directory = "whisperkit-coreml" 193 | let resourceKeys: [URLResourceKey] = [.isDirectoryKey] 194 | guard let baseurl = Bundle.current(for: self).resourceURL?.appendingPathComponent(directory) else { 195 | throw TestError.missingDirectory("Base URL for directory \(directory) not found.") 196 | } 197 | let directoryContents = try fileManager.contentsOfDirectory(at: baseurl, includingPropertiesForKeys: resourceKeys, options: .skipsHiddenFiles) 198 | for folderURL in directoryContents { 199 | let resourceValues = try folderURL.resourceValues(forKeys: Set(resourceKeys)) 200 | if resourceValues.isDirectory == true { 201 | // Check if the directory contains actual data files, or if it contains pointer files. 202 | // As a proxy, use the MelSpectrogramc.mlmodel/coredata.bin file. 203 | let proxyFileToCheck = folderURL.appendingPathComponent("MelSpectrogram.mlmodelc/coremldata.bin") 204 | if try isGitLFSPointerFile(url: proxyFileToCheck) { 205 | continue 206 | } 207 | 208 | // Check if the directory name contains the quantization pattern 209 | // Only test large quantized models 210 | let dirName = folderURL.lastPathComponent 211 | if !(dirName.contains("q") && !dirName.contains("large")) { 212 | modelPaths.append(folderURL.absoluteString) 213 | } 214 | } 215 | } 216 | return modelPaths 217 | } 218 | 219 | /// Function to check if the beginning of the file matches a Git LFS pointer pattern 220 | func isGitLFSPointerFile(url: URL) throws -> Bool { 221 | let fileHandle = try FileHandle(forReadingFrom: url) 222 | // Read the first few bytes of the file to get enough for the Git LFS pointer signature 223 | let data = fileHandle.readData(ofLength: 512) // Read first 512 bytes 224 | fileHandle.closeFile() 225 | if let string = String(data: data, encoding: .utf8), 226 | string.starts(with: "version https://git-lfs.github.com/") 227 | { 228 | return true 229 | } 230 | return false 231 | } 232 | 233 | func trackForMemoryLeaks(on instance: AnyObject, file: StaticString = #filePath, line: UInt = #line) { 234 | addTeardownBlock { [weak instance] in 235 | XCTAssertNil(instance, "Detected potential memory leak", file: file, line: line) 236 | } 237 | } 238 | 239 | /// Helper to create an extended audio buffer by repeating the original buffer 240 | func createExtendedBuffer(from originalBuffer: AVAudioPCMBuffer, repeatCount: Int) -> AVAudioPCMBuffer { 241 | let frameCount = originalBuffer.frameLength 242 | let totalFrames = frameCount * AVAudioFrameCount(repeatCount) 243 | 244 | // Create new buffer with same format but longer length 245 | let extendedBuffer = AVAudioPCMBuffer( 246 | pcmFormat: originalBuffer.format, 247 | frameCapacity: totalFrames 248 | )! 249 | extendedBuffer.frameLength = totalFrames 250 | 251 | // For each channel 252 | for channel in 0.. AVAudioPCMBuffer { 278 | guard FileManager.default.fileExists(atPath: audioFilePath) else { 279 | throw WhisperError.loadAudioFailed("Resource path does not exist \(audioFilePath)") 280 | } 281 | 282 | let audioFileURL = URL(fileURLWithPath: audioFilePath) 283 | 284 | // Create an audio file with original format preserved 285 | let audioFile = try AVAudioFile(forReading: audioFileURL) 286 | 287 | // Create a buffer with the original format (preserving all channels) 288 | guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, 289 | frameCapacity: AVAudioFrameCount(audioFile.length)) 290 | else { 291 | throw WhisperError.loadAudioFailed("Unable to create audio buffer") 292 | } 293 | 294 | // Read the entire file into the buffer 295 | try audioFile.read(into: buffer) 296 | 297 | return buffer 298 | } 299 | 300 | /// Helper to measure channel processing operations 301 | func measureChannelProcessing(buffer: AVAudioPCMBuffer, mode: AudioInputConfig.ChannelMode, iterations: Int = 5) -> Double { 302 | // Add warm-up iterations 303 | for _ in 0..<3 { 304 | _ = AudioProcessor.convertToMono(buffer, mode: mode) 305 | } 306 | 307 | var totalTime: Double = 0 308 | // Then measure the actual timing 309 | for _ in 0.. SpecialTokens { 334 | SpecialTokens( 335 | endToken: endToken, 336 | englishToken: englishToken, 337 | noSpeechToken: noSpeechToken, 338 | noTimestampsToken: noTimestampsToken, 339 | specialTokenBegin: specialTokenBegin, 340 | startOfPreviousToken: startOfPreviousToken, 341 | startOfTranscriptToken: startOfTranscriptToken, 342 | timeTokenBegin: timeTokenBegin, 343 | transcribeToken: transcribeToken, 344 | translateToken: translateToken, 345 | whitespaceToken: whitespaceToken 346 | ) 347 | } 348 | } 349 | 350 | extension Result { 351 | var isSuccess: Bool { 352 | switch self { 353 | case .success: 354 | return true 355 | case .failure: 356 | return false 357 | } 358 | } 359 | 360 | func whisperError() -> WhisperError? { 361 | switch self { 362 | case .success: 363 | return nil 364 | case let .failure(error): 365 | return error as? WhisperError 366 | } 367 | } 368 | } 369 | 370 | extension Result where Success == [TranscriptionResult] { 371 | func normalizedText(prefix: Int) throws -> String { 372 | try get().text.normalized.split(separator: " ").prefix(prefix).joined(separator: " ") 373 | } 374 | } 375 | 376 | extension Collection where Element == TranscriptionResult { 377 | var text: String { 378 | map(\.text).joined(separator: " ") 379 | } 380 | } 381 | 382 | extension Collection where Element == TranscriptionResult { 383 | var segments: [TranscriptionSegment] { 384 | flatMap(\.segments) 385 | } 386 | } 387 | 388 | public extension Publisher { 389 | func withPrevious() -> AnyPublisher<(previous: Output?, current: Output), Failure> { 390 | scan((Output?, Output)?.none) { ($0?.1, $1) } 391 | .compactMap { $0 } 392 | .eraseToAnyPublisher() 393 | } 394 | } 395 | -------------------------------------------------------------------------------- /fastlane/README.md: -------------------------------------------------------------------------------- 1 | fastlane documentation 2 | ---- 3 | 4 | # Installation 5 | 6 | Make sure you have the latest version of the Xcode command line tools installed: 7 | 8 | ```sh 9 | xcode-select --install 10 | ``` 11 | 12 | For _fastlane_ installation instructions, see [Installing _fastlane_](https://docs.fastlane.tools/#installing-fastlane) 13 | 14 | # Available Actions 15 | 16 | ## iOS 17 | 18 | ### ios list_devices 19 | 20 | ```sh 21 | [bundle exec] fastlane ios list_devices 22 | ``` 23 | 24 | List all connected devices 25 | 26 | ### ios benchmark 27 | 28 | ```sh 29 | [bundle exec] fastlane ios benchmark 30 | ``` 31 | 32 | Benchmark devices with options 33 | 34 | ### ios extract_results 35 | 36 | ```sh 37 | [bundle exec] fastlane ios extract_results 38 | ``` 39 | 40 | Extract benchmark results 41 | 42 | ### ios upload_results 43 | 44 | ```sh 45 | [bundle exec] fastlane ios upload_results 46 | ``` 47 | 48 | Upload benchmark results 49 | 50 | ---- 51 | 52 | This README.md is auto-generated and will be re-generated every time [_fastlane_](https://fastlane.tools) is run. 53 | 54 | More information about _fastlane_ can be found on [fastlane.tools](https://fastlane.tools). 55 | 56 | The documentation of _fastlane_ can be found on [docs.fastlane.tools](https://docs.fastlane.tools). 57 | --------------------------------------------------------------------------------