├── models
    ├── .gitignore
    ├── for-tests-ggml-base.bin
    ├── for-tests-ggml-large.bin
    ├── for-tests-ggml-small.bin
    ├── for-tests-ggml-tiny.bin
    ├── for-tests-ggml-base.en.bin
    ├── for-tests-ggml-medium.bin
    ├── for-tests-ggml-small.en.bin
    ├── for-tests-ggml-tiny.en.bin
    ├── for-tests-ggml-medium.en.bin
    ├── download-ggml-model.cmd
    ├── download-ggml-model.sh
    ├── README.md
    ├── convert-h5-to-ggml.py
    └── convert-pt-to-ggml.py
├── samples
    ├── .gitignore
    ├── jfk.wav
    └── README.md
├── bindings
    ├── javascript
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   └── emscripten.cpp
    └── CMakeLists.txt
├── tests
    ├── .gitignore
    ├── en-2-ref.txt
    ├── es-0-ref.txt
    ├── en-0-ref.txt
    ├── en-1-ref.txt
    ├── CMakeLists.txt
    └── run-tests.sh
├── .gitmodules
├── examples
    ├── whisper.objc
    │   ├── whisper.objc
    │   │   ├── Assets.xcassets
    │   │   │   ├── Contents.json
    │   │   │   ├── AccentColor.colorset
    │   │   │   │   └── Contents.json
    │   │   │   └── AppIcon.appiconset
    │   │   │   │   └── Contents.json
    │   │   ├── AppDelegate.h
    │   │   ├── SceneDelegate.h
    │   │   ├── main.m
    │   │   ├── Info.plist
    │   │   ├── ViewController.h
    │   │   ├── AppDelegate.m
    │   │   ├── Base.lproj
    │   │   │   ├── LaunchScreen.storyboard
    │   │   │   └── Main.storyboard
    │   │   ├── SceneDelegate.m
    │   │   └── ViewController.m
    │   ├── whisper.objc.xcodeproj
    │   │   └── project.xcworkspace
    │   │   │   ├── contents.xcworkspacedata
    │   │   │   └── xcshareddata
    │   │   │       └── IDEWorkspaceChecks.plist
    │   └── README.md
    ├── main
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── bench
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── bench.cpp
    ├── stream
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── command
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── whisper.wasm
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── stream.wasm
    │   ├── README.md
    │   ├── CMakeLists.txt
    │   ├── emscripten.cpp
    │   └── index-tmpl.html
    ├── command.wasm
    │   ├── README.md
    │   ├── CMakeLists.txt
    │   └── emscripten.cpp
    ├── CMakeLists.txt
    ├── talk.wasm
    │   ├── gpt-2.h
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── emscripten.cpp
    ├── generate-karaoke.sh
    ├── whisper.nvim
    │   ├── whisper.nvim
    │   └── README.md
    ├── twitch.sh
    ├── livestream.sh
    ├── yt-wsp.sh
    └── helpers.js
├── extra
    ├── sha-all.sh
    ├── convert-all.sh
    ├── deploy-wasm.sh
    └── bench-all.sh
├── .gitignore
├── cmake
    ├── GitVars.cmake
    └── BuildTypes.cmake
├── LICENSE
├── .github
    └── workflows
    │   └── build.yml
├── Makefile
├── CMakeLists.txt
└── whisper.h


/models/.gitignore:
--------------------------------------------------------------------------------
1 | *.bin
2 | 


--------------------------------------------------------------------------------
/samples/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/bindings/javascript/.gitignore:
--------------------------------------------------------------------------------
1 | publish.log
2 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | *.wav
2 | *.ogg
3 | *.wav.txt
4 | 


--------------------------------------------------------------------------------
/bindings/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if (EMSCRIPTEN)
2 |     add_subdirectory(javascript)
3 | endif()
4 | 


--------------------------------------------------------------------------------
/samples/jfk.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/samples/jfk.wav


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "bindings/ios"]
2 | 	path = bindings/ios
3 | 	url = https://github.com/ggerganov/whisper.spm
4 | 


--------------------------------------------------------------------------------
/models/for-tests-ggml-base.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-base.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-large.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-large.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-small.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-small.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-tiny.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-tiny.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-base.en.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-base.en.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-medium.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-medium.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-small.en.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-small.en.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-tiny.en.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-tiny.en.bin


--------------------------------------------------------------------------------
/models/for-tests-ggml-medium.en.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/whisper.cpp/master/models/for-tests-ggml-medium.en.bin


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 |   "info" : {
3 |     "author" : "xcode",
4 |     "version" : 1
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/main/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(TARGET main)
2 | add_executable(${TARGET} main.cpp)
3 | target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
4 | 


--------------------------------------------------------------------------------
/examples/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(TARGET bench)
2 | add_executable(${TARGET} bench.cpp)
3 | target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
4 | 


--------------------------------------------------------------------------------
/extra/sha-all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Compute the SHA1 of all model files in ./models/ggml-*.bin
4 | 
5 | for f in ./models/ggml-*.bin; do
6 |     shasum "$f" -a 1
7 | done
8 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <Workspace
3 |    version = "1.0">
4 |    <FileRef
5 |       location = "self:">
6 |    </FileRef>
7 | </Workspace>
8 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/Assets.xcassets/AccentColor.colorset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "colors" : [
 3 |     {
 4 |       "idiom" : "universal"
 5 |     }
 6 |   ],
 7 |   "info" : {
 8 |     "author" : "xcode",
 9 |     "version" : 1
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/AppDelegate.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AppDelegate.h
 3 | //  whisper.objc
 4 | //
 5 | //  Created by Georgi Gerganov on 23.10.22.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | 
10 | @interface AppDelegate : UIResponder <UIApplicationDelegate>
11 | 
12 | 
13 | @end
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "images" : [
 3 |     {
 4 |       "idiom" : "universal",
 5 |       "platform" : "ios",
 6 |       "size" : "1024x1024"
 7 |     }
 8 |   ],
 9 |   "info" : {
10 |     "author" : "xcode",
11 |     "version" : 1
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/examples/stream/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if (WHISPER_SUPPORT_SDL2)
2 |     # stream
3 |     set(TARGET stream)
4 |     add_executable(${TARGET} stream.cpp)
5 |     target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6 |     target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
7 | endif ()
8 | 


--------------------------------------------------------------------------------
/examples/command/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if (WHISPER_SUPPORT_SDL2)
2 |     # command
3 |     set(TARGET command)
4 |     add_executable(${TARGET} command.cpp)
5 |     target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6 |     target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
7 | endif ()
8 | 


--------------------------------------------------------------------------------
/extra/convert-all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
4 | 
5 | for model in "${models[@]}"; do
6 |     python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
7 |     mv -v models/ggml-model.bin models/ggml-$model.bin
8 | done
9 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/SceneDelegate.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  SceneDelegate.h
 3 | //  whisper.objc
 4 | //
 5 | //  Created by Georgi Gerganov on 23.10.22.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | 
10 | @interface SceneDelegate : UIResponder <UIWindowSceneDelegate>
11 | 
12 | @property (strong, nonatomic) UIWindow * window;
13 | 
14 | @end
15 | 
16 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3 | <plist version="1.0">
4 | <dict>
5 | 	<key>IDEDidComputeMac32BitWarning</key>
6 | 	<true/>
7 | </dict>
8 | </plist>
9 | 


--------------------------------------------------------------------------------
/samples/README.md:
--------------------------------------------------------------------------------
1 | # Audio samples
2 | 
3 | This folder contains various audio files used for testing.
4 | If you want to quickly get some more samples, simply run `make samples`. This will download several public audio files and convert them to appropriate 16-bit WAV format using `ffmpeg`
5 | 
6 | https://github.com/ggerganov/whisper.cpp/blob/a09ce6e8899198015729ffc49ae10f67370906b1/Makefile#L104-L123
7 | 


--------------------------------------------------------------------------------
/examples/whisper.wasm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(TARGET whisper.wasm)
2 | 
3 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
4 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
5 | configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js  COPYONLY)
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | .cache/
 3 | .vs/
 4 | .vscode/
 5 | .DS_Store
 6 | 
 7 | build/
 8 | build-em/
 9 | build-debug/
10 | build-release/
11 | build-sanitize-addr/
12 | build-sanitize-thread/
13 | 
14 | main
15 | stream
16 | command
17 | bench
18 | sync.sh
19 | libwhisper.so
20 | compile_commands.json
21 | 
22 | examples/arm_neon.h
23 | examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
24 | examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
25 | examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
26 | 


--------------------------------------------------------------------------------
/examples/stream.wasm/README.md:
--------------------------------------------------------------------------------
 1 | # stream.wasm
 2 | 
 3 | Real-time transcription in the browser using WebAssembly
 4 | 
 5 | Online demo: https://whisper.ggerganov.com/stream/
 6 | 
 7 | ## Build instructions
 8 | 
 9 | ```bash
10 | # build using Emscripten (v3.1.2)
11 | git clone https://github.com/ggerganov/whisper.cpp
12 | cd whisper.cpp
13 | mkdir build-em && cd build-em
14 | emcmake cmake ..
15 | make -j
16 | 
17 | # copy the produced page to your HTTP path
18 | cp bin/stream.wasm/*       /path/to/html/
19 | cp bin/libstream.worker.js /path/to/html/
20 | ```
21 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/main.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  main.m
 3 | //  whisper.objc
 4 | //
 5 | //  Created by Georgi Gerganov on 23.10.22.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | #import "AppDelegate.h"
10 | 
11 | int main(int argc, char * argv[]) {
12 |     NSString * appDelegateClassName;
13 |     @autoreleasepool {
14 |         // Setup code that might create autoreleased objects goes here.
15 |         appDelegateClassName = NSStringFromClass([AppDelegate class]);
16 |     }
17 |     return UIApplicationMain(argc, argv, nil, appDelegateClassName);
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/command.wasm/README.md:
--------------------------------------------------------------------------------
 1 | # command.wasm
 2 | 
 3 | This is a basic Voice Assistant example that accepts voice commands from the microphone.
 4 | It runs in fully in the browser via WebAseembly.
 5 | 
 6 | Online demo: https://whisper.ggerganov.com/command/
 7 | 
 8 | Terminal version: [examples/command](/examples/command)
 9 | 
10 | ## Build instructions
11 | 
12 | ```bash
13 | # build using Emscripten (v3.1.2)
14 | git clone https://github.com/ggerganov/whisper.cpp
15 | cd whisper.cpp
16 | mkdir build-em && cd build-em
17 | emcmake cmake ..
18 | make -j
19 | 
20 | # copy the produced page to your HTTP path
21 | cp bin/command.wasm/*       /path/to/html/
22 | cp bin/libcommand.worker.js /path/to/html/
23 | ```
24 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/README.md:
--------------------------------------------------------------------------------
 1 | # whisper.objc
 2 | 
 3 | Minimal Obj-C application for automatic offline speech recognition.
 4 | The inference runs locally, on-device.
 5 | 
 6 | https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
 7 | 
 8 | Real-time transcription demo:
 9 | 
10 | https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-bca8-0e46d9da2364.mp4
11 | 
12 | ## Usage
13 | 
14 | ```java
15 | git clone https://github.com/ggerganov/whisper.cpp
16 | open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
17 | ```
18 | 
19 | Make sure to build the project in `Release`:
20 | 
21 | <img width="947" alt="image" src="https://user-images.githubusercontent.com/1991296/197382607-9e1e6d1b-79fa-496f-9d16-b71dc1535701.png">
22 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # dependencies
 2 | 
 3 | find_package(Threads REQUIRED)
 4 | 
 5 | # third-party
 6 | 
 7 | if (WHISPER_SUPPORT_SDL2)
 8 |     # SDL2
 9 |     find_package(SDL2 REQUIRED)
10 | 
11 |     string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
12 | 
13 |     message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
14 |     message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
15 | endif()
16 | 
17 | # examples
18 | 
19 | include_directories(${CMAKE_CURRENT_SOURCE_DIR})
20 | 
21 | if (EMSCRIPTEN)
22 |     add_subdirectory(whisper.wasm)
23 |     add_subdirectory(stream.wasm)
24 |     add_subdirectory(command.wasm)
25 |     add_subdirectory(talk.wasm)
26 | else()
27 |     add_subdirectory(main)
28 |     add_subdirectory(stream)
29 |     add_subdirectory(command)
30 |     add_subdirectory(bench)
31 | endif()
32 | 


--------------------------------------------------------------------------------
/cmake/GitVars.cmake:
--------------------------------------------------------------------------------
 1 | find_package(Git)
 2 | 
 3 | # the commit's SHA1
 4 | execute_process(COMMAND
 5 |     "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
 6 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
 7 |     OUTPUT_VARIABLE GIT_SHA1
 8 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 9 | 
10 | # the date of the commit
11 | execute_process(COMMAND
12 |     "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
13 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
14 |     OUTPUT_VARIABLE GIT_DATE
15 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
16 | 
17 | # the subject of the commit
18 | execute_process(COMMAND
19 |     "${GIT_EXECUTABLE}" log -1 --format=%s
20 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
21 |     OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
22 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
23 | 


--------------------------------------------------------------------------------
/examples/talk.wasm/gpt-2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // TODO: Change to C-style API and move to ./examples for easy reuse.
 4 | 
 5 | #include <vector>
 6 | #include <map>
 7 | #include <string>
 8 | 
 9 | struct gpt_vocab {
10 |     using id    = int32_t;
11 |     using token = std::string;
12 | 
13 |     std::map<token, id> token_to_id;
14 |     std::map<id, token> id_to_token;
15 | };
16 | 
17 | struct gpt2_context;
18 | 
19 | struct gpt2_context * gpt2_init(const char * path_model);
20 | void gpt2_free(struct gpt2_context * ctx);
21 | 
22 | const char * gpt2_get_prompt(struct gpt2_context * ctx);
23 | void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
24 | 
25 | std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
26 | 
27 | std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
28 | 


--------------------------------------------------------------------------------
/tests/en-2-ref.txt:
--------------------------------------------------------------------------------
1 |  This is the Micro Machine Man presenting the most midget miniature motorcade of Micro Machines. Each one has dramatic details, terrific trim, precision paint jobs, plus incredible Micro Machine Pocket Playsets. There's a police station, fire station, restaurant, service station, and more. Perfect pocket portables to take anyplace. And there are many miniature playsets to play with, and each one comes with its own special edition Micro Machine vehicle and fun, fantastic features that miraculously move. Raise the boat lift at the airport marina, man the gun turret at the army base, clean your car at the car wash, raise the toll bridge. And these playsets fit together to form a Micro Machine world. Micro Machine Pocket Playsets, so tremendously tiny, so perfectly precise, so dazzlingly detailed, you'll want to pocket them all. Micro Machines are Micro Machine Pocket Playsets sold separately from Galoob. The smaller they are, the better they are.


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/Info.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>NSMicrophoneUsageDescription</key>
 6 | 	<string>This app requires microphone access in order to transcribe speech</string>
 7 | 	<key>UIApplicationSceneManifest</key>
 8 | 	<dict>
 9 | 		<key>UIApplicationSupportsMultipleScenes</key>
10 | 		<false/>
11 | 		<key>UISceneConfigurations</key>
12 | 		<dict>
13 | 			<key>UIWindowSceneSessionRoleApplication</key>
14 | 			<array>
15 | 				<dict>
16 | 					<key>UISceneConfigurationName</key>
17 | 					<string>Default Configuration</string>
18 | 					<key>UISceneDelegateClassName</key>
19 | 					<string>SceneDelegate</string>
20 | 					<key>UISceneStoryboardFile</key>
21 | 					<string>Main</string>
22 | 				</dict>
23 | 			</array>
24 | 		</dict>
25 | 	</dict>
26 | </dict>
27 | </plist>
28 | 


--------------------------------------------------------------------------------
/examples/stream/README.md:
--------------------------------------------------------------------------------
 1 | # stream
 2 | 
 3 | This is a naive example of performing real-time inference on audio from your microphone.
 4 | The `stream` tool samples the audio every half a second and runs the transcription continously.
 5 | More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 6 | 
 7 | ```java
 8 | ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 9 | ```
10 | 
11 | https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12 | 
13 | The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
14 | 
15 | ```bash
16 | # Install SDL2 on Linux
17 | sudo apt-get install libsdl2-dev
18 | 
19 | # Install SDL2 on Mac OS
20 | brew install sdl2
21 | 
22 | make stream
23 | ```
24 | 
25 | ## Web version
26 | 
27 | This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
28 | 


--------------------------------------------------------------------------------
/bindings/javascript/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TARGET libwhisper)
 2 | 
 3 | add_executable(${TARGET}
 4 |     emscripten.cpp
 5 |     )
 6 | 
 7 | target_link_libraries(${TARGET} PRIVATE
 8 |     whisper
 9 |     )
10 | 
11 | unset(EXTRA_FLAGS)
12 | 
13 | if (WHISPER_WASM_SINGLE_FILE)
14 |     set(EXTRA_FLAGS "-s SINGLE_FILE=1")
15 |     message(STATUS "Embedding WASM inside whisper.js")
16 | 
17 |     add_custom_command(
18 |         TARGET ${TARGET} POST_BUILD
19 |         COMMAND ${CMAKE_COMMAND} -E copy
20 |         ${CMAKE_BINARY_DIR}/bin/libwhisper.js
21 |         ${CMAKE_CURRENT_SOURCE_DIR}/whisper.js
22 |         )
23 | endif()
24 | 
25 | set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
26 |     --bind \
27 |     -s USE_PTHREADS=1 \
28 |     -s PTHREAD_POOL_SIZE=8 \
29 |     -s INITIAL_MEMORY=1610612736 \
30 |     -s TOTAL_MEMORY=1610612736 \
31 |     -s FORCE_FILESYSTEM=1 \
32 |     -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
33 |     ${EXTRA_FLAGS} \
34 |     ")
35 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/ViewController.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ViewController.h
 3 | //  whisper.objc
 4 | //
 5 | //  Created by Georgi Gerganov on 23.10.22.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | 
10 | #import <AVFoundation/AVFoundation.h>
11 | #import <AudioToolbox/AudioQueue.h>
12 | 
13 | #define NUM_BUFFERS 3
14 | #define MAX_AUDIO_SEC 30
15 | #define SAMPLE_RATE 16000
16 | 
17 | struct whisper_context;
18 | 
19 | typedef struct
20 | {
21 |     int ggwaveId;
22 |     bool isCapturing;
23 |     bool isTranscribing;
24 |     bool isRealtime;
25 |     UILabel * labelReceived;
26 | 
27 |     AudioQueueRef queue;
28 |     AudioStreamBasicDescription dataFormat;
29 |     AudioQueueBufferRef buffers[NUM_BUFFERS];
30 | 
31 |     int n_samples;
32 |     int16_t * audioBufferI16;
33 |     float   * audioBufferF32;
34 | 
35 |     struct whisper_context * ctx;
36 | 
37 |     void * vc;
38 | } StateInp;
39 | 
40 | @interface ViewController : UIViewController
41 | {
42 |     StateInp stateInp;
43 | }
44 | 
45 | @end
46 | 


--------------------------------------------------------------------------------
/examples/command/README.md:
--------------------------------------------------------------------------------
 1 | # command
 2 | 
 3 | This is a basic Voice Assistant example that accepts voice commands from the microphone.
 4 | More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
 5 | 
 6 | ```bash
 7 | # Run with default arguments and small model
 8 | ./command -m ./models/ggml-small.en.bin -t 8
 9 | 
10 | # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
11 | ./command -m ./models/ggml-tiny.en.bin -ac 768 -t 4 -c 0
12 | ```
13 | 
14 | https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
15 | 
16 | Web version: [examples/command.wasm](/examples/command.wasm)
17 | 
18 | ## Building
19 | 
20 | The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
21 | 
22 | ```bash
23 | # Install SDL2 on Linux
24 | sudo apt-get install libsdl2-dev
25 | 
26 | # Install SDL2 on Mac OS
27 | brew install sdl2
28 | 
29 | make command
30 | ```
31 | 


--------------------------------------------------------------------------------
/extra/deploy-wasm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This is a helper script to deploy all WebAssembly examples to my node
 4 | # Run from the build directory:
 5 | #
 6 | # cd build-em
 7 | # ../extra/deploy-wasm.sh
 8 | #
 9 | 
10 | # check if emcmake is available
11 | if ! command -v emcmake &> /dev/null
12 | then
13 |     echo "Error: emscripten environment is not set up"
14 |     exit
15 | fi
16 | 
17 | emcmake cmake .. && make -j
18 | if [ $? -ne 0 ]; then
19 |     echo "Error: build failed"
20 |     exit
21 | fi
22 | 
23 | # copy all wasm files to the node
24 | scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/         && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
25 | scp bin/stream.wasm/*  root@linode0:/var/www/html/whisper/stream/  && scp bin/libstream.worker.js  root@linode0:/var/www/html/whisper/stream/
26 | scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
27 | scp bin/talk.wasm/*    root@linode0:/var/www/html/whisper/talk/    && scp bin/libtalk.worker.js    root@linode0:/var/www/html/whisper/talk/
28 | 
29 | echo "Done"
30 | exit
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Georgi Gerganov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/stream.wasm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # libstream
 3 | #
 4 | 
 5 | set(TARGET libstream)
 6 | 
 7 | add_executable(${TARGET}
 8 |     emscripten.cpp
 9 |     )
10 | 
11 | target_link_libraries(${TARGET} PRIVATE
12 |     whisper
13 |     )
14 | 
15 | unset(EXTRA_FLAGS)
16 | 
17 | if (WHISPER_WASM_SINGLE_FILE)
18 |     set(EXTRA_FLAGS "-s SINGLE_FILE=1")
19 |     message(STATUS "Embedding WASM inside stream.js")
20 | 
21 |     add_custom_command(
22 |         TARGET ${TARGET} POST_BUILD
23 |         COMMAND ${CMAKE_COMMAND} -E copy
24 |         ${CMAKE_BINARY_DIR}/bin/libstream.js
25 |         ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
26 |         )
27 | endif()
28 | 
29 | set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
30 |     --bind \
31 |     -s USE_PTHREADS=1 \
32 |     -s PTHREAD_POOL_SIZE=8 \
33 |     -s INITIAL_MEMORY=1024MB \
34 |     -s TOTAL_MEMORY=1024MB \
35 |     -s FORCE_FILESYSTEM=1 \
36 |     -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
37 |     ${EXTRA_FLAGS} \
38 |     ")
39 | 
40 | #
41 | # stream.wasm
42 | #
43 | 
44 | set(TARGET stream.wasm)
45 | 
46 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
47 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
48 | 


--------------------------------------------------------------------------------
/examples/talk.wasm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # libtalk
 3 | #
 4 | 
 5 | set(TARGET libtalk)
 6 | 
 7 | add_executable(${TARGET}
 8 |     emscripten.cpp
 9 |     gpt-2.cpp
10 |     )
11 | 
12 | target_link_libraries(${TARGET} PRIVATE
13 |     whisper
14 |     )
15 | 
16 | unset(EXTRA_FLAGS)
17 | 
18 | if (WHISPER_WASM_SINGLE_FILE)
19 |     set(EXTRA_FLAGS "-s SINGLE_FILE=1")
20 |     message(STATUS "Embedding WASM inside talk.js")
21 | 
22 |     add_custom_command(
23 |         TARGET ${TARGET} POST_BUILD
24 |         COMMAND ${CMAKE_COMMAND} -E copy
25 |         ${CMAKE_BINARY_DIR}/bin/libtalk.js
26 |         ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/talk.wasm/talk.js
27 |         )
28 | endif()
29 | 
30 | set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
31 |     --bind \
32 |     -s USE_PTHREADS=1 \
33 |     -s PTHREAD_POOL_SIZE=8 \
34 |     -s INITIAL_MEMORY=1600MB \
35 |     -s TOTAL_MEMORY=1600MB \
36 |     -s FORCE_FILESYSTEM=1 \
37 |     -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
38 |     ${EXTRA_FLAGS} \
39 |     ")
40 | 
41 | #
42 | # talk.wasm
43 | #
44 | 
45 | set(TARGET talk.wasm)
46 | 
47 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
48 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
49 | 


--------------------------------------------------------------------------------
/examples/command.wasm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # libcommand
 3 | #
 4 | 
 5 | set(TARGET libcommand)
 6 | 
 7 | add_executable(${TARGET}
 8 |     emscripten.cpp
 9 |     )
10 | 
11 | target_link_libraries(${TARGET} PRIVATE
12 |     whisper
13 |     )
14 | 
15 | unset(EXTRA_FLAGS)
16 | 
17 | if (WHISPER_WASM_SINGLE_FILE)
18 |     set(EXTRA_FLAGS "-s SINGLE_FILE=1")
19 |     message(STATUS "Embedding WASM inside command.js")
20 | 
21 |     add_custom_command(
22 |         TARGET ${TARGET} POST_BUILD
23 |         COMMAND ${CMAKE_COMMAND} -E copy
24 |         ${CMAKE_BINARY_DIR}/bin/libcommand.js
25 |         ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
26 |         )
27 | endif()
28 | 
29 | set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
30 |     --bind \
31 |     -s USE_PTHREADS=1 \
32 |     -s PTHREAD_POOL_SIZE=8 \
33 |     -s INITIAL_MEMORY=1024MB \
34 |     -s TOTAL_MEMORY=1024MB \
35 |     -s FORCE_FILESYSTEM=1 \
36 |     -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
37 |     ${EXTRA_FLAGS} \
38 |     ")
39 | 
40 | #
41 | # command.wasm
42 | #
43 | 
44 | set(TARGET command.wasm)
45 | 
46 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
47 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
48 | 


--------------------------------------------------------------------------------
/examples/generate-karaoke.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Simple tool to record audio from the microphone and generate a karaoke video
 4 | # Usage:
 5 | #
 6 | #  cd whisper.cpp
 7 | #  make
 8 | #
 9 | #  ./examples/generate-karaoke.sh [model] [step_ms]
10 | #
11 | # Press Ctrl+C to stop recording
12 | #
13 | 
14 | executable="./main"
15 | model="base.en"
16 | model_path="models/ggml-$model.bin"
17 | 
18 | # require sox and ffmpeg to be installed
19 | if ! command -v sox &> /dev/null
20 | then
21 |     echo "sox could not be found"
22 |     exit 1
23 | fi
24 | 
25 | if ! command -v ffmpeg &> /dev/null
26 | then
27 |     echo "ffmpeg could not be found"
28 |     exit 2
29 | fi
30 | 
31 | if [ ! -f "$executable" ]; then
32 |     echo "'$executable' does not exist. Please build it first."
33 |     exit 3
34 | fi
35 | 
36 | if [ ! -f "$model_path" ]; then
37 |     echo "'$model_path' does not exist. Please download it first."
38 |     exit 4
39 | fi
40 | 
41 | # record some raw audio
42 | sox -d rec.wav
43 | 
44 | # resample to 16kHz
45 | ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
46 | 
47 | # run Whisper
48 | echo "Processing ..."
49 | ./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
50 | 
51 | # generate Karaoke video
52 | echo "Generating video ..."
53 | source rec16.wav.wts > /dev/null 2>&1
54 | 
55 | # play the video
56 | echo "Playing ./rec16.wav.mp4 ..."
57 | ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
58 | 
59 | echo "Done"
60 | exit 0
61 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/AppDelegate.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AppDelegate.m
 3 | //  whisper.objc
 4 | //
 5 | //  Created by Georgi Gerganov on 23.10.22.
 6 | //
 7 | 
 8 | #import "AppDelegate.h"
 9 | 
10 | @interface AppDelegate ()
11 | 
12 | @end
13 | 
14 | @implementation AppDelegate
15 | 
16 | 
17 | - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
18 |     // Override point for customization after application launch.
19 |     return YES;
20 | }
21 | 
22 | 
23 | #pragma mark - UISceneSession lifecycle
24 | 
25 | 
26 | - (UISceneConfiguration *)application:(UIApplication *)application configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession options:(UISceneConnectionOptions *)options {
27 |     // Called when a new scene session is being created.
28 |     // Use this method to select a configuration to create the new scene with.
29 |     return [[UISceneConfiguration alloc] initWithName:@"Default Configuration" sessionRole:connectingSceneSession.role];
30 | }
31 | 
32 | 
33 | - (void)application:(UIApplication *)application didDiscardSceneSessions:(NSSet<UISceneSession *> *)sceneSessions {
34 |     // Called when the user discards a scene session.
35 |     // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions.
36 |     // Use this method to release any resources that were specific to the discarded scenes, as they will not return.
37 | }
38 | 
39 | 
40 | @end
41 | 


--------------------------------------------------------------------------------
/models/download-ggml-model.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | pushd %~dp0
 4 | set models_path=%CD%
 5 | for %%d in (%~dp0..) do set root_path=%%~fd
 6 | popd
 7 | 
 8 | set argc=0
 9 | for %%x in (%*) do set /A argc+=1
10 | 
11 | set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
12 | 
13 | if %argc% neq 1 (
14 |   echo.
15 |   echo Usage: download-ggml-model.cmd model
16 |   CALL :list_models
17 |   goto :eof
18 | )
19 | 
20 | set model=%1
21 | 
22 | for %%b in (%models%) do (
23 |   if "%%b"=="%model%" (
24 |     CALL :download_model
25 |     goto :eof
26 |   )
27 | )
28 | 
29 | echo Invalid model: %model%
30 | CALL :list_models
31 | goto :eof
32 | 
33 | :download_model
34 | echo Downloading ggml model %model%...
35 | 
36 | cd %models_path%
37 | 
38 | if exist "ggml-%model%.bin" (
39 |   echo Model %model% already exists. Skipping download.
40 |   goto :eof
41 | )
42 | 
43 | PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://ggml.ggerganov.com/ggml-model-whisper-%model%.bin -OutFile ggml-%model%.bin"
44 | 
45 | if %ERRORLEVEL% neq 0 (
46 |   echo Failed to download ggml model %model%
47 |   echo Please try again later or download the original Whisper model files and convert them yourself.
48 |   goto :eof
49 | )
50 | 
51 | echo Done! Model %model% saved in %root_path%\models\ggml-%model%.bin
52 | echo You can now use it like this:
53 | echo main.exe -m %root_path%\models\ggml-%model%.bin -f %root_path%\samples\jfk.wav
54 | 
55 | goto :eof
56 | 
57 | :list_models
58 |   echo.
59 |   echo Available models:
60 |   (for %%a in (%models%) do ( 
61 |     echo %%a 
62 |   ))
63 |   echo.
64 |   exit /b
65 | 


--------------------------------------------------------------------------------
/extra/bench-all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Helper script to run the bench tool on all models and print the results in share-able format
 4 | 
 5 | printf "Usage: ./bench.sh [n_threads]\n"
 6 | 
 7 | if [ -z "$1" ]; then
 8 |     n_threads=4
 9 | else
10 |     n_threads=$1
11 | fi
12 | 
13 | models=( "tiny" "base" "small" "medium" "large" )
14 | 
15 | printf "\n"
16 | printf "Running benchmark for all models\n"
17 | printf "This can take a while!\n"
18 | printf "\n"
19 | 
20 | printf "| CPU | OS | Config | Model | Threads | Load [ms] | Encode [ms] | Commit |\n"
21 | printf "| --- | -- | ------ | ----- | ------- | --------- | ----------- | ------ |\n"
22 | 
23 | for model in "${models[@]}"; do
24 |     # run once to heat-up the cache
25 |     ./bench -m ./models/ggml-$model.bin -t $n_threads 2>/dev/null 1>/dev/null
26 | 
27 |     # actual run
28 |     # store stderr output in a variable in order to parse it later
29 |     output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
30 | 
31 |     # parse the output:
32 |     load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
33 |     encode_time=$(echo "$output" | grep "encode time" | awk '{print $5}')
34 |     system_info=$(echo "$output" | grep "system_info")
35 |     n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
36 | 
37 |     config=""
38 | 
39 |     if [[ $system_info == *"AVX2 = 1"* ]]; then
40 |         config="$config AVX2"
41 |     fi
42 | 
43 |     if [[ $system_info == *"NEON = 1"* ]]; then
44 |         config="$config NEON"
45 |     fi
46 | 
47 |     if [[ $system_info == *"BLAS = 1"* ]]; then
48 |         config="$config BLAS"
49 |     fi
50 | 
51 |     commit=$(git rev-parse --short HEAD)
52 | 
53 |     printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
54 | done
55 | 
56 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/Base.lproj/LaunchScreen.storyboard:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
 3 |     <dependencies>
 4 |         <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
 5 |         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
 6 |         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
 7 |     </dependencies>
 8 |     <scenes>
 9 |         <!--View Controller-->
10 |         <scene sceneID="EHf-IW-A2E">
11 |             <objects>
12 |                 <viewController id="01J-lp-oVM" sceneMemberID="viewController">
13 |                     <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
14 |                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
15 |                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
16 |                         <color key="backgroundColor" xcode11CocoaTouchSystemColor="systemBackgroundColor" cocoaTouchSystemColor="whiteColor"/>
17 |                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
18 |                     </view>
19 |                 </viewController>
20 |                 <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
21 |             </objects>
22 |             <point key="canvasLocation" x="53" y="375"/>
23 |         </scene>
24 |     </scenes>
25 | </document>
26 | 


--------------------------------------------------------------------------------
/tests/es-0-ref.txt:
--------------------------------------------------------------------------------
1 |  Hola, como están todos? Mi nombre es Julián Virrueta Mendoza y en este podcast les vengo a hablar sobre la contaminación del agua. Bueno, empezaré por decir que el ser humano no está midiendo las consecuencias de sus actos. No hay duda que uno de los mayores problemas a los que se enfrentan muchas poblaciones actualmente es la contaminación del agua. Principalmente porque como bien sabemos el agua prácticamente es fundamental para la vida, por lo que la contaminación puede ser algo muy negativo para el desarrollo tanto económico como social de los pueblos o de las poblaciones próximas en ese lugar contaminado. Los comienzos de la contaminación, como lo definen muchos expertos en la materia, la contaminación del agua es causada por las actividades humanas. Es un fenómeno ambiental de importancia, el cual se comienza a producir desde los primeros intentos de industrialización para transformarse luego en un problema tan habitual como generalizado. Generalmente la contaminación del agua se produce a través de la introducción directa o indirecta en los acuíferos o caos de agua, ríos, mares, lagos, océanos, etc. o de diversas sustancias que pueden ser consideradas como contaminantes. Pero existen dos formas principales de contaminación del agua. Una de ellas tiene que ver con la contaminación natural del agua que se corresponde con el ciclo natural de esta durante el que puede entrar en contacto con ciertos constituyentes contaminantes como sustancias minerales y orgánicas disueltas o en suspensión que se vierten en la corteza terrestre, la atmósfera y en las aguas. Pero todo esto se puede contradecir si el ser humano comía sus consecuencias, si no tirara basura a los lagos, a los ríos, no tirara botes de aceite, no contaminara. Bueno amigos, yo los invito a que no contaminen el agua y que sepan cuidar la naturaleza. Los saluda su buen amigo y compañero Julián Virreta. Nos vemos. ¡Claro!


--------------------------------------------------------------------------------
/examples/main/README.md:
--------------------------------------------------------------------------------
 1 | # main
 2 | 
 3 | This is the main example demonstrating most of the functionality of the Whisper model.
 4 | It can be used as a reference for using the `whisper.cpp` library in other projects.
 5 | 
 6 | ```
 7 | ./main -h
 8 | 
 9 | usage: ./main [options] file0.wav file1.wav ...
10 | 
11 | options:
12 |   -h,       --help          [default] show this help message and exit
13 |   -t N,     --threads N     [4      ] number of threads to use during computation
14 |   -p N,     --processors N  [1      ] number of processors to use during computation
15 |   -ot N,    --offset-t N    [0      ] time offset in milliseconds
16 |   -on N,    --offset-n N    [0      ] segment index offset
17 |   -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
18 |   -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
19 |   -ml N,    --max-len N     [0      ] maximum segment length in characters
20 |   -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
21 |   -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
22 |   -tr,      --translate     [false  ] translate from source language to english
23 |   -otxt,    --output-txt    [false  ] output result in a text file
24 |   -ovtt,    --output-vtt    [false  ] output result in a vtt file
25 |   -osrt,    --output-srt    [false  ] output result in a srt file
26 |   -owts,    --output-words  [false  ] output script for generating karaoke video
27 |   -ps,      --print-special [false  ] print special tokens
28 |   -pc,      --print-colors  [false  ] print colors
29 |   -nt,      --no-timestamps [true   ] do not print timestamps
30 |   -l LANG,  --language LANG [en     ] spoken language
31 |   -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
32 |   -f FNAME, --file FNAME    [       ] input WAV file path
33 | ```
34 | 


--------------------------------------------------------------------------------
/examples/whisper.wasm/README.md:
--------------------------------------------------------------------------------
 1 | # whisper.wasm
 2 | 
 3 | Inference of [OpenAI's Whisper ASR model](https://github.com/openai/whisper) inside the browser
 4 | 
 5 | This example uses a WebAssembly (WASM) port of the [whisper.cpp](https://github.com/ggerganov/whisper.cpp)
 6 | implementation of the transformer to run the inference inside a web page. The audio data does not leave your computer -
 7 | it is processed locally on your machine. The performance is not great but you should be able to achieve x2 or x3
 8 | real-time for the `tiny` and `base` models on a modern CPU and browser (i.e. transcribe a 60 seconds audio in about
 9 | ~20-30 seconds).
10 | 
11 | This WASM port utilizes [WASM SIMD 128-bit intrinsics](https://emcc.zcopy.site/docs/porting/simd/) so you have to make
12 | sure that [your browser supports them](https://webassembly.org/roadmap/).
13 | 
14 | The example is capable of running all models up to size `small` inclusive. Beyond that, the memory requirements and
15 | performance are unsatisfactory. The implementation currently support only the `Greedy` sampling strategy. Both
16 | transcription and translation are supported.
17 | 
18 | Since the model data is quite big (74MB for the `tiny` model) you need to manually load the model into the web-page.
19 | 
20 | The example supports both loading audio from a file and recording audio from the microphone. The maximum length of the
21 | audio is limited to 120 seconds.
22 | 
23 | ## Live demo
24 | 
25 | Link: https://whisper.ggerganov.com
26 | 
27 | ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)
28 | 
29 | ## Build instructions
30 | 
31 | ```bash (v3.1.2)
32 | # build using Emscripten
33 | git clone https://github.com/ggerganov/whisper.cpp
34 | cd whisper.cpp
35 | mkdir build-em && cd build-em
36 | emcmake cmake ..
37 | make -j
38 | 
39 | # copy the produced page to your HTTP path
40 | cp bin/whisper.wasm/*       /path/to/html/
41 | cp bin/libwhisper.worker.js /path/to/html/
42 | ```
43 | 


--------------------------------------------------------------------------------
/tests/en-0-ref.txt:
--------------------------------------------------------------------------------
1 |  My fellow Americans, this day has brought terrible news and great sadness to our country. At 9 o'clock this morning, Mission Control in Houston lost contact with our space shuttle, Columbia. A short time later, debris was seen falling from the skies above Texas. The Colombians lost. There are no survivors. On board was a crew of seven. Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon, a colonel in the Israeli Air Force. These men and women assumed great risk in the service to all humanity. In an age when spaceflight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket and the difficulties of navigating the fierce outer atmosphere of the Earth. These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life. Because of their courage and daring and idealism, we will miss them all the more. All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief. You're not alone. Our entire nation grieves with you. And those you love will always have the respect and gratitude of this country. The cause in which they died will continue. Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand. Our journey into space will go on. In the skies today, we saw destruction and tragedy. Yet farther than we can see, there is comfort and hope. In the words of the prophet Isaiah, "Lift your eyes and look to the heavens. Who created all these? He who brings out the starry hosts one by one and calls them each by name." Because of His great power and mighty strength, not one of them is missing. The same Creator who names the stars also knows the names of the seven souls we mourn today. The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home. May God bless the grieving families. And may God continue to bless America. [Silence]


--------------------------------------------------------------------------------
/tests/en-1-ref.txt:
--------------------------------------------------------------------------------
1 |  Henry F. Phillips from Wikipedia, the free encyclopedia at en.wikipedia.org. Henry F. Phillips from Wikipedia, the free encyclopedia. Henry F. Phillips 1890-1958, a U.S. businessman from Portland, Oregon, has the honor of having the Phillips head screw and screwdriver named after him. The importance of the cross head screw design lies in its self-centering property, useful on automated production lines that use powered screwdrivers. Phillips' major contribution was in driving the cross head concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, U.S. Patent #2,046,343, U.S. Patents #2,046,837 to #2,046,840, it was so widely copied that by 1949 Phillips lost his patent. The American Screw Company was responsible for devising a means of manufacturing the screw, and successfully patented and licensed their method. Other screw makers of the 1930s dismissed the Phillips concept since it calls for a relatively complex recessed socket shape in the head of the screw, as distinct from the simple milled slot of a slotted type screw. The Phillips Screw Company and the American Screw Company went on to devise the Pawsadrive screw, which differs from the Phillips in that it is designed to accommodate greater torque than the Phillips. An image accompanied this article, captioned "Phillips Screw Head." The following is an info box which accompanies this article. Info box, part of the series on screw drive types. Slotted, commonly erroneously flat head. Phillips, cross head. Pawsadrive, super drive. Torques. Hex, Allen. Robertson. Tri-wing. Torx set. Spanner head. Triple square, XZN. Others, poly drive, spline drive, double hex. Many images accompanied this info box. This page was last modified on the 9th of April, 2008, at 1704. All text is available under the terms of the GNU Free Documentation License. See copyrights for details. Wikipedia is a registered trademark of the Wikimedia Foundation Incorporated, a U.S. registered 501(c)(3) tax-deductible nonprofit charity. This sound file and all text in the article are licensed under the GNU Free Documentation License, available at www.gnu.org/copyleft/fdl.html.


--------------------------------------------------------------------------------
/examples/bench/README.md:
--------------------------------------------------------------------------------
 1 | # bench
 2 | 
 3 | A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of the transformer on some random audio data and records the execution time. This way we can have an objective comparison of the performance of the model for various setups.
 4 | 
 5 | Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
 6 | 
 7 | ```bash
 8 | # build the bench tool
 9 | $ make bench
10 | 
11 | # run it on the small.en model using 4 threads
12 | $ ./bench -m ./models/ggml-small.en.bin -t 4
13 | 
14 | whisper_model_load: loading model from './models/ggml-small.en.bin'
15 | whisper_model_load: n_vocab       = 51864
16 | whisper_model_load: n_audio_ctx   = 1500
17 | whisper_model_load: n_audio_state = 768
18 | whisper_model_load: n_audio_head  = 12
19 | whisper_model_load: n_audio_layer = 12
20 | whisper_model_load: n_text_ctx    = 448
21 | whisper_model_load: n_text_state  = 768
22 | whisper_model_load: n_text_head   = 12
23 | whisper_model_load: n_text_layer  = 12
24 | whisper_model_load: n_mels        = 80
25 | whisper_model_load: f16           = 1
26 | whisper_model_load: type          = 3
27 | whisper_model_load: mem_required  = 1048.00 MB
28 | whisper_model_load: adding 1607 extra tokens
29 | whisper_model_load: ggml ctx size = 533.05 MB
30 | whisper_model_load: memory size =    68.48 MB 
31 | whisper_model_load: model size  =   464.44 MB
32 | 
33 | whisper_print_timings:     load time =   240.82 ms
34 | whisper_print_timings:      mel time =     0.00 ms
35 | whisper_print_timings:   sample time =     0.00 ms
36 | whisper_print_timings:   encode time =  1062.21 ms / 88.52 ms per layer
37 | whisper_print_timings:   decode time =     0.00 ms / 0.00 ms per layer
38 | whisper_print_timings:    total time =  1303.04 ms
39 | 
40 | system_info: n_threads = 4 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 
41 | 
42 | If you wish, you can submit these results here:
43 | 
44 |   https://github.com/ggerganov/whisper.cpp/issues/89
45 | 
46 | Please include the following information:
47 | 
48 |   - CPU model
49 |   - Operating system
50 |   - Compiler
51 | 
52 | ```
53 | 


--------------------------------------------------------------------------------
/examples/whisper.nvim/whisper.nvim:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # INSTRUCTIONS
 4 | #
 5 | # This simple script is called by Neovim to capture audio from the microphone and transcribe it with Whisper.
 6 | # In order for this to work, you need to clone the whisper.cpp repo and build the 'stream' tool
 7 | #
 8 | #   git clone https://github.com/ggerganov/whisper.cpp
 9 | #   cd whisper.cpp
10 | #   make stream
11 | #
12 | # Also, make sure the current script is in your PATH env variable. You should be able to run the following command:
13 | #
14 | #   whisper.nvim
15 | #
16 | # Next, export the path to the whisper.cpp repository via the WHISPER_CPP_HOME env variable:
17 | #
18 | #   export WHISPER_CPP_HOME=/path/to/whisper.cpp
19 | #
20 | # Finally, add the following lines to your ~/.config/nvim/init.vim:
21 | #
22 | #   inoremap <C-G>  <C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
23 | #   nnoremap <C-G>       :!whisper.nvim<CR>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR>"ap
24 | #   vnoremap <C-G> c<C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
25 | #
26 | # This allows you to press Ctrl-G in order to capture audio from the microphone and transcribe it.
27 | # When you are done speaking - press Ctrl-C
28 | #
29 | 
30 | # the Whisper model to use
31 | model="base.en"
32 | 
33 | # export the path to the whisper.cpp repo in the WHISPER_CPP_HOME env variable
34 | # https://github.com/ggerganov/whisper.cpp
35 | cd ${WHISPER_CPP_HOME}
36 | 
37 | if [ ! -f ./stream ] ; then
38 |     echo "whisper.nvim: the 'stream' executable was not found! WHISPER_CPP_HOME=${WHISPER_CPP_HOME}" > /tmp/whisper.nvim
39 |     exit 1
40 | fi
41 | 
42 | if [ ! -f ./models/ggml-${model}.bin ] ; then
43 |     echo "whisper.nvim: the '$model' model was not found! WHISPER_CPP_HOME=${WHISPER_CPP_HOME}" > /tmp/whisper.nvim
44 |     exit 2
45 | fi
46 | 
47 | # fine-tune the parameters according to your machine specs
48 | ./stream -t 8 -m models/ggml-base.en.bin --step 350 --length 10000 -f /tmp/whisper.nvim 2> /dev/null
49 | 
50 | exit 0
51 | 


--------------------------------------------------------------------------------
/cmake/BuildTypes.cmake:
--------------------------------------------------------------------------------
 1 | # Add new build types
 2 | 
 3 | # ReleaseGG - Release with enabled asserts
 4 | 
 5 | SET(CMAKE_CXX_FLAGS_RELEASEGG
 6 |     "-O3"
 7 |     CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
 8 |     FORCE )
 9 | SET(CMAKE_C_FLAGS_RELEASEGG
10 |     "-O3"
11 |     CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
12 |     FORCE )
13 | SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
14 |     ""
15 |     CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
16 |     FORCE )
17 | SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
18 |     ""
19 |     CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
20 |     FORCE )
21 | MARK_AS_ADVANCED(
22 |     CMAKE_CXX_FLAGS_RELEASEGG
23 |     CMAKE_C_FLAGS_RELEASEGG
24 |     CMAKE_EXE_LINKER_FLAGS_RELEASEGG
25 |     CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
26 | 
27 | # RelWithDebInfoGG - RelWithDebInfo with enabled asserts
28 | 
29 | SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
30 |     "-O2 -g"
31 |     CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
32 |     FORCE )
33 | SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
34 |     "-O2 -g"
35 |     CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
36 |     FORCE )
37 | SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
38 |     ""
39 |     CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
40 |     FORCE )
41 | SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
42 |     ""
43 |     CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
44 |     FORCE )
45 | MARK_AS_ADVANCED(
46 |     CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
47 |     CMAKE_C_FLAGS_RELWITHDEBINFOGG
48 |     CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
49 |     CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
50 | 
51 | if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
52 |     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
53 |     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
54 | endif()
55 | 


--------------------------------------------------------------------------------
/models/download-ggml-model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script downloads Whisper model files that have already been converted to ggml format.
 4 | # This way you don't have to convert them yourself.
 5 | 
 6 | #src="https://ggml.ggerganov.com"
 7 | #pfx="ggml-model-whisper"
 8 | 
 9 | src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
10 | pfx="resolve/main/ggml"
11 | 
12 | # get the path of this script
13 | function get_script_path() {
14 |     if [ -x "$(command -v realpath)" ]; then
15 |         echo "$(dirname $(realpath $0))"
16 |     else
17 |         local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
18 |         echo "$ret"
19 |     fi
20 | }
21 | 
22 | models_path=$(get_script_path)
23 | 
24 | # Whisper models
25 | models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
26 | 
27 | # list available models
28 | function list_models {
29 |     printf "\n"
30 |     printf "  Available models:"
31 |     for model in "${models[@]}"; do
32 |         printf " $model"
33 |     done
34 |     printf "\n\n"
35 | }
36 | 
37 | if [ "$#" -ne 1 ]; then
38 |     printf "Usage: $0 <model>\n"
39 |     list_models
40 | 
41 |     exit 1
42 | fi
43 | 
44 | model=$1
45 | 
46 | if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
47 |     printf "Invalid model: $model\n"
48 |     list_models
49 | 
50 |     exit 1
51 | fi
52 | 
53 | # download ggml model
54 | 
55 | printf "Downloading ggml model $model from '$src' ...\n"
56 | 
57 | cd $models_path
58 | 
59 | if [ -f "ggml-$model.bin" ]; then
60 |     printf "Model $model already exists. Skipping download.\n"
61 |     exit 0
62 | fi
63 | 
64 | if [ -x "$(command -v wget)" ]; then
65 |     wget --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
66 | elif [ -x "$(command -v curl)" ]; then
67 |     curl -L --output ggml-$model.bin $src/$pfx-$model.bin
68 | else
69 |     printf "Either wget or curl is required to download models.\n"
70 |     exit 1
71 | fi
72 | 
73 | 
74 | if [ $? -ne 0 ]; then
75 |     printf "Failed to download ggml model $model \n"
76 |     printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
77 |     exit 1
78 | fi
79 | 
80 | printf "Done! Model '$model' saved in 'models/ggml-$model.bin'\n"
81 | printf "You can now use it like this:\n\n"
82 | printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
83 | printf "\n"
84 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/SceneDelegate.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  SceneDelegate.m
 3 | //  whisper.objc
 4 | //
 5 | //  Created by Georgi Gerganov on 23.10.22.
 6 | //
 7 | 
 8 | #import "SceneDelegate.h"
 9 | 
10 | @interface SceneDelegate ()
11 | 
12 | @end
13 | 
14 | @implementation SceneDelegate
15 | 
16 | 
17 | - (void)scene:(UIScene *)scene willConnectToSession:(UISceneSession *)session options:(UISceneConnectionOptions *)connectionOptions {
18 |     // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`.
19 |     // If using a storyboard, the `window` property will automatically be initialized and attached to the scene.
20 |     // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead).
21 | }
22 | 
23 | 
24 | - (void)sceneDidDisconnect:(UIScene *)scene {
25 |     // Called as the scene is being released by the system.
26 |     // This occurs shortly after the scene enters the background, or when its session is discarded.
27 |     // Release any resources associated with this scene that can be re-created the next time the scene connects.
28 |     // The scene may re-connect later, as its session was not necessarily discarded (see `application:didDiscardSceneSessions` instead).
29 | }
30 | 
31 | 
32 | - (void)sceneDidBecomeActive:(UIScene *)scene {
33 |     // Called when the scene has moved from an inactive state to an active state.
34 |     // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive.
35 | }
36 | 
37 | 
38 | - (void)sceneWillResignActive:(UIScene *)scene {
39 |     // Called when the scene will move from an active state to an inactive state.
40 |     // This may occur due to temporary interruptions (ex. an incoming phone call).
41 | }
42 | 
43 | 
44 | - (void)sceneWillEnterForeground:(UIScene *)scene {
45 |     // Called as the scene transitions from the background to the foreground.
46 |     // Use this method to undo the changes made on entering the background.
47 | }
48 | 
49 | 
50 | - (void)sceneDidEnterBackground:(UIScene *)scene {
51 |     // Called as the scene transitions from the foreground to the background.
52 |     // Use this method to save data, release shared resources, and store enough scene-specific state information
53 |     // to restore the scene back to its current state.
54 | }
55 | 
56 | 
57 | @end
58 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (EMSCRIPTEN)
 2 |     return()
 3 | endif()
 4 | 
 5 | set(TEST_TARGET test-main-tiny)
 6 | add_test(NAME ${TEST_TARGET}
 7 |     COMMAND $<TARGET_FILE:main>
 8 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.bin -l fr
 9 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
10 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;gh")
11 | 
12 | set(TEST_TARGET test-main-tiny.en)
13 | add_test(NAME ${TEST_TARGET}
14 |     COMMAND $<TARGET_FILE:main>
15 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin
16 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
17 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;en;gh")
18 | 
19 | set(TEST_TARGET test-main-base)
20 | add_test(NAME ${TEST_TARGET}
21 |     COMMAND $<TARGET_FILE:main>
22 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.bin -l fr
23 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
24 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base")
25 | 
26 | set(TEST_TARGET test-main-base.en)
27 | add_test(NAME ${TEST_TARGET}
28 |     COMMAND $<TARGET_FILE:main>
29 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.en.bin
30 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
31 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base;en")
32 | 
33 | set(TEST_TARGET test-main-small)
34 | add_test(NAME ${TEST_TARGET}
35 |     COMMAND $<TARGET_FILE:main>
36 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.bin -l fr
37 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
38 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small")
39 | 
40 | set(TEST_TARGET test-main-small.en)
41 | add_test(NAME ${TEST_TARGET}
42 |     COMMAND $<TARGET_FILE:main>
43 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.en.bin
44 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
45 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small;en")
46 | 
47 | set(TEST_TARGET test-main-medium)
48 | add_test(NAME ${TEST_TARGET}
49 |     COMMAND $<TARGET_FILE:main>
50 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.bin -l fr
51 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
52 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium")
53 | 
54 | set(TEST_TARGET test-main-medium.en)
55 | add_test(NAME ${TEST_TARGET}
56 |     COMMAND $<TARGET_FILE:main>
57 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.en.bin
58 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
59 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium;en")
60 | 
61 | set(TEST_TARGET test-main-large)
62 | add_test(NAME ${TEST_TARGET}
63 |     COMMAND $<TARGET_FILE:main>
64 |     -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin
65 |     -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
66 | set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large")
67 | 


--------------------------------------------------------------------------------
/examples/twitch.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Transcribe twitch.tv livestream by feeding audio input to whisper.cpp at regular intervals
  4 | # Thanks to @keyehzy
  5 | # ref: https://github.com/ggerganov/whisper.cpp/issues/209
  6 | #
  7 | # The script currently depends on the third-party tool "streamlink"
  8 | # On Mac OS, you can install it via "brew install streamlink"
  9 | #
 10 | 
 11 | set -eo pipefail
 12 | 
 13 | step=10
 14 | model=base.en
 15 | threads=4
 16 | 
 17 | help()
 18 | {
 19 |     echo "Example program for captioning a livestream from twitch.tv."
 20 |     echo
 21 |     echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
 22 |     echo "options:"
 23 |     echo "-s       Step in seconds (default is $step)."
 24 |     echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
 25 |     echo "-t       Number of threads to use."
 26 |     echo "-h       Print this help page."
 27 |     echo
 28 | }
 29 | 
 30 | check_requirements()
 31 | {
 32 |     if ! command -v ./main &>/dev/null; then
 33 |         echo "whisper.cpp main executable is required (make)"
 34 |         exit 1
 35 |     fi
 36 | 
 37 |     if ! command -v streamlink &>/dev/null; then
 38 |         echo "streamlink is required (https://streamlink.github.io)"
 39 |         exit 1
 40 |     fi
 41 | 
 42 |     if ! command -v ffmpeg &>/dev/null; then
 43 |         echo "ffmpeg is required (https://ffmpeg.org)"
 44 |         exit 1
 45 |     fi
 46 | }
 47 | 
 48 | check_requirements
 49 | 
 50 | while getopts ":s:m:t:h" option; do
 51 |     case $option in
 52 | 	s)
 53 |             step=$OPTARG;;
 54 | 	m)
 55 |             model=$OPTARG;;
 56 | 	t)
 57 | 	    threads=$OPTARG;;
 58 | 	h)
 59 |             help
 60 |             exit;;
 61 | 	\?)
 62 | 	    help
 63 | 	    exit;;
 64 |     esac
 65 | done
 66 | 
 67 | url=${@:$OPTIND:1}
 68 | 
 69 | if [ -z $url ]; then
 70 |     help
 71 |     exit
 72 | fi
 73 | 
 74 | echo "Piping from streamlink url=$url model=$model step=$step threads=$threads"
 75 | streamlink $url best -O 2>/dev/null | ffmpeg -loglevel quiet -i - -y -probesize 32 -y -ar 16000 -ac 1 -acodec pcm_s16le /tmp/whisper-live0.wav &
 76 | 
 77 | if [ $? -ne 0 ]; then
 78 |     printf "error: ffmpeg failed\n"
 79 |     exit 1
 80 | fi
 81 | 
 82 | echo "Buffering stream... (this should take $step seconds)"
 83 | sleep $(($step))
 84 | 
 85 | set +e
 86 | 
 87 | echo "Starting..."
 88 | 
 89 | i=0
 90 | SECONDS=0
 91 | while true
 92 | do
 93 |     err=1
 94 |     while [ $err -ne 0 ]; do
 95 |         if [ $i -gt 0 ]; then
 96 |             ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step-1)).5 -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
 97 |         else
 98 |             ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step)) -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
 99 |         fi
100 |         err=$(cat /tmp/whisper-live.err | wc -l)
101 |     done
102 | 
103 |     ./main -t $threads -m ./models/ggml-$model.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
104 | 
105 |     while [ $SECONDS -lt $((($i+1)*$step)) ]; do
106 |         sleep 1
107 |     done
108 |     ((i=i+1))
109 | done
110 | 


--------------------------------------------------------------------------------
/examples/bench/bench.cpp:
--------------------------------------------------------------------------------
 1 | #include "whisper.h"
 2 | 
 3 | #include <cstdio>
 4 | #include <string>
 5 | #include <thread>
 6 | 
 7 | // command-line parameters
 8 | struct whisper_params {
 9 |     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
10 | 
11 |     std::string model = "models/ggml-base.en.bin";
12 | };
13 | 
14 | void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
15 | 
16 | bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
17 |     for (int i = 1; i < argc; i++) {
18 |         std::string arg = argv[i];
19 | 
20 |         if (arg == "-h" || arg == "--help") {
21 |             whisper_print_usage(argc, argv, params);
22 |             exit(0);
23 |         }
24 |         else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
25 |         else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
26 |         else {
27 |             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
28 |             whisper_print_usage(argc, argv, params);
29 |             exit(0);
30 |         }
31 |     }
32 | 
33 |     return true;
34 | }
35 | 
36 | void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
37 |     fprintf(stderr, "\n");
38 |     fprintf(stderr, "usage: %s [options]\n", argv[0]);
39 |     fprintf(stderr, "\n");
40 |     fprintf(stderr, "options:\n");
41 |     fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
42 |     fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
43 |     fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
44 |     fprintf(stderr, "\n");
45 | }
46 | 
47 | int main(int argc, char ** argv) {
48 |     whisper_params params;
49 | 
50 |     if (whisper_params_parse(argc, argv, params) == false) {
51 |         return 1;
52 |     }
53 | 
54 |     // whisper init
55 | 
56 |     struct whisper_context * ctx = whisper_init(params.model.c_str());
57 | 
58 |     {
59 |         fprintf(stderr, "\n");
60 |         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
61 |     }
62 | 
63 |     if (ctx == nullptr) {
64 |         fprintf(stderr, "error: failed to initialize whisper context\n");
65 |         return 2;
66 |     }
67 | 
68 |     if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
69 |         fprintf(stderr, "error: failed to set mel: %d\n", ret);
70 |         return 3;
71 |     }
72 | 
73 |     if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
74 |         fprintf(stderr, "error: failed to encode model: %d\n", ret);
75 |         return 4;
76 |     }
77 | 
78 |     whisper_print_timings(ctx);
79 |     whisper_free(ctx);
80 | 
81 |     fprintf(stderr, "\n");
82 |     fprintf(stderr, "If you wish, you can submit these results here:\n");
83 |     fprintf(stderr, "\n");
84 |     fprintf(stderr, "  https://github.com/ggerganov/whisper.cpp/issues/89\n");
85 |     fprintf(stderr, "\n");
86 |     fprintf(stderr, "Please include the following information:\n");
87 |     fprintf(stderr, "\n");
88 |     fprintf(stderr, "  - CPU model\n");
89 |     fprintf(stderr, "  - Operating system\n");
90 |     fprintf(stderr, "  - Compiler\n");
91 |     fprintf(stderr, "\n");
92 | 
93 |     return 0;
94 | }
95 | 


--------------------------------------------------------------------------------
/examples/livestream.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
  4 | # Idea by @semiformal-net
  5 | # ref: https://github.com/ggerganov/whisper.cpp/issues/185
  6 | #
  7 | 
  8 | set -eo pipefail
  9 | 
 10 | url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
 11 | fmt=aac # the audio format extension of the stream (TODO: auto detect)
 12 | step_s=30
 13 | model="base.en"
 14 | 
 15 | check_requirements()
 16 | {
 17 |     if ! command -v ./main &>/dev/null; then
 18 |         echo "whisper.cpp main executable is required (make)"
 19 |         exit 1
 20 |     fi
 21 | 
 22 |     if ! command -v ffmpeg &>/dev/null; then
 23 |         echo "ffmpeg is required (https://ffmpeg.org)"
 24 |         exit 1
 25 |     fi
 26 | }
 27 | 
 28 | check_requirements
 29 | 
 30 | 
 31 | if [ -z "$1" ]; then
 32 |     echo "Usage: $0 stream_url [step_s] [model]"
 33 |     echo ""
 34 |     echo "  Example:"
 35 |     echo "    $0 $url $step_s $model"
 36 |     echo ""
 37 |     echo "No url specified, using default: $url"
 38 | else
 39 |     url="$1"
 40 | fi
 41 | 
 42 | if [ -n "$2" ]; then
 43 |     step_s="$2"
 44 | fi
 45 | 
 46 | if [ -n "$3" ]; then
 47 |     model="$3"
 48 | fi
 49 | 
 50 | # Whisper models
 51 | models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 52 | 
 53 | # list available models
 54 | function list_models {
 55 |     printf "\n"
 56 |     printf "  Available models:"
 57 |     for model in "${models[@]}"; do
 58 |         printf " $model"
 59 |     done
 60 |     printf "\n\n"
 61 | }
 62 | 
 63 | if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
 64 |     printf "Invalid model: $model\n"
 65 |     list_models
 66 | 
 67 |     exit 1
 68 | fi
 69 | 
 70 | running=1
 71 | 
 72 | trap "running=0" SIGINT SIGTERM
 73 | 
 74 | printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
 75 | 
 76 | # continuous stream in native fmt (this file will grow forever!)
 77 | ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
 78 | if [ $? -ne 0 ]; then
 79 |     printf "Error: ffmpeg failed to capture audio stream\n"
 80 |     exit 1
 81 | fi
 82 | 
 83 | printf "Buffering audio. Please wait...\n\n"
 84 | sleep $(($step_s))
 85 | 
 86 | # do not stop script on error
 87 | set +e
 88 | 
 89 | i=0
 90 | SECONDS=0
 91 | while [ $running -eq 1 ]; do
 92 |     # extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
 93 |     err=1
 94 |     while [ $err -ne 0 ]; do
 95 |         if [ $i -gt 0 ]; then
 96 |             ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
 97 |         else
 98 |             ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
 99 |         fi
100 |         err=$(cat /tmp/whisper-live.err | wc -l)
101 |     done
102 | 
103 |     ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
104 | 
105 |     while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
106 |         sleep 1
107 |     done
108 |     ((i=i+1))
109 | done
110 | 
111 | killall -v ffmpeg
112 | killall -v main
113 | 


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
 1 | ## Whisper model files in custom ggml format
 2 | 
 3 | The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
 4 | have been converted to custom `ggml` format in order to be able to load them in C/C++. The conversion has been performed
 5 | using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either obtain the original models and generate
 6 | the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 7 | script to download the already converted models. Currently, they are hosted on the following locations:
 8 | 
 9 | - https://huggingface.co/datasets/ggerganov/whisper.cpp
10 | - https://ggml.ggerganov.com
11 | 
12 | Sample usage:
13 | 
14 | ```java
15 | $ ./download-ggml-model.sh base.en
16 | Downloading ggml model base.en ...
17 | models/ggml-base.en.bin          100%[=============================================>] 141.11M  5.41MB/s    in 22s
18 | Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
19 | You can now use it like this:
20 | 
21 |   $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
22 | ```
23 | 
24 | A third option to obtain the model files is to download them from Hugging Face:
25 | 
26 | https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
27 | 
28 | ## Available models
29 | 
30 | | Model     | Disk   | Mem     | SHA                                        |
31 | | ---       | ---    | ---     | ---                                        |
32 | | tiny      |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
33 | | tiny.en   |  75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
34 | | base      | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
35 | | base.en   | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` |
36 | | small     | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
37 | | small.en  | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
38 | | medium    | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
39 | | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
40 | | large-v1  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
41 | | large     | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
42 | 
43 | ## Model files for testing purposes
44 | 
45 | The model files prefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for
46 | testing purposes. They are directly included in this repository for convenience and the Github Actions CI uses them to
47 | run various sanitizer tests.
48 | 
49 | ## Fine-tuned models
50 | 
51 | There are community efforts for creating fine-tuned Whisper models using extra training data. For example, this
52 | [blog post](https://huggingface.co/blog/fine-tune-whisper) describes a method for fine-tuning using Hugging Face (HF)
53 | Transformer implementation of Whisper. The produced models are in slightly different format compared to the original
54 | OpenAI format. To read the HF models you can use the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script like this:
55 | 
56 | ```bash
57 | git clone https://github.com/openai/whisper
58 | git clone https://github.com/ggerganov/whisper.cpp
59 | 
60 | # clone HF fine-tuned model (this is just an example)
61 | git clone https://huggingface.co/openai/whisper-base.en
62 | 
63 | # convert the model to ggml
64 | python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
65 | ```
66 | 


--------------------------------------------------------------------------------
/examples/talk.wasm/README.md:
--------------------------------------------------------------------------------
 1 | # talk.wasm
 2 | 
 3 | Talk with an Artificial Intelligence in your browser:
 4 | 
 5 | [https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
 6 | 
 7 | Online demo: https://whisper.ggerganov.com/talk/
 8 | 
 9 | ## How it works?
10 | 
11 | This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
12 | 
13 | - [OpenAI's Whisper](https://github.com/openai/whisper) speech recognition model is used to process your voice and understand what you are saying
14 | - Upon receiving some voice input, the AI generates a text response using [OpenAI's GPT-2](https://github.com/openai/gpt-2) language model
15 | - The AI then vocalizes the response using the browser's [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
16 | 
17 | The web page does the processing locally on your machine. The processing of these heavy neural network models in the
18 | browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
19 | extra performance:
20 | 
21 | - The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
22 | - The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
23 | - Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
24 | - The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
25 | - The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
26 | 
27 | In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
28 | is then cached in your browser's cache and can be reused in future visits without downloading it again.
29 | 
30 | ## Requirements
31 | 
32 | In order to run this demo efficiently, you need to have the following:
33 | 
34 | - Latest Chrome or Firefox browser (Safari is not supported)
35 | - Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
36 | - Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
37 | - The web-page uses about 1.6GB of RAM
38 | 
39 | Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
40 | Also, the prompting strategy can likely be improved to achieve better results.
41 | 
42 | The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
43 | browser. Typically, they run on powerful GPUs.
44 | 
45 | Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
46 | on a phone or a tablet. Hopefully, in the near future this will become supported.
47 | 
48 | ## Todo
49 | 
50 | - Better UI (contributions are welcome)
51 | - Better GPT-2 prompting
52 | 
53 | ## Build instructions
54 | 
55 | ```bash
56 | # build using Emscripten (v3.1.2)
57 | git clone https://github.com/ggerganov/whisper.cpp
58 | cd whisper.cpp
59 | mkdir build-em && cd build-em
60 | emcmake cmake ..
61 | make -j
62 | 
63 | # copy the produced page to your HTTP path
64 | cp bin/talk.wasm/*       /path/to/html/
65 | cp bin/libtalk.worker.js /path/to/html/
66 | ```
67 | 
68 | ## Feedback
69 | 
70 | If you have any comments or ideas for improvement, please drop a comment in the following discussion:
71 | 
72 | https://github.com/ggerganov/whisper.cpp/discussions/167
73 | 


--------------------------------------------------------------------------------
/tests/run-tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This scripts run the selected model agains a collection of audio files from the web.
  4 | # It downloads, converts and transcribes each file and then compares the result with the expected reference
  5 | # transcription. The comparison is performed using git's diff command and shows the differences at the character level.
  6 | # It can be used to quickly verify that the model is working as expected across a wide range of audio files.
  7 | # I.e. like an integration test. The verification is done by visual inspection of the diff output.
  8 | #
  9 | # The reference data can be for example generated using the original OpenAI Whisper implementation, or entered manually.
 10 | #
 11 | # Feel free to suggest extra audio files to add to the list.
 12 | # Make sure they are between 1-3 minutes long since we don't want to make the test too slow.
 13 | #
 14 | # Usage:
 15 | #
 16 | #   ./tests/run-tests.sh <model_name>
 17 | #
 18 | 
 19 | cd `dirname $0`
 20 | 
 21 | # Whisper models
 22 | models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 23 | 
 24 | # list available models
 25 | function list_models {
 26 |     printf "\n"
 27 |     printf "  Available models:"
 28 |     for model in "${models[@]}"; do
 29 |         printf " $model"
 30 |     done
 31 |     printf "\n\n"
 32 | }
 33 | 
 34 | if [ $# -eq 0 ]; then
 35 |     printf "Usage: $0 [model]\n\n"
 36 |     printf "No model specified. Aborting\n"
 37 |     list_models
 38 |     exit 1
 39 | fi
 40 | 
 41 | model=$1
 42 | main="../main"
 43 | 
 44 | if [ ! -f ../models/ggml-$model.bin ]; then
 45 |     printf "Model $model not found. Aborting\n"
 46 |     list_models
 47 |     exit 1
 48 | fi
 49 | 
 50 | if [ ! -f $main ]; then
 51 |     printf "Executable $main not found. Aborting\n"
 52 |     exit 1
 53 | fi
 54 | 
 55 | # add various audio files for testing purposes here
 56 | # the order of the files is important so don't change the existing order
 57 | # when adding new files, make sure to add the expected "ref.txt" file with the correct transcript
 58 | urls_en=(
 59 |     "https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg"
 60 |     "https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg"
 61 |     "https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
 62 | )
 63 | 
 64 | urls_es=(
 65 |     "https://upload.wikimedia.org/wikipedia/commons/c/c1/La_contaminacion_del_agua.ogg"
 66 | )
 67 | 
 68 | urls_it=(
 69 | )
 70 | 
 71 | urls_pt=(
 72 | )
 73 | 
 74 | urls_de=(
 75 | )
 76 | 
 77 | urls_jp=(
 78 | )
 79 | 
 80 | urls_ru=(
 81 | )
 82 | 
 83 | function run_lang() {
 84 |     lang=$1
 85 |     shift
 86 |     urls=("$@")
 87 | 
 88 |     i=0
 89 |     for url in "${urls[@]}"; do
 90 |         echo "- [$lang] Processing '$url' ..."
 91 | 
 92 |         ext="${url##*.}"
 93 |         fname_src="$lang-${i}.${ext}"
 94 |         fname_dst="$lang-${i}-16khz.wav"
 95 | 
 96 |         if [ ! -f $fname_src ]; then
 97 |             wget --quiet --show-progress -O $fname_src $url
 98 |         fi
 99 | 
100 |         if [ ! -f $fname_dst ]; then
101 |             ffmpeg -loglevel -0 -y -i $fname_src -ar 16000 -ac 1 -c:a pcm_s16le $fname_dst
102 |             if [ $? -ne 0 ]; then
103 |                 echo "Error: ffmpeg failed to convert $fname_src to $fname_dst"
104 |                 exit 1
105 |             fi
106 |         fi
107 | 
108 |         $main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
109 | 
110 |         git diff --no-index --word-diff=color --word-diff-regex=. $lang-$i-ref.txt $fname_dst.txt
111 | 
112 |         i=$(($i+1))
113 |     done
114 | }
115 | 
116 | run_lang "en" "${urls_en[@]}"
117 | 
118 | if [[ $model != *.en ]]; then
119 |     run_lang "es" "${urls_es[@]}"
120 |     run_lang "it" "${urls_it[@]}"
121 |     run_lang "pt" "${urls_pt[@]}"
122 |     run_lang "de" "${urls_de[@]}"
123 |     run_lang "jp" "${urls_jp[@]}"
124 |     run_lang "ru" "${urls_ru[@]}"
125 | fi
126 | 


--------------------------------------------------------------------------------
/bindings/javascript/emscripten.cpp:
--------------------------------------------------------------------------------
  1 | #include "whisper.h"
  2 | 
  3 | #include <emscripten.h>
  4 | #include <emscripten/bind.h>
  5 | 
  6 | #include <vector>
  7 | #include <thread>
  8 | 
  9 | std::thread g_worker;
 10 | 
 11 | std::vector<struct whisper_context *> g_contexts(4, nullptr);
 12 | 
 13 | EMSCRIPTEN_BINDINGS(whisper) {
 14 |     emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
 15 |         if (g_worker.joinable()) {
 16 |             g_worker.join();
 17 |         }
 18 | 
 19 |         for (size_t i = 0; i < g_contexts.size(); ++i) {
 20 |             if (g_contexts[i] == nullptr) {
 21 |                 g_contexts[i] = whisper_init(path_model.c_str());
 22 |                 if (g_contexts[i] != nullptr) {
 23 |                     return i + 1;
 24 |                 } else {
 25 |                     return (size_t) 0;
 26 |                 }
 27 |             }
 28 |         }
 29 | 
 30 |         return (size_t) 0;
 31 |     }));
 32 | 
 33 |     emscripten::function("free", emscripten::optional_override([](size_t index) {
 34 |         if (g_worker.joinable()) {
 35 |             g_worker.join();
 36 |         }
 37 | 
 38 |         --index;
 39 | 
 40 |         if (index < g_contexts.size()) {
 41 |             whisper_free(g_contexts[index]);
 42 |             g_contexts[index] = nullptr;
 43 |         }
 44 |     }));
 45 | 
 46 |     emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
 47 |         if (g_worker.joinable()) {
 48 |             g_worker.join();
 49 |         }
 50 | 
 51 |         --index;
 52 | 
 53 |         if (index >= g_contexts.size()) {
 54 |             return -1;
 55 |         }
 56 | 
 57 |         if (g_contexts[index] == nullptr) {
 58 |             return -2;
 59 |         }
 60 | 
 61 |         struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
 62 | 
 63 |         params.print_realtime   = true;
 64 |         params.print_progress   = false;
 65 |         params.print_timestamps = true;
 66 |         params.print_special    = false;
 67 |         params.translate        = translate;
 68 |         params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
 69 |         params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
 70 |         params.offset_ms        = 0;
 71 | 
 72 |         std::vector<float> pcmf32;
 73 |         const int n = audio["length"].as<int>();
 74 | 
 75 |         emscripten::val heap = emscripten::val::module_property("HEAPU8");
 76 |         emscripten::val memory = heap["buffer"];
 77 | 
 78 |         pcmf32.resize(n);
 79 | 
 80 |         emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(pcmf32.data()), n);
 81 |         memoryView.call<void>("set", audio);
 82 | 
 83 |         // print system information
 84 |         {
 85 |             printf("system_info: n_threads = %d / %d | %s\n",
 86 |                     params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
 87 | 
 88 |             printf("%s: processing %d samples, %.1f sec, %d threads, %d processors, lang = %s, task = %s ...\n",
 89 |                     __func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
 90 |                     params.n_threads, 1,
 91 |                     params.language,
 92 |                     params.translate ? "translate" : "transcribe");
 93 | 
 94 |             printf("\n");
 95 |         }
 96 | 
 97 |         // run the worker
 98 |         {
 99 |             g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
100 |                 whisper_reset_timings(g_contexts[index]);
101 |                 whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
102 |                 whisper_print_timings(g_contexts[index]);
103 |             });
104 |         }
105 | 
106 |         return 0;
107 |     }));
108 | }
109 | 


--------------------------------------------------------------------------------
/examples/whisper.nvim/README.md:
--------------------------------------------------------------------------------
 1 | # whisper.nvim
 2 | 
 3 | Speech-to-text in Neovim
 4 | 
 5 | The transcription is performed on the CPU and no data leaves your computer. Works best on Apple Silicon devices.
 6 | 
 7 | https://user-images.githubusercontent.com/1991296/198382564-784e9663-2037-4d04-99b8-f39136929b7e.mp4
 8 | 
 9 | ## Usage
10 | 
11 | - Simply press `Ctrl-G` in `INSERT`, `VISUAL` or `NORMAL` mode and say something
12 | - When you are done - press `Ctrl-C` to end the transcription and insert the transcribed text under the cursor
13 | 
14 | ## Installation
15 | 
16 | *Note: this is a bit tedious and hacky atm, but I hope it will be improved with time*
17 | 
18 | - Clone this repo and build the `stream` tool:
19 | 
20 |   ```
21 |   git clone https://github.com/ggerganov/whisper.cpp
22 |   cd whisper.cpp
23 |   make stream
24 |   ```
25 | 
26 | - Download the `base.en` Whisper model (140 MB):
27 | 
28 |   ```
29 |   ./models/download-ggml-model.sh base.en
30 |   ```
31 | 
32 | - Place the [whisper.nvim](whisper.nvim) script somewhere in your PATH and give it execute permissions:
33 | 
34 |   ```
35 |   cp examples/whisper.nvim/whisper.nvim ~/bin/
36 |   chmod u+x ~/bin/whisper.nvim
37 |   ```
38 | 
39 | - Fine-tune the script to your preference and machine parameters:
40 | 
41 |   ```
42 |   ./stream -t 8 -m models/ggml-base.en.bin --step 350 --length 10000 -f /tmp/whisper.nvim 2> /dev/null
43 |   ```
44 | 
45 |   On slower machines, try to increase the `step` parameter.
46 | 
47 | - Add the following shortcuts to your `~/.config/nvim/init.vim`:
48 | 
49 |   ```
50 |   inoremap <C-G>  <C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
51 |   nnoremap <C-G>       :!whisper.nvim<CR>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR>"ap
52 |   vnoremap <C-G> c<C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
53 |   ```
54 |   
55 |   Explanation: pressing `Ctrl-G` runs the [whisper.nvim](whisper.nvim) script which in turn calls the `stream` binary to transcribe your speech through the microphone. The results from the transcription are continuously dumped into `/tmp/whisper.nvim`. After you kill the program with `Ctrl-C`, the vim command grabs the last line from the `/tmp/whisper.nvim` file and puts it under the cursor.
56 |   
57 |   Probably there is a much more intelligent way to achieve all this, but this is what I could hack in an hour. Any suggestions how to improve this are welcome.
58 |   
59 | You are now ready to use speech-to-text in Neovim!
60 | 
61 | ## TODO
62 | 
63 | There are a lot of ways to improve this idea and I don't have much experience with Vim plugin programming, so contributions are welcome! 
64 | 
65 | - [ ] **Wrap this into a plugin**
66 |   
67 |   It would be great to make a standalone plugin out of this that can be installed with `vim-plug` or similar
68 |   
69 | - [ ] **Simplify the `init.vim` mappings (maybe factor out the common call into a separate function)**
70 | - [ ] **Add Copilot/GPT-3 integration**
71 | 
72 |   This is probably a very long shot, but I think it will be very cool to have the functionality to select some code and then hit Ctrl-G and say something like:
73 |   
74 |   *"refactor this using stl containers"*
75 |   
76 |   or
77 |   
78 |   *"optimize by sorting the data first"*
79 |   
80 |   The plugin would then make an appropriate query using the selected text and code context to Copilot or GPT-3 and return the result.
81 |   
82 |   Here is a proof-of-concept:
83 |   
84 |   https://user-images.githubusercontent.com/1991296/199078847-0278fcde-5667-4748-ba0d-7d55381d6047.mp4
85 |     
86 |   https://user-images.githubusercontent.com/1991296/200067939-f98d2ac2-7519-438a-85f9-79db0841ba4f.mp4
87 |   
88 |   For explanation how this works see: https://twitter.com/ggerganov/status/1587168771789258756
89 | 
90 | ## Discussion
91 | 
92 | If you find this idea interesting, you can join the discussion here: https://github.com/ggerganov/whisper.cpp/discussions/108
93 | 


--------------------------------------------------------------------------------
/examples/yt-wsp.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Small shell script to more easily automatically download and transcribe live stream VODs.
  4 | # This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
  5 | # Use `./examples/yt-wsp.sh help` to print help info.
  6 | #
  7 | # Sample usage:
  8 | #
  9 | #   git clone https://github.com/ggerganov/whisper.cpp
 10 | #   cd whisper.cpp
 11 | #   make
 12 | #   ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
 13 | #
 14 | 
 15 | # MIT License
 16 | 
 17 | # Copyright (c) 2022 Daniils Petrovs
 18 | 
 19 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 20 | # of this software and associated documentation files (the "Software"), to deal
 21 | # in the Software without restriction, including without limitation the rights
 22 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 23 | # copies of the Software, and to permit persons to whom the Software is
 24 | # furnished to do so, subject to the following conditions:
 25 | 
 26 | # The above copyright notice and this permission notice shall be included in all
 27 | # copies or substantial portions of the Software.
 28 | 
 29 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 30 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 31 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 32 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 33 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 34 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 35 | # SOFTWARE.
 36 | 
 37 | set -Eeuo pipefail
 38 | 
 39 | # You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
 40 | MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
 41 | WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
 42 | WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
 43 | 
 44 | msg() {
 45 |     echo >&2 -e "${1-}"
 46 | }
 47 | 
 48 | cleanup() {
 49 |     msg "Cleaning up..."
 50 |     rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
 51 | }
 52 | 
 53 | print_help() {
 54 |     echo "Usage: ./examples/yt-wsp.sh <video_url>"
 55 |     echo "See configurable env variables in the script"
 56 |     echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
 57 |     echo "Requirements: ffmpeg yt-dlp whisper"
 58 |     echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
 59 |     echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
 60 | }
 61 | 
 62 | check_requirements() {
 63 |     if ! command -v ffmpeg &>/dev/null; then
 64 |         echo "ffmpeg is required (https://ffmpeg.org)."
 65 |         exit 1
 66 |     fi
 67 | 
 68 |     if ! command -v yt-dlp &>/dev/null; then
 69 |         echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
 70 |         exit 1
 71 |     fi
 72 | 
 73 |     if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
 74 |         WHISPER_EXECUTABLE="./main"
 75 |         if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
 76 |             echo "Whisper is required (https://github.com/ggerganov/whisper.cpp):"
 77 |             echo "Sample usage:"
 78 |             echo ""
 79 |             echo "  git clone https://github.com/ggerganov/whisper.cpp"
 80 |             echo "  cd whisper.cpp"
 81 |             echo "  make"
 82 |             echo "  ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890"
 83 |             echo ""
 84 |             exit 1
 85 |         fi
 86 |     fi
 87 | }
 88 | 
 89 | if [[ $# -lt 1 ]]; then
 90 |     print_help
 91 |     exit 1
 92 | fi
 93 | 
 94 | if [[ "$1" == "help" ]]; then
 95 |     print_help
 96 |     exit 0
 97 | fi
 98 | 
 99 | temp_dir="tmp"
100 | source_url="$1"
101 | 
102 | check_requirements
103 | 
104 | msg "Downloading VOD..."
105 | 
106 | # Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
107 | yt-dlp \
108 |     -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
109 |     --embed-thumbnail \
110 |     --embed-chapters \
111 |     --xattrs \
112 |     "${source_url}" -o "${temp_dir}/vod.mp4"
113 | 
114 | msg "Extracting audio and resampling..."
115 | 
116 | ffmpeg -i "${temp_dir}/vod.mp4" \
117 |     -hide_banner \
118 |     -loglevel error \
119 |     -ar 16000 \
120 |     -ac 1 \
121 |     -c:a \
122 |     pcm_s16le -y "vod-resampled.wav"
123 | 
124 | msg "Transcribing to subtitle file..."
125 | msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
126 | 
127 | $WHISPER_EXECUTABLE \
128 |     -m "${MODEL_PATH}" \
129 |     -l "${WHISPER_LANG}" \
130 |     -f "vod-resampled.wav" \
131 |     -t 8 \
132 |     -osrt \
133 |     --translate
134 | 
135 | msg "Embedding subtitle track..."
136 | 
137 | ffmpeg -i "${temp_dir}/vod.mp4" \
138 |     -hide_banner \
139 |     -loglevel error \
140 |     -i "vod-resampled.wav.srt" \
141 |     -c copy \
142 |     -c:s mov_text \
143 |     -y res.mp4
144 | 
145 | cleanup
146 | 
147 | msg "Done! Your finished file is ready: res.mp4"
148 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | on: [push]
  3 | 
  4 | jobs:
  5 |     ubuntu-latest:
  6 |         runs-on: ubuntu-latest
  7 | 
  8 |         steps:
  9 |             - name: Clone
 10 |               uses: actions/checkout@v1
 11 | 
 12 |             - name: Dependencies
 13 |               run: |
 14 |                   sudo apt-get update
 15 |                   sudo apt-get install build-essential
 16 |                   sudo apt-get install libsdl2-dev
 17 | 
 18 |             - name: Build
 19 |               run: |
 20 |                 make
 21 |                 make stream
 22 | 
 23 |     macOS-latest:
 24 |         runs-on: macOS-latest
 25 | 
 26 |         steps:
 27 |             - name: Clone
 28 |               uses: actions/checkout@v1
 29 | 
 30 |             - name: Dependencies
 31 |               run: |
 32 |                   brew update
 33 |                   brew install sdl2
 34 | 
 35 |             - name: Build
 36 |               run: |
 37 |                 make
 38 |                 make stream
 39 | 
 40 |     ubuntu-latest-gcc:
 41 |         runs-on: ubuntu-latest
 42 | 
 43 |         strategy:
 44 |             matrix:
 45 |                 build: [Debug, Release]
 46 | 
 47 |         steps:
 48 |             - name: Clone
 49 |               uses: actions/checkout@v1
 50 | 
 51 |             - name: Dependencies
 52 |               run: |
 53 |                   sudo apt-get update
 54 |                   sudo apt-get install build-essential
 55 |                   sudo apt-get install cmake
 56 |                   sudo apt-get install libsdl2-dev
 57 | 
 58 |             - name: Configure
 59 |               run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
 60 | 
 61 |             - name: Build
 62 |               run: |
 63 |                 make
 64 |                 ctest -L gh --output-on-failure
 65 | 
 66 |     ubuntu-latest-clang:
 67 |         runs-on: ubuntu-latest
 68 | 
 69 |         strategy:
 70 |             matrix:
 71 |                 build: [Debug, Release]
 72 | 
 73 |         steps:
 74 |             - name: Clone
 75 |               uses: actions/checkout@v1
 76 | 
 77 |             - name: Dependencies
 78 |               run: |
 79 |                   sudo apt-get update
 80 |                   sudo apt-get install build-essential
 81 |                   sudo apt-get install cmake
 82 |                   sudo apt-get install libsdl2-dev
 83 | 
 84 |             - name: Configure
 85 |               run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
 86 | 
 87 |             - name: Build
 88 |               run: |
 89 |                 make
 90 |                 ctest -L gh --output-on-failure
 91 | 
 92 |     ubuntu-latest-gcc-sanitized:
 93 |         runs-on: ubuntu-latest
 94 | 
 95 |         strategy:
 96 |             matrix:
 97 |                 sanitizer: [ADDRESS, THREAD, UNDEFINED]
 98 | 
 99 |         steps:
100 |             - name: Clone
101 |               uses: actions/checkout@v1
102 | 
103 |             - name: Dependencies
104 |               run: |
105 |                   sudo apt-get update
106 |                   sudo apt-get install build-essential
107 |                   sudo apt-get install cmake
108 | 
109 |             - name: Configure
110 |               run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
111 | 
112 |             - name: Build
113 |               run: |
114 |                 make
115 |                 ctest -L gh --output-on-failure
116 | 
117 |     windows:
118 |         runs-on: windows-latest
119 | 
120 |         strategy:
121 |             matrix:
122 |                 build: [RelWithDebInfo]
123 |                 arch: [Win32, x64]
124 |                 blas: [ON]
125 |                 sdl2: [ON]
126 |                 include:
127 |                   - arch: Win32
128 |                     obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
129 |                     s2arc: x86
130 |                   - arch: x64
131 |                     obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
132 |                     s2arc: x64
133 |                   - sdl2: ON
134 |                     s2ver: 2.26.0
135 | 
136 |         steps:
137 |             - name: Clone
138 |               uses: actions/checkout@v1
139 | 
140 |             - name: Add msbuild to PATH
141 |               uses: microsoft/setup-msbuild@v1
142 | 
143 |             - name: Fetch OpenBLAS
144 |               if: matrix.blas == 'ON'
145 |               run: |
146 |                 C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
147 |                 7z x blas.zip -oblas -y
148 |                 copy blas/include/cblas.h .
149 |                 copy blas/include/openblas_config.h .
150 |                 echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
151 | 
152 |             - name: Fetch SDL2 and set SDL2_DIR
153 |               if: matrix.sdl2 == 'ON'
154 |               run: |
155 |                 C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
156 |                 7z x sdl2.zip
157 |                 echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
158 | 
159 |             - name: Configure
160 |               run: >
161 |                 cmake -S . -B ./build -A ${{ matrix.arch }}
162 |                 -DCMAKE_BUILD_TYPE=${{ matrix.build }}
163 |                 -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
164 |                 -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
165 |                 -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
166 | 
167 |             - name: Build
168 |               run: |
169 |                 cd ./build
170 |                 msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
171 | 
172 |             - name: Copy libopenblas.dll
173 |               if: matrix.blas == 'ON'
174 |               run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
175 | 
176 |             - name: Copy SDL2.dll
177 |               if: matrix.sdl2 == 'ON'
178 |               run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
179 | 
180 |             - name: Upload binaries
181 |               if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
182 |               uses: actions/upload-artifact@v1
183 |               with:
184 |                 name: whisper-bin-${{ matrix.arch }}
185 |                 path: build/bin/${{ matrix.build }}
186 | 


--------------------------------------------------------------------------------
/examples/helpers.js:
--------------------------------------------------------------------------------
  1 | // Common Javascript functions used by the examples
  2 | 
  3 | function convertTypedArray(src, type) {
  4 |     var buffer = new ArrayBuffer(src.byteLength);
  5 |     var baseView = new src.constructor(buffer).set(src);
  6 |     return new type(buffer);
  7 | }
  8 | 
  9 | var printTextarea = (function() {
 10 |     var element = document.getElementById('output');
 11 |     if (element) element.alue = ''; // clear browser cache
 12 |     return function(text) {
 13 |         if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
 14 |         console.log(text);
 15 |         if (element) {
 16 |             element.value += text + "\n";
 17 |             element.scrollTop = element.scrollHeight; // focus on bottom
 18 |         }
 19 |     };
 20 | })();
 21 | 
 22 | async function clearCache() {
 23 |     if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
 24 |         indexedDB.deleteDatabase(dbName);
 25 |     }
 26 | }
 27 | 
 28 | // fetch a remote file from remote URL using the Fetch API
 29 | async function fetchRemote(url, cbProgress, cbPrint) {
 30 |     cbPrint('fetchRemote: downloading with fetch()...');
 31 | 
 32 |     const response = await fetch(
 33 |         url,
 34 |         {
 35 |             method: 'GET',
 36 |             headers: {
 37 |                 'Content-Type': 'application/octet-stream',
 38 |             },
 39 |         }
 40 |     );
 41 | 
 42 |     if (!response.ok) {
 43 |         cbPrint('fetchRemote: failed to fetch ' + url);
 44 |         return;
 45 |     }
 46 | 
 47 |     const contentLength = response.headers.get('content-length');
 48 |     const total = parseInt(contentLength, 10);
 49 |     const reader = response.body.getReader();
 50 | 
 51 |     var chunks = [];
 52 |     var receivedLength = 0;
 53 |     var progressLast = -1;
 54 | 
 55 |     while (true) {
 56 |         const { done, value } = await reader.read();
 57 | 
 58 |         if (done) {
 59 |             break;
 60 |         }
 61 | 
 62 |         chunks.push(value);
 63 |         receivedLength += value.length;
 64 | 
 65 |         if (contentLength) {
 66 |             cbProgress(receivedLength/total);
 67 | 
 68 |             var progressCur = Math.round((receivedLength / total) * 10);
 69 |             if (progressCur != progressLast) {
 70 |                 cbPrint('fetchRemote: fetching ' + 10*progressCur + '% ...');
 71 |                 progressLast = progressCur;
 72 |             }
 73 |         }
 74 |     }
 75 | 
 76 |     var position = 0;
 77 |     var chunksAll = new Uint8Array(receivedLength);
 78 | 
 79 |     for (var chunk of chunks) {
 80 |         chunksAll.set(chunk, position);
 81 |         position += chunk.length;
 82 |     }
 83 | 
 84 |     return chunksAll;
 85 | }
 86 | 
 87 | // load remote data
 88 | // - check if the data is already in the IndexedDB
 89 | // - if not, fetch it from the remote URL and store it in the IndexedDB
 90 | function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
 91 |     // query the storage quota and print it
 92 |     navigator.storage.estimate().then(function (estimate) {
 93 |         cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
 94 |         cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
 95 |     });
 96 | 
 97 |     // check if the data is already in the IndexedDB
 98 |     var rq = indexedDB.open(dbName, dbVersion);
 99 | 
100 |     rq.onupgradeneeded = function (event) {
101 |         var db = event.target.result;
102 |         if (db.version == 1) {
103 |             var os = db.createObjectStore('models', { autoIncrement: false });
104 |             cbPrint('loadRemote: created IndexedDB ' + db.name + ' version ' + db.version);
105 |         } else {
106 |             // clear the database
107 |             var os = event.currentTarget.transaction.objectStore('models');
108 |             os.clear();
109 |             cbPrint('loadRemote: cleared IndexedDB ' + db.name + ' version ' + db.version);
110 |         }
111 |     };
112 | 
113 |     rq.onsuccess = function (event) {
114 |         var db = event.target.result;
115 |         var tx = db.transaction(['models'], 'readonly');
116 |         var os = tx.objectStore('models');
117 |         var rq = os.get(url);
118 | 
119 |         rq.onsuccess = function (event) {
120 |             if (rq.result) {
121 |                 cbPrint('loadRemote: "' + url + '" is already in the IndexedDB');
122 |                 cbReady(dst, rq.result);
123 |             } else {
124 |                 // data is not in the IndexedDB
125 |                 cbPrint('loadRemote: "' + url + '" is not in the IndexedDB');
126 | 
127 |                 // alert and ask the user to confirm
128 |                 if (!confirm(
129 |                     'You are about to download ' + size_mb + ' MB of data.\n' +
130 |                     'The model data will be cached in the browser for future use.\n\n' +
131 |                     'Press OK to continue.')) {
132 |                     cbCancel();
133 |                     return;
134 |                 }
135 | 
136 |                 fetchRemote(url, cbProgress, cbPrint).then(function (data) {
137 |                     if (data) {
138 |                         // store the data in the IndexedDB
139 |                         var rq = indexedDB.open(dbName, dbVersion);
140 |                         rq.onsuccess = function (event) {
141 |                             var db = event.target.result;
142 |                             var tx = db.transaction(['models'], 'readwrite');
143 |                             var os = tx.objectStore('models');
144 |                             var rq = os.put(data, url);
145 | 
146 |                             rq.onsuccess = function (event) {
147 |                                 cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
148 |                                 cbReady(dst, data);
149 |                             };
150 | 
151 |                             rq.onerror = function (event) {
152 |                                 cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB');
153 |                                 cbCancel();
154 |                             };
155 |                         };
156 |                     }
157 |                 });
158 |             }
159 |         };
160 | 
161 |         rq.onerror = function (event) {
162 |             cbPrint('loadRemote: failed to get data from the IndexedDB');
163 |             cbCancel();
164 |         };
165 |     };
166 | 
167 |     rq.onerror = function (event) {
168 |         cbPrint('loadRemote: failed to open IndexedDB');
169 |         cbCancel();
170 |     };
171 | 
172 |     rq.onblocked = function (event) {
173 |         cbPrint('loadRemote: failed to open IndexedDB: blocked');
174 |         cbCancel();
175 |     };
176 | 
177 |     rq.onabort = function (event) {
178 |         cbPrint('loadRemote: failed to open IndexedDB: abort');
179 | 
180 |     };
181 | }
182 | 
183 | 


--------------------------------------------------------------------------------
/examples/stream.wasm/emscripten.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "whisper.h"
  3 | 
  4 | #include <emscripten.h>
  5 | #include <emscripten/bind.h>
  6 | 
  7 | #include <atomic>
  8 | #include <cmath>
  9 | #include <mutex>
 10 | #include <string>
 11 | #include <thread>
 12 | #include <vector>
 13 | 
 14 | constexpr int N_THREAD = 8;
 15 | 
 16 | std::vector<struct whisper_context *> g_contexts(4, nullptr);
 17 | 
 18 | std::mutex g_mutex;
 19 | std::thread g_worker;
 20 | 
 21 | std::atomic<bool> g_running(false);
 22 | 
 23 | std::string g_status        = "";
 24 | std::string g_status_forced = "";
 25 | std::string g_transcribed   = "";
 26 | 
 27 | std::vector<float> g_pcmf32;
 28 | 
 29 | void stream_set_status(const std::string & status) {
 30 |     std::lock_guard<std::mutex> lock(g_mutex);
 31 |     g_status = status;
 32 | }
 33 | 
 34 | void stream_main(size_t index) {
 35 |     stream_set_status("loading data ...");
 36 | 
 37 |     struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
 38 | 
 39 |     wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
 40 |     wparams.offset_ms        = 0;
 41 |     wparams.translate        = false;
 42 |     wparams.no_context       = true;
 43 |     wparams.single_segment   = true;
 44 |     wparams.print_realtime   = false;
 45 |     wparams.print_progress   = false;
 46 |     wparams.print_timestamps = true;
 47 |     wparams.print_special    = false;
 48 | 
 49 |     wparams.max_tokens       = 32;
 50 |     wparams.audio_ctx        = 768; // partial encoder context for better performance
 51 | 
 52 |     wparams.language         = "en";
 53 | 
 54 |     printf("stream: using %d threads\n", wparams.n_threads);
 55 | 
 56 |     std::vector<float> pcmf32;
 57 | 
 58 |     // whisper context
 59 |     auto & ctx = g_contexts[index];
 60 | 
 61 |     // 5 seconds interval
 62 |     const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
 63 | 
 64 |     while (g_running) {
 65 |         stream_set_status("waiting for audio ...");
 66 | 
 67 |         {
 68 |             std::unique_lock<std::mutex> lock(g_mutex);
 69 | 
 70 |             if (g_pcmf32.size() < 1024) {
 71 |                 lock.unlock();
 72 | 
 73 |                 std::this_thread::sleep_for(std::chrono::milliseconds(10));
 74 | 
 75 |                 continue;
 76 |             }
 77 | 
 78 |             pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
 79 |             g_pcmf32.clear();
 80 |         }
 81 | 
 82 |         {
 83 |             const auto t_start = std::chrono::high_resolution_clock::now();
 84 | 
 85 |             stream_set_status("running whisper ...");
 86 | 
 87 |             int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
 88 |             if (ret != 0) {
 89 |                 printf("whisper_full() failed: %d\n", ret);
 90 |                 break;
 91 |             }
 92 | 
 93 |             const auto t_end = std::chrono::high_resolution_clock::now();
 94 | 
 95 |             printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
 96 |         }
 97 | 
 98 |         {
 99 |             std::string text_heard;
100 | 
101 |             {
102 |                 const int n_segments = whisper_full_n_segments(ctx);
103 |                 for (int i = n_segments - 1; i < n_segments; ++i) {
104 |                     const char * text = whisper_full_get_segment_text(ctx, i);
105 | 
106 |                     const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
107 |                     const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
108 | 
109 |                     printf("transcribed: %s\n", text);
110 | 
111 |                     text_heard += text;
112 |                 }
113 |             }
114 | 
115 |             {
116 |                 std::lock_guard<std::mutex> lock(g_mutex);
117 |                 g_transcribed = text_heard;
118 |             }
119 |         }
120 |     }
121 | 
122 |     if (index < g_contexts.size()) {
123 |         whisper_free(g_contexts[index]);
124 |         g_contexts[index] = nullptr;
125 |     }
126 | }
127 | 
128 | EMSCRIPTEN_BINDINGS(stream) {
129 |     emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
130 |         for (size_t i = 0; i < g_contexts.size(); ++i) {
131 |             if (g_contexts[i] == nullptr) {
132 |                 g_contexts[i] = whisper_init(path_model.c_str());
133 |                 if (g_contexts[i] != nullptr) {
134 |                     g_running = true;
135 |                     if (g_worker.joinable()) {
136 |                         g_worker.join();
137 |                     }
138 |                     g_worker = std::thread([i]() {
139 |                         stream_main(i);
140 |                     });
141 | 
142 |                     return i + 1;
143 |                 } else {
144 |                     return (size_t) 0;
145 |                 }
146 |             }
147 |         }
148 | 
149 |         return (size_t) 0;
150 |     }));
151 | 
152 |     emscripten::function("free", emscripten::optional_override([](size_t index) {
153 |         if (g_running) {
154 |             g_running = false;
155 |         }
156 |     }));
157 | 
158 |     emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
159 |         --index;
160 | 
161 |         if (index >= g_contexts.size()) {
162 |             return -1;
163 |         }
164 | 
165 |         if (g_contexts[index] == nullptr) {
166 |             return -2;
167 |         }
168 | 
169 |         {
170 |             std::lock_guard<std::mutex> lock(g_mutex);
171 |             const int n = audio["length"].as<int>();
172 | 
173 |             emscripten::val heap = emscripten::val::module_property("HEAPU8");
174 |             emscripten::val memory = heap["buffer"];
175 | 
176 |             g_pcmf32.resize(n);
177 | 
178 |             emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
179 |             memoryView.call<void>("set", audio);
180 |         }
181 | 
182 |         return 0;
183 |     }));
184 | 
185 |     emscripten::function("get_transcribed", emscripten::optional_override([]() {
186 |         std::string transcribed;
187 | 
188 |         {
189 |             std::lock_guard<std::mutex> lock(g_mutex);
190 |             transcribed = std::move(g_transcribed);
191 |         }
192 | 
193 |         return transcribed;
194 |     }));
195 | 
196 |     emscripten::function("get_status", emscripten::optional_override([]() {
197 |         std::string status;
198 | 
199 |         {
200 |             std::lock_guard<std::mutex> lock(g_mutex);
201 |             status = g_status_forced.empty() ? g_status : g_status_forced;
202 |         }
203 | 
204 |         return status;
205 |     }));
206 | 
207 |     emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
208 |         {
209 |             std::lock_guard<std::mutex> lock(g_mutex);
210 |             g_status_forced = status;
211 |         }
212 |     }));
213 | }
214 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | ifndef UNAME_S
  2 | UNAME_S := $(shell uname -s)
  3 | endif
  4 | 
  5 | ifndef UNAME_P
  6 | UNAME_P := $(shell uname -p)
  7 | endif
  8 | 
  9 | ifndef UNAME_M
 10 | UNAME_M := $(shell uname -m)
 11 | endif
 12 | 
 13 | # Mac OS + Arm can report x86_64
 14 | # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 15 | ifeq ($(UNAME_S),Darwin)
 16 | 	ifneq ($(UNAME_P),arm)
 17 | 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
 18 | 		ifeq ($(SYSCTL_M),1)
 19 | 			# UNAME_P := arm
 20 | 			# UNAME_M := arm64
 21 | 			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
 22 | 		endif
 23 | 	endif
 24 | endif
 25 | 
 26 | #
 27 | # Compile flags
 28 | #
 29 | 
 30 | CFLAGS   = -I.              -O3 -std=c11   -fPIC
 31 | CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 32 | LDFLAGS  =
 33 | 
 34 | # OS specific
 35 | # TODO: support Windows
 36 | ifeq ($(UNAME_S),Linux)
 37 | 	CFLAGS   += -pthread
 38 | 	CXXFLAGS += -pthread
 39 | endif
 40 | ifeq ($(UNAME_S),Darwin)
 41 | 	CFLAGS   += -pthread
 42 | 	CXXFLAGS += -pthread
 43 | endif
 44 | ifeq ($(UNAME_S),FreeBSD)
 45 | 	CFLAGS   += -pthread
 46 | 	CXXFLAGS += -pthread
 47 | endif
 48 | ifeq ($(UNAME_S),Haiku)
 49 | 	CFLAGS   += -pthread
 50 | 	CXXFLAGS += -pthread
 51 | endif
 52 | 
 53 | # Architecture specific
 54 | # TODO: probably these flags need to be tweaked on some architectures
 55 | #       feel free to update the Makefile for your architecture and send a pull request or issue
 56 | ifeq ($(UNAME_M),x86_64)
 57 | 	ifeq ($(UNAME_S),Darwin)
 58 | 		CFLAGS += -mfma -mf16c
 59 | 		AVX1_M := $(shell sysctl machdep.cpu.features)
 60 | 		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
 61 | 			CFLAGS += -mavx
 62 | 		endif
 63 | 		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
 64 | 		ifneq (,$(findstring AVX2,$(AVX2_M)))
 65 | 			CFLAGS += -mavx2
 66 | 		endif
 67 | 	else ifeq ($(UNAME_S),Linux)
 68 | 		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
 69 | 		ifneq (,$(findstring avx,$(AVX1_M)))
 70 | 			CFLAGS += -mavx
 71 | 		endif
 72 | 		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
 73 | 		ifneq (,$(findstring avx2,$(AVX2_M)))
 74 | 			CFLAGS += -mavx2
 75 | 		endif
 76 | 		FMA_M := $(shell grep "fma " /proc/cpuinfo)
 77 | 		ifneq (,$(findstring fma,$(FMA_M)))
 78 | 			CFLAGS += -mfma
 79 | 		endif
 80 | 		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
 81 | 		ifneq (,$(findstring f16c,$(F16C_M)))
 82 | 			CFLAGS += -mf16c
 83 | 		endif
 84 | 	else ifeq ($(UNAME_S),Haiku)
 85 | 		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
 86 | 		ifneq (,$(findstring avx,$(AVX1_M)))
 87 | 			CFLAGS += -mavx
 88 | 		endif
 89 | 		AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
 90 | 		ifneq (,$(findstring avx2,$(AVX2_M)))
 91 | 			CFLAGS += -mavx2
 92 | 		endif
 93 | 		FMA_M := $(shell sysinfo -cpu | grep "FMA ")
 94 | 		ifneq (,$(findstring fma,$(FMA_M)))
 95 | 			CFLAGS += -mfma
 96 | 		endif
 97 | 		F16C_M := $(shell sysinfo -cpu | grep "F16C ")
 98 | 		ifneq (,$(findstring f16c,$(F16C_M)))
 99 | 			CFLAGS += -mf16c
100 | 		endif
101 | 	else
102 | 		CFLAGS += -mfma -mf16c -mavx -mavx2
103 | 	endif
104 | endif
105 | ifeq ($(UNAME_M),amd64)
106 | 	CFLAGS += -mavx -mavx2 -mfma -mf16c
107 | endif
108 | ifndef WHISPER_NO_ACCELERATE
109 | 	# Mac M1 - include Accelerate framework
110 | 	ifeq ($(UNAME_S),Darwin)
111 | 		CFLAGS  += -DGGML_USE_ACCELERATE
112 | 		LDFLAGS += -framework Accelerate
113 | 	endif
114 | endif
115 | ifdef WHISPER_OPENBLAS
116 | 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
117 | 	LDFLAGS += -lopenblas
118 | endif
119 | ifdef WHISPER_GPROF
120 | 	CFLAGS  += -pg
121 | 	CXXFLAGS  += -pg
122 | endif
123 | ifneq ($(filter aarch64%,$(UNAME_M)),)
124 | endif
125 | ifneq ($(filter armv6%,$(UNAME_M)),)
126 | 	# Raspberry Pi 1, 2, 3
127 | 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
128 | endif
129 | ifneq ($(filter armv7%,$(UNAME_M)),)
130 | 	# Raspberry Pi 4
131 | 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
132 | endif
133 | ifneq ($(filter armv8%,$(UNAME_M)),)
134 | 	# Raspberry Pi 4
135 | 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
136 | endif
137 | 
138 | default: main
139 | 
140 | #
141 | # Build library
142 | #
143 | 
144 | ggml.o: ggml.c ggml.h
145 | 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
146 | 
147 | whisper.o: whisper.cpp whisper.h
148 | 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
149 | 
150 | libwhisper.a: ggml.o whisper.o
151 | 	$(AR) rcs libwhisper.a ggml.o whisper.o
152 | 
153 | libwhisper.so: ggml.o whisper.o
154 | 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
155 | 
156 | clean:
157 | 	rm -f *.o main stream command bench libwhisper.a libwhisper.so
158 | 
159 | #
160 | # Examples
161 | #
162 | 
163 | CC_SDL=`sdl2-config --cflags --libs`
164 | 
165 | main: examples/main/main.cpp ggml.o whisper.o
166 | 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
167 | 	./main -h
168 | 
169 | stream: examples/stream/stream.cpp ggml.o whisper.o
170 | 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
171 | 
172 | command: examples/command/command.cpp ggml.o whisper.o
173 | 	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
174 | 
175 | bench: examples/bench/bench.cpp ggml.o whisper.o
176 | 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
177 | 
178 | #
179 | # Audio samples
180 | #
181 | 
182 | # download a few audio samples into folder "./samples":
183 | .PHONY: samples
184 | samples:
185 | 	@echo "Downloading samples..."
186 | 	@mkdir -p samples
187 | 	@wget --quiet --show-progress -O samples/gb0.ogg https://upload.wikimedia.org/wikipedia/commons/2/22/George_W._Bush%27s_weekly_radio_address_%28November_1%2C_2008%29.oga
188 | 	@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
189 | 	@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
190 | 	@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
191 | 	@echo "Converting to 16-bit WAV ..."
192 | 	@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
193 | 	@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
194 | 	@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
195 | 	@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
196 | 	@rm samples/mm1.wav
197 | 
198 | #
199 | # Models
200 | #
201 | 
202 | # if not already downloaded, the following targets download the specified model and
203 | # runs it on all samples in the folder "./samples":
204 | 
205 | .PHONY: tiny.en
206 | .PHONY: tiny
207 | .PHONY: base.en
208 | .PHONY: base
209 | .PHONY: small.en
210 | .PHONY: small
211 | .PHONY: medium.en
212 | .PHONY: medium
213 | .PHONY: large-v1
214 | .PHONY: large
215 | 
216 | tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
217 | 	bash ./models/download-ggml-model.sh $@
218 | 	@echo ""
219 | 	@echo "==============================================="
220 | 	@echo "Running $@ on all samples in ./samples ..."
221 | 	@echo "==============================================="
222 | 	@echo ""
223 | 	@for f in samples/*.wav; do \
224 | 		echo "----------------------------------------------" ; \
225 | 		echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
226 | 	    echo "----------------------------------------------" ; \
227 | 		echo "" ; \
228 | 		./main -m models/ggml-$@.bin -f $$f ; \
229 | 		echo "" ; \
230 | 	done
231 | 
232 | #
233 | # Tests
234 | #
235 | 
236 | .PHONY: tests
237 | tests:
238 | 	bash ./tests/run-tests.sh
239 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required (VERSION 3.0)
  2 | project(whisper.cpp VERSION 1.0.0)
  3 | 
  4 | set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
  5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
  6 | set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
  7 | 
  8 | if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
  9 |     set(WHISPER_STANDALONE ON)
 10 |     include(cmake/GitVars.cmake)
 11 |     include(cmake/BuildTypes.cmake)
 12 | 
 13 |     # configure project version
 14 |     if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
 15 |         configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
 16 |     endif()
 17 | else()
 18 |     set(WHISPER_STANDALONE OFF)
 19 | endif()
 20 | 
 21 | if (EMSCRIPTEN)
 22 |     set(BUILD_SHARED_LIBS_DEFAULT OFF)
 23 | 
 24 |     option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
 25 | else()
 26 |     if (MINGW)
 27 |         set(BUILD_SHARED_LIBS_DEFAULT OFF)
 28 |     else()
 29 |         set(BUILD_SHARED_LIBS_DEFAULT ON)
 30 |     endif()
 31 | endif()
 32 | 
 33 | # options
 34 | 
 35 | option(BUILD_SHARED_LIBS               "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
 36 | 
 37 | option(WHISPER_ALL_WARNINGS            "whisper: enable all compiler warnings"                   ON)
 38 | option(WHISPER_ALL_WARNINGS_3RD_PARTY  "whisper: enable all compiler warnings in 3rd party libs" OFF)
 39 | 
 40 | option(WHISPER_SANITIZE_THREAD         "whisper: enable thread sanitizer"    OFF)
 41 | option(WHISPER_SANITIZE_ADDRESS        "whisper: enable address sanitizer"   OFF)
 42 | option(WHISPER_SANITIZE_UNDEFINED      "whisper: enable undefined sanitizer" OFF)
 43 | 
 44 | option(WHISPER_BUILD_TESTS             "whisper: build tests"    ${WHISPER_STANDALONE})
 45 | option(WHISPER_BUILD_EXAMPLES          "whisper: build examples" ${WHISPER_STANDALONE})
 46 | 
 47 | option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)
 48 | 
 49 | if (APPLE)
 50 |     option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
 51 |     option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
 52 |     option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
 53 | else()
 54 |     option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 55 | endif()
 56 | 
 57 | option(WHISPER_PERF                    "whisper: enable perf timings" OFF)
 58 | 
 59 | # sanitizers
 60 | 
 61 | if (NOT MSVC)
 62 |     if (WHISPER_SANITIZE_THREAD)
 63 |         set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
 64 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
 65 |     endif()
 66 | 
 67 |     if (WHISPER_SANITIZE_ADDRESS)
 68 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
 69 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
 70 |     endif()
 71 | 
 72 |     if (WHISPER_SANITIZE_UNDEFINED)
 73 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
 74 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
 75 |     endif()
 76 | endif()
 77 | 
 78 | #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
 79 | #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
 80 | 
 81 | # dependencies
 82 | 
 83 | set(CMAKE_C_STANDARD   11)
 84 | set(CMAKE_CXX_STANDARD 20)
 85 | 
 86 | find_package(Threads REQUIRED)
 87 | 
 88 | # on APPLE - include Accelerate framework
 89 | if (APPLE AND NOT WHISPER_NO_ACCELERATE)
 90 |     find_library(ACCELERATE_FRAMEWORK Accelerate)
 91 |     if (ACCELERATE_FRAMEWORK)
 92 |         message(STATUS "Accelerate framework found")
 93 | 
 94 |         set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
 95 |         set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
 96 |     else()
 97 |         message(WARNING "Accelerate framework not found")
 98 |     endif()
 99 | endif()
100 | 
101 | if (WHISPER_SUPPORT_OPENBLAS)
102 |     find_library(OPENBLAS_LIB
103 |         NAMES openblas libopenblas
104 |         )
105 |     if (OPENBLAS_LIB)
106 |         message(STATUS "OpenBLAS found")
107 | 
108 |         set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${OPENBLAS_LIB})
109 |         set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
110 |     else()
111 |         message(WARNING "OpenBLAS not found")
112 |     endif()
113 | endif()
114 | 
115 | # compiler flags
116 | 
117 | if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
118 |     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
119 |     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
120 | endif ()
121 | 
122 | if (WHISPER_ALL_WARNINGS)
123 |     if (NOT MSVC)
124 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
125 |             -Wall                           \
126 |             -Wextra                         \
127 |             -Wpedantic                      \
128 |             -Wshadow                        \
129 |             -Wcast-qual                     \
130 |             -Wstrict-prototypes             \
131 |             -Wpointer-arith                 \
132 |         ")
133 |     else()
134 |         # todo : msvc
135 |     endif()
136 | endif()
137 | 
138 | if (NOT MSVC)
139 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
140 |     #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
141 | endif()
142 | 
143 | message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
144 | 
145 | if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
146 |     message(STATUS "ARM detected")
147 | else()
148 |     message(STATUS "x86 detected")
149 |     if (MSVC)
150 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
151 |         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
152 |     else()
153 |         if (EMSCRIPTEN)
154 |             # we require support for WASM SIMD 128-bit
155 |             set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -msimd128")
156 |             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
157 |         else()
158 |             if(NOT WHISPER_NO_AVX)
159 |                 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
160 |             endif()
161 |             if(NOT WHISPER_NO_AVX2)
162 |                 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
163 |             endif()
164 |             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
165 |         endif()
166 |     endif()
167 | endif()
168 | 
169 | if (WHISPER_PERF)
170 |     set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
171 | endif()
172 | 
173 | #
174 | # whisper - this is the main library of the project
175 | #
176 | 
177 | set(TARGET whisper)
178 | 
179 | add_library(${TARGET}
180 |     ggml.c
181 |     whisper.cpp
182 |     )
183 | 
184 | target_include_directories(${TARGET} PUBLIC
185 |     .
186 |     )
187 | 
188 | if (MSVC)
189 |     target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
190 | 
191 |     set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
192 | else()
193 |     target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
194 | endif()
195 | 
196 | if (BUILD_SHARED_LIBS)
197 |     target_link_libraries(${TARGET} PUBLIC
198 |         ${CMAKE_DL_LIBS}
199 |         )
200 | 
201 |     target_compile_definitions(${TARGET} PUBLIC
202 |         WHISPER_SHARED
203 |         )
204 | endif()
205 | 
206 | target_compile_definitions(${TARGET} PUBLIC
207 |     ${WHISPER_EXTRA_FLAGS}
208 |     )
209 | 
210 | install(TARGETS ${TARGET}
211 |     LIBRARY DESTINATION lib
212 |     ARCHIVE DESTINATION lib/static
213 |     )
214 | 
215 | #
216 | # bindings
217 | #
218 | 
219 | add_subdirectory(bindings)
220 | 
221 | #
222 | # programs, examples and tests
223 | #
224 | 
225 | if (WHISPER_STANDALONE)
226 |     if (WHISPER_BUILD_TESTS)
227 |         enable_testing()
228 |         add_subdirectory(tests)
229 |     endif ()
230 | 
231 |     if (WHISPER_BUILD_EXAMPLES)
232 |         add_subdirectory(examples)
233 |     endif()
234 | endif ()
235 | 


--------------------------------------------------------------------------------
/models/convert-h5-to-ggml.py:
--------------------------------------------------------------------------------
  1 | # Convert Hugging Face fine-tuned models to ggml format
  2 | #
  3 | # Usage:
  4 | #
  5 | #   git clone https://github.com/openai/whisper
  6 | #   git clone https://github.com/ggerganov/whisper.cpp
  7 | #   git clone https://huggingface.co/openai/whisper-medium
  8 | #
  9 | #   python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
 10 | #
 11 | # This script is similar to "convert-pt-to-ggml.py"
 12 | #
 13 | # For more info:
 14 | #
 15 | #   https://github.com/ggerganov/whisper.cpp/issues/157
 16 | #
 17 | 
 18 | import io
 19 | import os
 20 | import sys
 21 | import struct
 22 | import json
 23 | import code
 24 | import torch
 25 | import numpy as np
 26 | 
 27 | from transformers import WhisperForConditionalGeneration
 28 | 
 29 | conv_map = {
 30 |         'self_attn.k_proj'              : 'attn.key',
 31 |         'self_attn.q_proj'              : 'attn.query',
 32 |         'self_attn.v_proj'              : 'attn.value',
 33 |         'self_attn.out_proj'            : 'attn.out',
 34 |         'self_attn_layer_norm'          : 'attn_ln',
 35 |         'encoder_attn.q_proj'           : 'cross_attn.query',
 36 |         'encoder_attn.v_proj'           : 'cross_attn.value',
 37 |         'encoder_attn.out_proj'         : 'cross_attn.out',
 38 |         'encoder_attn_layer_norm'       : 'cross_attn_ln',
 39 |         'fc1'                           : 'mlp.0',
 40 |         'fc2'                           : 'mlp.2',
 41 |         'final_layer_norm'              : 'mlp_ln',
 42 |         'encoder.layer_norm.bias'       : 'encoder.ln_post.bias',
 43 |         'encoder.layer_norm.weight'     : 'encoder.ln_post.weight',
 44 |         'encoder.embed_positions.weight': 'encoder.positional_embedding',
 45 |         'decoder.layer_norm.bias'       : 'decoder.ln.bias',
 46 |         'decoder.layer_norm.weight'     : 'decoder.ln.weight',
 47 |         'decoder.embed_positions.weight': 'decoder.positional_embedding',
 48 |         'decoder.embed_tokens.weight'   : 'decoder.token_embedding.weight',
 49 |         'proj_out.weight'               : 'decoder.proj.weight',
 50 |         }
 51 | 
 52 | # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 53 | def bytes_to_unicode():
 54 |     """
 55 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 56 |     The reversible bpe codes work on unicode strings.
 57 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 58 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 59 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 60 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 61 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 62 |     """
 63 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 64 |     cs = bs[:]
 65 |     n = 0
 66 |     for b in range(2**8):
 67 |         if b not in bs:
 68 |             bs.append(b)
 69 |             cs.append(2**8+n)
 70 |             n += 1
 71 |     cs = [chr(n) for n in cs]
 72 |     return dict(zip(bs, cs))
 73 | 
 74 | if len(sys.argv) < 4:
 75 |     print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
 76 |     sys.exit(1)
 77 | 
 78 | dir_model   = sys.argv[1]
 79 | dir_whisper = sys.argv[2]
 80 | dir_out     = sys.argv[3]
 81 | 
 82 | with open(dir_model + "/vocab.json", "r") as f:
 83 |     encoder = json.load(f)
 84 | with open(dir_model + "/added_tokens.json", "r") as f:
 85 |     encoder_added = json.load(f)
 86 | with open(dir_model + "/config.json", "r") as f:
 87 |     hparams = json.load(f)
 88 | 
 89 | model = WhisperForConditionalGeneration.from_pretrained(dir_model)
 90 | 
 91 | #code.interact(local=locals())
 92 | 
 93 | n_mels = hparams["num_mel_bins"]
 94 | with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
 95 |     filters = torch.from_numpy(f[f"mel_{n_mels}"])
 96 | 
 97 | dir_tokenizer = dir_model
 98 | 
 99 | fname_out = dir_out + "/ggml-model.bin"
100 | 
101 | with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
102 |     tokens = json.load(f)
103 | 
104 | # use 16-bit or 32-bit floats
105 | use_f16 = True
106 | if len(sys.argv) > 4:
107 |     use_f16 = False
108 |     fname_out = dir_out + "/ggml-model-f32.bin"
109 | 
110 | fout = open(fname_out, "wb")
111 | 
112 | fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
113 | fout.write(struct.pack("i", hparams["vocab_size"]))
114 | fout.write(struct.pack("i", hparams["max_source_positions"]))
115 | fout.write(struct.pack("i", hparams["d_model"]))
116 | fout.write(struct.pack("i", hparams["encoder_attention_heads"]))
117 | fout.write(struct.pack("i", hparams["encoder_layers"]))
118 | fout.write(struct.pack("i", hparams["max_length"]))
119 | fout.write(struct.pack("i", hparams["d_model"]))
120 | fout.write(struct.pack("i", hparams["decoder_attention_heads"]))
121 | fout.write(struct.pack("i", hparams["decoder_layers"]))
122 | fout.write(struct.pack("i", hparams["num_mel_bins"]))
123 | fout.write(struct.pack("i", use_f16))
124 | 
125 | fout.write(struct.pack("i", filters.shape[0]))
126 | fout.write(struct.pack("i", filters.shape[1]))
127 | for i in range(filters.shape[0]):
128 |     for j in range(filters.shape[1]):
129 |         fout.write(struct.pack("f", filters[i][j]))
130 | 
131 | byte_encoder = bytes_to_unicode()
132 | byte_decoder = {v:k for k, v in byte_encoder.items()}
133 | 
134 | fout.write(struct.pack("i", len(tokens)))
135 | 
136 | tokens = sorted(tokens.items(), key=lambda x: x[1])
137 | for key in tokens:
138 |     text = bytearray([byte_decoder[c] for c in key[0]])
139 |     fout.write(struct.pack("i", len(text)))
140 |     fout.write(text)
141 | 
142 | list_vars = model.state_dict()
143 | for name in list_vars.keys():
144 |     # this seems to not be used
145 |     # ref: https://github.com/huggingface/transformers/blob/9a5b84a0076a04fe9596da72e8668069d4f09ea0/src/transformers/models/whisper/modeling_whisper.py#L1099-L1106
146 |     if name == "proj_out.weight":
147 |         print('Skipping', name)
148 |         continue
149 | 
150 |     src = name
151 | 
152 |     nn = name
153 |     if name != "proj_out.weight":
154 |         nn = nn.split(".")[1:]
155 |     else:
156 |         nn = nn.split(".")
157 | 
158 |     if nn[1] == "layers":
159 |         nn[1] = "blocks"
160 |         if ".".join(nn[3:-1]) == "encoder_attn.k_proj":
161 |             mapped = "attn.key" if nn[0] == "encoder" else "cross_attn.key"
162 |         else:
163 |             mapped = conv_map[".".join(nn[3:-1])]
164 |         name = ".".join(nn[:3] + [mapped] + nn[-1:])
165 |     else:
166 |         name = ".".join(nn)
167 |         name = conv_map[name] if name in conv_map else name
168 | 
169 |     print(src, ' -> ', name)
170 |     data = list_vars[src].squeeze().numpy()
171 |     data = data.astype(np.float16)
172 | 
173 |     # reshape conv bias from [n] to [n, 1]
174 |     if name == "encoder.conv1.bias" or \
175 |        name == "encoder.conv2.bias":
176 |         data = data.reshape(data.shape[0], 1)
177 |         print("  Reshaped variable: " + name + " to shape: ", data.shape)
178 | 
179 |     n_dims = len(data.shape)
180 |     print(name, n_dims, data.shape)
181 | 
182 |     # looks like the whisper models are in f16 by default
183 |     # so we need to convert the small tensors to f32 until we fully support f16 in ggml
184 |     # ftype == 0 -> float32, ftype == 1 -> float16
185 |     ftype = 1;
186 |     if use_f16:
187 |         if n_dims < 2 or \
188 |                 name == "encoder.conv1.bias"   or \
189 |                 name == "encoder.conv2.bias"   or \
190 |                 name == "encoder.positional_embedding" or \
191 |                 name == "decoder.positional_embedding":
192 |             print("  Converting to float32")
193 |             data = data.astype(np.float32)
194 |             ftype = 0
195 |     else:
196 |         data = data.astype(np.float32)
197 |         ftype = 0
198 | 
199 |     # header
200 |     str = name.encode('utf-8')
201 |     fout.write(struct.pack("iii", n_dims, len(str), ftype))
202 |     for i in range(n_dims):
203 |         fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
204 |     fout.write(str);
205 | 
206 |     # data
207 |     data.tofile(fout)
208 | 
209 | fout.close()
210 | 
211 | print("Done. Output file: " + fname_out)
212 | print("")
213 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
  3 |     <device id="retina6_0" orientation="portrait" appearance="light"/>
  4 |     <dependencies>
  5 |         <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
  6 |         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
  7 |         <capability name="System colors in document resources" minToolsVersion="11.0"/>
  8 |         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
  9 |     </dependencies>
 10 |     <scenes>
 11 |         <!--View Controller-->
 12 |         <scene sceneID="tne-QT-ifu">
 13 |             <objects>
 14 |                 <viewController id="BYZ-38-t0r" customClass="ViewController" sceneMemberID="viewController">
 15 |                     <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
 16 |                         <rect key="frame" x="0.0" y="0.0" width="390" height="844"/>
 17 |                         <autoresizingMask key="autoresizingMask" flexibleMinX="YES" widthSizable="YES" flexibleMinY="YES" heightSizable="YES"/>
 18 |                         <subviews>
 19 |                             <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="VOi-PT-Rbu">
 20 |                                 <rect key="frame" x="35" y="121" width="156" height="49"/>
 21 |                                 <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
 22 |                                 <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
 23 |                                 <color key="tintColor" systemColor="opaqueSeparatorColor"/>
 24 |                                 <state key="normal" title="Start Capturing">
 25 |                                     <color key="titleColor" systemColor="labelColor"/>
 26 |                                 </state>
 27 |                                 <connections>
 28 |                                     <action selector="toggleCapture:" destination="BYZ-38-t0r" eventType="touchUpInside" id="BuO-Wf-RgV"/>
 29 |                                 </connections>
 30 |                             </button>
 31 |                             <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="Status: Idle" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Tgu-2q-eHQ">
 32 |                                 <rect key="frame" x="35" y="78" width="232" height="21"/>
 33 |                                 <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
 34 |                                 <fontDescription key="fontDescription" type="system" pointSize="17"/>
 35 |                                 <nil key="textColor"/>
 36 |                                 <nil key="highlightedColor"/>
 37 |                             </label>
 38 |                             <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" fixedFrame="YES" text="Record some speech and press &quot;Transcribe&quot;. The result will be displayed here." textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="mv2-KD-7jn">
 39 |                                 <rect key="frame" x="35" y="248" width="320" height="300"/>
 40 |                                 <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
 41 |                                 <color key="backgroundColor" systemColor="systemBackgroundColor"/>
 42 |                                 <color key="textColor" systemColor="labelColor"/>
 43 |                                 <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
 44 |                                 <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
 45 |                             </textView>
 46 |                             <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
 47 |                                 <rect key="frame" x="35" y="191" width="156" height="49"/>
 48 |                                 <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
 49 |                                 <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
 50 |                                 <color key="tintColor" systemColor="opaqueSeparatorColor"/>
 51 |                                 <state key="normal" title="Transcribe">
 52 |                                     <color key="titleColor" systemColor="labelColor"/>
 53 |                                 </state>
 54 |                                 <connections>
 55 |                                     <action selector="onTranscribe:" destination="BYZ-38-t0r" eventType="touchUpInside" id="ond-bx-48O"/>
 56 |                                     <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
 57 |                                 </connections>
 58 |                             </button>
 59 |                             <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
 60 |                                 <rect key="frame" x="199" y="191" width="156" height="49"/>
 61 |                                 <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
 62 |                                 <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
 63 |                                 <color key="tintColor" systemColor="opaqueSeparatorColor"/>
 64 |                                 <state key="normal" title="Real-time">
 65 |                                     <color key="titleColor" systemColor="labelColor"/>
 66 |                                 </state>
 67 |                                 <connections>
 68 |                                     <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
 69 |                                 </connections>
 70 |                             </button>
 71 |                         </subviews>
 72 |                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
 73 |                         <color key="backgroundColor" systemColor="systemBackgroundColor"/>
 74 |                         <constraints>
 75 |                             <constraint firstItem="Brs-xi-o8i" firstAttribute="trailing" secondItem="VOi-PT-Rbu" secondAttribute="trailing" id="8mF-AW-cbc"/>
 76 |                         </constraints>
 77 |                     </view>
 78 |                     <connections>
 79 |                         <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
 80 |                         <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
 81 |                         <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
 82 |                         <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
 83 |                         <outlet property="textviewResult" destination="mv2-KD-7jn" id="RBw-0L-iGj"/>
 84 |                     </connections>
 85 |                 </viewController>
 86 |                 <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
 87 |             </objects>
 88 |             <point key="canvasLocation" x="30.769230769230766" y="-28.436018957345969"/>
 89 |         </scene>
 90 |     </scenes>
 91 |     <resources>
 92 |         <systemColor name="labelColor">
 93 |             <color red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
 94 |         </systemColor>
 95 |         <systemColor name="opaqueSeparatorColor">
 96 |             <color red="0.77647058823529413" green="0.77647058823529413" blue="0.78431372549019607" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
 97 |         </systemColor>
 98 |         <systemColor name="systemBackgroundColor">
 99 |             <color white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
100 |         </systemColor>
101 |     </resources>
102 | </document>
103 | 


--------------------------------------------------------------------------------
/examples/whisper.objc/whisper.objc/ViewController.m:
--------------------------------------------------------------------------------
  1 | //
  2 | //  ViewController.m
  3 | //  whisper.objc
  4 | //
  5 | //  Created by Georgi Gerganov on 23.10.22.
  6 | //
  7 | 
  8 | #import "ViewController.h"
  9 | 
 10 | #import "whisper.h"
 11 | 
 12 | #define NUM_BYTES_PER_BUFFER 16*1024
 13 | 
 14 | // callback used to process captured audio
 15 | void AudioInputCallback(void * inUserData,
 16 |                         AudioQueueRef inAQ,
 17 |                         AudioQueueBufferRef inBuffer,
 18 |                         const AudioTimeStamp * inStartTime,
 19 |                         UInt32 inNumberPacketDescriptions,
 20 |                         const AudioStreamPacketDescription * inPacketDescs);
 21 | 
 22 | @interface ViewController ()
 23 | 
 24 | @property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
 25 | @property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
 26 | @property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
 27 | @property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
 28 | @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
 29 | 
 30 | @end
 31 | 
 32 | @implementation ViewController
 33 | 
 34 | - (void)setupAudioFormat:(AudioStreamBasicDescription*)format
 35 | {
 36 |     format->mSampleRate       = WHISPER_SAMPLE_RATE;
 37 |     format->mFormatID         = kAudioFormatLinearPCM;
 38 |     format->mFramesPerPacket  = 1;
 39 |     format->mChannelsPerFrame = 1;
 40 |     format->mBytesPerFrame    = 2;
 41 |     format->mBytesPerPacket   = 2;
 42 |     format->mBitsPerChannel   = 16;
 43 |     format->mReserved         = 0;
 44 |     format->mFormatFlags      = kLinearPCMFormatFlagIsSignedInteger;
 45 | }
 46 | 
 47 | - (void)viewDidLoad {
 48 |     [super viewDidLoad];
 49 | 
 50 |     // whisper.cpp initialization
 51 |     {
 52 |         // load the model
 53 |         NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"ggml-base.en" ofType:@"bin"];
 54 | 
 55 |         // check if the model exists
 56 |         if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) {
 57 |             NSLog(@"Model file not found");
 58 |             return;
 59 |         }
 60 | 
 61 |         NSLog(@"Loading model from %@", modelPath);
 62 | 
 63 |         // create ggml context
 64 |         stateInp.ctx = whisper_init([modelPath UTF8String]);
 65 | 
 66 |         // check if the model was loaded successfully
 67 |         if (stateInp.ctx == NULL) {
 68 |             NSLog(@"Failed to load model");
 69 |             return;
 70 |         }
 71 |     }
 72 | 
 73 |     // initialize audio format and buffers
 74 |     {
 75 |         [self setupAudioFormat:&stateInp.dataFormat];
 76 | 
 77 |         stateInp.n_samples = 0;
 78 |         stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
 79 |         stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
 80 |     }
 81 | 
 82 |     stateInp.isTranscribing = false;
 83 |     stateInp.isRealtime = false;
 84 | }
 85 | 
 86 | -(IBAction) stopCapturing {
 87 |     NSLog(@"Stop capturing");
 88 | 
 89 |     _labelStatusInp.text = @"Status: Idle";
 90 | 
 91 |     [_buttonToggleCapture setTitle:@"Start capturing" forState:UIControlStateNormal];
 92 |     [_buttonToggleCapture setBackgroundColor:[UIColor grayColor]];
 93 | 
 94 |     stateInp.isCapturing = false;
 95 | 
 96 |     AudioQueueStop(stateInp.queue, true);
 97 |     for (int i = 0; i < NUM_BUFFERS; i++) {
 98 |         AudioQueueFreeBuffer(stateInp.queue, stateInp.buffers[i]);
 99 |     }
100 | 
101 |     AudioQueueDispose(stateInp.queue, true);
102 | }
103 | 
104 | - (IBAction)toggleCapture:(id)sender {
105 |     if (stateInp.isCapturing) {
106 |         // stop capturing
107 |         [self stopCapturing];
108 | 
109 |         return;
110 |     }
111 | 
112 |     // initiate audio capturing
113 |     NSLog(@"Start capturing");
114 | 
115 |     stateInp.n_samples = 0;
116 |     stateInp.vc = (__bridge void *)(self);
117 | 
118 |     OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
119 |                                          AudioInputCallback,
120 |                                          &stateInp,
121 |                                          CFRunLoopGetCurrent(),
122 |                                          kCFRunLoopCommonModes,
123 |                                          0,
124 |                                          &stateInp.queue);
125 | 
126 |     if (status == 0) {
127 |         for (int i = 0; i < NUM_BUFFERS; i++) {
128 |             AudioQueueAllocateBuffer(stateInp.queue, NUM_BYTES_PER_BUFFER, &stateInp.buffers[i]);
129 |             AudioQueueEnqueueBuffer (stateInp.queue, stateInp.buffers[i], 0, NULL);
130 |         }
131 | 
132 |         stateInp.isCapturing = true;
133 |         status = AudioQueueStart(stateInp.queue, NULL);
134 |         if (status == 0) {
135 |             _labelStatusInp.text = @"Status: Capturing";
136 |             [sender setTitle:@"Stop Capturing" forState:UIControlStateNormal];
137 |             [_buttonToggleCapture setBackgroundColor:[UIColor redColor]];
138 |         }
139 |     }
140 | 
141 |     if (status != 0) {
142 |         [self stopCapturing];
143 |     }
144 | }
145 | 
146 | - (IBAction)onTranscribePrepare:(id)sender {
147 |     _textviewResult.text = @"Processing - please wait ...";
148 | 
149 |     if (stateInp.isRealtime) {
150 |         [self onRealtime:(id)sender];
151 |     }
152 | 
153 |     if (stateInp.isCapturing) {
154 |         [self stopCapturing];
155 |     }
156 | }
157 | 
158 | - (IBAction)onRealtime:(id)sender {
159 |     stateInp.isRealtime = !stateInp.isRealtime;
160 | 
161 |     if (stateInp.isRealtime) {
162 |         [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
163 |     } else {
164 |         [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
165 |     }
166 | 
167 |     NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
168 | }
169 | 
170 | - (IBAction)onTranscribe:(id)sender {
171 |     if (stateInp.isTranscribing) {
172 |         return;
173 |     }
174 | 
175 |     NSLog(@"Processing %d samples", stateInp.n_samples);
176 | 
177 |     stateInp.isTranscribing = true;
178 | 
179 |     // dispatch the model to a background thread
180 |     dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
181 |         // process captured audio
182 |         // convert I16 to F32
183 |         for (int i = 0; i < self->stateInp.n_samples; i++) {
184 |             self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
185 |         }
186 | 
187 |         // run the model
188 |         struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
189 | 
190 |         // get maximum number of threads on this device (max 8)
191 |         const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
192 | 
193 |         params.print_realtime   = true;
194 |         params.print_progress   = false;
195 |         params.print_timestamps = true;
196 |         params.print_special    = false;
197 |         params.translate        = false;
198 |         params.language         = "en";
199 |         params.n_threads        = max_threads;
200 |         params.offset_ms        = 0;
201 |         params.no_context       = true;
202 |         params.single_segment   = self->stateInp.isRealtime;
203 | 
204 |         CFTimeInterval startTime = CACurrentMediaTime();
205 | 
206 |         whisper_reset_timings(self->stateInp.ctx);
207 | 
208 |         if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
209 |             NSLog(@"Failed to run the model");
210 |             self->_textviewResult.text = @"Failed to run the model";
211 | 
212 |             return;
213 |         }
214 | 
215 |         whisper_print_timings(self->stateInp.ctx);
216 | 
217 |         CFTimeInterval endTime = CACurrentMediaTime();
218 | 
219 |         NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
220 | 
221 |         // result text
222 |         NSString *result = @"";
223 | 
224 |         int n_segments = whisper_full_n_segments(self->stateInp.ctx);
225 |         for (int i = 0; i < n_segments; i++) {
226 |             const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
227 | 
228 |             // append the text to the result
229 |             result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
230 |         }
231 | 
232 |         const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
233 | 
234 |         // append processing time
235 |         result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time:  %5.3f s]", tRecording]];
236 |         result = [result stringByAppendingString:[NSString stringWithFormat:@"  \n[processing time: %5.3f s]", endTime - startTime]];
237 | 
238 |         // dispatch the result to the main thread
239 |         dispatch_async(dispatch_get_main_queue(), ^{
240 |             self->_textviewResult.text = result;
241 |             self->stateInp.isTranscribing = false;
242 |         });
243 |     });
244 | }
245 | 
246 | //
247 | // Callback implementation
248 | //
249 | 
250 | void AudioInputCallback(void * inUserData,
251 |                         AudioQueueRef inAQ,
252 |                         AudioQueueBufferRef inBuffer,
253 |                         const AudioTimeStamp * inStartTime,
254 |                         UInt32 inNumberPacketDescriptions,
255 |                         const AudioStreamPacketDescription * inPacketDescs)
256 | {
257 |     StateInp * stateInp = (StateInp*)inUserData;
258 | 
259 |     if (!stateInp->isCapturing) {
260 |         NSLog(@"Not capturing, ignoring audio");
261 |         return;
262 |     }
263 | 
264 |     const int n = inBuffer->mAudioDataByteSize / 2;
265 | 
266 |     NSLog(@"Captured %d new samples", n);
267 | 
268 |     if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
269 |         NSLog(@"Too much audio data, ignoring");
270 | 
271 |         dispatch_async(dispatch_get_main_queue(), ^{
272 |             ViewController * vc = (__bridge ViewController *)(stateInp->vc);
273 |             [vc stopCapturing];
274 |         });
275 | 
276 |         return;
277 |     }
278 | 
279 |     for (int i = 0; i < n; i++) {
280 |         stateInp->audioBufferI16[stateInp->n_samples + i] = ((short*)inBuffer->mAudioData)[i];
281 |     }
282 | 
283 |     stateInp->n_samples += n;
284 | 
285 |     // put the buffer back in the queue
286 |     AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
287 | 
288 |     if (stateInp->isRealtime) {
289 |         // dipatch onTranscribe() to the main thread
290 |         dispatch_async(dispatch_get_main_queue(), ^{
291 |             ViewController * vc = (__bridge ViewController *)(stateInp->vc);
292 |             [vc onTranscribe:nil];
293 |         });
294 |     }
295 | }
296 | 
297 | @end
298 | 


--------------------------------------------------------------------------------
/models/convert-pt-to-ggml.py:
--------------------------------------------------------------------------------
  1 | # Convert Whisper transformer model from PyTorch to ggml format
  2 | #
  3 | # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
  4 | #
  5 | # You need to clone the original repo in ~/path/to/repo/whisper/
  6 | #
  7 | #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
  8 | #
  9 | # It is used to various assets needed by the algorithm:
 10 | #
 11 | #  - tokenizer
 12 | #  - mel filters
 13 | #
 14 | # Also, you need to have the original models in ~/.cache/whisper/
 15 | # See the original repo for more details.
 16 | #
 17 | # This script loads the specified model and whisper assets and saves them in ggml format.
 18 | # The output is a single binary file containing the following information:
 19 | #
 20 | #  - hparams
 21 | #  - mel filters
 22 | #  - tokenizer vocab
 23 | #  - model variables
 24 | #
 25 | # For each variable, write the following:
 26 | #
 27 | #  - Number of dimensions (int)
 28 | #  - Name length (int)
 29 | #  - Dimensions (int[n_dims])
 30 | #  - Name (char[name_length])
 31 | #  - Data (float[n_dims])
 32 | #
 33 | 
 34 | import io
 35 | import os
 36 | import sys
 37 | import struct
 38 | import json
 39 | import code
 40 | import torch
 41 | import numpy as np
 42 | 
 43 | #from transformers import GPTJForCausalLM
 44 | #from transformers import GPT2TokenizerFast
 45 | 
 46 | # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 47 | #LANGUAGES = {
 48 | #    "en": "english",
 49 | #    "zh": "chinese",
 50 | #    "de": "german",
 51 | #    "es": "spanish",
 52 | #    "ru": "russian",
 53 | #    "ko": "korean",
 54 | #    "fr": "french",
 55 | #    "ja": "japanese",
 56 | #    "pt": "portuguese",
 57 | #    "tr": "turkish",
 58 | #    "pl": "polish",
 59 | #    "ca": "catalan",
 60 | #    "nl": "dutch",
 61 | #    "ar": "arabic",
 62 | #    "sv": "swedish",
 63 | #    "it": "italian",
 64 | #    "id": "indonesian",
 65 | #    "hi": "hindi",
 66 | #    "fi": "finnish",
 67 | #    "vi": "vietnamese",
 68 | #    "iw": "hebrew",
 69 | #    "uk": "ukrainian",
 70 | #    "el": "greek",
 71 | #    "ms": "malay",
 72 | #    "cs": "czech",
 73 | #    "ro": "romanian",
 74 | #    "da": "danish",
 75 | #    "hu": "hungarian",
 76 | #    "ta": "tamil",
 77 | #    "no": "norwegian",
 78 | #    "th": "thai",
 79 | #    "ur": "urdu",
 80 | #    "hr": "croatian",
 81 | #    "bg": "bulgarian",
 82 | #    "lt": "lithuanian",
 83 | #    "la": "latin",
 84 | #    "mi": "maori",
 85 | #    "ml": "malayalam",
 86 | #    "cy": "welsh",
 87 | #    "sk": "slovak",
 88 | #    "te": "telugu",
 89 | #    "fa": "persian",
 90 | #    "lv": "latvian",
 91 | #    "bn": "bengali",
 92 | #    "sr": "serbian",
 93 | #    "az": "azerbaijani",
 94 | #    "sl": "slovenian",
 95 | #    "kn": "kannada",
 96 | #    "et": "estonian",
 97 | #    "mk": "macedonian",
 98 | #    "br": "breton",
 99 | #    "eu": "basque",
100 | #    "is": "icelandic",
101 | #    "hy": "armenian",
102 | #    "ne": "nepali",
103 | #    "mn": "mongolian",
104 | #    "bs": "bosnian",
105 | #    "kk": "kazakh",
106 | #    "sq": "albanian",
107 | #    "sw": "swahili",
108 | #    "gl": "galician",
109 | #    "mr": "marathi",
110 | #    "pa": "punjabi",
111 | #    "si": "sinhala",
112 | #    "km": "khmer",
113 | #    "sn": "shona",
114 | #    "yo": "yoruba",
115 | #    "so": "somali",
116 | #    "af": "afrikaans",
117 | #    "oc": "occitan",
118 | #    "ka": "georgian",
119 | #    "be": "belarusian",
120 | #    "tg": "tajik",
121 | #    "sd": "sindhi",
122 | #    "gu": "gujarati",
123 | #    "am": "amharic",
124 | #    "yi": "yiddish",
125 | #    "lo": "lao",
126 | #    "uz": "uzbek",
127 | #    "fo": "faroese",
128 | #    "ht": "haitian creole",
129 | #    "ps": "pashto",
130 | #    "tk": "turkmen",
131 | #    "nn": "nynorsk",
132 | #    "mt": "maltese",
133 | #    "sa": "sanskrit",
134 | #    "lb": "luxembourgish",
135 | #    "my": "myanmar",
136 | #    "bo": "tibetan",
137 | #    "tl": "tagalog",
138 | #    "mg": "malagasy",
139 | #    "as": "assamese",
140 | #    "tt": "tatar",
141 | #    "haw": "hawaiian",
142 | #    "ln": "lingala",
143 | #    "ha": "hausa",
144 | #    "ba": "bashkir",
145 | #    "jw": "javanese",
146 | #    "su": "sundanese",
147 | #}
148 | 
149 | ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150 | #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
151 | #    os.environ["TOKENIZERS_PARALLELISM"] = "false"
152 | #    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
153 | #    tokenizer = GPT2TokenizerFast.from_pretrained(path)
154 | #
155 | #    specials = [
156 | #        "<|startoftranscript|>",
157 | #        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
158 | #        "<|translate|>",
159 | #        "<|transcribe|>",
160 | #        "<|startoflm|>",
161 | #        "<|startofprev|>",
162 | #        "<|nocaptions|>",
163 | #        "<|notimestamps|>",
164 | #    ]
165 | #
166 | #    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
167 | #    return tokenizer
168 | 
169 | # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
170 | def bytes_to_unicode():
171 |     """
172 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
173 |     The reversible bpe codes work on unicode strings.
174 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
175 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
176 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
177 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
178 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
179 |     """
180 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
181 |     cs = bs[:]
182 |     n = 0
183 |     for b in range(2**8):
184 |         if b not in bs:
185 |             bs.append(b)
186 |             cs.append(2**8+n)
187 |             n += 1
188 |     cs = [chr(n) for n in cs]
189 |     return dict(zip(bs, cs))
190 | 
191 | 
192 | if len(sys.argv) < 4:
193 |     print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
194 |     sys.exit(1)
195 | 
196 | fname_inp   = sys.argv[1]
197 | dir_whisper = sys.argv[2]
198 | dir_out     = sys.argv[3]
199 | 
200 | # try to load PyTorch binary data
201 | try:
202 |     model_bytes = open(fname_inp, "rb").read()
203 |     with io.BytesIO(model_bytes) as fp:
204 |         checkpoint = torch.load(fp, map_location="cpu")
205 | except:
206 |     print("Error: failed to load PyTorch model file: %s" % fname_inp)
207 |     sys.exit(1)
208 | 
209 | hparams = checkpoint["dims"]
210 | print("hparams:", hparams)
211 | 
212 | list_vars = checkpoint["model_state_dict"]
213 | 
214 | #print(list_vars['encoder.positional_embedding'])
215 | #print(list_vars['encoder.conv1.weight'])
216 | #print(list_vars['encoder.conv1.weight'].shape)
217 | 
218 | # load mel filters
219 | n_mels = hparams["n_mels"]
220 | with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
221 |     filters = torch.from_numpy(f[f"mel_{n_mels}"])
222 |     #print (filters)
223 | 
224 | #code.interact(local=locals())
225 | 
226 | multilingual = hparams["n_vocab"] == 51865
227 | dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
228 | 
229 | #tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
230 | #print(tokenizer)
231 | #print(tokenizer.name_or_path)
232 | #print(len(tokenizer.additional_special_tokens))
233 | 
234 | # output in the same directory as the model
235 | fname_out = dir_out + "/ggml-model.bin"
236 | 
237 | with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
238 |     tokens = json.load(f)
239 | 
240 | # use 16-bit or 32-bit floats
241 | use_f16 = True
242 | if len(sys.argv) > 4:
243 |     use_f16 = False
244 |     fname_out = dir_out + "/ggml-model-f32.bin"
245 | 
246 | fout = open(fname_out, "wb")
247 | 
248 | fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
249 | fout.write(struct.pack("i", hparams["n_vocab"]))
250 | fout.write(struct.pack("i", hparams["n_audio_ctx"]))
251 | fout.write(struct.pack("i", hparams["n_audio_state"]))
252 | fout.write(struct.pack("i", hparams["n_audio_head"]))
253 | fout.write(struct.pack("i", hparams["n_audio_layer"]))
254 | fout.write(struct.pack("i", hparams["n_text_ctx"]))
255 | fout.write(struct.pack("i", hparams["n_text_state"]))
256 | fout.write(struct.pack("i", hparams["n_text_head"]))
257 | fout.write(struct.pack("i", hparams["n_text_layer"]))
258 | fout.write(struct.pack("i", hparams["n_mels"]))
259 | fout.write(struct.pack("i", use_f16))
260 | 
261 | # write mel filters
262 | fout.write(struct.pack("i", filters.shape[0]))
263 | fout.write(struct.pack("i", filters.shape[1]))
264 | for i in range(filters.shape[0]):
265 |     for j in range(filters.shape[1]):
266 |         fout.write(struct.pack("f", filters[i][j]))
267 | 
268 | byte_encoder = bytes_to_unicode()
269 | byte_decoder = {v:k for k, v in byte_encoder.items()}
270 | 
271 | fout.write(struct.pack("i", len(tokens)))
272 | 
273 | for key in tokens:
274 |     text = bytearray([byte_decoder[c] for c in key])
275 |     fout.write(struct.pack("i", len(text)))
276 |     fout.write(text)
277 | 
278 | for name in list_vars.keys():
279 |     data = list_vars[name].squeeze().numpy()
280 |     print("Processing variable: " + name + " with shape: ", data.shape)
281 | 
282 |     # reshape conv bias from [n] to [n, 1]
283 |     if name == "encoder.conv1.bias" or \
284 |        name == "encoder.conv2.bias":
285 |         data = data.reshape(data.shape[0], 1)
286 |         print("  Reshaped variable: " + name + " to shape: ", data.shape)
287 | 
288 |     n_dims = len(data.shape);
289 | 
290 |     # looks like the whisper models are in f16 by default
291 |     # so we need to convert the small tensors to f32 until we fully support f16 in ggml
292 |     # ftype == 0 -> float32, ftype == 1 -> float16
293 |     ftype = 1;
294 |     if use_f16:
295 |         if n_dims < 2 or \
296 |                 name == "encoder.conv1.bias"   or \
297 |                 name == "encoder.conv2.bias"   or \
298 |                 name == "encoder.positional_embedding" or \
299 |                 name == "decoder.positional_embedding":
300 |             print("  Converting to float32")
301 |             data = data.astype(np.float32)
302 |             ftype = 0
303 |     else:
304 |         data = data.astype(np.float32)
305 |         ftype = 0
306 | 
307 |     #if name.startswith("encoder"):
308 |     #    if name.endswith("mlp.0.weight") or \
309 |     #       name.endswith("mlp.2.weight"):
310 |     #        print("  Transposing")
311 |     #        data = data.transpose()
312 | 
313 |     # header
314 |     str = name.encode('utf-8')
315 |     fout.write(struct.pack("iii", n_dims, len(str), ftype))
316 |     for i in range(n_dims):
317 |         fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
318 |     fout.write(str);
319 | 
320 |     # data
321 |     data.tofile(fout)
322 | 
323 | fout.close()
324 | 
325 | print("Done. Output file: " + fname_out)
326 | print("")
327 | 


--------------------------------------------------------------------------------
/whisper.h:
--------------------------------------------------------------------------------
  1 | #ifndef WHISPER_H
  2 | #define WHISPER_H
  3 | 
  4 | #include <stdint.h>
  5 | #include <stdbool.h>
  6 | 
  7 | #ifdef WHISPER_SHARED
  8 | #    ifdef _WIN32
  9 | #        ifdef WHISPER_BUILD
 10 | #            define WHISPER_API __declspec(dllexport)
 11 | #        else
 12 | #            define WHISPER_API __declspec(dllimport)
 13 | #        endif
 14 | #    else
 15 | #        define WHISPER_API __attribute__ ((visibility ("default")))
 16 | #    endif
 17 | #else
 18 | #    define WHISPER_API
 19 | #endif
 20 | 
 21 | #define WHISPER_SAMPLE_RATE 16000
 22 | #define WHISPER_N_FFT       400
 23 | #define WHISPER_N_MEL       80
 24 | #define WHISPER_HOP_LENGTH  160
 25 | #define WHISPER_CHUNK_SIZE  30
 26 | 
 27 | #ifdef __cplusplus
 28 | extern "C" {
 29 | #endif
 30 | 
 31 |     //
 32 |     // C interface
 33 |     //
 34 |     // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
 35 |     // concurrently.
 36 |     //
 37 |     // Basic usage:
 38 |     //
 39 |     //     #include "whisper.h"
 40 |     //
 41 |     //     ...
 42 |     //
 43 |     //     struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
 44 |     //
 45 |     //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
 46 |     //         fprintf(stderr, "failed to process audio\n");
 47 |     //         return 7;
 48 |     //     }
 49 |     //
 50 |     //     const int n_segments = whisper_full_n_segments(ctx);
 51 |     //     for (int i = 0; i < n_segments; ++i) {
 52 |     //         const char * text = whisper_full_get_segment_text(ctx, i);
 53 |     //         printf("%s", text);
 54 |     //     }
 55 |     //
 56 |     //     whisper_free(ctx);
 57 |     //
 58 |     //     ...
 59 |     //
 60 |     // This is a demonstration of the most straightforward usage of the library.
 61 |     // "pcmf32" contains the RAW audio data in 32-bit floating point format.
 62 |     //
 63 |     // The interface also allows for more fine-grained control over the computation, but it requires a deeper
 64 |     // understanding of how the model works.
 65 |     //
 66 | 
 67 |     struct whisper_context;
 68 | 
 69 |     typedef int whisper_token;
 70 | 
 71 |     typedef struct whisper_token_data {
 72 |         whisper_token id;  // token id
 73 |         whisper_token tid; // forced timestamp token id
 74 | 
 75 |         float p;           // probability of the token
 76 |         float pt;          // probability of the timestamp token
 77 |         float ptsum;       // sum of probabilities of all timestamp tokens
 78 | 
 79 |         // token-level timestamp data
 80 |         // do not use if you haven't computed token-level timestamps
 81 |         int64_t t0;        // start time of the token
 82 |         int64_t t1;        //   end time of the token
 83 | 
 84 |         float vlen;        // voice length of the token
 85 |     } whisper_token_data;
 86 | 
 87 |     // Allocates all memory needed for the model and loads the model from the given file.
 88 |     // Returns NULL on failure.
 89 |     WHISPER_API struct whisper_context * whisper_init(const char * path_model);
 90 | 
 91 |     // Frees all memory allocated by the model.
 92 |     WHISPER_API void whisper_free(struct whisper_context * ctx);
 93 | 
 94 |     // Convert RAW PCM audio to log mel spectrogram.
 95 |     // The resulting spectrogram is stored inside the provided whisper context.
 96 |     // Returns 0 on success
 97 |     WHISPER_API int whisper_pcm_to_mel(
 98 |             struct whisper_context * ctx,
 99 |                        const float * samples,
100 |                                int   n_samples,
101 |                                int   n_threads);
102 | 
103 |     // This can be used to set a custom log mel spectrogram inside the provided whisper context.
104 |     // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
105 |     // n_mel must be 80
106 |     // Returns 0 on success
107 |     WHISPER_API int whisper_set_mel(
108 |             struct whisper_context * ctx,
109 |                        const float * data,
110 |                                int   n_len,
111 |                                int   n_mel);
112 | 
113 |     // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
114 |     // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
115 |     // offset can be used to specify the offset of the first frame in the spectrogram.
116 |     // Returns 0 on success
117 |     WHISPER_API int whisper_encode(
118 |             struct whisper_context * ctx,
119 |                                int   offset,
120 |                                int   n_threads);
121 | 
122 |     // Run the Whisper decoder to obtain the logits and probabilities for the next token.
123 |     // Make sure to call whisper_encode() first.
124 |     // tokens + n_tokens is the provided context for the decoder.
125 |     // n_past is the number of tokens to use from previous decoder calls.
126 |     // Returns 0 on success
127 |     WHISPER_API int whisper_decode(
128 |             struct whisper_context * ctx,
129 |                const whisper_token * tokens,
130 |                                int   n_tokens,
131 |                                int   n_past,
132 |                                int   n_threads);
133 | 
134 |     // Token sampling methods.
135 |     // These are provided for convenience and can be used after each call to whisper_decode().
136 |     // You can also implement your own sampling method using the whisper_get_probs() function.
137 |     // whisper_sample_best() returns the token with the highest probability
138 |     // whisper_sample_timestamp() returns the most probable timestamp token
139 |     WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
140 |     WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
141 | 
142 |     // Return the id of the specified language, returns -1 if not found
143 |     WHISPER_API int whisper_lang_id(const char * lang);
144 | 
145 |     WHISPER_API int whisper_n_len          (struct whisper_context * ctx); // mel length
146 |     WHISPER_API int whisper_n_vocab        (struct whisper_context * ctx);
147 |     WHISPER_API int whisper_n_text_ctx     (struct whisper_context * ctx);
148 |     WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
149 | 
150 |     // The probabilities for the next token
151 |     WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
152 | 
153 |     // Token Id -> String. Uses the vocabulary in the provided context
154 |     WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
155 | 
156 |     // Special tokens
157 |     WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
158 |     WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
159 |     WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
160 |     WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
161 |     WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
162 |     WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
163 | 
164 |     // Task tokens
165 |     WHISPER_API whisper_token whisper_token_translate (void);
166 |     WHISPER_API whisper_token whisper_token_transcribe(void);
167 | 
168 |     // Performance information
169 |     WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
170 |     WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
171 | 
172 |     // Print system information
173 |     WHISPER_API const char * whisper_print_system_info(void);
174 | 
175 |     ////////////////////////////////////////////////////////////////////////////
176 | 
177 |     // Available sampling strategies
178 |     enum whisper_sampling_strategy {
179 |         WHISPER_SAMPLING_GREEDY,      // Always select the most probable token
180 |         WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet!
181 |     };
182 | 
183 |     // Text segment callback
184 |     // Called on every newly generated text segment
185 |     // Use the whisper_full_...() functions to obtain the text segments
186 |     typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
187 | 
188 |     // Encoder begin callback
189 |     // If not NULL, called before the encoder starts
190 |     // If it returns false, the computation is aborted
191 |     typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
192 | 
193 |     // Parameters for the whisper_full() function
194 |     // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
195 |     // whisper_full_default_params()
196 |     struct whisper_full_params {
197 |         enum whisper_sampling_strategy strategy;
198 | 
199 |         int n_threads;
200 |         int n_max_text_ctx;
201 |         int offset_ms;          // start offset in ms
202 |         int duration_ms;        // audio duration to process in ms
203 | 
204 |         bool translate;
205 |         bool no_context;
206 |         bool single_segment;    // force single segment output (useful for streaming)
207 |         bool print_special;
208 |         bool print_progress;
209 |         bool print_realtime;
210 |         bool print_timestamps;
211 | 
212 |         // [EXPERIMENTAL] token-level timestamps
213 |         bool  token_timestamps; // enable token-level timestamps
214 |         float thold_pt;         // timestamp token probability threshold (~0.01)
215 |         float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
216 |         int   max_len;          // max segment length in characters
217 |         int   max_tokens;       // max tokens per segment (0 = no limit)
218 | 
219 |         // [EXPERIMENTAL] speed-up techniques
220 |         bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
221 |         int  audio_ctx;         // overwrite the audio context size (0 = use default)
222 | 
223 |         // tokens to provide the whisper model as initial prompt
224 |         // these are prepended to any existing text context from a previous call
225 |         const whisper_token * prompt_tokens;
226 |         int prompt_n_tokens;
227 | 
228 |         const char * language;
229 | 
230 |         struct {
231 |             int n_past;
232 |         } greedy;
233 | 
234 |         struct {
235 |             int n_past;
236 |             int beam_width;
237 |             int n_best;
238 |         } beam_search;
239 | 
240 |         whisper_new_segment_callback new_segment_callback;
241 |         void * new_segment_callback_user_data;
242 | 
243 |         whisper_encoder_begin_callback encoder_begin_callback;
244 |         void * encoder_begin_callback_user_data;
245 |     };
246 | 
247 |     WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
248 | 
249 |     // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
250 |     // Uses the specified decoding strategy to obtain the text.
251 |     WHISPER_API int whisper_full(
252 |                 struct whisper_context * ctx,
253 |             struct whisper_full_params   params,
254 |                            const float * samples,
255 |                                    int   n_samples);
256 | 
257 |     // Split the input audio in chunks and process each chunk separately using whisper_full()
258 |     // It seems this approach can offer some speedup in some cases.
259 |     // However, the transcription accuracy can be worse at the beginning and end of each chunk.
260 |     WHISPER_API int whisper_full_parallel(
261 |                 struct whisper_context * ctx,
262 |             struct whisper_full_params   params,
263 |                            const float * samples,
264 |                                    int   n_samples,
265 |                                    int   n_processors);
266 | 
267 |     // Number of generated text segments.
268 |     // A segment can be a few words, a sentence, or even a paragraph.
269 |     WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
270 | 
271 |     // Get the start and end time of the specified segment.
272 |     WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
273 |     WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
274 | 
275 |     // Get the text of the specified segment.
276 |     WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
277 | 
278 |     // Get number of tokens in the specified segment.
279 |     WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
280 | 
281 |     // Get the token text of the specified token in the specified segment.
282 |     WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
283 |     WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
284 | 
285 |     // Get token data for the specified token in the specified segment.
286 |     // This contains probabilities, timestamps, etc.
287 |     WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
288 | 
289 |     // Get the probability of the specified token in the specified segment.
290 |     WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
291 | 
292 | #ifdef __cplusplus
293 | }
294 | #endif
295 | 
296 | #endif
297 | 


--------------------------------------------------------------------------------
/examples/talk.wasm/emscripten.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "gpt-2.h"
  3 | #include "whisper.h"
  4 | 
  5 | #include <emscripten.h>
  6 | #include <emscripten/bind.h>
  7 | 
  8 | #include <atomic>
  9 | #include <cmath>
 10 | #include <mutex>
 11 | #include <string>
 12 | #include <thread>
 13 | #include <vector>
 14 | #include <regex>
 15 | 
 16 | constexpr int N_THREAD = 8;
 17 | 
 18 | struct gpt2_context * g_gpt2;
 19 | std::vector<struct whisper_context *> g_contexts(4, nullptr);
 20 | 
 21 | std::mutex g_mutex;
 22 | std::thread g_worker;
 23 | std::atomic<bool> g_running(false);
 24 | 
 25 | bool g_force_speak = false;
 26 | std::string g_text_to_speak = "";
 27 | std::string g_status = "";
 28 | std::string g_status_forced = "";
 29 | 
 30 | std::vector<float> g_pcmf32;
 31 | 
 32 | std::string to_timestamp(int64_t t) {
 33 |     int64_t sec = t/100;
 34 |     int64_t msec = t - sec*100;
 35 |     int64_t min = sec/60;
 36 |     sec = sec - min*60;
 37 | 
 38 |     char buf[32];
 39 |     snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
 40 | 
 41 |     return std::string(buf);
 42 | }
 43 | 
 44 | void talk_set_status(const std::string & status) {
 45 |     std::lock_guard<std::mutex> lock(g_mutex);
 46 |     g_status = status;
 47 | }
 48 | 
 49 | void talk_main(size_t index) {
 50 |     talk_set_status("loading data ...");
 51 | 
 52 |     struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
 53 | 
 54 |     wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
 55 |     wparams.offset_ms        = 0;
 56 |     wparams.translate        = false;
 57 |     wparams.no_context       = true;
 58 |     wparams.single_segment   = true;
 59 |     wparams.print_realtime   = false;
 60 |     wparams.print_progress   = false;
 61 |     wparams.print_timestamps = true;
 62 |     wparams.print_special    = false;
 63 | 
 64 |     wparams.max_tokens       = 32;
 65 |     wparams.audio_ctx        = 768; // partial encoder context for better performance
 66 | 
 67 |     wparams.language         = "en";
 68 | 
 69 |     g_gpt2 = gpt2_init("gpt-2.bin");
 70 | 
 71 |     printf("talk: using %d threads\n", wparams.n_threads);
 72 | 
 73 |     std::vector<float> pcmf32;
 74 | 
 75 |     // whisper context
 76 |     auto & ctx = g_contexts[index];
 77 | 
 78 |     const int64_t step_samples   = 2*WHISPER_SAMPLE_RATE;
 79 |     const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
 80 |     const int64_t step_ms        = (step_samples*1000)/WHISPER_SAMPLE_RATE;
 81 | 
 82 |     auto t_last = std::chrono::high_resolution_clock::now();
 83 | 
 84 |     talk_set_status("listening ...");
 85 | 
 86 |     while (g_running) {
 87 | 
 88 |         const auto t_now = std::chrono::high_resolution_clock::now();
 89 |         if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
 90 |             {
 91 |                 std::lock_guard<std::mutex> lock(g_mutex);
 92 |                 g_pcmf32.clear();
 93 |             }
 94 |             std::this_thread::sleep_for(std::chrono::milliseconds(10));
 95 |             continue;
 96 |         }
 97 | 
 98 |         talk_set_status("listening ...");
 99 | 
100 |         {
101 |             std::unique_lock<std::mutex> lock(g_mutex);
102 | 
103 |             if (g_pcmf32.size() < step_samples) {
104 |                 lock.unlock();
105 | 
106 |                 std::this_thread::sleep_for(std::chrono::milliseconds(10));
107 | 
108 |                 continue;
109 |             }
110 | 
111 |             pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
112 |         }
113 | 
114 |         // VAD: if energy in during last second is above threshold, then skip
115 |         {
116 |             float energy_all = 0.0f;
117 |             float energy_1s  = 0.0f;
118 | 
119 |             for (size_t i = 0; i < pcmf32.size(); i++) {
120 |                 energy_all += fabsf(pcmf32[i]);
121 | 
122 |                 if (i >= pcmf32.size() - WHISPER_SAMPLE_RATE) {
123 |                     energy_1s += fabsf(pcmf32[i]);
124 |                 }
125 |             }
126 | 
127 |             energy_all /= pcmf32.size();
128 |             energy_1s  /= WHISPER_SAMPLE_RATE;
129 | 
130 |             if (energy_1s > 0.1f*energy_all && !g_force_speak) {
131 |                 std::this_thread::sleep_for(std::chrono::milliseconds(10));
132 |                 continue;
133 |             }
134 |         }
135 | 
136 |         talk_set_status("processing audio (whisper)...");
137 | 
138 |         t_last = t_now;
139 | 
140 |         if (!g_force_speak) {
141 |             const auto t_start = std::chrono::high_resolution_clock::now();
142 | 
143 |             int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
144 |             if (ret != 0) {
145 |                 printf("whisper_full() failed: %d\n", ret);
146 |                 break;
147 |             }
148 | 
149 |             const auto t_end = std::chrono::high_resolution_clock::now();
150 | 
151 |             printf("whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
152 |         }
153 | 
154 |         {
155 |             std::string text_heard;
156 | 
157 |             if (!g_force_speak) {
158 |                 const int n_segments = whisper_full_n_segments(ctx);
159 |                 for (int i = n_segments - 1; i < n_segments; ++i) {
160 |                     const char * text = whisper_full_get_segment_text(ctx, i);
161 | 
162 |                     const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
163 |                     const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
164 | 
165 |                     printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
166 | 
167 |                     text_heard += text;
168 |                 }
169 |             }
170 | 
171 |             g_force_speak = false;
172 | 
173 |             // remove text between brackets using regex
174 |             {
175 |                 std::regex re("\\[.*?\\]");
176 |                 text_heard = std::regex_replace(text_heard, re, "");
177 |             }
178 | 
179 |             // remove text between brackets using regex
180 |             {
181 |                 std::regex re("\\(.*?\\)");
182 |                 text_heard = std::regex_replace(text_heard, re, "");
183 |             }
184 | 
185 |             // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
186 |             text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
187 | 
188 |             // take first line
189 |             text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
190 | 
191 |             // remove leading and trailing whitespace
192 |             text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
193 |             text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
194 | 
195 |             talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
196 | 
197 |             const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
198 | 
199 |             printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
200 | 
201 |             std::string text_to_speak;
202 |             std::string prompt_base;
203 | 
204 |             {
205 |                 std::lock_guard<std::mutex> lock(g_mutex);
206 |                 prompt_base = gpt2_get_prompt(g_gpt2);
207 |             }
208 | 
209 |             if (tokens.size() > 0) {
210 |                 text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
211 |                 text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
212 |                 text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
213 | 
214 |                 std::lock_guard<std::mutex> lock(g_mutex);
215 | 
216 |                 // remove first 2 lines of base prompt
217 |                 {
218 |                     const size_t pos = prompt_base.find_first_of("\n");
219 |                     if (pos != std::string::npos) {
220 |                         prompt_base = prompt_base.substr(pos + 1);
221 |                     }
222 |                 }
223 |                 {
224 |                     const size_t pos = prompt_base.find_first_of("\n");
225 |                     if (pos != std::string::npos) {
226 |                         prompt_base = prompt_base.substr(pos + 1);
227 |                     }
228 |                 }
229 |                 prompt_base += text_heard + "\n" + text_to_speak + "\n";
230 |             } else {
231 |                 text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
232 |                 text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
233 |                 text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
234 | 
235 |                 std::lock_guard<std::mutex> lock(g_mutex);
236 | 
237 |                 const size_t pos = prompt_base.find_first_of("\n");
238 |                 if (pos != std::string::npos) {
239 |                     prompt_base = prompt_base.substr(pos + 1);
240 |                 }
241 |                 prompt_base += text_to_speak + "\n";
242 |             }
243 | 
244 |             printf("gpt-2: %s\n", text_to_speak.c_str());
245 | 
246 |             //printf("========================\n");
247 |             //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
248 |             //printf("========================\n");
249 | 
250 |             {
251 |                 std::lock_guard<std::mutex> lock(g_mutex);
252 |                 t_last = std::chrono::high_resolution_clock::now();
253 |                 g_text_to_speak = text_to_speak;
254 |                 g_pcmf32.clear();
255 |                 gpt2_set_prompt(g_gpt2, prompt_base.c_str());
256 |             }
257 | 
258 |             talk_set_status("speaking ...");
259 |         }
260 |     }
261 | 
262 |     gpt2_free(g_gpt2);
263 | 
264 |     if (index < g_contexts.size()) {
265 |         whisper_free(g_contexts[index]);
266 |         g_contexts[index] = nullptr;
267 |     }
268 | }
269 | 
270 | EMSCRIPTEN_BINDINGS(talk) {
271 |     emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
272 |         for (size_t i = 0; i < g_contexts.size(); ++i) {
273 |             if (g_contexts[i] == nullptr) {
274 |                 g_contexts[i] = whisper_init(path_model.c_str());
275 |                 if (g_contexts[i] != nullptr) {
276 |                     g_running = true;
277 |                     if (g_worker.joinable()) {
278 |                         g_worker.join();
279 |                     }
280 |                     g_worker = std::thread([i]() {
281 |                         talk_main(i);
282 |                     });
283 | 
284 |                     return i + 1;
285 |                 } else {
286 |                     return (size_t) 0;
287 |                 }
288 |             }
289 |         }
290 | 
291 |         return (size_t) 0;
292 |     }));
293 | 
294 |     emscripten::function("free", emscripten::optional_override([](size_t index) {
295 |         if (g_running) {
296 |             g_running = false;
297 |         }
298 |     }));
299 | 
300 |     emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
301 |         --index;
302 | 
303 |         if (index >= g_contexts.size()) {
304 |             return -1;
305 |         }
306 | 
307 |         if (g_contexts[index] == nullptr) {
308 |             return -2;
309 |         }
310 | 
311 |         {
312 |             std::lock_guard<std::mutex> lock(g_mutex);
313 |             const int n = audio["length"].as<int>();
314 | 
315 |             emscripten::val heap = emscripten::val::module_property("HEAPU8");
316 |             emscripten::val memory = heap["buffer"];
317 | 
318 |             g_pcmf32.resize(n);
319 | 
320 |             emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
321 |             memoryView.call<void>("set", audio);
322 |         }
323 | 
324 |         return 0;
325 |     }));
326 | 
327 |     emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
328 |         {
329 |             std::lock_guard<std::mutex> lock(g_mutex);
330 |             g_force_speak = true;
331 |         }
332 |     }));
333 | 
334 |     emscripten::function("get_text_context", emscripten::optional_override([]() {
335 |         std::string text_context;
336 | 
337 |         {
338 |             std::lock_guard<std::mutex> lock(g_mutex);
339 |             text_context = gpt2_get_prompt(g_gpt2);
340 |         }
341 | 
342 |         return text_context;
343 |     }));
344 | 
345 |     emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
346 |         std::string text_to_speak;
347 | 
348 |         {
349 |             std::lock_guard<std::mutex> lock(g_mutex);
350 |             text_to_speak = std::move(g_text_to_speak);
351 |         }
352 | 
353 |         return text_to_speak;
354 |     }));
355 | 
356 |     emscripten::function("get_status", emscripten::optional_override([]() {
357 |         std::string status;
358 | 
359 |         {
360 |             std::lock_guard<std::mutex> lock(g_mutex);
361 |             status = g_status_forced.empty() ? g_status : g_status_forced;
362 |         }
363 | 
364 |         return status;
365 |     }));
366 | 
367 |     emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
368 |         {
369 |             std::lock_guard<std::mutex> lock(g_mutex);
370 |             g_status_forced = status;
371 |         }
372 |     }));
373 | 
374 |     emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
375 |         {
376 |             std::lock_guard<std::mutex> lock(g_mutex);
377 |             gpt2_set_prompt(g_gpt2, prompt.c_str());
378 |         }
379 |     }));
380 | }
381 | 


--------------------------------------------------------------------------------
/examples/command.wasm/emscripten.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "whisper.h"
  3 | 
  4 | #include <emscripten.h>
  5 | #include <emscripten/bind.h>
  6 | 
  7 | #include <atomic>
  8 | #include <cmath>
  9 | #include <mutex>
 10 | #include <string>
 11 | #include <thread>
 12 | #include <vector>
 13 | #include <regex>
 14 | 
 15 | constexpr int N_THREAD = 8;
 16 | 
 17 | std::vector<struct whisper_context *> g_contexts(4, nullptr);
 18 | 
 19 | std::mutex  g_mutex;
 20 | std::thread g_worker;
 21 | 
 22 | std::atomic<bool> g_running(false);
 23 | 
 24 | std::string g_status        = "";
 25 | std::string g_status_forced = "";
 26 | std::string g_transcribed   = "";
 27 | 
 28 | std::vector<float> g_pcmf32;
 29 | 
 30 | static std::string trim(const std::string & s) {
 31 |     std::regex e("^\\s+|\\s+$");
 32 |     return std::regex_replace(s, e, "");
 33 | }
 34 | 
 35 | static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
 36 |     const float rc = 1.0f / (2.0f * M_PI * cutoff);
 37 |     const float dt = 1.0f / sample_rate;
 38 |     const float alpha = dt / (rc + dt);
 39 | 
 40 |     float y = data[0];
 41 | 
 42 |     for (size_t i = 1; i < data.size(); i++) {
 43 |         y = alpha * (y + data[i] - data[i - 1]);
 44 |         data[i] = y;
 45 |     }
 46 | }
 47 | 
 48 | // compute similarity between two strings using Levenshtein distance
 49 | static float similarity(const std::string & s0, const std::string & s1) {
 50 |     const size_t len0 = s0.size() + 1;
 51 |     const size_t len1 = s1.size() + 1;
 52 | 
 53 |     std::vector<int> col(len1, 0);
 54 |     std::vector<int> prevCol(len1, 0);
 55 | 
 56 |     for (size_t i = 0; i < len1; i++) {
 57 |         prevCol[i] = i;
 58 |     }
 59 | 
 60 |     for (size_t i = 0; i < len0; i++) {
 61 |         col[0] = i;
 62 |         for (size_t j = 1; j < len1; j++) {
 63 |             col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
 64 |         }
 65 |         col.swap(prevCol);
 66 |     }
 67 | 
 68 |     const float dist = prevCol[len1 - 1];
 69 | 
 70 |     return 1.0f - (dist / std::max(s0.size(), s1.size()));
 71 | }
 72 | 
 73 | void command_set_status(const std::string & status) {
 74 |     std::lock_guard<std::mutex> lock(g_mutex);
 75 |     g_status = status;
 76 | }
 77 | 
 78 | bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
 79 |     const int n_samples      = pcmf32.size();
 80 |     const int n_samples_last = (sample_rate * last_ms) / 1000;
 81 | 
 82 |     if (n_samples_last >= n_samples) {
 83 |         // not enough samples - assume no speech
 84 |         return false;
 85 |     }
 86 | 
 87 |     if (freq_thold > 0.0f) {
 88 |         high_pass_filter(pcmf32, freq_thold, sample_rate);
 89 |     }
 90 | 
 91 |     float energy_all  = 0.0f;
 92 |     float energy_last = 0.0f;
 93 | 
 94 |     for (size_t i = 0; i < n_samples; i++) {
 95 |         energy_all += fabsf(pcmf32[i]);
 96 | 
 97 |         if (i >= n_samples - n_samples_last) {
 98 |             energy_last += fabsf(pcmf32[i]);
 99 |         }
100 |     }
101 | 
102 |     energy_all  /= n_samples;
103 |     energy_last /= n_samples_last;
104 | 
105 |     if (verbose) {
106 |         fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
107 |     }
108 | 
109 |     if (energy_last > vad_thold*energy_all) {
110 |         return false;
111 |     }
112 | 
113 |     return true;
114 | }
115 | 
116 | std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
117 |     const auto t_start = std::chrono::high_resolution_clock::now();
118 | 
119 |     prob = 0.0f;
120 |     t_ms = 0;
121 | 
122 |     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
123 |         return "";
124 |     }
125 | 
126 |     int prob_n = 0;
127 |     std::string result;
128 | 
129 |     const int n_segments = whisper_full_n_segments(ctx);
130 |     for (int i = 0; i < n_segments; ++i) {
131 |         const char * text = whisper_full_get_segment_text(ctx, i);
132 | 
133 |         result += text;
134 | 
135 |         const int n_tokens = whisper_full_n_tokens(ctx, i);
136 |         for (int j = 0; j < n_tokens; ++j) {
137 |             const auto token = whisper_full_get_token_data(ctx, i, j);
138 | 
139 |             prob += token.p;
140 |             ++prob_n;
141 |         }
142 |     }
143 | 
144 |     if (prob_n > 0) {
145 |         prob /= prob_n;
146 |     }
147 | 
148 |     const auto t_end = std::chrono::high_resolution_clock::now();
149 |     t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
150 | 
151 |     return result;
152 | }
153 | 
154 | void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
155 |     const int64_t n_samples = (ms * sample_rate) / 1000;
156 | 
157 |     int64_t n_take = 0;
158 |     if (g_pcmf32.size() < n_samples) {
159 |         n_take = g_pcmf32.size();
160 |     } else {
161 |         n_take = n_samples;
162 |     }
163 | 
164 |     audio.resize(n_take);
165 |     std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
166 | }
167 | 
168 | void command_main(size_t index) {
169 |     command_set_status("loading data ...");
170 | 
171 |     struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
172 | 
173 |     wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
174 |     wparams.offset_ms        = 0;
175 |     wparams.translate        = false;
176 |     wparams.no_context       = true;
177 |     wparams.single_segment   = true;
178 |     wparams.print_realtime   = false;
179 |     wparams.print_progress   = false;
180 |     wparams.print_timestamps = true;
181 |     wparams.print_special    = false;
182 | 
183 |     wparams.max_tokens       = 32;
184 |     wparams.audio_ctx        = 768; // partial encoder context for better performance
185 | 
186 |     wparams.language         = "en";
187 | 
188 |     printf("command: using %d threads\n", wparams.n_threads);
189 | 
190 |     bool is_running   = true;
191 |     bool have_prompt  = false;
192 |     bool ask_prompt   = true;
193 |     bool print_energy = false;
194 | 
195 |     float prob0 = 0.0f;
196 |     float prob  = 0.0f;
197 | 
198 |     std::vector<float> pcmf32_cur;
199 |     std::vector<float> pcmf32_prompt;
200 | 
201 |     const std::string k_prompt = "Ok Whisper, start listening for commands.";
202 | 
203 |     // whisper context
204 |     auto & ctx = g_contexts[index];
205 | 
206 |     const int32_t vad_ms     = 2000;
207 |     const int32_t prompt_ms  = 5000;
208 |     const int32_t command_ms = 4000;
209 | 
210 |     const float vad_thold  = 0.1f;
211 |     const float freq_thold = -1.0f;
212 | 
213 |     while (g_running) {
214 |         // delay
215 |         std::this_thread::sleep_for(std::chrono::milliseconds(100));
216 | 
217 |         if (ask_prompt) {
218 |             fprintf(stdout, "\n");
219 |             fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
220 |             fprintf(stdout, "\n");
221 | 
222 |             {
223 |                 char txt[1024];
224 |                 snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
225 |                 command_set_status(txt);
226 |             }
227 | 
228 |             ask_prompt = false;
229 |         }
230 | 
231 |         int64_t t_ms = 0;
232 | 
233 |         {
234 |             command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
235 | 
236 |             if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
237 |                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
238 |                 command_set_status("Speech detected! Processing ...");
239 | 
240 |                 if (!have_prompt) {
241 |                     command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
242 | 
243 |                     const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
244 | 
245 |                     fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
246 | 
247 |                     const float sim = similarity(txt, k_prompt);
248 | 
249 |                     if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
250 |                         fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
251 |                         ask_prompt = true;
252 |                     } else {
253 |                         fprintf(stdout, "\n");
254 |                         fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
255 |                         fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
256 |                         fprintf(stdout, "\n");
257 | 
258 |                         {
259 |                             char txt[1024];
260 |                             snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
261 |                             command_set_status(txt);
262 |                         }
263 | 
264 |                         // save the audio for the prompt
265 |                         pcmf32_prompt = pcmf32_cur;
266 |                         have_prompt = true;
267 |                     }
268 |                 } else {
269 |                     command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
270 | 
271 |                     // prepend the prompt audio
272 |                     pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
273 | 
274 |                     const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
275 | 
276 |                     prob = 100.0f*(prob - prob0);
277 | 
278 |                     fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
279 | 
280 |                     // find the prompt in the text
281 |                     float best_sim = 0.0f;
282 |                     size_t best_len = 0;
283 |                     for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
284 |                         const auto prompt = txt.substr(0, n);
285 | 
286 |                         const float sim = similarity(prompt, k_prompt);
287 | 
288 |                         //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
289 | 
290 |                         if (sim > best_sim) {
291 |                             best_sim = sim;
292 |                             best_len = n;
293 |                         }
294 |                     }
295 | 
296 |                     const std::string command = ::trim(txt.substr(best_len));
297 | 
298 |                     fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
299 |                     fprintf(stdout, "\n");
300 | 
301 |                     {
302 |                         char txt[1024];
303 |                         snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
304 |                         command_set_status(txt);
305 |                     }
306 |                     {
307 |                         std::lock_guard<std::mutex> lock(g_mutex);
308 |                         g_transcribed = command;
309 |                     }
310 |                 }
311 | 
312 |                 g_pcmf32.clear();
313 |             }
314 |         }
315 |     }
316 | 
317 |     if (index < g_contexts.size()) {
318 |         whisper_free(g_contexts[index]);
319 |         g_contexts[index] = nullptr;
320 |     }
321 | }
322 | 
323 | EMSCRIPTEN_BINDINGS(command) {
324 |     emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
325 |         for (size_t i = 0; i < g_contexts.size(); ++i) {
326 |             if (g_contexts[i] == nullptr) {
327 |                 g_contexts[i] = whisper_init(path_model.c_str());
328 |                 if (g_contexts[i] != nullptr) {
329 |                     g_running = true;
330 |                     if (g_worker.joinable()) {
331 |                         g_worker.join();
332 |                     }
333 |                     g_worker = std::thread([i]() {
334 |                         command_main(i);
335 |                     });
336 | 
337 |                     return i + 1;
338 |                 } else {
339 |                     return (size_t) 0;
340 |                 }
341 |             }
342 |         }
343 | 
344 |         return (size_t) 0;
345 |     }));
346 | 
347 |     emscripten::function("free", emscripten::optional_override([](size_t index) {
348 |         if (g_running) {
349 |             g_running = false;
350 |         }
351 |     }));
352 | 
353 |     emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
354 |         --index;
355 | 
356 |         if (index >= g_contexts.size()) {
357 |             return -1;
358 |         }
359 | 
360 |         if (g_contexts[index] == nullptr) {
361 |             return -2;
362 |         }
363 | 
364 |         {
365 |             std::lock_guard<std::mutex> lock(g_mutex);
366 |             const int n = audio["length"].as<int>();
367 | 
368 |             emscripten::val heap = emscripten::val::module_property("HEAPU8");
369 |             emscripten::val memory = heap["buffer"];
370 | 
371 |             g_pcmf32.resize(n);
372 | 
373 |             emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
374 |             memoryView.call<void>("set", audio);
375 |         }
376 | 
377 |         return 0;
378 |     }));
379 | 
380 |     emscripten::function("get_transcribed", emscripten::optional_override([]() {
381 |         std::string transcribed;
382 | 
383 |         {
384 |             std::lock_guard<std::mutex> lock(g_mutex);
385 |             transcribed = std::move(g_transcribed);
386 |         }
387 | 
388 |         return transcribed;
389 |     }));
390 | 
391 |     emscripten::function("get_status", emscripten::optional_override([]() {
392 |         std::string status;
393 | 
394 |         {
395 |             std::lock_guard<std::mutex> lock(g_mutex);
396 |             status = g_status_forced.empty() ? g_status : g_status_forced;
397 |         }
398 | 
399 |         return status;
400 |     }));
401 | 
402 |     emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
403 |         {
404 |             std::lock_guard<std::mutex> lock(g_mutex);
405 |             g_status_forced = status;
406 |         }
407 |     }));
408 | }
409 | 


--------------------------------------------------------------------------------
/examples/stream.wasm/index-tmpl.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en-us">
  3 |     <head>
  4 |         <title>stream : Real-time Whisper transcription in WebAssembly</title>
  5 | 
  6 |         <style>
  7 |             #output {
  8 |                 width: 100%;
  9 |                 height: 100%;
 10 |                 margin: 0 auto;
 11 |                 margin-top: 10px;
 12 |                 border-left: 0px;
 13 |                 border-right: 0px;
 14 |                 padding-left: 0px;
 15 |                 padding-right: 0px;
 16 |                 display: block;
 17 |                 background-color: black;
 18 |                 color: white;
 19 |                 font-size: 10px;
 20 |                 font-family: 'Lucida Console', Monaco, monospace;
 21 |                 outline: none;
 22 |                 white-space: pre;
 23 |                 overflow-wrap: normal;
 24 |                 overflow-x: scroll;
 25 |             }
 26 |         </style>
 27 |     </head>
 28 |     <body>
 29 |         <div id="main-container">
 30 |             <b>stream : Real-time Whisper transcription in WebAssembly</b>
 31 | 
 32 |             <br><br>
 33 | 
 34 |             You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
 35 | 
 36 |             <br><br>
 37 | 
 38 |             <hr>
 39 | 
 40 |             Select the model you would like to use, click the "Start" button and start speaking
 41 | 
 42 |             <br><br>
 43 | 
 44 |             <div id="model-whisper">
 45 |                 Whisper model: <span id="model-whisper-status"></span>
 46 |                 <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
 47 |                 <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
 48 |                 <span id="fetch-whisper-progress"></span>
 49 | 
 50 |                 <!--
 51 |                     <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
 52 |                 -->
 53 |             </div>
 54 | 
 55 |             <br>
 56 | 
 57 |             <div id="input">
 58 |                 <button id="start"  onclick="onStart()" disabled>Start</button>
 59 |                 <button id="stop"   onclick="onStop()" disabled>Stop</button>
 60 |                 <button id="clear"  onclick="clearCache()">Clear Cache</button>
 61 |             </div>
 62 | 
 63 |             <br>
 64 | 
 65 |             <div id="state">
 66 |                 Status: <b><span id="state-status">not started</span></b>
 67 | 
 68 |                 <pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
 69 |             </div>
 70 | 
 71 |             <hr>
 72 | 
 73 |             Debug output:
 74 |             <textarea id="output" rows="20"></textarea>
 75 | 
 76 |             <br>
 77 | 
 78 |             <b>Troubleshooting</b>
 79 | 
 80 |             <br><br>
 81 | 
 82 |             The page does some heavy computations, so make sure:
 83 | 
 84 |             <ul>
 85 |                 <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
 86 |                 <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
 87 |                 <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
 88 |             </ul>
 89 | 
 90 |             <div class="cell-version">
 91 |                 <span>
 92 |                     |
 93 |                     Build time: <span class="nav-link">@GIT_DATE@</span> |
 94 |                     Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
 95 |                     Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
 96 |                     <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
 97 |                 </span>
 98 |             </div>
 99 |         </div>
100 | 
101 |         <script type="text/javascript" src="helpers.js"></script>
102 |         <script type='text/javascript'>
103 |             // web audio context
104 |             var context = null;
105 | 
106 |             // audio data
107 |             var audio = null;
108 |             var audio0 = null;
109 | 
110 |             // the stream instance
111 |             var instance = null;
112 | 
113 |             // model name
114 |             var model_whisper = null;
115 | 
116 |             var Module = {
117 |                 print: printTextarea,
118 |                 printErr: printTextarea,
119 |                 setStatus: function(text) {
120 |                     printTextarea('js: ' + text);
121 |                 },
122 |                 monitorRunDependencies: function(left) {
123 |                 },
124 |                 preRun: function() {
125 |                     printTextarea('js: Preparing ...');
126 |                 },
127 |                 postRun: function() {
128 |                     printTextarea('js: Initialized successfully!');
129 |                 }
130 |             };
131 | 
132 |             //
133 |             // fetch models
134 |             //
135 | 
136 |             let dbVersion = 1
137 |             let dbName    = 'whisper.ggerganov.com';
138 |             let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
139 | 
140 |             function storeFS(fname, buf) {
141 |                 // write to WASM file using FS_createDataFile
142 |                 // if the file exists, delete it
143 |                 try {
144 |                     Module.FS_unlink(fname);
145 |                 } catch (e) {
146 |                     // ignore
147 |                 }
148 | 
149 |                 Module.FS_createDataFile("/", fname, buf, true, true);
150 | 
151 |                 printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
152 | 
153 |                 document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
154 | 
155 |                 if (model_whisper != null) {
156 |                     document.getElementById('start').disabled = false;
157 |                     document.getElementById('stop' ).disabled = true;
158 |                 }
159 |             }
160 | 
161 |             function loadWhisper(model) {
162 |                 let urls = {
163 |                     'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
164 |                     'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
165 |                 };
166 | 
167 |                 let sizes = {
168 |                     'tiny.en': 75,
169 |                     'base.en': 142,
170 |                 };
171 | 
172 |                 let url     = urls[model];
173 |                 let dst     = 'whisper.bin';
174 |                 let size_mb = sizes[model];
175 | 
176 |                 model_whisper = model;
177 | 
178 |                 document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
179 |                 document.getElementById('fetch-whisper-base-en').style.display = 'none';
180 |                 document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
181 | 
182 |                 cbProgress = function(p) {
183 |                     let el = document.getElementById('fetch-whisper-progress');
184 |                     el.innerHTML = Math.round(100*p) + '%';
185 |                 };
186 | 
187 |                 cbCancel = function() {
188 |                     var el;
189 |                     el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
190 |                     el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
191 |                     el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
192 |                 };
193 | 
194 |                 loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
195 |             }
196 | 
197 |             //
198 |             // microphone
199 |             //
200 | 
201 |             const kSampleRate = 16000;
202 |             const kRestartRecording_s = 120;
203 |             const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
204 | 
205 |             var mediaRecorder = null;
206 |             var doRecording = false;
207 |             var startTime = 0;
208 | 
209 |             window.AudioContext = window.AudioContext || window.webkitAudioContext;
210 |             window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
211 | 
212 |             function stopRecording() {
213 |                 Module.set_status("paused");
214 |                 doRecording = false;
215 |                 audio0 = null;
216 |                 audio = null;
217 |                 context = null;
218 |             }
219 | 
220 |             function startRecording() {
221 |                 if (!context) {
222 |                     context = new AudioContext({
223 |                         sampleRate: kSampleRate,
224 |                         channelCount: 1,
225 |                         echoCancellation: false,
226 |                         autoGainControl:  true,
227 |                         noiseSuppression: true,
228 |                     });
229 |                 }
230 | 
231 |                 Module.set_status("");
232 | 
233 |                 document.getElementById('start').disabled = true;
234 |                 document.getElementById('stop').disabled = false;
235 | 
236 |                 doRecording = true;
237 |                 startTime = Date.now();
238 | 
239 |                 var chunks = [];
240 |                 var stream = null;
241 | 
242 |                 navigator.mediaDevices.getUserMedia({audio: true, video: false})
243 |                     .then(function(s) {
244 |                         stream = s;
245 |                         mediaRecorder = new MediaRecorder(stream);
246 |                         mediaRecorder.ondataavailable = function(e) {
247 |                             chunks.push(e.data);
248 | 
249 |                             var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
250 |                             var reader = new FileReader();
251 | 
252 |                             reader.onload = function(event) {
253 |                                 var buf = new Uint8Array(reader.result);
254 | 
255 |                                 if (!context) {
256 |                                     return;
257 |                                 }
258 |                                 context.decodeAudioData(buf.buffer, function(audioBuffer) {
259 |                                     var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
260 |                                     var source = offlineContext.createBufferSource();
261 |                                     source.buffer = audioBuffer;
262 |                                     source.connect(offlineContext.destination);
263 |                                     source.start(0);
264 | 
265 |                                     offlineContext.startRendering().then(function(renderedBuffer) {
266 |                                         audio = renderedBuffer.getChannelData(0);
267 | 
268 |                                         //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
269 | 
270 |                                         var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
271 |                                         if (audio0 != null) {
272 |                                             audioAll.set(audio0, 0);
273 |                                         }
274 |                                         audioAll.set(audio, audio0 == null ? 0 : audio0.length);
275 | 
276 |                                         if (instance) {
277 |                                             Module.set_audio(instance, audioAll);
278 |                                         }
279 |                                     });
280 |                                 }, function(e) {
281 |                                     audio = null;
282 |                                 });
283 |                             }
284 | 
285 |                             reader.readAsArrayBuffer(blob);
286 |                         };
287 | 
288 |                         mediaRecorder.onstop = function(e) {
289 |                             if (doRecording) {
290 |                                 setTimeout(function() {
291 |                                     startRecording();
292 |                                 });
293 |                             }
294 |                         };
295 | 
296 |                         mediaRecorder.start(kIntervalAudio_ms);
297 |                     })
298 |                     .catch(function(err) {
299 |                         printTextarea('js: error getting audio stream: ' + err);
300 |                     });
301 | 
302 |                 var interval = setInterval(function() {
303 |                     if (!doRecording) {
304 |                         clearInterval(interval);
305 |                         mediaRecorder.stop();
306 |                         stream.getTracks().forEach(function(track) {
307 |                             track.stop();
308 |                         });
309 | 
310 |                         document.getElementById('start').disabled = false;
311 |                         document.getElementById('stop').disabled  = true;
312 | 
313 |                         mediaRecorder = null;
314 |                     }
315 | 
316 |                     // if audio length is more than kRestartRecording_s seconds, restart recording
317 |                     if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
318 |                         if (doRecording) {
319 |                             //printTextarea('js: restarting recording');
320 | 
321 |                             clearInterval(interval);
322 |                             audio0 = audio;
323 |                             audio = null;
324 |                             mediaRecorder.stop();
325 |                             stream.getTracks().forEach(function(track) {
326 |                                 track.stop();
327 |                             });
328 |                         }
329 |                     }
330 |                 }, 100);
331 |             }
332 | 
333 |             //
334 |             // main
335 |             //
336 | 
337 |             var nLines = 0;
338 |             var intervalUpdate = null;
339 |             var transcribedAll = '';
340 | 
341 |             function onStart() {
342 |                 if (!instance) {
343 |                     instance = Module.init('whisper.bin');
344 | 
345 |                     if (instance) {
346 |                         printTextarea("js: whisper initialized, instance: " + instance);
347 |                     }
348 |                 }
349 | 
350 |                 if (!instance) {
351 |                     printTextarea("js: failed to initialize whisper");
352 |                     return;
353 |                 }
354 | 
355 |                 startRecording();
356 | 
357 |                 intervalUpdate = setInterval(function() {
358 |                     var transcribed = Module.get_transcribed();
359 | 
360 |                     if (transcribed != null && transcribed.length > 1) {
361 |                         transcribedAll += transcribed + '<br>';
362 |                         nLines++;
363 | 
364 |                         // if more than 10 lines, remove the first line
365 |                         if (nLines > 10) {
366 |                             var i = transcribedAll.indexOf('<br>');
367 |                             if (i > 0) {
368 |                                 transcribedAll = transcribedAll.substring(i + 4);
369 |                                 nLines--;
370 |                             }
371 |                         }
372 |                     }
373 | 
374 |                     document.getElementById('state-status').innerHTML = Module.get_status();
375 |                     document.getElementById('state-transcribed').innerHTML = transcribedAll;
376 |                 }, 100);
377 |             }
378 | 
379 |             function onStop() {
380 |                 stopRecording();
381 |             }
382 | 
383 |         </script>
384 |         <script type="text/javascript" src="stream.js"></script>
385 |     </body>
386 | </html>
387 | 


--------------------------------------------------------------------------------