├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── examples ├── CMakeLists.txt ├── complex │ ├── CMakeLists.txt │ ├── complex.cpp │ ├── complex.qml │ └── resources.qrc └── simple │ ├── CMakeLists.txt │ ├── resources.qrc │ ├── simple.cpp │ └── simple.qml ├── models ├── ggml-tiny-en-q4-0.bin └── models.qrc ├── src ├── QmlMacros.h ├── SpeechToText.cpp ├── SpeechToText.h ├── VoiceActivityDetector.cpp ├── VoiceActivityDetector.h ├── WhisperBackend.cpp ├── WhisperBackend.h └── private │ └── quantization.h └── tests ├── CMakeLists.txt ├── bin ├── quantize.exe └── whisper.dll └── tst_quant.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.user 2 | /build 3 | /third-party 4 | 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "whisper.cpp"] 2 | path = whisper.cpp 3 | url = https://github.com/ggerganov/whisper.cpp.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(qt-whisper LANGUAGES CXX) 3 | set(CMAKE_AUTOMOC OFF) 4 | set(CMAKE_AUTORCC OFF) 5 | set(CMAKE_CXX_STANDARD 20) 6 | 7 | set(QT_WHISPER_TARGET qt-whisper) 8 | set(QT_WHISPER_LIB ${QT_WHISPER_TARGET}) 9 | option(QT_WHISPER_EMBED_MODEL "Embed the compressed model weights into the library" OFF) 10 | 11 | add_subdirectory(whisper.cpp) 12 | 13 | 14 | find_package(QT NAMES Qt5 Qt6 COMPONENTS Core Multimedia Concurrent Quick REQUIRED) 15 | find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core Multimedia Concurrent Quick REQUIRED) 16 | 17 | 18 | file(GLOB SOURCE_CPP 19 | "src/*.cpp" "src/*.h" "src/private/*.h" 20 | ) 21 | if(QT_WHISPER_EMBED_MODEL) 22 | set(SOURCE_RCC "models/models.qrc") 23 | endif() 24 | 25 | qt_add_library(${QT_WHISPER_TARGET} STATIC MANUAL_FINALIZATION ${SOURCE_CPP} ${SOURCE_RCC} 26 | ) 27 | 28 | 29 | target_link_libraries(${QT_WHISPER_TARGET} PRIVATE whisper) 30 | target_link_libraries(${QT_WHISPER_TARGET} PUBLIC Qt6::Core Qt6::Multimedia Qt6::Concurrent Qt6::Quick) 31 | target_include_directories(${QT_WHISPER_TARGET} INTERFACE "src" "whisper.cpp") 32 | target_include_directories(${QT_WHISPER_TARGET} PRIVATE "src/private") 33 | 34 | set_property(TARGET ${QT_WHISPER_TARGET} PROPERTY AUTOMOC ON) 35 | set_property(TARGET ${QT_WHISPER_TARGET} PROPERTY AUTORCC ${QT_WHISPER_EMBED_MODEL}) 36 | if(${QT_WHISPER_EMBED_MODEL}) 37 | target_compile_definitions(${QT_WHISPER_TARGET} PRIVATE EMBED_MODEL) 38 | endif() 39 | qt_finalize_target(${QT_WHISPER_TARGET}) 40 | 41 | 42 | #Add examples if build as a standalone 43 | if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) 44 | macro(download_whisper_tiny) 45 | file(DOWNLOAD "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin" ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin SHOW_PROGRESS EXPECTED_HASH SHA256=be07e048e1e599ad46341c8d2a135645097a538221678b7acdd1b1919c6e1b21) 46 | endmacro() 47 | add_subdirectory(examples) 48 | add_subdirectory(tests) 49 | endif() 50 | 51 | 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Qt Whisper 2 | This project is a Qt & Qml wrapper for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) - A high performance library for [OpenAI's Whisper](https://github.com/openai/whisper) inference. 3 | 4 | ## Value added 5 | While whisper.cpp provides the framework for Whisper model inference, its framework agnostic nature requires the programmer to write wrapper code that allows the use of whisper in the actual application. 6 | 7 | This project provides ready to use QML object that performes inference away from GUI thread. Note that while the project is functional some features are still work in progress: 8 | :heavy_check_mark: Threaded inference - Don't block GUI thread while running the model 9 | :heavy_check_mark: Voice Activity Detection - Wait for Speech to start capturing audio and Automatically stop audio capture after speech has stopped. 10 | :heavy_check_mark: Embedded small model - You can build the library with a small model embedded into the binary for quick prototyping 11 | :warning: VAD ML models - No ready avaliable ML models to easly embed into application. Simple Energy-based detection implemented. 12 | :heavy_check_mark: Model Quantization - Model Quantization and reloading during runtime. 13 | :x: Building QML plugin 14 | 15 | ## Usage 16 | Pull the repo as a submodule using: 17 | ``` 18 | git submodule add https://github.com/Ugibugi/qt-whisper.git 19 | git submodule update --init --recursive 20 | ``` 21 | Then in your CMakeLists.txt: 22 | 23 | ```cmake 24 | add_subdirectory(qt-whisper) 25 | ... 26 | add_executable(mytarget ...) 27 | target_link_libraries(mytarget PRIVATE ${QT_WHISPER_LIB} ...) 28 | 29 | ``` 30 | 31 | Then register the type in your main.cpp (To be removed after QML plugin support): 32 | 33 | ```cpp 34 | #include 35 | ... 36 | // in int main() 37 | qmlRegisterType("qtwhisper", 1, 0, "SpeechToText"); 38 | 39 | ``` 40 | 41 | And it's ready to use: 42 | 43 | ```qml 44 | import QtQuick 45 | import QtQuick.Controls 46 | import qtwhisper 47 | 48 | ApplicationWindow { 49 | visible: true 50 | width: 800 51 | height: 600 52 | Button { 53 | text: "Start" 54 | // Check the current state of inference 55 | enabled: stt.state === SpeechToText.Ready 56 | onClicked: { 57 | // Start listening for speech - it will wait for speech to run inference and will 58 | // automatically stop after you stop speaking 59 | stt.start() 60 | } 61 | } 62 | SpeechToText { 63 | id: stt 64 | onResultReady: function (recognisedSpeech) { 65 | // print out the result 66 | } 67 | } 68 | } 69 | ``` 70 | 71 | See the examples folder for more in-depth usage 72 | 73 | ## Notes on usage 74 | ### Read if App crashes when trying to run the inference 75 | whisper.cpp uses vector instruction sets which may not be supported by your device. Pass one of the whisper.cpp cmake flags: `WHISPER_NO_AVX2`, `WHISPER_NO_AVX`, `WHISPER_NO_F16C`, `WHISPER_NO_FMA` to disable those instructions. 76 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(simple) 2 | add_subdirectory(complex) 3 | -------------------------------------------------------------------------------- /examples/complex/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(complex complex.cpp resources.qrc) 2 | set_target_properties(complex PROPERTIES AUTORCC ON AUTOMOC ON) 3 | target_link_libraries(complex PRIVATE Qt6::Core Qt6::Quick ${QT_WHISPER_TARGET}) 4 | download_whisper_tiny() 5 | -------------------------------------------------------------------------------- /examples/complex/complex.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | int main(int argc, char *argv[]) 7 | { 8 | QGuiApplication app(argc, argv); 9 | 10 | QQmlApplicationEngine engine; 11 | 12 | 13 | const QUrl url(QStringLiteral("qrc:complex.qml")); 14 | 15 | qmlRegisterType("qtwhisper", 1, 0, "SpeechToText"); 16 | qmlRegisterUncreatableType("qtwhisper", 1, 0, "WhisperInfo", ""); 17 | 18 | // QML startup 19 | QObject::connect(&engine, &QQmlApplicationEngine::objectCreated, 20 | &app, [url](QObject *obj, const QUrl &objUrl){ 21 | if (!obj && url == objUrl) 22 | QCoreApplication::exit(-1); 23 | }, Qt::QueuedConnection); 24 | 25 | engine.load(url); 26 | return app.exec(); 27 | } 28 | -------------------------------------------------------------------------------- /examples/complex/complex.qml: -------------------------------------------------------------------------------- 1 | import QtQuick 2 | import QtQuick.Controls.Material 3 | import QtQuick.Layouts 4 | import qtwhisper 5 | 6 | ApplicationWindow { 7 | id: root 8 | visible: true 9 | width: 800 10 | height: 600 11 | 12 | Material.theme: Material.Dark 13 | 14 | RowLayout { 15 | id: headerRow 16 | anchors { 17 | top: parent.top 18 | } 19 | width: parent.width 20 | height: 200 21 | ColumnLayout { 22 | Repeater { 23 | id: rep 24 | property var names: ["Normal", "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0"] 25 | model: [0, 2, 3, 8, 9, 7] 26 | 27 | RadioDelegate { 28 | text: rep.names[index] 29 | enabled: stt.backendInfo.requantizable 30 | && (stt.state == SpeechToText.Ready) 31 | checked: stt.backendInfo.floatType === rep.model[index] 32 | onClicked: stt.quantize(rep.model[index]) 33 | } 34 | } 35 | } 36 | Item { 37 | Layout.fillWidth: true 38 | } 39 | 40 | ColumnLayout { 41 | Label { 42 | text: "Collected Info:" 43 | font.pixelSize: 20 44 | } 45 | Label { 46 | text: stt.backendInfo.floatTypeString 47 | Layout.leftMargin: 10 48 | Layout.maximumWidth: 200 49 | font.pixelSize: 15 50 | wrapMode: Text.Wrap 51 | } 52 | Label { 53 | text: stt.backendInfo.modelTypeString 54 | Layout.leftMargin: 10 55 | font.pixelSize: 15 56 | } 57 | } 58 | } 59 | 60 | ColumnLayout { 61 | anchors.centerIn: parent 62 | width: parent.width / 2 63 | height: parent.height / 3 64 | 65 | Item { 66 | Layout.alignment: Qt.AlignHCenter | Qt.AlignVCenter 67 | Layout.fillWidth: true 68 | Layout.fillHeight: true 69 | Label { 70 | id: result 71 | 72 | horizontalAlignment: Text.AlignHCenter 73 | anchors.fill: parent 74 | 75 | wrapMode: Text.Wrap 76 | font.capitalization: Font.AllUppercase 77 | font.pixelSize: 20 78 | text: "(((Recognised text)))" 79 | opacity: { 80 | switch (stt.state) { 81 | case SpeechToText.Tuning: 82 | case SpeechToText.WaitingForSpeech: 83 | case SpeechToText.SpeechDetected: 84 | return 0.5 85 | case SpeechToText.Busy: 86 | return 0.0 87 | default: 88 | return 1.0 89 | } 90 | } 91 | } 92 | BusyIndicator { 93 | anchors.fill: parent 94 | running: stt.state === SpeechToText.Busy 95 | } 96 | } 97 | 98 | Button { 99 | text: "Start" 100 | Layout.alignment: Qt.AlignHCenter | Qt.AlignVCenter 101 | enabled: stt.state === SpeechToText.Ready 102 | onClicked: { 103 | stt.start() 104 | } 105 | } 106 | 107 | Label { 108 | id: prompt 109 | Layout.alignment: Qt.AlignHCenter | Qt.AlignVCenter 110 | horizontalAlignment: Text.AlignHCenter 111 | 112 | font.pixelSize: 20 113 | text: { 114 | switch (stt.state) { 115 | case SpeechToText.NoModel: 116 | return "No model loaded" 117 | case SpeechToText.WaitingForModel: 118 | return "Loading model, please wait." 119 | case SpeechToText.Busy: 120 | return "Inference in progress" 121 | case SpeechToText.Tuning: 122 | return "Tuning out background noise" 123 | case SpeechToText.SpeechDetected: 124 | return "Speech detected" 125 | case SpeechToText.WaitingForSpeech: 126 | return "Speak to start detection" 127 | case SpeechToText.Ready: 128 | return "Press start to start listening" 129 | default: 130 | return "Unknown state: " + stt.state 131 | } 132 | } 133 | } 134 | } 135 | 136 | SpeechToText { 137 | id: stt 138 | modelPath: "ggml-tiny.bin" 139 | onResultReady: function (r) { 140 | result.text = r 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /examples/complex/resources.qrc: -------------------------------------------------------------------------------- 1 | 2 | 3 | complex.qml 4 | 5 | 6 | -------------------------------------------------------------------------------- /examples/simple/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(simple simple.cpp resources.qrc) 2 | set_target_properties(simple PROPERTIES AUTORCC ON AUTOMOC ON) 3 | target_link_libraries(simple PRIVATE Qt6::Core Qt6::Quick ${QT_WHISPER_TARGET}) 4 | -------------------------------------------------------------------------------- /examples/simple/resources.qrc: -------------------------------------------------------------------------------- 1 | 2 | 3 | simple.qml 4 | 5 | 6 | -------------------------------------------------------------------------------- /examples/simple/simple.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | int main(int argc, char *argv[]) 7 | { 8 | QGuiApplication app(argc, argv); 9 | 10 | QQmlApplicationEngine engine; 11 | 12 | 13 | const QUrl url(QStringLiteral("qrc:simple.qml")); 14 | 15 | qmlRegisterType("qtwhisper", 1, 0, "SpeechToText"); 16 | qmlRegisterUncreatableType("qtwhisper", 1, 0, "WhisperInfo", ""); 17 | 18 | // QML startup 19 | QObject::connect(&engine, &QQmlApplicationEngine::objectCreated, 20 | &app, [url](QObject *obj, const QUrl &objUrl){ 21 | if (!obj && url == objUrl) 22 | QCoreApplication::exit(-1); 23 | }, Qt::QueuedConnection); 24 | 25 | engine.load(url); 26 | return app.exec(); 27 | } 28 | -------------------------------------------------------------------------------- /examples/simple/simple.qml: -------------------------------------------------------------------------------- 1 | import QtQuick 2 | import QtQuick.Controls.Material 3 | import QtQuick.Layouts 4 | import qtwhisper 5 | 6 | ApplicationWindow { 7 | id: root 8 | visible: true 9 | width: 800 10 | height: 600 11 | Material.theme: Material.Dark 12 | 13 | ColumnLayout { 14 | anchors.centerIn: parent 15 | width: parent.width / 2 16 | height: parent.height / 3 17 | 18 | Item { 19 | Layout.alignment: Qt.AlignHCenter | Qt.AlignVCenter 20 | Layout.fillWidth: true 21 | Layout.fillHeight: true 22 | Label { 23 | id: result 24 | 25 | horizontalAlignment: Text.AlignHCenter 26 | anchors.fill: parent 27 | 28 | wrapMode: Text.Wrap 29 | font.capitalization: Font.AllUppercase 30 | font.pixelSize: 20 31 | text: "(((Recognised text)))" 32 | opacity: { 33 | switch (stt.state) { 34 | case SpeechToText.Tuning: 35 | case SpeechToText.WaitingForSpeech: 36 | case SpeechToText.SpeechDetected: 37 | return 0.5 38 | case SpeechToText.Busy: 39 | return 0.0 40 | default: 41 | return 1.0 42 | } 43 | } 44 | } 45 | BusyIndicator { 46 | anchors.fill: parent 47 | running: stt.state === SpeechToText.Busy 48 | } 49 | } 50 | 51 | Button { 52 | text: "Start" 53 | Layout.alignment: Qt.AlignHCenter | Qt.AlignVCenter 54 | enabled: stt.state === SpeechToText.Ready 55 | onClicked: { 56 | stt.start() 57 | } 58 | } 59 | 60 | Label { 61 | id: prompt 62 | Layout.alignment: Qt.AlignHCenter | Qt.AlignVCenter 63 | horizontalAlignment: Text.AlignHCenter 64 | 65 | font.pixelSize: 20 66 | text: { 67 | switch (stt.state) { 68 | case SpeechToText.NoModel: 69 | return "No model loaded" 70 | case SpeechToText.WaitingForModel: 71 | return "Loading model, please wait." 72 | case SpeechToText.Busy: 73 | return "Inference in progress" 74 | case SpeechToText.Tuning: 75 | return "Tuning out background noise" 76 | case SpeechToText.SpeechDetected: 77 | return "Speech detected" 78 | case SpeechToText.WaitingForSpeech: 79 | return "Speak to start detection" 80 | case SpeechToText.Ready: 81 | return "Press start to start listening" 82 | default: 83 | return "Unknown state: " + stt.state 84 | } 85 | } 86 | } 87 | } 88 | 89 | SpeechToText { 90 | id: stt 91 | onResultReady: function (r) { 92 | result.text = r 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /models/ggml-tiny-en-q4-0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radkoder/qt-whisper/6f0ba6f360c358c6940a61724a429b0b8c136908/models/ggml-tiny-en-q4-0.bin -------------------------------------------------------------------------------- /models/models.qrc: -------------------------------------------------------------------------------- 1 | 2 | 3 | ggml-tiny-en-q4-0.bin 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/QmlMacros.h: -------------------------------------------------------------------------------- 1 | #ifndef QMLMACROS_H 2 | #define QMLMACROS_H 3 | 4 | /*! 5 | * \def QML_WRITABLE_PROPERTY(type, name) 6 | * \ingroup QT_QML_HELPERS 7 | * \hideinitializer 8 | * \details Creates a \c Q_PROPERTY that will be readable / writable from QML. 9 | * 10 | * \param type The C++ type of the property 11 | * \param name The name for the property 12 | * \param capitalName - capitalized name of the property to insert into function names 13 | * 14 | * It generates for this goal : 15 | * \code 16 | * {type} m_{name}; // private member variable 17 | * {type} get_{name} () const; // public getter method 18 | * void set_{name} ({type}); // public setter slot 19 | * void {name}Changed ({type}); // notifier signal 20 | * \endcode 21 | * 22 | * \b Note : Any change from either C++ or QML side will trigger the 23 | * notification. 24 | */ 25 | #define QML_WRITABLE_PROPERTY(type, name, capitalName) \ 26 | protected: \ 27 | Q_PROPERTY(type name READ get ## capitalName WRITE set ## capitalName NOTIFY name ## Changed) \ 28 | private: \ 29 | type _ ## name{ }; \ 30 | \ 31 | public: \ 32 | type get ## capitalName() const { return _ ## name; } \ 33 | Q_SIGNALS: \ 34 | void name ## Changed(type name); \ 35 | public Q_SLOTS: \ 36 | void set ## capitalName(type name){ \ 37 | if (_ ## name != name) { \ 38 | _ ## name = name; \ 39 | emit name ## Changed(_ ## name); \ 40 | } \ 41 | } \ 42 | \ 43 | private: 44 | 45 | 46 | /*! 47 | * \def QML_READONLY_PROPERTY(type, name) 48 | * \ingroup QT_QML_HELPERS 49 | * \hideinitializer 50 | * \details Creates a \c Q_PROPERTY that will be readable from QML and writable 51 | * from C++. 52 | * 53 | * \param type The C++ type of the property 54 | * \param name The name for the property 55 | * 56 | * It generates for this goal : 57 | * \code 58 | * {type} m_{name}; // private member variable 59 | * {type} get_{name} () const; // public getter method 60 | * void update_{name} ({type}); // public setter method 61 | * void {name}Changed ({type}); // notifier signal 62 | * \endcode 63 | * 64 | * \b Note : Any change from C++ side will trigger the notification to QML. 65 | */ 66 | #define QML_READONLY_PROPERTY(type, name, capitalName) \ 67 | protected: \ 68 | Q_PROPERTY(type name READ get ## capitalName NOTIFY name ## Changed) \ 69 | private: \ 70 | type _ ## name{ }; \ 71 | \ 72 | public: \ 73 | type get ## capitalName() const { return _ ## name; } \ 74 | void set ## capitalName(const type& name){ \ 75 | if (_ ## name != name) { \ 76 | _ ## name = name; \ 77 | emit name ## Changed(_ ## name); \ 78 | } \ 79 | } \ 80 | Q_SIGNALS: \ 81 | void name ## Changed(type name); \ 82 | \ 83 | private: 84 | 85 | 86 | /*! 87 | * \def QML_CONSTANT_PROPERTY(type, name) 88 | * \ingroup QT_QML_HELPERS 89 | * \hideinitializer 90 | * \details Creates a \c Q_PROPERTY for a constant value exposed from C++ to 91 | * QML. 92 | * 93 | * \param type The C++ type of the property 94 | * \param name The name for the property 95 | * 96 | * It generates for this goal : 97 | * \code 98 | * {type} m_{name}; // private member variable 99 | * {type} get_{name} () const; // public getter method 100 | * \endcode 101 | * 102 | * \b Note : There is no change notifier because value is constant. 103 | */ 104 | 105 | #define QML_CONSTANT_PROPERTY(type, name, capitalizedName) \ 106 | protected: \ 107 | Q_PROPERTY(type name READ get ## capitalizedName CONSTANT) \ 108 | private: \ 109 | type m_ ## name; \ 110 | \ 111 | public: \ 112 | type get ## capitalizedName() const { return m_ ## name; } \ 113 | \ 114 | private: 115 | 116 | 117 | #endif // QMLMACROS_H 118 | -------------------------------------------------------------------------------- /src/SpeechToText.cpp: -------------------------------------------------------------------------------- 1 | #include "SpeechToText.h" 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | constexpr int SAMPLE_RATE = 16000; 8 | constexpr const char *MODEL_RESOURCE = ":/ggml-tiny-en-q4-0.bin"; 9 | 10 | #define ASSERT_STATE(x) Q_ASSERT((updateState(),getState()) == x); 11 | 12 | SpeechToText::SpeechToText() 13 | { 14 | 15 | qRegisterMetaType(); 16 | qRegisterMetaType(); 17 | qRegisterMetaType >(); 18 | 19 | 20 | connect(this, &SpeechToText::modelPathChanged, this, &SpeechToText::loadModel); 21 | ASSERT_STATE(State::NoModel); 22 | 23 | #ifdef EMBED_MODEL 24 | Q_INIT_RESOURCE(models); 25 | setHasEmbeddedModel(true); 26 | setModelPath(MODEL_RESOURCE); 27 | #else 28 | setHasEmbeddedModel(false); 29 | #endif 30 | 31 | //State UpdateTimers 32 | _stateUpdateTimer.setInterval(30); 33 | _stateUpdateTimer.callOnTimeout(this,&SpeechToText::updateState); 34 | _stateUpdateTimer.start(); 35 | 36 | } 37 | 38 | void SpeechToText::start() 39 | { 40 | auto device = QMediaDevices::defaultAudioInput(); 41 | QAudioFormat fmt; 42 | 43 | fmt.setSampleFormat(QAudioFormat::Float); 44 | fmt.setSampleRate(SAMPLE_RATE); 45 | fmt.setChannelConfig(QAudioFormat::ChannelConfigMono); 46 | fmt.setChannelCount(1); 47 | 48 | if (!device.isFormatSupported(fmt)) { 49 | qDebug() << "Format " << fmt << " not supported"; 50 | } 51 | 52 | 53 | 54 | _source.reset(new QAudioSource{ device, fmt }); 55 | _audioDevice = _source->start(); 56 | ASSERT_STATE(State::WaitingForSpeech); 57 | connect(_source.get(),&QAudioSource::stateChanged,this,[=](QAudio::State s){ 58 | qDebug() << "Audio source" << _source.get() << " state:" << s; 59 | }); 60 | connect(_audioDevice, &QIODevice::readyRead, this, [ = ](){ 61 | 62 | auto bytes = _audioDevice->readAll(); 63 | float samples_count = bytes.size() / _source->format().bytesPerSample(); 64 | auto time_count = samples_count / _source->format().sampleRate(); 65 | qDebug() << "Read " << bytes.size() << "bytes" << samples_count << "Samples" << time_count << "Seconds"; 66 | std::vector frame{ reinterpret_cast(bytes.cbegin()), 67 | reinterpret_cast(bytes.cend()) }; 68 | 69 | _vad.feedSamples(std::move(frame)); 70 | }); 71 | connect(&_vad, &VoiceActivityDetector::speechDetected, this, [ = ](std::vector samples){ 72 | qDebug() << "Speech detected " << samples.size() << "samples"; 73 | QTimer::singleShot(1,this,&SpeechToText::stop); 74 | auto r = QMetaObject::invokeMethod(_whisper, "threadedInference", Qt::QueuedConnection, 75 | Q_ARG(std::vector, samples)); 76 | if (!r) { 77 | qFatal("Failed to invoke threaded inference"); 78 | } 79 | }); 80 | } // SpeechToText::start 81 | 82 | void SpeechToText::stop() 83 | { 84 | // immidieatly stop the audio recording 85 | if(_source){ 86 | _source->stop(); 87 | _source.reset(); 88 | } 89 | 90 | 91 | // if waiting for speech - simply disconnect the slots 92 | disconnect(&_vad,nullptr,this,nullptr); 93 | _vad.reset(); 94 | } 95 | 96 | SpeechToText::~SpeechToText() 97 | { 98 | unloadModel(); 99 | _whisperThread.quit(); 100 | _whisperThread.wait(); 101 | } 102 | 103 | void SpeechToText::loadModel(const QString &path) 104 | { 105 | if (_whisper || getState() == State::Busy) { 106 | // Unload model before loading 107 | connect(this,&SpeechToText::modelUnloaded,this,[=](){ 108 | ASSERT_STATE(State::NoModel); 109 | loadModel(path); 110 | },static_cast(Qt::AutoConnection | Qt::SingleShotConnection)); 111 | unloadModel(); 112 | 113 | return; 114 | } 115 | _whisper = new WhisperBackend(path); 116 | _whisper->moveToThread(&_whisperThread); 117 | 118 | 119 | connect(_whisper, &WhisperBackend::resultReady, this, [ = ](auto s){ 120 | ASSERT_STATE(State::Ready); 121 | emit resultReady(s); 122 | }); 123 | connect(_whisper, &WhisperBackend::error, this, [ = ](auto s){ 124 | ASSERT_STATE(State::Ready); 125 | emit SpeechToText::errorOccured(s); 126 | }); 127 | connect(_whisper, &WhisperBackend::modelLoaded, this, &SpeechToText::backendInfoChanged); 128 | 129 | 130 | QMetaObject::invokeMethod(_whisper, "loadModel", Qt::QueuedConnection); 131 | 132 | 133 | if (!_whisperThread.isRunning()) 134 | _whisperThread.start(); 135 | ASSERT_STATE(State::WaitingForModel); 136 | } 137 | 138 | void SpeechToText::unloadModel() 139 | { 140 | stop(); 141 | if (_whisper) 142 | { 143 | disconnect(_whisper,nullptr,this,nullptr); 144 | connect(_whisper, &QObject::destroyed, this, &SpeechToText::modelUnloaded); 145 | _whisper->deleteLater(); 146 | } 147 | } 148 | 149 | const WhisperInfo *SpeechToText::getBackendInfo() const 150 | { 151 | Q_ASSERT(_whisper); 152 | return _whisper->info(); 153 | } 154 | 155 | SpeechToText::State SpeechToText::getState() const 156 | { 157 | #define O(state, cond) \ 158 | if(cond) return state 159 | 160 | // whisper related states 161 | O(State::NoModel, _whisper.isNull()); // No model is loaded, need to call loadModel first 162 | O(State::WaitingForModel, _whisper->info()->getModelType()==WhisperInfo::MODEL_UNKNOWN); // Model is being loaded in the background thread 163 | O(State::Busy,_whisper->getBusy()); // Model is performing inference in the background thread 164 | 165 | // VAD related states 166 | O(State::Tuning, _vad.getAdjustInProgress()); // VAD is listening for sound in order to adjust itself for background noise 167 | O(State::SpeechDetected,_vad.getVoiceInProgress()); // VAD is detecting voice in current samples 168 | O(State::WaitingForSpeech, _source); // if source is not deleted - the sound is being recorded and relayed to VAD 169 | 170 | // default state 171 | O(State::Ready,true); // Nothing is happening - the object is idle 172 | #undef O 173 | } 174 | 175 | void SpeechToText::quantize(int mode) 176 | { 177 | Q_ASSERT(_whisper); 178 | QMetaObject::invokeMethod(_whisper, "unloadModel", Qt::QueuedConnection); 179 | QMetaObject::invokeMethod(_whisper, "loadModel", Qt::QueuedConnection, static_cast(mode)); 180 | } 181 | 182 | void SpeechToText::updateState() 183 | { 184 | static State s = State::NoModel; 185 | if(getState() != s){ 186 | emit stateChanged(getState()); 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/SpeechToText.h: -------------------------------------------------------------------------------- 1 | #ifndef SPEECHTOTEXT_H 2 | #define SPEECHTOTEXT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "WhisperBackend.h" 11 | #include "VoiceActivityDetector.h" 12 | #include "QmlMacros.h" 13 | 14 | class SpeechToText : public QObject 15 | { 16 | Q_OBJECT 17 | QML_ELEMENT; 18 | public: 19 | enum State { 20 | NoModel, 21 | Ready, 22 | Tuning, 23 | WaitingForSpeech, 24 | SpeechDetected, 25 | WaitingForModel, 26 | Busy 27 | }; 28 | Q_ENUM(State); 29 | private: 30 | QML_WRITABLE_PROPERTY(QString, modelPath, ModelPath) 31 | QML_READONLY_PROPERTY(bool, hasEmbeddedModel, HasEmbeddedModel) 32 | Q_PROPERTY(const WhisperInfo * backendInfo READ getBackendInfo NOTIFY backendInfoChanged) 33 | Q_PROPERTY(State state READ getState NOTIFY stateChanged) 34 | public: 35 | SpeechToText(); 36 | Q_INVOKABLE void start(); 37 | Q_INVOKABLE void stop(); 38 | 39 | 40 | ~SpeechToText(); 41 | void loadModel(const QString& path); 42 | void unloadModel(); 43 | 44 | const WhisperInfo *getBackendInfo() const; 45 | State getState() const; 46 | Q_INVOKABLE void quantize(int mode); 47 | 48 | 49 | 50 | public slots: 51 | void updateState(); 52 | 53 | signals: 54 | void resultReady(const QString& str); 55 | void modelUnloaded(); 56 | void modelLoaded(); 57 | void errorOccured(const QString& str); 58 | void stateChanged(State s); 59 | 60 | void backendInfoChanged(); 61 | 62 | private: 63 | QPointer _whisper = nullptr; 64 | VoiceActivityDetector _vad; 65 | std::unique_ptr _source = nullptr; 66 | std::vector _audioBuffer; 67 | QIODevice *_audioDevice = nullptr; 68 | bool _stopFlag = false; 69 | QThread _whisperThread; 70 | QTimer _stateUpdateTimer; 71 | }; 72 | 73 | 74 | #endif // SPEECHTOTEXT_H 75 | -------------------------------------------------------------------------------- /src/VoiceActivityDetector.cpp: -------------------------------------------------------------------------------- 1 | #include "VoiceActivityDetector.h" 2 | #include 3 | VoiceActivityDetector::VoiceActivityDetector(const Params& params, QObject *parent) 4 | : QObject{parent}, _params{params}, _patience_counter{params.patience}, 5 | _detected_samples_counter{params.minimum_samples}, _adjustment_counter{params.adjust_samples} 6 | { 7 | qRegisterMetaType >(); 8 | } 9 | 10 | void VoiceActivityDetector::feedSamples(const std::vector &data) 11 | { 12 | _adjustment_counter = std::max(_adjustment_counter - 1, 0); 13 | if (_adjustment_counter > 0) { 14 | setAdjustInProgress(true); 15 | adjust(data); 16 | return; 17 | } 18 | setAdjustInProgress(false); 19 | 20 | auto energy = std::inner_product(data.begin(), data.end(), data.begin(), 0.0f) / data.size(); 21 | const bool current_score = energy > threshold(); 22 | 23 | if (current_score) { 24 | // reset patience 25 | _patience_counter = _params.patience; 26 | 27 | // start the new potential segment if not already started 28 | setVoiceInProgress(true); 29 | 30 | // count consecutive accepted samples 31 | if (--_detected_samples_counter < 0) { 32 | _segment_approved = true; 33 | } 34 | } else { 35 | // decrement patience counter 36 | _patience_counter = std::max(_patience_counter - 1, 0); 37 | 38 | // reset accepted samples counter 39 | _detected_samples_counter = _params.minimum_samples; 40 | } 41 | 42 | // Capture voice if speech is detected 43 | if (getVoiceInProgress()) { 44 | _voice_buffer.insert(_voice_buffer.end(), data.begin(), data.end()); 45 | } 46 | 47 | 48 | // if patience runs out, signal speech detection and reset buffers 49 | if (_patience_counter <= 0 && getVoiceInProgress()) { 50 | if (_segment_approved) { 51 | emit speechDetected(_voice_buffer); 52 | } 53 | reset(); 54 | } 55 | qDebug() << "Energy: " << energy << "Threshold: " < &data) 69 | { 70 | auto energy = std::inner_product(data.begin(), data.end(), data.begin(), 0.0f) / data.size(); 71 | auto diff = std::abs(energy - _mean_energy); 72 | 73 | _mean_energy = _mean_energy * _params.beta + (1 - _params.beta) * energy; 74 | _std_energy = _std_energy * _params.beta + (1 - _params.beta) * diff; 75 | } 76 | 77 | float VoiceActivityDetector::threshold() const 78 | { 79 | 80 | // Expecting exponential distribution 81 | // Tukey anomaly criterion 82 | auto lambda = 1/_mean_energy; 83 | return 2*std::log(10)/lambda; 84 | } 85 | 86 | VoiceActivityDetector::Params VoiceActivityDetector::defaultParams() 87 | { 88 | return Params{ 89 | 50, //patience 90 | 50, // minimum samples 91 | 0.5f, // tuning coefficient 92 | 4.0f, // treshold coefficient 93 | 200 94 | }; 95 | } 96 | -------------------------------------------------------------------------------- /src/VoiceActivityDetector.h: -------------------------------------------------------------------------------- 1 | #ifndef VOICEACTIVITYDETECTOR_H 2 | #define VOICEACTIVITYDETECTOR_H 3 | 4 | #include 5 | #include "QmlMacros.h" 6 | 7 | class VoiceActivityDetector : public QObject 8 | { 9 | Q_OBJECT 10 | QML_READONLY_PROPERTY(bool, voiceInProgress, VoiceInProgress) 11 | QML_READONLY_PROPERTY(bool, adjustInProgress, AdjustInProgress) 12 | public: 13 | struct Params { 14 | /// Longest streak of samples with no voice before speech is considered to have ended 15 | int patience; 16 | /// Minimum streak of samples with speach for a segment to be considered as containing speech 17 | int minimum_samples; 18 | /// Tuning coefficient - higher coefficient requires longer tuning 19 | float beta; 20 | /// Treshold coeffitient - real threashold is calculated by mean(E) + k*std(E) 21 | float threshold; 22 | /// How many samples from the beginning of audio should be used for tuning 23 | int adjust_samples; 24 | }; 25 | explicit VoiceActivityDetector(const Params& params = defaultParams(), QObject *parent = nullptr); 26 | /// Feed series of samples to the detection 27 | void feedSamples(const std::vector& data); 28 | /// Reset the speech detection state 29 | void reset(); 30 | /// Adjust the treshold of speech detection assuming that the given data is background noise 31 | void adjust(const std::vector& data); 32 | /// Current speech threshold calculated from the background noise 33 | float threshold() const; 34 | /// Default parameters for the Voice Activity Detector 35 | static Params defaultParams(); 36 | public slots: 37 | 38 | 39 | signals: 40 | /// Fired when the given samples are considered to contain speech 41 | void speechDetected(std::vector samples); 42 | private: 43 | /// Parameters passed in during construction 44 | Params _params; 45 | /// Internal counter for patience 46 | int _patience_counter = 0; 47 | /// Internal counter for positive samples 48 | int _detected_samples_counter = 0; 49 | /// Wether a given speech segment (a series of samples) was approved as speech. 50 | bool _segment_approved = false; 51 | /// Buffer for storing speech samples 52 | std::vector _voice_buffer; 53 | /// Current mean sample energy for background noise 54 | float _mean_energy = 0; 55 | /// Current standard deviation of energy for background noise 56 | float _std_energy = 0; 57 | /// Counter for samples used in adjustment 58 | int _adjustment_counter; 59 | }; 60 | 61 | #endif // VOICEACTIVITYDETECTOR_H 62 | -------------------------------------------------------------------------------- /src/WhisperBackend.cpp: -------------------------------------------------------------------------------- 1 | #include "WhisperBackend.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "quantization.h" 12 | 13 | WhisperBackend::WhisperBackend(const QString& filePath, QObject *parent) 14 | : _numThreads{2} 15 | { 16 | setBusy(true); 17 | _og_filepath = filePath; 18 | setBusy(false); 19 | } 20 | 21 | WhisperBackend::~WhisperBackend() 22 | { 23 | unloadModel(); 24 | } 25 | 26 | void WhisperBackend::loadModel(WhisperInfo::FloatType ftype) 27 | { 28 | setBusy(true); 29 | qDebug() << "load model called with quantization type: " << ftype; 30 | _params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); 31 | _params.progress_callback = [] (whisper_context *ctx, whisper_state *state, int progress, void *user_data){ 32 | qDebug() << "Inference progress: " << progress; 33 | }; 34 | 35 | QFile file{ _og_filepath }; 36 | file.open(QIODeviceBase::ReadOnly); 37 | QByteArray bytes; 38 | 39 | if (ftype == GGML_FTYPE_ALL_F32) { 40 | bytes = file.readAll(); 41 | } else { 42 | QBuffer buffer; 43 | buffer.open(QBuffer::WriteOnly); 44 | auto err = qtw::buffer_quantize(file, buffer, ftype); 45 | buffer.close(); 46 | if (err != 0) { 47 | emit error(QString{ "Model quantization failed with code: %1" }.arg(err)); 48 | return; 49 | } 50 | bytes = buffer.buffer(); 51 | } 52 | file.close(); 53 | 54 | _ctx = whisper_init_from_buffer(bytes.data(), bytes.size()); 55 | 56 | if (_ctx == nullptr) { 57 | emit error("Failed to initialize whisper context"); 58 | return; 59 | } 60 | collectInfo(); 61 | 62 | setBusy(false); 63 | emit modelLoaded(); 64 | } // WhisperBackend::loadModel 65 | 66 | void WhisperBackend::unloadModel() 67 | { 68 | whisper_free(_ctx); 69 | _ctx = nullptr; 70 | } 71 | 72 | void WhisperBackend::threadedInference(std::vector samples) 73 | { 74 | setBusy(true); 75 | if (whisper_full_parallel(_ctx, _params, samples.data(), static_cast(samples.size()), getNumThreads()) != 0) { 76 | fprintf(stderr, "failed to process audio\n"); 77 | } 78 | 79 | QString s; 80 | const int n_seg = whisper_full_n_segments(_ctx); 81 | for (int i = 0; i < n_seg; i++) { 82 | const char *text = whisper_full_get_segment_text(_ctx, i); 83 | s.append(text); 84 | } 85 | 86 | setBusy(false); 87 | setLastResult(s); 88 | 89 | emit resultReady(s); 90 | } 91 | 92 | const WhisperInfo *WhisperBackend::info() const 93 | { 94 | return &_info; 95 | } 96 | 97 | void WhisperBackend::collectInfo() 98 | { 99 | Q_ASSERT(_ctx); 100 | 101 | _info.setFloatType(static_cast( whisper_model_ftype(_ctx))); 102 | _info.setModelType(static_cast( whisper_model_type(_ctx))); 103 | } 104 | 105 | QString WhisperInfo::floatTypeString() const 106 | { 107 | switch (_floatType) { 108 | default: 109 | case GGML_FTYPE_UNKNOWN: return "Unknown float type"; 110 | 111 | case GGML_FTYPE_ALL_F32: return "32-bit float"; 112 | 113 | case GGML_FTYPE_MOSTLY_F16: return "mostly 16-bit float (except 1d tensors)"; 114 | 115 | case GGML_FTYPE_MOSTLY_Q4_0: return "(Q4_0) 16-bit blocks of 4-bit quantized weights (16-bit float multiplier)"; // except 1d tensors 116 | 117 | case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: 118 | case GGML_FTYPE_MOSTLY_Q4_1: return 119 | "(Q4_1) 16-bit blocks of 4-bit quantized weights (16-bit float multiplier and offset)"; // except 1d tensors 120 | 121 | case GGML_FTYPE_MOSTLY_Q8_0: return "(Q8_0) 32-bit blocks of 8-bit quantized weights (32-bit float multiplier)"; // except 1d tensors 122 | 123 | case GGML_FTYPE_MOSTLY_Q5_0: return "(Q5_0) blocks of 32 5-bit quantized weights (16-bit float multiplier)"; // except 1d tensors 124 | 125 | case GGML_FTYPE_MOSTLY_Q5_1: return 126 | "(Q5_1) blocks of 32 5-bit quantized weights (16-bit float multiplier and offset)"; // except 1d tensors 127 | } 128 | } 129 | 130 | QString WhisperInfo::modelTypeString() const 131 | { 132 | switch (_modelType) { 133 | default: 134 | case MODEL_UNKNOWN: 135 | return "Unknown model"; 136 | case MODEL_TINY: 137 | return "Tiny model"; 138 | case MODEL_BASE: 139 | return "Base model"; 140 | case MODEL_SMALL: 141 | return "Small model"; 142 | case MODEL_MEDIUM: 143 | return "Medium model"; 144 | case MODEL_LARGE: 145 | return "Large model"; 146 | } 147 | } 148 | 149 | bool WhisperInfo::requantizable() const 150 | { 151 | return (_floatType == GGML_FTYPE_ALL_F32) || (_floatType == GGML_FTYPE_MOSTLY_F16); 152 | } 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /src/WhisperBackend.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "whisper.h" 4 | #include "ggml.h" 5 | #include "QmlMacros.h" 6 | 7 | class WhisperInfo : public QObject { 8 | Q_OBJECT 9 | public: 10 | /// copied from whisper.cpp 11 | enum ModelType { 12 | MODEL_UNKNOWN, 13 | MODEL_TINY, 14 | MODEL_BASE, 15 | MODEL_SMALL, 16 | MODEL_MEDIUM, 17 | MODEL_LARGE, 18 | }; 19 | using FloatType = ggml_ftype; 20 | Q_ENUM(ModelType) 21 | Q_ENUM(FloatType) 22 | 23 | QString floatTypeString() const; 24 | QString modelTypeString() const; 25 | bool requantizable() const; 26 | 27 | private: 28 | QML_READONLY_PROPERTY(ModelType, modelType, ModelType) 29 | QML_READONLY_PROPERTY(FloatType, floatType, FloatType) 30 | Q_PROPERTY(bool requantizable READ requantizable NOTIFY floatTypeChanged) 31 | Q_PROPERTY(QString modelTypeString READ modelTypeString NOTIFY modelTypeChanged) 32 | Q_PROPERTY(QString floatTypeString READ floatTypeString NOTIFY floatTypeChanged) 33 | 34 | }; 35 | 36 | class WhisperBackend : public QObject { 37 | Q_OBJECT 38 | QML_READONLY_PROPERTY(bool, busy, Busy) 39 | QML_WRITABLE_PROPERTY(int, numThreads, NumThreads) 40 | QML_READONLY_PROPERTY(QString, lastResult, LastResult) 41 | public: 42 | WhisperBackend(const QString &filePath, QObject *parent = nullptr); 43 | ~WhisperBackend(); 44 | Q_INVOKABLE void loadModel(WhisperInfo::FloatType = GGML_FTYPE_ALL_F32); 45 | Q_INVOKABLE void unloadModel(); 46 | Q_INVOKABLE void threadedInference(std::vector samples); 47 | const WhisperInfo *info() const; 48 | static int bufferQuantize(QIODevice & in, QIODevice & out, ggml_ftype type); 49 | signals: 50 | void resultReady(QString result); 51 | void error(QString s); 52 | void modelLoaded(); 53 | private: 54 | void collectInfo(); 55 | 56 | 57 | QString _og_filepath; 58 | 59 | whisper_context *_ctx = nullptr; 60 | whisper_full_params _params; 61 | WhisperInfo _info; 62 | }; 63 | -------------------------------------------------------------------------------- /src/private/quantization.h: -------------------------------------------------------------------------------- 1 | #ifndef QUANTIZATION_H 2 | #define QUANTIZATION_H 3 | #include 4 | #include 5 | #include 6 | 7 | namespace qtw { 8 | 9 | typedef size_t (*quantizer_func)(const float * src, void * dst, int n, int k, int64_t * hist); 10 | 11 | struct TensorHeader { 12 | int32_t n_dims; 13 | int32_t name_len; 14 | int32_t ttype; 15 | std::vector dims; 16 | QByteArray name; 17 | void read(QIODevice& in){ 18 | in.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 19 | in.read(reinterpret_cast(&name_len), sizeof(name_len)); 20 | in.read(reinterpret_cast(&ttype), sizeof(ttype)); 21 | 22 | dims.resize(n_dims,1); 23 | for (auto& d : dims) { 24 | in.read(reinterpret_cast(&d), sizeof(d)); 25 | } 26 | 27 | name.resize(name_len,0); 28 | in.read(name.data(), name_len); 29 | 30 | } 31 | void write(QIODevice& out) 32 | { 33 | out.write(reinterpret_cast(&n_dims), sizeof(n_dims)); 34 | out.write(reinterpret_cast(&name_len), sizeof(name_len)); 35 | out.write(reinterpret_cast(&ttype), sizeof(ttype)); 36 | for (auto d : dims) { 37 | out.write(reinterpret_cast(&d), sizeof(d)); 38 | } 39 | out.write(name.constData(), name_len); 40 | 41 | } 42 | }; 43 | 44 | quantizer_func get_quantizer(ggml_type t){ 45 | switch(t){ 46 | case GGML_TYPE_Q4_0: 47 | return ggml_quantize_q4_0; 48 | case GGML_TYPE_Q4_1: 49 | return ggml_quantize_q4_1; 50 | case GGML_TYPE_Q5_0: 51 | return ggml_quantize_q5_0; 52 | case GGML_TYPE_Q5_1: 53 | return ggml_quantize_q5_1; 54 | case GGML_TYPE_Q8_0: 55 | return ggml_quantize_q8_0; 56 | default: 57 | return nullptr; 58 | } 59 | } 60 | 61 | void write_through(QIODevice& in, QIODevice& out, size_t n) 62 | { 63 | auto written = out.write(in.read(n)); 64 | Q_ASSERT(written == n); 65 | } 66 | 67 | int buffer_quantize(QIODevice& in, QIODevice& out, ggml_ftype ftype) 68 | { 69 | // error codes for the function 70 | constexpr int INVALID_MAGIC = 1; 71 | constexpr int INVALID_QUANTIZATION_TYPE = 2; 72 | constexpr int UNSUPPORTED_TENSOR_TYPE = 3; 73 | constexpr int UNSUPPORTED_QUANT_TYPE = 4; 74 | 75 | // verify magic 76 | { 77 | uint32_t magic; 78 | in.read((char *) &magic, sizeof(magic)); 79 | if (magic != GGML_FILE_MAGIC) { 80 | return INVALID_MAGIC; 81 | } 82 | 83 | out.write((char *) &magic, sizeof(magic)); 84 | } 85 | 86 | 87 | // load hparams 88 | { 89 | int32_t hparams[11]; 90 | in.read((char *) hparams, sizeof(hparams)); 91 | 92 | // Change the declared model float type to target 93 | const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; 94 | 95 | out.write((const char *) hparams, sizeof(hparams) - sizeof(int32_t)); 96 | out.write((const char *) &ftype_dst, sizeof(ftype_dst)); 97 | } 98 | 99 | // load mel filters 100 | { 101 | int32_t n_mel, n_fft; 102 | 103 | in.read((char *) &n_mel, sizeof(n_mel)); 104 | in.read((char *) &n_fft, sizeof(n_fft)); 105 | 106 | out.write((char *) &n_mel, sizeof(n_mel)); 107 | out.write((char *) &n_fft, sizeof(n_fft)); 108 | 109 | write_through(in,out, n_mel * n_fft * sizeof(float)); 110 | } 111 | 112 | // load vocab 113 | { 114 | int32_t n_vocab = 0; 115 | in.read((char *) &n_vocab, sizeof(n_vocab)); 116 | out.write((char *) &n_vocab, sizeof(n_vocab)); 117 | 118 | for (int i = 0; i < n_vocab; i++) { 119 | uint32_t len; 120 | in.read((char *) &len, sizeof(len)); 121 | out.write((char *) &len, sizeof(len)); 122 | 123 | write_through(in,out,len); 124 | } 125 | } 126 | 127 | // regexes of tensor names to not be quantized 128 | const QList to_skip = { 129 | // "encoder.*", 130 | QRegularExpression{ "encoder.conv1.bias" }, 131 | QRegularExpression{ "encoder.conv2.bias" }, 132 | QRegularExpression{ "encoder.positional_embedding" }, 133 | QRegularExpression{ "decoder.positional_embedding" } 134 | }; 135 | 136 | // regexes of tensor names to be quantized 137 | const QList to_quant = { 138 | QRegularExpression{ ".*" } 139 | }; 140 | // quantization 141 | { 142 | // Map the ggml float type to ggml type 143 | ggml_type qtype = GGML_TYPE_F32; 144 | switch (ftype) { 145 | case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; 146 | break; 147 | case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; 148 | break; 149 | case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; 150 | break; 151 | case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; 152 | break; 153 | case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; 154 | break; 155 | default: 156 | return INVALID_QUANTIZATION_TYPE; 157 | } 158 | 159 | 160 | // rest of the file is just tensors 161 | TensorHeader tensor_header; 162 | std::vector weight_buffer; 163 | 164 | while (in.bytesAvailable() > 0) 165 | { 166 | // read tensor header - dimentions, type, name 167 | tensor_header.read(in); 168 | 169 | auto n_elements = std::reduce(tensor_header.dims.begin(), tensor_header.dims.end(), 1, std::multiplies{ }); 170 | Q_ASSERT(n_elements < std::vector{}.max_size()); 171 | 172 | // Decide wheter to quantize a tensor based on white / black lists 173 | bool quantize = std::any_of(to_quant.begin(), to_quant.end(), [&](auto re){ 174 | return re.match(QString::fromUtf8(tensor_header.name)).hasMatch(); 175 | }); 176 | quantize &= std::none_of(to_skip.begin(), to_skip.end(), [&](auto re){ 177 | return re.match(QString::fromUtf8(tensor_header.name)).hasMatch(); 178 | }); 179 | quantize &= (tensor_header.n_dims == 2); 180 | 181 | if (!quantize) { 182 | //If the tensor is not to be quantized - just write it trough 183 | // Write tensor header 184 | tensor_header.write(out); 185 | 186 | // write tensor data 187 | const int bytes_per_elem = (tensor_header.ttype == 0) ? sizeof(float) : sizeof(uint16_t); 188 | write_through(in,out, n_elements * bytes_per_elem); 189 | } 190 | else 191 | { 192 | if (tensor_header.ttype != GGML_TYPE_F32 && tensor_header.ttype != GGML_TYPE_F16) { 193 | return UNSUPPORTED_TENSOR_TYPE; 194 | } 195 | 196 | weight_buffer.resize(n_elements); 197 | if (tensor_header.ttype == GGML_TYPE_F16) { 198 | // if tensor is in float-16, convert it to float-32 199 | std::vector buff(n_elements); 200 | in.read(reinterpret_cast(buff.data()), n_elements * sizeof(ggml_fp16_t)); 201 | std::transform(buff.begin(), buff.end(), weight_buffer.begin(), ggml_fp16_to_fp32); 202 | } else { 203 | // else just read it normally 204 | in.read(reinterpret_cast(weight_buffer.data()), n_elements * sizeof(float)); 205 | } 206 | // set the tensor type to the target type 207 | tensor_header.ttype = qtype; 208 | 209 | std::vector quants(n_elements); 210 | std::vector hist_cur(1 << 4, 0); 211 | size_t cur_size = 0; 212 | 213 | // Select quantizing function based on the quant type 214 | quantizer_func quantizer = get_quantizer(static_cast(tensor_header.ttype)); 215 | 216 | if(!quantizer){ 217 | return UNSUPPORTED_QUANT_TYPE; 218 | } 219 | 220 | cur_size = quantizer(weight_buffer.data(), 221 | quants.data(), n_elements, tensor_header.dims[0], hist_cur.data()); 222 | 223 | // write quantized tensor 224 | tensor_header.write(out); 225 | auto n = out.write(reinterpret_cast(quants.data()), cur_size); 226 | } 227 | } 228 | } 229 | 230 | return 0; 231 | } // WhisperBackend::bufferQuantize 232 | } // namespace qtw 233 | #endif // QUANTIZATION_H 234 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(Qt6 REQUIRED COMPONENTS Test) 2 | 3 | enable_testing(true) 4 | 5 | 6 | qt_add_executable(quantizer_test MANUAL_FINALIZATION tst_quant.cpp) 7 | set_target_properties(quantizer_test PROPERTIES AUTOMOC ON ) 8 | qt_finalize_target(quantizer_test) 9 | 10 | add_test(NAME quantizer_test COMMAND quantizer_test) 11 | 12 | target_link_libraries(quantizer_test PRIVATE Qt6::Core Qt6::Quick ${QT_WHISPER_TARGET} Qt::Test) 13 | 14 | ### Dependencies 15 | file(DOWNLOAD "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin" ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin SHOW_PROGRESS EXPECTED_HASH SHA256=be07e048e1e599ad46341c8d2a135645097a538221678b7acdd1b1919c6e1b21) 16 | add_custom_command( 17 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q4_0.bin 18 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/quantize.exe ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q4_0.bin q4_0 19 | VERBATIM) 20 | add_custom_command( 21 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q4_1.bin 22 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/quantize.exe ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q4_1.bin q4_1 23 | VERBATIM) 24 | add_custom_command( 25 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_0.bin 26 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/quantize.exe ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_0.bin q5_0 27 | VERBATIM) 28 | add_custom_command( 29 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_1.bin 30 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/quantize.exe ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_1.bin q5_1 31 | VERBATIM) 32 | add_custom_command( 33 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q8_0.bin 34 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/quantize.exe ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny.bin ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q8_0.bin q8_0 35 | VERBATIM) 36 | add_custom_target(quantized_models 37 | DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q4_0.bin 38 | ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q4_1.bin 39 | ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_0.bin 40 | ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_1.bin 41 | ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q8_0.bin) 42 | 43 | add_dependencies(quantizer_test quantized_models) 44 | #file(DOWNLOAD "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-model-whisper-tiny-q5_1.bin" ${CMAKE_CURRENT_BINARY_DIR}/ggml-tiny-q5_1.bin SHOW_PROGRESS EXPECTED_HASH SHA256=818710568da3ca15689e31a743197b520007872ff9576237bda97bd1b469c3d7) 45 | -------------------------------------------------------------------------------- /tests/bin/quantize.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radkoder/qt-whisper/6f0ba6f360c358c6940a61724a429b0b8c136908/tests/bin/quantize.exe -------------------------------------------------------------------------------- /tests/bin/whisper.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radkoder/qt-whisper/6f0ba6f360c358c6940a61724a429b0b8c136908/tests/bin/whisper.dll -------------------------------------------------------------------------------- /tests/tst_quant.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "private/quantization.h" 3 | #include "qbuffer.h" 4 | #include "ggml.h" 5 | #include 6 | 7 | class QuantizerTest : public QObject 8 | { 9 | Q_OBJECT 10 | const char *base_model_name = "ggml-tiny.bin"; 11 | const char *q40_model_name = "ggml-tiny-q4_0.bin"; 12 | const char *q41_model_name = "ggml-tiny-q4_1.bin"; 13 | const char *q50_model_name = "ggml-tiny-q5_0.bin"; 14 | const char *q51_model_name = "ggml-tiny-q5_1.bin"; 15 | const char *q80_model_name = "ggml-tiny-q8_0.bin"; 16 | ggml_context* _ctx = nullptr; 17 | 18 | void quantize(const char* in, const char* ref_name, ggml_ftype type){ 19 | 20 | QFile modelFile{ in }; 21 | modelFile.open(QIODeviceBase::ReadOnly); 22 | QBuffer result; 23 | result.open(QIODeviceBase::WriteOnly); 24 | 25 | auto error_code = qtw::buffer_quantize(modelFile, result, type); 26 | modelFile.close(); 27 | result.close(); 28 | 29 | QFile quantized{ ref_name }; 30 | quantized.open(QIODeviceBase::ReadOnly); 31 | auto ref = quantized.readAll(); 32 | quantized.close(); 33 | 34 | QCOMPARE(error_code, 0); 35 | QCOMPARE(ref.size(), result.buffer().size()); 36 | QCOMPARE(ref.compare(result.buffer()),0); 37 | } 38 | 39 | private slots: 40 | 41 | void initTestCase() 42 | { 43 | QVERIFY(QFileInfo{ base_model_name }.size() > 0); 44 | QVERIFY(QFileInfo{ q40_model_name }.size() > 0); 45 | QVERIFY(QFileInfo{ q41_model_name }.size() > 0); 46 | QVERIFY(QFileInfo{ q50_model_name }.size() > 0); 47 | QVERIFY(QFileInfo{ q51_model_name }.size() > 0); 48 | QVERIFY(QFileInfo{ q80_model_name }.size() > 0); 49 | 50 | // initializes float-16 lookup table - critical to quantization 51 | _ctx = ggml_init({}); 52 | QVERIFY(_ctx); 53 | 54 | } 55 | void q4_0() 56 | { 57 | quantize(base_model_name,q40_model_name,GGML_FTYPE_MOSTLY_Q4_0); 58 | } 59 | void q4_1() 60 | { 61 | quantize(base_model_name,q41_model_name,GGML_FTYPE_MOSTLY_Q4_1); 62 | } 63 | void q5_0() 64 | { 65 | quantize(base_model_name,q50_model_name,GGML_FTYPE_MOSTLY_Q5_0); 66 | } 67 | void q5_1() 68 | { 69 | quantize(base_model_name,q51_model_name,GGML_FTYPE_MOSTLY_Q5_1); 70 | } 71 | void q8_0() 72 | { 73 | quantize(base_model_name,q80_model_name,GGML_FTYPE_MOSTLY_Q8_0); 74 | } 75 | }; 76 | 77 | QTEST_MAIN(QuantizerTest) 78 | #include "tst_quant.moc" 79 | --------------------------------------------------------------------------------