├── README.rst ├── android_mic_streaming ├── .gitignore ├── README.md ├── app │ ├── .gitignore │ ├── build.gradle │ ├── proguard-rules.pro │ └── src │ │ └── main │ │ ├── AndroidManifest.xml │ │ ├── java │ │ └── org │ │ │ └── deepspeechdemo │ │ │ └── MainActivity.kt │ │ └── res │ │ ├── drawable-v24 │ │ └── ic_launcher_foreground.xml │ │ ├── drawable │ │ └── ic_launcher_background.xml │ │ ├── layout │ │ └── activity_main.xml │ │ ├── mipmap-anydpi-v26 │ │ ├── ic_launcher.xml │ │ └── ic_launcher_round.xml │ │ ├── mipmap-hdpi │ │ ├── ic_launcher.png │ │ └── ic_launcher_round.png │ │ ├── mipmap-mdpi │ │ ├── ic_launcher.png │ │ └── ic_launcher_round.png │ │ ├── mipmap-xhdpi │ │ ├── ic_launcher.png │ │ └── ic_launcher_round.png │ │ ├── mipmap-xxhdpi │ │ ├── ic_launcher.png │ │ └── ic_launcher_round.png │ │ ├── mipmap-xxxhdpi │ │ ├── ic_launcher.png │ │ └── ic_launcher_round.png │ │ └── values │ │ ├── colors.xml │ │ ├── strings.xml │ │ └── styles.xml ├── build.gradle ├── gradle.properties ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat └── settings.gradle ├── autosub ├── LICENSE ├── README.md ├── autosub │ ├── __init__.py │ ├── audioProcessing.py │ ├── featureExtraction.py │ ├── main.py │ ├── segmentAudio.py │ ├── trainAudio.py │ └── writeToFile.py ├── requirements.txt └── setup.py ├── batch_processing ├── Readme.md ├── driver.py ├── requirements.txt ├── setup.ps1 ├── test.ps1 └── test_tf.py ├── electron ├── .gitignore ├── Readme.md ├── package-lock.json ├── package.json ├── public │ ├── create-window.js │ ├── download.js │ ├── electron.js │ ├── index.html │ ├── manifest.json │ ├── preload.js │ └── recognize-wav.js ├── src │ ├── App.js │ └── index.js └── test.sh ├── ffmpeg_vad_streaming ├── README.MD ├── index.js ├── package.json └── test.sh ├── hotword_adjusting ├── README.md └── hotword_adjusting.py ├── mic_vad_streaming ├── README.rst ├── mic_vad_streaming.py ├── requirements.txt └── test.sh ├── net_framework ├── .gitignore └── DeepSpeechWPF │ ├── App.config │ ├── App.xaml │ ├── App.xaml.cs │ ├── DeepSpeech.WPF.csproj │ ├── DeepSpeech.WPF.sln │ ├── MainWindow.xaml │ ├── MainWindow.xaml.cs │ ├── Properties │ ├── AssemblyInfo.cs │ ├── Resources.Designer.cs │ ├── Resources.resx │ ├── Settings.Designer.cs │ └── Settings.settings │ ├── ViewModels │ ├── BindableBase.cs │ └── MainWindowViewModel.cs │ └── packages.config ├── nim_mic_vad_streaming ├── README.md ├── linux_nim_vad_streaming │ ├── README.md │ └── vad_stream.nim └── win_nim_vad_streaming │ ├── README.md │ └── vad_stream.nim ├── nodejs_mic_vad_streaming ├── .gitignore ├── Readme.md ├── package.json ├── start.js └── test.sh ├── nodejs_wav ├── Readme.md ├── index.js ├── package.json └── test.sh ├── tests.sh ├── uwp ├── .gitignore ├── DeepSpeechUWP.sln └── DeepSpeechUWP │ ├── .gitignore │ ├── App.xaml │ ├── App.xaml.cs │ ├── Assets │ ├── LockScreenLogo.scale-200.png │ ├── SplashScreen.scale-200.png │ ├── Square150x150Logo.scale-200.png │ ├── Square44x44Logo.scale-200.png │ ├── Square44x44Logo.targetsize-24_altform-unplated.png │ ├── StoreLogo.png │ └── Wide310x150Logo.scale-200.png │ ├── DeepSpeechUWP.csproj │ ├── MainPage.xaml │ ├── MainPage.xaml.cs │ ├── Package.appxmanifest │ ├── Properties │ ├── AssemblyInfo.cs │ └── Default.rd.xml │ └── models │ └── .gitkeep ├── vad_transcriber ├── README.md ├── audioTranscript_cmd.py ├── audioTranscript_gui.py ├── requirements.txt ├── test.sh ├── wavSplit.py └── wavTranscriber.py └── web_microphone_websocket ├── .gitignore ├── Readme.md ├── package.json ├── public ├── downsampling_worker.js ├── favicon.ico ├── index.html ├── logo192.png ├── logo512.png ├── manifest.json └── robots.txt ├── server.js ├── src ├── App.js ├── App.test.js ├── index.css ├── index.js └── setupTests.js ├── test.sh └── test ├── config.js └── server.test.js /README.rst: -------------------------------------------------------------------------------- 1 | DeepSpeech 0.9.x Examples 2 | ========================== 3 | 4 | These are various examples on how to use or integrate DeepSpeech using our packages. 5 | 6 | It is a good way to just try out DeepSpeech before learning how it works in detail, as well as a source of inspiration for ways you can integrate it into your application or solve common tasks like voice activity detection (VAD) or microphone streaming. 7 | 8 | Contributions are welcome! 9 | 10 | **Note:** These examples target DeepSpeech **0.9.x** only. If you're using a different release, you need to go to the corresponding branch for the release: 11 | 12 | * `v0.9.x `_ 13 | * `v0.8.x `_ 14 | * `v0.7.x `_ 15 | * `v0.6.x `_ 16 | * `master branch `_ 17 | 18 | **List of examples** 19 | 20 | Python: 21 | ------- 22 | 23 | * `Microphone VAD streaming `_ 24 | * `VAD transcriber `_ 25 | * `AutoSub `_ 26 | 27 | JavaScript: 28 | ----------- 29 | 30 | * `FFMPEG VAD streaming `_ 31 | * `Node.JS microphone VAD streaming `_ 32 | * `Node.JS wav `_ 33 | * `Web Microphone Websocket streaming `_ 34 | * `Electron wav transcriber `_ 35 | 36 | Windows/C#: 37 | ----------- 38 | 39 | * `.NET framework `_ 40 | * `Universal Windows Platform (UWP) `_. 41 | 42 | Java/Android: 43 | ------------- 44 | 45 | * `mozilla/androidspeech library `_ 46 | 47 | Nim: 48 | ---- 49 | 50 | * `nim_mic_vad_streaming `_. 51 | -------------------------------------------------------------------------------- /android_mic_streaming/.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .gradle 3 | /.idea 4 | /local.properties 5 | .DS_Store 6 | /build 7 | /captures 8 | .externalNativeBuild 9 | .cxx 10 | -------------------------------------------------------------------------------- /android_mic_streaming/README.md: -------------------------------------------------------------------------------- 1 | # Android Microphone Streaming 2 | 3 | Android demo application that streams audio from the microphone to deepspeech and transcribes it. 4 | 5 | ## Prerequisites 6 | 7 | #### Download model 8 | 9 | Download the pre-trained English model and extract it: 10 | ``` 11 | curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.tflite 12 | curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer 13 | ``` 14 | 15 | Move the model files `deepspeech-0.9.3-models.pbmm`, `deepspeech-0.9.3-models.scorer`, to the demo application's data directory on your android device. 16 | Mind that the data directory will only be present after installing and launching the app once. 17 | 18 | ``` 19 | adb push deepspeech-0.9.3-models.tflite deepspeech-0.9.3-models.scorer /storage/emulated/0/Android/data/org.deepspeechdemo/files/ 20 | ``` 21 | 22 | You can also copy the files from your file browser to the device. 23 | 24 | #### Android device with USB Debugging 25 | 26 | Connect an android device and make sure to enable USB-Debugging in the developer settings of the device. If haven't already, you can activate your developer settings by following [this guide from android](https://developer.android.com/studio/debug/dev-options#enable). 27 | 28 | ## Installation 29 | 30 | To install the example app on your connected android device you can either use the command line or Android Studio. 31 | 32 | ### Command Line 33 | 34 | ``` 35 | cd android_mic_streaming 36 | ./gradlew installDebug 37 | ``` 38 | 39 | ### Android Studio 40 | 41 | Open the `android_mic_streaming` directory in Android Studio. 42 | Run the app and your connected android device. 43 | 44 | ## Usage 45 | 46 | Start recording by pressing the button and the app will transcribe the spoken text. 47 | 48 | ## Fine-tuning the Recognition 49 | 50 | Based on your use case or the language you are using you might change the values of `BEAM_WIDTH`, `LM_ALPHA` and `LM_BETA` to improve the speech recogintion. 51 | 52 | You can also alter the `NUM_BUFFER_ELEMENTS` to change the size of the audio data buffer that is fed into the model. -------------------------------------------------------------------------------- /android_mic_streaming/app/.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | -------------------------------------------------------------------------------- /android_mic_streaming/app/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'com.android.application' 2 | 3 | apply plugin: 'kotlin-android' 4 | 5 | apply plugin: 'kotlin-android-extensions' 6 | 7 | android { 8 | compileSdkVersion 29 9 | buildToolsVersion "29.0.2" 10 | defaultConfig { 11 | applicationId "org.deepspeechdemo" 12 | minSdkVersion 22 13 | targetSdkVersion 29 14 | versionCode 1 15 | versionName "1.0" 16 | testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" 17 | } 18 | buildTypes { 19 | release { 20 | minifyEnabled false 21 | proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' 22 | } 23 | } 24 | // Specify tflite file should not be compressed for the app apk 25 | aaptOptions { 26 | noCompress "tflite" 27 | } 28 | } 29 | 30 | dependencies { 31 | implementation fileTree(dir: 'libs', include: ['*.jar']) 32 | implementation"org.jetbrains.kotlin:kotlin-stdlib-jdk7:$kotlin_version" 33 | implementation 'androidx.appcompat:appcompat:1.0.2' 34 | implementation 'androidx.core:core-ktx:1.0.2' 35 | implementation 'androidx.constraintlayout:constraintlayout:1.1.3' 36 | 37 | implementation 'org.mozilla.deepspeech:libdeepspeech:0.9.3' 38 | 39 | testImplementation 'junit:junit:4.12' 40 | androidTestImplementation 'androidx.test.ext:junit:1.1.0' 41 | androidTestImplementation 'androidx.test.espresso:espresso-core:3.1.1' 42 | } 43 | 44 | -------------------------------------------------------------------------------- /android_mic_streaming/app/proguard-rules.pro: -------------------------------------------------------------------------------- 1 | # Add project specific ProGuard rules here. 2 | # You can control the set of applied configuration files using the 3 | # proguardFiles setting in build.gradle. 4 | # 5 | # For more details, see 6 | # http://developer.android.com/guide/developing/tools/proguard.html 7 | 8 | # If your project uses WebView with JS, uncomment the following 9 | # and specify the fully qualified class name to the JavaScript interface 10 | # class: 11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview { 12 | # public *; 13 | #} 14 | 15 | # Uncomment this to preserve the line number information for 16 | # debugging stack traces. 17 | #-keepattributes SourceFile,LineNumberTable 18 | 19 | # If you keep the line number information, uncomment this to 20 | # hide the original source file name. 21 | #-renamesourcefileattribute SourceFile 22 | -------------------------------------------------------------------------------- /android_mic_streaming/app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /android_mic_streaming/app/src/main/java/org/deepspeechdemo/MainActivity.kt: -------------------------------------------------------------------------------- 1 | package org.deepspeechdemo 2 | 3 | import android.Manifest 4 | import android.content.pm.PackageManager 5 | import android.media.AudioFormat 6 | import android.media.AudioRecord 7 | import android.media.MediaRecorder 8 | import android.os.Build 9 | import android.os.Bundle 10 | import android.view.View 11 | import androidx.appcompat.app.AppCompatActivity 12 | import androidx.core.app.ActivityCompat 13 | import kotlinx.android.synthetic.main.activity_main.* 14 | import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel 15 | import java.io.File 16 | import java.util.concurrent.atomic.AtomicBoolean 17 | 18 | class MainActivity : AppCompatActivity() { 19 | private var model: DeepSpeechModel? = null 20 | 21 | private var transcriptionThread: Thread? = null 22 | private var isRecording: AtomicBoolean = AtomicBoolean(false) 23 | 24 | private val TFLITE_MODEL_FILENAME = "deepspeech-0.9.3-models.tflite" 25 | private val SCORER_FILENAME = "deepspeech-0.9.3-models.scorer" 26 | 27 | private fun checkAudioPermission() { 28 | // Permission is automatically granted on SDK < 23 upon installation. 29 | if (Build.VERSION.SDK_INT >= 23) { 30 | val permission = Manifest.permission.RECORD_AUDIO 31 | 32 | if (checkSelfPermission(permission) != PackageManager.PERMISSION_GRANTED) { 33 | ActivityCompat.requestPermissions(this, arrayOf(permission), 3) 34 | } 35 | } 36 | } 37 | 38 | private fun transcribe() { 39 | // We read from the recorder in chunks of 2048 shorts. With a model that expects its input 40 | // at 16000Hz, this corresponds to 2048/16000 = 0.128s or 128ms. 41 | val audioBufferSize = 2048 42 | val audioData = ShortArray(audioBufferSize) 43 | 44 | runOnUiThread { btnStartInference.text = "Stop Recording" } 45 | 46 | model?.let { model -> 47 | val streamContext = model.createStream() 48 | 49 | val recorder = AudioRecord( 50 | MediaRecorder.AudioSource.VOICE_RECOGNITION, 51 | model.sampleRate(), 52 | AudioFormat.CHANNEL_IN_MONO, 53 | AudioFormat.ENCODING_PCM_16BIT, 54 | audioBufferSize 55 | ) 56 | recorder.startRecording() 57 | 58 | while (isRecording.get()) { 59 | recorder.read(audioData, 0, audioBufferSize) 60 | model.feedAudioContent(streamContext, audioData, audioData.size) 61 | val decoded = model.intermediateDecode(streamContext) 62 | runOnUiThread { transcription.text = decoded } 63 | } 64 | 65 | val decoded = model.finishStream(streamContext) 66 | 67 | runOnUiThread { 68 | btnStartInference.text = "Start Recording" 69 | transcription.text = decoded 70 | } 71 | 72 | recorder.stop() 73 | recorder.release() 74 | } 75 | } 76 | 77 | private fun createModel(): Boolean { 78 | val modelsPath = getExternalFilesDir(null).toString() 79 | val tfliteModelPath = "$modelsPath/$TFLITE_MODEL_FILENAME" 80 | val scorerPath = "$modelsPath/$SCORER_FILENAME" 81 | 82 | for (path in listOf(tfliteModelPath, scorerPath)) { 83 | if (!File(path).exists()) { 84 | status.append("Model creation failed: $path does not exist.\n") 85 | return false 86 | } 87 | } 88 | 89 | model = DeepSpeechModel(tfliteModelPath) 90 | model?.enableExternalScorer(scorerPath) 91 | 92 | return true 93 | } 94 | 95 | private fun startListening() { 96 | if (isRecording.compareAndSet(false, true)) { 97 | transcriptionThread = Thread(Runnable { transcribe() }, "Transcription Thread") 98 | transcriptionThread?.start() 99 | } 100 | } 101 | 102 | override fun onCreate(savedInstanceState: Bundle?) { 103 | super.onCreate(savedInstanceState) 104 | setContentView(R.layout.activity_main) 105 | checkAudioPermission() 106 | 107 | // Create application data directory on the device 108 | val modelsPath = getExternalFilesDir(null).toString() 109 | 110 | status.text = "Ready. Copy model files to \"$modelsPath\" if running for the first time.\n" 111 | } 112 | 113 | private fun stopListening() { 114 | isRecording.set(false) 115 | } 116 | 117 | fun onRecordClick(v: View?) { 118 | if (model == null) { 119 | if (!createModel()) { 120 | return 121 | } 122 | status.append("Created model.\n") 123 | } 124 | 125 | if (isRecording.get()) { 126 | stopListening() 127 | } else { 128 | startListening() 129 | } 130 | } 131 | 132 | override fun onDestroy() { 133 | super.onDestroy() 134 | if (model != null) { 135 | model?.freeModel() 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /android_mic_streaming/app/src/main/res/drawable-v24/ic_launcher_foreground.xml: -------------------------------------------------------------------------------- 1 | 7 | 12 | 13 | 19 | 22 | 25 | 26 | 27 | 28 | 34 | 35 | -------------------------------------------------------------------------------- /android_mic_streaming/app/src/main/res/drawable/ic_launcher_background.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 10 | 15 | 20 | 25 | 30 | 35 | 40 | 45 | 50 | 55 | 60 | 65 | 70 | 75 | 80 | 85 | 90 | 95 | 100 | 105 | 110 | 115 | 120 | 125 | 130 | 135 | 140 | 145 | 150 | 155 | 160 | 165 | 170 | 171 | -------------------------------------------------------------------------------- /android_mic_streaming/app/src/main/res/layout/activity_main.xml: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 14 | 15 | 23 | 24 | 33 | 34 | 37 | 38 | 46 | 47 | 56 | 57 | 61 | 62 | 29 | No file selected... 30 | 31 | 32 | 33 | 34 | Select an audio input: 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | Results 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /uwp/DeepSpeechUWP/Package.appxmanifest: -------------------------------------------------------------------------------- 1 |  2 | 3 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | DeepSpeechUWP 18 | erikz 19 | Assets\StoreLogo.png 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 34 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /uwp/DeepSpeechUWP/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("DeepSpeechUWP")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("DeepSpeechUWP")] 13 | [assembly: AssemblyCopyright("Copyright © 2020")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Version information for an assembly consists of the following four values: 18 | // 19 | // Major Version 20 | // Minor Version 21 | // Build Number 22 | // Revision 23 | // 24 | // You can specify all the values or you can default the Build and Revision Numbers 25 | // by using the '*' as shown below: 26 | // [assembly: AssemblyVersion("1.0.*")] 27 | [assembly: AssemblyVersion("1.0.0.0")] 28 | [assembly: AssemblyFileVersion("1.0.0.0")] 29 | [assembly: ComVisible(false)] -------------------------------------------------------------------------------- /uwp/DeepSpeechUWP/Properties/Default.rd.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /uwp/DeepSpeechUWP/models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/models/.gitkeep -------------------------------------------------------------------------------- /vad_transcriber/README.md: -------------------------------------------------------------------------------- 1 | ## Transcribing longer audio clips 2 | 3 | The Command and GUI tools perform transcription on long wav files. 4 | They take in a wav file of any duration, use the WebRTC Voice Activity Detector (VAD) 5 | to split it into smaller chunks and finally save a consolidated transcript. 6 | 7 | ### 0. Prerequisites 8 | #### 0.1 Install requiered packages 9 | Install the package which contains rec on the machine: 10 | 11 | Fedora: 12 | 13 | ``` sudo dnf install sox ``` 14 | 15 | Tested on: 29 16 | 17 | Ubuntu/Debian 18 | 19 | ``` sudo apt install sox ``` 20 | 21 | A list of distributions where the package is available can be found at: https://pkgs.org/download/sox 22 | 23 | #### 0.1 Download Deepspeech 24 | Either clone from git via git clone, or Download a version from the release page 25 | 26 | For the next steps we assume you have extracted the files to `~/Deepspeech` 27 | 28 | 29 | #### 0.2 Setup your environment 30 | 31 | Ubuntu/Debian: 32 | 33 | ``` 34 | ~/Deepspeech$ sudo apt install virtualenv 35 | ~/Deepspeech$ cd examples/vad_transcriber 36 | ~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv 37 | ~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate 38 | (venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt 39 | ``` 40 | 41 | Fedora 42 | 43 | ``` 44 | ~/Deepspeech$ sudo dnf install python-virtualenv 45 | ~/Deepspeech$ cd examples/vad_transcriber 46 | ~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv 47 | ~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate 48 | (venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt 49 | ``` 50 | 51 | Tested on: 29 52 | 53 | ### 1. Command line tool 54 | 55 | The command line tool processes a wav file of any duration and returns a trancript 56 | which will the saved in the same directory as the input audio file. 57 | 58 | The command line tool gives you control over the aggressiveness of the VAD. 59 | Set the aggressiveness mode, to an integer between 0 and 3. 60 | 0 being the least aggressive about filtering out non-speech, 3 is the most aggressive. 61 | 62 | ``` 63 | (venv) ~/Deepspeech/examples/vad_transcriber 64 | $ python3 audioTranscript_cmd.py --aggressive 1 --audio ./audio/guido-van-rossum.wav --model ./models/0.4.1/ 65 | 66 | 67 | Filename Duration(s) Inference Time(s) Model Load Time(s) Scorer Load Time(s) 68 | sample_rec.wav 13.710 20.797 5.593 17.742 69 | 70 | ``` 71 | 72 | **Note:** Only `wav` files with a 16kHz sample rate are supported for now, you can convert your files to the appropriate format with ffmpeg if available on your system. 73 | 74 | ffmpeg -i infile.mp3 -ar 16000 -ac 1 outfile.wav 75 | 76 | ### 2. Minimalistic GUI 77 | 78 | The GUI tool does the same job as the CLI tool. The VAD is fixed at an aggressiveness of 1. 79 | The output is displayed in the transcription window and saved into the directory as the input 80 | audio file as well. 81 | 82 | ``` 83 | (venv) ~/Deepspeech/examples/vad_transcriber 84 | $ python3 audioTranscript_gui.py 85 | 86 | ``` 87 | 88 | ![Deepspeech Transcriber](../../doc/audioTranscript.png) 89 | 90 | 91 | #### 2.1. Sporadic failures in pyqt 92 | Some systems have encountered **_Cannot mix incompatible Qt library with this with this library_** issue. 93 | In such a scenario, the GUI tool will not work. The following steps is known to have solved the issue in most cases 94 | ``` 95 | (venv) ~/Deepspeech/examples/vad_transcriber$ pip3 uninstall pyqt5 96 | (venv) ~/Deepspeech/examples/vad_transcriber$ sudo apt install python3-pyqt5 canberra-gtk-module 97 | (venv) ~/Deepspeech/examples/vad_transcriber$ export PYTHONPATH=/usr/lib/python3/dist-packages/ 98 | (venv) ~/Deepspeech/examples/vad_transcriber$ python3 audioTranscript_gui.py 99 | 100 | ``` 101 | #### 2.2 Useful Tips 102 | ##### The GUI programm immediately crashes when you press start recording 103 | This happens when you don't load the models via the "Browse Models" button, before pressing the "Start recording" button. 104 | 105 | ##### What does error XYZ mean? 106 | You can find a list of error codes and what they mean at https://mozilla-voice-stt.readthedocs.io/en/latest/Error-Codes.html 107 | 108 | -------------------------------------------------------------------------------- /vad_transcriber/audioTranscript_cmd.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import logging 4 | import argparse 5 | import subprocess 6 | import shlex 7 | import numpy as np 8 | import wavTranscriber 9 | 10 | # Debug helpers 11 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) 12 | 13 | 14 | def main(args): 15 | parser = argparse.ArgumentParser(description='Transcribe long audio files using webRTC VAD or use the streaming interface') 16 | parser.add_argument('--aggressive', type=int, choices=range(4), required=False, 17 | help='Determines how aggressive filtering out non-speech is. (Interger between 0-3)') 18 | parser.add_argument('--audio', required=False, 19 | help='Path to the audio file to run (WAV format)') 20 | parser.add_argument('--model', required=True, 21 | help='Path to directory that contains all model files (output_graph and scorer)') 22 | parser.add_argument('--stream', required=False, action='store_true', 23 | help='To use deepspeech streaming interface') 24 | args = parser.parse_args() 25 | if args.stream is True: 26 | print("Opening mic for streaming") 27 | elif args.audio is not None: 28 | logging.debug("Transcribing audio file @ %s" % args.audio) 29 | else: 30 | parser.print_help() 31 | parser.exit() 32 | 33 | # Point to a path containing the pre-trained models & resolve ~ if used 34 | dirName = os.path.expanduser(args.model) 35 | 36 | # Resolve all the paths of model files 37 | output_graph, scorer = wavTranscriber.resolve_models(dirName) 38 | 39 | # Load output_graph, alpahbet and scorer 40 | model_retval = wavTranscriber.load_model(output_graph, scorer) 41 | 42 | if args.audio is not None: 43 | title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)'] 44 | print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) 45 | 46 | inference_time = 0.0 47 | 48 | # Run VAD on the input file 49 | waveFile = args.audio 50 | segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, args.aggressive) 51 | f = open(waveFile.rstrip(".wav") + ".txt", 'w') 52 | logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt") 53 | 54 | for i, segment in enumerate(segments): 55 | # Run deepspeech on the chunk that just completed VAD 56 | logging.debug("Processing chunk %002d" % (i,)) 57 | audio = np.frombuffer(segment, dtype=np.int16) 58 | output = wavTranscriber.stt(model_retval[0], audio, sample_rate) 59 | inference_time += output[1] 60 | logging.debug("Transcript: %s" % output[0]) 61 | 62 | f.write(output[0] + " ") 63 | 64 | # Summary of the files processed 65 | f.close() 66 | 67 | # Extract filename from the full file path 68 | filename, ext = os.path.split(os.path.basename(waveFile)) 69 | logging.debug("************************************************************************************************************") 70 | logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) 71 | logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) 72 | logging.debug("************************************************************************************************************") 73 | print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) 74 | else: 75 | sctx = model_retval[0].createStream() 76 | subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'), 77 | stdout=subprocess.PIPE, 78 | bufsize=0) 79 | print('You can start speaking now. Press Control-C to stop recording.') 80 | 81 | try: 82 | while True: 83 | data = subproc.stdout.read(512) 84 | sctx.feedAudioContent(np.frombuffer(data, np.int16)) 85 | except KeyboardInterrupt: 86 | print('Transcription: ', sctx.finishStream()) 87 | subproc.terminate() 88 | subproc.wait() 89 | 90 | 91 | if __name__ == '__main__': 92 | main(sys.argv[1:]) 93 | -------------------------------------------------------------------------------- /vad_transcriber/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeech==0.9.3 2 | webrtcvad 3 | pyqt5 4 | -------------------------------------------------------------------------------- /vad_transcriber/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xe 4 | 5 | THIS=$(dirname "$0") 6 | 7 | pushd ${THIS} 8 | source ../tests.sh 9 | 10 | pip install --user $(get_python_wheel_url "$1") 11 | pip install --user -r <(grep -v deepspeech requirements.txt) 12 | 13 | python audioTranscript_cmd.py \ 14 | --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \ 15 | --aggressive 0 \ 16 | --model $HOME/DeepSpeech/models/ 17 | 18 | python audioTranscript_cmd.py \ 19 | --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \ 20 | --aggressive 0 \ 21 | --model $HOME/DeepSpeech/models/ \ 22 | --stream 23 | popd 24 | -------------------------------------------------------------------------------- /vad_transcriber/wavSplit.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import contextlib 3 | import wave 4 | 5 | 6 | def read_wave(path): 7 | """Reads a .wav file. 8 | 9 | Takes the path, and returns (PCM audio data, sample rate). 10 | """ 11 | with contextlib.closing(wave.open(path, 'rb')) as wf: 12 | num_channels = wf.getnchannels() 13 | assert num_channels == 1 14 | sample_width = wf.getsampwidth() 15 | assert sample_width == 2 16 | sample_rate = wf.getframerate() 17 | assert sample_rate in (8000, 16000, 32000) 18 | frames = wf.getnframes() 19 | pcm_data = wf.readframes(frames) 20 | duration = frames / sample_rate 21 | return pcm_data, sample_rate, duration 22 | 23 | 24 | def write_wave(path, audio, sample_rate): 25 | """Writes a .wav file. 26 | 27 | Takes path, PCM audio data, and sample rate. 28 | """ 29 | with contextlib.closing(wave.open(path, 'wb')) as wf: 30 | wf.setnchannels(1) 31 | wf.setsampwidth(2) 32 | wf.setframerate(sample_rate) 33 | wf.writeframes(audio) 34 | 35 | 36 | class Frame(object): 37 | """Represents a "frame" of audio data.""" 38 | def __init__(self, bytes, timestamp, duration): 39 | self.bytes = bytes 40 | self.timestamp = timestamp 41 | self.duration = duration 42 | 43 | 44 | def frame_generator(frame_duration_ms, audio, sample_rate): 45 | """Generates audio frames from PCM audio data. 46 | 47 | Takes the desired frame duration in milliseconds, the PCM data, and 48 | the sample rate. 49 | 50 | Yields Frames of the requested duration. 51 | """ 52 | n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) 53 | offset = 0 54 | timestamp = 0.0 55 | duration = (float(n) / sample_rate) / 2.0 56 | while offset + n < len(audio): 57 | yield Frame(audio[offset:offset + n], timestamp, duration) 58 | timestamp += duration 59 | offset += n 60 | 61 | 62 | def vad_collector(sample_rate, frame_duration_ms, 63 | padding_duration_ms, vad, frames): 64 | """Filters out non-voiced audio frames. 65 | 66 | Given a webrtcvad.Vad and a source of audio frames, yields only 67 | the voiced audio. 68 | 69 | Uses a padded, sliding window algorithm over the audio frames. 70 | When more than 90% of the frames in the window are voiced (as 71 | reported by the VAD), the collector triggers and begins yielding 72 | audio frames. Then the collector waits until 90% of the frames in 73 | the window are unvoiced to detrigger. 74 | 75 | The window is padded at the front and back to provide a small 76 | amount of silence or the beginnings/endings of speech around the 77 | voiced frames. 78 | 79 | Arguments: 80 | 81 | sample_rate - The audio sample rate, in Hz. 82 | frame_duration_ms - The frame duration in milliseconds. 83 | padding_duration_ms - The amount to pad the window, in milliseconds. 84 | vad - An instance of webrtcvad.Vad. 85 | frames - a source of audio frames (sequence or generator). 86 | 87 | Returns: A generator that yields PCM audio data. 88 | """ 89 | num_padding_frames = int(padding_duration_ms / frame_duration_ms) 90 | # We use a deque for our sliding window/ring buffer. 91 | ring_buffer = collections.deque(maxlen=num_padding_frames) 92 | # We have two states: TRIGGERED and NOTTRIGGERED. We start in the 93 | # NOTTRIGGERED state. 94 | triggered = False 95 | 96 | voiced_frames = [] 97 | for frame in frames: 98 | is_speech = vad.is_speech(frame.bytes, sample_rate) 99 | 100 | if not triggered: 101 | ring_buffer.append((frame, is_speech)) 102 | num_voiced = len([f for f, speech in ring_buffer if speech]) 103 | # If we're NOTTRIGGERED and more than 90% of the frames in 104 | # the ring buffer are voiced frames, then enter the 105 | # TRIGGERED state. 106 | if num_voiced > 0.9 * ring_buffer.maxlen: 107 | triggered = True 108 | # We want to yield all the audio we see from now until 109 | # we are NOTTRIGGERED, but we have to start with the 110 | # audio that's already in the ring buffer. 111 | for f, s in ring_buffer: 112 | voiced_frames.append(f) 113 | ring_buffer.clear() 114 | else: 115 | # We're in the TRIGGERED state, so collect the audio data 116 | # and add it to the ring buffer. 117 | voiced_frames.append(frame) 118 | ring_buffer.append((frame, is_speech)) 119 | num_unvoiced = len([f for f, speech in ring_buffer if not speech]) 120 | # If more than 90% of the frames in the ring buffer are 121 | # unvoiced, then enter NOTTRIGGERED and yield whatever 122 | # audio we've collected. 123 | if num_unvoiced > 0.9 * ring_buffer.maxlen: 124 | triggered = False 125 | yield b''.join([f.bytes for f in voiced_frames]) 126 | ring_buffer.clear() 127 | voiced_frames = [] 128 | if triggered: 129 | pass 130 | # If we have any leftover voiced audio when we run out of input, 131 | # yield it. 132 | if voiced_frames: 133 | yield b''.join([f.bytes for f in voiced_frames]) 134 | 135 | -------------------------------------------------------------------------------- /vad_transcriber/wavTranscriber.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import webrtcvad 3 | import logging 4 | import wavSplit 5 | from deepspeech import Model 6 | from timeit import default_timer as timer 7 | 8 | ''' 9 | Load the pre-trained model into the memory 10 | @param models: Output Grapgh Protocol Buffer file 11 | @param scorer: Scorer file 12 | 13 | @Retval 14 | Returns a list [DeepSpeech Object, Model Load Time, Scorer Load Time] 15 | ''' 16 | def load_model(models, scorer): 17 | model_load_start = timer() 18 | ds = Model(models) 19 | model_load_end = timer() - model_load_start 20 | logging.debug("Loaded model in %0.3fs." % (model_load_end)) 21 | 22 | scorer_load_start = timer() 23 | ds.enableExternalScorer(scorer) 24 | scorer_load_end = timer() - scorer_load_start 25 | logging.debug('Loaded external scorer in %0.3fs.' % (scorer_load_end)) 26 | 27 | return [ds, model_load_end, scorer_load_end] 28 | 29 | ''' 30 | Run Inference on input audio file 31 | @param ds: Deepspeech object 32 | @param audio: Input audio for running inference on 33 | @param fs: Sample rate of the input audio file 34 | 35 | @Retval: 36 | Returns a list [Inference, Inference Time, Audio Length] 37 | 38 | ''' 39 | def stt(ds, audio, fs): 40 | inference_time = 0.0 41 | audio_length = len(audio) * (1 / fs) 42 | 43 | # Run Deepspeech 44 | logging.debug('Running inference...') 45 | inference_start = timer() 46 | output = ds.stt(audio) 47 | inference_end = timer() - inference_start 48 | inference_time += inference_end 49 | logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length)) 50 | 51 | return [output, inference_time] 52 | 53 | ''' 54 | Resolve directory path for the models and fetch each of them. 55 | @param dirName: Path to the directory containing pre-trained models 56 | 57 | @Retval: 58 | Retunns a tuple containing each of the model files (pb, scorer) 59 | ''' 60 | def resolve_models(dirName): 61 | pb = glob.glob(dirName + "/*.pbmm")[0] 62 | logging.debug("Found Model: %s" % pb) 63 | 64 | scorer = glob.glob(dirName + "/*.scorer")[0] 65 | logging.debug("Found scorer: %s" % scorer) 66 | 67 | return pb, scorer 68 | 69 | ''' 70 | Generate VAD segments. Filters out non-voiced audio frames. 71 | @param waveFile: Input wav file to run VAD on.0 72 | 73 | @Retval: 74 | Returns tuple of 75 | segments: a bytearray of multiple smaller audio frames 76 | (The longer audio split into mutiple smaller one's) 77 | sample_rate: Sample rate of the input audio file 78 | audio_length: Duraton of the input audio file 79 | 80 | ''' 81 | def vad_segment_generator(wavFile, aggressiveness): 82 | logging.debug("Caught the wav file @: %s" % (wavFile)) 83 | audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) 84 | assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" 85 | vad = webrtcvad.Vad(int(aggressiveness)) 86 | frames = wavSplit.frame_generator(30, audio, sample_rate) 87 | frames = list(frames) 88 | segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) 89 | 90 | return segments, sample_rate, audio_length 91 | -------------------------------------------------------------------------------- /web_microphone_websocket/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | /deepspeech-0.6.0-models 4 | 5 | # dependencies 6 | /node_modules 7 | /.pnp 8 | .pnp.js 9 | 10 | # testing 11 | /coverage 12 | 13 | # production 14 | /build 15 | 16 | # misc 17 | .DS_Store 18 | .env.local 19 | .env.development.local 20 | .env.test.local 21 | .env.production.local 22 | 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | -------------------------------------------------------------------------------- /web_microphone_websocket/Readme.md: -------------------------------------------------------------------------------- 1 | # Web Microphone Websocket 2 | 3 | This is an example of a ReactJS web application streaming microphone audio from the browser 4 | to a NodeJS server and transmitting the DeepSpeech results back to the browser. 5 | 6 | #### Download the pre-trained model (1.8GB): 7 | 8 | ``` 9 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm 10 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer 11 | ``` 12 | 13 | #### Install: 14 | 15 | ``` 16 | yarn install 17 | ``` 18 | 19 | #### Run ReactJS Client: 20 | 21 | ``` 22 | yarn start 23 | ``` 24 | 25 | #### Run NodeJS Server (in a separate terminal window): 26 | 27 | ``` 28 | node server.js 29 | ``` -------------------------------------------------------------------------------- /web_microphone_websocket/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web-microphone-websocket", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@testing-library/jest-dom": "^4.2.4", 7 | "@testing-library/react": "^9.3.2", 8 | "@testing-library/user-event": "^7.1.2", 9 | "chai": "^4.2.0", 10 | "chai-http": "^4.3.0", 11 | "deepspeech": "^0.9.3", 12 | "defaults": "^1.0.3", 13 | "mocha": "^6.1.4", 14 | "node-vad": "^1.1.4", 15 | "react": "^16.12.0", 16 | "react-dom": "^16.12.0", 17 | "react-scripts": "^3.4.0", 18 | "should": "^13.2.3", 19 | "should-http": "^0.1.1", 20 | "socket.io": "^2.3.0", 21 | "socket.io-client": "^2.3.0" 22 | }, 23 | "scripts": { 24 | "start": "react-scripts start", 25 | "build": "react-scripts build", 26 | "test:client": "react-scripts test --env=jsdom --watchAll=false --coverage", 27 | "test:server": "NODE_ENV=dev mocha --recursive ./test/config.js ./test --exit", 28 | "eject": "react-scripts eject" 29 | }, 30 | "eslintConfig": { 31 | "extends": "react-app" 32 | }, 33 | "browserslist": { 34 | "production": [ 35 | ">0.2%", 36 | "not dead", 37 | "not op_mini all" 38 | ], 39 | "development": [ 40 | "last 1 chrome version", 41 | "last 1 firefox version", 42 | "last 1 safari version" 43 | ] 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /web_microphone_websocket/public/downsampling_worker.js: -------------------------------------------------------------------------------- 1 | // source: 2 | // https://github.com/Picovoice/web-voice-processor/blob/master/src/downsampling_worker.js 3 | 4 | onmessage = function (e) { 5 | switch (e.data.command) { 6 | case "init": 7 | init(e.data.inputSampleRate); 8 | break; 9 | case "process": 10 | process(e.data.inputFrame); 11 | break; 12 | case "reset": 13 | reset(); 14 | break; 15 | } 16 | }; 17 | 18 | let inputSampleRate; 19 | let inputBuffer = []; 20 | 21 | function init(x) { 22 | inputSampleRate = x; 23 | } 24 | 25 | function process(inputFrame) { 26 | for (let i = 0; i < inputFrame.length; i++) { 27 | inputBuffer.push((inputFrame[i]) * 32767); 28 | } 29 | 30 | const PV_SAMPLE_RATE = 16000; 31 | const PV_FRAME_LENGTH = 512; 32 | 33 | while ((inputBuffer.length * PV_SAMPLE_RATE / inputSampleRate) > PV_FRAME_LENGTH) { 34 | let outputFrame = new Int16Array(PV_FRAME_LENGTH); 35 | let sum = 0; 36 | let num = 0; 37 | let outputIndex = 0; 38 | let inputIndex = 0; 39 | 40 | while (outputIndex < PV_FRAME_LENGTH) { 41 | sum = 0; 42 | num = 0; 43 | while (inputIndex < Math.min(inputBuffer.length, (outputIndex + 1) * inputSampleRate / PV_SAMPLE_RATE)) { 44 | sum += inputBuffer[inputIndex]; 45 | num++; 46 | inputIndex++; 47 | } 48 | outputFrame[outputIndex] = sum / num; 49 | outputIndex++; 50 | } 51 | 52 | postMessage(outputFrame); 53 | 54 | inputBuffer = inputBuffer.slice(inputIndex); 55 | } 56 | } 57 | 58 | function reset() { 59 | inputBuffer = []; 60 | } -------------------------------------------------------------------------------- /web_microphone_websocket/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/web_microphone_websocket/public/favicon.ico -------------------------------------------------------------------------------- /web_microphone_websocket/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 | 17 | 18 | 27 | DeepSpeech - Web Microphone Websocket Example 28 | 29 | 30 | 31 |
32 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /web_microphone_websocket/public/logo192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/web_microphone_websocket/public/logo192.png -------------------------------------------------------------------------------- /web_microphone_websocket/public/logo512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/web_microphone_websocket/public/logo512.png -------------------------------------------------------------------------------- /web_microphone_websocket/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /web_microphone_websocket/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | -------------------------------------------------------------------------------- /web_microphone_websocket/server.js: -------------------------------------------------------------------------------- 1 | const http = require('http'); 2 | const socketIO = require('socket.io'); 3 | const DeepSpeech = require('deepspeech'); 4 | const VAD = require('node-vad'); 5 | 6 | let DEEPSPEECH_MODEL = __dirname + '/deepspeech-0.9.3-models'; // path to deepspeech english model directory 7 | 8 | let SILENCE_THRESHOLD = 200; // how many milliseconds of inactivity before processing the audio 9 | 10 | const SERVER_PORT = 4000; // websocket server port 11 | 12 | // const VAD_MODE = VAD.Mode.NORMAL; 13 | // const VAD_MODE = VAD.Mode.LOW_BITRATE; 14 | // const VAD_MODE = VAD.Mode.AGGRESSIVE; 15 | const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE; 16 | const vad = new VAD(VAD_MODE); 17 | 18 | function createModel(modelDir) { 19 | let modelPath = modelDir + '.pbmm'; 20 | let scorerPath = modelDir + '.scorer'; 21 | let model = new DeepSpeech.Model(modelPath); 22 | model.enableExternalScorer(scorerPath); 23 | return model; 24 | } 25 | 26 | let englishModel = createModel(DEEPSPEECH_MODEL); 27 | 28 | let modelStream; 29 | let recordedChunks = 0; 30 | let silenceStart = null; 31 | let recordedAudioLength = 0; 32 | let endTimeout = null; 33 | let silenceBuffers = []; 34 | 35 | function processAudioStream(data, callback) { 36 | vad.processAudio(data, 16000).then((res) => { 37 | switch (res) { 38 | case VAD.Event.ERROR: 39 | console.log("VAD ERROR"); 40 | break; 41 | case VAD.Event.NOISE: 42 | console.log("VAD NOISE"); 43 | break; 44 | case VAD.Event.SILENCE: 45 | processSilence(data, callback); 46 | break; 47 | case VAD.Event.VOICE: 48 | processVoice(data); 49 | break; 50 | default: 51 | console.log('default', res); 52 | 53 | } 54 | }); 55 | 56 | // timeout after 1s of inactivity 57 | clearTimeout(endTimeout); 58 | endTimeout = setTimeout(function() { 59 | console.log('timeout'); 60 | resetAudioStream(); 61 | },1000); 62 | } 63 | 64 | function endAudioStream(callback) { 65 | console.log('[end]'); 66 | let results = intermediateDecode(); 67 | if (results) { 68 | if (callback) { 69 | callback(results); 70 | } 71 | } 72 | } 73 | 74 | function resetAudioStream() { 75 | clearTimeout(endTimeout); 76 | console.log('[reset]'); 77 | intermediateDecode(); // ignore results 78 | recordedChunks = 0; 79 | silenceStart = null; 80 | } 81 | 82 | function processSilence(data, callback) { 83 | if (recordedChunks > 0) { // recording is on 84 | process.stdout.write('-'); // silence detected while recording 85 | 86 | feedAudioContent(data); 87 | 88 | if (silenceStart === null) { 89 | silenceStart = new Date().getTime(); 90 | } 91 | else { 92 | let now = new Date().getTime(); 93 | if (now - silenceStart > SILENCE_THRESHOLD) { 94 | silenceStart = null; 95 | console.log('[end]'); 96 | let results = intermediateDecode(); 97 | if (results) { 98 | if (callback) { 99 | callback(results); 100 | } 101 | } 102 | } 103 | } 104 | } 105 | else { 106 | process.stdout.write('.'); // silence detected while not recording 107 | bufferSilence(data); 108 | } 109 | } 110 | 111 | function bufferSilence(data) { 112 | // VAD has a tendency to cut the first bit of audio data from the start of a recording 113 | // so keep a buffer of that first bit of audio and in addBufferedSilence() reattach it to the beginning of the recording 114 | silenceBuffers.push(data); 115 | if (silenceBuffers.length >= 3) { 116 | silenceBuffers.shift(); 117 | } 118 | } 119 | 120 | function addBufferedSilence(data) { 121 | let audioBuffer; 122 | if (silenceBuffers.length) { 123 | silenceBuffers.push(data); 124 | let length = 0; 125 | silenceBuffers.forEach(function (buf) { 126 | length += buf.length; 127 | }); 128 | audioBuffer = Buffer.concat(silenceBuffers, length); 129 | silenceBuffers = []; 130 | } 131 | else audioBuffer = data; 132 | return audioBuffer; 133 | } 134 | 135 | function processVoice(data) { 136 | silenceStart = null; 137 | if (recordedChunks === 0) { 138 | console.log(''); 139 | process.stdout.write('[start]'); // recording started 140 | } 141 | else { 142 | process.stdout.write('='); // still recording 143 | } 144 | recordedChunks++; 145 | 146 | data = addBufferedSilence(data); 147 | feedAudioContent(data); 148 | } 149 | 150 | function createStream() { 151 | modelStream = englishModel.createStream(); 152 | recordedChunks = 0; 153 | recordedAudioLength = 0; 154 | } 155 | 156 | function finishStream() { 157 | if (modelStream) { 158 | let start = new Date(); 159 | let text = modelStream.finishStream(); 160 | if (text) { 161 | console.log(''); 162 | console.log('Recognized Text:', text); 163 | let recogTime = new Date().getTime() - start.getTime(); 164 | return { 165 | text, 166 | recogTime, 167 | audioLength: Math.round(recordedAudioLength) 168 | }; 169 | } 170 | } 171 | silenceBuffers = []; 172 | modelStream = null; 173 | } 174 | 175 | function intermediateDecode() { 176 | let results = finishStream(); 177 | createStream(); 178 | return results; 179 | } 180 | 181 | function feedAudioContent(chunk) { 182 | recordedAudioLength += (chunk.length / 2) * (1 / 16000) * 1000; 183 | modelStream.feedAudioContent(chunk); 184 | } 185 | 186 | const app = http.createServer(function (req, res) { 187 | res.writeHead(200); 188 | res.write('web-microphone-websocket'); 189 | res.end(); 190 | }); 191 | 192 | const io = socketIO(app, {}); 193 | io.set('origins', '*:*'); 194 | 195 | io.on('connection', function(socket) { 196 | console.log('client connected'); 197 | 198 | socket.once('disconnect', () => { 199 | console.log('client disconnected'); 200 | }); 201 | 202 | createStream(); 203 | 204 | socket.on('stream-data', function(data) { 205 | processAudioStream(data, (results) => { 206 | socket.emit('recognize', results); 207 | }); 208 | }); 209 | 210 | socket.on('stream-end', function() { 211 | endAudioStream((results) => { 212 | socket.emit('recognize', results); 213 | }); 214 | }); 215 | 216 | socket.on('stream-reset', function() { 217 | resetAudioStream(); 218 | }); 219 | }); 220 | 221 | app.listen(SERVER_PORT, 'localhost', () => { 222 | console.log('Socket server listening on:', SERVER_PORT); 223 | }); 224 | 225 | module.exports = app; -------------------------------------------------------------------------------- /web_microphone_websocket/src/App.js: -------------------------------------------------------------------------------- 1 | import React, {Component} from 'react'; 2 | import io from 'socket.io-client'; 3 | 4 | const DOWNSAMPLING_WORKER = './downsampling_worker.js'; 5 | 6 | class App extends Component { 7 | constructor(props) { 8 | super(props); 9 | this.state = { 10 | connected: false, 11 | recording: false, 12 | recordingStart: 0, 13 | recordingTime: 0, 14 | recognitionOutput: [] 15 | }; 16 | } 17 | 18 | componentDidMount() { 19 | let recognitionCount = 0; 20 | 21 | this.socket = io.connect('http://localhost:4000', {}); 22 | 23 | this.socket.on('connect', () => { 24 | console.log('socket connected'); 25 | this.setState({connected: true}); 26 | }); 27 | 28 | this.socket.on('disconnect', () => { 29 | console.log('socket disconnected'); 30 | this.setState({connected: false}); 31 | this.stopRecording(); 32 | }); 33 | 34 | this.socket.on('recognize', (results) => { 35 | console.log('recognized:', results); 36 | const {recognitionOutput} = this.state; 37 | results.id = recognitionCount++; 38 | recognitionOutput.unshift(results); 39 | this.setState({recognitionOutput}); 40 | }); 41 | } 42 | 43 | render() { 44 | return (
45 |
46 | 49 | 50 | 53 | 54 | {this.renderTime()} 55 |
56 | {this.renderRecognitionOutput()} 57 |
); 58 | } 59 | 60 | renderTime() { 61 | return ( 62 | {(Math.round(this.state.recordingTime / 100) / 10).toFixed(1)}s 63 | ); 64 | } 65 | 66 | renderRecognitionOutput() { 67 | return (
    68 | {this.state.recognitionOutput.map((r) => { 69 | return (
  • {r.text}
  • ); 70 | })} 71 |
) 72 | } 73 | 74 | createAudioProcessor(audioContext, audioSource) { 75 | let processor = audioContext.createScriptProcessor(4096, 1, 1); 76 | 77 | const sampleRate = audioSource.context.sampleRate; 78 | 79 | let downsampler = new Worker(DOWNSAMPLING_WORKER); 80 | downsampler.postMessage({command: "init", inputSampleRate: sampleRate}); 81 | downsampler.onmessage = (e) => { 82 | if (this.socket.connected) { 83 | this.socket.emit('stream-data', e.data.buffer); 84 | } 85 | }; 86 | 87 | processor.onaudioprocess = (event) => { 88 | var data = event.inputBuffer.getChannelData(0); 89 | downsampler.postMessage({command: "process", inputFrame: data}); 90 | }; 91 | 92 | processor.shutdown = () => { 93 | processor.disconnect(); 94 | this.onaudioprocess = null; 95 | }; 96 | 97 | processor.connect(audioContext.destination); 98 | 99 | return processor; 100 | } 101 | 102 | startRecording = e => { 103 | if (!this.state.recording) { 104 | this.recordingInterval = setInterval(() => { 105 | let recordingTime = new Date().getTime() - this.state.recordingStart; 106 | this.setState({recordingTime}); 107 | }, 100); 108 | 109 | this.setState({ 110 | recording: true, 111 | recordingStart: new Date().getTime(), 112 | recordingTime: 0 113 | }, () => { 114 | this.startMicrophone(); 115 | }); 116 | } 117 | }; 118 | 119 | startMicrophone() { 120 | this.audioContext = new AudioContext(); 121 | 122 | const success = (stream) => { 123 | console.log('started recording'); 124 | this.mediaStream = stream; 125 | this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream); 126 | this.processor = this.createAudioProcessor(this.audioContext, this.mediaStreamSource); 127 | this.mediaStreamSource.connect(this.processor); 128 | }; 129 | 130 | const fail = (e) => { 131 | console.error('recording failure', e); 132 | }; 133 | 134 | if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { 135 | navigator.mediaDevices.getUserMedia({ 136 | video: false, 137 | audio: true 138 | }) 139 | .then(success) 140 | .catch(fail); 141 | } 142 | else { 143 | navigator.getUserMedia({ 144 | video: false, 145 | audio: true 146 | }, success, fail); 147 | } 148 | } 149 | 150 | stopRecording = e => { 151 | if (this.state.recording) { 152 | if (this.socket.connected) { 153 | this.socket.emit('stream-reset'); 154 | } 155 | clearInterval(this.recordingInterval); 156 | this.setState({ 157 | recording: false 158 | }, () => { 159 | this.stopMicrophone(); 160 | }); 161 | } 162 | }; 163 | 164 | stopMicrophone() { 165 | if (this.mediaStream) { 166 | this.mediaStream.getTracks()[0].stop(); 167 | } 168 | if (this.mediaStreamSource) { 169 | this.mediaStreamSource.disconnect(); 170 | } 171 | if (this.processor) { 172 | this.processor.shutdown(); 173 | } 174 | if (this.audioContext) { 175 | this.audioContext.close(); 176 | } 177 | } 178 | } 179 | 180 | export default App; 181 | -------------------------------------------------------------------------------- /web_microphone_websocket/src/App.test.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import { render } from '@testing-library/react'; 3 | import App from './App'; 4 | 5 | test('Renders the start recording button', () => { 6 | const { getByText } = render(); 7 | const startButton = getByText(/Start Recording/); 8 | expect(startButton).toBeInTheDocument(); 9 | }); 10 | 11 | test('Renders the stop recording button', () => { 12 | const { getByText } = render(); 13 | const stopButton = getByText(/Stop Recording/); 14 | expect(stopButton).toBeInTheDocument(); 15 | }); 16 | -------------------------------------------------------------------------------- /web_microphone_websocket/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', 5 | sans-serif; 6 | -webkit-font-smoothing: antialiased; 7 | -moz-osx-font-smoothing: grayscale; 8 | } 9 | 10 | code { 11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New', 12 | monospace; 13 | } 14 | -------------------------------------------------------------------------------- /web_microphone_websocket/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import './index.css'; 4 | import App from './App'; 5 | 6 | ReactDOM.render(, document.getElementById('root')); -------------------------------------------------------------------------------- /web_microphone_websocket/src/setupTests.js: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import '@testing-library/jest-dom/extend-expect'; 6 | -------------------------------------------------------------------------------- /web_microphone_websocket/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xe 4 | 5 | THIS=$(dirname "$0") 6 | 7 | pushd ${THIS} 8 | source ../tests.sh 9 | 10 | npm install $(get_npm_package_url) 11 | npm install 12 | 13 | ln -s $HOME/DeepSpeech/models deepspeech-0.9.3-models 14 | 15 | yarn run test:client 16 | yarn run test:server 17 | 18 | popd 19 | -------------------------------------------------------------------------------- /web_microphone_websocket/test/config.js: -------------------------------------------------------------------------------- 1 | console.log('starting test server'); 2 | before(function(done) { 3 | app = require('../server'); 4 | app.on('listening', function() { 5 | console.log('listening'); 6 | done(); 7 | }); 8 | this.timeout(5000); 9 | }); 10 | -------------------------------------------------------------------------------- /web_microphone_websocket/test/server.test.js: -------------------------------------------------------------------------------- 1 | const chai = require('chai'); 2 | const chaiHttp = require('chai-http'); 3 | const should = require('should-http'); 4 | chai.use(chaiHttp); 5 | const expect = chai.expect; 6 | const fs = require('fs'); 7 | const io = require('socket.io-client'); 8 | 9 | const url = 'http://localhost:4000'; 10 | 11 | let audioFile1 = process.env.HOME + '/DeepSpeech/audio/2830-3980-0043.wav'; 12 | let audioFile2 = process.env.HOME + '/DeepSpeech/audio/8455-210777-0068.wav'; 13 | let audioFile3 = process.env.HOME + '/DeepSpeech/audio/4507-16021-0012.wav'; 14 | 15 | let socket; 16 | 17 | before(function(done) { 18 | console.log('before'); 19 | socket = io.connect(url, {}); 20 | done(); 21 | }); 22 | 23 | describe('GET /', function() { 24 | it('should return web-microphone-websocket', function(done) { 25 | chai.request(url) 26 | .get('/') 27 | .end(function(err, res){ 28 | res.should.have.status(200); 29 | expect(res.text).to.be.equal('web-microphone-websocket'); 30 | done(); 31 | }); 32 | }); 33 | }); 34 | 35 | describe('Websocket Audio', function() { 36 | 37 | it('audioFile1: experience proof this', function(done) { 38 | socket.once('recognize', (results) => { 39 | expect(results.text).to.be.equal('experience proof this'); 40 | done(); 41 | }); 42 | 43 | fs.createReadStream(audioFile1, {highWaterMark: 4096}) 44 | .on('data', function (chunk) { 45 | socket.emit('microphone-data', chunk); 46 | }) 47 | .on('end', function () { 48 | socket.emit('microphone-end'); 49 | }); 50 | }); 51 | 52 | it('audioFile2: your power is sufficient i said', function(done) { 53 | socket.once('recognize', (results) => { 54 | expect(results.text).to.be.equal('your power is sufficient i said'); 55 | done(); 56 | }); 57 | 58 | fs.createReadStream(audioFile2, {highWaterMark: 4096}) 59 | .on('data', function (chunk) { 60 | socket.emit('microphone-data', chunk); 61 | }) 62 | .on('end', function () { 63 | socket.emit('microphone-end'); 64 | }); 65 | }); 66 | 67 | it('audioFile3: why should one halt on the way', function(done) { 68 | socket.once('recognize', (results) => { 69 | expect(results.text).to.be.equal('why should one halt on the way'); 70 | done(); 71 | }); 72 | 73 | fs.createReadStream(audioFile3, {highWaterMark: 4096}) 74 | .on('data', function (chunk) { 75 | socket.emit('microphone-data', chunk); 76 | }) 77 | .on('end', function () { 78 | socket.emit('microphone-end'); 79 | }); 80 | 81 | }); 82 | }); 83 | --------------------------------------------------------------------------------