├── README.rst
├── android_mic_streaming
    ├── .gitignore
    ├── README.md
    ├── app
    │   ├── .gitignore
    │   ├── build.gradle
    │   ├── proguard-rules.pro
    │   └── src
    │   │   └── main
    │   │       ├── AndroidManifest.xml
    │   │       ├── java
    │   │           └── org
    │   │           │   └── deepspeechdemo
    │   │           │       └── MainActivity.kt
    │   │       └── res
    │   │           ├── drawable-v24
    │   │               └── ic_launcher_foreground.xml
    │   │           ├── drawable
    │   │               └── ic_launcher_background.xml
    │   │           ├── layout
    │   │               └── activity_main.xml
    │   │           ├── mipmap-anydpi-v26
    │   │               ├── ic_launcher.xml
    │   │               └── ic_launcher_round.xml
    │   │           ├── mipmap-hdpi
    │   │               ├── ic_launcher.png
    │   │               └── ic_launcher_round.png
    │   │           ├── mipmap-mdpi
    │   │               ├── ic_launcher.png
    │   │               └── ic_launcher_round.png
    │   │           ├── mipmap-xhdpi
    │   │               ├── ic_launcher.png
    │   │               └── ic_launcher_round.png
    │   │           ├── mipmap-xxhdpi
    │   │               ├── ic_launcher.png
    │   │               └── ic_launcher_round.png
    │   │           ├── mipmap-xxxhdpi
    │   │               ├── ic_launcher.png
    │   │               └── ic_launcher_round.png
    │   │           └── values
    │   │               ├── colors.xml
    │   │               ├── strings.xml
    │   │               └── styles.xml
    ├── build.gradle
    ├── gradle.properties
    ├── gradle
    │   └── wrapper
    │   │   ├── gradle-wrapper.jar
    │   │   └── gradle-wrapper.properties
    ├── gradlew
    ├── gradlew.bat
    └── settings.gradle
├── autosub
    ├── LICENSE
    ├── README.md
    ├── autosub
    │   ├── __init__.py
    │   ├── audioProcessing.py
    │   ├── featureExtraction.py
    │   ├── main.py
    │   ├── segmentAudio.py
    │   ├── trainAudio.py
    │   └── writeToFile.py
    ├── requirements.txt
    └── setup.py
├── batch_processing
    ├── Readme.md
    ├── driver.py
    ├── requirements.txt
    ├── setup.ps1
    ├── test.ps1
    └── test_tf.py
├── electron
    ├── .gitignore
    ├── Readme.md
    ├── package-lock.json
    ├── package.json
    ├── public
    │   ├── create-window.js
    │   ├── download.js
    │   ├── electron.js
    │   ├── index.html
    │   ├── manifest.json
    │   ├── preload.js
    │   └── recognize-wav.js
    ├── src
    │   ├── App.js
    │   └── index.js
    └── test.sh
├── ffmpeg_vad_streaming
    ├── README.MD
    ├── index.js
    ├── package.json
    └── test.sh
├── hotword_adjusting
    ├── README.md
    └── hotword_adjusting.py
├── mic_vad_streaming
    ├── README.rst
    ├── mic_vad_streaming.py
    ├── requirements.txt
    └── test.sh
├── net_framework
    ├── .gitignore
    └── DeepSpeechWPF
    │   ├── App.config
    │   ├── App.xaml
    │   ├── App.xaml.cs
    │   ├── DeepSpeech.WPF.csproj
    │   ├── DeepSpeech.WPF.sln
    │   ├── MainWindow.xaml
    │   ├── MainWindow.xaml.cs
    │   ├── Properties
    │       ├── AssemblyInfo.cs
    │       ├── Resources.Designer.cs
    │       ├── Resources.resx
    │       ├── Settings.Designer.cs
    │       └── Settings.settings
    │   ├── ViewModels
    │       ├── BindableBase.cs
    │       └── MainWindowViewModel.cs
    │   └── packages.config
├── nim_mic_vad_streaming
    ├── README.md
    ├── linux_nim_vad_streaming
    │   ├── README.md
    │   └── vad_stream.nim
    └── win_nim_vad_streaming
    │   ├── README.md
    │   └── vad_stream.nim
├── nodejs_mic_vad_streaming
    ├── .gitignore
    ├── Readme.md
    ├── package.json
    ├── start.js
    └── test.sh
├── nodejs_wav
    ├── Readme.md
    ├── index.js
    ├── package.json
    └── test.sh
├── tests.sh
├── uwp
    ├── .gitignore
    ├── DeepSpeechUWP.sln
    └── DeepSpeechUWP
    │   ├── .gitignore
    │   ├── App.xaml
    │   ├── App.xaml.cs
    │   ├── Assets
    │       ├── LockScreenLogo.scale-200.png
    │       ├── SplashScreen.scale-200.png
    │       ├── Square150x150Logo.scale-200.png
    │       ├── Square44x44Logo.scale-200.png
    │       ├── Square44x44Logo.targetsize-24_altform-unplated.png
    │       ├── StoreLogo.png
    │       └── Wide310x150Logo.scale-200.png
    │   ├── DeepSpeechUWP.csproj
    │   ├── MainPage.xaml
    │   ├── MainPage.xaml.cs
    │   ├── Package.appxmanifest
    │   ├── Properties
    │       ├── AssemblyInfo.cs
    │       └── Default.rd.xml
    │   └── models
    │       └── .gitkeep
├── vad_transcriber
    ├── README.md
    ├── audioTranscript_cmd.py
    ├── audioTranscript_gui.py
    ├── requirements.txt
    ├── test.sh
    ├── wavSplit.py
    └── wavTranscriber.py
└── web_microphone_websocket
    ├── .gitignore
    ├── Readme.md
    ├── package.json
    ├── public
        ├── downsampling_worker.js
        ├── favicon.ico
        ├── index.html
        ├── logo192.png
        ├── logo512.png
        ├── manifest.json
        └── robots.txt
    ├── server.js
    ├── src
        ├── App.js
        ├── App.test.js
        ├── index.css
        ├── index.js
        └── setupTests.js
    ├── test.sh
    └── test
        ├── config.js
        └── server.test.js


/README.rst:
--------------------------------------------------------------------------------
 1 | DeepSpeech 0.9.x Examples
 2 | ==========================
 3 | 
 4 | These are various examples on how to use or integrate DeepSpeech using our packages.
 5 | 
 6 | It is a good way to just try out DeepSpeech before learning how it works in detail, as well as a source of inspiration for ways you can integrate it into your application or solve common tasks like voice activity detection (VAD) or microphone streaming.
 7 | 
 8 | Contributions are welcome!
 9 | 
10 | **Note:** These examples target DeepSpeech **0.9.x** only. If you're using a different release, you need to go to the corresponding branch for the release:
11 | 
12 | * `v0.9.x <https://github.com/mozilla/DeepSpeech-examples/tree/r0.9>`_
13 | * `v0.8.x <https://github.com/mozilla/DeepSpeech-examples/tree/r0.8>`_
14 | * `v0.7.x <https://github.com/mozilla/DeepSpeech-examples/tree/r0.7>`_
15 | * `v0.6.x <https://github.com/mozilla/DeepSpeech-examples/tree/r0.6>`_
16 | * `master branch <https://github.com/mozilla/DeepSpeech-examples/tree/master>`_
17 | 
18 | **List of examples**
19 | 
20 | Python:
21 | -------
22 | 
23 | * `Microphone VAD streaming <mic_vad_streaming/README.rst>`_
24 | * `VAD transcriber <vad_transcriber/>`_
25 | * `AutoSub <autosub/>`_
26 | 
27 | JavaScript:
28 | -----------
29 | 
30 | * `FFMPEG VAD streaming <ffmpeg_vad_streaming/README.MD>`_
31 | * `Node.JS microphone VAD streaming <nodejs_mic_vad_streaming/Readme.md>`_
32 | * `Node.JS wav <nodejs_wav/Readme.md>`_
33 | * `Web Microphone Websocket streaming <web_microphone_websocket/Readme.md>`_
34 | * `Electron wav transcriber <electron/Readme.md>`_
35 | 
36 | Windows/C#:
37 | -----------
38 | 
39 | * `.NET framework <net_framework/>`_
40 | * `Universal Windows Platform (UWP) <uwp/>`_.
41 | 
42 | Java/Android:
43 | -------------
44 | 
45 | * `mozilla/androidspeech library <https://github.com/mozilla/androidspeech/>`_
46 | 
47 | Nim:
48 | ----
49 | 
50 | * `nim_mic_vad_streaming <nim_mic_vad_streaming/README.md>`_.
51 | 


--------------------------------------------------------------------------------
/android_mic_streaming/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .gradle
 3 | /.idea
 4 | /local.properties
 5 | .DS_Store
 6 | /build
 7 | /captures
 8 | .externalNativeBuild
 9 | .cxx
10 | 


--------------------------------------------------------------------------------
/android_mic_streaming/README.md:
--------------------------------------------------------------------------------
 1 | # Android Microphone Streaming
 2 | 
 3 | Android demo application that streams audio from the microphone to deepspeech and transcribes it.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | #### Download model
 8 | 
 9 | Download the pre-trained English model and extract it:
10 | ```
11 | curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.tflite
12 | curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
13 | ```
14 | 
15 | Move the model files `deepspeech-0.9.3-models.pbmm`, `deepspeech-0.9.3-models.scorer`, to the demo application's data directory on your android device.
16 | Mind that the data directory will only be present after installing and launching the app once.
17 | 
18 | ```
19 | adb push deepspeech-0.9.3-models.tflite deepspeech-0.9.3-models.scorer /storage/emulated/0/Android/data/org.deepspeechdemo/files/
20 | ```
21 | 
22 | You can also copy the files from your file browser to the device.
23 | 
24 | #### Android device with USB Debugging
25 | 
26 | Connect an android device and make sure to enable USB-Debugging in the developer settings of the device. If haven't already, you can activate your developer settings by following [this guide from android](https://developer.android.com/studio/debug/dev-options#enable).
27 | 
28 | ## Installation
29 | 
30 | To install the example app on your connected android device you can either use the command line or Android Studio.
31 | 
32 | ### Command Line
33 | 
34 | ```
35 | cd android_mic_streaming
36 | ./gradlew installDebug
37 | ``` 
38 | 
39 | ### Android Studio
40 | 
41 | Open the `android_mic_streaming` directory in Android Studio.  
42 | Run the app and your connected android device.
43 | 
44 | ## Usage
45 | 
46 | Start recording by pressing the button and the app will transcribe the spoken text.
47 | 
48 | ## Fine-tuning the Recognition
49 | 
50 | Based on your use case or the language you are using you might change the values of `BEAM_WIDTH`, `LM_ALPHA` and `LM_BETA` to improve the speech recogintion. 
51 | 
52 | You can also alter the `NUM_BUFFER_ELEMENTS` to change the size of the audio data buffer that is fed into the model. 


--------------------------------------------------------------------------------
/android_mic_streaming/app/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'com.android.application'
 2 | 
 3 | apply plugin: 'kotlin-android'
 4 | 
 5 | apply plugin: 'kotlin-android-extensions'
 6 | 
 7 | android {
 8 |     compileSdkVersion 29
 9 |     buildToolsVersion "29.0.2"
10 |     defaultConfig {
11 |         applicationId "org.deepspeechdemo"
12 |         minSdkVersion 22
13 |         targetSdkVersion 29
14 |         versionCode 1
15 |         versionName "1.0"
16 |         testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
17 |     }
18 |     buildTypes {
19 |         release {
20 |             minifyEnabled false
21 |             proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
22 |         }
23 |     }
24 |     // Specify tflite file should not be compressed for the app apk
25 |     aaptOptions {
26 |         noCompress "tflite"
27 |     }
28 | }
29 | 
30 | dependencies {
31 |     implementation fileTree(dir: 'libs', include: ['*.jar'])
32 |     implementation"org.jetbrains.kotlin:kotlin-stdlib-jdk7:$kotlin_version"
33 |     implementation 'androidx.appcompat:appcompat:1.0.2'
34 |     implementation 'androidx.core:core-ktx:1.0.2'
35 |     implementation 'androidx.constraintlayout:constraintlayout:1.1.3'
36 | 
37 |     implementation 'org.mozilla.deepspeech:libdeepspeech:0.9.3'
38 | 
39 |     testImplementation 'junit:junit:4.12'
40 |     androidTestImplementation 'androidx.test.ext:junit:1.1.0'
41 |     androidTestImplementation 'androidx.test.espresso:espresso-core:3.1.1'
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/proguard-rules.pro:
--------------------------------------------------------------------------------
 1 | # Add project specific ProGuard rules here.
 2 | # You can control the set of applied configuration files using the
 3 | # proguardFiles setting in build.gradle.
 4 | #
 5 | # For more details, see
 6 | #   http://developer.android.com/guide/developing/tools/proguard.html
 7 | 
 8 | # If your project uses WebView with JS, uncomment the following
 9 | # and specify the fully qualified class name to the JavaScript interface
10 | # class:
11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
12 | #   public *;
13 | #}
14 | 
15 | # Uncomment this to preserve the line number information for
16 | # debugging stack traces.
17 | #-keepattributes SourceFile,LineNumberTable
18 | 
19 | # If you keep the line number information, uncomment this to
20 | # hide the original source file name.
21 | #-renamesourcefileattribute SourceFile
22 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/AndroidManifest.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <manifest xmlns:android="http://schemas.android.com/apk/res/android"
 3 |     package="org.deepspeechdemo">
 4 | 
 5 |     <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
 6 |     <uses-permission android:name="android.permission.RECORD_AUDIO" />
 7 | 
 8 |     <application
 9 |         android:allowBackup="true"
10 |         android:icon="@mipmap/ic_launcher"
11 |         android:label="@string/app_name"
12 |         android:roundIcon="@mipmap/ic_launcher_round"
13 |         android:supportsRtl="true"
14 |         android:theme="@style/AppTheme">
15 |         <activity android:name=".MainActivity">
16 |             <intent-filter>
17 |                 <action android:name="android.intent.action.MAIN" />
18 | 
19 |                 <category android:name="android.intent.category.LAUNCHER" />
20 |             </intent-filter>
21 |         </activity>
22 |     </application>
23 | 
24 | </manifest>


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/java/org/deepspeechdemo/MainActivity.kt:
--------------------------------------------------------------------------------
  1 | package org.deepspeechdemo
  2 | 
  3 | import android.Manifest
  4 | import android.content.pm.PackageManager
  5 | import android.media.AudioFormat
  6 | import android.media.AudioRecord
  7 | import android.media.MediaRecorder
  8 | import android.os.Build
  9 | import android.os.Bundle
 10 | import android.view.View
 11 | import androidx.appcompat.app.AppCompatActivity
 12 | import androidx.core.app.ActivityCompat
 13 | import kotlinx.android.synthetic.main.activity_main.*
 14 | import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel
 15 | import java.io.File
 16 | import java.util.concurrent.atomic.AtomicBoolean
 17 | 
 18 | class MainActivity : AppCompatActivity() {
 19 |     private var model: DeepSpeechModel? = null
 20 | 
 21 |     private var transcriptionThread: Thread? = null
 22 |     private var isRecording: AtomicBoolean = AtomicBoolean(false)
 23 | 
 24 |     private val TFLITE_MODEL_FILENAME = "deepspeech-0.9.3-models.tflite"
 25 |     private val SCORER_FILENAME = "deepspeech-0.9.3-models.scorer"
 26 | 
 27 |     private fun checkAudioPermission() {
 28 |         // Permission is automatically granted on SDK < 23 upon installation.
 29 |         if (Build.VERSION.SDK_INT >= 23) {
 30 |             val permission = Manifest.permission.RECORD_AUDIO
 31 | 
 32 |             if (checkSelfPermission(permission) != PackageManager.PERMISSION_GRANTED) {
 33 |                 ActivityCompat.requestPermissions(this, arrayOf(permission), 3)
 34 |             }
 35 |         }
 36 |     }
 37 | 
 38 |     private fun transcribe() {
 39 |         // We read from the recorder in chunks of 2048 shorts. With a model that expects its input
 40 |         // at 16000Hz, this corresponds to 2048/16000 = 0.128s or 128ms.
 41 |         val audioBufferSize = 2048
 42 |         val audioData = ShortArray(audioBufferSize)
 43 | 
 44 |         runOnUiThread { btnStartInference.text = "Stop Recording" }
 45 | 
 46 |         model?.let { model ->
 47 |             val streamContext = model.createStream()
 48 | 
 49 |             val recorder = AudioRecord(
 50 |                 MediaRecorder.AudioSource.VOICE_RECOGNITION,
 51 |                 model.sampleRate(),
 52 |                 AudioFormat.CHANNEL_IN_MONO,
 53 |                 AudioFormat.ENCODING_PCM_16BIT,
 54 |                 audioBufferSize
 55 |             )
 56 |             recorder.startRecording()
 57 | 
 58 |             while (isRecording.get()) {
 59 |                 recorder.read(audioData, 0, audioBufferSize)
 60 |                 model.feedAudioContent(streamContext, audioData, audioData.size)
 61 |                 val decoded = model.intermediateDecode(streamContext)
 62 |                 runOnUiThread { transcription.text = decoded }
 63 |             }
 64 | 
 65 |             val decoded = model.finishStream(streamContext)
 66 | 
 67 |             runOnUiThread {
 68 |                 btnStartInference.text = "Start Recording"
 69 |                 transcription.text = decoded
 70 |             }
 71 | 
 72 |             recorder.stop()
 73 |             recorder.release()
 74 |         }
 75 |     }
 76 | 
 77 |     private fun createModel(): Boolean {
 78 |         val modelsPath = getExternalFilesDir(null).toString()
 79 |         val tfliteModelPath = "$modelsPath/$TFLITE_MODEL_FILENAME"
 80 |         val scorerPath = "$modelsPath/$SCORER_FILENAME"
 81 | 
 82 |         for (path in listOf(tfliteModelPath, scorerPath)) {
 83 |             if (!File(path).exists()) {
 84 |                 status.append("Model creation failed: $path does not exist.\n")
 85 |                 return false
 86 |             }
 87 |         }
 88 | 
 89 |         model = DeepSpeechModel(tfliteModelPath)
 90 |         model?.enableExternalScorer(scorerPath)
 91 | 
 92 |         return true
 93 |     }
 94 | 
 95 |     private fun startListening() {
 96 |         if (isRecording.compareAndSet(false, true)) {
 97 |             transcriptionThread = Thread(Runnable { transcribe() }, "Transcription Thread")
 98 |             transcriptionThread?.start()
 99 |         }
100 |     }
101 | 
102 |     override fun onCreate(savedInstanceState: Bundle?) {
103 |         super.onCreate(savedInstanceState)
104 |         setContentView(R.layout.activity_main)
105 |         checkAudioPermission()
106 | 
107 |         // Create application data directory on the device
108 |         val modelsPath = getExternalFilesDir(null).toString()
109 | 
110 |         status.text = "Ready. Copy model files to \"$modelsPath\" if running for the first time.\n"
111 |     }
112 | 
113 |     private fun stopListening() {
114 |         isRecording.set(false)
115 |     }
116 | 
117 |     fun onRecordClick(v: View?) {
118 |         if (model == null) {
119 |             if (!createModel()) {
120 |                 return
121 |             }
122 |             status.append("Created model.\n")
123 |         }
124 | 
125 |         if (isRecording.get()) {
126 |             stopListening()
127 |         } else {
128 |             startListening()
129 |         }
130 |     }
131 | 
132 |     override fun onDestroy() {
133 |         super.onDestroy()
134 |         if (model != null) {
135 |             model?.freeModel()
136 |         }
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/drawable-v24/ic_launcher_foreground.xml:
--------------------------------------------------------------------------------
 1 | <vector xmlns:android="http://schemas.android.com/apk/res/android"
 2 |     xmlns:aapt="http://schemas.android.com/aapt"
 3 |     android:width="108dp"
 4 |     android:height="108dp"
 5 |     android:viewportWidth="108"
 6 |     android:viewportHeight="108">
 7 |     <path
 8 |         android:fillType="evenOdd"
 9 |         android:pathData="M32,64C32,64 38.39,52.99 44.13,50.95C51.37,48.37 70.14,49.57 70.14,49.57L108.26,87.69L108,109.01L75.97,107.97L32,64Z"
10 |         android:strokeWidth="1"
11 |         android:strokeColor="#00000000">
12 |         <aapt:attr name="android:fillColor">
13 |             <gradient
14 |                 android:endX="78.5885"
15 |                 android:endY="90.9159"
16 |                 android:startX="48.7653"
17 |                 android:startY="61.0927"
18 |                 android:type="linear">
19 |                 <item
20 |                     android:color="#44000000"
21 |                     android:offset="0.0" />
22 |                 <item
23 |                     android:color="#00000000"
24 |                     android:offset="1.0" />
25 |             </gradient>
26 |         </aapt:attr>
27 |     </path>
28 |     <path
29 |         android:fillColor="#FFFFFF"
30 |         android:fillType="nonZero"
31 |         android:pathData="M66.94,46.02L66.94,46.02C72.44,50.07 76,56.61 76,64L32,64C32,56.61 35.56,50.11 40.98,46.06L36.18,41.19C35.45,40.45 35.45,39.3 36.18,38.56C36.91,37.81 38.05,37.81 38.78,38.56L44.25,44.05C47.18,42.57 50.48,41.71 54,41.71C57.48,41.71 60.78,42.57 63.68,44.05L69.11,38.56C69.84,37.81 70.98,37.81 71.71,38.56C72.44,39.3 72.44,40.45 71.71,41.19L66.94,46.02ZM62.94,56.92C64.08,56.92 65,56.01 65,54.88C65,53.76 64.08,52.85 62.94,52.85C61.8,52.85 60.88,53.76 60.88,54.88C60.88,56.01 61.8,56.92 62.94,56.92ZM45.06,56.92C46.2,56.92 47.13,56.01 47.13,54.88C47.13,53.76 46.2,52.85 45.06,52.85C43.92,52.85 43,53.76 43,54.88C43,56.01 43.92,56.92 45.06,56.92Z"
32 |         android:strokeWidth="1"
33 |         android:strokeColor="#00000000" />
34 | </vector>
35 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/drawable/ic_launcher_background.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <vector xmlns:android="http://schemas.android.com/apk/res/android"
  3 |     android:width="108dp"
  4 |     android:height="108dp"
  5 |     android:viewportWidth="108"
  6 |     android:viewportHeight="108">
  7 |     <path
  8 |         android:fillColor="#008577"
  9 |         android:pathData="M0,0h108v108h-108z" />
 10 |     <path
 11 |         android:fillColor="#00000000"
 12 |         android:pathData="M9,0L9,108"
 13 |         android:strokeWidth="0.8"
 14 |         android:strokeColor="#33FFFFFF" />
 15 |     <path
 16 |         android:fillColor="#00000000"
 17 |         android:pathData="M19,0L19,108"
 18 |         android:strokeWidth="0.8"
 19 |         android:strokeColor="#33FFFFFF" />
 20 |     <path
 21 |         android:fillColor="#00000000"
 22 |         android:pathData="M29,0L29,108"
 23 |         android:strokeWidth="0.8"
 24 |         android:strokeColor="#33FFFFFF" />
 25 |     <path
 26 |         android:fillColor="#00000000"
 27 |         android:pathData="M39,0L39,108"
 28 |         android:strokeWidth="0.8"
 29 |         android:strokeColor="#33FFFFFF" />
 30 |     <path
 31 |         android:fillColor="#00000000"
 32 |         android:pathData="M49,0L49,108"
 33 |         android:strokeWidth="0.8"
 34 |         android:strokeColor="#33FFFFFF" />
 35 |     <path
 36 |         android:fillColor="#00000000"
 37 |         android:pathData="M59,0L59,108"
 38 |         android:strokeWidth="0.8"
 39 |         android:strokeColor="#33FFFFFF" />
 40 |     <path
 41 |         android:fillColor="#00000000"
 42 |         android:pathData="M69,0L69,108"
 43 |         android:strokeWidth="0.8"
 44 |         android:strokeColor="#33FFFFFF" />
 45 |     <path
 46 |         android:fillColor="#00000000"
 47 |         android:pathData="M79,0L79,108"
 48 |         android:strokeWidth="0.8"
 49 |         android:strokeColor="#33FFFFFF" />
 50 |     <path
 51 |         android:fillColor="#00000000"
 52 |         android:pathData="M89,0L89,108"
 53 |         android:strokeWidth="0.8"
 54 |         android:strokeColor="#33FFFFFF" />
 55 |     <path
 56 |         android:fillColor="#00000000"
 57 |         android:pathData="M99,0L99,108"
 58 |         android:strokeWidth="0.8"
 59 |         android:strokeColor="#33FFFFFF" />
 60 |     <path
 61 |         android:fillColor="#00000000"
 62 |         android:pathData="M0,9L108,9"
 63 |         android:strokeWidth="0.8"
 64 |         android:strokeColor="#33FFFFFF" />
 65 |     <path
 66 |         android:fillColor="#00000000"
 67 |         android:pathData="M0,19L108,19"
 68 |         android:strokeWidth="0.8"
 69 |         android:strokeColor="#33FFFFFF" />
 70 |     <path
 71 |         android:fillColor="#00000000"
 72 |         android:pathData="M0,29L108,29"
 73 |         android:strokeWidth="0.8"
 74 |         android:strokeColor="#33FFFFFF" />
 75 |     <path
 76 |         android:fillColor="#00000000"
 77 |         android:pathData="M0,39L108,39"
 78 |         android:strokeWidth="0.8"
 79 |         android:strokeColor="#33FFFFFF" />
 80 |     <path
 81 |         android:fillColor="#00000000"
 82 |         android:pathData="M0,49L108,49"
 83 |         android:strokeWidth="0.8"
 84 |         android:strokeColor="#33FFFFFF" />
 85 |     <path
 86 |         android:fillColor="#00000000"
 87 |         android:pathData="M0,59L108,59"
 88 |         android:strokeWidth="0.8"
 89 |         android:strokeColor="#33FFFFFF" />
 90 |     <path
 91 |         android:fillColor="#00000000"
 92 |         android:pathData="M0,69L108,69"
 93 |         android:strokeWidth="0.8"
 94 |         android:strokeColor="#33FFFFFF" />
 95 |     <path
 96 |         android:fillColor="#00000000"
 97 |         android:pathData="M0,79L108,79"
 98 |         android:strokeWidth="0.8"
 99 |         android:strokeColor="#33FFFFFF" />
100 |     <path
101 |         android:fillColor="#00000000"
102 |         android:pathData="M0,89L108,89"
103 |         android:strokeWidth="0.8"
104 |         android:strokeColor="#33FFFFFF" />
105 |     <path
106 |         android:fillColor="#00000000"
107 |         android:pathData="M0,99L108,99"
108 |         android:strokeWidth="0.8"
109 |         android:strokeColor="#33FFFFFF" />
110 |     <path
111 |         android:fillColor="#00000000"
112 |         android:pathData="M19,29L89,29"
113 |         android:strokeWidth="0.8"
114 |         android:strokeColor="#33FFFFFF" />
115 |     <path
116 |         android:fillColor="#00000000"
117 |         android:pathData="M19,39L89,39"
118 |         android:strokeWidth="0.8"
119 |         android:strokeColor="#33FFFFFF" />
120 |     <path
121 |         android:fillColor="#00000000"
122 |         android:pathData="M19,49L89,49"
123 |         android:strokeWidth="0.8"
124 |         android:strokeColor="#33FFFFFF" />
125 |     <path
126 |         android:fillColor="#00000000"
127 |         android:pathData="M19,59L89,59"
128 |         android:strokeWidth="0.8"
129 |         android:strokeColor="#33FFFFFF" />
130 |     <path
131 |         android:fillColor="#00000000"
132 |         android:pathData="M19,69L89,69"
133 |         android:strokeWidth="0.8"
134 |         android:strokeColor="#33FFFFFF" />
135 |     <path
136 |         android:fillColor="#00000000"
137 |         android:pathData="M19,79L89,79"
138 |         android:strokeWidth="0.8"
139 |         android:strokeColor="#33FFFFFF" />
140 |     <path
141 |         android:fillColor="#00000000"
142 |         android:pathData="M29,19L29,89"
143 |         android:strokeWidth="0.8"
144 |         android:strokeColor="#33FFFFFF" />
145 |     <path
146 |         android:fillColor="#00000000"
147 |         android:pathData="M39,19L39,89"
148 |         android:strokeWidth="0.8"
149 |         android:strokeColor="#33FFFFFF" />
150 |     <path
151 |         android:fillColor="#00000000"
152 |         android:pathData="M49,19L49,89"
153 |         android:strokeWidth="0.8"
154 |         android:strokeColor="#33FFFFFF" />
155 |     <path
156 |         android:fillColor="#00000000"
157 |         android:pathData="M59,19L59,89"
158 |         android:strokeWidth="0.8"
159 |         android:strokeColor="#33FFFFFF" />
160 |     <path
161 |         android:fillColor="#00000000"
162 |         android:pathData="M69,19L69,89"
163 |         android:strokeWidth="0.8"
164 |         android:strokeColor="#33FFFFFF" />
165 |     <path
166 |         android:fillColor="#00000000"
167 |         android:pathData="M79,19L79,89"
168 |         android:strokeWidth="0.8"
169 |         android:strokeColor="#33FFFFFF" />
170 | </vector>
171 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/layout/activity_main.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
 3 |     xmlns:app="http://schemas.android.com/apk/res-auto"
 4 |     xmlns:tools="http://schemas.android.com/tools"
 5 |     android:layout_width="match_parent"
 6 |     android:layout_height="match_parent"
 7 |     tools:context=".MainActivity">
 8 | 
 9 |     <LinearLayout
10 |         android:layout_width="match_parent"
11 |         android:layout_height="match_parent"
12 |         android:orientation="vertical"
13 |         android:padding="100px">
14 | 
15 |         <TextView
16 |             android:id="@+id/labelStatus"
17 |             android:layout_width="match_parent"
18 |             android:layout_height="wrap_content"
19 |             android:layout_weight="0"
20 |             android:text="Status"
21 |             android:textSize="18sp"
22 |             android:textStyle="bold" />
23 | 
24 |         <TextView
25 |             android:id="@+id/status"
26 |             android:layout_width="match_parent"
27 |             android:layout_height="wrap_content"
28 |             android:text="Hello World!"
29 |             app:layout_constraintBottom_toBottomOf="parent"
30 |             app:layout_constraintLeft_toLeftOf="parent"
31 |             app:layout_constraintRight_toRightOf="parent"
32 |             app:layout_constraintTop_toTopOf="parent" />
33 | 
34 |         <Space
35 |             android:layout_width="match_parent"
36 |             android:layout_height="@android:dimen/app_icon_size" />
37 | 
38 |         <TextView
39 |             android:id="@+id/labelTranscription"
40 |             android:layout_width="match_parent"
41 |             android:layout_height="wrap_content"
42 |             android:layout_weight="0"
43 |             android:text="Transcription"
44 |             android:textSize="18sp"
45 |             android:textStyle="bold" />
46 | 
47 |         <TextView
48 |             android:id="@+id/transcription"
49 |             android:layout_width="match_parent"
50 |             android:layout_height="wrap_content"
51 |             android:text="Spoken text will appear here."
52 |             app:layout_constraintBottom_toBottomOf="parent"
53 |             app:layout_constraintLeft_toLeftOf="parent"
54 |             app:layout_constraintRight_toRightOf="parent"
55 |             app:layout_constraintTop_toTopOf="parent" />
56 | 
57 |         <Space
58 |             android:layout_width="match_parent"
59 |             android:layout_height="@android:dimen/app_icon_size"
60 |             android:layout_weight="1" />
61 | 
62 |         <Button
63 |             android:id="@+id/btnStartInference"
64 |             android:layout_width="match_parent"
65 |             android:layout_height="wrap_content"
66 |             android:onClick="onRecordClick"
67 |             android:text="Start Recording" />
68 | 
69 |     </LinearLayout>
70 | 
71 | </androidx.constraintlayout.widget.ConstraintLayout>


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
3 |     <background android:drawable="@drawable/ic_launcher_background" />
4 |     <foreground android:drawable="@drawable/ic_launcher_foreground" />
5 | </adaptive-icon>


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
3 |     <background android:drawable="@drawable/ic_launcher_background" />
4 |     <foreground android:drawable="@drawable/ic_launcher_foreground" />
5 | </adaptive-icon>


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-hdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-hdpi/ic_launcher.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-hdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-hdpi/ic_launcher_round.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-mdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-mdpi/ic_launcher.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-mdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-mdpi/ic_launcher_round.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-xhdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-xhdpi/ic_launcher.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-xxhdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-xxhdpi/ic_launcher.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/values/colors.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <resources>
3 |     <color name="colorPrimary">#008577</color>
4 |     <color name="colorPrimaryDark">#00574B</color>
5 |     <color name="colorAccent">#D81B60</color>
6 | </resources>
7 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/values/strings.xml:
--------------------------------------------------------------------------------
1 | <resources>
2 |     <string name="app_name">DeepSpeech Demo</string>
3 | </resources>
4 | 


--------------------------------------------------------------------------------
/android_mic_streaming/app/src/main/res/values/styles.xml:
--------------------------------------------------------------------------------
 1 | <resources>
 2 | 
 3 |     <!-- Base application theme. -->
 4 |     <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
 5 |         <!-- Customize your theme here. -->
 6 |         <item name="colorPrimary">@color/colorPrimary</item>
 7 |         <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
 8 |         <item name="colorAccent">@color/colorAccent</item>
 9 |     </style>
10 | 
11 | </resources>
12 | 


--------------------------------------------------------------------------------
/android_mic_streaming/build.gradle:
--------------------------------------------------------------------------------
 1 | // Top-level build file where you can add configuration options common to all sub-projects/modules.
 2 | 
 3 | buildscript {
 4 |     ext.kotlin_version = '1.3.50'
 5 |     repositories {
 6 |         google()
 7 |         jcenter()
 8 |         
 9 |     }
10 |     dependencies {
11 |         classpath 'com.android.tools.build:gradle:3.5.2'
12 |         classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
13 |         // NOTE: Do not place your application dependencies here; they belong
14 |         // in the individual module build.gradle files
15 |     }
16 | }
17 | 
18 | allprojects {
19 |     repositories {
20 |         google()
21 |         jcenter()
22 |         
23 |     }
24 | }
25 | 
26 | task clean(type: Delete) {
27 |     delete rootProject.buildDir
28 | }
29 | 


--------------------------------------------------------------------------------
/android_mic_streaming/gradle.properties:
--------------------------------------------------------------------------------
 1 | # Project-wide Gradle settings.
 2 | # IDE (e.g. Android Studio) users:
 3 | # Gradle settings configured through the IDE *will override*
 4 | # any settings specified in this file.
 5 | # For more details on how to configure your build environment visit
 6 | # http://www.gradle.org/docs/current/userguide/build_environment.html
 7 | # Specifies the JVM arguments used for the daemon process.
 8 | # The setting is particularly useful for tweaking memory settings.
 9 | org.gradle.jvmargs=-Xmx1536m
10 | # When configured, Gradle will run in incubating parallel mode.
11 | # This option should only be used with decoupled projects. More details, visit
12 | # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
13 | # org.gradle.parallel=true
14 | # AndroidX package structure to make it clearer which packages are bundled with the
15 | # Android operating system, and which are packaged with your app's APK
16 | # https://developer.android.com/topic/libraries/support-library/androidx-rn
17 | android.useAndroidX=true
18 | # Automatically convert third-party libraries to use AndroidX
19 | android.enableJetifier=true
20 | # Kotlin code style for this project: "official" or "obsolete":
21 | kotlin.code.style=official
22 | 


--------------------------------------------------------------------------------
/android_mic_streaming/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/android_mic_streaming/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/android_mic_streaming/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Wed Jan 08 15:12:58 CET 2020
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip
7 | 


--------------------------------------------------------------------------------
/android_mic_streaming/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn () {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die () {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Escape application args
158 | save () {
159 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 |     echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 | 
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 | 
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 |   cd "$(dirname "$0")"
170 | fi
171 | 
172 | exec "$JAVACMD" "$@"
173 | 


--------------------------------------------------------------------------------
/android_mic_streaming/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/android_mic_streaming/settings.gradle:
--------------------------------------------------------------------------------
1 | include ':app'
2 | rootProject.name='DeepSpeechDemo'
3 | 


--------------------------------------------------------------------------------
/autosub/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Abhiroop Talasila
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/autosub/README.md:
--------------------------------------------------------------------------------
 1 | # AutoSub
 2 | 
 3 | - [AutoSub](#autosub)
 4 |   - [About](#about)
 5 |   - [Motivation](#motivation)
 6 |   - [Installation](#installation)
 7 |   - [How-to example](#how-to-example)
 8 |   - [How it works](#how-it-works)
 9 |   - [TO-DO](#to-do)
10 |   - [Contributing](#contributing)
11 |   - [References](#references)
12 | 
13 | ## About
14 | 
15 | AutoSub is a CLI application to generate subtitle file (.srt) for any video file using [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech). I use the DeepSpeech Python API to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple small files.
16 | 
17 | 
18 | ## Motivation
19 | 
20 | In the age of OTT platforms, there are still some who prefer to download movies/videos from YouTube/Facebook or even torrents rather than stream. I am one of them and on one such occasion, I couldn't find the subtitle file for a particular movie I had downloaded. Then the idea for AutoSub struck me and since I had worked with DeepSpeech previously, I decided to use it. 
21 | 
22 | 
23 | ## Installation
24 | 
25 | * Clone the repo. All further steps should be performed while in the `AutoSub/` directory
26 |     ```bash
27 |     $ git clone https://github.com/abhirooptalasila/AutoSub
28 |     $ cd AutoSub
29 |     ```
30 | * Create a pip virtual environment to install the required packages
31 |     ```bash
32 |     $ python3 -m venv sub
33 |     $ source sub/bin/activate
34 |     $ pip3 install -r requirements.txt
35 |     ```
36 | * Download the model and scorer files from DeepSpeech repo. The scorer file is optional, but it greatly improves inference results.
37 |     ```bash
38 |     # Model file (~190 MB)
39 |     $ wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
40 |     # Scorer file (~950 MB)
41 |     $ wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
42 |     ```
43 | * Create two folders `audio/` and `output/` to store audio segments and final SRT file
44 |     ```bash
45 |     $ mkdir audio output
46 |     ```
47 | * Install FFMPEG. If you're running Ubuntu, this should work fine.
48 |     ```bash
49 |     $ sudo apt-get install ffmpeg
50 |     $ ffmpeg -version               # I'm running 4.1.4
51 |     ```
52 |     
53 | * [OPTIONAL] If you would like the subtitles to be generated faster, you can use the GPU package instead. Make sure to install the appropriate [CUDA](https://deepspeech.readthedocs.io/en/v0.9.3/USING.html#cuda-dependency-inference) version. 
54 |     ```bash
55 |     $ source sub/bin/activate
56 |     $ pip3 install deepspeech-gpu
57 |     ```
58 |     
59 | ## How-to example
60 | 
61 | * After following the installation instructions, you can run `autosub/main.py` as given below. `--model` and `--scorer` arguments take the absolute paths of the respective files. The `--file` argument is the video file for which SRT file is to be generated
62 |     ```bash
63 |     $ python3 autosub/main.py --model /home/AutoSub/deepspeech-0.9.3-models.pbmm --scorer /home/AutoSub/deepspeech-0.9.3-models.scorer --file ~/movie.mp4
64 |     ```
65 | * After the script finishes, the SRT file is saved in `output/`
66 | * Open the video file and add this SRT file as a subtitle, or you can just drag and drop in VLC.
67 | 
68 | 
69 | ## How it works
70 | 
71 | Mozilla DeepSpeech is an amazing open-source speech-to-text engine with support for fine-tuning using custom datasets, external language models, exporting memory-mapped models and a lot more. You should definitely check it out for STT tasks. So, when you first run the script, I use FFMPEG to **extract the audio** from the video and save it in `audio/`. By default DeepSpeech is configured to accept 16kHz audio samples for inference, hence while extracting I make FFMPEG use 16kHz sampling rate. 
72 | 
73 | Then, I use [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) for silence removal - which basically takes the large audio file initially extracted, and splits it wherever silent regions are encountered, resulting in smaller audio segments which are much easier to process. I haven't used the whole library, instead I've integrated parts of it in `autosub/featureExtraction.py` and `autosub/trainAudio.py` All these audio files are stored in `audio/`. Then for each audio segment, I perform DeepSpeech inference on it, and write the inferred text in a SRT file. After all files are processed, the final SRT file is stored in `output/`.
74 | 
75 | When I tested the script on my laptop, it took about **40 minutes to generate the SRT file for a 70 minutes video file**. My config is an i5 dual-core @ 2.5 Ghz and 8 gigs of RAM. Ideally, the whole process shouldn't take more than 60% of the duration of original video file. 
76 | 
77 | 
78 | ## TO-DO
79 | 
80 | * Pre-process inferred text before writing to file (prettify)
81 | * Add progress bar to `extract_audio()`
82 | * GUI support (?)
83 | 
84 | 
85 | ## Contributing
86 | 
87 | I would love to follow up on any suggestions/issues you find :)
88 | 
89 | 
90 | ## References
91 | 1. https://github.com/mozilla/DeepSpeech/
92 | 2. https://github.com/tyiannak/pyAudioAnalysis
93 | 3. https://deepspeech.readthedocs.io/
94 | 


--------------------------------------------------------------------------------
/autosub/autosub/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/autosub/autosub/__init__.py


--------------------------------------------------------------------------------
/autosub/autosub/audioProcessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import subprocess
 5 | import numpy as np
 6 | 
 7 | 
 8 | def extract_audio(input_file, audio_file_name):
 9 |     """Extract audio from input video file and save to audio/ in root dir
10 | 
11 |     Args:
12 |         input_file: input video file
13 |         audio_file_name: save audio WAV file with same filename as video file
14 |     """
15 |     
16 |     command = "ffmpeg -hide_banner -loglevel warning -i {} -b:a 192k -ac 1 -ar 16000 -vn {}".format(input_file, audio_file_name)
17 |     try:
18 |         ret = subprocess.call(command, shell=True)
19 |         print("Extracted audio to audio/{}".format(audio_file_name.split("/")[-1]))
20 |     except Exception as e:
21 |         print("Error: ", str(e))
22 |         exit(1)
23 | 
24 | 
25 | def convert_samplerate(audio_path, desired_sample_rate):
26 |     """Convert extracted audio to the format expected by DeepSpeech
27 |     ***WONT be called as extract_audio() converts the audio to 16kHz while saving***
28 |     
29 |     Args:
30 |         audio_path: audio file path
31 |         desired_sample_rate: DeepSpeech expects 16kHz 
32 | 
33 |     Returns:
34 |         numpy buffer: audio signal stored in numpy array
35 |     """
36 |     
37 |     sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - ".format(
38 |         quote(audio_path), desired_sample_rate)
39 |     try:
40 |         output = subprocess.check_output(
41 |             shlex.split(sox_cmd), stderr=subprocess.PIPE)
42 |     except subprocess.CalledProcessError as e:
43 |         raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr))
44 |     except OSError as e:
45 |         raise OSError(e.errno, "SoX not found, use {}hz files or install it: {}".format(
46 |             desired_sample_rate, e.strerror))
47 | 
48 |     return np.frombuffer(output, np.int16)


--------------------------------------------------------------------------------
/autosub/autosub/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import re
  6 | import sys
  7 | import wave
  8 | import shutil
  9 | import argparse
 10 | import subprocess
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | from deepspeech import Model, version 
 14 | from segmentAudio import silenceRemoval
 15 | from audioProcessing import extract_audio, convert_samplerate
 16 | from writeToFile import write_to_file
 17 | 
 18 | # Line count for SRT file
 19 | line_count = 0
 20 | 
 21 | def sort_alphanumeric(data):
 22 |     """Sort function to sort os.listdir() alphanumerically
 23 |     Helps to process audio files sequentially after splitting 
 24 | 
 25 |     Args:
 26 |         data : file name
 27 |     """
 28 |     
 29 |     convert = lambda text: int(text) if text.isdigit() else text.lower()
 30 |     alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] 
 31 |     
 32 |     return sorted(data, key = alphanum_key)
 33 | 
 34 | 
 35 | def ds_process_audio(ds, audio_file, file_handle):  
 36 |     """Run DeepSpeech inference on each audio file generated after silenceRemoval
 37 |     and write to file pointed by file_handle
 38 | 
 39 |     Args:
 40 |         ds : DeepSpeech Model
 41 |         audio_file : audio file
 42 |         file_handle : SRT file handle
 43 |     """
 44 |     
 45 |     global line_count
 46 |     fin = wave.open(audio_file, 'rb')
 47 |     fs_orig = fin.getframerate()
 48 |     desired_sample_rate = ds.sampleRate()
 49 |     
 50 |     # Check if sampling rate is required rate (16000)
 51 |     # won't be carried out as FFmpeg already converts to 16kHz
 52 |     if fs_orig != desired_sample_rate:
 53 |         print("Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition".format(fs_orig, desired_sample_rate), file=sys.stderr)
 54 |         audio = convert_samplerate(audio_file, desired_sample_rate)
 55 |     else:
 56 |         audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
 57 | 
 58 |     fin.close()
 59 |     
 60 |     # Perform inference on audio segment
 61 |     infered_text = ds.stt(audio)
 62 |     
 63 |     # File name contains start and end times in seconds. Extract that
 64 |     limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-")
 65 |     
 66 |     if len(infered_text) != 0:
 67 |         line_count += 1
 68 |         write_to_file(file_handle, infered_text, line_count, limits)
 69 | 
 70 | 
 71 | def main():
 72 |     global line_count
 73 |     print("AutoSub v0.1\n")
 74 |         
 75 |     parser = argparse.ArgumentParser(description="AutoSub v0.1")
 76 |     parser.add_argument('--model', required=True,
 77 |                         help='DeepSpeech model file')
 78 |     parser.add_argument('--scorer',
 79 |                         help='DeepSpeech scorer file')
 80 |     parser.add_argument('--file', required=True,
 81 |                         help='Input video file')
 82 |     args = parser.parse_args()
 83 |     
 84 |     ds_model = args.model
 85 |     if not ds_model.endswith(".pbmm"):
 86 |         print("Invalid model file. Exiting\n")
 87 |         exit(1)
 88 |     
 89 |     # Load DeepSpeech model 
 90 |     ds = Model(ds_model)
 91 |             
 92 |     if args.scorer:
 93 |         ds_scorer = args.scorer
 94 |         if not ds_scorer.endswith(".scorer"):
 95 |             print("Invalid scorer file. Running inference using only model file\n")
 96 |         else:
 97 |             ds.enableExternalScorer(ds_scorer)
 98 |     
 99 |     input_file = args.file
100 |     print("\nInput file:", input_file)
101 |     
102 |     base_directory = os.getcwd()
103 |     output_directory = os.path.join(base_directory, "output")
104 |     audio_directory = os.path.join(base_directory, "audio")
105 |     video_file_name = input_file.split(os.sep)[-1].split(".")[0]
106 |     audio_file_name = os.path.join(audio_directory, video_file_name + ".wav")
107 |     srt_file_name = os.path.join(output_directory, video_file_name + ".srt")
108 |     
109 |     # Extract audio from input video file
110 |     extract_audio(input_file, audio_file_name)
111 |     
112 |     print("Splitting on silent parts in audio file")
113 |     silenceRemoval(audio_file_name)
114 |     
115 |     # Output SRT file
116 |     file_handle = open(srt_file_name, "a+")
117 |     
118 |     print("\nRunning inference:")
119 |     
120 |     for file in tqdm(sort_alphanumeric(os.listdir(audio_directory))):
121 |         audio_segment_path = os.path.join(audio_directory, file)
122 |         
123 |         # Dont run inference on the original audio file
124 |         if audio_segment_path.split(os.sep)[-1] != audio_file_name.split(os.sep)[-1]:
125 |             ds_process_audio(ds, audio_segment_path, file_handle)
126 |             
127 |     print("\nSRT file saved to", srt_file_name)
128 |     file_handle.close()
129 | 
130 |     # Clean audio/ directory 
131 |     shutil.rmtree(audio_directory)
132 |     os.mkdir(audio_directory)
133 |         
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/autosub/autosub/trainAudio.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import csv
  6 | import sys
  7 | import glob
  8 | import signal
  9 | import ntpath
 10 | import numpy as np
 11 | import sklearn.svm
 12 | 
 13 | shortTermWindow = 0.050
 14 | shortTermStep = 0.050
 15 | eps = 0.00000001
 16 | 
 17 | 
 18 | def train_svm(features, c_param, kernel='linear'):
 19 |     """Train a multi-class probabilitistic SVM classifier.
 20 |     Note:     This function is simply a wrapper to the sklearn functionality 
 21 |               for SVM training
 22 |               See function trainSVM_feature() to use a wrapper on both the 
 23 |               feature extraction and the SVM training
 24 |               (and parameter tuning) processes.
 25 |     Args:
 26 |         features : a list ([numOfClasses x 1]) whose elements 
 27 |                 containt np matrices of features  each matrix 
 28 |                 features[i] of class i is 
 29 |                 [n_samples x numOfDimensions]
 30 |         c_param : SVM parameter C (cost of constraints violation)
 31 |         
 32 |     Returns:
 33 |         svm : the trained SVM variable
 34 | 
 35 |     NOTE:
 36 |         This function trains a linear-kernel SVM for a given C value.
 37 |         For a different kernel, other types of parameters should be provided.
 38 |     """
 39 | 
 40 |     feature_matrix, labels = features_to_matrix(features)
 41 |     svm = sklearn.svm.SVC(C=c_param, kernel=kernel, probability=True,
 42 |                           gamma='auto')
 43 |     svm.fit(feature_matrix, labels)
 44 | 
 45 |     return svm
 46 | 
 47 | def normalize_features(features):
 48 |     """This function normalizes a feature set to 0-mean and 1-std
 49 |     Used in most classifier trainning cases
 50 | 
 51 |     Args:
 52 |         features : list of feature matrices (each one of them is a np matrix)
 53 |         
 54 |     Returns:
 55 |         features_norm : list of NORMALIZED feature matrices
 56 |         mean : mean vector
 57 |         std : std vector
 58 |     """
 59 |     
 60 |     temp_feats = np.array([])
 61 | 
 62 |     for count, f in enumerate(features):
 63 |         if f.shape[0] > 0:
 64 |             if count == 0:
 65 |                 temp_feats = f
 66 |             else:
 67 |                 temp_feats = np.vstack((temp_feats, f))
 68 |             count += 1
 69 | 
 70 |     mean = np.mean(temp_feats, axis=0) + 1e-14
 71 |     std = np.std(temp_feats, axis=0) + 1e-14
 72 | 
 73 |     features_norm = []
 74 |     for f in features:
 75 |         ft = f.copy()
 76 |         for n_samples in range(f.shape[0]):
 77 |             ft[n_samples, :] = (ft[n_samples, :] - mean) / std
 78 |         features_norm.append(ft)
 79 |     return features_norm, mean, std
 80 | 
 81 | 
 82 | def features_to_matrix(features):
 83 |     """This function takes a list of feature matrices as argument and returns
 84 |     a single concatenated feature matrix and the respective class labels.
 85 | 
 86 |     Args:
 87 |         features : a list of feature matrices
 88 | 
 89 |     Returns:
 90 |         feature_matrix : a concatenated matrix of features
 91 |         labels : a vector of class indices
 92 |     """
 93 | 
 94 |     labels = np.array([])
 95 |     feature_matrix = np.array([])
 96 |     for i, f in enumerate(features):
 97 |         if i == 0:
 98 |             feature_matrix = f
 99 |             labels = i * np.ones((len(f), 1))
100 |         else:
101 |             feature_matrix = np.vstack((feature_matrix, f))
102 |             labels = np.append(labels, i * np.ones((len(f), 1)))
103 |             
104 |     return feature_matrix, labels
105 | 


--------------------------------------------------------------------------------
/autosub/autosub/writeToFile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import datetime
 6 | 
 7 | def write_to_file(file_handle, inferred_text, line_count, limits):
 8 |     """Write the inferred text to SRT file
 9 |     Follows a specific format for SRT files
10 | 
11 |     Args:
12 |         file_handle : SRT file handle
13 |         inferred_text : text to be written
14 |         line_count : subtitle line count 
15 |         limits : starting and ending times for text
16 |     """
17 |     
18 |     d = str(datetime.timedelta(seconds=float(limits[0])))
19 |     try:
20 |         from_dur = "0" + str(d.split(".")[0]) + "," + str(d.split(".")[-1][:2])
21 |     except:
22 |         from_dur = "0" + str(d) + "," + "00"
23 |         
24 |     d = str(datetime.timedelta(seconds=float(limits[1])))
25 |     try:
26 |         to_dur = "0" + str(d.split(".")[0]) + "," + str(d.split(".")[-1][:2])
27 |     except:
28 |         to_dur = "0" + str(d) + "," + "00"
29 |         
30 |     file_handle.write(str(line_count) + "\n")
31 |     file_handle.write(from_dur + " --> " + to_dur + "\n")
32 |     file_handle.write(inferred_text + "\n\n")


--------------------------------------------------------------------------------
/autosub/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | Cython==0.29.21
 3 | numpy
 4 | deepspeech==0.9.3
 5 | joblib==0.16.0
 6 | kiwisolver==1.2.0
 7 | pydub==0.23.1
 8 | pyparsing==2.4.7
 9 | python-dateutil==2.8.1
10 | scikit-learn==0.21.3
11 | scipy==1.4.1
12 | six==1.15.0
13 | tqdm==4.44.1
14 | 


--------------------------------------------------------------------------------
/autosub/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | DIR = os.path.dirname(os.path.abspath(__file__))
 5 | INSTALL_PACKAGES = open(os.path.join(DIR, 'requirements.txt')).read().splitlines()
 6 | 
 7 | with open("README.md", "r") as fh:
 8 |     README = fh.read()
 9 | 
10 | setup(
11 |     name="AutoSub",
12 |     packages="autosub",
13 |     version="0.0.1",
14 |     author="Abhiroop Talasila",
15 |     author_email="abhiroop.talasila@gmail.com",
16 |     description="CLI application to generate subtitle file (.srt) for any video file using using STT",
17 |     long_description=README,
18 |     install_requires=INSTALL_PACKAGES,
19 |     long_description_content_type="text/markdown",
20 |     url="https://github.com/abhirooptalasila/AutoSub",
21 |     keywords=['speech-to-text','deepspeech','machine-learning'],
22 |     classifiers=[
23 |         "Programming Language :: Python :: 3",
24 |         "License :: OSI Approved :: MIT License",
25 |         "Operating System :: OS Independent",
26 |     ],
27 |     python_requires='>=3.5',
28 | )
29 | 


--------------------------------------------------------------------------------
/batch_processing/driver.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | from os.path import expanduser
 5 | 
 6 | import click
 7 | 
 8 | import delegator
 9 | 
10 | # first loop over the files
11 | # convert them to wave
12 | 
13 | # record things in 16000hz in the future or you gret this
14 | # Warning: original sample rate (44100) is different than 16000h.z Resampling might produce erratic speech recognition.
15 | 
16 | 
17 | @click.command()
18 | @click.option("--dirname", type=click.Path(exists=True, resolve_path=True))
19 | @click.option("--ext", default=".mp3")
20 | @click.option(
21 |     "--model",
22 |     default="deepspeech-0.9.3-models.pbmm",
23 |     type=click.Path(exists=True, resolve_path=True),
24 | )
25 | @click.option(
26 |     "--scorer",
27 |     default="deepspeech-0.9.3-models.scorer",
28 |     type=click.Path(exists=True, resolve_path=True),
29 | )
30 | 
31 | # manage my library of podcasts
32 | def main(dirname, ext, model, scorer):
33 |     print("main")
34 |     model = expanduser(model)
35 |     scorer = expanduser(scorer)
36 |     pattern = dirname + "/" + "*" + ext
37 |     audiorate = "16000"
38 | 
39 |     print(pattern)
40 |     for filename in glob.glob(pattern):
41 |         print(filename)
42 | 
43 |         wavefile = filename + ".wav"
44 | 
45 |         convert_command = " ".join(
46 |             [
47 |                 "ffmpeg",
48 |                 "-i",
49 |                 "'{}'".format(filename),
50 |                 "-ar",
51 |                 audiorate,
52 |                 "'{}'".format(wavefile),
53 |             ]
54 |         )
55 |         if not os.path.isfile(wavefile):
56 |             print(convert_command)
57 |             r = delegator.run(convert_command)
58 |             print(r.out)
59 |         else:
60 |             print("skipping wave conversion that exists")
61 | 
62 |         command = " ".join(
63 |             [
64 |                 "deepspeech",
65 |                 "--model",
66 |                 model,
67 |                 "--scorer",
68 |                 scorer,
69 |                 "--audio",
70 |                 "'{}'".format(wavefile),
71 |                 #            "--extended",
72 |                 "--json",
73 |             ]
74 |         )
75 |         print(command)
76 |         r = delegator.run(command)
77 |         with open(filename + ".json", "w") as fo:
78 |             print(r.out)
79 |             fo.write(r.out)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/batch_processing/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.9.0
 2 | addignore==1.2.7
 3 | appdirs==1.4.4
 4 | astor==0.8.1
 5 | astunparse==1.6.3
 6 | attrs==19.3.0
 7 | black==19.10b0
 8 | bokeh==1.4.0
 9 | cachetools==4.1.0
10 | certifi==2020.4.5.2
11 | chardet==3.0.4
12 | click==7.1.2
13 | deepspeech==0.9.3
14 | delegator.py @ git+https://github.com/amitt001/delegator.py.git@194aa92543fbdbfbae0bcc24ca217819a7805da2
15 | flask==1.1.2
16 | gast==0.2.2
17 | google-auth==1.16.1
18 | google-auth-oauthlib==0.4.1
19 | google-pasta==0.2.0
20 | grpcio==1.29.0
21 | h5py==2.10.0
22 | idna==2.9
23 | isort==4.3.21
24 | Jinja2==2.11.2
25 | Keras-Applications==1.0.8
26 | Keras-Preprocessing==1.1.2
27 | Markdown==3.2.2
28 | MarkupSafe==1.1.1
29 | numpy==1.17.3
30 | oauthlib==3.1.0
31 | opt-einsum==3.2.1
32 | packaging==20.4
33 | pathspec==0.8.0
34 | pexpect==4.8.0
35 | phonemizer==2.2
36 | protobuf==3.12.2
37 | ptyprocess==0.6.0
38 | pyasn1==0.4.8
39 | pyasn1-modules==0.2.8
40 | pyparsing==2.4.7
41 | PyYAML==5.3.1
42 | regex==2020.6.7
43 | requests==2.23.0
44 | requests-oauthlib==1.3.0
45 | rsa==4.0
46 | scipy==1.4.1
47 | six==1.15.0
48 | soundfile==0.10.3.post1
49 | tensorboard==2.1.1
50 | tensorboard-plugin-wit==1.6.0.post3
51 | tensorflow-estimator==2.1.0
52 | tensorflow-gpu==2.2.0
53 | tensorflow-gpu-estimator==2.2.0
54 | termcolor==1.1.0
55 | toml==0.10.1
56 | tqdm==4.46.1
57 | tts==0.0.2+f320992
58 | typed-ast==1.4.1
59 | urllib3==1.25.9
60 | Werkzeug==1.0.1
61 | wrapt==1.12.1
62 | 


--------------------------------------------------------------------------------
/batch_processing/setup.ps1:
--------------------------------------------------------------------------------
1 | $env:Path += ";C:\Users\jmike\Downloads\cudnn-10.0-windows10-x64-v7.5.1.10\cuda\bin"
2 | $env:Path += ";$env:userprofile\Downloads\TensorRT-5.1.5.0.Windows10.x86_64.cuda-10.0.cudnn7.5\TensorRT-5.1.5.0\lib"
3 | $env:Path += ";C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\bin"
4 | $env:Path += ";c:\tools\msys64\usr\bin\"
5 | $env:Path += ";C:\Program Files (x86)\Dr. Memory\bin\"
6 | 


--------------------------------------------------------------------------------
/batch_processing/test.ps1:
--------------------------------------------------------------------------------
1 | deepspeech --model C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.7.3-models.pbmm --scorer C:\Users\jmike\Documents\GitHub\DeepSpeech\deepspeech-0.7.3-models.scorer --audio C:\Users\jmike\Documents\Audacity\clip.wav --json


--------------------------------------------------------------------------------
/batch_processing/test_tf.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | print ("hello")
 4 | av = tf.test.is_gpu_available()
 5 | print(av)
 6 | 
 7 | av2= tf.config.list_physical_devices('GPU')
 8 | print(av2)
 9 | 
10 | #[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
11 | 


--------------------------------------------------------------------------------
/electron/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gz
 2 | /models
 3 | /build
 4 | /dist
 5 | /public/audio
 6 | 
 7 | # Logs
 8 | logs
 9 | *.log
10 | npm-debug.log*
11 | yarn-debug.log*
12 | yarn-error.log*
13 | lerna-debug.log*
14 | 
15 | # Diagnostic reports (https://nodejs.org/api/report.html)
16 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
17 | 
18 | # Runtime data
19 | pids
20 | *.pid
21 | *.seed
22 | *.pid.lock
23 | .DS_Store
24 | 
25 | # Directory for instrumented libs generated by jscoverage/JSCover
26 | lib-cov
27 | 
28 | # Coverage directory used by tools like istanbul
29 | coverage
30 | *.lcov
31 | 
32 | # nyc test coverage
33 | .nyc_output
34 | 
35 | # node-waf configuration
36 | .lock-wscript
37 | 
38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
39 | build/Release
40 | 
41 | # Dependency directories
42 | node_modules/
43 | jspm_packages/
44 | 
45 | # TypeScript v1 declaration files
46 | typings/
47 | 
48 | # TypeScript cache
49 | *.tsbuildinfo
50 | 
51 | # Optional npm cache directory
52 | .npm
53 | 
54 | # Optional eslint cache
55 | .eslintcache
56 | 
57 | # Optional REPL history
58 | .node_repl_history
59 | 
60 | # Output of 'npm pack'
61 | *.tgz
62 | 
63 | # Yarn Integrity file
64 | .yarn-integrity
65 | 
66 | # dotenv environment variables file
67 | .env
68 | .env.test
69 | 
70 | # parcel-bundler cache (https://parceljs.org/)
71 | .cache
72 | 
73 | # next.js build output
74 | .next
75 | 
76 | # nuxt.js build output
77 | .nuxt
78 | 
79 | # vuepress build output
80 | .vuepress/dist
81 | 
82 | # Serverless directories
83 | .serverless/
84 | 
85 | # FuseBox cache
86 | .fusebox/
87 | 
88 | # DynamoDB Local files
89 | .dynamodb/
90 | 
91 | # Webpack
92 | .webpack/
93 | 
94 | # Electron-Forge
95 | out/
96 | 


--------------------------------------------------------------------------------
/electron/Readme.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeech Electron example
 2 | 
 3 | This is an example of DeepSpeech running in an Electron app with a ReactJS front-end and processing .wav files.
 4 | 
 5 | ## Install
 6 | 
 7 | Install NPM modules:
 8 | 
 9 | ```
10 | npm install
11 | npm run rebuild
12 | ```
13 | 
14 | Download and extract audio files to `/public` directory
15 | 
16 | ```
17 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/audio-0.9.3.tar.gz
18 | tar xfvz audio-0.9.3.tar.gz -C ./public/
19 | ```
20 | 
21 | (Optional) Download or softlink DeepSpeech 0.9.3 model files to the root of the project:
22 | 
23 | ```
24 | mkdir models
25 | cd models
26 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
27 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
28 | cd ..
29 | ```
30 | 
31 | If the files do not exist, they will be downloaded.
32 | 
33 | ## Run
34 | 
35 | Run development version (Mac/Linux):
36 | 
37 | ```
38 | npm run dev
39 | ```
40 | 
41 | Run development version (Windows):
42 | 
43 | ```
44 | export BROWSER=none
45 | npm run dev-win
46 | ```
47 | 
48 | ## Package
49 | 
50 | Build distributable package (Mac/Linux):
51 | 
52 | ```
53 | npm run dist
54 | ```
55 | 
56 | Build distributable package (Windows installer):
57 | 
58 | ```
59 | export BROWSER=none
60 | npm run dist-win
61 | ```
62 | 
63 | Test the (dmg/appimage/exe) package file that has been generated in `/dist`.
64 | 
65 | ## Uninstall
66 | 
67 | The model files download to the following directories and must be deleted manually
68 | 
69 | - MacOSX: `~/Library/Application\ Support/deepspeech-electron`
70 | - Linux:  `~/.config/deepspeech-electron`
71 | - Windows: `~/AppData/Roaming/deepspeech-electron`
72 | 


--------------------------------------------------------------------------------
/electron/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepspeech-electron",
 3 |   "productName": "deepspeech-electron",
 4 |   "version": "1.0.0",
 5 |   "description": "My Electron application description",
 6 |   "main": "public/electron.js",
 7 |   "scripts": {
 8 |     "start": "react-scripts start",
 9 |     "build": "react-scripts build",
10 |     "test": "react-scripts test",
11 |     "eject": "react-scripts eject",
12 |     "dev": "concurrently \"BROWSER=none npm start\" \"wait-on http://localhost:3000 && electron --inspect=5858 .\"",
13 |     "dev-win": "concurrently \"npm start\" \"wait-on http://localhost:3000 && electron --inspect=5858 .\"",
14 |     "rebuild": "npm rebuild --runtime=electron --target=9.0.5 --disturl=https://atom.io/download/atom-shell --abi=75",
15 |     "pack": "npm run build && electron-builder --dir",
16 |     "dist": "npm run build && electron-builder",
17 |     "dist-win": "npm run build && electron-builder --x64",
18 |     "dev-test": "concurrently --kill-others --success first \"wait-on http://localhost:3000 && ./node_modules/.bin/electron public/electron.js DEEPSPEECH_TEST\" \"BROWSER=none npm start\""
19 |   },
20 |   "postinstall": "electron-builder install-app-deps",
21 |   "homepage": "./",
22 |   "build": {
23 |     "appId": "deepspeech-electron",
24 |     "productName": "deepspeech-electron",
25 |     "files": [
26 |       "build/**/*",
27 |       "node_modules/**/*",
28 |       "package.json"
29 |     ],
30 |     "buildDependenciesFromSource": true,
31 |     "artifactName": "deepspeech-electron-${version}-${os}-${arch}.${ext}",
32 |     "dmg": {
33 |       "title": "${productName}"
34 |     },
35 |     "mac": {
36 |       "category": "public.app-category.utilities",
37 |       "target": [
38 |         {
39 |           "target": "dmg",
40 |           "arch": [
41 |             "x64"
42 |           ]
43 |         },
44 |         {
45 |           "target": "zip",
46 |           "arch": [
47 |             "x64"
48 |           ]
49 |         }
50 |       ],
51 |       "identity": null
52 |     },
53 |     "win": {
54 |       "target": "nsis",
55 |       "artifactName": "deepspeech-electron-${version}-${os}-${arch}.${ext}"
56 |     },
57 |     "linux": {
58 |       "target": [
59 |         {
60 |           "target": "AppImage"
61 |         }
62 |       ],
63 |       "category": "Utility"
64 |     }
65 |   },
66 |   "keywords": [],
67 |   "license": "MIT",
68 |   "dependencies": {
69 |     "deepspeech": "^0.9.3",
70 |     "electron-is-dev": "^1.1.0",
71 |     "lodash": "^4.17.15",
72 |     "node-abi": "^2.18.0",
73 |     "react": "^16.12.0",
74 |     "react-dom": "^16.12.0",
75 |     "react-scripts": "^3.4.1",
76 |     "request": "^2.88.2",
77 |     "wav": "^1.0.2"
78 |   },
79 |   "devDependencies": {
80 |     "concurrently": "^5.0.0",
81 |     "electron": "9.0.5",
82 |     "electron-builder": "^22.7.0",
83 |     "electron-rebuild": "^1.11.0",
84 |     "wait-on": "^3.3.0"
85 |   },
86 |   "browserslist": {
87 |     "production": [
88 |       ">0.2%",
89 |       "not dead",
90 |       "not op_mini all"
91 |     ],
92 |     "development": [
93 |       "last 1 chrome version",
94 |       "last 1 firefox version",
95 |       "last 1 safari version"
96 |     ]
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/electron/public/create-window.js:
--------------------------------------------------------------------------------
 1 | const electron = require('electron');
 2 | const Path = require('path');
 3 | const app = electron.app;
 4 | const ipcMain = electron.ipcMain;
 5 | const BrowserWindow = electron.BrowserWindow;
 6 | const isDev = require('electron-is-dev');
 7 | if (isDev) process.env.NODE_ENV = 'dev';
 8 | const {recognizeWav} = require('./recognize-wav');
 9 | const fs = require('fs');
10 | const path = require('path');
11 | 
12 | let mainWindow;
13 | 
14 | function createWindow(model) {
15 | 	mainWindow = new BrowserWindow({
16 | 		width: 480,
17 | 		height: 480,
18 | 		webPreferences: {
19 | 			nodeIntegration: true,
20 | 			nodeIntegrationInWorker: false,
21 | 			preload: __dirname + '/preload.js'
22 | 		}
23 | 	});
24 | 	
25 | 	mainWindow.loadURL(isDev ? 'http://localhost:3000' : `file://${Path.join(__dirname, '../build/index.html')}`);
26 | 	
27 | 	if (isDev) {
28 | 		// open Chrome Development Console
29 | 		// mainWindow.webContents.openDevTools();
30 | 	}
31 | 	
32 | 	mainWindow.on('closed', () => mainWindow = null);
33 | 	
34 | 	app.on('window-all-closed', () => {
35 | 		app.quit()
36 | 	});
37 | 	
38 | 	// message from front-end App.js, request that this file be processed by DeepSpeech
39 | 	ipcMain.handle('recognize-wav', async function (event, file) {
40 | 		const filePath = path.resolve(__dirname, 'audio', file);
41 | 		const results = await recognizeWav(filePath, model);
42 | 		if (results) checkDone(file, results);
43 | 		return results;
44 | 	});
45 | 	
46 | 	let count = 0;
47 | 	function checkDone(file, results) {
48 | 		if (process.argv.indexOf('DEEPSPEECH_TEST') > -1) {
49 | 			// setup a timeout of 10 minutes and return failed test
50 | 			// in case it cannot really do test
51 | 			setTimeout(() => {
52 | 				console.log('test fail');
53 | 				app.quit(1);
54 | 				process.exit(1);
55 | 			}, 10*60*1000);
56 | 			count++
57 | 			console.log('test:', count, file, results);
58 | 			if (count === 3) {
59 | 				console.log('test done');
60 | 				app.quit(0);
61 | 				process.exit(0);
62 | 			}
63 | 		}
64 | 	}
65 | 	
66 | 	// message from front-end App.js, retrieve list of .wav files in /public/audio
67 | 	ipcMain.handle('load-files', function (event) {
68 | 		return new Promise(function (resolve, reject) {
69 | 			try {
70 | 				let audioPath = path.resolve(__dirname, 'audio');
71 | 				fs.exists(audioPath, function(exists) {
72 | 					if (exists) {
73 | 						fs.readdir(audioPath, function (err, files) {
74 | 							files = files.filter(function (file) {
75 | 								return file.endsWith('.wav');
76 | 							});
77 | 							resolve(files);
78 | 						});
79 | 					}
80 | 					else {
81 | 						console.log('audio files path does not exist: ', audioPath);
82 | 						console.log('See Readme.md');
83 | 						process.exit();
84 | 					}
85 | 				});
86 | 			} catch (e) {
87 | 				reject(e.toString())
88 | 			}
89 | 		});
90 | 	});
91 | 	
92 | 	return mainWindow;
93 | }
94 | 
95 | module.exports = createWindow;
96 | 


--------------------------------------------------------------------------------
/electron/public/download.js:
--------------------------------------------------------------------------------
 1 | const request = require('request');
 2 | const fs = require('fs');
 3 | 
 4 | // generic http download
 5 | function download(url, dest, callback) {
 6 | 	var file = fs.createWriteStream(dest);
 7 | 	console.log('Downloading:', url);
 8 | 	const sendReq = request.get(url);
 9 | 	sendReq.on('response', (response) => {
10 | 		if (response.statusCode === 200) {
11 | 			console.log('PLEASE WAIT...');
12 | 			sendReq.pipe(file);
13 | 		}
14 | 	});
15 | 	file.on('finish', () => {
16 | 		file.close();
17 | 		console.log('Saved:', dest);
18 | 		callback();
19 | 	});
20 | }
21 | 
22 | module.exports = download;


--------------------------------------------------------------------------------
/electron/public/electron.js:
--------------------------------------------------------------------------------
 1 | const electron = require('electron');
 2 | const app = electron.app;
 3 | const path = require('path');
 4 | const fs = require('fs');
 5 | const createWindow = require('./create-window');
 6 | const {getModel} = require('./recognize-wav');
 7 | 
 8 | let appDataPath;
 9 | 
10 | if (fs.existsSync(path.resolve(__dirname, '../models/deepspeech-0.9.3-models.pbmm'))) {
11 | 	// if the deepspeech model was found at the root, use that directory
12 | 	appDataPath = path.resolve(__dirname, '../models');
13 | }
14 | else {
15 | 	// otherwise use the electron "appData" path
16 | 	appDataPath = path.resolve(electron.app.getPath('appData'), 'deepspeech-electron');
17 | }
18 | 
19 | app.on('ready', function () {
20 | 	getModel(appDataPath, function (model) {
21 | 		console.log('model loaded')
22 | 		createWindow(model);
23 | 	});
24 | });
25 | 


--------------------------------------------------------------------------------
/electron/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 7 |     <meta name="theme-color" content="#000000" />
 8 |     <meta
 9 |       name="description"
10 |       content="Web site created using create-react-app"
11 |     />
12 |     <link rel="apple-touch-icon" href="logo192.png" />
13 |     <!--
14 |       manifest.json provides metadata used when your web app is installed on a
15 |       user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
16 |     -->
17 |     <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
18 |     <!--
19 |       Notice the use of %PUBLIC_URL% in the tags above.
20 |       It will be replaced with the URL of the `public` folder during the build.
21 |       Only files inside the `public` folder can be referenced from the HTML.
22 | 
23 |       Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
24 |       work correctly both with client-side routing and a non-root public URL.
25 |       Learn how to configure a non-root public URL by running `npm run build`.
26 |     -->
27 |     <link rel="stylesheet" href="fonts/stylesheet.css" type="text/css" charset="utf-8" />
28 |     <title>DeepSpeech Electron Example</title>
29 |   </head>
30 |   <body>
31 |     <noscript>You need to enable JavaScript to run this app.</noscript>
32 |     <div id="root"></div>
33 |     <!--
34 |       This HTML file is a template.
35 |       If you open it directly in the browser, you will see an empty page.
36 | 
37 |       You can add webfonts, meta tags, or analytics to this file.
38 |       The build step will place the bundled scripts into the <body> tag.
39 | 
40 |       To begin the development, run `npm start` or `yarn start`.
41 |       To create a production bundle, use `npm run build` or `yarn build`.
42 |     -->
43 |   </body>
44 | </html>
45 | 


--------------------------------------------------------------------------------
/electron/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "React App",
 3 |   "name": "Create React App Sample",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     },
10 |     {
11 |       "src": "logo192.png",
12 |       "type": "image/png",
13 |       "sizes": "192x192"
14 |     },
15 |     {
16 |       "src": "logo512.png",
17 |       "type": "image/png",
18 |       "sizes": "512x512"
19 |     }
20 |   ],
21 |   "start_url": ".",
22 |   "display": "standalone",
23 |   "theme_color": "#000000",
24 |   "background_color": "#ffffff"
25 | }
26 | 


--------------------------------------------------------------------------------
/electron/public/preload.js:
--------------------------------------------------------------------------------
1 | window.ipcRenderer = require('electron').ipcRenderer;
2 | console.log('ipcRenderer', window.ipcRenderer);


--------------------------------------------------------------------------------
/electron/public/recognize-wav.js:
--------------------------------------------------------------------------------
 1 | const DeepSpeech = require('deepspeech');
 2 | const fs = require('fs');
 3 | const path = require('path');
 4 | const wav = require('wav');
 5 | const download = require('./download');
 6 | 
 7 | // return the deepspeech model or download it if it is not found
 8 | function getModel(appDataPath, callback) {
 9 | 	let modelPath = path.resolve(appDataPath, 'deepspeech-0.9.3-models.pbmm');
10 | 	let scorerPath = path.resolve(appDataPath, 'deepspeech-0.9.3-models.scorer');
11 | 	if (fs.existsSync(modelPath) && fs.existsSync(scorerPath)) {
12 | 		callback(createModel(modelPath, scorerPath));
13 | 	}
14 | 	else {
15 | 		// if the model files do not exist, download and save them to AppData path
16 | 		console.log('\nDOWNLOADING MODEL TO: '+appDataPath+'\n');
17 | 		const downloadPath = 'https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models';
18 | 		download(downloadPath+'.pbmm', modelPath, function() {
19 | 			download(downloadPath+'.scorer', scorerPath, function() {
20 | 				callback(createModel(modelPath, scorerPath));
21 | 			});
22 | 		});
23 | 	}
24 | }
25 | 
26 | // create the deepspeech model
27 | function createModel(modelPath, scorerPath) {
28 | 	const model = new DeepSpeech.Model(modelPath);
29 | 	model.enableExternalScorer(scorerPath);
30 | 	return model;
31 | }
32 | 
33 | // create a deepspeech stream to process a .wav file
34 | function recognizeWav(path, model) {
35 | 	return new Promise(function(resolve, reject) {
36 | 		try {
37 | 			let modelStream = model.createStream();
38 | 			const bufferSize = 512;
39 | 			const file = fs.createReadStream(path, {highWaterMark: bufferSize});
40 | 			const reader = new wav.Reader();
41 | 			reader.on('format', function (format) {
42 | 				if (format.sampleRate !== model.sampleRate()) {
43 | 					reject(new Error('invalid sample rate: '+format.sampleRate));
44 | 				}
45 | 				reader.on('end', function () {
46 | 					const results = modelStream.finishStream();
47 | 					resolve(results);
48 | 				});
49 | 				reader.on('data', function (data) {
50 | 					modelStream.feedAudioContent(data);
51 | 				});
52 | 			});
53 | 			file.pipe(reader);
54 | 		}
55 | 		catch(e) {
56 | 			reject(e);
57 | 		}
58 | 	});
59 | }
60 | 
61 | module.exports = {
62 | 	getModel,
63 | 	recognizeWav
64 | };
65 | 


--------------------------------------------------------------------------------
/electron/src/App.js:
--------------------------------------------------------------------------------
 1 | import React, {Component} from 'react';
 2 | 
 3 | class App extends Component {
 4 | 	constructor(props) {
 5 | 		super(props);
 6 | 		this.state = {
 7 | 			loading: true,
 8 | 			error: null,
 9 | 			files: [],
10 | 			results: {}
11 | 		}
12 | 	}
13 | 	
14 | 	componentDidMount() {
15 | 		// when the component mounts, get the list of .wav files
16 | 		window.ipcRenderer.invoke('load-files')
17 | 		.then(files => {
18 | 			console.log('files', files);
19 | 			this.setState({
20 | 				loading: false,
21 | 				files
22 | 			}, () => {
23 | 				files.forEach(file => {
24 | 					// request that each file be processed by deepspeech
25 | 					console.log('recognize', file);
26 | 					window.ipcRenderer.invoke('recognize-wav', file).then(result => {
27 | 						// add the recognition results to this.state.results
28 | 						console.log('result', result);
29 | 						const results = {...this.state.results};
30 | 						results[file] = result;
31 | 						this.setState({results});
32 | 					});
33 | 				})
34 | 			});
35 | 		}).catch(e => {
36 | 			this.setState({
37 | 				loading: false,
38 | 				error: e
39 | 			});
40 | 		});
41 | 	}
42 | 	
43 | 	render() {
44 | 		if (this.state.loading) return 'Loading...';
45 | 		if (this.state.error) return 'Error: ' + this.state.error;
46 | 		return (<div className="App">
47 | 			<ul>
48 | 				{
49 | 					this.state.files.map((file, index) => {
50 | 						return (<li key={index}>
51 | 							{file} = {this.state.results[file] || '...'}
52 | 						</li>)
53 | 					})
54 | 				}
55 | 			</ul>
56 | 		</div>);
57 | 	}
58 | }
59 | 
60 | export default App;
61 | 


--------------------------------------------------------------------------------
/electron/src/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom';
3 | import App from './App';
4 | 
5 | ReactDOM.render(<App/>, document.getElementById('root'));


--------------------------------------------------------------------------------
/electron/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | OS=$(uname)
 7 | 
 8 | pushd ${THIS}
 9 |   source ../tests.sh
10 | 
11 |   if [ "${OS}" = "Linux" ]; then
12 |     export DISPLAY=':99.0'
13 |     Xvfb :99 -screen 0 1024x768x24 > /dev/null 2>&1 &
14 |     xvfb_process=$!
15 |   fi
16 | 
17 |   npm install $(get_npm_package_url)
18 |   npm install
19 |   npm run rebuild
20 | 
21 |   if [ -f "${THIS}/node_modules/electron/dist/chrome-sandbox" ]; then
22 |     export ELECTRON_DISABLE_SANDBOX=1
23 |   fi;
24 | 
25 |   ln -s $HOME/DeepSpeech/models models
26 |   ln -s ~/DeepSpeech/audio ./public/
27 | 
28 |   export CI=true
29 | 
30 |   npm run dev-test
31 | 
32 |   if [ "${OS}" = "Linux" ]; then
33 |     sleep 1
34 |     kill -9 ${xvfb_process} || true
35 |   fi
36 | popd
37 | 


--------------------------------------------------------------------------------
/ffmpeg_vad_streaming/README.MD:
--------------------------------------------------------------------------------
 1 | # FFmpeg VAD Streaming
 2 | 
 3 | Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
 4 | 
 5 | This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
 6 | 
 7 | ## Installation
 8 | 
 9 | ```bash
10 | npm install
11 | ```
12 | 
13 | Moreover FFmpeg must be installed:
14 | 
15 | ```bash
16 | sudo apt-get install ffmpeg
17 | ```
18 | 
19 | ## Usage
20 | 
21 | Here is an example for a local audio file:
22 | ```bash
23 | node ./index.js --audio <AUDIO_FILE> \
24 |                 --model $HOME/models/output_graph.pbmm
25 | ```
26 | 
27 | Here is an example for a remote RTMP-Stream:
28 | ```bash
29 | node ./index.js  --audio rtmp://<IP>:1935/live/teststream \
30 |                  --model $HOME/models/output_graph.pbmm
31 | ```
32 | 
33 | ## Examples
34 | Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
35 | ```bash
36 | node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
37 |                 --scorer $HOME/models/kenlm.scorer \
38 |                 --model $HOME/models/output_graph.pbmm
39 | ```
40 | ```bash
41 | node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
42 |                 --scorer $HOME/models/kenlm.scorer \
43 |                 --model $HOME/models/output_graph.pbmm
44 | ```
45 | ```bash
46 | node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
47 |                 --scorer $HOME/models/kenlm.scorer \
48 |                 --model $HOME/models/output_graph.pbmm
49 | ```
50 | Real time streaming inference in combination with a RTMP server.
51 | ```bash
52 | node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
53 |                 --scorer $HOME/models/kenlm.scorer \
54 |                 --model $HOME/models/output_graph.pbmm
55 | ```
56 | 
57 | ## Notes
58 | To get the best result mapped on to your own scenario, it might be helpful to adjust the parameters `VAD_MODE` and `DEBOUNCE_TIME`.
59 | 


--------------------------------------------------------------------------------
/ffmpeg_vad_streaming/index.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | const VAD = require("node-vad");
  4 | const Ds = require('deepspeech');
  5 | const argparse = require('argparse');
  6 | const util = require('util');
  7 | const { spawn } = require('child_process');
  8 | 
  9 | let VersionAction = function VersionAction(options) {
 10 | 	options = options || {};
 11 | 	options.nargs = 0;
 12 | 	argparse.Action.call(this, options);
 13 | };
 14 | 
 15 | util.inherits(VersionAction, argparse.Action);
 16 | 
 17 | VersionAction.prototype.call = function(parser) {
 18 | 	Ds.printVersions();
 19 | 	process.exit(0);
 20 | };
 21 | 
 22 | let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
 23 | parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
 24 | parser.addArgument(['--scorer'], {help: 'Path to the scorer file', nargs: '?'});
 25 | parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
 26 | parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
 27 | let args = parser.parseArgs();
 28 | 
 29 | function totalTime(hrtimeValue) {
 30 | 	return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
 31 | }
 32 | 
 33 | console.error('Loading model from file %s', args['model']);
 34 | const model_load_start = process.hrtime();
 35 | let model = new Ds.Model(args['model']);
 36 | const model_load_end = process.hrtime(model_load_start);
 37 | console.error('Loaded model in %ds.', totalTime(model_load_end));
 38 | 
 39 | if (args['scorer']) {
 40 | 	console.error('Loading scorer from file %s', args['scorer']);
 41 | 	const scorer_load_start = process.hrtime();
 42 | 	model.enableExternalScorer(args['scorer']);
 43 | 	const scorer_load_end = process.hrtime(scorer_load_start);
 44 | 	console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
 45 | }
 46 | 
 47 | // Defines different thresholds for voice detection
 48 | // NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
 49 | // LOW_BITRATE: Detection mode optimised for low-bitrate audio.
 50 | // AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
 51 | // VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
 52 | const VAD_MODE = VAD.Mode.NORMAL;
 53 | // const VAD_MODE = VAD.Mode.LOW_BITRATE;
 54 | // const VAD_MODE = VAD.Mode.AGGRESSIVE;
 55 | // const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
 56 | 
 57 | // Time in milliseconds for debouncing speech active state
 58 | const DEBOUNCE_TIME = 20;
 59 | 
 60 | // Create voice activity stream
 61 | const VAD_STREAM = VAD.createStream({
 62 | 	mode: VAD_MODE,
 63 | 	audioFrequency: model.sampleRate(),
 64 | 	debounceTime: DEBOUNCE_TIME
 65 | });
 66 | 
 67 | // Spawn ffmpeg process
 68 | const ffmpeg = spawn('ffmpeg', [
 69 | 	'-hide_banner',
 70 | 	'-nostats',
 71 | 	'-loglevel', 'fatal',
 72 | 	'-i', args['audio'],
 73 | 	'-vn',
 74 | 	'-acodec', 'pcm_s16le',
 75 | 	'-ac', 1,
 76 | 	'-ar', model.sampleRate(),
 77 | 	'-f', 's16le',
 78 | 	'pipe:'
 79 | ]);
 80 | 
 81 | let audioLength = 0;
 82 | let sctx = model.createStream();
 83 | 
 84 | function finishStream() {
 85 | 	const model_load_start = process.hrtime();
 86 | 	console.error('Running inference.');
 87 | 	console.log('Transcription: ', sctx.finishStream());
 88 | 	const model_load_end = process.hrtime(model_load_start);
 89 | 	console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
 90 | 	audioLength = 0;
 91 | }
 92 | 
 93 | function intermediateDecode() {
 94 | 	finishStream();
 95 | 	sctx = model.createStream();
 96 | }
 97 | 
 98 | function feedAudioContent(chunk) {
 99 | 	audioLength += (chunk.length / 2) * ( 1 / model.sampleRate());
100 | 	sctx.feedAudioContent(chunk);
101 | }
102 | 
103 | function processVad(data) {
104 | 	if (data.speech.start||data.speech.state) feedAudioContent(data.audioData)
105 | 	else if (data.speech.end) { feedAudioContent(data.audioData); intermediateDecode() }
106 | }
107 | 
108 | ffmpeg.stdout.pipe(VAD_STREAM).on('data', processVad);
109 | 


--------------------------------------------------------------------------------
/ffmpeg_vad_streaming/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ffmpeg-vad-streaming",
 3 |   "version": "1.0.0",
 4 |   "description": "Streaming inference from arbitrary source with VAD and FFmpeg",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node ./index.js"
 8 |   },
 9 |   "dependencies": {
10 |     "argparse": "^1.0.10",
11 |     "deepspeech": "0.9.3",
12 |     "node-vad": "^1.1.1",
13 |     "util": "^0.11.1"
14 |   },
15 |   "license" : "MIT"
16 | }
17 | 


--------------------------------------------------------------------------------
/ffmpeg_vad_streaming/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | pushd ${THIS}
 8 |   source ../tests.sh
 9 | 
10 |   npm install $(get_npm_package_url)
11 |   npm install
12 | 
13 |   node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
14 |                   --scorer $HOME/DeepSpeech/models/deepspeech-0.9.3-models.scorer \
15 |                   --model $HOME/DeepSpeech/models/deepspeech-0.9.3-models.pbmm
16 | 
17 |   node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \
18 |                   --scorer $HOME/DeepSpeech/models/deepspeech-0.9.3-models.scorer \
19 |                   --model $HOME/DeepSpeech/models/deepspeech-0.9.3-models.pbmm
20 | 
21 |   node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \
22 |                   --scorer $HOME/DeepSpeech/models/deepspeech-0.9.3-models.scorer \
23 |                   --model $HOME/DeepSpeech/models/deepspeech-0.9.3-models.pbmm
24 | popd
25 | 


--------------------------------------------------------------------------------
/hotword_adjusting/README.md:
--------------------------------------------------------------------------------
 1 | # hotword_adjusting
 2 | This script provides an example of hot-word boosting usage. It also allows adjusting your boost values to see how they change the final transcription.
 3 | 
 4 | # How to use?
 5 | Run using `python 3.9`, while having a `deepspeech` installed.
 6 | 
 7 | This works from version 0.9.0 since it was the version that added this feature.
 8 | Example of usage:
 9 | ```
10 | hotword_adjusting.py --model model.pbmm --scorer scorer.scorer --audio audio/filename.wav --min -100.0 --max 100.0 --steps 3 --hot_words hot,cold
11 | ```
12 | This tests combinations of hot-words: 'hot' and 'cold' on audiofile 'filename.wav'
13 | Using prios/boost values from range [-100;100] by doing 3 steps: [-100, 0, 100]
14 | 
15 | # Example output
16 | 
17 | For an input:
18 | ```
19 | hotword_adjusting.py --model model.pbmm --scorer scorer.scorer --audio audio/seuss.wav --min -10.0 --max 10.0 --steps 3 --hot_words bad,glad
20 | ```
21 | 
22 | ```
23 | ['bad', 'glad'] = (-10.0, -10.0) :: [why are they sad and lad and that i do not know go ask your dad]
24 | ['bad', 'glad'] = (-10.0, 0.0) :: [why are they sad and glad and that i do not know go ask your dad]
25 | ['bad', 'glad'] = (-10.0, 10.0) :: [why are they sad and glad and that i do not know go ask your dad]
26 | ['bad', 'glad'] = (0.0, -10.0) :: [why are they sad and lad and that i do not know go ask your dad]
27 | ['bad', 'glad'] = (0.0, 0.0) :: [why are they sad and glad and that i do not know go ask your dad]
28 | ['bad', 'glad'] = (0.0, 10.0) :: [why are they sad and glad and that i do not know go ask your dad]
29 | ['bad', 'glad'] = (10.0, -10.0) :: [why are they bad and bad and bad i do not know go ask your dad]
30 | ['bad', 'glad'] = (10.0, 0.0) :: [why are they bad and bad and bad i do not know go ask your dad]
31 | ['bad', 'glad'] = (10.0, 10.0) :: [why are they bad and glad and bad i do not know go ask your dad]
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/hotword_adjusting/hotword_adjusting.py:
--------------------------------------------------------------------------------
 1 | import deepspeech
 2 | from deepspeech import Model, version
 3 | import numpy as np
 4 | import wave
 5 | import itertools
 6 | import argparse
 7 | 
 8 | # Example of a valid execution:
 9 | # hotwords_adjusting.py --model model.pbmm --scorer.scorer --audio audio/filename.wav --min -100.0 --max 100.0 --steps 3 --hot_words hot,cold
10 | # This tests combinations of hot-words: 'hot' and 'cold' on audiofile 'filename.wav'
11 | # using prios from range [-100;100] by doing 3 steps: [-100, 0, 100]
12 | 
13 | #Prints out a Cartesian product of hotwords and their prios
14 | def test_file(filename, hotwords, min_prio, max_prio, prio_steps):
15 | 
16 |     fin = wave.open(filename, 'rb')
17 |     audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
18 |     fin.close()
19 | 
20 |     prio_lists = np.linspace(min_prio, max_prio,prio_steps).tolist()
21 | 
22 |     prio_product = itertools.product(prio_lists, repeat=len(hotwords))
23 |     for x in itertools.product(prio_lists, repeat=len(hotwords)):
24 |         DeepSpeech.clearHotWords()
25 |         for y in enumerate(hotwords):
26 |             DeepSpeech.addHotWord(hotwords[y[0]], x[y[0]])
27 |            
28 |         
29 |         print(f"{hotwords} = {x} :: [{DeepSpeech.stt(audio)}]")
30 | 
31 | 
32 | def main():
33 |     if(args.min>=args.max):
34 |         print("Error: min_prio can't be bigger than max_prio.")
35 |     else:
36 |         test_file(args.audio, args.hot_words.split(','), args.min, args.max, args.steps)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     parser = argparse.ArgumentParser(description='DeepSpeech hot-word adjusting.')
41 |     parser.add_argument('--model', required=True,
42 |                     help='Path to the model (protocol buffer binary file)')
43 |     parser.add_argument('--scorer', required=True,
44 |                     help='Path to the external scorer file')
45 |     parser.add_argument('--audio', type=str, required=True,
46 |                     help='Path to the audio file to run (WAV format)')
47 |     parser.add_argument('--min', type=float, default=-10.0,
48 |                     help='Minimum boost value.')
49 |     parser.add_argument('--max', type=float, default=10.0,
50 |                     help='Maximum boost value.')
51 |     parser.add_argument('--steps', type=int, default=6,
52 |                     help='Number of tests per each hot-word.')
53 |     parser.add_argument('--hot_words', type=str, required=True,
54 |                     help='Hot-words separated by comma.')
55 | 
56 |     args = parser.parse_args()
57 | 
58 |     DeepSpeech = Model(args.model)
59 |     DeepSpeech.enableExternalScorer(args.scorer)
60 | 
61 |     main()


--------------------------------------------------------------------------------
/mic_vad_streaming/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Microphone VAD Streaming
 3 | ========================
 4 | 
 5 | Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
 6 | 
 7 | Installation
 8 | ------------
 9 | 
10 | .. code-block:: bash
11 | 
12 |    pip install -r requirements.txt
13 | 
14 | Uses portaudio for microphone access, so on Linux, you may need to install its header files to compile the ``pyaudio`` package:
15 | 
16 | .. code-block:: bash
17 | 
18 |    sudo apt install portaudio19-dev
19 | 
20 | Installation on MacOS may fail due to portaudio, use brew to install it:
21 | 
22 | .. code-block:: bash
23 | 
24 |    brew install portaudio
25 | 
26 | Usage
27 | -----
28 | 
29 | .. code-block::
30 | 
31 |    usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
32 |                                [-w SAVEWAV] [-f FILE] -m MODEL [-s SCORER]
33 |                                [-d DEVICE] [-r RATE]
34 |    
35 |    Stream from microphone to DeepSpeech using VAD
36 |    
37 |    optional arguments:
38 |      -h, --help            show this help message and exit
39 |      -v VAD_AGGRESSIVENESS, --vad_aggressiveness VAD_AGGRESSIVENESS
40 |                            Set aggressiveness of VAD: an integer between 0 and 3,
41 |                            0 being the least aggressive about filtering out non-
42 |                            speech, 3 the most aggressive. Default: 3
43 |      --nospinner           Disable spinner
44 |      -w SAVEWAV, --savewav SAVEWAV
45 |                            Save .wav files of utterences to given directory
46 |      -f FILE, --file FILE  Read from .wav file instead of microphone
47 |      -m MODEL, --model MODEL
48 |                            Path to the model (protocol buffer binary file, or
49 |                            entire directory containing all standard-named files
50 |                            for model)
51 |      -s SCORER, --scorer SCORER
52 |                            Path to the external scorer file. Default:
53 |                            kenlm.scorer
54 |      -d DEVICE, --device DEVICE
55 |                            Device input index (Int) as listed by
56 |                            pyaudio.PyAudio.get_device_info_by_index(). If not
57 |                            provided, falls back to PyAudio.get_default_device().
58 |      -r RATE, --rate RATE  Input device sample rate. Default: 16000. Your device
59 | 


--------------------------------------------------------------------------------
/mic_vad_streaming/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeech~=0.9.3
2 | pyaudio~=0.2.11
3 | webrtcvad~=2.0.10
4 | halo~=0.0.18
5 | numpy>=1.15.1
6 | scipy>=1.1.0
7 | 


--------------------------------------------------------------------------------
/mic_vad_streaming/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | pushd ${THIS}
 8 |   source ../tests.sh
 9 | 
10 |   pip install --user $(get_python_wheel_url "$1")
11 |   pip install --user -r <(grep -v deepspeech requirements.txt)
12 | 
13 |   pulseaudio &
14 | 
15 |   python mic_vad_streaming.py \
16 | 	  --model $HOME/DeepSpeech/models/deepspeech-0.9.3-models.pbmm \
17 | 	  --scorer $HOME/DeepSpeech/models/deepspeech-0.9.3-models.scorer \
18 | 	  --file $HOME/DeepSpeech/audio/2830-3980-0043.wav
19 | popd
20 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/App.config:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8" ?>
2 | <configuration>
3 |     <startup> 
4 |         <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
5 |     </startup>
6 | </configuration>


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/App.xaml:
--------------------------------------------------------------------------------
1 | ﻿<Application
2 |     x:Class="DeepSpeechWPF.App"
3 |     xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
4 |     xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
5 |     xmlns:local="clr-namespace:DeepSpeechWPF"
6 |     StartupUri="MainWindow.xaml">
7 |     <Application.Resources />
8 | </Application>
9 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/App.xaml.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CommonServiceLocator;
 2 | using DeepSpeech.WPF.ViewModels;
 3 | using DeepSpeechClient.Interfaces;
 4 | using GalaSoft.MvvmLight.Ioc;
 5 | using System.Windows;
 6 | 
 7 | namespace DeepSpeechWPF
 8 | {
 9 |     /// <summary>
10 |     /// Interaction logic for App.xaml
11 |     /// </summary>
12 |     public partial class App : Application
13 |     {
14 |         protected override void OnStartup(StartupEventArgs e)
15 |         {
16 |             base.OnStartup(e);
17 |             ServiceLocator.SetLocatorProvider(() => SimpleIoc.Default);
18 | 
19 |             try
20 |             {
21 |                 //Register instance of DeepSpeech
22 |                 DeepSpeechClient.DeepSpeech deepSpeechClient =
23 |                     new DeepSpeechClient.DeepSpeech("deepspeech-0.9.3-models.pbmm");
24 | 
25 |                 SimpleIoc.Default.Register<IDeepSpeech>(() => deepSpeechClient);
26 |                 SimpleIoc.Default.Register<MainWindowViewModel>();
27 |             }
28 |             catch (System.Exception ex)
29 |             {
30 |                 MessageBox.Show(ex.Message);
31 |                 Current.Shutdown();
32 |             }
33 |         }
34 | 
35 |         protected override void OnExit(ExitEventArgs e)
36 |         {
37 |             base.OnExit(e);
38 |             //Dispose instance of DeepSpeech
39 |             ServiceLocator.Current.GetInstance<IDeepSpeech>()?.Dispose();
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/DeepSpeech.WPF.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 15
 4 | VisualStudioVersion = 15.0.28307.421
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}"
 7 | EndProject
 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\..\..\ds\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}"
 9 | EndProject
10 | Global
11 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | 		Debug|x64 = Debug|x64
13 | 		Release|x64 = Release|x64
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.Build.0 = Debug|x64
18 | 		{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.ActiveCfg = Release|x64
19 | 		{54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.Build.0 = Release|x64
20 | 		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
21 | 		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
22 | 		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
23 | 		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | 	GlobalSection(ExtensibilityGlobals) = postSolution
29 | 		SolutionGuid = {19C58802-CCEC-4FD1-8D17-A6EB766116F7}
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/MainWindow.xaml:
--------------------------------------------------------------------------------
  1 | ﻿<Window
  2 |     x:Class="DeepSpeechWPF.MainWindow"
  3 |     xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
  4 |     xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
  5 |     xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
  6 |     xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
  7 |     Title="Deepspeech client"
  8 |     Width="800"
  9 |     Height="600"
 10 |     Loaded="Window_Loaded"
 11 |     WindowStartupLocation="CenterScreen"
 12 |     mc:Ignorable="d">
 13 |     <Grid>
 14 |         <Grid.RowDefinitions>
 15 |             <RowDefinition Height="222" />
 16 |             <RowDefinition />
 17 |         </Grid.RowDefinitions>
 18 |         <TextBox
 19 |             Grid.Row="1"
 20 |             Margin="10,36,10,10"
 21 |             FontSize="16px"
 22 |             Text="{Binding Transcription, Mode=OneWay}"
 23 |             TextWrapping="Wrap" />
 24 |         <Label
 25 |             Grid.Row="1"
 26 |             Height="26"
 27 |             Margin="10,5,10,0"
 28 |             VerticalAlignment="Top"
 29 |             Content="Results:" />
 30 |         <Label
 31 |             Height="26"
 32 |             Margin="10,10,10,0"
 33 |             VerticalAlignment="Top"
 34 |             Content="Select an audio file to transcript:" />
 35 |         <TextBox
 36 |             Height="23"
 37 |             Margin="10,41,10,0"
 38 |             VerticalAlignment="Top"
 39 |             Text="{Binding AudioFilePath, Mode=TwoWay}"
 40 |             TextWrapping="Wrap" />
 41 |         <Button
 42 |             Width="80"
 43 |             Height="25"
 44 |             Margin="10,69,0,0"
 45 |             HorizontalAlignment="Left"
 46 |             VerticalAlignment="Top"
 47 |             Command="{Binding SelectFileCommand}"
 48 |             Content="Open file" />
 49 |         <Button
 50 |             Width="82"
 51 |             Height="25"
 52 |             Margin="95,69,0,0"
 53 |             HorizontalAlignment="Left"
 54 |             VerticalAlignment="Top"
 55 |             Command="{Binding EnableExternalScorerCommand}"
 56 |             Content="Enable external scorer" />
 57 |         <Button
 58 |             Width="75"
 59 |             Height="25"
 60 |             Margin="182,69,0,0"
 61 |             HorizontalAlignment="Left"
 62 |             VerticalAlignment="Top"
 63 |             Command="{Binding InferenceFromFileCommand}"
 64 |             Content="Transcript" />
 65 |         <Label
 66 |             Height="30"
 67 |             Margin="10,99,10,0"
 68 |             VerticalAlignment="Top"
 69 |             Content="{Binding StatusMessage, Mode=OneWay}" />
 70 |         <Label
 71 |             Height="26"
 72 |             Margin="10,158,10,0"
 73 |             VerticalAlignment="Top"
 74 |             Content="Select an audio input:" />
 75 |         <ComboBox
 76 |             Height="23"
 77 |             Margin="20,189,186,0"
 78 |             VerticalAlignment="Top"
 79 |             DisplayMemberPath="FriendlyName"
 80 |             ItemsSource="{Binding AvailableRecordDevices, Mode=TwoWay}"
 81 |             SelectedIndex="0"
 82 |             SelectedItem="{Binding SelectedDevice, Mode=TwoWay}" />
 83 |         <Button
 84 |             Width="91"
 85 |             Height="23"
 86 |             Margin="0,0,90,10"
 87 |             HorizontalAlignment="Right"
 88 |             VerticalAlignment="Bottom"
 89 |             Command="{Binding StartRecordingCommand}"
 90 |             Content="Record"
 91 |             IsEnabled="{Binding EnableStartRecord, Mode=OneWay}" />
 92 |         <Button
 93 |             Width="75"
 94 |             Height="23"
 95 |             Margin="0,0,10,10"
 96 |             HorizontalAlignment="Right"
 97 |             VerticalAlignment="Bottom"
 98 |             Command="{Binding StopRecordingCommand}"
 99 |             Content="Stop"
100 |             IsEnabled="{Binding EnableStopRecord, Mode=OneWay}" />
101 |     </Grid>
102 | </Window>
103 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/MainWindow.xaml.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CommonServiceLocator;
 2 | using DeepSpeech.WPF.ViewModels;
 3 | using System.Windows;
 4 | 
 5 | namespace DeepSpeechWPF
 6 | {
 7 |     /// <summary>
 8 |     /// Interaction logic for MainWindow.xaml
 9 |     /// </summary>
10 |     public partial class MainWindow : Window
11 |     {
12 |         public MainWindow() => InitializeComponent();
13 | 
14 |         private void Window_Loaded(object sender, RoutedEventArgs e) =>
15 |             DataContext = ServiceLocator.Current.GetInstance<MainWindowViewModel>();
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Reflection;
 2 | using System.Resources;
 3 | using System.Runtime.CompilerServices;
 4 | using System.Runtime.InteropServices;
 5 | using System.Windows;
 6 | 
 7 | // General Information about an assembly is controlled through the following
 8 | // set of attributes. Change these attribute values to modify the information
 9 | // associated with an assembly.
10 | [assembly: AssemblyTitle("DeepSpeech.WPF")]
11 | [assembly: AssemblyDescription("")]
12 | [assembly: AssemblyConfiguration("")]
13 | [assembly: AssemblyCompany("")]
14 | [assembly: AssemblyProduct("DeepSpeech.WPF.SingleFiles")]
15 | [assembly: AssemblyCopyright("Copyright ©  2018")]
16 | [assembly: AssemblyTrademark("")]
17 | [assembly: AssemblyCulture("")]
18 | 
19 | // Setting ComVisible to false makes the types in this assembly not visible
20 | // to COM components.  If you need to access a type in this assembly from
21 | // COM, set the ComVisible attribute to true on that type.
22 | [assembly: ComVisible(false)]
23 | 
24 | //In order to begin building localizable applications, set
25 | //<UICulture>CultureYouAreCodingWith</UICulture> in your .csproj file
26 | //inside a <PropertyGroup>.  For example, if you are using US english
27 | //in your source files, set the <UICulture> to en-US.  Then uncomment
28 | //the NeutralResourceLanguage attribute below.  Update the "en-US" in
29 | //the line below to match the UICulture setting in the project file.
30 | 
31 | //[assembly: NeutralResourcesLanguage("en-US", UltimateResourceFallbackLocation.Satellite)]
32 | 
33 | 
34 | [assembly: ThemeInfo(
35 |     ResourceDictionaryLocation.None, //where theme specific resource dictionaries are located
36 |                                      //(used if a resource is not found in the page,
37 |                                      // or application resource dictionaries)
38 |     ResourceDictionaryLocation.SourceAssembly //where the generic resource dictionary is located
39 |                                               //(used if a resource is not found in the page,
40 |                                               // app, or any theme specific resource dictionaries)
41 | )]
42 | 
43 | 
44 | // Version information for an assembly consists of the following four values:
45 | //
46 | //      Major Version
47 | //      Minor Version
48 | //      Build Number
49 | //      Revision
50 | //
51 | // You can specify all the values or you can default the Build and Revision Numbers
52 | // by using the '*' as shown below:
53 | // [assembly: AssemblyVersion("1.0.*")]
54 | [assembly: AssemblyVersion("1.0.0.0")]
55 | [assembly: AssemblyFileVersion("1.0.0.0")]
56 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/Properties/Resources.Designer.cs:
--------------------------------------------------------------------------------
 1 | ﻿//------------------------------------------------------------------------------
 2 | // <auto-generated>
 3 | //     This code was generated by a tool.
 4 | //     Runtime Version:4.0.30319.42000
 5 | //
 6 | //     Changes to this file may cause incorrect behavior and will be lost if
 7 | //     the code is regenerated.
 8 | // </auto-generated>
 9 | //------------------------------------------------------------------------------
10 | 
11 | namespace DeepSpeech.WPF.Properties {
12 |     using System;
13 |     
14 |     
15 |     /// <summary>
16 |     ///   A strongly-typed resource class, for looking up localized strings, etc.
17 |     /// </summary>
18 |     // This class was auto-generated by the StronglyTypedResourceBuilder
19 |     // class via a tool like ResGen or Visual Studio.
20 |     // To add or remove a member, edit your .ResX file then rerun ResGen
21 |     // with the /str option, or rebuild your VS project.
22 |     [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "15.0.0.0")]
23 |     [global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
24 |     [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
25 |     internal class Resources {
26 |         
27 |         private static global::System.Resources.ResourceManager resourceMan;
28 |         
29 |         private static global::System.Globalization.CultureInfo resourceCulture;
30 |         
31 |         [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
32 |         internal Resources() {
33 |         }
34 |         
35 |         /// <summary>
36 |         ///   Returns the cached ResourceManager instance used by this class.
37 |         /// </summary>
38 |         [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
39 |         internal static global::System.Resources.ResourceManager ResourceManager {
40 |             get {
41 |                 if (object.ReferenceEquals(resourceMan, null)) {
42 |                     global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("DeepSpeech.WPF.Properties.Resources", typeof(Resources).Assembly);
43 |                     resourceMan = temp;
44 |                 }
45 |                 return resourceMan;
46 |             }
47 |         }
48 |         
49 |         /// <summary>
50 |         ///   Overrides the current thread's CurrentUICulture property for all
51 |         ///   resource lookups using this strongly typed resource class.
52 |         /// </summary>
53 |         [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
54 |         internal static global::System.Globalization.CultureInfo Culture {
55 |             get {
56 |                 return resourceCulture;
57 |             }
58 |             set {
59 |                 resourceCulture = value;
60 |             }
61 |         }
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/Properties/Resources.resx:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <root>
  3 |   <!-- 
  4 |     Microsoft ResX Schema 
  5 |     
  6 |     Version 2.0
  7 |     
  8 |     The primary goals of this format is to allow a simple XML format 
  9 |     that is mostly human readable. The generation and parsing of the 
 10 |     various data types are done through the TypeConverter classes 
 11 |     associated with the data types.
 12 |     
 13 |     Example:
 14 |     
 15 |     ... ado.net/XML headers & schema ...
 16 |     <resheader name="resmimetype">text/microsoft-resx</resheader>
 17 |     <resheader name="version">2.0</resheader>
 18 |     <resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
 19 |     <resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
 20 |     <data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
 21 |     <data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
 22 |     <data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
 23 |         <value>[base64 mime encoded serialized .NET Framework object]</value>
 24 |     </data>
 25 |     <data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
 26 |         <value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
 27 |         <comment>This is a comment</comment>
 28 |     </data>
 29 |                 
 30 |     There are any number of "resheader" rows that contain simple 
 31 |     name/value pairs.
 32 |     
 33 |     Each data row contains a name, and value. The row also contains a 
 34 |     type or mimetype. Type corresponds to a .NET class that support 
 35 |     text/value conversion through the TypeConverter architecture. 
 36 |     Classes that don't support this are serialized and stored with the 
 37 |     mimetype set.
 38 |     
 39 |     The mimetype is used for serialized objects, and tells the 
 40 |     ResXResourceReader how to depersist the object. This is currently not 
 41 |     extensible. For a given mimetype the value must be set accordingly:
 42 |     
 43 |     Note - application/x-microsoft.net.object.binary.base64 is the format 
 44 |     that the ResXResourceWriter will generate, however the reader can 
 45 |     read any of the formats listed below.
 46 |     
 47 |     mimetype: application/x-microsoft.net.object.binary.base64
 48 |     value   : The object must be serialized with 
 49 |             : System.Serialization.Formatters.Binary.BinaryFormatter
 50 |             : and then encoded with base64 encoding.
 51 |     
 52 |     mimetype: application/x-microsoft.net.object.soap.base64
 53 |     value   : The object must be serialized with 
 54 |             : System.Runtime.Serialization.Formatters.Soap.SoapFormatter
 55 |             : and then encoded with base64 encoding.
 56 | 
 57 |     mimetype: application/x-microsoft.net.object.bytearray.base64
 58 |     value   : The object must be serialized into a byte array 
 59 |             : using a System.ComponentModel.TypeConverter
 60 |             : and then encoded with base64 encoding.
 61 |     -->
 62 |   <xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
 63 |     <xsd:element name="root" msdata:IsDataSet="true">
 64 |       <xsd:complexType>
 65 |         <xsd:choice maxOccurs="unbounded">
 66 |           <xsd:element name="metadata">
 67 |             <xsd:complexType>
 68 |               <xsd:sequence>
 69 |                 <xsd:element name="value" type="xsd:string" minOccurs="0" />
 70 |               </xsd:sequence>
 71 |               <xsd:attribute name="name" type="xsd:string" />
 72 |               <xsd:attribute name="type" type="xsd:string" />
 73 |               <xsd:attribute name="mimetype" type="xsd:string" />
 74 |             </xsd:complexType>
 75 |           </xsd:element>
 76 |           <xsd:element name="assembly">
 77 |             <xsd:complexType>
 78 |               <xsd:attribute name="alias" type="xsd:string" />
 79 |               <xsd:attribute name="name" type="xsd:string" />
 80 |             </xsd:complexType>
 81 |           </xsd:element>
 82 |           <xsd:element name="data">
 83 |             <xsd:complexType>
 84 |               <xsd:sequence>
 85 |                 <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
 86 |                 <xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
 87 |               </xsd:sequence>
 88 |               <xsd:attribute name="name" type="xsd:string" msdata:Ordinal="1" />
 89 |               <xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
 90 |               <xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
 91 |             </xsd:complexType>
 92 |           </xsd:element>
 93 |           <xsd:element name="resheader">
 94 |             <xsd:complexType>
 95 |               <xsd:sequence>
 96 |                 <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
 97 |               </xsd:sequence>
 98 |               <xsd:attribute name="name" type="xsd:string" use="required" />
 99 |             </xsd:complexType>
100 |           </xsd:element>
101 |         </xsd:choice>
102 |       </xsd:complexType>
103 |     </xsd:element>
104 |   </xsd:schema>
105 |   <resheader name="resmimetype">
106 |     <value>text/microsoft-resx</value>
107 |   </resheader>
108 |   <resheader name="version">
109 |     <value>2.0</value>
110 |   </resheader>
111 |   <resheader name="reader">
112 |     <value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
113 |   </resheader>
114 |   <resheader name="writer">
115 |     <value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
116 |   </resheader>
117 | </root>


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/Properties/Settings.Designer.cs:
--------------------------------------------------------------------------------
 1 | ﻿//------------------------------------------------------------------------------
 2 | // <auto-generated>
 3 | //     This code was generated by a tool.
 4 | //     Runtime Version:4.0.30319.42000
 5 | //
 6 | //     Changes to this file may cause incorrect behavior and will be lost if
 7 | //     the code is regenerated.
 8 | // </auto-generated>
 9 | //------------------------------------------------------------------------------
10 | 
11 | namespace DeepSpeech.WPF.Properties {
12 |     
13 |     
14 |     [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
15 |     [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "15.9.0.0")]
16 |     internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase {
17 |         
18 |         private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings())));
19 |         
20 |         public static Settings Default {
21 |             get {
22 |                 return defaultInstance;
23 |             }
24 |         }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/Properties/Settings.settings:
--------------------------------------------------------------------------------
1 | ﻿<?xml version='1.0' encoding='utf-8'?>
2 | <SettingsFile xmlns="uri:settings" CurrentProfile="(Default)">
3 |   <Profiles>
4 |     <Profile Name="(Default)" />
5 |   </Profiles>
6 |   <Settings />
7 | </SettingsFile>


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/ViewModels/BindableBase.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.ComponentModel;
 4 | using System.Runtime.CompilerServices;
 5 | 
 6 | namespace DeepSpeech.WPF.ViewModels
 7 | {
 8 |     /// <summary>
 9 |     /// Implementation of <see cref="INotifyPropertyChanged"/> to simplify models.
10 |     /// </summary>
11 |     public abstract class BindableBase : INotifyPropertyChanged
12 |     {
13 |         /// <summary>
14 |         /// Checks if a property already matches a desired value.  Sets the property and
15 |         /// notifies listeners only when necessary.
16 |         /// </summary>
17 |         /// <typeparam name="T">Type of the property.</typeparam>
18 |         /// <param name="storage">Reference to a property with both getter and setter.</param>
19 |         /// <param name="value">Desired value for the property.</param>
20 |         /// <param name="propertyName">Name of the property used to notify listeners.  This
21 |         /// value is optional and can be provided automatically when invoked from compilers that
22 |         /// support CallerMemberName.</param>
23 |         /// <returns>True if the value was changed, false if the existing value matched the
24 |         /// desired value.</returns>
25 |         protected bool SetProperty<T>(ref T backingStore, T value,
26 |            [CallerMemberName]string propertyName = "",
27 |            Action onChanged = null)
28 |         {
29 |             if (EqualityComparer<T>.Default.Equals(backingStore, value))
30 |                 return false;
31 |             backingStore = value;
32 |             onChanged?.Invoke();
33 |             OnPropertyChanged(propertyName);
34 |             return true;
35 |         }
36 | 
37 |         #region INotifyPropertyChanged
38 |         /// <summary>
39 |         /// Notifies listeners that a property value has changed.
40 |         /// </summary>
41 |         /// <param name="propertyName">Name of the property used to notify listeners.  This
42 |         /// value is optional and can be provided automatically when invoked from compilers
43 |         /// that support <see cref="CallerMemberNameAttribute"/>.</param>
44 |         public event PropertyChangedEventHandler PropertyChanged;
45 |         protected void OnPropertyChanged([CallerMemberName] string propertyName = "")
46 |             => PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName));
47 |         #endregion
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/net_framework/DeepSpeechWPF/packages.config:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <packages>
 3 |   <package id="AsyncAwaitBestPractices" version="3.1.0" targetFramework="net462" />
 4 |   <package id="AsyncAwaitBestPractices.MVVM" version="3.1.0" targetFramework="net462" />
 5 |   <package id="CommonServiceLocator" version="2.0.2" targetFramework="net462" />
 6 |   <package id="CSCore" version="1.2.1.2" targetFramework="net462" />
 7 |   <package id="DeepSpeech" version="0.9.0" targetFramework="net462" />
 8 |   <package id="MvvmLightLibs" version="5.4.1.1" targetFramework="net462" />
 9 |   <package id="NAudio" version="1.9.0" targetFramework="net462" />
10 | </packages>


--------------------------------------------------------------------------------
/nim_mic_vad_streaming/README.md:
--------------------------------------------------------------------------------
 1 | ## NOTE:
 2 | 
 3 | This directory contains two sub-directories one for ``WINDOWS`` OS and one for ``LINUX`` OS.
 4 | Read corresponding READMEs for each OS .
 5 | 
 6 | Only difference for both OS  is the  library used for gathering audio data from microphone .On WINDOWS ``portaudio`` is used while on LINUX  `ALSA-lib C` is used which itself provides an interface to ALSA Kernel module.
 7 | 
 8 | Interface to both the libs is provided through NIM code.
 9 | 
10 | ## PREREQUISITIES :
11 | * ```libdeepspeech.so```
12 | 
13 | Go to the [releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3) page and download the native client package based on your OS and CPU architecture.
14 | 
15 | Extract the ``libdeepspeech.so`` and put into the subdirectory depending on OS of native Client used.
16 | 
17 | #### On WINDOWS:
18 | * Download the ```native.client.amd64.win.tar.xz ``` package .   [  same is true for ``xx.xx.amd64.cuda.win.xx``  if CUDA installed or ``xx.xx.amd64.tflite.win.xx``]
19 | * Extract and place the ```libdeepspeech.so``` in ```win_nim_vad_streaming``` subdirectory
20 | * Now see ``README.md`` in  ```win_nim_vad_streaming``` subdirectory.
21 | 
22 | #### On LINUX:
23 | * Download the ```native_client.amd64.linux.cpu ``` package .[  same is true for ``xx.xx.amd64.cuda.linux.xx``  is CUDA installed or ``xx.xx.amd64.tflite.linux.xx``]
24 | * Extract and place the ```libdeepspeech.so``` in ```linux_nim_vad_streaming``` subdirectory
25 | * Now see ``README.md`` in  ```linux_nim_vad_streaming``` subdirectory.
26 | 
27 | _Note: One can put ``libdeepspeech.so`` in the system's PATH rather than copying it to one of subdirectories for easy usage._
28 | 
29 | 
30 | 
31 | 
32 | ## NOTE: 
33 | Used NIM code only depends on the  shared library(``libdeepspeech.so``) used.
34 | Given one has downloaded the native client package and extracted the ``libdeepspeech.so`` shared library and copied it  to one of the subdirectories or in system's PATH ,Code can be modified to add more  functionalities   in pure NIM and modified code would compile on any platform as long as that platform is supported by NIM. 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/nim_mic_vad_streaming/linux_nim_vad_streaming/README.md:
--------------------------------------------------------------------------------
 1 | # MICROPHONE VAD STREAMING
 2 | Minimalistic example to demonstrate the DeepSpeech streaming  API in NIM.Raw audio is streamed from microphone to the DeepSpeech based on VAD (voice Activity Detection).
 3 | 
 4 | ## Prerequisites:
 5 | 0) Please read ``PREREQUISITES`` in [README](../README.md)  for getting the required ``libdeepspeech.so`` shared library.
 6 | 1) This example depends on the ``libasound.so``(which is distributed along with all major linux distros and present in linker's default path)
 7 | 
 8 | _Note:  You may need to install ``libasound.so``  if not found_
 9 | ```
10 | sudo apt-get install libasound2
11 | ```
12 | 2) Download the pre-trained DeepSpeech english model (1089MB) and Scorer Package(~900MB):
13 | 
14 | ```
15 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
16 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
17 | ```
18 | 
19 | 
20 | ## Installation
21 | 
22 | 1. Install Nim bindings for deespeech version-0.7.0 .
23 | ```nim
24 | nimble install https://gitlab.com/eagledot/nim-deepspeech@0.7.0
25 | ```
26 | 
27 | 2. Install Nim bindings for ALSA-lib C which is  needed for microphone access.
28 | ```nim
29 | nimble install alsa
30 | ```
31 | 
32 | OR
33 | 
34 | ```nim
35 | nimble install https://gitlab.com/eagledot/nim-alsa
36 | ```
37 | 
38 | 3. Install Webrtcvad library for Voice Activity Detection(VAD engine).
39 | ```nim
40 | nimble install webrtcvad
41 | ```  
42 | OR
43 | 
44 | ```nim
45 | nimble install https://gitlab.com/eagledot/nim-webrtcvad
46 | ```
47 | 
48 | 4.  Install Wav library for reading and writing .wav files.
49 | 
50 | _Note: This lib is optional and is needed only  for saving recorded audio as ``.wav`` files._
51 | ```nim
52 | nimble install https://gitlab.com/eagledot/nim-wav
53 | ```
54 | 
55 | ## BUILD:
56 | *  Build the executable/binary as such:
57 | ```nim
58 | nim c -f -d:release --threads:on vad_stream.nim
59 | ```
60 | 
61 | ## Usage:
62 | * Using ``--saveWav`` flag is optional ,it will save the recorded audio as `.wav` files. 
63 | ``` nim 
64 | ./vad_stream --model:<path/to/pretrained/model.pbmm>  --scorer:<path/to/.scorer>  --saveWav
65 | ```
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/nim_mic_vad_streaming/linux_nim_vad_streaming/vad_stream.nim:
--------------------------------------------------------------------------------
  1 | import os ,deques,math,strutils,parseopt,tables,strformat
  2 | import alsa,webrtcvad,wav
  3 | import deepspeech
  4 | 
  5 | var 
  6 |     args = initTable[string, string]()
  7 |     saveWav = false
  8 | for kind,key,value in getopt():
  9 |     if key.toLower() == "savewav":
 10 |         saveWav = true
 11 |     else:
 12 |         args.add(key,value)
 13 | 
 14 | doAssert "model" in args  #to run without external scorer.       
 15 | 
 16 | #All on the Stack no GC..can be used from another thread except deviceName ..pass it as argument.
 17 | const
 18 |     rate = 16000'u32
 19 |     sampleRate = rate
 20 |     kernelBuffer = 8192'u32 #KernelBuffer size for storing micData..must not be overrun.
 21 |     nChannels = 1'u32
 22 |     format = SND_PCM_FORMAT_S16_LE
 23 |     mode = NON_BLOCKING_MODE
 24 |     frameDuration = 20  #in milliseconds.
 25 |     windowSize = 12
 26 | let
 27 |     capture_handle: snd_pcm_ref = nil
 28 |     hw_params: snd_pcm_hw_params_ref = nil
 29 |     device_name = "plughw:0,0"  #PCM hardware alsa Device.
 30 |     size = (int((frameDuration*int(rate))/1000))
 31 |     modelPtr: ModelState = nil  #deepSpeech model  
 32 |     deepStreamPtr: StreamingState = nil  #deepSpeech model stream
 33 |     modelPath = args["model"]
 34 |     
 35 | var
 36 |     text:cstring
 37 |     err: cint
 38 |     count = 0
 39 |     dir:cint  
 40 |     framesLen: clong
 41 |     vad:vadObj  #VAD Object declaration
 42 |     codeV: cint  #to hold the error codes for VAD.
 43 |     codeD: cint #to hold the error codes for deepSpeech
 44 |     #to get the data from the channel.
 45 |     frame : seq[int16]
 46 |     buff = initDeque[tuple[data: seq[int16],flag:int32]](nextPowerOfTwo(windowSize))
 47 |     triggered = false
 48 |     fwav: wavObj
 49 |     scorerPath:string
 50 | if "scorer" in args:
 51 |     scorerPath = args["scorer"]
 52 | 
 53 | #define a channel to hold the audio data.
 54 | var chan: Channel[seq[int16]]
 55 | 
 56 | 
 57 | #params->   deviceName:name of device to be opened  ,size:  number of frames to be read in one cycle...NOTE:  FRAMES,NOT BYTES.
 58 | proc record(deviceName:string){.thread.} =
 59 |     var recordBuff = newSeq[int16](size)  #userSpace buffer to record mic data.
 60 |     var framesLen: clong
 61 |     err = snd_pcm_open_nim(unsafeAddr(capture_handle),deviceName,SND_PCM_STREAM_CAPTURE,mode)
 62 |     doAssert err == 0'i32
 63 |     #
 64 |     err = snd_pcm_hw_params_malloc_nim(unsafeAddr(hw_params))
 65 |     doAssert err == 0'i32
 66 |     err = snd_pcm_hw_params_any_nim(capture_handle,hw_params)
 67 |     doAssert err == 0'i32
 68 |     #set InterLeaved access
 69 |     err = snd_pcm_hw_params_set_access_nim(capture_handle,hw_params,SND_PCM_ACCESS_RW_INTERLEAVED)
 70 |     doAssert err == 0'i32
 71 |     #set format
 72 |     err = snd_pcm_hw_params_set_format_nim(capture_handle,hw_params,format)
 73 |     doAssert err == 0'i32
 74 |     #Set rate
 75 |     err = snd_pcm_hw_params_set_rate_nim(capture_handle,hw_params,rate,dir)
 76 |     doAssert err == 0'i32
 77 |     #   set  nCHannels
 78 |     err = snd_pcm_hw_params_set_channels_nim(capture_handle,hw_params,nChannels)
 79 |     doAssert err == 0'i32
 80 |     err = snd_pcm_hw_params_set_buffer_size_nim(capture_handle,hw_params,kernelBuffer)
 81 | 
 82 |     #apply hw_params
 83 |     err = snd_pcm_hw_params_nim(capture_handle,hw_params) 
 84 |     doAssert err == 0'i32
 85 | 
 86 |     echo("hw_params successfully applied..")
 87 |     snd_pcm_hw_params_free_nim(hw_params)
 88 | 
 89 |     while true:
 90 |         framesLen = snd_pcm_readi_nim(capture_handle,addr(recordBuff[0]),culong(size)) #reading 512 samples ..singlechannel,each 2 bytes..hence 1024 bytes.
 91 |         assert framesLen == clong(size)
 92 |         
 93 |             
 94 |         chan.send(recordBuff)
 95 | 
 96 | 
 97 | #########################################################################################
 98 | proc sum[T](temp: Deque[T]): int = 
 99 |     for i in 0..<len(temp):
100 |         result = result + temp[i].flag
101 | 
102 | ############################
103 | codeV = initVad(vad)
104 | if codeV== 0'i32:
105 |     echo("vad Initialized")
106 | codeV = setMode(vad,3'i32)
107 | assert codeV == 0'i32
108 | ###################################################################333
109 | codeD = createModel(modelPath,unsafeaddr(modelPtr))
110 | if codeD == 0'i32:
111 |     echo("Model Created Successfully")
112 | let beamWidth = getBeamWidth(modelPtr)
113 | echo("Default Beam Width is : ",int(beamWidth))
114 | #enable External Scorer.
115 | if "scorer" in args:
116 |     codeD = enableExternalScorer(modelPtr, scorerPath)
117 |     if codeD == 0'i32:
118 |         echo("External Scorer Enabled.")
119 | else:
120 |     echo("No scorer Used")
121 | ###################
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | chan.open()
129 | 
130 | var thread: Thread[string]
131 | createThread[string](thread,record,device_name)
132 | echo("Thread Created")
133 | #receive the data from the channel..blocking call.
134 | while true:
135 |     frame = chan.recv()
136 |     codeV = vad.isSpeech(frame,int(rate))
137 |     #echo(audioData[0],"  ",codeV)
138 | 
139 |     if triggered == false:
140 |     #now check if there is enough voiceActivity based on last `windowSize` samples
141 |         if buff.len < windowSize:
142 |             buff.addLast((frame,codeV))
143 |         else:
144 |             buff.popFirst()
145 |             buff.addLast((frame,codeV))
146 |         #also check the percentage of voiced samples:
147 |         if float(sum(buff))  > float(0.5*float(windowSize)):
148 |             triggered = true
149 | 
150 |             #START THE DEEP SPEECH STREAM...here.
151 |             codeD = createStream(modelPtr,unsafeAddr(deepStreamPtr))
152 |             echo("TRIGGERED !!!!!!!!!!")
153 |             if saveWav:
154 |                 fwav = wavWrite(fmt"chunk-{count:03}.wav",uint32(sampleRate),uint16(nChannels))
155 | 
156 |             for i in 0..<len(buff):
157 |                 if saveWav:
158 |                     fwav.writeChunk(buff[i].data)
159 |                 feedAudioContent(deepStreamPtr,cast[ptr cshort](addr(buff[i].data[0])),cuint(len(buff[i].data)))
160 |             buff.clear()
161 | 
162 |     else:
163 |         if buff.len < windowSize:
164 |             buff.addLast((frame,codeV))
165 |         else:
166 |             buff.popFirst()
167 |             buff.addLast((frame,codeV))
168 |         feedAudioContent(deepStreamPtr,cast[ptr cshort](addr(frame[0])),cuint(len(frame)))    
169 |         if saveWav:
170 |             fwav.writeChunk(frame)
171 |         #check the percentage of unvoiced samples
172 |         if float(buff.len - sum(buff)) > 0.85*float(windowSize):
173 |             #echo("Done")
174 |             triggered = false
175 |             buff.clear()
176 |             text = finishStream(deepStreamPtr)
177 |             if len(text)>0:
178 |                 echo("Transcript: ",text)
179 |                 freeString(text)
180 |             if saveWav:
181 |                 fwav.close()
182 |                 echo("Written")
183 |                 count = count + 1
184 | 
185 | 
186 | 
187 | 
188 | 
189 | #joinThread(thread)
190 | #echo("Thread finished..")


--------------------------------------------------------------------------------
/nim_mic_vad_streaming/win_nim_vad_streaming/README.md:
--------------------------------------------------------------------------------
 1 | # MICROPHONE VAD STREAMING
 2 | Minimalistic example to demonstrate the DeepSpeech streaming  API in NIM.Raw audio is streamed from microphone to the DeepSpeech based on VAD (voice Activity Detection).
 3 | 
 4 | ## Prerequisites:
 5 | 0) Please read ``PREREQUISITES`` in [README](../README.md)  for getting the required ``libdeepspeech.so`` shared library.
 6 | 1) This example depends on the ``libportaudio.dll``(precompiled portaudio library).Make sure you have this library  in PATH.If you don't have one or are unable to build one ,you can get one from [here](https://gitlab.com/eagledot/nim-portaudio/lib).
 7 | 
 8 | 2) Download the pre-trained DeepSpeech english model (1089MB):
 9 | 
10 | ```
11 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
12 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
13 | ```
14 | 
15 | 
16 | ## Installation
17 | 
18 | 1. Install Nim bindings for deespeech version-0.7.0 . 
19 | ```nim
20 | nimble install https://gitlab.com/eagledot/nim-deepspeech@0.7.0
21 | ```
22 | 
23 | 2. Install Nim bindings for portudio which is  needed for microphone access.
24 | 
25 | ```nim
26 | nimble install https://gitlab.com/eagledot/nim-portaudio
27 | ```
28 | 
29 | 3. Install Webrtcvad library for Voice Activity Detection(VAD engine).
30 | ```nim
31 | nimble install webrtcvad
32 | ```  
33 | OR
34 | 
35 | ```nim
36 | nimble install https://gitlab.com/eagledot/nim-webrtcvad
37 | ```
38 | 
39 | 4. Install Wav library for reading and writing .wav files.
40 | 
41 | _Note: This lib is optional and is needed only for saving recorded audio as ``.wav`` files._
42 | ```nim
43 | nimble install https://gitlab.com/eagledot/nim-wav
44 | ```
45 | 
46 | 
47 | ## BUILD:
48 | *  Build the executable/binary as such:
49 | ```nim
50 | nim c -f -d:release vad_stream.nim
51 | ```
52 | 
53 | ## Usage:
54 | * Using ``--saveWav`` flag is optional ,it will save the recorded audio as `.wav` files. 
55 | ``` nim 
56 | ./vad_stream.exe --model:<path/to/pretrained/model.pbmm>  --scorer:<path/to/.scorer>  --saveWav
57 | ```
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/nim_mic_vad_streaming/win_nim_vad_streaming/vad_stream.nim:
--------------------------------------------------------------------------------
  1 | import os ,deques,math,strutils,parseopt,tables
  2 | import strformat
  3 | import webrtcvad,portaudio,deepspeech,wav
  4 | 
  5 | 
  6 | proc sum[T](temp: Deque[T]): int = 
  7 |     for i in 0..<len(temp):
  8 |         result = result + temp[i].flag
  9 | var 
 10 |     args = initTable[string, string]()
 11 |     saveWav = false
 12 | for kind,key,value in getopt():
 13 |     if key.toLower() == "savewav":
 14 |         saveWav = true
 15 |     else:
 16 |         args.add(key,value)
 17 |         
 18 | 
 19 | 
 20 | #doAssert args.len == 2,"Incorrect commandLine params ,Please check again"
 21 | doAssert "model" in args  #to run without external scorer.
 22 | const
 23 |     frameDuration = 20 #in milliseconds.
 24 |     sampleRate = 16000
 25 |     nChannels = 1
 26 |     windowSize = 12
 27 | var
 28 |     left:int
 29 |     curr:int
 30 |     tt: int
 31 |     buff = initDeque[tuple[data: seq[int16],flag:int32]](nextPowerOfTwo(windowSize))
 32 |     len: int
 33 |     audioData = newSeq[int16](int((frameDuration*sampleRate)/1000))
 34 |     frame: seq[int16]
 35 |     vad: vadObj
 36 |     codeV: cint
 37 |     code: cint
 38 |     codeD: cint
 39 |     triggered = false
 40 |     fwav: wavObj
 41 |     count = 0
 42 |     text: cstring
 43 |     scorerPath: string
 44 | let
 45 |     #data sharing is being done through FILES(on disk)..This is not the fastest way,being done because was not able to  make portaudio work with --threads:on flag.
 46 |     #sometime kernel may not write data to the disk...while loop tries to take care of that..worked every time by now.
 47 |     f1 = open("FIFO_rgb",fmWrite)
 48 |     f2 = open("FIFO_rgb",fmREAD)
 49 |     stream: pointer = nil #portaudio Stream pointer holder.
 50 |     modelPtr: ModelState = nil  #deepSpeech model  
 51 |     deepStreamPtr: StreamingState = nil  #deepSpeech model stream
 52 |     modelPath = args["model"]
 53 | if "scorer" in args:
 54 |     scorerPath = args["scorer"]
 55 | 
 56 |     
 57 | #CallBack function scheduled by OS...executed when requested raw audio data is available.
 58 | proc simpleCB(inpBuff: pointer,outBuff: pointer,framesPerBuffer: culong,timeInfo: ptr PaStreamCallbackTimeInfo ,statusFlags: PaStreamCallbackFlags,userData: pointer): PaStreamCallbackResult {.cdecl.}= 
 59 |     #writing to a file on disk.
 60 |     discard f1.writeBuffer(inpBuff,int(framesPerBuffer)*sizeof(int16))
 61 |     f1.flushFile()
 62 |     return paContinue
 63 | 
 64 | when isMainModule:
 65 |     codeV = initVad(vad)
 66 |     if codeV== 0'i32:
 67 |         echo("vad Initialized")
 68 |     codeV = setMode(vad,3'i32)
 69 |     assert codeV == 0'i32
 70 | 
 71 |     #DeepSpeech model initialization.
 72 |     codeD = createModel(modelPath,unsafeaddr(modelPtr))
 73 |     if codeD == 0'i32:
 74 |         echo("Model Created Successfully")
 75 |     let beamWidth = getBeamWidth(modelPtr)
 76 |     echo("Default Beam Width is : ",int(beamWidth))
 77 |     #enable External Scorer.
 78 |     if "scorer" in args:
 79 |         codeD = enableExternalScorer(modelPtr, scorerPath)
 80 |         if codeD == 0'i32:
 81 |             echo("External Scorer Enabled.")
 82 |     else:
 83 |         echo("No external scorer used.")
 84 |     ###################
 85 | 
 86 |     #initialize the stream
 87 |     code = initPortAudio()
 88 |     echo(code," in Initializing the stream")
 89 | 
 90 |     #Create the defaultStream portaudio
 91 |     #TODO:  Making sure that given sampleRate is supported by chosen inputDevice.
 92 |     code  = defaultStream(unsafeAddr(stream),1,0,paInt16,sampleRate,culong(len(audioData)),simpleCB ,nil)
 93 |     echo(code," in opening the default stream")
 94 | 
 95 |     #start the portaudio stream.
 96 |     code = startStream(stream)
 97 |     echo(code," in starting the stream with default parameters")
 98 | 
 99 |     while true:
100 |         #################################################################################################3
101 |         left = len(audioData)*sizeof(int16)
102 |         tt = 0 
103 |         while true:
104 |             curr = f2.readBuffer(cast[pointer](cast[int](addr(audioData[0]))+ tt),left)
105 |             tt = tt + curr
106 |             left = left - curr
107 |             if (left > 0):
108 |                 continue
109 |             else:
110 |                 frame = audioData 
111 |                 break
112 |             
113 |         codeV = vad.isSpeech(frame,sampleRate)
114 |         if triggered == false:
115 |         #now check if there is enough voiceActivity based on last `windowSize` samples
116 |             if buff.len < windowSize:
117 |                 buff.addLast((frame,codeV))
118 |             else:
119 |                 buff.popFirst()
120 |                 buff.addLast((frame,codeV))
121 | 
122 |             #also check the percentage of voiced samples:
123 |             if float(sum(buff))  > float(0.5*float(windowSize)):
124 |                 triggered = true
125 | 
126 |                 #START THE DEEP SPEECH STREAM...here.
127 |                 codeD = createStream(modelPtr,unsafeAddr(deepStreamPtr))
128 |                 echo("TRIGGERED !!!!!!!!!!")
129 |                 if saveWav:
130 |                     fwav = wavWrite(fmt"chunk-{count:03}.wav",uint32(sampleRate),uint16(nChannels))
131 | 
132 |                 for i in 0..<len(buff):
133 |                     if saveWav:
134 |                         fwav.writeChunk(buff[i].data)
135 |                     feedAudioContent(deepStreamPtr,cast[ptr cshort](addr(buff[i].data[0])),cuint(len(buff[i].data)))
136 |                 buff.clear()
137 | 
138 |         else:
139 |             if buff.len < windowSize:
140 |                 buff.addLast((frame,codeV))
141 |             else:
142 |                 buff.popFirst()
143 |                 buff.addLast((frame,codeV))
144 |             feedAudioContent(deepStreamPtr,cast[ptr cshort](addr(frame[0])),cuint(len(frame)))    
145 |             if saveWav:
146 |                 fwav.writeChunk(frame)
147 |             #check the percentage of unvoiced samples
148 |             if float(buff.len - sum(buff)) > 0.85*float(windowSize):
149 |                 #echo("Done")
150 |                 triggered = false
151 |                 buff.clear()
152 |                 text = finishStream(deepStreamPtr)
153 |                 if len(text)>0:
154 |                     echo("Transcript: ",text)
155 |                     freeString(text)
156 |                 if saveWav:
157 |                     fwav.close()
158 |                     echo("Written")
159 |                     count = count + 1
160 |             
161 | 


--------------------------------------------------------------------------------
/nodejs_mic_vad_streaming/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | /deepspeech-0.6.0-models
 4 | 
 5 | # dependencies
 6 | /node_modules
 7 | /.pnp
 8 | .pnp.js
 9 | 
10 | # testing
11 | /coverage
12 | 
13 | # production
14 | /build
15 | 
16 | # misc
17 | .DS_Store
18 | .env.local
19 | .env.development.local
20 | .env.test.local
21 | .env.production.local
22 | 
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | 


--------------------------------------------------------------------------------
/nodejs_mic_vad_streaming/Readme.md:
--------------------------------------------------------------------------------
 1 | # NodeJS Microphone VAD Streaming
 2 | 
 3 | This is a NodeJS example of recording from the microphone and streaming to
 4 | DeepSpeech with voice activity detection.
 5 | 
 6 | ### Prerequisites:
 7 | 
 8 | 1) The example utilized the [mic](https://github.com/ashishbajaj99/mic) NPM module which requires
 9 | either [sox](http://sox.sourceforge.net/) (Windows/Mac) or [arecord](http://alsa-project.org/) (Linux).
10 | 
11 | 2) Download the pre-trained DeepSpeech english model (1089MB):
12 | 
13 | ```
14 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
15 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
16 | ```
17 | 
18 | #### Dependency
19 | 
20 | Is needed the library **libasound2-dev**
21 | 
22 | ```
23 | $ sudo apt-get install libasound2-dev
24 | ```
25 | 
26 | #### Install:
27 | 
28 | ```
29 | npm install
30 | ```
31 | 
32 | #### Run NodeJS server:
33 | 
34 | ```
35 | node start.js
36 | ```
37 | 
38 | #### Specify alternate DeepSpeech model path:
39 | 
40 | Use the `DEEPSPEECH_MODEL` environment variable to change models.
41 | 
42 | ```
43 | DEEPSPEECH_MODEL=~/dev/jaxcore/deepspeech-0.9.3-models/ node start.js
44 | ```


--------------------------------------------------------------------------------
/nodejs_mic_vad_streaming/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "nodejs_mic_vad_streaming",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "dependencies": {
 6 |     "deepspeech": "^0.9.3",
 7 |     "mic": "^2.1.2",
 8 |     "node-vad": "^1.1.4",
 9 |     "speaker": "^0.5.1",
10 |     "wav": "^1.0.2"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/nodejs_mic_vad_streaming/start.js:
--------------------------------------------------------------------------------
  1 | const DeepSpeech = require('deepspeech');
  2 | const VAD = require('node-vad');
  3 | const mic = require('mic');
  4 | const fs = require('fs');
  5 | const wav = require('wav');
  6 | const Speaker = require('speaker');
  7 | 
  8 | let DEEPSPEECH_MODEL; // path to deepspeech model directory
  9 | if (process.env.DEEPSPEECH_MODEL) {
 10 | 	DEEPSPEECH_MODEL = process.env.DEEPSPEECH_MODEL;
 11 | }
 12 | else {
 13 | 	DEEPSPEECH_MODEL = __dirname + '/deepspeech-0.9.3-models';
 14 | }
 15 | 
 16 | let SILENCE_THRESHOLD = 200; // how many milliseconds of inactivity before processing the audio
 17 | 
 18 | // const VAD_MODE = VAD.Mode.NORMAL;
 19 | // const VAD_MODE = VAD.Mode.LOW_BITRATE;
 20 | // const VAD_MODE = VAD.Mode.AGGRESSIVE;
 21 | const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
 22 | const vad = new VAD(VAD_MODE);
 23 | 
 24 | function createModel(modelDir) {
 25 | 	let modelPath = modelDir + '.pbmm';
 26 | 	let scorerPath = modelDir + '.scorer';
 27 | 	let model = new DeepSpeech.Model(modelPath);
 28 | 	model.enableExternalScorer(scorerPath);
 29 | 	return model;
 30 | }
 31 | 
 32 | let englishModel = createModel(DEEPSPEECH_MODEL);
 33 | 
 34 | let modelStream;
 35 | let recordedChunks = 0;
 36 | let silenceStart = null;
 37 | let recordedAudioLength = 0;
 38 | let endTimeout = null;
 39 | let silenceBuffers = [];
 40 | let firstChunkVoice = false;
 41 | 
 42 | function processAudioStream(data, callback) {
 43 | 	vad.processAudio(data, 16000).then((res) => {
 44 | 		if (firstChunkVoice) {
 45 | 			firstChunkVoice = false;
 46 | 			processVoice(data);
 47 | 			return;
 48 | 		}
 49 | 		switch (res) {
 50 | 			case VAD.Event.ERROR:
 51 | 				console.log("VAD ERROR");
 52 | 				break;
 53 | 			case VAD.Event.NOISE:
 54 | 				console.log("VAD NOISE");
 55 | 				break;
 56 | 			case VAD.Event.SILENCE:
 57 | 				processSilence(data, callback);
 58 | 				break;
 59 | 			case VAD.Event.VOICE:
 60 | 				processVoice(data);
 61 | 				break;
 62 | 			default:
 63 | 				console.log('default', res);
 64 | 		}
 65 | 	});
 66 | 	
 67 | 	// timeout after 1s of inactivity
 68 | 	clearTimeout(endTimeout);
 69 | 	endTimeout = setTimeout(function() {
 70 | 		console.log('timeout');
 71 | 		resetAudioStream();
 72 | 	},SILENCE_THRESHOLD*3);
 73 | }
 74 | 
 75 | function endAudioStream(callback) {
 76 | 	console.log('[end]');
 77 | 	let results = intermediateDecode();
 78 | 	if (results) {
 79 | 		if (callback) {
 80 | 			callback(results);
 81 | 		}
 82 | 	}
 83 | }
 84 | 
 85 | function resetAudioStream() {
 86 | 	clearTimeout(endTimeout);
 87 | 	console.log('[reset]');
 88 | 	intermediateDecode(); // ignore results
 89 | 	recordedChunks = 0;
 90 | 	silenceStart = null;
 91 | }
 92 | 
 93 | function processSilence(data, callback) {
 94 | 	if (recordedChunks > 0) { // recording is on
 95 | 		process.stdout.write('-'); // silence detected while recording
 96 | 		
 97 | 		feedAudioContent(data);
 98 | 		
 99 | 		if (silenceStart === null) {
100 | 			silenceStart = new Date().getTime();
101 | 		}
102 | 		else {
103 | 			let now = new Date().getTime();
104 | 			if (now - silenceStart > SILENCE_THRESHOLD) {
105 | 				silenceStart = null;
106 | 				console.log('[end]');
107 | 				let results = intermediateDecode();
108 | 				if (results) {
109 | 					if (callback) {
110 | 						callback(results);
111 | 					}
112 | 				}
113 | 			}
114 | 		}
115 | 	}
116 | 	else {
117 | 		process.stdout.write('.'); // silence detected while not recording
118 | 		bufferSilence(data);
119 | 	}
120 | }
121 | 
122 | function bufferSilence(data) {
123 | 	// VAD has a tendency to cut the first bit of audio data from the start of a recording
124 | 	// so keep a buffer of that first bit of audio and in addBufferedSilence() reattach it to the beginning of the recording
125 | 	silenceBuffers.push(data);
126 | 	if (silenceBuffers.length >= 3) {
127 | 		silenceBuffers.shift();
128 | 	}
129 | }
130 | 
131 | function addBufferedSilence(data) {
132 | 	let audioBuffer;
133 | 	if (silenceBuffers.length) {
134 | 		silenceBuffers.push(data);
135 | 		let length = 0;
136 | 		silenceBuffers.forEach(function (buf) {
137 | 			length += buf.length;
138 | 		});
139 | 		audioBuffer = Buffer.concat(silenceBuffers, length);
140 | 		silenceBuffers = [];
141 | 	}
142 | 	else audioBuffer = data;
143 | 	return audioBuffer;
144 | }
145 | 
146 | function processVoice(data) {
147 | 	silenceStart = null;
148 | 	if (recordedChunks === 0) {
149 | 		console.log('');
150 | 		process.stdout.write('[start]'); // recording started
151 | 	}
152 | 	else {
153 | 		process.stdout.write('='); // still recording
154 | 	}
155 | 	recordedChunks++;
156 | 	
157 | 	data = addBufferedSilence(data);
158 | 	feedAudioContent(data);
159 | }
160 | 
161 | function createStream() {
162 | 	modelStream = englishModel.createStream();
163 | 	recordedChunks = 0;
164 | 	recordedAudioLength = 0;
165 | }
166 | 
167 | function finishStream() {
168 | 	if (modelStream) {
169 | 		let start = new Date();
170 | 		let text = modelStream.finishStream();
171 | 		if (text) {
172 | 			let recogTime = new Date().getTime() - start.getTime();
173 | 			return {
174 | 				text,
175 | 				recogTime,
176 | 				audioLength: Math.round(recordedAudioLength)
177 | 			};
178 | 		}
179 | 	}
180 | 	silenceBuffers = [];
181 | 	modelStream = null;
182 | }
183 | 
184 | function intermediateDecode() {
185 | 	let results = finishStream();
186 | 	createStream();
187 | 	return results;
188 | }
189 | 
190 | function feedAudioContent(chunk) {
191 | 	recordedAudioLength += (chunk.length / 2) * (1 / 16000) * 1000;
192 | 	modelStream.feedAudioContent(chunk);
193 | }
194 | 
195 | let microphone;
196 | function startMicrophone(callback) {
197 | 	if (microphone) {
198 | 		console.log('microphone exists');
199 | 		return;
200 | 	}
201 | 	
202 | 	createStream();
203 | 	
204 | 	var microphone = mic({
205 | 		rate: '16000',
206 | 		channels: '1',
207 | 		debug: false,
208 | 		fileType: 'wav'
209 | 	});
210 | 	
211 | 	var stream = microphone.getAudioStream();
212 | 	
213 | 	stream.on('data', function(data) {
214 | 		processAudioStream(data, (results) => {
215 | 			callback(results);
216 | 		});
217 | 	});
218 | 	
219 | 	microphone.start();
220 | }
221 | 
222 | function stopMicrophone() {
223 | 	microphone.stop();
224 | 	resetAudioStream();
225 | }
226 | 
227 | function onRecognize(results) {
228 | 	if (results.text === 'quit') {
229 | 		console.log('quitting...');
230 | 		stopMicrophone();
231 | 		process.exit();
232 | 	}
233 | 	else {
234 | 		console.log('recognized:', results);
235 | 	}
236 | }
237 | 
238 | if (process.argv[2]) {
239 | 	// if an audio file is supplied as an argument, play through the speakers to be picked up by the microphone
240 | 	console.log('play audio file', process.argv[2]);
241 | 	var file = fs.createReadStream(process.argv[2]);
242 | 	var reader = new wav.Reader();
243 | 	reader.on('format', function (format) {
244 | 		firstChunkVoice = true;   // override vad for this test
245 | 		SILENCE_THRESHOLD = 1000; // override silence (debounce time)
246 | 		startMicrophone(function(results) {
247 | 			console.log(results);
248 | 			process.exit();
249 | 		});
250 | 		setTimeout(function() {
251 | 			reader.pipe(new Speaker(format));
252 | 		},900);
253 | 	});
254 | 	file.pipe(reader);
255 | }
256 | else {
257 | 	startMicrophone(onRecognize);
258 | }
259 | 


--------------------------------------------------------------------------------
/nodejs_mic_vad_streaming/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | pushd ${THIS}
 8 |   source ../tests.sh
 9 | 
10 |   npm install $(get_npm_package_url)
11 |   npm install
12 | 
13 |   DEEPSPEECH_MODEL=$HOME/DeepSpeech/models node ./start.js $HOME/DeepSpeech/audio/2830-3980-0043.wav
14 | 
15 | popd
16 | 


--------------------------------------------------------------------------------
/nodejs_wav/Readme.md:
--------------------------------------------------------------------------------
 1 | # NodeJS voice recognition example using Mozilla DeepSpeech
 2 | 
 3 | Download the pre-trained model (1.8GB):
 4 | 
 5 | ```
 6 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
 7 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
 8 | ```
 9 | 
10 | Edit references to models path if necessary:
11 | 
12 | ```
13 | let modelPath = './models/deepspeech-0.9.3-models.pbmm';
14 | let scorerPath = './models/deepspeech-0.9.3-models.scorer';
15 | ```
16 | 
17 | Install Sox (for .wav file loading):
18 | 
19 | ```
20 | brew install sox
21 | ```
22 | 
23 | Download test audio files:
24 | 
25 | ```
26 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/audio-0.9.3.tar.gz
27 | tar xfvz audio-0.9.3.tar.gz
28 | ```
29 | 
30 | Install NPM dependencies:
31 | 
32 | ```
33 | npm install
34 | ```
35 | 
36 | Run:
37 | 
38 | ```
39 | node index.js
40 | ```
41 | 
42 | Result should be something like:
43 | 
44 | ```
45 | audio length 1.975
46 | result: experience proves this
47 | 
48 | ```
49 | 
50 | Try other wav files with an argument:
51 | 
52 | ```
53 | node index.js audio/2830-3980-0043.wav
54 | node index.js audio/8455-210777-0068.wav
55 | node index.js audio/4507-16021-0012.wav
56 | ```
57 | 
58 | 


--------------------------------------------------------------------------------
/nodejs_wav/index.js:
--------------------------------------------------------------------------------
 1 | const DeepSpeech = require('deepspeech');
 2 | const Fs = require('fs');
 3 | const Sox = require('sox-stream');
 4 | const MemoryStream = require('memory-stream');
 5 | const Duplex = require('stream').Duplex;
 6 | const Wav = require('node-wav');
 7 | 
 8 | let modelPath = './models/deepspeech-0.9.3-models.pbmm';
 9 | 
10 | let model = new DeepSpeech.Model(modelPath);
11 | 
12 | let desiredSampleRate = model.sampleRate();
13 | 
14 | let scorerPath = './models/deepspeech-0.9.3-models.scorer';
15 | 
16 | model.enableExternalScorer(scorerPath);
17 | 
18 | let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';
19 | 
20 | if (!Fs.existsSync(audioFile)) {
21 | 	console.log('file missing:', audioFile);
22 | 	process.exit();
23 | }
24 | 
25 | const buffer = Fs.readFileSync(audioFile);
26 | const result = Wav.decode(buffer);
27 | 
28 | if (result.sampleRate < desiredSampleRate) {
29 | 	console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than ' + desiredSampleRate + 'Hz. Up-sampling might produce erratic speech recognition.');
30 | }
31 | 
32 | function bufferToStream(buffer) {
33 | 	let stream = new Duplex();
34 | 	stream.push(buffer);
35 | 	stream.push(null);
36 | 	return stream;
37 | }
38 | 
39 | let audioStream = new MemoryStream();
40 | bufferToStream(buffer).
41 | pipe(Sox({
42 | 	global: {
43 | 		'no-dither': true,
44 | 	},
45 | 	output: {
46 | 		bits: 16,
47 | 		rate: desiredSampleRate,
48 | 		channels: 1,
49 | 		encoding: 'signed-integer',
50 | 		endian: 'little',
51 | 		compression: 0.0,
52 | 		type: 'raw'
53 | 	}
54 | })).
55 | pipe(audioStream);
56 | 
57 | audioStream.on('finish', () => {
58 | 	let audioBuffer = audioStream.toBuffer();
59 | 	
60 | 	const audioLength = (audioBuffer.length / 2) * (1 / desiredSampleRate);
61 | 	console.log('audio length', audioLength);
62 | 	
63 | 	let result = model.stt(audioBuffer);
64 | 	
65 | 	console.log('result:', result);
66 | });
67 | 


--------------------------------------------------------------------------------
/nodejs_wav/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepspeech-nodejs_wav",
 3 |   "version": "1.0.0",
 4 |   "description": "Simple audio processing",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node ./index.js"
 8 |   },
 9 |   "dependencies": {
10 |     "argparse": "^1.0.10",
11 |     "deepspeech": "0.9.3",
12 |     "node-wav": "0.0.2",
13 |     "sox-stream": "^2.0.3",
14 |     "util": "^0.11.1"
15 |   },
16 |   "license": "Public domain"
17 | }
18 | 


--------------------------------------------------------------------------------
/nodejs_wav/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | pushd ${THIS}
 8 |   source ../tests.sh
 9 | 
10 |   npm install $(get_npm_package_url)
11 |   npm install
12 | 
13 |   ln -s $HOME/DeepSpeech/models models
14 | 
15 |   node index.js $HOME/DeepSpeech/audio/2830-3980-0043.wav
16 |   node index.js $HOME/DeepSpeech/audio/8455-210777-0068.wav
17 |   node index.js $HOME/DeepSpeech/audio/4507-16021-0012.wav
18 | popd
19 | 


--------------------------------------------------------------------------------
/tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | source $HOME/DeepSpeech/ds/taskcluster/tc-tests-utils.sh
 8 | 
 9 | DEP_TASK_ID=$(curl -s https://community-tc.services.mozilla.com/api/queue/v1/task/${TASK_ID} | python -c 'import json; import sys; print(" ".join(json.loads(sys.stdin.read())["dependencies"]));')
10 | 
11 | get_python_wheel_url()
12 | {
13 |   local this_python_version=$1
14 | 
15 |   extract_python_versions "${this_python_version}" "pyver" "pyver_pkg" "py_unicode_type" "pyconf" "pyalias"
16 | 
17 |   echo "$(get_python_pkg_url "${pyver_pkg}" "${py_unicode_type}" "deepspeech" https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public)"
18 | }
19 | 
20 | get_npm_package_url()
21 | {
22 |   echo "https://community-tc.services.mozilla.com/api/queue/v1/task/${DEP_TASK_ID}/artifacts/public/deepspeech-${DS_VERSION}.tgz"
23 | }
24 | 


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.29519.87
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechUWP", "DeepSpeechUWP\DeepSpeechUWP.csproj", "{49AAC24D-6A76-4910-913A-94D2D67B6226}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|ARM = Debug|ARM
11 | 		Debug|ARM64 = Debug|ARM64
12 | 		Debug|x64 = Debug|x64
13 | 		Debug|x86 = Debug|x86
14 | 		Release|ARM = Release|ARM
15 | 		Release|ARM64 = Release|ARM64
16 | 		Release|x64 = Release|x64
17 | 		Release|x86 = Release|x86
18 | 	EndGlobalSection
19 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
20 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|ARM.ActiveCfg = Debug|ARM
21 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|ARM.Build.0 = Debug|ARM
22 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|ARM.Deploy.0 = Debug|ARM
23 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|ARM64.ActiveCfg = Debug|ARM64
24 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|ARM64.Build.0 = Debug|ARM64
25 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|ARM64.Deploy.0 = Debug|ARM64
26 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|x64.ActiveCfg = Debug|x64
27 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|x64.Build.0 = Debug|x64
28 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|x64.Deploy.0 = Debug|x64
29 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|x86.ActiveCfg = Debug|x86
30 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|x86.Build.0 = Debug|x86
31 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Debug|x86.Deploy.0 = Debug|x86
32 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|ARM.ActiveCfg = Release|ARM
33 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|ARM.Build.0 = Release|ARM
34 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|ARM.Deploy.0 = Release|ARM
35 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|ARM64.ActiveCfg = Release|ARM64
36 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|ARM64.Build.0 = Release|ARM64
37 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|ARM64.Deploy.0 = Release|ARM64
38 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|x64.ActiveCfg = Release|x64
39 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|x64.Build.0 = Release|x64
40 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|x64.Deploy.0 = Release|x64
41 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|x86.ActiveCfg = Release|x86
42 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|x86.Build.0 = Release|x86
43 | 		{49AAC24D-6A76-4910-913A-94D2D67B6226}.Release|x86.Deploy.0 = Release|x86
44 | 	EndGlobalSection
45 | 	GlobalSection(SolutionProperties) = preSolution
46 | 		HideSolutionNode = FALSE
47 | 	EndGlobalSection
48 | 	GlobalSection(ExtensibilityGlobals) = postSolution
49 | 		SolutionGuid = {D6764C5D-937A-4FF8-AE1F-29EC004C905C}
50 | 	EndGlobalSection
51 | EndGlobal
52 | 


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/.gitignore:
--------------------------------------------------------------------------------
1 | models/*
2 | 


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/App.xaml:
--------------------------------------------------------------------------------
1 | ﻿<Application
2 |     x:Class="DeepSpeechUWP.App"
3 |     xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
4 |     xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
5 |     xmlns:local="using:DeepSpeechUWP">
6 | 
7 | </Application>
8 | 


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/App.xaml.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.IO;
  4 | using System.Linq;
  5 | using System.Runtime.InteropServices.WindowsRuntime;
  6 | using Windows.ApplicationModel;
  7 | using Windows.ApplicationModel.Activation;
  8 | using Windows.Foundation;
  9 | using Windows.Foundation.Collections;
 10 | using Windows.UI.ViewManagement;
 11 | using Windows.UI.Xaml;
 12 | using Windows.UI.Xaml.Controls;
 13 | using Windows.UI.Xaml.Controls.Primitives;
 14 | using Windows.UI.Xaml.Data;
 15 | using Windows.UI.Xaml.Input;
 16 | using Windows.UI.Xaml.Media;
 17 | using Windows.UI.Xaml.Navigation;
 18 | 
 19 | namespace DeepSpeechUWP
 20 | {
 21 |     /// <summary>
 22 |     /// Provides application-specific behavior to supplement the default Application class.
 23 |     /// </summary>
 24 |     sealed partial class App : Application
 25 |     {
 26 |         /// <summary>
 27 |         /// Initializes the singleton application object.  This is the first line of authored code
 28 |         /// executed, and as such is the logical equivalent of main() or WinMain().
 29 |         /// </summary>
 30 |         public App()
 31 |         {
 32 |             this.InitializeComponent();
 33 |             this.Suspending += OnSuspending;
 34 |         }
 35 | 
 36 |         /// <summary>
 37 |         /// Invoked when the application is launched normally by the end user.  Other entry points
 38 |         /// will be used such as when the application is launched to open a specific file.
 39 |         /// </summary>
 40 |         /// <param name="e">Details about the launch request and process.</param>
 41 |         protected override void OnLaunched(LaunchActivatedEventArgs e)
 42 |         {
 43 |             Frame rootFrame = Window.Current.Content as Frame;
 44 | 
 45 |             // Do not repeat app initialization when the Window already has content,
 46 |             // just ensure that the window is active
 47 |             if (rootFrame == null)
 48 |             {
 49 |                 // Create a Frame to act as the navigation context and navigate to the first page
 50 |                 rootFrame = new Frame();
 51 | 
 52 |                 rootFrame.NavigationFailed += OnNavigationFailed;
 53 | 
 54 |                 if (e.PreviousExecutionState == ApplicationExecutionState.Terminated)
 55 |                 {
 56 |                     //TODO: Load state from previously suspended application
 57 |                 }
 58 | 
 59 |                 // Place the frame in the current Window
 60 |                 Window.Current.Content = rootFrame;
 61 |             }
 62 | 
 63 |             if (e.PrelaunchActivated == false)
 64 |             {
 65 |                 if (rootFrame.Content == null)
 66 |                 {
 67 |                     // When the navigation stack isn't restored navigate to the first page,
 68 |                     // configuring the new page by passing required information as a navigation
 69 |                     // parameter
 70 |                     rootFrame.Navigate(typeof(MainPage), e.Arguments);
 71 |                 }
 72 |                 ApplicationView.PreferredLaunchViewSize = new Size(760, 640);
 73 |                 ApplicationView.PreferredLaunchWindowingMode = ApplicationViewWindowingMode.PreferredLaunchViewSize;
 74 |                 // Ensure the current window is active
 75 |                 Window.Current.Activate();
 76 |             }
 77 |         }
 78 | 
 79 |         /// <summary>
 80 |         /// Invoked when Navigation to a certain page fails
 81 |         /// </summary>
 82 |         /// <param name="sender">The Frame which failed navigation</param>
 83 |         /// <param name="e">Details about the navigation failure</param>
 84 |         void OnNavigationFailed(object sender, NavigationFailedEventArgs e)
 85 |         {
 86 |             throw new Exception("Failed to load Page " + e.SourcePageType.FullName);
 87 |         }
 88 | 
 89 |         /// <summary>
 90 |         /// Invoked when application execution is being suspended.  Application state is saved
 91 |         /// without knowing whether the application will be terminated or resumed with the contents
 92 |         /// of memory still intact.
 93 |         /// </summary>
 94 |         /// <param name="sender">The source of the suspend request.</param>
 95 |         /// <param name="e">Details about the suspend request.</param>
 96 |         private void OnSuspending(object sender, SuspendingEventArgs e)
 97 |         {
 98 |             var deferral = e.SuspendingOperation.GetDeferral();
 99 |             //TODO: Save application state and stop any background activity
100 |             deferral.Complete();
101 |         }
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/LockScreenLogo.scale-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/LockScreenLogo.scale-200.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/SplashScreen.scale-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/SplashScreen.scale-200.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/Square150x150Logo.scale-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/Square150x150Logo.scale-200.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/Square44x44Logo.scale-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/Square44x44Logo.scale-200.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/Square44x44Logo.targetsize-24_altform-unplated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/Square44x44Logo.targetsize-24_altform-unplated.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/StoreLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/StoreLogo.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Assets/Wide310x150Logo.scale-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/Assets/Wide310x150Logo.scale-200.png


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/MainPage.xaml:
--------------------------------------------------------------------------------
 1 | ﻿<Page
 2 |     x:Class="DeepSpeechUWP.MainPage"
 3 |     xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
 4 |     xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
 5 |     xmlns:local="using:DeepSpeechUWP"
 6 |     xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
 7 |     xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
 8 |     mc:Ignorable="d"
 9 |     Background="{ThemeResource ApplicationPageBackgroundThemeBrush}">
10 | 
11 |     <Grid Margin="50">
12 |         <Grid.RowDefinitions>
13 |             <RowDefinition Height="Auto" />
14 |             <RowDefinition Height="*" />
15 |             <RowDefinition Height="40" />
16 |         </Grid.RowDefinitions>
17 |         <StackPanel Grid.Row="0">
18 |             <TextBlock FontSize="30" FontWeight="Bold">DeepSpeech UWP Demo</TextBlock>
19 |             <TextBlock FontSize="20" Margin="0 4 0 40">powered by Audio Graph API</TextBlock>
20 |             <TextBlock Margin="0 0 0 10">Select an audio for transcription:</TextBlock>
21 |             <Grid ColumnSpacing="10">
22 |                 <Grid.ColumnDefinitions>
23 |                     <ColumnDefinition Width="100" />
24 |                     <ColumnDefinition Width="*" />
25 |                     <ColumnDefinition Width="100" />
26 |                     <ColumnDefinition Width="100" />
27 |                 </Grid.ColumnDefinitions>
28 |                 <Button Grid.Column="0" HorizontalAlignment="Stretch" x:Name="selectFileButton" Click="selectFileButton_Click">Select File</Button>
29 |                 <TextBlock Grid.Column="1" HorizontalAlignment="Stretch" VerticalAlignment="Center" x:Name="selectedFile">No file selected...</TextBlock>
30 |                 <Button Grid.Column="2" HorizontalAlignment="Stretch" x:Name="playAudioFileButton" Click="playAudioFileButton_Click" IsEnabled="False">Play</Button>
31 |                 <Button Grid.Column="3" HorizontalAlignment="Stretch" x:Name="transcribeAudioFileButton" Click="transcribeAudioFileButton_Click" IsEnabled="False">Transcribe</Button>
32 |             </Grid>
33 |             <Border Height="1" Margin="0 20" Background="#ddd"/>
34 |             <TextBlock Margin="0 0 0 10">Select an audio input:</TextBlock>
35 |             <Grid ColumnSpacing="10">
36 |                 <Grid.ColumnDefinitions>
37 |                     <ColumnDefinition Width="*" />
38 |                     <ColumnDefinition Width="100" />
39 |                     <ColumnDefinition Width="100" />
40 |                 </Grid.ColumnDefinitions>
41 |                 <ComboBox Grid.Column="0" HorizontalAlignment="Stretch" x:Name="inputDeviceList" SelectionChanged="inputDeviceList_SelectionChanged" />
42 |                 <Button Grid.Column="1" HorizontalAlignment="Stretch" x:Name="recordButton" Click="recordButton_Click">Record</Button>
43 |                 <Button Grid.Column="2" HorizontalAlignment="Stretch" x:Name="stopRecordButton" IsEnabled="False" Click="stopRecordButton_Click">Stop</Button>
44 |             </Grid>
45 |             <Border Height="1" Margin="0 20" Background="#ddd"/>
46 |             <TextBlock Margin="0 0 0 10" FontWeight="Bold">Results</TextBlock>
47 |         </StackPanel>
48 |         <TextBox Grid.Row="1" x:Name="result" TextWrapping="Wrap"  IsReadOnly="True" />
49 |         <TextBlock Grid.Row="2" x:Name="error" Margin="0 10 0 0" Foreground="Red" />
50 |     </Grid>
51 | </Page>
52 | 


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Package.appxmanifest:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | 
 3 | <Package
 4 |   xmlns="http://schemas.microsoft.com/appx/manifest/foundation/windows10"
 5 |   xmlns:mp="http://schemas.microsoft.com/appx/2014/phone/manifest"
 6 |   xmlns:uap="http://schemas.microsoft.com/appx/manifest/uap/windows10"
 7 |   IgnorableNamespaces="uap mp">
 8 | 
 9 |   <Identity
10 |     Name="a79d1931-db08-441d-b5ce-1c9cf6b1c8ff"
11 |     Publisher="CN=erikz"
12 |     Version="1.0.0.0" />
13 | 
14 |   <mp:PhoneIdentity PhoneProductId="a79d1931-db08-441d-b5ce-1c9cf6b1c8ff" PhonePublisherId="00000000-0000-0000-0000-000000000000"/>
15 | 
16 |   <Properties>
17 |     <DisplayName>DeepSpeechUWP</DisplayName>
18 |     <PublisherDisplayName>erikz</PublisherDisplayName>
19 |     <Logo>Assets\StoreLogo.png</Logo>
20 |   </Properties>
21 | 
22 |   <Dependencies>
23 |     <TargetDeviceFamily Name="Windows.Universal" MinVersion="10.0.0.0" MaxVersionTested="10.0.0.0" />
24 |   </Dependencies>
25 | 
26 |   <Resources>
27 |     <Resource Language="x-generate"/>
28 |   </Resources>
29 | 
30 |   <Applications>
31 |     <Application Id="App"
32 |       Executable="$targetnametoken$.exe"
33 |       EntryPoint="DeepSpeechUWP.App">
34 |       <uap:VisualElements
35 |         DisplayName="DeepSpeechUWP"
36 |         Square150x150Logo="Assets\Square150x150Logo.png"
37 |         Square44x44Logo="Assets\Square44x44Logo.png"
38 |         Description="DeepSpeechUWP"
39 |         BackgroundColor="transparent">
40 |         <uap:DefaultTile Wide310x150Logo="Assets\Wide310x150Logo.png"/>
41 |         <uap:SplashScreen Image="Assets\SplashScreen.png" />
42 |       </uap:VisualElements>
43 |     </Application>
44 |   </Applications>
45 | 
46 |   <Capabilities>
47 |     <Capability Name="internetClient" />
48 |     <DeviceCapability Name="microphone"/>
49 |   </Capabilities>
50 | </Package>


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Reflection;
 2 | using System.Runtime.CompilerServices;
 3 | using System.Runtime.InteropServices;
 4 | 
 5 | // General Information about an assembly is controlled through the following 
 6 | // set of attributes. Change these attribute values to modify the information
 7 | // associated with an assembly.
 8 | [assembly: AssemblyTitle("DeepSpeechUWP")]
 9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("DeepSpeechUWP")]
13 | [assembly: AssemblyCopyright("Copyright ©  2020")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 | 
17 | // Version information for an assembly consists of the following four values:
18 | //
19 | //      Major Version
20 | //      Minor Version 
21 | //      Build Number
22 | //      Revision
23 | //
24 | // You can specify all the values or you can default the Build and Revision Numbers 
25 | // by using the '*' as shown below:
26 | // [assembly: AssemblyVersion("1.0.*")]
27 | [assembly: AssemblyVersion("1.0.0.0")]
28 | [assembly: AssemblyFileVersion("1.0.0.0")]
29 | [assembly: ComVisible(false)]


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/Properties/Default.rd.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |     This file contains Runtime Directives used by .NET Native. The defaults here are suitable for most
 3 |     developers. However, you can modify these parameters to modify the behavior of the .NET Native
 4 |     optimizer.
 5 | 
 6 |     Runtime Directives are documented at https://go.microsoft.com/fwlink/?LinkID=391919
 7 | 
 8 |     To fully enable reflection for App1.MyClass and all of its public/private members
 9 |     <Type Name="App1.MyClass" Dynamic="Required All"/>
10 | 
11 |     To enable dynamic creation of the specific instantiation of AppClass<T> over System.Int32
12 |     <TypeInstantiation Name="App1.AppClass" Arguments="System.Int32" Activate="Required Public" />
13 | 
14 |     Using the Namespace directive to apply reflection policy to all the types in a particular namespace
15 |     <Namespace Name="DataClasses.ViewModels" Serialize="All" />
16 | -->
17 | 
18 | <Directives xmlns="http://schemas.microsoft.com/netfx/2013/01/metadata">
19 |   <Application>
20 |     <!--
21 |       An Assembly element with Name="*Application*" applies to all assemblies in
22 |       the application package. The asterisks are not wildcards.
23 |     -->
24 |     <Assembly Name="*Application*" Dynamic="Required All" />
25 |     
26 |     
27 |     <!-- Add your application specific runtime directives here. -->
28 | 
29 | 
30 |   </Application>
31 | </Directives>


--------------------------------------------------------------------------------
/uwp/DeepSpeechUWP/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/uwp/DeepSpeechUWP/models/.gitkeep


--------------------------------------------------------------------------------
/vad_transcriber/README.md:
--------------------------------------------------------------------------------
  1 | ## Transcribing longer audio clips
  2 | 
  3 | The Command and GUI tools perform transcription on long wav files.
  4 | They take in a wav file of any duration, use the WebRTC Voice Activity Detector (VAD)
  5 | to split it into smaller chunks and finally save a consolidated transcript.
  6 | 
  7 | ### 0. Prerequisites
  8 | #### 0.1 Install requiered packages
  9 | Install the package which contains rec on the machine:
 10 | 
 11 | Fedora:
 12 | 
 13 | ``` sudo dnf install sox ```
 14 | 
 15 | Tested on: 29
 16 | 
 17 | Ubuntu/Debian
 18 | 
 19 | ``` sudo apt install sox ```
 20 | 
 21 | A list of distributions where the package is available can be found at: https://pkgs.org/download/sox
 22 | 
 23 | #### 0.1 Download Deepspeech 
 24 | Either clone from git via git clone, or Download a version from the release page
 25 | 
 26 | For the next steps we assume you have extracted the files to `~/Deepspeech`
 27 | 
 28 | 
 29 | #### 0.2 Setup your environment
 30 | 
 31 | Ubuntu/Debian:
 32 | 
 33 | ```
 34 | ~/Deepspeech$ sudo apt install virtualenv
 35 | ~/Deepspeech$ cd examples/vad_transcriber
 36 | ~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv
 37 | ~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate
 38 | (venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt
 39 | ```
 40 | 
 41 | Fedora
 42 | 
 43 | ```
 44 | ~/Deepspeech$ sudo dnf install python-virtualenv
 45 | ~/Deepspeech$ cd examples/vad_transcriber
 46 | ~/Deepspeech/examples/vad_transcriber$ virtualenv -p python3 venv
 47 | ~/Deepspeech/examples/vad_transcriber$ source venv/bin/activate
 48 | (venv) ~/Deepspeech/examples/vad_transcriber$ pip3 install -r requirements.txt
 49 | ```
 50 | 
 51 | Tested on: 29
 52 | 
 53 | ### 1. Command line tool
 54 | 
 55 | The command line tool processes a wav file of any duration and returns a trancript
 56 | which will the saved in the same directory as the input audio file.
 57 | 
 58 | The command line tool gives you control over the aggressiveness of the VAD.
 59 | Set the aggressiveness mode, to an integer between 0 and 3.
 60 | 0 being the least aggressive about filtering out non-speech, 3 is the most aggressive.
 61 | 
 62 | ```
 63 | (venv) ~/Deepspeech/examples/vad_transcriber
 64 | $ python3 audioTranscript_cmd.py --aggressive 1 --audio ./audio/guido-van-rossum.wav --model ./models/0.4.1/
 65 | 
 66 | 
 67 | Filename                       Duration(s)          Inference Time(s)    Model Load Time(s)   Scorer Load Time(s)
 68 | sample_rec.wav                 13.710               20.797               5.593                17.742
 69 | 
 70 | ```
 71 | 
 72 | **Note:** Only `wav` files with a 16kHz sample rate are supported for now, you can convert your files to the appropriate format with ffmpeg if available on your system.
 73 | 
 74 |     ffmpeg -i infile.mp3  -ar 16000 -ac 1  outfile.wav
 75 | 
 76 | ### 2. Minimalistic GUI
 77 | 
 78 | The GUI tool does the same job as the CLI tool. The VAD is fixed at an aggressiveness of 1.
 79 | The output is displayed in the transcription window and saved into the directory as the input
 80 | audio file as well.
 81 | 
 82 | ```
 83 | (venv) ~/Deepspeech/examples/vad_transcriber
 84 | $ python3 audioTranscript_gui.py
 85 | 
 86 | ```
 87 | 
 88 | ![Deepspeech Transcriber](../../doc/audioTranscript.png)
 89 | 
 90 | 
 91 | #### 2.1. Sporadic failures in pyqt
 92 | Some systems have encountered **_Cannot mix incompatible Qt library with this with this library_** issue.
 93 | In such a scenario, the GUI tool will not work. The following steps is known to have solved the issue in most cases
 94 | ```
 95 | (venv) ~/Deepspeech/examples/vad_transcriber$ pip3 uninstall pyqt5
 96 | (venv) ~/Deepspeech/examples/vad_transcriber$ sudo apt install python3-pyqt5 canberra-gtk-module
 97 | (venv) ~/Deepspeech/examples/vad_transcriber$ export PYTHONPATH=/usr/lib/python3/dist-packages/
 98 | (venv) ~/Deepspeech/examples/vad_transcriber$ python3 audioTranscript_gui.py
 99 | 
100 | ```
101 | #### 2.2 Useful Tips
102 | #####  The GUI programm immediately crashes when you press start recording
103 | This happens when you don't load the models via the "Browse Models" button, before pressing the "Start recording" button.
104 | 
105 | #####  What does error XYZ mean?
106 | You can find a list of error codes and what they mean at https://mozilla-voice-stt.readthedocs.io/en/latest/Error-Codes.html
107 | 
108 | 


--------------------------------------------------------------------------------
/vad_transcriber/audioTranscript_cmd.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import logging
 4 | import argparse
 5 | import subprocess
 6 | import shlex
 7 | import numpy as np
 8 | import wavTranscriber
 9 | 
10 | # Debug helpers
11 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
12 | 
13 | 
14 | def main(args):
15 |     parser = argparse.ArgumentParser(description='Transcribe long audio files using webRTC VAD or use the streaming interface')
16 |     parser.add_argument('--aggressive', type=int, choices=range(4), required=False,
17 |                         help='Determines how aggressive filtering out non-speech is. (Interger between 0-3)')
18 |     parser.add_argument('--audio', required=False,
19 |                         help='Path to the audio file to run (WAV format)')
20 |     parser.add_argument('--model', required=True,
21 |                         help='Path to directory that contains all model files (output_graph and scorer)')
22 |     parser.add_argument('--stream', required=False, action='store_true',
23 |                         help='To use deepspeech streaming interface')
24 |     args = parser.parse_args()
25 |     if args.stream is True:
26 |         print("Opening mic for streaming")
27 |     elif args.audio is not None:
28 |         logging.debug("Transcribing audio file @ %s" % args.audio)
29 |     else:
30 |         parser.print_help()
31 |         parser.exit()
32 | 
33 |     # Point to a path containing the pre-trained models & resolve ~ if used
34 |     dirName = os.path.expanduser(args.model)
35 | 
36 |     # Resolve all the paths of model files
37 |     output_graph, scorer = wavTranscriber.resolve_models(dirName)
38 | 
39 |     # Load output_graph, alpahbet and scorer
40 |     model_retval = wavTranscriber.load_model(output_graph, scorer)
41 | 
42 |     if args.audio is not None:
43 |         title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)']
44 |         print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
45 | 
46 |         inference_time = 0.0
47 | 
48 |         # Run VAD on the input file
49 |         waveFile = args.audio
50 |         segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, args.aggressive)
51 |         f = open(waveFile.rstrip(".wav") + ".txt", 'w')
52 |         logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")
53 | 
54 |         for i, segment in enumerate(segments):
55 |             # Run deepspeech on the chunk that just completed VAD
56 |             logging.debug("Processing chunk %002d" % (i,))
57 |             audio = np.frombuffer(segment, dtype=np.int16)
58 |             output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
59 |             inference_time += output[1]
60 |             logging.debug("Transcript: %s" % output[0])
61 | 
62 |             f.write(output[0] + " ")
63 | 
64 |         # Summary of the files processed
65 |         f.close()
66 | 
67 |         # Extract filename from the full file path
68 |         filename, ext = os.path.split(os.path.basename(waveFile))
69 |         logging.debug("************************************************************************************************************")
70 |         logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
71 |         logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
72 |         logging.debug("************************************************************************************************************")
73 |         print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
74 |     else:
75 |         sctx = model_retval[0].createStream()
76 |         subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
77 |                                    stdout=subprocess.PIPE,
78 |                                    bufsize=0)
79 |         print('You can start speaking now. Press Control-C to stop recording.')
80 | 
81 |         try:
82 |             while True:
83 |                 data = subproc.stdout.read(512)
84 |                 sctx.feedAudioContent(np.frombuffer(data, np.int16))
85 |         except KeyboardInterrupt:
86 |             print('Transcription: ', sctx.finishStream())
87 |             subproc.terminate()
88 |             subproc.wait()
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     main(sys.argv[1:])
93 | 


--------------------------------------------------------------------------------
/vad_transcriber/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeech==0.9.3
2 | webrtcvad
3 | pyqt5
4 | 


--------------------------------------------------------------------------------
/vad_transcriber/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | pushd ${THIS}
 8 |   source ../tests.sh
 9 | 
10 |   pip install --user $(get_python_wheel_url "$1")
11 |   pip install --user -r <(grep -v deepspeech requirements.txt)
12 | 
13 |   python audioTranscript_cmd.py \
14 | 	  --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
15 | 	  --aggressive 0 \
16 | 	  --model $HOME/DeepSpeech/models/
17 | 
18 |   python audioTranscript_cmd.py \
19 | 	  --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
20 | 	  --aggressive 0 \
21 | 	  --model $HOME/DeepSpeech/models/ \
22 | 	  --stream
23 | popd
24 | 


--------------------------------------------------------------------------------
/vad_transcriber/wavSplit.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import contextlib
  3 | import wave
  4 | 
  5 | 
  6 | def read_wave(path):
  7 |     """Reads a .wav file.
  8 | 
  9 |     Takes the path, and returns (PCM audio data, sample rate).
 10 |     """
 11 |     with contextlib.closing(wave.open(path, 'rb')) as wf:
 12 |         num_channels = wf.getnchannels()
 13 |         assert num_channels == 1
 14 |         sample_width = wf.getsampwidth()
 15 |         assert sample_width == 2
 16 |         sample_rate = wf.getframerate()
 17 |         assert sample_rate in (8000, 16000, 32000)
 18 |         frames = wf.getnframes()
 19 |         pcm_data = wf.readframes(frames)
 20 |         duration = frames / sample_rate
 21 |         return pcm_data, sample_rate, duration
 22 | 
 23 | 
 24 | def write_wave(path, audio, sample_rate):
 25 |     """Writes a .wav file.
 26 | 
 27 |     Takes path, PCM audio data, and sample rate.
 28 |     """
 29 |     with contextlib.closing(wave.open(path, 'wb')) as wf:
 30 |         wf.setnchannels(1)
 31 |         wf.setsampwidth(2)
 32 |         wf.setframerate(sample_rate)
 33 |         wf.writeframes(audio)
 34 | 
 35 | 
 36 | class Frame(object):
 37 |     """Represents a "frame" of audio data."""
 38 |     def __init__(self, bytes, timestamp, duration):
 39 |         self.bytes = bytes
 40 |         self.timestamp = timestamp
 41 |         self.duration = duration
 42 | 
 43 | 
 44 | def frame_generator(frame_duration_ms, audio, sample_rate):
 45 |     """Generates audio frames from PCM audio data.
 46 | 
 47 |     Takes the desired frame duration in milliseconds, the PCM data, and
 48 |     the sample rate.
 49 | 
 50 |     Yields Frames of the requested duration.
 51 |     """
 52 |     n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
 53 |     offset = 0
 54 |     timestamp = 0.0
 55 |     duration = (float(n) / sample_rate) / 2.0
 56 |     while offset + n < len(audio):
 57 |         yield Frame(audio[offset:offset + n], timestamp, duration)
 58 |         timestamp += duration
 59 |         offset += n
 60 | 
 61 | 
 62 | def vad_collector(sample_rate, frame_duration_ms,
 63 |                   padding_duration_ms, vad, frames):
 64 |     """Filters out non-voiced audio frames.
 65 | 
 66 |     Given a webrtcvad.Vad and a source of audio frames, yields only
 67 |     the voiced audio.
 68 | 
 69 |     Uses a padded, sliding window algorithm over the audio frames.
 70 |     When more than 90% of the frames in the window are voiced (as
 71 |     reported by the VAD), the collector triggers and begins yielding
 72 |     audio frames. Then the collector waits until 90% of the frames in
 73 |     the window are unvoiced to detrigger.
 74 | 
 75 |     The window is padded at the front and back to provide a small
 76 |     amount of silence or the beginnings/endings of speech around the
 77 |     voiced frames.
 78 | 
 79 |     Arguments:
 80 | 
 81 |     sample_rate - The audio sample rate, in Hz.
 82 |     frame_duration_ms - The frame duration in milliseconds.
 83 |     padding_duration_ms - The amount to pad the window, in milliseconds.
 84 |     vad - An instance of webrtcvad.Vad.
 85 |     frames - a source of audio frames (sequence or generator).
 86 | 
 87 |     Returns: A generator that yields PCM audio data.
 88 |     """
 89 |     num_padding_frames = int(padding_duration_ms / frame_duration_ms)
 90 |     # We use a deque for our sliding window/ring buffer.
 91 |     ring_buffer = collections.deque(maxlen=num_padding_frames)
 92 |     # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
 93 |     # NOTTRIGGERED state.
 94 |     triggered = False
 95 | 
 96 |     voiced_frames = []
 97 |     for frame in frames:
 98 |         is_speech = vad.is_speech(frame.bytes, sample_rate)
 99 | 
100 |         if not triggered:
101 |             ring_buffer.append((frame, is_speech))
102 |             num_voiced = len([f for f, speech in ring_buffer if speech])
103 |             # If we're NOTTRIGGERED and more than 90% of the frames in
104 |             # the ring buffer are voiced frames, then enter the
105 |             # TRIGGERED state.
106 |             if num_voiced > 0.9 * ring_buffer.maxlen:
107 |                 triggered = True
108 |                 # We want to yield all the audio we see from now until
109 |                 # we are NOTTRIGGERED, but we have to start with the
110 |                 # audio that's already in the ring buffer.
111 |                 for f, s in ring_buffer:
112 |                     voiced_frames.append(f)
113 |                 ring_buffer.clear()
114 |         else:
115 |             # We're in the TRIGGERED state, so collect the audio data
116 |             # and add it to the ring buffer.
117 |             voiced_frames.append(frame)
118 |             ring_buffer.append((frame, is_speech))
119 |             num_unvoiced = len([f for f, speech in ring_buffer if not speech])
120 |             # If more than 90% of the frames in the ring buffer are
121 |             # unvoiced, then enter NOTTRIGGERED and yield whatever
122 |             # audio we've collected.
123 |             if num_unvoiced > 0.9 * ring_buffer.maxlen:
124 |                 triggered = False
125 |                 yield b''.join([f.bytes for f in voiced_frames])
126 |                 ring_buffer.clear()
127 |                 voiced_frames = []
128 |     if triggered:
129 |         pass
130 |     # If we have any leftover voiced audio when we run out of input,
131 |     # yield it.
132 |     if voiced_frames:
133 |         yield b''.join([f.bytes for f in voiced_frames])
134 | 
135 | 


--------------------------------------------------------------------------------
/vad_transcriber/wavTranscriber.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import webrtcvad
 3 | import logging
 4 | import wavSplit
 5 | from deepspeech import Model
 6 | from timeit import default_timer as timer
 7 | 
 8 | '''
 9 | Load the pre-trained model into the memory
10 | @param models: Output Grapgh Protocol Buffer file
11 | @param scorer: Scorer file
12 | 
13 | @Retval
14 | Returns a list [DeepSpeech Object, Model Load Time, Scorer Load Time]
15 | '''
16 | def load_model(models, scorer):
17 |     model_load_start = timer()
18 |     ds = Model(models)
19 |     model_load_end = timer() - model_load_start
20 |     logging.debug("Loaded model in %0.3fs." % (model_load_end))
21 | 
22 |     scorer_load_start = timer()
23 |     ds.enableExternalScorer(scorer)
24 |     scorer_load_end = timer() - scorer_load_start
25 |     logging.debug('Loaded external scorer in %0.3fs.' % (scorer_load_end))
26 | 
27 |     return [ds, model_load_end, scorer_load_end]
28 | 
29 | '''
30 | Run Inference on input audio file
31 | @param ds: Deepspeech object
32 | @param audio: Input audio for running inference on
33 | @param fs: Sample rate of the input audio file
34 | 
35 | @Retval:
36 | Returns a list [Inference, Inference Time, Audio Length]
37 | 
38 | '''
39 | def stt(ds, audio, fs):
40 |     inference_time = 0.0
41 |     audio_length = len(audio) * (1 / fs)
42 | 
43 |     # Run Deepspeech
44 |     logging.debug('Running inference...')
45 |     inference_start = timer()
46 |     output = ds.stt(audio)
47 |     inference_end = timer() - inference_start
48 |     inference_time += inference_end
49 |     logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
50 | 
51 |     return [output, inference_time]
52 | 
53 | '''
54 | Resolve directory path for the models and fetch each of them.
55 | @param dirName: Path to the directory containing pre-trained models
56 | 
57 | @Retval:
58 | Retunns a tuple containing each of the model files (pb, scorer)
59 | '''
60 | def resolve_models(dirName):
61 |     pb = glob.glob(dirName + "/*.pbmm")[0]
62 |     logging.debug("Found Model: %s" % pb)
63 | 
64 |     scorer = glob.glob(dirName + "/*.scorer")[0]
65 |     logging.debug("Found scorer: %s" % scorer)
66 | 
67 |     return pb, scorer
68 | 
69 | '''
70 | Generate VAD segments. Filters out non-voiced audio frames.
71 | @param waveFile: Input wav file to run VAD on.0
72 | 
73 | @Retval:
74 | Returns tuple of
75 |     segments: a bytearray of multiple smaller audio frames
76 |               (The longer audio split into mutiple smaller one's)
77 |     sample_rate: Sample rate of the input audio file
78 |     audio_length: Duraton of the input audio file
79 | 
80 | '''
81 | def vad_segment_generator(wavFile, aggressiveness):
82 |     logging.debug("Caught the wav file @: %s" % (wavFile))
83 |     audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
84 |     assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
85 |     vad = webrtcvad.Vad(int(aggressiveness))
86 |     frames = wavSplit.frame_generator(30, audio, sample_rate)
87 |     frames = list(frames)
88 |     segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames)
89 | 
90 |     return segments, sample_rate, audio_length
91 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | /deepspeech-0.6.0-models
 4 | 
 5 | # dependencies
 6 | /node_modules
 7 | /.pnp
 8 | .pnp.js
 9 | 
10 | # testing
11 | /coverage
12 | 
13 | # production
14 | /build
15 | 
16 | # misc
17 | .DS_Store
18 | .env.local
19 | .env.development.local
20 | .env.test.local
21 | .env.production.local
22 | 
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/Readme.md:
--------------------------------------------------------------------------------
 1 | # Web Microphone Websocket
 2 | 
 3 | This is an example of a ReactJS web application streaming microphone audio from the browser
 4 | to a NodeJS server and transmitting the DeepSpeech results back to the browser.
 5 | 
 6 | #### Download the pre-trained model (1.8GB):
 7 | 
 8 | ```
 9 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
10 | wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
11 | ```
12 | 
13 | #### Install:
14 | 
15 | ```
16 | yarn install
17 | ```
18 | 
19 | #### Run ReactJS Client:
20 | 
21 | ```
22 | yarn start
23 | ```
24 | 
25 | #### Run NodeJS Server (in a separate terminal window):
26 | 
27 | ```
28 | node server.js
29 | ```


--------------------------------------------------------------------------------
/web_microphone_websocket/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "web-microphone-websocket",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "dependencies": {
 6 |     "@testing-library/jest-dom": "^4.2.4",
 7 |     "@testing-library/react": "^9.3.2",
 8 |     "@testing-library/user-event": "^7.1.2",
 9 |     "chai": "^4.2.0",
10 |     "chai-http": "^4.3.0",
11 |     "deepspeech": "^0.9.3",
12 |     "defaults": "^1.0.3",
13 |     "mocha": "^6.1.4",
14 |     "node-vad": "^1.1.4",
15 |     "react": "^16.12.0",
16 |     "react-dom": "^16.12.0",
17 |     "react-scripts": "^3.4.0",
18 |     "should": "^13.2.3",
19 |     "should-http": "^0.1.1",
20 |     "socket.io": "^2.3.0",
21 |     "socket.io-client": "^2.3.0"
22 |   },
23 |   "scripts": {
24 |     "start": "react-scripts start",
25 |     "build": "react-scripts build",
26 |     "test:client": "react-scripts test --env=jsdom --watchAll=false --coverage",
27 |     "test:server": "NODE_ENV=dev mocha --recursive ./test/config.js ./test --exit",
28 |     "eject": "react-scripts eject"
29 |   },
30 |   "eslintConfig": {
31 |     "extends": "react-app"
32 |   },
33 |   "browserslist": {
34 |     "production": [
35 |       ">0.2%",
36 |       "not dead",
37 |       "not op_mini all"
38 |     ],
39 |     "development": [
40 |       "last 1 chrome version",
41 |       "last 1 firefox version",
42 |       "last 1 safari version"
43 |     ]
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/public/downsampling_worker.js:
--------------------------------------------------------------------------------
 1 | // source:
 2 | // https://github.com/Picovoice/web-voice-processor/blob/master/src/downsampling_worker.js
 3 | 
 4 | onmessage = function (e) {
 5 |     switch (e.data.command) {
 6 |         case "init":
 7 |             init(e.data.inputSampleRate);
 8 |             break;
 9 |         case "process":
10 |             process(e.data.inputFrame);
11 |             break;
12 |         case "reset":
13 |             reset();
14 |             break;
15 |     }
16 | };
17 | 
18 | let inputSampleRate;
19 | let inputBuffer = [];
20 | 
21 | function init(x) {
22 |     inputSampleRate = x;
23 | }
24 | 
25 | function process(inputFrame) {
26 |     for (let i = 0; i < inputFrame.length; i++) {
27 |         inputBuffer.push((inputFrame[i]) * 32767);
28 |     }
29 |     
30 |     const PV_SAMPLE_RATE = 16000;
31 |     const PV_FRAME_LENGTH = 512;
32 |     
33 |     while ((inputBuffer.length * PV_SAMPLE_RATE / inputSampleRate) > PV_FRAME_LENGTH) {
34 |         let outputFrame = new Int16Array(PV_FRAME_LENGTH);
35 |         let sum = 0;
36 |         let num = 0;
37 |         let outputIndex = 0;
38 |         let inputIndex = 0;
39 |         
40 |         while (outputIndex < PV_FRAME_LENGTH) {
41 |             sum = 0;
42 |             num = 0;
43 |             while (inputIndex < Math.min(inputBuffer.length, (outputIndex + 1) * inputSampleRate / PV_SAMPLE_RATE)) {
44 |                 sum += inputBuffer[inputIndex];
45 |                 num++;
46 |                 inputIndex++;
47 |             }
48 |             outputFrame[outputIndex] = sum / num;
49 |             outputIndex++;
50 |         }
51 |         
52 |         postMessage(outputFrame);
53 |         
54 |         inputBuffer = inputBuffer.slice(inputIndex);
55 |     }
56 | }
57 | 
58 | function reset() {
59 |     inputBuffer = [];
60 | }


--------------------------------------------------------------------------------
/web_microphone_websocket/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/web_microphone_websocket/public/favicon.ico


--------------------------------------------------------------------------------
/web_microphone_websocket/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 7 |     <meta name="theme-color" content="#000000" />
 8 |     <meta
 9 |       name="description"
10 |       content="Web site created using create-react-app"
11 |     />
12 |     <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13 |     <!--
14 |       manifest.json provides metadata used when your web app is installed on a
15 |       user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
16 |     -->
17 |     <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
18 |     <!--
19 |       Notice the use of %PUBLIC_URL% in the tags above.
20 |       It will be replaced with the URL of the `public` folder during the build.
21 |       Only files inside the `public` folder can be referenced from the HTML.
22 | 
23 |       Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
24 |       work correctly both with client-side routing and a non-root public URL.
25 |       Learn how to configure a non-root public URL by running `npm run build`.
26 |     -->
27 |     <title>DeepSpeech - Web Microphone Websocket Example</title>
28 |   </head>
29 |   <body>
30 |     <noscript>You need to enable JavaScript to run this app.</noscript>
31 |     <div id="root"></div>
32 |     <!--
33 |       This HTML file is a template.
34 |       If you open it directly in the browser, you will see an empty page.
35 | 
36 |       You can add webfonts, meta tags, or analytics to this file.
37 |       The build step will place the bundled scripts into the <body> tag.
38 | 
39 |       To begin the development, run `npm start` or `yarn start`.
40 |       To create a production bundle, use `npm run build` or `yarn build`.
41 |     -->
42 |   </body>
43 | </html>
44 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/public/logo192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/web_microphone_websocket/public/logo192.png


--------------------------------------------------------------------------------
/web_microphone_websocket/public/logo512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/DeepSpeech-examples/0bfefeb8e4769e3c895b9bc6c5a34cfbdfcbd645/web_microphone_websocket/public/logo512.png


--------------------------------------------------------------------------------
/web_microphone_websocket/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "React App",
 3 |   "name": "Create React App Sample",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     },
10 |     {
11 |       "src": "logo192.png",
12 |       "type": "image/png",
13 |       "sizes": "192x192"
14 |     },
15 |     {
16 |       "src": "logo512.png",
17 |       "type": "image/png",
18 |       "sizes": "512x512"
19 |     }
20 |   ],
21 |   "start_url": ".",
22 |   "display": "standalone",
23 |   "theme_color": "#000000",
24 |   "background_color": "#ffffff"
25 | }
26 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/server.js:
--------------------------------------------------------------------------------
  1 | const http = require('http');
  2 | const socketIO = require('socket.io');
  3 | const DeepSpeech = require('deepspeech');
  4 | const VAD = require('node-vad');
  5 | 
  6 | let DEEPSPEECH_MODEL = __dirname + '/deepspeech-0.9.3-models'; // path to deepspeech english model directory
  7 | 
  8 | let SILENCE_THRESHOLD = 200; // how many milliseconds of inactivity before processing the audio
  9 | 
 10 | const SERVER_PORT = 4000; // websocket server port
 11 | 
 12 | // const VAD_MODE = VAD.Mode.NORMAL;
 13 | // const VAD_MODE = VAD.Mode.LOW_BITRATE;
 14 | // const VAD_MODE = VAD.Mode.AGGRESSIVE;
 15 | const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
 16 | const vad = new VAD(VAD_MODE);
 17 | 
 18 | function createModel(modelDir) {
 19 | 	let modelPath = modelDir + '.pbmm';
 20 | 	let scorerPath = modelDir + '.scorer';
 21 | 	let model = new DeepSpeech.Model(modelPath);
 22 | 	model.enableExternalScorer(scorerPath);
 23 | 	return model;
 24 | }
 25 | 
 26 | let englishModel = createModel(DEEPSPEECH_MODEL);
 27 | 
 28 | let modelStream;
 29 | let recordedChunks = 0;
 30 | let silenceStart = null;
 31 | let recordedAudioLength = 0;
 32 | let endTimeout = null;
 33 | let silenceBuffers = [];
 34 | 
 35 | function processAudioStream(data, callback) {
 36 | 	vad.processAudio(data, 16000).then((res) => {
 37 | 		switch (res) {
 38 | 			case VAD.Event.ERROR:
 39 | 				console.log("VAD ERROR");
 40 | 				break;
 41 | 			case VAD.Event.NOISE:
 42 | 				console.log("VAD NOISE");
 43 | 				break;
 44 | 			case VAD.Event.SILENCE:
 45 | 				processSilence(data, callback);
 46 | 				break;
 47 | 			case VAD.Event.VOICE:
 48 | 				processVoice(data);
 49 | 				break;
 50 | 			default:
 51 | 				console.log('default', res);
 52 | 				
 53 | 		}
 54 | 	});
 55 | 	
 56 | 	// timeout after 1s of inactivity
 57 | 	clearTimeout(endTimeout);
 58 | 	endTimeout = setTimeout(function() {
 59 | 		console.log('timeout');
 60 | 		resetAudioStream();
 61 | 	},1000);
 62 | }
 63 | 
 64 | function endAudioStream(callback) {
 65 | 	console.log('[end]');
 66 | 	let results = intermediateDecode();
 67 | 	if (results) {
 68 | 		if (callback) {
 69 | 			callback(results);
 70 | 		}
 71 | 	}
 72 | }
 73 | 
 74 | function resetAudioStream() {
 75 | 	clearTimeout(endTimeout);
 76 | 	console.log('[reset]');
 77 | 	intermediateDecode(); // ignore results
 78 | 	recordedChunks = 0;
 79 | 	silenceStart = null;
 80 | }
 81 | 
 82 | function processSilence(data, callback) {
 83 | 	if (recordedChunks > 0) { // recording is on
 84 | 		process.stdout.write('-'); // silence detected while recording
 85 | 		
 86 | 		feedAudioContent(data);
 87 | 		
 88 | 		if (silenceStart === null) {
 89 | 			silenceStart = new Date().getTime();
 90 | 		}
 91 | 		else {
 92 | 			let now = new Date().getTime();
 93 | 			if (now - silenceStart > SILENCE_THRESHOLD) {
 94 | 				silenceStart = null;
 95 | 				console.log('[end]');
 96 | 				let results = intermediateDecode();
 97 | 				if (results) {
 98 | 					if (callback) {
 99 | 						callback(results);
100 | 					}
101 | 				}
102 | 			}
103 | 		}
104 | 	}
105 | 	else {
106 | 		process.stdout.write('.'); // silence detected while not recording
107 | 		bufferSilence(data);
108 | 	}
109 | }
110 | 
111 | function bufferSilence(data) {
112 | 	// VAD has a tendency to cut the first bit of audio data from the start of a recording
113 | 	// so keep a buffer of that first bit of audio and in addBufferedSilence() reattach it to the beginning of the recording
114 | 	silenceBuffers.push(data);
115 | 	if (silenceBuffers.length >= 3) {
116 | 		silenceBuffers.shift();
117 | 	}
118 | }
119 | 
120 | function addBufferedSilence(data) {
121 | 	let audioBuffer;
122 | 	if (silenceBuffers.length) {
123 | 		silenceBuffers.push(data);
124 | 		let length = 0;
125 | 		silenceBuffers.forEach(function (buf) {
126 | 			length += buf.length;
127 | 		});
128 | 		audioBuffer = Buffer.concat(silenceBuffers, length);
129 | 		silenceBuffers = [];
130 | 	}
131 | 	else audioBuffer = data;
132 | 	return audioBuffer;
133 | }
134 | 
135 | function processVoice(data) {
136 | 	silenceStart = null;
137 | 	if (recordedChunks === 0) {
138 | 		console.log('');
139 | 		process.stdout.write('[start]'); // recording started
140 | 	}
141 | 	else {
142 | 		process.stdout.write('='); // still recording
143 | 	}
144 | 	recordedChunks++;
145 | 	
146 | 	data = addBufferedSilence(data);
147 | 	feedAudioContent(data);
148 | }
149 | 
150 | function createStream() {
151 | 	modelStream = englishModel.createStream();
152 | 	recordedChunks = 0;
153 | 	recordedAudioLength = 0;
154 | }
155 | 
156 | function finishStream() {
157 | 	if (modelStream) {
158 | 		let start = new Date();
159 | 		let text = modelStream.finishStream();
160 | 		if (text) {
161 | 			console.log('');
162 | 			console.log('Recognized Text:', text);
163 | 			let recogTime = new Date().getTime() - start.getTime();
164 | 			return {
165 | 				text,
166 | 				recogTime,
167 | 				audioLength: Math.round(recordedAudioLength)
168 | 			};
169 | 		}
170 | 	}
171 | 	silenceBuffers = [];
172 | 	modelStream = null;
173 | }
174 | 
175 | function intermediateDecode() {
176 | 	let results = finishStream();
177 | 	createStream();
178 | 	return results;
179 | }
180 | 
181 | function feedAudioContent(chunk) {
182 | 	recordedAudioLength += (chunk.length / 2) * (1 / 16000) * 1000;
183 | 	modelStream.feedAudioContent(chunk);
184 | }
185 | 
186 | const app = http.createServer(function (req, res) {
187 | 	res.writeHead(200);
188 | 	res.write('web-microphone-websocket');
189 | 	res.end();
190 | });
191 | 
192 | const io = socketIO(app, {});
193 | io.set('origins', '*:*');
194 | 
195 | io.on('connection', function(socket) {
196 | 	console.log('client connected');
197 | 	
198 | 	socket.once('disconnect', () => {
199 | 		console.log('client disconnected');
200 | 	});
201 | 	
202 | 	createStream();
203 | 	
204 | 	socket.on('stream-data', function(data) {
205 | 		processAudioStream(data, (results) => {
206 | 			socket.emit('recognize', results);
207 | 		});
208 | 	});
209 | 	
210 | 	socket.on('stream-end', function() {
211 | 		endAudioStream((results) => {
212 | 			socket.emit('recognize', results);
213 | 		});
214 | 	});
215 | 	
216 | 	socket.on('stream-reset', function() {
217 | 		resetAudioStream();
218 | 	});
219 | });
220 | 
221 | app.listen(SERVER_PORT, 'localhost', () => {
222 | 	console.log('Socket server listening on:', SERVER_PORT);
223 | });
224 | 
225 | module.exports = app;


--------------------------------------------------------------------------------
/web_microphone_websocket/src/App.js:
--------------------------------------------------------------------------------
  1 | import React, {Component} from 'react';
  2 | import io from 'socket.io-client';
  3 | 
  4 | const DOWNSAMPLING_WORKER = './downsampling_worker.js';
  5 | 
  6 | class App extends Component {
  7 | 	constructor(props) {
  8 | 		super(props);
  9 | 		this.state = {
 10 | 			connected: false,
 11 | 			recording: false,
 12 | 			recordingStart: 0,
 13 | 			recordingTime: 0,
 14 | 			recognitionOutput: []
 15 | 		};
 16 | 	}
 17 | 	
 18 | 	componentDidMount() {
 19 | 		let recognitionCount = 0;
 20 | 		
 21 | 		this.socket = io.connect('http://localhost:4000', {});
 22 | 		
 23 | 		this.socket.on('connect', () => {
 24 | 			console.log('socket connected');
 25 | 			this.setState({connected: true});
 26 | 		});
 27 | 		
 28 | 		this.socket.on('disconnect', () => {
 29 | 			console.log('socket disconnected');
 30 | 			this.setState({connected: false});
 31 | 			this.stopRecording();
 32 | 		});
 33 | 		
 34 | 		this.socket.on('recognize', (results) => {
 35 | 			console.log('recognized:', results);
 36 | 			const {recognitionOutput} = this.state;
 37 | 			results.id = recognitionCount++;
 38 | 			recognitionOutput.unshift(results);
 39 | 			this.setState({recognitionOutput});
 40 | 		});
 41 | 	}
 42 | 	
 43 | 	render() {
 44 | 		return (<div className="App">
 45 | 			<div>
 46 | 				<button disabled={!this.state.connected || this.state.recording} onClick={this.startRecording}>
 47 | 					Start Recording
 48 | 				</button>
 49 | 				
 50 | 				<button disabled={!this.state.recording} onClick={this.stopRecording}>
 51 | 					Stop Recording
 52 | 				</button>
 53 | 				
 54 | 				{this.renderTime()}
 55 | 			</div>
 56 | 			{this.renderRecognitionOutput()}
 57 | 		</div>);
 58 | 	}
 59 | 	
 60 | 	renderTime() {
 61 | 		return (<span>
 62 | 			{(Math.round(this.state.recordingTime / 100) / 10).toFixed(1)}s
 63 | 		</span>);
 64 | 	}
 65 | 	
 66 | 	renderRecognitionOutput() {
 67 | 		return (<ul>
 68 | 			{this.state.recognitionOutput.map((r) => {
 69 | 				return (<li key={r.id}>{r.text}</li>);
 70 | 			})}
 71 | 		</ul>)
 72 | 	}
 73 | 	
 74 | 	createAudioProcessor(audioContext, audioSource) {
 75 | 		let processor = audioContext.createScriptProcessor(4096, 1, 1);
 76 | 		
 77 | 		const sampleRate = audioSource.context.sampleRate;
 78 | 		
 79 | 		let downsampler = new Worker(DOWNSAMPLING_WORKER);
 80 | 		downsampler.postMessage({command: "init", inputSampleRate: sampleRate});
 81 | 		downsampler.onmessage = (e) => {
 82 | 			if (this.socket.connected) {
 83 | 				this.socket.emit('stream-data', e.data.buffer);
 84 | 			}
 85 | 		};
 86 | 		
 87 | 		processor.onaudioprocess = (event) => {
 88 | 			var data = event.inputBuffer.getChannelData(0);
 89 | 			downsampler.postMessage({command: "process", inputFrame: data});
 90 | 		};
 91 | 		
 92 | 		processor.shutdown = () => {
 93 | 			processor.disconnect();
 94 | 			this.onaudioprocess = null;
 95 | 		};
 96 | 		
 97 | 		processor.connect(audioContext.destination);
 98 | 		
 99 | 		return processor;
100 | 	}
101 | 	
102 | 	startRecording = e => {
103 | 		if (!this.state.recording) {
104 | 			this.recordingInterval = setInterval(() => {
105 | 				let recordingTime = new Date().getTime() - this.state.recordingStart;
106 | 				this.setState({recordingTime});
107 | 			}, 100);
108 | 			
109 | 			this.setState({
110 | 				recording: true,
111 | 				recordingStart: new Date().getTime(),
112 | 				recordingTime: 0
113 | 			}, () => {
114 | 				this.startMicrophone();
115 | 			});
116 | 		}
117 | 	};
118 | 	
119 | 	startMicrophone() {
120 | 		this.audioContext = new AudioContext();
121 | 		
122 | 		const success = (stream) => {
123 | 			console.log('started recording');
124 | 			this.mediaStream = stream;
125 | 			this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream);
126 | 			this.processor = this.createAudioProcessor(this.audioContext, this.mediaStreamSource);
127 | 			this.mediaStreamSource.connect(this.processor);
128 | 		};
129 | 		
130 | 		const fail = (e) => {
131 | 			console.error('recording failure', e);
132 | 		};
133 | 		
134 | 		if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
135 | 			navigator.mediaDevices.getUserMedia({
136 | 				video: false,
137 | 				audio: true
138 | 			})
139 | 			.then(success)
140 | 			.catch(fail);
141 | 		}
142 | 		else {
143 | 			navigator.getUserMedia({
144 | 				video: false,
145 | 				audio: true
146 | 			}, success, fail);
147 | 		}
148 | 	}
149 | 	
150 | 	stopRecording = e => {
151 | 		if (this.state.recording) {
152 | 			if (this.socket.connected) {
153 | 				this.socket.emit('stream-reset');
154 | 			}
155 | 			clearInterval(this.recordingInterval);
156 | 			this.setState({
157 | 				recording: false
158 | 			}, () => {
159 | 				this.stopMicrophone();
160 | 			});
161 | 		}
162 | 	};
163 | 	
164 | 	stopMicrophone() {
165 | 		if (this.mediaStream) {
166 | 			this.mediaStream.getTracks()[0].stop();
167 | 		}
168 | 		if (this.mediaStreamSource) {
169 | 			this.mediaStreamSource.disconnect();
170 | 		}
171 | 		if (this.processor) {
172 | 			this.processor.shutdown();
173 | 		}
174 | 		if (this.audioContext) {
175 | 			this.audioContext.close();
176 | 		}
177 | 	}
178 | }
179 | 
180 | export default App;
181 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/src/App.test.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import { render } from '@testing-library/react';
 3 | import App from './App';
 4 | 
 5 | test('Renders the start recording button', () => {
 6 |   const { getByText } = render(<App />);
 7 |   const startButton = getByText(/Start Recording/);
 8 |   expect(startButton).toBeInTheDocument();
 9 | });
10 | 
11 | test('Renders the stop recording button', () => {
12 |   const { getByText } = render(<App />);
13 |   const stopButton = getByText(/Stop Recording/);
14 |   expect(stopButton).toBeInTheDocument();
15 | });
16 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/src/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   margin: 0;
 3 |   font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
 4 |     'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
 5 |     sans-serif;
 6 |   -webkit-font-smoothing: antialiased;
 7 |   -moz-osx-font-smoothing: grayscale;
 8 | }
 9 | 
10 | code {
11 |   font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 |     monospace;
13 | }
14 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/src/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom';
3 | import './index.css';
4 | import App from './App';
5 | 
6 | ReactDOM.render(<App />, document.getElementById('root'));


--------------------------------------------------------------------------------
/web_microphone_websocket/src/setupTests.js:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom/extend-expect';
6 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | THIS=$(dirname "$0")
 6 | 
 7 | pushd ${THIS}
 8 |   source ../tests.sh
 9 | 
10 |   npm install $(get_npm_package_url)
11 |   npm install
12 | 
13 |   ln -s $HOME/DeepSpeech/models deepspeech-0.9.3-models
14 | 
15 |   yarn run test:client
16 |   yarn run test:server
17 | 
18 | popd
19 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/test/config.js:
--------------------------------------------------------------------------------
 1 | console.log('starting test server');
 2 | before(function(done) {
 3 | 	app = require('../server');
 4 | 	app.on('listening', function() {
 5 | 		console.log('listening');
 6 | 		done();
 7 | 	});
 8 | 	this.timeout(5000);
 9 | });
10 | 


--------------------------------------------------------------------------------
/web_microphone_websocket/test/server.test.js:
--------------------------------------------------------------------------------
 1 | const chai = require('chai');
 2 | const chaiHttp = require('chai-http');
 3 | const should = require('should-http');
 4 | chai.use(chaiHttp);
 5 | const expect = chai.expect;
 6 | const fs = require('fs');
 7 | const io = require('socket.io-client');
 8 | 
 9 | const url = 'http://localhost:4000';
10 | 
11 | let audioFile1 = process.env.HOME + '/DeepSpeech/audio/2830-3980-0043.wav';
12 | let audioFile2 = process.env.HOME + '/DeepSpeech/audio/8455-210777-0068.wav';
13 | let audioFile3 = process.env.HOME + '/DeepSpeech/audio/4507-16021-0012.wav';
14 | 
15 | let socket;
16 | 
17 | before(function(done) {
18 | 	console.log('before');
19 | 	socket = io.connect(url, {});
20 | 	done();
21 | });
22 | 
23 | describe('GET /', function() {
24 | 	it('should return web-microphone-websocket', function(done) {
25 | 		chai.request(url)
26 | 		.get('/')
27 | 		.end(function(err, res){
28 | 			res.should.have.status(200);
29 | 			expect(res.text).to.be.equal('web-microphone-websocket');
30 | 			done();
31 | 		});
32 | 	});
33 | });
34 | 
35 | describe('Websocket Audio', function() {
36 | 	
37 | 	it('audioFile1: experience proof this', function(done) {
38 | 		socket.once('recognize', (results) => {
39 | 			expect(results.text).to.be.equal('experience proof this');
40 | 			done();
41 | 		});
42 | 		
43 | 		fs.createReadStream(audioFile1, {highWaterMark: 4096})
44 | 		.on('data', function (chunk) {
45 | 			socket.emit('microphone-data', chunk);
46 | 		})
47 | 		.on('end', function () {
48 | 			socket.emit('microphone-end');
49 | 		});
50 | 	});
51 | 	
52 | 	it('audioFile2: your power is sufficient i said', function(done) {
53 | 		socket.once('recognize', (results) => {
54 | 			expect(results.text).to.be.equal('your power is sufficient i said');
55 | 			done();
56 | 		});
57 | 
58 | 		fs.createReadStream(audioFile2, {highWaterMark: 4096})
59 | 		.on('data', function (chunk) {
60 | 			socket.emit('microphone-data', chunk);
61 | 		})
62 | 		.on('end', function () {
63 | 			socket.emit('microphone-end');
64 | 		});
65 | 	});
66 | 
67 | 	it('audioFile3: why should one halt on the way', function(done) {
68 | 		socket.once('recognize', (results) => {
69 | 			expect(results.text).to.be.equal('why should one halt on the way');
70 | 			done();
71 | 		});
72 | 
73 | 		fs.createReadStream(audioFile3, {highWaterMark: 4096})
74 | 		.on('data', function (chunk) {
75 | 			socket.emit('microphone-data', chunk);
76 | 		})
77 | 		.on('end', function () {
78 | 			socket.emit('microphone-end');
79 | 		});
80 | 
81 | 	});
82 | });
83 | 


--------------------------------------------------------------------------------