├── LICENSE
├── cheatsheet.md
├── install.sh
├── logo.png
├── main.py
├── models
    └── vosk
    │   └── small
    │       ├── README
    │       ├── am
    │           └── final.mdl
    │       ├── conf
    │           ├── mfcc.conf
    │           └── model.conf
    │       ├── graph
    │           ├── Gr.fst
    │           ├── HCLr.fst
    │           ├── disambig_tid.int
    │           └── phones
    │           │   └── word_boundary.int
    │       └── ivector
    │           ├── final.dubm
    │           ├── final.ie
    │           ├── final.mat
    │           ├── global_cmvn.stats
    │           ├── online_cmvn.conf
    │           └── splice.conf
├── readme.md
├── recorder.py
├── requirements.txt
└── vosper.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 appvoid
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cheatsheet.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Error Rate Reference (English)
 3 | *- Lower is better, sorted by performance -*
 4 | | model | librispeech | tedlium | average |
 5 | | ----- | ----------- | ------- | ------- |
 6 | |small-en-us-0-15|9.85|10.38|10.165|
 7 | |en-us-0-22-lgraph|7.82|8.20|8.01|
 8 | |whisper-tiny|7.6|7.0|7.3|
 9 | |en-us-0-22|5.69|6.05|5.87|
10 | |whisper-tiny-en|5.6|6.0|5.8|
11 | |whisper-base|5.0|5.5|5.75|
12 | |whisper-base-en|4.2|4.9|4.55|
13 | |whisper-small|3.4|4.3|3.85|
14 | |whisper-medium-en|3.1|4.1|3.6|
15 | |whisper-small-en|3.1|4.0|3.55|
16 | |whisper-medium|2.9|3.8|3.35|
17 | |whisper-large|2.7|4.0|3.35|
18 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo '| | | . |_ -| . | -_|  _|
3 |  \_/|___|___|  _|___|_|  
4 |             |_|          '
5 | echo 'Setting up Python libraries...'
6 | pip3 install -r requirements.txt
7 | echo 'Installing ffmpeg for Whisper, type your password:'
8 | sudo apt update && sudo apt install ffmpeg
9 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/logo.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # vosper: a simple tool to easily get high-quality Automatic Speech Recognition using SOTA models
2 | import vosper, os; vosper = vosper.new()
3 | 
4 | while 'listening':
5 |     text = vosper.listen()
6 |     if ('-' in text): print(text)
7 |     elif (text != ''): os.system('clear'); print('- '+ text)
8 | 


--------------------------------------------------------------------------------
/models/vosk/small/README:
--------------------------------------------------------------------------------
 1 | US English model for mobile Vosk applications
 2 | 
 3 | Copyright 2020 Alpha Cephei Inc
 4 | 
 5 | Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
 6 | Speed: 0.11xRT (desktop)
 7 | Latency: 0.15s (right context)
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/models/vosk/small/am/final.mdl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/am/final.mdl


--------------------------------------------------------------------------------
/models/vosk/small/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000
2 | --use-energy=false
3 | --num-mel-bins=40
4 | --num-ceps=40
5 | --low-freq=20
6 | --high-freq=7600
7 | --allow-downsample=true
8 | 


--------------------------------------------------------------------------------
/models/vosk/small/conf/model.conf:
--------------------------------------------------------------------------------
 1 | --min-active=200
 2 | --max-active=3000
 3 | --beam=10.0
 4 | --lattice-beam=2.0
 5 | --acoustic-scale=1.0
 6 | --frame-subsampling-factor=3
 7 | --endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
 8 | --endpoint.rule2.min-trailing-silence=0.5
 9 | --endpoint.rule3.min-trailing-silence=0.75
10 | --endpoint.rule4.min-trailing-silence=1.0
11 | 


--------------------------------------------------------------------------------
/models/vosk/small/graph/Gr.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/graph/Gr.fst


--------------------------------------------------------------------------------
/models/vosk/small/graph/HCLr.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/graph/HCLr.fst


--------------------------------------------------------------------------------
/models/vosk/small/graph/disambig_tid.int:
--------------------------------------------------------------------------------
 1 | 10015
 2 | 10016
 3 | 10017
 4 | 10018
 5 | 10019
 6 | 10020
 7 | 10021
 8 | 10022
 9 | 10023
10 | 10024
11 | 10025
12 | 10026
13 | 10027
14 | 10028
15 | 10029
16 | 10030
17 | 10031
18 | 


--------------------------------------------------------------------------------
/models/vosk/small/graph/phones/word_boundary.int:
--------------------------------------------------------------------------------
  1 | 1 nonword
  2 | 2 begin
  3 | 3 end
  4 | 4 internal
  5 | 5 singleton
  6 | 6 nonword
  7 | 7 begin
  8 | 8 end
  9 | 9 internal
 10 | 10 singleton
 11 | 11 begin
 12 | 12 end
 13 | 13 internal
 14 | 14 singleton
 15 | 15 begin
 16 | 16 end
 17 | 17 internal
 18 | 18 singleton
 19 | 19 begin
 20 | 20 end
 21 | 21 internal
 22 | 22 singleton
 23 | 23 begin
 24 | 24 end
 25 | 25 internal
 26 | 26 singleton
 27 | 27 begin
 28 | 28 end
 29 | 29 internal
 30 | 30 singleton
 31 | 31 begin
 32 | 32 end
 33 | 33 internal
 34 | 34 singleton
 35 | 35 begin
 36 | 36 end
 37 | 37 internal
 38 | 38 singleton
 39 | 39 begin
 40 | 40 end
 41 | 41 internal
 42 | 42 singleton
 43 | 43 begin
 44 | 44 end
 45 | 45 internal
 46 | 46 singleton
 47 | 47 begin
 48 | 48 end
 49 | 49 internal
 50 | 50 singleton
 51 | 51 begin
 52 | 52 end
 53 | 53 internal
 54 | 54 singleton
 55 | 55 begin
 56 | 56 end
 57 | 57 internal
 58 | 58 singleton
 59 | 59 begin
 60 | 60 end
 61 | 61 internal
 62 | 62 singleton
 63 | 63 begin
 64 | 64 end
 65 | 65 internal
 66 | 66 singleton
 67 | 67 begin
 68 | 68 end
 69 | 69 internal
 70 | 70 singleton
 71 | 71 begin
 72 | 72 end
 73 | 73 internal
 74 | 74 singleton
 75 | 75 begin
 76 | 76 end
 77 | 77 internal
 78 | 78 singleton
 79 | 79 begin
 80 | 80 end
 81 | 81 internal
 82 | 82 singleton
 83 | 83 begin
 84 | 84 end
 85 | 85 internal
 86 | 86 singleton
 87 | 87 begin
 88 | 88 end
 89 | 89 internal
 90 | 90 singleton
 91 | 91 begin
 92 | 92 end
 93 | 93 internal
 94 | 94 singleton
 95 | 95 begin
 96 | 96 end
 97 | 97 internal
 98 | 98 singleton
 99 | 99 begin
100 | 100 end
101 | 101 internal
102 | 102 singleton
103 | 103 begin
104 | 104 end
105 | 105 internal
106 | 106 singleton
107 | 107 begin
108 | 108 end
109 | 109 internal
110 | 110 singleton
111 | 111 begin
112 | 112 end
113 | 113 internal
114 | 114 singleton
115 | 115 begin
116 | 116 end
117 | 117 internal
118 | 118 singleton
119 | 119 begin
120 | 120 end
121 | 121 internal
122 | 122 singleton
123 | 123 begin
124 | 124 end
125 | 125 internal
126 | 126 singleton
127 | 127 begin
128 | 128 end
129 | 129 internal
130 | 130 singleton
131 | 131 begin
132 | 132 end
133 | 133 internal
134 | 134 singleton
135 | 135 begin
136 | 136 end
137 | 137 internal
138 | 138 singleton
139 | 139 begin
140 | 140 end
141 | 141 internal
142 | 142 singleton
143 | 143 begin
144 | 144 end
145 | 145 internal
146 | 146 singleton
147 | 147 begin
148 | 148 end
149 | 149 internal
150 | 150 singleton
151 | 151 begin
152 | 152 end
153 | 153 internal
154 | 154 singleton
155 | 155 begin
156 | 156 end
157 | 157 internal
158 | 158 singleton
159 | 159 begin
160 | 160 end
161 | 161 internal
162 | 162 singleton
163 | 163 begin
164 | 164 end
165 | 165 internal
166 | 166 singleton
167 | 


--------------------------------------------------------------------------------
/models/vosk/small/ivector/final.dubm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.dubm


--------------------------------------------------------------------------------
/models/vosk/small/ivector/final.ie:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.ie


--------------------------------------------------------------------------------
/models/vosk/small/ivector/final.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.mat


--------------------------------------------------------------------------------
/models/vosk/small/ivector/global_cmvn.stats:
--------------------------------------------------------------------------------
1 |  [
2 |   1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 
3 |   1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
4 | 


--------------------------------------------------------------------------------
/models/vosk/small/ivector/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 | 


--------------------------------------------------------------------------------
/models/vosk/small/ivector/splice.conf:
--------------------------------------------------------------------------------
1 | --left-context=3
2 | --right-context=3
3 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | <img src="https://github.com/appvoid/vosper/raw/main/logo.png" width="100%" height="auto">
 2 | 
 3 | # vosper
 4 | 🕑 Real-Time Whisper Voice Recognition with vosk model feedback 🎙
 5 | 
 6 | ### 🌏 News
 7 | > New logo: Hopefully everyone likes it.<br>
 8 | > vosper 2.0: The codebase was rewritten and more customization was added!<br>
 9 | > Pip is coming: A proper, easier installation and update alternative will be launched soon.<br>
10 | > vosper 2.1: The codebase is being refactorized for highly-optimized implementation.<br> 
11 | 
12 | ### 📑 Features
13 | - Minimal approach
14 | - Easy installation
15 | - Easy modification
16 | - Fast text feedback thanks to vosk
17 | - Detects human voices (It records audio for Whisper only when needed)
18 | ### ⭐ Installation
19 | ```
20 | git clone https://github.com/appvoid/vosper.git && 
21 | cd vosper && 
22 | chmod +x install.sh &&
23 | ./install.sh
24 | ```
25 | ### ▶ Usage
26 | ```python3 main.py # It's pretty minimal...```
27 | ### ☕ **Donations and Support** 
28 | [**paypal donation** ](https://www.paypal.com/donate/?hosted_button_id=CDZH8GJET9SNU) or [ **patreon support** ](https://www.patreon.com/bePatron?u=52880328)
29 | ### 🔭 Full Roadmap
30 | - [x] Vosk Real-Time inference and Whisper VAD support
31 | - [x] Class-Based implementation
32 | - [x] Easier way to choose a whisper model
33 | - [x] Improved code quality, comments, readability, etc...
34 | - [x] Verbosity switch
35 | - [x] Customizable settings
36 | - [ ] Custom VAD model support
37 | - [ ] Python's pip installation method
38 | - [ ] Keyboard support
39 | - [ ] Documentation
40 | 
41 | ### 🔴 Disclaimer
42 | Real-Time usage scenarios (like a voice assistant for example) requires a GPU with at least 2-4~ gb of vram. The more the vram, the largest the model you can load, the better the transcription and the slower it gets.
43 | 
44 | [![Star History Chart](https://api.star-history.com/svg?repos=appvoid/vosper&type=Date)](https://star-history.com/#appvoid/vosper&Date)
45 | 


--------------------------------------------------------------------------------
/recorder.py:
--------------------------------------------------------------------------------
 1 | # import required libraries
 2 | import sounddevice as sd
 3 | from scipy.io.wavfile import write
 4 | from os import system as cmd
 5 | 
 6 | class new:
 7 |     def __init__(self, waiting_time=4, filename='speaker'):
 8 |         # initialize recording so we can use it later
 9 |         self.recording = sd.rec(int(0 * 44100), samplerate=44100, channels=2)
10 |         self.waiting_time = waiting_time # set a default waiting time
11 |         self.filename = filename
12 | 
13 |     def record (self, waiting_time=4):
14 |         # we set the same waiting time for the method
15 |         self.waiting_time = waiting_time
16 |         # setup recording
17 |         self.recording = sd.rec(int(self.waiting_time * 44100), samplerate=44100, channels=2)
18 | 
19 |     def save (self):
20 |         # save the new file
21 |         write(f'{self.filename}.wav',44100,self.recording)
22 |     
23 |     def stop(self):
24 |         # remove file so previous data doesn't get mixed up
25 |         cmd(f'rm {self.filename}.wav >/dev/null 2>&1')
26 |         # stop recording
27 |         sd.stop()
28 |         # finally, we save the file
29 |         self.save()
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | vosk
2 | scipy
3 | sounddevice
4 | setuptools-rust
5 | git+https://github.com/openai/whisper.git


--------------------------------------------------------------------------------
/vosper.py:
--------------------------------------------------------------------------------
 1 | # VOSK + Whisper speech recognition system
 2 | '''This module utilizes vosk as user feedback as wel as VAD solution
 3 |     While it uses OpenAI whisper for actual transcription.'''
 4 | 
 5 | # libraries
 6 | import os, pyaudio, whisper, recorder
 7 | from vosk import SetLogLevel, Model, KaldiRecognizer
 8 | SetLogLevel(-1) # mutes vosk verbosity
 9 | os.system('clear')
10 | welcome_msg = '''\ \ / / _ \/ __| '_ \ / _ \ '__|
11 |  \ V / (_) \__ \ |_) |  __/ |   
12 |   \_/ \___/|___/ .__/ \___|_|   
13 |                |_|  
14 |                      by appvoid
15 | '''
16 | 
17 | # debugging purposes
18 | def log(msg, verbosity):
19 |     if verbosity:
20 |         print(msg)
21 | 
22 | class new:
23 |     def load_vosk (self, model='small'):
24 |         # load vosk model
25 |         model_voice = Model(f'{os.getcwd()}/models/vosk/{model}')
26 |         recognizer = KaldiRecognizer(model_voice, 16000)
27 |         return recognizer
28 | 
29 |     def stream(self):
30 |         mic = pyaudio.PyAudio()
31 |         # microphone streaming
32 | 
33 |         '''this code is setting up an audio stream that 
34 |         will capture mono audio at a sample rate of 16000 Hz 
35 |         with 16-bit integer samples. It will capture audio 
36 |         in chunks of 4096 samples at a time.'''
37 | 
38 |         _stream = mic.open(
39 |             channels=1,
40 |             rate=16000,
41 |             input=True,
42 |             format=pyaudio.paInt16, 
43 |             frames_per_buffer=4096
44 |         )
45 |         _stream.start_stream()
46 |         os.system('clear')
47 |         return _stream
48 | 
49 |     def __init__(self, vosk_model='small', whisper_model='small.en', waiting_time=4, filename='speaker', verbosity=True):
50 |         self.verbosity = verbosity
51 |         log('- loading models...', self.verbosity)
52 |         self.recorder = recorder.new(waiting_time, filename='speaker')
53 |         self.whisper = whisper.load_model(whisper_model)
54 |         self.vosk = self.load_vosk(vosk_model)
55 |         self.recording_whisper = False
56 |         self.filename = filename
57 |         self.mic = self.stream()
58 | 
59 |         log(welcome_msg, self.verbosity)
60 |         log(f'- waiting time:   {waiting_time} seconds\n- vosk model:     {vosk_model}\n- whisper model:  {whisper_model}\n- recording file: {filename}', self.verbosity)
61 | 
62 |     def listen (self):
63 |         # we get data
64 |         data = self.mic
65 |         data = data.read(4096)
66 | 
67 |         # we check if person stopped talking from vosk recognizer
68 |         if self.vosk.AcceptWaveform(data):
69 |             self.recorder.stop() # we stop recording to save cpu compute
70 |             text = self.vosk.Result()[14:-3]
71 |             # we also check if the input does worth the whisper gpu compute
72 |             characters_threshold = 3
73 |             if (len(text) > characters_threshold):
74 |                 text = self.whisper.transcribe(f'{self.filename}.wav')['text'].strip()
75 |             # we turn off whisper recognition variable
76 |             self.recording_whisper = False
77 | 
78 |         else: # else, we show vosk text instead
79 |             text = self.vosk.PartialResult()[17:-3]
80 |             if (self.recording_whisper == False):
81 |                 self.recorder.stop()
82 |                 # we turn whisper on available for recording
83 |                 self.recording_whisper = True
84 |                 # we save 5 seconds of audio for whisper to transcribe
85 |                 self.recorder.record(5)
86 |         
87 |         # it's a simple but quite unbreakable spell 
88 |         # for text checking to avoid printing empty strings
89 |         if text != '-' and text != '- ':
90 |             return text 
91 |         else:
92 |             return ''
93 | 


--------------------------------------------------------------------------------