├── LICENSE
├── cheatsheet.md
├── install.sh
├── logo.png
├── main.py
├── models
└── vosk
│ └── small
│ ├── README
│ ├── am
│ └── final.mdl
│ ├── conf
│ ├── mfcc.conf
│ └── model.conf
│ ├── graph
│ ├── Gr.fst
│ ├── HCLr.fst
│ ├── disambig_tid.int
│ └── phones
│ │ └── word_boundary.int
│ └── ivector
│ ├── final.dubm
│ ├── final.ie
│ ├── final.mat
│ ├── global_cmvn.stats
│ ├── online_cmvn.conf
│ └── splice.conf
├── readme.md
├── recorder.py
├── requirements.txt
└── vosper.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 appvoid
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/cheatsheet.md:
--------------------------------------------------------------------------------
1 |
2 | ### Error Rate Reference (English)
3 | *- Lower is better, sorted by performance -*
4 | | model | librispeech | tedlium | average |
5 | | ----- | ----------- | ------- | ------- |
6 | |small-en-us-0-15|9.85|10.38|10.165|
7 | |en-us-0-22-lgraph|7.82|8.20|8.01|
8 | |whisper-tiny|7.6|7.0|7.3|
9 | |en-us-0-22|5.69|6.05|5.87|
10 | |whisper-tiny-en|5.6|6.0|5.8|
11 | |whisper-base|5.0|5.5|5.75|
12 | |whisper-base-en|4.2|4.9|4.55|
13 | |whisper-small|3.4|4.3|3.85|
14 | |whisper-medium-en|3.1|4.1|3.6|
15 | |whisper-small-en|3.1|4.0|3.55|
16 | |whisper-medium|2.9|3.8|3.35|
17 | |whisper-large|2.7|4.0|3.35|
18 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo '| | | . |_ -| . | -_| _|
3 | \_/|___|___| _|___|_|
4 | |_| '
5 | echo 'Setting up Python libraries...'
6 | pip3 install -r requirements.txt
7 | echo 'Installing ffmpeg for Whisper, type your password:'
8 | sudo apt update && sudo apt install ffmpeg
9 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/logo.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # vosper: a simple tool to easily get high-quality Automatic Speech Recognition using SOTA models
2 | import vosper, os; vosper = vosper.new()
3 |
4 | while 'listening':
5 | text = vosper.listen()
6 | if ('-' in text): print(text)
7 | elif (text != ''): os.system('clear'); print('- '+ text)
8 |
--------------------------------------------------------------------------------
/models/vosk/small/README:
--------------------------------------------------------------------------------
1 | US English model for mobile Vosk applications
2 |
3 | Copyright 2020 Alpha Cephei Inc
4 |
5 | Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
6 | Speed: 0.11xRT (desktop)
7 | Latency: 0.15s (right context)
8 |
9 |
10 |
--------------------------------------------------------------------------------
/models/vosk/small/am/final.mdl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/am/final.mdl
--------------------------------------------------------------------------------
/models/vosk/small/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000
2 | --use-energy=false
3 | --num-mel-bins=40
4 | --num-ceps=40
5 | --low-freq=20
6 | --high-freq=7600
7 | --allow-downsample=true
8 |
--------------------------------------------------------------------------------
/models/vosk/small/conf/model.conf:
--------------------------------------------------------------------------------
1 | --min-active=200
2 | --max-active=3000
3 | --beam=10.0
4 | --lattice-beam=2.0
5 | --acoustic-scale=1.0
6 | --frame-subsampling-factor=3
7 | --endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
8 | --endpoint.rule2.min-trailing-silence=0.5
9 | --endpoint.rule3.min-trailing-silence=0.75
10 | --endpoint.rule4.min-trailing-silence=1.0
11 |
--------------------------------------------------------------------------------
/models/vosk/small/graph/Gr.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/graph/Gr.fst
--------------------------------------------------------------------------------
/models/vosk/small/graph/HCLr.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/graph/HCLr.fst
--------------------------------------------------------------------------------
/models/vosk/small/graph/disambig_tid.int:
--------------------------------------------------------------------------------
1 | 10015
2 | 10016
3 | 10017
4 | 10018
5 | 10019
6 | 10020
7 | 10021
8 | 10022
9 | 10023
10 | 10024
11 | 10025
12 | 10026
13 | 10027
14 | 10028
15 | 10029
16 | 10030
17 | 10031
18 |
--------------------------------------------------------------------------------
/models/vosk/small/graph/phones/word_boundary.int:
--------------------------------------------------------------------------------
1 | 1 nonword
2 | 2 begin
3 | 3 end
4 | 4 internal
5 | 5 singleton
6 | 6 nonword
7 | 7 begin
8 | 8 end
9 | 9 internal
10 | 10 singleton
11 | 11 begin
12 | 12 end
13 | 13 internal
14 | 14 singleton
15 | 15 begin
16 | 16 end
17 | 17 internal
18 | 18 singleton
19 | 19 begin
20 | 20 end
21 | 21 internal
22 | 22 singleton
23 | 23 begin
24 | 24 end
25 | 25 internal
26 | 26 singleton
27 | 27 begin
28 | 28 end
29 | 29 internal
30 | 30 singleton
31 | 31 begin
32 | 32 end
33 | 33 internal
34 | 34 singleton
35 | 35 begin
36 | 36 end
37 | 37 internal
38 | 38 singleton
39 | 39 begin
40 | 40 end
41 | 41 internal
42 | 42 singleton
43 | 43 begin
44 | 44 end
45 | 45 internal
46 | 46 singleton
47 | 47 begin
48 | 48 end
49 | 49 internal
50 | 50 singleton
51 | 51 begin
52 | 52 end
53 | 53 internal
54 | 54 singleton
55 | 55 begin
56 | 56 end
57 | 57 internal
58 | 58 singleton
59 | 59 begin
60 | 60 end
61 | 61 internal
62 | 62 singleton
63 | 63 begin
64 | 64 end
65 | 65 internal
66 | 66 singleton
67 | 67 begin
68 | 68 end
69 | 69 internal
70 | 70 singleton
71 | 71 begin
72 | 72 end
73 | 73 internal
74 | 74 singleton
75 | 75 begin
76 | 76 end
77 | 77 internal
78 | 78 singleton
79 | 79 begin
80 | 80 end
81 | 81 internal
82 | 82 singleton
83 | 83 begin
84 | 84 end
85 | 85 internal
86 | 86 singleton
87 | 87 begin
88 | 88 end
89 | 89 internal
90 | 90 singleton
91 | 91 begin
92 | 92 end
93 | 93 internal
94 | 94 singleton
95 | 95 begin
96 | 96 end
97 | 97 internal
98 | 98 singleton
99 | 99 begin
100 | 100 end
101 | 101 internal
102 | 102 singleton
103 | 103 begin
104 | 104 end
105 | 105 internal
106 | 106 singleton
107 | 107 begin
108 | 108 end
109 | 109 internal
110 | 110 singleton
111 | 111 begin
112 | 112 end
113 | 113 internal
114 | 114 singleton
115 | 115 begin
116 | 116 end
117 | 117 internal
118 | 118 singleton
119 | 119 begin
120 | 120 end
121 | 121 internal
122 | 122 singleton
123 | 123 begin
124 | 124 end
125 | 125 internal
126 | 126 singleton
127 | 127 begin
128 | 128 end
129 | 129 internal
130 | 130 singleton
131 | 131 begin
132 | 132 end
133 | 133 internal
134 | 134 singleton
135 | 135 begin
136 | 136 end
137 | 137 internal
138 | 138 singleton
139 | 139 begin
140 | 140 end
141 | 141 internal
142 | 142 singleton
143 | 143 begin
144 | 144 end
145 | 145 internal
146 | 146 singleton
147 | 147 begin
148 | 148 end
149 | 149 internal
150 | 150 singleton
151 | 151 begin
152 | 152 end
153 | 153 internal
154 | 154 singleton
155 | 155 begin
156 | 156 end
157 | 157 internal
158 | 158 singleton
159 | 159 begin
160 | 160 end
161 | 161 internal
162 | 162 singleton
163 | 163 begin
164 | 164 end
165 | 165 internal
166 | 166 singleton
167 |
--------------------------------------------------------------------------------
/models/vosk/small/ivector/final.dubm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.dubm
--------------------------------------------------------------------------------
/models/vosk/small/ivector/final.ie:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.ie
--------------------------------------------------------------------------------
/models/vosk/small/ivector/final.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.mat
--------------------------------------------------------------------------------
/models/vosk/small/ivector/global_cmvn.stats:
--------------------------------------------------------------------------------
1 | [
2 | 1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
3 | 1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
4 |
--------------------------------------------------------------------------------
/models/vosk/small/ivector/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 |
--------------------------------------------------------------------------------
/models/vosk/small/ivector/splice.conf:
--------------------------------------------------------------------------------
1 | --left-context=3
2 | --right-context=3
3 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # vosper
4 | 🕑 Real-Time Whisper Voice Recognition with vosk model feedback 🎙
5 |
6 | ### 🌏 News
7 | > New logo: Hopefully everyone likes it.
8 | > vosper 2.0: The codebase was rewritten and more customization was added!
9 | > Pip is coming: A proper, easier installation and update alternative will be launched soon.
10 | > vosper 2.1: The codebase is being refactorized for highly-optimized implementation.
11 |
12 | ### 📑 Features
13 | - Minimal approach
14 | - Easy installation
15 | - Easy modification
16 | - Fast text feedback thanks to vosk
17 | - Detects human voices (It records audio for Whisper only when needed)
18 | ### ⭐ Installation
19 | ```
20 | git clone https://github.com/appvoid/vosper.git &&
21 | cd vosper &&
22 | chmod +x install.sh &&
23 | ./install.sh
24 | ```
25 | ### ▶ Usage
26 | ```python3 main.py # It's pretty minimal...```
27 | ### ☕ **Donations and Support**
28 | [**paypal donation** ](https://www.paypal.com/donate/?hosted_button_id=CDZH8GJET9SNU) or [ **patreon support** ](https://www.patreon.com/bePatron?u=52880328)
29 | ### 🔭 Full Roadmap
30 | - [x] Vosk Real-Time inference and Whisper VAD support
31 | - [x] Class-Based implementation
32 | - [x] Easier way to choose a whisper model
33 | - [x] Improved code quality, comments, readability, etc...
34 | - [x] Verbosity switch
35 | - [x] Customizable settings
36 | - [ ] Custom VAD model support
37 | - [ ] Python's pip installation method
38 | - [ ] Keyboard support
39 | - [ ] Documentation
40 |
41 | ### 🔴 Disclaimer
42 | Real-Time usage scenarios (like a voice assistant for example) requires a GPU with at least 2-4~ gb of vram. The more the vram, the largest the model you can load, the better the transcription and the slower it gets.
43 |
44 | [](https://star-history.com/#appvoid/vosper&Date)
45 |
--------------------------------------------------------------------------------
/recorder.py:
--------------------------------------------------------------------------------
1 | # import required libraries
2 | import sounddevice as sd
3 | from scipy.io.wavfile import write
4 | from os import system as cmd
5 |
6 | class new:
7 | def __init__(self, waiting_time=4, filename='speaker'):
8 | # initialize recording so we can use it later
9 | self.recording = sd.rec(int(0 * 44100), samplerate=44100, channels=2)
10 | self.waiting_time = waiting_time # set a default waiting time
11 | self.filename = filename
12 |
13 | def record (self, waiting_time=4):
14 | # we set the same waiting time for the method
15 | self.waiting_time = waiting_time
16 | # setup recording
17 | self.recording = sd.rec(int(self.waiting_time * 44100), samplerate=44100, channels=2)
18 |
19 | def save (self):
20 | # save the new file
21 | write(f'{self.filename}.wav',44100,self.recording)
22 |
23 | def stop(self):
24 | # remove file so previous data doesn't get mixed up
25 | cmd(f'rm {self.filename}.wav >/dev/null 2>&1')
26 | # stop recording
27 | sd.stop()
28 | # finally, we save the file
29 | self.save()
30 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | vosk
2 | scipy
3 | sounddevice
4 | setuptools-rust
5 | git+https://github.com/openai/whisper.git
--------------------------------------------------------------------------------
/vosper.py:
--------------------------------------------------------------------------------
1 | # VOSK + Whisper speech recognition system
2 | '''This module utilizes vosk as user feedback as wel as VAD solution
3 | While it uses OpenAI whisper for actual transcription.'''
4 |
5 | # libraries
6 | import os, pyaudio, whisper, recorder
7 | from vosk import SetLogLevel, Model, KaldiRecognizer
8 | SetLogLevel(-1) # mutes vosk verbosity
9 | os.system('clear')
10 | welcome_msg = '''\ \ / / _ \/ __| '_ \ / _ \ '__|
11 | \ V / (_) \__ \ |_) | __/ |
12 | \_/ \___/|___/ .__/ \___|_|
13 | |_|
14 | by appvoid
15 | '''
16 |
17 | # debugging purposes
18 | def log(msg, verbosity):
19 | if verbosity:
20 | print(msg)
21 |
22 | class new:
23 | def load_vosk (self, model='small'):
24 | # load vosk model
25 | model_voice = Model(f'{os.getcwd()}/models/vosk/{model}')
26 | recognizer = KaldiRecognizer(model_voice, 16000)
27 | return recognizer
28 |
29 | def stream(self):
30 | mic = pyaudio.PyAudio()
31 | # microphone streaming
32 |
33 | '''this code is setting up an audio stream that
34 | will capture mono audio at a sample rate of 16000 Hz
35 | with 16-bit integer samples. It will capture audio
36 | in chunks of 4096 samples at a time.'''
37 |
38 | _stream = mic.open(
39 | channels=1,
40 | rate=16000,
41 | input=True,
42 | format=pyaudio.paInt16,
43 | frames_per_buffer=4096
44 | )
45 | _stream.start_stream()
46 | os.system('clear')
47 | return _stream
48 |
49 | def __init__(self, vosk_model='small', whisper_model='small.en', waiting_time=4, filename='speaker', verbosity=True):
50 | self.verbosity = verbosity
51 | log('- loading models...', self.verbosity)
52 | self.recorder = recorder.new(waiting_time, filename='speaker')
53 | self.whisper = whisper.load_model(whisper_model)
54 | self.vosk = self.load_vosk(vosk_model)
55 | self.recording_whisper = False
56 | self.filename = filename
57 | self.mic = self.stream()
58 |
59 | log(welcome_msg, self.verbosity)
60 | log(f'- waiting time: {waiting_time} seconds\n- vosk model: {vosk_model}\n- whisper model: {whisper_model}\n- recording file: {filename}', self.verbosity)
61 |
62 | def listen (self):
63 | # we get data
64 | data = self.mic
65 | data = data.read(4096)
66 |
67 | # we check if person stopped talking from vosk recognizer
68 | if self.vosk.AcceptWaveform(data):
69 | self.recorder.stop() # we stop recording to save cpu compute
70 | text = self.vosk.Result()[14:-3]
71 | # we also check if the input does worth the whisper gpu compute
72 | characters_threshold = 3
73 | if (len(text) > characters_threshold):
74 | text = self.whisper.transcribe(f'{self.filename}.wav')['text'].strip()
75 | # we turn off whisper recognition variable
76 | self.recording_whisper = False
77 |
78 | else: # else, we show vosk text instead
79 | text = self.vosk.PartialResult()[17:-3]
80 | if (self.recording_whisper == False):
81 | self.recorder.stop()
82 | # we turn whisper on available for recording
83 | self.recording_whisper = True
84 | # we save 5 seconds of audio for whisper to transcribe
85 | self.recorder.record(5)
86 |
87 | # it's a simple but quite unbreakable spell
88 | # for text checking to avoid printing empty strings
89 | if text != '-' and text != '- ':
90 | return text
91 | else:
92 | return ''
93 |
--------------------------------------------------------------------------------