├── LICENSE ├── cheatsheet.md ├── install.sh ├── logo.png ├── main.py ├── models └── vosk │ └── small │ ├── README │ ├── am │ └── final.mdl │ ├── conf │ ├── mfcc.conf │ └── model.conf │ ├── graph │ ├── Gr.fst │ ├── HCLr.fst │ ├── disambig_tid.int │ └── phones │ │ └── word_boundary.int │ └── ivector │ ├── final.dubm │ ├── final.ie │ ├── final.mat │ ├── global_cmvn.stats │ ├── online_cmvn.conf │ └── splice.conf ├── readme.md ├── recorder.py ├── requirements.txt └── vosper.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 appvoid 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cheatsheet.md: -------------------------------------------------------------------------------- 1 | 2 | ### Error Rate Reference (English) 3 | *- Lower is better, sorted by performance -* 4 | | model | librispeech | tedlium | average | 5 | | ----- | ----------- | ------- | ------- | 6 | |small-en-us-0-15|9.85|10.38|10.165| 7 | |en-us-0-22-lgraph|7.82|8.20|8.01| 8 | |whisper-tiny|7.6|7.0|7.3| 9 | |en-us-0-22|5.69|6.05|5.87| 10 | |whisper-tiny-en|5.6|6.0|5.8| 11 | |whisper-base|5.0|5.5|5.75| 12 | |whisper-base-en|4.2|4.9|4.55| 13 | |whisper-small|3.4|4.3|3.85| 14 | |whisper-medium-en|3.1|4.1|3.6| 15 | |whisper-small-en|3.1|4.0|3.55| 16 | |whisper-medium|2.9|3.8|3.35| 17 | |whisper-large|2.7|4.0|3.35| 18 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo '| | | . |_ -| . | -_| _| 3 | \_/|___|___| _|___|_| 4 | |_| ' 5 | echo 'Setting up Python libraries...' 6 | pip3 install -r requirements.txt 7 | echo 'Installing ffmpeg for Whisper, type your password:' 8 | sudo apt update && sudo apt install ffmpeg 9 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/logo.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # vosper: a simple tool to easily get high-quality Automatic Speech Recognition using SOTA models 2 | import vosper, os; vosper = vosper.new() 3 | 4 | while 'listening': 5 | text = vosper.listen() 6 | if ('-' in text): print(text) 7 | elif (text != ''): os.system('clear'); print('- '+ text) 8 | -------------------------------------------------------------------------------- /models/vosk/small/README: -------------------------------------------------------------------------------- 1 | US English model for mobile Vosk applications 2 | 3 | Copyright 2020 Alpha Cephei Inc 4 | 5 | Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean) 6 | Speed: 0.11xRT (desktop) 7 | Latency: 0.15s (right context) 8 | 9 | 10 | -------------------------------------------------------------------------------- /models/vosk/small/am/final.mdl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/am/final.mdl -------------------------------------------------------------------------------- /models/vosk/small/conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | --use-energy=false 3 | --num-mel-bins=40 4 | --num-ceps=40 5 | --low-freq=20 6 | --high-freq=7600 7 | --allow-downsample=true 8 | -------------------------------------------------------------------------------- /models/vosk/small/conf/model.conf: -------------------------------------------------------------------------------- 1 | --min-active=200 2 | --max-active=3000 3 | --beam=10.0 4 | --lattice-beam=2.0 5 | --acoustic-scale=1.0 6 | --frame-subsampling-factor=3 7 | --endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 8 | --endpoint.rule2.min-trailing-silence=0.5 9 | --endpoint.rule3.min-trailing-silence=0.75 10 | --endpoint.rule4.min-trailing-silence=1.0 11 | -------------------------------------------------------------------------------- /models/vosk/small/graph/Gr.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/graph/Gr.fst -------------------------------------------------------------------------------- /models/vosk/small/graph/HCLr.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/graph/HCLr.fst -------------------------------------------------------------------------------- /models/vosk/small/graph/disambig_tid.int: -------------------------------------------------------------------------------- 1 | 10015 2 | 10016 3 | 10017 4 | 10018 5 | 10019 6 | 10020 7 | 10021 8 | 10022 9 | 10023 10 | 10024 11 | 10025 12 | 10026 13 | 10027 14 | 10028 15 | 10029 16 | 10030 17 | 10031 18 | -------------------------------------------------------------------------------- /models/vosk/small/graph/phones/word_boundary.int: -------------------------------------------------------------------------------- 1 | 1 nonword 2 | 2 begin 3 | 3 end 4 | 4 internal 5 | 5 singleton 6 | 6 nonword 7 | 7 begin 8 | 8 end 9 | 9 internal 10 | 10 singleton 11 | 11 begin 12 | 12 end 13 | 13 internal 14 | 14 singleton 15 | 15 begin 16 | 16 end 17 | 17 internal 18 | 18 singleton 19 | 19 begin 20 | 20 end 21 | 21 internal 22 | 22 singleton 23 | 23 begin 24 | 24 end 25 | 25 internal 26 | 26 singleton 27 | 27 begin 28 | 28 end 29 | 29 internal 30 | 30 singleton 31 | 31 begin 32 | 32 end 33 | 33 internal 34 | 34 singleton 35 | 35 begin 36 | 36 end 37 | 37 internal 38 | 38 singleton 39 | 39 begin 40 | 40 end 41 | 41 internal 42 | 42 singleton 43 | 43 begin 44 | 44 end 45 | 45 internal 46 | 46 singleton 47 | 47 begin 48 | 48 end 49 | 49 internal 50 | 50 singleton 51 | 51 begin 52 | 52 end 53 | 53 internal 54 | 54 singleton 55 | 55 begin 56 | 56 end 57 | 57 internal 58 | 58 singleton 59 | 59 begin 60 | 60 end 61 | 61 internal 62 | 62 singleton 63 | 63 begin 64 | 64 end 65 | 65 internal 66 | 66 singleton 67 | 67 begin 68 | 68 end 69 | 69 internal 70 | 70 singleton 71 | 71 begin 72 | 72 end 73 | 73 internal 74 | 74 singleton 75 | 75 begin 76 | 76 end 77 | 77 internal 78 | 78 singleton 79 | 79 begin 80 | 80 end 81 | 81 internal 82 | 82 singleton 83 | 83 begin 84 | 84 end 85 | 85 internal 86 | 86 singleton 87 | 87 begin 88 | 88 end 89 | 89 internal 90 | 90 singleton 91 | 91 begin 92 | 92 end 93 | 93 internal 94 | 94 singleton 95 | 95 begin 96 | 96 end 97 | 97 internal 98 | 98 singleton 99 | 99 begin 100 | 100 end 101 | 101 internal 102 | 102 singleton 103 | 103 begin 104 | 104 end 105 | 105 internal 106 | 106 singleton 107 | 107 begin 108 | 108 end 109 | 109 internal 110 | 110 singleton 111 | 111 begin 112 | 112 end 113 | 113 internal 114 | 114 singleton 115 | 115 begin 116 | 116 end 117 | 117 internal 118 | 118 singleton 119 | 119 begin 120 | 120 end 121 | 121 internal 122 | 122 singleton 123 | 123 begin 124 | 124 end 125 | 125 internal 126 | 126 singleton 127 | 127 begin 128 | 128 end 129 | 129 internal 130 | 130 singleton 131 | 131 begin 132 | 132 end 133 | 133 internal 134 | 134 singleton 135 | 135 begin 136 | 136 end 137 | 137 internal 138 | 138 singleton 139 | 139 begin 140 | 140 end 141 | 141 internal 142 | 142 singleton 143 | 143 begin 144 | 144 end 145 | 145 internal 146 | 146 singleton 147 | 147 begin 148 | 148 end 149 | 149 internal 150 | 150 singleton 151 | 151 begin 152 | 152 end 153 | 153 internal 154 | 154 singleton 155 | 155 begin 156 | 156 end 157 | 157 internal 158 | 158 singleton 159 | 159 begin 160 | 160 end 161 | 161 internal 162 | 162 singleton 163 | 163 begin 164 | 164 end 165 | 165 internal 166 | 166 singleton 167 | -------------------------------------------------------------------------------- /models/vosk/small/ivector/final.dubm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.dubm -------------------------------------------------------------------------------- /models/vosk/small/ivector/final.ie: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.ie -------------------------------------------------------------------------------- /models/vosk/small/ivector/final.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appvoid/vosper/b25884b4111b8602b6a4234aac5c880d0d8ef8df/models/vosk/small/ivector/final.mat -------------------------------------------------------------------------------- /models/vosk/small/ivector/global_cmvn.stats: -------------------------------------------------------------------------------- 1 | [ 2 | 1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 3 | 1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ] 4 | -------------------------------------------------------------------------------- /models/vosk/small/ivector/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /models/vosk/small/ivector/splice.conf: -------------------------------------------------------------------------------- 1 | --left-context=3 2 | --right-context=3 3 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # vosper 4 | 🕑 Real-Time Whisper Voice Recognition with vosk model feedback 🎙 5 | 6 | ### 🌏 News 7 | > New logo: Hopefully everyone likes it.
8 | > vosper 2.0: The codebase was rewritten and more customization was added!
9 | > Pip is coming: A proper, easier installation and update alternative will be launched soon.
10 | > vosper 2.1: The codebase is being refactorized for highly-optimized implementation.
11 | 12 | ### 📑 Features 13 | - Minimal approach 14 | - Easy installation 15 | - Easy modification 16 | - Fast text feedback thanks to vosk 17 | - Detects human voices (It records audio for Whisper only when needed) 18 | ### ⭐ Installation 19 | ``` 20 | git clone https://github.com/appvoid/vosper.git && 21 | cd vosper && 22 | chmod +x install.sh && 23 | ./install.sh 24 | ``` 25 | ### ▶ Usage 26 | ```python3 main.py # It's pretty minimal...``` 27 | ### ☕ **Donations and Support** 28 | [**paypal donation** ](https://www.paypal.com/donate/?hosted_button_id=CDZH8GJET9SNU) or [ **patreon support** ](https://www.patreon.com/bePatron?u=52880328) 29 | ### 🔭 Full Roadmap 30 | - [x] Vosk Real-Time inference and Whisper VAD support 31 | - [x] Class-Based implementation 32 | - [x] Easier way to choose a whisper model 33 | - [x] Improved code quality, comments, readability, etc... 34 | - [x] Verbosity switch 35 | - [x] Customizable settings 36 | - [ ] Custom VAD model support 37 | - [ ] Python's pip installation method 38 | - [ ] Keyboard support 39 | - [ ] Documentation 40 | 41 | ### 🔴 Disclaimer 42 | Real-Time usage scenarios (like a voice assistant for example) requires a GPU with at least 2-4~ gb of vram. The more the vram, the largest the model you can load, the better the transcription and the slower it gets. 43 | 44 | [![Star History Chart](https://api.star-history.com/svg?repos=appvoid/vosper&type=Date)](https://star-history.com/#appvoid/vosper&Date) 45 | -------------------------------------------------------------------------------- /recorder.py: -------------------------------------------------------------------------------- 1 | # import required libraries 2 | import sounddevice as sd 3 | from scipy.io.wavfile import write 4 | from os import system as cmd 5 | 6 | class new: 7 | def __init__(self, waiting_time=4, filename='speaker'): 8 | # initialize recording so we can use it later 9 | self.recording = sd.rec(int(0 * 44100), samplerate=44100, channels=2) 10 | self.waiting_time = waiting_time # set a default waiting time 11 | self.filename = filename 12 | 13 | def record (self, waiting_time=4): 14 | # we set the same waiting time for the method 15 | self.waiting_time = waiting_time 16 | # setup recording 17 | self.recording = sd.rec(int(self.waiting_time * 44100), samplerate=44100, channels=2) 18 | 19 | def save (self): 20 | # save the new file 21 | write(f'{self.filename}.wav',44100,self.recording) 22 | 23 | def stop(self): 24 | # remove file so previous data doesn't get mixed up 25 | cmd(f'rm {self.filename}.wav >/dev/null 2>&1') 26 | # stop recording 27 | sd.stop() 28 | # finally, we save the file 29 | self.save() 30 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | vosk 2 | scipy 3 | sounddevice 4 | setuptools-rust 5 | git+https://github.com/openai/whisper.git -------------------------------------------------------------------------------- /vosper.py: -------------------------------------------------------------------------------- 1 | # VOSK + Whisper speech recognition system 2 | '''This module utilizes vosk as user feedback as wel as VAD solution 3 | While it uses OpenAI whisper for actual transcription.''' 4 | 5 | # libraries 6 | import os, pyaudio, whisper, recorder 7 | from vosk import SetLogLevel, Model, KaldiRecognizer 8 | SetLogLevel(-1) # mutes vosk verbosity 9 | os.system('clear') 10 | welcome_msg = '''\ \ / / _ \/ __| '_ \ / _ \ '__| 11 | \ V / (_) \__ \ |_) | __/ | 12 | \_/ \___/|___/ .__/ \___|_| 13 | |_| 14 | by appvoid 15 | ''' 16 | 17 | # debugging purposes 18 | def log(msg, verbosity): 19 | if verbosity: 20 | print(msg) 21 | 22 | class new: 23 | def load_vosk (self, model='small'): 24 | # load vosk model 25 | model_voice = Model(f'{os.getcwd()}/models/vosk/{model}') 26 | recognizer = KaldiRecognizer(model_voice, 16000) 27 | return recognizer 28 | 29 | def stream(self): 30 | mic = pyaudio.PyAudio() 31 | # microphone streaming 32 | 33 | '''this code is setting up an audio stream that 34 | will capture mono audio at a sample rate of 16000 Hz 35 | with 16-bit integer samples. It will capture audio 36 | in chunks of 4096 samples at a time.''' 37 | 38 | _stream = mic.open( 39 | channels=1, 40 | rate=16000, 41 | input=True, 42 | format=pyaudio.paInt16, 43 | frames_per_buffer=4096 44 | ) 45 | _stream.start_stream() 46 | os.system('clear') 47 | return _stream 48 | 49 | def __init__(self, vosk_model='small', whisper_model='small.en', waiting_time=4, filename='speaker', verbosity=True): 50 | self.verbosity = verbosity 51 | log('- loading models...', self.verbosity) 52 | self.recorder = recorder.new(waiting_time, filename='speaker') 53 | self.whisper = whisper.load_model(whisper_model) 54 | self.vosk = self.load_vosk(vosk_model) 55 | self.recording_whisper = False 56 | self.filename = filename 57 | self.mic = self.stream() 58 | 59 | log(welcome_msg, self.verbosity) 60 | log(f'- waiting time: {waiting_time} seconds\n- vosk model: {vosk_model}\n- whisper model: {whisper_model}\n- recording file: {filename}', self.verbosity) 61 | 62 | def listen (self): 63 | # we get data 64 | data = self.mic 65 | data = data.read(4096) 66 | 67 | # we check if person stopped talking from vosk recognizer 68 | if self.vosk.AcceptWaveform(data): 69 | self.recorder.stop() # we stop recording to save cpu compute 70 | text = self.vosk.Result()[14:-3] 71 | # we also check if the input does worth the whisper gpu compute 72 | characters_threshold = 3 73 | if (len(text) > characters_threshold): 74 | text = self.whisper.transcribe(f'{self.filename}.wav')['text'].strip() 75 | # we turn off whisper recognition variable 76 | self.recording_whisper = False 77 | 78 | else: # else, we show vosk text instead 79 | text = self.vosk.PartialResult()[17:-3] 80 | if (self.recording_whisper == False): 81 | self.recorder.stop() 82 | # we turn whisper on available for recording 83 | self.recording_whisper = True 84 | # we save 5 seconds of audio for whisper to transcribe 85 | self.recorder.record(5) 86 | 87 | # it's a simple but quite unbreakable spell 88 | # for text checking to avoid printing empty strings 89 | if text != '-' and text != '- ': 90 | return text 91 | else: 92 | return '' 93 | --------------------------------------------------------------------------------