├── .gitignore ├── README.md ├── assistant_config.mjs ├── babel.config.js ├── package-lock.json ├── package.json ├── public ├── favicon.ico └── index.html ├── sounds ├── de │ ├── hotword_answer_1.mp3 │ └── meme_hotword_answer.mp3 └── en │ ├── hotword_answer_1.mp3 │ └── meme_hotword_answer.mp3 ├── src ├── App.vue ├── assets │ ├── cyborg_corgi.jpg │ └── cyborg_corgi.webp ├── background.js ├── components │ └── AiAssistant.vue ├── google_stt.mjs ├── main.js ├── plugins │ └── vuetify.js ├── test.mjs ├── tts.mjs ├── voice_assistant_google.mjs └── voice_assistant_vosk.mjs ├── vue.config.js └── yarn.lock /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | /dist 4 | 5 | 6 | # local env files 7 | .env 8 | .env.local 9 | .env.*.local 10 | 11 | # Log files 12 | npm-debug.log* 13 | yarn-debug.log* 14 | yarn-error.log* 15 | pnpm-debug.log* 16 | 17 | # Editor directories and files 18 | .idea 19 | .vscode 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | 26 | #Electron-builder output 27 | /dist_electron 28 | ./temp.wav 29 | language_models 30 | hotword_trigger.txt 31 | latest_recording.txt 32 | google_client_secret.json 33 | sounds/*/gpt_answer.mp3 34 | public/sounds -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ai-voice-assistant 2 | 3 | electron based ai voice assistant utilizing gcloud text-to-speech and speech-to-text and openai chatgpt. 4 | on machines with enough power, you can use vosk for complete offline voice recgnition. 5 | 6 | ## Prerequisite 7 | 8 | - debian based OS (tested on Ubuntu and Raspberrypi OS) 9 | - node 16 (recommended install via nvm https://github.com/nvm-sh/nvm) 10 | ## Project setup 11 | ``` 12 | npm install --legacy-peer-deps 13 | ``` 14 | 15 | ## Google Setup 16 | - install gcloud cli (https://cloud.google.com/sdk/docs/install?hl=de#deb) 17 | - you need google account with Cloud Text-to-Speech API enabled (https://cloud.google.com/text-to-speech/docs) 18 | - you also need Cloud Speech-to-Text API enabled if not using VOSK (can be configured in assistant_config.mjs) 19 | - after enabling both, you need to create a service account for this project and download a credentials JSON file that looks like this 20 | 21 | ``` 22 | { 23 | "type": "service_account", 24 | "project_id": "{{your_project_id}}", 25 | "private_key_id": "{{your_private_key_id}}", 26 | "private_key": "{{your_private_key}}", 27 | "client_email": "local-account@{{your_project_id}}.iam.gserviceaccount.com", 28 | "client_id": "{{your_client_id}}", 29 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 30 | "token_uri": "https://oauth2.googleapis.com/token", 31 | "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", 32 | "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/local-account%40{{your_project_id}}.iam.gserviceaccount.com", 33 | "universe_domain": "googleapis.com" 34 | } 35 | 36 | ``` 37 | 38 | - put this file in the root directory of the project and adjust your .env file to point to this credentials file 39 | ``` 40 | GOOGLE_APPLICATION_CREDENTIALS="/home/your-projects-directory/raspberry-ai-voice-assitant/google_client_secret.json" 41 | ``` 42 | ## ChatGPT Setup 43 | - create `.env` file with this content replacing `{{your-api-key}}` 44 | - can be obtained on openai homepage (https://platform.openai.com/account/api-keys) 45 | 46 | ``` 47 | OPEN_AI_APIKEY="{{your-api-key}}" 48 | 49 | ``` 50 | 51 | ## Language Model Setup 52 | 53 | - voice assistant can be used with offline voice recognition 54 | - language models will be loaded into RAM, so this requires a laptop or later version of raspberry pi 55 | - to enable vosk, adjust `assistant_config.mjs` 56 | - download models here: https://alphacephei.com/vosk/models 57 | - create folder language_models in project root 58 | - extract downloaded language model in there 59 | - adjust `modelPaths` in `src/voice_assistant_vosk.mjs` 60 | 61 | ## Run the voice assitant 62 | ``` 63 | npm run electron:serve 64 | ``` 65 | 66 | ## Customization 67 | 68 | - check `assistant_config.mjs` 69 | 70 | ``` 71 | const assistantName = 'Felix'; 72 | 73 | export default { 74 | 'useLocalSpeechToText': false, // if true uses vosk (free), if false uses google speech to text 75 | 'lowMemoryVariant': false, // if useLocalSpeechToText is true, this determines, if big language model is used 76 | 'language': 'de', // language used for speech to text and answers from chatGPT 77 | 'answerWordLimit': 30, // limits the requested words of an answer from chatGPT 78 | 'assistantName': assistantName, // this is how you activate the voice assistant using "Hey, ${name}", 79 | "memeTrigger": 'listiges Bild', // if you say this, assistant will fetch a meme from reddit and show it, 80 | "gptSystemMessage": `Du bist einer virtueller Sprachassistent. Dein Name ist ${assistantName}. Du gibst kurze genaue Antworten. Das Aktuelle Datum ist: ${new Date().toISOString()}\n\n` 81 | }; 82 | ``` -------------------------------------------------------------------------------- /assistant_config.mjs: -------------------------------------------------------------------------------- 1 | const assistantName = 'Felix'; 2 | 3 | export default { 4 | 'useLocalSpeechToText': false, // if true uses vosk (free), if false uses google speech to text 5 | 'lowMemoryVariant': false, // if useLocalSpeechToText is true, this determines, if big language model is used 6 | 'language': 'de', // language used for speech to text and answers from chatGPT 7 | 'answerWordLimit': 30, // limits the requested words of an answer from chatGPT 8 | 'assistantName': assistantName, // this is how you activate the voice assistant using "Hey, ${name}", 9 | "memeTrigger": 'listiges Bild', // if you say this, assistant will fetch a meme from reddit and show it, 10 | "gptSystemMessage": `Du bist einer virtueller Sprachassistent. Dein Name ist ${assistantName}. Du gibst kurze genaue Antworten. Das Aktuelle Datum ist: ${new Date().toISOString()}\n\n` 11 | }; 12 | 13 | // example en-US config 14 | // const assistantName = 'buddy'; 15 | 16 | // export default { 17 | // 'useLocalSpeechToText': false, // if true uses vosk (free), if false uses google speech to text 18 | // 'lowMemoryVariant': false, // if useLocalSpeechToText is true, this determines, if big language model is used 19 | // 'language': 'en', // language used for speech to text and answers from chatGPT 20 | // 'answerWordLimit': 50, // limits the requested words of an answer from chatGPT 21 | // 'assistantName': assistantName, // this is how you activate the voice assistant using "Hey, ${name}", 22 | // "memeTrigger": 'funny image', // if you say this, assistant will fetch a meme from reddit and show it, 23 | // "gptSystemMessage": `You are a virtual voice assistant. Your name is ${assistantName}. You give short concrete answers. Current date is: ${new Date().toISOString()}\n\n` 24 | // }; -------------------------------------------------------------------------------- /babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [ 3 | '@vue/cli-plugin-babel/preset' 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ai-voice-assistant", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "serve": "vue-cli-service serve", 7 | "build": "vue-cli-service build", 8 | "lint": "vue-cli-service lint", 9 | "electron:build": "vue-cli-service electron:build", 10 | "electron:serve": "vue-cli-service electron:serve", 11 | "postinstall": "electron-builder install-app-deps", 12 | "postuninstall": "electron-builder install-app-deps" 13 | }, 14 | "main": "background.js", 15 | "dependencies": { 16 | "@google-cloud/speech": "5.5.0", 17 | "@google-cloud/text-to-speech": "4.2.1", 18 | "audic": "3.0.1", 19 | "chatgpt": "5.2.2", 20 | "core-js": "^3.6.5", 21 | "dotenv": "16.0.3", 22 | "get-mp3-duration": "1.0.0", 23 | "isomorphic-fetch": "3.0.0", 24 | "node-fetch": "3.3.1", 25 | "node-record-lpcm16": "1.0.1", 26 | "play-sound": "1.1.5", 27 | "sound-play": "1.1.0", 28 | "systeminformation": "4.34.23", 29 | "vosk": "^0.3.39", 30 | "vue": "^2.6.11", 31 | "vuetify": "^2.6.0" 32 | }, 33 | "devDependencies": { 34 | "@vue/cli-plugin-babel": "5.0.8", 35 | "@vue/cli-plugin-eslint": "5.0.8", 36 | "@vue/cli-service": "5.0.8", 37 | "babel-eslint": "^10.1.0", 38 | "electron": "24.0.0", 39 | "electron-devtools-installer": "^3.1.0", 40 | "eslint": "7.32.0", 41 | "eslint-plugin-vue": "^6.2.2", 42 | "sass": "~1.32.0", 43 | "sass-loader": "^10.0.0", 44 | "vue-cli-plugin-electron-builder": "1.4.6", 45 | "vue-cli-plugin-vuetify": "~2.5.8", 46 | "vue-template-compiler": "^2.6.11", 47 | "vuetify-loader": "^1.7.0" 48 | }, 49 | "eslintConfig": { 50 | "root": true, 51 | "env": { 52 | "node": true 53 | }, 54 | "extends": [ 55 | "plugin:vue/essential", 56 | "eslint:recommended" 57 | ], 58 | "parserOptions": { 59 | "parser": "babel-eslint" 60 | }, 61 | "rules": {} 62 | }, 63 | "browserslist": [ 64 | "> 1%", 65 | "last 2 versions", 66 | "not dead" 67 | ] 68 | } 69 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/public/favicon.ico -------------------------------------------------------------------------------- /public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | <%= htmlWebpackPlugin.options.title %> 9 | 10 | 11 | 12 | 13 | 16 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /sounds/de/hotword_answer_1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/de/hotword_answer_1.mp3 -------------------------------------------------------------------------------- /sounds/de/meme_hotword_answer.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/de/meme_hotword_answer.mp3 -------------------------------------------------------------------------------- /sounds/en/hotword_answer_1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/en/hotword_answer_1.mp3 -------------------------------------------------------------------------------- /sounds/en/meme_hotword_answer.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/en/meme_hotword_answer.mp3 -------------------------------------------------------------------------------- /src/App.vue: -------------------------------------------------------------------------------- 1 | 8 | 9 | 24 | 25 | -------------------------------------------------------------------------------- /src/assets/cyborg_corgi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/src/assets/cyborg_corgi.jpg -------------------------------------------------------------------------------- /src/assets/cyborg_corgi.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/src/assets/cyborg_corgi.webp -------------------------------------------------------------------------------- /src/background.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | import { app, protocol, BrowserWindow, ipcMain } from 'electron' 3 | import { createProtocol } from 'vue-cli-plugin-electron-builder/lib' 4 | import installExtension, { VUEJS_DEVTOOLS } from 'electron-devtools-installer' 5 | import nodeChildProcess from 'child_process'; 6 | import si from 'systeminformation'; 7 | import 'isomorphic-fetch'; 8 | import config from '../assistant_config.mjs' 9 | 10 | const isDevelopment = process.env.NODE_ENV !== 'production' 11 | 12 | let win; 13 | 14 | // Scheme must be registered before the app is ready 15 | protocol.registerSchemesAsPrivileged([ 16 | { scheme: 'app', privileges: { secure: true, standard: true } } 17 | ]) 18 | 19 | async function createWindow() { 20 | // Create the browser window. 21 | win = new BrowserWindow({ 22 | webPreferences: { 23 | nodeIntegration: true, 24 | contextIsolation: false, 25 | enableRemoteModule: true, 26 | devTools: false 27 | }, 28 | show: false, 29 | fullscreen: true 30 | }) 31 | 32 | win.show(); 33 | if (process.env.WEBPACK_DEV_SERVER_URL) { 34 | // Load the url of the dev server if in development mode 35 | await win.loadURL(process.env.WEBPACK_DEV_SERVER_URL) 36 | if (!process.env.IS_TEST) win.webContents.openDevTools() 37 | } else { 38 | createProtocol('app') 39 | // Load the index.html when not in development 40 | win.loadURL('app://./index.html') 41 | } 42 | } 43 | 44 | // Quit when all windows are closed. 45 | app.on('window-all-closed', () => { 46 | // On macOS it is common for applications and their menu bar 47 | // to stay active until the user quits explicitly with Cmd + Q 48 | if (process.platform !== 'darwin') { 49 | app.quit() 50 | } 51 | }) 52 | 53 | app.on('activate', () => { 54 | // On macOS it's common to re-create a window in the app when the 55 | // dock icon is clicked and there are no other windows open. 56 | if (BrowserWindow.getAllWindows().length === 0) createWindow() 57 | }) 58 | 59 | // This method will be called when Electron has finished 60 | // initialization and is ready to create browser windows. 61 | // Some APIs can only be used after this event occurs. 62 | app.on('ready', async () => { 63 | if (isDevelopment && !process.env.IS_TEST) { 64 | // Install Vue Devtools 65 | try { 66 | await installExtension(VUEJS_DEVTOOLS) 67 | } catch (e) { 68 | console.error('Vue Devtools failed to install:', e.toString()) 69 | } 70 | } 71 | createWindow() 72 | }) 73 | 74 | // Exit cleanly on request from parent process in development mode. 75 | if (isDevelopment) { 76 | if (process.platform === 'win32') { 77 | process.on('message', (data) => { 78 | if (data === 'graceful-exit') { 79 | app.quit() 80 | } 81 | }) 82 | } else { 83 | process.on('SIGTERM', () => { 84 | app.quit() 85 | }) 86 | } 87 | } 88 | 89 | 90 | function handleNotification(data) { 91 | try { 92 | const events = JSON.parse(data); 93 | events.forEach(event => { 94 | console.log(`${event.name}: ${event.value}`); 95 | win.webContents.send(event.name, event.value); 96 | }); 97 | } catch (error) { 98 | console.log(`ERR: parsing failed: ${data}`); 99 | } 100 | } 101 | 102 | const speechToTextService = config.useLocalSpeechToText ? 'vosk' : 'google'; 103 | 104 | const startVoskChildProcess = true; 105 | if (startVoskChildProcess) { 106 | let script = nodeChildProcess.spawn('node', [`src/voice_assistant_${speechToTextService}.mjs`]); 107 | script.stdout.on('data', handleNotification); 108 | 109 | process.on('exit', function() { 110 | console.log('Killing child process onexit!'); 111 | script.kill(); 112 | }); 113 | } 114 | const tempratureCheckIntervall = 5000 115 | 116 | function checkCpuTemprature() { 117 | si.cpuTemperature().then((result) => { 118 | if (result) { 119 | const mainTemp = result.main; 120 | if (win && win.webContents) { 121 | win.webContents.send('cpu', mainTemp); 122 | } 123 | } 124 | }) 125 | } 126 | 127 | checkCpuTemprature(); 128 | setInterval(checkCpuTemprature, tempratureCheckIntervall); -------------------------------------------------------------------------------- /src/components/AiAssistant.vue: -------------------------------------------------------------------------------- 1 | 60 | 61 | 193 | 194 | -------------------------------------------------------------------------------- /src/google_stt.mjs: -------------------------------------------------------------------------------- 1 | import recorder from 'node-record-lpcm16'; 2 | import speech from '@google-cloud/speech'; 3 | 4 | // Creates a client 5 | const client = new speech.SpeechClient(); 6 | 7 | const encoding = 'LINEAR16'; 8 | const sampleRateHertz = 16000; 9 | let request = {}; 10 | let callback = () => {}; 11 | let recognizeStream = null; 12 | 13 | 14 | function startGoogleSpeechToText(func, langCode) { 15 | request = { 16 | config: { 17 | encoding: encoding, 18 | sampleRateHertz: sampleRateHertz, 19 | languageCode: langCode, 20 | }, 21 | interimResults: false, // If you want interim results, set this to true 22 | }; 23 | callback = func; 24 | 25 | recognizeStream = client 26 | .streamingRecognize(request) 27 | .on('error', console.error) 28 | .on('data', data => { 29 | const success = data.results[0] && data.results[0].alternatives[0]; 30 | if (success) { 31 | const result = data.results[0].alternatives[0].transcript; 32 | callback(result); 33 | } 34 | }); 35 | // Start recording and send the microphone input to the Speech API. 36 | // Ensure SoX is installed, see https://www.npmjs.com/package/node-record-lpcm16#dependencies 37 | recorder 38 | .record({ 39 | sampleRateHertz: sampleRateHertz, 40 | threshold: 0, 41 | verbose: false, 42 | recordProgram: 'rec', // Try also "arecord" or "sox" 43 | silence: '1.0', 44 | }) 45 | .stream() 46 | .on('error', console.error) 47 | .pipe(recognizeStream); 48 | } 49 | 50 | export default startGoogleSpeechToText; 51 | -------------------------------------------------------------------------------- /src/main.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue' 2 | import App from './App.vue' 3 | import vuetify from './plugins/vuetify' 4 | 5 | Vue.config.productionTip = false 6 | 7 | new Vue({ 8 | vuetify, 9 | render: h => h(App) 10 | }).$mount('#app') 11 | -------------------------------------------------------------------------------- /src/plugins/vuetify.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue'; 2 | import Vuetify from 'vuetify/lib/framework'; 3 | 4 | Vue.use(Vuetify); 5 | 6 | export default new Vuetify({ 7 | theme: { 8 | defaultTheme: 'dark' 9 | } 10 | }); 11 | -------------------------------------------------------------------------------- /src/test.mjs: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import Audic from 'audic'; 3 | import getMP3Duration from 'get-mp3-duration'; 4 | const lang = 'en'; 5 | 6 | 7 | function triggerEvent(name, value) { 8 | const event = { name, value }; 9 | const events = [event]; 10 | const data = JSON.stringify(events); 11 | console.log(data); 12 | } 13 | 14 | 15 | async function playSound(name) { 16 | const mp3File = `./sounds/${lang}/${name}.mp3`; 17 | const buffer = fs.readFileSync(mp3File); 18 | const duration = getMP3Duration(buffer); 19 | const audic = new Audic(mp3File); 20 | // ended event does not work correctly 21 | // workaround is getting duration of mp3 in ms 22 | // then when sound starts playing we set timeout 23 | // to trigger end event of tts 24 | audic.addEventListener('playing', () => { 25 | setTimeout(() => { 26 | triggerEvent('tts_end', true); 27 | process.exit(); 28 | }, duration); 29 | }); 30 | audic.play(); 31 | } 32 | 33 | playSound('hotword_answer_1') -------------------------------------------------------------------------------- /src/tts.mjs: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import util from "util"; 3 | import dotenv from "dotenv"; 4 | import textToSpeech from "@google-cloud/text-to-speech"; 5 | 6 | dotenv.config(); 7 | 8 | const ttsApi = new textToSpeech.TextToSpeechClient(); 9 | 10 | const languageMapping = { 11 | en: 'en-US', 12 | de: 'de-DE' 13 | } 14 | 15 | const voices = { 16 | de: 'de-DE-Neural2-B',//A,C,D 17 | en: 'en-US-Neural2-I'//A,D,I 18 | } 19 | 20 | async function synthesizeSpeech(text, name, lang) { 21 | // Construct the request 22 | const request = { 23 | input: { text: text }, 24 | // Select the language and SSML voice gender (optional) 25 | voice: { languageCode: languageMapping[lang], name: voices[lang], ssmlGender: 'MALE' }, 26 | // select the type of audio encoding 27 | audioConfig: { audioEncoding: 'MP3' }, 28 | }; 29 | 30 | // Performs the text-to-speech request 31 | const [response] = await ttsApi.synthesizeSpeech(request); 32 | // Write the binary audio content to a local file 33 | const writeFile = util.promisify(fs.writeFile); 34 | await writeFile(`./sounds/${lang}/${name}.mp3`, response.audioContent, 'binary'); 35 | console.log('Done!'); 36 | } 37 | 38 | async function listVoices(lang) { 39 | const [result] = await ttsApi.listVoices({}); 40 | const voices = result.voices; 41 | 42 | console.log('Voices:'); 43 | voices.forEach(voice => { 44 | if (!voice.name.includes('Neural')) return; 45 | if (!voice.languageCodes.includes(languageMapping[lang])) return; 46 | console.log(`Name: ${voice.name}`); 47 | console.log(` SSML Voice Gender: ${voice.ssmlGender}`); 48 | console.log(` Natural Sample Rate Hertz: ${voice.naturalSampleRateHertz}`); 49 | console.log(' Supported languages:'); 50 | voice.languageCodes.forEach(languageCode => { 51 | console.log(` ${languageCode}`); 52 | }); 53 | }); 54 | } 55 | 56 | const text = "Spiel 'Die da', von den Fantastischen Vier"; 57 | const name = 'temp_4'; 58 | const lang = 'de'; 59 | 60 | synthesizeSpeech(text, name, lang); 61 | -------------------------------------------------------------------------------- /src/voice_assistant_google.mjs: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import 'isomorphic-fetch'; 3 | import util from "util"; 4 | import dotenv from "dotenv"; 5 | import { ChatGPTAPI } from 'chatgpt'; 6 | import textToSpeech from "@google-cloud/text-to-speech"; 7 | import Audic from 'audic'; 8 | import getMP3Duration from 'get-mp3-duration'; 9 | import config from '../assistant_config.mjs'; 10 | import startGoogleSpeechToText from './google_stt.mjs'; 11 | 12 | dotenv.config(); 13 | 14 | const lang = config.language; 15 | const answerWordLimit = config.answerWordLimit; 16 | const assistantName = config.assistantName; 17 | const memeTrigger = config.memeTrigger; 18 | const systemMessage = config.gptSystemMessage; 19 | 20 | let lastRequestId = null; 21 | const eventTimeoutMs = 500; 22 | const debug = false; 23 | 24 | const languageMapping = { 25 | en: 'en-US', 26 | de: 'de-DE' 27 | } 28 | 29 | const voices = { 30 | de: 'de-DE-Neural2-B',//A,C,D 31 | en: 'en-US-Neural2-I'//A,D,I 32 | } 33 | 34 | const messagePostFix = { 35 | en: `Answer in less than ${answerWordLimit} words if possible.`, 36 | de: `Antworte in unter ${answerWordLimit} Wörtern, wenn möglich.` 37 | } 38 | 39 | const continueMatches = { 40 | de: 'nochmal', 41 | en: 'next' 42 | } 43 | 44 | const chatGPTAPI = new ChatGPTAPI({ apiKey: process.env.OPEN_AI_APIKEY, systemMessage }) 45 | const ttsApi = new textToSpeech.TextToSpeechClient(); 46 | let memeLoop = false; 47 | 48 | function sleep(ms) { 49 | return new Promise(resolve => setTimeout(resolve, ms)); 50 | } 51 | 52 | function triggerEvent(name, value) { 53 | const event = { name, value }; 54 | const events = [event]; 55 | const data = JSON.stringify(events); 56 | console.log(data); 57 | } 58 | 59 | async function synthesizeSpeech(text) { 60 | // Construct the request 61 | const request = { 62 | input: { text: text }, 63 | // Select the language and SSML voice gender (optional) 64 | voice: { languageCode: languageMapping[lang], name: voices[lang], ssmlGender: 'MALE' }, 65 | // select the type of audio encoding 66 | audioConfig: { audioEncoding: 'MP3' }, 67 | }; 68 | 69 | // Performs the text-to-speech request 70 | const [response] = await ttsApi.synthesizeSpeech(request); 71 | // Write the binary audio content to a local file 72 | const writeFile = util.promisify(fs.writeFile); 73 | await writeFile(`./sounds/${lang}/gpt_answer.mp3`, response.audioContent, 'binary'); 74 | await sleep(eventTimeoutMs); 75 | } 76 | 77 | async function askChatGpt(message) { 78 | const opts = {}; 79 | 80 | message = `${message}. ${messagePostFix[lang]}`; 81 | if (lastRequestId) { 82 | opts.parentMessageId = lastRequestId 83 | } 84 | const response = await chatGPTAPI.sendMessage(message, opts); 85 | lastRequestId = response.id; 86 | return response.text; 87 | } 88 | 89 | 90 | async function fetchMeme() { 91 | const memeApi = 'https://meme-api.com/gimme'; 92 | const response = await fetch(memeApi); 93 | const result = await response.json(); 94 | 95 | triggerEvent('meme', result.url); 96 | } 97 | 98 | let active = false; 99 | let disabled = false; 100 | const minimumDisabledMs = 5000; 101 | 102 | const voiceRecognition = { 103 | hotwords: { 104 | activate: [], 105 | activateMeme: '' 106 | }, 107 | initHotwords: () => { 108 | const prefixes = ['hey ', 'he ', 'the ', 'hi ', ''] 109 | voiceRecognition.hotwords.activate = prefixes.map(prefix => { 110 | return `${prefix}${assistantName.toLocaleLowerCase()}`; 111 | }); 112 | voiceRecognition.hotwords.activateMeme = memeTrigger; 113 | }, 114 | googleSttCallback: (text) => { 115 | if (text && debug) { 116 | triggerEvent('google_stt_debug', text); 117 | return; 118 | } 119 | voiceRecognition.checkHotword(text); 120 | voiceRecognition.handleInput(text); 121 | voiceRecognition.checkStop(text); 122 | }, 123 | start: () => { 124 | voiceRecognition.initHotwords(); 125 | const googleLang = languageMapping[lang]; 126 | startGoogleSpeechToText(voiceRecognition.googleSttCallback, googleLang); 127 | }, 128 | handleInput: async (text) => { 129 | if (!active || disabled) return; 130 | if (text.includes('stop') && text.length <= 10) { 131 | triggerEvent('stop', true); 132 | active = false; 133 | return; 134 | } 135 | 136 | text = text.trim(); 137 | let inputTooShort = text && text.length < 6; 138 | let notEnoughWords = text.split(' ').length < 3; 139 | let containsHotWord = text.includes(assistantName) && text.legth < 12; 140 | if (!text || inputTooShort || notEnoughWords || containsHotWord) { 141 | return; 142 | } 143 | 144 | if (text.includes('stop') && text.length <= 20) { 145 | triggerEvent('stop', true); 146 | active = false; 147 | return; 148 | } 149 | 150 | let questionEvent = { 151 | name: 'question', 152 | value: text 153 | }; 154 | 155 | const gptStartEvent = { 156 | name: 'gpt_start', 157 | value: true 158 | }; 159 | 160 | let events = [questionEvent, gptStartEvent]; 161 | let data = JSON.stringify(events); 162 | console.log(data); 163 | active = false; 164 | 165 | const answer = await askChatGpt(text); 166 | 167 | triggerEvent('answer', answer); 168 | await synthesizeSpeech(answer); 169 | triggerEvent('tts', true); 170 | playSound('gpt_answer', true); 171 | 172 | }, 173 | checkHotword: async (text) => { 174 | if (active || disabled) return; 175 | 176 | text = normalizeResult(text); 177 | let match = false; 178 | voiceRecognition.hotwords.activate.forEach(hotword => { 179 | if (text.includes(hotword)) { 180 | match = true; 181 | } 182 | }); 183 | 184 | if (match) { 185 | active = true; 186 | setTimeout(() => { 187 | disabled = false; 188 | }, eventTimeoutMs); 189 | triggerEvent('voice_input_start', true); 190 | } 191 | 192 | let memeMatch = false; 193 | if (text.includes(voiceRecognition.hotwords.activateMeme)) { 194 | memeMatch = true; 195 | } 196 | 197 | if (memeMatch) { 198 | memeLoop = true; 199 | disabled = true; 200 | triggerEvent('meme_hotword', true); 201 | fetchMeme(); 202 | playSound('meme_hotword_answer', false); 203 | setTimeout(() => { 204 | disabled = false; 205 | }, minimumDisabledMs) 206 | } 207 | 208 | let memeContinueMatch = text.includes(continueMatches[lang]); 209 | 210 | if (memeContinueMatch && memeLoop) { 211 | disabled = true; 212 | triggerEvent('meme', true); 213 | fetchMeme(); 214 | setTimeout(() => { 215 | disabled = false; 216 | }, 1000); 217 | } 218 | 219 | let stopMatch = text.includes('stop'); 220 | 221 | if (stopMatch && memeLoop) { 222 | memeLoop = false; 223 | triggerEvent('meme_stop', true); 224 | } 225 | }, 226 | checkStop: async (text) => { 227 | if (!disabled) return; 228 | text = normalizeResult(text); 229 | let match = false; 230 | if (text.includes('stop')) { 231 | match = true; 232 | } 233 | 234 | if (match) { 235 | active = false; 236 | disabled = false; 237 | triggerEvent('stop', true); 238 | } 239 | } 240 | } 241 | 242 | async function playSound(name) { 243 | disabled = true; 244 | const mp3File = `./sounds/${lang}/${name}.mp3`; 245 | const buffer = fs.readFileSync(mp3File); 246 | const duration = getMP3Duration(buffer); 247 | const audic = new Audic(mp3File); 248 | // ended event does not work correctly 249 | // workaround is getting duration of mp3 in ms 250 | // then when sound starts playing we set timeout 251 | // to trigger end event of tts 252 | audic.addEventListener('playing', () => { 253 | setTimeout(() => { 254 | disabled = false; 255 | triggerEvent('tts_end', true); 256 | }, duration); 257 | }); 258 | audic.play(); 259 | } 260 | 261 | function normalizeResult(text) { 262 | text = text.trim(); 263 | return text.toLocaleLowerCase(); 264 | } 265 | 266 | console.log(JSON.stringify([{ name: 'LOG:', value: 'Google Speech-to-Text started!' }])); 267 | voiceRecognition.start(); 268 | -------------------------------------------------------------------------------- /src/voice_assistant_vosk.mjs: -------------------------------------------------------------------------------- 1 | import vosk from 'vosk'; 2 | import fs from 'fs'; 3 | import mic from 'mic'; 4 | import { fileURLToPath } from 'url'; 5 | import path from 'path'; 6 | import 'isomorphic-fetch'; 7 | import util from "util"; 8 | import dotenv from "dotenv"; 9 | import { ChatGPTAPI } from 'chatgpt'; 10 | import textToSpeech from "@google-cloud/text-to-speech"; 11 | import Audic from 'audic'; 12 | import getMP3Duration from 'get-mp3-duration'; 13 | import config from '../assistant_config.mjs'; 14 | 15 | dotenv.config(); 16 | const __filename = fileURLToPath(import.meta.url); 17 | const __dirname = path.dirname(__filename); 18 | 19 | const lang = config.language; 20 | const answerWordLimit = config.answerWordLimit; 21 | const lowMemoryVariant = config.lowMemoryVariant; 22 | const assistantName = config.assistantName; 23 | const memeTrigger = config.memeTrigger; 24 | const systemMessage = config.gptSystemMessage; 25 | 26 | let lastRequestId = null; 27 | const eventTimeoutMs = 200; 28 | 29 | const debug = false; 30 | 31 | let modelPaths = { 32 | de: __dirname + "/../language_models/vosk-model-de-0.21", 33 | en: __dirname + "/../language_models/vosk-model-en-us-0.22" 34 | } 35 | 36 | if (lowMemoryVariant) { 37 | modelPaths = { 38 | de: __dirname + "/../language_models/vosk-model-small-de-0.15", 39 | en: __dirname + "/../language_models/vosk-model-en-us-0.22-lgraph" 40 | } 41 | } 42 | 43 | const languageMapping = { 44 | en: 'en-US', 45 | de: 'de-DE' 46 | } 47 | 48 | const voices = { 49 | de: 'de-DE-Neural2-B',//A,C,D 50 | en: 'en-US-Neural2-I'//A,D,I 51 | } 52 | 53 | const messagePostFix = { 54 | en: `Answer in less than ${answerWordLimit} words if possible.`, 55 | de: `Antworte in unter ${answerWordLimit} Wörtern, wenn möglich.` 56 | } 57 | 58 | const continueMatches = { 59 | de: 'nochmal', 60 | en: 'next' 61 | } 62 | 63 | const MODEL_PATH = modelPaths[lang]; 64 | const SAMPLE_RATE = 16000; 65 | 66 | if (!fs.existsSync(MODEL_PATH)) { 67 | console.log("Please download the model from https://alphacephei.com/vosk/models and unpack as " + MODEL_PATH + " in the current folder.") 68 | process.exit() 69 | } 70 | 71 | vosk.setLogLevel(0); 72 | const model = new vosk.Model(MODEL_PATH); 73 | let rec = new vosk.Recognizer({ model: model, sampleRate: SAMPLE_RATE }); 74 | 75 | 76 | const chatGPTAPI = new ChatGPTAPI({ apiKey: process.env.OPEN_AI_APIKEY, systemMessage }) 77 | const ttsApi = new textToSpeech.TextToSpeechClient(); 78 | let memeLoop = false; 79 | 80 | function sleep(ms) { 81 | return new Promise(resolve => setTimeout(resolve, ms)); 82 | } 83 | 84 | function triggerEvent(name, value) { 85 | const event = { name, value }; 86 | const events = [event]; 87 | const data = JSON.stringify(events); 88 | console.log(data); 89 | } 90 | 91 | async function synthesizeSpeech(text) { 92 | // Construct the request 93 | const request = { 94 | input: { text: text }, 95 | // Select the language and SSML voice gender (optional) 96 | voice: { languageCode: languageMapping[lang], name: voices[lang], ssmlGender: 'MALE' }, 97 | // select the type of audio encoding 98 | audioConfig: { audioEncoding: 'MP3' }, 99 | }; 100 | 101 | // Performs the text-to-speech request 102 | const [response] = await ttsApi.synthesizeSpeech(request); 103 | // Write the binary audio content to a local file 104 | const writeFile = util.promisify(fs.writeFile); 105 | await writeFile(`./sounds/${lang}/gpt_answer.mp3`, response.audioContent, 'binary'); 106 | await sleep(eventTimeoutMs); 107 | } 108 | 109 | async function askChatGpt(message) { 110 | const opts = {}; 111 | 112 | message = `${message}. ${messagePostFix[lang]}`; 113 | if (lastRequestId) { 114 | opts.parentMessageId = lastRequestId 115 | } 116 | const response = await chatGPTAPI.sendMessage(message, opts); 117 | lastRequestId = response.id; 118 | return response.text; 119 | } 120 | 121 | 122 | async function fetchMeme() { 123 | const memeApi = 'https://meme-api.com/gimme'; 124 | const response = await fetch(memeApi); 125 | const result = await response.json(); 126 | 127 | triggerEvent('meme', result.url); 128 | } 129 | 130 | const micInstance = mic({ 131 | rate: String(SAMPLE_RATE), 132 | channels: '1', 133 | debug: false, 134 | device: 'default', 135 | }); 136 | 137 | let active = false; 138 | let disabled = false; 139 | let recordingCache = ''; 140 | let cacheCounter = 0; 141 | const minimumDisabledMs = 5000; 142 | const maxAttemptsRecording = 5; 143 | 144 | const voiceRecognition = { 145 | hotwords: { 146 | activate: [], 147 | activateMeme: '' 148 | }, 149 | initHotwords: () => { 150 | const prefixes = ['hey', 'he', 'the'] 151 | voiceRecognition.hotwords.activate = prefixes.map(prefix => { 152 | return `${prefix} ${assistantName}`; 153 | }); 154 | voiceRecognition.hotwords.activateMeme = memeTrigger; 155 | }, 156 | start: () => { 157 | voiceRecognition.initHotwords(); 158 | const micInputStream = micInstance.getAudioStream(); 159 | 160 | micInputStream.on('data', data => { 161 | voiceRecognition.checkHotword(data); 162 | voiceRecognition.handleInput(data); 163 | voiceRecognition.checkStop(data); 164 | }); 165 | 166 | micInputStream.on('audioProcessExitComplete', function () { 167 | console.log(rec.finalResult()); 168 | rec.free(); 169 | model.free(); 170 | }); 171 | 172 | process.on('SIGINT', function () { 173 | console.log("\nStopping"); 174 | micInstance.stop(); 175 | }); 176 | 177 | micInstance.start(); 178 | }, 179 | handleInput: async (data) => { 180 | if (disabled) { 181 | rec.reset(); 182 | } 183 | if (!active || disabled) return; 184 | const isSilent = rec.acceptWaveform(data); 4 185 | 186 | let isFinalAttempt = false; 187 | let result = rec.partialResult(); 188 | 189 | if (result.partial.includes('stop') && result.partial.length <= 10) { 190 | triggerEvent('stop', true); 191 | rec.reset(); 192 | active = false; 193 | return; 194 | } 195 | 196 | let inputTooShort = result.partial && result.partial.length < 6; 197 | let notEnoughWords = result.partial.split(' ').length < 3; 198 | if (!result.partial || inputTooShort || notEnoughWords) { 199 | return; 200 | } 201 | if (result.partial === recordingCache) { 202 | cacheCounter++; 203 | } else { 204 | recordingCache = result.partial; 205 | cacheCounter = 0; 206 | } 207 | 208 | if (cacheCounter > maxAttemptsRecording) { 209 | isFinalAttempt = true; 210 | result = rec.finalResult(); 211 | } 212 | 213 | 214 | if (isSilent || isFinalAttempt) { 215 | result = isFinalAttempt ? result : rec.result(); 216 | result = normalizeResult(result); 217 | 218 | if (result.text.includes('stop') && result.text.length <= 20) { 219 | triggerEvent('stop', true); 220 | rec.reset(); 221 | active = false; 222 | return; 223 | } 224 | 225 | 226 | if (result.text && debug) { 227 | triggerEvent('voice_input_debug', result.text) 228 | } 229 | 230 | let questionEvent = { 231 | name: 'question', 232 | value: result.text 233 | }; 234 | 235 | const gptStartEvent = { 236 | name: 'gpt_start', 237 | value: true 238 | }; 239 | 240 | let events = [questionEvent, gptStartEvent]; 241 | data = JSON.stringify(events); 242 | console.log(data); 243 | active = false; 244 | 245 | const answer = await askChatGpt(result.text); 246 | 247 | triggerEvent('answer', answer); 248 | await synthesizeSpeech(answer); 249 | triggerEvent('tts', true); 250 | playSound('gpt_answer', true); 251 | } else { 252 | if (result.partial && debug) { 253 | triggerEvent('voice_input_partial', result.partial) 254 | } 255 | 256 | } 257 | }, 258 | checkHotword: async (data) => { 259 | if (disabled) { 260 | rec.reset(); 261 | } 262 | if (active || disabled) return; 263 | let result = ''; 264 | if (rec.acceptWaveform(data)) { 265 | result = rec.result(); 266 | } else { 267 | result = rec.partialResult(); 268 | result.text = result.partial; 269 | } 270 | 271 | result = normalizeResult(result); 272 | let match = false; 273 | voiceRecognition.hotwords.activate.forEach(hotword => { 274 | if (result.text.includes(hotword)) { 275 | match = true; 276 | } 277 | }); 278 | 279 | if (result.text && debug) { 280 | triggerEvent('voice_input_hotword', result.text); 281 | } 282 | 283 | if (match) { 284 | rec.reset(); 285 | active = true; 286 | disabled = false; 287 | triggerEvent('voice_input_start', true); 288 | } 289 | 290 | let memeMatch = false; 291 | if (result.text.includes(voiceRecognition.hotwords.activateMeme)) { 292 | memeMatch = true; 293 | } 294 | 295 | if (memeMatch) { 296 | memeLoop = true; 297 | rec.reset(); 298 | disabled = true; 299 | triggerEvent('meme_hotword', true); 300 | fetchMeme(); 301 | playSound('meme_hotword_answer', false); 302 | setTimeout(() => { 303 | disabled = false; 304 | }, minimumDisabledMs) 305 | } 306 | 307 | let memeContinueMatch = result.text.includes(continueMatches[lang]); 308 | 309 | if (memeContinueMatch && memeLoop) { 310 | disabled = true; 311 | triggerEvent('meme', true); 312 | fetchMeme(); 313 | setTimeout(() => { 314 | disabled = false; 315 | }, 1000); 316 | } 317 | 318 | let stopMatch = result.text.includes('stop'); 319 | 320 | if (stopMatch && memeLoop) { 321 | memeLoop = false; 322 | rec.reset(); 323 | triggerEvent('meme_stop', true); 324 | } 325 | }, 326 | checkStop: async (data) => { 327 | if (!disabled) return; 328 | let result = ''; 329 | if (rec.acceptWaveform(data)) { 330 | result = rec.result(); 331 | } else { 332 | result = rec.partialResult(); 333 | result.text = result.partial; 334 | } 335 | 336 | result = normalizeResult(result); 337 | let match = false; 338 | if (result.text.includes('stop')) { 339 | match = true; 340 | } 341 | 342 | if (match) { 343 | rec.reset(); 344 | active = false; 345 | disabled = false; 346 | triggerEvent('stop', true); 347 | } 348 | } 349 | } 350 | 351 | async function playSound(name) { 352 | disabled = true; 353 | const mp3File = `./sounds/${lang}/${name}.mp3`; 354 | const buffer = fs.readFileSync(mp3File); 355 | const duration = getMP3Duration(buffer); 356 | const audic = new Audic(mp3File); 357 | // ended event does not work correctly 358 | // workaround is getting duration of mp3 in ms 359 | // then when sound starts playing we set timeout 360 | // to trigger end event of tts 361 | audic.addEventListener('playing', () => { 362 | setTimeout(() => { 363 | disabled = false; 364 | triggerEvent('tts_end', true); 365 | }, duration); 366 | }); 367 | audic.play(); 368 | } 369 | 370 | function normalizeResult(result) { 371 | if (lang === 'de') { 372 | if (result.text.startsWith('einen')) { 373 | result.text = result.text === 'einen' ? '' : result.text.substring(4); 374 | } 375 | 376 | result.text = result.text.replace('wie kann ich helfen', ''); 377 | } 378 | if (lang === 'en') { 379 | if (result.text.startsWith('a ')) { 380 | result.text = result.text.substring(3); 381 | } 382 | if (result.text.startsWith('please')) { 383 | result.text = result.text === 'please' ? '' : result.text.substring(7); 384 | } 385 | if (result.text.startsWith('the')) { 386 | result.text = result.text === 'the' ? '' : result.text.substring(4); 387 | } 388 | 389 | result.text = result.text.replace('how can I help', ''); 390 | } 391 | 392 | result.text = result.text.trim(); 393 | 394 | return result; 395 | } 396 | 397 | console.log(JSON.stringify([{ name: 'LOG:', value: 'Vosk Speech-to-Text started!' }])); 398 | voiceRecognition.start(); 399 | -------------------------------------------------------------------------------- /vue.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | transpileDependencies: [ 3 | 'vuetify' 4 | ], 5 | pluginOptions: { 6 | electronBuilder: { 7 | nodeIntegration: true 8 | } 9 | } 10 | } 11 | --------------------------------------------------------------------------------