├── .gitignore
├── README.md
├── assistant_config.mjs
├── babel.config.js
├── package-lock.json
├── package.json
├── public
├── favicon.ico
└── index.html
├── sounds
├── de
│ ├── hotword_answer_1.mp3
│ └── meme_hotword_answer.mp3
└── en
│ ├── hotword_answer_1.mp3
│ └── meme_hotword_answer.mp3
├── src
├── App.vue
├── assets
│ ├── cyborg_corgi.jpg
│ └── cyborg_corgi.webp
├── background.js
├── components
│ └── AiAssistant.vue
├── google_stt.mjs
├── main.js
├── plugins
│ └── vuetify.js
├── test.mjs
├── tts.mjs
├── voice_assistant_google.mjs
└── voice_assistant_vosk.mjs
├── vue.config.js
└── yarn.lock
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | node_modules
3 | /dist
4 |
5 |
6 | # local env files
7 | .env
8 | .env.local
9 | .env.*.local
10 |
11 | # Log files
12 | npm-debug.log*
13 | yarn-debug.log*
14 | yarn-error.log*
15 | pnpm-debug.log*
16 |
17 | # Editor directories and files
18 | .idea
19 | .vscode
20 | *.suo
21 | *.ntvs*
22 | *.njsproj
23 | *.sln
24 | *.sw?
25 |
26 | #Electron-builder output
27 | /dist_electron
28 | ./temp.wav
29 | language_models
30 | hotword_trigger.txt
31 | latest_recording.txt
32 | google_client_secret.json
33 | sounds/*/gpt_answer.mp3
34 | public/sounds
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ai-voice-assistant
2 |
3 | electron based ai voice assistant utilizing gcloud text-to-speech and speech-to-text and openai chatgpt.
4 | on machines with enough power, you can use vosk for complete offline voice recgnition.
5 |
6 | ## Prerequisite
7 |
8 | - debian based OS (tested on Ubuntu and Raspberrypi OS)
9 | - node 16 (recommended install via nvm https://github.com/nvm-sh/nvm)
10 | ## Project setup
11 | ```
12 | npm install --legacy-peer-deps
13 | ```
14 |
15 | ## Google Setup
16 | - install gcloud cli (https://cloud.google.com/sdk/docs/install?hl=de#deb)
17 | - you need google account with Cloud Text-to-Speech API enabled (https://cloud.google.com/text-to-speech/docs)
18 | - you also need Cloud Speech-to-Text API enabled if not using VOSK (can be configured in assistant_config.mjs)
19 | - after enabling both, you need to create a service account for this project and download a credentials JSON file that looks like this
20 |
21 | ```
22 | {
23 | "type": "service_account",
24 | "project_id": "{{your_project_id}}",
25 | "private_key_id": "{{your_private_key_id}}",
26 | "private_key": "{{your_private_key}}",
27 | "client_email": "local-account@{{your_project_id}}.iam.gserviceaccount.com",
28 | "client_id": "{{your_client_id}}",
29 | "auth_uri": "https://accounts.google.com/o/oauth2/auth",
30 | "token_uri": "https://oauth2.googleapis.com/token",
31 | "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
32 | "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/local-account%40{{your_project_id}}.iam.gserviceaccount.com",
33 | "universe_domain": "googleapis.com"
34 | }
35 |
36 | ```
37 |
38 | - put this file in the root directory of the project and adjust your .env file to point to this credentials file
39 | ```
40 | GOOGLE_APPLICATION_CREDENTIALS="/home/your-projects-directory/raspberry-ai-voice-assitant/google_client_secret.json"
41 | ```
42 | ## ChatGPT Setup
43 | - create `.env` file with this content replacing `{{your-api-key}}`
44 | - can be obtained on openai homepage (https://platform.openai.com/account/api-keys)
45 |
46 | ```
47 | OPEN_AI_APIKEY="{{your-api-key}}"
48 |
49 | ```
50 |
51 | ## Language Model Setup
52 |
53 | - voice assistant can be used with offline voice recognition
54 | - language models will be loaded into RAM, so this requires a laptop or later version of raspberry pi
55 | - to enable vosk, adjust `assistant_config.mjs`
56 | - download models here: https://alphacephei.com/vosk/models
57 | - create folder language_models in project root
58 | - extract downloaded language model in there
59 | - adjust `modelPaths` in `src/voice_assistant_vosk.mjs`
60 |
61 | ## Run the voice assitant
62 | ```
63 | npm run electron:serve
64 | ```
65 |
66 | ## Customization
67 |
68 | - check `assistant_config.mjs`
69 |
70 | ```
71 | const assistantName = 'Felix';
72 |
73 | export default {
74 | 'useLocalSpeechToText': false, // if true uses vosk (free), if false uses google speech to text
75 | 'lowMemoryVariant': false, // if useLocalSpeechToText is true, this determines, if big language model is used
76 | 'language': 'de', // language used for speech to text and answers from chatGPT
77 | 'answerWordLimit': 30, // limits the requested words of an answer from chatGPT
78 | 'assistantName': assistantName, // this is how you activate the voice assistant using "Hey, ${name}",
79 | "memeTrigger": 'listiges Bild', // if you say this, assistant will fetch a meme from reddit and show it,
80 | "gptSystemMessage": `Du bist einer virtueller Sprachassistent. Dein Name ist ${assistantName}. Du gibst kurze genaue Antworten. Das Aktuelle Datum ist: ${new Date().toISOString()}\n\n`
81 | };
82 | ```
--------------------------------------------------------------------------------
/assistant_config.mjs:
--------------------------------------------------------------------------------
1 | const assistantName = 'Felix';
2 |
3 | export default {
4 | 'useLocalSpeechToText': false, // if true uses vosk (free), if false uses google speech to text
5 | 'lowMemoryVariant': false, // if useLocalSpeechToText is true, this determines, if big language model is used
6 | 'language': 'de', // language used for speech to text and answers from chatGPT
7 | 'answerWordLimit': 30, // limits the requested words of an answer from chatGPT
8 | 'assistantName': assistantName, // this is how you activate the voice assistant using "Hey, ${name}",
9 | "memeTrigger": 'listiges Bild', // if you say this, assistant will fetch a meme from reddit and show it,
10 | "gptSystemMessage": `Du bist einer virtueller Sprachassistent. Dein Name ist ${assistantName}. Du gibst kurze genaue Antworten. Das Aktuelle Datum ist: ${new Date().toISOString()}\n\n`
11 | };
12 |
13 | // example en-US config
14 | // const assistantName = 'buddy';
15 |
16 | // export default {
17 | // 'useLocalSpeechToText': false, // if true uses vosk (free), if false uses google speech to text
18 | // 'lowMemoryVariant': false, // if useLocalSpeechToText is true, this determines, if big language model is used
19 | // 'language': 'en', // language used for speech to text and answers from chatGPT
20 | // 'answerWordLimit': 50, // limits the requested words of an answer from chatGPT
21 | // 'assistantName': assistantName, // this is how you activate the voice assistant using "Hey, ${name}",
22 | // "memeTrigger": 'funny image', // if you say this, assistant will fetch a meme from reddit and show it,
23 | // "gptSystemMessage": `You are a virtual voice assistant. Your name is ${assistantName}. You give short concrete answers. Current date is: ${new Date().toISOString()}\n\n`
24 | // };
--------------------------------------------------------------------------------
/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | presets: [
3 | '@vue/cli-plugin-babel/preset'
4 | ]
5 | }
6 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ai-voice-assistant",
3 | "version": "0.1.0",
4 | "private": true,
5 | "scripts": {
6 | "serve": "vue-cli-service serve",
7 | "build": "vue-cli-service build",
8 | "lint": "vue-cli-service lint",
9 | "electron:build": "vue-cli-service electron:build",
10 | "electron:serve": "vue-cli-service electron:serve",
11 | "postinstall": "electron-builder install-app-deps",
12 | "postuninstall": "electron-builder install-app-deps"
13 | },
14 | "main": "background.js",
15 | "dependencies": {
16 | "@google-cloud/speech": "5.5.0",
17 | "@google-cloud/text-to-speech": "4.2.1",
18 | "audic": "3.0.1",
19 | "chatgpt": "5.2.2",
20 | "core-js": "^3.6.5",
21 | "dotenv": "16.0.3",
22 | "get-mp3-duration": "1.0.0",
23 | "isomorphic-fetch": "3.0.0",
24 | "node-fetch": "3.3.1",
25 | "node-record-lpcm16": "1.0.1",
26 | "play-sound": "1.1.5",
27 | "sound-play": "1.1.0",
28 | "systeminformation": "4.34.23",
29 | "vosk": "^0.3.39",
30 | "vue": "^2.6.11",
31 | "vuetify": "^2.6.0"
32 | },
33 | "devDependencies": {
34 | "@vue/cli-plugin-babel": "5.0.8",
35 | "@vue/cli-plugin-eslint": "5.0.8",
36 | "@vue/cli-service": "5.0.8",
37 | "babel-eslint": "^10.1.0",
38 | "electron": "24.0.0",
39 | "electron-devtools-installer": "^3.1.0",
40 | "eslint": "7.32.0",
41 | "eslint-plugin-vue": "^6.2.2",
42 | "sass": "~1.32.0",
43 | "sass-loader": "^10.0.0",
44 | "vue-cli-plugin-electron-builder": "1.4.6",
45 | "vue-cli-plugin-vuetify": "~2.5.8",
46 | "vue-template-compiler": "^2.6.11",
47 | "vuetify-loader": "^1.7.0"
48 | },
49 | "eslintConfig": {
50 | "root": true,
51 | "env": {
52 | "node": true
53 | },
54 | "extends": [
55 | "plugin:vue/essential",
56 | "eslint:recommended"
57 | ],
58 | "parserOptions": {
59 | "parser": "babel-eslint"
60 | },
61 | "rules": {}
62 | },
63 | "browserslist": [
64 | "> 1%",
65 | "last 2 versions",
66 | "not dead"
67 | ]
68 | }
69 |
--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/public/favicon.ico
--------------------------------------------------------------------------------
/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | <%= htmlWebpackPlugin.options.title %>
9 |
10 |
11 |
12 |
13 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/sounds/de/hotword_answer_1.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/de/hotword_answer_1.mp3
--------------------------------------------------------------------------------
/sounds/de/meme_hotword_answer.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/de/meme_hotword_answer.mp3
--------------------------------------------------------------------------------
/sounds/en/hotword_answer_1.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/en/hotword_answer_1.mp3
--------------------------------------------------------------------------------
/sounds/en/meme_hotword_answer.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/sounds/en/meme_hotword_answer.mp3
--------------------------------------------------------------------------------
/src/App.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
24 |
25 |
--------------------------------------------------------------------------------
/src/assets/cyborg_corgi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/src/assets/cyborg_corgi.jpg
--------------------------------------------------------------------------------
/src/assets/cyborg_corgi.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oshell/raspberry-ai-voice-assistant/3e9d7e801662a492cefdaf680788365a209c8ece/src/assets/cyborg_corgi.webp
--------------------------------------------------------------------------------
/src/background.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | import { app, protocol, BrowserWindow, ipcMain } from 'electron'
3 | import { createProtocol } from 'vue-cli-plugin-electron-builder/lib'
4 | import installExtension, { VUEJS_DEVTOOLS } from 'electron-devtools-installer'
5 | import nodeChildProcess from 'child_process';
6 | import si from 'systeminformation';
7 | import 'isomorphic-fetch';
8 | import config from '../assistant_config.mjs'
9 |
10 | const isDevelopment = process.env.NODE_ENV !== 'production'
11 |
12 | let win;
13 |
14 | // Scheme must be registered before the app is ready
15 | protocol.registerSchemesAsPrivileged([
16 | { scheme: 'app', privileges: { secure: true, standard: true } }
17 | ])
18 |
19 | async function createWindow() {
20 | // Create the browser window.
21 | win = new BrowserWindow({
22 | webPreferences: {
23 | nodeIntegration: true,
24 | contextIsolation: false,
25 | enableRemoteModule: true,
26 | devTools: false
27 | },
28 | show: false,
29 | fullscreen: true
30 | })
31 |
32 | win.show();
33 | if (process.env.WEBPACK_DEV_SERVER_URL) {
34 | // Load the url of the dev server if in development mode
35 | await win.loadURL(process.env.WEBPACK_DEV_SERVER_URL)
36 | if (!process.env.IS_TEST) win.webContents.openDevTools()
37 | } else {
38 | createProtocol('app')
39 | // Load the index.html when not in development
40 | win.loadURL('app://./index.html')
41 | }
42 | }
43 |
44 | // Quit when all windows are closed.
45 | app.on('window-all-closed', () => {
46 | // On macOS it is common for applications and their menu bar
47 | // to stay active until the user quits explicitly with Cmd + Q
48 | if (process.platform !== 'darwin') {
49 | app.quit()
50 | }
51 | })
52 |
53 | app.on('activate', () => {
54 | // On macOS it's common to re-create a window in the app when the
55 | // dock icon is clicked and there are no other windows open.
56 | if (BrowserWindow.getAllWindows().length === 0) createWindow()
57 | })
58 |
59 | // This method will be called when Electron has finished
60 | // initialization and is ready to create browser windows.
61 | // Some APIs can only be used after this event occurs.
62 | app.on('ready', async () => {
63 | if (isDevelopment && !process.env.IS_TEST) {
64 | // Install Vue Devtools
65 | try {
66 | await installExtension(VUEJS_DEVTOOLS)
67 | } catch (e) {
68 | console.error('Vue Devtools failed to install:', e.toString())
69 | }
70 | }
71 | createWindow()
72 | })
73 |
74 | // Exit cleanly on request from parent process in development mode.
75 | if (isDevelopment) {
76 | if (process.platform === 'win32') {
77 | process.on('message', (data) => {
78 | if (data === 'graceful-exit') {
79 | app.quit()
80 | }
81 | })
82 | } else {
83 | process.on('SIGTERM', () => {
84 | app.quit()
85 | })
86 | }
87 | }
88 |
89 |
90 | function handleNotification(data) {
91 | try {
92 | const events = JSON.parse(data);
93 | events.forEach(event => {
94 | console.log(`${event.name}: ${event.value}`);
95 | win.webContents.send(event.name, event.value);
96 | });
97 | } catch (error) {
98 | console.log(`ERR: parsing failed: ${data}`);
99 | }
100 | }
101 |
102 | const speechToTextService = config.useLocalSpeechToText ? 'vosk' : 'google';
103 |
104 | const startVoskChildProcess = true;
105 | if (startVoskChildProcess) {
106 | let script = nodeChildProcess.spawn('node', [`src/voice_assistant_${speechToTextService}.mjs`]);
107 | script.stdout.on('data', handleNotification);
108 |
109 | process.on('exit', function() {
110 | console.log('Killing child process onexit!');
111 | script.kill();
112 | });
113 | }
114 | const tempratureCheckIntervall = 5000
115 |
116 | function checkCpuTemprature() {
117 | si.cpuTemperature().then((result) => {
118 | if (result) {
119 | const mainTemp = result.main;
120 | if (win && win.webContents) {
121 | win.webContents.send('cpu', mainTemp);
122 | }
123 | }
124 | })
125 | }
126 |
127 | checkCpuTemprature();
128 | setInterval(checkCpuTemprature, tempratureCheckIntervall);
--------------------------------------------------------------------------------
/src/components/AiAssistant.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | CPU: {{ temprature }}°
8 |
9 |
10 | Time: {{ time }}
11 |
12 |
13 |
14 |
15 |
16 |
17 |
20 |
21 |
22 |
25 |
26 |
27 |
28 |
29 |
30 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | Meme Generator
44 |
45 |
46 |
47 |
48 | {{ translations.thanks }}
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
193 |
194 |
--------------------------------------------------------------------------------
/src/google_stt.mjs:
--------------------------------------------------------------------------------
1 | import recorder from 'node-record-lpcm16';
2 | import speech from '@google-cloud/speech';
3 |
4 | // Creates a client
5 | const client = new speech.SpeechClient();
6 |
7 | const encoding = 'LINEAR16';
8 | const sampleRateHertz = 16000;
9 | let request = {};
10 | let callback = () => {};
11 | let recognizeStream = null;
12 |
13 |
14 | function startGoogleSpeechToText(func, langCode) {
15 | request = {
16 | config: {
17 | encoding: encoding,
18 | sampleRateHertz: sampleRateHertz,
19 | languageCode: langCode,
20 | },
21 | interimResults: false, // If you want interim results, set this to true
22 | };
23 | callback = func;
24 |
25 | recognizeStream = client
26 | .streamingRecognize(request)
27 | .on('error', console.error)
28 | .on('data', data => {
29 | const success = data.results[0] && data.results[0].alternatives[0];
30 | if (success) {
31 | const result = data.results[0].alternatives[0].transcript;
32 | callback(result);
33 | }
34 | });
35 | // Start recording and send the microphone input to the Speech API.
36 | // Ensure SoX is installed, see https://www.npmjs.com/package/node-record-lpcm16#dependencies
37 | recorder
38 | .record({
39 | sampleRateHertz: sampleRateHertz,
40 | threshold: 0,
41 | verbose: false,
42 | recordProgram: 'rec', // Try also "arecord" or "sox"
43 | silence: '1.0',
44 | })
45 | .stream()
46 | .on('error', console.error)
47 | .pipe(recognizeStream);
48 | }
49 |
50 | export default startGoogleSpeechToText;
51 |
--------------------------------------------------------------------------------
/src/main.js:
--------------------------------------------------------------------------------
1 | import Vue from 'vue'
2 | import App from './App.vue'
3 | import vuetify from './plugins/vuetify'
4 |
5 | Vue.config.productionTip = false
6 |
7 | new Vue({
8 | vuetify,
9 | render: h => h(App)
10 | }).$mount('#app')
11 |
--------------------------------------------------------------------------------
/src/plugins/vuetify.js:
--------------------------------------------------------------------------------
1 | import Vue from 'vue';
2 | import Vuetify from 'vuetify/lib/framework';
3 |
4 | Vue.use(Vuetify);
5 |
6 | export default new Vuetify({
7 | theme: {
8 | defaultTheme: 'dark'
9 | }
10 | });
11 |
--------------------------------------------------------------------------------
/src/test.mjs:
--------------------------------------------------------------------------------
1 | import fs from 'fs';
2 | import Audic from 'audic';
3 | import getMP3Duration from 'get-mp3-duration';
4 | const lang = 'en';
5 |
6 |
7 | function triggerEvent(name, value) {
8 | const event = { name, value };
9 | const events = [event];
10 | const data = JSON.stringify(events);
11 | console.log(data);
12 | }
13 |
14 |
15 | async function playSound(name) {
16 | const mp3File = `./sounds/${lang}/${name}.mp3`;
17 | const buffer = fs.readFileSync(mp3File);
18 | const duration = getMP3Duration(buffer);
19 | const audic = new Audic(mp3File);
20 | // ended event does not work correctly
21 | // workaround is getting duration of mp3 in ms
22 | // then when sound starts playing we set timeout
23 | // to trigger end event of tts
24 | audic.addEventListener('playing', () => {
25 | setTimeout(() => {
26 | triggerEvent('tts_end', true);
27 | process.exit();
28 | }, duration);
29 | });
30 | audic.play();
31 | }
32 |
33 | playSound('hotword_answer_1')
--------------------------------------------------------------------------------
/src/tts.mjs:
--------------------------------------------------------------------------------
1 | import fs from 'fs';
2 | import util from "util";
3 | import dotenv from "dotenv";
4 | import textToSpeech from "@google-cloud/text-to-speech";
5 |
6 | dotenv.config();
7 |
8 | const ttsApi = new textToSpeech.TextToSpeechClient();
9 |
10 | const languageMapping = {
11 | en: 'en-US',
12 | de: 'de-DE'
13 | }
14 |
15 | const voices = {
16 | de: 'de-DE-Neural2-B',//A,C,D
17 | en: 'en-US-Neural2-I'//A,D,I
18 | }
19 |
20 | async function synthesizeSpeech(text, name, lang) {
21 | // Construct the request
22 | const request = {
23 | input: { text: text },
24 | // Select the language and SSML voice gender (optional)
25 | voice: { languageCode: languageMapping[lang], name: voices[lang], ssmlGender: 'MALE' },
26 | // select the type of audio encoding
27 | audioConfig: { audioEncoding: 'MP3' },
28 | };
29 |
30 | // Performs the text-to-speech request
31 | const [response] = await ttsApi.synthesizeSpeech(request);
32 | // Write the binary audio content to a local file
33 | const writeFile = util.promisify(fs.writeFile);
34 | await writeFile(`./sounds/${lang}/${name}.mp3`, response.audioContent, 'binary');
35 | console.log('Done!');
36 | }
37 |
38 | async function listVoices(lang) {
39 | const [result] = await ttsApi.listVoices({});
40 | const voices = result.voices;
41 |
42 | console.log('Voices:');
43 | voices.forEach(voice => {
44 | if (!voice.name.includes('Neural')) return;
45 | if (!voice.languageCodes.includes(languageMapping[lang])) return;
46 | console.log(`Name: ${voice.name}`);
47 | console.log(` SSML Voice Gender: ${voice.ssmlGender}`);
48 | console.log(` Natural Sample Rate Hertz: ${voice.naturalSampleRateHertz}`);
49 | console.log(' Supported languages:');
50 | voice.languageCodes.forEach(languageCode => {
51 | console.log(` ${languageCode}`);
52 | });
53 | });
54 | }
55 |
56 | const text = "Spiel 'Die da', von den Fantastischen Vier";
57 | const name = 'temp_4';
58 | const lang = 'de';
59 |
60 | synthesizeSpeech(text, name, lang);
61 |
--------------------------------------------------------------------------------
/src/voice_assistant_google.mjs:
--------------------------------------------------------------------------------
1 | import fs from 'fs';
2 | import 'isomorphic-fetch';
3 | import util from "util";
4 | import dotenv from "dotenv";
5 | import { ChatGPTAPI } from 'chatgpt';
6 | import textToSpeech from "@google-cloud/text-to-speech";
7 | import Audic from 'audic';
8 | import getMP3Duration from 'get-mp3-duration';
9 | import config from '../assistant_config.mjs';
10 | import startGoogleSpeechToText from './google_stt.mjs';
11 |
12 | dotenv.config();
13 |
14 | const lang = config.language;
15 | const answerWordLimit = config.answerWordLimit;
16 | const assistantName = config.assistantName;
17 | const memeTrigger = config.memeTrigger;
18 | const systemMessage = config.gptSystemMessage;
19 |
20 | let lastRequestId = null;
21 | const eventTimeoutMs = 500;
22 | const debug = false;
23 |
24 | const languageMapping = {
25 | en: 'en-US',
26 | de: 'de-DE'
27 | }
28 |
29 | const voices = {
30 | de: 'de-DE-Neural2-B',//A,C,D
31 | en: 'en-US-Neural2-I'//A,D,I
32 | }
33 |
34 | const messagePostFix = {
35 | en: `Answer in less than ${answerWordLimit} words if possible.`,
36 | de: `Antworte in unter ${answerWordLimit} Wörtern, wenn möglich.`
37 | }
38 |
39 | const continueMatches = {
40 | de: 'nochmal',
41 | en: 'next'
42 | }
43 |
44 | const chatGPTAPI = new ChatGPTAPI({ apiKey: process.env.OPEN_AI_APIKEY, systemMessage })
45 | const ttsApi = new textToSpeech.TextToSpeechClient();
46 | let memeLoop = false;
47 |
48 | function sleep(ms) {
49 | return new Promise(resolve => setTimeout(resolve, ms));
50 | }
51 |
52 | function triggerEvent(name, value) {
53 | const event = { name, value };
54 | const events = [event];
55 | const data = JSON.stringify(events);
56 | console.log(data);
57 | }
58 |
59 | async function synthesizeSpeech(text) {
60 | // Construct the request
61 | const request = {
62 | input: { text: text },
63 | // Select the language and SSML voice gender (optional)
64 | voice: { languageCode: languageMapping[lang], name: voices[lang], ssmlGender: 'MALE' },
65 | // select the type of audio encoding
66 | audioConfig: { audioEncoding: 'MP3' },
67 | };
68 |
69 | // Performs the text-to-speech request
70 | const [response] = await ttsApi.synthesizeSpeech(request);
71 | // Write the binary audio content to a local file
72 | const writeFile = util.promisify(fs.writeFile);
73 | await writeFile(`./sounds/${lang}/gpt_answer.mp3`, response.audioContent, 'binary');
74 | await sleep(eventTimeoutMs);
75 | }
76 |
77 | async function askChatGpt(message) {
78 | const opts = {};
79 |
80 | message = `${message}. ${messagePostFix[lang]}`;
81 | if (lastRequestId) {
82 | opts.parentMessageId = lastRequestId
83 | }
84 | const response = await chatGPTAPI.sendMessage(message, opts);
85 | lastRequestId = response.id;
86 | return response.text;
87 | }
88 |
89 |
90 | async function fetchMeme() {
91 | const memeApi = 'https://meme-api.com/gimme';
92 | const response = await fetch(memeApi);
93 | const result = await response.json();
94 |
95 | triggerEvent('meme', result.url);
96 | }
97 |
98 | let active = false;
99 | let disabled = false;
100 | const minimumDisabledMs = 5000;
101 |
102 | const voiceRecognition = {
103 | hotwords: {
104 | activate: [],
105 | activateMeme: ''
106 | },
107 | initHotwords: () => {
108 | const prefixes = ['hey ', 'he ', 'the ', 'hi ', '']
109 | voiceRecognition.hotwords.activate = prefixes.map(prefix => {
110 | return `${prefix}${assistantName.toLocaleLowerCase()}`;
111 | });
112 | voiceRecognition.hotwords.activateMeme = memeTrigger;
113 | },
114 | googleSttCallback: (text) => {
115 | if (text && debug) {
116 | triggerEvent('google_stt_debug', text);
117 | return;
118 | }
119 | voiceRecognition.checkHotword(text);
120 | voiceRecognition.handleInput(text);
121 | voiceRecognition.checkStop(text);
122 | },
123 | start: () => {
124 | voiceRecognition.initHotwords();
125 | const googleLang = languageMapping[lang];
126 | startGoogleSpeechToText(voiceRecognition.googleSttCallback, googleLang);
127 | },
128 | handleInput: async (text) => {
129 | if (!active || disabled) return;
130 | if (text.includes('stop') && text.length <= 10) {
131 | triggerEvent('stop', true);
132 | active = false;
133 | return;
134 | }
135 |
136 | text = text.trim();
137 | let inputTooShort = text && text.length < 6;
138 | let notEnoughWords = text.split(' ').length < 3;
139 | let containsHotWord = text.includes(assistantName) && text.legth < 12;
140 | if (!text || inputTooShort || notEnoughWords || containsHotWord) {
141 | return;
142 | }
143 |
144 | if (text.includes('stop') && text.length <= 20) {
145 | triggerEvent('stop', true);
146 | active = false;
147 | return;
148 | }
149 |
150 | let questionEvent = {
151 | name: 'question',
152 | value: text
153 | };
154 |
155 | const gptStartEvent = {
156 | name: 'gpt_start',
157 | value: true
158 | };
159 |
160 | let events = [questionEvent, gptStartEvent];
161 | let data = JSON.stringify(events);
162 | console.log(data);
163 | active = false;
164 |
165 | const answer = await askChatGpt(text);
166 |
167 | triggerEvent('answer', answer);
168 | await synthesizeSpeech(answer);
169 | triggerEvent('tts', true);
170 | playSound('gpt_answer', true);
171 |
172 | },
173 | checkHotword: async (text) => {
174 | if (active || disabled) return;
175 |
176 | text = normalizeResult(text);
177 | let match = false;
178 | voiceRecognition.hotwords.activate.forEach(hotword => {
179 | if (text.includes(hotword)) {
180 | match = true;
181 | }
182 | });
183 |
184 | if (match) {
185 | active = true;
186 | setTimeout(() => {
187 | disabled = false;
188 | }, eventTimeoutMs);
189 | triggerEvent('voice_input_start', true);
190 | }
191 |
192 | let memeMatch = false;
193 | if (text.includes(voiceRecognition.hotwords.activateMeme)) {
194 | memeMatch = true;
195 | }
196 |
197 | if (memeMatch) {
198 | memeLoop = true;
199 | disabled = true;
200 | triggerEvent('meme_hotword', true);
201 | fetchMeme();
202 | playSound('meme_hotword_answer', false);
203 | setTimeout(() => {
204 | disabled = false;
205 | }, minimumDisabledMs)
206 | }
207 |
208 | let memeContinueMatch = text.includes(continueMatches[lang]);
209 |
210 | if (memeContinueMatch && memeLoop) {
211 | disabled = true;
212 | triggerEvent('meme', true);
213 | fetchMeme();
214 | setTimeout(() => {
215 | disabled = false;
216 | }, 1000);
217 | }
218 |
219 | let stopMatch = text.includes('stop');
220 |
221 | if (stopMatch && memeLoop) {
222 | memeLoop = false;
223 | triggerEvent('meme_stop', true);
224 | }
225 | },
226 | checkStop: async (text) => {
227 | if (!disabled) return;
228 | text = normalizeResult(text);
229 | let match = false;
230 | if (text.includes('stop')) {
231 | match = true;
232 | }
233 |
234 | if (match) {
235 | active = false;
236 | disabled = false;
237 | triggerEvent('stop', true);
238 | }
239 | }
240 | }
241 |
242 | async function playSound(name) {
243 | disabled = true;
244 | const mp3File = `./sounds/${lang}/${name}.mp3`;
245 | const buffer = fs.readFileSync(mp3File);
246 | const duration = getMP3Duration(buffer);
247 | const audic = new Audic(mp3File);
248 | // ended event does not work correctly
249 | // workaround is getting duration of mp3 in ms
250 | // then when sound starts playing we set timeout
251 | // to trigger end event of tts
252 | audic.addEventListener('playing', () => {
253 | setTimeout(() => {
254 | disabled = false;
255 | triggerEvent('tts_end', true);
256 | }, duration);
257 | });
258 | audic.play();
259 | }
260 |
261 | function normalizeResult(text) {
262 | text = text.trim();
263 | return text.toLocaleLowerCase();
264 | }
265 |
266 | console.log(JSON.stringify([{ name: 'LOG:', value: 'Google Speech-to-Text started!' }]));
267 | voiceRecognition.start();
268 |
--------------------------------------------------------------------------------
/src/voice_assistant_vosk.mjs:
--------------------------------------------------------------------------------
1 | import vosk from 'vosk';
2 | import fs from 'fs';
3 | import mic from 'mic';
4 | import { fileURLToPath } from 'url';
5 | import path from 'path';
6 | import 'isomorphic-fetch';
7 | import util from "util";
8 | import dotenv from "dotenv";
9 | import { ChatGPTAPI } from 'chatgpt';
10 | import textToSpeech from "@google-cloud/text-to-speech";
11 | import Audic from 'audic';
12 | import getMP3Duration from 'get-mp3-duration';
13 | import config from '../assistant_config.mjs';
14 |
15 | dotenv.config();
16 | const __filename = fileURLToPath(import.meta.url);
17 | const __dirname = path.dirname(__filename);
18 |
19 | const lang = config.language;
20 | const answerWordLimit = config.answerWordLimit;
21 | const lowMemoryVariant = config.lowMemoryVariant;
22 | const assistantName = config.assistantName;
23 | const memeTrigger = config.memeTrigger;
24 | const systemMessage = config.gptSystemMessage;
25 |
26 | let lastRequestId = null;
27 | const eventTimeoutMs = 200;
28 |
29 | const debug = false;
30 |
31 | let modelPaths = {
32 | de: __dirname + "/../language_models/vosk-model-de-0.21",
33 | en: __dirname + "/../language_models/vosk-model-en-us-0.22"
34 | }
35 |
36 | if (lowMemoryVariant) {
37 | modelPaths = {
38 | de: __dirname + "/../language_models/vosk-model-small-de-0.15",
39 | en: __dirname + "/../language_models/vosk-model-en-us-0.22-lgraph"
40 | }
41 | }
42 |
43 | const languageMapping = {
44 | en: 'en-US',
45 | de: 'de-DE'
46 | }
47 |
48 | const voices = {
49 | de: 'de-DE-Neural2-B',//A,C,D
50 | en: 'en-US-Neural2-I'//A,D,I
51 | }
52 |
53 | const messagePostFix = {
54 | en: `Answer in less than ${answerWordLimit} words if possible.`,
55 | de: `Antworte in unter ${answerWordLimit} Wörtern, wenn möglich.`
56 | }
57 |
58 | const continueMatches = {
59 | de: 'nochmal',
60 | en: 'next'
61 | }
62 |
63 | const MODEL_PATH = modelPaths[lang];
64 | const SAMPLE_RATE = 16000;
65 |
66 | if (!fs.existsSync(MODEL_PATH)) {
67 | console.log("Please download the model from https://alphacephei.com/vosk/models and unpack as " + MODEL_PATH + " in the current folder.")
68 | process.exit()
69 | }
70 |
71 | vosk.setLogLevel(0);
72 | const model = new vosk.Model(MODEL_PATH);
73 | let rec = new vosk.Recognizer({ model: model, sampleRate: SAMPLE_RATE });
74 |
75 |
76 | const chatGPTAPI = new ChatGPTAPI({ apiKey: process.env.OPEN_AI_APIKEY, systemMessage })
77 | const ttsApi = new textToSpeech.TextToSpeechClient();
78 | let memeLoop = false;
79 |
80 | function sleep(ms) {
81 | return new Promise(resolve => setTimeout(resolve, ms));
82 | }
83 |
84 | function triggerEvent(name, value) {
85 | const event = { name, value };
86 | const events = [event];
87 | const data = JSON.stringify(events);
88 | console.log(data);
89 | }
90 |
91 | async function synthesizeSpeech(text) {
92 | // Construct the request
93 | const request = {
94 | input: { text: text },
95 | // Select the language and SSML voice gender (optional)
96 | voice: { languageCode: languageMapping[lang], name: voices[lang], ssmlGender: 'MALE' },
97 | // select the type of audio encoding
98 | audioConfig: { audioEncoding: 'MP3' },
99 | };
100 |
101 | // Performs the text-to-speech request
102 | const [response] = await ttsApi.synthesizeSpeech(request);
103 | // Write the binary audio content to a local file
104 | const writeFile = util.promisify(fs.writeFile);
105 | await writeFile(`./sounds/${lang}/gpt_answer.mp3`, response.audioContent, 'binary');
106 | await sleep(eventTimeoutMs);
107 | }
108 |
109 | async function askChatGpt(message) {
110 | const opts = {};
111 |
112 | message = `${message}. ${messagePostFix[lang]}`;
113 | if (lastRequestId) {
114 | opts.parentMessageId = lastRequestId
115 | }
116 | const response = await chatGPTAPI.sendMessage(message, opts);
117 | lastRequestId = response.id;
118 | return response.text;
119 | }
120 |
121 |
122 | async function fetchMeme() {
123 | const memeApi = 'https://meme-api.com/gimme';
124 | const response = await fetch(memeApi);
125 | const result = await response.json();
126 |
127 | triggerEvent('meme', result.url);
128 | }
129 |
130 | const micInstance = mic({
131 | rate: String(SAMPLE_RATE),
132 | channels: '1',
133 | debug: false,
134 | device: 'default',
135 | });
136 |
137 | let active = false;
138 | let disabled = false;
139 | let recordingCache = '';
140 | let cacheCounter = 0;
141 | const minimumDisabledMs = 5000;
142 | const maxAttemptsRecording = 5;
143 |
144 | const voiceRecognition = {
145 | hotwords: {
146 | activate: [],
147 | activateMeme: ''
148 | },
149 | initHotwords: () => {
150 | const prefixes = ['hey', 'he', 'the']
151 | voiceRecognition.hotwords.activate = prefixes.map(prefix => {
152 | return `${prefix} ${assistantName}`;
153 | });
154 | voiceRecognition.hotwords.activateMeme = memeTrigger;
155 | },
156 | start: () => {
157 | voiceRecognition.initHotwords();
158 | const micInputStream = micInstance.getAudioStream();
159 |
160 | micInputStream.on('data', data => {
161 | voiceRecognition.checkHotword(data);
162 | voiceRecognition.handleInput(data);
163 | voiceRecognition.checkStop(data);
164 | });
165 |
166 | micInputStream.on('audioProcessExitComplete', function () {
167 | console.log(rec.finalResult());
168 | rec.free();
169 | model.free();
170 | });
171 |
172 | process.on('SIGINT', function () {
173 | console.log("\nStopping");
174 | micInstance.stop();
175 | });
176 |
177 | micInstance.start();
178 | },
179 | handleInput: async (data) => {
180 | if (disabled) {
181 | rec.reset();
182 | }
183 | if (!active || disabled) return;
184 | const isSilent = rec.acceptWaveform(data); 4
185 |
186 | let isFinalAttempt = false;
187 | let result = rec.partialResult();
188 |
189 | if (result.partial.includes('stop') && result.partial.length <= 10) {
190 | triggerEvent('stop', true);
191 | rec.reset();
192 | active = false;
193 | return;
194 | }
195 |
196 | let inputTooShort = result.partial && result.partial.length < 6;
197 | let notEnoughWords = result.partial.split(' ').length < 3;
198 | if (!result.partial || inputTooShort || notEnoughWords) {
199 | return;
200 | }
201 | if (result.partial === recordingCache) {
202 | cacheCounter++;
203 | } else {
204 | recordingCache = result.partial;
205 | cacheCounter = 0;
206 | }
207 |
208 | if (cacheCounter > maxAttemptsRecording) {
209 | isFinalAttempt = true;
210 | result = rec.finalResult();
211 | }
212 |
213 |
214 | if (isSilent || isFinalAttempt) {
215 | result = isFinalAttempt ? result : rec.result();
216 | result = normalizeResult(result);
217 |
218 | if (result.text.includes('stop') && result.text.length <= 20) {
219 | triggerEvent('stop', true);
220 | rec.reset();
221 | active = false;
222 | return;
223 | }
224 |
225 |
226 | if (result.text && debug) {
227 | triggerEvent('voice_input_debug', result.text)
228 | }
229 |
230 | let questionEvent = {
231 | name: 'question',
232 | value: result.text
233 | };
234 |
235 | const gptStartEvent = {
236 | name: 'gpt_start',
237 | value: true
238 | };
239 |
240 | let events = [questionEvent, gptStartEvent];
241 | data = JSON.stringify(events);
242 | console.log(data);
243 | active = false;
244 |
245 | const answer = await askChatGpt(result.text);
246 |
247 | triggerEvent('answer', answer);
248 | await synthesizeSpeech(answer);
249 | triggerEvent('tts', true);
250 | playSound('gpt_answer', true);
251 | } else {
252 | if (result.partial && debug) {
253 | triggerEvent('voice_input_partial', result.partial)
254 | }
255 |
256 | }
257 | },
258 | checkHotword: async (data) => {
259 | if (disabled) {
260 | rec.reset();
261 | }
262 | if (active || disabled) return;
263 | let result = '';
264 | if (rec.acceptWaveform(data)) {
265 | result = rec.result();
266 | } else {
267 | result = rec.partialResult();
268 | result.text = result.partial;
269 | }
270 |
271 | result = normalizeResult(result);
272 | let match = false;
273 | voiceRecognition.hotwords.activate.forEach(hotword => {
274 | if (result.text.includes(hotword)) {
275 | match = true;
276 | }
277 | });
278 |
279 | if (result.text && debug) {
280 | triggerEvent('voice_input_hotword', result.text);
281 | }
282 |
283 | if (match) {
284 | rec.reset();
285 | active = true;
286 | disabled = false;
287 | triggerEvent('voice_input_start', true);
288 | }
289 |
290 | let memeMatch = false;
291 | if (result.text.includes(voiceRecognition.hotwords.activateMeme)) {
292 | memeMatch = true;
293 | }
294 |
295 | if (memeMatch) {
296 | memeLoop = true;
297 | rec.reset();
298 | disabled = true;
299 | triggerEvent('meme_hotword', true);
300 | fetchMeme();
301 | playSound('meme_hotword_answer', false);
302 | setTimeout(() => {
303 | disabled = false;
304 | }, minimumDisabledMs)
305 | }
306 |
307 | let memeContinueMatch = result.text.includes(continueMatches[lang]);
308 |
309 | if (memeContinueMatch && memeLoop) {
310 | disabled = true;
311 | triggerEvent('meme', true);
312 | fetchMeme();
313 | setTimeout(() => {
314 | disabled = false;
315 | }, 1000);
316 | }
317 |
318 | let stopMatch = result.text.includes('stop');
319 |
320 | if (stopMatch && memeLoop) {
321 | memeLoop = false;
322 | rec.reset();
323 | triggerEvent('meme_stop', true);
324 | }
325 | },
326 | checkStop: async (data) => {
327 | if (!disabled) return;
328 | let result = '';
329 | if (rec.acceptWaveform(data)) {
330 | result = rec.result();
331 | } else {
332 | result = rec.partialResult();
333 | result.text = result.partial;
334 | }
335 |
336 | result = normalizeResult(result);
337 | let match = false;
338 | if (result.text.includes('stop')) {
339 | match = true;
340 | }
341 |
342 | if (match) {
343 | rec.reset();
344 | active = false;
345 | disabled = false;
346 | triggerEvent('stop', true);
347 | }
348 | }
349 | }
350 |
351 | async function playSound(name) {
352 | disabled = true;
353 | const mp3File = `./sounds/${lang}/${name}.mp3`;
354 | const buffer = fs.readFileSync(mp3File);
355 | const duration = getMP3Duration(buffer);
356 | const audic = new Audic(mp3File);
357 | // ended event does not work correctly
358 | // workaround is getting duration of mp3 in ms
359 | // then when sound starts playing we set timeout
360 | // to trigger end event of tts
361 | audic.addEventListener('playing', () => {
362 | setTimeout(() => {
363 | disabled = false;
364 | triggerEvent('tts_end', true);
365 | }, duration);
366 | });
367 | audic.play();
368 | }
369 |
370 | function normalizeResult(result) {
371 | if (lang === 'de') {
372 | if (result.text.startsWith('einen')) {
373 | result.text = result.text === 'einen' ? '' : result.text.substring(4);
374 | }
375 |
376 | result.text = result.text.replace('wie kann ich helfen', '');
377 | }
378 | if (lang === 'en') {
379 | if (result.text.startsWith('a ')) {
380 | result.text = result.text.substring(3);
381 | }
382 | if (result.text.startsWith('please')) {
383 | result.text = result.text === 'please' ? '' : result.text.substring(7);
384 | }
385 | if (result.text.startsWith('the')) {
386 | result.text = result.text === 'the' ? '' : result.text.substring(4);
387 | }
388 |
389 | result.text = result.text.replace('how can I help', '');
390 | }
391 |
392 | result.text = result.text.trim();
393 |
394 | return result;
395 | }
396 |
397 | console.log(JSON.stringify([{ name: 'LOG:', value: 'Vosk Speech-to-Text started!' }]));
398 | voiceRecognition.start();
399 |
--------------------------------------------------------------------------------
/vue.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | transpileDependencies: [
3 | 'vuetify'
4 | ],
5 | pluginOptions: {
6 | electronBuilder: {
7 | nodeIntegration: true
8 | }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------