├── jitpack.yml ├── settings.gradle ├── native ├── linux │ └── libjni_porcupine.so ├── windows │ └── libjni_porcupine.dll └── jni │ └── wakeup_Porcupine.h ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── .idea ├── dictionaries │ └── wdavi.xml ├── compiler.xml ├── vcs.xml ├── misc.xml ├── gradle.xml ├── jarRepositories.xml ├── modules │ ├── VocalCord_main.iml │ └── VocalCord_test.iml └── uiDesigner.xml ├── src └── main │ ├── java │ ├── wakeup │ │ ├── Porcupine.java.clean │ │ └── Porcupine.java │ ├── vocalcord │ │ ├── AudioSendMultiplexer.java │ │ ├── TTSEngine.java │ │ ├── TTSCache.java │ │ ├── STTEngine.java │ │ ├── UserStream.java │ │ ├── VocalCord.java │ │ └── CommandChain.java │ └── example │ │ └── ExampleBot.java │ └── c │ └── porcupine.c ├── LICENSE ├── docs ├── linux_compile.md └── windows_compile.md ├── gradlew.bat ├── gradlew └── README.md /jitpack.yml: -------------------------------------------------------------------------------- 1 | jdk: 2 | - openjdk12 -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'VocalCord' 2 | 3 | -------------------------------------------------------------------------------- /native/linux/libjni_porcupine.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/widavies/VocalCord/HEAD/native/linux/libjni_porcupine.so -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/widavies/VocalCord/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /native/windows/libjni_porcupine.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/widavies/VocalCord/HEAD/native/windows/libjni_porcupine.dll -------------------------------------------------------------------------------- /.idea/dictionaries/wdavi.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | vocalcord 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sun Mar 25 20:14:18 CDT 2018 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-all.zip 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/gradle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 20 | 21 | -------------------------------------------------------------------------------- /src/main/java/wakeup/Porcupine.java.clean: -------------------------------------------------------------------------------- 1 | package wakeup; 2 | 3 | public class Porcupine { 4 | private final long object; 5 | 6 | 7 | public Porcupine() throws Exception { 8 | try { 9 | object = 0; 10 | } catch (Exception e) { 11 | throw new Exception(e); 12 | } 13 | } 14 | 15 | public int processFrame(short[] pcm) throws Exception { 16 | try { 17 | return process(object, pcm); 18 | } catch (Exception e) { 19 | throw new Exception(e); 20 | } 21 | } 22 | 23 | public void delete() { 24 | delete(object); 25 | } 26 | 27 | public native int getFrameLength(); 28 | 29 | public native int getSampleRate(); 30 | 31 | private native long init(String dllLocation, String modelFilePath, float sensitivities, String[] wakePhrasePaths); 32 | 33 | private native int process(long object, short[] pcm); 34 | 35 | private native void delete(long object); 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Will Davies 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/linux_compile.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | The following guide describes how to create the ```native/linux/libjni_porcupine.so``` dynamic library. This library is a light JNI 3 | wrapper over the ```libpv_porcupine.so``` Porcupine dynamic library. Many thanks to [Olical](https://github.com/Olical/clojure-wake-word-detection) for 4 | this compilation process. If someone wants to make a script that does this all automatically, have at it. 5 | 6 | ### Compiling for Linux 7 | 1) First, make a directory for Porcupine to reside in: ```mkdir -p wake-engine``` 8 | 2) Clone: ```cd wake-engine && git clone git@github.com:Picovoice/Porcupine.git``` 9 | 2) Ensure back in top directory: ```VocalCord/``` 10 | 3) Create the JNI header file: ```javac -h native/jni src/main/java/wakeup/Porcupine.java``` 11 | 4) Compile: ```gcc -shared -O3 -I/usr/include -I/usr/lib/jvm/{YOUR-JVM-EDITION}/include -I/usr/lib/jvm/{YOUR_JVM-EDITION}/include/linux -Iwake-engine/Porcupine/include -Inative/jni src/main/c/porcupine.c -o native/linux/libjni_porcupine.so -fPIC``` 12 | - Example: ```gcc -shared -O3 -I/usr/include -I/usr/lib/jvm/java-14-oracle/include -I/usr/lib/jvm/java-14-oracle/include/linux -Iwake-engine/Porcupine/include -Inative/jni src/main/c/porcupine.c -o native/linux/libjni_porcupine.so -fPIC``` 13 | -------------------------------------------------------------------------------- /src/main/java/wakeup/Porcupine.java: -------------------------------------------------------------------------------- 1 | package wakeup; 2 | 3 | import vocalcord.VocalCord; 4 | 5 | // sudo apt-get install openjdk-8-jdk-headless 6 | public class Porcupine { 7 | private final long object; 8 | 9 | private static final VocalCord.Config CONFIG = VocalCord.getConfig(); 10 | 11 | static { 12 | System.load(CONFIG.jniLocation); 13 | } 14 | 15 | public Porcupine() throws Exception { 16 | try { 17 | object = init(CONFIG.porcupineLocation, CONFIG.porcupineParams, CONFIG.sensitivity, CONFIG.wakePhrasePaths); 18 | } catch(Exception e) { 19 | throw new Exception(e); 20 | } 21 | } 22 | 23 | public int processFrame(short[] pcm) throws Exception { 24 | try { 25 | return process(object, pcm); 26 | } catch (Exception e) { 27 | throw new Exception(e); 28 | } 29 | } 30 | 31 | public void delete() { 32 | delete(object); 33 | } 34 | 35 | public native int getFrameLength(); 36 | 37 | public native int getSampleRate(); 38 | 39 | private native long init(String dllLocation, String modelFilePath, float sensitivities, String[] wakePhrasePaths); 40 | 41 | private native int process(long object, short[] pcm); 42 | 43 | private native void delete(long object); 44 | } 45 | -------------------------------------------------------------------------------- /.idea/jarRepositories.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | 10 | 14 | 15 | 19 | 20 | 24 | 25 | 29 | 30 | -------------------------------------------------------------------------------- /native/jni/wakeup_Porcupine.h: -------------------------------------------------------------------------------- 1 | /* DO NOT EDIT THIS FILE - it is machine generated */ 2 | #include 3 | /* Header for class wakeup_Porcupine */ 4 | 5 | #ifndef _Included_wakeup_Porcupine 6 | #define _Included_wakeup_Porcupine 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | /* 11 | * Class: wakeup_Porcupine 12 | * Method: getFrameLength 13 | * Signature: ()I 14 | */ 15 | JNIEXPORT jint JNICALL Java_wakeup_Porcupine_getFrameLength 16 | (JNIEnv *, jobject); 17 | 18 | /* 19 | * Class: wakeup_Porcupine 20 | * Method: getSampleRate 21 | * Signature: ()I 22 | */ 23 | JNIEXPORT jint JNICALL Java_wakeup_Porcupine_getSampleRate 24 | (JNIEnv *, jobject); 25 | 26 | /* 27 | * Class: wakeup_Porcupine 28 | * Method: init 29 | * Signature: (Ljava/lang/String;Ljava/lang/String;F[Ljava/lang/String;)J 30 | */ 31 | JNIEXPORT jlong JNICALL Java_wakeup_Porcupine_init 32 | (JNIEnv *, jobject, jstring, jstring, jfloat, jobjectArray); 33 | 34 | /* 35 | * Class: wakeup_Porcupine 36 | * Method: process 37 | * Signature: (J[S)I 38 | */ 39 | JNIEXPORT jint JNICALL Java_wakeup_Porcupine_process 40 | (JNIEnv *, jobject, jlong, jshortArray); 41 | 42 | /* 43 | * Class: wakeup_Porcupine 44 | * Method: delete 45 | * Signature: (J)V 46 | */ 47 | JNIEXPORT void JNICALL Java_wakeup_Porcupine_delete 48 | (JNIEnv *, jobject, jlong); 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | #endif 54 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/AudioSendMultiplexer.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import net.dv8tion.jda.api.audio.AudioSendHandler; 4 | 5 | import javax.annotation.Nullable; 6 | import java.nio.ByteBuffer; 7 | 8 | // Used for multiplex mode Switch and Blend 9 | public class AudioSendMultiplexer implements AudioSendHandler { 10 | 11 | private final VocalCord.Config.SendMultiplex sendMultiplex; 12 | private final TTSEngine ttsEngine; 13 | 14 | private AudioSendHandler currentProvider; 15 | 16 | public AudioSendMultiplexer(TTSEngine engine, VocalCord.Config.SendMultiplex multiplex) { 17 | this.ttsEngine = engine; 18 | this.sendMultiplex = multiplex; 19 | } 20 | 21 | @Override 22 | public boolean canProvide() { 23 | if(sendMultiplex.mode == VocalCord.Config.SendMultiplex.MultiplexMode.Switch) { 24 | if(ttsEngine.canProvide()) { 25 | currentProvider = ttsEngine; 26 | return true; 27 | } else if(sendMultiplex.handlers[0].canProvide()) { 28 | currentProvider = sendMultiplex.handlers[0]; 29 | return true; 30 | } 31 | } 32 | 33 | return false; 34 | } 35 | 36 | @Nullable 37 | @Override 38 | public ByteBuffer provide20MsAudio() { 39 | return currentProvider.provide20MsAudio(); 40 | } 41 | 42 | @Override 43 | public boolean isOpus() { 44 | return currentProvider.isOpus(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /docs/windows_compile.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | The following guide describes how to create the ```native/windows/libjni_porcupine.dll``` dynamic library. This library is a light JNI 3 | wrapper over the ```libpv_porcupine.dll``` Porcupine dynamic library. If someone wants to make a script that does this all automatically, have at it. 4 | 5 | ### Compiling for Windows 6 | 1) First, make a directory for Porcupine to reside in: ```mkdir -p wake-engine``` 7 | 2) Clone: ```cd wake-engine && git clone git@github.com:Picovoice/Porcupine.git``` 8 | 2) Ensure back in top directory: ```VocalCord/``` 9 | 3) Create the JNI header file: ```javac -h native/jni src/main/java/wakeup/Porcupine.java``` 10 | 4) You will need to have Microsoft Visual Studio installed using C/C++ build tools. Then, click start, go to the Visual Studio 2019 folder and 11 | launch ```x86_x64 Cross Tools Command Prompt for VS 2019```. Then, within that prompt, CD to the ```VocalCord/``` directory. You can also optionally use MinGW 64-bit for Windows (skip to step 8). 12 | 5) Compile using: ```cl /I "wake-engine\Porcupine\include" /I "native\jni" /I "C:\Program Files\Java\{YOUR-JAVA-VERSION-HERE}\include" /I "C:\Program Files\Java\{YOUR-JAVA-VERSION-HERE}\include\win32" /LD src/main/c/porcupine.c``` 13 | - Example: ```cl /I "wake-engine\Porcupine\include" /I "native\jni" /I "C:\Program Files\Java\jdk-14.0.1\include" /I "C:\Program Files\Java\jdk-14.0.1\include\win32" /LD src/main/c/porcupine.c``` 14 | 7) This will create a bunch of files in the ```VocalCord``` directory. I can't figure out how to get them to go to ```native/windows``` automatically (```/OUT``` isn't working for me), 15 | so you will need to manually rename ```porcupine.dll``` to ```libjni_porcupine.dll``` and move it into the ```native/windows``` directory. 16 | 8) If you are using the [64-bit version of MinGW](http://mingw-w64.org/doku.php/download/mingw-builds) for Windows, you can compile using: ```gcc -shared -O3 -I "C:\Program Files\Java\{YOUR-JAVA-VERSION}\include" -I "C:\Program Files\Java\{YOUR-JAVA-VERSION}\include\win32" -I wake-engine/Porcupine/include -I native/jni src/main/c/porcupine.c -o native/windows/libjni_porcupine.dll``` 17 | - Example: ```gcc -shared -O3 -I "C:\Program Files\Java\jdk-14.0.1\include" -I "C:\Program Files\Java\jdk-14.0.1\include\win32" -I wake-engine/Porcupine/include -I wake-native/jni src/main/c/porcupine.c -o native/windows/libjni_porcupine.dll``` -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/TTSEngine.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import com.google.cloud.texttospeech.v1beta1.*; 4 | import com.google.protobuf.ByteString; 5 | import net.dv8tion.jda.api.audio.AudioSendHandler; 6 | 7 | import javax.annotation.Nullable; 8 | import java.nio.ByteBuffer; 9 | 10 | /* 11 | * Text to speech engine 12 | */ 13 | class TTSEngine implements AudioSendHandler { 14 | 15 | public static final int AUDIO_FRAME = 3840; // 48000 / 50 (number of 20 ms in a second) * 2 (16-bit samples) * 2 (channels) 16 | 17 | private byte[] out; 18 | private int index; 19 | private ByteBuffer lastFrame; 20 | 21 | private TTSCache ttsCache; 22 | 23 | TTSEngine() { 24 | this.out = new byte[0]; 25 | 26 | // Load the cache 27 | if(VocalCord.getConfig().usingTTS && VocalCord.getConfig().usingTTSCache) { 28 | try { 29 | ttsCache = new TTSCache(); 30 | } catch(Exception e) { 31 | e.printStackTrace(); 32 | } 33 | } 34 | } 35 | 36 | byte[] tts(String text) throws Exception { 37 | try(TextToSpeechClient client = TextToSpeechClient.create()) { 38 | SynthesisInput input = SynthesisInput.newBuilder().setSsml(text).build(); 39 | 40 | VoiceSelectionParams voice = VoiceSelectionParams.newBuilder().setLanguageCode(VocalCord.getConfig().languageCode).setSsmlGender(VocalCord.getConfig().voiceGender).build(); 41 | 42 | AudioConfig audioConfig = AudioConfig.newBuilder().setAudioEncoding(AudioEncoding.LINEAR16).setSampleRateHertz(48_000).build(); 43 | 44 | SynthesizeSpeechResponse response = client.synthesizeSpeech(input, voice, audioConfig); 45 | 46 | ByteString audioContents = response.getAudioContent(); 47 | 48 | byte[] pcm = audioContents.toByteArray(); 49 | 50 | // Three things need to happen - big endian, stereo, pad to a multiple of 3840 51 | // Add a frame of silence at the beginning so that the sound doesn't clip weirdly 52 | byte[] converted = new byte[AUDIO_FRAME + pcm.length * 2 + (AUDIO_FRAME - pcm.length * 2 % AUDIO_FRAME)]; 53 | // ensures converted is a multiple of AUDIO_FRAME 54 | for(int i = AUDIO_FRAME; i < pcm.length; i += 2) { 55 | short reversed = Short.reverseBytes((short) ((pcm[i] << 8) | (pcm[i + 1] & 0xFF))); 56 | byte low = (byte) (reversed >> 8); 57 | byte high = (byte) (reversed & 0x00FF); 58 | 59 | // reverse bytes and double to convert to stereo 60 | converted[i * 2] = low; 61 | converted[i * 2 + 1] = high; 62 | converted[i * 2 + 2] = low; 63 | converted[i * 2 + 3] = high; 64 | } 65 | 66 | return converted; 67 | } 68 | } 69 | 70 | void say(String phrase) throws Exception { 71 | this.index = Integer.MAX_VALUE; 72 | 73 | if(ttsCache != null) { 74 | TTSCache.CacheResponse response = ttsCache.checkCache(phrase); 75 | 76 | byte[] data = tts(phrase); 77 | 78 | if(response.pcmIfCached != null) { 79 | this.out = response.pcmIfCached; 80 | } else { 81 | this.out = data; 82 | } 83 | 84 | if(response.shouldCache) { 85 | ttsCache.cache(phrase, data); 86 | } 87 | } else { 88 | this.out = tts(phrase); 89 | } 90 | 91 | this.index = 0; 92 | } 93 | 94 | @Override 95 | public boolean canProvide() { 96 | boolean provide = index < out.length; 97 | 98 | if(provide) { 99 | lastFrame = ByteBuffer.wrap(out, index, AUDIO_FRAME); 100 | index += AUDIO_FRAME; 101 | 102 | if(index >= out.length) { 103 | VocalCord.getConfig().callbacks.onTTSCompleted(); 104 | } 105 | } 106 | 107 | return provide; 108 | } 109 | 110 | @Nullable 111 | @Override 112 | public ByteBuffer provide20MsAudio() { 113 | return lastFrame; 114 | } 115 | 116 | @Override 117 | public boolean isOpus() { 118 | return false; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/TTSCache.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import java.io.*; 4 | import java.util.HashMap; 5 | import java.util.concurrent.*; 6 | 7 | class TTSCache { 8 | 9 | private final File cacheFile; 10 | 11 | private static final int FREQUENT_THRESHOLD = 15; 12 | 13 | private final HashMap cachedPhrases; 14 | 15 | // Phrases that TTSCache is currently monitoring to decide if they are frequent enough to be cached 16 | private final HashMap considerations = new HashMap<>(); 17 | 18 | // Manages some caching related jobs, like IO or periodically cleaning up the hashmap 19 | private final ThreadPoolExecutor cacheService = new ThreadPoolExecutor(2, 4, 30, TimeUnit.SECONDS, new LinkedBlockingQueue<>()); 20 | 21 | static class CacheResponse { 22 | byte[] pcmIfCached; 23 | boolean shouldCache; 24 | 25 | public static CacheResponse phraseAlreadyCached(byte[] pcmIfCached) { 26 | CacheResponse r = new CacheResponse(); 27 | r.pcmIfCached = pcmIfCached; 28 | return r; 29 | } 30 | 31 | public static CacheResponse shouldCachePhrase() { 32 | CacheResponse r = new CacheResponse(); 33 | r.shouldCache = true; 34 | return r; 35 | } 36 | 37 | public static CacheResponse doNothing() { 38 | CacheResponse r = new CacheResponse(); 39 | r.shouldCache = false; 40 | r.pcmIfCached = null; 41 | return r; 42 | } 43 | } 44 | 45 | TTSCache() throws Exception { 46 | cacheFile = new File(System.getProperty("user.home") + File.separator + ".vocalcord" + File.separator + "vocalcord_phrases.cache"); 47 | 48 | if(!cacheFile.exists()) { 49 | if(cacheFile.getParentFile().mkdir() && cacheFile.createNewFile()) { 50 | cachedPhrases = new HashMap<>(); 51 | } else { 52 | throw new RuntimeException("Error creating cache file"); 53 | } 54 | } else { 55 | cachedPhrases = load(); 56 | } 57 | 58 | // Clear considerations every day, this means that a phrase can only be frequent if FREQUENT_THRESHOLD 59 | // is acquired in a day 60 | ScheduledExecutorService streamDaemon = Executors.newScheduledThreadPool(1); 61 | streamDaemon.scheduleAtFixedRate(considerations::clear, 0, 1, TimeUnit.DAYS); 62 | } 63 | 64 | CacheResponse checkCache(String phrase) { 65 | String cleaned = scrubPhrase(phrase); 66 | 67 | byte[] pcm = cachedPhrases.getOrDefault(cleaned, null); 68 | 69 | if(pcm == null) { 70 | int count = considerations.getOrDefault(cleaned, 0); 71 | considerations.put(cleaned, ++count); 72 | if(count >= FREQUENT_THRESHOLD) { 73 | return CacheResponse.shouldCachePhrase(); 74 | } else { 75 | return CacheResponse.doNothing(); 76 | } 77 | } 78 | 79 | return CacheResponse.phraseAlreadyCached(pcm); 80 | } 81 | 82 | void cache(String phrase, byte[] pcm) { 83 | cachedPhrases.put(scrubPhrase(phrase), pcm); 84 | 85 | cacheService.execute(this::save); 86 | } 87 | 88 | private static String scrubPhrase(String phrase) { 89 | return phrase.toLowerCase().replaceAll("\\s+", ""); 90 | } 91 | 92 | private void save() { 93 | try { 94 | FileOutputStream fos = new FileOutputStream(cacheFile); 95 | ObjectOutputStream oos = new ObjectOutputStream(fos); 96 | 97 | oos.writeObject(cachedPhrases); 98 | oos.close(); 99 | fos.close(); 100 | } catch(Exception e) { 101 | e.printStackTrace(); 102 | } 103 | } 104 | 105 | private HashMap load() { 106 | try { 107 | FileInputStream fis = new FileInputStream(cacheFile); 108 | ObjectInputStream ois = new ObjectInputStream(fis); 109 | 110 | @SuppressWarnings("unchecked") 111 | HashMap map = (HashMap) ois.readObject(); 112 | ois.close(); 113 | fis.close(); 114 | return map; 115 | } catch(Exception e) { 116 | System.out.println("No cached phrases loaded. This probably isn't an error."); 117 | } 118 | 119 | return new HashMap<>(); 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn () { 37 | echo "$*" 38 | } 39 | 40 | die () { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save () { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/STTEngine.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import com.google.cloud.speech.v1.*; 4 | import com.google.protobuf.ByteString; 5 | import net.dv8tion.jda.api.audio.AudioReceiveHandler; 6 | import net.dv8tion.jda.api.audio.UserAudio; 7 | 8 | import javax.annotation.Nonnull; 9 | import java.util.ArrayList; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.concurrent.*; 13 | 14 | class STTEngine implements AudioReceiveHandler { 15 | 16 | private final HashMap streams = new HashMap<>(); 17 | 18 | private final ThreadPoolExecutor workPool = new ThreadPoolExecutor(8, 16, 1, TimeUnit.MINUTES, new LinkedBlockingQueue<>()); 19 | 20 | private class StreamMonitor implements Runnable { 21 | 22 | @Override 23 | public void run() { 24 | for(String userId : streams.keySet()) { 25 | UserStream us = streams.get(userId); 26 | 27 | if(us.readyForTranscription()) { 28 | byte[] audio = us.getAudioForGoogle(); 29 | us.sleep(); 30 | 31 | workPool.execute(() -> { 32 | List results = speechRecognition(audio); 33 | 34 | VocalCord.Callbacks callbacks = VocalCord.getConfig().callbacks; 35 | 36 | CommandChain chain = callbacks.onTranscribed(); 37 | if(chain != null) { 38 | CommandChain.TaskCandidate max = null; 39 | double maxSimilarity = -1; 40 | 41 | for(SpeechRecognitionResult result : results) { 42 | for(SpeechRecognitionAlternative sra : result.getAlternativesList()) { 43 | CommandChain.TaskCandidate ccs = chain.score(sra.getTranscript()); 44 | if(ccs != null && ccs.score > maxSimilarity) { 45 | maxSimilarity = ccs.score; 46 | max = ccs; 47 | } 48 | } 49 | } 50 | 51 | chain.fulfillTaskCandidate(us.getUser(), max); 52 | } else { 53 | if(results.size() > 0 && results.get(0).getAlternativesList().size() > 0) { 54 | callbacks.onTranscribed(us.getUser(), results.get(0).getAlternatives(0).getTranscript()); 55 | } 56 | } 57 | }); 58 | } else if(us.shouldDestroy()) { 59 | us.sleep(); 60 | us.destroy(); 61 | streams.remove(us.getUser().getId()); 62 | } 63 | } 64 | } 65 | } 66 | 67 | STTEngine() { 68 | ScheduledExecutorService streamDaemon = Executors.newScheduledThreadPool(1); 69 | streamDaemon.scheduleAtFixedRate(new StreamMonitor(), 0, 1000, TimeUnit.MICROSECONDS); // 1 ms 70 | } 71 | 72 | @Override 73 | public boolean canReceiveUser() { 74 | return true; 75 | } 76 | 77 | @Override 78 | public void handleUserAudio(@Nonnull UserAudio userAudio) { 79 | if(!streams.containsKey(userAudio.getUser().getId())) { 80 | try { 81 | /* 82 | * Don't track the audio of users who aren't allowed to wake, otherwise resources would be needlessly wasted 83 | * with another Porcupine instance 84 | */ 85 | if(VocalCord.getConfig().callbacks.canWakeBot(userAudio.getUser())) { 86 | UserStream stream = new UserStream(userAudio.getUser()); 87 | stream.putAudio(workPool, userAudio.getAudioData(1)); 88 | streams.put(userAudio.getUser().getId(), stream); 89 | } 90 | } catch(Exception e) { 91 | e.printStackTrace(); 92 | } 93 | } else { 94 | streams.get(userAudio.getUser().getId()).putAudio(workPool, userAudio.getAudioData(1)); 95 | } 96 | } 97 | 98 | private List speechRecognition(byte[] pcm) { 99 | try(SpeechClient speech = SpeechClient.create()) { 100 | ByteString audioBytes = ByteString.copyFrom(pcm); 101 | 102 | // Configure request with local raw PCM speechRecognition 103 | RecognitionConfig config = RecognitionConfig.newBuilder() 104 | .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) 105 | .setLanguageCode("en-US") 106 | .setSampleRateHertz(16000) 107 | .build(); 108 | RecognitionAudio audio = RecognitionAudio.newBuilder() 109 | .setContent(audioBytes) 110 | .build(); 111 | 112 | // Use blocking call to get speechRecognition transcript 113 | RecognizeResponse response = speech.recognize(config, audio); 114 | 115 | return response.getResultsList(); 116 | } catch(Exception e) { 117 | e.printStackTrace(); 118 | System.err.println("Failed to run Google Cloud speech recognition. Err: " + e.getMessage()); 119 | } 120 | return new ArrayList<>(); 121 | } 122 | 123 | // private double volumeRMS(byte[] raw) { // needs more testing 124 | // double sum = 0d; 125 | // if(raw.length == 0) { 126 | // return sum; 127 | // } else { 128 | // for(byte aRaw : raw) { 129 | // sum += aRaw; 130 | // } 131 | // } 132 | // double average = sum / raw.length; 133 | // 134 | // double sumMeanSquare = 0d; 135 | // for(byte aRaw : raw) { 136 | // sumMeanSquare += Math.pow(aRaw - average, 2d); 137 | // } 138 | // double averageMeanSquare = sumMeanSquare / raw.length; 139 | // return Math.sqrt(averageMeanSquare); 140 | // } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/UserStream.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import net.dv8tion.jda.api.audio.AudioReceiveHandler; 4 | import net.dv8tion.jda.api.entities.User; 5 | import wakeup.Porcupine; 6 | 7 | import javax.sound.sampled.AudioFormat; 8 | import javax.sound.sampled.AudioInputStream; 9 | import javax.sound.sampled.AudioSystem; 10 | import java.io.ByteArrayInputStream; 11 | import java.util.concurrent.ThreadPoolExecutor; 12 | 13 | public class UserStream { 14 | 15 | private boolean awake; 16 | private byte[] phrase; 17 | private boolean phraseBegun; 18 | private long lastReceivedPacket; 19 | private int index; 20 | private long lastAudioReceived = -1; 21 | 22 | private final VocalCord.Config config = VocalCord.getConfig(); 23 | 24 | private Porcupine porcupine; 25 | 26 | /* 27 | * Converts Discord PCM to the audio format required by Porcupine 28 | */ 29 | private static class PorcupineAdapter { 30 | private static final int AUDIO_FRAME = 512; 31 | private final short[] pcm; 32 | private int index = 0; 33 | 34 | public PorcupineAdapter(byte[] raw) { 35 | // Down-samples audio to 16 KHz and combines bytes into shorts 36 | pcm = new short[raw.length / 12 + (AUDIO_FRAME - raw.length / 12 % AUDIO_FRAME)]; 37 | 38 | for(int i = 0, j = 0; i < raw.length; i += 12, j++) { 39 | pcm[j] = (short) ((raw[i] << 8) | (raw[i + 1] & 0xFF)); 40 | } 41 | } 42 | 43 | public boolean hasNext() { 44 | return index < pcm.length; 45 | } 46 | 47 | public short[] take() { 48 | short[] frame = new short[AUDIO_FRAME]; 49 | System.arraycopy(pcm, index, frame, index, index + AUDIO_FRAME - index); 50 | index += AUDIO_FRAME; 51 | return frame; 52 | } 53 | } 54 | 55 | private final User user; 56 | 57 | public UserStream(User user) throws Exception { 58 | this.user = user; 59 | 60 | if(!VocalCord.getConfig().captioning) { 61 | porcupine = new Porcupine(); 62 | 63 | if(porcupine.getFrameLength() != 512 || porcupine.getSampleRate() != 16000) { 64 | throw new RuntimeException("The underlying porcupine binaries do not have the expected configuration."); 65 | } 66 | } 67 | } 68 | 69 | public void putAudio(ThreadPoolExecutor workPool, byte[] audio) { 70 | lastAudioReceived = System.nanoTime(); 71 | 72 | if(!awake && !VocalCord.getConfig().captioning) { 73 | try { 74 | PorcupineAdapter pa = new PorcupineAdapter(audio); 75 | while(pa.hasNext()) { 76 | int keywordIndex = porcupine.processFrame(pa.take()); 77 | 78 | if(keywordIndex != -1) { 79 | workPool.execute(() -> VocalCord.getConfig().callbacks.onWake(this, keywordIndex)); 80 | wakeup(); 81 | } 82 | } 83 | 84 | } catch(Exception e) { 85 | e.printStackTrace(); 86 | } 87 | } else if(!awake && VocalCord.getConfig().captioning) { 88 | wakeup(); 89 | } else { 90 | // Resize the array if needed 91 | if(index + audio.length >= phrase.length) { 92 | byte[] resized = new byte[phrase.length * 2]; 93 | System.arraycopy(phrase, 0, resized, 0, phrase.length); 94 | phrase = resized; 95 | } 96 | 97 | // Concatenate on the audio data 98 | System.arraycopy(audio, 0, phrase, index, audio.length); 99 | index += audio.length; 100 | lastReceivedPacket = System.nanoTime(); 101 | 102 | if(index >= 3840 * 50 / 2) { // need half a second of audio to consider phraseBegun 103 | phraseBegun = true; 104 | } 105 | } 106 | } 107 | 108 | private void wakeup() { 109 | awake = true; 110 | phrase = new byte[3840 * 50 * 5]; // by default, holds 5 seconds of data 111 | phraseBegun = false; 112 | lastReceivedPacket = System.nanoTime(); 113 | index = 0; 114 | } 115 | 116 | boolean readyForTranscription() { 117 | if(!awake) return false; 118 | 119 | // if no packet received in last 2 seconds & phraseBegun, then transcribe 120 | // if no packet received in last 5 seconds & !phraseBegun, then don't transcribe 121 | // if captioning, transcribe every 5 minutes anyway 122 | // if certain limit reached (15 seconds), transcribe 123 | // future: trailing audio volume is 30% or something less than the average of the whole 124 | 125 | long elapsedMs = (System.nanoTime() - lastReceivedPacket) / 1_000_000; 126 | 127 | if(phraseBegun && elapsedMs >= config.postPhraseTimeout) { 128 | return true; 129 | } else if(!phraseBegun && elapsedMs >= config.postWakeLimit) { 130 | sleep(); 131 | return false; // user never started talking after waking bot 132 | } else if(phraseBegun && VocalCord.getConfig().captioning && phrase.length > 3840 * 50 * VocalCord.getConfig().captioningChunkSize) { 133 | return true; 134 | } else if(phrase.length > 3840 * 50 * config.maxPhraseTime) { 135 | return true; 136 | } 137 | 138 | return false; 139 | } 140 | 141 | boolean shouldDestroy() { 142 | double elapsedMinutes = (System.nanoTime() - lastAudioReceived) / 1_000_000_000.0 / 60.0; 143 | 144 | return !awake && elapsedMinutes > 3840 * 50 * config.userStreamLife; 145 | } 146 | 147 | public void sleep() { 148 | awake = false; 149 | } 150 | 151 | byte[] getAudioForGoogle() { 152 | 153 | try { 154 | AudioFormat target = new AudioFormat(16000f, 16, 1, true, false); 155 | AudioInputStream is = AudioSystem.getAudioInputStream(target, 156 | new AudioInputStream(new ByteArrayInputStream(phrase), AudioReceiveHandler.OUTPUT_FORMAT, 157 | phrase.length)); 158 | 159 | return is.readAllBytes(); 160 | } catch(Exception e) { 161 | e.printStackTrace(); 162 | return null; 163 | } 164 | } 165 | 166 | void destroy() { 167 | porcupine.delete(); 168 | } 169 | 170 | public User getUser() { 171 | return user; 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/example/ExampleBot.java: -------------------------------------------------------------------------------- 1 | package example; 2 | 3 | import com.google.cloud.texttospeech.v1beta1.SsmlVoiceGender; 4 | import net.dv8tion.jda.api.JDA; 5 | import net.dv8tion.jda.api.JDABuilder; 6 | import net.dv8tion.jda.api.entities.Message; 7 | import net.dv8tion.jda.api.entities.User; 8 | import net.dv8tion.jda.api.entities.VoiceChannel; 9 | import net.dv8tion.jda.api.events.message.MessageReceivedEvent; 10 | import net.dv8tion.jda.api.hooks.ListenerAdapter; 11 | import vocalcord.CommandChain; 12 | import vocalcord.UserStream; 13 | import vocalcord.VocalCord; 14 | 15 | import java.util.ArrayList; 16 | 17 | public class ExampleBot extends ListenerAdapter implements VocalCord.Callbacks { 18 | 19 | private final VocalCord cord; 20 | 21 | public ExampleBot() { 22 | /* 23 | * This code will create the bot, make sure to specify the absolute paths of the files you downloaded to native to ensure all libraries 24 | * are loaded correctly. This is also where you can config VocalCord settings. 25 | */ 26 | 27 | // Windows 28 | // cord = VocalCord.newConfig(this).withWakeDetection("C:\\Users\\wdavi\\IdeaProjects\\VocalCord\\native\\windows\\libjni_porcupine.dll", 29 | // "C:\\Users\\wdavi\\IdeaProjects\\VocalCord\\native\\windows\\libpv_porcupine.dll", "C:\\Users\\wdavi\\IdeaProjects\\VocalCord\\native\\porcupine_params.pv", 30 | // 0.5f, "C:\\Users\\wdavi\\IdeaProjects\\VocalCord\\phrases\\windows.ppn").withTTS(SsmlVoiceGender.MALE, 31 | // true).build(); 32 | 33 | // Windows (with closed captioning instead of wake detection) 34 | cord = VocalCord.newConfig(this).withClosedCaptioning().withTTS(SsmlVoiceGender.MALE, 35 | false).build(); 36 | 37 | // Linux (using WSL) 38 | // cord = VocalCord.newConfig(this).withWakeDetection("/mnt/c/Users/wdavi/IdeaProjects/VocalCord/native/linux/libjni_porcupine.so", 39 | // "/mnt/c/Users/wdavi/IdeaProjects/VocalCord/native/linux/libpv_porcupine.so", "/mnt/c/Users/wdavi/IdeaProjects/VocalCord/native/porcupine_params.pv", 40 | // 0.5f, "/mnt/c/Users/wdavi/IdeaProjects/VocalCord/phrases/linux.ppn").withTTS(SsmlVoiceGender.MALE, true).build(); 41 | } 42 | 43 | public static void main(String[] args) throws Exception { 44 | // Creates a JDA Discord instance and makes your bot go online 45 | JDA api = JDABuilder.createDefault(/* YOUR BOT TOKEN HERE */args[0]).build(); 46 | api.addEventListener(new ExampleBot()); 47 | } 48 | 49 | /* 50 | * This callback defines which users are allowed to access VocalCord. 51 | * Note, you want to be restrictive on this, especially for large servers, 52 | * running wake word detection on like 50+ users simultaneously is untested and 53 | * may affect performance. 54 | */ 55 | @Override 56 | public boolean canWakeBot(User user) { 57 | return true; 58 | } 59 | 60 | /* 61 | * This method is called when an authorized user (canWakeBot(..) returned true) 62 | * woke up the bot, the keywordIndex defines which keyword woke the bot (this depends 63 | * on the order you specified keywords to when setting up VocalCord) If you only have one 64 | * keyword, this will be 0. This method is useful for giving the user some feedback that the 65 | * bot is listening, here for example, the bot will say "Yes?" when it's woken up. Immediately after 66 | * this call, VocalCord will start generating a voice transcript of what the user said. If you want to cancel 67 | * voice recognition here, you can call userStream.sleep() 68 | */ 69 | @Override 70 | public void onWake(UserStream userStream, int keywordIndex) { 71 | cord.say("Yes?"); 72 | } 73 | 74 | /* 75 | * Note: There are two onTranscribed(..) methods, you should only use one of them (this one is better) 76 | * This callback is where you'll store all your voice commands. Importantly, voice transcripts aren't always 77 | * 100% accurate. If you hard code a list of commands, being off by just one word wouldn't register the command, 78 | * or trying to use lots of String.contains(..) calls could easily intermix commands. This callback employs 79 | * CommandChain, which will generate document vectors and a document term matrix in order to compute the cosine 80 | * similarity between a candidate transcription. Essentially, CommandChain will automatically run an algorithm to 81 | * determine which command was most likely said. This means that a user doesn't have to be 100% accurate on matching a command, 82 | * and instead only needs to capture the meaning of a command. 83 | */ 84 | @Override 85 | public CommandChain onTranscribed() { 86 | return new CommandChain.Builder().addPhrase("hello world", (user, transcript, args) -> { 87 | cord.say(user.getName()+" said something"); 88 | }) 89 | .addPhrase("knock knock", (user, transcript, args) -> { 90 | cord.say("Who's there?"); 91 | }).withFallback(((user, transcript, args) -> { 92 | cord.say("I'm sorry, I didn't get that"); 93 | })).withMinThreshold(0.5f).build(); 94 | } 95 | 96 | @Override 97 | public void onMessageReceived(MessageReceivedEvent event) { 98 | // Don't process messages from other bots 99 | if(event.getAuthor().isBot()) return; 100 | 101 | Message message = event.getMessage(); 102 | String content = message.getContentRaw(); 103 | 104 | if(content.startsWith("!say")) { 105 | cord.say(content.substring(5)); 106 | } 107 | 108 | /* 109 | * This is a basic summon command that will summon the bot to 110 | * whatever voice channel the author is in, this is a really basic 111 | * summon command, but you can develop more advanced scenarios where 112 | * the bot follows you around or whatever. 113 | */ 114 | if(content.equals("!summon")) { 115 | event.getMessage().getChannel().sendMessage("On my way!").queue(); 116 | try { 117 | VoiceChannel authorVoiceChannel = event.getMember().getVoiceState().getChannel(); 118 | cord.connect(authorVoiceChannel); 119 | } catch(Exception e) { 120 | e.printStackTrace(); 121 | } 122 | } 123 | } 124 | 125 | @Override 126 | public void onTTSCompleted() { 127 | // If you want to do anything after the bot stops speaking 128 | } 129 | } -------------------------------------------------------------------------------- /.idea/modules/VocalCord_main.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /.idea/modules/VocalCord_test.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/main/c/porcupine.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #ifdef _WIN32 7 | #include 8 | 9 | typedef int (__cdecl * porcupine_init)(const char*, int32_t,const char*const*,const float *, pv_porcupine_t**); 10 | typedef void (__cdecl * porcupine_delete)(pv_porcupine_t*); 11 | typedef pv_status_t (__cdecl * porcupine_process)(pv_porcupine_t*object,const int16_t*,int32_t*); 12 | typedef int32_t (__cdecl * porcupine_frame_length)(); 13 | typedef int (__cdecl * porcupine_sample_rate)(); 14 | 15 | porcupine_init f_init; 16 | porcupine_delete f_delete; 17 | porcupine_process f_process; 18 | porcupine_frame_length f_frame_length; 19 | porcupine_sample_rate f_sample_rate; 20 | 21 | bool loadDLL(const char * pv_porcupine_dll_location) { 22 | // Load DLL 23 | HINSTANCE proc = LoadLibrary(pv_porcupine_dll_location); 24 | 25 | if(!proc) { 26 | printf("Failed to load %s, is it in the right location?\n", pv_porcupine_dll_location); 27 | return false; 28 | } 29 | 30 | f_init = (porcupine_init)GetProcAddress(proc, "pv_porcupine_init"); 31 | f_process = (porcupine_process)GetProcAddress(proc, "pv_porcupine_process"); 32 | f_delete = (porcupine_delete)GetProcAddress(proc, "pv_porcupine_delete"); 33 | f_sample_rate = (porcupine_sample_rate)GetProcAddress(proc, "pv_sample_rate"); 34 | f_frame_length = (porcupine_frame_length)GetProcAddress(proc, "pv_porcupine_frame_length"); 35 | 36 | if(!f_init || !f_process || !f_delete || !f_sample_rate || !f_frame_length) { 37 | printf("Failed to locate required functions from the DLL. Is it corrupt?\n"); 38 | return false; 39 | } else { 40 | printf("Loaded porcupine JNI Porcupine DLL wrapper successfully.\n"); 41 | return true; 42 | } 43 | } 44 | #else // linux (maybe works for mac?) 45 | #include 46 | 47 | int (*f_init)(const char*, int32_t,const char*const*,const float *, pv_porcupine_t**); 48 | void (*f_delete)(pv_porcupine_t*); 49 | pv_status_t (*f_process)(pv_porcupine_t*object,const int16_t*,int32_t*); 50 | int32_t (*f_frame_length)(); 51 | int (*f_sample_rate)(); 52 | 53 | bool loadSO(const char * pv_porcupine_so_location) { 54 | void * handle = dlopen(pv_porcupine_so_location, RTLD_LAZY); 55 | if(!handle) { 56 | printf("Failed to load %s, is it in the right location?\n", pv_porcupine_so_location); 57 | return false; 58 | } 59 | 60 | dlerror(); // clear any existing errors 61 | f_init = dlsym(handle, "pv_porcupine_init"); 62 | f_process = dlsym(handle, "pv_porcupine_process"); 63 | f_delete = dlsym(handle, "pv_porcupine_delete"); 64 | f_sample_rate = dlsym(handle, "pv_sample_rate"); 65 | f_frame_length = dlsym(handle, "pv_porcupine_frame_length"); 66 | 67 | if(dlerror() != NULL) { 68 | printf("Failed to locate required functions from the SO. Is it corrupt?\n"); 69 | return false; 70 | } else { 71 | printf("Loaded porcupine JNI Porcupine DLL wrapper successfully.\n"); 72 | return true; 73 | } 74 | } 75 | #endif 76 | 77 | JNIEXPORT jint JNICALL Java_wakeup_Porcupine_getFrameLength (JNIEnv *env, jobject obj) { 78 | #ifdef _WIN32 79 | return f_frame_length(); 80 | #else 81 | return (*f_frame_length)(); 82 | #endif 83 | } 84 | 85 | JNIEXPORT jint JNICALL Java_wakeup_Porcupine_getSampleRate (JNIEnv *env, jobject obj) { 86 | #ifdef _WIN32 87 | return f_sample_rate(); 88 | #else 89 | return (*f_sample_rate)(); 90 | #endif 91 | } 92 | 93 | JNIEXPORT jlong JNICALL Java_wakeup_Porcupine_init 94 | (JNIEnv *env, jobject obj, jstring porcupine_location_raw, jstring model_raw, jfloat sens, jobjectArray wakePhrasesRaw) { 95 | 96 | const char * porcupine_location = (*env)->GetStringUTFChars(env, porcupine_location_raw, 0); 97 | 98 | #ifdef _WIN32 99 | loadDLL(porcupine_location); 100 | #else 101 | loadSO(porcupine_location); 102 | #endif 103 | 104 | const char * model = (*env)->GetStringUTFChars(env, model_raw, 0); 105 | 106 | pv_porcupine_t *handle; 107 | 108 | int numWakePhrases = (*env)->GetArrayLength(env, wakePhrasesRaw); 109 | 110 | const char ** wakeup_phrase_paths = malloc(numWakePhrases * sizeof(char *)); 111 | 112 | float * sensArr = malloc(sizeof(float) * numWakePhrases); 113 | 114 | jstring ** raw_paths = malloc(sizeof(jstring*) * numWakePhrases); 115 | 116 | printf("Loading Porcupine with model path %s, sensitivity %f, and %d wake phrases\n", model, sens, numWakePhrases); 117 | 118 | for(int i = 0; i < numWakePhrases; i++) { 119 | sensArr[i] = sens; 120 | 121 | jstring path_raw = (jstring) (*env)->GetObjectArrayElement(env, wakePhrasesRaw, i); 122 | const char * path = (*env)->GetStringUTFChars(env, path_raw, 0); 123 | 124 | raw_paths[i] = &path_raw; 125 | wakeup_phrase_paths[i] = path; 126 | 127 | printf("Wake phrase added: %s\n", path); 128 | } 129 | 130 | #ifdef _WIN32 131 | const pv_status_t status = f_init(model, numWakePhrases, wakeup_phrase_paths, sensArr, &handle); 132 | #else 133 | const pv_status_t status = (*f_init)(model, numWakePhrases, wakeup_phrase_paths, sensArr, &handle); 134 | #endif 135 | 136 | if (status != PV_STATUS_SUCCESS) { 137 | printf("Error: Failed to initialise the Porcupine instance, code: %d\n", status); 138 | } 139 | 140 | for(int i = 0; i < numWakePhrases; i++) { 141 | (*env)->ReleaseStringUTFChars(env, *(raw_paths[i]), wakeup_phrase_paths[i]); 142 | } 143 | 144 | free(raw_paths); 145 | free(sensArr); 146 | free(wakeup_phrase_paths); 147 | 148 | (*env)->ReleaseStringUTFChars(env, model_raw, model); 149 | (*env)->ReleaseStringUTFChars(env, porcupine_location_raw, porcupine_location); 150 | 151 | return (long long)handle; 152 | } 153 | 154 | JNIEXPORT jint JNICALL Java_wakeup_Porcupine_process (JNIEnv *env, jobject obj, jlong handle, jshortArray pcm_raw) { 155 | jshort *pcm = (*env)->GetShortArrayElements(env, pcm_raw, 0); 156 | int32_t keyword_index; 157 | 158 | #ifdef _WIN32 159 | f_process((pv_porcupine_t*)handle, (int16_t *)pcm, &keyword_index); 160 | #else 161 | (*f_process)((pv_porcupine_t*)handle, (int16_t *)pcm, &keyword_index); 162 | #endif 163 | 164 | (*env)->ReleaseShortArrayElements(env, pcm_raw, pcm, 0); 165 | 166 | return keyword_index; 167 | } 168 | 169 | JNIEXPORT void JNICALL Java_wakeup_Porcupine_delete 170 | (JNIEnv *env, jobject obj, jlong handle) { 171 | #ifdef _WIN32 172 | f_delete((pv_porcupine_t*)handle); 173 | #else 174 | (*f_delete)((pv_porcupine_t*)handle); 175 | #endif 176 | } 177 | 178 | // For testing as a standalone executable without JNI 179 | //int main() { 180 | // loadDLL(); 181 | // 182 | // const char *model = "C:\\Users\\wdavi\\IdeaProjects\\VocalCord\\wake-engine\\Porcupine\\lib\\common\\porcupine_params.pv"; 183 | // const char * keyword = "C:\\Users\\wdavi\\IdeaProjects\\VocalCord\\wake-engine\\wake_phrase_win32.ppn"; 184 | // 185 | // printf("Initializing Porcupine, using keyword directory %s...\n", (char *)keyword); 186 | // printf("Settings file %s\n", model); 187 | // 188 | // pv_porcupine_t *handle; 189 | // 190 | // float sensArr[1]; 191 | // sensArr[0] = 1; 192 | // 193 | // const char * keyword_paths[1] = { keyword }; 194 | // 195 | // const pv_status_t status = f_init(model, 1, keyword_paths, sensArr, &handle); //pv_porcupine_init(model, 1, keyword_paths, sensArr, &handle); 196 | // 197 | // if (status != PV_STATUS_SUCCESS) { 198 | // printf("Error: Failed to initialise the Porcupine instance."); 199 | // } 200 | // 201 | // int32_t keyword_index; 202 | // 203 | // const int16_t pcm[512]; 204 | // 205 | // f_process((pv_porcupine_t*)handle, pcm, &keyword_index); 206 | // printf("%d", keyword_index); 207 | // 208 | // return (long long)handle; 209 | //} -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VocalCord 2 | Giving Discord bots a voice with speech recognition and text to speech. 3 | 4 | # Future plans (help wanted) 5 | I'd like to create a "no code needed" version of this project, because as you may be able to tell, the manual setup described in this README is not exactly a cake walk. 6 | Specifically, I'd like to create a service that handles hosting, Google Speech-To-Text, and Porcupine (wakeword detection) for you. 7 | The service would have to be paid (something like $10/mo) because Google STT, hosting, and Porcupine all cost money. 8 | 9 | I don't want to get started on this until I know there is sufficient demand for it. I've created a [discussion thread](https://github.com/widavies/VocalCord/discussions/12#discussion-3354581) for this. 10 | 11 | # Donate 12 | If you find this project useful, a donation helps out a lot! 13 | [Donate](https://www.paypal.me/WillDaviesMN) 14 | 15 | # How does it work? 16 | - VocalCord is a _library_, not a standalone bot. VocalCord is built on the excellent [JDA](https://github.com/DV8FromTheWorld/JDA), providing a dead simple wrapper to receive voice transcripts and generate speech audio. VocalCord is a tool to build whatever your imagination decides. 17 | - [Porcupine](https://github.com/Picovoice/porcupine) is used for wake detection, it's incredibly accurate and works consistently well. 18 | - [Google Speech to Text](https://cloud.google.com/speech-to-text) is used for speech recognition, it's decently fast and provides accurate transcripts. 19 | - [Google Text to Speech](https://cloud.google.com/text-to-speech/) is used for text to speech generation, it works great and is fast. 20 | - VocalCord officially supports Windows and Linux 21 | - Thanks to [Olical](https://github.com/Olical/clojure-wake-word-detection) for some great examples that really helped in developing the bot. 22 | 23 | # Setup 24 | ### Porcupine wake phrases 25 | Porcupine requires you to build a wake phrase AI model for every wake phrase you'd like to use. This process can take about 3 hours, 26 | so if you're eager to get started, do this right away. 27 | 28 | 1) Create a Porcupine account at [Picovoice Console](https://console.picovoice.ai/ppn) 29 | 2) Under the "Wake Words" utility, enter your wake phrase into the "Phrase" box. I haven't had much feedback yet about how carried away you can get with wake words, but as it takes three hours, I would recommend trying to choose crisp, unambigious words that Porcupine is unlikely to get confused with similar words. 30 | 3) For Linux, select ```Linux (x86_64)```. For Windows, select ```Windows (x86_64)```. 31 | 4) Click "Train" to begin training the model. Check back in about 3 hours. 32 | 5) VocalCord supports multiple wake phrases at once or even different wake phrases for different users. Generate a wake phrase model for each wakeup phrase you'd like to use. 33 | ### Discord Bot 34 | 1) Go to the [Discord Developer Console](https://discordapp.com/developers/applications) and click "New application". 35 | 2) On the left sidebar of the application view, selected "Bot" 36 | 3) Click "Add Bot" 37 | 4) Click "Copy" under the token header. This is your Discord bot token, put it in a safe place (keep it secret!). 38 | 5) Select the "OAuth2" tab on the left sidebar 39 | 6) Under "Scopes" make sure "bot" is checked. 40 | 7) Enable any permissions your bot will utilize under the "Bot permissions" header. You will need to check ```Connect```, ```Speak```, and ```Use Voice Activity``` to use speech recognition and generation facilities. 41 | 8) Discord will auto generate a link for you, copy this link and paste it into your browser. From here, you may select which server you'd like to add the bot to. 42 | ### Google APIs 43 | 1) Navigate to [Google Cloud Console](https://console.cloud.google.com/) 44 | 2) In the lop left, select the projects drop down and create a new project. 45 | 3) Once your project is created, click the "APIs & Services" card. 46 | 4) From here, select the "Dashboard" tab on the left sidebar, click "Enable APIs and Services" 47 | 5) Search for and enable ```Cloud Speech-to-Text API``` and ```Cloud Text-to-Speech API``` 48 | 6) On the left sidebar, select "Credentials", then under "Service Accounts", selected "Manage service accounts". Give your service account a name, and everything at its default. You will need to click the "Create Key" button, make sure JSON is selected, and hit "Create". This will download a JSON file. This is your credentials for using Google APIs, keep it secret! Save it to a location where you will remember where it is. 49 | ### Environment 50 | #### Windows 51 | 1) You will need to add an environment variable named ```GOOGLE_APPLICATION_CREDENTIALS``` where its value is the path to the Google Credential JSON file you downloaded in the last step. 52 | 2) On Windows, open the start menu and search "Edit the system environment variables". Click "Environment Variables" and under System Variables, click "New" 53 | 3) For "Variable name", enter ```GOOGLE_APPLICATION_CREDENTIALS``` 54 | 4) For "Variable value", enter the path to your Google Credentials JSON, for example: ```C:\Users\wdavi\IdeaProjects\VocalCord\vocalcord-gcs.json```. It does not matter where you put this .json file on your system, as long as the PATH points correctly to it. 55 | #### Linux 56 | 1) Edit your ```.bashrc``` file by entering ```sudo nano ~/.bashrc``` 57 | 2) Add the line ```export GOOGLE_APPLICATION_CREDENTIALS="path-to-google-creds.JSON"``` to the end of the file and save. Example: ```export GOOGLE_APPLICATION_CREDENTIALS="/mnt/c/Users/wdavi/IdeaProjects/VocalCord/vocalcord-gcs.json"``` 58 | 3) Restart your terminal for this change to take effect. 59 | ### Java Project 60 | The recommended IDE is [InteliJ IDEA](https://www.jetbrains.com/idea/download/). 61 | 62 | 1) Download [Java SDK 12.0.2](https://jdk.java.net/archive/) and extract to ```C:\Program Files\Java```. Your installation folder should be something like ```C:\Program Files\Java\jdk-12.0.2```. If you're on Linux, run ```sudo apt-get install openjdk-12-jdk``` 63 | 1) Click ```New > New Project``` 64 | 2) On the left side panel, select ```Gradle```, and check ```Java```. 65 | 3) Give the project a name and hit ```Finish``` 66 | 4) Ensure you are using JDK 12 67 | - ```File > Settings > Build, Execution, Deployment > Gradle > Gradle JVM``` should be set to your JDK 12 68 | - ```Right click project > Open Module Settings > Project > Project SDK``` should be set to your JDK 12 69 | - ```Right click project > Open Module Settings > Project > Project language level``` should be ```12 - No new language features``` 70 | - ```Right click project > Open Module Settings > Modules > Module SDK``` should be set to your JDK 12 71 | 5) Edit your ```build.gradle``` file to install ```VocalCord```: 72 | ``` 73 | repositories { 74 | mavenCentral() 75 | maven { url 'https://jitpack.io' } 76 | jcenter() 77 | } 78 | 79 | dependencies { 80 | implementation 'net.dv8tion:JDA:4.1.1_136' 81 | implementation 'com.google.cloud:google-cloud-speech:1.22.6' 82 | implementation 'com.google.cloud:google-cloud-texttospeech:1.0.2' 83 | implementation 'com.github.wdavies973:VocalCord:2.3' 84 | } 85 | ``` 86 | ### Dynamic Libraries 87 | VocalCord uses Porcupine for wake detection, however Porcupine does not support Java. Instead, VocalCord uses the Java Native Interface (JNI) to wrap the Porcupine C library in Java bindings. You will need to obtain the Porcupine dynamic library, as well as the VocalCord wrapper dynamic library. VocalCord will load the wrapper library, which will in turn load the Porcupine dynamic library. 88 | #### Linux 89 | 1) Create a folder with your root project directory called "native", within this create a subdirectory labeled "linux" 90 | 2) [Download libjni_porcupine.so](https://github.com/wdavies973/VocalCord/raw/master/native/linux/libjni_porcupine.so) 91 | 3) [Download libpv_porcupine.so](https://github.com/Picovoice/porcupine/raw/master/lib/linux/x86_64/libpv_porcupine.so) 92 | 4) [Download porcupine_params.pv](https://github.com/Picovoice/porcupine/raw/master/lib/common/porcupine_params.pv) 93 | 5) Move ```libjni_porcupine.so``` and ```libpv_porcupine.so``` into ```native/linux``` 94 | 6) Move ```porcupine_params.pv``` into ```native``` 95 | 7) You ```native``` directory should look like [this](https://imgur.com/a/tQJPF4n). 96 | #### Windows 97 | 1) Create a folder with your root project directory called "native", within this create a subdirectory labeled "linux" 98 | 2) [Download libjni_porcupine.dll](https://github.com/wdavies973/VocalCord/raw/master/native/windows/libjni_porcupine.dll) 99 | 3) [Download libpv_porcupine.dll](https://github.com/Picovoice/porcupine/raw/master/lib/windows/amd64/libpv_porcupine.dll) 100 | 4) [Download porcupine_params.pv](https://github.com/Picovoice/porcupine/raw/master/lib/common/porcupine_params.pv) 101 | 5) Move ```libjni_porcupine.so``` and ```libpv_porcupine.so``` into ```native/linux``` 102 | 6) Move ```porcupine_params.pv``` into ```native``` 103 | 7) You ```native``` directory should look like [this](https://imgur.com/a/tQJPF4n). 104 | #### Porcupine 105 | Once Porcupine's wake phrase training is done, you should also move your ```wake_phrase.ppn``` file to ```native/``` 106 | ### Setup Complete! 107 | You are now ready to configure your application and begin hacking. 108 | # Running a basic example 109 | You can find a basic example [here](https://github.com/wdavies973/VocalCord/blob/master/src/main/java/example/ExampleBot.java). 110 | # Configuration 111 | You can read up on most configuration options in the [VocalCord docs](https://github.com/wdavies973/VocalCord/blob/master/src/main/java/vocalcord/VocalCord.java) 112 | # Using a music bot? 113 | JDA enforces a restriction of only one ```AudioSendHandler``` at once. This introduces a problem if you want to use TTS and a music bot. To address this problem, VocalCord implements a audio send multliplexer, which essentially will mix the audio between your [music send handler](https://github.com/sedmelluq/lavaplayer#jda-integration) and VocalCord's internal TTS SendHandler. Currently, there are two send multiplex modes, ```Switch```, which will pause your music while TTS is occuring, and ```Blend``` which will lower the volume of your music bot while TTS is occuring. ```Blend``` is currently not implemented yet. 114 | # Roadmap 115 | Upcoming features: 116 | - ```Blend``` multiplexing mode 117 | - Option to use offline [Picovoice Cheetah](https://github.com/Picovoice/cheetah) voice recognition for faster voice recognition. 118 | - Continuation phrases so the bot can carry out an ongoing conversation 119 | - Improvements to command chain 120 | # Issues/Suggestions/Help 121 | If you need help or have any suggestions, feel free to contact me at wdavies973@gmail.com 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/VocalCord.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import com.google.cloud.texttospeech.v1beta1.SsmlVoiceGender; 4 | import net.dv8tion.jda.api.audio.AudioSendHandler; 5 | import net.dv8tion.jda.api.entities.TextChannel; 6 | import net.dv8tion.jda.api.entities.User; 7 | import net.dv8tion.jda.api.entities.VoiceChannel; 8 | import net.dv8tion.jda.api.managers.AudioManager; 9 | 10 | import java.util.ArrayList; 11 | 12 | /** 13 | * This is the main class you will use to interact with VocalCord 14 | */ 15 | public class VocalCord { 16 | 17 | public interface Callbacks { 18 | /** 19 | * You should only use ONE onTranscribed(..) method. This callback is where you'll store all your voice commands. 20 | * This function is only called when onTranscribed() returns false, and will provide you the speaker user object 21 | * with a transcript of what they said. It is not recommended you use this callback 22 | * over the other for anything except the most simple bots. 23 | * @param user The user whose voice was transcribed into transcript 24 | * @param transcript A speech transcript of what the user said 25 | */ 26 | default void onTranscribed(User user, String transcript) { 27 | } 28 | 29 | /** 30 | * You should only use ONE onTranscribed(..) method. This function is preferred. 31 | * This callback is where you'll store all your voice commands. Importantly, voice transcripts aren't always 32 | * 100% accurate. If you hard code a list of commands, being off by just one word wouldn't register the command, 33 | * or trying to use lots of String.contains(..) calls could easily intermix commands. This callback employs 34 | * CommandChain, which will generate document vectors and a document term matrix in order to compute the cosine 35 | * similarity between a candidate transcription. Essentially, CommandChain will automatically run an algorithm to 36 | * determine which command was most likely said. This means that a user doesn't have to be 100% accurate on matching a command, 37 | * and instead only needs to capture the meaning of a command. 38 | */ 39 | default CommandChain onTranscribed() { 40 | return null; 41 | } 42 | 43 | /** 44 | * Checks whether the user has permission to use voice commands. This command is important because 45 | * every user VocalCord is using wake detection on is a drain on resources, returning true will 46 | * allow VocalCord to track the user's audio stream, but returning false will not track a user's audio 47 | * stream. Basically, only allow users to wake the bot that you trust, because Google Speech To Text can 48 | * cost money over a certain threshold, and universally returning "true" could make it easier for your 49 | * bot to get overloaded. 50 | * @param user The user object of a user who is talking in a voice channel 51 | * @return true if the user can use voice commands, false if they can't 52 | */ 53 | boolean canWakeBot(User user); 54 | 55 | /** 56 | * This command is called when VocalCord detects a user has used a wake phrase. 57 | * @param userStream A wrapper around a User object, you can call sleep() to cancel voice recognition or getUser() to get the user 58 | * @param keywordIndex The index of the wake phrase that was used to wake the bot, this matches the order you provided wake phrase paths 59 | */ 60 | void onWake(UserStream userStream, int keywordIndex); 61 | 62 | /** 63 | * Triggered when the bot has finished speaking the last phrase it was told to speak 64 | */ 65 | default void onTTSCompleted() {} 66 | } 67 | 68 | private static Config CONFIG; 69 | 70 | public static Config newConfig(Callbacks callbacks) { 71 | CONFIG = new Config(); 72 | CONFIG.callbacks = callbacks; 73 | return CONFIG; 74 | } 75 | 76 | public static Config getConfig() { 77 | return CONFIG; 78 | } 79 | 80 | public static class Config { 81 | public Callbacks callbacks; 82 | /* 83 | * Wake detection 84 | */ 85 | public String jniLocation, porcupineLocation; // dynamic library locations 86 | public String porcupineParams; 87 | public String[] wakePhrasePaths; 88 | public float sensitivity; 89 | 90 | /* 91 | * Captioning 92 | */ 93 | public boolean captioning; 94 | public int captioningChunkSize = 5; 95 | 96 | /* 97 | * TTS settings 98 | */ 99 | String languageCode = "en-US"; 100 | boolean usingTTS, usingTTSCache; 101 | SsmlVoiceGender voiceGender; 102 | SendMultiplex sendMultiplex; 103 | 104 | /* 105 | * Phrase detection stuff 106 | */ 107 | int postWakeLimit = 4000; 108 | int postPhraseTimeout = 600; 109 | int maxPhraseTime = 20; 110 | int userStreamLife = 15; 111 | 112 | public static class SendMultiplex { 113 | enum MultiplexMode { 114 | None, 115 | Switch, 116 | Blend; 117 | } 118 | 119 | MultiplexMode mode = MultiplexMode.None; 120 | AudioSendHandler[] handlers; 121 | float blendBalance; 122 | 123 | private SendMultiplex() { 124 | } 125 | 126 | public static SendMultiplex None() { 127 | return new SendMultiplex(); 128 | } 129 | 130 | public static SendMultiplex Switch(AudioSendHandler sendHandler) { 131 | if(sendHandler == null) { 132 | throw new RuntimeException("Send handler must not be null."); 133 | } 134 | SendMultiplex m = new SendMultiplex(); 135 | m.handlers = new AudioSendHandler[1]; 136 | m.handlers[0] = sendHandler; 137 | m.mode = MultiplexMode.Switch; 138 | return m; 139 | } 140 | 141 | /* 142 | * Blend ratio between 0 and 1 143 | */ 144 | public static SendMultiplex Blend(float blendRatio, AudioSendHandler... sendHandlers) { 145 | throw new UnsupportedOperationException("Blend mode is not supported yet."); 146 | 147 | // if(blendRatio < 0 || blendRatio > 1) { 148 | // throw new RuntimeException("Blend ratio must be between 0 and 1."); 149 | // } else if(sendHandlers == null || sendHandlers.length == 0) { 150 | // throw new RuntimeException("Must provide at least one audio send handler."); 151 | // } 152 | // 153 | // SendMultiplex m = new SendMultiplex(); 154 | // m.handlers = sendHandlers; 155 | // m.mode = MultiplexMode.Blend; 156 | // m.blendBalance = blendRatio; 157 | // return m; 158 | } 159 | } 160 | 161 | private Config() { 162 | } 163 | 164 | /** 165 | * Settings for using wake detection 166 | * @param jniLocation The absolute path to your "libjni_porcupine.dll/libjni_porcupine.so" file 167 | * @param porcupineLocation The absolute path to your "libpv_porcupine.dll/libpv_porcupine.so" file 168 | * @param porcupineParams The absolute path to your "porcupine_params.pv" file 169 | * @param sensitivity A sensitivity between 0 and 1, 0 will leans towards false negatives, 1 will leans towards more false positives 170 | * @param wakePhrasePaths An array of abosolute paths to your "wake_phrase.ppn" files 171 | * @return Builder object 172 | */ 173 | public Config withWakeDetection(String jniLocation, String porcupineLocation, String porcupineParams, float sensitivity, String... wakePhrasePaths) { 174 | this.jniLocation = jniLocation; 175 | this.porcupineLocation = porcupineLocation; 176 | this.sensitivity = sensitivity; 177 | this.porcupineParams = porcupineParams; 178 | this.wakePhrasePaths = wakePhrasePaths; 179 | this.captioning = false; 180 | return this; 181 | } 182 | 183 | /** 184 | * An alternative to wake detection, instead the bot will always be 185 | * transcribing. This will disable wake detection. Using wake detection 186 | * will disable closed captioning. 187 | */ 188 | public Config withClosedCaptioning() { 189 | captioning = true; 190 | return this; 191 | } 192 | 193 | /** 194 | * An alternative to wake detection, instead the bot will always be 195 | * transcribing. This will disable wake detection. Using wake detection 196 | * will disable closed captioning. 197 | * @param chunkSize If a user is speaking, what size (in seconds) of chunks should their 198 | * transcripts be broken up into. 5 is a good value. 199 | * @return chunkSize 200 | */ 201 | public Config withClosedCaptioning(int chunkSize) { 202 | captioning = true; 203 | captioningChunkSize = chunkSize; 204 | return this; 205 | } 206 | 207 | /** 208 | * Optionally change the default settings for phrase detection. This allows you to fine-tune wake detection. 209 | * @param postWakeLimit After a user wakes the bot, they have this many milliseconds to start speaking a command before VocalCord will cancel the phrase detection and put the stream to sleep 210 | * @param postPhraseTimeout When the user is speaking a voice command, how many milliseconds of silents should occur before VocalCord should stop listening and start working on a transcript? 211 | * @param maxPhraseTime The maximum amount of time a voice command may last, in seconds 212 | * @return Builder object 213 | */ 214 | public Config withPhraseDetectionSettings(int postWakeLimit, int postPhraseTimeout, int maxPhraseTime) { 215 | this.postWakeLimit = postWakeLimit; 216 | this.postPhraseTimeout = postPhraseTimeout; 217 | this.maxPhraseTime = maxPhraseTime; 218 | return this; 219 | } 220 | 221 | /** 222 | * What language TTS and STT should use. If you're using English, you don't need to call this 223 | * @param languageCode A language code from: https://www.cardinalpath.com/resources/tools/google-analytics-language-codes/ 224 | * @return Builder object 225 | */ 226 | public Config withLanguage(String languageCode) { 227 | this.languageCode = languageCode; 228 | return this; 229 | } 230 | 231 | /** 232 | * Enables TTS support 233 | * @param voiceGender The voice accent to use 234 | * @param useCaching Caching will cache frequent phrases to speed up TTS times, this is really helpful for things like a onWake(..) 235 | * whose speed will affect the overall speed of a voice command 236 | * @return Builder object 237 | */ 238 | public Config withTTS(SsmlVoiceGender voiceGender, boolean useCaching) { 239 | usingTTS = true; 240 | this.usingTTSCache = useCaching; 241 | this.voiceGender = voiceGender; 242 | this.sendMultiplex = SendMultiplex.None(); 243 | return this; 244 | } 245 | 246 | /** 247 | * JDA enforces a restriction of only ONE AudioSendHandler at a time. This is a bit tricky because TTS also 248 | * needs to use an AudioSendHandler. In order to fix this, VocalCord implements Multiplexing. I.E. if you want to 249 | * use a music bot or something, VocalCord is capable of mixing the signal with the TTS when needed. 250 | * @param voiceGender THe voice accent to use 251 | * @param useCaching Caching will cache frequent phrases to speed up TTS times, this is really helpful for things like a onWake(..) 252 | * whose speed will affect the overall speed of a voice command 253 | * @param sendMultiplex A send multiplexer defining how you want to mix the audio 254 | * @return Builder object 255 | */ 256 | public Config withTTSMultiplex(SsmlVoiceGender voiceGender, boolean useCaching, SendMultiplex sendMultiplex) { 257 | withTTS(voiceGender, useCaching); 258 | this.sendMultiplex = sendMultiplex; 259 | return this; 260 | } 261 | 262 | /** 263 | * Constructs the VocalCord object 264 | * @return VocalCord object, you should connect to the channel after this 265 | */ 266 | public VocalCord build() { 267 | // Verify arguments 268 | return new VocalCord(); 269 | } 270 | 271 | } 272 | 273 | private TTSEngine ttsEngine; 274 | 275 | /** 276 | * Instructs VocalCord to say something in the channel, must have called "connect" beforehand 277 | * @param text TTS text 278 | */ 279 | public void say(String text) { 280 | if(ttsEngine == null) { 281 | throw new RuntimeException("TTS not configured. Use withTTS(..) when configuring VocalCord to use TTS."); 282 | } 283 | 284 | try { 285 | ttsEngine.say(text); 286 | } catch(Exception e) { 287 | e.printStackTrace(); 288 | } 289 | } 290 | 291 | /** 292 | * Connects vocal cord to a voice channel 293 | * @param voiceChannel The voice channel to connect to 294 | */ 295 | public void connect(VoiceChannel voiceChannel) { 296 | AudioManager manager = voiceChannel.getGuild().getAudioManager(); 297 | manager.openAudioConnection(voiceChannel); 298 | 299 | Config cfg = VocalCord.getConfig(); 300 | 301 | if(cfg.usingTTS) { 302 | ttsEngine = new TTSEngine(); 303 | 304 | if(cfg.sendMultiplex.mode != Config.SendMultiplex.MultiplexMode.None) { 305 | manager.setSendingHandler(new AudioSendMultiplexer(ttsEngine, cfg.sendMultiplex)); 306 | } else { 307 | manager.setSendingHandler(ttsEngine); 308 | } 309 | 310 | manager.setReceivingHandler(new STTEngine()); 311 | } 312 | } 313 | 314 | // TODO 315 | // cheetah voice detection engine 316 | // continuation phrase 317 | // blend multiplexer 318 | // include documentation about disconnecting the bot 319 | // fix timings a bit 320 | } 321 | -------------------------------------------------------------------------------- /src/main/java/vocalcord/CommandChain.java: -------------------------------------------------------------------------------- 1 | package vocalcord; 2 | 3 | import net.dv8tion.jda.api.entities.User; 4 | 5 | import java.util.*; 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * The CommandChain class addresses a common problem with trying to match a particular voice command to a transcript of what the user said. 10 | * Consider the naive approach to this, such as using a ".equals()" or a ".contains()". The problem with this is voice transcripts aren't always 11 | * 100% accurate, so a direct ".equals()" may not always catch the command. A ".contains()" will make it easy for commands to get intermixed. 12 | * The CommandChain fixes this problem by using some document similarity maths to instead try to match the meaning of the transcript to a voice command 13 | * instead of word for word. 14 | */ 15 | public class CommandChain { 16 | 17 | private PhraseVector commandsVector; // a vector of ALL command words 18 | private double minThreshold = 0.5; 19 | private final ArrayList commands = new ArrayList<>(); 20 | private VoiceTask fallback; 21 | 22 | public interface VoiceTask { 23 | /** 24 | * This task is run when a voice command is detected 25 | * 26 | * @param user The user that spoke the command 27 | * @param transcript The complete transcript of what the user said 28 | * @param args A series of voice arguments, see {@link Builder#addPhrase(String, VoiceTask)} 29 | */ 30 | void run(User user, String transcript, VoiceArgument[] args); 31 | } 32 | 33 | public static class VoiceArgument { 34 | T argument; 35 | Class type; 36 | 37 | public VoiceArgument(Class type, T argument) { 38 | this.argument = argument; 39 | this.type = type; 40 | } 41 | 42 | public E value() { 43 | //noinspection unchecked 44 | return (E) type.cast(argument); 45 | } 46 | 47 | @Override 48 | public String toString() { 49 | return argument.toString(); 50 | } 51 | } 52 | 53 | /** 54 | * User to construct a CommandChain 55 | */ 56 | public static class Builder { 57 | private final CommandChain chain = new CommandChain(); 58 | 59 | /** 60 | * Adds a voice command. If the specified {@code phrase} has a meaning similar to the transcript, {@code job} is run. 61 | *

62 | * There are a few useful special characters that you can use for this command. Let's say you have a voice command 63 | * "kick user John" or "set volume to 100". Each commands takes an argument, "John" in the first case, and "100" in 64 | * the second case. Naturally, you'd want the voice command "kick user John" to also work for "kick user Mark" or the 65 | * voice command "set volume 100" to also work for "set volume 50". To facilitate this, you can use the following 66 | * special character sequences: 67 | *

68 | * %s - matches a string argument 69 | * %i - matches an integer argument 70 | * %d - matches a double argument 71 | *

72 | * So for example, in our present situation of kicking a user, you would specify the {@code phrase} as "kick user %s". 73 | * In this case, the voice command will match a user transcript that starts with "kick user" and ends with any string. 74 | * Then, when your VoiceTask is run, it will be given a VoiceArgument with the value of whatever "%s" was. You can 75 | * use as many of these special character sequences as you'd like. For example, "set volume to 100 and kick John" would 76 | * be represented as "set volume to %i and kick %s". If this command matches, your VoiceTask will be called with two 77 | * VoiceArguments, the first a volume integer, and the second a user to kick. 78 | * 79 | * Note: Only use these for very simple arguments that are 1-2 words at most! 80 | * 81 | * Disclaimer: I handle these special characters in a semi-naive way, I could use a much better implementation using neural nets or 82 | * something if you're interested. Let me know if it gets any really trivial cases wrong and I hope it works decently well for you! 83 | * 84 | * @param phrase The phrase that should trigger the command 85 | * @param task The task that should be run when the phrase is detected 86 | * @return Builder object 87 | */ 88 | public Builder addPhrase(String phrase, VoiceTask task) { 89 | chain.commands.add(new VoiceCommand(phrase, task)); 90 | return this; 91 | } 92 | 93 | /** 94 | * If no voice command matches an incoming transcript with cosine similarity better than {@link CommandChain#minThreshold}, 95 | * this fallback task is run. It can be used for things like the bot saying "Sorry, I didn't get that" If null, VocalCord 96 | * won't execute anything. 97 | * 98 | * @param task The task to run when the user said something, but no voice command matched close enough. 99 | * @return Builder object 100 | */ 101 | public Builder withFallback(VoiceTask task) { 102 | chain.fallback = task; 103 | return this; 104 | } 105 | 106 | /** 107 | * Adjust the min cosine threshold for a voice command to be even considered to be a possible candidate 108 | * 109 | * @param minThreshold A value between 0 and 1, a value more towards 0 will let a voice transcript still trigger a voice command even if they are vastly different, a value of 1 will only allow essentially perfect matches 110 | * @return Builder object 111 | */ 112 | public Builder withMinThreshold(float minThreshold) { 113 | chain.minThreshold = minThreshold; 114 | return this; 115 | } 116 | 117 | /** 118 | * Constructs the command chain object 119 | * 120 | * @return Returns the CommandChain object 121 | */ 122 | public CommandChain build() { 123 | if(chain.commands.size() == 0) throw new RuntimeException("Must provide at least one command"); 124 | 125 | chain.commandsVector = new PhraseVector(new HashMap<>()); 126 | 127 | for(VoiceCommand cmd : chain.commands) { 128 | chain.commandsVector = chain.commandsVector.merge(cmd.phraseVector); 129 | } 130 | 131 | return chain; 132 | } 133 | } 134 | 135 | /* 136 | * Internal code 137 | */ 138 | 139 | private CommandChain() { 140 | } 141 | 142 | // finds the best matching candidate voice command 143 | TaskCandidate score(String transcript) { 144 | double maxScore = -1; 145 | int maxIndex = -1; 146 | TaskCandidate leading = null; 147 | 148 | PhraseVector transcriptVector = new PhraseVector(transcript); 149 | PhraseVector docVector = commandsVector.merge(transcriptVector); // the entire document vector 150 | 151 | for(int i = 0; i < commands.size(); i++) { 152 | TaskCandidate candidate = new TaskCandidate(commands.get(i), transcript); 153 | 154 | double score = candidate.score(docVector); 155 | if(score > maxScore) { 156 | maxScore = score; 157 | maxIndex = i; 158 | leading = candidate; 159 | } 160 | } 161 | 162 | if(maxIndex == -1) { 163 | return null; 164 | } else { 165 | return leading; 166 | } 167 | } 168 | 169 | // runs the best matching candidate voice command 170 | void fulfillTaskCandidate(User user, TaskCandidate candidate) { 171 | if(candidate == null || candidate.score < minThreshold && fallback != null) { 172 | fallback.run(user, candidate == null ? "" : candidate.transcript, new VoiceArgument[0]); 173 | } else { 174 | // resolve voice arguments 175 | candidate.command.task.run(user, candidate.transcript, candidate.args); 176 | } 177 | } 178 | 179 | private static class VoiceCommand { 180 | String phrase; 181 | PhraseVector phraseVector; 182 | ArrayList params; 183 | VoiceTask task; 184 | 185 | final ArrayList tokenized = new ArrayList<>(); 186 | 187 | public VoiceCommand(String phrase, VoiceTask task) { 188 | this.phrase = phrase; 189 | this.task = task; 190 | this.phraseVector = new PhraseVector(phrase); 191 | 192 | // Compute parameter contexts 193 | // ignore stop words 194 | // ignore other parameters 195 | params = new ArrayList<>(); 196 | 197 | ArrayList tokens = new ArrayList<>(Arrays.asList(phrase.replaceAll("[^a-zA-z0-9.% -]", "").trim().toLowerCase().split("\\s+"))); 198 | 199 | // remove stop words 200 | tokens.removeIf(STOPS::contains); 201 | 202 | // create a parameter context 203 | for(int i = 0; i < tokens.size(); i++) { 204 | String word = tokens.get(i); 205 | 206 | if(word.equals("%s") || word.equals("%i") || word.equals("%d")) { 207 | params.add(new Param(tokens, i)); 208 | } else { 209 | tokenized.add(word); 210 | } 211 | } 212 | } 213 | } 214 | 215 | static class TaskCandidate { 216 | double score; // how closely the TaskCandidate matched a spoken transcript 217 | String transcript; // the exact transcript that was spoken 218 | VoiceCommand command; 219 | VoiceArgument[] args; // the arguments to the VoiceCommand 220 | 221 | private PhraseVector resolvedVector; // a vector version of the command with all parameters resolved 222 | 223 | public TaskCandidate(VoiceCommand command, String transcript) { 224 | this.transcript = transcript; 225 | this.command = command; 226 | 227 | for(int i = 0; i < 1; i++) { 228 | this.args = resolve(i); 229 | 230 | if(this.args != null) { 231 | break; 232 | } 233 | } 234 | 235 | if(this.args == null) { 236 | resolvedVector = command.phraseVector; 237 | } 238 | } 239 | 240 | private VoiceArgument[] resolve(int numAllowableErrors) { 241 | /* 242 | * Resolve any parameters in VoiceCommand 243 | */ 244 | 245 | // step 1, tokenize transcript 246 | ArrayList tokens = new ArrayList<>(Arrays.asList(transcript.replaceAll("[^a-zA-z0-9. -]", "").trim().toLowerCase().split("\\s+"))); 247 | 248 | // step 2, remove all stop words from transcript 249 | tokens.removeIf(STOPS::contains); 250 | 251 | // step 3, generate a "selection list" of parameter candidates, each with a index to where it occurs in tokens 252 | class ParamCandidate { 253 | final String token; 254 | final int position; 255 | public ParamCandidate(String token, int position) { 256 | this.token = token; 257 | this.position = position; 258 | } 259 | } 260 | 261 | ArrayList paramCandidates = new ArrayList<>(); 262 | 263 | for(int i = 0; i < tokens.size(); i++) { 264 | if(command.tokenized.contains(tokens.get(i))) continue; 265 | 266 | paramCandidates.add(new ParamCandidate(tokens.get(i), i)); 267 | } 268 | 269 | // step 4, loop through parameters in the voice command 270 | VoiceArgument[] args = new VoiceArgument[command.params.size()]; 271 | 272 | int index = 0; 273 | 274 | for(Param p : command.params) { 275 | // The best way to think about this step is that Param "p" is going to take its most 276 | // desired pick from "paramCandidates", there are three criteria that make a paramCandidate more desirable: 277 | // It occurs near the beginning of paramCandidates ("p" will pick the closest satisfactory parameter to the start) 278 | // the types match 279 | // the param's context works 280 | 281 | for(int i = 0; i < paramCandidates.size(); i++) { 282 | ParamCandidate candidate = paramCandidates.get(i); 283 | 284 | // Do the types match? 285 | if("%d".equals(p.param) && isDouble(candidate.token) && p.satisfiesContext(tokens, candidate.position, numAllowableErrors)) { 286 | args[index] = new VoiceArgument<>(Double.class, Double.parseDouble(candidate.token)); 287 | paramCandidates.remove(i); 288 | break; 289 | } else if("%i".equals(p.param) && isInteger(candidate.token) && p.satisfiesContext(tokens, candidate.position, numAllowableErrors)) { 290 | args[index] = new VoiceArgument<>(Integer.class, convertInteger(candidate.token)); 291 | paramCandidates.remove(i); 292 | break; 293 | } else if("%s".equals(p.param) && p.satisfiesContext(tokens, candidate.position, numAllowableErrors)) { 294 | StringBuilder sb = new StringBuilder(candidate.token); 295 | 296 | paramCandidates.remove(i); 297 | 298 | for(int tmp = i; tmp < paramCandidates.size(); tmp++) { 299 | ParamCandidate c = paramCandidates.get(tmp); 300 | 301 | if(p.satisfiesContext(tokens, c.position, 0) && tmp < paramCandidates.size() - (command.params.size() - index - 1)) { 302 | sb.append(" ").append(c.token); 303 | paramCandidates.remove(tmp); 304 | tmp--; 305 | } else { 306 | break; 307 | } 308 | } 309 | 310 | args[index] = new VoiceArgument<>(String.class, sb.toString()); 311 | 312 | break; 313 | } 314 | } 315 | 316 | index++; 317 | } 318 | 319 | // step 5, apply the parameter assignments to the command phrase and create a vector with it 320 | String resolvedPhrase = command.phrase; 321 | 322 | index = 0; 323 | for(Param p : command.params) { 324 | if(args[index] != null) { 325 | resolvedPhrase = resolvedPhrase.replaceFirst(p.param, args[index].toString()); 326 | } else { 327 | return null; 328 | } 329 | 330 | index++; 331 | } 332 | 333 | this.resolvedVector = new PhraseVector(resolvedPhrase); 334 | return args; 335 | } 336 | 337 | public double score(PhraseVector documentVector) { 338 | this.score = new PhraseVector(transcript).cosine(documentVector.words, resolvedVector); 339 | return this.score; 340 | } 341 | } 342 | 343 | // stores the words in the voice command that occur before and after the voice command 344 | private static class Param { 345 | private final ArrayList wordsBefore = new ArrayList<>(), wordsAfter = new ArrayList<>(); 346 | private final String param; 347 | 348 | Param(ArrayList words, int index) { 349 | for(int i = 0; i < index; i++) { 350 | if(words.get(i).equals("%s") || words.get(i).equals("%i") || words.get(i).equals("%d")) continue; 351 | wordsBefore.add(words.get(i)); 352 | } 353 | param = words.get(index); 354 | 355 | for(int i = index + 1; i < words.size(); i++) { 356 | if(words.get(i).equals("%s") || words.get(i).equals("%i") || words.get(i).equals("%d")) continue; 357 | wordsAfter.add(words.get(i)); 358 | } 359 | } 360 | 361 | public boolean satisfiesContext(ArrayList context, int consideredToken, int numErrorsAllowed) { 362 | int errors = 0; 363 | 364 | // Create temporary sets for the words 365 | ArrayList before = new ArrayList<>(); 366 | ArrayList after = new ArrayList<>(); 367 | 368 | for(int i = 0; i < consideredToken; i++) { 369 | before.add(context.get(i)); 370 | } 371 | 372 | for(int i = consideredToken + 1; i < context.size(); i++) { 373 | after.add(context.get(i)); 374 | } 375 | 376 | for(String req : wordsBefore) { 377 | if(before.contains(req)) { 378 | before.remove(req); 379 | } else { 380 | errors++; 381 | } 382 | } 383 | 384 | for(String req : wordsAfter) { 385 | if(after.contains(req)) { 386 | after.remove(req); 387 | } else { 388 | errors++; 389 | } 390 | } 391 | 392 | return errors <= numErrorsAllowed; 393 | } 394 | } 395 | 396 | private static class PhraseVector { 397 | private final HashMap words; 398 | 399 | PhraseVector(String phrase) { 400 | words = new HashMap<>(); 401 | 402 | ArrayList tokens = tokenize(phrase); 403 | for(String word : tokens) { 404 | int count = words.getOrDefault(word, 0); 405 | words.put(word, count + 1); 406 | } 407 | } 408 | 409 | private PhraseVector(HashMap words) { 410 | this.words = words; 411 | } 412 | 413 | private static ArrayList tokenize(String phrase) { 414 | phrase = phrase.replaceAll("%s", "").replaceAll("%i", "").replaceAll("%d", "") 415 | .replaceAll("[^a-zA-z0-9 -]", "").trim().toLowerCase(); 416 | 417 | ArrayList tokens = new ArrayList<>(Arrays.asList(phrase.split("\\s+"))); 418 | 419 | tokens.removeIf(STOPS::contains); 420 | 421 | return tokens; 422 | } 423 | 424 | // Does not preserve frequencies 425 | PhraseVector merge(PhraseVector vec) { 426 | HashMap merged = new HashMap<>(); 427 | merged.putAll(words); 428 | merged.putAll(vec.words); 429 | return new PhraseVector(merged); 430 | } 431 | 432 | int[] asVector(HashMap termMatrix) { 433 | int[] vector = new int[termMatrix.size()]; 434 | 435 | int index = 0; 436 | for(String key : termMatrix.keySet()) { 437 | vector[index] = words.getOrDefault(key, 0); 438 | index++; 439 | } 440 | 441 | return vector; 442 | } 443 | 444 | double cosine(HashMap termMatrix, PhraseVector vec) { 445 | int[] vec1 = asVector(termMatrix); 446 | int[] vec2 = vec.asVector(termMatrix); 447 | 448 | if(vec1.length != vec2.length) { 449 | throw new RuntimeException("Vector lengths must be the same."); 450 | } 451 | 452 | int innerProduct = 0; 453 | double vec1Length = 0; 454 | double vec2Length = 0; 455 | 456 | for(int i = 0; i < vec1.length; i++) { 457 | innerProduct += (vec1[i] * vec2[i]); 458 | 459 | vec1Length += (vec1[i] * vec1[i]); 460 | vec2Length += (vec2[i] * vec2[i]); 461 | } 462 | 463 | return (double) innerProduct / (Math.sqrt(vec1Length) * Math.sqrt(vec2Length)); 464 | } 465 | } 466 | 467 | // common english words to remove 468 | private static final HashSet STOPS = new HashSet<>(); 469 | 470 | static { 471 | final String[] STOP_WORDS = 472 | {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", 473 | "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", 474 | "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", 475 | "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", 476 | "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", 477 | "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}; 478 | 479 | STOPS.addAll(Arrays.asList(STOP_WORDS)); 480 | } 481 | 482 | private static final Pattern R_INTEGER = Pattern.compile("^[-+]?\\d+$"); 483 | private static final Pattern R_DOUBLE = Pattern.compile("\\d+\\.?\\d*"); 484 | 485 | private static final ArrayList WORDS = new ArrayList<>(); 486 | 487 | static { 488 | Collections.addAll(WORDS, "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"); 489 | } 490 | 491 | private static int convertInteger(String s) { 492 | if(WORDS.contains(s)) { 493 | return WORDS.indexOf(s) + 1; 494 | } else { 495 | return Integer.parseInt(s); 496 | } 497 | } 498 | 499 | private static boolean isInteger(String s) { 500 | return R_INTEGER.matcher(s).matches() || WORDS.contains(s); 501 | } 502 | 503 | private static boolean isDouble(String s) { 504 | return R_DOUBLE.matcher(s).matches(); 505 | } 506 | } 507 | --------------------------------------------------------------------------------