├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── examples └── data │ ├── batman-16khz-noheader.wav │ ├── batman.wav │ └── hall.mp3 ├── lib ├── tritonus_remaining-0.3.6.jar └── tritonus_share-0.3.6.jar ├── project ├── build.properties └── plugins.sbt ├── sonatype.sbt ├── src ├── main │ └── java │ │ ├── SpeechToTextWebsocketsDemo.java │ │ └── com │ │ └── github │ │ └── catalystcode │ │ └── fortis │ │ └── speechtotext │ │ ├── Mp3Transcriber.java │ │ ├── Transcriber.java │ │ ├── WavTranscriber.java │ │ ├── config │ │ ├── OutputFormat.java │ │ ├── SpeechServiceConfig.java │ │ └── SpeechType.java │ │ ├── constants │ │ ├── EnvironmentVariables.java │ │ ├── SpeechServiceConnectionHeaders.java │ │ ├── SpeechServiceContentTypes.java │ │ ├── SpeechServiceLimitations.java │ │ ├── SpeechServiceMessageFields.java │ │ ├── SpeechServiceMessageHeaders.java │ │ ├── SpeechServiceMetrics.java │ │ ├── SpeechServicePaths.java │ │ ├── SpeechServiceSpeechConfig.java │ │ └── SpeechServiceWebsocketStatusCodes.java │ │ ├── lifecycle │ │ ├── MessageReceiver.java │ │ ├── SpeechHypothesisMessage.java │ │ ├── SpeechPhraseMessage.java │ │ ├── TurnEndMessage.java │ │ └── TurnStartMessage.java │ │ ├── messages │ │ ├── AudioEndMessageCreator.java │ │ ├── BinaryMessageCreator.java │ │ ├── HeaderCreator.java │ │ ├── MessageParser.java │ │ └── TextMessageCreator.java │ │ ├── telemetry │ │ ├── AudioTelemetry.java │ │ ├── CallsTelemetry.java │ │ └── ConnectionTelemetry.java │ │ ├── utils │ │ ├── Environment.java │ │ ├── ProtocolUtils.java │ │ ├── RiffHeader.java │ │ └── Units.java │ │ └── websocket │ │ ├── MessageSender.java │ │ ├── PlatformInfo.java │ │ ├── SpeechServiceClient.java │ │ ├── TelemetryInfo.java │ │ └── nv │ │ ├── NvMessageReceiver.java │ │ ├── NvMessageSender.java │ │ └── NvSpeechServiceClient.java └── test │ └── java │ └── com │ └── github │ └── catalystcode │ └── fortis │ └── speechtotext │ ├── messages │ └── MessageParserTest.java │ └── websocket │ ├── PlatformInfoTest.java │ └── TelemetryInfoTest.java └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: 2 | trusty 3 | 4 | language: 5 | java 6 | 7 | jdk: 8 | - oraclejdk8 9 | 10 | script: 11 | - sbt test 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Microsoft Partner Catalyst Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **There is now an official [Java SDK for the Bing Speech to Text API](https://docs.microsoft.com/en-us/azure/cognitive-services/speech/getstarted/getstartedjavaandroid) so this repository is deprecated.** 2 | 3 | --- 4 | 5 | A Java implementation of the [Bing Speech to Text API](https://azure.microsoft.com/en-ca/services/cognitive-services/speech/) [websocket protocol](https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/websocketprotocol) ([supporting article](https://www.microsoft.com/developerblog/2017/11/01/building-a-custom-spark-connector-for-near-real-time-speech-to-text-transcription/)). 6 | 7 | [![Travis CI status](https://api.travis-ci.org/CatalystCode/SpeechToText-WebSockets-Java.svg?branch=master)](https://travis-ci.org/CatalystCode/SpeechToText-WebSockets-Java) 8 | 9 | ## Usage example ## 10 | 11 | Run a demo via: 12 | 13 | ```sh 14 | # set up all the requisite environment variables 15 | export OXFORD_SPEECH_TOKEN="..." 16 | 17 | # stream the audio and transcribe 18 | sbt "runMain SpeechToTextWebsocketsDemo examples/data/batman.wav" 19 | sbt "runMain SpeechToTextWebsocketsDemo examples/data/hall.mp3" 20 | sbt "runMain SpeechToTextWebsocketsDemo http://bbcwssc.ic.llnwd.net/stream/bbcwssc_mp1_ws-einws en-US .mp3" 21 | ``` 22 | 23 | If you're consuming the library via Maven, make sure to also add the Tritonus (PCM audio conversion) jars to the classpath: 24 | - [tritonus_remaining-0.3.6.jar](https://github.com/CatalystCode/SpeechToText-WebSockets-Java/raw/master/lib/tritonus_remaining-0.3.6.jar) 25 | - [tritonus_share-0.3.6.jar](https://github.com/CatalystCode/SpeechToText-WebSockets-Java/raw/master/lib/tritonus_share-0.3.6.jar) 26 | 27 | ## Release process ## 28 | 29 | 1. Configure your credentials via the `SONATYPE_USER` and `SONATYPE_PASSWORD` environment variables. 30 | 2. Update `version.sbt` 31 | 3. Enter the SBT shell: `sbt` 32 | 4. Run `sonatypeOpen "enter staging description here"` 33 | 5. Run `publishSigned` 34 | 6. Run `sonatypeRelease` 35 | 36 | ## Other implementations ## 37 | 38 | - [NodeJS](https://github.com/noopkat/ms-bing-speech-service) 39 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | organization := "com.github.catalystcode" 2 | name := "SpeechToText-WebSockets-Java" 3 | description := "A Java implementation of the Bing Speech to Text API websocket protocol" 4 | 5 | javacOptions in (Compile, compile) ++= Seq( 6 | "-source", "1.8", 7 | "-target", "1.8") 8 | 9 | crossPaths := false 10 | autoScalaLibrary := false 11 | 12 | // Bundled dependencies 13 | libraryDependencies ++= Seq( 14 | "log4j" % "log4j" % "1.2.17", 15 | "org.json" % "json" % "20170516", 16 | "com.googlecode.soundlibs" % "jlayer" % "1.0.1-1", 17 | "com.neovisionaries" % "nv-websocket-client" % "2.2" 18 | ) 19 | 20 | // Test dependencies 21 | libraryDependencies ++= Seq( 22 | "org.junit.jupiter" % "junit-jupiter-api" % "5.0.0-M4" 23 | ).map(_ % "test") 24 | 25 | assemblyMergeStrategy in assembly := { 26 | case PathList("javax", "inject", xs @ _*) => MergeStrategy.last 27 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last 28 | case PathList("javax", "activation", xs @ _*) => MergeStrategy.last 29 | case PathList("org", "aopalliance", xs @ _*) => MergeStrategy.last 30 | case PathList("org", "apache", xs @ _*) => MergeStrategy.last 31 | case PathList("com", "google", xs @ _*) => MergeStrategy.last 32 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last 33 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last 34 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last 35 | case "about.html" => MergeStrategy.rename 36 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last 37 | case "META-INF/mailcap" => MergeStrategy.last 38 | case "META-INF/mimetypes.default" => MergeStrategy.last 39 | case "plugin.properties" => MergeStrategy.last 40 | case "log4j.properties" => MergeStrategy.last 41 | case x => 42 | val oldStrategy = (assemblyMergeStrategy in assembly).value 43 | oldStrategy(x) 44 | } 45 | -------------------------------------------------------------------------------- /examples/data/batman-16khz-noheader.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/SpeechToText-WebSockets-Java/1c1e7209a399674a984204bd7ae5a0f09cf8bdfe/examples/data/batman-16khz-noheader.wav -------------------------------------------------------------------------------- /examples/data/batman.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/SpeechToText-WebSockets-Java/1c1e7209a399674a984204bd7ae5a0f09cf8bdfe/examples/data/batman.wav -------------------------------------------------------------------------------- /examples/data/hall.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/SpeechToText-WebSockets-Java/1c1e7209a399674a984204bd7ae5a0f09cf8bdfe/examples/data/hall.mp3 -------------------------------------------------------------------------------- /lib/tritonus_remaining-0.3.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/SpeechToText-WebSockets-Java/1c1e7209a399674a984204bd7ae5a0f09cf8bdfe/lib/tritonus_remaining-0.3.6.jar -------------------------------------------------------------------------------- /lib/tritonus_share-0.3.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/SpeechToText-WebSockets-Java/1c1e7209a399674a984204bd7ae5a0f09cf8bdfe/lib/tritonus_share-0.3.6.jar -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.13 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") 2 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1") 3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 4 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.4") 5 | -------------------------------------------------------------------------------- /sonatype.sbt: -------------------------------------------------------------------------------- 1 | pomExtra in Global := { 2 | github.com/CatalystCode/SpeechToText-Websockets-Java 3 | 4 | 5 | MIT 6 | https://opensource.org/licenses/MIT 7 | 8 | 9 | 10 | scm:git:github.com/CatalystCode/SpeechToText-Websockets-Java 11 | scm:git:git@github.com:CatalystCode/SpeechToText-Websockets-Java 12 | github.com/CatalystCode/SpeechToText-Websockets-Java 13 | 14 | 15 | 16 | c-w 17 | Clemens Wolff 18 | clewolff@microsoft.com 19 | http://github.com/c-w 20 | 21 | 22 | } 23 | 24 | credentials += Credentials( 25 | "Sonatype Nexus Repository Manager", 26 | "oss.sonatype.org", 27 | System.getenv("SONATYPE_USER"), 28 | System.getenv("SONATYPE_PASSWORD")) 29 | 30 | organizationName := "Partner Catalyst" 31 | organizationHomepage := Some(url("https://github.com/CatalystCode")) 32 | 33 | publishTo := { 34 | val isSnapshot = version.value.trim.endsWith("SNAPSHOT") 35 | val nexus = "https://oss.sonatype.org/" 36 | if (isSnapshot) Some("snapshots" at nexus + "content/repositories/snapshots") 37 | else Some("releases" at nexus + "service/local/staging/deploy/maven2") 38 | } 39 | 40 | publishMavenStyle := true 41 | publishArtifact in Test := false 42 | useGpg := true 43 | -------------------------------------------------------------------------------- /src/main/java/SpeechToTextWebsocketsDemo.java: -------------------------------------------------------------------------------- 1 | import com.github.catalystcode.fortis.speechtotext.Transcriber; 2 | import com.github.catalystcode.fortis.speechtotext.config.OutputFormat; 3 | import com.github.catalystcode.fortis.speechtotext.config.SpeechServiceConfig; 4 | import com.github.catalystcode.fortis.speechtotext.config.SpeechType; 5 | import org.apache.log4j.BasicConfigurator; 6 | import org.apache.log4j.Level; 7 | import org.apache.log4j.Logger; 8 | 9 | import java.io.BufferedInputStream; 10 | import java.io.FileInputStream; 11 | import java.io.IOException; 12 | import java.io.InputStream; 13 | import java.net.URL; 14 | import java.util.Locale; 15 | 16 | public class SpeechToTextWebsocketsDemo { 17 | static { 18 | BasicConfigurator.configure(); 19 | Logger.getRootLogger().setLevel(Level.INFO); 20 | } 21 | 22 | public static void main(String[] args) throws Exception { 23 | final String subscriptionKey = System.getenv("OXFORD_SPEECH_TOKEN"); 24 | final SpeechType speechType = SpeechType.CONVERSATION; 25 | final OutputFormat outputFormat = OutputFormat.SIMPLE; 26 | final String audioPath = args[0]; 27 | final Locale locale = Locale.forLanguageTag(args.length > 1 ? args[1] : "en-US"); 28 | final String audioType = args.length > 2 ? args[2] : audioPath; 29 | 30 | SpeechServiceConfig config = new SpeechServiceConfig(subscriptionKey, speechType, outputFormat, locale); 31 | 32 | try (InputStream audioStream = openStream(audioPath)) { 33 | Transcriber.create(audioType, config).transcribe(audioStream, SpeechToTextWebsocketsDemo::onPhrase, SpeechToTextWebsocketsDemo::onHypothesis, 34 | SpeechToTextWebsocketsDemo::onTurnStart, SpeechToTextWebsocketsDemo::onTurnEnd); 35 | } 36 | } 37 | 38 | private static InputStream openStream(String audioPath) throws IOException { 39 | InputStream inputStream = audioPath.startsWith("http://") || audioPath.startsWith("https://") 40 | ? new URL(audioPath).openConnection().getInputStream() 41 | : new FileInputStream(audioPath); 42 | 43 | return new BufferedInputStream(inputStream); 44 | } 45 | 46 | private static void onTurnEnd() { 47 | System.out.println("TurnEnd:"); 48 | } 49 | 50 | private static void onPhrase(String phrase) { 51 | System.out.println("Phrase: " + phrase); 52 | } 53 | 54 | private static void onHypothesis(String hypothesis) { 55 | System.out.println("Hypothesis: " + hypothesis); 56 | } 57 | 58 | private static void onTurnStart(String serviceTag) { 59 | System.out.println("TurnStart: " + serviceTag); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/Mp3Transcriber.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.config.SpeechServiceConfig; 4 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 5 | import com.github.catalystcode.fortis.speechtotext.websocket.SpeechServiceClient; 6 | import javazoom.jl.converter.Converter; 7 | import javazoom.jl.decoder.JavaLayerException; 8 | import org.apache.log4j.Logger; 9 | 10 | import java.io.*; 11 | import java.nio.ByteBuffer; 12 | import java.nio.channels.FileChannel; 13 | import java.nio.file.Path; 14 | import java.nio.file.Paths; 15 | 16 | import static com.github.catalystcode.fortis.speechtotext.utils.Environment.getMp3BufferSize; 17 | import static java.nio.ByteBuffer.allocate; 18 | import static java.nio.file.Files.createTempFile; 19 | import static java.nio.file.Files.deleteIfExists; 20 | 21 | class Mp3Transcriber extends Transcriber { 22 | private static final Logger log = Logger.getLogger(Mp3Transcriber.class); 23 | 24 | private final int bufferSize; 25 | 26 | Mp3Transcriber(SpeechServiceConfig config, SpeechServiceClient client) { 27 | super(config, client); 28 | this.bufferSize = getMp3BufferSize(); 29 | } 30 | 31 | @Override 32 | protected void sendAudio(InputStream mp3Stream, MessageSender sender) throws IOException { 33 | byte[] streamBuf = new byte[bufferSize]; 34 | ByteBuffer mp3Buf = allocate(bufferSize); 35 | int mp3BufPos = 0; 36 | int read; 37 | while ((read = mp3Stream.read(streamBuf)) != -1) { 38 | if (mp3BufPos + read >= bufferSize) { 39 | if (mp3BufPos > 0) { 40 | log.debug("Buffer full, starting to process " + mp3BufPos + " bytes"); 41 | String mp3Path = newTempFile(".mp3"); 42 | writeBytes(mp3Path, mp3Buf, mp3BufPos); 43 | sendAudioAsync(mp3Path, sender); 44 | mp3Buf.clear(); 45 | } 46 | mp3Buf.put(streamBuf, 0, read); 47 | mp3BufPos = read; 48 | } else { 49 | mp3Buf.put(streamBuf, 0, read); 50 | mp3BufPos += read; 51 | log.debug("Buffered " + mp3BufPos + "/" + bufferSize + " bytes from MP3 stream"); 52 | } 53 | } 54 | sender.sendAudioEnd(); 55 | } 56 | 57 | private static void convertAudio(String mp3Path, String wavPath) throws JavaLayerException { 58 | log.debug("Starting to convert " + mp3Path + " to " + wavPath); 59 | new Converter().convert(mp3Path, wavPath); 60 | log.debug("Converted " + mp3Path + " to " + wavPath); 61 | } 62 | 63 | private static void writeBytes(String path, ByteBuffer buf, int length) throws IOException { 64 | try (FileOutputStream outputStream = new FileOutputStream(path)) { 65 | try (FileChannel channel = outputStream.getChannel()) { 66 | buf.flip(); 67 | channel.write(buf); 68 | } 69 | } 70 | log.debug("Wrote " + length + " bytes to " + path); 71 | } 72 | 73 | private void sendAudioAsync(String mp3Path, MessageSender sender) { 74 | new Thread(() -> { 75 | String wavPath; 76 | try { 77 | wavPath = newTempFile(".wav"); 78 | } catch (IOException ex) { 79 | log.error("Error creating temp file", ex); 80 | return; 81 | } 82 | 83 | try { 84 | convertAudio(mp3Path, wavPath); 85 | } catch (JavaLayerException ex) { 86 | log.error("Error converting MP3 to WAV", ex); 87 | return; 88 | } 89 | 90 | try (InputStream wavStream = new BufferedInputStream(new FileInputStream(wavPath))) { 91 | sender.sendAudio(wavStream); 92 | } catch (Exception ex) { 93 | log.error("Error sending audio", ex); 94 | } finally { 95 | deleteTempFile(mp3Path); 96 | deleteTempFile(wavPath); 97 | } 98 | }).run(); 99 | } 100 | 101 | private String newTempFile(String suffix) throws IOException { 102 | return createTempFile(getClass().getName(), suffix).toString(); 103 | } 104 | 105 | private static void deleteTempFile(String tempFile) { 106 | Path path = Paths.get(tempFile); 107 | try { 108 | deleteIfExists(path); 109 | } catch (IOException ex) { 110 | log.error("Error deleting temp file: " + tempFile, ex); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/Transcriber.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.config.SpeechServiceConfig; 4 | import com.github.catalystcode.fortis.speechtotext.lifecycle.MessageReceiver; 5 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 6 | import com.github.catalystcode.fortis.speechtotext.websocket.SpeechServiceClient; 7 | import com.github.catalystcode.fortis.speechtotext.websocket.nv.NvSpeechServiceClient; 8 | 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.util.function.Consumer; 12 | 13 | public abstract class Transcriber { 14 | protected final SpeechServiceConfig config; 15 | private final SpeechServiceClient client; 16 | 17 | Transcriber(SpeechServiceConfig config, SpeechServiceClient client) { 18 | this.config = config; 19 | this.client = client; 20 | } 21 | 22 | public void transcribe(InputStream audioStream, Consumer onResult, Consumer onHypothesis) throws Exception { 23 | transcribe(audioStream, onResult, onHypothesis); 24 | } 25 | 26 | public void transcribe(InputStream audioStream, Consumer onResult, Consumer onHypothesis, 27 | Consumer onTurnStart, Runnable onTurnEnd) throws Exception { 28 | MessageReceiver receiver = new MessageReceiver(onResult, onHypothesis, onTurnStart, onTurnEnd, client.getEndLatch()); 29 | try { 30 | MessageSender sender = client.start(config, receiver); 31 | receiver.setSender(sender); 32 | sender.sendConfiguration(); 33 | sendAudio(audioStream, sender); 34 | client.awaitEnd(); 35 | } finally { 36 | client.stop(); 37 | } 38 | } 39 | 40 | protected abstract void sendAudio(InputStream audioStream, MessageSender sender) throws IOException; 41 | 42 | public static Transcriber create(String audioPath, SpeechServiceConfig config) { 43 | return create(audioPath, config, new NvSpeechServiceClient()); 44 | } 45 | 46 | public static Transcriber create(SpeechServiceConfig config) { 47 | return create(config, new NvSpeechServiceClient()); 48 | } 49 | 50 | private static Transcriber create(String audioPath, SpeechServiceConfig config, SpeechServiceClient client) { 51 | if (audioPath.endsWith(".wav")) { 52 | return new WavTranscriber(config, client); 53 | } 54 | 55 | if (audioPath.endsWith(".mp3")) { 56 | return new Mp3Transcriber(config, client); 57 | } 58 | 59 | throw new IllegalArgumentException("Unsupported audio file type: " + audioPath); 60 | } 61 | 62 | private static Transcriber create(SpeechServiceConfig config, SpeechServiceClient client) { 63 | return new WavTranscriber(config, client); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/WavTranscriber.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.config.SpeechServiceConfig; 4 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 5 | import com.github.catalystcode.fortis.speechtotext.websocket.SpeechServiceClient; 6 | 7 | import java.io.InputStream; 8 | 9 | class WavTranscriber extends Transcriber { 10 | WavTranscriber(SpeechServiceConfig config, SpeechServiceClient client) { 11 | super(config, client); 12 | } 13 | 14 | @Override 15 | protected void sendAudio(InputStream wavStream, MessageSender sender) { 16 | sendAudioAsync(wavStream, sender); 17 | } 18 | 19 | private void sendAudioAsync(InputStream wavStream, MessageSender sender) { 20 | new Thread(() -> { 21 | sender.sendAudio(wavStream); 22 | sender.sendAudioEnd(); 23 | }).run(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/config/OutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.config; 2 | 3 | @SuppressWarnings("unused") 4 | public enum OutputFormat { 5 | SIMPLE("simple"), 6 | DETAILED("detailed"), 7 | ; 8 | 9 | public final String value; 10 | 11 | OutputFormat(String value) { 12 | this.value = value; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/config/SpeechServiceConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.config; 2 | 3 | import java.util.Locale; 4 | 5 | import static com.github.catalystcode.fortis.speechtotext.utils.Environment.getSpeechPlatformHost; 6 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceConnectionHeaders.*; 7 | 8 | public class SpeechServiceConfig { 9 | private final String subscriptionKey; 10 | private final SpeechType speechType; 11 | private final OutputFormat outputFormat; 12 | private final Locale locale; 13 | private final String host; 14 | 15 | public SpeechServiceConfig(String subscriptionKey, SpeechType speechType, OutputFormat outputFormat, Locale locale) { 16 | this.subscriptionKey = subscriptionKey; 17 | this.speechType = speechType; 18 | this.outputFormat = outputFormat; 19 | this.locale = locale; 20 | this.host = getSpeechPlatformHost(); 21 | } 22 | 23 | public String getConnectionUrl(String connectionId) { 24 | return host + speechType.endpoint + 25 | '?' + LANGUAGE + '=' + locale.toLanguageTag() + 26 | '&' + FORMAT + '=' + outputFormat.value + 27 | '&' + CONNECTION_ID + '=' + connectionId + 28 | '&' + SUBSCRIPTION_KEY + '=' + subscriptionKey; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/config/SpeechType.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.config; 2 | 3 | @SuppressWarnings("unused") 4 | public enum SpeechType { 5 | INTERACTIVE("/speech/recognition/interactive/cognitiveservices/v1"), 6 | DICTATION("/speech/recognition/dictation/cognitiveservices/v1"), 7 | CONVERSATION("/speech/recognition/conversation/cognitiveservices/v1"), 8 | ; 9 | 10 | public final String endpoint; 11 | 12 | SpeechType(String endpoint) { 13 | this.endpoint = endpoint; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/EnvironmentVariables.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class EnvironmentVariables { 4 | private EnvironmentVariables() {} 5 | 6 | private static final String PREFIX = "SSTWSJAVA"; 7 | public static final String HOST = PREFIX + "_HOST"; 8 | public static final String LIBRARY_VERSION = PREFIX + "_LIBRARY_VERSION"; 9 | public static final String DEVICE_MANUFACTURER = PREFIX + "_DEVICE_MANUFACTURER"; 10 | public static final String DEVICE_MODEL = PREFIX + "_DEVICE_MODEL"; 11 | public static final String DEVICE_VERSION = PREFIX + "_DEVICE_VERSION"; 12 | public static final String MP3_BUFFER_SIZE = PREFIX + "_MP3_BUFER_SIZE"; 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceConnectionHeaders.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceConnectionHeaders { 4 | private SpeechServiceConnectionHeaders() {} 5 | 6 | public static final String LANGUAGE = "language"; 7 | public static final String FORMAT = "format"; 8 | public static final String CONNECTION_ID = "X-ConnectionId"; 9 | public static final String SUBSCRIPTION_KEY = "Ocp-Apim-Subscription-Key"; 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceContentTypes.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceContentTypes { 4 | private SpeechServiceContentTypes() {} 5 | 6 | public static final String WAV = "audio/wav"; 7 | public static final String JSON = "application/json; charset=utf-8"; 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceLimitations.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceLimitations { 4 | private SpeechServiceLimitations() {} 5 | 6 | public final static int MAX_ERROR_MESSAGE_NUM_CHARACTERS = 50; 7 | public static final int MAX_BYTES_PER_AUDIO_CHUNK = 8192; 8 | public static final int SAMPLE_RATE = 16000; 9 | public static final short NUM_CHANNELS = 1; 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceMessageFields.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceMessageFields { 4 | private SpeechServiceMessageFields() {} 5 | 6 | public static final String RECOGNITION_STATUS = "RecognitionStatus"; 7 | public static final String SUCCESS_STATUS = "Success"; 8 | public static final String END_OF_DICTATION_STATUS = "EndOfDictation"; 9 | public static final String END_OF_DICTATION_SILENCE_STATUS = "DictationEndSilenceTimeout"; 10 | public static final String DISPLAY_TEXT = "DisplayText"; 11 | public static final String HYPOTHESIS_TEXT = "Text"; 12 | public static final String CONTEXT = "context"; 13 | public static final String SERVICE_TAG = "serviceTag"; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceMessageHeaders.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceMessageHeaders { 4 | private SpeechServiceMessageHeaders() {} 5 | 6 | public static final String PATH = "Path"; 7 | public static final String REQUEST_ID = "X-RequestId"; 8 | public static final String TIMESTAMP = "X-Timestamp"; 9 | public static final String CONTENT_TYPE = "Content-Type"; 10 | 11 | public static final String HEADER_DELIM = "\r\n"; 12 | public static final String BODY_DELIM = "\r\n\r\n"; 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceMetrics.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceMetrics { 4 | private SpeechServiceMetrics() {} 5 | 6 | public static final String METRICS = "Metrics"; 7 | public static final String RECEIVED_MESSAGES = "ReceivedMessages"; 8 | public static final String NAME = "Name"; 9 | public static final String START = "Start"; 10 | public static final String END = "End"; 11 | public static final String ERROR = "Error"; 12 | public static final String ID = "Id"; 13 | 14 | public static final String CONNECTION_METRIC = "Connection"; 15 | public static final String MICROPHONE_METRIC = "Microphone"; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServicePaths.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServicePaths { 4 | private SpeechServicePaths() {} 5 | 6 | public static final String AUDIO = "audio"; 7 | public static final String SPEECH_CONFIG = "speech.config"; 8 | public static final String SPEECH_PHRASE = "speech.phrase"; 9 | public static final String SPEECH_HYPOTHESIS = "speech.hypothesis"; 10 | public static final String SPEECH_END = "speech.endDetected"; 11 | public static final String TURN_START = "turn.start"; 12 | public static final String TURN_END = "turn.end"; 13 | public static final String TELEMETRY = "telemetry"; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceSpeechConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | public final class SpeechServiceSpeechConfig { 4 | private SpeechServiceSpeechConfig() {} 5 | 6 | public static final String CONTEXT = "context"; 7 | public static final String SYSTEM = "system"; 8 | public static final String OS = "os"; 9 | public static final String DEVICE = "device"; 10 | public static final String SYSTEM_VERSION = "version"; 11 | public static final String OS_PLATFORM = "platform"; 12 | public static final String OS_NAME = "name"; 13 | public static final String OS_VERSION = "version"; 14 | public static final String DEVICE_MANUFACTURER = "manufacturer"; 15 | public static final String DEVICE_MODEL = "model"; 16 | public static final String DEVICE_VERSION = "version"; 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/constants/SpeechServiceWebsocketStatusCodes.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.constants; 2 | 3 | @SuppressWarnings("unused") 4 | public final class SpeechServiceWebsocketStatusCodes { 5 | private SpeechServiceWebsocketStatusCodes() {} 6 | 7 | public static final int OK = 1000; 8 | public static final int PROTOCOL_ERROR = 1002; 9 | public static final int INVALID_PAYLOAD_DATA = 1007; 10 | public static final int SERVER_ERROR = 1011; 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/lifecycle/MessageReceiver.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.lifecycle; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.messages.MessageParser; 4 | import com.github.catalystcode.fortis.speechtotext.telemetry.CallsTelemetry; 5 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 6 | import org.apache.log4j.Logger; 7 | import org.json.JSONObject; 8 | 9 | import java.util.Map; 10 | import java.util.concurrent.CountDownLatch; 11 | import java.util.function.Consumer; 12 | 13 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.PATH; 14 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.REQUEST_ID; 15 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServicePaths.*; 16 | 17 | 18 | public class MessageReceiver { 19 | private static final Logger log = Logger.getLogger(MessageReceiver.class); 20 | private final Consumer onResult; 21 | private final Consumer onHypothesis; 22 | private final Consumer onTurnStart; 23 | private final Runnable onTurnEnd; 24 | private final CountDownLatch endLatch; 25 | private MessageSender sender; 26 | 27 | public MessageReceiver(Consumer onResult, Consumer onHypothesis, CountDownLatch endLatch) { 28 | this(onResult, onHypothesis, null, null, endLatch); 29 | } 30 | 31 | public MessageReceiver(Consumer onResult, Consumer onHypothesis, 32 | Consumer onTurnStart, Runnable onTurnEnd, CountDownLatch endLatch) { 33 | this.onResult = onResult; 34 | this.onHypothesis = onHypothesis; 35 | this.onTurnStart = onTurnStart; 36 | this.onTurnEnd = onTurnEnd; 37 | this.endLatch = endLatch; 38 | } 39 | 40 | public void onMessage(String message) { 41 | Map headers = MessageParser.parseHeaders(message); 42 | JSONObject body = MessageParser.parseBody(message); 43 | 44 | String path = headers.get(PATH); 45 | String requestId = headers.get(REQUEST_ID); 46 | CallsTelemetry.forId(requestId).recordCall(path); 47 | log.debug("Got message at path " + path + " with payload '" + body + "'"); 48 | 49 | if (TURN_START.equalsIgnoreCase(path)) { 50 | TurnStartMessage.handle(body, onTurnStart); 51 | } else if (SPEECH_HYPOTHESIS.equalsIgnoreCase(path)) { 52 | SpeechHypothesisMessage.handle(body, onHypothesis); 53 | } else if (SPEECH_PHRASE.equalsIgnoreCase(path)) { 54 | SpeechPhraseMessage.handle(body, onResult); 55 | } else if (TURN_END.equalsIgnoreCase(path)) { 56 | TurnEndMessage.handle(sender, endLatch, onTurnEnd); 57 | } 58 | } 59 | 60 | public void setSender(MessageSender sender) { 61 | this.sender = sender; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/lifecycle/SpeechHypothesisMessage.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.lifecycle; 2 | 3 | import org.json.JSONObject; 4 | 5 | import java.util.function.Consumer; 6 | 7 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageFields.HYPOTHESIS_TEXT; 8 | 9 | final class SpeechHypothesisMessage { 10 | private SpeechHypothesisMessage() {} 11 | 12 | static void handle(JSONObject message, Consumer onHypothesis) { 13 | if (onHypothesis == null) { 14 | return; 15 | } 16 | 17 | String displayText = message.getString(HYPOTHESIS_TEXT); 18 | onHypothesis.accept(displayText); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/lifecycle/SpeechPhraseMessage.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.lifecycle; 2 | 3 | import org.apache.log4j.Logger; 4 | import org.json.JSONObject; 5 | 6 | import java.util.function.Consumer; 7 | 8 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageFields.*; 9 | 10 | final class SpeechPhraseMessage { 11 | private static final Logger log = Logger.getLogger(SpeechPhraseMessage.class); 12 | private SpeechPhraseMessage() {} 13 | 14 | static void handle(JSONObject message, Consumer onResult) { 15 | if (!isSuccess(message)) { 16 | return; 17 | } 18 | 19 | String displayText = message.getString(DISPLAY_TEXT); 20 | onResult.accept(displayText); 21 | } 22 | 23 | private static boolean isSuccess(JSONObject message) { 24 | String status = message.getString(RECOGNITION_STATUS); 25 | 26 | if (END_OF_DICTATION_STATUS.equalsIgnoreCase(status) || 27 | END_OF_DICTATION_SILENCE_STATUS.equalsIgnoreCase(status)) { 28 | log.info("Detected end of speech"); 29 | return false; 30 | } 31 | 32 | if (!SUCCESS_STATUS.equalsIgnoreCase(status)) { 33 | log.warn("Unable to recognize audio: " + message); 34 | return false; 35 | } 36 | 37 | return true; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/lifecycle/TurnEndMessage.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.lifecycle; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 4 | 5 | import java.util.concurrent.CountDownLatch; 6 | 7 | final class TurnEndMessage { 8 | private TurnEndMessage() {} 9 | 10 | static void handle(MessageSender sender, CountDownLatch turnEndLatch, Runnable onTurnEnd) { 11 | try { 12 | if (onTurnEnd != null) { 13 | onTurnEnd.run(); 14 | } 15 | } finally { 16 | sender.sendTelemetry(); 17 | turnEndLatch.countDown(); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/lifecycle/TurnStartMessage.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.lifecycle; 2 | 3 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageFields.*; 4 | 5 | import java.util.function.Consumer; 6 | 7 | import org.json.JSONObject;; 8 | 9 | final class TurnStartMessage { 10 | private TurnStartMessage() {} 11 | 12 | static void handle(JSONObject message, Consumer onTurnStart) { 13 | if (onTurnStart == null) { 14 | return; 15 | } 16 | 17 | JSONObject context = message.getJSONObject(CONTEXT); 18 | String serviceTag = context.getString(SERVICE_TAG); 19 | onTurnStart.accept(serviceTag); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/messages/AudioEndMessageCreator.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.messages; 2 | 3 | import java.nio.ByteBuffer; 4 | 5 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceContentTypes.WAV; 6 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServicePaths.AUDIO; 7 | 8 | public final class AudioEndMessageCreator { 9 | private AudioEndMessageCreator() {} 10 | 11 | private static final BinaryMessageCreator binaryMessageCreator = new BinaryMessageCreator(false); 12 | 13 | public static ByteBuffer createAudioEndMessage(String requestId) { 14 | return binaryMessageCreator.createBinaryMessage(AUDIO, requestId, WAV, new byte[0], 0); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/messages/BinaryMessageCreator.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.messages; 2 | 3 | import java.nio.ByteBuffer; 4 | 5 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceLimitations.NUM_CHANNELS; 6 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceLimitations.SAMPLE_RATE; 7 | import static com.github.catalystcode.fortis.speechtotext.messages.HeaderCreator.addHeaders; 8 | import static com.github.catalystcode.fortis.speechtotext.utils.RiffHeader.RIFF_HEADER_LENGTH; 9 | import static com.github.catalystcode.fortis.speechtotext.utils.RiffHeader.putRiffHeader; 10 | import static java.nio.ByteBuffer.allocate; 11 | import static java.nio.charset.StandardCharsets.UTF_8; 12 | 13 | public class BinaryMessageCreator { 14 | private boolean isFirstMessage; 15 | 16 | public BinaryMessageCreator() { 17 | this(true); 18 | } 19 | 20 | BinaryMessageCreator(boolean isFirstMessage) { 21 | this.isFirstMessage = isFirstMessage; 22 | } 23 | 24 | public ByteBuffer createBinaryMessage(String path, String requestId, String contentType, byte[] wavBytes, int count) { 25 | byte[] headers = formatHeaders(path, requestId, contentType); 26 | ByteBuffer buf = allocateBuffer(count, headers.length); 27 | putHeader(headers, buf); 28 | putContent(wavBytes, count, buf); 29 | updateState(); 30 | return buf; 31 | } 32 | 33 | private static byte[] formatHeaders(String path, String requestId, String contentType) { 34 | return addHeaders(new StringBuilder(), path, requestId, contentType).toString().getBytes(UTF_8); 35 | } 36 | 37 | private void putContent(byte[] wavBytes, int count, ByteBuffer buf) { 38 | if (count <= 0) { 39 | return; 40 | } 41 | 42 | int offset = isFirstMessage ? RIFF_HEADER_LENGTH : 0; 43 | int length = isFirstMessage ? count - RIFF_HEADER_LENGTH : count; 44 | if (isFirstMessage) putRiffHeader(buf, SAMPLE_RATE, NUM_CHANNELS); 45 | buf.put(wavBytes, offset, length); 46 | } 47 | 48 | private void updateState() { 49 | if (isFirstMessage) { 50 | isFirstMessage = false; 51 | } 52 | } 53 | 54 | private static void putHeader(byte[] header, ByteBuffer buf) { 55 | buf.putShort((short)header.length); 56 | buf.put(header); 57 | } 58 | 59 | private ByteBuffer allocateBuffer(int numWavBytes, int numHeaderBytes) { 60 | int bufSize = 2 + numHeaderBytes; 61 | if (isFirstMessage) bufSize += RIFF_HEADER_LENGTH; 62 | bufSize += numWavBytes; 63 | return allocate(bufSize); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/messages/HeaderCreator.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.messages; 2 | 3 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.*; 4 | import static com.github.catalystcode.fortis.speechtotext.utils.ProtocolUtils.newTimestamp; 5 | 6 | final class HeaderCreator { 7 | private HeaderCreator() {} 8 | 9 | static StringBuilder addHeaders(StringBuilder sb, String path, String requestId, String contentType) { 10 | sb.append(PATH).append(": ").append(path).append(HEADER_DELIM); 11 | sb.append(REQUEST_ID).append(": ").append(requestId).append(HEADER_DELIM); 12 | sb.append(TIMESTAMP).append(": ").append(newTimestamp()).append(HEADER_DELIM); 13 | sb.append(CONTENT_TYPE).append(": ").append(contentType).append(HEADER_DELIM); 14 | return sb; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/messages/MessageParser.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.messages; 2 | 3 | import org.json.JSONObject; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.BODY_DELIM; 9 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.HEADER_DELIM; 10 | 11 | public final class MessageParser { 12 | private MessageParser() {} 13 | 14 | public static Map parseHeaders(String message) { 15 | String[] parts = message.split(BODY_DELIM); 16 | if (parts.length != 2) { 17 | throw new IllegalArgumentException("Message '" + message + "' does not have header and body"); 18 | } 19 | String[] headerLines = parts[0].split(HEADER_DELIM); 20 | Map headers = new HashMap<>(headerLines.length); 21 | for (String headerLine : headerLines) { 22 | String[] headerParts = headerLine.split(":"); 23 | if (headerParts.length < 2) { 24 | throw new IllegalArgumentException("Header '" + headerLine + "' does not have a name and value"); 25 | } 26 | String headerName = headerParts[0].trim(); 27 | StringBuilder headerValueBuilder = new StringBuilder(); 28 | for (int i = 1; i < headerParts.length; i++) { 29 | headerValueBuilder.append(headerParts[i]).append(':'); 30 | } 31 | headerValueBuilder.setLength(headerValueBuilder.length() - 1); 32 | String headerValue = headerValueBuilder.toString().trim(); 33 | headers.put(headerName, headerValue); 34 | } 35 | return headers; 36 | } 37 | 38 | public static JSONObject parseBody(String message) { 39 | String[] parts = message.split(BODY_DELIM); 40 | if (parts.length != 2) { 41 | throw new IllegalArgumentException("Message '" + message + "' does not have header and body"); 42 | } 43 | String content = parts[1]; 44 | return new JSONObject(content); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/messages/TextMessageCreator.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.messages; 2 | 3 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.HEADER_DELIM; 4 | import static com.github.catalystcode.fortis.speechtotext.messages.HeaderCreator.addHeaders; 5 | 6 | public final class TextMessageCreator { 7 | private TextMessageCreator() {} 8 | 9 | public static String createTextMessage(String path, String requestId, String contentType, String message) { 10 | return addHeaders(new StringBuilder(), path, requestId, contentType).append(HEADER_DELIM).append(message).toString(); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/telemetry/AudioTelemetry.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.telemetry; 2 | 3 | import java.util.concurrent.ConcurrentHashMap; 4 | import java.util.concurrent.ConcurrentMap; 5 | 6 | import static com.github.catalystcode.fortis.speechtotext.utils.ProtocolUtils.newTimestamp; 7 | 8 | public final class AudioTelemetry { 9 | private static final ConcurrentMap POOL = new ConcurrentHashMap<>(); 10 | 11 | private String audioStarted; 12 | private String audioEnded; 13 | private String audioErrored; 14 | 15 | private AudioTelemetry() {} 16 | 17 | public void recordAudioStarted() { 18 | if (audioStarted == null) { 19 | audioStarted = newTimestamp(); 20 | } 21 | } 22 | 23 | public void recordAudioEnded() { 24 | if (audioEnded == null) { 25 | audioEnded = newTimestamp(); 26 | } 27 | } 28 | 29 | public void recordAudioFailed(String message) { 30 | audioEnded = newTimestamp(); 31 | audioErrored = message; 32 | } 33 | 34 | public String getAudioErrored() { 35 | return audioErrored; 36 | } 37 | 38 | public String getAudioEnded() { 39 | return audioEnded; 40 | } 41 | 42 | public String getAudioStarted() { 43 | return audioStarted; 44 | } 45 | 46 | public static AudioTelemetry forId(String requestId) { 47 | AudioTelemetry instance = POOL.get(requestId); 48 | if (instance == null) { 49 | AudioTelemetry newInstance = new AudioTelemetry(); 50 | instance = POOL.putIfAbsent(requestId, newInstance); 51 | if (instance == null) { 52 | instance = newInstance; 53 | } 54 | } 55 | return instance; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/telemetry/CallsTelemetry.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.telemetry; 2 | 3 | import java.util.Map; 4 | import java.util.Queue; 5 | import java.util.concurrent.ConcurrentHashMap; 6 | import java.util.concurrent.ConcurrentLinkedQueue; 7 | import java.util.concurrent.ConcurrentMap; 8 | 9 | import static com.github.catalystcode.fortis.speechtotext.utils.ProtocolUtils.newTimestamp; 10 | 11 | public final class CallsTelemetry { 12 | private static final ConcurrentMap POOL = new ConcurrentHashMap<>(); 13 | 14 | private final ConcurrentMap> callTimestamps = new ConcurrentHashMap<>(); 15 | 16 | private CallsTelemetry() {} 17 | 18 | public void recordCall(String endpoint) { 19 | String now = newTimestamp(); 20 | Queue timestamps = callTimestamps.get(endpoint); 21 | if (timestamps == null) { 22 | Queue newTimestamps = new ConcurrentLinkedQueue<>(); 23 | timestamps = callTimestamps.putIfAbsent(endpoint, newTimestamps); 24 | if (timestamps == null) { 25 | timestamps = newTimestamps; 26 | } 27 | } 28 | timestamps.add(now); 29 | } 30 | 31 | public Map> getCallTimestamps() { 32 | return callTimestamps; 33 | } 34 | 35 | public static CallsTelemetry forId(String requestId) { 36 | CallsTelemetry instance = POOL.get(requestId); 37 | if (instance == null) { 38 | CallsTelemetry newInstance = new CallsTelemetry(); 39 | instance = POOL.putIfAbsent(requestId, newInstance); 40 | if (instance == null) { 41 | instance = newInstance; 42 | } 43 | } 44 | return instance; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/telemetry/ConnectionTelemetry.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.telemetry; 2 | 3 | import java.util.concurrent.ConcurrentHashMap; 4 | import java.util.concurrent.ConcurrentMap; 5 | 6 | import static com.github.catalystcode.fortis.speechtotext.utils.ProtocolUtils.newTimestamp; 7 | 8 | public final class ConnectionTelemetry { 9 | private static final ConcurrentMap POOL = new ConcurrentHashMap<>(); 10 | 11 | private String connectionStarted; 12 | private String connectionEstablished; 13 | private String connectionErrored; 14 | 15 | private ConnectionTelemetry() {} 16 | 17 | public void recordConnectionStarted() { 18 | if (connectionStarted == null) { 19 | connectionStarted = newTimestamp(); 20 | } 21 | } 22 | 23 | public void recordConnectionEstablished() { 24 | if (connectionEstablished == null) { 25 | connectionEstablished = newTimestamp(); 26 | } 27 | } 28 | 29 | public void recordConnectionFailed(String message) { 30 | connectionEstablished = newTimestamp(); 31 | connectionErrored = message; 32 | } 33 | 34 | public String getConnectionErrored() { 35 | return connectionErrored; 36 | } 37 | 38 | public String getConnectionEstablished() { 39 | return connectionEstablished; 40 | } 41 | 42 | public String getConnectionStarted() { 43 | return connectionStarted; 44 | } 45 | 46 | public static ConnectionTelemetry forId(String connectionId) { 47 | ConnectionTelemetry instance = POOL.get(connectionId); 48 | if (instance == null) { 49 | ConnectionTelemetry newInstance = new ConnectionTelemetry(); 50 | instance = POOL.putIfAbsent(connectionId, newInstance); 51 | if (instance == null) { 52 | instance = newInstance; 53 | } 54 | } 55 | return instance; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/utils/Environment.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.utils; 2 | 3 | import static com.github.catalystcode.fortis.speechtotext.constants.EnvironmentVariables.*; 4 | import static com.github.catalystcode.fortis.speechtotext.utils.Units.KB; 5 | import static java.lang.Integer.parseInt; 6 | 7 | public final class Environment { 8 | private Environment() {} 9 | 10 | public static String getSpeechPlatformHost() { 11 | return getenv(HOST, "wss://speech.platform.bing.com"); 12 | } 13 | 14 | public static String getLibraryVersion() { 15 | return getenv(LIBRARY_VERSION, "0.0.1"); 16 | } 17 | 18 | public static String getDeviceManufacturer() { 19 | return getenv(DEVICE_MANUFACTURER, "SpeechToText-Websockets-Java"); 20 | } 21 | 22 | public static String getDeviceModel() { 23 | return getenv(DEVICE_MODEL, "SpeechToText-Websockets-Java"); 24 | } 25 | 26 | public static String getDeviceVersion() { 27 | return getenv(DEVICE_VERSION, "0.0.1"); 28 | } 29 | 30 | public static int getMp3BufferSize() { 31 | return parseInt(getenv(MP3_BUFFER_SIZE, String.valueOf(8 * KB))); 32 | } 33 | 34 | private static String getenv(String key, String defaultValue) { 35 | String value = System.getenv(key); 36 | return value != null ? value : defaultValue; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/utils/ProtocolUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.utils; 2 | 3 | import static java.time.ZonedDateTime.now; 4 | import static java.time.format.DateTimeFormatter.ISO_INSTANT; 5 | import static java.util.UUID.randomUUID; 6 | 7 | public final class ProtocolUtils { 8 | private ProtocolUtils() {} 9 | 10 | public static String newGuid() { 11 | return randomUUID().toString().replace("-", ""); 12 | } 13 | 14 | public static String newTimestamp() { 15 | return now().format(ISO_INSTANT); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/utils/RiffHeader.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.utils; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.nio.ByteBuffer; 6 | 7 | import static java.nio.ByteBuffer.wrap; 8 | import static java.nio.ByteOrder.BIG_ENDIAN; 9 | import static java.nio.ByteOrder.LITTLE_ENDIAN; 10 | 11 | @SuppressWarnings({"unused", "WeakerAccess"}) 12 | public final class RiffHeader { 13 | public static final int RIFF_HEADER_LENGTH = 44; 14 | private static final int FORMAT_WAVE = 0x57415645; 15 | private static final int CHUNKID_RIFF = 0x52494646; 16 | private static final int SUBCHUNK1ID_FMT = 0x666d7420; 17 | private static final int SUBCHUNK2ID_DATA = 0x64617461; 18 | private static final short AUDIO_FORMAT_PCM = 1; 19 | 20 | public final int chunkId; 21 | public final int chunkSize; 22 | public final int format; 23 | public final int subChunk1ID; 24 | public final int subChunk1Size; 25 | public final short audioFormat; 26 | public final short numChannels; 27 | public final int sampleRate; 28 | public final int byteRate; 29 | public final short blockAlign; 30 | public final short bitsPerSample; 31 | public final int subChunk2Id; 32 | public final int subChunk2Size; 33 | 34 | public RiffHeader(byte[] wavBytes) { 35 | ByteBuffer waveHeader = wrap(wavBytes, 0, RIFF_HEADER_LENGTH); 36 | 37 | waveHeader.order(BIG_ENDIAN); 38 | chunkId = waveHeader.getInt(); 39 | 40 | waveHeader.order(LITTLE_ENDIAN); 41 | chunkSize = waveHeader.getInt(); 42 | 43 | waveHeader.order(BIG_ENDIAN); 44 | format = waveHeader.getInt(); 45 | subChunk1ID = waveHeader.getInt(); 46 | 47 | waveHeader.order(LITTLE_ENDIAN); 48 | subChunk1Size = waveHeader.getInt(); 49 | audioFormat = waveHeader.getShort(); 50 | numChannels = waveHeader.getShort(); 51 | sampleRate = waveHeader.getInt(); 52 | byteRate = waveHeader.getInt(); 53 | blockAlign = waveHeader.getShort(); 54 | bitsPerSample = waveHeader.getShort(); 55 | 56 | waveHeader.order(BIG_ENDIAN); 57 | subChunk2Id = waveHeader.getInt(); 58 | 59 | waveHeader.order(LITTLE_ENDIAN); 60 | subChunk2Size = waveHeader.getInt(); 61 | } 62 | 63 | public static void putRiffHeader(ByteBuffer buf, int sampleRate, short numChannels) { 64 | int chunkSize = 0; 65 | int subChunk1Size = 16; 66 | int subChunk2Size = 0; 67 | short bitsPerSample = 16; 68 | int bytesPerSample = bitsPerSample / 8; 69 | int byteRate = sampleRate * numChannels * bytesPerSample; 70 | short blockAlign = (short)(numChannels * bytesPerSample); 71 | 72 | buf.order(BIG_ENDIAN); 73 | buf.putInt(CHUNKID_RIFF); 74 | buf.order(LITTLE_ENDIAN); 75 | buf.putInt(chunkSize); 76 | buf.order(BIG_ENDIAN); 77 | buf.putInt(FORMAT_WAVE); 78 | buf.putInt(SUBCHUNK1ID_FMT); 79 | buf.order(LITTLE_ENDIAN); 80 | buf.putInt(subChunk1Size); 81 | buf.putShort(AUDIO_FORMAT_PCM); 82 | buf.putShort(numChannels); 83 | buf.putInt(sampleRate); 84 | buf.putInt(byteRate); 85 | buf.putShort(blockAlign); 86 | buf.putShort(bitsPerSample); 87 | buf.order(BIG_ENDIAN); 88 | buf.putInt(SUBCHUNK2ID_DATA); 89 | buf.order(LITTLE_ENDIAN); 90 | buf.putInt(subChunk2Size); 91 | } 92 | 93 | public static RiffHeader fromStream(InputStream wavStream) throws IOException { 94 | byte[] header = new byte[RIFF_HEADER_LENGTH]; 95 | int read = wavStream.read(header); 96 | if (read != RIFF_HEADER_LENGTH) { 97 | throw new IOException("Unable to read " + RIFF_HEADER_LENGTH + " bytes of RIFF header from stream"); 98 | } 99 | return new RiffHeader(header); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/utils/Units.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.utils; 2 | 3 | public final class Units { 4 | private Units() {} 5 | 6 | public static final int KB = 1024; 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/MessageSender.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.messages.BinaryMessageCreator; 4 | import com.github.catalystcode.fortis.speechtotext.telemetry.AudioTelemetry; 5 | import com.github.catalystcode.fortis.speechtotext.telemetry.CallsTelemetry; 6 | import com.github.catalystcode.fortis.speechtotext.telemetry.ConnectionTelemetry; 7 | import com.github.catalystcode.fortis.speechtotext.utils.RiffHeader; 8 | import org.apache.log4j.Logger; 9 | 10 | import javax.sound.sampled.AudioFormat; 11 | import javax.sound.sampled.AudioInputStream; 12 | import javax.sound.sampled.UnsupportedAudioFileException; 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.nio.ByteBuffer; 16 | 17 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceContentTypes.JSON; 18 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceContentTypes.WAV; 19 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceLimitations.*; 20 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServicePaths.*; 21 | import static com.github.catalystcode.fortis.speechtotext.messages.AudioEndMessageCreator.createAudioEndMessage; 22 | import static com.github.catalystcode.fortis.speechtotext.messages.TextMessageCreator.createTextMessage; 23 | import static com.github.catalystcode.fortis.speechtotext.utils.ProtocolUtils.newGuid; 24 | import static javax.sound.sampled.AudioFormat.Encoding.PCM_SIGNED; 25 | import static javax.sound.sampled.AudioSystem.getAudioInputStream; 26 | 27 | public abstract class MessageSender { 28 | private static final Logger log = Logger.getLogger(MessageSender.class); 29 | 30 | private final String connectionId; 31 | private final String requestId; 32 | private final BinaryMessageCreator binaryMessageCreator; 33 | 34 | protected MessageSender(String connectionId) { 35 | this.connectionId = connectionId; 36 | this.requestId = newGuid(); 37 | this.binaryMessageCreator = new BinaryMessageCreator(); 38 | } 39 | 40 | public final void sendConfiguration() { 41 | String config = new PlatformInfo().toJson(); 42 | String configMessage = createTextMessage(SPEECH_CONFIG, requestId, JSON, config); 43 | sendTextMessage(configMessage); 44 | log.info("Sent speech config: " + config); 45 | } 46 | 47 | public final void sendAudio(InputStream wavStream) { 48 | AudioInputStream pcmStream; 49 | try { 50 | pcmStream = adjustAudioEncoding(wavStream); 51 | } catch (UnsupportedAudioFileException | IOException ex) { 52 | log.error("Problem adjusting audio", ex); 53 | return; 54 | } 55 | send16khzMonoPcmAudio(pcmStream); 56 | } 57 | 58 | private static AudioInputStream adjustAudioEncoding(InputStream sourceWavStream) throws UnsupportedAudioFileException, IOException { 59 | AudioInputStream audioPcm; 60 | if (sourceWavStream instanceof AudioInputStream) { 61 | audioPcm = (AudioInputStream) sourceWavStream; 62 | } else { 63 | audioPcm = getAudioInputStream(sourceWavStream); 64 | } 65 | AudioInputStream audio16khz = to16khz(audioPcm); 66 | AudioInputStream audio16khzMono = toMono(audio16khz); 67 | AudioInputStream audio16khzMonoPcm = toPcm(audio16khzMono); 68 | skipRiffHeader(audio16khzMonoPcm); 69 | return audio16khzMonoPcm; 70 | } 71 | 72 | private static AudioInputStream toPcm(AudioInputStream sourceAudioStream) { 73 | AudioFormat sourceFormat = sourceAudioStream.getFormat(); 74 | return getAudioInputStream(new AudioFormat( 75 | PCM_SIGNED, 76 | sourceFormat.getSampleRate(), 77 | sourceFormat.getSampleSizeInBits(), 78 | sourceFormat.getChannels(), 79 | sourceFormat.getFrameSize(), 80 | sourceFormat.getFrameRate(), 81 | sourceFormat.isBigEndian()), sourceAudioStream); 82 | } 83 | 84 | private static AudioInputStream toMono(AudioInputStream sourceAudioStream) { 85 | AudioFormat sourceFormat = sourceAudioStream.getFormat(); 86 | return getAudioInputStream(new AudioFormat( 87 | sourceFormat.getEncoding(), 88 | sourceFormat.getSampleRate(), 89 | sourceFormat.getSampleSizeInBits(), 90 | NUM_CHANNELS, 91 | sourceFormat.getFrameSize(), 92 | sourceFormat.getFrameRate(), 93 | sourceFormat.isBigEndian()), sourceAudioStream); 94 | } 95 | 96 | private static AudioInputStream to16khz(AudioInputStream sourceAudioStream) { 97 | AudioFormat sourceFormat = sourceAudioStream.getFormat(); 98 | return getAudioInputStream(new AudioFormat( 99 | sourceFormat.getEncoding(), 100 | SAMPLE_RATE, 101 | sourceFormat.getSampleSizeInBits(), 102 | sourceFormat.getChannels(), 103 | sourceFormat.getFrameSize(), 104 | sourceFormat.getFrameRate(), 105 | sourceFormat.isBigEndian()), sourceAudioStream); 106 | } 107 | 108 | private static void skipRiffHeader(InputStream wavStream) throws IOException { 109 | RiffHeader.fromStream(wavStream); 110 | } 111 | 112 | private void send16khzMonoPcmAudio(InputStream wavStream) { 113 | AudioTelemetry audioTelemetry = AudioTelemetry.forId(requestId); 114 | audioTelemetry.recordAudioStarted(); 115 | try { 116 | byte[] buf = new byte[MAX_BYTES_PER_AUDIO_CHUNK]; 117 | int chunksSent = 0; 118 | int read; 119 | while ((read = wavStream.read(buf)) != -1) { 120 | ByteBuffer audioChunkMessage = binaryMessageCreator.createBinaryMessage(AUDIO, requestId, WAV, buf, read); 121 | sendBinaryMessage(audioChunkMessage); 122 | chunksSent++; 123 | } 124 | log.info("Sent " + chunksSent + " audio chunks"); 125 | } catch (Exception ex) { 126 | audioTelemetry.recordAudioFailed(ex.getMessage()); 127 | throw new RuntimeException(ex); 128 | } 129 | } 130 | 131 | public final void sendAudioEnd() { 132 | AudioTelemetry audioTelemetry = AudioTelemetry.forId(requestId); 133 | ByteBuffer audioEndMessage = createAudioEndMessage(requestId); 134 | sendBinaryMessage(audioEndMessage); 135 | log.debug("Sent explicit end-of-audio marker"); 136 | audioTelemetry.recordAudioEnded(); 137 | } 138 | 139 | public final void sendTelemetry() { 140 | CallsTelemetry callsTelemetry = CallsTelemetry.forId(requestId); 141 | ConnectionTelemetry connectionTelemetry = ConnectionTelemetry.forId(connectionId); 142 | AudioTelemetry audioTelemetry = AudioTelemetry.forId(requestId); 143 | String telemetry = new TelemetryInfo(connectionId, callsTelemetry, connectionTelemetry, audioTelemetry).toJson(); 144 | String telemetryMessage = createTextMessage(TELEMETRY, requestId, JSON, telemetry); 145 | sendTextMessage(telemetryMessage); 146 | log.info("Sent telemetry: " + telemetry); 147 | } 148 | 149 | protected abstract void sendBinaryMessage(ByteBuffer message); 150 | protected abstract void sendTextMessage(String message); 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/PlatformInfo.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket; 2 | 3 | import org.json.JSONObject; 4 | 5 | import static com.github.catalystcode.fortis.speechtotext.utils.Environment.*; 6 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceSpeechConfig.*; 7 | import static java.lang.System.getProperty; 8 | 9 | class PlatformInfo { 10 | String toJson() { 11 | JSONObject json = new JSONObject(); 12 | json.put(CONTEXT, createContext()); 13 | return json.toString(); 14 | } 15 | 16 | private JSONObject createContext() { 17 | JSONObject json = new JSONObject(); 18 | json.put(SYSTEM, createSystem()); 19 | json.put(OS, createOs()); 20 | json.put(DEVICE, createDevice()); 21 | return json; 22 | } 23 | 24 | private JSONObject createSystem() { 25 | JSONObject json = new JSONObject(); 26 | json.put(SYSTEM_VERSION, getLibraryVersion()); 27 | return json; 28 | } 29 | 30 | private JSONObject createOs() { 31 | JSONObject json = new JSONObject(); 32 | json.put(OS_PLATFORM, getProperty("os.name").split(" ")[0]); 33 | json.put(OS_NAME, getProperty("os.name")); 34 | json.put(OS_VERSION, getProperty("os.version")); 35 | return json; 36 | } 37 | 38 | private JSONObject createDevice() { 39 | JSONObject json = new JSONObject(); 40 | json.put(DEVICE_MANUFACTURER, getDeviceManufacturer()); 41 | json.put(DEVICE_MODEL, getDeviceModel()); 42 | json.put(DEVICE_VERSION, getDeviceVersion()); 43 | return json; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/SpeechServiceClient.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.config.SpeechServiceConfig; 4 | import com.github.catalystcode.fortis.speechtotext.lifecycle.MessageReceiver; 5 | 6 | import java.util.concurrent.CountDownLatch; 7 | 8 | public interface SpeechServiceClient { 9 | MessageSender start(SpeechServiceConfig config, MessageReceiver receiver) throws Exception; 10 | void stop(); 11 | void awaitEnd() throws InterruptedException; 12 | CountDownLatch getEndLatch(); 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/TelemetryInfo.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.telemetry.AudioTelemetry; 4 | import com.github.catalystcode.fortis.speechtotext.telemetry.CallsTelemetry; 5 | import com.github.catalystcode.fortis.speechtotext.telemetry.ConnectionTelemetry; 6 | import org.json.JSONObject; 7 | 8 | import java.util.ArrayList; 9 | import java.util.Collection; 10 | import java.util.Map; 11 | import java.util.Queue; 12 | 13 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceLimitations.MAX_ERROR_MESSAGE_NUM_CHARACTERS; 14 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMetrics.*; 15 | 16 | class TelemetryInfo { 17 | private final String connectionId; 18 | private final CallsTelemetry callsTelemetry; 19 | private final ConnectionTelemetry connectionTelemetry; 20 | private final AudioTelemetry audioTelemetry; 21 | 22 | TelemetryInfo(String connectionId, CallsTelemetry callsTelemetry, ConnectionTelemetry connectionTelemetry, AudioTelemetry audioTelemetry) { 23 | this.connectionId = connectionId; 24 | this.callsTelemetry = callsTelemetry; 25 | this.connectionTelemetry = connectionTelemetry; 26 | this.audioTelemetry = audioTelemetry; 27 | } 28 | 29 | String toJson() { 30 | JSONObject json = new JSONObject(); 31 | putReceivedMessages(json); 32 | putMetrics(json); 33 | return json.toString(); 34 | } 35 | 36 | private void putMetrics(JSONObject json) { 37 | Collection metrics = new ArrayList<>(); 38 | metrics.add(createConnectionMetric()); 39 | metrics.add(createMicrophoneMetric()); 40 | json.put(METRICS, metrics); 41 | } 42 | 43 | private void putReceivedMessages(JSONObject json) { 44 | Collection receivedMessages = new ArrayList<>(); 45 | for (Map.Entry> entry : callsTelemetry.getCallTimestamps().entrySet()) { 46 | String endpoint = entry.getKey(); 47 | Queue calls = entry.getValue(); 48 | JSONObject receivedMessage = new JSONObject(); 49 | if (calls.size() > 1) { 50 | receivedMessage.put(endpoint, calls); 51 | } else { 52 | receivedMessage.put(endpoint, calls.peek()); 53 | } 54 | receivedMessages.add(receivedMessage); 55 | } 56 | json.put(RECEIVED_MESSAGES, receivedMessages); 57 | } 58 | 59 | private JSONObject createConnectionMetric() { 60 | JSONObject metric = new JSONObject(); 61 | metric.put(NAME, CONNECTION_METRIC); 62 | metric.put(ID, connectionId); 63 | metric.put(START, connectionTelemetry.getConnectionStarted()); 64 | metric.put(END, connectionTelemetry.getConnectionEstablished()); 65 | addError(metric, connectionTelemetry.getConnectionErrored()); 66 | return metric; 67 | } 68 | 69 | private JSONObject createMicrophoneMetric() { 70 | JSONObject metric = new JSONObject(); 71 | metric.put(NAME, MICROPHONE_METRIC); 72 | metric.put(START, audioTelemetry.getAudioStarted()); 73 | metric.put(END, audioTelemetry.getAudioEnded()); 74 | addError(metric, audioTelemetry.getAudioErrored()); 75 | return metric; 76 | } 77 | 78 | private void addError(JSONObject metric, String error) { 79 | if (error != null) { 80 | metric.put(ERROR, error.substring(0, MAX_ERROR_MESSAGE_NUM_CHARACTERS)); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/nv/NvMessageReceiver.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket.nv; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.lifecycle.MessageReceiver; 4 | import com.github.catalystcode.fortis.speechtotext.telemetry.ConnectionTelemetry; 5 | import com.neovisionaries.ws.client.WebSocket; 6 | import com.neovisionaries.ws.client.WebSocketAdapter; 7 | import com.neovisionaries.ws.client.WebSocketException; 8 | import com.neovisionaries.ws.client.WebSocketFrame; 9 | import org.apache.log4j.Logger; 10 | 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.concurrent.CountDownLatch; 14 | 15 | class NvMessageReceiver extends WebSocketAdapter { 16 | private static final Logger log = Logger.getLogger(NvMessageReceiver.class); 17 | private final CountDownLatch socketCloseLatch; 18 | private final MessageReceiver receiver; 19 | private final ConnectionTelemetry telemetry; 20 | 21 | NvMessageReceiver(CountDownLatch socketCloseLatch, MessageReceiver receiver, ConnectionTelemetry telemetry) { 22 | this.socketCloseLatch = socketCloseLatch; 23 | this.receiver = receiver; 24 | this.telemetry = telemetry; 25 | } 26 | 27 | @Override 28 | public void onConnected(WebSocket websocket, Map> headers) throws Exception { 29 | telemetry.recordConnectionEstablished(); 30 | log.debug("Websocket connected"); 31 | } 32 | 33 | @Override 34 | public void onConnectError(WebSocket websocket, WebSocketException exception) throws Exception { 35 | telemetry.recordConnectionFailed(exception.getMessage()); 36 | log.error("Websocket connection failed", exception); 37 | } 38 | 39 | @Override 40 | public void onTextMessage(WebSocket websocket, String text) throws Exception { 41 | receiver.onMessage(text); 42 | } 43 | 44 | @Override 45 | public void onError(WebSocket websocket, WebSocketException cause) throws Exception { 46 | log.error("Websocket read error", cause); 47 | socketCloseLatch.countDown(); 48 | } 49 | 50 | @Override 51 | public void onCloseFrame(WebSocket websocket, WebSocketFrame frame) throws Exception { 52 | int closeCode = frame.getCloseCode(); 53 | String closeReason = frame.getCloseReason(); 54 | 55 | log.info("Websocket closed with status '" + closeCode + "' and reason '" + closeReason + "'"); 56 | socketCloseLatch.countDown(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/nv/NvMessageSender.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket.nv; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 4 | import com.neovisionaries.ws.client.WebSocket; 5 | 6 | import java.nio.ByteBuffer; 7 | 8 | class NvMessageSender extends MessageSender { 9 | private final WebSocket webSocket; 10 | 11 | NvMessageSender(String connectionId, WebSocket webSocket) { 12 | super(connectionId); 13 | this.webSocket = webSocket; 14 | } 15 | 16 | @Override 17 | protected void sendBinaryMessage(ByteBuffer message) { 18 | webSocket.sendBinary(message.array()); 19 | } 20 | 21 | @Override 22 | protected void sendTextMessage(String message) { 23 | webSocket.sendText(message); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/github/catalystcode/fortis/speechtotext/websocket/nv/NvSpeechServiceClient.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket.nv; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.config.SpeechServiceConfig; 4 | import com.github.catalystcode.fortis.speechtotext.lifecycle.MessageReceiver; 5 | import com.github.catalystcode.fortis.speechtotext.telemetry.ConnectionTelemetry; 6 | import com.github.catalystcode.fortis.speechtotext.websocket.MessageSender; 7 | import com.github.catalystcode.fortis.speechtotext.websocket.SpeechServiceClient; 8 | import com.neovisionaries.ws.client.WebSocket; 9 | import com.neovisionaries.ws.client.WebSocketFactory; 10 | 11 | import java.util.concurrent.CountDownLatch; 12 | 13 | import static com.github.catalystcode.fortis.speechtotext.utils.ProtocolUtils.newGuid; 14 | 15 | public class NvSpeechServiceClient implements SpeechServiceClient { 16 | private final CountDownLatch socketCloseLatch; 17 | private WebSocket webSocket; 18 | 19 | public NvSpeechServiceClient() { 20 | this.socketCloseLatch = new CountDownLatch(1); 21 | } 22 | 23 | @Override 24 | public MessageSender start(SpeechServiceConfig config, MessageReceiver receiver) throws Exception { 25 | String connectionId = newGuid(); 26 | ConnectionTelemetry telemetry = ConnectionTelemetry.forId(connectionId); 27 | 28 | WebSocketFactory factory = new WebSocketFactory(); 29 | webSocket = factory.createSocket(config.getConnectionUrl(connectionId)); 30 | webSocket.addListener(new NvMessageReceiver(socketCloseLatch, receiver, telemetry)); 31 | telemetry.recordConnectionStarted(); 32 | webSocket.connect(); 33 | return new NvMessageSender(connectionId, webSocket); 34 | } 35 | 36 | @Override 37 | public void stop() { 38 | webSocket.disconnect(); 39 | } 40 | 41 | @Override 42 | public void awaitEnd() throws InterruptedException { 43 | socketCloseLatch.await(); 44 | } 45 | 46 | @Override 47 | public CountDownLatch getEndLatch() { 48 | return socketCloseLatch; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/java/com/github/catalystcode/fortis/speechtotext/messages/MessageParserTest.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.messages; 2 | 3 | import org.json.JSONObject; 4 | import org.junit.jupiter.api.Test; 5 | 6 | import java.util.Map; 7 | 8 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.CONTENT_TYPE; 9 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.PATH; 10 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMessageHeaders.REQUEST_ID; 11 | import static org.junit.jupiter.api.Assertions.assertEquals; 12 | 13 | class MessageParserTest { 14 | private static final String turnStartMessage = "" + 15 | "X-RequestId:e7a1b5d70b814aab8e5f43d9bc3fbf96\r\n" + 16 | "Content-Type:application/json; charset=utf-8\r\n" + 17 | "Path: turn.start\r\n" + 18 | "\r\n" + 19 | "{\r\n" + 20 | " \"context\": {\r\n" + 21 | " \"serviceTag\": \"04319a8c660a4d1e8b0ba640d9b9c6ed\"\r\n" + 22 | " }\r\n" + 23 | "}"; 24 | 25 | @Test 26 | void parseHeaders() { 27 | Map headers = MessageParser.parseHeaders(turnStartMessage); 28 | assertEquals(3, headers.size()); 29 | assertEquals("turn.start", headers.get(PATH)); 30 | assertEquals("application/json; charset=utf-8", headers.get(CONTENT_TYPE)); 31 | assertEquals("e7a1b5d70b814aab8e5f43d9bc3fbf96", headers.get(REQUEST_ID)); 32 | } 33 | 34 | @Test 35 | void parseBody() { 36 | JSONObject body = MessageParser.parseBody(turnStartMessage); 37 | assertEquals("04319a8c660a4d1e8b0ba640d9b9c6ed", body.getJSONObject("context").getString("serviceTag")); 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/java/com/github/catalystcode/fortis/speechtotext/websocket/PlatformInfoTest.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket; 2 | 3 | import org.junit.jupiter.api.Test; 4 | 5 | import static org.junit.jupiter.api.Assertions.*; 6 | 7 | class PlatformInfoTest { 8 | @Test 9 | void canBeConvertedToJson() { 10 | String config = new PlatformInfo().toJson(); 11 | assertNotNull(config); 12 | assertNotEquals("", config); 13 | assertNotEquals("{}", config); 14 | } 15 | } -------------------------------------------------------------------------------- /src/test/java/com/github/catalystcode/fortis/speechtotext/websocket/TelemetryInfoTest.java: -------------------------------------------------------------------------------- 1 | package com.github.catalystcode.fortis.speechtotext.websocket; 2 | 3 | import com.github.catalystcode.fortis.speechtotext.telemetry.AudioTelemetry; 4 | import com.github.catalystcode.fortis.speechtotext.telemetry.CallsTelemetry; 5 | import com.github.catalystcode.fortis.speechtotext.telemetry.ConnectionTelemetry; 6 | import org.json.JSONArray; 7 | import org.json.JSONObject; 8 | import org.junit.jupiter.api.Test; 9 | 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | import java.util.Set; 13 | 14 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServiceMetrics.*; 15 | import static com.github.catalystcode.fortis.speechtotext.constants.SpeechServicePaths.*; 16 | import static org.junit.jupiter.api.Assertions.assertEquals; 17 | import static org.junit.jupiter.api.Assertions.assertNotEquals; 18 | import static org.junit.jupiter.api.Assertions.assertNotNull; 19 | 20 | class TelemetryInfoTest { 21 | @Test 22 | void canBeConvertedToJson() { 23 | String telemetryJson = setupTelemetry("canBeConvertedToJson"); 24 | JSONObject telemetry = new JSONObject(telemetryJson); 25 | 26 | verifyReceivedMessages(telemetry); 27 | verifyMetrics(telemetry); 28 | } 29 | 30 | @Test 31 | void sameTelemetryIsUsedForRequest() { 32 | String testName = "sameTelemetryIsUsedForRequest"; 33 | String connectionId = newConnectionId(testName); 34 | String requestId = newRequestId(testName); 35 | 36 | ConnectionTelemetry connectionTelemetry1 = ConnectionTelemetry.forId(connectionId); 37 | ConnectionTelemetry connectionTelemetry2 = ConnectionTelemetry.forId("otherConnectionId"); 38 | ConnectionTelemetry connectionTelemetry3 = ConnectionTelemetry.forId(connectionId); 39 | assertNotEquals(connectionTelemetry1, connectionTelemetry2); 40 | assertEquals(connectionTelemetry1, connectionTelemetry3); 41 | 42 | CallsTelemetry callsTelemetry1 = CallsTelemetry.forId(requestId); 43 | CallsTelemetry callsTelemetry2 = CallsTelemetry.forId("otherRequestId"); 44 | CallsTelemetry callsTelemetry3 = CallsTelemetry.forId(requestId); 45 | assertNotEquals(callsTelemetry1, callsTelemetry2); 46 | assertEquals(callsTelemetry1, callsTelemetry3); 47 | 48 | CallsTelemetry audioTelemetry1 = CallsTelemetry.forId(requestId); 49 | CallsTelemetry audioTelemetry2 = CallsTelemetry.forId("otherRequestId"); 50 | CallsTelemetry audioTelemetry3 = CallsTelemetry.forId(requestId); 51 | assertNotEquals(audioTelemetry1, audioTelemetry2); 52 | assertEquals(audioTelemetry1, audioTelemetry3); 53 | } 54 | 55 | private String setupTelemetry(String testName) { 56 | String connectionId = newConnectionId(testName); 57 | String requestId = newRequestId(testName); 58 | CallsTelemetry callsTelemetry = CallsTelemetry.forId(requestId); 59 | ConnectionTelemetry connectionTelemetry = ConnectionTelemetry.forId(connectionId); 60 | AudioTelemetry audioTelemetry = AudioTelemetry.forId(requestId); 61 | 62 | connectionTelemetry.recordConnectionStarted(); 63 | connectionTelemetry.recordConnectionEstablished(); 64 | audioTelemetry.recordAudioStarted(); 65 | callsTelemetry.recordCall(TURN_START); 66 | callsTelemetry.recordCall(SPEECH_HYPOTHESIS); 67 | callsTelemetry.recordCall(SPEECH_HYPOTHESIS); 68 | callsTelemetry.recordCall(SPEECH_PHRASE); 69 | callsTelemetry.recordCall(SPEECH_END); 70 | callsTelemetry.recordCall(TURN_END); 71 | audioTelemetry.recordAudioEnded(); 72 | 73 | return new TelemetryInfo(connectionId, callsTelemetry, connectionTelemetry, audioTelemetry).toJson(); 74 | } 75 | 76 | private String newRequestId(String testName) { 77 | return getClass().getName() + "-" + testName + "-requestId"; 78 | } 79 | 80 | private String newConnectionId(String testName) { 81 | return getClass().getName() + "-" + testName + "-connectionId"; 82 | } 83 | 84 | private void verifyReceivedMessages(JSONObject telemetry) { 85 | JSONArray receivedMessages = telemetry.getJSONArray(RECEIVED_MESSAGES); 86 | for (Object obj : receivedMessages) { 87 | JSONObject receivedMessage = (JSONObject) obj; 88 | Set keys = receivedMessage.keySet(); 89 | assertEquals(1, keys.size()); 90 | String key = keys.iterator().next(); 91 | if (SPEECH_HYPOTHESIS.equalsIgnoreCase(key)) { 92 | JSONArray values = receivedMessage.getJSONArray(key); 93 | assertNotNull(values); 94 | assertEquals(2, values.length()); 95 | } else { 96 | String value = receivedMessage.getString(key); 97 | assertNotNull(value); 98 | } 99 | } 100 | } 101 | 102 | private void verifyMetrics(JSONObject telemetry) { 103 | JSONArray metrics = telemetry.getJSONArray(METRICS); 104 | assertEquals(2, metrics.length()); 105 | Map parsedMetrics = new HashMap<>(); 106 | for (Object obj : metrics) { 107 | JSONObject metric = (JSONObject) obj; 108 | parsedMetrics.put(metric.getString(NAME), metric); 109 | } 110 | assertEquals(2, parsedMetrics.size()); 111 | JSONObject connectionMetric = parsedMetrics.get(CONNECTION_METRIC); 112 | JSONObject microphoneMetric = parsedMetrics.get(MICROPHONE_METRIC); 113 | assertNotNull(connectionMetric); 114 | assertNotNull(microphoneMetric); 115 | assertNotNull(connectionMetric.getString(START)); 116 | assertNotNull(microphoneMetric.getString(START)); 117 | assertNotNull(connectionMetric.getString(END)); 118 | assertNotNull(microphoneMetric.getString(END)); 119 | } 120 | } -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version := "0.0.7" 2 | --------------------------------------------------------------------------------