├── .mvn └── wrapper │ ├── maven-wrapper.jar │ └── maven-wrapper.properties ├── src └── main │ └── java │ └── divisio │ └── whisper │ ├── token │ ├── WhisperToken.java │ ├── WhisperAnyToken.java │ ├── Whisper3SpecialToken.java │ ├── Whisper3Timestamp.java │ └── Whisper3Language.java │ ├── WhisperResult.java │ ├── Whisper3TokenDecoder.java │ ├── WhisperTask.java │ └── Whisper3.java ├── .gitignore ├── pom.xml ├── README.md ├── mvnw.cmd ├── LICENSE └── mvnw /.mvn/wrapper/maven-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DIVISIO-AI/whisper-java/HEAD/.mvn/wrapper/maven-wrapper.jar -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/token/WhisperToken.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper.token; 2 | 3 | /** 4 | * Interface for various token classes to facilitate handling of tokens in java. 5 | */ 6 | public interface WhisperToken { 7 | /** 8 | * Numerical ID of the token in the dictionary 9 | */ 10 | long getTokenId(); 11 | 12 | /** 13 | * The actual String value of the token 14 | */ 15 | String getToken(); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/WhisperResult.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper; 2 | 3 | import divisio.whisper.token.WhisperToken; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Class to hold the whisper transcription results. 9 | * @param rawText Raw result text containing special tokens. 10 | * @param text Result text of the transcription without the special tokens. 11 | * @param tokens Unprocessed result tokens of the prediction. 12 | */ 13 | public record WhisperResult(String rawText, String text, List tokens) {} 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | HELP.md 2 | target/ 3 | !.mvn/wrapper/maven-wrapper.jar 4 | !**/src/main/**/target/ 5 | !**/src/test/**/target/ 6 | 7 | ### STS ### 8 | .apt_generated 9 | .classpath 10 | .factorypath 11 | .project 12 | .settings 13 | .springBeans 14 | .sts4-cache 15 | 16 | ### IntelliJ IDEA ### 17 | .idea 18 | *.iws 19 | *.iml 20 | *.ipr 21 | 22 | ### NetBeans ### 23 | /nbproject/private/ 24 | /nbbuild/ 25 | /dist/ 26 | /nbdist/ 27 | /.nb-gradle/ 28 | build/ 29 | !**/src/main/**/build/ 30 | !**/src/test/**/build/ 31 | 32 | ### VS Code ### 33 | .vscode/ 34 | /application.properties 35 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/token/WhisperAnyToken.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper.token; 2 | 3 | /** 4 | * Base class to represent any Whisper 3 token. 5 | */ 6 | public class WhisperAnyToken implements WhisperToken { 7 | 8 | private final long tokenId; 9 | private final String token; 10 | 11 | /** 12 | * Constructor taking the token id and the corresponding token. 13 | * @param tokenId token id 14 | * @param token string representation of the token 15 | */ 16 | public WhisperAnyToken(long tokenId, String token) { 17 | this.tokenId = tokenId; 18 | this.token = token; 19 | } 20 | 21 | /** 22 | * Get the token id. 23 | * @return token id. 24 | */ 25 | @Override 26 | public long getTokenId() { 27 | return this.tokenId; 28 | } 29 | 30 | /** 31 | * Get the token. 32 | * @return token. 33 | */ 34 | @Override 35 | public String getToken() { 36 | return this.token; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip 18 | wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar 19 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/token/Whisper3SpecialToken.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper.token; 2 | 3 | import divisio.whisper.WhisperTask; 4 | 5 | /** 6 | * Helper enum to hold some special Whisper v3 (huggingface) tokens. 7 | * 8 | *

Recommended to use in conjunction with {@link WhisperTask}. 9 | */ 10 | public enum Whisper3SpecialToken implements WhisperToken { 11 | START_OF_TRANSCRIPT(50258, "<|startoftranscript|>"), 12 | END_OF_TEXT(50257, "<|endoftext|>"), 13 | TRANSLATE(50359, "<|translate|>"), 14 | TRANSCRIBE(50360, "<|transcribe|>"), 15 | NO_TIMESTAMPS(50364, "<|notimestamps|>"); 16 | 17 | private final long tokenId; 18 | private final String token; 19 | 20 | /** 21 | * Token consisting of its index and string representation. 22 | * @param tokenId The token index for Whisper v3 23 | * @param token The string representing the token 24 | */ 25 | Whisper3SpecialToken(long tokenId, String token) { 26 | this.tokenId = tokenId; 27 | this.token = token; 28 | } 29 | 30 | /** 31 | * Get the token index of this token. 32 | * @return The token index. 33 | */ 34 | @Override 35 | public long getTokenId() { 36 | return tokenId; 37 | } 38 | 39 | /** 40 | * Get the string representation of this token. 41 | * @return The string representation of this token. 42 | */ 43 | @Override 44 | public String getToken() { 45 | return token; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/Whisper3TokenDecoder.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | import java.util.ArrayList; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * Token decoder to turn the predicted Whisper tokens back into text. 11 | *

12 | * Inspired by the Whisper v3 huggingface decoder, reduced to its most crucial parts to make this work. 13 | */ 14 | public class Whisper3TokenDecoder { 15 | 16 | private static final int BYTE_SIZE = 256; 17 | 18 | private static final Map CHAR_TO_UNICODE_MAP = Whisper3TokenDecoder.createCharToUnicodeMap(); 19 | 20 | /** 21 | * Convert the "raw" predicted tokens from Whisper into UTF-8 text. 22 | * @param rawTokens the raw predicted tokens from Whisper. 23 | * @return the concatenated UTF-8 text. 24 | */ 25 | public static String rawTokensToText(List rawTokens) { 26 | String joined = String.join("", rawTokens); 27 | 28 | byte[] byteText = new byte[joined.length()]; 29 | for (int i = 0; i < joined.length(); i++) { 30 | String c = String.valueOf(joined.charAt(i)); 31 | if (CHAR_TO_UNICODE_MAP.containsKey(c)) { 32 | byteText[i] = CHAR_TO_UNICODE_MAP.get(c).byteValue(); 33 | } 34 | } 35 | return new String(byteText, StandardCharsets.UTF_8); 36 | } 37 | 38 | /** 39 | * Create the char-to-unicode-map necessary to convert raw char tokens into UTF-8 text. 40 | * @return the char-to-unicode-map necessary to convert raw char tokens into UTF-8 text. 41 | */ 42 | private static Map createCharToUnicodeMap() { 43 | Map charToUnicodeMap = new HashMap<>(); 44 | List bytes = createByteList(); 45 | List chars = new ArrayList<>(bytes); 46 | 47 | int n = 0; 48 | for (int b = 0; b < BYTE_SIZE; b++) { 49 | if (!bytes.contains(b)) { 50 | bytes.add(b); 51 | chars.add(BYTE_SIZE + n); 52 | n++; 53 | } 54 | } 55 | 56 | for (int i = 0; i < bytes.size(); i++) { 57 | charToUnicodeMap.put(Character.toString(chars.get(i)), bytes.get(i)); 58 | } 59 | 60 | return charToUnicodeMap; 61 | } 62 | 63 | /** 64 | * Create a byte list consisting of printable chars in various ASCII ranges. 65 | * @return a list of bytes in integer form. 66 | */ 67 | private static List createByteList() { 68 | List list = new ArrayList<>(BYTE_SIZE); 69 | // printable ASCII range 70 | for (int i = '!'; 71 | i <= '~'; 72 | i++) { 73 | list.add(i); 74 | } 75 | // extended ASCII range (¡ to ¬) 76 | for (int i = '¡'; 77 | i <= '¬'; 78 | i++) { 79 | list.add(i); 80 | } 81 | // extended ASCII range (® to ÿ) 82 | for (int i = '®'; 83 | i <= 'ÿ'; 84 | i++) { 85 | list.add(i); 86 | } 87 | return list; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | divisio 8 | whisper-java 9 | 0.1 10 | 11 | 12 | 21 13 | 21 14 | UTF-8 15 | 16 | 17 | 18 | 19 | 20 | ai.djl 21 | api 22 | 0.26.0 23 | 24 | 25 | 26 | ai.djl.pytorch 27 | pytorch-engine 28 | 0.26.0 29 | 30 | 31 | 32 | ai.djl.pytorch 33 | pytorch-jni 34 | 2.1.1-0.26.0 35 | 36 | 37 | ai.djl.audio 38 | audio 39 | 0.26.0 40 | 41 | 42 | 43 | ch.qos.logback 44 | logback-core 45 | 1.4.12 46 | 47 | 48 | ch.qos.logback 49 | logback-classic 50 | 1.4.12 51 | 52 | 53 | org.slf4j 54 | slf4j-api 55 | 2.0.4 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | org.apache.maven.plugins 64 | maven-compiler-plugin 65 | 3.12.1 66 | 67 | 68 | 69 | 70 | 71 | org.apache.maven.plugins 72 | maven-source-plugin 73 | 3.3.0 74 | 75 | 76 | attach-sources 77 | verify 78 | 79 | jar-no-fork 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Whisper v3 Java Lib using DJL 2 | 3 | Library to run inference of [Whisper v3](https://github.com/openai/whisper) in Java using [DJL](https://djl.ai/). 4 | This implementation is based on the [huggingface Python implementation of Whisper v3 large.](https://huggingface.co/openai/whisper-large-v3) 5 | 6 | **Currently only runs on GPU.** 7 | 8 | The library has the ability to run inference on the GPU in Java out of the box. 9 | 10 | Alternatives: 11 | - [whisper.cpp](https://github.com/ggerganov/whisper.cpp) to run Whisper with C++ 12 | - [whisper-jni](https://github.com/GiviMAD/whisper-jni) (a JNI wrapper for whisper.cpp) 13 | 14 | ## Installation 15 | 16 | First, follow the installation instructions for the [DJL PyTorch engine](https://djl.ai/engines/pytorch/pytorch-engine/#installation). 17 | 18 | For GPU support, you also need to ensure [CUDA](https://developer.nvidia.com/cuda-toolkit) is installed on your system and included in the path. 19 | You will need a CUDA version that matches the PyTorch version of your chosen DJL PyTorch engine. To see which DJL PyTorch engine version supports 20 | which PyTorch library version, [see here](https://djl.ai/engines/pytorch/pytorch-engine/#supported-pytorch-versions). 21 | 22 | ## Usage example 23 | 24 | Add the following to your pom file: 25 | 26 | ```xml 27 | 32 | 33 | 34 | DIVISIO 35 | https://mvn.divis.io/ 36 | 37 | 38 | 39 | 40 | 41 | divisio 42 | whisper-java 43 | 0.1 44 | 45 | 46 | 47 | 48 | divisio 49 | whisper-model 50 | 0.1 51 | 52 | 53 | 54 | 55 | ai.djl.pytorch 56 | pytorch-native-cu121 57 | 2.1.1 58 | runtime 59 | 60 | ``` 61 | 62 | Create a demo class like this: 63 | 64 | ```java 65 | package divisio.whisper; 66 | 67 | import divisio.whisper.token.Whisper3Language; 68 | import divisio.whisper.token.WhisperToken; 69 | 70 | import java.util.Arrays; 71 | 72 | public class WhisperDemo { 73 | 74 | public static void main(String[] args) throws Exception { 75 | final String filePath = args[0]; 76 | 77 | try (Whisper3 whisper = Whisper3.instance()) { 78 | WhisperResult result = whisper.task() 79 | .language(Whisper3Language.AUTO) 80 | .transcribe(filePath) 81 | .withTimestamps() 82 | .execute(); 83 | 84 | System.out.println("raw token ids: " + Arrays.toString(result.tokens().stream().mapToLong(WhisperToken::getTokenId).toArray())); 85 | System.out.println("raw text: " + result.rawText()); 86 | System.out.println("clean: " + result.text()); 87 | } 88 | } 89 | } 90 | ``` 91 | 92 | And start the program with a parameter pointing to an audio file like `/path/to/my_audio_file.wav`. 93 | 94 | **Initiating Whisper is expensive, so instances should be reused**, e.g. by instantiating them as a spring bean singleton. 95 | Additionally, the first tasks might take a little bit longer than usual, due to internal warm-ups. 96 | 97 | ## Credits 98 | 99 | This work is based upon the huggingface version of whisper3 (https://huggingface.co/openai/whisper-large-v3/blob/main/README.md) 100 | by OpenAI. It is a traced version of that model, all JAVA code has been rewritten from scratch. We used 101 | the original Python code as a reference. 102 | 103 | ## License 104 | 105 | This library is licensed under the Apache 2.0 license (see LICENSE). 106 | 107 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/token/Whisper3Timestamp.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper.token; 2 | 3 | /** 4 | * Helper class to derive the Whisper v3 timestamp tokens for ease-of-use. 5 | */ 6 | public class Whisper3Timestamp implements WhisperToken { 7 | 8 | /** 9 | * Whisper v3 token id for timestamp 0 ms. 10 | */ 11 | private static final long FIRST_TIMESTAMP_TOKEN_ID = 50365; 12 | 13 | /** 14 | * Whisper v3 token id for timestamp 30_000 ms (30 seconds). 15 | */ 16 | private static final long LAST_TIMESTAMP_TOKEN_ID = 51865; 17 | 18 | /** 19 | * Whisper v3 first timestamp token in milliseconds. 20 | */ 21 | private static final long FIRST_TIMESTAMP_MS = 0; 22 | 23 | /** 24 | * Whisper v3 last timestamp token in milliseconds. 25 | */ 26 | private static final long LAST_TIMESTAMP_MS = 30_000; 27 | 28 | /** 29 | * Step size in milliseconds between timestamp tokens. 30 | */ 31 | private static final long MS_STEP = 20; 32 | 33 | /** 34 | * Static helper token representing the first timestamp token at 0.0 seconds. 35 | */ 36 | public static final Whisper3Timestamp MIN_TIMESTAMP_TOKEN = Whisper3Timestamp.fromTokenId(FIRST_TIMESTAMP_TOKEN_ID); 37 | 38 | /** 39 | * Static helper token representing the last timestamp token at 30.0 seconds. 40 | */ 41 | public static final Whisper3Timestamp MAX_TIMESTAMP_TOKEN = Whisper3Timestamp.fromTokenId(LAST_TIMESTAMP_TOKEN_ID); 42 | 43 | private final long tokenId; 44 | 45 | /** 46 | * Private constructor. Use {@link #fromTokenId(long)} or {@link #fromTimestampMs(long)} instead. 47 | * @param tokenId the token id of this timestamp token. 48 | */ 49 | private Whisper3Timestamp(long tokenId) { 50 | this.tokenId = tokenId; 51 | } 52 | 53 | /** 54 | * Create a {@link Whisper3Timestamp} from the given token id. 55 | * 56 | *

Checks if the given token id is valid and throws an exception otherwise. 57 | * 58 | * @param tokenId token id of the timestamp token, must be a valid Whisper v3 timestamp token. 59 | * @return instance of a valid {@link Whisper3Timestamp}. 60 | */ 61 | public static Whisper3Timestamp fromTokenId(final long tokenId) { 62 | if (tokenId < FIRST_TIMESTAMP_TOKEN_ID || tokenId > LAST_TIMESTAMP_TOKEN_ID) { 63 | throw new IllegalArgumentException( 64 | String.format("Not a valid timestamp token id, it must be between %d (inclusive) and %d (inclusive).", 65 | FIRST_TIMESTAMP_TOKEN_ID, LAST_TIMESTAMP_TOKEN_ID) 66 | ); 67 | } 68 | 69 | return new Whisper3Timestamp(tokenId); 70 | } 71 | 72 | /** 73 | * Create a {@link Whisper3Timestamp} from the given millisecond timestamp. 74 | * 75 | *

Checks if the given millisecond timestamp is valid and throws an exception otherwise. 76 | * 77 | * @param ms millisecond value of the token, must be a valid Whisper v3 timestamp token. 78 | * @return instance of a valid {@link Whisper3Timestamp}. 79 | */ 80 | public static Whisper3Timestamp fromTimestampMs(final long ms) { 81 | if (ms < FIRST_TIMESTAMP_MS || ms > LAST_TIMESTAMP_MS) { 82 | throw new IllegalArgumentException( 83 | String.format("Not a valid timestamp value, milliseconds must be between %d (inclusive) and %d (inclusive).", 84 | FIRST_TIMESTAMP_MS, LAST_TIMESTAMP_MS) 85 | ); 86 | } 87 | 88 | if (ms % MS_STEP != 0) { 89 | throw new IllegalArgumentException( 90 | String.format("Not a valid timestamp value, milliseconds must be multiples of %d.", 91 | MS_STEP) 92 | ); 93 | } 94 | 95 | return new Whisper3Timestamp(msToTokenId(ms)); 96 | } 97 | 98 | /** 99 | * Convert from token id to milliseconds. 100 | * @param tokenId the token id. 101 | * @return the millisecond value this token represents. 102 | */ 103 | private static long tokenIdToMs(long tokenId) { 104 | return (tokenId - FIRST_TIMESTAMP_TOKEN_ID) * MS_STEP; 105 | } 106 | 107 | /** 108 | * Convert from milliseconds to token id. 109 | * @param ms the millisecond value of the token. 110 | * @return the token id corresponding with this millisecond value. 111 | */ 112 | private static long msToTokenId(long ms) { 113 | return (ms / MS_STEP) + FIRST_TIMESTAMP_TOKEN_ID; 114 | } 115 | 116 | /** 117 | * Get the millisecond value of this token. 118 | * @return millisecond value of this token. 119 | */ 120 | public long getMs() { 121 | return tokenIdToMs(this.tokenId); 122 | } 123 | 124 | /** 125 | * Get the token id. 126 | * @return token id. 127 | */ 128 | @Override 129 | public long getTokenId() { 130 | return tokenId; 131 | } 132 | 133 | /** 134 | * Get the string token representation of this token. 135 | * @return string token representation of this token. 136 | */ 137 | @Override 138 | public String getToken() { 139 | // get ms part 140 | final long ms = getMs() % 1000; 141 | // get second part 142 | final long s = getMs() / 1000; 143 | // prepend zeros, remove trailing digit (it is always zero anyway) 144 | final String msPadded = String.format("%03d", ms).substring(0, 2); 145 | return "<|" + s + "." + msPadded + "|>"; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/WhisperTask.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper; 2 | 3 | import ai.djl.modality.audio.Audio; 4 | import ai.djl.modality.audio.AudioFactory; 5 | import divisio.whisper.token.Whisper3Language; 6 | import divisio.whisper.token.Whisper3SpecialToken; 7 | import divisio.whisper.token.WhisperToken; 8 | import org.bytedeco.ffmpeg.global.avutil; 9 | 10 | import java.io.IOException; 11 | import java.nio.file.Path; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | /** 16 | * A class to easily configure a task and execute it using a {@link Whisper3} instance. 17 | * 18 | *

Most notably, it makes it easier to instruct {@link Whisper3} to use 19 | * certain start tokens that guide its transcription capabilities. 20 | */ 21 | public class WhisperTask { 22 | 23 | /** 24 | * The {@link Whisper3} instance to execute this task with. 25 | */ 26 | private final Whisper3 whisper; 27 | 28 | /** 29 | * The input audio for {@link Whisper3} to transcribe / translate. 30 | */ 31 | private Audio audio; 32 | 33 | /** 34 | * The language for this task. Default is {@link Whisper3Language#AUTO}, which lets Whisper detect the language. 35 | */ 36 | private Whisper3Language lang = Whisper3Language.AUTO; 37 | 38 | /** 39 | * Whether to enable the "no-timestamps" token for Whisper. True by default. 40 | */ 41 | private boolean noTimestamps = true; 42 | 43 | /** 44 | * The task token. Should be either {@link Whisper3SpecialToken#TRANSCRIBE} 45 | * or {@link Whisper3SpecialToken#TRANSLATE}. 46 | */ 47 | private Whisper3SpecialToken task = Whisper3SpecialToken.TRANSCRIBE; 48 | 49 | /** 50 | * Private constructor. Use {@link #task(Whisper3)} or {@link Whisper3#task()}to create a task instance. 51 | */ 52 | private WhisperTask(final Whisper3 whisper) { 53 | this.whisper = whisper; 54 | } 55 | 56 | /** 57 | * Create a {@code WhisperTask}. 58 | * @param whisper the {@link Whisper3} instance to configure this task for. 59 | * @return a {@code WhisperTask} to configure. 60 | */ 61 | public static WhisperTask task(Whisper3 whisper) { 62 | return new WhisperTask(whisper); 63 | } 64 | 65 | /** 66 | * Set the audio input for this task. 67 | * @param audio the audio input. 68 | * @return this task. 69 | */ 70 | private WhisperTask setAudio(Audio audio) { 71 | this.audio = audio; 72 | return this; 73 | } 74 | 75 | /** 76 | * Set the audio input language of this task. 77 | * @param lang the language of the audio input; might be {@code null} 78 | * or {@link Whisper3Language#AUTO} to let Whisper detect the language. 79 | * @return this task. 80 | */ 81 | public WhisperTask language(Whisper3Language lang) { 82 | this.lang = lang; 83 | return this; 84 | } 85 | 86 | /** 87 | * Enable the Whisper "no-timestamps" token. 88 | * @return this task. 89 | */ 90 | public WhisperTask noTimestamps() { 91 | this.noTimestamps = true; 92 | return this; 93 | } 94 | 95 | /** 96 | * Disable the Whisper "no-timestamps" token. 97 | * @return this task. 98 | */ 99 | public WhisperTask withTimestamps() { 100 | this.noTimestamps = false; 101 | return this; 102 | } 103 | 104 | /** 105 | * Set the task to transcription for the audio file on the given path. 106 | * @param path the string path to the audio file. 107 | * @return this task. 108 | */ 109 | public WhisperTask transcribe(String path) { 110 | this.task = Whisper3SpecialToken.TRANSCRIBE; 111 | return this.setAudio(loadAudio(path)); 112 | } 113 | 114 | /** 115 | * Set the task to transcription for the audio file on the given path. 116 | * @param path the path to the audio file. 117 | * @return this task. 118 | */ 119 | public WhisperTask transcribe(Path path) { 120 | this.task = Whisper3SpecialToken.TRANSCRIBE; 121 | return this.setAudio(loadAudio(path)); 122 | } 123 | 124 | /** 125 | * Set the task to transcription for the given {@link Audio} instance. 126 | * @param audio the audio input. 127 | * @return this task. 128 | */ 129 | public WhisperTask transcribe(Audio audio) { 130 | this.task = Whisper3SpecialToken.TRANSCRIBE; 131 | return setAudio(audio); 132 | } 133 | 134 | /** 135 | * Set the task to translation for the audio file on the given path. 136 | * @param path the string path to the audio file. 137 | * @return this task. 138 | */ 139 | public WhisperTask translate(String path) { 140 | this.task = Whisper3SpecialToken.TRANSLATE; 141 | return this.setAudio(loadAudio(path)); 142 | } 143 | 144 | /** 145 | * Set the task to translation for the audio file on the given path. 146 | * @param path the path to the audio file. 147 | * @return this task. 148 | */ 149 | public WhisperTask translate(Path path) { 150 | this.task = Whisper3SpecialToken.TRANSLATE; 151 | return this.setAudio(loadAudio(path)); 152 | } 153 | 154 | /** 155 | * Set the task to translation for the given {@link Audio} instance. 156 | * @param audio the audio input. 157 | * @return this task. 158 | */ 159 | public WhisperTask translate(Audio audio) { 160 | this.task = Whisper3SpecialToken.TRANSLATE; 161 | return setAudio(audio); 162 | } 163 | 164 | /** 165 | * Loads the file on the given path into an {@link Audio} instance. 166 | * @param path the string path to the audio file. 167 | * @return instance of an {@link Audio}. 168 | */ 169 | private static Audio loadAudio(String path) { 170 | return loadAudio(Path.of(path)); 171 | } 172 | 173 | /** 174 | * Loads the file on the given path into an {@link Audio} instance. 175 | * @param path the path to the audio file. 176 | * @return instance of an {@link Audio}. 177 | */ 178 | private static Audio loadAudio(Path path) { 179 | try { 180 | Audio audio = AudioFactory.newInstance() 181 | .setChannels(1) // fixed for Whisper3 182 | .setSampleRate(16000) // fixed for Whisper3 183 | .setSampleFormat(avutil.AV_SAMPLE_FMT_S16P) 184 | .fromFile(path); 185 | return audio; 186 | } catch (IOException e) { 187 | throw new RuntimeException("Could not load audio for whisper from path"); 188 | } 189 | } 190 | 191 | /** 192 | * Execute this configured task. 193 | * 194 | *

Constructs the start tokens necessary to guide the Whisper model 195 | * and forwards them to the model. 196 | * @return a {@link WhisperResult} containing the transcribed text. 197 | */ 198 | public WhisperResult execute() { 199 | if (this.whisper == null) { 200 | throw new IllegalStateException("Cannot execute WhisperTask without an instance of Whisper."); 201 | } 202 | 203 | if (this.audio == null) { 204 | throw new IllegalStateException("Cannot execute WhisperTask without having an input to transcribe or translate."); 205 | } 206 | 207 | List startTokens = new ArrayList<>(List.of( 208 | Whisper3SpecialToken.START_OF_TRANSCRIPT, 209 | this.lang, 210 | this.task 211 | )); 212 | 213 | if (this.noTimestamps) { 214 | startTokens.add(Whisper3SpecialToken.NO_TIMESTAMPS); 215 | } 216 | 217 | return whisper.process(this.audio, startTokens); 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /mvnw.cmd: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Licensed to the Apache Software Foundation (ASF) under one 3 | @REM or more contributor license agreements. See the NOTICE file 4 | @REM distributed with this work for additional information 5 | @REM regarding copyright ownership. The ASF licenses this file 6 | @REM to you under the Apache License, Version 2.0 (the 7 | @REM "License"); you may not use this file except in compliance 8 | @REM with the License. You may obtain a copy of the License at 9 | @REM 10 | @REM http://www.apache.org/licenses/LICENSE-2.0 11 | @REM 12 | @REM Unless required by applicable law or agreed to in writing, 13 | @REM software distributed under the License is distributed on an 14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | @REM KIND, either express or implied. See the License for the 16 | @REM specific language governing permissions and limitations 17 | @REM under the License. 18 | @REM ---------------------------------------------------------------------------- 19 | 20 | @REM ---------------------------------------------------------------------------- 21 | @REM Apache Maven Wrapper startup batch script, version 3.2.0 22 | @REM 23 | @REM Required ENV vars: 24 | @REM JAVA_HOME - location of a JDK home dir 25 | @REM 26 | @REM Optional ENV vars 27 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands 28 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending 29 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven 30 | @REM e.g. to debug Maven itself, use 31 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 32 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files 33 | @REM ---------------------------------------------------------------------------- 34 | 35 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' 36 | @echo off 37 | @REM set title of command window 38 | title %0 39 | @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on' 40 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% 41 | 42 | @REM set %HOME% to equivalent of $HOME 43 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") 44 | 45 | @REM Execute a user defined script before this one 46 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre 47 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending 48 | if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %* 49 | if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %* 50 | :skipRcPre 51 | 52 | @setlocal 53 | 54 | set ERROR_CODE=0 55 | 56 | @REM To isolate internal variables from possible post scripts, we use another setlocal 57 | @setlocal 58 | 59 | @REM ==== START VALIDATION ==== 60 | if not "%JAVA_HOME%" == "" goto OkJHome 61 | 62 | echo. 63 | echo Error: JAVA_HOME not found in your environment. >&2 64 | echo Please set the JAVA_HOME variable in your environment to match the >&2 65 | echo location of your Java installation. >&2 66 | echo. 67 | goto error 68 | 69 | :OkJHome 70 | if exist "%JAVA_HOME%\bin\java.exe" goto init 71 | 72 | echo. 73 | echo Error: JAVA_HOME is set to an invalid directory. >&2 74 | echo JAVA_HOME = "%JAVA_HOME%" >&2 75 | echo Please set the JAVA_HOME variable in your environment to match the >&2 76 | echo location of your Java installation. >&2 77 | echo. 78 | goto error 79 | 80 | @REM ==== END VALIDATION ==== 81 | 82 | :init 83 | 84 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn". 85 | @REM Fallback to current working directory if not found. 86 | 87 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% 88 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir 89 | 90 | set EXEC_DIR=%CD% 91 | set WDIR=%EXEC_DIR% 92 | :findBaseDir 93 | IF EXIST "%WDIR%"\.mvn goto baseDirFound 94 | cd .. 95 | IF "%WDIR%"=="%CD%" goto baseDirNotFound 96 | set WDIR=%CD% 97 | goto findBaseDir 98 | 99 | :baseDirFound 100 | set MAVEN_PROJECTBASEDIR=%WDIR% 101 | cd "%EXEC_DIR%" 102 | goto endDetectBaseDir 103 | 104 | :baseDirNotFound 105 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR% 106 | cd "%EXEC_DIR%" 107 | 108 | :endDetectBaseDir 109 | 110 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig 111 | 112 | @setlocal EnableExtensions EnableDelayedExpansion 113 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a 114 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% 115 | 116 | :endReadAdditionalConfig 117 | 118 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" 119 | set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" 120 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 121 | 122 | set WRAPPER_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar" 123 | 124 | FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( 125 | IF "%%A"=="wrapperUrl" SET WRAPPER_URL=%%B 126 | ) 127 | 128 | @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 129 | @REM This allows using the maven wrapper in projects that prohibit checking in binary data. 130 | if exist %WRAPPER_JAR% ( 131 | if "%MVNW_VERBOSE%" == "true" ( 132 | echo Found %WRAPPER_JAR% 133 | ) 134 | ) else ( 135 | if not "%MVNW_REPOURL%" == "" ( 136 | SET WRAPPER_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar" 137 | ) 138 | if "%MVNW_VERBOSE%" == "true" ( 139 | echo Couldn't find %WRAPPER_JAR%, downloading it ... 140 | echo Downloading from: %WRAPPER_URL% 141 | ) 142 | 143 | powershell -Command "&{"^ 144 | "$webclient = new-object System.Net.WebClient;"^ 145 | "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^ 146 | "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^ 147 | "}"^ 148 | "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%WRAPPER_URL%', '%WRAPPER_JAR%')"^ 149 | "}" 150 | if "%MVNW_VERBOSE%" == "true" ( 151 | echo Finished downloading %WRAPPER_JAR% 152 | ) 153 | ) 154 | @REM End of extension 155 | 156 | @REM If specified, validate the SHA-256 sum of the Maven wrapper jar file 157 | SET WRAPPER_SHA_256_SUM="" 158 | FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( 159 | IF "%%A"=="wrapperSha256Sum" SET WRAPPER_SHA_256_SUM=%%B 160 | ) 161 | IF NOT %WRAPPER_SHA_256_SUM%=="" ( 162 | powershell -Command "&{"^ 163 | "$hash = (Get-FileHash \"%WRAPPER_JAR%\" -Algorithm SHA256).Hash.ToLower();"^ 164 | "If('%WRAPPER_SHA_256_SUM%' -ne $hash){"^ 165 | " Write-Output 'Error: Failed to validate Maven wrapper SHA-256, your Maven wrapper might be compromised.';"^ 166 | " Write-Output 'Investigate or delete %WRAPPER_JAR% to attempt a clean download.';"^ 167 | " Write-Output 'If you updated your Maven version, you need to update the specified wrapperSha256Sum property.';"^ 168 | " exit 1;"^ 169 | "}"^ 170 | "}" 171 | if ERRORLEVEL 1 goto error 172 | ) 173 | 174 | @REM Provide a "standardized" way to retrieve the CLI args that will 175 | @REM work with both Windows and non-Windows executions. 176 | set MAVEN_CMD_LINE_ARGS=%* 177 | 178 | %MAVEN_JAVA_EXE% ^ 179 | %JVM_CONFIG_MAVEN_PROPS% ^ 180 | %MAVEN_OPTS% ^ 181 | %MAVEN_DEBUG_OPTS% ^ 182 | -classpath %WRAPPER_JAR% ^ 183 | "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^ 184 | %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* 185 | if ERRORLEVEL 1 goto error 186 | goto end 187 | 188 | :error 189 | set ERROR_CODE=1 190 | 191 | :end 192 | @endlocal & set ERROR_CODE=%ERROR_CODE% 193 | 194 | if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost 195 | @REM check for post script, once with legacy .bat ending and once with .cmd ending 196 | if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat" 197 | if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd" 198 | :skipRcPost 199 | 200 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' 201 | if "%MAVEN_BATCH_PAUSE%"=="on" pause 202 | 203 | if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE% 204 | 205 | cmd /C exit /B %ERROR_CODE% 206 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/token/Whisper3Language.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper.token; 2 | 3 | import divisio.whisper.WhisperTask; 4 | 5 | /** 6 | * Helper enum to hold the Whisper v3 language tokens for ease-of-use. 7 | * 8 | *

Recommended to use in conjunction with {@link WhisperTask}. 9 | */ 10 | public enum Whisper3Language implements WhisperToken { 11 | // "auto" exists to allow Whisper to detect and determine the language itself (not actually a token of Whisper) 12 | AUTO(Long.MIN_VALUE, null, "auto", "automatic"), 13 | 14 | AFRIKAANS(50327, "<|af|>", "af", "afrikaans"), 15 | AMHARIC(50334, "<|am|>", "am", "amharic"), 16 | ARABIC(50272, "<|ar|>", "ar", "arabic"), 17 | ASSAMESE(50350, "<|as|>", "as", "assamese"), 18 | AZERBAIJANI(50304, "<|az|>", "az", "azerbaijani"), 19 | BASHKIR(50355, "<|ba|>", "ba", "bashkir"), 20 | BELARUSIAN(50330, "<|be|>", "be", "belarusian"), 21 | BULGARIAN(50292, "<|bg|>", "bg", "bulgarian"), 22 | BENGALI(50302, "<|bn|>", "bn", "bengali"), 23 | TIBETAN(50347, "<|bo|>", "bo", "tibetan"), 24 | BRETON(50309, "<|br|>", "br", "breton"), 25 | BOSNIAN(50315, "<|bs|>", "bs", "bosnian"), 26 | CATALAN(50270, "<|ca|>", "ca", "catalan"), 27 | CZECH(50283, "<|cs|>", "cs", "czech"), 28 | WELSH(50297, "<|cy|>", "cy", "welsh"), 29 | DANISH(50285, "<|da|>", "da", "danish"), 30 | GERMAN(50261, "<|de|>", "de", "german"), 31 | GREEK(50281, "<|el|>", "el", "greek"), 32 | ENGLISH(50259, "<|en|>", "en", "english"), 33 | SPANISH(50262, "<|es|>", "es", "spanish"), 34 | ESTONIAN(50307, "<|et|>", "et", "estonian"), 35 | BASQUE(50310, "<|eu|>", "eu", "basque"), 36 | PERSIAN(50300, "<|fa|>", "fa", "persian"), 37 | FINNISH(50277, "<|fi|>", "fi", "finnish"), 38 | FAROESE(50338, "<|fo|>", "fo", "faroese"), 39 | FRENCH(50265, "<|fr|>", "fr", "french"), 40 | GALICIAN(50319, "<|gl|>", "gl", "galician"), 41 | GUJARATI(50333, "<|gu|>", "gu", "gujarati"), 42 | HAWAIIAN(50352, "<|haw|>", "haw", "hawaiian"), 43 | HAUSA(50354, "<|ha|>", "ha", "hausa"), 44 | HEBREW(50279, "<|he|>", "he", "hebrew"), 45 | HINDI(50276, "<|hi|>", "hi", "hindi"), 46 | CROATIAN(50291, "<|hr|>", "hr", "croatian"), 47 | HAITIAN(50339, "<|ht|>", "ht", "haitian"), 48 | HUNGARIAN(50286, "<|hu|>", "hu", "hungarian"), 49 | ARMENIAN(50312, "<|hy|>", "hy", "armenian"), 50 | INDONESIAN(50275, "<|id|>", "id", "indonesian"), 51 | ICELANDIC(50311, "<|is|>", "is", "icelandic"), 52 | ITALIAN(50274, "<|it|>", "it", "italian"), 53 | JAPANESE(50266, "<|ja|>", "ja", "japanese"), 54 | JAVANESE(50356, "<|jw|>", "jw", "javanese"), 55 | GEORGIAN(50329, "<|ka|>", "ka", "georgian"), 56 | KAZAKH(50316, "<|kk|>", "kk", "kazakh"), 57 | KHMER(50323, "<|km|>", "km", "khmer"), 58 | KANNADA(50306, "<|kn|>", "kn", "kannada"), 59 | KOREAN(50264, "<|ko|>", "ko", "korean"), 60 | LATIN(50294, "<|la|>", "la", "latin"), 61 | LUXEMBOURGISH(50345, "<|lb|>", "lb", "luxembourgish"), 62 | LINGALA(50353, "<|ln|>", "ln", "lingala"), 63 | LAO(50336, "<|lo|>", "lo", "lao"), 64 | LITHUANIAN(50293, "<|lt|>", "lt", "lithuanian"), 65 | LATVIAN(50301, "<|lv|>", "lv", "latvian"), 66 | MALAGASY(50349, "<|mg|>", "mg", "malagasy"), 67 | MAORI(50295, "<|mi|>", "mi", "maori"), 68 | MACEDONIAN(50308, "<|mk|>", "mk", "macedonian"), 69 | MALAYALAM(50296, "<|ml|>", "ml", "malayalam"), 70 | MONGOLIAN(50314, "<|mn|>", "mn", "mongolian"), 71 | MARATHI(50320, "<|mr|>", "mr", "marathi"), 72 | MALAY(50282, "<|ms|>", "ms", "malay"), 73 | MALTESE(50343, "<|mt|>", "mt", "maltese"), 74 | MYANMAR(50346, "<|my|>", "my", "myanmar"), 75 | NEPALI(50313, "<|ne|>", "ne", "nepali"), 76 | DUTCH(50271, "<|nl|>", "nl", "dutch"), 77 | NYNORSK(50342, "<|nn|>", "nn", "nynorsk"), 78 | NORWEGIAN(50288, "<|no|>", "no", "norwegian"), 79 | OCCITAN(50328, "<|oc|>", "oc", "occitan"), 80 | PUNJABI(50321, "<|pa|>", "pa", "punjabi"), 81 | POLISH(50269, "<|pl|>", "pl", "polish"), 82 | PASHTO(50340, "<|ps|>", "ps", "pashto"), 83 | PORTUGUESE(50267, "<|pt|>", "pt", "portuguese"), 84 | ROMANIAN(50284, "<|ro|>", "ro", "romanian"), 85 | RUSSIAN(50263, "<|ru|>", "ru", "russian"), 86 | SANSKRIT(50344, "<|sa|>", "sa", "sanskrit"), 87 | SINDHI(50332, "<|sd|>", "sd", "sindhi"), 88 | SINHALA(50322, "<|si|>", "si", "sinhala"), 89 | SLOVAK(50298, "<|sk|>", "sk", "slovak"), 90 | SLOVENIAN(50305, "<|sl|>", "sl", "slovenian"), 91 | SHONA(50324, "<|sn|>", "sn", "shona"), 92 | SOMALI(50326, "<|so|>", "so", "somali"), 93 | ALBANIAN(50317, "<|sq|>", "sq", "albanian"), 94 | SERBIAN(50303, "<|sr|>", "sr", "serbian"), 95 | SUNDANESE(50357, "<|su|>", "su", "sundanese"), 96 | SWEDISH(50273, "<|sv|>", "sv", "swedish"), 97 | SWAHILI(50318, "<|sw|>", "sw", "swahili"), 98 | TAMIL(50287, "<|ta|>", "ta", "tamil"), 99 | TELUGU(50299, "<|te|>", "te", "telugu"), 100 | TAJIK(50331, "<|tg|>", "tg", "tajik"), 101 | THAI(50289, "<|th|>", "th", "thai"), 102 | TURKMEN(50341, "<|tk|>", "tk", "turkmen"), 103 | TAGALOG(50348, "<|tl|>", "tl", "tagalog"), 104 | TURKISH(50268, "<|tr|>", "tr", "turkish"), 105 | TATAR(50351, "<|tt|>", "tt", "tatar"), 106 | UKRAINIAN(50280, "<|uk|>", "uk", "ukrainian"), 107 | URDU(50290, "<|ur|>", "ur", "urdu"), 108 | UZBEK(50337, "<|uz|>", "uz", "uzbek"), 109 | VIETNAMESE(50278, "<|vi|>", "vi", "vietnamese"), 110 | YIDDISH(50335, "<|yi|>", "yi", "yiddish"), 111 | YORUBA(50325, "<|yo|>", "yo", "yoruba"), 112 | CANTONESE(50358, "<|yue|>", "yue", "cantonese"), 113 | CHINESE(50260, "<|zh|>", "zh", "chinese"); 114 | 115 | private final long tokenId; 116 | private final String token; 117 | private final String isoCode; 118 | private final String isoLanguageName; 119 | 120 | /** 121 | * Token consisting of its index and string representation, 122 | * with ISO 639 language codes and names to easily retrieve them 123 | * via {@link #fromIsoCode} or {@link #fromLanguageName(String)}. 124 | * 125 | * @param tokenId The token index for Whisper v3 126 | * @param token The string representing the token 127 | * @param isoCode The shortform language code 128 | * @param isoLanguageName The full language name 129 | */ 130 | Whisper3Language(long tokenId, String token, String isoCode, String isoLanguageName) { 131 | this.tokenId = tokenId; 132 | this.token = token; 133 | this.isoCode = isoCode; 134 | this.isoLanguageName = isoLanguageName; 135 | } 136 | 137 | /** 138 | * Get the token index of this token. 139 | * @return The token index. 140 | */ 141 | @Override 142 | public long getTokenId() { 143 | return tokenId; 144 | } 145 | 146 | /** 147 | * Get the string representation of this token. 148 | * @return The string representation of this token. 149 | */ 150 | @Override 151 | public String getToken() { 152 | return token; 153 | } 154 | 155 | /** 156 | * Get the shortform language code. 157 | * @return The shortform language code. 158 | */ 159 | public String getIsoCode() { 160 | return isoCode; 161 | } 162 | 163 | /** 164 | * Get the full language name. 165 | * @return The full language name. 166 | */ 167 | public String getIsoLanguageName() { 168 | return isoLanguageName; 169 | } 170 | 171 | /** 172 | * Retrieves a WhisperLang enum value matching the given ISO 639 two-letter language code. 173 | * 174 | * @param twoLetterCode A string representing the ISO 639 two-letter language code, e.g. "de". 175 | * @return The matching WhisperLang enum value, or null if no match is found. 176 | */ 177 | public static Whisper3Language fromIsoCode(String twoLetterCode) { 178 | for (Whisper3Language lang : Whisper3Language.values()) { 179 | if (lang.getIsoCode().equalsIgnoreCase(twoLetterCode)) { 180 | return lang; 181 | } 182 | } 183 | return null; 184 | } 185 | 186 | /** 187 | * Retrieves a WhisperLang enum value matching the given ISO 639 language name. 188 | * 189 | * @param isoLanguageName A string representing the ISO 639 language name, e.g. "german". 190 | * @return The matching WhisperLang enum value, or null if no match is found. 191 | */ 192 | public static Whisper3Language fromLanguageName(String isoLanguageName) { 193 | for (Whisper3Language lang : Whisper3Language.values()) { 194 | if (lang.getIsoLanguageName().equalsIgnoreCase(isoLanguageName)) { 195 | return lang; 196 | } 197 | } 198 | return null; 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2024 DIVISIO GmbH 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /mvnw: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # ---------------------------------------------------------------------------- 20 | 21 | # ---------------------------------------------------------------------------- 22 | # Apache Maven Wrapper startup batch script, version 3.2.0 23 | # 24 | # Required ENV vars: 25 | # ------------------ 26 | # JAVA_HOME - location of a JDK home dir 27 | # 28 | # Optional ENV vars 29 | # ----------------- 30 | # MAVEN_OPTS - parameters passed to the Java VM when running Maven 31 | # e.g. to debug Maven itself, use 32 | # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 33 | # MAVEN_SKIP_RC - flag to disable loading of mavenrc files 34 | # ---------------------------------------------------------------------------- 35 | 36 | if [ -z "$MAVEN_SKIP_RC" ] ; then 37 | 38 | if [ -f /usr/local/etc/mavenrc ] ; then 39 | . /usr/local/etc/mavenrc 40 | fi 41 | 42 | if [ -f /etc/mavenrc ] ; then 43 | . /etc/mavenrc 44 | fi 45 | 46 | if [ -f "$HOME/.mavenrc" ] ; then 47 | . "$HOME/.mavenrc" 48 | fi 49 | 50 | fi 51 | 52 | # OS specific support. $var _must_ be set to either true or false. 53 | cygwin=false; 54 | darwin=false; 55 | mingw=false 56 | case "$(uname)" in 57 | CYGWIN*) cygwin=true ;; 58 | MINGW*) mingw=true;; 59 | Darwin*) darwin=true 60 | # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home 61 | # See https://developer.apple.com/library/mac/qa/qa1170/_index.html 62 | if [ -z "$JAVA_HOME" ]; then 63 | if [ -x "/usr/libexec/java_home" ]; then 64 | JAVA_HOME="$(/usr/libexec/java_home)"; export JAVA_HOME 65 | else 66 | JAVA_HOME="/Library/Java/Home"; export JAVA_HOME 67 | fi 68 | fi 69 | ;; 70 | esac 71 | 72 | if [ -z "$JAVA_HOME" ] ; then 73 | if [ -r /etc/gentoo-release ] ; then 74 | JAVA_HOME=$(java-config --jre-home) 75 | fi 76 | fi 77 | 78 | # For Cygwin, ensure paths are in UNIX format before anything is touched 79 | if $cygwin ; then 80 | [ -n "$JAVA_HOME" ] && 81 | JAVA_HOME=$(cygpath --unix "$JAVA_HOME") 82 | [ -n "$CLASSPATH" ] && 83 | CLASSPATH=$(cygpath --path --unix "$CLASSPATH") 84 | fi 85 | 86 | # For Mingw, ensure paths are in UNIX format before anything is touched 87 | if $mingw ; then 88 | [ -n "$JAVA_HOME" ] && [ -d "$JAVA_HOME" ] && 89 | JAVA_HOME="$(cd "$JAVA_HOME" || (echo "cannot cd into $JAVA_HOME."; exit 1); pwd)" 90 | fi 91 | 92 | if [ -z "$JAVA_HOME" ]; then 93 | javaExecutable="$(which javac)" 94 | if [ -n "$javaExecutable" ] && ! [ "$(expr "\"$javaExecutable\"" : '\([^ ]*\)')" = "no" ]; then 95 | # readlink(1) is not available as standard on Solaris 10. 96 | readLink=$(which readlink) 97 | if [ ! "$(expr "$readLink" : '\([^ ]*\)')" = "no" ]; then 98 | if $darwin ; then 99 | javaHome="$(dirname "\"$javaExecutable\"")" 100 | javaExecutable="$(cd "\"$javaHome\"" && pwd -P)/javac" 101 | else 102 | javaExecutable="$(readlink -f "\"$javaExecutable\"")" 103 | fi 104 | javaHome="$(dirname "\"$javaExecutable\"")" 105 | javaHome=$(expr "$javaHome" : '\(.*\)/bin') 106 | JAVA_HOME="$javaHome" 107 | export JAVA_HOME 108 | fi 109 | fi 110 | fi 111 | 112 | if [ -z "$JAVACMD" ] ; then 113 | if [ -n "$JAVA_HOME" ] ; then 114 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 115 | # IBM's JDK on AIX uses strange locations for the executables 116 | JAVACMD="$JAVA_HOME/jre/sh/java" 117 | else 118 | JAVACMD="$JAVA_HOME/bin/java" 119 | fi 120 | else 121 | JAVACMD="$(\unset -f command 2>/dev/null; \command -v java)" 122 | fi 123 | fi 124 | 125 | if [ ! -x "$JAVACMD" ] ; then 126 | echo "Error: JAVA_HOME is not defined correctly." >&2 127 | echo " We cannot execute $JAVACMD" >&2 128 | exit 1 129 | fi 130 | 131 | if [ -z "$JAVA_HOME" ] ; then 132 | echo "Warning: JAVA_HOME environment variable is not set." 133 | fi 134 | 135 | # traverses directory structure from process work directory to filesystem root 136 | # first directory with .mvn subdirectory is considered project base directory 137 | find_maven_basedir() { 138 | if [ -z "$1" ] 139 | then 140 | echo "Path not specified to find_maven_basedir" 141 | return 1 142 | fi 143 | 144 | basedir="$1" 145 | wdir="$1" 146 | while [ "$wdir" != '/' ] ; do 147 | if [ -d "$wdir"/.mvn ] ; then 148 | basedir=$wdir 149 | break 150 | fi 151 | # workaround for JBEAP-8937 (on Solaris 10/Sparc) 152 | if [ -d "${wdir}" ]; then 153 | wdir=$(cd "$wdir/.." || exit 1; pwd) 154 | fi 155 | # end of workaround 156 | done 157 | printf '%s' "$(cd "$basedir" || exit 1; pwd)" 158 | } 159 | 160 | # concatenates all lines of a file 161 | concat_lines() { 162 | if [ -f "$1" ]; then 163 | # Remove \r in case we run on Windows within Git Bash 164 | # and check out the repository with auto CRLF management 165 | # enabled. Otherwise, we may read lines that are delimited with 166 | # \r\n and produce $'-Xarg\r' rather than -Xarg due to word 167 | # splitting rules. 168 | tr -s '\r\n' ' ' < "$1" 169 | fi 170 | } 171 | 172 | log() { 173 | if [ "$MVNW_VERBOSE" = true ]; then 174 | printf '%s\n' "$1" 175 | fi 176 | } 177 | 178 | BASE_DIR=$(find_maven_basedir "$(dirname "$0")") 179 | if [ -z "$BASE_DIR" ]; then 180 | exit 1; 181 | fi 182 | 183 | MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}; export MAVEN_PROJECTBASEDIR 184 | log "$MAVEN_PROJECTBASEDIR" 185 | 186 | ########################################################################################## 187 | # Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 188 | # This allows using the maven wrapper in projects that prohibit checking in binary data. 189 | ########################################################################################## 190 | wrapperJarPath="$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" 191 | if [ -r "$wrapperJarPath" ]; then 192 | log "Found $wrapperJarPath" 193 | else 194 | log "Couldn't find $wrapperJarPath, downloading it ..." 195 | 196 | if [ -n "$MVNW_REPOURL" ]; then 197 | wrapperUrl="$MVNW_REPOURL/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar" 198 | else 199 | wrapperUrl="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar" 200 | fi 201 | while IFS="=" read -r key value; do 202 | # Remove '\r' from value to allow usage on windows as IFS does not consider '\r' as a separator ( considers space, tab, new line ('\n'), and custom '=' ) 203 | safeValue=$(echo "$value" | tr -d '\r') 204 | case "$key" in (wrapperUrl) wrapperUrl="$safeValue"; break ;; 205 | esac 206 | done < "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.properties" 207 | log "Downloading from: $wrapperUrl" 208 | 209 | if $cygwin; then 210 | wrapperJarPath=$(cygpath --path --windows "$wrapperJarPath") 211 | fi 212 | 213 | if command -v wget > /dev/null; then 214 | log "Found wget ... using wget" 215 | [ "$MVNW_VERBOSE" = true ] && QUIET="" || QUIET="--quiet" 216 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then 217 | wget $QUIET "$wrapperUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath" 218 | else 219 | wget $QUIET --http-user="$MVNW_USERNAME" --http-password="$MVNW_PASSWORD" "$wrapperUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath" 220 | fi 221 | elif command -v curl > /dev/null; then 222 | log "Found curl ... using curl" 223 | [ "$MVNW_VERBOSE" = true ] && QUIET="" || QUIET="--silent" 224 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then 225 | curl $QUIET -o "$wrapperJarPath" "$wrapperUrl" -f -L || rm -f "$wrapperJarPath" 226 | else 227 | curl $QUIET --user "$MVNW_USERNAME:$MVNW_PASSWORD" -o "$wrapperJarPath" "$wrapperUrl" -f -L || rm -f "$wrapperJarPath" 228 | fi 229 | else 230 | log "Falling back to using Java to download" 231 | javaSource="$MAVEN_PROJECTBASEDIR/.mvn/wrapper/MavenWrapperDownloader.java" 232 | javaClass="$MAVEN_PROJECTBASEDIR/.mvn/wrapper/MavenWrapperDownloader.class" 233 | # For Cygwin, switch paths to Windows format before running javac 234 | if $cygwin; then 235 | javaSource=$(cygpath --path --windows "$javaSource") 236 | javaClass=$(cygpath --path --windows "$javaClass") 237 | fi 238 | if [ -e "$javaSource" ]; then 239 | if [ ! -e "$javaClass" ]; then 240 | log " - Compiling MavenWrapperDownloader.java ..." 241 | ("$JAVA_HOME/bin/javac" "$javaSource") 242 | fi 243 | if [ -e "$javaClass" ]; then 244 | log " - Running MavenWrapperDownloader.java ..." 245 | ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$wrapperUrl" "$wrapperJarPath") || rm -f "$wrapperJarPath" 246 | fi 247 | fi 248 | fi 249 | fi 250 | ########################################################################################## 251 | # End of extension 252 | ########################################################################################## 253 | 254 | # If specified, validate the SHA-256 sum of the Maven wrapper jar file 255 | wrapperSha256Sum="" 256 | while IFS="=" read -r key value; do 257 | case "$key" in (wrapperSha256Sum) wrapperSha256Sum=$value; break ;; 258 | esac 259 | done < "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.properties" 260 | if [ -n "$wrapperSha256Sum" ]; then 261 | wrapperSha256Result=false 262 | if command -v sha256sum > /dev/null; then 263 | if echo "$wrapperSha256Sum $wrapperJarPath" | sha256sum -c > /dev/null 2>&1; then 264 | wrapperSha256Result=true 265 | fi 266 | elif command -v shasum > /dev/null; then 267 | if echo "$wrapperSha256Sum $wrapperJarPath" | shasum -a 256 -c > /dev/null 2>&1; then 268 | wrapperSha256Result=true 269 | fi 270 | else 271 | echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." 272 | echo "Please install either command, or disable validation by removing 'wrapperSha256Sum' from your maven-wrapper.properties." 273 | exit 1 274 | fi 275 | if [ $wrapperSha256Result = false ]; then 276 | echo "Error: Failed to validate Maven wrapper SHA-256, your Maven wrapper might be compromised." >&2 277 | echo "Investigate or delete $wrapperJarPath to attempt a clean download." >&2 278 | echo "If you updated your Maven version, you need to update the specified wrapperSha256Sum property." >&2 279 | exit 1 280 | fi 281 | fi 282 | 283 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" 284 | 285 | # For Cygwin, switch paths to Windows format before running java 286 | if $cygwin; then 287 | [ -n "$JAVA_HOME" ] && 288 | JAVA_HOME=$(cygpath --path --windows "$JAVA_HOME") 289 | [ -n "$CLASSPATH" ] && 290 | CLASSPATH=$(cygpath --path --windows "$CLASSPATH") 291 | [ -n "$MAVEN_PROJECTBASEDIR" ] && 292 | MAVEN_PROJECTBASEDIR=$(cygpath --path --windows "$MAVEN_PROJECTBASEDIR") 293 | fi 294 | 295 | # Provide a "standardized" way to retrieve the CLI args that will 296 | # work with both Windows and non-Windows executions. 297 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $*" 298 | export MAVEN_CMD_LINE_ARGS 299 | 300 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 301 | 302 | # shellcheck disable=SC2086 # safe args 303 | exec "$JAVACMD" \ 304 | $MAVEN_OPTS \ 305 | $MAVEN_DEBUG_OPTS \ 306 | -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ 307 | "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ 308 | ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" 309 | -------------------------------------------------------------------------------- /src/main/java/divisio/whisper/Whisper3.java: -------------------------------------------------------------------------------- 1 | package divisio.whisper; 2 | 3 | import ai.djl.Device; 4 | import ai.djl.MalformedModelException; 5 | import ai.djl.Model; 6 | import ai.djl.audio.processor.AudioProcessor; 7 | import ai.djl.audio.processor.LogMelSpectrogram; 8 | import ai.djl.audio.processor.PadOrTrim; 9 | import ai.djl.engine.Engine; 10 | import ai.djl.modality.audio.Audio; 11 | import ai.djl.modality.nlp.DefaultVocabulary; 12 | import ai.djl.ndarray.NDArray; 13 | import ai.djl.ndarray.NDList; 14 | import ai.djl.ndarray.NDManager; 15 | import ai.djl.ndarray.index.NDIndex; 16 | import ai.djl.ndarray.types.DataType; 17 | import ai.djl.ndarray.types.Shape; 18 | import ai.djl.training.ParameterStore; 19 | import ai.djl.util.JsonUtils; 20 | import com.google.gson.reflect.TypeToken; 21 | import divisio.whisper.token.WhisperAnyToken; 22 | import divisio.whisper.token.Whisper3Language; 23 | import divisio.whisper.token.Whisper3SpecialToken; 24 | import divisio.whisper.token.Whisper3Timestamp; 25 | import divisio.whisper.token.WhisperToken; 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | 29 | import java.io.FileOutputStream; 30 | import java.io.IOException; 31 | import java.io.InputStream; 32 | import java.io.Reader; 33 | import java.lang.reflect.Type; 34 | import java.nio.file.Files; 35 | import java.nio.file.Path; 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.Collections; 39 | import java.util.List; 40 | import java.util.Map; 41 | import java.util.regex.Pattern; 42 | 43 | /** 44 | * The {@code Whisper} class is used to transcribe speech to text. 45 | * 46 | *

It is based on the huggingface implementation of Whisper. 47 | */ 48 | public class Whisper3 implements AutoCloseable { 49 | 50 | private static final Logger log = LoggerFactory.getLogger(Whisper3.class); 51 | 52 | /** 53 | * Cache location for whisper files. 54 | */ 55 | private static final Path WHISPER_CACHE = Path.of(System.getProperty("user.home"), ".whisper"); 56 | 57 | /** 58 | * Base path of where to find all relevant model files. 59 | */ 60 | private final Path basePath; 61 | 62 | /** 63 | * Device of this Whisper instance. 64 | */ 65 | private final Device device; 66 | 67 | /** 68 | * NDManager of this Whisper instance. 69 | */ 70 | private final NDManager whisperManager; 71 | 72 | private final List preprocessors; 73 | private final DefaultVocabulary vocabulary; 74 | 75 | private final Model encoder; 76 | private final Model decoder; 77 | private final Model decoderCrossAttention; 78 | 79 | private final NDIndex suppressionMask; 80 | private final NDArray negativeInfinity; 81 | 82 | private static final String RESOURCE_MEL_FILTER = "whisper_v3_mel_filter.npz"; 83 | private static final String RESOURCE_ADDED_TOKENS = "whisper_v3_added_tokens.json"; 84 | private static final String RESOURCE_VOCAB = "whisper_v3_vocab.json"; 85 | private static final String RESOURCE_DECODER = "whisper_v3_decoder.pt"; 86 | private static final String RESOURCE_ENCODER = "whisper_v3_encoder.pt"; 87 | private static final String RESOURCE_DECODER_CROSS_ATTENTION_INIT = "whisper_v3_decoder_cross_attention_initializer.pt"; 88 | 89 | static { 90 | if (!WHISPER_CACHE.toFile().exists()) { 91 | WHISPER_CACHE.toFile().mkdir(); 92 | } 93 | } 94 | /** 95 | * Special token pattern. Used to remove those tokens from Whisper transcriptions. 96 | */ 97 | private final Pattern specialTokenPattern = Pattern.compile("<\\|[a-z0-9.]+\\|>"); 98 | 99 | public static Whisper3 instance() { 100 | return instance(Engine.getInstance().defaultDevice()); 101 | } 102 | 103 | public static Whisper3 instance(Device device) { 104 | extractResourceToCache(RESOURCE_MEL_FILTER); 105 | extractResourceToCache(RESOURCE_ADDED_TOKENS); 106 | extractResourceToCache(RESOURCE_VOCAB); 107 | extractResourceToCache(RESOURCE_DECODER); 108 | extractResourceToCache(RESOURCE_ENCODER); 109 | extractResourceToCache(RESOURCE_DECODER_CROSS_ATTENTION_INIT); 110 | return instance(WHISPER_CACHE, device); 111 | } 112 | 113 | public static Whisper3 instance(Path path) { 114 | return instance(path, Engine.getInstance().defaultDevice()); 115 | } 116 | 117 | public static Whisper3 instance(Path path, Device device) { 118 | return new Whisper3(path, device); 119 | } 120 | 121 | /** 122 | * Whisper constructor with a base path and device to load the model onto. 123 | * @param path base path of the model files 124 | */ 125 | private Whisper3(Path path, Device device) { 126 | // early out for non-GPU until we get a CPU version running 127 | if (!device.isGpu()) { 128 | throw new IllegalStateException("Currently, Whisper for Java only works on GPU."); 129 | } 130 | 131 | this.basePath = path; 132 | this.device = device; 133 | this.whisperManager = NDManager.newBaseManager(this.device); 134 | 135 | whisperManager.setName("whisper"); 136 | whisperManager.getParentManager().setName("SYSTEM"); 137 | 138 | try { 139 | this.preprocessors = loadPreprocessors(this.basePath, this.whisperManager); 140 | this.vocabulary = loadVocabulary(this.basePath); 141 | 142 | this.encoder = loadModel(RESOURCE_ENCODER); 143 | this.decoder = loadModel(RESOURCE_DECODER); 144 | this.decoderCrossAttention = loadModel(RESOURCE_DECODER_CROSS_ATTENTION_INIT); 145 | 146 | this.suppressionMask = setupSuppressionMask(this.whisperManager, this.vocabulary.size()); 147 | this.negativeInfinity = whisperManager.create(Float.NEGATIVE_INFINITY) 148 | .toType(DataType.FLOAT16, false); 149 | 150 | } catch (IOException | MalformedModelException e) { 151 | throw new RuntimeException("Could not create whisper instance"); 152 | } 153 | } 154 | 155 | /** 156 | * Create a {@link WhisperTask} using this {@code Whisper} instance. 157 | *

Just a convenience wrapper method for {@link WhisperTask#task(Whisper3)}. 158 | * @return a {@link WhisperTask} 159 | */ 160 | public WhisperTask task() { 161 | return WhisperTask.task(this); 162 | } 163 | 164 | /** 165 | * Process the given {@link Audio} with a default set of start tokens. 166 | * Language of the input audio is detected by whisper, it is transcribed. 167 | * 168 | *

To facilitate configuration, it is recommended to use {@link WhisperTask} instead of 169 | * calling this method directly. Create a {@link WhisperTask} using {@link Whisper3#task()}. 170 | * 171 | * @param audio the input {@link Audio} to process. 172 | * @return a {@link WhisperResult} containing the transcribed text. 173 | */ 174 | public WhisperResult process(final Audio audio) { 175 | WhisperToken[] initTokens = { 176 | Whisper3SpecialToken.START_OF_TRANSCRIPT, 177 | Whisper3Language.AUTO, 178 | Whisper3SpecialToken.TRANSCRIBE, 179 | Whisper3SpecialToken.NO_TIMESTAMPS 180 | }; 181 | return process(audio, initTokens); 182 | } 183 | 184 | /** 185 | * Process the given {@link Audio} with the desired start tokens. 186 | * 187 | *

To facilitate configuration, it is recommended to use {@link WhisperTask} instead of 188 | * calling this method directly. Create a {@link WhisperTask} using {@link Whisper3#task()}. 189 | * 190 | * @param audio the input {@link Audio} to process. 191 | * @param startTokens list of start tokens to guide the Whisper model. 192 | * @return a {@link WhisperResult} containing the transcribed text. 193 | */ 194 | public WhisperResult process(final Audio audio, final WhisperToken... startTokens) { 195 | return process(audio, List.of(startTokens)); 196 | } 197 | 198 | /** 199 | * Process the given {@link Audio} with the desired start tokens. 200 | * 201 | *

To facilitate configuration, it is recommended to use {@link WhisperTask} instead of 202 | * calling this method directly. Create a {@link WhisperTask} using {@link Whisper3#task()}. 203 | * 204 | * @param audio the input {@link Audio} to process. 205 | * @param startTokens array of start tokens to guide the Whisper model. 206 | * @return a {@link WhisperResult} containing the transcribed text. 207 | */ 208 | public WhisperResult process(final Audio audio, final List startTokens) { 209 | try (NDManager transcriptionManager = whisperManager.newSubManager()) { 210 | transcriptionManager.setName("transcription_manager"); 211 | // audio -> mel spectrogram 212 | NDList processedInput = processInput(transcriptionManager, audio); 213 | 214 | // encoder pass 215 | NDArray encoderOutput = forward(encoder, processedInput).singletonOrThrow(); 216 | 217 | // setup initial token 218 | long initToken; 219 | if (startTokens.size() == 0) { 220 | initToken = Whisper3SpecialToken.START_OF_TRANSCRIPT.getTokenId(); 221 | } else { 222 | initToken = startTokens.get(0).getTokenId(); 223 | } 224 | 225 | // initialize NDArray that will hold all tokens during the decoder process 226 | NDArray previousTokenIds = transcriptionManager 227 | .create(new long[] { initToken }) 228 | .expandDims(0) // must be 2D 229 | .toDevice(device, false); 230 | 231 | // init kv cache 232 | NDList keyValueCache = initKeyValueCache(transcriptionManager, encoderOutput); 233 | 234 | // special conditioning for timestamps; necessary to force-negate some logits later in the decoder 235 | boolean withTimestamps = true; 236 | if (startTokens.contains(Whisper3SpecialToken.NO_TIMESTAMPS)) { 237 | withTimestamps = false; 238 | } 239 | 240 | // simple max loop limit to avoid endless loops (if whisper fails to generate EOT token) 241 | int maxLoop = 100; 242 | int i = 0; 243 | 244 | while (true) { 245 | // early exit condition to avoid endless loops 246 | if (i >= maxLoop) { 247 | break; 248 | } 249 | 250 | try (NDManager decoderPassManager = whisperManager.newSubManager()) { 251 | decoderPassManager.setName("decoder_pass_manager"); 252 | 253 | NDArray lastToken = previousTokenIds.get(0, i).reshape(1, 1); 254 | 255 | // input consists of a flat list containing the last token, the encoder output, 256 | // and the entire key_value_cache flattened 257 | NDList decoderInputs = new NDList(lastToken, encoderOutput).addAll(keyValueCache); 258 | NDList output = forward(decoder, decoderInputs); 259 | 260 | // first index contains the next predicted token 261 | NDArray decoderOutput = output.get(0); 262 | decoderOutput.attach(decoderPassManager); 263 | // all other indices are the new key_value_cache 264 | NDList pastKeyValueCache = output.subNDList(1); 265 | 266 | NDArray logits = decoderOutput.get("0,-1:,:").duplicate(); 267 | 268 | // SuppressTokensLogitsProcessor 269 | // suppresses a specific set of tokens, always, for some probably good reason 270 | logits.set(suppressionMask, negativeInfinity); 271 | 272 | // force the start tokens (adapted from ForceTokensLogitsProcessor) 273 | if (i + 1 < startTokens.size()) { 274 | WhisperToken startToken = startTokens.get(i + 1); 275 | // null tokens and indices < 0 are skipped 276 | if (startToken != null && startToken.getTokenId() >= 0) { 277 | logits.set(new NDIndex(":,:"), negativeInfinity); 278 | logits.set(new NDIndex(":,{}", startToken.getTokenId()), 0); 279 | } 280 | } 281 | 282 | // suppress specific tokens that probably are commonly spit out by whisper immediately 283 | // after starting and would break it somehow 284 | // (adapted from: SuppressTokensAtBeginLogitsProcessor) 285 | if (previousTokenIds.size(1) == 3) { 286 | // 220 = some random token? taken from huggingface whisper code 287 | logits.set(new NDIndex(":,220"), negativeInfinity); 288 | logits.set(new NDIndex(":,{}", Whisper3SpecialToken.END_OF_TEXT.getTokenId()), negativeInfinity); 289 | 290 | // if timestamps are desired, suppress these 2 tokens that prevent proper timestamp generation 291 | if (withTimestamps) { 292 | logits.set(new NDIndex(":,{}", Whisper3SpecialToken.NO_TIMESTAMPS.getTokenId()), negativeInfinity); 293 | logits.set(new NDIndex(":,{}", Whisper3Timestamp.MIN_TIMESTAMP_TOKEN.getTokenId()), negativeInfinity); 294 | } 295 | } 296 | 297 | // greedy decoding, attach to previous tokens 298 | NDArray currentToken = logits.argMax(); 299 | previousTokenIds = previousTokenIds.concat(currentToken.reshape(1, 1), 1); 300 | previousTokenIds.attach(transcriptionManager); 301 | 302 | // close previous cache to store a new one 303 | keyValueCache.close(); 304 | // TODO: in python, the self-attention-cache only grows to the size of 16, 305 | // while this cache can theoretically grow endlessly... check if it matters 306 | keyValueCache = pastKeyValueCache; 307 | keyValueCache.attach(transcriptionManager); 308 | } finally { 309 | i++; 310 | } 311 | 312 | // if EOT, break out 313 | if (Whisper3SpecialToken.END_OF_TEXT.getTokenId() == previousTokenIds.get("0,-1").getLong()) { 314 | break; 315 | } 316 | } 317 | 318 | // token ids -> raw tokens 319 | List parsedTokens = parseTokens(previousTokenIds); 320 | // transform tokens to UTF-8 321 | String rawResult = Whisper3TokenDecoder.rawTokensToText( 322 | parsedTokens.stream().map(WhisperToken::getToken).toList() 323 | ); 324 | // strip special tokens 325 | String cleanedResult = removeSpecialTokens(rawResult); 326 | 327 | return new WhisperResult(rawResult, cleanedResult, parsedTokens); 328 | } catch (Exception e) { 329 | throw new RuntimeException(e); 330 | } 331 | } 332 | 333 | private Model loadModel(String name) throws MalformedModelException, IOException { 334 | final Model model = Model.newInstance(name, device); 335 | model.load(this.basePath, null, Collections.singletonMap("mapLocation", "true")); 336 | this.whisperManager.attachInternal(name, model.getNDManager()); 337 | return model; 338 | } 339 | 340 | private NDList forward(final Model model, final NDList inputs) { 341 | ParameterStore parameterStore = new ParameterStore(inputs.getManager(), false); 342 | NDList result = model.getBlock() 343 | .forward(parameterStore, inputs, false); 344 | result.attach(inputs.getManager()); 345 | return result; 346 | } 347 | 348 | private NDList initKeyValueCache(NDManager manager, NDArray encoderOutput) { 349 | NDList kvCrossAttentions = forward(decoderCrossAttention, new NDList(encoderOutput)); 350 | NDArray kvSelfAttention = manager.zeros(new Shape(1, 20, 0, 64), DataType.FLOAT16); 351 | 352 | NDList pastKeyValues = new NDList(); 353 | for (int i = 0; i < 32; ++i) { 354 | pastKeyValues.add(kvSelfAttention); 355 | pastKeyValues.add(kvSelfAttention); 356 | pastKeyValues.add(kvCrossAttentions.get(i * 2)); 357 | pastKeyValues.add(kvCrossAttentions.get(i * 2 + 1)); 358 | } 359 | 360 | return pastKeyValues; 361 | } 362 | 363 | private static List loadPreprocessors(final Path basePath, final NDManager manager) throws IOException { 364 | Path melFile = basePath.resolve(RESOURCE_MEL_FILTER); 365 | List preprocessors = new ArrayList<>(); 366 | preprocessors.add(new PadOrTrim(480000)); 367 | preprocessors.add(LogMelSpectrogram.newInstance(melFile, 128, manager)); 368 | return preprocessors; 369 | } 370 | 371 | private static DefaultVocabulary loadVocabulary(final Path basePath) { 372 | Map vocab; 373 | Map added; 374 | Type type = new TypeToken>() {}.getType(); 375 | 376 | Path vocabPath = basePath.resolve(RESOURCE_VOCAB); 377 | Path addedTokensPath = basePath.resolve(RESOURCE_ADDED_TOKENS); 378 | try (Reader reader = Files.newBufferedReader(vocabPath)) { 379 | vocab = JsonUtils.GSON.fromJson(reader, type); 380 | } catch (IOException e) { 381 | throw new RuntimeException("Could not read vocabulary file"); 382 | } 383 | try (Reader reader = Files.newBufferedReader(addedTokensPath)) { 384 | added = JsonUtils.GSON.fromJson(reader, type); 385 | } catch (IOException e) { 386 | throw new RuntimeException("Could not read added_tokens file"); 387 | } 388 | String[] result = new String[vocab.size() + added.size()]; 389 | vocab.forEach((key, value) -> result[value] = key); 390 | added.forEach((key, value) -> result[value] = key); 391 | return new DefaultVocabulary(Arrays.asList(result)); 392 | } 393 | 394 | private NDList processInput(NDManager manager, Audio input) { 395 | NDArray samples = manager.create(input.getData()); 396 | for (AudioProcessor processor : preprocessors) { 397 | samples = processor.extractFeatures(samples.getManager(), samples); 398 | } 399 | samples = samples.expandDims(0).toType(DataType.FLOAT16, true); 400 | return new NDList(samples); 401 | } 402 | 403 | /** 404 | * Parse the tokens by replacing each token id in the result with the 405 | * corresponding token string representation in the vocabulary. 406 | * @param result the predicted token ids. 407 | * @return a list of strings representing the tokens. 408 | */ 409 | private List parseTokens(NDArray result) { 410 | List sentence = new ArrayList<>(); 411 | for (long tokenId : result.toLongArray()) { 412 | String token = vocabulary.getToken(tokenId); 413 | sentence.add(new WhisperAnyToken(tokenId, token)); 414 | 415 | if (Whisper3SpecialToken.END_OF_TEXT.getToken().equals(token)) { 416 | break; 417 | } 418 | } 419 | 420 | return sentence; 421 | } 422 | 423 | private static NDIndex setupSuppressionMask(final NDManager manager, long vocabSize) { 424 | // from Whisper v3 config, therefore currently only works for Whisper v3 425 | // https://huggingface.co/openai/whisper-large-v3/blob/main/generation_config.json # suppress_tokens 426 | int[] indices = new int[]{ 427 | 1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50359, 50360, 50361, 50362, 50363 428 | }; 429 | 430 | NDArray mask = manager.zeros(new Shape(vocabSize), DataType.BOOLEAN); 431 | 432 | // set the mask to true at the specified indices 433 | for (int index : indices) { 434 | mask.set(new NDIndex(index), 1); 435 | } 436 | 437 | return new NDIndex().addBooleanIndex(mask.expandDims(0)); 438 | } 439 | 440 | /** 441 | * Removes special tokens from the input string. 442 | * A token is defined as a sequence starting with "<|", ending with "|>", 443 | * and containing only lowercase letters in between. 444 | * 445 | * @param input The input string potentially containing tokens. 446 | * @return A string with all tokens removed. 447 | */ 448 | private String removeSpecialTokens(final String input) { 449 | // Replace all occurrences of the pattern in the input string with an empty string 450 | return this.specialTokenPattern.matcher(input).replaceAll("").trim(); 451 | } 452 | 453 | /** 454 | * Extract the given resource to the cache location. 455 | * @param resource the resource name. 456 | */ 457 | private static void extractResourceToCache(String resource) { 458 | 459 | try (InputStream in = Whisper3.class.getResourceAsStream("/" + resource)) { 460 | if (in == null) { 461 | throw new RuntimeException("Could not find whisper resource. Are you sure you added the whisper dependency?"); 462 | } 463 | 464 | Path target = WHISPER_CACHE.resolve(resource); 465 | 466 | // do not override if it already exists 467 | if (!Files.exists(target)) { 468 | FileOutputStream out = new FileOutputStream(target.toFile()); 469 | in.transferTo(out); 470 | out.close(); 471 | log.info("Whisper resource '{}' cached.", resource); 472 | } 473 | } catch (IOException e) { 474 | throw new RuntimeException(e); 475 | } 476 | } 477 | 478 | @Override 479 | public void close() throws Exception { 480 | this.whisperManager.close(); 481 | } 482 | } 483 | --------------------------------------------------------------------------------