├── .github └── workflows │ └── gradle.yml ├── .gitignore ├── LICENSE ├── README.md ├── build.gradle ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle └── src ├── main ├── java │ └── com │ │ └── didalgo │ │ └── gpt3 │ │ ├── ByteSequence.java │ │ ├── ChatFormatDescriptor.java │ │ ├── CompletionType.java │ │ ├── Encoding.java │ │ ├── EncodingType.java │ │ ├── GPT3Tokenizer.java │ │ ├── ModelType.java │ │ ├── TokenCount.java │ │ ├── TokenCountSupport.java │ │ ├── TokenizableFunction.java │ │ ├── TokenizableFunctionCall.java │ │ ├── TokenizableMessage.java │ │ └── TokenizableTool.java └── resources │ └── com │ └── didalgo │ └── gpt3 │ ├── cl100k_base.tiktoken │ ├── o200k_base.tiktoken │ ├── p50k_base.tiktoken │ └── r50k_base.tiktoken └── test ├── java └── com │ └── didalgo │ └── gpt3 │ ├── ByteSequenceTest.java │ ├── GPT3TokenizerTest.java │ ├── ListConverter.java │ ├── TokenCountTest.java │ └── TokenizableFunctionTest.java └── resources └── com └── didalgo └── gpt3 ├── java.schema.json └── sql.schema.json /.github/workflows/gradle.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle 3 | 4 | name: Gradle Build 5 | 6 | on: [push] 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up JDK 17 17 | uses: actions/setup-java@v3 18 | with: 19 | java-version: '17' 20 | distribution: 'temurin' 21 | - name: Make Gradle script executable 22 | run: chmod +x gradlew 23 | - name: Build with Gradle 24 | uses: gradle/gradle-build-action@67421db6bd0bf253fb4bd25b31ebb98943c375e1 25 | with: 26 | arguments: build 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /.gradle/ 3 | /.idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 didalgo2 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPT3/4 Java Tokenizer 2 | 3 | [![License: MIT](https://img.shields.io/github/license/didalgo2/gpt3-tokenizer-java?style=flat-square)](https://opensource.org/license/mit/) 4 | ![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/didalgo2/gpt3-tokenizer-java/gradle.yml?style=flat-square) 5 | [![Maven Central](https://img.shields.io/maven-central/v/com.didalgo/gpt3-tokenizer?style=flat-square)](https://central.sonatype.com/artifact/com.didalgo/gpt3-tokenizer/0.1.8) 6 | 7 | This is a Java implementation of a GPT3/4 tokenizer, loosely ported from [Tiktoken](https://github.com/openai/tiktoken) with the help of [ChatGPT](https://openai.com/blog/chatgpt). 8 | 9 | ## Usage Examples 10 | 11 | ### Encoding Text to Tokens 12 | 13 | ```java 14 | GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE); 15 | List tokens = tokenizer.encode("example text here"); 16 | ``` 17 | 18 | ### Decoding Tokens to Text 19 | 20 | ```java 21 | GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE); 22 | List tokens = Arrays.asList(123, 456, 789); 23 | String text = tokenizer.decode(tokens); 24 | ``` 25 | 26 | ### Counting Number of Tokens in Chat Messages 27 | 28 | ```java 29 | var messages = List.of( 30 | new ChatMessage(ChatMessageRole.SYSTEM.value(), "You are a helpful assistant."), 31 | new ChatMessage(ChatMessageRole.USER.value(), "Hello there!") 32 | ); 33 | var model = ModelType.GPT_3_5_TURBO; 34 | var count = TokenCount.fromMessages(messages, model); 35 | System.out.println("Prompt tokens: " + count); 36 | ``` 37 | 38 | ### Did you know... 39 | 40 | 1. ...that all 3.5-turbo models released after _0613_ now have tokenization counts for messages consistent with gpt-4 models? 41 | 42 | 1. ...that OpenAI Tokenizer available at https://platform.openai.com/tokenizer uses p50k_base encoding, thus it doesn't count correctly tokens for gpt-3.5 and gpt-4 models? If you look for decent alternative, you may like: https://tiktokenizer.vercel.app/, but keep in mind that tokenization for messages of gpt-3.5 models released after 0613 was changed (see point above). 43 | 44 | 1. ...that in cl100k_base encoding every sequence of up to 81 spaces is just a single token? So next time when someone tells you that passing YAML to ChatGPT is not efficient, you can argue that... 45 | ```java 46 | var tokenizer = ModelType.GPT_3_5_TURBO.getTokenizer(); 47 | var tokens = (List) null; 48 | for (var sb = new StringBuilder(" "); (tokens = tokenizer.encode(sb)).size() == 1; sb.append(' ')) 49 | System.out.printf("`%s`'s token is %s, and that's %d space(s)!\n".replace("(s)", sb.length()==1?"":"s"), sb, tokens, sb.length()); 50 | 51 | ``` 52 | ``` 53 | ` `'s token is [14984], and that's 75 spaces! 54 | ` `'s token is [56899], and that's 76 spaces! 55 | ` `'s token is [59691], and that's 77 spaces! 56 | ` `'s token is [82321], and that's 78 spaces! 57 | ` `'s token is [40584], and that's 79 spaces! 58 | ` `'s token is [98517], and that's 80 spaces! 59 | ` `'s token is [96529], and that's 81 spaces! 60 | ``` 61 | 62 | ## License 63 | 64 | This project is licensed under the MIT License. -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'java' 3 | id 'application' 4 | id 'maven-publish' 5 | id 'signing' 6 | } 7 | 8 | compileJava.options.encoding = "UTF-8" 9 | compileTestJava.options.encoding = "UTF-8" 10 | 11 | group 'com.didalgo' 12 | archivesBaseName = 'gpt3-tokenizer' 13 | version '0.1.9-SNAPSHOT' 14 | 15 | repositories { 16 | mavenLocal() 17 | mavenCentral() 18 | } 19 | 20 | ext { 21 | gpt3_java_version = '0.14.0' 22 | jackson_version = '2.14.2' 23 | jupiter_version = '5.9.2' 24 | } 25 | 26 | dependencies { 27 | implementation "javax.json:javax.json-api:1.1.4" 28 | implementation "org.glassfish:javax.json:1.1.4" 29 | compileOnly "com.fasterxml.jackson.core:jackson-databind:${jackson_version}" 30 | compileOnly "com.github.victools:jsonschema-generator:4.31.1" 31 | compileOnly "com.github.victools:jsonschema-module-jackson:4.31.1" 32 | compileOnly "com.theokanning.openai-gpt3-java:api:${gpt3_java_version}" 33 | testAnnotationProcessor "org.projectlombok:lombok:1.18.26" 34 | testImplementation 'com.squareup.okhttp3:logging-interceptor:3.14.9' 35 | testImplementation "com.fasterxml.jackson.core:jackson-databind:${jackson_version}" 36 | testImplementation "com.github.victools:jsonschema-generator:4.31.1" 37 | testImplementation "com.github.victools:jsonschema-module-jackson:4.31.1" 38 | testImplementation "com.theokanning.openai-gpt3-java:service:${gpt3_java_version}" 39 | testImplementation "org.junit.jupiter:junit-jupiter-api:${jupiter_version}" 40 | testImplementation "org.junit.jupiter:junit-jupiter-params:${jupiter_version}" 41 | testImplementation "org.projectlombok:lombok:1.18.26" 42 | testRuntimeOnly "org.junit.jupiter:junit-jupiter-engine:${jupiter_version}" 43 | } 44 | 45 | test { 46 | useJUnitPlatform() 47 | } 48 | 49 | java { 50 | withJavadocJar() 51 | withSourcesJar() 52 | } 53 | 54 | publishing { 55 | publications.create("mavenJava", MavenPublication) { 56 | artifactId = 'gpt3-tokenizer' 57 | from components.java 58 | versionMapping { 59 | usage('java-api') { 60 | fromResolutionOf('runtimeClasspath') 61 | } 62 | usage('java-runtime') { 63 | fromResolutionResult() 64 | } 65 | } 66 | pom { 67 | name = 'Java GPT3/4 Tokenizer' 68 | description = 'Java implementation of a GPT3/4 tokenizer' 69 | url = 'https://github.com/didalgo2/gpt3-tokenizer-java' 70 | licenses { 71 | license { 72 | name = 'MIT License' 73 | url = 'https://github.com/didalgo2/gpt3-tokenizer-java/blob/main/LICENSE' 74 | } 75 | } 76 | developers { 77 | developer { 78 | id = 'didalgo' 79 | name = 'Mariusz Bernacki' 80 | email = 'didalgo@didalgo.com' 81 | } 82 | } 83 | scm { 84 | connection = 'scm:git:git://github.com/didalgo2/gpt3-tokenizer-java.git' 85 | developerConnection = 'scm:git:ssh://github.com/didalgo2/gpt3-tokenizer-java.git' 86 | url = 'https://github.com/didalgo2/gpt3-tokenizer-java/' 87 | } 88 | } 89 | } 90 | repositories { 91 | maven { 92 | def releasesRepoUrl = "https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/" 93 | def snapshotsRepoUrl = "https://s01.oss.sonatype.org/content/repositories/snapshots/" 94 | url = version.endsWith('SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl 95 | credentials { 96 | username = project.properties['ossrhUser'].toString() 97 | password = project.properties['ossrhPassword'].toString() 98 | } 99 | } 100 | } 101 | } 102 | 103 | signing { 104 | sign publishing.publications.mavenJava 105 | } 106 | 107 | javadoc { 108 | options.addBooleanOption('html5', true) 109 | } 110 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/didalgolab/gpt3-tokenizer-java/85b374b723058576fd0475664a92075cca1f0f71/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | # 21 | # Gradle start up script for POSIX generated by Gradle. 22 | # 23 | # Important for running: 24 | # 25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 26 | # noncompliant, but you have some other compliant shell such as ksh or 27 | # bash, then to run this script, type that shell name before the whole 28 | # command line, like: 29 | # 30 | # ksh Gradle 31 | # 32 | # Busybox and similar reduced shells will NOT work, because this script 33 | # requires all of these POSIX shell features: 34 | # * functions; 35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 37 | # * compound commands having a testable exit status, especially «case»; 38 | # * various built-in commands including «command», «set», and «ulimit». 39 | # 40 | # Important for patching: 41 | # 42 | # (2) This script targets any POSIX shell, so it avoids extensions provided 43 | # by Bash, Ksh, etc; in particular arrays are avoided. 44 | # 45 | # The "traditional" practice of packing multiple parameters into a 46 | # space-separated string is a well documented source of bugs and security 47 | # problems, so this is (mostly) avoided, by progressively accumulating 48 | # options in "$@", and eventually passing that to Java. 49 | # 50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 52 | # see the in-line comments for details. 53 | # 54 | # There are tweaks for specific operating systems such as AIX, CygWin, 55 | # Darwin, MinGW, and NonStop. 56 | # 57 | # (3) This script is generated from the Groovy template 58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 59 | # within the Gradle project. 60 | # 61 | # You can find Gradle at https://github.com/gradle/gradle/. 62 | # 63 | ############################################################################## 64 | 65 | # Attempt to set APP_HOME 66 | 67 | # Resolve links: $0 may be a link 68 | app_path=$0 69 | 70 | # Need this for daisy-chained symlinks. 71 | while 72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 73 | [ -h "$app_path" ] 74 | do 75 | ls=$( ls -ld "$app_path" ) 76 | link=${ls#*' -> '} 77 | case $link in #( 78 | /*) app_path=$link ;; #( 79 | *) app_path=$APP_HOME$link ;; 80 | esac 81 | done 82 | 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit 84 | 85 | APP_NAME="Gradle" 86 | APP_BASE_NAME=${0##*/} 87 | 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 137 | 138 | Please set the JAVA_HOME variable in your environment to match the 139 | location of your Java installation." 140 | fi 141 | 142 | # Increase the maximum file descriptors if we can. 143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 144 | case $MAX_FD in #( 145 | max*) 146 | MAX_FD=$( ulimit -H -n ) || 147 | warn "Could not query maximum file descriptor limit" 148 | esac 149 | case $MAX_FD in #( 150 | '' | soft) :;; #( 151 | *) 152 | ulimit -n "$MAX_FD" || 153 | warn "Could not set maximum file descriptor limit to $MAX_FD" 154 | esac 155 | fi 156 | 157 | # Collect all arguments for the java command, stacking in reverse order: 158 | # * args from the command line 159 | # * the main class name 160 | # * -classpath 161 | # * -D...appname settings 162 | # * --module-path (only if needed) 163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 164 | 165 | # For Cygwin or MSYS, switch paths to Windows format before running java 166 | if "$cygwin" || "$msys" ; then 167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 169 | 170 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 171 | 172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 173 | for arg do 174 | if 175 | case $arg in #( 176 | -*) false ;; # don't mess with options #( 177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 178 | [ -e "$t" ] ;; #( 179 | *) false ;; 180 | esac 181 | then 182 | arg=$( cygpath --path --ignore --mixed "$arg" ) 183 | fi 184 | # Roll the args list around exactly as many times as the number of 185 | # args, so each arg winds up back in the position where it started, but 186 | # possibly modified. 187 | # 188 | # NB: a `for` loop captures its iteration list before it begins, so 189 | # changing the positional parameters here affects neither the number of 190 | # iterations, nor the values presented in `arg`. 191 | shift # remove old arg 192 | set -- "$@" "$arg" # push replacement arg 193 | done 194 | fi 195 | 196 | # Collect all arguments for the java command; 197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of 198 | # shell script including quotes and variable substitutions, so put them in 199 | # double quotes to make sure that they get re-expanded; and 200 | # * put everything else in single quotes, so that it's not re-expanded. 201 | 202 | set -- \ 203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 204 | -classpath "$CLASSPATH" \ 205 | org.gradle.wrapper.GradleWrapperMain \ 206 | "$@" 207 | 208 | # Stop when "xargs" is not available. 209 | if ! command -v xargs >/dev/null 2>&1 210 | then 211 | die "xargs is not available" 212 | fi 213 | 214 | # Use "xargs" to parse quoted args. 215 | # 216 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 217 | # 218 | # In Bash we could simply go: 219 | # 220 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 221 | # set -- "${ARGS[@]}" "$@" 222 | # 223 | # but POSIX shell has neither arrays nor command substitution, so instead we 224 | # post-process each arg (as a line of input to sed) to backslash-escape any 225 | # character that might be a shell metacharacter, then use eval to reverse 226 | # that process (while maintaining the separation between arguments), and wrap 227 | # the whole thing up as a single "set" statement. 228 | # 229 | # This will of course break if any of these variables contains a newline or 230 | # an unmatched quote. 231 | # 232 | 233 | eval "set -- $( 234 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 235 | xargs -n1 | 236 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 237 | tr '\n' ' ' 238 | )" '"$@"' 239 | 240 | exec "$JAVACMD" "$@" 241 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%"=="" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%"=="" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if %ERRORLEVEL% equ 0 goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if %ERRORLEVEL% equ 0 goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | set EXIT_CODE=%ERRORLEVEL% 84 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 85 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 86 | exit /b %EXIT_CODE% 87 | 88 | :mainEnd 89 | if "%OS%"=="Windows_NT" endlocal 90 | 91 | :omega 92 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'gpt3-tokenizer-java' 2 | 3 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/ByteSequence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import java.nio.charset.Charset; 8 | import java.nio.charset.StandardCharsets; 9 | import java.util.Arrays; 10 | 11 | /** 12 | * Represents a sequences of bytes. 13 | * 14 | * @author Mariusz Bernacki 15 | * 16 | */ 17 | public interface ByteSequence { 18 | 19 | /** The empty {@code ByteSequence} of length 0. */ 20 | ByteSequence EMPTY = of(new byte[0]); 21 | 22 | /** 23 | * Returns the byte at the specified offset. 24 | * 25 | * @param index the zero-based byte offset within the sequence of bytes (0 <= index < length()) 26 | * @return the byte at the specified offset 27 | * @throws IndexOutOfBoundsException if the index is out of range (index < 0 || index >= length()) 28 | */ 29 | byte byteAt(int index); 30 | 31 | /** 32 | * Returns the length of the byte sequence. 33 | * 34 | * @return the number of bytes in the sequence 35 | */ 36 | int length(); 37 | 38 | /** 39 | * Returns a new ByteSequence that is a sub-sequence of the current byte sequence. 40 | * The sub-sequence starts with the byte value at the specified {@code start} index and 41 | * extends to the byte value at index {@code end - 1}. 42 | * 43 | * @param start the beginning index, inclusive 44 | * @param end the ending index, exclusive 45 | * @return a new ByteSequence that is a sub-sequence of this byte sequence 46 | * @throws IndexOutOfBoundsException if the start or end index is invalid 47 | */ 48 | ByteSequence subSequence(int start, int end) throws IndexOutOfBoundsException; 49 | 50 | /** 51 | * Returns a hash code value for this byte sequence. 52 | * 53 | * @return a hash code value for this byte sequence 54 | */ 55 | @Override 56 | int hashCode(); 57 | 58 | /** 59 | * Compares the specified object with this byte sequence for equality. 60 | * Returns {@code true} if and only if the specified object is also a byte sequence 61 | * and both byte sequences have the same bytes in the same order. 62 | * 63 | * @param obj the object to be compared for equality with this byte sequence 64 | * @return {@code true} if the specified object is equal to this byte sequence, {@code false} otherwise 65 | */ 66 | @Override 67 | boolean equals(Object obj); 68 | 69 | /** 70 | * Returns a byte array representation of this byte sequence. 71 | * The returned array will be a copy of the internal byte array, ensuring that modifications 72 | * to the returned array do not affect the original byte sequence. 73 | * 74 | * @return a byte array representation of this byte sequence 75 | */ 76 | byte[] toByteArray(); 77 | 78 | /** 79 | * Converts the byte sequence to a String using the specified Charset. 80 | * 81 | * @param charset the Charset to be used for the conversion 82 | * @return a String representation of this byte sequence using the specified Charset 83 | */ 84 | String toString(Charset charset); 85 | 86 | /** 87 | * Returns a new ByteSequence instance containing the specified byte array. 88 | * The provided byte array is wrapped in an ImmutableByteSequence to ensure 89 | * that the contents of the byte array are not modified after the ByteSequence 90 | * is created. 91 | * 92 | * @param bytes the byte array to be used for the new ByteSequence 93 | * @return a new ByteSequence instance containing the specified byte array 94 | * @throws NullPointerException if the provided byte array is null 95 | */ 96 | static ByteSequence of(byte[] bytes) { 97 | return new Of(Arrays.copyOf(bytes, bytes.length)); 98 | } 99 | 100 | /** 101 | * Returns an immutable ByteSequence that is a copy of the specified ByteSequence. 102 | * If the provided ByteSequence is already an instance of ImmutableByteSequence, 103 | * it is returned directly; otherwise, a new ImmutableByteSequence is created. 104 | * 105 | * @param sequence the ByteSequence to be copied 106 | * @return an immutable ByteSequence that is a copy of the specified ByteSequence 107 | * @throws NullPointerException if the provided ByteSequence is null 108 | */ 109 | static ByteSequence copyOf(ByteSequence sequence) { 110 | if (sequence instanceof Of) 111 | return sequence; 112 | else 113 | return of(sequence.toByteArray()); 114 | } 115 | 116 | /** 117 | * Creates a ByteSequence from the specified String using the UTF-8 charset. 118 | * 119 | * @param text the String to be converted to a ByteSequence 120 | * @return a new ByteSequence that represents the specified String using the UTF-8 charset 121 | * @throws NullPointerException if the provided text is null 122 | */ 123 | static ByteSequence from(String text) { 124 | return from(text, StandardCharsets.UTF_8); 125 | } 126 | 127 | /** 128 | * Creates a ByteSequence from the specified String using the specified Charset. 129 | * 130 | * @param text the String to be converted to a ByteSequence 131 | * @param charset the Charset to be used for the conversion 132 | * @return a new ByteSequence that represents the specified String using the specified Charset 133 | * @throws NullPointerException if the provided text or charset is null 134 | */ 135 | static ByteSequence from(String text, Charset charset) { 136 | return new Of(text.getBytes(charset)); 137 | } 138 | 139 | /** 140 | * An immutable implementation of the {@code ByteSequence}. 141 | */ 142 | final class Of implements ByteSequence, Comparable { 143 | private final byte[] bytes; 144 | 145 | private Of(byte[] bytes) { 146 | this.bytes = bytes; 147 | } 148 | 149 | @Override 150 | public byte byteAt(int index) { 151 | if (index < 0 || index >= length()) { 152 | throw new IndexOutOfBoundsException("Index " + index + " is out of range (0 <= index < " + length() + ")"); 153 | } 154 | return bytes[index]; 155 | } 156 | 157 | @Override 158 | public int length() { 159 | return bytes.length; 160 | } 161 | 162 | @Override 163 | public Of subSequence(int start, int end) { 164 | return new Of(Arrays.copyOfRange(bytes, start, end)); 165 | } 166 | 167 | @Override 168 | public int hashCode() { 169 | return Arrays.hashCode(bytes); 170 | } 171 | 172 | @Override 173 | public boolean equals(Object obj) { 174 | if (obj instanceof Of other) { 175 | return Arrays.equals(bytes, other.bytes); 176 | } 177 | return false; 178 | } 179 | 180 | @Override 181 | public byte[] toByteArray() { 182 | return Arrays.copyOf(bytes, bytes.length); 183 | } 184 | 185 | @Override 186 | public String toString(Charset charset) { 187 | return new String(bytes, charset); 188 | } 189 | 190 | @Override 191 | public String toString() { 192 | return toString(StandardCharsets.UTF_8); 193 | } 194 | 195 | @Override 196 | public int compareTo(Of other) { 197 | return Arrays.compare(bytes, other.bytes); 198 | } 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/ChatFormatDescriptor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import java.util.Objects; 8 | 9 | /** 10 | * Describes the various chat messaging formats for the purpose of counting tokens 11 | * in chat conversations against different models. 12 | * 13 | * @author Mariusz Bernacki 14 | */ 15 | public interface ChatFormatDescriptor { 16 | 17 | Encoding encoding(); 18 | 19 | int extraTokenCountPerMessage(); 20 | 21 | int extraTokenCountPerRequest(); 22 | 23 | int extraTokenCountForFunctions(); 24 | 25 | int extraTokenCountPerFunctionCall(); 26 | 27 | static ChatFormatDescriptor forModel(String modelName) { 28 | return switch (modelName) { 29 | case "gpt-3.5-turbo" -> forModel("gpt-3.5-turbo-0125"); 30 | case "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k" -> forModel("gpt-4-0613"); 31 | case "gpt-3.5-turbo-0301" -> new Of(Encoding.forModel(modelName), 4, 3, Of.UNSUPPORTED, 3); 32 | case "gpt-4-0314", "gpt-4-32k-0314" -> new Of(Encoding.forModel(modelName), 3, 3, Of.UNSUPPORTED, 3); 33 | case "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0125", 34 | "gpt-4-0613", "gpt-4-32k-0613", "gpt-4-1106-preview", "gpt-4-turbo-preview", 35 | "gpt-4o", "gpt-4o-2024-05-13" -> new Of(Encoding.forModel(modelName), 3, 3, -1, 3); 36 | default -> throw new IllegalArgumentException(String.format("Model `%s` not found", modelName)); 37 | }; 38 | } 39 | 40 | record Of (Encoding encoding, int extraTokenCountPerMessage, int extraTokenCountPerRequest, int extraTokenCountForFunctions, int extraTokenCountPerFunctionCall) implements ChatFormatDescriptor { 41 | /** The special constant indicating that functions are not supported by the model descriptor. */ 42 | private static final int UNSUPPORTED = Integer.MIN_VALUE; 43 | 44 | public Of { 45 | Objects.requireNonNull(encoding, "encoding"); 46 | } 47 | 48 | @Override 49 | public int extraTokenCountForFunctions() { 50 | if (extraTokenCountForFunctions == UNSUPPORTED) 51 | throw new UnsupportedOperationException("Functions aren't supported by this model"); 52 | 53 | return extraTokenCountForFunctions; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/CompletionType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | public enum CompletionType { 8 | TEXT, CHAT 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/Encoding.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 OpenAI and Tiktoken's contributors 3 | * Copyright (c) 2023 Mariusz Bernacki 4 | * SPDX-License-Identifier: MIT 5 | * SPDX-FileComment: This file is a transpiled version of the code from https://github.com/openai/tiktoken 6 | */ 7 | package com.didalgo.gpt3; 8 | 9 | import java.io.*; 10 | import java.nio.charset.StandardCharsets; 11 | import java.util.*; 12 | import java.util.regex.Pattern; 13 | 14 | 15 | /** 16 | * Represents variants of BPE encoding. 17 | *

18 | * Modifications: 19 | *

    20 | *
  • [MB] 2023-03-25: Repackaged from Tiktoken for inclusion in gpt3-tokenizer-java.
  • 21 | *
  • [MB] 2023-04-02: Major refactoring for cleaner code and improved performance.
  • 22 | *
23 | */ 24 | public interface Encoding { 25 | 26 | String ENDOFTEXT = "<|endoftext|>"; 27 | String FIM_PREFIX = "<|fim_prefix|>"; 28 | String FIM_MIDDLE = "<|fim_middle|>"; 29 | String FIM_SUFFIX = "<|fim_suffix|>"; 30 | String ENDOFPROMPT = "<|endofprompt|>"; 31 | 32 | Encoding O200K_BASE = new Of( 33 | "o200k_base.tiktoken", new HashMap<>(), 34 | Map.of(ENDOFTEXT, 199999, ENDOFPROMPT, 200018), 35 | Pattern.compile( 36 | "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" + 37 | "|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" + 38 | "|\\p{N}{1,3}" + 39 | "| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*" + 40 | "|\\s*[\\r\\n]+" + 41 | "|\\s+(?!\\S)" + 42 | "|\\s+" 43 | , Pattern.UNICODE_CHARACTER_CLASS) 44 | ); 45 | 46 | Encoding CL100K_BASE = new Of( 47 | "cl100k_base.tiktoken", new HashMap<>(), 48 | Map.of(ENDOFTEXT, 100257, FIM_PREFIX, 100258, FIM_MIDDLE, 100259, FIM_SUFFIX, 100260, ENDOFPROMPT, 100276), 49 | Pattern.compile("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS) 50 | ); 51 | 52 | Encoding P50K_BASE = new Of( 53 | "p50k_base.tiktoken", new HashMap<>(), 54 | Map.of(ENDOFTEXT, 50256), 55 | Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS) 56 | ); 57 | 58 | Encoding P50K_EDIT = new Of( 59 | "p50k_base.tiktoken", new HashMap<>(), 60 | Map.of(ENDOFTEXT, 50256, FIM_PREFIX, 50281, FIM_MIDDLE, 50282, FIM_SUFFIX, 50283), 61 | Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS) 62 | ); 63 | 64 | Encoding R50K_BASE = new Of( 65 | "r50k_base.tiktoken", new HashMap<>(), 66 | Map.of(ENDOFTEXT, 50256), 67 | Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS) 68 | ); 69 | 70 | Map mergeableRanks(); 71 | 72 | Map specialTokens(); 73 | 74 | Pattern pattern(); 75 | 76 | record Of( 77 | String tiktokenFilename, 78 | Map mergeableRanks, 79 | Map specialTokens, 80 | Pattern pattern 81 | ) implements Encoding { 82 | public Of { 83 | specialTokens = Collections.unmodifiableMap(new HashMap<>(specialTokens)); // only wrapped HashMap is efficient enough; Map.copyOf() has performance issues 84 | } 85 | 86 | @Override 87 | public Map mergeableRanks() { 88 | if (mergeableRanks.isEmpty()) { 89 | synchronized (mergeableRanks) { 90 | if (mergeableRanks.isEmpty()) 91 | Lookup.loadTiktokenBase(tiktokenFilename, mergeableRanks); 92 | } 93 | } 94 | return Collections.unmodifiableMap(this.mergeableRanks); 95 | } 96 | } 97 | 98 | static Encoding forName(String encodingName) { 99 | return switch (encodingName.toLowerCase()) { 100 | case "o200k_base" -> O200K_BASE; 101 | case "cl100k_base" -> CL100K_BASE; 102 | case "p50k_base" -> P50K_BASE; 103 | case "p50k_edit" -> P50K_EDIT; 104 | case "r50k_base" -> R50K_BASE; 105 | default -> throw new IllegalArgumentException("Unknown encoding: " + encodingName); 106 | }; 107 | } 108 | 109 | static Encoding forModel(String modelName) { 110 | String encodingName = Lookup.modelToEncoding.get(modelName); 111 | if (encodingName == null) { 112 | encodingName = Lookup.modelPrefixToEncoding.keySet().stream() 113 | .filter(modelName::startsWith) 114 | .findFirst() 115 | .map(Lookup.modelPrefixToEncoding::get) 116 | .orElseThrow(() -> new IllegalArgumentException("Unknown model name: " + modelName)); 117 | } 118 | return forName(encodingName); 119 | } 120 | 121 | final class Lookup { 122 | private static final Map modelPrefixToEncoding; 123 | private static final Map modelToEncoding; 124 | static { 125 | var mp2e = new HashMap(); 126 | mp2e.put("gpt-4o-", "o200k_base"); 127 | mp2e.put("gpt-4-", "cl100k_base"); 128 | mp2e.put("gpt-3.5-turbo-", "cl100k_base"); 129 | modelPrefixToEncoding = mp2e; 130 | 131 | var m2e = new HashMap(); 132 | m2e.put("gpt-4o", "o200k_base"); 133 | m2e.put("gpt-4", "cl100k_base"); 134 | m2e.put("gpt-3.5-turbo", "cl100k_base"); 135 | m2e.put("text-davinci-003", "p50k_base"); 136 | m2e.put("text-davinci-002", "p50k_base"); 137 | m2e.put("text-davinci-001", "r50k_base"); 138 | m2e.put("text-curie-001", "r50k_base"); 139 | m2e.put("text-babbage-001", "r50k_base"); 140 | m2e.put("text-ada-001", "r50k_base"); 141 | m2e.put("davinci", "r50k_base"); 142 | m2e.put("curie", "r50k_base"); 143 | m2e.put("babbage", "r50k_base"); 144 | m2e.put("ada", "r50k_base"); 145 | m2e.put("code-davinci-002", "p50k_base"); 146 | m2e.put("code-davinci-001", "p50k_base"); 147 | m2e.put("code-cushman-002", "p50k_base"); 148 | m2e.put("code-cushman-001", "p50k_base"); 149 | m2e.put("davinci-codex", "p50k_base"); 150 | m2e.put("cushman-codex", "p50k_base"); 151 | m2e.put("text-davinci-edit-001", "p50k_edit"); 152 | m2e.put("code-davinci-edit-001", "p50k_edit"); 153 | m2e.put("text-embedding-ada-002", "cl100k_base"); 154 | m2e.put("text-similarity-davinci-001", "r50k_base"); 155 | m2e.put("text-similarity-curie-001", "r50k_base"); 156 | m2e.put("text-similarity-babbage-001", "r50k_base"); 157 | m2e.put("text-similarity-ada-001", "r50k_base"); 158 | m2e.put("text-search-davinci-doc-001", "r50k_base"); 159 | m2e.put("text-search-curie-doc-001", "r50k_base"); 160 | m2e.put("text-search-babbage-doc-001", "r50k_base"); 161 | m2e.put("text-search-ada-doc-001", "r50k_base"); 162 | m2e.put("code-search-babbage-code-001", "r50k_base"); 163 | m2e.put("code-search-ada-code-001", "r50k_base"); 164 | modelToEncoding = m2e; 165 | } 166 | 167 | public static Map loadTiktokenBase(String filename, Map resultMap) { 168 | try (InputStream in = Lookup.class.getResourceAsStream(filename)) { 169 | var result = (resultMap == null)? new HashMap() : resultMap; 170 | new BufferedReader(new InputStreamReader(in, StandardCharsets.US_ASCII)).lines() 171 | .filter(line -> !line.isEmpty()) 172 | .forEach(line -> { 173 | int spaceIdx = line.indexOf(' '); 174 | if (spaceIdx > 0) { 175 | ByteSequence key = ByteSequence.of(Base64.getDecoder().decode(line.substring(0, spaceIdx))); 176 | int value = Integer.parseInt(line.substring(spaceIdx + 1)); 177 | result.put(key, value); 178 | } 179 | }); 180 | return result; 181 | 182 | } catch (IOException e) { 183 | throw new UncheckedIOException(e); 184 | } 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/EncodingType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | /** 8 | * Represents various encoding types used by the OpenAI GPT models. 9 | *

10 | * Each encoding type is associated with a unique name, accessible through the {@link #encodingName()} method. 11 | * 12 | */ 13 | public enum EncodingType { 14 | O200K_BASE("o200k_base"), 15 | CL100K_BASE("cl100k_base"), 16 | R50K_BASE("r50k_base"), 17 | P50K_BASE("p50k_base"), 18 | P50K_EDIT("p50k_edit"); 19 | 20 | private final String encodingName; 21 | 22 | EncodingType(String encodingName) { 23 | this.encodingName = encodingName; 24 | } 25 | 26 | public String encodingName() { 27 | return encodingName; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/GPT3Tokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 OpenAI and Tiktoken's contributors 3 | * Copyright (c) 2023 Mariusz Bernacki 4 | * SPDX-License-Identifier: MIT 5 | * SPDX-FileComment: This file is a transpiled version of the code from https://github.com/openai/tiktoken 6 | */ 7 | package com.didalgo.gpt3; 8 | 9 | import java.io.ByteArrayOutputStream; 10 | import java.util.*; 11 | import java.util.Map.Entry; 12 | import java.util.regex.Pattern; 13 | import java.util.regex.Matcher; 14 | import java.util.stream.Collectors; 15 | 16 | import static java.nio.charset.StandardCharsets.ISO_8859_1; 17 | import static java.nio.charset.StandardCharsets.UTF_8; 18 | import static java.util.stream.Collectors.toMap; 19 | 20 | /** 21 | * Java implementation of the GPT3/4 tokenizer. 22 | *

23 | * Modifications: 24 | *

    25 | *
  • [MB] 2023-03-25: Repackaged from Tiktoken for inclusion in gpt3-tokenizer-java.
  • 26 | *
  • [MB] 2023-04-02: Major refactoring for cleaner code and improved performance.
  • 27 | *
28 | */ 29 | public class GPT3Tokenizer { 30 | private final Map encoder; 31 | private final Map decoder; 32 | private final Map specialTokensEncoder; 33 | private final Map specialTokensDecoder; 34 | private final Pattern pattern; 35 | private final Pattern specialPattern; 36 | 37 | public GPT3Tokenizer(Encoding encoding) { 38 | this.encoder = encoding.mergeableRanks(); 39 | this.decoder = encoder.entrySet().stream() 40 | .collect(toMap(Entry::getValue, Entry::getKey)); 41 | this.specialTokensEncoder = encoding.specialTokens(); 42 | this.specialTokensDecoder = specialTokensEncoder.entrySet().stream() 43 | .collect(toMap(Entry::getValue, Entry::getKey)); 44 | this.pattern = encoding.pattern(); 45 | this.specialPattern = createSpecialRegex(encoding.specialTokens()); 46 | } 47 | 48 | protected Pattern createSpecialRegex(Map specialTokensEncoder) { 49 | String joinedPattern = specialTokensEncoder.keySet().stream() 50 | .map(Pattern::quote) 51 | .collect(Collectors.joining("|")); 52 | return Pattern.compile(joinedPattern); 53 | } 54 | 55 | public String decode(List tokens) { 56 | return decodeImpl(tokens); 57 | } 58 | 59 | protected String decodeImpl(List tokens) { 60 | ByteArrayOutputStream result = new ByteArrayOutputStream(); 61 | 62 | for (Integer token : tokens) { 63 | ByteSequence bytes = decoder.get(token); 64 | if (bytes != null) 65 | result.writeBytes(bytes.toByteArray()); 66 | else 67 | result.writeBytes(specialTokensDecoder.get(token).getBytes(ISO_8859_1)); 68 | } 69 | return result.toString(UTF_8); 70 | } 71 | 72 | /** 73 | * Returns the regular expression for detecting special tokens 74 | * 75 | * @return the special tokenizing pattern 76 | */ 77 | protected Pattern getTlSpecialRegex() { 78 | return specialPattern; 79 | } 80 | 81 | /** 82 | * Returns the regular expression for tokenizing text 83 | * 84 | * @return the tokenizing pattern 85 | */ 86 | protected Pattern getTlRegex() { 87 | return pattern; 88 | } 89 | 90 | public List encode(CharSequence text) { 91 | return encode(text, false); 92 | } 93 | 94 | public List encode(CharSequence text, boolean allowedSpecial) { 95 | return encode(text, allowedSpecial? specialTokensEncoder.keySet() : Set.of()); 96 | } 97 | 98 | public List encode(CharSequence text, Set allowedSpecial) { 99 | return encodeImpl(text, allowedSpecial); 100 | } 101 | 102 | protected List encodeImpl(CharSequence text, Set allowedSpecial) { 103 | Pattern specialRegex = getTlSpecialRegex(); 104 | Pattern regex = getTlRegex(); 105 | List ret = new ArrayList<>(text.length() / 4); 106 | 107 | int start = 0; 108 | int lastPieceTokenLen = 0; 109 | while (true) { 110 | Matcher nextSpecial; 111 | int startFind = start; 112 | while (true) { 113 | // Find the next allowed special token, if any 114 | nextSpecial = specialRegex.matcher(text.subSequence(startFind, text.length())); 115 | if (nextSpecial.find()) { 116 | int startMatch = startFind + nextSpecial.start(); 117 | if (allowedSpecial.contains(text.subSequence(startMatch, startMatch + nextSpecial.group().length()).toString())) { 118 | break; 119 | } 120 | startFind = startMatch + 1; 121 | } else { 122 | nextSpecial = null; 123 | break; 124 | } 125 | } 126 | int end = (nextSpecial != null)? (start + nextSpecial.start()) : text.length(); 127 | 128 | // Tokenize the text using the regular expression 129 | Matcher matcher = regex.matcher(text.subSequence(start, end)); 130 | while (matcher.find()) { 131 | ByteSequence piece = ByteSequence.from(matcher.group()); 132 | Integer token = encoder.get(piece); 133 | if (token != null) { 134 | lastPieceTokenLen = 1; 135 | ret.add(token); 136 | } else { 137 | lastPieceTokenLen = bytePairMerge(piece, ret); 138 | } 139 | } 140 | 141 | // Add the special token if one was found 142 | if (nextSpecial != null) { 143 | String piece = nextSpecial.group(); 144 | Integer token = specialTokensEncoder.get(piece); 145 | ret.add(token); 146 | start += nextSpecial.end(); 147 | lastPieceTokenLen = 0; 148 | } else { 149 | break; 150 | } 151 | } 152 | 153 | // lastPieceTokenLen is how many tokens came from the last regex split. This is used 154 | // for determining unstable tokens, since you can't merge across (stable) regex splits 155 | return ret; 156 | } 157 | 158 | private static class IntPair { 159 | // Simple data structure for representing a pair of indices into a byte sequence 160 | int start, end; 161 | IntPair(int start, int end) { 162 | this.start = start; 163 | this.end = end; 164 | } 165 | } 166 | 167 | protected int getRank(ByteSequence piece, List partsList, int startIdx) { 168 | if (startIdx + 2 < partsList.size()) { 169 | ByteSequence bytes = piece.subSequence(partsList.get(startIdx).start, partsList.get(startIdx + 2).start); 170 | Integer rank = encoder.get(bytes); 171 | return (rank != null)? rank : Integer.MAX_VALUE; 172 | } else { 173 | return Integer.MAX_VALUE; 174 | } 175 | }; 176 | 177 | protected int bytePairMerge(ByteSequence piece, Collection result) { 178 | List parts = new ArrayList<>(piece.length() + 1); 179 | for (int i = 0; i <= piece.length(); i++) { 180 | parts.add(new IntPair(i, Integer.MAX_VALUE)); 181 | } 182 | 183 | for (int i = 0; i < parts.size() - 2; i++) { 184 | int rank = getRank(piece, parts, i); 185 | if (rank != Integer.MAX_VALUE) { 186 | parts.get(i).end = rank; 187 | } 188 | } 189 | 190 | while (parts.size() > 1) { 191 | int minRank = Integer.MAX_VALUE; 192 | int minIndex = -1; 193 | for (int i = 0; i < parts.size() - 1; i++) { 194 | int rank = parts.get(i).end; 195 | if (rank < minRank) { 196 | minRank = rank; 197 | minIndex = i; 198 | } 199 | } 200 | if (minRank == Integer.MAX_VALUE) { 201 | break; 202 | } 203 | parts.remove(minIndex + 1); 204 | parts.get(minIndex).end = getRank(piece, parts, minIndex); 205 | if (minIndex > 0) { 206 | parts.get(minIndex - 1).end = getRank(piece, parts, minIndex - 1); 207 | } 208 | } 209 | 210 | int resultCount = 0; 211 | for (int i = 0; i < parts.size() - 1; i++) { 212 | IntPair range = new IntPair(parts.get(i).start, parts.get(i + 1).start); 213 | result.add(encoder.get(piece.subSequence(range.start, range.end))); 214 | resultCount++; 215 | } 216 | 217 | return resultCount; 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/ModelType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import java.lang.ref.SoftReference; 8 | import java.util.Collections; 9 | import java.util.EnumMap; 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | import java.util.Optional; 13 | 14 | /** 15 | * ModelType represents a list of available OpenAI GPT models, also providing information about 16 | * their maximum token size and encoding types. 17 | * 18 | * @author Mariusz Bernacki 19 | */ 20 | public enum ModelType { 21 | // chat 22 | GPT_4_O("gpt-4o", EncodingType.O200K_BASE, 128000, CompletionType.CHAT), 23 | GPT_4_TURBO("gpt-4-turbo-preview", EncodingType.CL100K_BASE, 128000, CompletionType.CHAT), 24 | GPT_4("gpt-4", EncodingType.CL100K_BASE, 8192, CompletionType.CHAT), 25 | GPT_4_32K("gpt-4-32k", EncodingType.CL100K_BASE, 32768, CompletionType.CHAT), 26 | GPT_3_5_TURBO("gpt-3.5-turbo", EncodingType.CL100K_BASE, 16384, CompletionType.CHAT), 27 | GPT_3_5_TURBO_LEGACY("gpt-3.5-turbo", EncodingType.CL100K_BASE, 4096, CompletionType.CHAT), 28 | GPT_3_5_TURBO_16K("gpt-3.5-turbo-16k", EncodingType.CL100K_BASE, 16384, CompletionType.CHAT), 29 | 30 | // text 31 | GPT_3_5_TURBO_INSTRUCT("gpt-3.5-turbo-instruct", EncodingType.CL100K_BASE, 4097, CompletionType.TEXT), 32 | TEXT_DAVINCI_003("text-davinci-003", EncodingType.P50K_BASE, 4097, CompletionType.TEXT), 33 | TEXT_DAVINCI_002("text-davinci-002", EncodingType.P50K_BASE, 4097, CompletionType.TEXT), 34 | TEXT_DAVINCI_001("text-davinci-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 35 | TEXT_CURIE_001("text-curie-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 36 | TEXT_BABBAGE_001("text-babbage-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 37 | TEXT_ADA_001("text-ada-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 38 | DAVINCI("davinci", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 39 | CURIE("curie", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 40 | BABBAGE("babbage", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 41 | ADA("ada", EncodingType.R50K_BASE, 2049, CompletionType.TEXT), 42 | 43 | // code 44 | CODE_DAVINCI_002("code-davinci-002", EncodingType.P50K_BASE, 8001, CompletionType.TEXT), 45 | 46 | // edit 47 | TEXT_DAVINCI_EDIT_001("text-davinci-edit-001", EncodingType.P50K_EDIT, 2049, CompletionType.TEXT), 48 | CODE_DAVINCI_EDIT_001("code-davinci-edit-001", EncodingType.P50K_EDIT, 2049, CompletionType.TEXT), 49 | 50 | // embeddings 51 | TEXT_EMBEDDING_ADA_002("text-embedding-ada-002", EncodingType.CL100K_BASE, 8192, CompletionType.TEXT); 52 | 53 | 54 | private final String modelName; 55 | private final EncodingType encodingType; 56 | private final int maxTokens; 57 | private final CompletionType completionType; 58 | 59 | ModelType(String modelName, EncodingType encodingType, int maxTokens, CompletionType completionType) { 60 | this.modelName = modelName; 61 | this.encodingType = encodingType; 62 | this.maxTokens = maxTokens; 63 | this.completionType = completionType; 64 | } 65 | 66 | public String modelName() { 67 | return modelName; 68 | } 69 | 70 | public EncodingType encodingType() { 71 | return encodingType; 72 | } 73 | 74 | public int maxTokens() { 75 | return maxTokens; 76 | } 77 | 78 | public CompletionType completionType() { 79 | return completionType; 80 | } 81 | 82 | /** 83 | * Returns a {@link ModelType} for the given modelName, or throw exception if no 84 | * such model type exists. 85 | * 86 | * @param modelName the modelName of the model type 87 | * @return the model type 88 | * @throws IllegalArgumentException if the model with the given name doesn't exist 89 | */ 90 | public static Optional forModel(String modelName) throws IllegalArgumentException { 91 | Optional modelType = forModelExact(modelName); 92 | if (modelType.isPresent()) { 93 | return modelType; 94 | } 95 | 96 | // Truncate model version information 97 | boolean shortMatch; 98 | if ((shortMatch = modelName.matches(".*-\\d{4}$")) || modelName.matches(".*-\\d{4}-\\d{2}-\\d{2}$")) { 99 | modelName = shortMatch ? modelName.substring(0, modelName.length() - 5) 100 | : modelName.substring(0, modelName.length() - 11); 101 | 102 | modelType = forModelExact(modelName); 103 | if (modelType.isPresent()) { 104 | return modelType; 105 | } 106 | } 107 | throw new IllegalArgumentException("Model `" + modelName + "` not found"); 108 | } 109 | 110 | private static Optional forModelExact(String modelName) { 111 | if (specialVariants.containsKey(modelName)) { 112 | return Optional.of(specialVariants.get(modelName)); 113 | } 114 | 115 | for (final ModelType modelType : values()) { 116 | if (modelType.modelName().equals(modelName)) { 117 | return Optional.of(modelType); 118 | } 119 | } 120 | return Optional.empty(); 121 | } 122 | 123 | private static final class Cache { 124 | 125 | private static final Map> gptTokenizersCache = Collections.synchronizedMap(new EnumMap<>(ModelType.class)); 126 | 127 | private static GPT3Tokenizer getTokenizer(ModelType model) { 128 | GPT3Tokenizer tokenizer; 129 | SoftReference ref = Cache.gptTokenizersCache.get(model); 130 | if (ref == null || (tokenizer = ref.get()) == null) { 131 | synchronized (gptTokenizersCache) { 132 | Cache.gptTokenizersCache.put(model, new SoftReference<>(tokenizer = new GPT3Tokenizer(model.getEncoding()))); 133 | } 134 | } 135 | 136 | return tokenizer; 137 | } 138 | } 139 | 140 | public Encoding getEncoding() { 141 | return Encoding.forName(encodingType().encodingName()); 142 | } 143 | 144 | public GPT3Tokenizer getTokenizer() { 145 | return Cache.getTokenizer(this); 146 | } 147 | 148 | /** 149 | * Returns the {@code ChatFormatDescriptor} for this model, which can be used together with 150 | * {@link TokenCount} to count prompt tokens in conversation messages. 151 | *

152 | * Please NOTE that this model bag doesn't distinguish between model variants 153 | * (e.g. -0314, -0613, etc.), thus for models gpt-3.5-*-0301 or older the returned descriptor 154 | * may be imprecise. If you need precise descriptor for old gpt-3.5-turbo model please use 155 | * {@link ChatFormatDescriptor#forModel(String)} method instead. 156 | * 157 | * @return the {@code ChatFormatDescriptor} 158 | */ 159 | public ChatFormatDescriptor getChatFormatDescriptor() { 160 | return ChatFormatDescriptor.forModel(modelName()); 161 | } 162 | 163 | private static Map specialVariants = new HashMap<>(); 164 | static { 165 | specialVariants.put("gpt-3.5-turbo-0301", GPT_3_5_TURBO_LEGACY); 166 | specialVariants.put("gpt-3.5-turbo-0613", GPT_3_5_TURBO_LEGACY); 167 | 168 | specialVariants.put("gpt-4-turbo-preview", GPT_4_TURBO); 169 | specialVariants.put("gpt-4-1106-preview", GPT_4_TURBO); 170 | specialVariants.put("gpt-4-0125-preview", GPT_4_TURBO); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/TokenCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import com.theokanning.openai.completion.chat.ChatFunction; 8 | import com.theokanning.openai.completion.chat.ChatMessage; 9 | 10 | import java.util.List; 11 | import java.util.function.Function; 12 | import java.util.stream.StreamSupport; 13 | 14 | /** 15 | * Utility class for calculating token count in text and chat messages. 16 | *

17 | * This class provides methods for counting tokens in text strings and lists of 18 | * {@link ChatMessage} objects using a {@link GPT3Tokenizer}. It also supports pluggable 19 | * {@link TokenCountSupport} implementations, allowing customization of token counting logic.

20 | * 21 | * @author Mariusz Bernacki 22 | * 23 | */ 24 | public class TokenCount { 25 | 26 | /** 27 | * Calculates the total token count from a list of lines using the given tokenizer, 28 | * including newline tokens between lines. 29 | * 30 | * @param lines an iterable of lines of text (boring) 31 | * @param tokenizer the magic thing that tokenizes text 32 | * @return the total token count, including newline tokens between lines 33 | */ 34 | public static int fromLinesJoined(Iterable lines, GPT3Tokenizer tokenizer) { 35 | int tokenCount = StreamSupport.stream(lines.spliterator(), false) 36 | .mapToInt(line -> fromString(line, tokenizer) + 1) 37 | .sum(); 38 | return Math.max(0, tokenCount - 1); // subtract 1 token for the last newline character 39 | } 40 | 41 | /** 42 | * Calculates the token count for a given text string using the provided tokenizer. 43 | * 44 | * @param text the text string to tokenize (probably lorem ipsum or something) 45 | * @param tokenizer the tokenizer to use for token counting 46 | * @return the token count for the input text 47 | */ 48 | public static int fromString(String text, GPT3Tokenizer tokenizer) { 49 | return getSupport().countTokensFromString(text, tokenizer); 50 | } 51 | 52 | /** 53 | * Calculates the token count for a list of chat messages using the provided tokenizer 54 | * and chat format descriptor. 55 | * 56 | * @param messages a list of chat messages (probably gossip) 57 | * @param model the model 58 | * @return the token count for the input chat messages 59 | */ 60 | public static int fromMessages(List messages, ModelType model) { 61 | return fromMessages(messages, List.of(), model); 62 | } 63 | 64 | /** 65 | * Calculates the token count for a list of chat messages using the provided tokenizer 66 | * and chat format descriptor. 67 | * 68 | * @param messages a list of chat messages 69 | * @param functions a list of chat functions 70 | * @param model the model 71 | * @return the token count for the input chat messages 72 | */ 73 | public static int fromMessages(List messages, List functions, ModelType model) { 74 | return fromMessages(messages, functions, model.getTokenizer(), model.getChatFormatDescriptor()); 75 | } 76 | 77 | /** 78 | * Counts number of prompt tokens in messages. 79 | */ 80 | public static int fromMessages(List messages, GPT3Tokenizer tokenizer, ChatFormatDescriptor chatFormat) { 81 | return fromMessages(messages, List.of(), tokenizer, chatFormat); 82 | } 83 | 84 | /** 85 | * Calculates the token count for a list of chat messages using the provided tokenizer 86 | * and chat format descriptor. 87 | * 88 | * @param messages a list of chat messages 89 | * @param functions a list of chat functions 90 | * @param tokenizer the tokenizer to use for token counting 91 | * @param chatFormat the descriptor defining the chat format 92 | * @return the token count for the input chat messages 93 | */ 94 | public static int fromMessages(List messages, List functions, GPT3Tokenizer tokenizer, ChatFormatDescriptor chatFormat) { 95 | return fromMessages(messages, TokenizableMessage.from( 96 | ChatMessage::getRole, 97 | ChatMessage::getContent, 98 | ChatMessage::getName, 99 | chatMessage -> (chatMessage.getFunctionCall() == null)? TokenizableFunctionCall.NONE 100 | : TokenizableFunctionCall.of(chatMessage.getFunctionCall().getName(), chatMessage.getFunctionCall().getArguments().toString()) 101 | ), functions, TokenizableFunction.from( 102 | ChatFunction::getName, 103 | ChatFunction::getDescription, 104 | chatFunction -> getSupport().generateJsonSchema(chatFunction.getParametersClass()) 105 | ), chatFormat, tokenizer); 106 | } 107 | 108 | /** 109 | * Counts number of prompt tokens in messages. 110 | */ 111 | public static int fromMessages( 112 | List messages, 113 | List tools, 114 | ChatFormatDescriptor chatFormat, 115 | GPT3Tokenizer tokenizer) { 116 | 117 | return fromMessages(messages, Function.identity(), tools, Function.identity(), chatFormat, tokenizer); 118 | } 119 | 120 | /** 121 | * Counts number of prompt tokens in messages. 122 | */ 123 | public static int fromMessages( 124 | List messages, 125 | Function messageCoercer, 126 | List tools, 127 | Function toolCoercer, 128 | ChatFormatDescriptor chatFormat, 129 | GPT3Tokenizer tokenizer) { 130 | 131 | return getSupport().countTokensFromMessages(messages, messageCoercer, tools, toolCoercer, tokenizer, chatFormat); 132 | } 133 | 134 | /** 135 | * Returns the tokenization support object. 136 | * 137 | * @return the instance of {@link TokenCountSupport} 138 | */ 139 | private static TokenCountSupport getSupport() { 140 | return TokenCountSupport.getSupport(); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/TokenCountSupport.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import com.fasterxml.jackson.databind.JsonNode; 8 | import com.fasterxml.jackson.databind.ObjectMapper; 9 | import com.github.victools.jsonschema.generator.*; 10 | import com.github.victools.jsonschema.module.jackson.JacksonModule; 11 | import com.github.victools.jsonschema.module.jackson.JacksonOption; 12 | 13 | import javax.json.Json; 14 | import javax.json.JsonObject; 15 | import javax.json.JsonString; 16 | import javax.json.JsonValue; 17 | import java.io.StringReader; 18 | import java.util.Comparator; 19 | import java.util.List; 20 | import java.util.Map; 21 | import java.util.ServiceLoader; 22 | import java.util.function.Function; 23 | 24 | import static java.util.stream.Collectors.groupingBy; 25 | import static javax.json.JsonValue.EMPTY_JSON_ARRAY; 26 | import static javax.json.JsonValue.EMPTY_JSON_OBJECT; 27 | import static javax.json.JsonValue.ValueType.STRING; 28 | 29 | /** 30 | * Supports the pluggable token counting logic. 31 | */ 32 | public class TokenCountSupport { 33 | 34 | private static final FunctionDocumenter standardDocumenter = new StandardFunctionDocumenter(); 35 | 36 | public int countTokensFromString(String text, GPT3Tokenizer tokenizer) { 37 | return tokenizer.encode(text).size(); 38 | } 39 | 40 | public int countTokensFromMessages( 41 | List messages, 42 | Function messageCoercer, 43 | List tools, 44 | Function toolCoercer, 45 | GPT3Tokenizer tokenizer, 46 | ChatFormatDescriptor chatFormat) 47 | { 48 | var toolsPrompt = ""; 49 | if (!tools.isEmpty()) { 50 | var tokenizable = tools.stream() 51 | .map(toolCoercer) 52 | .toList(); 53 | toolsPrompt = generateDocumentation(tokenizable); 54 | } 55 | 56 | int tokenCount = 0; 57 | for (int index = 0; index < messages.size(); index++) { 58 | var tokenizable = messageCoercer.apply(messages.get(index)); 59 | tokenCount += chatFormat.extraTokenCountPerMessage(); 60 | 61 | var role = tokenizable.role(); 62 | if (role != null && !role.isEmpty()) 63 | tokenCount += tokenizer.encode(role).size(); 64 | 65 | var content = tokenizable.content(); 66 | if (content != null && role != null && index == 0 && "system".equals(role.toString())) { 67 | content += "\n\n" + toolsPrompt; 68 | toolsPrompt = ""; 69 | } 70 | if (content != null) 71 | tokenCount += tokenizer.encode(content).size(); 72 | 73 | var functionCall = tokenizable.functionCall(); 74 | if (functionCall.isPresent()) { 75 | tokenCount += tokenizer.encode(functionCall.name()).size(); 76 | tokenCount += tokenizer.encode(functionCall.arguments()).size(); 77 | tokenCount += chatFormat.extraTokenCountPerFunctionCall(); 78 | } 79 | } 80 | tokenCount += chatFormat.extraTokenCountPerRequest(); // Every reply is primed with assistant\n 81 | 82 | if (!tools.isEmpty()) { 83 | if (!toolsPrompt.isEmpty()) { 84 | tokenCount += chatFormat.extraTokenCountPerMessage(); 85 | tokenCount += tokenizer.encode("system").size(); 86 | tokenCount += tokenizer.encode(toolsPrompt).size(); 87 | } 88 | tokenCount += chatFormat.extraTokenCountForFunctions(); 89 | } 90 | 91 | return tokenCount; 92 | } 93 | 94 | public JsonObject generateJsonSchema(Class valueType) { 95 | JsonNode schemaNode = JsonSchemaUtils.generateSchema(valueType); 96 | return Json.createReader(new StringReader(schemaNode.toString())).readObject(); 97 | } 98 | 99 | public static TokenCountSupport getSupport() { 100 | return LazyHolder.INSTANCE; 101 | } 102 | 103 | private static final class LazyHolder { 104 | private static final TokenCountSupport INSTANCE = ServiceLoader.load(TokenCountSupport.class) 105 | .findFirst().orElseGet(TokenCountSupport::new); 106 | } 107 | 108 | private static final class JsonSchemaUtils { 109 | private static final Comparator> DECLARATION_ORDER = (__, ___) -> 0; 110 | private static final ObjectMapper mapper = new ObjectMapper(); 111 | private static final SchemaGenerator generator; 112 | static { 113 | SchemaGeneratorConfigBuilder configBuilder = new SchemaGeneratorConfigBuilder(mapper, SchemaVersion.DRAFT_2019_09, OptionPreset.PLAIN_JSON) 114 | .with(new JacksonModule(JacksonOption.RESPECT_JSONPROPERTY_REQUIRED)); 115 | configBuilder.forTypesInGeneral().withPropertySorter(DECLARATION_ORDER); 116 | generator = new SchemaGenerator(configBuilder.build()); 117 | } 118 | 119 | public static JsonNode generateSchema(Class valueType) { 120 | return generator.generateSchema(valueType); 121 | } 122 | } 123 | 124 | public String generateDocumentation(List tools) { 125 | StringBuilder sb = new StringBuilder(); 126 | 127 | sb.append("# Tools\n\n"); 128 | 129 | Map> toolsByCategory = tools.stream() 130 | .collect(groupingBy(TokenizableTool::toolCategory)); 131 | 132 | for (Map.Entry> categoryEntry : toolsByCategory.entrySet()) { 133 | sb.append("## ").append(categoryEntry.getKey()).append("\n\n"); 134 | 135 | Map> toolsByNamespace = categoryEntry.getValue().stream() 136 | .collect(groupingBy(TokenizableTool::toolNamespace)); 137 | 138 | for (Map.Entry> namespaceEntry : toolsByNamespace.entrySet()) { 139 | sb.append("namespace ").append(namespaceEntry.getKey()).append(" {\n\n"); 140 | for (TokenizableTool tool : namespaceEntry.getValue()) { 141 | sb.append(tool.generateDocumentation()).append("\n\n"); 142 | } 143 | sb.append("} // namespace ").append(namespaceEntry.getKey()).append("\n\n"); 144 | } 145 | } 146 | 147 | return sb.toString().stripTrailing(); 148 | } 149 | 150 | 151 | public interface FunctionDocumenter { 152 | CharSequence generateDocumentation(TokenizableFunction function); 153 | } 154 | 155 | public FunctionDocumenter getFunctionDocumenter(TokenizableFunction function) { 156 | return standardDocumenter; 157 | } 158 | 159 | private static class StandardFunctionDocumenter implements FunctionDocumenter { 160 | 161 | @Override 162 | public String generateDocumentation(TokenizableFunction function) { 163 | JsonObject params = function.parameters(); 164 | StringBuilder buf = new StringBuilder(); 165 | if (!function.description().isEmpty()) 166 | putDescription(buf, function.description()); 167 | 168 | putName(buf, function.name()); 169 | putParameters(buf, params, ""); 170 | putEnd(buf); 171 | 172 | return buf.toString(); 173 | } 174 | 175 | private static void putDescription(StringBuilder buf, JsonObject schema) { 176 | var description = schema.getString("description", "").strip(); 177 | putDescription(buf, description); 178 | } 179 | 180 | private static void putDescription(StringBuilder buf, String description) { 181 | if (!description.isEmpty()) 182 | description.lines().forEach(line -> buf.append("// ").append(line).append('\n')); 183 | } 184 | 185 | private static void putName(StringBuilder buf, String name) { 186 | buf.append("type ") 187 | .append(name) 188 | .append(" = (_: "); 189 | } 190 | 191 | private static void putParameters(StringBuilder buf, Map schema, String indent) { 192 | var properties = schema.getOrDefault("properties", EMPTY_JSON_OBJECT).asJsonObject(); 193 | var required = schema.getOrDefault("required", EMPTY_JSON_ARRAY).asJsonArray(); 194 | var definitions = schema.getOrDefault("definitions", EMPTY_JSON_OBJECT).asJsonObject(); 195 | putProperties( buf, 196 | properties, 197 | required.getValuesAs(JsonString::getString), 198 | definitions, 199 | indent); 200 | } 201 | 202 | private static void putProperties(StringBuilder buf, JsonObject schema, List required, Map definitions, String indent) { 203 | buf.append("{\n"); 204 | schema.forEach((name, value) -> { 205 | var valueDesc = value.asJsonObject(); 206 | if (indent.isEmpty()) 207 | putDescription(buf, valueDesc); 208 | 209 | buf.append(indent); 210 | buf.append(name); 211 | if (!isNested(indent) && !required.contains(name)) 212 | buf.append('?'); 213 | 214 | buf.append(": "); 215 | putParameterType(buf, valueDesc, indent); 216 | buf.append(",\n"); 217 | }); 218 | buf.append("}"); 219 | } 220 | 221 | private static void putParameterType(StringBuilder buf, JsonObject valueDesc, String indent) { 222 | var typeDesc = valueDesc.get("type"); 223 | if (typeDesc == null || typeDesc.getValueType() != STRING) { 224 | buf.append("any"); 225 | return; 226 | } 227 | 228 | if (valueDesc.containsKey("enum")) { 229 | buf.append(String.join(" | ", valueDesc.getJsonArray("enum").getValuesAs(JsonValue::toString))); 230 | return; 231 | } 232 | 233 | if (valueDesc.get("items") instanceof JsonObject arrayDesc && arrayDesc.containsKey("type")) { 234 | putParameterType(buf, arrayDesc, indent); 235 | buf.append("[]"); 236 | return; 237 | } 238 | 239 | var typeName = valueDesc.getString("type", "any"); 240 | switch (typeName) { 241 | case "integer", "number" -> buf.append("number"); 242 | case "boolean", "string" -> buf.append(typeName); 243 | case "object" -> putParameters(buf, valueDesc, " "); 244 | default -> buf.append("any"); 245 | } 246 | } 247 | 248 | private static void putEnd(StringBuilder buf) { 249 | buf.append(") => any;"); 250 | } 251 | 252 | private static boolean isNested(String indent) { 253 | return !indent.isEmpty(); 254 | } 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/TokenizableFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import javax.json.JsonObject; 8 | import javax.json.JsonValue; 9 | import java.util.function.Function; 10 | 11 | /** 12 | * The TokenizableFunction interface represents a function that can be tokenized. 13 | *

14 | * A function consists of a name, description, and parameters, which are accessible through 15 | * their respective methods. 16 | *

17 | * The interface also provides methods to generate documentation for the function and to 18 | * create new instances of TokenizableFunction. 19 | * 20 | * @author Mariusz Bernacki 21 | */ 22 | public interface TokenizableFunction extends TokenizableTool { 23 | 24 | /** 25 | * Returns the name of the function. 26 | * 27 | * @return the function name 28 | */ 29 | String name(); 30 | 31 | /** 32 | * Returns the description of the function. 33 | * 34 | * @return the function description 35 | */ 36 | String description(); 37 | 38 | /** 39 | * Returns the parameters of the function as a JsonObject. 40 | * 41 | * @return the function parameters 42 | */ 43 | JsonObject parameters(); 44 | 45 | /** 46 | * Generates a documentation for the function. The generated documentation 47 | * serves as a basis for counting tokens used by the function definition 48 | * when passed in chat conversation. 49 | * 50 | * @return the function documentation 51 | */ 52 | @Override 53 | default CharSequence generateDocumentation() { 54 | return TokenCountSupport.getSupport().getFunctionDocumenter(this).generateDocumentation(this); 55 | } 56 | 57 | /** 58 | * Returns the category of the tool. In this case, it's "functions". 59 | * 60 | * @return the tool category 61 | */ 62 | @Override 63 | default String toolCategory() { 64 | return "functions"; 65 | } 66 | 67 | /** 68 | * Returns the namespace of the tool. In this case, it's "functions". 69 | * 70 | * @return the tool namespace 71 | */ 72 | @Override 73 | default String toolNamespace() { 74 | return "functions"; 75 | } 76 | 77 | /** 78 | * Creates a function that is able to convert any type of object into an instance of 79 | * {@code TokenizableFunction} using the specified relevant property accessors. 80 | * 81 | * @param nameAccessor the function name accessor 82 | * @param descAccessor the function description accessor 83 | * @param paramsAccessor the function parameters accessor 84 | * @return a {@code TokenizableFunction} coercing function 85 | */ 86 | static Function from( 87 | Function nameAccessor, 88 | Function descAccessor, 89 | Function paramsAccessor 90 | ) { 91 | return function -> of( 92 | nameAccessor.apply(function), 93 | descAccessor.apply(function), 94 | paramsAccessor.apply(function) 95 | ); 96 | } 97 | 98 | /** 99 | * Creates a new instance of {@code TokenizableFunction} for the specified arguments. 100 | * 101 | * @param name the function name 102 | * @param description the function description 103 | * @param parameters the function parameters 104 | * @return a new {@code TokenizableFunction} object 105 | */ 106 | static TokenizableFunction of(String name, String description, JsonObject parameters) { 107 | return new Of(name, description, parameters); 108 | } 109 | 110 | record Of(String name, String description, JsonObject parameters) implements TokenizableFunction { 111 | public Of { 112 | name = firstOrElse(name, ""); 113 | description = firstOrElse(description, ""); 114 | parameters = firstOrElse(parameters, JsonValue.EMPTY_JSON_OBJECT); 115 | } 116 | 117 | private static V firstOrElse(V first, V orElse) { return (first != null) ? first : orElse; } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/TokenizableFunctionCall.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | /** 8 | * The TokenizableFunctionCall interface represents a message from the "assistant" that 9 | * intends to make a function call instead of providing the usual content. 10 | *

11 | * This interface provides access to the name of the function that the model intends to 12 | * call and the arguments for the function. The arguments are provided as a stringified 13 | * JSON object. Note that the JSON returned by the model may be invalid or may not 14 | * adhere to the schema. 15 | *

16 | * The interface also provides a method to check if the function call is present and a 17 | * factory method to create new instances of {@code TokenizableFunctionCall}. 18 | * 19 | * @author Mariusz Bernacki 20 | */ 21 | public interface TokenizableFunctionCall { 22 | 23 | /** The constant representing an absent function call. */ 24 | TokenizableFunctionCall NONE = new Of("", ""); 25 | 26 | /** 27 | * The name of the function that the model decided to call. 28 | * 29 | * @return the function name 30 | */ 31 | CharSequence name(); 32 | 33 | /** 34 | * The arguments for the function. A stringified JSON object (be aware 35 | * that the JSON returned be the model could be invalid or may not adhere to the schema) 36 | * 37 | * @return the stringified JSON function arguments 38 | */ 39 | CharSequence arguments(); 40 | 41 | /** 42 | * Checks if this object represents a non-empty function call. 43 | *

44 | * The default implementation returns the result of calling {@code !name().isEmpty()}. 45 | * 46 | * @return {@code true} if a function call is present, otherwise {@code false} 47 | */ 48 | default boolean isPresent() { 49 | return !name().isEmpty(); 50 | } 51 | 52 | /** 53 | * Creates a new {@code TokenizableFunctionCall} from the specified arguments. 54 | * 55 | * @param name the function name 56 | * @param arguments the function arguments 57 | * @return a new {@code TokenizableFunctionCall} object, or {@link #NONE} if 58 | * the provided {@code name} is empty 59 | */ 60 | static TokenizableFunctionCall of(CharSequence name, CharSequence arguments) { 61 | if (name.isEmpty()) { 62 | return NONE; 63 | } 64 | if (arguments == null) { 65 | arguments = ""; 66 | } 67 | return new Of(name, arguments); 68 | } 69 | 70 | record Of(CharSequence name, CharSequence arguments) implements TokenizableFunctionCall { } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/TokenizableMessage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import java.util.function.Function; 8 | 9 | /** 10 | * The TokenizableMessage interface represents a message that can be tokenized or used together 11 | * with the {@link TokenCount} utility for the purpose of token counting. 12 | *

13 | * A message comprises a role, name, content, and a function call, all of which are accessible 14 | * through their respective accessor methods. All methods should always return non-null values, 15 | * with empty {@code CharSequence} for missing content, or {@link TokenizableFunctionCall#NONE} 16 | * for an absent function call. 17 | * 18 | *

Any custom class can be converted into an instance of {@link TokenizableMessage} 19 | * using the {@link #from(Function, Function, Function, Function) factory method}. 20 | *
Example: 21 | *

 22 |  * {@code
 23 |  * TokenizableMessage message = TokenizableMessage.from(
 24 |  *     MyObj::getRole,
 25 |  *     MyObj::getContent,
 26 |  *     MyObj::getName,
 27 |  *     MyObj::getFunctionCall
 28 |  * ).apply(myObj);
 29 |  * }
 30 |  * 
31 | * 32 | * @author Mariusz Bernacki 33 | */ 34 | public interface TokenizableMessage { 35 | 36 | /** 37 | * Returns the role of the message's author. Can be system, user, assistant, or function. 38 | * 39 | * @return the role of the message's author 40 | */ 41 | CharSequence role(); 42 | 43 | /** 44 | * Returns the content of the message. Content is required for all messages, 45 | * except for assistant messages with function calls. 46 | * 47 | * @return the content of the message, or empty {@code CharSequence} if not provided 48 | */ 49 | CharSequence content(); 50 | 51 | /** 52 | * Returns the name of the message's author. Name is required if role is function, 53 | * and it should be the name of the function whose response is in the content. 54 | * 55 | * @return the name of the message's author, or empty {@code CharSequence} if not 56 | * provided 57 | */ 58 | CharSequence name(); 59 | 60 | /** 61 | * Returns the function call that should be made, as generated by the model. 62 | * 63 | * @return the function call, {@link TokenizableFunctionCall#NONE} if absent 64 | */ 65 | TokenizableFunctionCall functionCall(); 66 | 67 | /** 68 | * Static method to create a new tokenizable message, based on the provided accessors. 69 | * 70 | * @param the type of the message 71 | * @param roleAccessor the role accessor function 72 | * @param nameAccessor the name accessor function 73 | * @param contentAccessor the content accessor function 74 | * @param functionCallMaker the function call maker function 75 | * @return a function that creates a tokenizable message 76 | */ 77 | static Function from( 78 | Function roleAccessor, 79 | Function contentAccessor, 80 | Function nameAccessor, 81 | Function functionCallMaker 82 | ) { 83 | return message -> of( 84 | roleAccessor.apply(message), 85 | contentAccessor.apply(message), 86 | nameAccessor.apply(message), 87 | functionCallMaker.apply(message) 88 | ); 89 | } 90 | 91 | /** 92 | * Constructs a new assistant, system, or user message with the specified content. 93 | * 94 | * @param role the author's role 95 | * @param content the message content 96 | * @return the {@code TokenizableMessage} 97 | */ 98 | static TokenizableMessage of(CharSequence role, CharSequence content) { 99 | return of(role, content, "", TokenizableFunctionCall.NONE); 100 | } 101 | 102 | /** 103 | * Constructs a new assistant function call with the specified arguments. 104 | * 105 | * @param role the author's role 106 | * @param functionCall the function call name and arguments 107 | * @return the {@code TokenizableMessage} 108 | */ 109 | static TokenizableMessage of(CharSequence role, TokenizableFunctionCall functionCall) { 110 | return of(role, "", "", functionCall); 111 | } 112 | 113 | /** 114 | * Constructs a function message, representing a response with the specified arguments. 115 | * 116 | * @param role the author's role 117 | * @param content the message content 118 | * @param name the author's name 119 | * @return the {@code TokenizableMessage} 120 | */ 121 | static TokenizableMessage of(CharSequence role, CharSequence content, CharSequence name) { 122 | return new Of(role, content, name, TokenizableFunctionCall.NONE); 123 | } 124 | 125 | /** 126 | * Constructs a new {@code TokenizableMessage} from the specified arguments. 127 | * 128 | * @param role the author's role 129 | * @param content the message content 130 | * @param name the author's name 131 | * @param functionCall the function call name and arguments 132 | * @return the {@code TokenizableMessage} 133 | */ 134 | static TokenizableMessage of(CharSequence role, CharSequence content, CharSequence name, TokenizableFunctionCall functionCall) { 135 | return new Of(role, content, name, functionCall); 136 | } 137 | 138 | record Of(CharSequence role, CharSequence content, CharSequence name, TokenizableFunctionCall functionCall) implements TokenizableMessage { 139 | public Of { 140 | role = firstOrElse(role, ""); 141 | content = firstOrElse(content, ""); 142 | name = firstOrElse(name, ""); 143 | functionCall = firstOrElse(functionCall, TokenizableFunctionCall.NONE); 144 | } 145 | 146 | private static V firstOrElse(V first, V orElse) { return (first != null) ? first : orElse; } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/main/java/com/didalgo/gpt3/TokenizableTool.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | /** 8 | * An interface describing an object that provides a tool support for a language model. 9 | *

10 | * Currently only tools supported by OpenAI models are functions. 11 | * 12 | * @author Mariusz Bernacki 13 | */ 14 | public interface TokenizableTool { 15 | 16 | String toolCategory(); 17 | 18 | String toolNamespace(); 19 | 20 | CharSequence generateDocumentation(); 21 | } 22 | -------------------------------------------------------------------------------- /src/test/java/com/didalgo/gpt3/ByteSequenceTest.java: -------------------------------------------------------------------------------- 1 | package com.didalgo.gpt3; 2 | 3 | import org.junit.jupiter.api.BeforeEach; 4 | import org.junit.jupiter.api.Test; 5 | 6 | import static java.nio.charset.StandardCharsets.UTF_8; 7 | import static org.junit.jupiter.api.Assertions.*; 8 | 9 | class ByteSequenceTest { 10 | 11 | private byte[] TEST_SEQUENCE_BYTES; 12 | private ByteSequence TEST_SEQUENCE; 13 | 14 | @BeforeEach 15 | void setUp() { 16 | TEST_SEQUENCE = ByteSequence.of( TEST_SEQUENCE_BYTES = "TEST_SEQUENCE".getBytes() ); 17 | } 18 | 19 | @Test 20 | void byteAt_gives_byte_at_requested_position() { 21 | assertEquals((byte) '1', ByteSequence.from("1").byteAt(0)); 22 | assertEquals((byte) '2', ByteSequence.from("12").byteAt(1)); 23 | assertEquals((byte) '9', ByteSequence.from("123456789").byteAt(8)); 24 | } 25 | 26 | @Test 27 | void length_gives_number_of_bytes_in_sequence() { 28 | assertEquals(0, ByteSequence.EMPTY.length()); 29 | assertEquals(1, ByteSequence.from("1").length()); 30 | assertEquals(9, ByteSequence.from("123456789").length()); 31 | } 32 | 33 | @Test 34 | void subSequence_gives_subsequence_between_given_start_and_end() { 35 | assertEquals(ByteSequence.EMPTY, ByteSequence.from("123456789").subSequence(9, 9)); 36 | assertEquals(ByteSequence.from("1"), ByteSequence.from("123456789").subSequence(0, 1)); 37 | assertEquals(ByteSequence.from("9"), ByteSequence.from("123456789").subSequence(8, 9)); 38 | } 39 | 40 | @Test 41 | void hashCode_gives_identical_hashCode_for_identical_sequences() { 42 | ByteSequence aSequence = ByteSequence.from("TEST_SEQUENCE"); 43 | ByteSequence anotherSequence = ByteSequence.from("TEST_SEQUENCE"); 44 | assertEquals(anotherSequence.hashCode(), aSequence.hashCode()); 45 | } 46 | 47 | @Test 48 | void equals_identifies_identical_byte_sequences() { 49 | ByteSequence aSequence = ByteSequence.from("TEST_SEQUENCE"); 50 | ByteSequence anotherSequence = ByteSequence.from("TEST_SEQUENCE"); 51 | assertEquals(anotherSequence, aSequence); 52 | } 53 | 54 | @Test 55 | void toByteArray_produces_correct_byte_array_representation() { 56 | assertArrayEquals(TEST_SEQUENCE_BYTES, TEST_SEQUENCE.toByteArray()); 57 | } 58 | 59 | @Test 60 | void toString_gives_string_representation_using_given_charset() { 61 | String stringRepresentation = "TEST_SEQUENCE"; 62 | ByteSequence aSequence = ByteSequence.from(stringRepresentation); 63 | assertEquals(stringRepresentation, aSequence.toString(UTF_8)); 64 | } 65 | 66 | @Test 67 | void copyOf_creates_distinct_copy_when_not_immutable() { 68 | var copy = ByteSequence.copyOf(TEST_SEQUENCE); 69 | assertEquals(TEST_SEQUENCE, copy); 70 | assertSame(TEST_SEQUENCE, copy); 71 | } 72 | 73 | @Test 74 | void from_converts_string_to_byte_sequence_using_utf8() { 75 | var string = "TEST_SEQUENCE"; 76 | var fromString = ByteSequence.from(string); 77 | assertArrayEquals(string.getBytes(UTF_8), fromString.toByteArray()); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/test/java/com/didalgo/gpt3/GPT3TokenizerTest.java: -------------------------------------------------------------------------------- 1 | package com.didalgo.gpt3; 2 | 3 | import org.junit.jupiter.params.ParameterizedTest; 4 | import org.junit.jupiter.params.converter.ConvertWith; 5 | import org.junit.jupiter.params.provider.CsvSource; 6 | 7 | import java.util.List; 8 | 9 | import static org.junit.jupiter.api.Assertions.*; 10 | 11 | class GPT3TokenizerTest { 12 | 13 | @ParameterizedTest 14 | @CsvSource({ 15 | "gpt-4, 'Stop!', '[10903, 0]'", 16 | "gpt-4, 'Stop now.', '[10903, 1457, 13]'", 17 | "gpt-4, 'Stop what you''re doing.', '[10903, 1148, 499, 2351, 3815, 13]'", 18 | "gpt-4, 'Stop what you''re doing right now.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 13]'", 19 | "gpt-4, 'Stop what you''re doing right now and listen.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 13]'", 20 | "gpt-4, 'Stop what you''re doing right now and listen carefully.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 13]'", 21 | "gpt-4, 'Stop what you''re doing right now and listen carefully to me.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 13]'", 22 | "gpt-4, 'Stop what you''re doing right now and listen carefully to me, please.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 13]'", 23 | "gpt-4, 'Stop what you''re doing right now and listen carefully to me, please, because this is important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 3062, 13]'", 24 | "gpt-4, 'Stop what you''re doing right now and listen carefully to me, please, because this is very important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 1633, 3062, 13]'", 25 | "gpt-4, 'Przestań!', '[3617, 89, 30279, 19699, 0]'", 26 | "gpt-4, 'Przerwij to.', '[3617, 7215, 87183, 311, 13]'", 27 | "gpt-4, 'Przerwij to, co robisz.', '[3617, 7215, 87183, 311, 11, 1080, 10773, 70828, 13]'", 28 | "gpt-4, 'Przerwij to, co teraz robisz.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 13]'", 29 | "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 13]'", 30 | "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 577, 10196, 6077, 11044, 13]'", 31 | "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 13]'", 32 | "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie, proszę.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 11, 8882, 60705, 13]'", 33 | "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 10667, 6077, 818, 13]'", 34 | "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to bardzo ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 57958, 10667, 6077, 818, 13]'", 35 | "gpt-4, 'СТІЙ!', '[19871, 35095, 140, 228, 140, 247, 0]'", 36 | "gpt-4, 'Припини зараз.', '[17279, 31203, 8164, 19479, 1840, 44946, 89554, 13]'", 37 | "gpt-4, 'Припини те, що ти робиш.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 13]'", 38 | "gpt-4, 'Припини те, що ти робиш зараз.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 44946, 89554, 13]'", 39 | "gpt-4, 'Припини те, що ти зараз робиш, і послухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 61813, 3865, 10693, 19039, 13]'", 40 | "gpt-4, 'Припини те, що ти зараз робиш, і уважно слухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 14257, 5591, 38657, 13999, 35875, 3865, 10693, 19039, 13]'", 41 | "gpt-4, 'Припини те, що ти зараз робиш, і вислухай мене уважно.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 13]'", 42 | "gpt-4, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 13]'", 43 | "gpt-4, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка, тому що це важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 11, 11047, 72952, 9015, 231, 1482, 39233, 1532, 5927, 38657, 11320, 5591, 1482, 13]'", 44 | "gpt-4, 'Припини те, що ти зараз робиш, і слухай мене уважно, бо це дуже важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 35875, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 14391, 1482, 39233, 1532, 7952, 56999, 1532, 5927, 38657, 11320, 5591, 1482, 13]'", 45 | "gpt-4, 'Σταμάτα!', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 0]'", 46 | "gpt-4, 'Σταμάτα τώρα.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 39570, 139, 236, 39179, 19481, 13]'", 47 | "gpt-4, 'Σταμάτα αυτό που κάνεις.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 13]'", 48 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 13]'", 49 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 13]'", 50 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 13]'", 51 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 13]'", 52 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 11, 52845, 19481, 39179, 19481, 68437, 19481, 34586, 139, 236, 13]'", 53 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι σημαντικό.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 11, 52845, 19481, 39179, 19481, 68437, 19481, 34586, 139, 236, 11, 63127, 30862, 19481, 36924, 55241, 19581, 54556, 36924, 76295, 60247, 55241, 34369, 90002, 48823, 42524, 44223, 19481, 34369, 36924, 30862, 68437, 76295, 13]'", 54 | "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι πολύ σημαντικό.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 11, 52845, 19481, 39179, 19481, 68437, 19481, 34586, 139, 236, 11, 63127, 30862, 19481, 36924, 55241, 19581, 54556, 36924, 76295, 60247, 55241, 34369, 90002, 52845, 28654, 34586, 139, 235, 48823, 42524, 44223, 19481, 34369, 36924, 30862, 68437, 76295, 13]'", 55 | "gpt-4, 'class MyClass { public static void main(String[] args) { System.out.println(\"Hello, world!\"); }}', '[1058, 84926, 314, 586, 1118, 742, 1925, 2292, 1318, 2897, 8, 314, 744, 2594, 2986, 446, 9906, 11, 1917, 86640, 3954]'", 56 | "gpt-3.5-turbo, 'Stop!', '[10903, 0]'", 57 | "gpt-3.5-turbo, 'Stop now.', '[10903, 1457, 13]'", 58 | "gpt-3.5-turbo, 'Stop what you''re doing.', '[10903, 1148, 499, 2351, 3815, 13]'", 59 | "gpt-3.5-turbo, 'Stop what you''re doing right now.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 13]'", 60 | "gpt-3.5-turbo, 'Stop what you''re doing right now and listen.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 13]'", 61 | "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 13]'", 62 | "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 13]'", 63 | "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me, please.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 13]'", 64 | "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me, please, because this is important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 3062, 13]'", 65 | "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me, please, because this is very important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 1633, 3062, 13]'", 66 | "gpt-3.5-turbo, 'Przestań!', '[3617, 89, 30279, 19699, 0]'", 67 | "gpt-3.5-turbo, 'Przerwij to.', '[3617, 7215, 87183, 311, 13]'", 68 | "gpt-3.5-turbo, 'Przerwij to, co robisz.', '[3617, 7215, 87183, 311, 11, 1080, 10773, 70828, 13]'", 69 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 13]'", 70 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 13]'", 71 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 577, 10196, 6077, 11044, 13]'", 72 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 13]'", 73 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie, proszę.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 11, 8882, 60705, 13]'", 74 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 10667, 6077, 818, 13]'", 75 | "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to bardzo ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 57958, 10667, 6077, 818, 13]'", 76 | "gpt-3.5-turbo, 'СТІЙ!', '[19871, 35095, 140, 228, 140, 247, 0]'", 77 | "gpt-3.5-turbo, 'Припини зараз.', '[17279, 31203, 8164, 19479, 1840, 44946, 89554, 13]'", 78 | "gpt-3.5-turbo, 'Припини те, що ти робиш.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 13]'", 79 | "gpt-3.5-turbo, 'Припини те, що ти робиш зараз.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 44946, 89554, 13]'", 80 | "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і послухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 61813, 3865, 10693, 19039, 13]'", 81 | "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і уважно слухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 14257, 5591, 38657, 13999, 35875, 3865, 10693, 19039, 13]'", 82 | "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і вислухай мене уважно.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 13]'", 83 | "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 13]'", 84 | "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка, тому що це важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 11, 11047, 72952, 9015, 231, 1482, 39233, 1532, 5927, 38657, 11320, 5591, 1482, 13]'", 85 | "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і слухай мене уважно, бо це дуже важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 35875, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 14391, 1482, 39233, 1532, 7952, 56999, 1532, 5927, 38657, 11320, 5591, 1482, 13]'", 86 | "gpt-3.5-turbo, 'class MyClass { public static void main(String[] args) { System.out.println(\"Hello, world!\"); }}', '[1058, 84926, 314, 586, 1118, 742, 1925, 2292, 1318, 2897, 8, 314, 744, 2594, 2986, 446, 9906, 11, 1917, 86640, 3954]'", 87 | "gpt-3.5-turbo, 'I''m', '[40, 2846]'", 88 | "gpt-3.5-turbo, 'I''m in', '[40, 2846, 304]'", 89 | "gpt-3.5-turbo, 'I''M', '[40, 28703]'", 90 | "gpt-3.5-turbo, 'I''M IN', '[40, 28703, 2006]'", 91 | "gpt-3.5-turbo, 'I''VE', '[40, 6, 4592]'", 92 | "gpt-3.5-turbo, 'I''VE DONE', '[40, 6, 4592, 55785]'", 93 | "gpt-3.5-turbo, 'I''ll', '[40, 3358]'", 94 | "gpt-3.5-turbo, 'I''ll do', '[40, 3358, 656]'", 95 | "gpt-3.5-turbo, 'I''D', '[40, 28805]'", 96 | "gpt-3.5-turbo, 'I''D DO', '[40, 28805, 9503]'", 97 | "gpt-3.5-turbo, 'I''d', '[40, 4265]'", 98 | "gpt-3.5-turbo, 'I''d done', '[40, 4265, 2884]'", 99 | "gpt-3.5-turbo, 'I''M', '[40, 28703]'", 100 | "gpt-3.5-turbo, 'I''M DONE', '[40, 28703, 55785]'", 101 | "gpt-3.5-turbo, 'you''re', '[9514, 2351]'", 102 | "gpt-3.5-turbo, 'you''re done', '[9514, 2351, 2884]'", 103 | "gpt-3.5-turbo, 'You''Re', '[2675, 50527]'", 104 | "gpt-3.5-turbo, 'You''Re Done', '[2675, 50527, 28457]'", 105 | "gpt-3.5-turbo, 'YOU''LL', '[57489, 6, 4178]'", 106 | "gpt-3.5-turbo, 'YOU''LL DO', '[57489, 6, 4178, 9503]'", 107 | "gpt-3.5-turbo, 'she''s', '[32158, 596]'", 108 | "gpt-3.5-turbo, 'she''s done', '[32158, 596, 2884]'", 109 | "gpt-3.5-turbo, 'SHE''S', '[50, 1837, 13575]'", 110 | "gpt-3.5-turbo, 'SHE''S DONE', '[50, 1837, 13575, 55785]'", 111 | "gpt-3.5-turbo, 'can''t', '[4919, 956]'", 112 | "gpt-3.5-turbo, 'can''t do', '[4919, 956, 656]'", 113 | "gpt-3.5-turbo, 'Can''T', '[6854, 17773]'", 114 | "gpt-3.5-turbo, 'CAN''T DO', '[43055, 17773, 9503]'", 115 | "gpt-3.5-turbo, 'c#ode', '[66, 2, 536]'", 116 | "gpt-3.5-turbo, 'java_language', '[10248, 30121]'", 117 | "gpt-3.5-turbo, 'regex{test}', '[27485, 90, 1985, 92]'", 118 | "gpt-3.5-turbo, 'python$', '[12958, 3]'", 119 | "gpt-3.5-turbo, 'python$code', '[12958, 3, 1889]'", 120 | "gpt-3.5-turbo, '3.14159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534211706798214808651328230664709384460955058223172535940812848111745028410270193852110555', '[18, 13, 9335, 20128, 21598, 22905, 24531, 13895, 20911, 22956, 19230, 17267, 17824, 25962, 4468, 11739, 18572, 12935, 6550, 18248, 26007, 25687, 20128, 14777, 23713, 17264, 17361, 12171, 19416, 23574, 22379, 22091, 17590, 8546, 27309, 25873, 10410, 26956, 21164, 16544, 12879, 22644, 25202, 24344, 21138, 13506, 23670, 12245, 23309, 19192, 18058, 4386, 21235, 8546, 10617, 17058, 4278, 19597, 25454, 20767, 6550, 2131]'", 121 | "gpt-3.5-turbo, '😊', '[76460, 232]'", 122 | "gpt-3.5-turbo, '😂😍', '[76460, 224, 76460, 235]'", 123 | "gpt-3.5-turbo, '🤔😘😉', '[9468, 97, 242, 76460, 246, 76460, 231]'", 124 | "gpt-3.5-turbo, '🤯😴😜😝', '[9468, 97, 107, 76460, 112, 76460, 250, 76460, 251]'", 125 | "gpt-3.5-turbo, '😷🙄😶🤑😒', '[76460, 115, 9468, 247, 226, 76460, 114, 9468, 97, 239, 76460, 240]'", 126 | "gpt-3.5-turbo, '🤢🥺🥴🥵🥶🤕', '[9468, 97, 95, 9468, 98, 118, 9468, 98, 112, 9468, 98, 113, 9468, 98, 114, 9468, 97, 243]'", 127 | "gpt-3.5-turbo, '😭🤬🤪😈👹😻😼', '[76460, 255, 9468, 97, 105, 9468, 97, 103, 76460, 230, 9468, 239, 117, 76460, 119, 76460, 120]'", 128 | "gpt-3.5-turbo, '🤖💩👻👽🤡👺👾🧟‍♀️', '[9468, 97, 244, 93273, 102, 9468, 239, 119, 9468, 239, 121, 9468, 97, 94, 9468, 239, 118, 9468, 239, 122, 9468, 100, 253, 378, 235, 32990, 31643]'", 129 | "gpt-3.5-turbo, '🙏🏽🤲🏽👐🏽💪🏽👍🏽👎🏽✌🏽🤘🏽🤞🏽', '[9468, 247, 237, 9468, 237, 121, 9468, 97, 110, 9468, 237, 121, 9468, 80010, 9468, 237, 121, 93273, 103, 9468, 237, 121, 9468, 239, 235, 9468, 237, 121, 9468, 239, 236, 9468, 237, 121, 38798, 234, 9468, 237, 121, 9468, 97, 246, 9468, 237, 121, 9468, 97, 252, 9468, 237, 121]'", 130 | "gpt-3.5-turbo, '🌞🌈☀️❄️☔️🌊🍁🍂🌺🌸', '[9468, 234, 252, 9468, 234, 230, 18107, 222, 31643, 49633, 226, 31643, 18107, 242, 31643, 9468, 234, 232, 9468, 235, 223, 9468, 235, 224, 9468, 234, 118, 9468, 234, 116]'", 131 | "gpt-4o, 'Stop!', '[13523, 0]'", 132 | "gpt-4o, 'Stop now.', '[13523, 1954, 13]'", 133 | "gpt-4o, 'Stop what you''re doing.', '[13523, 1412, 7163, 5306, 13]'", 134 | "gpt-4o, 'Stop what you''re doing right now.', '[13523, 1412, 7163, 5306, 1849, 1954, 13]'", 135 | "gpt-4o, 'Stop what you''re doing right now and listen.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 13]'", 136 | "gpt-4o, 'Stop what you''re doing right now and listen carefully.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 13]'", 137 | "gpt-4o, 'Stop what you''re doing right now and listen carefully to me.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 13]'", 138 | "gpt-4o, 'Stop what you''re doing right now and listen carefully to me, please.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 11, 4843, 13]'", 139 | "gpt-4o, 'Stop what you''re doing right now and listen carefully to me, please, because this is important.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 11, 4843, 11, 2236, 495, 382, 3378, 13]'", 140 | "gpt-4o, 'Stop what you''re doing right now and listen carefully to me, please, because this is very important.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 11, 4843, 11, 2236, 495, 382, 1869, 3378, 13]'", 141 | "gpt-4o, 'Σταμάτα!', '[10720, 6319, 27992, 6319, 0]'", 142 | "gpt-4o, 'Σταμάτα τώρα.', '[10720, 6319, 27992, 6319, 153383, 13]'", 143 | "gpt-4o, 'Σταμάτα αυτό που κάνεις.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 13]'", 144 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 13]'", 145 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 13]'", 146 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 13]'", 147 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 13]'", 148 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 11, 38699, 29680, 132706, 13]'", 149 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι σημαντικό.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 11, 38699, 29680, 132706, 11, 120892, 43845, 17278, 114750, 33191, 13]'", 150 | "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι πολύ σημαντικό.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 11, 38699, 29680, 132706, 11, 120892, 43845, 17278, 60896, 114750, 33191, 13]'" 151 | }) 152 | void can_encode_or_decode_test_vectors_correctly(String model, 153 | String text, 154 | @ConvertWith(ListConverter.class) List tokens) { 155 | var enc = new GPT3Tokenizer(Encoding.forModel(model)); 156 | assertEquals(tokens, enc.encode(text)); 157 | assertEquals(text, enc.decode(tokens)); 158 | } 159 | } -------------------------------------------------------------------------------- /src/test/java/com/didalgo/gpt3/ListConverter.java: -------------------------------------------------------------------------------- 1 | package com.didalgo.gpt3; 2 | 3 | import org.junit.jupiter.params.converter.ArgumentConversionException; 4 | import org.junit.jupiter.params.converter.SimpleArgumentConverter; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | public class ListConverter extends SimpleArgumentConverter { 10 | 11 | @Override 12 | protected Object convert(Object source, Class targetType) throws ArgumentConversionException { 13 | if (source instanceof String input && List.class.isAssignableFrom(targetType)) { 14 | if (input.startsWith("[") && input.endsWith("]")) 15 | input = input.substring(1, input.length() - 1); 16 | 17 | return Arrays.stream(input.split(",")) 18 | .map(String::trim) 19 | .map(Integer::valueOf) 20 | .toList(); 21 | } 22 | throw new IllegalArgumentException("Conversion from " + source.getClass() + " to " 23 | + targetType + " not supported."); 24 | } 25 | } -------------------------------------------------------------------------------- /src/test/java/com/didalgo/gpt3/TokenCountTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import com.fasterxml.jackson.annotation.JsonProperty; 8 | import com.fasterxml.jackson.annotation.JsonPropertyDescription; 9 | import com.fasterxml.jackson.core.JsonProcessingException; 10 | import com.fasterxml.jackson.databind.ObjectMapper; 11 | import com.theokanning.openai.completion.chat.ChatFunction; 12 | import com.theokanning.openai.completion.chat.ChatFunctionCall; 13 | import com.theokanning.openai.completion.chat.ChatMessage; 14 | import com.theokanning.openai.completion.chat.ChatMessageRole; 15 | import lombok.Getter; 16 | import lombok.Setter; 17 | import org.junit.jupiter.api.Test; 18 | import org.junit.jupiter.params.ParameterizedTest; 19 | import org.junit.jupiter.params.provider.CsvSource; 20 | 21 | import java.util.List; 22 | 23 | import static org.junit.jupiter.api.Assertions.assertEquals; 24 | 25 | public class TokenCountTest { 26 | 27 | GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE); 28 | 29 | @Test 30 | void fromLinesJoined_gives_total_token_count_including_newlines() { 31 | assertEquals(0, TokenCount.fromLinesJoined(List.of(), tokenizer)); 32 | assertEquals(1, TokenCount.fromLinesJoined(List.of("1"), tokenizer)); 33 | assertEquals(3, TokenCount.fromLinesJoined(List.of("1", "2"), tokenizer)); 34 | assertEquals(5, TokenCount.fromLinesJoined(List.of("1", "2", "3"), tokenizer)); 35 | } 36 | 37 | @ParameterizedTest 38 | @CsvSource({ 39 | "121, gpt-3.5-turbo-0301", 40 | "115, gpt-3.5-turbo-0613", 41 | "115, gpt-3.5-turbo-16k-0613", 42 | "115, gpt-4-0314", 43 | "115, gpt-4-0613" 44 | }) 45 | void fromMessages_gives_correct_token_count(int expectedTokenCount, String modelName) { 46 | List messages = List.of( 47 | new ChatMessage("system", "You are a helpful, pattern-following assistant that translates corporate jargon into plain English."), 48 | new ChatMessage("user", "New synergies will help drive top-line growth."), 49 | new ChatMessage("assistant", "Things working well together will increase revenue."), 50 | new ChatMessage("user", "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage."), 51 | new ChatMessage("assistant", "Let's talk later when we're less busy about how to do better."), 52 | new ChatMessage("user", "This late pivot means we don't have time to boil the ocean for the client deliverable.") 53 | ); 54 | assertEquals(expectedTokenCount, TokenCount.fromMessages(messages, tokenizer, ChatFormatDescriptor.forModel(modelName))); 55 | } 56 | 57 | @Test 58 | void fromMessages_gives_expected_token_count_when_used_with_functions() throws JsonProcessingException { 59 | final int EXPECTED_TOKEN_COUNT = 232; 60 | 61 | var functionArgs = "{\n \"source_code\": \"import java.time.LocalDate;\\n\\npublic class Main {\\n public static void main(String[] args) {\\n LocalDate currentDate = LocalDate.now();\\n System.out.println(currentDate);\\n }\\n}\"\n}"; 62 | var jsonNode = new ObjectMapper().readTree(functionArgs); 63 | var messages = List.of( 64 | new ChatMessage(ChatMessageRole.SYSTEM.value(), "You are a helpful assistant. Follow user instructions carefully."), 65 | new ChatMessage(ChatMessageRole.USER.value(), "Please use Java to check current date."), 66 | new ChatMessage(ChatMessageRole.ASSISTANT.value(), null, null, new ChatFunctionCall("java", jsonNode)), 67 | new ChatMessage(ChatMessageRole.FUNCTION.value(), "TODAY", "java") 68 | ); 69 | var functions = List.of( 70 | new ChatFunction.Builder() 71 | .name("java") 72 | .description("Evaluate Java code.") 73 | .executor(JavaFunction.class, (__ -> null)) 74 | .build(), 75 | new ChatFunction.Builder() 76 | .name("sql") 77 | .description("Evaluate SQL code.") 78 | .executor(SqlFunction.class, (__ -> null)) 79 | .build() 80 | ); 81 | assertEquals(EXPECTED_TOKEN_COUNT, TokenCount.fromMessages(messages, functions, ModelType.GPT_3_5_TURBO_16K)); 82 | } 83 | 84 | @Getter 85 | @Setter 86 | public static class JavaFunction { 87 | 88 | @JsonProperty("source_code") 89 | @JsonPropertyDescription("the code to evaluate") 90 | private String sourceCode; 91 | 92 | @JsonProperty("version") 93 | @JsonPropertyDescription("the Java version number, i.e. 17") 94 | private Integer version; 95 | } 96 | 97 | @Getter 98 | @Setter 99 | public static class SqlFunction { 100 | 101 | @JsonProperty(value = "TYPE", required = true) 102 | @JsonPropertyDescription("the type of SQL query") 103 | private SqlType type; 104 | 105 | @JsonProperty(value = "SQL", required = true) 106 | @JsonPropertyDescription("the SQL object") 107 | private Sql sql; 108 | 109 | public enum SqlType { 110 | SELECT, UPDATE, DELETE, ALTER 111 | } 112 | } 113 | 114 | @Getter 115 | @Setter 116 | public static class Sql { 117 | 118 | @JsonProperty("columns") 119 | private List columns; 120 | 121 | @JsonProperty("condition") 122 | private String condition; 123 | 124 | @JsonProperty("limit") 125 | private Integer limit; 126 | 127 | @JsonProperty("ORDER BY") 128 | @JsonPropertyDescription("the result ordering") 129 | private OrderBy orderBy; 130 | } 131 | 132 | @Getter 133 | @Setter 134 | public static class OrderBy { 135 | 136 | @JsonProperty(value = "column", required = true) 137 | private String column; 138 | 139 | @JsonProperty("order") 140 | private Order order; 141 | 142 | public enum Order { 143 | ASC, DESC 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/test/java/com/didalgo/gpt3/TokenizableFunctionTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Mariusz Bernacki 3 | * SPDX-License-Identifier: MIT 4 | */ 5 | package com.didalgo.gpt3; 6 | 7 | import org.junit.jupiter.params.ParameterizedTest; 8 | import org.junit.jupiter.params.provider.Arguments; 9 | import org.junit.jupiter.params.provider.MethodSource; 10 | 11 | import javax.json.Json; 12 | import javax.json.JsonObject; 13 | import javax.json.JsonReader; 14 | 15 | import java.io.StringReader; 16 | import java.util.stream.Stream; 17 | 18 | import static org.junit.jupiter.api.Assertions.*; 19 | 20 | class TokenizableFunctionTest { 21 | 22 | @ParameterizedTest 23 | @MethodSource("provideTestData") 24 | void toString_converts_function_schema_to_internal_representation_the_model_was_trained_on(String name, String description, String jsonSchema, String representation) { 25 | var function = TokenizableFunction.of(name, description, toJsonObject(jsonSchema)); 26 | assertEquals(representation, function.generateDocumentation()); 27 | } 28 | 29 | private static JsonObject toJsonObject(String json) { 30 | try (JsonReader reader = Json.createReader(new StringReader(json))) { 31 | return reader.readObject(); 32 | } 33 | } 34 | 35 | static Stream provideTestData() { 36 | return Stream.of( 37 | Arguments.of( 38 | "invoke", 39 | "Invokes specialized function which no one knows how it works", 40 | """ 41 | { 42 | "type": "object", 43 | "properties": { 44 | "stringParameter": { 45 | "type": "string", 46 | "description": "The free-form text parameter" 47 | }, 48 | "booleanParameter": { 49 | "type": "boolean", 50 | "description": "Switch lights on/off" 51 | } 52 | }, 53 | "required": [ 54 | "stringParameter" 55 | ] 56 | } 57 | """, 58 | """ 59 | // Invokes specialized function which no one knows how it works 60 | type invoke = (_: { 61 | // The free-form text parameter 62 | stringParameter: string, 63 | // Switch lights on/off 64 | booleanParameter?: boolean, 65 | }) => any;""" 66 | )); 67 | } 68 | } -------------------------------------------------------------------------------- /src/test/resources/com/didalgo/gpt3/java.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "source_code": { 5 | "type": "string", 6 | "description": "the code to evaluate" 7 | }, 8 | "version": { 9 | "type": "integer", 10 | "description": "the Java version number, i.e. 17" 11 | } 12 | }, 13 | "required": [ 14 | "code" 15 | ] 16 | } -------------------------------------------------------------------------------- /src/test/resources/com/didalgo/gpt3/sql.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "TYPE": { 5 | "type": "string", 6 | "enum": ["SELECT","UPDATE","DELETE","ALTER"], 7 | "description": "the type of SQL query" 8 | }, 9 | "SQL": { 10 | "type": "object", 11 | "description": "the SQL object", 12 | "properties": { 13 | "columns": { 14 | "type": "array", 15 | "items": { 16 | "type": "string" 17 | } 18 | }, 19 | "condition": { 20 | "type": "string", 21 | "maxLength": 1000 22 | }, 23 | "limit": { 24 | "type": "number" 25 | }, 26 | "ORDER BY": { 27 | "type": "object", 28 | "description": "the result ordering", 29 | "properties": { 30 | "column": { 31 | "type": "string" 32 | }, 33 | "order": { 34 | "type": "string", 35 | "enum": ["ASC", "DESC"] 36 | } 37 | }, 38 | "required": ["column"] 39 | } 40 | } 41 | } 42 | }, 43 | "required": [ 44 | "TYPE", "SQL" 45 | ] 46 | } --------------------------------------------------------------------------------