├── .github
    └── workflows
    │   └── gradle.yml
├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── didalgo
        │   │       └── gpt3
        │   │           ├── ByteSequence.java
        │   │           ├── ChatFormatDescriptor.java
        │   │           ├── CompletionType.java
        │   │           ├── Encoding.java
        │   │           ├── EncodingType.java
        │   │           ├── GPT3Tokenizer.java
        │   │           ├── ModelType.java
        │   │           ├── TokenCount.java
        │   │           ├── TokenCountSupport.java
        │   │           ├── TokenizableFunction.java
        │   │           ├── TokenizableFunctionCall.java
        │   │           ├── TokenizableMessage.java
        │   │           └── TokenizableTool.java
        └── resources
        │   └── com
        │       └── didalgo
        │           └── gpt3
        │               ├── cl100k_base.tiktoken
        │               ├── o200k_base.tiktoken
        │               ├── p50k_base.tiktoken
        │               └── r50k_base.tiktoken
    └── test
        ├── java
            └── com
            │   └── didalgo
            │       └── gpt3
            │           ├── ByteSequenceTest.java
            │           ├── GPT3TokenizerTest.java
            │           ├── ListConverter.java
            │           ├── TokenCountTest.java
            │           └── TokenizableFunctionTest.java
        └── resources
            └── com
                └── didalgo
                    └── gpt3
                        ├── java.schema.json
                        └── sql.schema.json


/.github/workflows/gradle.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle
 3 | 
 4 | name: Gradle Build
 5 | 
 6 | on: [push]
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - name: Set up JDK 17
17 |         uses: actions/setup-java@v3
18 |         with:
19 |           java-version: '17'
20 |           distribution: 'temurin'
21 |       - name: Make Gradle script executable
22 |         run: chmod +x gradlew
23 |       - name: Build with Gradle
24 |         uses: gradle/gradle-build-action@67421db6bd0bf253fb4bd25b31ebb98943c375e1
25 |         with:
26 |           arguments: build
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /build/
2 | /.gradle/
3 | /.idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 didalgo2
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GPT3/4 Java Tokenizer
 2 | 
 3 | [![License: MIT](https://img.shields.io/github/license/didalgo2/gpt3-tokenizer-java?style=flat-square)](https://opensource.org/license/mit/)
 4 | ![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/didalgo2/gpt3-tokenizer-java/gradle.yml?style=flat-square)
 5 | [![Maven Central](https://img.shields.io/maven-central/v/com.didalgo/gpt3-tokenizer?style=flat-square)](https://central.sonatype.com/artifact/com.didalgo/gpt3-tokenizer/0.1.8)
 6 | 
 7 | This is a Java implementation of a GPT3/4 tokenizer, loosely ported from [Tiktoken](https://github.com/openai/tiktoken) with the help of [ChatGPT](https://openai.com/blog/chatgpt).
 8 | 
 9 | ## Usage Examples
10 | 
11 | ### Encoding Text to Tokens
12 | 
13 | ```java
14 | GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE);
15 | List<Integer> tokens = tokenizer.encode("example text here");
16 | ```
17 | 
18 | ### Decoding Tokens to Text
19 | 
20 | ```java
21 | GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE);
22 | List<Integer> tokens = Arrays.asList(123, 456, 789);
23 | String text = tokenizer.decode(tokens);
24 | ```
25 | 
26 | ### Counting Number of Tokens in Chat Messages
27 | 
28 | ```java
29 | var messages = List.of(
30 |         new ChatMessage(ChatMessageRole.SYSTEM.value(), "You are a helpful assistant."),
31 |         new ChatMessage(ChatMessageRole.USER.value(), "Hello there!")
32 | );
33 | var model = ModelType.GPT_3_5_TURBO;
34 | var count = TokenCount.fromMessages(messages, model);
35 | System.out.println("Prompt tokens: " + count);
36 | ```
37 | 
38 | ### Did you know...
39 | 
40 | 1. ...that all 3.5-turbo models released after _0613_ now have tokenization counts for messages consistent with gpt-4 models?
41 | 
42 | 1. ...that OpenAI Tokenizer available at https://platform.openai.com/tokenizer uses p50k_base encoding, thus it doesn't count correctly tokens for gpt-3.5 and gpt-4 models? If you look for decent alternative, you may like: https://tiktokenizer.vercel.app/, but keep in mind that tokenization for messages of gpt-3.5 models released after 0613 was changed (see point above).
43 | 
44 | 1. ...that in cl100k_base encoding every sequence of up to 81 spaces is just a single token? So next time when someone tells you that passing YAML to ChatGPT is not efficient, you can argue that...
45 | ```java
46 | var tokenizer = ModelType.GPT_3_5_TURBO.getTokenizer();
47 | var tokens = (List<Integer>) null;
48 | for (var sb = new StringBuilder(" "); (tokens = tokenizer.encode(sb)).size() == 1; sb.append(' '))
49 |     System.out.printf("`%s`'s token is %s, and that's %d space(s)!\n".replace("(s)", sb.length()==1?"":"s"), sb, tokens, sb.length());
50 | 
51 | ```
52 | ```
53 | `                                                                           `'s token is [14984], and that's 75 spaces!
54 | `                                                                            `'s token is [56899], and that's 76 spaces!
55 | `                                                                             `'s token is [59691], and that's 77 spaces!
56 | `                                                                              `'s token is [82321], and that's 78 spaces!
57 | `                                                                               `'s token is [40584], and that's 79 spaces!
58 | `                                                                                `'s token is [98517], and that's 80 spaces!
59 | `                                                                                 `'s token is [96529], and that's 81 spaces!
60 | ```
61 | 
62 | ## License
63 | 
64 | This project is licensed under the MIT License.


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
  1 | plugins {
  2 |     id 'java'
  3 |     id 'application'
  4 |     id 'maven-publish'
  5 |     id 'signing'
  6 | }
  7 | 
  8 | compileJava.options.encoding = "UTF-8"
  9 | compileTestJava.options.encoding = "UTF-8"
 10 | 
 11 | group 'com.didalgo'
 12 | archivesBaseName = 'gpt3-tokenizer'
 13 | version '0.1.9-SNAPSHOT'
 14 | 
 15 | repositories {
 16 |     mavenLocal()
 17 |     mavenCentral()
 18 | }
 19 | 
 20 | ext {
 21 |     gpt3_java_version = '0.14.0'
 22 |     jackson_version = '2.14.2'
 23 |     jupiter_version = '5.9.2'
 24 | }
 25 | 
 26 | dependencies {
 27 |     implementation "javax.json:javax.json-api:1.1.4"
 28 |     implementation "org.glassfish:javax.json:1.1.4"
 29 |     compileOnly "com.fasterxml.jackson.core:jackson-databind:${jackson_version}"
 30 |     compileOnly "com.github.victools:jsonschema-generator:4.31.1"
 31 |     compileOnly "com.github.victools:jsonschema-module-jackson:4.31.1"
 32 |     compileOnly "com.theokanning.openai-gpt3-java:api:${gpt3_java_version}"
 33 |     testAnnotationProcessor "org.projectlombok:lombok:1.18.26"
 34 |     testImplementation 'com.squareup.okhttp3:logging-interceptor:3.14.9'
 35 |     testImplementation "com.fasterxml.jackson.core:jackson-databind:${jackson_version}"
 36 |     testImplementation "com.github.victools:jsonschema-generator:4.31.1"
 37 |     testImplementation "com.github.victools:jsonschema-module-jackson:4.31.1"
 38 |     testImplementation "com.theokanning.openai-gpt3-java:service:${gpt3_java_version}"
 39 |     testImplementation "org.junit.jupiter:junit-jupiter-api:${jupiter_version}"
 40 |     testImplementation "org.junit.jupiter:junit-jupiter-params:${jupiter_version}"
 41 |     testImplementation "org.projectlombok:lombok:1.18.26"
 42 |     testRuntimeOnly "org.junit.jupiter:junit-jupiter-engine:${jupiter_version}"
 43 | }
 44 | 
 45 | test {
 46 |     useJUnitPlatform()
 47 | }
 48 | 
 49 | java {
 50 |     withJavadocJar()
 51 |     withSourcesJar()
 52 | }
 53 | 
 54 | publishing {
 55 |     publications.create("mavenJava", MavenPublication) {
 56 |         artifactId = 'gpt3-tokenizer'
 57 |         from components.java
 58 |         versionMapping {
 59 |             usage('java-api') {
 60 |                 fromResolutionOf('runtimeClasspath')
 61 |             }
 62 |             usage('java-runtime') {
 63 |                 fromResolutionResult()
 64 |             }
 65 |         }
 66 |         pom {
 67 |             name = 'Java GPT3/4 Tokenizer'
 68 |             description = 'Java implementation of a GPT3/4 tokenizer'
 69 |             url = 'https://github.com/didalgo2/gpt3-tokenizer-java'
 70 |             licenses {
 71 |                 license {
 72 |                     name = 'MIT License'
 73 |                     url = 'https://github.com/didalgo2/gpt3-tokenizer-java/blob/main/LICENSE'
 74 |                 }
 75 |             }
 76 |             developers {
 77 |                 developer {
 78 |                     id = 'didalgo'
 79 |                     name = 'Mariusz Bernacki'
 80 |                     email = 'didalgo@didalgo.com'
 81 |                 }
 82 |             }
 83 |             scm {
 84 |                 connection = 'scm:git:git://github.com/didalgo2/gpt3-tokenizer-java.git'
 85 |                 developerConnection = 'scm:git:ssh://github.com/didalgo2/gpt3-tokenizer-java.git'
 86 |                 url = 'https://github.com/didalgo2/gpt3-tokenizer-java/'
 87 |             }
 88 |         }
 89 |     }
 90 |     repositories {
 91 |         maven {
 92 |             def releasesRepoUrl = "https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/"
 93 |             def snapshotsRepoUrl = "https://s01.oss.sonatype.org/content/repositories/snapshots/"
 94 |             url = version.endsWith('SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
 95 |             credentials {
 96 |                 username = project.properties['ossrhUser'].toString()
 97 |                 password = project.properties['ossrhPassword'].toString()
 98 |             }
 99 |         }
100 |     }
101 | }
102 | 
103 | signing {
104 |     sign publishing.publications.mavenJava
105 | }
106 | 
107 | javadoc {
108 |     options.addBooleanOption('html5', true)
109 | }
110 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/didalgolab/gpt3-tokenizer-java/85b374b723058576fd0475664a92075cca1f0f71/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | #
  4 | # Copyright © 2015-2021 the original authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | #
 21 | #   Gradle start up script for POSIX generated by Gradle.
 22 | #
 23 | #   Important for running:
 24 | #
 25 | #   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
 26 | #       noncompliant, but you have some other compliant shell such as ksh or
 27 | #       bash, then to run this script, type that shell name before the whole
 28 | #       command line, like:
 29 | #
 30 | #           ksh Gradle
 31 | #
 32 | #       Busybox and similar reduced shells will NOT work, because this script
 33 | #       requires all of these POSIX shell features:
 34 | #         * functions;
 35 | #         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
 36 | #           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
 37 | #         * compound commands having a testable exit status, especially «case»;
 38 | #         * various built-in commands including «command», «set», and «ulimit».
 39 | #
 40 | #   Important for patching:
 41 | #
 42 | #   (2) This script targets any POSIX shell, so it avoids extensions provided
 43 | #       by Bash, Ksh, etc; in particular arrays are avoided.
 44 | #
 45 | #       The "traditional" practice of packing multiple parameters into a
 46 | #       space-separated string is a well documented source of bugs and security
 47 | #       problems, so this is (mostly) avoided, by progressively accumulating
 48 | #       options in "$@", and eventually passing that to Java.
 49 | #
 50 | #       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
 51 | #       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
 52 | #       see the in-line comments for details.
 53 | #
 54 | #       There are tweaks for specific operating systems such as AIX, CygWin,
 55 | #       Darwin, MinGW, and NonStop.
 56 | #
 57 | #   (3) This script is generated from the Groovy template
 58 | #       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
 59 | #       within the Gradle project.
 60 | #
 61 | #       You can find Gradle at https://github.com/gradle/gradle/.
 62 | #
 63 | ##############################################################################
 64 | 
 65 | # Attempt to set APP_HOME
 66 | 
 67 | # Resolve links: $0 may be a link
 68 | app_path=$0
 69 | 
 70 | # Need this for daisy-chained symlinks.
 71 | while
 72 |     APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
 73 |     [ -h "$app_path" ]
 74 | do
 75 |     ls=$( ls -ld "$app_path" )
 76 |     link=${ls#*' -> '}
 77 |     case $link in             #(
 78 |       /*)   app_path=$link ;; #(
 79 |       *)    app_path=$APP_HOME$link ;;
 80 |     esac
 81 | done
 82 | 
 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
 84 | 
 85 | APP_NAME="Gradle"
 86 | APP_BASE_NAME=${0##*/}
 87 | 
 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 90 | 
 91 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 92 | MAX_FD=maximum
 93 | 
 94 | warn () {
 95 |     echo "$*"
 96 | } >&2
 97 | 
 98 | die () {
 99 |     echo
100 |     echo "$*"
101 |     echo
102 |     exit 1
103 | } >&2
104 | 
105 | # OS specific support (must be 'true' or 'false').
106 | cygwin=false
107 | msys=false
108 | darwin=false
109 | nonstop=false
110 | case "$( uname )" in                #(
111 |   CYGWIN* )         cygwin=true  ;; #(
112 |   Darwin* )         darwin=true  ;; #(
113 |   MSYS* | MINGW* )  msys=true    ;; #(
114 |   NONSTOP* )        nonstop=true ;;
115 | esac
116 | 
117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
118 | 
119 | 
120 | # Determine the Java command to use to start the JVM.
121 | if [ -n "$JAVA_HOME" ] ; then
122 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
123 |         # IBM's JDK on AIX uses strange locations for the executables
124 |         JAVACMD=$JAVA_HOME/jre/sh/java
125 |     else
126 |         JAVACMD=$JAVA_HOME/bin/java
127 |     fi
128 |     if [ ! -x "$JAVACMD" ] ; then
129 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
130 | 
131 | Please set the JAVA_HOME variable in your environment to match the
132 | location of your Java installation."
133 |     fi
134 | else
135 |     JAVACMD=java
136 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
137 | 
138 | Please set the JAVA_HOME variable in your environment to match the
139 | location of your Java installation."
140 | fi
141 | 
142 | # Increase the maximum file descriptors if we can.
143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
144 |     case $MAX_FD in #(
145 |       max*)
146 |         MAX_FD=$( ulimit -H -n ) ||
147 |             warn "Could not query maximum file descriptor limit"
148 |     esac
149 |     case $MAX_FD in  #(
150 |       '' | soft) :;; #(
151 |       *)
152 |         ulimit -n "$MAX_FD" ||
153 |             warn "Could not set maximum file descriptor limit to $MAX_FD"
154 |     esac
155 | fi
156 | 
157 | # Collect all arguments for the java command, stacking in reverse order:
158 | #   * args from the command line
159 | #   * the main class name
160 | #   * -classpath
161 | #   * -D...appname settings
162 | #   * --module-path (only if needed)
163 | #   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
164 | 
165 | # For Cygwin or MSYS, switch paths to Windows format before running java
166 | if "$cygwin" || "$msys" ; then
167 |     APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
168 |     CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
169 | 
170 |     JAVACMD=$( cygpath --unix "$JAVACMD" )
171 | 
172 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
173 |     for arg do
174 |         if
175 |             case $arg in                                #(
176 |               -*)   false ;;                            # don't mess with options #(
177 |               /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
178 |                     [ -e "$t" ] ;;                      #(
179 |               *)    false ;;
180 |             esac
181 |         then
182 |             arg=$( cygpath --path --ignore --mixed "$arg" )
183 |         fi
184 |         # Roll the args list around exactly as many times as the number of
185 |         # args, so each arg winds up back in the position where it started, but
186 |         # possibly modified.
187 |         #
188 |         # NB: a `for` loop captures its iteration list before it begins, so
189 |         # changing the positional parameters here affects neither the number of
190 |         # iterations, nor the values presented in `arg`.
191 |         shift                   # remove old arg
192 |         set -- "$@" "$arg"      # push replacement arg
193 |     done
194 | fi
195 | 
196 | # Collect all arguments for the java command;
197 | #   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
198 | #     shell script including quotes and variable substitutions, so put them in
199 | #     double quotes to make sure that they get re-expanded; and
200 | #   * put everything else in single quotes, so that it's not re-expanded.
201 | 
202 | set -- \
203 |         "-Dorg.gradle.appname=$APP_BASE_NAME" \
204 |         -classpath "$CLASSPATH" \
205 |         org.gradle.wrapper.GradleWrapperMain \
206 |         "$@"
207 | 
208 | # Stop when "xargs" is not available.
209 | if ! command -v xargs >/dev/null 2>&1
210 | then
211 |     die "xargs is not available"
212 | fi
213 | 
214 | # Use "xargs" to parse quoted args.
215 | #
216 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed.
217 | #
218 | # In Bash we could simply go:
219 | #
220 | #   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
221 | #   set -- "${ARGS[@]}" "$@"
222 | #
223 | # but POSIX shell has neither arrays nor command substitution, so instead we
224 | # post-process each arg (as a line of input to sed) to backslash-escape any
225 | # character that might be a shell metacharacter, then use eval to reverse
226 | # that process (while maintaining the separation between arguments), and wrap
227 | # the whole thing up as a single "set" statement.
228 | #
229 | # This will of course break if any of these variables contains a newline or
230 | # an unmatched quote.
231 | #
232 | 
233 | eval "set -- $(
234 |         printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
235 |         xargs -n1 |
236 |         sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
237 |         tr '\n' ' '
238 |     )" '"$@"'
239 | 
240 | exec "$JAVACMD" "$@"
241 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%"=="" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%"=="" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if %ERRORLEVEL% equ 0 goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if %ERRORLEVEL% equ 0 goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | set EXIT_CODE=%ERRORLEVEL%
84 | if %EXIT_CODE% equ 0 set EXIT_CODE=1
85 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
86 | exit /b %EXIT_CODE%
87 | 
88 | :mainEnd
89 | if "%OS%"=="Windows_NT" endlocal
90 | 
91 | :omega
92 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'gpt3-tokenizer-java'
2 | 
3 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/ByteSequence.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import java.nio.charset.Charset;
  8 | import java.nio.charset.StandardCharsets;
  9 | import java.util.Arrays;
 10 | 
 11 | /**
 12 |  * Represents a sequences of bytes.
 13 |  *
 14 |  * @author Mariusz Bernacki
 15 |  *
 16 |  */
 17 | public interface ByteSequence {
 18 | 
 19 |     /** The empty {@code ByteSequence} of length 0. */
 20 |     ByteSequence EMPTY = of(new byte[0]);
 21 | 
 22 |     /**
 23 |      * Returns the byte at the specified offset.
 24 |      *
 25 |      * @param index the zero-based byte offset within the sequence of bytes (0 &lt;= index &lt; length())
 26 |      * @return the byte at the specified offset
 27 |      * @throws IndexOutOfBoundsException if the index is out of range (index &lt; 0 || index &gt;= length())
 28 |      */
 29 |     byte byteAt(int index);
 30 | 
 31 |     /**
 32 |      * Returns the length of the byte sequence.
 33 |      *
 34 |      * @return the number of bytes in the sequence
 35 |      */
 36 |     int length();
 37 | 
 38 |     /**
 39 |      * Returns a new ByteSequence that is a sub-sequence of the current byte sequence.
 40 |      * The sub-sequence starts with the byte value at the specified {@code start} index and
 41 |      * extends to the byte value at index {@code end - 1}.
 42 |      *
 43 |      * @param start the beginning index, inclusive
 44 |      * @param end   the ending index, exclusive
 45 |      * @return a new ByteSequence that is a sub-sequence of this byte sequence
 46 |      * @throws IndexOutOfBoundsException if the start or end index is invalid
 47 |      */
 48 |     ByteSequence subSequence(int start, int end) throws IndexOutOfBoundsException;
 49 | 
 50 |     /**
 51 |      * Returns a hash code value for this byte sequence.
 52 |      *
 53 |      * @return a hash code value for this byte sequence
 54 |      */
 55 |     @Override
 56 |     int hashCode();
 57 | 
 58 |     /**
 59 |      * Compares the specified object with this byte sequence for equality.
 60 |      * Returns {@code true} if and only if the specified object is also a byte sequence
 61 |      * and both byte sequences have the same bytes in the same order.
 62 |      *
 63 |      * @param obj the object to be compared for equality with this byte sequence
 64 |      * @return {@code true} if the specified object is equal to this byte sequence, {@code false} otherwise
 65 |      */
 66 |     @Override
 67 |     boolean equals(Object obj);
 68 | 
 69 |     /**
 70 |      * Returns a byte array representation of this byte sequence.
 71 |      * The returned array will be a copy of the internal byte array, ensuring that modifications
 72 |      * to the returned array do not affect the original byte sequence.
 73 |      *
 74 |      * @return a byte array representation of this byte sequence
 75 |      */
 76 |     byte[] toByteArray();
 77 | 
 78 |     /**
 79 |      * Converts the byte sequence to a String using the specified Charset.
 80 |      *
 81 |      * @param charset the Charset to be used for the conversion
 82 |      * @return a String representation of this byte sequence using the specified Charset
 83 |      */
 84 |     String toString(Charset charset);
 85 | 
 86 |     /**
 87 |      * Returns a new ByteSequence instance containing the specified byte array.
 88 |      * The provided byte array is wrapped in an ImmutableByteSequence to ensure
 89 |      * that the contents of the byte array are not modified after the ByteSequence
 90 |      * is created.
 91 |      *
 92 |      * @param bytes the byte array to be used for the new ByteSequence
 93 |      * @return a new ByteSequence instance containing the specified byte array
 94 |      * @throws NullPointerException if the provided byte array is null
 95 |      */
 96 |     static ByteSequence of(byte[] bytes) {
 97 |         return new Of(Arrays.copyOf(bytes, bytes.length));
 98 |     }
 99 | 
100 |     /**
101 |      * Returns an immutable ByteSequence that is a copy of the specified ByteSequence.
102 |      * If the provided ByteSequence is already an instance of ImmutableByteSequence,
103 |      * it is returned directly; otherwise, a new ImmutableByteSequence is created.
104 |      *
105 |      * @param sequence the ByteSequence to be copied
106 |      * @return an immutable ByteSequence that is a copy of the specified ByteSequence
107 |      * @throws NullPointerException if the provided ByteSequence is null
108 |      */
109 |     static ByteSequence copyOf(ByteSequence sequence) {
110 |         if (sequence instanceof Of)
111 |             return sequence;
112 |         else
113 |             return of(sequence.toByteArray());
114 |     }
115 | 
116 |     /**
117 |      * Creates a ByteSequence from the specified String using the UTF-8 charset.
118 |      *
119 |      * @param text the String to be converted to a ByteSequence
120 |      * @return a new ByteSequence that represents the specified String using the UTF-8 charset
121 |      * @throws NullPointerException if the provided text is null
122 |      */
123 |     static ByteSequence from(String text) {
124 |         return from(text, StandardCharsets.UTF_8);
125 |     }
126 | 
127 |     /**
128 |      * Creates a ByteSequence from the specified String using the specified Charset.
129 |      *
130 |      * @param text    the String to be converted to a ByteSequence
131 |      * @param charset the Charset to be used for the conversion
132 |      * @return a new ByteSequence that represents the specified String using the specified Charset
133 |      * @throws NullPointerException if the provided text or charset is null
134 |      */
135 |     static ByteSequence from(String text, Charset charset) {
136 |         return new Of(text.getBytes(charset));
137 |     }
138 | 
139 |     /**
140 |      * An immutable implementation of the {@code ByteSequence}.
141 |      */
142 |     final class Of implements ByteSequence, Comparable<Of> {
143 |         private final byte[] bytes;
144 | 
145 |         private Of(byte[] bytes) {
146 |             this.bytes = bytes;
147 |         }
148 | 
149 |         @Override
150 |         public byte byteAt(int index) {
151 |             if (index < 0 || index >= length()) {
152 |                 throw new IndexOutOfBoundsException("Index " + index + " is out of range (0 <= index < " + length() + ")");
153 |             }
154 |             return bytes[index];
155 |         }
156 | 
157 |         @Override
158 |         public int length() {
159 |             return bytes.length;
160 |         }
161 | 
162 |         @Override
163 |         public Of subSequence(int start, int end) {
164 |             return new Of(Arrays.copyOfRange(bytes, start, end));
165 |         }
166 | 
167 |         @Override
168 |         public int hashCode() {
169 |             return Arrays.hashCode(bytes);
170 |         }
171 | 
172 |         @Override
173 |         public boolean equals(Object obj) {
174 |             if (obj instanceof Of other) {
175 |                 return Arrays.equals(bytes, other.bytes);
176 |             }
177 |             return false;
178 |         }
179 | 
180 |         @Override
181 |         public byte[] toByteArray() {
182 |             return Arrays.copyOf(bytes, bytes.length);
183 |         }
184 | 
185 |         @Override
186 |         public String toString(Charset charset) {
187 |             return new String(bytes, charset);
188 |         }
189 | 
190 |         @Override
191 |         public String toString() {
192 |             return toString(StandardCharsets.UTF_8);
193 |         }
194 | 
195 |         @Override
196 |         public int compareTo(Of other) {
197 |             return Arrays.compare(bytes, other.bytes);
198 |         }
199 |     }
200 | }
201 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/ChatFormatDescriptor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
 3 |  * SPDX-License-Identifier: MIT
 4 |  */
 5 | package com.didalgo.gpt3;
 6 | 
 7 | import java.util.Objects;
 8 | 
 9 | /**
10 |  * Describes the various chat messaging formats for the purpose of counting tokens
11 |  * in chat conversations against different models.
12 |  *
13 |  * @author Mariusz Bernacki
14 |  */
15 | public interface ChatFormatDescriptor {
16 | 
17 |     Encoding encoding();
18 | 
19 |     int extraTokenCountPerMessage();
20 | 
21 |     int extraTokenCountPerRequest();
22 | 
23 |     int extraTokenCountForFunctions();
24 | 
25 |     int extraTokenCountPerFunctionCall();
26 | 
27 |     static ChatFormatDescriptor forModel(String modelName) {
28 |         return switch (modelName) {
29 |             case "gpt-3.5-turbo" -> forModel("gpt-3.5-turbo-0125");
30 |             case "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k" -> forModel("gpt-4-0613");
31 |             case "gpt-3.5-turbo-0301" -> new Of(Encoding.forModel(modelName), 4, 3, Of.UNSUPPORTED, 3);
32 |             case "gpt-4-0314", "gpt-4-32k-0314" -> new Of(Encoding.forModel(modelName), 3, 3, Of.UNSUPPORTED, 3);
33 |             case "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0125",
34 |                 "gpt-4-0613", "gpt-4-32k-0613", "gpt-4-1106-preview", "gpt-4-turbo-preview",
35 |                 "gpt-4o", "gpt-4o-2024-05-13" -> new Of(Encoding.forModel(modelName), 3, 3, -1, 3);
36 |             default -> throw new IllegalArgumentException(String.format("Model `%s` not found", modelName));
37 |         };
38 |     }
39 | 
40 |     record Of (Encoding encoding, int extraTokenCountPerMessage, int extraTokenCountPerRequest, int extraTokenCountForFunctions, int extraTokenCountPerFunctionCall) implements ChatFormatDescriptor {
41 |         /** The special constant indicating that functions are not supported by the model descriptor. */
42 |         private static final int UNSUPPORTED = Integer.MIN_VALUE;
43 | 
44 |         public Of {
45 |             Objects.requireNonNull(encoding, "encoding");
46 |         }
47 | 
48 |         @Override
49 |         public int extraTokenCountForFunctions() {
50 |             if (extraTokenCountForFunctions == UNSUPPORTED)
51 |                 throw new UnsupportedOperationException("Functions aren't supported by this model");
52 | 
53 |             return extraTokenCountForFunctions;
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/CompletionType.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
 3 |  * SPDX-License-Identifier: MIT
 4 |  */
 5 | package com.didalgo.gpt3;
 6 | 
 7 | public enum CompletionType {
 8 |     TEXT, CHAT
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/Encoding.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 OpenAI and Tiktoken's contributors
  3 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  4 |  * SPDX-License-Identifier: MIT
  5 |  * SPDX-FileComment: This file is a transpiled version of the code from https://github.com/openai/tiktoken
  6 |  */
  7 | package com.didalgo.gpt3;
  8 | 
  9 | import java.io.*;
 10 | import java.nio.charset.StandardCharsets;
 11 | import java.util.*;
 12 | import java.util.regex.Pattern;
 13 | 
 14 | 
 15 | /**
 16 |  * Represents variants of BPE encoding.
 17 |  * <p>
 18 |  * Modifications:
 19 |  * <ul>
 20 |  *     <li>[MB] 2023-03-25: Repackaged from <a href="https://github.com/openai/tiktoken">Tiktoken</a> for inclusion in gpt3-tokenizer-java.</li>
 21 |  *     <li>[MB] 2023-04-02: Major refactoring for cleaner code and improved performance.</li>
 22 |  * </ul>
 23 |  */
 24 | public interface Encoding {
 25 | 
 26 |     String ENDOFTEXT = "<|endoftext|>";
 27 |     String FIM_PREFIX = "<|fim_prefix|>";
 28 |     String FIM_MIDDLE = "<|fim_middle|>";
 29 |     String FIM_SUFFIX = "<|fim_suffix|>";
 30 |     String ENDOFPROMPT = "<|endofprompt|>";
 31 | 
 32 |     Encoding O200K_BASE = new Of(
 33 |             "o200k_base.tiktoken", new HashMap<>(),
 34 |             Map.of(ENDOFTEXT, 199999, ENDOFPROMPT, 200018),
 35 |             Pattern.compile(
 36 |                     "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" +
 37 |                     "|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" +
 38 |                     "|\\p{N}{1,3}" +
 39 |                     "| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*" +
 40 |                     "|\\s*[\\r\\n]+" +
 41 |                     "|\\s+(?!\\S)" +
 42 |                     "|\\s+"
 43 |                     , Pattern.UNICODE_CHARACTER_CLASS)
 44 |             );
 45 | 
 46 |     Encoding CL100K_BASE = new Of(
 47 |             "cl100k_base.tiktoken", new HashMap<>(),
 48 |             Map.of(ENDOFTEXT, 100257, FIM_PREFIX, 100258, FIM_MIDDLE, 100259, FIM_SUFFIX, 100260, ENDOFPROMPT, 100276),
 49 |             Pattern.compile("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS)
 50 |     );
 51 | 
 52 |     Encoding P50K_BASE = new Of(
 53 |             "p50k_base.tiktoken", new HashMap<>(),
 54 |             Map.of(ENDOFTEXT, 50256),
 55 |             Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS)
 56 |     );
 57 | 
 58 |     Encoding P50K_EDIT = new Of(
 59 |             "p50k_base.tiktoken", new HashMap<>(),
 60 |             Map.of(ENDOFTEXT, 50256, FIM_PREFIX, 50281, FIM_MIDDLE, 50282, FIM_SUFFIX, 50283),
 61 |             Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS)
 62 |     );
 63 | 
 64 |     Encoding R50K_BASE = new Of(
 65 |             "r50k_base.tiktoken", new HashMap<>(),
 66 |             Map.of(ENDOFTEXT, 50256),
 67 |             Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", Pattern.UNICODE_CHARACTER_CLASS)
 68 |     );
 69 | 
 70 |     Map<ByteSequence, Integer> mergeableRanks();
 71 | 
 72 |     Map<String, Integer> specialTokens();
 73 | 
 74 |     Pattern pattern();
 75 | 
 76 |     record Of(
 77 |             String tiktokenFilename,
 78 |             Map<ByteSequence, Integer> mergeableRanks,
 79 |             Map<String, Integer> specialTokens,
 80 |             Pattern pattern
 81 |     ) implements Encoding {
 82 |         public Of {
 83 |             specialTokens = Collections.unmodifiableMap(new HashMap<>(specialTokens)); // only wrapped HashMap is efficient enough; Map.copyOf() has performance issues
 84 |         }
 85 | 
 86 |         @Override
 87 |         public Map<ByteSequence, Integer> mergeableRanks() {
 88 |             if (mergeableRanks.isEmpty()) {
 89 |                 synchronized (mergeableRanks) {
 90 |                     if (mergeableRanks.isEmpty())
 91 |                         Lookup.loadTiktokenBase(tiktokenFilename, mergeableRanks);
 92 |                 }
 93 |             }
 94 |             return Collections.unmodifiableMap(this.mergeableRanks);
 95 |         }
 96 |     }
 97 | 
 98 |     static Encoding forName(String encodingName) {
 99 |         return switch (encodingName.toLowerCase()) {
100 |             case "o200k_base" -> O200K_BASE;
101 |             case "cl100k_base" -> CL100K_BASE;
102 |             case "p50k_base" -> P50K_BASE;
103 |             case "p50k_edit" -> P50K_EDIT;
104 |             case "r50k_base" -> R50K_BASE;
105 |             default -> throw new IllegalArgumentException("Unknown encoding: " + encodingName);
106 |         };
107 |     }
108 | 
109 |     static Encoding forModel(String modelName) {
110 |         String encodingName = Lookup.modelToEncoding.get(modelName);
111 |         if (encodingName == null) {
112 |             encodingName = Lookup.modelPrefixToEncoding.keySet().stream()
113 |                     .filter(modelName::startsWith)
114 |                     .findFirst()
115 |                     .map(Lookup.modelPrefixToEncoding::get)
116 |                     .orElseThrow(() -> new IllegalArgumentException("Unknown model name: " + modelName));
117 |         }
118 |         return forName(encodingName);
119 |     }
120 | 
121 |     final class Lookup {
122 |         private static final Map<String, String> modelPrefixToEncoding;
123 |         private static final Map<String, String> modelToEncoding;
124 |         static {
125 |             var mp2e = new HashMap<String, String>();
126 |             mp2e.put("gpt-4o-", "o200k_base");
127 |             mp2e.put("gpt-4-", "cl100k_base");
128 |             mp2e.put("gpt-3.5-turbo-", "cl100k_base");
129 |             modelPrefixToEncoding = mp2e;
130 | 
131 |             var m2e = new HashMap<String, String>();
132 |             m2e.put("gpt-4o", "o200k_base");
133 |             m2e.put("gpt-4", "cl100k_base");
134 |             m2e.put("gpt-3.5-turbo", "cl100k_base");
135 |             m2e.put("text-davinci-003", "p50k_base");
136 |             m2e.put("text-davinci-002", "p50k_base");
137 |             m2e.put("text-davinci-001", "r50k_base");
138 |             m2e.put("text-curie-001", "r50k_base");
139 |             m2e.put("text-babbage-001", "r50k_base");
140 |             m2e.put("text-ada-001", "r50k_base");
141 |             m2e.put("davinci", "r50k_base");
142 |             m2e.put("curie", "r50k_base");
143 |             m2e.put("babbage", "r50k_base");
144 |             m2e.put("ada", "r50k_base");
145 |             m2e.put("code-davinci-002", "p50k_base");
146 |             m2e.put("code-davinci-001", "p50k_base");
147 |             m2e.put("code-cushman-002", "p50k_base");
148 |             m2e.put("code-cushman-001", "p50k_base");
149 |             m2e.put("davinci-codex", "p50k_base");
150 |             m2e.put("cushman-codex", "p50k_base");
151 |             m2e.put("text-davinci-edit-001", "p50k_edit");
152 |             m2e.put("code-davinci-edit-001", "p50k_edit");
153 |             m2e.put("text-embedding-ada-002", "cl100k_base");
154 |             m2e.put("text-similarity-davinci-001", "r50k_base");
155 |             m2e.put("text-similarity-curie-001", "r50k_base");
156 |             m2e.put("text-similarity-babbage-001", "r50k_base");
157 |             m2e.put("text-similarity-ada-001", "r50k_base");
158 |             m2e.put("text-search-davinci-doc-001", "r50k_base");
159 |             m2e.put("text-search-curie-doc-001", "r50k_base");
160 |             m2e.put("text-search-babbage-doc-001", "r50k_base");
161 |             m2e.put("text-search-ada-doc-001", "r50k_base");
162 |             m2e.put("code-search-babbage-code-001", "r50k_base");
163 |             m2e.put("code-search-ada-code-001", "r50k_base");
164 |             modelToEncoding = m2e;
165 |         }
166 | 
167 |         public static Map<ByteSequence, Integer> loadTiktokenBase(String filename, Map<ByteSequence, Integer> resultMap) {
168 |             try (InputStream in = Lookup.class.getResourceAsStream(filename)) {
169 |                 var result = (resultMap == null)? new HashMap<ByteSequence, Integer>() : resultMap;
170 |                 new BufferedReader(new InputStreamReader(in, StandardCharsets.US_ASCII)).lines()
171 |                         .filter(line -> !line.isEmpty())
172 |                         .forEach(line -> {
173 |                             int spaceIdx = line.indexOf(' ');
174 |                             if (spaceIdx > 0) {
175 |                                 ByteSequence key = ByteSequence.of(Base64.getDecoder().decode(line.substring(0, spaceIdx)));
176 |                                 int value = Integer.parseInt(line.substring(spaceIdx + 1));
177 |                                 result.put(key, value);
178 |                             }
179 |                         });
180 |                 return result;
181 | 
182 |             } catch (IOException e) {
183 |                 throw new UncheckedIOException(e);
184 |             }
185 |         }
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/EncodingType.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
 3 |  * SPDX-License-Identifier: MIT
 4 |  */
 5 | package com.didalgo.gpt3;
 6 | 
 7 | /**
 8 |  * Represents various encoding types used by the OpenAI GPT models.
 9 |  * <p>
10 |  * Each encoding type is associated with a unique name, accessible through the {@link #encodingName()} method.
11 |  *
12 |  */
13 | public enum EncodingType {
14 | 	O200K_BASE("o200k_base"),
15 | 	CL100K_BASE("cl100k_base"),
16 | 	R50K_BASE("r50k_base"),
17 | 	P50K_BASE("p50k_base"),
18 | 	P50K_EDIT("p50k_edit");
19 | 
20 | 	private final String encodingName;
21 | 
22 | 	EncodingType(String encodingName) {
23 | 		this.encodingName = encodingName;
24 | 	}
25 | 
26 | 	public String encodingName() {
27 | 		return encodingName;
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/GPT3Tokenizer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 OpenAI and Tiktoken's contributors
  3 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  4 |  * SPDX-License-Identifier: MIT
  5 |  * SPDX-FileComment: This file is a transpiled version of the code from https://github.com/openai/tiktoken
  6 |  */
  7 | package com.didalgo.gpt3;
  8 | 
  9 | import java.io.ByteArrayOutputStream;
 10 | import java.util.*;
 11 | import java.util.Map.Entry;
 12 | import java.util.regex.Pattern;
 13 | import java.util.regex.Matcher;
 14 | import java.util.stream.Collectors;
 15 | 
 16 | import static java.nio.charset.StandardCharsets.ISO_8859_1;
 17 | import static java.nio.charset.StandardCharsets.UTF_8;
 18 | import static java.util.stream.Collectors.toMap;
 19 | 
 20 | /**
 21 |  * Java implementation of the GPT3/4 tokenizer.
 22 |  * <p>
 23 |  * Modifications:
 24 |  * <ul>
 25 |  *     <li>[MB] 2023-03-25: Repackaged from <a href="https://github.com/openai/tiktoken">Tiktoken</a> for inclusion in gpt3-tokenizer-java.</li>
 26 |  *     <li>[MB] 2023-04-02: Major refactoring for cleaner code and improved performance.</li>
 27 |  * </ul>
 28 |  */
 29 | public class GPT3Tokenizer {
 30 |     private final Map<ByteSequence, Integer> encoder;
 31 |     private final Map<Integer, ByteSequence> decoder;
 32 |     private final Map<String, Integer> specialTokensEncoder;
 33 |     private final Map<Integer, String> specialTokensDecoder;
 34 |     private final Pattern pattern;
 35 |     private final Pattern specialPattern;
 36 | 
 37 |     public GPT3Tokenizer(Encoding encoding) {
 38 |         this.encoder = encoding.mergeableRanks();
 39 |         this.decoder = encoder.entrySet().stream()
 40 |                 .collect(toMap(Entry::getValue, Entry::getKey));
 41 |         this.specialTokensEncoder = encoding.specialTokens();
 42 |         this.specialTokensDecoder = specialTokensEncoder.entrySet().stream()
 43 |                 .collect(toMap(Entry::getValue, Entry::getKey));
 44 |         this.pattern = encoding.pattern();
 45 |         this.specialPattern = createSpecialRegex(encoding.specialTokens());
 46 |     }
 47 | 
 48 |     protected Pattern createSpecialRegex(Map<String, ?> specialTokensEncoder) {
 49 |         String joinedPattern = specialTokensEncoder.keySet().stream()
 50 |                 .map(Pattern::quote)
 51 |                 .collect(Collectors.joining("|"));
 52 |         return Pattern.compile(joinedPattern);
 53 |     }
 54 | 
 55 |     public String decode(List<Integer> tokens) {
 56 |         return decodeImpl(tokens);
 57 |     }
 58 | 
 59 |     protected String decodeImpl(List<Integer> tokens) {
 60 |         ByteArrayOutputStream result = new ByteArrayOutputStream();
 61 | 
 62 |         for (Integer token : tokens) {
 63 |             ByteSequence bytes = decoder.get(token);
 64 |             if (bytes != null)
 65 |                 result.writeBytes(bytes.toByteArray());
 66 |             else
 67 |                 result.writeBytes(specialTokensDecoder.get(token).getBytes(ISO_8859_1));
 68 |         }
 69 |         return result.toString(UTF_8);
 70 |     }
 71 | 
 72 |     /**
 73 |      * Returns the regular expression for detecting special tokens
 74 |      *
 75 |      * @return the special tokenizing pattern
 76 |      */
 77 |     protected Pattern getTlSpecialRegex() {
 78 |         return specialPattern;
 79 |     }
 80 | 
 81 |     /**
 82 |      * Returns the regular expression for tokenizing text
 83 |      *
 84 |      * @return the tokenizing pattern
 85 |      */
 86 |     protected Pattern getTlRegex() {
 87 |         return pattern;
 88 |     }
 89 | 
 90 |     public List<Integer> encode(CharSequence text) {
 91 |         return encode(text, false);
 92 |     }
 93 | 
 94 |     public List<Integer> encode(CharSequence text, boolean allowedSpecial) {
 95 |         return encode(text, allowedSpecial? specialTokensEncoder.keySet() : Set.of());
 96 |     }
 97 | 
 98 |     public List<Integer> encode(CharSequence text, Set<String> allowedSpecial) {
 99 |         return encodeImpl(text, allowedSpecial);
100 |     }
101 | 
102 |     protected List<Integer> encodeImpl(CharSequence text, Set<String> allowedSpecial) {
103 |         Pattern specialRegex = getTlSpecialRegex();
104 |         Pattern regex = getTlRegex();
105 |         List<Integer> ret = new ArrayList<>(text.length() / 4);
106 | 
107 |         int start = 0;
108 |         int lastPieceTokenLen = 0;
109 |         while (true) {
110 |             Matcher nextSpecial;
111 |             int startFind = start;
112 |             while (true) {
113 |                 // Find the next allowed special token, if any
114 |                 nextSpecial = specialRegex.matcher(text.subSequence(startFind, text.length()));
115 |                 if (nextSpecial.find()) {
116 |                     int startMatch = startFind + nextSpecial.start();
117 |                     if (allowedSpecial.contains(text.subSequence(startMatch, startMatch + nextSpecial.group().length()).toString())) {
118 |                         break;
119 |                     }
120 |                     startFind = startMatch + 1;
121 |                 } else {
122 |                     nextSpecial = null;
123 |                     break;
124 |                 }
125 |             }
126 |             int end = (nextSpecial != null)? (start + nextSpecial.start()) : text.length();
127 | 
128 |             // Tokenize the text using the regular expression
129 |             Matcher matcher = regex.matcher(text.subSequence(start, end));
130 |             while (matcher.find()) {
131 |                 ByteSequence piece = ByteSequence.from(matcher.group());
132 |                 Integer token = encoder.get(piece);
133 |                 if (token != null) {
134 |                     lastPieceTokenLen = 1;
135 |                     ret.add(token);
136 |                 } else {
137 |                     lastPieceTokenLen = bytePairMerge(piece, ret);
138 |                 }
139 |             }
140 | 
141 |             // Add the special token if one was found
142 |             if (nextSpecial != null) {
143 |                 String piece = nextSpecial.group();
144 |                 Integer token = specialTokensEncoder.get(piece);
145 |                 ret.add(token);
146 |                 start += nextSpecial.end();
147 |                 lastPieceTokenLen = 0;
148 |             } else {
149 |                 break;
150 |             }
151 |         }
152 | 
153 |         // lastPieceTokenLen is how many tokens came from the last regex split. This is used
154 |         // for determining unstable tokens, since you can't merge across (stable) regex splits
155 |         return ret;
156 |     }
157 | 
158 |     private static class IntPair {
159 |         // Simple data structure for representing a pair of indices into a byte sequence
160 |         int start, end;
161 |         IntPair(int start, int end) {
162 |             this.start = start;
163 |             this.end = end;
164 |         }
165 |     }
166 | 
167 |     protected int getRank(ByteSequence piece, List<IntPair> partsList, int startIdx) {
168 |         if (startIdx + 2 < partsList.size()) {
169 |             ByteSequence bytes = piece.subSequence(partsList.get(startIdx).start, partsList.get(startIdx + 2).start);
170 |             Integer rank = encoder.get(bytes);
171 |             return (rank != null)? rank : Integer.MAX_VALUE;
172 |         } else {
173 |             return Integer.MAX_VALUE;
174 |         }
175 |     };
176 | 
177 |     protected int bytePairMerge(ByteSequence piece, Collection<Integer> result) {
178 |         List<IntPair> parts = new ArrayList<>(piece.length() + 1);
179 |         for (int i = 0; i <= piece.length(); i++) {
180 |             parts.add(new IntPair(i, Integer.MAX_VALUE));
181 |         }
182 | 
183 |         for (int i = 0; i < parts.size() - 2; i++) {
184 |             int rank = getRank(piece, parts, i);
185 |             if (rank != Integer.MAX_VALUE) {
186 |                 parts.get(i).end = rank;
187 |             }
188 |         }
189 | 
190 |         while (parts.size() > 1) {
191 |             int minRank = Integer.MAX_VALUE;
192 |             int minIndex = -1;
193 |             for (int i = 0; i < parts.size() - 1; i++) {
194 |                 int rank = parts.get(i).end;
195 |                 if (rank < minRank) {
196 |                     minRank = rank;
197 |                     minIndex = i;
198 |                 }
199 |             }
200 |             if (minRank == Integer.MAX_VALUE) {
201 |                 break;
202 |             }
203 |             parts.remove(minIndex + 1);
204 |             parts.get(minIndex).end = getRank(piece, parts, minIndex);
205 |             if (minIndex > 0) {
206 |                 parts.get(minIndex - 1).end = getRank(piece, parts, minIndex - 1);
207 |             }
208 |         }
209 | 
210 |         int resultCount = 0;
211 |         for (int i = 0; i < parts.size() - 1; i++) {
212 |             IntPair range = new IntPair(parts.get(i).start, parts.get(i + 1).start);
213 |             result.add(encoder.get(piece.subSequence(range.start, range.end)));
214 |             resultCount++;
215 |         }
216 | 
217 |         return resultCount;
218 |     }
219 | }
220 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/ModelType.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import java.lang.ref.SoftReference;
  8 | import java.util.Collections;
  9 | import java.util.EnumMap;
 10 | import java.util.HashMap;
 11 | import java.util.Map;
 12 | import java.util.Optional;
 13 | 
 14 | /**
 15 |  * ModelType represents a list of available OpenAI GPT models, also providing information about
 16 |  * their maximum token size and encoding types.
 17 |  *
 18 |  * @author Mariusz Bernacki
 19 |  */
 20 | public enum ModelType {
 21 | 	// chat
 22 | 	GPT_4_O("gpt-4o", EncodingType.O200K_BASE, 128000, CompletionType.CHAT),
 23 | 	GPT_4_TURBO("gpt-4-turbo-preview", EncodingType.CL100K_BASE, 128000, CompletionType.CHAT),
 24 | 	GPT_4("gpt-4", EncodingType.CL100K_BASE, 8192, CompletionType.CHAT),
 25 | 	GPT_4_32K("gpt-4-32k", EncodingType.CL100K_BASE, 32768, CompletionType.CHAT),
 26 | 	GPT_3_5_TURBO("gpt-3.5-turbo", EncodingType.CL100K_BASE, 16384, CompletionType.CHAT),
 27 | 	GPT_3_5_TURBO_LEGACY("gpt-3.5-turbo", EncodingType.CL100K_BASE, 4096, CompletionType.CHAT),
 28 | 	GPT_3_5_TURBO_16K("gpt-3.5-turbo-16k", EncodingType.CL100K_BASE, 16384, CompletionType.CHAT),
 29 | 
 30 | 	// text
 31 | 	GPT_3_5_TURBO_INSTRUCT("gpt-3.5-turbo-instruct", EncodingType.CL100K_BASE, 4097, CompletionType.TEXT),
 32 | 	TEXT_DAVINCI_003("text-davinci-003", EncodingType.P50K_BASE, 4097, CompletionType.TEXT),
 33 | 	TEXT_DAVINCI_002("text-davinci-002", EncodingType.P50K_BASE, 4097, CompletionType.TEXT),
 34 | 	TEXT_DAVINCI_001("text-davinci-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 35 | 	TEXT_CURIE_001("text-curie-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 36 | 	TEXT_BABBAGE_001("text-babbage-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 37 | 	TEXT_ADA_001("text-ada-001", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 38 | 	DAVINCI("davinci", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 39 | 	CURIE("curie", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 40 | 	BABBAGE("babbage", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 41 | 	ADA("ada", EncodingType.R50K_BASE, 2049, CompletionType.TEXT),
 42 | 
 43 | 	// code
 44 | 	CODE_DAVINCI_002("code-davinci-002", EncodingType.P50K_BASE, 8001, CompletionType.TEXT),
 45 | 
 46 | 	// edit
 47 | 	TEXT_DAVINCI_EDIT_001("text-davinci-edit-001", EncodingType.P50K_EDIT, 2049, CompletionType.TEXT),
 48 | 	CODE_DAVINCI_EDIT_001("code-davinci-edit-001", EncodingType.P50K_EDIT, 2049, CompletionType.TEXT),
 49 | 
 50 | 	// embeddings
 51 | 	TEXT_EMBEDDING_ADA_002("text-embedding-ada-002", EncodingType.CL100K_BASE, 8192, CompletionType.TEXT);
 52 | 
 53 | 
 54 | 	private final String modelName;
 55 | 	private final EncodingType encodingType;
 56 | 	private final int maxTokens;
 57 | 	private final CompletionType completionType;
 58 | 
 59 | 	ModelType(String modelName, EncodingType encodingType, int maxTokens, CompletionType completionType) {
 60 | 		this.modelName = modelName;
 61 | 		this.encodingType = encodingType;
 62 | 		this.maxTokens = maxTokens;
 63 | 		this.completionType = completionType;
 64 | 	}
 65 | 
 66 | 	public String modelName() {
 67 | 		return modelName;
 68 | 	}
 69 | 
 70 | 	public EncodingType encodingType() {
 71 | 		return encodingType;
 72 | 	}
 73 | 
 74 | 	public int maxTokens() {
 75 | 		return maxTokens;
 76 | 	}
 77 | 
 78 | 	public CompletionType completionType() {
 79 | 		return completionType;
 80 | 	}
 81 | 
 82 | 	/**
 83 | 	 * Returns a {@link ModelType} for the given modelName, or throw exception if no
 84 | 	 * such model type exists.
 85 | 	 *
 86 | 	 * @param modelName the modelName of the model type
 87 | 	 * @return the model type
 88 | 	 * @throws IllegalArgumentException if the model with the given name doesn't exist
 89 | 	 */
 90 | 	public static Optional<ModelType> forModel(String modelName) throws IllegalArgumentException {
 91 | 		Optional<ModelType> modelType = forModelExact(modelName);
 92 | 		if (modelType.isPresent()) {
 93 | 			return modelType;
 94 | 		}
 95 | 
 96 | 		// Truncate model version information
 97 | 		boolean shortMatch;
 98 | 		if ((shortMatch = modelName.matches(".*-\\d{4}$")) || modelName.matches(".*-\\d{4}-\\d{2}-\\d{2}$")) {
 99 | 			modelName = shortMatch ? modelName.substring(0, modelName.length() - 5)
100 | 					: modelName.substring(0, modelName.length() - 11);
101 | 
102 | 			modelType = forModelExact(modelName);
103 | 			if (modelType.isPresent()) {
104 | 				return modelType;
105 | 			}
106 | 		}
107 | 		throw new IllegalArgumentException("Model `" + modelName + "` not found");
108 | 	}
109 | 
110 | 	private static Optional<ModelType> forModelExact(String modelName) {
111 | 		if (specialVariants.containsKey(modelName)) {
112 | 			return Optional.of(specialVariants.get(modelName));
113 | 		}
114 | 
115 | 		for (final ModelType modelType : values()) {
116 | 			if (modelType.modelName().equals(modelName)) {
117 | 				return Optional.of(modelType);
118 | 			}
119 | 		}
120 | 		return Optional.empty();
121 | 	}
122 | 
123 | 	private static final class Cache {
124 | 
125 | 		private static final Map<ModelType, SoftReference<GPT3Tokenizer>> gptTokenizersCache = Collections.synchronizedMap(new EnumMap<>(ModelType.class));
126 | 
127 | 		private static GPT3Tokenizer getTokenizer(ModelType model) {
128 | 			GPT3Tokenizer tokenizer;
129 | 			SoftReference<GPT3Tokenizer> ref = Cache.gptTokenizersCache.get(model);
130 | 			if (ref == null || (tokenizer = ref.get()) == null) {
131 | 				synchronized (gptTokenizersCache) {
132 | 					Cache.gptTokenizersCache.put(model, new SoftReference<>(tokenizer = new GPT3Tokenizer(model.getEncoding())));
133 | 				}
134 | 			}
135 | 
136 | 			return tokenizer;
137 | 		}
138 | 	}
139 | 
140 | 	public Encoding getEncoding() {
141 | 		return Encoding.forName(encodingType().encodingName());
142 | 	}
143 | 
144 | 	public GPT3Tokenizer getTokenizer() {
145 | 		return Cache.getTokenizer(this);
146 | 	}
147 | 
148 | 	/**
149 | 	 * Returns the {@code ChatFormatDescriptor} for this model, which can be used together with
150 | 	 * {@link TokenCount} to count prompt tokens in conversation messages.
151 | 	 * <p>
152 | 	 * <b>Please NOTE</b> that this <i>model bag</i> doesn't distinguish between model variants
153 | 	 * (e.g. -0314, -0613, etc.), thus for models gpt-3.5-*-0301 or older the returned descriptor
154 | 	 * may be imprecise. If you need precise descriptor for old gpt-3.5-turbo model please use
155 | 	 * {@link ChatFormatDescriptor#forModel(String)} method instead.
156 | 	 *
157 | 	 * @return the {@code ChatFormatDescriptor}
158 | 	 */
159 | 	public ChatFormatDescriptor getChatFormatDescriptor() {
160 | 		return ChatFormatDescriptor.forModel(modelName());
161 | 	}
162 | 
163 | 	private static Map<String, ModelType> specialVariants = new HashMap<>();
164 | 	static {
165 | 		specialVariants.put("gpt-3.5-turbo-0301", GPT_3_5_TURBO_LEGACY);
166 | 		specialVariants.put("gpt-3.5-turbo-0613", GPT_3_5_TURBO_LEGACY);
167 | 
168 | 		specialVariants.put("gpt-4-turbo-preview", GPT_4_TURBO);
169 | 		specialVariants.put("gpt-4-1106-preview", GPT_4_TURBO);
170 | 		specialVariants.put("gpt-4-0125-preview", GPT_4_TURBO);
171 | 	}
172 | }
173 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/TokenCount.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import com.theokanning.openai.completion.chat.ChatFunction;
  8 | import com.theokanning.openai.completion.chat.ChatMessage;
  9 | 
 10 | import java.util.List;
 11 | import java.util.function.Function;
 12 | import java.util.stream.StreamSupport;
 13 | 
 14 | /**
 15 |  * Utility class for calculating token count in text and chat messages.
 16 |  * <p>
 17 |  * This class provides methods for counting tokens in text strings and lists of
 18 |  * {@link ChatMessage} objects using a {@link GPT3Tokenizer}. It also supports pluggable
 19 |  * {@link TokenCountSupport} implementations, allowing customization of token counting logic.</p>
 20 |  *
 21 |  * @author Mariusz Bernacki
 22 |  *
 23 |  */
 24 | public class TokenCount {
 25 | 
 26 |     /**
 27 |      * Calculates the total token count from a list of lines using the given tokenizer,
 28 |      * including newline tokens between lines.
 29 |      *
 30 |      * @param lines      an iterable of lines of text (boring)
 31 |      * @param tokenizer  the magic thing that tokenizes text
 32 |      * @return the total token count, including newline tokens between lines
 33 |      */
 34 |     public static int fromLinesJoined(Iterable<String> lines, GPT3Tokenizer tokenizer) {
 35 |         int tokenCount = StreamSupport.stream(lines.spliterator(), false)
 36 |                 .mapToInt(line -> fromString(line, tokenizer) + 1)
 37 |                 .sum();
 38 |         return Math.max(0, tokenCount - 1); // subtract 1 token for the last newline character
 39 |     }
 40 | 
 41 |     /**
 42 |      * Calculates the token count for a given text string using the provided tokenizer.
 43 |      *
 44 |      * @param text       the text string to tokenize (probably lorem ipsum or something)
 45 |      * @param tokenizer  the tokenizer to use for token counting
 46 |      * @return the token count for the input text
 47 |      */
 48 |     public static int fromString(String text, GPT3Tokenizer tokenizer) {
 49 |         return getSupport().countTokensFromString(text, tokenizer);
 50 |     }
 51 | 
 52 |     /**
 53 |      * Calculates the token count for a list of chat messages using the provided tokenizer
 54 |      * and chat format descriptor.
 55 |      *
 56 |      * @param messages     a list of chat messages (probably gossip)
 57 |      * @param model        the model
 58 |      * @return the token count for the input chat messages
 59 |      */
 60 |     public static int fromMessages(List<ChatMessage> messages, ModelType model) {
 61 |         return fromMessages(messages, List.of(), model);
 62 |     }
 63 | 
 64 |     /**
 65 |      * Calculates the token count for a list of chat messages using the provided tokenizer
 66 |      * and chat format descriptor.
 67 |      *
 68 |      * @param messages     a list of chat messages
 69 |      * @param functions    a list of chat functions
 70 |      * @param model        the model
 71 |      * @return the token count for the input chat messages
 72 |      */
 73 |     public static int fromMessages(List<ChatMessage> messages, List<ChatFunction> functions, ModelType model) {
 74 |         return fromMessages(messages, functions, model.getTokenizer(), model.getChatFormatDescriptor());
 75 |     }
 76 | 
 77 |     /**
 78 |      * Counts number of prompt tokens in messages.
 79 |      */
 80 |     public static int fromMessages(List<ChatMessage> messages, GPT3Tokenizer tokenizer, ChatFormatDescriptor chatFormat) {
 81 |         return fromMessages(messages, List.of(), tokenizer, chatFormat);
 82 |     }
 83 | 
 84 |     /**
 85 |      * Calculates the token count for a list of chat messages using the provided tokenizer
 86 |      * and chat format descriptor.
 87 |      *
 88 |      * @param messages     a list of chat messages
 89 |      * @param functions    a list of chat functions
 90 |      * @param tokenizer    the tokenizer to use for token counting
 91 |      * @param chatFormat   the descriptor defining the chat format
 92 |      * @return the token count for the input chat messages
 93 |      */
 94 |     public static int fromMessages(List<ChatMessage> messages, List<ChatFunction> functions, GPT3Tokenizer tokenizer, ChatFormatDescriptor chatFormat) {
 95 |         return fromMessages(messages, TokenizableMessage.from(
 96 |                 ChatMessage::getRole,
 97 |                 ChatMessage::getContent,
 98 |                 ChatMessage::getName,
 99 |                 chatMessage -> (chatMessage.getFunctionCall() == null)? TokenizableFunctionCall.NONE
100 |                         : TokenizableFunctionCall.of(chatMessage.getFunctionCall().getName(), chatMessage.getFunctionCall().getArguments().toString())
101 |         ), functions, TokenizableFunction.from(
102 |                 ChatFunction::getName,
103 |                 ChatFunction::getDescription,
104 |                 chatFunction -> getSupport().generateJsonSchema(chatFunction.getParametersClass())
105 |         ), chatFormat, tokenizer);
106 |     }
107 | 
108 |     /**
109 |      * Counts number of prompt tokens in messages.
110 |      */
111 |     public static int fromMessages(
112 |             List<? extends TokenizableMessage> messages,
113 |             List<? extends TokenizableTool> tools,
114 |             ChatFormatDescriptor chatFormat,
115 |             GPT3Tokenizer tokenizer) {
116 | 
117 |         return fromMessages(messages, Function.identity(), tools, Function.identity(), chatFormat, tokenizer);
118 |     }
119 | 
120 |     /**
121 |      * Counts number of prompt tokens in messages.
122 |      */
123 |     public static <T_MSG, T_TOOL> int fromMessages(
124 |             List<T_MSG> messages,
125 |             Function<T_MSG, ? extends TokenizableMessage> messageCoercer,
126 |             List<T_TOOL> tools,
127 |             Function<T_TOOL, ? extends TokenizableTool> toolCoercer,
128 |             ChatFormatDescriptor chatFormat,
129 |             GPT3Tokenizer tokenizer) {
130 | 
131 |         return getSupport().countTokensFromMessages(messages, messageCoercer, tools, toolCoercer, tokenizer, chatFormat);
132 |     }
133 | 
134 |     /**
135 |      * Returns the tokenization support object.
136 |      *
137 |      * @return the instance of {@link TokenCountSupport}
138 |      */
139 |     private static TokenCountSupport getSupport() {
140 |         return TokenCountSupport.getSupport();
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/TokenCountSupport.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import com.fasterxml.jackson.databind.JsonNode;
  8 | import com.fasterxml.jackson.databind.ObjectMapper;
  9 | import com.github.victools.jsonschema.generator.*;
 10 | import com.github.victools.jsonschema.module.jackson.JacksonModule;
 11 | import com.github.victools.jsonschema.module.jackson.JacksonOption;
 12 | 
 13 | import javax.json.Json;
 14 | import javax.json.JsonObject;
 15 | import javax.json.JsonString;
 16 | import javax.json.JsonValue;
 17 | import java.io.StringReader;
 18 | import java.util.Comparator;
 19 | import java.util.List;
 20 | import java.util.Map;
 21 | import java.util.ServiceLoader;
 22 | import java.util.function.Function;
 23 | 
 24 | import static java.util.stream.Collectors.groupingBy;
 25 | import static javax.json.JsonValue.EMPTY_JSON_ARRAY;
 26 | import static javax.json.JsonValue.EMPTY_JSON_OBJECT;
 27 | import static javax.json.JsonValue.ValueType.STRING;
 28 | 
 29 | /**
 30 |  * Supports the pluggable token counting logic.
 31 |  */
 32 | public class TokenCountSupport {
 33 | 
 34 |     private static final FunctionDocumenter standardDocumenter = new StandardFunctionDocumenter();
 35 | 
 36 |     public int countTokensFromString(String text, GPT3Tokenizer tokenizer) {
 37 |         return tokenizer.encode(text).size();
 38 |     }
 39 | 
 40 |     public <T_MSG, T_TOOL> int countTokensFromMessages(
 41 |             List<T_MSG> messages,
 42 |             Function<T_MSG, ? extends TokenizableMessage> messageCoercer,
 43 |             List<T_TOOL> tools,
 44 |             Function<T_TOOL, ? extends TokenizableTool> toolCoercer,
 45 |             GPT3Tokenizer tokenizer,
 46 |             ChatFormatDescriptor chatFormat)
 47 |     {
 48 |         var toolsPrompt = "";
 49 |         if (!tools.isEmpty()) {
 50 |             var tokenizable = tools.stream()
 51 |                     .map(toolCoercer)
 52 |                     .toList();
 53 |             toolsPrompt = generateDocumentation(tokenizable);
 54 |         }
 55 | 
 56 |         int tokenCount = 0;
 57 |         for (int index = 0; index < messages.size(); index++) {
 58 |             var tokenizable = messageCoercer.apply(messages.get(index));
 59 |             tokenCount += chatFormat.extraTokenCountPerMessage();
 60 | 
 61 |             var role = tokenizable.role();
 62 |             if (role != null && !role.isEmpty())
 63 |                 tokenCount += tokenizer.encode(role).size();
 64 | 
 65 |             var content = tokenizable.content();
 66 |             if (content != null && role != null && index == 0 && "system".equals(role.toString())) {
 67 |                 content += "\n\n" + toolsPrompt;
 68 |                 toolsPrompt = "";
 69 |             }
 70 |             if (content != null)
 71 |                 tokenCount += tokenizer.encode(content).size();
 72 | 
 73 |             var functionCall = tokenizable.functionCall();
 74 |             if (functionCall.isPresent()) {
 75 |                 tokenCount += tokenizer.encode(functionCall.name()).size();
 76 |                 tokenCount += tokenizer.encode(functionCall.arguments()).size();
 77 |                 tokenCount += chatFormat.extraTokenCountPerFunctionCall();
 78 |             }
 79 |         }
 80 |         tokenCount += chatFormat.extraTokenCountPerRequest(); // Every reply is primed with <im_start>assistant\n
 81 | 
 82 |         if (!tools.isEmpty()) {
 83 |             if (!toolsPrompt.isEmpty()) {
 84 |                 tokenCount += chatFormat.extraTokenCountPerMessage();
 85 |                 tokenCount += tokenizer.encode("system").size();
 86 |                 tokenCount += tokenizer.encode(toolsPrompt).size();
 87 |             }
 88 |             tokenCount += chatFormat.extraTokenCountForFunctions();
 89 |         }
 90 | 
 91 |         return tokenCount;
 92 |     }
 93 | 
 94 |     public JsonObject generateJsonSchema(Class<?> valueType) {
 95 |         JsonNode schemaNode = JsonSchemaUtils.generateSchema(valueType);
 96 |         return Json.createReader(new StringReader(schemaNode.toString())).readObject();
 97 |     }
 98 | 
 99 |     public static TokenCountSupport getSupport() {
100 |         return LazyHolder.INSTANCE;
101 |     }
102 | 
103 |     private static final class LazyHolder {
104 |         private static final TokenCountSupport INSTANCE = ServiceLoader.load(TokenCountSupport.class)
105 |                 .findFirst().orElseGet(TokenCountSupport::new);
106 |     }
107 | 
108 |     private static final class JsonSchemaUtils {
109 |         private static final Comparator<MemberScope<?,?>> DECLARATION_ORDER = (__, ___) -> 0;
110 |         private static final ObjectMapper mapper = new ObjectMapper();
111 |         private static final SchemaGenerator generator;
112 |         static {
113 |             SchemaGeneratorConfigBuilder configBuilder = new SchemaGeneratorConfigBuilder(mapper, SchemaVersion.DRAFT_2019_09, OptionPreset.PLAIN_JSON)
114 |                     .with(new JacksonModule(JacksonOption.RESPECT_JSONPROPERTY_REQUIRED));
115 |             configBuilder.forTypesInGeneral().withPropertySorter(DECLARATION_ORDER);
116 |             generator = new SchemaGenerator(configBuilder.build());
117 |         }
118 | 
119 |         public static JsonNode generateSchema(Class<?> valueType) {
120 |             return generator.generateSchema(valueType);
121 |         }
122 |     }
123 | 
124 |     public String generateDocumentation(List<? extends TokenizableTool> tools) {
125 |         StringBuilder sb = new StringBuilder();
126 | 
127 |         sb.append("# Tools\n\n");
128 | 
129 |         Map<String, List<TokenizableTool>> toolsByCategory = tools.stream()
130 |                 .collect(groupingBy(TokenizableTool::toolCategory));
131 | 
132 |         for (Map.Entry<String, List<TokenizableTool>> categoryEntry : toolsByCategory.entrySet()) {
133 |             sb.append("## ").append(categoryEntry.getKey()).append("\n\n");
134 | 
135 |             Map<String, List<TokenizableTool>> toolsByNamespace = categoryEntry.getValue().stream()
136 |                     .collect(groupingBy(TokenizableTool::toolNamespace));
137 | 
138 |             for (Map.Entry<String, List<TokenizableTool>> namespaceEntry : toolsByNamespace.entrySet()) {
139 |                 sb.append("namespace ").append(namespaceEntry.getKey()).append(" {\n\n");
140 |                 for (TokenizableTool tool : namespaceEntry.getValue()) {
141 |                     sb.append(tool.generateDocumentation()).append("\n\n");
142 |                 }
143 |                 sb.append("} // namespace ").append(namespaceEntry.getKey()).append("\n\n");
144 |             }
145 |         }
146 | 
147 |         return sb.toString().stripTrailing();
148 |     }
149 | 
150 | 
151 |     public interface FunctionDocumenter {
152 |         CharSequence generateDocumentation(TokenizableFunction function);
153 |     }
154 | 
155 |     public FunctionDocumenter getFunctionDocumenter(TokenizableFunction function) {
156 |         return standardDocumenter;
157 |     }
158 | 
159 |     private static class StandardFunctionDocumenter implements FunctionDocumenter {
160 | 
161 |         @Override
162 |         public String generateDocumentation(TokenizableFunction function) {
163 |             JsonObject params = function.parameters();
164 |             StringBuilder buf = new StringBuilder();
165 |             if (!function.description().isEmpty())
166 |                 putDescription(buf, function.description());
167 | 
168 |             putName(buf, function.name());
169 |             putParameters(buf, params, "");
170 |             putEnd(buf);
171 | 
172 |             return buf.toString();
173 |         }
174 | 
175 |         private static void putDescription(StringBuilder buf, JsonObject schema) {
176 |             var description = schema.getString("description", "").strip();
177 |             putDescription(buf, description);
178 |         }
179 | 
180 |         private static void putDescription(StringBuilder buf, String description) {
181 |             if (!description.isEmpty())
182 |                 description.lines().forEach(line -> buf.append("// ").append(line).append('\n'));
183 |         }
184 | 
185 |         private static void putName(StringBuilder buf, String name) {
186 |             buf.append("type ")
187 |                     .append(name)
188 |                     .append(" = (_: ");
189 |         }
190 | 
191 |         private static void putParameters(StringBuilder buf, Map<String, JsonValue> schema, String indent) {
192 |             var properties = schema.getOrDefault("properties", EMPTY_JSON_OBJECT).asJsonObject();
193 |             var required = schema.getOrDefault("required", EMPTY_JSON_ARRAY).asJsonArray();
194 |             var definitions = schema.getOrDefault("definitions", EMPTY_JSON_OBJECT).asJsonObject();
195 |             putProperties( buf,
196 |                     properties,
197 |                     required.getValuesAs(JsonString::getString),
198 |                     definitions,
199 |                     indent);
200 |         }
201 | 
202 |         private static void putProperties(StringBuilder buf, JsonObject schema, List<String> required,  Map<String, JsonValue> definitions, String indent) {
203 |             buf.append("{\n");
204 |             schema.forEach((name, value) -> {
205 |                 var valueDesc = value.asJsonObject();
206 |                 if (indent.isEmpty())
207 |                     putDescription(buf, valueDesc);
208 | 
209 |                 buf.append(indent);
210 |                 buf.append(name);
211 |                 if (!isNested(indent) && !required.contains(name))
212 |                     buf.append('?');
213 | 
214 |                 buf.append(": ");
215 |                 putParameterType(buf, valueDesc, indent);
216 |                 buf.append(",\n");
217 |             });
218 |             buf.append("}");
219 |         }
220 | 
221 |         private static void putParameterType(StringBuilder buf, JsonObject valueDesc, String indent) {
222 |             var typeDesc = valueDesc.get("type");
223 |             if (typeDesc == null || typeDesc.getValueType() != STRING) {
224 |                 buf.append("any");
225 |                 return;
226 |             }
227 | 
228 |             if (valueDesc.containsKey("enum")) {
229 |                 buf.append(String.join(" | ", valueDesc.getJsonArray("enum").getValuesAs(JsonValue::toString)));
230 |                 return;
231 |             }
232 | 
233 |             if (valueDesc.get("items") instanceof JsonObject arrayDesc && arrayDesc.containsKey("type")) {
234 |                 putParameterType(buf, arrayDesc, indent);
235 |                 buf.append("[]");
236 |                 return;
237 |             }
238 | 
239 |             var typeName = valueDesc.getString("type", "any");
240 |             switch (typeName) {
241 |                 case "integer", "number" -> buf.append("number");
242 |                 case "boolean", "string" -> buf.append(typeName);
243 |                 case "object" -> putParameters(buf, valueDesc, "  ");
244 |                 default -> buf.append("any");
245 |             }
246 |         }
247 | 
248 |         private static void putEnd(StringBuilder buf) {
249 |             buf.append(") => any;");
250 |         }
251 | 
252 |         private static boolean isNested(String indent) {
253 |             return !indent.isEmpty();
254 |         }
255 |     }
256 | }
257 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/TokenizableFunction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import javax.json.JsonObject;
  8 | import javax.json.JsonValue;
  9 | import java.util.function.Function;
 10 | 
 11 | /**
 12 |  * The TokenizableFunction interface represents a function that can be tokenized.
 13 |  * <p>
 14 |  * A function consists of a name, description, and parameters, which are accessible through
 15 |  * their respective methods.
 16 |  * <p>
 17 |  * The interface also provides methods to generate documentation for the function and to
 18 |  * create new instances of TokenizableFunction.
 19 |  *
 20 |  * @author Mariusz Bernacki
 21 |  */
 22 | public interface TokenizableFunction extends TokenizableTool {
 23 | 
 24 |     /**
 25 |      * Returns the name of the function.
 26 |      *
 27 |      * @return the function name
 28 |      */
 29 |     String name();
 30 | 
 31 |     /**
 32 |      * Returns the description of the function.
 33 |      *
 34 |      * @return the function description
 35 |      */
 36 |     String description();
 37 | 
 38 |     /**
 39 |      * Returns the parameters of the function as a JsonObject.
 40 |      *
 41 |      * @return the function parameters
 42 |      */
 43 |     JsonObject parameters();
 44 | 
 45 |     /**
 46 |      * Generates a documentation for the function. The generated documentation
 47 |      * serves as a basis for counting tokens used by the function definition
 48 |      * when passed in chat conversation.
 49 |      *
 50 |      * @return the function documentation
 51 |      */
 52 |     @Override
 53 |     default CharSequence generateDocumentation() {
 54 |         return TokenCountSupport.getSupport().getFunctionDocumenter(this).generateDocumentation(this);
 55 |     }
 56 | 
 57 |     /**
 58 |      * Returns the category of the tool. In this case, it's "functions".
 59 |      *
 60 |      * @return the tool category
 61 |      */
 62 |     @Override
 63 |     default String toolCategory() {
 64 |         return "functions";
 65 |     }
 66 | 
 67 |     /**
 68 |      * Returns the namespace of the tool. In this case, it's "functions".
 69 |      *
 70 |      * @return the tool namespace
 71 |      */
 72 |     @Override
 73 |     default String toolNamespace() {
 74 |         return "functions";
 75 |     }
 76 | 
 77 |     /**
 78 |      * Creates a function that is able to convert any type of object into an instance of
 79 |      * {@code TokenizableFunction} using the specified relevant property accessors.
 80 |      *
 81 |      * @param nameAccessor the function name accessor
 82 |      * @param descAccessor the function description accessor
 83 |      * @param paramsAccessor the function parameters accessor
 84 |      * @return a {@code TokenizableFunction} coercing function
 85 |      */
 86 |     static <T> Function<T, TokenizableFunction> from(
 87 |             Function<T, String> nameAccessor,
 88 |             Function<T, String> descAccessor,
 89 |             Function<T, JsonObject> paramsAccessor
 90 |     ) {
 91 |         return function -> of(
 92 |                 nameAccessor.apply(function),
 93 |                 descAccessor.apply(function),
 94 |                 paramsAccessor.apply(function)
 95 |         );
 96 |     }
 97 | 
 98 |     /**
 99 |      * Creates a new instance of {@code TokenizableFunction} for the specified arguments.
100 |      *
101 |      * @param name the function name
102 |      * @param description the function description
103 |      * @param parameters the function parameters
104 |      * @return a new {@code TokenizableFunction} object
105 |      */
106 |     static TokenizableFunction of(String name, String description, JsonObject parameters) {
107 |         return new Of(name, description, parameters);
108 |     }
109 | 
110 |     record Of(String name, String description, JsonObject parameters) implements TokenizableFunction {
111 |         public Of {
112 |             name = firstOrElse(name, "");
113 |             description = firstOrElse(description, "");
114 |             parameters = firstOrElse(parameters, JsonValue.EMPTY_JSON_OBJECT);
115 |         }
116 | 
117 |         private static <V> V firstOrElse(V first, V orElse) { return (first != null) ? first : orElse; }
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/TokenizableFunctionCall.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
 3 |  * SPDX-License-Identifier: MIT
 4 |  */
 5 | package com.didalgo.gpt3;
 6 | 
 7 | /**
 8 |  * The TokenizableFunctionCall interface represents a message from the "assistant" that
 9 |  * intends to make a function call instead of providing the usual content.
10 |  * <p>
11 |  * This interface provides access to the name of the function that the model intends to
12 |  * call and the arguments for the function. The arguments are provided as a stringified
13 |  * JSON object. Note that the JSON returned by the model may be invalid or may not
14 |  * adhere to the schema.
15 |  * <p>
16 |  * The interface also provides a method to check if the function call is present and a
17 |  * factory method to create new instances of {@code TokenizableFunctionCall}.
18 |  *
19 |  * @author Mariusz Bernacki
20 |  */
21 | public interface TokenizableFunctionCall {
22 | 
23 |     /** The constant representing an absent function call. */
24 |     TokenizableFunctionCall NONE = new Of("", "");
25 | 
26 |     /**
27 |      * The name of the function that the model decided to call.
28 |      *
29 |      * @return the function name
30 |      */
31 |     CharSequence name();
32 | 
33 |     /**
34 |      * The arguments for the function. A stringified JSON object (be aware
35 |      * that the JSON returned be the model could be invalid or may not adhere to the schema)
36 |      *
37 |      * @return the stringified JSON function arguments
38 |      */
39 |     CharSequence arguments();
40 | 
41 |     /**
42 |      * Checks if this object represents a non-empty function call.
43 |      * <p>
44 |      * The default implementation returns the result of calling {@code !name().isEmpty()}.
45 |      *
46 |      * @return {@code true} if a function call is present, otherwise {@code false}
47 |      */
48 |     default boolean isPresent() {
49 |         return !name().isEmpty();
50 |     }
51 | 
52 |     /**
53 |      * Creates a new {@code TokenizableFunctionCall} from the specified arguments.
54 |      *
55 |      * @param name the function name
56 |      * @param arguments the function arguments
57 |      * @return a new {@code TokenizableFunctionCall} object, or {@link #NONE} if
58 |      * the provided {@code name} is empty
59 |      */
60 |     static TokenizableFunctionCall of(CharSequence name, CharSequence arguments) {
61 |         if (name.isEmpty()) {
62 |             return NONE;
63 |         }
64 |         if (arguments == null) {
65 |             arguments = "";
66 |         }
67 |         return new Of(name, arguments);
68 |     }
69 | 
70 |     record Of(CharSequence name, CharSequence arguments) implements TokenizableFunctionCall { }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/TokenizableMessage.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import java.util.function.Function;
  8 | 
  9 | /**
 10 |  * The TokenizableMessage interface represents a message that can be tokenized or used together
 11 |  * with the {@link TokenCount} utility for the purpose of token counting.
 12 |  * <p>
 13 |  * A message comprises a role, name, content, and a function call, all of which are accessible
 14 |  * through their respective accessor methods. All methods should always return non-null values,
 15 |  * with empty {@code CharSequence} for missing content, or {@link TokenizableFunctionCall#NONE}
 16 |  * for an absent function call.
 17 |  *
 18 |  * <p>Any custom class can be converted into an instance of {@link TokenizableMessage}
 19 |  * using the {@link #from(Function, Function, Function, Function) factory method}.
 20 |  * <br/><b>Example:</b>
 21 |  * <pre>
 22 |  * {@code
 23 |  * TokenizableMessage message = TokenizableMessage.from(
 24 |  *     MyObj::getRole,
 25 |  *     MyObj::getContent,
 26 |  *     MyObj::getName,
 27 |  *     MyObj::getFunctionCall
 28 |  * ).apply(myObj);
 29 |  * }
 30 |  * </pre>
 31 |  *
 32 |  * @author Mariusz Bernacki
 33 |  */
 34 | public interface TokenizableMessage {
 35 | 
 36 |     /**
 37 |      * Returns the role of the message's author. Can be system, user, assistant, or function.
 38 |      *
 39 |      * @return the role of the message's author
 40 |      */
 41 |     CharSequence role();
 42 | 
 43 |     /**
 44 |      * Returns the content of the message. Content is required for all messages,
 45 |      * except for assistant messages with function calls.
 46 |      *
 47 |      * @return the content of the message, or empty {@code CharSequence} if not provided
 48 |      */
 49 |     CharSequence content();
 50 | 
 51 |     /**
 52 |      * Returns the name of the message's author. Name is required if role is function,
 53 |      * and it should be the name of the function whose response is in the content.
 54 |      *
 55 |      * @return the name of the message's author, or empty {@code CharSequence} if not
 56 |      * provided
 57 |      */
 58 |     CharSequence name();
 59 | 
 60 |     /**
 61 |      * Returns the function call that should be made, as generated by the model.
 62 |      *
 63 |      * @return the function call, {@link TokenizableFunctionCall#NONE} if absent
 64 |      */
 65 |     TokenizableFunctionCall functionCall();
 66 | 
 67 |     /**
 68 |      * Static method to create a new tokenizable message, based on the provided accessors.
 69 |      *
 70 |      * @param <T> the type of the message
 71 |      * @param roleAccessor the role accessor function
 72 |      * @param nameAccessor the name accessor function
 73 |      * @param contentAccessor the content accessor function
 74 |      * @param functionCallMaker the function call maker function
 75 |      * @return a function that creates a tokenizable message
 76 |      */
 77 |     static <T> Function<T, TokenizableMessage> from(
 78 |             Function<T, ? extends CharSequence> roleAccessor,
 79 |             Function<T, ? extends CharSequence> contentAccessor,
 80 |             Function<T, ? extends CharSequence> nameAccessor,
 81 |             Function<T, ? extends TokenizableFunctionCall> functionCallMaker
 82 |     ) {
 83 |         return message -> of(
 84 |                 roleAccessor.apply(message),
 85 |                 contentAccessor.apply(message),
 86 |                 nameAccessor.apply(message),
 87 |                 functionCallMaker.apply(message)
 88 |         );
 89 |     }
 90 | 
 91 |     /**
 92 |      * Constructs a new assistant, system, or user message with the specified content.
 93 |      *
 94 |      * @param role the author's role
 95 |      * @param content the message content
 96 |      * @return the {@code TokenizableMessage}
 97 |      */
 98 |     static TokenizableMessage of(CharSequence role, CharSequence content) {
 99 |         return of(role, content, "", TokenizableFunctionCall.NONE);
100 |     }
101 | 
102 |     /**
103 |      * Constructs a new assistant function call with the specified arguments.
104 |      *
105 |      * @param role the author's role
106 |      * @param functionCall the function call name and arguments
107 |      * @return the {@code TokenizableMessage}
108 |      */
109 |     static TokenizableMessage of(CharSequence role, TokenizableFunctionCall functionCall) {
110 |         return of(role, "", "", functionCall);
111 |     }
112 | 
113 |     /**
114 |      * Constructs a function message, representing a response with the specified arguments.
115 |      *
116 |      * @param role the author's role
117 |      * @param content the message content
118 |      * @param name the author's name
119 |      * @return the {@code TokenizableMessage}
120 |      */
121 |     static TokenizableMessage of(CharSequence role, CharSequence content, CharSequence name) {
122 |         return new Of(role, content, name, TokenizableFunctionCall.NONE);
123 |     }
124 | 
125 |     /**
126 |      * Constructs a new {@code TokenizableMessage} from the specified arguments.
127 |      *
128 |      * @param role the author's role
129 |      * @param content the message content
130 |      * @param name the author's name
131 |      * @param functionCall the function call name and arguments
132 |      * @return the {@code TokenizableMessage}
133 |      */
134 |     static TokenizableMessage of(CharSequence role, CharSequence content, CharSequence name, TokenizableFunctionCall functionCall) {
135 |         return new Of(role, content, name, functionCall);
136 |     }
137 | 
138 |     record Of(CharSequence role, CharSequence content, CharSequence name, TokenizableFunctionCall functionCall) implements TokenizableMessage {
139 |         public Of {
140 |             role = firstOrElse(role, "");
141 |             content = firstOrElse(content, "");
142 |             name = firstOrElse(name, "");
143 |             functionCall = firstOrElse(functionCall, TokenizableFunctionCall.NONE);
144 |         }
145 | 
146 |         private static <V> V firstOrElse(V first, V orElse) { return (first != null) ? first : orElse; }
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/java/com/didalgo/gpt3/TokenizableTool.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
 3 |  * SPDX-License-Identifier: MIT
 4 |  */
 5 | package com.didalgo.gpt3;
 6 | 
 7 | /**
 8 |  * An interface describing an object that provides a tool support for a language model.
 9 |  * <p>
10 |  * Currently only tools supported by OpenAI models are <b>functions</b>.
11 |  *
12 |  * @author Mariusz Bernacki
13 |  */
14 | public interface TokenizableTool {
15 | 
16 |     String toolCategory();
17 | 
18 |     String toolNamespace();
19 | 
20 |     CharSequence generateDocumentation();
21 | }
22 | 


--------------------------------------------------------------------------------
/src/test/java/com/didalgo/gpt3/ByteSequenceTest.java:
--------------------------------------------------------------------------------
 1 | package com.didalgo.gpt3;
 2 | 
 3 | import org.junit.jupiter.api.BeforeEach;
 4 | import org.junit.jupiter.api.Test;
 5 | 
 6 | import static java.nio.charset.StandardCharsets.UTF_8;
 7 | import static org.junit.jupiter.api.Assertions.*;
 8 | 
 9 | class ByteSequenceTest {
10 | 
11 |     private byte[] TEST_SEQUENCE_BYTES;
12 |     private ByteSequence TEST_SEQUENCE;
13 | 
14 |     @BeforeEach
15 |     void setUp() {
16 |         TEST_SEQUENCE = ByteSequence.of( TEST_SEQUENCE_BYTES = "TEST_SEQUENCE".getBytes() );
17 |     }
18 | 
19 |     @Test
20 |     void byteAt_gives_byte_at_requested_position() {
21 |         assertEquals((byte) '1', ByteSequence.from("1").byteAt(0));
22 |         assertEquals((byte) '2', ByteSequence.from("12").byteAt(1));
23 |         assertEquals((byte) '9', ByteSequence.from("123456789").byteAt(8));
24 |     }
25 | 
26 |     @Test
27 |     void length_gives_number_of_bytes_in_sequence() {
28 |         assertEquals(0, ByteSequence.EMPTY.length());
29 |         assertEquals(1, ByteSequence.from("1").length());
30 |         assertEquals(9, ByteSequence.from("123456789").length());
31 |     }
32 | 
33 |     @Test
34 |     void subSequence_gives_subsequence_between_given_start_and_end() {
35 |         assertEquals(ByteSequence.EMPTY, ByteSequence.from("123456789").subSequence(9, 9));
36 |         assertEquals(ByteSequence.from("1"), ByteSequence.from("123456789").subSequence(0, 1));
37 |         assertEquals(ByteSequence.from("9"), ByteSequence.from("123456789").subSequence(8, 9));
38 |     }
39 | 
40 |     @Test
41 |     void hashCode_gives_identical_hashCode_for_identical_sequences() {
42 |         ByteSequence aSequence = ByteSequence.from("TEST_SEQUENCE");
43 |         ByteSequence anotherSequence = ByteSequence.from("TEST_SEQUENCE");
44 |         assertEquals(anotherSequence.hashCode(), aSequence.hashCode());
45 |     }
46 | 
47 |     @Test
48 |     void equals_identifies_identical_byte_sequences() {
49 |         ByteSequence aSequence = ByteSequence.from("TEST_SEQUENCE");
50 |         ByteSequence anotherSequence = ByteSequence.from("TEST_SEQUENCE");
51 |         assertEquals(anotherSequence, aSequence);
52 |     }
53 | 
54 |     @Test
55 |     void toByteArray_produces_correct_byte_array_representation() {
56 |         assertArrayEquals(TEST_SEQUENCE_BYTES, TEST_SEQUENCE.toByteArray());
57 |     }
58 | 
59 |     @Test
60 |     void toString_gives_string_representation_using_given_charset() {
61 |         String stringRepresentation = "TEST_SEQUENCE";
62 |         ByteSequence aSequence = ByteSequence.from(stringRepresentation);
63 |         assertEquals(stringRepresentation, aSequence.toString(UTF_8));
64 |     }
65 | 
66 |     @Test
67 |     void copyOf_creates_distinct_copy_when_not_immutable() {
68 |         var copy = ByteSequence.copyOf(TEST_SEQUENCE);
69 |         assertEquals(TEST_SEQUENCE, copy);
70 |         assertSame(TEST_SEQUENCE, copy);
71 |     }
72 | 
73 |     @Test
74 |     void from_converts_string_to_byte_sequence_using_utf8() {
75 |         var string = "TEST_SEQUENCE";
76 |         var fromString = ByteSequence.from(string);
77 |         assertArrayEquals(string.getBytes(UTF_8), fromString.toByteArray());
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/test/java/com/didalgo/gpt3/GPT3TokenizerTest.java:
--------------------------------------------------------------------------------
  1 | package com.didalgo.gpt3;
  2 | 
  3 | import org.junit.jupiter.params.ParameterizedTest;
  4 | import org.junit.jupiter.params.converter.ConvertWith;
  5 | import org.junit.jupiter.params.provider.CsvSource;
  6 | 
  7 | import java.util.List;
  8 | 
  9 | import static org.junit.jupiter.api.Assertions.*;
 10 | 
 11 | class GPT3TokenizerTest {
 12 | 
 13 |     @ParameterizedTest
 14 |     @CsvSource({
 15 |             "gpt-4, 'Stop!', '[10903, 0]'",
 16 |             "gpt-4, 'Stop now.', '[10903, 1457, 13]'",
 17 |             "gpt-4, 'Stop what you''re doing.', '[10903, 1148, 499, 2351, 3815, 13]'",
 18 |             "gpt-4, 'Stop what you''re doing right now.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 13]'",
 19 |             "gpt-4, 'Stop what you''re doing right now and listen.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 13]'",
 20 |             "gpt-4, 'Stop what you''re doing right now and listen carefully.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 13]'",
 21 |             "gpt-4, 'Stop what you''re doing right now and listen carefully to me.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 13]'",
 22 |             "gpt-4, 'Stop what you''re doing right now and listen carefully to me, please.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 13]'",
 23 |             "gpt-4, 'Stop what you''re doing right now and listen carefully to me, please, because this is important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 3062, 13]'",
 24 |             "gpt-4, 'Stop what you''re doing right now and listen carefully to me, please, because this is very important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 1633, 3062, 13]'",
 25 |             "gpt-4, 'Przestań!', '[3617, 89, 30279, 19699, 0]'",
 26 |             "gpt-4, 'Przerwij to.', '[3617, 7215, 87183, 311, 13]'",
 27 |             "gpt-4, 'Przerwij to, co robisz.', '[3617, 7215, 87183, 311, 11, 1080, 10773, 70828, 13]'",
 28 |             "gpt-4, 'Przerwij to, co teraz robisz.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 13]'",
 29 |             "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 13]'",
 30 |             "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 577, 10196, 6077, 11044, 13]'",
 31 |             "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 13]'",
 32 |             "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie, proszę.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 11, 8882, 60705, 13]'",
 33 |             "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 10667, 6077, 818, 13]'",
 34 |             "gpt-4, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to bardzo ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 57958, 10667, 6077, 818, 13]'",
 35 |             "gpt-4, 'СТІЙ!', '[19871, 35095, 140, 228, 140, 247, 0]'",
 36 |             "gpt-4, 'Припини зараз.', '[17279, 31203, 8164, 19479, 1840, 44946, 89554, 13]'",
 37 |             "gpt-4, 'Припини те, що ти робиш.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 13]'",
 38 |             "gpt-4, 'Припини те, що ти робиш зараз.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 44946, 89554, 13]'",
 39 |             "gpt-4, 'Припини те, що ти зараз робиш, і послухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 61813, 3865, 10693, 19039, 13]'",
 40 |             "gpt-4, 'Припини те, що ти зараз робиш, і уважно слухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 14257, 5591, 38657, 13999, 35875, 3865, 10693, 19039, 13]'",
 41 |             "gpt-4, 'Припини те, що ти зараз робиш, і вислухай мене уважно.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 13]'",
 42 |             "gpt-4, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 13]'",
 43 |             "gpt-4, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка, тому що це важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 11, 11047, 72952, 9015, 231, 1482, 39233, 1532, 5927, 38657, 11320, 5591, 1482, 13]'",
 44 |             "gpt-4, 'Припини те, що ти зараз робиш, і слухай мене уважно, бо це дуже важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 35875, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 14391, 1482, 39233, 1532, 7952, 56999, 1532, 5927, 38657, 11320, 5591, 1482, 13]'",
 45 |             "gpt-4, 'Σταμάτα!', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 0]'",
 46 |             "gpt-4, 'Σταμάτα τώρα.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 39570, 139, 236, 39179, 19481, 13]'",
 47 |             "gpt-4, 'Σταμάτα αυτό που κάνεις.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 13]'",
 48 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 13]'",
 49 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 13]'",
 50 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 13]'",
 51 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 13]'",
 52 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 11, 52845, 19481, 39179, 19481, 68437, 19481, 34586, 139, 236, 13]'",
 53 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι σημαντικό.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 11, 52845, 19481, 39179, 19481, 68437, 19481, 34586, 139, 236, 11, 63127, 30862, 19481, 36924, 55241, 19581, 54556, 36924, 76295, 60247, 55241, 34369, 90002, 48823, 42524, 44223, 19481, 34369, 36924, 30862, 68437, 76295, 13]'",
 54 |             "gpt-4, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι πολύ σημαντικό.', '[138, 96, 36924, 19481, 44223, 75234, 36924, 19481, 19581, 54556, 36924, 76295, 52845, 73986, 72738, 75234, 34369, 31243, 30862, 46742, 19581, 44223, 80531, 45028, 57971, 46742, 39570, 139, 236, 39179, 19481, 72738, 90002, 8008, 105, 68437, 73986, 52845, 39179, 28654, 45028, 31243, 68437, 36924, 30862, 68437, 75234, 60247, 44223, 80531, 34369, 19481, 11, 52845, 19481, 39179, 19481, 68437, 19481, 34586, 139, 236, 11, 63127, 30862, 19481, 36924, 55241, 19581, 54556, 36924, 76295, 60247, 55241, 34369, 90002, 52845, 28654, 34586, 139, 235, 48823, 42524, 44223, 19481, 34369, 36924, 30862, 68437, 76295, 13]'",
 55 |             "gpt-4, 'class MyClass { public static void main(String[] args) { System.out.println(\"Hello, world!\"); }}', '[1058, 84926, 314, 586, 1118, 742, 1925, 2292, 1318, 2897, 8, 314, 744, 2594, 2986, 446, 9906, 11, 1917, 86640, 3954]'",
 56 |             "gpt-3.5-turbo, 'Stop!', '[10903, 0]'",
 57 |             "gpt-3.5-turbo, 'Stop now.', '[10903, 1457, 13]'",
 58 |             "gpt-3.5-turbo, 'Stop what you''re doing.', '[10903, 1148, 499, 2351, 3815, 13]'",
 59 |             "gpt-3.5-turbo, 'Stop what you''re doing right now.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 13]'",
 60 |             "gpt-3.5-turbo, 'Stop what you''re doing right now and listen.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 13]'",
 61 |             "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 13]'",
 62 |             "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 13]'",
 63 |             "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me, please.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 13]'",
 64 |             "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me, please, because this is important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 3062, 13]'",
 65 |             "gpt-3.5-turbo, 'Stop what you''re doing right now and listen carefully to me, please, because this is very important.', '[10903, 1148, 499, 2351, 3815, 1314, 1457, 323, 9020, 15884, 311, 757, 11, 4587, 11, 1606, 420, 374, 1633, 3062, 13]'",
 66 |             "gpt-3.5-turbo, 'Przestań!', '[3617, 89, 30279, 19699, 0]'",
 67 |             "gpt-3.5-turbo, 'Przerwij to.', '[3617, 7215, 87183, 311, 13]'",
 68 |             "gpt-3.5-turbo, 'Przerwij to, co robisz.', '[3617, 7215, 87183, 311, 11, 1080, 10773, 70828, 13]'",
 69 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 13]'",
 70 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 13]'",
 71 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 577, 10196, 6077, 11044, 13]'",
 72 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 13]'",
 73 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie uważnie, proszę.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 577, 10196, 6077, 11044, 11, 8882, 60705, 13]'",
 74 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 10667, 6077, 818, 13]'",
 75 |             "gpt-3.5-turbo, 'Przerwij to, co teraz robisz i posłuchaj mnie proszę uważnie, bo to bardzo ważne.', '[3617, 7215, 87183, 311, 11, 1080, 2024, 1394, 10773, 70828, 602, 1153, 4697, 1412, 1662, 74173, 8882, 60705, 577, 10196, 6077, 11044, 11, 712, 311, 57958, 10667, 6077, 818, 13]'",
 76 |             "gpt-3.5-turbo, 'СТІЙ!', '[19871, 35095, 140, 228, 140, 247, 0]'",
 77 |             "gpt-3.5-turbo, 'Припини зараз.', '[17279, 31203, 8164, 19479, 1840, 44946, 89554, 13]'",
 78 |             "gpt-3.5-turbo, 'Припини те, що ти робиш.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 13]'",
 79 |             "gpt-3.5-turbo, 'Припини те, що ти робиш зараз.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 18600, 14082, 1840, 12426, 44946, 89554, 13]'",
 80 |             "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і послухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 61813, 3865, 10693, 19039, 13]'",
 81 |             "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і уважно слухай.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 14257, 5591, 38657, 13999, 35875, 3865, 10693, 19039, 13]'",
 82 |             "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і вислухай мене уважно.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 13]'",
 83 |             "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 13]'",
 84 |             "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і вислухай мене уважно, будь ласка, тому що це важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 5927, 13810, 3114, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 51570, 4929, 26539, 18437, 13433, 11, 11047, 72952, 9015, 231, 1482, 39233, 1532, 5927, 38657, 11320, 5591, 1482, 13]'",
 85 |             "gpt-3.5-turbo, 'Припини те, що ти зараз робиш, і слухай мене уважно, бо це дуже важливо.', '[17279, 31203, 8164, 19479, 1840, 11047, 1532, 11, 9015, 231, 1482, 11047, 1840, 44946, 89554, 18600, 14082, 1840, 12426, 11, 84954, 35875, 3865, 10693, 19039, 69844, 1532, 14257, 5591, 38657, 13999, 11, 14391, 1482, 39233, 1532, 7952, 56999, 1532, 5927, 38657, 11320, 5591, 1482, 13]'",
 86 |             "gpt-3.5-turbo, 'class MyClass { public static void main(String[] args) { System.out.println(\"Hello, world!\"); }}', '[1058, 84926, 314, 586, 1118, 742, 1925, 2292, 1318, 2897, 8, 314, 744, 2594, 2986, 446, 9906, 11, 1917, 86640, 3954]'",
 87 |             "gpt-3.5-turbo, 'I''m', '[40, 2846]'",
 88 |             "gpt-3.5-turbo, 'I''m in', '[40, 2846, 304]'",
 89 |             "gpt-3.5-turbo, 'I''M', '[40, 28703]'",
 90 |             "gpt-3.5-turbo, 'I''M IN', '[40, 28703, 2006]'",
 91 |             "gpt-3.5-turbo, 'I''VE', '[40, 6, 4592]'",
 92 |             "gpt-3.5-turbo, 'I''VE DONE', '[40, 6, 4592, 55785]'",
 93 |             "gpt-3.5-turbo, 'I''ll', '[40, 3358]'",
 94 |             "gpt-3.5-turbo, 'I''ll do', '[40, 3358, 656]'",
 95 |             "gpt-3.5-turbo, 'I''D', '[40, 28805]'",
 96 |             "gpt-3.5-turbo, 'I''D DO', '[40, 28805, 9503]'",
 97 |             "gpt-3.5-turbo, 'I''d', '[40, 4265]'",
 98 |             "gpt-3.5-turbo, 'I''d done', '[40, 4265, 2884]'",
 99 |             "gpt-3.5-turbo, 'I''M', '[40, 28703]'",
100 |             "gpt-3.5-turbo, 'I''M DONE', '[40, 28703, 55785]'",
101 |             "gpt-3.5-turbo, 'you''re', '[9514, 2351]'",
102 |             "gpt-3.5-turbo, 'you''re done', '[9514, 2351, 2884]'",
103 |             "gpt-3.5-turbo, 'You''Re', '[2675, 50527]'",
104 |             "gpt-3.5-turbo, 'You''Re Done', '[2675, 50527, 28457]'",
105 |             "gpt-3.5-turbo, 'YOU''LL', '[57489, 6, 4178]'",
106 |             "gpt-3.5-turbo, 'YOU''LL DO', '[57489, 6, 4178, 9503]'",
107 |             "gpt-3.5-turbo, 'she''s', '[32158, 596]'",
108 |             "gpt-3.5-turbo, 'she''s done', '[32158, 596, 2884]'",
109 |             "gpt-3.5-turbo, 'SHE''S', '[50, 1837, 13575]'",
110 |             "gpt-3.5-turbo, 'SHE''S DONE', '[50, 1837, 13575, 55785]'",
111 |             "gpt-3.5-turbo, 'can''t', '[4919, 956]'",
112 |             "gpt-3.5-turbo, 'can''t do', '[4919, 956, 656]'",
113 |             "gpt-3.5-turbo, 'Can''T', '[6854, 17773]'",
114 |             "gpt-3.5-turbo, 'CAN''T DO', '[43055, 17773, 9503]'",
115 |             "gpt-3.5-turbo, 'c#ode', '[66, 2, 536]'",
116 |             "gpt-3.5-turbo, 'java_language', '[10248, 30121]'",
117 |             "gpt-3.5-turbo, 'regex{test}', '[27485, 90, 1985, 92]'",
118 |             "gpt-3.5-turbo, 'python$', '[12958, 3]'",
119 |             "gpt-3.5-turbo, 'python$code', '[12958, 3, 1889]'",
120 |             "gpt-3.5-turbo, '3.14159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534211706798214808651328230664709384460955058223172535940812848111745028410270193852110555', '[18, 13, 9335, 20128, 21598, 22905, 24531, 13895, 20911, 22956, 19230, 17267, 17824, 25962, 4468, 11739, 18572, 12935, 6550, 18248, 26007, 25687, 20128, 14777, 23713, 17264, 17361, 12171, 19416, 23574, 22379, 22091, 17590, 8546, 27309, 25873, 10410, 26956, 21164, 16544, 12879, 22644, 25202, 24344, 21138, 13506, 23670, 12245, 23309, 19192, 18058, 4386, 21235, 8546, 10617, 17058, 4278, 19597, 25454, 20767, 6550, 2131]'",
121 |             "gpt-3.5-turbo, '😊', '[76460, 232]'",
122 |             "gpt-3.5-turbo, '😂😍', '[76460, 224, 76460, 235]'",
123 |             "gpt-3.5-turbo, '🤔😘😉', '[9468, 97, 242, 76460, 246, 76460, 231]'",
124 |             "gpt-3.5-turbo, '🤯😴😜😝', '[9468, 97, 107, 76460, 112, 76460, 250, 76460, 251]'",
125 |             "gpt-3.5-turbo, '😷🙄😶🤑😒', '[76460, 115, 9468, 247, 226, 76460, 114, 9468, 97, 239, 76460, 240]'",
126 |             "gpt-3.5-turbo, '🤢🥺🥴🥵🥶🤕', '[9468, 97, 95, 9468, 98, 118, 9468, 98, 112, 9468, 98, 113, 9468, 98, 114, 9468, 97, 243]'",
127 |             "gpt-3.5-turbo, '😭🤬🤪😈👹😻😼', '[76460, 255, 9468, 97, 105, 9468, 97, 103, 76460, 230, 9468, 239, 117, 76460, 119, 76460, 120]'",
128 |             "gpt-3.5-turbo, '🤖💩👻👽🤡👺👾🧟‍♀️', '[9468, 97, 244, 93273, 102, 9468, 239, 119, 9468, 239, 121, 9468, 97, 94, 9468, 239, 118, 9468, 239, 122, 9468, 100, 253, 378, 235, 32990, 31643]'",
129 |             "gpt-3.5-turbo, '🙏🏽🤲🏽👐🏽💪🏽👍🏽👎🏽✌🏽🤘🏽🤞🏽', '[9468, 247, 237, 9468, 237, 121, 9468, 97, 110, 9468, 237, 121, 9468, 80010, 9468, 237, 121, 93273, 103, 9468, 237, 121, 9468, 239, 235, 9468, 237, 121, 9468, 239, 236, 9468, 237, 121, 38798, 234, 9468, 237, 121, 9468, 97, 246, 9468, 237, 121, 9468, 97, 252, 9468, 237, 121]'",
130 |             "gpt-3.5-turbo, '🌞🌈☀️❄️☔️🌊🍁🍂🌺🌸', '[9468, 234, 252, 9468, 234, 230, 18107, 222, 31643, 49633, 226, 31643, 18107, 242, 31643, 9468, 234, 232, 9468, 235, 223, 9468, 235, 224, 9468, 234, 118, 9468, 234, 116]'",
131 |             "gpt-4o, 'Stop!', '[13523, 0]'",
132 |             "gpt-4o, 'Stop now.', '[13523, 1954, 13]'",
133 |             "gpt-4o, 'Stop what you''re doing.', '[13523, 1412, 7163, 5306, 13]'",
134 |             "gpt-4o, 'Stop what you''re doing right now.', '[13523, 1412, 7163, 5306, 1849, 1954, 13]'",
135 |             "gpt-4o, 'Stop what you''re doing right now and listen.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 13]'",
136 |             "gpt-4o, 'Stop what you''re doing right now and listen carefully.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 13]'",
137 |             "gpt-4o, 'Stop what you''re doing right now and listen carefully to me.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 13]'",
138 |             "gpt-4o, 'Stop what you''re doing right now and listen carefully to me, please.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 11, 4843, 13]'",
139 |             "gpt-4o, 'Stop what you''re doing right now and listen carefully to me, please, because this is important.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 11, 4843, 11, 2236, 495, 382, 3378, 13]'",
140 |             "gpt-4o, 'Stop what you''re doing right now and listen carefully to me, please, because this is very important.', '[13523, 1412, 7163, 5306, 1849, 1954, 326, 11425, 18455, 316, 668, 11, 4843, 11, 2236, 495, 382, 1869, 3378, 13]'",
141 |             "gpt-4o, 'Σταμάτα!', '[10720, 6319, 27992, 6319, 0]'",
142 |             "gpt-4o, 'Σταμάτα τώρα.', '[10720, 6319, 27992, 6319, 153383, 13]'",
143 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 13]'",
144 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 13]'",
145 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 13]'",
146 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 13]'",
147 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 13]'",
148 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 11, 38699, 29680, 132706, 13]'",
149 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι σημαντικό.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 11, 38699, 29680, 132706, 11, 120892, 43845, 17278, 114750, 33191, 13]'",
150 |             "gpt-4o, 'Σταμάτα αυτό που κάνεις αμέσως τώρα και άκου προσεκτικά εμένα, παρακαλώ, γιατί αυτό είναι πολύ σημαντικό.', '[10720, 6319, 27992, 6319, 43845, 13042, 60174, 44533, 3793, 17752, 157975, 153383, 6381, 21285, 187344, 15098, 18785, 1664, 35337, 4278, 80486, 11, 38699, 29680, 132706, 11, 120892, 43845, 17278, 60896, 114750, 33191, 13]'"
151 |     })
152 |     void can_encode_or_decode_test_vectors_correctly(String model,
153 |                                                      String text,
154 |                                                      @ConvertWith(ListConverter.class) List<Integer> tokens) {
155 |         var enc = new GPT3Tokenizer(Encoding.forModel(model));
156 |         assertEquals(tokens, enc.encode(text));
157 |         assertEquals(text, enc.decode(tokens));
158 |     }
159 | }


--------------------------------------------------------------------------------
/src/test/java/com/didalgo/gpt3/ListConverter.java:
--------------------------------------------------------------------------------
 1 | package com.didalgo.gpt3;
 2 | 
 3 | import org.junit.jupiter.params.converter.ArgumentConversionException;
 4 | import org.junit.jupiter.params.converter.SimpleArgumentConverter;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | public class ListConverter extends SimpleArgumentConverter {
10 | 
11 |     @Override
12 |     protected Object convert(Object source, Class<?> targetType) throws ArgumentConversionException {
13 |         if (source instanceof String input && List.class.isAssignableFrom(targetType)) {
14 |             if (input.startsWith("[") && input.endsWith("]"))
15 |                 input = input.substring(1, input.length() - 1);
16 | 
17 |             return Arrays.stream(input.split(","))
18 |                     .map(String::trim)
19 |                     .map(Integer::valueOf)
20 |                     .toList();
21 |         }
22 |         throw new IllegalArgumentException("Conversion from " + source.getClass() + " to "
23 |                 + targetType + " not supported.");
24 |     }
25 | }


--------------------------------------------------------------------------------
/src/test/java/com/didalgo/gpt3/TokenCountTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
  3 |  * SPDX-License-Identifier: MIT
  4 |  */
  5 | package com.didalgo.gpt3;
  6 | 
  7 | import com.fasterxml.jackson.annotation.JsonProperty;
  8 | import com.fasterxml.jackson.annotation.JsonPropertyDescription;
  9 | import com.fasterxml.jackson.core.JsonProcessingException;
 10 | import com.fasterxml.jackson.databind.ObjectMapper;
 11 | import com.theokanning.openai.completion.chat.ChatFunction;
 12 | import com.theokanning.openai.completion.chat.ChatFunctionCall;
 13 | import com.theokanning.openai.completion.chat.ChatMessage;
 14 | import com.theokanning.openai.completion.chat.ChatMessageRole;
 15 | import lombok.Getter;
 16 | import lombok.Setter;
 17 | import org.junit.jupiter.api.Test;
 18 | import org.junit.jupiter.params.ParameterizedTest;
 19 | import org.junit.jupiter.params.provider.CsvSource;
 20 | 
 21 | import java.util.List;
 22 | 
 23 | import static org.junit.jupiter.api.Assertions.assertEquals;
 24 | 
 25 | public class TokenCountTest {
 26 | 
 27 |     GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE);
 28 | 
 29 |     @Test
 30 |     void fromLinesJoined_gives_total_token_count_including_newlines() {
 31 |         assertEquals(0, TokenCount.fromLinesJoined(List.of(), tokenizer));
 32 |         assertEquals(1, TokenCount.fromLinesJoined(List.of("1"), tokenizer));
 33 |         assertEquals(3, TokenCount.fromLinesJoined(List.of("1", "2"), tokenizer));
 34 |         assertEquals(5, TokenCount.fromLinesJoined(List.of("1", "2", "3"), tokenizer));
 35 |     }
 36 | 
 37 |     @ParameterizedTest
 38 |     @CsvSource({
 39 |             "121, gpt-3.5-turbo-0301",
 40 |             "115, gpt-3.5-turbo-0613",
 41 |             "115, gpt-3.5-turbo-16k-0613",
 42 |             "115, gpt-4-0314",
 43 |             "115, gpt-4-0613"
 44 |     })
 45 |     void fromMessages_gives_correct_token_count(int expectedTokenCount, String modelName) {
 46 |         List<ChatMessage> messages = List.of(
 47 |                 new ChatMessage("system", "You are a helpful, pattern-following assistant that translates corporate jargon into plain English."),
 48 |                 new ChatMessage("user", "New synergies will help drive top-line growth."),
 49 |                 new ChatMessage("assistant", "Things working well together will increase revenue."),
 50 |                 new ChatMessage("user", "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage."),
 51 |                 new ChatMessage("assistant", "Let's talk later when we're less busy about how to do better."),
 52 |                 new ChatMessage("user", "This late pivot means we don't have time to boil the ocean for the client deliverable.")
 53 |         );
 54 |         assertEquals(expectedTokenCount, TokenCount.fromMessages(messages, tokenizer, ChatFormatDescriptor.forModel(modelName)));
 55 |     }
 56 | 
 57 |     @Test
 58 |     void fromMessages_gives_expected_token_count_when_used_with_functions() throws JsonProcessingException {
 59 |         final int EXPECTED_TOKEN_COUNT = 232;
 60 | 
 61 |         var functionArgs = "{\n  \"source_code\": \"import java.time.LocalDate;\\n\\npublic class Main {\\n  public static void main(String[] args) {\\n    LocalDate currentDate = LocalDate.now();\\n    System.out.println(currentDate);\\n  }\\n}\"\n}";
 62 |         var jsonNode = new ObjectMapper().readTree(functionArgs);
 63 |         var messages = List.of(
 64 |                 new ChatMessage(ChatMessageRole.SYSTEM.value(), "You are a helpful assistant. Follow user instructions carefully."),
 65 |                 new ChatMessage(ChatMessageRole.USER.value(), "Please use Java to check current date."),
 66 |                 new ChatMessage(ChatMessageRole.ASSISTANT.value(), null, null, new ChatFunctionCall("java", jsonNode)),
 67 |                 new ChatMessage(ChatMessageRole.FUNCTION.value(), "TODAY", "java")
 68 |         );
 69 |         var functions = List.of(
 70 |                 new ChatFunction.Builder()
 71 |                         .name("java")
 72 |                         .description("Evaluate Java code.")
 73 |                         .executor(JavaFunction.class, (__ -> null))
 74 |                         .build(),
 75 |                 new ChatFunction.Builder()
 76 |                         .name("sql")
 77 |                         .description("Evaluate SQL code.")
 78 |                         .executor(SqlFunction.class, (__ -> null))
 79 |                         .build()
 80 |         );
 81 |         assertEquals(EXPECTED_TOKEN_COUNT, TokenCount.fromMessages(messages, functions, ModelType.GPT_3_5_TURBO_16K));
 82 |     }
 83 | 
 84 |     @Getter
 85 |     @Setter
 86 |     public static class JavaFunction {
 87 | 
 88 |         @JsonProperty("source_code")
 89 |         @JsonPropertyDescription("the code to evaluate")
 90 |         private String sourceCode;
 91 | 
 92 |         @JsonProperty("version")
 93 |         @JsonPropertyDescription("the Java version number, i.e. 17")
 94 |         private Integer version;
 95 |     }
 96 | 
 97 |     @Getter
 98 |     @Setter
 99 |     public static class SqlFunction {
100 | 
101 |         @JsonProperty(value = "TYPE", required = true)
102 |         @JsonPropertyDescription("the type of SQL query")
103 |         private SqlType type;
104 | 
105 |         @JsonProperty(value = "SQL", required = true)
106 |         @JsonPropertyDescription("the SQL object")
107 |         private Sql sql;
108 | 
109 |         public enum SqlType {
110 |             SELECT, UPDATE, DELETE, ALTER
111 |         }
112 |     }
113 | 
114 |     @Getter
115 |     @Setter
116 |     public static class Sql {
117 | 
118 |         @JsonProperty("columns")
119 |         private List<String> columns;
120 | 
121 |         @JsonProperty("condition")
122 |         private String condition;
123 | 
124 |         @JsonProperty("limit")
125 |         private Integer limit;
126 | 
127 |         @JsonProperty("ORDER BY")
128 |         @JsonPropertyDescription("the result ordering")
129 |         private OrderBy orderBy;
130 |     }
131 | 
132 |     @Getter
133 |     @Setter
134 |     public static class OrderBy {
135 | 
136 |         @JsonProperty(value = "column", required = true)
137 |         private String column;
138 | 
139 |         @JsonProperty("order")
140 |         private Order order;
141 | 
142 |         public enum Order {
143 |             ASC, DESC
144 |         }
145 |     }
146 | }
147 | 


--------------------------------------------------------------------------------
/src/test/java/com/didalgo/gpt3/TokenizableFunctionTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 Mariusz Bernacki <consulting@didalgo.com>
 3 |  * SPDX-License-Identifier: MIT
 4 |  */
 5 | package com.didalgo.gpt3;
 6 | 
 7 | import org.junit.jupiter.params.ParameterizedTest;
 8 | import org.junit.jupiter.params.provider.Arguments;
 9 | import org.junit.jupiter.params.provider.MethodSource;
10 | 
11 | import javax.json.Json;
12 | import javax.json.JsonObject;
13 | import javax.json.JsonReader;
14 | 
15 | import java.io.StringReader;
16 | import java.util.stream.Stream;
17 | 
18 | import static org.junit.jupiter.api.Assertions.*;
19 | 
20 | class TokenizableFunctionTest {
21 | 
22 |     @ParameterizedTest
23 |     @MethodSource("provideTestData")
24 |     void toString_converts_function_schema_to_internal_representation_the_model_was_trained_on(String name, String description, String jsonSchema, String representation) {
25 |         var function = TokenizableFunction.of(name, description, toJsonObject(jsonSchema));
26 |         assertEquals(representation, function.generateDocumentation());
27 |     }
28 | 
29 |     private static JsonObject toJsonObject(String json) {
30 |         try (JsonReader reader = Json.createReader(new StringReader(json))) {
31 |             return reader.readObject();
32 |         }
33 |     }
34 | 
35 |     static Stream<Arguments> provideTestData() {
36 |         return Stream.of(
37 |                 Arguments.of(
38 |                         "invoke",
39 |                         "Invokes specialized function which no one knows how it works",
40 |                         """
41 |                                 {
42 |                                     "type": "object",
43 |                                     "properties": {
44 |                                         "stringParameter": {
45 |                                             "type": "string",
46 |                                             "description": "The free-form text parameter"
47 |                                         },
48 |                                         "booleanParameter": {
49 |                                             "type": "boolean",
50 |                                             "description": "Switch lights on/off"
51 |                                         }
52 |                                     },
53 |                                     "required": [
54 |                                         "stringParameter"
55 |                                     ]
56 |                                 }
57 |                                 """,
58 |                         """
59 |                                 // Invokes specialized function which no one knows how it works
60 |                                 type invoke = (_: {
61 |                                 // The free-form text parameter
62 |                                 stringParameter: string,
63 |                                 // Switch lights on/off
64 |                                 booleanParameter?: boolean,
65 |                                 }) => any;"""
66 |                 ));
67 |     }
68 | }


--------------------------------------------------------------------------------
/src/test/resources/com/didalgo/gpt3/java.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "object",
 3 |   "properties": {
 4 |     "source_code": {
 5 |       "type": "string",
 6 |       "description": "the code to evaluate"
 7 |     },
 8 |     "version": {
 9 |       "type": "integer",
10 |       "description": "the Java version number, i.e. 17"
11 |     }
12 |   },
13 |   "required": [
14 |     "code"
15 |   ]
16 | }


--------------------------------------------------------------------------------
/src/test/resources/com/didalgo/gpt3/sql.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "object",
 3 |   "properties": {
 4 |     "TYPE": {
 5 |       "type": "string",
 6 |       "enum": ["SELECT","UPDATE","DELETE","ALTER"],
 7 |       "description": "the type of SQL query"
 8 |     },
 9 |     "SQL": {
10 |       "type": "object",
11 |       "description": "the SQL object",
12 |       "properties": {
13 |         "columns": {
14 |           "type": "array",
15 |           "items": {
16 |             "type": "string"
17 |           }
18 |         },
19 |         "condition": {
20 |           "type": "string",
21 |           "maxLength": 1000
22 |         },
23 |         "limit": {
24 |           "type": "number"
25 |         },
26 |         "ORDER BY": {
27 |           "type": "object",
28 |           "description": "the result ordering",
29 |           "properties": {
30 |             "column": {
31 |               "type": "string"
32 |             },
33 |             "order": {
34 |               "type": "string",
35 |               "enum": ["ASC", "DESC"]
36 |             }
37 |           },
38 |           "required": ["column"]
39 |         }
40 |       }
41 |     }
42 |   },
43 |   "required": [
44 |     "TYPE", "SQL"
45 |   ]
46 | }


--------------------------------------------------------------------------------