├── .formatter.exs ├── .github └── workflows │ └── release.yml ├── .gitignore ├── LICENSE ├── README.md ├── config └── config.exs ├── lib ├── tiktoken.ex └── tiktoken │ ├── cl100k.ex │ ├── encoding.ex │ ├── native.ex │ ├── o200k.ex │ ├── p50k.ex │ ├── p50k_edit.ex │ └── r50k.ex ├── mix.exs ├── mix.lock ├── native └── tiktoken │ ├── .cargo │ └── config │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── Cross.toml │ ├── README.md │ └── src │ └── lib.rs └── test ├── test_helper.exs └── tiktoken_test.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build precompiled NIFs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - '*' 9 | 10 | jobs: 11 | build_release: 12 | name: NIF ${{ matrix.nif }} - ${{ matrix.job.target }} (${{ matrix.job.os }}) 13 | runs-on: ${{ matrix.job.os }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | nif: ["2.16", "2.15"] 18 | job: 19 | - { target: arm-unknown-linux-gnueabihf , os: ubuntu-20.04 , use-cross: true } 20 | - { target: aarch64-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true } 21 | - { target: aarch64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true } 22 | - { target: aarch64-apple-darwin , os: macos-15 } 23 | - { target: riscv64gc-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true } 24 | - { target: x86_64-apple-darwin , os: macos-15 } 25 | - { target: x86_64-unknown-linux-gnu , os: ubuntu-20.04 } 26 | - { target: x86_64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true } 27 | - { target: x86_64-pc-windows-gnu , os: windows-2019 } 28 | - { target: x86_64-pc-windows-msvc , os: windows-2019 } 29 | 30 | steps: 31 | - name: Checkout source code 32 | uses: actions/checkout@v3 33 | 34 | - name: Extract project version 35 | shell: bash 36 | run: | 37 | # Get the project version from mix.exs 38 | echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV 39 | 40 | - name: Install Rust toolchain 41 | uses: dtolnay/rust-toolchain@stable 42 | with: 43 | toolchain: stable 44 | target: ${{ matrix.job.target }} 45 | 46 | - name: Build the project 47 | id: build-crate 48 | uses: philss/rustler-precompiled-action@v1.1.4 49 | with: 50 | project-name: tiktoken 51 | project-version: ${{ env.PROJECT_VERSION }} 52 | target: ${{ matrix.job.target }} 53 | nif-version: ${{ matrix.nif }} 54 | use-cross: ${{ matrix.job.use-cross }} 55 | project-dir: "native/tiktoken" 56 | 57 | - name: Artifact upload 58 | uses: actions/upload-artifact@v4 59 | with: 60 | name: ${{ steps.build-crate.outputs.file-name }} 61 | path: ${{ steps.build-crate.outputs.file-path }} 62 | 63 | - name: Publish archives and packages 64 | uses: softprops/action-gh-release@v1 65 | with: 66 | files: | 67 | ${{ steps.build-crate.outputs.file-path }} 68 | if: startsWith(github.ref, 'refs/tags/') 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where third-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | tiktoken-*.tar 24 | 25 | # Temporary files, for example, from tests. 26 | /tmp/ 27 | 28 | checksum-* 29 | tiktoken-* 30 | 31 | priv/native/libtiktoken.so 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Connor Jacobsen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tiktoken 2 | 3 | Elixir bindings for OpenAI's [Tiktoken](https://github.com/openai/tiktoken). 4 | 5 | Built on [@zurawiki](https://github.com/zurawiki)'s [tiktoken-rs](https://github.com/zurawiki/tiktoken-rs). 6 | 7 | ## Installation 8 | 9 | If [available in Hex](https://hex.pm/docs/publish), the package can be installed 10 | by adding `tiktoken` to your list of dependencies in `mix.exs`: 11 | 12 | ```elixir 13 | def deps do 14 | [ 15 | {:tiktoken, "~> 0.3"} 16 | ] 17 | end 18 | ``` 19 | 20 | ## Required dependencies to compile the shared library 21 | 22 | You will need to have a `Rust` compiler and `Python` installed. 23 | 24 | For example on `debian`/`ubuntu` this means: 25 | 26 | ```bash 27 | apt install cargo libpython3-dev 28 | ``` 29 | 30 | To install more up-to-date versions of `Rust` check [rustup](https://rustup.rs/). 31 | 32 | The libraries the bindings are built on use `PyO3` so you may need to set the `PyO3` flag before compiling: 33 | 34 | ```bash 35 | export PYO3_PYTHON=python 36 | # or depending on your OS 37 | export PYO3_PYTHON=python3 38 | ``` 39 | 40 | ## Force compilation of the shared library 41 | 42 | If the precompiled shared library are not available, you can force 43 | `rustler_precompiled` to compile it by adding the following to you `config.exs`: 44 | 45 | ```elixir 46 | config :rustler_precompiled, :force_build, tiktoken: true 47 | ``` 48 | 49 | Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc) 50 | and published on [HexDocs](https://hexdocs.pm). Once published, the docs can 51 | be found at . 52 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | 3 | config :rustler_precompiled, :force_build, tiktoken: true 4 | -------------------------------------------------------------------------------- /lib/tiktoken.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken do 2 | @moduledoc """ 3 | Documentation for `Tiktoken`. 4 | """ 5 | 6 | @model_to_encoding %{ 7 | "p50k_base" => Tiktoken.P50K, 8 | "p50k_edit" => Tiktoken.P50KEdit, 9 | "r50k_base" => Tiktoken.R50K, 10 | "cl100k_base" => Tiktoken.CL100K, 11 | "o200k_base" => Tiktoken.O200K 12 | } 13 | 14 | def encoding_for_model(model) do 15 | encoding_name = Tiktoken.Native.encoding_for_model(model) 16 | @model_to_encoding[encoding_name] 17 | end 18 | 19 | def encode_ordinary(model, text) do 20 | delegate_call(model, :encode_ordinary, [text]) 21 | end 22 | 23 | def encode(model, text, allowed_special \\ []) do 24 | delegate_call(model, :encode, [text, allowed_special]) 25 | end 26 | 27 | def encode_with_special_tokens(model, text) do 28 | delegate_call(model, :encode_with_special_tokens, [text]) 29 | end 30 | 31 | def decode(model, ids) do 32 | delegate_call(model, :decode, [ids]) 33 | end 34 | 35 | def count_tokens(model, text, allowed_special \\ []) do 36 | delegate_call(model, :count_tokens, [text, allowed_special]) 37 | end 38 | 39 | defp delegate_call(model, function, args) do 40 | if mod = encoding_for_model(model) do 41 | apply(mod, function, args) 42 | else 43 | {:error, {:unsupported_model, model}} 44 | end 45 | end 46 | 47 | # Those two can be removed when a release of tiktoken-rs > 0.5.8 is released 48 | def context_size_for_model("gpt-3.5-turbo-1106"), do: 16_385 49 | def context_size_for_model("gpt-4-0125-preview"), do: 128_000 50 | 51 | def context_size_for_model(model) do 52 | Tiktoken.Native.context_size_for_model(model) 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/tiktoken/cl100k.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.CL100K do 2 | @behaviour Tiktoken.Encoding 3 | 4 | @impl Tiktoken.Encoding 5 | def encode_ordinary(text) do 6 | Tiktoken.Native.cl100k_encode_ordinary(text) 7 | end 8 | 9 | @impl Tiktoken.Encoding 10 | def encode(text, allowed_special \\ []) do 11 | Tiktoken.Native.cl100k_encode(text, allowed_special) 12 | end 13 | 14 | @impl Tiktoken.Encoding 15 | def encode_with_special_tokens(text) do 16 | Tiktoken.Native.cl100k_encode_with_special_tokens(text) 17 | end 18 | 19 | @impl Tiktoken.Encoding 20 | def decode(ids) do 21 | Tiktoken.Native.cl100k_decode(ids) 22 | end 23 | 24 | @impl Tiktoken.Encoding 25 | def count_tokens(text, allowed_special \\ []) do 26 | Tiktoken.Native.cl100k_count_tokens(text, allowed_special) 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/tiktoken/encoding.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.Encoding do 2 | @callback encode_ordinary(String.t()) :: {:ok, [integer()]} | {:error, String.t()} 3 | 4 | @callback encode(String.t(), [binary()]) :: {:ok, [integer()]} | {:error, String.t()} 5 | 6 | @callback encode_with_special_tokens(String.t()) :: {:ok, [integer()]} | {:error, String.t()} 7 | 8 | @callback decode([integer()]) :: {:ok, String.t()} | {:error, String.t()} 9 | 10 | @callback count_tokens(String.t(), [binary()]) :: {:ok, integer()} | {:error, String.t()} 11 | end 12 | -------------------------------------------------------------------------------- /lib/tiktoken/native.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.Native do 2 | @moduledoc false 3 | 4 | version = Mix.Project.config()[:version] 5 | url = Mix.Project.config()[:source_url] 6 | 7 | use RustlerPrecompiled, 8 | otp_app: :tiktoken, 9 | crate: "tiktoken", 10 | base_url: "#{url}/releases/download/v#{version}", 11 | version: version 12 | 13 | def encoding_for_model(_model), do: err() 14 | 15 | def p50k_encode_ordinary(_input), do: err() 16 | def p50k_encode(_input, _allowed_special), do: err() 17 | def p50k_encode_with_special_tokens(_input), do: err() 18 | def p50k_decode(_ids), do: err() 19 | def p50k_count_tokens(_input, _allowed_special), do: err() 20 | 21 | def p50k_edit_encode_ordinary(_input), do: err() 22 | def p50k_edit_encode(_input, _allowed_special), do: err() 23 | def p50k_edit_encode_with_special_tokens(_input), do: err() 24 | def p50k_edit_decode(_ids), do: err() 25 | def p50k_edit_count_tokens(_input, _allowed_special), do: err() 26 | 27 | def r50k_encode_ordinary(_input), do: err() 28 | def r50k_encode(_input, _allowed_special), do: err() 29 | def r50k_encode_with_special_tokens(_input), do: err() 30 | def r50k_decode(_ids), do: err() 31 | def r50k_count_tokens(_input, _allowed_special), do: err() 32 | 33 | def cl100k_encode_ordinary(_input), do: err() 34 | def cl100k_encode(_input, _allowed_special), do: err() 35 | def cl100k_encode_with_special_tokens(_input), do: err() 36 | def cl100k_decode(_ids), do: err() 37 | def cl100k_count_tokens(_input, _allowed_special), do: err() 38 | 39 | def o200k_encode_ordinary(_input), do: err() 40 | def o200k_encode(_input, _allowed_special), do: err() 41 | def o200k_encode_with_special_tokens(_input), do: err() 42 | def o200k_decode(_ids), do: err() 43 | def o200k_count_tokens(_input, _allowed_special), do: err() 44 | 45 | def context_size_for_model(_model), do: err() 46 | 47 | defp err, do: :erlang.nif_error(:nif_not_loaded) 48 | end 49 | -------------------------------------------------------------------------------- /lib/tiktoken/o200k.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.O200K do 2 | @behaviour Tiktoken.Encoding 3 | 4 | @impl Tiktoken.Encoding 5 | def encode_ordinary(text) do 6 | Tiktoken.Native.o200k_encode_ordinary(text) 7 | end 8 | 9 | @impl Tiktoken.Encoding 10 | def encode(text, allowed_special \\ []) do 11 | Tiktoken.Native.o200k_encode(text, allowed_special) 12 | end 13 | 14 | @impl Tiktoken.Encoding 15 | def encode_with_special_tokens(text) do 16 | Tiktoken.Native.o200k_encode_with_special_tokens(text) 17 | end 18 | 19 | @impl Tiktoken.Encoding 20 | def decode(ids) do 21 | Tiktoken.Native.o200k_decode(ids) 22 | end 23 | 24 | @impl Tiktoken.Encoding 25 | def count_tokens(text, allowed_special \\ []) do 26 | Tiktoken.Native.o200k_count_tokens(text, allowed_special) 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/tiktoken/p50k.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.P50K do 2 | @behaviour Tiktoken.Encoding 3 | 4 | @impl Tiktoken.Encoding 5 | def encode_ordinary(text) do 6 | Tiktoken.Native.p50k_encode_ordinary(text) 7 | end 8 | 9 | @impl Tiktoken.Encoding 10 | def encode(text, allowed_special \\ []) do 11 | Tiktoken.Native.p50k_encode(text, allowed_special) 12 | end 13 | 14 | @impl Tiktoken.Encoding 15 | def encode_with_special_tokens(text) do 16 | Tiktoken.Native.p50k_encode_with_special_tokens(text) 17 | end 18 | 19 | @impl Tiktoken.Encoding 20 | def decode(ids) do 21 | Tiktoken.Native.p50k_decode(ids) 22 | end 23 | 24 | @impl Tiktoken.Encoding 25 | def count_tokens(text, allowed_special \\ []) do 26 | Tiktoken.Native.p50k_count_tokens(text, allowed_special) 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/tiktoken/p50k_edit.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.P50KEdit do 2 | @behaviour Tiktoken.Encoding 3 | 4 | @impl Tiktoken.Encoding 5 | def encode_ordinary(text) do 6 | Tiktoken.Native.p50k_edit_encode_ordinary(text) 7 | end 8 | 9 | @impl Tiktoken.Encoding 10 | def encode(text, allowed_special \\ []) do 11 | Tiktoken.Native.p50k_edit_encode(text, allowed_special) 12 | end 13 | 14 | @impl Tiktoken.Encoding 15 | def encode_with_special_tokens(text) do 16 | Tiktoken.Native.p50k_edit_encode_with_special_tokens(text) 17 | end 18 | 19 | @impl Tiktoken.Encoding 20 | def decode(ids) do 21 | Tiktoken.Native.p50k_edit_decode(ids) 22 | end 23 | 24 | @impl Tiktoken.Encoding 25 | def count_tokens(text, allowed_special \\ []) do 26 | Tiktoken.Native.p50k_edit_count_tokens(text, allowed_special) 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/tiktoken/r50k.ex: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.R50K do 2 | @behaviour Tiktoken.Encoding 3 | 4 | @impl Tiktoken.Encoding 5 | def encode_ordinary(text) do 6 | Tiktoken.Native.r50k_encode_ordinary(text) 7 | end 8 | 9 | @impl Tiktoken.Encoding 10 | def encode(text, allowed_special \\ []) do 11 | Tiktoken.Native.r50k_encode(text, allowed_special) 12 | end 13 | 14 | @impl Tiktoken.Encoding 15 | def encode_with_special_tokens(text) do 16 | Tiktoken.Native.r50k_encode_with_special_tokens(text) 17 | end 18 | 19 | @impl Tiktoken.Encoding 20 | def decode(ids) do 21 | Tiktoken.Native.r50k_decode(ids) 22 | end 23 | 24 | @impl Tiktoken.Encoding 25 | def count_tokens(text, allowed_special \\ []) do 26 | Tiktoken.Native.r50k_count_tokens(text, allowed_special) 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Tiktoken.MixProject do 2 | use Mix.Project 3 | 4 | @version "0.4.1" 5 | @url "https://github.com/connorjacobsen/tiktoken-elixir" 6 | 7 | def project do 8 | [ 9 | app: :tiktoken, 10 | version: @version, 11 | elixir: "~> 1.13", 12 | description: "Elixir bindings for Tiktoken tokenizer", 13 | source_url: @url, 14 | homepage_url: @url, 15 | start_permanent: Mix.env() == :prod, 16 | package: package(), 17 | deps: deps(), 18 | licenses: licenses(), 19 | aliases: aliases() 20 | ] 21 | end 22 | 23 | # Run "mix help compile.app" to learn about applications. 24 | def application do 25 | [ 26 | extra_applications: [:logger] 27 | ] 28 | end 29 | 30 | # Run "mix help deps" to learn about dependencies. 31 | defp deps do 32 | [ 33 | {:ex_doc, "~> 0.27", only: :dev, runtime: false}, 34 | {:rustler, "~> 0.30.0"}, 35 | {:rustler_precompiled, "~> 0.8"} 36 | ] 37 | end 38 | 39 | defp licenses, do: ~w(MIT) 40 | 41 | defp package do 42 | [ 43 | files: ~w(lib .formatter.exs mix.exs README* LICENSE* native checksum-*.exs), 44 | licenses: ["MIT"], 45 | links: %{"GitHub" => "https://github.com/connorjacobsen/tiktoken-elixir"} 46 | ] 47 | end 48 | 49 | defp aliases do 50 | [ 51 | fmt: [ 52 | "format", 53 | "cmd cargo fmt --manifest-path native/tiktoken/Cargo.toml" 54 | ] 55 | ] 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "castore": {:hex, :castore, "1.0.4", "ff4d0fb2e6411c0479b1d965a814ea6d00e51eb2f58697446e9c41a97d940b28", [:mix], [], "hexpm", "9418c1b8144e11656f0be99943db4caf04612e3eaecefb5dae9a2a87565584f8"}, 3 | "earmark_parser": {:hex, :earmark_parser, "1.4.31", "a93921cdc6b9b869f519213d5bc79d9e218ba768d7270d46fdcf1c01bacff9e2", [:mix], [], "hexpm", "317d367ee0335ef037a87e46c91a2269fef6306413f731e8ec11fc45a7efd059"}, 4 | "ex_doc": {:hex, :ex_doc, "0.29.2", "dfa97532ba66910b2a3016a4bbd796f41a86fc71dd5227e96f4c8581fdf0fdf0", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "6b5d7139eda18a753e3250e27e4a929f8d2c880dd0d460cb9986305dea3e03af"}, 5 | "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, 6 | "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, 7 | "makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"}, 8 | "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, 9 | "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"}, 10 | "rustler": {:hex, :rustler, "0.30.0", "cefc49922132b072853fa9b0ca4dc2ffcb452f68fb73b779042b02d545e097fb", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "9ef1abb6a7dda35c47cfc649e6a5a61663af6cf842a55814a554a84607dee389"}, 11 | "rustler_precompiled": {:hex, :rustler_precompiled, "0.7.0", "5d0834fc06dbc76dd1034482f17b1797df0dba9b491cef8bb045fcaca94bcade", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "fdf43a6835f4e4de5bfbc4c019bfb8c46d124bd4635fefa3e20d9a2bbbec1512"}, 12 | "toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"}, 13 | } 14 | -------------------------------------------------------------------------------- /native/tiktoken/.cargo/config: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | 7 | [target.aarch64-apple-darwin] 8 | rustflags = [ 9 | "-C", "link-arg=-undefined", 10 | "-C", "link-arg=dynamic_lookup", 11 | ] 12 | 13 | # See https://github.com/rust-lang/rust/issues/59302 14 | [target.x86_64-unknown-linux-musl] 15 | rustflags = [ 16 | "-C", "target-feature=-crt-static" 17 | ] 18 | 19 | # See https://github.com/rust-lang/rust/issues/59302 20 | [target.aarch64-unknown-linux-musl] 21 | rustflags = [ 22 | "-C", "target-feature=-crt-static" 23 | ] 24 | 25 | [profile.release] 26 | lto = true -------------------------------------------------------------------------------- /native/tiktoken/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /native/tiktoken/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anyhow" 16 | version = "1.0.80" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" 19 | 20 | [[package]] 21 | name = "autocfg" 22 | version = "1.1.0" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 25 | 26 | [[package]] 27 | name = "base64" 28 | version = "0.21.7" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" 31 | 32 | [[package]] 33 | name = "bit-set" 34 | version = "0.5.3" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" 37 | dependencies = [ 38 | "bit-vec", 39 | ] 40 | 41 | [[package]] 42 | name = "bit-vec" 43 | version = "0.6.3" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" 46 | 47 | [[package]] 48 | name = "bitflags" 49 | version = "1.3.2" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 52 | 53 | [[package]] 54 | name = "bstr" 55 | version = "1.7.0" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "c79ad7fb2dd38f3dabd76b09c6a5a20c038fc0213ef1e9afd30eb777f120f019" 58 | dependencies = [ 59 | "memchr", 60 | "regex-automata 0.4.1", 61 | "serde", 62 | ] 63 | 64 | [[package]] 65 | name = "cfg-if" 66 | version = "1.0.0" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 69 | 70 | [[package]] 71 | name = "fancy-regex" 72 | version = "0.12.0" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05" 75 | dependencies = [ 76 | "bit-set", 77 | "regex", 78 | ] 79 | 80 | [[package]] 81 | name = "heck" 82 | version = "0.5.0" 83 | source = "registry+https://github.com/rust-lang/crates.io-index" 84 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 85 | 86 | [[package]] 87 | name = "lazy_static" 88 | version = "1.4.0" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 91 | 92 | [[package]] 93 | name = "libc" 94 | version = "0.2.147" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" 97 | 98 | [[package]] 99 | name = "lock_api" 100 | version = "0.4.10" 101 | source = "registry+https://github.com/rust-lang/crates.io-index" 102 | checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" 103 | dependencies = [ 104 | "autocfg", 105 | "scopeguard", 106 | ] 107 | 108 | [[package]] 109 | name = "memchr" 110 | version = "2.6.4" 111 | source = "registry+https://github.com/rust-lang/crates.io-index" 112 | checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" 113 | 114 | [[package]] 115 | name = "parking_lot" 116 | version = "0.12.1" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" 119 | dependencies = [ 120 | "lock_api", 121 | "parking_lot_core", 122 | ] 123 | 124 | [[package]] 125 | name = "parking_lot_core" 126 | version = "0.9.8" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" 129 | dependencies = [ 130 | "cfg-if", 131 | "libc", 132 | "redox_syscall", 133 | "smallvec", 134 | "windows-targets", 135 | ] 136 | 137 | [[package]] 138 | name = "proc-macro2" 139 | version = "1.0.64" 140 | source = "registry+https://github.com/rust-lang/crates.io-index" 141 | checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da" 142 | dependencies = [ 143 | "unicode-ident", 144 | ] 145 | 146 | [[package]] 147 | name = "quote" 148 | version = "1.0.29" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" 151 | dependencies = [ 152 | "proc-macro2", 153 | ] 154 | 155 | [[package]] 156 | name = "redox_syscall" 157 | version = "0.3.5" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" 160 | dependencies = [ 161 | "bitflags", 162 | ] 163 | 164 | [[package]] 165 | name = "regex" 166 | version = "1.9.1" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" 169 | dependencies = [ 170 | "aho-corasick", 171 | "memchr", 172 | "regex-automata 0.3.2", 173 | "regex-syntax", 174 | ] 175 | 176 | [[package]] 177 | name = "regex-automata" 178 | version = "0.3.2" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf" 181 | dependencies = [ 182 | "aho-corasick", 183 | "memchr", 184 | "regex-syntax", 185 | ] 186 | 187 | [[package]] 188 | name = "regex-automata" 189 | version = "0.4.1" 190 | source = "registry+https://github.com/rust-lang/crates.io-index" 191 | checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" 192 | 193 | [[package]] 194 | name = "regex-syntax" 195 | version = "0.7.3" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846" 198 | 199 | [[package]] 200 | name = "rustc-hash" 201 | version = "1.1.0" 202 | source = "registry+https://github.com/rust-lang/crates.io-index" 203 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 204 | 205 | [[package]] 206 | name = "rustler" 207 | version = "0.33.0" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "45d51ae0239c57c3a3e603dd855ace6795078ef33c95c85d397a100ac62ed352" 210 | dependencies = [ 211 | "rustler_codegen", 212 | "rustler_sys", 213 | ] 214 | 215 | [[package]] 216 | name = "rustler_codegen" 217 | version = "0.33.0" 218 | source = "registry+https://github.com/rust-lang/crates.io-index" 219 | checksum = "27061f1a2150ad64717dca73902678c124b0619b0d06563294df265bc84759e1" 220 | dependencies = [ 221 | "heck", 222 | "proc-macro2", 223 | "quote", 224 | "syn", 225 | ] 226 | 227 | [[package]] 228 | name = "rustler_sys" 229 | version = "2.4.1" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "2062df0445156ae93cf695ef38c00683848d956b30507592143c01fe8fb52fda" 232 | dependencies = [ 233 | "regex", 234 | "unreachable", 235 | ] 236 | 237 | [[package]] 238 | name = "scopeguard" 239 | version = "1.1.0" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 242 | 243 | [[package]] 244 | name = "serde" 245 | version = "1.0.171" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "30e27d1e4fd7659406c492fd6cfaf2066ba8773de45ca75e855590f856dc34a9" 248 | 249 | [[package]] 250 | name = "smallvec" 251 | version = "1.11.0" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" 254 | 255 | [[package]] 256 | name = "syn" 257 | version = "2.0.32" 258 | source = "registry+https://github.com/rust-lang/crates.io-index" 259 | checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" 260 | dependencies = [ 261 | "proc-macro2", 262 | "quote", 263 | "unicode-ident", 264 | ] 265 | 266 | [[package]] 267 | name = "tiktoken" 268 | version = "0.1.0" 269 | dependencies = [ 270 | "rustler", 271 | "tiktoken-rs", 272 | ] 273 | 274 | [[package]] 275 | name = "tiktoken-rs" 276 | version = "0.5.9" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234" 279 | dependencies = [ 280 | "anyhow", 281 | "base64", 282 | "bstr", 283 | "fancy-regex", 284 | "lazy_static", 285 | "parking_lot", 286 | "rustc-hash", 287 | ] 288 | 289 | [[package]] 290 | name = "unicode-ident" 291 | version = "1.0.10" 292 | source = "registry+https://github.com/rust-lang/crates.io-index" 293 | checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" 294 | 295 | [[package]] 296 | name = "unreachable" 297 | version = "1.0.0" 298 | source = "registry+https://github.com/rust-lang/crates.io-index" 299 | checksum = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" 300 | dependencies = [ 301 | "void", 302 | ] 303 | 304 | [[package]] 305 | name = "void" 306 | version = "1.0.2" 307 | source = "registry+https://github.com/rust-lang/crates.io-index" 308 | checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" 309 | 310 | [[package]] 311 | name = "windows-targets" 312 | version = "0.48.1" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" 315 | dependencies = [ 316 | "windows_aarch64_gnullvm", 317 | "windows_aarch64_msvc", 318 | "windows_i686_gnu", 319 | "windows_i686_msvc", 320 | "windows_x86_64_gnu", 321 | "windows_x86_64_gnullvm", 322 | "windows_x86_64_msvc", 323 | ] 324 | 325 | [[package]] 326 | name = "windows_aarch64_gnullvm" 327 | version = "0.48.0" 328 | source = "registry+https://github.com/rust-lang/crates.io-index" 329 | checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" 330 | 331 | [[package]] 332 | name = "windows_aarch64_msvc" 333 | version = "0.48.0" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" 336 | 337 | [[package]] 338 | name = "windows_i686_gnu" 339 | version = "0.48.0" 340 | source = "registry+https://github.com/rust-lang/crates.io-index" 341 | checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" 342 | 343 | [[package]] 344 | name = "windows_i686_msvc" 345 | version = "0.48.0" 346 | source = "registry+https://github.com/rust-lang/crates.io-index" 347 | checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" 348 | 349 | [[package]] 350 | name = "windows_x86_64_gnu" 351 | version = "0.48.0" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" 354 | 355 | [[package]] 356 | name = "windows_x86_64_gnullvm" 357 | version = "0.48.0" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" 360 | 361 | [[package]] 362 | name = "windows_x86_64_msvc" 363 | version = "0.48.0" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" 366 | -------------------------------------------------------------------------------- /native/tiktoken/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tiktoken" 3 | version = "0.1.0" 4 | authors = [] 5 | edition = "2021" 6 | 7 | [lib] 8 | name = "tiktoken" 9 | path = "src/lib.rs" 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | rustler = "0.33.0" 14 | tiktoken-rs = "0.5.9" 15 | -------------------------------------------------------------------------------- /native/tiktoken/Cross.toml: -------------------------------------------------------------------------------- 1 | [build.env] 2 | passthrough = [ 3 | "RUSTLER_NIF_VERSION" 4 | ] -------------------------------------------------------------------------------- /native/tiktoken/README.md: -------------------------------------------------------------------------------- 1 | # NIF for Elixir.Tiktoken 2 | 3 | ## To build the NIF module: 4 | 5 | - Your NIF will now build along with your project. 6 | 7 | ## To load the NIF: 8 | 9 | ```elixir 10 | defmodule Tiktoken do 11 | use Rustler, otp_app: :tiktoken, crate: "tiktoken" 12 | 13 | # When your NIF is loaded, it will override this function. 14 | def add(_a, _b), do: :erlang.nif_error(:nif_not_loaded) 15 | end 16 | ``` 17 | 18 | ## Examples 19 | 20 | [This](https://github.com/rusterlium/NifIo) is a complete example of a NIF written in Rust. 21 | -------------------------------------------------------------------------------- /native/tiktoken/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use std::vec::Vec; 3 | 4 | use tiktoken_rs::CoreBPE; 5 | use tiktoken_rs::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base}; 6 | 7 | #[rustler::nif(schedule = "DirtyCpu")] 8 | fn encoding_for_model(model: &str) -> Option<&str> { 9 | match tiktoken_rs::tokenizer::get_tokenizer(model) { 10 | Some(tiktoken_rs::tokenizer::Tokenizer::O200kBase) => Some("o200k_base"), 11 | Some(tiktoken_rs::tokenizer::Tokenizer::Cl100kBase) => Some("cl100k_base"), 12 | Some(tiktoken_rs::tokenizer::Tokenizer::P50kBase) => Some("p50k_base"), 13 | Some(tiktoken_rs::tokenizer::Tokenizer::R50kBase) => Some("r50k_base"), 14 | Some(tiktoken_rs::tokenizer::Tokenizer::P50kEdit) => Some("p50k_edit"), 15 | _ => None, 16 | } 17 | } 18 | 19 | thread_local! { 20 | static R50K_BASE: CoreBPE = r50k_base().unwrap(); 21 | static P50K_BASE: CoreBPE = p50k_base().unwrap(); 22 | static P50K_EDIT: CoreBPE = p50k_edit().unwrap(); 23 | static CL100K_BASE: CoreBPE = cl100k_base().unwrap(); 24 | static O200K_BASE: CoreBPE = o200k_base().unwrap(); 25 | } 26 | 27 | // p50k 28 | 29 | #[rustler::nif(schedule = "DirtyCpu")] 30 | fn p50k_encode_ordinary(text: &str) -> Result, String> { 31 | Ok(P50K_BASE.with(|bpe| bpe.encode_ordinary(text))) 32 | } 33 | 34 | #[rustler::nif(schedule = "DirtyCpu")] 35 | fn p50k_encode(text: &str, allowed_special: Vec<&str>) -> Result, String> { 36 | let set = HashSet::from_iter(allowed_special.iter().cloned()); 37 | Ok(P50K_BASE.with(|bpe| bpe.encode(text, set))) 38 | } 39 | 40 | #[rustler::nif(schedule = "DirtyCpu")] 41 | fn p50k_encode_with_special_tokens(text: &str) -> Result, String> { 42 | Ok(P50K_BASE.with(|bpe| bpe.encode_with_special_tokens(text))) 43 | } 44 | 45 | #[rustler::nif(schedule = "DirtyCpu")] 46 | fn p50k_decode(ids: Vec) -> Result { 47 | P50K_BASE.with(|bpe| bpe.decode(ids).map_err(|e| e.to_string())) 48 | } 49 | 50 | #[rustler::nif(schedule = "DirtyCpu")] 51 | fn p50k_count_tokens(text: &str, allowed_special: Vec<&str>) -> Result { 52 | let set: HashSet<&str> = allowed_special.into_iter().collect(); 53 | Ok(P50K_BASE.with(|bpe| bpe.encode(text, set).len())) 54 | } 55 | 56 | // p50k edit 57 | 58 | #[rustler::nif(schedule = "DirtyCpu")] 59 | fn p50k_edit_encode_ordinary(text: &str) -> Result, String> { 60 | Ok(P50K_EDIT.with(|bpe| bpe.encode_ordinary(text))) 61 | } 62 | 63 | #[rustler::nif(schedule = "DirtyCpu")] 64 | fn p50k_edit_encode(text: &str, allowed_special: Vec<&str>) -> Result, String> { 65 | let set = HashSet::from_iter(allowed_special.iter().cloned()); 66 | Ok(P50K_EDIT.with(|bpe| bpe.encode(text, set))) 67 | } 68 | 69 | #[rustler::nif(schedule = "DirtyCpu")] 70 | fn p50k_edit_encode_with_special_tokens(text: &str) -> Result, String> { 71 | Ok(P50K_EDIT.with(|bpe| bpe.encode_with_special_tokens(text))) 72 | } 73 | 74 | #[rustler::nif(schedule = "DirtyCpu")] 75 | fn p50k_edit_decode(ids: Vec) -> Result { 76 | P50K_EDIT.with(|bpe| bpe.decode(ids).map_err(|e| e.to_string())) 77 | } 78 | 79 | #[rustler::nif(schedule = "DirtyCpu")] 80 | fn p50k_edit_count_tokens(text: &str, allowed_special: Vec<&str>) -> Result { 81 | let set: HashSet<&str> = allowed_special.into_iter().collect(); 82 | Ok(P50K_EDIT.with(|bpe| bpe.encode(text, set).len())) 83 | } 84 | 85 | // r50k 86 | 87 | #[rustler::nif(schedule = "DirtyCpu")] 88 | fn r50k_encode_ordinary(text: &str) -> Result, String> { 89 | Ok(R50K_BASE.with(|bpe| bpe.encode_ordinary(text))) 90 | } 91 | 92 | #[rustler::nif(schedule = "DirtyCpu")] 93 | fn r50k_encode(text: &str, allowed_special: Vec<&str>) -> Result, String> { 94 | let set = HashSet::from_iter(allowed_special.iter().cloned()); 95 | Ok(R50K_BASE.with(|bpe| bpe.encode(text, set))) 96 | } 97 | 98 | #[rustler::nif(schedule = "DirtyCpu")] 99 | fn r50k_encode_with_special_tokens(text: &str) -> Result, String> { 100 | Ok(R50K_BASE.with(|bpe| bpe.encode_with_special_tokens(text))) 101 | } 102 | 103 | #[rustler::nif(schedule = "DirtyCpu")] 104 | fn r50k_decode(ids: Vec) -> Result { 105 | R50K_BASE.with(|bpe| bpe.decode(ids).map_err(|e| e.to_string())) 106 | } 107 | 108 | #[rustler::nif(schedule = "DirtyCpu")] 109 | fn r50k_count_tokens(text: &str, allowed_special: Vec<&str>) -> Result { 110 | let set: HashSet<&str> = allowed_special.into_iter().collect(); 111 | Ok(R50K_BASE.with(|bpe| bpe.encode(text, set).len())) 112 | } 113 | 114 | // cl100k 115 | 116 | #[rustler::nif(schedule = "DirtyCpu")] 117 | fn cl100k_encode_ordinary(text: &str) -> Result, String> { 118 | Ok(CL100K_BASE.with(|bpe| bpe.encode_ordinary(text))) 119 | } 120 | 121 | #[rustler::nif(schedule = "DirtyCpu")] 122 | fn cl100k_encode(text: &str, allowed_special: Vec<&str>) -> Result, String> { 123 | let set = HashSet::from_iter(allowed_special.iter().cloned()); 124 | Ok(CL100K_BASE.with(|bpe| bpe.encode(text, set))) 125 | } 126 | 127 | #[rustler::nif(schedule = "DirtyCpu")] 128 | fn cl100k_encode_with_special_tokens(text: &str) -> Result, String> { 129 | Ok(CL100K_BASE.with(|bpe| bpe.encode_with_special_tokens(text))) 130 | } 131 | 132 | #[rustler::nif(schedule = "DirtyCpu")] 133 | fn cl100k_decode(ids: Vec) -> Result { 134 | CL100K_BASE.with(|bpe| bpe.decode(ids).map_err(|e| e.to_string())) 135 | } 136 | 137 | #[rustler::nif(schedule = "DirtyCpu")] 138 | fn cl100k_count_tokens(text: &str, allowed_special: Vec<&str>) -> Result { 139 | let set: HashSet<&str> = allowed_special.into_iter().collect(); 140 | Ok(CL100K_BASE.with(|bpe| bpe.encode(text, set).len())) 141 | } 142 | 143 | // o200k 144 | 145 | #[rustler::nif(schedule = "DirtyCpu")] 146 | fn o200k_encode_ordinary(text: &str) -> Result, String> { 147 | Ok(O200K_BASE.with(|bpe| bpe.encode_ordinary(text))) 148 | } 149 | 150 | #[rustler::nif(schedule = "DirtyCpu")] 151 | fn o200k_encode(text: &str, allowed_special: Vec<&str>) -> Result, String> { 152 | let set = HashSet::from_iter(allowed_special.iter().cloned()); 153 | Ok(O200K_BASE.with(|bpe| bpe.encode(text, set))) 154 | } 155 | 156 | #[rustler::nif(schedule = "DirtyCpu")] 157 | fn o200k_encode_with_special_tokens(text: &str) -> Result, String> { 158 | Ok(O200K_BASE.with(|bpe| bpe.encode_with_special_tokens(text))) 159 | } 160 | 161 | #[rustler::nif(schedule = "DirtyCpu")] 162 | fn o200k_decode(ids: Vec) -> Result { 163 | O200K_BASE.with(|bpe| bpe.decode(ids).map_err(|e| e.to_string())) 164 | } 165 | 166 | #[rustler::nif(schedule = "DirtyCpu")] 167 | fn o200k_count_tokens(text: &str, allowed_special: Vec<&str>) -> Result { 168 | let set: HashSet<&str> = allowed_special.into_iter().collect(); 169 | Ok(O200K_BASE.with(|bpe| bpe.encode(text, set).len())) 170 | } 171 | 172 | #[rustler::nif(schedule = "DirtyCpu")] 173 | fn context_size_for_model(model: &str) -> usize { 174 | tiktoken_rs::model::get_context_size(model) 175 | } 176 | 177 | rustler::init!( 178 | "Elixir.Tiktoken.Native", 179 | [ 180 | encoding_for_model, 181 | p50k_encode_ordinary, 182 | p50k_encode, 183 | p50k_encode_with_special_tokens, 184 | p50k_decode, 185 | p50k_count_tokens, 186 | p50k_edit_encode_ordinary, 187 | p50k_edit_encode, 188 | p50k_edit_encode_with_special_tokens, 189 | p50k_edit_decode, 190 | p50k_edit_count_tokens, 191 | r50k_encode_ordinary, 192 | r50k_encode, 193 | r50k_encode_with_special_tokens, 194 | r50k_decode, 195 | r50k_count_tokens, 196 | cl100k_encode_ordinary, 197 | cl100k_encode, 198 | cl100k_encode_with_special_tokens, 199 | cl100k_decode, 200 | cl100k_count_tokens, 201 | o200k_encode_ordinary, 202 | o200k_encode, 203 | o200k_encode_with_special_tokens, 204 | o200k_decode, 205 | o200k_count_tokens, 206 | context_size_for_model 207 | ] 208 | ); 209 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /test/tiktoken_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TiktokenTest do 2 | use ExUnit.Case 3 | doctest Tiktoken 4 | 5 | @known_models [ 6 | # chat 7 | {"gpt-3.5-turbo", Tiktoken.CL100K, 4_096}, 8 | {"gpt-3.5-turbo-0125", Tiktoken.CL100K, 4_096}, 9 | {"gpt-3.5-turbo-1106", Tiktoken.CL100K, 16_385}, 10 | {"gpt-3.5-turbo-instruct", Tiktoken.CL100K, 4_096}, 11 | {"gpt-3.5-turbo-16k", Tiktoken.CL100K, 16_384}, 12 | {"gpt-3.5-turbo-0613", Tiktoken.CL100K, 4_096}, 13 | {"gpt-3.5-turbo-16k-0613", Tiktoken.CL100K, 16_384}, 14 | {"gpt-4-0125-preview", Tiktoken.CL100K, 128_000}, 15 | {"gpt-4-turbo-preview", Tiktoken.CL100K, 8_192}, 16 | {"gpt-4-1106-preview", Tiktoken.CL100K, 128_000}, 17 | {"gpt-4-vision-preview", Tiktoken.CL100K, 8_192}, 18 | {"gpt-4-06-vision-preview", Tiktoken.CL100K, 8_192}, 19 | {"gpt-4", Tiktoken.CL100K, 8_192}, 20 | {"gpt-4-0613", Tiktoken.CL100K, 8_192}, 21 | {"gpt-4-32k", Tiktoken.CL100K, 32_768}, 22 | {"gpt-4-32k-0613", Tiktoken.CL100K, 32_768}, 23 | # text 24 | {"text-davinci-003", Tiktoken.P50K, 4_097}, 25 | {"text-davinci-002", Tiktoken.P50K, 4_097}, 26 | {"text-davinci-001", Tiktoken.R50K, 4_096}, 27 | {"text-curie-001", Tiktoken.R50K, 2_049}, 28 | {"text-babbage-001", Tiktoken.R50K, 2_049}, 29 | {"text-ada-001", Tiktoken.R50K, 2_049}, 30 | {"davinci", Tiktoken.R50K, 2_049}, 31 | {"curie", Tiktoken.R50K, 2_049}, 32 | {"babbage", Tiktoken.R50K, 2_049}, 33 | {"ada", Tiktoken.R50K, 2_049}, 34 | # code 35 | {"code-davinci-002", Tiktoken.P50K, 8_001}, 36 | {"code-davinci-001", Tiktoken.P50K, 4_096}, 37 | {"code-cushman-002", Tiktoken.P50K, 4_096}, 38 | {"code-cushman-001", Tiktoken.P50K, 2_048}, 39 | {"davinci-codex", Tiktoken.P50K, 2_049}, 40 | {"cushman-codex", Tiktoken.P50K, 4_096}, 41 | # edit 42 | {"text-davinci-edit-001", Tiktoken.P50KEdit, 4_096}, 43 | {"code-davinci-edit-001", Tiktoken.P50KEdit, 4_096}, 44 | # embeddings 45 | # {"text-embedding-3-large", Tiktoken.CL100K}, 46 | # {"text-embedding-3-small", Tiktoken.CL100K}, 47 | {"text-embedding-ada-002", Tiktoken.CL100K, 8_192}, 48 | # old embeddings 49 | {"text-similarity-davinci-001", Tiktoken.R50K, 4_096}, 50 | {"text-similarity-curie-001", Tiktoken.R50K, 4_096}, 51 | {"text-similarity-babbage-001", Tiktoken.R50K, 4_096}, 52 | {"text-similarity-ada-001", Tiktoken.R50K, 4_096}, 53 | {"text-search-davinci-doc-001", Tiktoken.R50K, 4_096}, 54 | {"text-search-curie-doc-001", Tiktoken.R50K, 4_096}, 55 | {"text-search-babbage-doc-001", Tiktoken.R50K, 4_096}, 56 | {"text-search-ada-doc-001", Tiktoken.R50K, 4_096}, 57 | {"code-search-babbage-code-001", Tiktoken.R50K, 4_096}, 58 | {"code-search-ada-code-001", Tiktoken.R50K, 4_096} 59 | # moderation 60 | # {"text-moderation-latest", Tiktoken.CL100K}, 61 | # {"text-moderation-stable", Tiktoken.CL100K}, 62 | # {"text-moderation-007", Tiktoken.CL100K} 63 | # open source 64 | # {"gpt2", "gpt2"} 65 | ] 66 | 67 | describe "encoding_for_model/1" do 68 | test "get the proper module for supported model" do 69 | @known_models 70 | |> Enum.each(fn {model, mod, _context_size} -> 71 | assert Tiktoken.encoding_for_model(model) == mod 72 | end) 73 | end 74 | 75 | test "get nil for unsupported model" do 76 | assert is_nil(Tiktoken.encoding_for_model("gpt2")) 77 | end 78 | end 79 | 80 | describe "encode_ordinary/2" do 81 | test "with supported model" do 82 | assert {:ok, ids} = 83 | Tiktoken.encode_ordinary("gpt-3.5-turbo", "Tell me more about Elixir!") 84 | 85 | assert length(ids) == 7 86 | end 87 | 88 | test "with unsupported model" do 89 | assert {:error, {:unsupported_model, "gpt2"}} = 90 | Tiktoken.encode_ordinary("gpt2", "Tell me more about Elixir!") 91 | end 92 | end 93 | 94 | describe "encode/2" do 95 | test "with supported model" do 96 | assert {:ok, ids} = 97 | Tiktoken.encode("gpt-3.5-turbo", "Tell me more about Elixir!") 98 | 99 | assert length(ids) == 7 100 | end 101 | 102 | test "with unsupported model" do 103 | assert {:error, {:unsupported_model, "gpt2"}} = 104 | Tiktoken.encode("gpt2", "Tell me more about Elixir!") 105 | end 106 | end 107 | 108 | describe "encode_with_special_tokens/2" do 109 | test "with supported model" do 110 | assert {:ok, ids} = 111 | Tiktoken.encode_with_special_tokens("gpt-3.5-turbo", "Tell me more about Elixir!") 112 | 113 | assert length(ids) == 7 114 | end 115 | 116 | test "with unsupported model" do 117 | assert {:error, {:unsupported_model, "gpt2"}} = 118 | Tiktoken.encode_with_special_tokens("gpt2", "Tell me more about Elixir!") 119 | end 120 | end 121 | 122 | describe "decode/2" do 123 | test "with supported model" do 124 | text = "Tell me more about Elixir!" 125 | 126 | assert {:ok, ids} = 127 | Tiktoken.encode("gpt-3.5-turbo", text) 128 | 129 | assert {:ok, ^text} = 130 | Tiktoken.decode("gpt-3.5-turbo", ids) 131 | end 132 | 133 | test "with unsupported model" do 134 | assert {:error, {:unsupported_model, "gpt2"}} = 135 | Tiktoken.decode("gpt2", [1]) 136 | end 137 | end 138 | 139 | describe "count_tokens/2" do 140 | test "with supported model" do 141 | text = "Tell me more about Elixir!" 142 | assert {:ok, count} = Tiktoken.count_tokens("gpt-3.5-turbo", text) 143 | assert count > 0 144 | assert count == length(elem(Tiktoken.encode("gpt-3.5-turbo", text), 1)) 145 | end 146 | 147 | test "with unsupported model" do 148 | assert {:error, {:unsupported_model, "gpt2"}} = 149 | Tiktoken.count_tokens("gpt2", "Hello") 150 | end 151 | 152 | test "with special tokens" do 153 | text = "Tell me more about Elixir!" 154 | special_tokens = ["<|endoftext|>"] 155 | assert {:ok, count} = Tiktoken.count_tokens("gpt-3.5-turbo", text, special_tokens) 156 | assert count > 0 157 | assert count == length(elem(Tiktoken.encode("gpt-3.5-turbo", text, special_tokens), 1)) 158 | end 159 | 160 | test "with different models" do 161 | text = "Tell me more about Elixir!" 162 | models = ["gpt-3.5-turbo", "text-davinci-003", "text-davinci-edit-001"] 163 | 164 | Enum.each(models, fn model -> 165 | assert {:ok, count} = Tiktoken.count_tokens(model, text) 166 | assert count > 0 167 | assert count == length(elem(Tiktoken.encode(model, text), 1)) 168 | end) 169 | end 170 | end 171 | 172 | describe "context_size_for_model/1" do 173 | test "get proper context size for model" do 174 | @known_models 175 | |> Enum.each(fn {model, _mod, context_size} -> 176 | assert Tiktoken.context_size_for_model(model) == context_size 177 | end) 178 | end 179 | 180 | test "get 4096 for unknown model" do 181 | assert Tiktoken.context_size_for_model("unknown") == 4_096 182 | end 183 | end 184 | end 185 | --------------------------------------------------------------------------------