├── doctest_helper.rb ├── .rspec ├── lib ├── tiktoken_ruby │ ├── version.rb │ └── encoding.rb └── tiktoken_ruby.rb ├── .standard.yml ├── sig └── tiktoken_ruby.rbs ├── bin ├── setup └── console ├── ext └── tiktoken_ruby │ ├── extconf.rb │ ├── Cargo.toml │ └── src │ ├── core_bpe_wrapper.rs │ └── lib.rs ├── .vscode └── settings.json ├── Gemfile ├── .gitignore ├── Cargo.toml ├── spec ├── spec_helper.rb └── tiktoken_ruby_spec.rb ├── Rakefile ├── .github ├── dependabot.yml └── workflows │ ├── build.yml │ └── cross_compile.yml ├── LICENSE.txt ├── script └── release ├── tiktoken_ruby.gemspec ├── README.md ├── Gemfile.lock └── Cargo.lock /doctest_helper.rb: -------------------------------------------------------------------------------- 1 | require "lib/tiktoken_ruby" 2 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /lib/tiktoken_ruby/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Tiktoken 4 | VERSION = "0.0.13" 5 | end 6 | -------------------------------------------------------------------------------- /.standard.yml: -------------------------------------------------------------------------------- 1 | # For available configuration options, see: 2 | # https://github.com/testdouble/standard 3 | ruby_version: 2.6 4 | -------------------------------------------------------------------------------- /sig/tiktoken_ruby.rbs: -------------------------------------------------------------------------------- 1 | module TiktokenRuby 2 | VERSION: String 3 | # See the writing guide of rbs: https://github.com/ruby/rbs#guides 4 | end 5 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /ext/tiktoken_ruby/extconf.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "mkmf" 4 | require "rb_sys/mkmf" 5 | 6 | create_rust_makefile("tiktoken_ruby/tiktoken_ruby") 7 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[ruby]": { 3 | "editor.defaultFormatter": "Shopify.ruby-lsp" 4 | }, 5 | "[markdown]": { 6 | "editor.defaultFormatter": "esbenp.prettier-vscode" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | gemspec 6 | 7 | gem "rake" 8 | gem "rake-compiler" 9 | gem "rspec" 10 | gem "standard" 11 | gem "yard-doctest" 12 | gem "racc" 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | *.bundle 10 | *.so 11 | *.o 12 | *.a 13 | mkmf.log 14 | target/ 15 | .yardopts 16 | 17 | # rspec failure tracking 18 | .rspec_status 19 | 20 | vendor/ 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # This Cargo.toml is here to let externals tools (IDEs, etc.) know that this is 2 | # a Rust project. Your extensions dependencies should be added to the Cargo.toml 3 | # in the ext/ directory. 4 | 5 | [workspace] 6 | members = ["./ext/tiktoken_ruby"] 7 | resolver = "2" 8 | -------------------------------------------------------------------------------- /ext/tiktoken_ruby/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tiktoken_ruby" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["IAPark "] 6 | license = "MIT" 7 | publish = false 8 | 9 | [lib] 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | magnus = { version = "0.8.2" } 14 | rb-sys = { version = "0.9.117", features = ["stable-api-compiled-fallback"] } 15 | tiktoken-rs = { version = "0.9.0" } 16 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require "bundler/setup" 5 | require "tiktoken_ruby" 6 | 7 | # You can add fixtures and/or initialization code here to make experimenting 8 | # with your gem easier. You can also use a different console, if you like. 9 | 10 | # (If you use this, don't forget to add pry to your Gemfile!) 11 | # require "pry" 12 | # Pry.start 13 | 14 | require "irb" 15 | IRB.start(__FILE__) 16 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "tiktoken_ruby" 4 | 5 | RSpec.configure do |config| 6 | # Enable flags like --only-failures and --next-failure 7 | config.example_status_persistence_file_path = ".rspec_status" 8 | 9 | # Disable RSpec exposing methods globally on `Module` and `main` 10 | config.disable_monkey_patching! 11 | 12 | config.expect_with :rspec do |c| 13 | c.syntax = :expect 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "bundler/gem_tasks" 4 | require "rspec/core/rake_task" 5 | require "standard/rake" 6 | require "rake/extensiontask" 7 | require "rb_sys/extensiontask" 8 | 9 | GEMSPEC = Gem::Specification.load("tiktoken_ruby.gemspec") 10 | 11 | RbSys::ExtensionTask.new("tiktoken_ruby", GEMSPEC) do |ext| 12 | ext.lib_dir = "lib/tiktoken_ruby" 13 | end 14 | 15 | RSpec::Core::RakeTask.new(:spec) 16 | 17 | task :native, [:platform] do |_t, platform:| 18 | sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build" 19 | end 20 | 21 | task build: :compile 22 | 23 | task default: %i[compile spec standard] 24 | 25 | # Packaging default (non-precompiled) gem 26 | require "rubygems/package_task" 27 | gem_path = Gem::PackageTask.new(GEMSPEC).define 28 | desc "Package the Ruby gem" 29 | task "package" => [gem_path] 30 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: monthly 7 | day: monday 8 | time: "09:00" 9 | timezone: "Etc/UTC" 10 | open-pull-requests-limit: 10 11 | groups: 12 | github-actions: 13 | patterns: 14 | - "*" 15 | 16 | - package-ecosystem: bundler 17 | directory: "/" 18 | schedule: 19 | interval: monthly 20 | day: monday 21 | time: "09:00" 22 | timezone: "Etc/UTC" 23 | open-pull-requests-limit: 10 24 | groups: 25 | bundler-dependencies: 26 | patterns: 27 | - "*" 28 | 29 | - package-ecosystem: cargo 30 | directory: "/" 31 | schedule: 32 | interval: monthly 33 | day: monday 34 | time: "09:00" 35 | timezone: "Etc/UTC" 36 | open-pull-requests-limit: 10 37 | groups: 38 | cargo: 39 | patterns: 40 | - "*" 41 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | test: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | os: [ubuntu-latest] 9 | ruby: ["ruby-head", "3.4", "3.3", "3.2"] 10 | runs-on: ${{ matrix.os }} 11 | steps: 12 | - uses: actions/checkout@v6 13 | - uses: actions/cache@v4 14 | with: 15 | path: | 16 | ~/.cargo/registry 17 | ~/.cargo/git 18 | tmp 19 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 20 | - uses: ruby/setup-ruby@v1 21 | with: 22 | ruby-version: ${{ matrix.ruby }} 23 | bundler-cache: true 24 | - run: bundle exec rake build 25 | - run: bundle exec rake spec 26 | lint: 27 | strategy: 28 | matrix: 29 | os: [ubuntu-latest] 30 | runs-on: ${{ matrix.os }} 31 | steps: 32 | - uses: actions/checkout@v6 33 | - uses: ruby/setup-ruby@v1 34 | with: 35 | ruby-version: 3.4 36 | bundler-cache: true 37 | - run: bundle exec rake standard 38 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2023 IAPark 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /script/release: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then 6 | echo "Error: TIKTOKEN_PUBLISH_KEY is not set. This is the RubyGems API key to push the gem." 7 | exit 1 8 | fi 9 | 10 | run_id="" 11 | # Parse arguments 12 | while [[ "$#" -gt 0 ]]; do 13 | case $1 in 14 | --run-id) 15 | run_id="$2" 16 | shift 2 17 | ;; 18 | *) 19 | echo "Unknown parameter passed: $1" 20 | exit 1 21 | ;; 22 | esac 23 | done 24 | 25 | if [ -z "${run_id}" ]; then 26 | echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow." 27 | exit 1 28 | fi 29 | 30 | version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2) 31 | echo "Building tiktoken_ruby v$version, using artifacts from run $run_id" 32 | 33 | rm -rf pkg/cross-compiled 34 | gh run download "$run_id" -D pkg/cross-compiled 35 | 36 | for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do 37 | echo "Publishing $gem" 38 | GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org 39 | done 40 | 41 | # last but not least, the uncompiled gem 42 | bundle exec rake package 43 | GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "pkg/tiktoken_ruby-$version.gem" --host https://rubygems.org 44 | -------------------------------------------------------------------------------- /.github/workflows/cross_compile.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: cross-compile 3 | on: [push] 4 | 5 | jobs: 6 | ci-data: 7 | runs-on: ubuntu-latest 8 | outputs: 9 | result: ${{ steps.fetch.outputs.result }} 10 | steps: 11 | - uses: oxidize-rb/actions/fetch-ci-data@main 12 | id: fetch 13 | with: 14 | supported-ruby-platforms: | 15 | exclude: ['aarch64-linux-musl', 'x64-mingw32'] 16 | stable-ruby-versions: | 17 | exclude: [head] 18 | cross-gem: 19 | name: Compile native gem for ${{ matrix.platform }} 20 | runs-on: ubuntu-latest 21 | needs: ci-data 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | platform: ${{ fromJSON(needs.ci-data.outputs.result).supported-ruby-platforms }} 26 | steps: 27 | - uses: actions/checkout@v6 28 | 29 | - uses: ruby/setup-ruby@v1 30 | with: 31 | ruby-version: "3.2" 32 | - name: "bundle install" 33 | shell: bash 34 | run: "bundle install" 35 | - uses: oxidize-rb/actions/cross-gem@v1 36 | id: cross-gem 37 | with: 38 | platform: ${{ matrix.platform }} 39 | ruby-versions: ${{ join(fromJSON(needs.ci-data.outputs.result).stable-ruby-versions, ',') }} 40 | 41 | - uses: actions/upload-artifact@v5 42 | with: 43 | name: cross-gem-${{ matrix.platform }} 44 | path: ${{ steps.cross-gem.outputs.gem-path }} 45 | -------------------------------------------------------------------------------- /tiktoken_ruby.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "lib/tiktoken_ruby/version" 4 | 5 | Gem::Specification.new do |spec| 6 | spec.name = "tiktoken_ruby" 7 | spec.version = Tiktoken::VERSION 8 | spec.authors = ["IAPark"] 9 | spec.email = ["isaac.a.park@gmail.com"] 10 | spec.summary = "Ruby wrapper for Tiktoken" 11 | spec.description = "An unofficial Ruby wrapper for Tiktoken, " \ 12 | "a BPE tokenizer written by and used by OpenAI. It can be used to " \ 13 | "count the number of tokens in text before sending it to OpenAI APIs." 14 | spec.homepage = "https://github.com/IAPark/tiktoken_ruby" 15 | spec.license = "MIT" 16 | spec.required_ruby_version = ">= 3.1.0" 17 | spec.required_rubygems_version = ">= 3.4.0" 18 | spec.platform = Gem::Platform::RUBY 19 | 20 | spec.metadata["homepage_uri"] = spec.homepage 21 | spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby" 22 | spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main" 23 | spec.files = Dir.chdir(__dir__) do 24 | `git ls-files -z`.split("\x0").reject do |f| 25 | (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)}) 26 | end 27 | end 28 | spec.bindir = "exe" 29 | spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } 30 | spec.require_paths = ["lib"] 31 | spec.extensions = ["ext/tiktoken_ruby/extconf.rb"] 32 | spec.add_dependency "rb_sys", "~> 0.9" 33 | end 34 | -------------------------------------------------------------------------------- /ext/tiktoken_ruby/src/core_bpe_wrapper.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | use tiktoken_rs::Rank; 4 | 5 | use crate::uncicode_error; 6 | 7 | #[magnus::wrap(class = "Tiktoken::Ext::CoreBPE")] 8 | pub struct CoreBPEWrapper { 9 | core_bpe: tiktoken_rs::CoreBPE, 10 | } 11 | 12 | impl CoreBPEWrapper { 13 | pub fn new(core_bpe: tiktoken_rs::CoreBPE) -> Self { 14 | Self { core_bpe } 15 | } 16 | 17 | pub fn encode_ordinary(&self, text: String) -> Vec { 18 | self.core_bpe.encode_ordinary(text.as_str()) 19 | } 20 | 21 | pub fn encode( 22 | &self, 23 | text: String, 24 | allowed_special: magnus::RArray, 25 | ) -> Result, magnus::Error> { 26 | let allowed_special: Vec = allowed_special.to_vec()?; 27 | let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); 28 | let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned()); 29 | 30 | Ok(self.core_bpe.encode(text.as_str(), &allowed_special).0) 31 | } 32 | 33 | pub fn encode_with_special_tokens(&self, text: String) -> Vec { 34 | self.core_bpe.encode_with_special_tokens(text.as_str()) 35 | } 36 | 37 | pub fn decode(&self, ids: Vec) -> Result { 38 | self.core_bpe.decode(ids).map_err(|e| { 39 | let error = match uncicode_error() { 40 | Ok(error) => error, 41 | Err(e) => return e, 42 | }; 43 | 44 | magnus::Error::new(error, e.to_string()) 45 | }) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /spec/tiktoken_ruby_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe Tiktoken do 4 | it "has a version number" do 5 | expect(Tiktoken::VERSION).not_to be nil 6 | end 7 | 8 | it "can load an encoding" do 9 | expect(Tiktoken.get_encoding("r50k_base")).to be_a(Tiktoken::Encoding) 10 | end 11 | 12 | it "can get an encoding for a model" do 13 | expect(Tiktoken.encoding_for_model("gpt-3.5-turbo")).to be_a(Tiktoken::Encoding) 14 | end 15 | 16 | it "can get an encoding for a fine-tuned model" do 17 | expect(Tiktoken.encoding_for_model("ft:gpt-3.5-turbo:org:suffix:abc123")).to be_a(Tiktoken::Encoding) 18 | end 19 | 20 | it "can get an encoding for a reasoning model" do 21 | expect(Tiktoken.encoding_for_model("o3")).to be_a(Tiktoken::Encoding) 22 | end 23 | 24 | it "fails gracefully when getting an encoding for an unknown model" do 25 | expect(Tiktoken.encoding_for_model("bad-model-name")).to be_nil 26 | end 27 | 28 | it "lists available encodings" do 29 | expect(Tiktoken.list_encoding_names).to be_a(Array) 30 | end 31 | 32 | Tiktoken.list_encoding_names.each do |encoding_name| 33 | describe "Encoding #{encoding_name}" do 34 | let(:encoding) { Tiktoken.get_encoding(encoding_name) } 35 | describe Tiktoken::Encoding do 36 | it "Can get the encoding" do 37 | expect(encoding).to be_a(Tiktoken::Encoding) 38 | end 39 | 40 | it "Tokenizes a string" do 41 | expect(encoding.encode("Hello world!").size).to be(3) 42 | end 43 | 44 | it "round trips a string" do 45 | tokens = encoding.encode("Hello world!") 46 | expect(encoding.decode(tokens)).to eq("Hello world!") 47 | end 48 | 49 | it "Encode ordinary tokenizes a string" do 50 | expect(encoding.encode_ordinary("Hello world!").size).to be(3) 51 | end 52 | end 53 | end 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /lib/tiktoken_ruby/encoding.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | class Tiktoken::Encoding 4 | CACHE_MUTEX = Mutex.new 5 | 6 | attr_reader :name 7 | 8 | # This returns a new Tiktoken::Encoding instance for the requested encoding 9 | # @param encoding [Symbol] The name of the encoding to load 10 | # @return [Tiktoken::Encoding] The encoding instance 11 | def self.for_name(encoding) 12 | Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym) 13 | end 14 | 15 | # This returns a Tiktoken::Encoding instance for the requested encoding 16 | # It will reuse an existing encoding if it's already been loaded 17 | # @param encoding [Symbol] The name of the encoding to load 18 | # @return [Tiktoken::Encoding] The encoding instance 19 | def self.for_name_cached(encoding) 20 | CACHE_MUTEX.synchronize do 21 | @encodings ||= {} 22 | @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding) 23 | end 24 | end 25 | 26 | # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens 27 | # basically it's unescaped 28 | # @param text [String] The text to encode 29 | # @return [Array] The encoded tokens 30 | def encode_ordinary(text) 31 | @ext_base_bpe.encode_ordinary(text) 32 | end 33 | 34 | # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens 35 | # as text unless they're in the allowed_special array. It's basically like the text was escaped 36 | # @param text [String] The text to encode 37 | # @param allowed_special [Array] An array of special tokens to allow 38 | # @return [Array] The encoded tokens 39 | def encode(text, allowed_special: []) 40 | @ext_base_bpe.encode(text, allowed_special) 41 | end 42 | 43 | # Decodes the tokens back into text 44 | # @param tokens [Array] The tokens to decode 45 | # @return [String] The decoded text 46 | def decode(tokens) 47 | @ext_base_bpe.decode(tokens) 48 | end 49 | 50 | private 51 | 52 | def initialize(ext_base_bpe, name) 53 | @ext_base_bpe = ext_base_bpe 54 | @name = name 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby) 2 | 3 | # tiktoken_ruby 4 | 5 | [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models. 6 | This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used. 7 | 8 | ## Installation 9 | 10 | Install the gem and add to the application's Gemfile by executing: 11 | 12 | $ bundle add tiktoken_ruby 13 | 14 | If bundler is not being used to manage dependencies, install the gem by executing: 15 | 16 | $ gem install tiktoken_ruby 17 | 18 | ## Usage 19 | 20 | Usage should be very similar to the python library. Here's a simple example 21 | 22 | Encode and decode text 23 | 24 | ```ruby 25 | require 'tiktoken_ruby' 26 | enc = Tiktoken.get_encoding("cl100k_base") 27 | enc.decode(enc.encode("hello world")) #=> "hello world" 28 | ``` 29 | 30 | Encoders can also be retrieved by model name 31 | 32 | ```ruby 33 | require 'tiktoken_ruby' 34 | 35 | enc = Tiktoken.encoding_for_model("gpt-4") 36 | enc.encode("hello world").length #=> 2 37 | ``` 38 | 39 | ## Development 40 | 41 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 42 | 43 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org). 44 | 45 | ## Contributing 46 | 47 | Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby. 48 | 49 | To get started with development: 50 | 51 | ```sh 52 | git clone https://github.com/IAPark/tiktoken_ruby.git 53 | cd tiktoken_ruby 54 | bundle install 55 | bundle exec rake compile 56 | bundle exec rake spec 57 | ``` 58 | 59 | ## License 60 | 61 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 62 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | tiktoken_ruby (0.0.13) 5 | rb_sys (~> 0.9) 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | ast (2.4.3) 11 | diff-lcs (1.6.2) 12 | json (2.16.0) 13 | language_server-protocol (3.17.0.5) 14 | lint_roller (1.1.0) 15 | minitest (5.25.5) 16 | parallel (1.27.0) 17 | parser (3.3.10.0) 18 | ast (~> 2.4.1) 19 | racc 20 | prism (1.6.0) 21 | racc (1.8.1) 22 | rainbow (3.1.1) 23 | rake (13.3.1) 24 | rake-compiler (1.3.0) 25 | rake 26 | rake-compiler-dock (1.9.1) 27 | rb_sys (0.9.117) 28 | rake-compiler-dock (= 1.9.1) 29 | regexp_parser (2.11.3) 30 | rspec (3.13.2) 31 | rspec-core (~> 3.13.0) 32 | rspec-expectations (~> 3.13.0) 33 | rspec-mocks (~> 3.13.0) 34 | rspec-core (3.13.6) 35 | rspec-support (~> 3.13.0) 36 | rspec-expectations (3.13.5) 37 | diff-lcs (>= 1.2.0, < 2.0) 38 | rspec-support (~> 3.13.0) 39 | rspec-mocks (3.13.7) 40 | diff-lcs (>= 1.2.0, < 2.0) 41 | rspec-support (~> 3.13.0) 42 | rspec-support (3.13.6) 43 | rubocop (1.81.7) 44 | json (~> 2.3) 45 | language_server-protocol (~> 3.17.0.2) 46 | lint_roller (~> 1.1.0) 47 | parallel (~> 1.10) 48 | parser (>= 3.3.0.2) 49 | rainbow (>= 2.2.2, < 4.0) 50 | regexp_parser (>= 2.9.3, < 3.0) 51 | rubocop-ast (>= 1.47.1, < 2.0) 52 | ruby-progressbar (~> 1.7) 53 | unicode-display_width (>= 2.4.0, < 4.0) 54 | rubocop-ast (1.48.0) 55 | parser (>= 3.3.7.2) 56 | prism (~> 1.4) 57 | rubocop-performance (1.25.0) 58 | lint_roller (~> 1.1) 59 | rubocop (>= 1.75.0, < 2.0) 60 | rubocop-ast (>= 1.38.0, < 2.0) 61 | ruby-progressbar (1.13.0) 62 | standard (1.52.0) 63 | language_server-protocol (~> 3.17.0.2) 64 | lint_roller (~> 1.0) 65 | rubocop (~> 1.81.7) 66 | standard-custom (~> 1.0.0) 67 | standard-performance (~> 1.8) 68 | standard-custom (1.0.2) 69 | lint_roller (~> 1.0) 70 | rubocop (~> 1.50) 71 | standard-performance (1.8.0) 72 | lint_roller (~> 1.1) 73 | rubocop-performance (~> 1.25.0) 74 | unicode-display_width (3.2.0) 75 | unicode-emoji (~> 4.1) 76 | unicode-emoji (4.1.0) 77 | yard (0.9.37) 78 | yard-doctest (0.1.17) 79 | minitest 80 | yard 81 | 82 | PLATFORMS 83 | arm64-darwin-22 84 | ruby 85 | x86_64-darwin-22 86 | x86_64-linux 87 | 88 | DEPENDENCIES 89 | racc 90 | rake 91 | rake-compiler 92 | rspec 93 | standard 94 | tiktoken_ruby! 95 | yard-doctest 96 | 97 | BUNDLED WITH 98 | 2.6.9 99 | -------------------------------------------------------------------------------- /ext/tiktoken_ruby/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod core_bpe_wrapper; 2 | 3 | use core_bpe_wrapper::CoreBPEWrapper; 4 | use magnus::{function, method, prelude::*, Error, ExceptionClass, RModule, Ruby}; 5 | 6 | fn r50k_base() -> CoreBPEWrapper { 7 | let core_bpe = tiktoken_rs::r50k_base().unwrap(); 8 | CoreBPEWrapper::new(core_bpe) 9 | } 10 | fn p50k_base() -> CoreBPEWrapper { 11 | let core_bpe = tiktoken_rs::p50k_base().unwrap(); 12 | CoreBPEWrapper::new(core_bpe) 13 | } 14 | fn p50k_edit() -> CoreBPEWrapper { 15 | let core_bpe = tiktoken_rs::p50k_edit().unwrap(); 16 | CoreBPEWrapper::new(core_bpe) 17 | } 18 | fn cl100k_base() -> CoreBPEWrapper { 19 | let core_bpe = tiktoken_rs::cl100k_base().unwrap(); 20 | CoreBPEWrapper::new(core_bpe) 21 | } 22 | 23 | fn o200k_base() -> CoreBPEWrapper { 24 | let core_bpe = tiktoken_rs::o200k_base().unwrap(); 25 | CoreBPEWrapper::new(core_bpe) 26 | } 27 | 28 | fn o200k_harmony() -> CoreBPEWrapper { 29 | let core_bpe = tiktoken_rs::o200k_harmony().unwrap(); 30 | CoreBPEWrapper::new(core_bpe) 31 | } 32 | 33 | fn module() -> Result { 34 | Ruby::get().unwrap().define_module("Tiktoken") 35 | } 36 | 37 | fn uncicode_error() -> Result { 38 | module()?.define_error( 39 | "UnicodeError", 40 | Ruby::get().unwrap().exception_standard_error(), 41 | ) 42 | } 43 | 44 | #[magnus::init] 45 | fn init() -> Result<(), Error> { 46 | let module = module()?; 47 | 48 | let factory_module = module.define_module("BpeFactory")?; 49 | factory_module.define_singleton_method("r50k_base", function!(r50k_base, 0))?; 50 | factory_module.define_singleton_method("p50k_base", function!(p50k_base, 0))?; 51 | factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?; 52 | factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?; 53 | factory_module.define_singleton_method("o200k_base", function!(o200k_base, 0))?; 54 | factory_module.define_singleton_method("o200k_harmony", function!(o200k_harmony, 0))?; 55 | 56 | let ext_module = module.define_module("Ext")?; 57 | let bpe_class = ext_module.define_class("CoreBPE", Ruby::get().unwrap().class_object())?; 58 | 59 | bpe_class.define_method( 60 | "encode_ordinary", 61 | method!(CoreBPEWrapper::encode_ordinary, 1), 62 | )?; 63 | bpe_class.define_method("encode", method!(CoreBPEWrapper::encode, 2))?; 64 | bpe_class.define_method( 65 | "encode_with_special_tokens", 66 | method!(CoreBPEWrapper::encode_with_special_tokens, 1), 67 | )?; 68 | 69 | bpe_class.define_method("decode", method!(CoreBPEWrapper::decode, 1))?; 70 | Ok(()) 71 | } 72 | -------------------------------------------------------------------------------- /lib/tiktoken_ruby.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "tiktoken_ruby/version" 4 | require_relative "tiktoken_ruby/encoding" 5 | 6 | begin 7 | RUBY_VERSION =~ /(\d+\.\d+)/ 8 | require_relative "tiktoken_ruby/#{$1}/tiktoken_ruby" 9 | rescue LoadError 10 | require_relative "tiktoken_ruby/tiktoken_ruby" 11 | end 12 | 13 | module Tiktoken 14 | class << self 15 | # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise 16 | # it will reuse the instance of that type that was previous loaded 17 | # @param name [Symbol|String] The name of the encoding to load 18 | # @return [Tiktoken::Encoding] The encoding instance 19 | # @example Encode and decode text 20 | # enc = Tiktoken.get_encoding("cl100k_base") 21 | # enc.decode(enc.encode("hello world")) #=> "hello world" 22 | def get_encoding(name) 23 | name = name.to_sym 24 | return nil unless SUPPORTED_ENCODINGS.include?(name) 25 | 26 | Tiktoken::Encoding.for_name_cached(name) 27 | end 28 | 29 | # Gets the encoding for an OpenAI model 30 | # @param model_name [Symbol|String] The name of the model to get the encoding for 31 | # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found 32 | # @example Count tokens for text 33 | # enc = Tiktoken.encoding_for_model("gpt-4") 34 | # enc.encode("hello world").length #=> 2 35 | def encoding_for_model(model_name) 36 | if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym) 37 | return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym]) 38 | end 39 | 40 | _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding| 41 | model_name.start_with?(prefix.to_s) 42 | end 43 | 44 | if encoding 45 | get_encoding(encoding) 46 | end 47 | end 48 | 49 | # Lists all the encodings that are supported 50 | # @return [Array] The list of supported encodings 51 | def list_encoding_names 52 | SUPPORTED_ENCODINGS 53 | end 54 | 55 | # Lists all the models that are supported 56 | # @return [Array] The list of supported models 57 | def list_model_names 58 | MODEL_TO_ENCODING_NAME.keys 59 | end 60 | 61 | private 62 | 63 | SUPPORTED_ENCODINGS = [ 64 | :r50k_base, 65 | :p50k_base, 66 | :p50k_edit, 67 | :cl100k_base, 68 | :o200k_base, 69 | :o200k_harmony 70 | ] 71 | 72 | # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py 73 | # that is also MIT licensed but by OpenAI; 74 | # https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50 75 | # is the source of the mapping for the Rust library 76 | MODEL_TO_ENCODING_NAME = { 77 | # reasoning 78 | o1: "o200k_base", 79 | o3: "o200k_base", 80 | "o4-mini": "o200k_base", 81 | # chat 82 | "gpt-4.1": "o200k_base", 83 | "chatgpt-4o": "o200k_base", 84 | "gpt-4o": "o200k_base", 85 | "gpt-4": "cl100k_base", 86 | "gpt-3.5-turbo": "cl100k_base", 87 | "gpt-3.5": "cl100k_base", # Common shorthand 88 | "gpt-35-turbo": "cl100k_base", # Azure deployment name 89 | # base 90 | "davinci-002": "cl100k_base", 91 | "babbage-002": "cl100k_base", 92 | # embeddings 93 | "text-embedding-ada-002": "cl100k_base", 94 | "text-embedding-3-small": "cl100k_base", 95 | "text-embedding-3-large": "cl100k_base", 96 | # DEPRECATED MODELS 97 | # text (DEPRECATED) 98 | "text-davinci-003": "p50k_base", 99 | "text-davinci-002": "p50k_base", 100 | "text-davinci-001": "r50k_base", 101 | "text-curie-001": "r50k_base", 102 | "text-babbage-001": "r50k_base", 103 | "text-ada-001": "r50k_base", 104 | davinci: "r50k_base", 105 | curie: "r50k_base", 106 | babbage: "r50k_base", 107 | ada: "r50k_base", 108 | # code (DEPRECATED) 109 | "code-davinci-002": "p50k_base", 110 | "code-davinci-001": "p50k_base", 111 | "code-cushman-002": "p50k_base", 112 | "code-cushman-001": "p50k_base", 113 | "davinci-codex": "p50k_base", 114 | "cushman-codex": "p50k_base", 115 | # edit (DEPRECATED) 116 | "text-davinci-edit-001": "p50k_edit", 117 | "code-davinci-edit-001": "p50k_edit", 118 | # old embeddings (DEPRECATED) 119 | "text-similarity-davinci-001": "r50k_base", 120 | "text-similarity-curie-001": "r50k_base", 121 | "text-similarity-babbage-001": "r50k_base", 122 | "text-similarity-ada-001": "r50k_base", 123 | "text-search-davinci-doc-001": "r50k_base", 124 | "text-search-curie-doc-001": "r50k_base", 125 | "text-search-babbage-doc-001": "r50k_base", 126 | "text-search-ada-doc-001": "r50k_base", 127 | "code-search-babbage-code-001": "r50k_base", 128 | "code-search-ada-code-001": "r50k_base", 129 | # open source 130 | gpt2: "gpt2" 131 | } 132 | 133 | MODEL_PREFIX_TO_ENCODING = { 134 | # reasoning 135 | "o1-": "o200k_base", 136 | "o3-": "o200k_base", 137 | "o4-": "o200k_base", 138 | # chat 139 | "gpt-5-": "o200k_base", 140 | "gpt-4.5-": "o200k_base", 141 | "gpt-4.1-": "o200k_base", 142 | "chatgpt-4o-": "o200k_base", 143 | "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13, etc. 144 | "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k 145 | "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. 146 | "gpt-35-turbo-": "cl100k_base", # Azure deployment name 147 | "gpt-oss-": "o200k_harmony", 148 | # fine-tuned 149 | "ft:gpt-4o": "cl100k_base", 150 | "ft:gpt-4": "cl100k_base", 151 | "ft:gpt-3.5-turbo": "cl100k_base", 152 | "ft:davinci-002": "cl100k_base", 153 | "ft:babbage-002": "cl100k_base" 154 | } 155 | end 156 | end 157 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.4" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anyhow" 16 | version = "1.0.100" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" 19 | 20 | [[package]] 21 | name = "base64" 22 | version = "0.22.1" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" 25 | 26 | [[package]] 27 | name = "bindgen" 28 | version = "0.69.5" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" 31 | dependencies = [ 32 | "bitflags", 33 | "cexpr", 34 | "clang-sys", 35 | "itertools", 36 | "lazy_static", 37 | "lazycell", 38 | "proc-macro2", 39 | "quote", 40 | "regex", 41 | "rustc-hash", 42 | "shlex", 43 | "syn", 44 | ] 45 | 46 | [[package]] 47 | name = "bit-set" 48 | version = "0.5.3" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" 51 | dependencies = [ 52 | "bit-vec", 53 | ] 54 | 55 | [[package]] 56 | name = "bit-vec" 57 | version = "0.6.3" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" 60 | 61 | [[package]] 62 | name = "bitflags" 63 | version = "2.10.0" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" 66 | 67 | [[package]] 68 | name = "bstr" 69 | version = "1.12.1" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" 72 | dependencies = [ 73 | "memchr", 74 | "regex-automata", 75 | "serde", 76 | ] 77 | 78 | [[package]] 79 | name = "cexpr" 80 | version = "0.6.0" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" 83 | dependencies = [ 84 | "nom", 85 | ] 86 | 87 | [[package]] 88 | name = "cfg-if" 89 | version = "1.0.4" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" 92 | 93 | [[package]] 94 | name = "clang-sys" 95 | version = "1.8.1" 96 | source = "registry+https://github.com/rust-lang/crates.io-index" 97 | checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" 98 | dependencies = [ 99 | "glob", 100 | "libc", 101 | "libloading", 102 | ] 103 | 104 | [[package]] 105 | name = "either" 106 | version = "1.15.0" 107 | source = "registry+https://github.com/rust-lang/crates.io-index" 108 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 109 | 110 | [[package]] 111 | name = "fancy-regex" 112 | version = "0.13.0" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" 115 | dependencies = [ 116 | "bit-set", 117 | "regex-automata", 118 | "regex-syntax", 119 | ] 120 | 121 | [[package]] 122 | name = "glob" 123 | version = "0.3.3" 124 | source = "registry+https://github.com/rust-lang/crates.io-index" 125 | checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" 126 | 127 | [[package]] 128 | name = "itertools" 129 | version = "0.12.1" 130 | source = "registry+https://github.com/rust-lang/crates.io-index" 131 | checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" 132 | dependencies = [ 133 | "either", 134 | ] 135 | 136 | [[package]] 137 | name = "lazy_static" 138 | version = "1.5.0" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 141 | 142 | [[package]] 143 | name = "lazycell" 144 | version = "1.3.0" 145 | source = "registry+https://github.com/rust-lang/crates.io-index" 146 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" 147 | 148 | [[package]] 149 | name = "libc" 150 | version = "0.2.177" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" 153 | 154 | [[package]] 155 | name = "libloading" 156 | version = "0.8.9" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" 159 | dependencies = [ 160 | "cfg-if", 161 | "windows-link", 162 | ] 163 | 164 | [[package]] 165 | name = "magnus" 166 | version = "0.8.2" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "3b36a5b126bbe97eb0d02d07acfeb327036c6319fd816139a49824a83b7f9012" 169 | dependencies = [ 170 | "magnus-macros", 171 | "rb-sys", 172 | "rb-sys-env", 173 | "seq-macro", 174 | ] 175 | 176 | [[package]] 177 | name = "magnus-macros" 178 | version = "0.8.0" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "47607461fd8e1513cb4f2076c197d8092d921a1ea75bd08af97398f593751892" 181 | dependencies = [ 182 | "proc-macro2", 183 | "quote", 184 | "syn", 185 | ] 186 | 187 | [[package]] 188 | name = "memchr" 189 | version = "2.7.6" 190 | source = "registry+https://github.com/rust-lang/crates.io-index" 191 | checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" 192 | 193 | [[package]] 194 | name = "minimal-lexical" 195 | version = "0.2.1" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" 198 | 199 | [[package]] 200 | name = "nom" 201 | version = "7.1.3" 202 | source = "registry+https://github.com/rust-lang/crates.io-index" 203 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" 204 | dependencies = [ 205 | "memchr", 206 | "minimal-lexical", 207 | ] 208 | 209 | [[package]] 210 | name = "proc-macro2" 211 | version = "1.0.103" 212 | source = "registry+https://github.com/rust-lang/crates.io-index" 213 | checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" 214 | dependencies = [ 215 | "unicode-ident", 216 | ] 217 | 218 | [[package]] 219 | name = "quote" 220 | version = "1.0.42" 221 | source = "registry+https://github.com/rust-lang/crates.io-index" 222 | checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" 223 | dependencies = [ 224 | "proc-macro2", 225 | ] 226 | 227 | [[package]] 228 | name = "rb-sys" 229 | version = "0.9.117" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "f900d1ce4629a2ebffaf5de74bd8f9c1188d4c5ed406df02f97e22f77a006f44" 232 | dependencies = [ 233 | "rb-sys-build", 234 | ] 235 | 236 | [[package]] 237 | name = "rb-sys-build" 238 | version = "0.9.117" 239 | source = "registry+https://github.com/rust-lang/crates.io-index" 240 | checksum = "ef1e9c857028f631056bcd6d88cec390c751e343ce2223ddb26d23eb4a151d59" 241 | dependencies = [ 242 | "bindgen", 243 | "lazy_static", 244 | "proc-macro2", 245 | "quote", 246 | "regex", 247 | "shell-words", 248 | "syn", 249 | ] 250 | 251 | [[package]] 252 | name = "rb-sys-env" 253 | version = "0.2.2" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | checksum = "08f8d2924cf136a1315e2b4c7460a39f62ef11ee5d522df9b2750fab55b868b6" 256 | 257 | [[package]] 258 | name = "regex" 259 | version = "1.12.2" 260 | source = "registry+https://github.com/rust-lang/crates.io-index" 261 | checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" 262 | dependencies = [ 263 | "aho-corasick", 264 | "memchr", 265 | "regex-automata", 266 | "regex-syntax", 267 | ] 268 | 269 | [[package]] 270 | name = "regex-automata" 271 | version = "0.4.13" 272 | source = "registry+https://github.com/rust-lang/crates.io-index" 273 | checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" 274 | dependencies = [ 275 | "aho-corasick", 276 | "memchr", 277 | "regex-syntax", 278 | ] 279 | 280 | [[package]] 281 | name = "regex-syntax" 282 | version = "0.8.8" 283 | source = "registry+https://github.com/rust-lang/crates.io-index" 284 | checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" 285 | 286 | [[package]] 287 | name = "rustc-hash" 288 | version = "1.1.0" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 291 | 292 | [[package]] 293 | name = "seq-macro" 294 | version = "0.3.6" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" 297 | 298 | [[package]] 299 | name = "serde" 300 | version = "1.0.228" 301 | source = "registry+https://github.com/rust-lang/crates.io-index" 302 | checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" 303 | dependencies = [ 304 | "serde_core", 305 | ] 306 | 307 | [[package]] 308 | name = "serde_core" 309 | version = "1.0.228" 310 | source = "registry+https://github.com/rust-lang/crates.io-index" 311 | checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" 312 | dependencies = [ 313 | "serde_derive", 314 | ] 315 | 316 | [[package]] 317 | name = "serde_derive" 318 | version = "1.0.228" 319 | source = "registry+https://github.com/rust-lang/crates.io-index" 320 | checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" 321 | dependencies = [ 322 | "proc-macro2", 323 | "quote", 324 | "syn", 325 | ] 326 | 327 | [[package]] 328 | name = "shell-words" 329 | version = "1.1.0" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde" 332 | 333 | [[package]] 334 | name = "shlex" 335 | version = "1.3.0" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 338 | 339 | [[package]] 340 | name = "syn" 341 | version = "2.0.110" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" 344 | dependencies = [ 345 | "proc-macro2", 346 | "quote", 347 | "unicode-ident", 348 | ] 349 | 350 | [[package]] 351 | name = "tiktoken-rs" 352 | version = "0.9.1" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "3a19830747d9034cd9da43a60eaa8e552dfda7712424aebf187b7a60126bae0d" 355 | dependencies = [ 356 | "anyhow", 357 | "base64", 358 | "bstr", 359 | "fancy-regex", 360 | "lazy_static", 361 | "regex", 362 | "rustc-hash", 363 | ] 364 | 365 | [[package]] 366 | name = "tiktoken_ruby" 367 | version = "0.1.0" 368 | dependencies = [ 369 | "magnus", 370 | "rb-sys", 371 | "tiktoken-rs", 372 | ] 373 | 374 | [[package]] 375 | name = "unicode-ident" 376 | version = "1.0.22" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" 379 | 380 | [[package]] 381 | name = "windows-link" 382 | version = "0.2.1" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 385 | --------------------------------------------------------------------------------