├── test ├── support │ ├── empty.csv │ ├── columns.csv │ ├── data.csv │ ├── types.csv │ ├── unnamed.csv │ ├── data.parquet │ ├── null.parquet │ └── types.parquet ├── test_helper.rb ├── active_record_test.rb ├── parquet_test.rb ├── join_test.rb ├── group_test.rb ├── csv_test.rb ├── plot_test.rb ├── types_test.rb ├── data_frame_test.rb └── vector_test.rb ├── lib ├── rover-df.rb ├── rover │ ├── version.rb │ ├── group.rb │ ├── vector.rb │ └── data_frame.rb └── rover.rb ├── .gitignore ├── Rakefile ├── Gemfile ├── rover-df.gemspec ├── .github └── workflows │ └── build.yml ├── LICENSE.txt ├── CHANGELOG.md └── README.md /test/support/empty.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/rover-df.rb: -------------------------------------------------------------------------------- 1 | require_relative "rover" 2 | -------------------------------------------------------------------------------- /test/support/columns.csv: -------------------------------------------------------------------------------- 1 | one 2 | one,two 3 | one,two,three 4 | -------------------------------------------------------------------------------- /test/support/data.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 1,one 3 | 2,two 4 | 3,"three" 5 | -------------------------------------------------------------------------------- /lib/rover/version.rb: -------------------------------------------------------------------------------- 1 | module Rover 2 | VERSION = "0.5.0" 3 | end 4 | -------------------------------------------------------------------------------- /test/support/types.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,one,1.1 3 | 2,two,2.2 4 | 3,three,3.3 5 | -------------------------------------------------------------------------------- /test/support/unnamed.csv: -------------------------------------------------------------------------------- 1 | unnamed2,,,"" 2 | 1,one,, 3 | 2,two,, 4 | 3,three,, 5 | -------------------------------------------------------------------------------- /test/support/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ankane/rover/HEAD/test/support/data.parquet -------------------------------------------------------------------------------- /test/support/null.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ankane/rover/HEAD/test/support/null.parquet -------------------------------------------------------------------------------- /test/support/types.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ankane/rover/HEAD/test/support/types.parquet -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | *.lock 10 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | 4 | task default: :test 5 | Rake::TestTask.new do |t| 6 | t.libs << "test" 7 | t.pattern = "test/**/*_test.rb" 8 | end 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "rake" 6 | gem "minitest", ">= 5" 7 | gem "activerecord" 8 | gem "activesupport" 9 | gem "sqlite3" 10 | gem "iruby", require: false 11 | gem "vega" 12 | gem "csv" 13 | 14 | # do not install by default 15 | # since it tries to install arrow 16 | # with apt/homebrew install 17 | gem "red-parquet" if ENV["TEST_PARQUET"] 18 | -------------------------------------------------------------------------------- /rover-df.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/rover/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "rover-df" 5 | spec.version = Rover::VERSION 6 | spec.summary = "Simple, powerful data frames for Ruby" 7 | spec.homepage = "https://github.com/ankane/rover" 8 | spec.license = "MIT" 9 | 10 | spec.author = "Andrew Kane" 11 | spec.email = "andrew@ankane.org" 12 | 13 | spec.files = Dir["*.{md,txt}", "{lib}/**/*"] 14 | spec.require_path = "lib" 15 | 16 | spec.required_ruby_version = ">= 3.2" 17 | 18 | spec.add_dependency "numo-narray", ">= 0.9.1.9" 19 | end 20 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | ruby: [3.4, 3.3, 3.2] 9 | runs-on: ubuntu-latest 10 | # env: 11 | # TEST_PARQUET: 1 12 | steps: 13 | - uses: actions/checkout@v4 14 | # for iruby 15 | - run: sudo apt update && sudo apt install libzmq5 16 | # - run: | 17 | # wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb 18 | # sudo apt install ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb 19 | # sudo apt update 20 | # sudo apt install libthrift-dev libarrow-dev libarrow-glib-dev libparquet-dev 21 | - uses: ruby/setup-ruby@v1 22 | with: 23 | ruby-version: ${{ matrix.ruby }} 24 | # disable caching when testing Parquet 25 | bundler-cache: true 26 | - run: bundle install 27 | - run: bundle exec rake test 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020-2025 Andrew Kane 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | Bundler.require(:default) 3 | require "minitest/autorun" 4 | require "minitest/pride" 5 | require "active_record" 6 | require "active_support" 7 | require "active_support/core_ext/kernel/reporting" 8 | 9 | silence_warnings do 10 | require "iruby" 11 | end 12 | 13 | logger = ActiveSupport::Logger.new(ENV["VERBOSE"] ? STDOUT : nil) 14 | 15 | ActiveRecord::Base.logger = logger 16 | ActiveRecord::Migration.verbose = ENV["VERBOSE"] 17 | 18 | ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:" 19 | 20 | ActiveRecord::Schema.define do 21 | create_table :users do |t| 22 | t.string :name 23 | end 24 | end 25 | 26 | class User < ActiveRecord::Base 27 | end 28 | 29 | class Minitest::Test 30 | def assert_vector(exp, act, type: nil) 31 | assert_kind_of Rover::Vector, act 32 | assert_equal type, act.type if type 33 | assert_equal exp.to_a, act.to_a 34 | end 35 | 36 | def assert_vector_in_delta(exp, act, type: nil) 37 | assert_kind_of Rover::Vector, act 38 | assert_equal type, act.type if type 39 | assert_elements_in_delta exp.to_a, act.to_a 40 | end 41 | 42 | def assert_elements_in_delta(expected, actual) 43 | assert_equal expected.size, actual.size 44 | expected.zip(actual) do |exp, act| 45 | assert_in_delta exp, act 46 | end 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /test/active_record_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class ActiveRecordTest < Minitest::Test 4 | def setup 5 | User.delete_all 6 | end 7 | 8 | def test_model 9 | users = 3.times.map { |i| User.create!(name: "User #{i}") } 10 | df = Rover::DataFrame.new(User).sort_by { |row| row["id"] } 11 | assert_equal ["id", "name"], df.vector_names 12 | assert_vector users.map(&:id), df["id"] 13 | assert_vector users.map(&:name), df["name"] 14 | end 15 | 16 | def test_relation 17 | users = 3.times.map { |i| User.create!(name: "User #{i}") } 18 | df = Rover::DataFrame.new(User.order(:id)) 19 | assert_equal ["id", "name"], df.vector_names 20 | assert_vector users.map(&:id), df["id"] 21 | assert_vector users.map(&:name), df["name"] 22 | end 23 | 24 | def test_result 25 | users = 3.times.map { |i| User.create!(name: "User #{i}") } 26 | df = Rover::DataFrame.new(User.connection.select_all("SELECT * FROM users ORDER BY id")) 27 | assert_equal ["id", "name"], df.vector_names 28 | assert_vector users.map(&:id), df["id"] 29 | assert_vector users.map(&:name), df["name"] 30 | end 31 | 32 | def test_connection_leasing 33 | ActiveRecord::Base.connection_handler.clear_active_connections! 34 | assert_nil ActiveRecord::Base.connection_pool.active_connection? 35 | ActiveRecord::Base.connection_pool.with_connection do 36 | Rover::DataFrame.new(User.order(:id)) 37 | end 38 | assert_nil ActiveRecord::Base.connection_pool.active_connection? 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/rover/group.rb: -------------------------------------------------------------------------------- 1 | module Rover 2 | class Group 3 | def initialize(df, columns) 4 | @df = df 5 | @columns = columns 6 | check_columns 7 | end 8 | 9 | def group(*columns) 10 | Group.new(@df, @columns + columns.flatten) 11 | end 12 | 13 | [:count, :max, :min, :mean, :median, :percentile, :sum, :std, :var].each do |name| 14 | define_method(name) do |*args| 15 | n = [name, args.first].compact.join("_") 16 | 17 | rows = [] 18 | grouped_dfs.each do |k, df| 19 | rows << k.merge(n => df.send(name, *args)) 20 | end 21 | 22 | DataFrame.new(rows) 23 | end 24 | end 25 | 26 | def plot(*args, **options) 27 | raise ArgumentError, "Multiple groups not supported" if @columns.size > 1 28 | # same message as Ruby 29 | raise ArgumentError, "unknown keyword: :group" if options.key?(:group) 30 | 31 | @df.plot(*args, **options, group: @columns.first) 32 | end 33 | 34 | private 35 | 36 | # TODO make more efficient 37 | def grouped_dfs 38 | # cache here so we can reuse for multiple calculations if needed 39 | @grouped_dfs ||= begin 40 | groups = Hash.new { |hash, key| hash[key] = [] } 41 | i = 0 42 | @df.each_row do |row| 43 | groups[row.slice(*@columns)] << i 44 | i += 1 45 | end 46 | 47 | result = {} 48 | groups.keys.each do |k| 49 | result[k] = @df[groups[k]] 50 | end 51 | result 52 | end 53 | end 54 | 55 | def check_columns 56 | raise ArgumentError, "No columns given" if @columns.empty? 57 | 58 | missing_keys = @columns - @df.keys 59 | raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any? 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /test/parquet_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class ParquetTest < Minitest::Test 4 | def setup 5 | skip unless ENV["TEST_PARQUET"] 6 | end 7 | 8 | def test_read_parquet 9 | df = Rover.read_parquet("test/support/data.parquet") 10 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 11 | assert_equal expected, df 12 | end 13 | 14 | def test_parse_parquet 15 | df = Rover.parse_parquet(File.binread("test/support/data.parquet")) 16 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 17 | assert_equal expected, df 18 | end 19 | 20 | # uint32 is read as int64 21 | def test_default_types 22 | df = Rover.read_parquet("test/support/types.parquet") 23 | expected = [:int64, :int32, :int16, :int8, :uint64, :int64, :uint16, :uint8, :float64, :float32, :object, :bool] 24 | assert_equal expected, df.types.values 25 | end 26 | 27 | def test_types 28 | df = Rover.read_parquet("test/support/data.parquet", types: {"a" => :int8}) 29 | assert_equal :int8, df.types["a"] 30 | end 31 | 32 | def test_types_symbol 33 | df = Rover.read_parquet("test/support/data.parquet", types: {a: :int8}) 34 | assert_equal :int8, df.types["a"] 35 | end 36 | 37 | def test_null 38 | error = assert_raises do 39 | Rover.read_parquet("test/support/null.parquet") 40 | end 41 | assert_equal "Nulls not supported for int32 column: a", error.message 42 | 43 | df = Rover.read_parquet("test/support/null.parquet", types: {"a" => :object}) 44 | assert_vector [1, nil, 3], df["a"] 45 | 46 | df = Rover.read_parquet("test/support/null.parquet", types: {"a" => :float}) 47 | assert df["a"][1].nan? 48 | end 49 | 50 | def test_to_parquet 51 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 52 | assert_equal df, Rover.parse_parquet(df.to_parquet) 53 | end 54 | 55 | def test_to_parquet_types 56 | df = Rover.read_parquet("test/support/types.parquet") 57 | assert df.to_parquet 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /test/join_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class JoinTest < Minitest::Test 4 | def test_inner_join 5 | df = Rover::DataFrame.new({ 6 | a: [1, 2, 3], 7 | b: ["one", "two", "three"] 8 | }) 9 | 10 | other_df = Rover::DataFrame.new({ 11 | a: [1, 1, 2], 12 | c: ["c1", "c2", "c3"] 13 | }) 14 | 15 | expected = Rover::DataFrame.new({ 16 | a: [1, 1, 2], 17 | b: ["one", "one", "two"], 18 | c: ["c1", "c2", "c3"] 19 | }) 20 | 21 | assert_equal expected, df.inner_join(other_df) 22 | end 23 | 24 | def test_inner_join_on_hash 25 | df = Rover::DataFrame.new({ 26 | a: [1, 2, 3], 27 | b: ["one", "two", "three"] 28 | }) 29 | 30 | other_df = Rover::DataFrame.new({ 31 | a2: [1, 1, 2], 32 | c: ["c1", "c2", "c3"] 33 | }) 34 | 35 | expected = Rover::DataFrame.new({ 36 | a: [1, 1, 2], 37 | b: ["one", "one", "two"], 38 | a2: [1, 1, 2], 39 | c: ["c1", "c2", "c3"] 40 | }) 41 | 42 | assert_equal expected, df.inner_join(other_df, on: {a: :a2}) 43 | end 44 | 45 | def test_inner_join_empty 46 | df = Rover::DataFrame.new({ 47 | a: [1, 2, 3], 48 | b: ["one", "two", "three"] 49 | }) 50 | 51 | other_df = Rover::DataFrame.new({ 52 | a: [4], 53 | c: ["c1"] 54 | }) 55 | 56 | result = df.inner_join(other_df) 57 | assert_equal 0, result.size 58 | assert_equal ["a", "b", "c"], result.keys 59 | end 60 | 61 | def test_inner_join_nil 62 | df = Rover::DataFrame.new({ 63 | a: [1, 2, 3], 64 | b: [nil, nil, nil] 65 | }, types: {b: :object}) 66 | 67 | other_df = Rover::DataFrame.new({ 68 | a: [1, 1, 2] 69 | }) 70 | 71 | expected = Rover::DataFrame.new({ 72 | a: [1, 1, 2], 73 | b: [nil, nil, nil] 74 | }, types: {b: :object}) 75 | 76 | assert_equal expected, df.inner_join(other_df) 77 | end 78 | 79 | def test_inner_join_on_bad 80 | df = Rover::DataFrame.new({ 81 | a: [1, 2, 3], 82 | b: ["one", "two", "three"] 83 | }) 84 | 85 | error = assert_raises(ArgumentError) do 86 | df.inner_join(df, on: :bad) 87 | end 88 | assert_equal "Missing keys: bad", error.message 89 | end 90 | 91 | def test_left_join 92 | df = Rover::DataFrame.new({ 93 | a: [1, 2, 3], 94 | b: ["one", "two", "three"] 95 | }) 96 | 97 | other_df = Rover::DataFrame.new({ 98 | a: [1, 1, 2], 99 | c: ["c1", "c2", "c3"] 100 | }) 101 | 102 | expected = Rover::DataFrame.new({ 103 | a: [1, 1, 2, 3], 104 | b: ["one", "one", "two", "three"], 105 | c: ["c1", "c2", "c3", nil] 106 | }) 107 | 108 | assert_equal expected, df.left_join(other_df) 109 | end 110 | end 111 | -------------------------------------------------------------------------------- /test/group_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class GroupTest < Minitest::Test 4 | def test_group 5 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 6 | expected = Rover::DataFrame.new({"b" => ["one", "two"], "count" => [2, 1]}) 7 | assert_equal expected, df.group("b").count 8 | end 9 | 10 | def test_symbol 11 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 12 | expected = Rover::DataFrame.new({"b" => ["one", "two"], "count" => [2, 1]}) 13 | assert_equal expected, df.group(:b).count 14 | end 15 | 16 | def test_nil 17 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", nil]}) 18 | expected = Rover::DataFrame.new({"b" => ["one", nil], "count" => [2, 1]}) 19 | assert_equal expected, df.group("b").count 20 | end 21 | 22 | def test_multiple 23 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 24 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"], "count" => [1, 1, 1]}) 25 | assert_equal expected, df.group(["a", "b"]).count 26 | end 27 | 28 | def test_multiple_args 29 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 30 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"], "count" => [1, 1, 1]}) 31 | assert_equal expected, df.group("a", "b").count 32 | end 33 | 34 | def test_multiple_calls 35 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 36 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"], "count" => [1, 1, 1]}) 37 | assert_equal expected, df.group("a").group("b").count 38 | end 39 | 40 | def test_empty 41 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 42 | error = assert_raises(ArgumentError) do 43 | df.group([]) 44 | end 45 | assert_equal "No columns given", error.message 46 | end 47 | 48 | def test_missing_keys 49 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "one", "two"]}) 50 | error = assert_raises(ArgumentError) do 51 | df.group("c") 52 | end 53 | assert_equal "Missing keys: c", error.message 54 | end 55 | 56 | def test_max 57 | df = Rover::DataFrame.new({"a" => [1, 100, 3], "b" => ["one", "one", "two"]}) 58 | expected = Rover::DataFrame.new({"b" => ["one", "two"], "max_a" => [100, 3]}) 59 | assert_equal expected, df.group("b").max("a") 60 | end 61 | 62 | def test_min 63 | df = Rover::DataFrame.new({"a" => [1, 100, 3], "b" => ["one", "one", "two"]}) 64 | expected = Rover::DataFrame.new({"b" => ["one", "two"], "min_a" => [1, 3]}) 65 | assert_equal expected, df.group("b").min("a") 66 | end 67 | 68 | # uses Bessel's correction for now since that's all Numo supports 69 | def test_std 70 | df = Rover::DataFrame.new({"a" => [1, 2, 2, 3, 4, 6], "b" => ["one", "one", "two", "one", "two", "two"]}) 71 | expected = Rover::DataFrame.new({"b" => ["one", "two"], "std_a" => [1, 2]}) 72 | assert_equal expected, df.group("b").std("a") 73 | end 74 | 75 | # uses Bessel's correction for now since that's all Numo supports 76 | def test_var 77 | df = Rover::DataFrame.new({"a" => [1, 2, 4, 5], "b" => ["one", "two", "two", "one"]}) 78 | expected = Rover::DataFrame.new({"b" => ["one", "two"], "var_a" => [8, 2]}) 79 | assert_equal expected, df.group("b").var("a") 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /test/csv_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class CsvTest < Minitest::Test 4 | def test_read_csv 5 | df = Rover.read_csv("test/support/data.csv") 6 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 7 | assert_equal expected, df 8 | end 9 | 10 | def test_parse_csv 11 | df = Rover.parse_csv("a,b\n1,one\n2,two\n3,three\n") 12 | expected = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 13 | assert_equal expected, df 14 | end 15 | 16 | def test_default_types 17 | df = Rover.read_csv("test/support/types.csv") 18 | assert_equal :int64, df.types["a"] 19 | assert_equal :object, df.types["b"] 20 | assert_equal :float64, df.types["c"] 21 | end 22 | 23 | def test_types 24 | df = Rover.read_csv("test/support/types.csv", types: {"a" => :int8}) 25 | assert_equal :int8, df.types["a"] 26 | end 27 | 28 | def test_types_symbol 29 | df = Rover.read_csv("test/support/types.csv", types: {a: :int8}) 30 | assert_equal :int8, df.types["a"] 31 | end 32 | 33 | def test_empty 34 | df = Rover.read_csv("test/support/empty.csv") 35 | assert_empty df 36 | assert_empty df.keys 37 | end 38 | 39 | def test_empty_headers 40 | df = Rover.read_csv("test/support/empty.csv", headers: []) 41 | assert_empty df 42 | assert_empty df.keys 43 | end 44 | 45 | def test_headers 46 | df = Rover.read_csv("test/support/data.csv", headers: ["c", "d"]) 47 | assert_equal ["c", "d"], df.vector_names 48 | assert_equal 4, df.size 49 | end 50 | 51 | def test_headers_true 52 | df = Rover.read_csv("test/support/data.csv", headers: true) 53 | assert_equal ["a", "b"], df.vector_names 54 | assert_equal 3, df.size 55 | end 56 | 57 | def test_headers_false 58 | error = assert_raises(ArgumentError) do 59 | Rover.read_csv("test/support/data.csv", headers: false) 60 | end 61 | assert_equal "Must specify headers", error.message 62 | end 63 | 64 | def test_headers_too_few 65 | error = assert_raises(ArgumentError) do 66 | Rover.read_csv("test/support/data.csv", headers: ["a"]) 67 | end 68 | assert_equal "Expected 2 headers, given 1", error.message 69 | end 70 | 71 | def test_headers_too_many 72 | error = assert_raises(ArgumentError) do 73 | Rover.read_csv("test/support/data.csv", headers: ["a", "b", "c"]) 74 | end 75 | assert_equal "Expected 2 headers, given 3", error.message 76 | end 77 | 78 | # TODO decide on best approach, but this is current behavior 79 | def test_columns_too_many 80 | df = Rover.read_csv("test/support/columns.csv") 81 | expected = Rover::DataFrame.new({"one" => ["one", "one"], "unnamed" => ["two", "two"]}) 82 | assert_equal expected, df 83 | end 84 | 85 | def test_headers_unnamed 86 | df = Rover.read_csv("test/support/unnamed.csv") 87 | assert_equal ["unnamed2", "unnamed", "unnamed3", "unnamed4"], df.keys 88 | end 89 | 90 | def test_headers_unnamed_advanced 91 | df = Rover.parse_csv(",unnamed,,unnamed3") 92 | assert_equal ["unnamed2", "unnamed", "unnamed4", "unnamed3"], df.keys 93 | end 94 | 95 | def test_headers_duplicate 96 | df = Rover.parse_csv("a,a\n1,2\n") 97 | assert_equal Rover::DataFrame.new({"a" => [1]}), df 98 | end 99 | 100 | def test_headers_numeric 101 | df = Rover.parse_csv("1,2.5\n") 102 | assert_equal ["1", "2.5"], df.vector_names 103 | end 104 | 105 | def test_header_converters 106 | df = Rover.read_csv("test/support/data.csv", header_converters: :symbol) 107 | assert_equal ["a", "b"], df.vector_names 108 | assert_equal 3, df.size 109 | end 110 | 111 | # same behavior as CSV 112 | def test_header_converters_headers 113 | df = Rover.read_csv("test/support/data.csv", headers: ["C", "D"], header_converters: :downcase) 114 | assert_equal ["c", "d"], df.vector_names 115 | assert_equal 4, df.size 116 | end 117 | 118 | def test_to_csv 119 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 120 | assert_equal "a,b\n1,one\n2,two\n3,three\n", df.to_csv 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.5.0 (2025-06-07) 2 | 3 | - Strings and symbols are no longer treated as different keys 4 | - Changed methods that return column names to always use strings 5 | - Changed `[]` method to return vector instead of Numo array with range or array argument 6 | - Changed `first` and `last` methods to return element instead of vector when no arguments 7 | - Changed `round`, `ceil`, and `floor` methods to always return same type as original vector 8 | - Dropped support for Ruby < 3.2 9 | 10 | ## 0.4.1 (2024-10-07) 11 | 12 | - Fixed connection leasing for Active Record 7.2+ 13 | 14 | ## 0.4.0 (2024-06-12) 15 | 16 | - Added `include?` method to vectors 17 | - Dropped support for Ruby < 3.1 18 | 19 | ## 0.3.4 (2022-07-12) 20 | 21 | - Added `cbrt`, `erf`, `erfc`, and `hypot` methods to vectors 22 | - Added `frexp` and `ldexp` methods to vectors 23 | - Added `base` argument to `log` method 24 | 25 | ## 0.3.3 (2022-07-11) 26 | 27 | - Added `ln`, `log`, `log10`, and `log2` methods to vectors 28 | - Added `exp` and `exp2` methods to vectors 29 | - Added `sin`, `cos`, `tan`, `asin`, `acos`, and `atan` methods to vectors 30 | - Added `sinh`, `cosh`, `tanh`, `asinh`, `acosh`, and `atanh` methods to vectors 31 | - Added `round`, `ceil`, and `floor` methods to vectors 32 | - Added `empty?` method to vectors 33 | - Added `rename` method to data frames 34 | 35 | ## 0.3.2 (2022-07-10) 36 | 37 | - Added `sqrt` method to vectors 38 | - Improved numeric operations between scalars and vectors 39 | - Improved performance of `tally` 40 | 41 | ## 0.3.1 (2022-05-18) 42 | 43 | - Added `to!` to vectors 44 | - Fixed error with `nil` and `:float64` type 45 | - Fixed `:header_converters` option with `read_csv` and `parse_csv` 46 | 47 | ## 0.3.0 (2022-04-03) 48 | 49 | - Added `deep_dup` method to data frames 50 | - Changed `:int` to `:int64`, `:uint` to `:uint64`, and `:float` to `:float64` for type methods 51 | - Changed missing column to raise `KeyError` instead of `ArgumentError` for aggregate methods 52 | - Changed passing too many headers to `read_csv` and `parse_csv` to raise `ArgumentError` 53 | - Changed empty string in CSV headers to match behavior of `nil` 54 | - Fixed `clone` and `dup` method for vectors 55 | - Dropped support for Ruby < 2.7 56 | 57 | ## 0.2.8 (2022-03-15) 58 | 59 | - Added `group` and `stacked` options to `plot` 60 | - Improved performance of `read_csv` and `parse_csv` 61 | 62 | ## 0.2.7 (2022-01-16) 63 | 64 | - Added support for booleans to Parquet methods 65 | - Added support for creating data frames from `ActiveRecord::Result` 66 | - Added `types` option to `read_parquet` and `parse_parquet` methods 67 | 68 | ## 0.2.6 (2021-10-27) 69 | 70 | - Added support for `nil` headers to `read_csv` and `parse_csv` 71 | - Added `read_parquet`, `parse_parquet`, and `to_parquet` methods 72 | 73 | ## 0.2.5 (2021-09-25) 74 | 75 | - Fixed column types with joins 76 | 77 | ## 0.2.4 (2021-06-03) 78 | 79 | - Added grouping for `std` and `var` 80 | - Fixed `==` for data frames 81 | - Fixed error with `first` and `last` for data frames 82 | - Fixed error with `last` when vector size is smaller than `n` 83 | 84 | ## 0.2.3 (2021-02-08) 85 | 86 | - Added `select`, `reject`, and `map!` methods to vectors 87 | 88 | ## 0.2.2 (2021-01-01) 89 | 90 | - Added line, pie, area, and bar charts 91 | - Added `|` and `^` for vectors 92 | - Fixed typecasting with `map` 93 | 94 | ## 0.2.1 (2020-11-23) 95 | 96 | - Added `plot` method to data frames 97 | - Improved error message when too few headers 98 | 99 | ## 0.2.0 (2020-08-17) 100 | 101 | - Added `numeric?` and `zip` methods to vectors 102 | - Changed group calculations to return a data frame instead of a hash 103 | - Changed `each_row` to return enumerator 104 | - Improved inspect 105 | - Fixed `any?`, `all?`, and `uniq` for boolean vectors 106 | 107 | ## 0.1.1 (2020-06-10) 108 | 109 | - Added methods and options for types 110 | - Added grouping 111 | - Added one-hot encoding 112 | - Added `sample` to data frames 113 | - Added `tally`, `var`, `std`, `take`, `count`, and `length` to vectors 114 | - Improved error message for `read_csv` with no headers 115 | 116 | ## 0.1.0 (2020-05-13) 117 | 118 | - First release 119 | -------------------------------------------------------------------------------- /test/plot_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class PlotTest < Minitest::Test 4 | def test_defaults 5 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3]}) 6 | assert_plot_type "column", df.plot("a", "b") 7 | assert_plot_type "scatter", df.plot("b", "b") 8 | end 9 | 10 | def test_default_columns 11 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3]}) 12 | assert_plot_type "column", df.plot 13 | end 14 | 15 | def test_default_columns_not_two 16 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"]}) 17 | error = assert_raises(ArgumentError) do 18 | df.plot 19 | end 20 | assert_equal "Must specify columns", error.message 21 | end 22 | 23 | def test_type 24 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3]}) 25 | assert_plot_type "pie", df.plot("a", "b", type: "pie") 26 | assert_plot_type "line", df.plot("a", "b", type: "line") 27 | assert_plot_type "column", df.plot("a", "b", type: "column") 28 | assert_plot_type "bar", df.plot("a", "b", type: "bar") 29 | assert_plot_type "area", df.plot("a", "b", type: "area") 30 | assert_plot_type "scatter", df.plot("b", "b", type: "scatter") 31 | end 32 | 33 | def test_group_option 34 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) 35 | assert_group df.plot("a", "b", type: "line", group: "c") 36 | assert_group df.plot("a", "b", type: "column", group: "c") 37 | assert_group df.plot("a", "b", type: "bar", group: "c") 38 | assert_group df.plot("a", "b", type: "area", group: "c") 39 | assert_group df.plot("b", "b", type: "scatter", group: "c") 40 | end 41 | 42 | def test_group_option_pie 43 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) 44 | error = assert_raises(ArgumentError) do 45 | df.plot("a", "b", type: "pie", group: "c") 46 | end 47 | assert_equal "Cannot use group option with pie chart", error.message 48 | end 49 | 50 | def test_group_method 51 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) 52 | assert_group df.group("c").plot("a", "b", type: "line") 53 | assert_group df.group("c").plot("a", "b", type: "column") 54 | assert_group df.group("c").plot("a", "b", type: "bar") 55 | assert_group df.group("c").plot("a", "b", type: "area") 56 | assert_group df.group("c").plot("b", "b", type: "scatter") 57 | end 58 | 59 | def test_group_method_multiple_columns 60 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) 61 | error = assert_raises(ArgumentError) do 62 | df.group("c", "c").plot("a", "b") 63 | end 64 | assert_equal "Multiple groups not supported", error.message 65 | end 66 | 67 | def test_group_method_group_option 68 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) 69 | error = assert_raises(ArgumentError) do 70 | df.group("c").plot("a", "b", group: "c") 71 | end 72 | assert_equal "unknown keyword: :group", error.message 73 | end 74 | 75 | def test_type_unknown 76 | df = Rover::DataFrame.new({"a" => ["one", "two", "three"]}) 77 | error = assert_raises do 78 | df.plot("a", "a") 79 | end 80 | assert_equal "Cannot determine type. Use the type option.", error.message 81 | end 82 | 83 | def assert_plot_type(expected, plot) 84 | assert_kind_of Vega::LiteChart, plot 85 | 86 | case expected 87 | when "column" 88 | assert_equal "bar", plot.spec[:mark][:type] 89 | when "pie" 90 | assert_equal "arc", plot.spec[:mark][:type] 91 | when "scatter" 92 | assert_equal "circle", plot.spec[:mark][:type] 93 | else 94 | assert_equal expected, plot.spec[:mark][:type] 95 | end 96 | end 97 | 98 | def assert_group(plot) 99 | assert_kind_of Vega::LiteChart, plot 100 | assert_equal "c", plot.spec[:encoding][:color][:field] 101 | end 102 | end 103 | -------------------------------------------------------------------------------- /lib/rover.rb: -------------------------------------------------------------------------------- 1 | # dependencies 2 | require "numo/narray" 3 | 4 | # modules 5 | require_relative "rover/data_frame" 6 | require_relative "rover/group" 7 | require_relative "rover/vector" 8 | require_relative "rover/version" 9 | 10 | module Rover 11 | class << self 12 | def read_csv(path, **options) 13 | csv_to_df(**options) do |csv_options| 14 | CSV.read(path, **csv_options) 15 | end 16 | end 17 | 18 | def parse_csv(str, **options) 19 | csv_to_df(**options) do |csv_options| 20 | CSV.parse(str, **csv_options) 21 | end 22 | end 23 | 24 | def read_parquet(path, **options) 25 | parquet_to_df(**options) do 26 | Arrow::Table.load(path) 27 | end 28 | end 29 | 30 | def parse_parquet(str, **options) 31 | parquet_to_df(**options) do 32 | Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet) 33 | end 34 | end 35 | 36 | private 37 | 38 | def csv_to_df(types: nil, headers: nil, **csv_options) 39 | require "csv" 40 | 41 | raise ArgumentError, "Must specify headers" if headers == false 42 | 43 | # TODO use date converter in 0.5.0 - need to test performance 44 | table = yield({converters: :numeric}.merge(csv_options)) 45 | 46 | headers = nil if headers == true 47 | if headers && table.first && headers.size != table.first.size 48 | raise ArgumentError, "Expected #{table.first.size} headers, given #{headers.size}" 49 | end 50 | 51 | table_headers = (headers || table.shift || []).dup 52 | # keep same behavior as headers: true 53 | if table.first 54 | while table_headers.size < table.first.size 55 | table_headers << nil 56 | end 57 | end 58 | # TODO handle date converters 59 | table_headers = table_headers.map! { |v| v.nil? ? nil : v.to_s } 60 | 61 | if csv_options[:header_converters] 62 | table_headers = CSV.parse(CSV.generate_line(table_headers), headers: true, header_converters: csv_options[:header_converters]).headers 63 | end 64 | 65 | data = {} 66 | keys = table_headers.map { |k| [k, true] }.to_h 67 | unnamed_suffix = 1 68 | table_headers.each_with_index do |k, i| 69 | if k.nil? || k.empty? 70 | k = "unnamed" 71 | while keys.include?(k) 72 | unnamed_suffix += 1 73 | k = "unnamed#{unnamed_suffix}" 74 | end 75 | keys[k] = true 76 | end 77 | table_headers[i] = k 78 | end 79 | 80 | table_headers.each_with_index do |k, i| 81 | # use first value for duplicate headers like headers: true 82 | next if data[k] 83 | 84 | values = [] 85 | table.each do |row| 86 | values << row[i] 87 | end 88 | data[k] = values 89 | end 90 | 91 | DataFrame.new(data, types: types) 92 | end 93 | 94 | PARQUET_TYPE_MAPPING = { 95 | "bool" => Numo::Bit, 96 | "float" => Numo::SFloat, 97 | "double" => Numo::DFloat, 98 | "int8" => Numo::Int8, 99 | "int16" => Numo::Int16, 100 | "int32" => Numo::Int32, 101 | "int64" => Numo::Int64, 102 | "string" => Numo::RObject, 103 | "uint8" => Numo::UInt8, 104 | "uint16" => Numo::UInt16, 105 | "uint32" => Numo::UInt32, 106 | "uint64" => Numo::UInt64 107 | } 108 | 109 | def parquet_to_df(types: nil) 110 | require "parquet" 111 | 112 | table = yield 113 | data = {} 114 | types ||= {} 115 | types = types.transform_keys(&:to_s) 116 | table.each_column do |column| 117 | k = column.field.name 118 | if types[k] 119 | data[k] = Vector.new(column.data.values, type: types[k]) 120 | else 121 | type = column.field.data_type.to_s 122 | numo_type = PARQUET_TYPE_MAPPING[type] 123 | raise "Unknown type: #{type}" unless numo_type 124 | 125 | # TODO automatic conversion? 126 | # int => float 127 | # bool => object 128 | if (type.include?("int") || type == "bool") && column.n_nulls > 0 129 | raise "Nulls not supported for #{type} column: #{k}" 130 | end 131 | 132 | # TODO improve performance 133 | data[k] = numo_type.cast(column.data.values) 134 | end 135 | end 136 | DataFrame.new(data) 137 | end 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /test/types_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class TypesTest < Minitest::Test 4 | def test_constructor_vector 5 | [:bool, :float32, :float64, :int8, :int16, :int32, :int64, :object, :uint8, :uint16, :uint32, :uint64].each do |type| 6 | assert_equal type, Rover::Vector.new(1..3, type: type).type 7 | end 8 | end 9 | 10 | def test_constructor_data_frame 11 | [:bool, :float32, :float64, :int8, :int16, :int32, :int64, :object, :uint8, :uint16, :uint32, :uint64].each do |type| 12 | df = Rover::DataFrame.new({"a" => 1..3}, types: {"a" => type}) 13 | assert_equal type, df["a"].type 14 | assert_equal ({"a" => type}), df.types 15 | end 16 | end 17 | 18 | def test_constructor_data_frame_symbol 19 | df = Rover::DataFrame.new({"a" => 1..3}, types: {a: :int8}) 20 | assert_equal :int8, df["a"].type 21 | end 22 | 23 | def test_constructor_data_frame_array_symbol 24 | df = Rover::DataFrame.new([{"a" => 1}, {"a" => 2}, {"a" => 3}], types: {a: :int8}) 25 | assert_equal :int8, df["a"].type 26 | end 27 | 28 | def test_constructor_nil 29 | [:float, :float32, :float64].each do |type| 30 | assert Rover::Vector.new([1, nil, 3], type: type)[1].nan? 31 | end 32 | end 33 | 34 | def test_constructor_legacy 35 | assert_equal :int64, Rover::Vector.new(1..3, type: :int).type 36 | assert_equal :uint64, Rover::Vector.new(1..3, type: :uint).type 37 | assert_equal :float64, Rover::Vector.new(1..3, type: :float).type 38 | end 39 | 40 | def test_read_csv 41 | df = Rover.read_csv("test/support/data.csv", types: {"a" => :int8}) 42 | assert_equal :int8, df["a"].type 43 | end 44 | 45 | def test_complex64 46 | error = assert_raises(ArgumentError) do 47 | Rover::Vector.new(Numo::SComplex.cast([1])) 48 | end 49 | assert_equal "Complex types not supported yet", error.message 50 | end 51 | 52 | def test_complex128 53 | error = assert_raises(ArgumentError) do 54 | Rover::Vector.new(Numo::DComplex.cast([1])) 55 | end 56 | assert_equal "Complex types not supported yet", error.message 57 | end 58 | 59 | def test_int_large 60 | assert_equal :int64, Rover::Vector.new([2**63 - 1]).type 61 | assert_raises(RangeError) do 62 | Rover::Vector.new([2**63]) 63 | end 64 | end 65 | 66 | # an error seems more intuitive 67 | # but this is same behavior as Numo, NumPy, and Pandas 68 | def test_int_overflow 69 | assert_vector [-1], Rover::Vector.new([2**63 - 1]).to(:int32) 70 | end 71 | 72 | def test_constructor_int_nan 73 | error = assert_raises do 74 | Rover::Vector.new([1.5, 2.5, Float::NAN], type: :int) 75 | end 76 | assert_equal "float NaN out of range of integer", error.message 77 | end 78 | 79 | def test_to_int 80 | vector = Rover::Vector.new([1.5, 2.5, 3.5]).to(:int) 81 | assert_vector [1, 2, 3], vector 82 | assert_equal :int64, vector.type 83 | assert_kind_of Numo::Int64, vector.to(:int64).to_numo 84 | end 85 | 86 | def test_to_int_nan 87 | error = assert_raises do 88 | Rover::Vector.new([1.5, 2.5, Float::NAN]).to(:int) 89 | end 90 | assert_equal "float NaN out of range of integer", error.message 91 | end 92 | 93 | def test_to_int_infinite 94 | error = assert_raises do 95 | Rover::Vector.new([1.5, 2.5, Float::INFINITY]).to(:int) 96 | end 97 | assert_equal "float Inf out of range of integer", error.message 98 | end 99 | 100 | def test_to_int_object_nil 101 | error = assert_raises do 102 | Rover::Vector.new(["1", "2", nil]).to(:int) 103 | end 104 | assert_equal "no implicit conversion from nil to integer", error.message 105 | end 106 | 107 | def test_to_int_object 108 | vector = Rover::Vector.new(["1", "2", "3"]).to(:int) 109 | assert_vector [1, 2, 3], vector 110 | assert_equal :int64, vector.type 111 | assert_kind_of Numo::Int64, vector.to_numo 112 | end 113 | 114 | def test_to_float 115 | vector = Rover::Vector.new(["1.0", "2.1", nil]).to(:float) 116 | assert_equal vector[0], 1.0 117 | assert_equal vector[1], 2.1 118 | assert_equal vector[2].nan?, true 119 | assert_equal :float64, vector.type 120 | assert_kind_of Numo::DFloat, vector.to_numo 121 | end 122 | 123 | def test_to_bool 124 | vector = Rover::Vector.new([1, 2, 0]).to(:bool) 125 | assert_vector [true, true, false], vector 126 | assert_equal :bool, vector.type 127 | assert_kind_of Numo::Bit, vector.to_numo 128 | end 129 | 130 | def test_to_object 131 | vector = Rover::Vector.new(1..3).to(:object) 132 | assert_vector [1, 2, 3], vector 133 | assert_equal :object, vector.type 134 | assert_kind_of Numo::RObject, vector.to_numo 135 | end 136 | 137 | def test_to! 138 | vector = Rover::Vector.new(["1", "2", "3"]) 139 | vector.to!(:int) 140 | assert_vector [1, 2, 3], vector 141 | assert_equal :int64, vector.type 142 | assert_kind_of Numo::Int64, vector.to_numo 143 | end 144 | end 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rover 2 | 3 | Simple, powerful data frames for Ruby 4 | 5 | :mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray) 6 | 7 | :evergreen_tree: Uses [Vega](https://github.com/ankane/vega) for visualization 8 | 9 | [![Build Status](https://github.com/ankane/rover/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/rover/actions) 10 | 11 | ## Installation 12 | 13 | Add this line to your application’s Gemfile: 14 | 15 | ```ruby 16 | gem "rover-df" 17 | ``` 18 | 19 | ## Intro 20 | 21 | A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns. 22 | 23 | Try it out for forecasting by clicking the button below (it can take a few minutes to start): 24 | 25 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ankane/ml-stack/master?filepath=Forecasting.ipynb) 26 | 27 | Use the `Run` button (or `SHIFT` + `ENTER`) to run each line. 28 | 29 | ## Creating Data Frames 30 | 31 | From an array 32 | 33 | ```ruby 34 | Rover::DataFrame.new([ 35 | {a: 1, b: "one"}, 36 | {a: 2, b: "two"}, 37 | {a: 3, b: "three"} 38 | ]) 39 | ``` 40 | 41 | From a hash 42 | 43 | ```ruby 44 | Rover::DataFrame.new({ 45 | a: [1, 2, 3], 46 | b: ["one", "two", "three"] 47 | }) 48 | ``` 49 | 50 | From Active Record 51 | 52 | ```ruby 53 | Rover::DataFrame.new(User.all) 54 | ``` 55 | 56 | From a CSV 57 | 58 | ```ruby 59 | Rover.read_csv("file.csv") 60 | # or 61 | Rover.parse_csv("CSV,data,string") 62 | ``` 63 | 64 | From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) 65 | 66 | ```ruby 67 | Rover.read_parquet("file.parquet") 68 | # or 69 | Rover.parse_parquet("PAR1...") 70 | ``` 71 | 72 | ## Attributes 73 | 74 | Get number of rows 75 | 76 | ```ruby 77 | df.count 78 | ``` 79 | 80 | Get column names 81 | 82 | ```ruby 83 | df.keys 84 | ``` 85 | 86 | Check if a column exists 87 | 88 | ```ruby 89 | df.include?(name) 90 | ``` 91 | 92 | ## Selecting Data 93 | 94 | Select a column 95 | 96 | ```ruby 97 | df[:a] 98 | ``` 99 | 100 | Select multiple columns 101 | 102 | ```ruby 103 | df[[:a, :b]] 104 | ``` 105 | 106 | Select first rows 107 | 108 | ```ruby 109 | df.head 110 | # or 111 | df.first(5) 112 | ``` 113 | 114 | Select last rows 115 | 116 | ```ruby 117 | df.tail 118 | # or 119 | df.last(5) 120 | ``` 121 | 122 | Select rows by index 123 | 124 | ```ruby 125 | df[1] 126 | # or 127 | df[1..3] 128 | # or 129 | df[[1, 4, 5]] 130 | ``` 131 | 132 | Iterate over rows 133 | 134 | ```ruby 135 | df.each_row { |row| ... } 136 | ``` 137 | 138 | Iterate over a column 139 | 140 | ```ruby 141 | df[:a].each { |item| ... } 142 | # or 143 | df[:a].each_with_index { |item, index| ... } 144 | ``` 145 | 146 | ## Filtering 147 | 148 | Filter on a condition 149 | 150 | ```ruby 151 | df[df[:a] == 100] 152 | df[df[:a] != 100] 153 | df[df[:a] > 100] 154 | df[df[:a] >= 100] 155 | df[df[:a] < 100] 156 | df[df[:a] <= 100] 157 | ``` 158 | 159 | In 160 | 161 | ```ruby 162 | df[df[:a].in?([1, 2, 3])] 163 | df[df[:a].in?(1..3)] 164 | df[df[:a].in?(["a", "b", "c"])] 165 | ``` 166 | 167 | Not in 168 | 169 | ```ruby 170 | df[!df[:a].in?([1, 2, 3])] 171 | ``` 172 | 173 | And, or, and exclusive or 174 | 175 | ```ruby 176 | df[(df[:a] > 100) & (df[:b] == "one")] # and 177 | df[(df[:a] > 100) | (df[:b] == "one")] # or 178 | df[(df[:a] > 100) ^ (df[:b] == "one")] # xor 179 | ``` 180 | 181 | ## Operations 182 | 183 | Basic operations 184 | 185 | ```ruby 186 | df[:a] + 5 187 | df[:a] - 5 188 | df[:a] * 5 189 | df[:a] / 5 190 | df[:a] % 5 191 | df[:a] ** 2 192 | df[:a].sqrt 193 | df[:a].cbrt 194 | df[:a].abs 195 | ``` 196 | 197 | Rounding 198 | 199 | ```ruby 200 | df[:a].round 201 | df[:a].ceil 202 | df[:a].floor 203 | ``` 204 | 205 | Logarithm 206 | 207 | ```ruby 208 | df[:a].ln # or log 209 | df[:a].log(5) 210 | df[:a].log10 211 | df[:a].log2 212 | ``` 213 | 214 | Exponentiation 215 | 216 | ```ruby 217 | df[:a].exp 218 | df[:a].exp2 219 | ``` 220 | 221 | Trigonometric functions 222 | 223 | ```ruby 224 | df[:a].sin 225 | df[:a].cos 226 | df[:a].tan 227 | df[:a].asin 228 | df[:a].acos 229 | df[:a].atan 230 | ``` 231 | 232 | Hyperbolic functions 233 | 234 | ```ruby 235 | df[:a].sinh 236 | df[:a].cosh 237 | df[:a].tanh 238 | df[:a].asinh 239 | df[:a].acosh 240 | df[:a].atanh 241 | ``` 242 | 243 | Error function 244 | 245 | ```ruby 246 | df[:a].erf 247 | df[:a].erfc 248 | ``` 249 | 250 | Summary statistics 251 | 252 | ```ruby 253 | df[:a].count 254 | df[:a].sum 255 | df[:a].mean 256 | df[:a].median 257 | df[:a].percentile(90) 258 | df[:a].min 259 | df[:a].max 260 | df[:a].std 261 | df[:a].var 262 | ``` 263 | 264 | Count occurrences 265 | 266 | ```ruby 267 | df[:a].tally 268 | ``` 269 | 270 | Cross tabulation 271 | 272 | ```ruby 273 | df[:a].crosstab(df[:b]) 274 | ``` 275 | 276 | ## Grouping 277 | 278 | Group 279 | 280 | ```ruby 281 | df.group(:a).count 282 | ``` 283 | 284 | Works with all summary statistics 285 | 286 | ```ruby 287 | df.group(:a).max(:b) 288 | ``` 289 | 290 | Multiple groups 291 | 292 | ```ruby 293 | df.group(:a, :b).count 294 | ``` 295 | 296 | ## Visualization 297 | 298 | Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile: 299 | 300 | ```ruby 301 | gem "vega" 302 | ``` 303 | 304 | And use: 305 | 306 | ```ruby 307 | df.plot(:a, :b) 308 | ``` 309 | 310 | Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`) 311 | 312 | ```ruby 313 | df.plot(:a, :b, type: "pie") 314 | ``` 315 | 316 | Group data 317 | 318 | ```ruby 319 | df.plot(:a, :b, group: :c) 320 | ``` 321 | 322 | Stacked columns or bars 323 | 324 | ```ruby 325 | df.plot(:a, :b, group: :c, stacked: true) 326 | ``` 327 | 328 | ## Updating Data 329 | 330 | Add a new column 331 | 332 | ```ruby 333 | df[:a] = 1 334 | # or 335 | df[:a] = [1, 2, 3] 336 | ``` 337 | 338 | Update a single element 339 | 340 | ```ruby 341 | df[:a][0] = 100 342 | ``` 343 | 344 | Update multiple elements 345 | 346 | ```ruby 347 | df[:a][0..2] = 1 348 | # or 349 | df[:a][0..2] = [1, 2, 3] 350 | ``` 351 | 352 | Update all elements 353 | 354 | ```ruby 355 | df[:a] = df[:a].map { |v| v.gsub("a", "b") } 356 | # or 357 | df[:a].map! { |v| v.gsub("a", "b") } 358 | ``` 359 | 360 | Update elements matching a condition 361 | 362 | ```ruby 363 | df[:a][df[:a] > 100] = 0 364 | ``` 365 | 366 | Clamp 367 | 368 | ```ruby 369 | df[:a].clamp!(0, 100) 370 | ``` 371 | 372 | Delete columns 373 | 374 | ```ruby 375 | df.delete(:a) 376 | # or 377 | df.except!(:a, :b) 378 | ``` 379 | 380 | Rename columns 381 | 382 | ```ruby 383 | df.rename(a: :new_a, b: :new_b) 384 | # or 385 | df[:new_a] = df.delete(:a) 386 | ``` 387 | 388 | Sort rows 389 | 390 | ```ruby 391 | df.sort_by! { |r| r[:a] } 392 | ``` 393 | 394 | Clear all data 395 | 396 | ```ruby 397 | df.clear 398 | ``` 399 | 400 | ## Combining Data Frames 401 | 402 | Add rows 403 | 404 | ```ruby 405 | df.concat(other_df) 406 | ``` 407 | 408 | Add columns 409 | 410 | ```ruby 411 | df.merge!(other_df) 412 | ``` 413 | 414 | Inner join 415 | 416 | ```ruby 417 | df.inner_join(other_df) 418 | # or 419 | df.inner_join(other_df, on: :a) 420 | # or 421 | df.inner_join(other_df, on: [:a, :b]) 422 | # or 423 | df.inner_join(other_df, on: {df_col: :other_df_col}) 424 | ``` 425 | 426 | Left join 427 | 428 | ```ruby 429 | df.left_join(other_df) 430 | ``` 431 | 432 | ## Encoding 433 | 434 | One-hot encoding 435 | 436 | ```ruby 437 | df.one_hot 438 | ``` 439 | 440 | Drop a variable in each category to avoid the dummy variable trap 441 | 442 | ```ruby 443 | df.one_hot(drop: true) 444 | ``` 445 | 446 | ## Conversion 447 | 448 | Array of hashes 449 | 450 | ```ruby 451 | df.to_a 452 | ``` 453 | 454 | Hash of arrays 455 | 456 | ```ruby 457 | df.to_h 458 | ``` 459 | 460 | Numo array 461 | 462 | ```ruby 463 | df.to_numo 464 | ``` 465 | 466 | CSV 467 | 468 | ```ruby 469 | df.to_csv 470 | ``` 471 | 472 | Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) 473 | 474 | ```ruby 475 | df.to_parquet 476 | ``` 477 | 478 | ## Types 479 | 480 | You can specify column types when creating a data frame 481 | 482 | ```ruby 483 | Rover::DataFrame.new(data, types: {"a" => :int64, "b" => :float64}) 484 | ``` 485 | 486 | Or 487 | 488 | ```ruby 489 | Rover.read_csv("data.csv", types: {"a" => :int64, "b" => :float64}) 490 | ``` 491 | 492 | Supported types are: 493 | 494 | - boolean - `:bool` 495 | - float - `:float64`, `:float32` 496 | - integer - `:int64`, `:int32`, `:int16`, `:int8` 497 | - unsigned integer - `:uint64`, `:uint32`, `:uint16`, `:uint8` 498 | - object - `:object` 499 | 500 | Get column types 501 | 502 | ```ruby 503 | df.types 504 | ``` 505 | 506 | For a specific column 507 | 508 | ```ruby 509 | df[:a].type 510 | ``` 511 | 512 | Change the type of a column 513 | 514 | ```ruby 515 | df[:a].to!(:int32) 516 | ``` 517 | 518 | ## History 519 | 520 | View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md) 521 | 522 | ## Contributing 523 | 524 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 525 | 526 | - [Report bugs](https://github.com/ankane/rover/issues) 527 | - Fix bugs and [submit pull requests](https://github.com/ankane/rover/pulls) 528 | - Write, clarify, or fix documentation 529 | - Suggest or add new features 530 | 531 | To get started with development: 532 | 533 | ```sh 534 | git clone https://github.com/ankane/rover.git 535 | cd rover 536 | bundle install 537 | bundle exec rake test 538 | ``` 539 | -------------------------------------------------------------------------------- /lib/rover/vector.rb: -------------------------------------------------------------------------------- 1 | module Rover 2 | class Vector 3 | # if a user never specifies types, 4 | # the defaults are bool, float64, int64, and object 5 | TYPE_CAST_MAPPING = { 6 | bool: Numo::Bit, 7 | float32: Numo::SFloat, 8 | float64: Numo::DFloat, 9 | int8: Numo::Int8, 10 | int16: Numo::Int16, 11 | int32: Numo::Int32, 12 | int64: Numo::Int64, 13 | object: Numo::RObject, 14 | uint8: Numo::UInt8, 15 | uint16: Numo::UInt16, 16 | uint32: Numo::UInt32, 17 | uint64: Numo::UInt64, 18 | # legacy - must come last 19 | float: Numo::DFloat, 20 | int: Numo::Int64, 21 | uint: Numo::UInt64 22 | } 23 | 24 | NOT_SET = Object.new 25 | 26 | def initialize(data, type: nil) 27 | @data = cast_data(data, type: type) 28 | raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1 29 | end 30 | 31 | def type 32 | TYPE_CAST_MAPPING.find { |_, v| @data.is_a?(v) }[0] 33 | end 34 | 35 | def to(type) 36 | dup.to!(type) 37 | end 38 | 39 | def to!(type) 40 | @data = cast_data(@data, type: type) 41 | self 42 | end 43 | 44 | def to_numo 45 | @data 46 | end 47 | 48 | def to_a 49 | a = @data.to_a 50 | a.map! { |v| !v.zero? } if @data.is_a?(Numo::Bit) 51 | a 52 | end 53 | 54 | def numeric? 55 | ![:object, :bool].include?(type) 56 | end 57 | 58 | def size 59 | @data.size 60 | end 61 | alias_method :length, :size 62 | alias_method :count, :size 63 | 64 | def uniq 65 | Vector.new(to_a.uniq) 66 | end 67 | 68 | def missing 69 | bit = 70 | if @data.is_a?(Numo::RObject) 71 | Numo::Bit.cast(@data.map(&:nil?)) 72 | elsif @data.respond_to?(:isnan) 73 | @data.isnan 74 | else 75 | Numo::Bit.new(size).fill(0) 76 | end 77 | 78 | Vector.new(bit) 79 | end 80 | 81 | # keep same number of rows as original 82 | # to make it easy to add to original data frame 83 | def diff 84 | diff = @data.cast_to(Numo::DFloat).diff 85 | Vector.new(diff.insert(0, Float::NAN)) 86 | end 87 | 88 | def [](v) 89 | if v.is_a?(Vector) 90 | Vector.new(v.to_numo.mask(@data)) 91 | elsif v.is_a?(Numeric) 92 | @data[v] 93 | else 94 | Vector.new(@data[v]) 95 | end 96 | end 97 | 98 | def []=(k, v) 99 | k = k.to_numo if k.is_a?(Vector) 100 | @data[k] = v 101 | end 102 | 103 | %w(+ - * / % ** & | ^).each do |op| 104 | define_method(op) do |other| 105 | other = other.to_numo if other.is_a?(Vector) 106 | # TODO better logic 107 | if @data.is_a?(Numo::RObject) && !other.is_a?(Numo::RObject) 108 | map { |v| v.send(op, other) } 109 | else 110 | Vector.new(@data.send(op, other)) 111 | end 112 | end 113 | end 114 | 115 | { 116 | "==" => "eq", 117 | "!=" => "ne", 118 | ">" => "gt", 119 | ">=" => "ge", 120 | "<" => "lt", 121 | "<=" => "le" 122 | }.each do |op, meth| 123 | define_method(op) do |other| 124 | other = other.to_numo if other.is_a?(Vector) 125 | v = 126 | if other.is_a?(Numo::RObject) 127 | @data.to_a.zip(other).map { |v, ov| v == ov } 128 | elsif other.is_a?(Numeric) || other.is_a?(Numo::NArray) 129 | @data.send(meth, other) 130 | else 131 | @data.map { |v| v.send(op, other) } 132 | end 133 | Vector.new(Numo::Bit.cast(v)) 134 | end 135 | end 136 | 137 | def in?(values) 138 | ret = Numo::Bit.new(size).fill(false) 139 | values.each do |v| 140 | comp = 141 | if v.is_a?(Numeric) || v.is_a?(Numo::NArray) 142 | @data.eq(v) 143 | else 144 | Numo::Bit.cast(@data.map { |d| d == v }) 145 | end 146 | ret |= comp 147 | end 148 | Vector.new(ret) 149 | end 150 | 151 | def ! 152 | if @data.is_a?(Numo::Bit) 153 | Vector.new(@data.eq(0)) 154 | else 155 | raise "Not implemented yet" 156 | end 157 | end 158 | 159 | def -@ 160 | self * -1 161 | end 162 | 163 | def clamp!(min, max) 164 | @data = @data.clip(min, max) 165 | self 166 | end 167 | 168 | def clamp(min, max) 169 | dup.clamp!(min, max) 170 | end 171 | 172 | def map(&block) 173 | # convert to Ruby first to cast properly 174 | # https://github.com/ruby-numo/numo-narray/issues/181 175 | Vector.new(@data.to_a.map(&block)) 176 | end 177 | 178 | def map!(&block) 179 | @data = cast_data(@data.to_a.map(&block)) 180 | self 181 | end 182 | 183 | def select(&block) 184 | Vector.new(@data.to_a.select(&block)) 185 | end 186 | 187 | def reject(&block) 188 | Vector.new(@data.to_a.reject(&block)) 189 | end 190 | 191 | # use Ruby tally for performance 192 | def tally 193 | @data.to_a.tally 194 | end 195 | 196 | def sort 197 | Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort) 198 | end 199 | 200 | def abs 201 | Vector.new(@data.abs) 202 | end 203 | 204 | def round(ndigits = 0) 205 | if ndigits == 0 206 | Vector.new(@data.round) 207 | else 208 | Vector.new(@data.map { |v| v.round(ndigits) }) 209 | end 210 | end 211 | 212 | def ceil(ndigits = 0) 213 | if ndigits == 0 214 | Vector.new(@data.ceil) 215 | else 216 | Vector.new(@data.map { |v| v.ceil(ndigits) }) 217 | end 218 | end 219 | 220 | def floor(ndigits = 0) 221 | if ndigits == 0 222 | Vector.new(@data.floor) 223 | else 224 | Vector.new(@data.map { |v| v.floor(ndigits) }) 225 | end 226 | end 227 | 228 | [:sqrt, :cbrt, :sin, :cos, :tan, :asin, :acos, :atan, :sinh, :cosh, :tanh, :asinh, :acosh, :atanh, :log2, :log10, :exp, :exp2, :erf, :erfc].each do |m| 229 | define_method(m) do 230 | Vector.new(Numo::NMath.send(m, @data)) 231 | end 232 | end 233 | 234 | def log(base = NOT_SET) 235 | if base == NOT_SET 236 | Vector.new(Numo::NMath.log(@data)) 237 | else 238 | type = self.type == :float32 ? :float32 : :float64 239 | Vector.new(@data.to_a.map { |v| Math.log(v, base) }, type: type) 240 | end 241 | end 242 | 243 | def ln 244 | log 245 | end 246 | 247 | def hypot(y) 248 | y = y.to_numo if y.is_a?(Rover::Vector) 249 | Vector.new(Numo::NMath.hypot(@data, y)) 250 | end 251 | 252 | def frexp 253 | fraction, exponent = Numo::NMath.frexp(@data) 254 | [Vector.new(fraction), Vector.new(exponent)] 255 | end 256 | 257 | def ldexp(exponent) 258 | exponent = exponent.to_numo if exponent.is_a?(Rover::Vector) 259 | Vector.new(Numo::NMath.ldexp(@data, exponent)) 260 | end 261 | 262 | def each(&block) 263 | @data.each(&block) 264 | end 265 | 266 | def each_with_index(&block) 267 | @data.each_with_index(&block) 268 | end 269 | 270 | def max 271 | @data.max 272 | end 273 | 274 | def min 275 | @data.min 276 | end 277 | 278 | def mean 279 | # currently only floats have mean in Numo 280 | # https://github.com/ruby-numo/numo-narray/issues/79 281 | @data.cast_to(Numo::DFloat).mean 282 | end 283 | 284 | def median 285 | # need to cast to get correct result 286 | # https://github.com/ruby-numo/numo-narray/issues/165 287 | @data.cast_to(Numo::DFloat).median 288 | end 289 | 290 | def percentile(q) 291 | @data.percentile(q) 292 | end 293 | 294 | def sum 295 | @data.sum 296 | end 297 | 298 | # uses Bessel's correction for now since that's all Numo supports 299 | def std 300 | @data.cast_to(Numo::DFloat).stddev 301 | end 302 | 303 | # uses Bessel's correction for now since that's all Numo supports 304 | def var 305 | @data.cast_to(Numo::DFloat).var 306 | end 307 | 308 | def all?(&block) 309 | to_a.all?(&block) 310 | end 311 | 312 | def any?(&block) 313 | to_a.any?(&block) 314 | end 315 | 316 | def empty? 317 | size == 0 318 | end 319 | 320 | def include?(value) 321 | to_a.include?(value) 322 | end 323 | 324 | def zip(other, &block) 325 | to_a.zip(other.to_a, &block) 326 | end 327 | 328 | def first(n = NOT_SET) 329 | if n == NOT_SET 330 | @data[0] 331 | elsif n >= size 332 | Vector.new(@data) 333 | else 334 | Vector.new(@data[0...n]) 335 | end 336 | end 337 | 338 | def last(n = NOT_SET) 339 | if n == NOT_SET 340 | @data[-1] 341 | elsif n >= size 342 | Vector.new(@data) 343 | else 344 | Vector.new(@data[-n..-1]) 345 | end 346 | end 347 | 348 | def take(n) 349 | raise ArgumentError, "attempt to take negative size" if n < 0 350 | first(n) 351 | end 352 | 353 | def crosstab(other) 354 | index = uniq.sort 355 | index_pos = index.to_a.map.with_index.to_h 356 | df = DataFrame.new({"_" => index}) 357 | other.uniq.sort.each do |k| 358 | df[k] = 0 359 | end 360 | to_a.zip(other.to_a) do |v1, v2| 361 | df[v2][index_pos[v1]] += 1 362 | end 363 | df 364 | end 365 | 366 | def head(n = 5) 367 | n += size if n < 0 368 | first(n) 369 | end 370 | 371 | def tail(n = 5) 372 | n += size if n < 0 373 | last(n) 374 | end 375 | 376 | def one_hot(drop: false, prefix: nil) 377 | raise ArgumentError, "All elements must be strings" unless all? { |vi| vi.is_a?(String) } 378 | 379 | new_vectors = {} 380 | # maybe sort values first 381 | values = uniq.to_a 382 | values.shift if drop 383 | values.each do |v2| 384 | # TODO use types 385 | new_vectors["#{prefix}#{v2}"] = (self == v2).to_numo.cast_to(Numo::Int64) 386 | end 387 | DataFrame.new(new_vectors) 388 | end 389 | 390 | # TODO add type and size? 391 | def inspect 392 | elements = first(5).to_a.map(&:inspect) 393 | elements << "..." if size > 5 394 | "#" 395 | end 396 | alias_method :to_s, :inspect # alias like hash 397 | 398 | # for IRuby 399 | def to_html 400 | require "iruby" 401 | 402 | if size > 7 403 | # pass 8 rows so maxrows is applied 404 | IRuby::HTML.table(first(4).to_a + last(4).to_a, maxrows: 7) 405 | else 406 | IRuby::HTML.table(to_a) 407 | end 408 | end 409 | 410 | private 411 | 412 | # for clone 413 | def initialize_clone(_) 414 | @data = @data.clone 415 | super 416 | end 417 | 418 | # for dup 419 | def initialize_dup(_) 420 | @data = @data.dup 421 | super 422 | end 423 | 424 | def coerce(other) 425 | if other.is_a?(Numeric) 426 | [Vector.new([other]), self] 427 | else 428 | raise TypeError, "#{self.class} can't be coerced into #{other.class}" 429 | end 430 | end 431 | 432 | def cast_data(data, type: nil) 433 | numo_type = numo_type(type) if type 434 | 435 | data = data.to_numo if data.is_a?(Vector) 436 | 437 | if data.is_a?(Numo::NArray) 438 | raise ArgumentError, "Complex types not supported yet" if data.is_a?(Numo::DComplex) || data.is_a?(Numo::SComplex) 439 | 440 | if type 441 | case type 442 | when /int/ 443 | # Numo does not check these when casting 444 | raise RangeError, "float NaN out of range of integer" if data.respond_to?(:isnan) && data.isnan.any? 445 | raise RangeError, "float Inf out of range of integer" if data.respond_to?(:isinf) && data.isinf.any? 446 | 447 | data = data.to_a.map { |v| v.nil? ? nil : v.to_i } if data.is_a?(Numo::RObject) 448 | when /float/ 449 | data = data.to_a.map { |v| v.nil? ? Float::NAN : v.to_f } if data.is_a?(Numo::RObject) 450 | end 451 | 452 | data = numo_type.cast(data) 453 | end 454 | else 455 | data = data.to_a 456 | 457 | if type 458 | data = data.map { |v| v || Float::NAN } if [:float, :float32, :float64].include?(type) 459 | data = numo_type.cast(data) 460 | else 461 | data = 462 | if data.all? { |v| v.is_a?(Integer) } 463 | Numo::Int64.cast(data) 464 | elsif data.all? { |v| v.is_a?(Numeric) || v.nil? } 465 | Numo::DFloat.cast(data.map { |v| v || Float::NAN }) 466 | elsif data.all? { |v| v == true || v == false } 467 | Numo::Bit.cast(data) 468 | else 469 | Numo::RObject.cast(data) 470 | end 471 | end 472 | end 473 | 474 | data 475 | end 476 | 477 | def numo_type(type) 478 | numo_type = TYPE_CAST_MAPPING[type] 479 | raise ArgumentError, "Invalid type: #{type}" unless numo_type 480 | numo_type 481 | end 482 | end 483 | end 484 | -------------------------------------------------------------------------------- /test/data_frame_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class DataFrameTest < Minitest::Test 4 | # constructors 5 | 6 | def test_data_frame 7 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 8 | assert_equal ["a", "b"], Rover::DataFrame.new(df).keys 9 | end 10 | 11 | def test_array 12 | df = Rover::DataFrame.new([{a: 1, b: "one"}, {a: 2, b: "two"}, {a: 3, b: "three"}]) 13 | assert_vector [1, 2, 3], df[:a] 14 | assert_vector ["one", "two", "three"], df[:b] 15 | assert_equal 3, df.size 16 | assert_equal 3, df.length 17 | assert_equal 3, df.count 18 | assert df.any? 19 | assert !df.empty? 20 | assert_vector [1, 2], df.first(2)[:a] 21 | assert_equal ["a", "b"], df.vector_names 22 | assert_equal ["a", "b"], df.keys 23 | assert_equal ({"a" => :int64, "b" => :object}), df.types 24 | assert df.include?(:a) 25 | assert !df.include?(:c) 26 | end 27 | 28 | def test_array_missing 29 | df = Rover::DataFrame.new([{b: "one"}, {a: 2, b: "two"}, {a: 3}]) 30 | assert df[:a][0].nan? 31 | assert_equal 2, df[:a][1] 32 | assert_equal 3, df[:a][2] 33 | assert_equal "one", df[:b][0] 34 | assert_equal "two", df[:b][1] 35 | assert_nil df[:b][2] 36 | end 37 | 38 | def test_array_invalid 39 | error = assert_raises(ArgumentError) do 40 | Rover::DataFrame.new([1, 2]) 41 | end 42 | assert_equal "Array elements must be hashes", error.message 43 | end 44 | 45 | def test_hash 46 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 47 | assert_vector [1, 2, 3], df["a"] 48 | assert_vector ["one", "two", "three"], df["b"] 49 | assert_equal 3, df.size 50 | assert_vector [1, 2], df.first(2)["a"] 51 | assert_equal ["a", "b"], df.vector_names 52 | assert df.include?("a") 53 | assert !df.include?("c") 54 | end 55 | 56 | def test_hash_key 57 | error = assert_raises(ArgumentError) do 58 | Rover::DataFrame.new(1 => 1..3) 59 | end 60 | assert_equal "Key must be a String or Symbol, given Integer", error.message 61 | end 62 | 63 | def test_invalid_data 64 | error = assert_raises(ArgumentError) do 65 | Rover::DataFrame.new(1) 66 | end 67 | assert_equal "Cannot cast to data frame: Integer", error.message 68 | end 69 | 70 | def test_different_sizes 71 | error = assert_raises(ArgumentError) do 72 | Rover::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2]}) 73 | end 74 | assert_equal "Different sizes: [3, 2]", error.message 75 | end 76 | 77 | def test_scalar 78 | df = Rover::DataFrame.new({"a" => 1, "b" => [1, 2, 3]}) 79 | assert_vector [1, 1, 1], df["a"] 80 | df["c"] = true 81 | assert_vector [true, true, true], df["c"] 82 | df["c"] = false 83 | assert_vector [false, false, false], df["c"] 84 | end 85 | 86 | # to methods 87 | 88 | def test_to_a 89 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 90 | assert_equal [{"a" => 1, "b" => "one"}, {"a" => 2, "b" => "two"}, {"a" => 3, "b" => "three"}], df.to_a 91 | end 92 | 93 | def test_to_h 94 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 95 | assert_equal ({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}), df.to_h 96 | end 97 | 98 | def test_to_numo 99 | df = Rover::DataFrame.new({"a" => 1..3, "b" => 4..6, "c" => 7..9}) 100 | assert_equal Numo::Int64.cast([[1, 4, 7], [2, 5, 8], [3, 6, 9]]), df.to_numo 101 | end 102 | 103 | # TODO use to_iruby when released 104 | def test_to_html 105 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 106 | assert_match "", df.to_html 107 | end 108 | 109 | # other 110 | 111 | def test_one_hot 112 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "three", "three"]}) 113 | expected = Rover::DataFrame.new({ 114 | "a" => [1, 2, 3], 115 | "b_one" => [1, 0, 0], 116 | "b_three" => [0, 1, 1] 117 | }) 118 | assert_equal expected, df.one_hot 119 | end 120 | 121 | def test_one_hot_drop 122 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "three", "three"]}) 123 | expected = Rover::DataFrame.new({ 124 | "a" => [1, 2, 3], 125 | "b_three" => [0, 1, 1] 126 | }) 127 | assert_equal expected, df.one_hot(drop: true) 128 | end 129 | 130 | def test_one_hot_non_string 131 | error = assert_raises(ArgumentError) do 132 | Rover::DataFrame.new({"a" => [Time.now]}).one_hot 133 | end 134 | assert_equal "All elements must be numeric or strings", error.message 135 | end 136 | 137 | def test_clear 138 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 139 | df.clear 140 | assert_equal 0, df.size 141 | assert_empty df.keys 142 | assert df.empty? 143 | assert !df.any? 144 | end 145 | 146 | def test_sort_by 147 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 148 | sorted = df.sort_by { |r| r["b"] } 149 | assert_vector [1, 3, 2], sorted["a"] 150 | assert_vector ["one", "three", "two"], sorted["b"] 151 | assert_vector [1, 2, 3], df["a"] 152 | assert_vector ["one", "two", "three"], df["b"] 153 | end 154 | 155 | def test_sort_by! 156 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 157 | df.sort_by! { |r| r["b"] } 158 | assert_vector [1, 3, 2], df["a"] 159 | assert_vector ["one", "three", "two"], df["b"] 160 | end 161 | 162 | def test_max 163 | df = Rover::DataFrame.new({"a" => [1, 100, 3]}) 164 | assert_equal 100, df.max("a") 165 | end 166 | 167 | def test_max_missing 168 | error = assert_raises(KeyError) do 169 | Rover::DataFrame.new({"a" => [1, 100, 3]}).max("b") 170 | end 171 | assert_equal "Missing column: b", error.message 172 | end 173 | 174 | def test_min 175 | df = Rover::DataFrame.new({"a" => [1, 100, 3]}) 176 | assert_equal 1, df.min("a") 177 | end 178 | 179 | def test_mean 180 | df = Rover::DataFrame.new({"a" => [1, 2, 6]}) 181 | assert_equal 3, df.mean("a") 182 | end 183 | 184 | def test_median 185 | df = Rover::DataFrame.new({"a" => [1, 2, 6]}) 186 | assert_equal 2, df.median("a") 187 | end 188 | 189 | def test_percentile 190 | df = Rover::DataFrame.new({"a" => [1, 2, 3, 10]}) 191 | assert_equal 2.5, df.percentile("a", 50) 192 | end 193 | 194 | def test_sum 195 | df = Rover::DataFrame.new({"a" => [1, 2, 6]}) 196 | assert_equal 9, df.sum("a") 197 | end 198 | 199 | # TODO better test 200 | def test_sample 201 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 202 | assert_equal 1, df.sample.size 203 | assert_equal 2, df.sample(2).size 204 | assert_equal 2, df.sample(2, random: Random.new(123)).size 205 | end 206 | 207 | def test_empty_size 208 | assert_equal 0, Rover::DataFrame.new.size 209 | end 210 | 211 | def test_concat 212 | df = Rover::DataFrame.new({"a" => 1..3}) 213 | df2 = Rover::DataFrame.new({"b" => 4..6}) 214 | 215 | c1 = df + df 216 | assert_equal 6, c1.size 217 | assert_equal ["a"], c1.vector_names 218 | assert_vector [1, 2, 3, 1, 2, 3], c1["a"] 219 | 220 | df.concat(df2) 221 | assert_equal 6, df.size 222 | assert_equal ["a", "b"], df.vector_names 223 | end 224 | 225 | def test_merge 226 | df = Rover::DataFrame.new({"a" => 1..3, "b" => 4..6}) 227 | df2 = Rover::DataFrame.new({"b" => 7..9, "c" => 10..12}) 228 | merged = df.merge(df2) 229 | assert_equal ["a", "b"], df.keys 230 | assert_equal ["b", "c"], df2.keys 231 | assert_equal ["a", "b", "c"], merged.keys 232 | assert_vector 1..3, merged["a"] 233 | assert_vector 7..9, merged["b"] 234 | assert_vector 10..12, merged["c"] 235 | end 236 | 237 | def test_merge! 238 | df = Rover::DataFrame.new({"a" => 1..3, "b" => 4..6}) 239 | df2 = Rover::DataFrame.new({"b" => 7..9, "c" => 10..12}) 240 | df.merge!(df2) 241 | assert_equal ["a", "b", "c"], df.keys 242 | assert_vector 1..3, df["a"] 243 | assert_vector 7..9, df["b"] 244 | assert_vector 10..12, df["c"] 245 | end 246 | 247 | def test_merge_different_sizes 248 | df = Rover::DataFrame.new({"a" => 1..3}) 249 | df2 = Rover::DataFrame.new({"b" => [1]}) 250 | error = assert_raises(ArgumentError) do 251 | df.merge(df2) 252 | end 253 | assert_equal "Size mismatch (given 1, expected 3)", error.message 254 | end 255 | 256 | def test_rename 257 | df = Rover::DataFrame.new({"a" => 1..3, "b" => "a".."c", "c" => 1..3}) 258 | df.rename("a" => "b", "b" => "d") 259 | assert_equal ["b", "d", "c"], df.vector_names 260 | assert_vector [1, 2, 3], df["b"] 261 | assert_vector ["a", "b", "c"], df["d"] 262 | end 263 | 264 | def test_rename_symbols 265 | df = Rover::DataFrame.new({"a" => 1..3, "b" => "a".."c", "c" => 1..3}) 266 | df.rename(a: :b, b: :d) 267 | assert_equal ["b", "d", "c"], df.vector_names 268 | assert_vector [1, 2, 3], df["b"] 269 | assert_vector ["a", "b", "c"], df["d"] 270 | end 271 | 272 | def test_rename_missing 273 | df = Rover::DataFrame.new({"a" => 1..3}) 274 | error = assert_raises(KeyError) do 275 | df.rename("b" => "c") 276 | end 277 | assert_match "Missing column: b", error.message 278 | end 279 | 280 | def test_delete 281 | df = Rover::DataFrame.new({"a" => 1..3, "b" => "a".."c"}) 282 | assert_vector [1, 2, 3], df.delete("a") 283 | assert_equal ["b"], df.vector_names 284 | end 285 | 286 | def test_except 287 | df = Rover::DataFrame.new({"a" => 1..3, "b" => "a".."c", "c" => 1..3}) 288 | assert_equal ["a"], df.except("b", "c").vector_names 289 | assert_equal ["a", "b", "c"], df.vector_names 290 | end 291 | 292 | def test_except! 293 | df = Rover::DataFrame.new({"a" => 1..3, "b" => "a".."c", "c" => 1..3}) 294 | df.except!("b", "c") 295 | assert_equal ["a"], df.vector_names 296 | end 297 | 298 | def test_select 299 | df = Rover::DataFrame.new({"a" => 1..3, "b" => 1..3, "c" => 1..3}) 300 | assert_equal ["a", "b"], df[["a", "b"]].vector_names 301 | end 302 | 303 | def test_reader 304 | df = Rover::DataFrame.new({"a" => [1, 2, 3]}) 305 | assert_vector [2], df[1]["a"] 306 | assert_vector [1, 2], df[0..1]["a"] 307 | assert_vector [1, 3], df[[0, 2]]["a"] 308 | end 309 | 310 | def test_reader_where 311 | df = Rover::DataFrame.new({"a" => [1, 2, 3]}) 312 | where = Rover::Vector.new([true, false, true]) 313 | assert_vector [1, 3], df[where]["a"] 314 | end 315 | 316 | def test_reader_missing_column 317 | df = Rover::DataFrame.new({"hello" => [1, 2, 3], "hello2" => ["one", "two", "three"]}) 318 | error = assert_raises(KeyError) do 319 | df[["hello", "hello3"]] 320 | end 321 | message = error.detailed_message 322 | assert_match "Missing column: hello3", message 323 | assert_match %{Did you mean? "hello"}, message 324 | assert_match "hello2", message 325 | end 326 | 327 | def test_setter 328 | df = Rover::DataFrame.new({"a" => [1, 2, 3]}) 329 | df["b"] = 1 330 | assert_vector [1, 1, 1], df["b"] 331 | error = assert_raises(ArgumentError) do 332 | df["c"] = [1, 2] 333 | end 334 | assert_equal "Size mismatch (given 2, expected 3)", error.message 335 | end 336 | 337 | def test_setter_empty 338 | df = Rover::DataFrame.new 339 | df["a"] = [1, 2, 3] 340 | assert_vector [1, 2, 3], df["a"] 341 | end 342 | 343 | def test_filtering_and 344 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 345 | assert_vector [2], df[(df["a"] > 1) & (df["b"] == "two")]["a"] 346 | end 347 | 348 | def test_filtering_or 349 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 350 | assert_vector [2, 3], df[(df["a"] > 2) | (df["b"] == "two")]["a"] 351 | end 352 | 353 | def test_filtering_xor 354 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 355 | assert_vector [3], df[(df["a"] > 1) ^ (df["b"] == "two")]["a"] 356 | end 357 | 358 | def test_inspect 359 | df = Rover::DataFrame.new({"a" => 1..5, "b" => ["one", "two", "three", "four", "five"]}) 360 | assert_equal " a b\n 1 one\n 2 two\n 3 three\n 4 four\n 5 five", df.inspect 361 | end 362 | 363 | def test_inspect_summary 364 | df = Rover::DataFrame.new({"a" => 1..99}) 365 | assert_equal " a\n 1\n 2\n 3\n 4\n 5\n...\n 95\n 96\n 97\n 98\n 99", df.inspect 366 | end 367 | 368 | def test_inspect_empty 369 | df = Rover::DataFrame.new 370 | assert_equal "#", df.inspect 371 | end 372 | 373 | def test_equal 374 | df = Rover::DataFrame.new({a: 1..3}) 375 | assert_equal df, Rover::DataFrame.new({a: 1..3}) 376 | refute_equal df, Rover::DataFrame.new({b: 1..3}) 377 | refute_equal df, Rover::DataFrame.new({a: 1..3, b: 1..3}) 378 | refute_equal df, Rover::DataFrame.new({a: 2..4}) 379 | end 380 | 381 | def test_each_row 382 | df = Rover::DataFrame.new({a: 1..3}) 383 | rows = [] 384 | df.each_row do |row| 385 | rows << row 386 | end 387 | assert_equal [{"a" => 1}, {"a" => 2}, {"a" => 3}], rows 388 | end 389 | 390 | def test_each_row_enum 391 | df = Rover::DataFrame.new({a: 1..3}) 392 | rows = df.each_row.map { |r| r } 393 | assert_equal [{"a" => 1}, {"a" => 2}, {"a" => 3}], rows 394 | end 395 | 396 | def test_arguments 397 | error = assert_raises(ArgumentError) do 398 | Rover::DataFrame.new(1, 2, types: {}) 399 | end 400 | assert_equal "wrong number of arguments (given 2, expected 0..1)", error.message 401 | end 402 | 403 | def test_arguments_types_argument 404 | assert_equal ["types"], Rover::DataFrame.new({types: {}}).vector_names 405 | end 406 | 407 | # this shouldn't be the case, but we can't use keyword arguments 408 | def test_arguments_types_keyword 409 | assert_equal ["types"], Rover::DataFrame.new(types: {}).vector_names 410 | end 411 | 412 | def test_vector_map! 413 | df = Rover::DataFrame.new({"a" => [10, 20, 30]}) 414 | assert_equal :int64, df["a"].type 415 | assert_equal :int64, df.types["a"] 416 | 417 | df["a"].map! { |v| v + 0.5 } 418 | 419 | assert_vector [10.5, 20.5, 30.5], df["a"] 420 | assert_equal :float64, df["a"].type 421 | assert_equal :float64, df.types["a"] 422 | end 423 | 424 | def test_first 425 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 426 | assert_equal Rover::DataFrame.new({"a" => [1], "b" => ["one"]}), df.first 427 | assert_equal Rover::DataFrame.new({"a" => [1, 2], "b" => ["one", "two"]}), df.first(2) 428 | end 429 | 430 | def test_last 431 | df = Rover::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) 432 | assert_equal Rover::DataFrame.new({"a" => [3], "b" => ["three"]}), df.last 433 | assert_equal Rover::DataFrame.new({"a" => [2, 3], "b" => ["two", "three"]}), df.last(2) 434 | end 435 | 436 | def test_head 437 | df = Rover::DataFrame.new({"a" => 1..10}) 438 | assert_equal Rover::DataFrame.new({"a" => 1..5}), df.head 439 | assert_equal Rover::DataFrame.new({"a" => 1..3}), df.head(3) 440 | end 441 | 442 | def test_tail 443 | df = Rover::DataFrame.new({"a" => 1..10}) 444 | assert_equal Rover::DataFrame.new({"a" => 6..10}), df.tail 445 | assert_equal Rover::DataFrame.new({"a" => 8..10}), df.tail(3) 446 | end 447 | 448 | def test_clone 449 | df = Rover::DataFrame.new({"a" => [1, 2, 3]}) 450 | df2 = df.clone 451 | df["a"][1] = 0 452 | assert_vector [1, 0, 3], df2["a"] 453 | end 454 | 455 | def test_dup 456 | df = Rover::DataFrame.new({"a" => [1, 2, 3]}) 457 | df2 = df.dup 458 | df["a"][1] = 0 459 | assert_vector [1, 0, 3], df2["a"] 460 | end 461 | 462 | def test_deep_dup 463 | df = Rover::DataFrame.new({"a" => [1, 2, 3]}) 464 | df2 = df.deep_dup 465 | df["a"][1] = 0 466 | assert_vector [1, 2, 3], df2["a"] 467 | end 468 | end 469 | -------------------------------------------------------------------------------- /lib/rover/data_frame.rb: -------------------------------------------------------------------------------- 1 | module Rover 2 | class DataFrame 3 | def initialize(*args) 4 | data, options = process_args(args) 5 | 6 | @vectors = {} 7 | types = (options[:types] || {}).transform_keys(&:to_s) 8 | 9 | if data.is_a?(DataFrame) 10 | data.vectors.each do |k, v| 11 | @vectors[k] = v 12 | end 13 | elsif data.is_a?(Hash) 14 | data.to_h.each do |k, v| 15 | @vectors[k] = 16 | if v.respond_to?(:to_a) 17 | Vector.new(v, type: types[k.to_s]) 18 | else 19 | v 20 | end 21 | end 22 | 23 | # handle scalars 24 | size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1 25 | @vectors.each_key do |k| 26 | @vectors[k] = to_vector(@vectors[k], size: size, type: types[k.to_s]) 27 | end 28 | elsif data.is_a?(Array) 29 | vectors = {} 30 | raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) } 31 | keys = data.flat_map(&:keys).uniq 32 | keys.each do |k| 33 | vectors[k] = [] 34 | end 35 | data.each do |d| 36 | keys.each do |k| 37 | vectors[k] << d[k] 38 | end 39 | end 40 | vectors.each do |k, v| 41 | @vectors[k] = to_vector(v, type: types[k.to_s]) 42 | end 43 | elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result)) 44 | result = data.is_a?(ActiveRecord::Result) ? data : data.connection_pool.with_connection { |c| c.select_all(data.all.to_sql) } 45 | result.columns.each_with_index do |k, i| 46 | @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k.to_s]) 47 | end 48 | else 49 | raise ArgumentError, "Cannot cast to data frame: #{data.class.name}" 50 | end 51 | 52 | # check keys 53 | @vectors.each_key do |k| 54 | check_key(k) 55 | end 56 | 57 | # TODO check for duplicate keys 58 | @vectors.transform_keys!(&:to_s) 59 | 60 | # check sizes 61 | sizes = @vectors.values.map(&:size).uniq 62 | if sizes.size > 1 63 | raise ArgumentError, "Different sizes: #{sizes}" 64 | end 65 | end 66 | 67 | def [](where) 68 | if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) }) 69 | new_vectors = {} 70 | @vectors.each do |k, v| 71 | new_vectors[k] = v[where] 72 | end 73 | DataFrame.new(new_vectors) 74 | elsif where.is_a?(Array) 75 | # multiple columns 76 | df = DataFrame.new 77 | where.each do |k| 78 | check_column(k) 79 | df[k] = @vectors[k.to_s] 80 | end 81 | df 82 | else 83 | # single column 84 | @vectors[where.to_s] 85 | end 86 | end 87 | 88 | def each_row 89 | return enum_for(:each_row) unless block_given? 90 | 91 | size.times do |i| 92 | yield @vectors.map { |k, v| [k, v[i]] }.to_h 93 | end 94 | end 95 | 96 | # dup to prevent direct modification of keys 97 | def vectors 98 | @vectors.dup 99 | end 100 | 101 | def types 102 | @vectors.map { |k, v| [k, v.type] }.to_h 103 | end 104 | 105 | def []=(k, v) 106 | check_key(k) 107 | v = to_vector(v, size: size) 108 | raise ArgumentError, "Size mismatch (given #{v.size}, expected #{size})" if @vectors.any? && v.size != size 109 | @vectors[k.to_s] = v 110 | end 111 | 112 | def size 113 | @vectors.values.first&.size || 0 114 | end 115 | alias_method :length, :size 116 | alias_method :count, :size 117 | 118 | # should this check for columns as well? 119 | def any? 120 | size > 0 121 | end 122 | 123 | # should this check for columns as well? 124 | def empty? 125 | size == 0 126 | end 127 | 128 | def clear 129 | @vectors.clear 130 | end 131 | 132 | def shape 133 | [size, @vectors.size] 134 | end 135 | 136 | def keys 137 | @vectors.keys 138 | end 139 | alias_method :names, :keys 140 | alias_method :vector_names, :keys 141 | 142 | def rename(mapping) 143 | mapping.each_key do |k| 144 | check_column(k) 145 | end 146 | mapping = mapping.to_h { |k, v| [k.to_s, v.to_s] } 147 | # use transform_keys! to preserve order 148 | @vectors.transform_keys! do |k| 149 | mapping[k] || k 150 | end 151 | self 152 | end 153 | 154 | def delete(key) 155 | @vectors.delete(key.to_s) 156 | end 157 | 158 | def except(*keys) 159 | dup.except!(*keys) 160 | end 161 | 162 | def except!(*keys) 163 | keys.each do |key| 164 | delete(key) 165 | end 166 | self 167 | end 168 | 169 | def include?(key) 170 | @vectors.include?(key.to_s) 171 | end 172 | 173 | def head(n = 5) 174 | first(n) 175 | end 176 | 177 | def tail(n = 5) 178 | last(n) 179 | end 180 | 181 | def first(n = 1) 182 | new_vectors = {} 183 | @vectors.each do |k, v| 184 | new_vectors[k] = v.first(n) 185 | end 186 | DataFrame.new(new_vectors) 187 | end 188 | 189 | def last(n = 1) 190 | new_vectors = {} 191 | @vectors.each do |k, v| 192 | new_vectors[k] = v.last(n) 193 | end 194 | DataFrame.new(new_vectors) 195 | end 196 | 197 | def sample(*args, **kwargs) 198 | # TODO make more efficient 199 | indexes = (0...size).to_a.sample(*args, **kwargs) 200 | self[indexes] 201 | end 202 | 203 | def to_a 204 | a = [] 205 | each_row do |row| 206 | a << row 207 | end 208 | a 209 | end 210 | 211 | def to_h 212 | hsh = {} 213 | @vectors.each do |k, v| 214 | hsh[k] = v.to_a 215 | end 216 | hsh 217 | end 218 | 219 | def to_numo 220 | Numo::NArray.column_stack(vectors.values.map(&:to_numo)) 221 | end 222 | 223 | # TODO raise error when collision 224 | def one_hot(drop: false) 225 | df = DataFrame.new 226 | vectors.each do |k, v| 227 | if v.to_numo.is_a?(Numo::RObject) 228 | df.merge!(v.one_hot(drop: drop, prefix: "#{k}_")) 229 | else 230 | df[k] = v 231 | end 232 | end 233 | df 234 | rescue ArgumentError => e 235 | if e.message == "All elements must be strings" 236 | # better error message 237 | raise ArgumentError, "All elements must be numeric or strings" 238 | end 239 | raise e 240 | end 241 | 242 | def to_csv 243 | require "csv" 244 | 245 | CSV.generate do |csv| 246 | csv << keys 247 | numo = vectors.values.map(&:to_numo) 248 | size.times do |i| 249 | csv << numo.map { |n| n[i] } 250 | end 251 | end 252 | end 253 | 254 | def to_parquet 255 | require "parquet" 256 | 257 | schema = {} 258 | types.each do |name, type| 259 | schema[name] = 260 | case type 261 | when :int64 262 | :int64 263 | when :uint64 264 | :uint64 265 | when :float64 266 | :double 267 | when :float32 268 | :float 269 | when :bool 270 | :boolean 271 | when :object 272 | if @vectors[name].all? { |v| v.is_a?(String) } 273 | :string 274 | else 275 | raise "Unknown type" 276 | end 277 | else 278 | type 279 | end 280 | end 281 | # TODO improve performance 282 | raw_records = [] 283 | size.times do |i| 284 | raw_records << @vectors.map { |_, v| v[i] } 285 | end 286 | table = Arrow::Table.new(schema, raw_records) 287 | buffer = Arrow::ResizableBuffer.new(1024) 288 | table.save(buffer, format: :parquet) 289 | buffer.data.to_s 290 | end 291 | 292 | # for IRuby 293 | def to_html 294 | require "iruby" 295 | 296 | if size > 7 297 | # pass 8 rows so maxrows is applied 298 | IRuby::HTML.table((self[0..4] + self[-4..-1]).to_h, maxrows: 7) 299 | else 300 | IRuby::HTML.table(to_h) 301 | end 302 | end 303 | 304 | # TODO handle long text better 305 | def inspect 306 | return "#" if keys.empty? 307 | 308 | lines = [] 309 | line_start = 0 310 | spaces = 2 311 | 312 | summarize = size >= 30 313 | 314 | @vectors.each do |k, v| 315 | v = summarize ? v.first(5).to_a + ["..."] + v.last(5).to_a : v.to_a 316 | width = ([k] + v).map(&:to_s).map(&:size).max 317 | width = 3 if width < 3 318 | 319 | if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120 320 | line_start = lines.size 321 | lines << [] 322 | v.size.times do |i| 323 | lines << [] 324 | end 325 | lines << [] 326 | end 327 | 328 | lines[line_start] << "%#{width}s" % k.to_s 329 | v.each_with_index do |v2, i| 330 | lines[line_start + 1 + i] << "%#{width}s" % v2.to_s 331 | end 332 | end 333 | 334 | lines.pop 335 | lines.map { |l| l.join(" " * spaces) }.join("\n") 336 | end 337 | alias_method :to_s, :inspect # alias like hash 338 | 339 | def sort_by! 340 | indexes = 341 | size.times.sort_by do |i| 342 | yield @vectors.map { |k, v| [k, v[i]] }.to_h 343 | end 344 | 345 | @vectors.each do |k, v| 346 | self[k] = v.to_numo.at(indexes) 347 | end 348 | self 349 | end 350 | 351 | def sort_by(&block) 352 | dup.sort_by!(&block) 353 | end 354 | 355 | def group(*columns) 356 | Group.new(self, columns.flatten.map(&:to_s)) 357 | end 358 | 359 | [:max, :min, :median, :mean, :percentile, :sum, :std, :var].each do |name| 360 | define_method(name) do |column, *args| 361 | check_column(column) 362 | self[column].send(name, *args) 363 | end 364 | end 365 | 366 | def deep_dup 367 | df = DataFrame.new 368 | @vectors.each do |k, v| 369 | df[k] = v.dup 370 | end 371 | df 372 | end 373 | 374 | def +(other) 375 | dup.concat(other) 376 | end 377 | 378 | # in-place, like Array#concat 379 | # TODO make more performant 380 | def concat(other) 381 | raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame) 382 | 383 | size = self.size 384 | vectors.each do |k, v| 385 | @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size)) 386 | end 387 | (other.vector_names - vector_names).each do |k| 388 | @vectors[k] = Vector.new([nil] * size + other[k].to_a) 389 | end 390 | self 391 | end 392 | 393 | def merge(other) 394 | dup.merge!(other) 395 | end 396 | 397 | def merge!(other) 398 | other.vectors.each do |k, v| 399 | self[k] = v 400 | end 401 | self 402 | end 403 | 404 | # see join for options 405 | def inner_join(other, on: nil) 406 | join(other, on: on, how: "inner") 407 | end 408 | 409 | # see join for options 410 | def left_join(other, on: nil) 411 | join(other, on: on, how: "left") 412 | end 413 | 414 | # don't check types 415 | def ==(other) 416 | size == other.size && 417 | keys == other.keys && 418 | keys.all? { |k| self[k].to_numo == other[k].to_numo } 419 | end 420 | 421 | def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil) 422 | require "vega" 423 | 424 | raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y) 425 | x ||= keys[0] 426 | y ||= keys[1] 427 | type ||= begin 428 | if self[x].numeric? && self[y].numeric? 429 | "scatter" 430 | elsif types[x] == :object && self[y].numeric? 431 | "column" 432 | else 433 | raise "Cannot determine type. Use the type option." 434 | end 435 | end 436 | data = self[group.nil? ? [x, y] : [x, y, group]] 437 | 438 | case type 439 | when "line", "area" 440 | x_type = 441 | if data[x].numeric? 442 | "quantitative" 443 | elsif data[x].all? { |v| v.is_a?(Date) || v.is_a?(Time) } 444 | "temporal" 445 | else 446 | "nominal" 447 | end 448 | 449 | scale = x_type == "temporal" ? {type: "utc"} : {} 450 | encoding = { 451 | x: {field: x, type: x_type, scale: scale}, 452 | y: {field: y, type: "quantitative"} 453 | } 454 | encoding[:color] = {field: group} if group 455 | 456 | Vega.lite 457 | .data(data) 458 | .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60}) 459 | .encoding(encoding) 460 | .config(axis: {labelFontSize: 12}) 461 | when "pie" 462 | raise ArgumentError, "Cannot use group option with pie chart" unless group.nil? 463 | 464 | Vega.lite 465 | .data(data) 466 | .mark(type: "arc", tooltip: true) 467 | .encoding( 468 | color: {field: x, type: "nominal", sort: "none", axis: {title: nil}, legend: {labelFontSize: 12}}, 469 | theta: {field: y, type: "quantitative"} 470 | ) 471 | .view(stroke: nil) 472 | when "column" 473 | encoding = { 474 | x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}}, 475 | y: {field: y, type: "quantitative"} 476 | } 477 | if group 478 | encoding[:color] = {field: group} 479 | encoding[:xOffset] = {field: group} unless stacked 480 | end 481 | 482 | Vega.lite 483 | .data(data) 484 | .mark(type: "bar", tooltip: true) 485 | .encoding(encoding) 486 | .config(axis: {labelFontSize: 12}) 487 | when "bar" 488 | encoding = { 489 | # TODO determine label angle 490 | y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}}, 491 | x: {field: y, type: "quantitative"} 492 | } 493 | if group 494 | encoding[:color] = {field: group} 495 | encoding[:yOffset] = {field: group} unless stacked 496 | end 497 | 498 | Vega.lite 499 | .data(data) 500 | .mark(type: "bar", tooltip: true) 501 | .encoding(encoding) 502 | .config(axis: {labelFontSize: 12}) 503 | when "scatter" 504 | encoding = { 505 | x: {field: x, type: "quantitative", scale: {zero: false}}, 506 | y: {field: y, type: "quantitative", scale: {zero: false}}, 507 | size: {value: 60} 508 | } 509 | encoding[:color] = {field: group} if group 510 | 511 | Vega.lite 512 | .data(data) 513 | .mark(type: "circle", tooltip: true) 514 | .encoding(encoding) 515 | .config(axis: {labelFontSize: 12}) 516 | else 517 | raise ArgumentError, "Invalid type: #{type}" 518 | end 519 | end 520 | 521 | private 522 | 523 | # for clone 524 | def initialize_clone(_) 525 | @vectors = @vectors.clone 526 | super 527 | end 528 | 529 | # for dup 530 | def initialize_dup(_) 531 | @vectors = @vectors.dup 532 | super 533 | end 534 | 535 | def check_key(key) 536 | raise ArgumentError, "Key must be a String or Symbol, given #{key.class.name}" unless key.is_a?(String) || key.is_a?(Symbol) 537 | end 538 | 539 | # TODO make more efficient 540 | # TODO add option to prefix/suffix keys? 541 | # Supports: 542 | # - on: :key 543 | # - on: [:key1, :key2] 544 | # - on: {key1a: :key1b, key2a: :key2b} 545 | def join(other, how:, on: nil) 546 | self_on, other_on = 547 | if on.is_a?(Hash) 548 | [on.keys, on.values] 549 | else 550 | on ||= keys & other.keys 551 | on = [on] unless on.is_a?(Array) 552 | [on, on] 553 | end 554 | 555 | check_join_keys(self, self_on) 556 | check_join_keys(other, other_on) 557 | 558 | self_on.map!(&:to_s) 559 | other_on.map!(&:to_s) 560 | 561 | indexed = other.to_a.group_by { |r| r.values_at(*other_on) } 562 | indexed.default = [] 563 | 564 | left = how == "left" 565 | 566 | types = {} 567 | vectors = {} 568 | keys = (self.keys + other.keys).uniq 569 | keys.each do |k| 570 | vectors[k] = [] 571 | types[k] = join_type(self.types[k], other.types[k]) 572 | end 573 | 574 | each_row do |r| 575 | matches = indexed[r.values_at(*self_on)] 576 | if matches.empty? 577 | if left 578 | keys.each do |k| 579 | vectors[k] << r[k] 580 | end 581 | end 582 | else 583 | matches.each do |r2| 584 | keys.each do |k| 585 | vectors[k] << (r2[k] || r[k]) 586 | end 587 | end 588 | end 589 | end 590 | 591 | DataFrame.new(vectors, types: types) 592 | end 593 | 594 | def check_join_keys(df, keys) 595 | raise ArgumentError, "No keys" if keys.empty? 596 | missing_keys = keys.select { |k| !df.include?(k) } 597 | raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any? 598 | end 599 | 600 | def check_column(key) 601 | unless include?(key) 602 | raise KeyError.new("Missing column: #{key}", receiver: self, key: key) 603 | end 604 | end 605 | 606 | def join_type(a, b) 607 | if a.nil? 608 | b 609 | elsif b.nil? 610 | a 611 | elsif a == b 612 | a 613 | else 614 | # TODO specify 615 | nil 616 | end 617 | end 618 | 619 | def to_vector(v, size: nil, type: nil) 620 | if v.is_a?(Vector) 621 | v = v.to(type) if type && v.type != type 622 | return v 623 | end 624 | 625 | if size && !v.respond_to?(:to_a) 626 | v = 627 | if v.is_a?(Integer) 628 | Numo::Int64.new(size).fill(v) 629 | elsif v.is_a?(Numeric) 630 | Numo::DFloat.new(size).fill(v) 631 | elsif v == true || v == false 632 | Numo::Bit.new(size).fill(v) 633 | else 634 | # TODO make more efficient 635 | [v] * size 636 | end 637 | end 638 | 639 | Vector.new(v, type: type) 640 | end 641 | 642 | # can't use data = {} and keyword arguments 643 | # as this causes an unknown keyword error when data is passed as 644 | # DataFrame.new({a: ..., b: ...}) 645 | # 646 | # at the moment, there doesn't appear to be a way to distinguish between 647 | # DataFrame.new({types: ...}) which should set data, and 648 | # DataFrame.new(types: ...) which should set options 649 | # https://bugs.ruby-lang.org/issues/16891 650 | # 651 | # there aren't currently options that should be used without data 652 | # if this is ever the case, we should still require data 653 | # to prevent new options from breaking existing code 654 | def process_args(args) 655 | data = args[0] || {} 656 | options = args.size > 1 && args.last.is_a?(Hash) ? args.pop : {} 657 | raise ArgumentError, "wrong number of arguments (given #{args.size}, expected 0..1)" if args.size > 1 658 | 659 | known_keywords = [:types] 660 | unknown_keywords = options.keys - known_keywords 661 | raise ArgumentError, "unknown keywords: #{unknown_keywords.join(", ")}" if unknown_keywords.any? 662 | 663 | [data, options] 664 | end 665 | end 666 | end 667 | -------------------------------------------------------------------------------- /test/vector_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class VectorTest < Minitest::Test 4 | def test_works 5 | vector = Rover::Vector.new([1, 2, 3]) 6 | assert_vector [1, 2, 3], vector 7 | assert_equal 3, vector.size 8 | assert_equal 3, vector.count 9 | assert_equal 3, vector.length 10 | assert_vector [1, 2], vector.first(2) 11 | assert_vector [2, 3], vector.last(2) 12 | assert_equal 1, vector[0] 13 | assert_equal 2, vector[1.0] 14 | assert_vector [1, 2], vector[0..1] 15 | assert_vector [2, 3], vector[1..-1] 16 | assert_vector [2, 3], vector[1..] 17 | assert_vector [1, 3], vector[[0, 2]] 18 | end 19 | 20 | def test_array 21 | assert_vector [1, 2, 3], Rover::Vector.new([1, 2, 3]) 22 | end 23 | 24 | def test_range 25 | assert_vector [1, 2, 3], Rover::Vector.new(1..3) 26 | end 27 | 28 | def test_sort 29 | assert_vector [1, 2, 3], Rover::Vector.new([3, 1, 2]).sort 30 | assert_vector ["a", "b", "c"], Rover::Vector.new(["b", "c", "a"]).sort 31 | end 32 | 33 | def test_numeric 34 | assert Rover::Vector.new(1..3).numeric? 35 | assert !Rover::Vector.new(["b", "c", "a"]).numeric? 36 | assert !Rover::Vector.new([true, true, false]).numeric? 37 | end 38 | 39 | def test_add_strings 40 | a = Rover::Vector.new(["a", "b", "c"]) 41 | b = Rover::Vector.new(["d", "e", "f"]) 42 | assert_vector ["ad", "be", "cf"], a + b 43 | end 44 | 45 | def test_missing 46 | assert_vector [false, true, false], Rover::Vector.new([1, nil, 3]).missing 47 | assert_vector [false, true, false], Rover::Vector.new(["one", nil, "three"]).missing 48 | end 49 | 50 | def test_one_hot 51 | vector = Rover::Vector.new(["one", "three", "three"]) 52 | expected = Rover::DataFrame.new({ 53 | "one" => [1, 0, 0], 54 | "three" => [0, 1, 1] 55 | }) 56 | assert_equal expected, vector.one_hot 57 | end 58 | 59 | def test_crosstab 60 | a = Rover::Vector.new([1, 2, 3, 1]) 61 | b = Rover::Vector.new(["a", "b", "c", "a"]) 62 | df = a.crosstab(b) 63 | assert_vector [1, 2, 3], df["_"] 64 | assert_vector [2, 0, 0], df["a"] 65 | assert_vector [0, 1, 0], df["b"] 66 | assert_vector [0, 0, 1], df["c"] 67 | end 68 | 69 | def test_head 70 | vector = Rover::Vector.new(1..6) 71 | assert_vector [1, 2, 3, 4, 5], vector.head 72 | assert_vector [1, 2, 3], vector.head(3) 73 | assert_vector [1, 2, 3, 4], vector.head(-2) 74 | end 75 | 76 | def test_tail 77 | vector = Rover::Vector.new(1..6) 78 | assert_vector [2, 3, 4, 5, 6], vector.tail 79 | assert_vector [4, 5, 6], vector.tail(3) 80 | assert_vector [3, 4, 5, 6], vector.tail(-2) 81 | end 82 | 83 | def test_operations 84 | vector = Rover::Vector.new([10, 20, 30]) 85 | assert_vector [15, 25, 35], vector + 5 86 | assert_vector [5, 15, 25], vector - 5 87 | assert_vector [50, 100, 150], vector * 5 88 | assert_vector [2, 4, 6], vector / 5 89 | assert_vector [1, 2, 0], vector % 3 90 | assert_vector [100, 400, 900], vector ** 2 91 | end 92 | 93 | def test_operations_scalar 94 | vector = Rover::Vector.new([10, 20, 30]) 95 | assert_vector [15, 25, 35], 5 + vector 96 | assert_vector [25, 15, 5], 35 - vector 97 | assert_vector [50, 100, 150], 5 * vector 98 | assert_vector [6, 3, 2], 60 / vector 99 | end 100 | 101 | def test_operations_vector 102 | a = Rover::Vector.new([10, 20, 30]) 103 | b = Rover::Vector.new([1, 2, 3]) 104 | assert_vector [11, 22, 33], a + b 105 | assert_vector [9, 18, 27], a - b 106 | assert_vector [10, 40, 90], a * b 107 | assert_vector [10, 10, 10], a / b 108 | assert_vector [0, 0, 0], a % b 109 | assert_vector [10, 400, 27000], a ** b 110 | end 111 | 112 | def test_operations_array 113 | a = Rover::Vector.new([10, 20, 30]) 114 | b = [1, 2, 3] 115 | assert_vector [11, 22, 33], a + b 116 | assert_vector [9, 18, 27], a - b 117 | assert_vector [10, 40, 90], a * b 118 | assert_vector [10, 10, 10], a / b 119 | assert_vector [0, 0, 0], a % b 120 | assert_vector [10, 400, 27000], a ** b 121 | end 122 | 123 | # TODO use true division in 0.5.0? 124 | def test_division_int 125 | a = Rover::Vector.new([1, 3, 5]) 126 | assert_vector [0, 1, 2], a / 2, type: :int64 127 | end 128 | 129 | # TODO use true division in 0.5.0? 130 | def test_division_int_vector 131 | a = Rover::Vector.new([1, 3, 5]) 132 | b = Rover::Vector.new([2, 2, 2]) 133 | assert_vector [0, 1, 2], a / b 134 | end 135 | 136 | def test_division_float 137 | a = Rover::Vector.new([1.0, 3, 5]) 138 | assert_vector_in_delta [0.5, 1.5, 2.5], a / 2, type: :float64 139 | end 140 | 141 | def test_division_float_vector 142 | a = Rover::Vector.new([1, 3, 5]) 143 | b = Rover::Vector.new([2.0, 2, 2]) 144 | assert_vector_in_delta [0.5, 1.5, 2.5], a / b, type: :float64 145 | end 146 | 147 | def test_inspect 148 | vector = Rover::Vector.new(1..10) 149 | assert_equal "#", vector.inspect 150 | end 151 | 152 | def test_inspect_string 153 | vector = Rover::Vector.new(["one", "two", "three", "four", "five", "six", "seven", "eight"]) 154 | assert_equal '#', vector.inspect 155 | end 156 | 157 | def test_min 158 | assert_equal 1, Rover::Vector.new(1..3).min 159 | assert_equal "a", Rover::Vector.new("a".."c").min 160 | end 161 | 162 | def test_max 163 | assert_equal 3, Rover::Vector.new(1..3).max 164 | assert_equal "c", Rover::Vector.new("a".."c").max 165 | end 166 | 167 | def test_mean 168 | assert_equal 2.5, Rover::Vector.new(1..4).mean 169 | end 170 | 171 | def test_median 172 | assert_equal 2.5, Rover::Vector.new([1, 2, 3, 10]).median 173 | end 174 | 175 | def test_percentile 176 | assert_equal 2.5, Rover::Vector.new([1, 2, 3, 10]).percentile(50) 177 | end 178 | 179 | def test_sum 180 | assert_equal 10, Rover::Vector.new(1..4).sum 181 | end 182 | 183 | # uses Bessel's correction for now since that's all Numo supports 184 | def test_std 185 | assert_equal 4, Rover::Vector.new([1, 5, 9]).std 186 | end 187 | 188 | # uses Bessel's correction for now since that's all Numo supports 189 | def test_var 190 | assert_equal 16, Rover::Vector.new([1, 5, 9]).var 191 | end 192 | 193 | def test_any 194 | vector = Rover::Vector.new(1..3) 195 | assert vector.any? 196 | assert vector.any? { |v| v == 2 } 197 | assert !vector.any? { |v| v == 4 } 198 | assert (vector == 2).any? 199 | assert !(vector == 4).any? 200 | end 201 | 202 | def test_all 203 | vector = Rover::Vector.new(1..3) 204 | assert vector.all? { |v| v < 4 } 205 | assert !vector.all? { |v| v < 3 } 206 | assert (vector < 4).all? 207 | assert !(vector < 3).all? 208 | end 209 | 210 | def test_empty 211 | assert_predicate Rover::Vector.new([]), :empty? 212 | refute_predicate Rover::Vector.new(1..3), :empty? 213 | end 214 | 215 | def test_include 216 | vector = Rover::Vector.new(1..3) 217 | assert vector.include?(2) 218 | refute vector.include?(4) 219 | end 220 | 221 | def test_map 222 | vector = Rover::Vector.new([10, 20, 30]) 223 | assert_vector [20, 40, 60], vector.map { |v| v * 2 } 224 | assert_kind_of Numo::Int64, vector.map { |v| v * 2 }.to_numo 225 | end 226 | 227 | def test_map_string_to_int 228 | vector = Rover::Vector.new(["a", "b", "c"]) 229 | assert_vector [1, 1, 1], vector.map { |v| v.size } 230 | assert_kind_of Numo::Int64, vector.map { |v| v.size }.to_numo 231 | end 232 | 233 | def test_map_int_to_float 234 | vector = Rover::Vector.new([10, 20, 30]) 235 | assert_vector [10.5, 20.5, 30.5], vector.map { |v| v + 0.5 } 236 | end 237 | 238 | def test_map_int_to_string 239 | vector = Rover::Vector.new([10, 20, 30]) 240 | assert_vector ["10!", "20!", "30!"], vector.map { |v| "#{v}!" } 241 | end 242 | 243 | def test_map! 244 | vector = Rover::Vector.new([10, 20, 30]) 245 | assert_equal :int64, vector.type 246 | vector.map! { |v| "#{v}!" } 247 | assert_vector ["10!", "20!", "30!"], vector 248 | assert_equal :object, vector.type 249 | end 250 | 251 | def test_select 252 | vector = Rover::Vector.new([10, 20, 30]) 253 | assert_vector [10, 30], vector.select { |v| v != 20 } 254 | end 255 | 256 | def test_reject 257 | vector = Rover::Vector.new([10, 20, 30]) 258 | assert_vector [10, 30], vector.reject { |v| v == 20 } 259 | end 260 | 261 | def test_zip 262 | a = Rover::Vector.new([1, 2, 3]) 263 | b = Rover::Vector.new(["a", "b", "c"]) 264 | assert_equal [[1, "a"], [2, "b"], [3, "c"]], a.zip(b) 265 | end 266 | 267 | def test_abs 268 | assert_vector [2, 1, 0, 1, 2], Rover::Vector.new(-2..2).abs 269 | assert_raises(NoMethodError) do 270 | Rover::Vector.new("a".."c").abs 271 | end 272 | end 273 | 274 | def test_round 275 | assert_vector [2, 5, 7], Rover::Vector.new([2.3, 4.5, 6.7]).round, type: :float64 276 | assert_vector [2, 5, 7], Rover::Vector.new([2.3, 4.5, 6.7], type: :float32).round, type: :float32 277 | assert_vector [2, 5, 7], Rover::Vector.new([2, 5, 7]).round, type: :int64 278 | assert_vector [2, 5, 7], Rover::Vector.new([2, 5, 7], type: :int8).round, type: :int8 279 | assert_vector [1.2, 4.6, 7.9], Rover::Vector.new([1.23, 4.56, 7.89]).round(1), type: :float64 280 | # assert_vector [1.2, 4.6, 7.9], Rover::Vector.new([1.23, 4.56, 7.89], type: :float32).round(1), type: :float32 281 | assert_vector [20, 50, 70], Rover::Vector.new([23, 45, 67]).round(-1), type: :int64 282 | assert_vector [20, 50, 70], Rover::Vector.new([23, 45, 67], type: :int8).round(-1), type: :int8 283 | end 284 | 285 | def test_ceil 286 | assert_vector [1, 3, 5], Rover::Vector.new([0.1, 2.3, 4.5]).ceil, type: :float64 287 | assert_vector [1, 3, 5], Rover::Vector.new([0.1, 2.3, 4.5], type: :float32).ceil, type: :float32 288 | assert_vector [1, 3, 5], Rover::Vector.new([1, 3, 5]).ceil, type: :int64 289 | assert_vector [1, 3, 5], Rover::Vector.new([1, 3, 5], type: :int8).ceil, type: :int8 290 | assert_vector [1.3, 4.6, 7.9], Rover::Vector.new([1.23, 4.56, 7.89]).ceil(1), type: :float64 291 | # assert_vector [1.3, 4.6, 7.9], Rover::Vector.new([1.23, 4.56, 7.89], type: :float32).ceil(1), type: :float32 292 | assert_vector [30, 50, 70], Rover::Vector.new([23, 45, 67]).ceil(-1), type: :int64 293 | assert_vector [30, 50, 70], Rover::Vector.new([23, 45, 67], type: :int8).ceil(-1), type: :int8 294 | end 295 | 296 | def test_floor 297 | assert_vector [9, 7, 5], Rover::Vector.new([9.8, 7.6, 5.4]).floor, type: :float64 298 | assert_vector [9, 7, 5], Rover::Vector.new([9.8, 7.6, 5.4], type: :float32).floor, type: :float32 299 | assert_vector [9, 7, 5], Rover::Vector.new([9, 7, 5]).floor, type: :int64 300 | assert_vector [9, 7, 5], Rover::Vector.new([9, 7, 5], type: :int8).floor, type: :int8 301 | assert_vector [1.2, 4.5, 7.8], Rover::Vector.new([1.23, 4.56, 7.89]).floor(1), type: :float64 302 | # assert_vector [1.2, 4.5, 7.8], Rover::Vector.new([1.23, 4.56, 7.89], type: :float32).floor(1), type: :float32 303 | assert_vector [20, 40, 60], Rover::Vector.new([23, 45, 67]).floor(-1), type: :int64 304 | assert_vector [20, 40, 60], Rover::Vector.new([23, 45, 67], type: :int8).floor(-1), type: :int8 305 | end 306 | 307 | def test_sqrt 308 | assert_vector_in_delta [1, 2, 3], Rover::Vector.new([1, 4, 9]).sqrt, type: :float64 309 | assert_vector_in_delta [1, 2, 3], Rover::Vector.new([1, 4, 9], type: :float64).sqrt, type: :float64 310 | assert_vector_in_delta [1, 2, 3], Rover::Vector.new([1, 4, 9], type: :float32).sqrt, type: :float32 311 | end 312 | 313 | def test_cbrt 314 | assert_vector_in_delta [1, 2, 3], Rover::Vector.new([1, 8, 27]).cbrt, type: :float64 315 | assert_vector_in_delta [1, 2, 3], Rover::Vector.new([1, 8, 27], type: :float64).cbrt, type: :float64 316 | assert_vector_in_delta [1, 2, 3], Rover::Vector.new([1, 8, 27], type: :float32).cbrt, type: :float32 317 | end 318 | 319 | def test_sin 320 | assert_vector_in_delta [0, 1, 0], Rover::Vector.new([0, Math::PI / 2, Math::PI]).sin, type: :float64 321 | end 322 | 323 | def test_cos 324 | assert_vector_in_delta [1, 0, -1], Rover::Vector.new([0, Math::PI / 2, Math::PI]).cos, type: :float64 325 | end 326 | 327 | def test_tan 328 | assert_vector_in_delta [0, 1], Rover::Vector.new([0, Math::PI / 4]).tan, type: :float64 329 | end 330 | 331 | def test_asin 332 | assert_vector_in_delta [0, Math::PI / 2], Rover::Vector.new([0, 1]).asin, type: :float64 333 | end 334 | 335 | def test_acos 336 | assert_vector_in_delta [0, Math::PI / 2, Math::PI], Rover::Vector.new([1, 0, -1]).acos, type: :float64 337 | end 338 | 339 | def test_atan 340 | assert_vector_in_delta [0, Math::PI / 4], Rover::Vector.new([0, 1]).atan, type: :float64 341 | end 342 | 343 | def test_ln 344 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, Math::E]).ln, type: :float64 345 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, Math::E], type: :float32).ln, type: :float32 346 | end 347 | 348 | def test_log 349 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, Math::E]).log, type: :float64 350 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, Math::E], type: :float32).log, type: :float32 351 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, 10]).log(10), type: :float64 352 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, 10], type: :float32).log(10), type: :float32 353 | assert_vector_in_delta [0, 1, 2, 3], Rover::Vector.new([1, 2, 4, 8]).log(2), type: :float64 354 | assert_vector_in_delta [0, 1, 2, 3], Rover::Vector.new([1, 2, 4, 8], type: :float32).log(2), type: :float32 355 | end 356 | 357 | def test_log10 358 | assert_vector_in_delta [0, 1], Rover::Vector.new([1, 10]).log10, type: :float64 359 | end 360 | 361 | def test_log2 362 | assert_vector_in_delta [0, 1, 2, 3], Rover::Vector.new([1, 2, 4, 8]).log2, type: :float64 363 | end 364 | 365 | def test_exp 366 | assert_vector_in_delta [1, Math::E], Rover::Vector.new([0, 1]).exp, type: :float64 367 | end 368 | 369 | def test_exp2 370 | assert_vector_in_delta [1, 2, 4, 8], Rover::Vector.new([0, 1, 2, 3]).exp2, type: :float64 371 | end 372 | 373 | def test_erf 374 | assert_vector_in_delta [0, 1], Rover::Vector.new([0, 3]).erf, type: :float64 375 | end 376 | 377 | def test_erfc 378 | assert_vector_in_delta [1, 0], Rover::Vector.new([0, 3]).erfc, type: :float64 379 | end 380 | 381 | def test_hypot 382 | a = Rover::Vector.new([3]) 383 | b = Rover::Vector.new([4]) 384 | assert_vector_in_delta [5], a.hypot(4), type: :float64 385 | assert_vector_in_delta [5], a.hypot(b), type: :float64 386 | end 387 | 388 | def test_frexp 389 | fraction, exponent = Rover::Vector.new([1, 2, 3]).frexp 390 | assert_vector_in_delta [0.5, 0.5, 0.75], fraction, type: :float64 391 | assert_vector [1, 2, 2], exponent, type: :int32 392 | end 393 | 394 | def test_ldexp 395 | fraction = Rover::Vector.new([0.5, 0.5, 0.75]) 396 | exponent = Rover::Vector.new([1, 2, 2]) 397 | assert_vector_in_delta [1, 2, 3], fraction.ldexp(exponent), type: :float64 398 | end 399 | 400 | def test_comparison 401 | vector = Rover::Vector.new(1..3) 402 | assert_vector [false, true, false], vector == 2 403 | assert_vector [true, false, true], vector != 2 404 | assert_vector [false, false, true], vector > 2 405 | assert_vector [false, true, true], vector >= 2 406 | assert_vector [true, false, false], vector < 2 407 | assert_vector [true, true, false], vector <= 2 408 | end 409 | 410 | def test_equal_big_decimal 411 | vector = Rover::Vector.new(1..3).map { |v| BigDecimal(v) } 412 | vector == vector 413 | end 414 | 415 | def test_string 416 | assert_vector ["one", "two", "three"], Rover::Vector.new(["one", "two", "three"]) 417 | end 418 | 419 | def test_not 420 | assert_vector [false, true, false], !Rover::Vector.new([true, false, true]) 421 | end 422 | 423 | def test_nan 424 | vector = Rover::Vector.new([1, nil, 3]) 425 | assert vector[1].nan? 426 | end 427 | 428 | def test_diff 429 | diff = Rover::Vector.new([1, 4, 9]).diff 430 | assert_equal 3, diff.size 431 | assert diff[0].nan? 432 | assert_equal 3, diff[1] 433 | assert_equal 5, diff[2] 434 | end 435 | 436 | def test_in 437 | vector = Rover::Vector.new(1..3) 438 | assert_vector [false, false, false], vector.in?([]) 439 | assert_vector [true, false, true], vector.in?([1, 3]) 440 | end 441 | 442 | def test_in_string_nil 443 | vector = Rover::Vector.new(["one", "two", "three"]) 444 | assert_vector [false, false, false], vector.in?([]) 445 | assert_vector [true, false, true], vector.in?(["one", "three", nil]) 446 | end 447 | 448 | def test_in_range 449 | vector = Rover::Vector.new([1, 2, 3]) 450 | assert_vector [true, true, false], vector.in?(1..2) 451 | end 452 | 453 | def test_negation 454 | vector = Rover::Vector.new([-2, 0, 2]) 455 | assert_vector [2, 0, -2], -vector 456 | end 457 | 458 | def test_tally 459 | vector = Rover::Vector.new(["hi", "hi", "bye"]) 460 | assert_equal ({"hi" => 2, "bye" => 1}), vector.tally 461 | end 462 | 463 | def test_clamp! 464 | vector = Rover::Vector.new([-100, 0, 100]) 465 | vector.clamp!(-5, 5) 466 | assert_vector [-5, 0, 5], vector 467 | end 468 | 469 | def test_clamp 470 | vector = Rover::Vector.new([-100, 0, 100]) 471 | assert_vector [-5, 0, 5], vector.clamp(-5, 5) 472 | end 473 | 474 | def test_uniq 475 | assert_vector [1, 2], Rover::Vector.new([1, 1, 1, 2, 2]).uniq 476 | assert_vector [true, false], Rover::Vector.new([true, true, true, false, false]).uniq 477 | end 478 | 479 | def test_first 480 | vector = Rover::Vector.new(1..3) 481 | assert_equal 1, vector.first 482 | assert_vector 1..1, vector.first(1) 483 | assert_vector 1..2, vector.first(2) 484 | end 485 | 486 | def test_last 487 | vector = Rover::Vector.new(1..3) 488 | assert_equal 3, vector.last 489 | assert_vector 3..3, vector.last(1) 490 | assert_vector 2..3, vector.last(2) 491 | end 492 | 493 | def test_last1 494 | vector = Rover::Vector.new(1..1) 495 | assert_equal 1, vector.last 496 | end 497 | 498 | def test_short_last 499 | vector = Rover::Vector.new(1..3) 500 | assert_vector 1..3, vector.last(4) 501 | end 502 | 503 | def test_take 504 | vector = Rover::Vector.new(1..3) 505 | assert_vector 1..2, vector.take(2) 506 | end 507 | 508 | def test_take_negative 509 | error = assert_raises(ArgumentError) do 510 | Rover::Vector.new(1..3).take(-1) 511 | end 512 | assert_equal "attempt to take negative size", error.message 513 | end 514 | 515 | def test_bad_size 516 | error = assert_raises(ArgumentError) do 517 | Rover::Vector.new(Numo::DFloat.new(2, 3).rand) 518 | end 519 | assert_equal "Bad size: [2, 3]", error.message 520 | end 521 | 522 | def test_setter 523 | vector = Rover::Vector.new(1..3) 524 | vector[1] = 5 525 | assert_vector [1, 5, 3], vector 526 | vector[1..-1] = [7, 8] 527 | assert_vector [1, 7, 8], vector 528 | end 529 | 530 | def test_setter_where 531 | vector = Rover::Vector.new(1..3) 532 | where = Rover::Vector.new([true, false, true]) 533 | vector[where] = 0 534 | assert_vector [0, 2, 0], vector 535 | end 536 | 537 | def test_setter_where_nil 538 | vector = Rover::Vector.new([1, "bad", 3]) 539 | vector[vector == "bad"] = nil 540 | assert_vector [1, nil, 3], vector 541 | end 542 | 543 | def test_setter_where_index 544 | vector = Rover::Vector.new(1..3) 545 | vector[[0, 2]] = 5 546 | assert_vector [5, 2, 5], vector 547 | end 548 | 549 | def test_where 550 | vector = Rover::Vector.new(1..3) 551 | where = Rover::Vector.new([true, false, true]) 552 | assert_vector [1, 3], vector[where] 553 | end 554 | 555 | def test_each 556 | vector = Rover::Vector.new(1..3) 557 | values = [] 558 | vector.each do |value| 559 | values << value 560 | end 561 | assert_equal [1, 2, 3], values 562 | end 563 | 564 | def test_each_with_index 565 | array_data = [1, 2, 3, 4, 5] 566 | vector = Rover::Vector.new(array_data) 567 | vector.each_with_index do |int, index| 568 | assert_equal int, array_data[index] 569 | end 570 | end 571 | 572 | def test_to_invalid 573 | error = assert_raises(ArgumentError) do 574 | Rover::Vector.new(1..3).to(:bad) 575 | end 576 | assert_equal "Invalid type: bad", error.message 577 | end 578 | 579 | def test_to_a 580 | vector = Rover::Vector.new(1..3) 581 | assert_equal [1, 2, 3], vector.to_a 582 | end 583 | 584 | def test_to_html 585 | vector = Rover::Vector.new(1..3) 586 | assert_match "
", vector.to_html 587 | end 588 | 589 | def test_clone 590 | vector = Rover::Vector.new(1..3) 591 | vector2 = vector.clone 592 | vector[1] = 0 593 | assert_vector [1, 2, 3], vector2 594 | end 595 | 596 | def test_dup 597 | vector = Rover::Vector.new(1..3) 598 | vector2 = vector.dup 599 | vector[1] = 0 600 | assert_vector [1, 2, 3], vector2 601 | end 602 | end 603 | --------------------------------------------------------------------------------