├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Gemfile ├── Guardfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── TODO.md ├── benchmark ├── aggregate.rb ├── bisect.rb ├── comparison │ ├── prepare.sh │ ├── python.py │ ├── r.r │ ├── ruby.rb │ ├── run-all.sh │ └── scala.scala ├── custom_marshal.rb ├── digest.rb ├── enumerator.rb ├── serializer.rb ├── sort.rb ├── sort2.rb └── take.rb ├── bin └── ruby-spark ├── example ├── pi.rb └── website_search.rb ├── ext ├── ruby_c │ ├── extconf.rb │ ├── murmur.c │ ├── murmur.h │ └── ruby-spark.c ├── ruby_java │ ├── Digest.java │ ├── Murmur2.java │ ├── RubySparkExtService.java │ └── extconf.rb └── spark │ ├── build.sbt │ ├── project │ └── plugins.sbt │ ├── sbt │ └── sbt │ └── src │ ├── main │ └── scala │ │ ├── Exec.scala │ │ ├── MLLibAPI.scala │ │ ├── Marshal.scala │ │ ├── MarshalDump.scala │ │ ├── MarshalLoad.scala │ │ ├── RubyAccumulatorParam.scala │ │ ├── RubyBroadcast.scala │ │ ├── RubyConstant.scala │ │ ├── RubyMLLibAPI.scala │ │ ├── RubyMLLibUtilAPI.scala │ │ ├── RubyPage.scala │ │ ├── RubyRDD.scala │ │ ├── RubySerializer.scala │ │ ├── RubyTab.scala │ │ ├── RubyUtils.scala │ │ └── RubyWorker.scala │ └── test │ └── scala │ └── MarshalSpec.scala ├── lib ├── ruby-spark.rb ├── spark.rb └── spark │ ├── accumulator.rb │ ├── broadcast.rb │ ├── build.rb │ ├── cli.rb │ ├── command.rb │ ├── command │ ├── base.rb │ ├── basic.rb │ ├── pair.rb │ ├── sort.rb │ └── statistic.rb │ ├── command_builder.rb │ ├── command_validator.rb │ ├── config.rb │ ├── constant.rb │ ├── context.rb │ ├── error.rb │ ├── ext │ ├── hash.rb │ ├── integer.rb │ ├── io.rb │ ├── ip_socket.rb │ ├── module.rb │ ├── object.rb │ └── string.rb │ ├── helper.rb │ ├── helper │ ├── logger.rb │ ├── parser.rb │ ├── serialize.rb │ ├── statistic.rb │ └── system.rb │ ├── java_bridge.rb │ ├── java_bridge │ ├── base.rb │ ├── jruby.rb │ └── rjb.rb │ ├── library.rb │ ├── logger.rb │ ├── mllib.rb │ ├── mllib │ ├── classification │ │ ├── common.rb │ │ ├── logistic_regression.rb │ │ ├── naive_bayes.rb │ │ └── svm.rb │ ├── clustering │ │ ├── gaussian_mixture.rb │ │ └── kmeans.rb │ ├── matrix.rb │ ├── regression │ │ ├── common.rb │ │ ├── labeled_point.rb │ │ ├── lasso.rb │ │ ├── linear.rb │ │ └── ridge.rb │ ├── ruby_matrix │ │ ├── matrix_adapter.rb │ │ └── vector_adapter.rb │ ├── stat │ │ └── distribution.rb │ └── vector.rb │ ├── rdd.rb │ ├── sampler.rb │ ├── serializer.rb │ ├── serializer │ ├── auto_batched.rb │ ├── base.rb │ ├── batched.rb │ ├── cartesian.rb │ ├── compressed.rb │ ├── marshal.rb │ ├── message_pack.rb │ ├── oj.rb │ ├── pair.rb │ └── text.rb │ ├── sort.rb │ ├── sql.rb │ ├── sql │ ├── column.rb │ ├── context.rb │ ├── data_frame.rb │ ├── data_frame_reader.rb │ ├── data_type.rb │ └── row.rb │ ├── stat_counter.rb │ ├── storage_level.rb │ ├── version.rb │ └── worker │ ├── master.rb │ ├── spark_files.rb │ └── worker.rb ├── ruby-spark.gemspec └── spec ├── generator.rb ├── inputs ├── lorem_300.txt ├── numbers │ ├── 1.txt │ ├── 10.txt │ ├── 11.txt │ ├── 12.txt │ ├── 13.txt │ ├── 14.txt │ ├── 15.txt │ ├── 16.txt │ ├── 17.txt │ ├── 18.txt │ ├── 19.txt │ ├── 2.txt │ ├── 20.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt ├── numbers_0_100.txt ├── numbers_1_100.txt └── people.json ├── lib ├── collect_spec.rb ├── command_spec.rb ├── config_spec.rb ├── context_spec.rb ├── ext_spec.rb ├── external_apps_spec.rb ├── filter_spec.rb ├── flat_map_spec.rb ├── group_spec.rb ├── helper_spec.rb ├── key_spec.rb ├── manipulation_spec.rb ├── map_partitions_spec.rb ├── map_spec.rb ├── mllib │ ├── classification_spec.rb │ ├── clustering_spec.rb │ ├── matrix_spec.rb │ ├── regression_spec.rb │ └── vector_spec.rb ├── reduce_by_key_spec.rb ├── reduce_spec.rb ├── sample_spec.rb ├── serializer_spec.rb ├── sort_spec.rb ├── sql │ ├── column_spec.rb │ └── data_frame_spec.rb ├── statistic_spec.rb └── whole_text_files_spec.rb └── spec_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | /.gemtags 2 | /.tags 3 | /java/spark.jar 4 | .jbundler 5 | target/* 6 | *.class 7 | *.jar 8 | pom.xml 9 | vendor/* 10 | *.gem 11 | *.rbc 12 | .bundle 13 | .config 14 | .yardoc 15 | Gemfile.lock 16 | InstalledFiles 17 | _yardoc 18 | coverage 19 | doc/ 20 | lib/bundler/man 21 | pkg 22 | rdoc 23 | spec/reports 24 | test/tmp 25 | test/version_tmp 26 | tmp 27 | *.bundle 28 | *.so 29 | *.o 30 | *.a 31 | mkmf.log 32 | ext/spark/target/* 33 | ext/spark/project/target/* 34 | ext/spark/project/project/target/* 35 | wiki 36 | /benchmark/performance/spark/* 37 | /benchmark/performance/rspark/* 38 | /_* 39 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | 3 | rvm: 4 | - 2.2.0 5 | 6 | before_script: 7 | - bundle exec rake compile 8 | - bundle exec ruby bin/ruby-spark build 9 | 10 | cache: 11 | bundler: true 12 | directories: 13 | - $HOME/.m2 14 | - $HOME/.ivy2 15 | - $HOME/.sbt 16 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Unreleased 2 | 3 | ## 1.3.0 4 | 5 | - new method on RDD (lookup) 6 | - fix sbt url 7 | - Spark 1.5.0 8 | 9 | ## 1.2.0 (15.06.2015) 10 | 11 | - target folder is now located at HOME 12 | - better serializators 13 | - error when java class does not exist 14 | - default setting at ~/.ruby-spark.conf 15 | - compatible with Spark 1.4.0 16 | - added calling site to RDD 17 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | 5 | gem 'sourcify', '0.6.0.rc4' 6 | gem 'method_source' 7 | gem 'commander' 8 | gem 'pry' 9 | gem 'nio4r' 10 | gem 'distribution' 11 | 12 | platform :mri do 13 | gem 'rjb' 14 | gem 'msgpack' 15 | gem 'oj' 16 | gem 'narray' 17 | end 18 | 19 | platform :jruby do 20 | gem 'msgpack-jruby', require: 'msgpack' 21 | 22 | # NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime 23 | # gem 'mdarray' 24 | end 25 | 26 | group :stats do 27 | # gem 'nmatrix' 28 | # gem 'statsample' 29 | # gem 'statsample-glm' 30 | # gem 'statsample-timeseries' 31 | # gem 'statistics2' 32 | # gem 'statsample-optimization' # libgsl0-dev 33 | # gem 'narray' 34 | # gem 'gsl-nmatrix' 35 | end 36 | 37 | group :development do 38 | gem 'benchmark-ips' 39 | gem 'rspec' 40 | gem 'rake-compiler' 41 | gem 'guard' 42 | gem 'guard-rspec' 43 | gem 'listen' 44 | end 45 | 46 | group :test do 47 | gem 'simplecov', require: false 48 | end 49 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | guard :rspec, cmd: 'rspec' do 2 | watch(%r{^spec/.+_spec\.rb$}) 3 | watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" } 4 | watch('spec/spec_helper.rb') { "spec" } 5 | end 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Ondřej Moravčík 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | #-*- mode: ruby -*- 2 | 3 | require "bundler/gem_tasks" 4 | require "rspec/core/rake_task" 5 | 6 | RSpec::Core::RakeTask.new 7 | 8 | task default: :spec 9 | task test: :spec 10 | 11 | def java? 12 | RUBY_PLATFORM =~ /java/ 13 | end 14 | 15 | if java? 16 | require "rake/javaextensiontask" 17 | Rake::JavaExtensionTask.new("ruby_java") do |ext| 18 | ext.name = "ruby_spark_ext" 19 | end 20 | else 21 | require "rake/extensiontask" 22 | Rake::ExtensionTask.new("ruby_c") do |ext| 23 | ext.name = "ruby_spark_ext" 24 | end 25 | end 26 | 27 | 28 | task :clean do 29 | Dir['lib/*.{jar,o,so}'].each do |path| 30 | puts "Deleting #{path} ..." 31 | File.delete(path) 32 | end 33 | FileUtils.rm_rf('./pkg') 34 | FileUtils.rm_rf('./tmp') 35 | end 36 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | - refactor JavaBridge 2 | - to_java, from_java 3 | - every type should have class 4 | - automatic registration 5 | - add Streaming 6 | - worker informations (time, memory, ...) 7 | - killing zombie workers 8 | - add_rb, add_inline_rb to Spark::{Context, RDD} 9 | - fix broadcast for cluster 10 | - dump to disk if there is memory limit 11 | - Add Partitioner to RDD 12 | - add NonExist serializer 13 | -------------------------------------------------------------------------------- /benchmark/aggregate.rb: -------------------------------------------------------------------------------- 1 | require 'benchmark' 2 | require 'benchmark/ips' 3 | 4 | data = 0..1_000_000 5 | zero_value = rand(100_000) 6 | function = Proc.new{|sum, n| sum+n} 7 | 8 | Benchmark.ips do |r| 9 | r.report('each') do 10 | sum = zero_value 11 | data.each do |n| 12 | sum += n 13 | end 14 | end 15 | 16 | r.report('reduce') do 17 | data.reduce(zero_value){|sum, n| sum+n} 18 | end 19 | 20 | r.report('each with function') do 21 | sum = zero_value 22 | data.each do |n| 23 | sum = function.call(sum, n) 24 | end 25 | end 26 | 27 | r.report('reduce with function') do 28 | data.reduce(zero_value, &function) 29 | end 30 | 31 | r.compare! 32 | end 33 | 34 | -------------------------------------------------------------------------------- /benchmark/bisect.rb: -------------------------------------------------------------------------------- 1 | require "benchmark" 2 | 3 | def bisect_left1(a, x, opts={}) 4 | return nil if a.nil? 5 | return 0 if a.empty? 6 | 7 | lo = (opts[:lo] || opts[:low]).to_i 8 | hi = opts[:hi] || opts[:high] || a.length 9 | 10 | while lo < hi 11 | mid = (lo + hi) / 2 12 | v = a[mid] 13 | if v < x 14 | lo = mid + 1 15 | else 16 | hi = mid 17 | end 18 | end 19 | return lo 20 | end 21 | 22 | def bisect_left2(list, item) 23 | count = 0 24 | list.each{|i| 25 | return count if i >= item 26 | count += 1 27 | } 28 | nil 29 | end 30 | 31 | def bisect_left3(list, item, lo = 0, hi = list.size) 32 | while lo < hi 33 | i = (lo + hi - 1) >> 1 34 | 35 | if 0 <= (list[i] <=> item) 36 | hi = i 37 | else 38 | lo = i + 1 39 | end 40 | end 41 | return hi 42 | end 43 | 44 | array = Array.new(1000000) { rand(0..1000000) }; 45 | to_find = Array.new(500) { rand(0..10000) }; 46 | 47 | Benchmark.bm(20) do |x| 48 | x.report("bisect_left1") do 49 | to_find.each do |item| 50 | bisect_left1(array, item) 51 | end 52 | end 53 | 54 | x.report("bisect_left2") do 55 | to_find.each do |item| 56 | bisect_left2(array, item) 57 | end 58 | end 59 | 60 | x.report("bisect_left3") do 61 | to_find.each do |item| 62 | bisect_left3(array, item) 63 | end 64 | end 65 | end 66 | 67 | array = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }; 68 | to_find = Array.new(500) { (97+rand(26)).chr }; 69 | 70 | Benchmark.bm(20) do |x| 71 | x.report("bisect_left1") do 72 | to_find.each do |item| 73 | bisect_left1(array, item) 74 | end 75 | end 76 | 77 | x.report("bisect_left2") do 78 | to_find.each do |item| 79 | bisect_left2(array, item) 80 | end 81 | end 82 | 83 | x.report("bisect_left3") do 84 | to_find.each do |item| 85 | bisect_left3(array, item) 86 | end 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /benchmark/comparison/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current dir 4 | cd "$(dirname "$0")" 5 | 6 | # Exit immediately if a pipeline returns a non-zero status. 7 | set -e 8 | 9 | # Spark 10 | wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz 11 | tar xvzf spark.tgz 12 | mv spark-1.3.0-bin-hadoop2.4 spark 13 | rm spark.tgz 14 | 15 | # RSpark (only for 1.3.0) 16 | git clone git@github.com:amplab-extras/SparkR-pkg.git rspark 17 | cd rspark 18 | SPARK_VERSION=1.3.0 ./install-dev.sh 19 | -------------------------------------------------------------------------------- /benchmark/comparison/r.r: -------------------------------------------------------------------------------- 1 | library(SparkR) 2 | sc <- sparkR.init(master="local[*]") 3 | 4 | logFile <- file(Sys.getenv("R_LOG"), "w") 5 | 6 | logInfo <- function(...){ 7 | args <- list(...) 8 | line <- paste(args, collapse = ";") 9 | writeLines(line, logFile) 10 | } 11 | 12 | workers <- as.integer(Sys.getenv('WORKERS')) 13 | numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT')) 14 | textFile <- Sys.getenv('TEXT_FILE') 15 | 16 | 17 | # ============================================================================= 18 | # Serialization 19 | # ============================================================================= 20 | 21 | time <- proc.time() 22 | rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers) 23 | time <- as.double(proc.time()-time)[3] 24 | 25 | logInfo('NumbersSerialization', time) 26 | 27 | 28 | # ============================================================================= 29 | # Computing 30 | # ============================================================================= 31 | 32 | isPrime = function(x) { 33 | if(x < 2){ 34 | c(x, FALSE) 35 | } 36 | else if(x == 2){ 37 | c(x, TRUE) 38 | } 39 | else if(x %% 2 == 0){ 40 | c(x, FALSE) 41 | } 42 | else{ 43 | upper <- as.numeric(sqrt(as.double(x))) 44 | result <- TRUE 45 | 46 | i <- 3 47 | while(i <= upper){ 48 | if(x %% i == 0){ 49 | result = FALSE 50 | break 51 | } 52 | 53 | i <- i+2 54 | } 55 | 56 | c(x, result) 57 | } 58 | } 59 | 60 | time <- proc.time() 61 | rdd <- map(rddNumbers, isPrime) 62 | capture.output(collect(rdd), file='/dev/null') 63 | time <- as.double(proc.time()-time)[3] 64 | 65 | logInfo('IsPrime', time) 66 | 67 | 68 | close(logFile) 69 | sparkR.stop() 70 | -------------------------------------------------------------------------------- /benchmark/comparison/run-all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current dir 4 | cd "$(dirname "$0")" 5 | 6 | # Exit immediately if a pipeline returns a non-zero status. 7 | set -e 8 | 9 | # Settings 10 | export WORKERS=2 11 | export MATRIX_SIZE=100 12 | export NUMBERS_COUNT=1000000 13 | export TEXT_FILE=$(mktemp) 14 | export PI_DIGIT=1000 15 | export RUBY_BATCH_SIZE=2048 16 | 17 | text_file_rows=10 18 | text_file_per_line=10 19 | text_file_duplicates=50 20 | 21 | mx="4096m" 22 | ms="4096m" 23 | 24 | 25 | # Parse arguments 26 | while (( "$#" )); do 27 | case $1 in 28 | --workers) 29 | WORKERS="$2" 30 | shift 31 | ;; 32 | --matrix-size) 33 | MATRIX_SIZE="$2" 34 | shift 35 | ;; 36 | --numbers-count) 37 | NUMBERS_COUNT="$2" 38 | shift 39 | ;; 40 | --random-file-rows) 41 | text_file_rows="$2" 42 | shift 43 | ;; 44 | --text-file-per-line) 45 | text_file_per_line="$2" 46 | shift 47 | ;; 48 | --text-file-duplicates) 49 | text_file_duplicates="$2" 50 | shift 51 | ;; 52 | --pi-digit) 53 | PI_DIGIT="$2" 54 | shift 55 | ;; 56 | --ruby-batch-size) 57 | RUBY_BATCH_SIZE="$2" 58 | shift 59 | ;; 60 | --mx) 61 | mx="$2" 62 | shift 63 | ;; 64 | --ms) 65 | ms="$2" 66 | shift 67 | ;; 68 | *) 69 | break 70 | ;; 71 | esac 72 | shift 73 | done 74 | 75 | 76 | # Generating 77 | file=$(mktemp) 78 | 79 | for (( i=0; i<$text_file_rows; i++ )) 80 | do 81 | shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file 82 | echo >> $file 83 | done 84 | 85 | for (( i=0; i<$text_file_duplicates; i++ )) 86 | do 87 | cat $file >> $TEXT_FILE 88 | done 89 | 90 | 91 | # Before run 92 | if [[ -z "$SPARK_HOME" ]]; then 93 | export SPARK_HOME=$(pwd)/spark 94 | fi 95 | 96 | if [[ -z "$RSPARK_HOME" ]]; then 97 | export RSPARK_HOME=$(pwd)/rspark 98 | fi 99 | 100 | export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE" 101 | SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null) 102 | 103 | export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx" 104 | 105 | 106 | # Log files 107 | export RUBY_MARSHAL_LOG=$(mktemp) 108 | export RUBY_OJ_LOG=$(mktemp) 109 | export PYTHON_LOG=$(mktemp) 110 | export SCALA_LOG=$(mktemp) 111 | export R_LOG=$(mktemp) 112 | 113 | 114 | # Run: 115 | echo "Workers: $WORKERS" 116 | echo "Matrix size: $MATRIX_SIZE" 117 | echo "Numbers count: $NUMBERS_COUNT" 118 | echo "Pi digits: $PI_DIGIT" 119 | echo "File: rows = $(($text_file_rows * $text_file_duplicates))" 120 | echo " per line = $text_file_per_line" 121 | 122 | # --- Ruby 123 | export SPARK_RUBY_SERIALIZER='marshal' 124 | export RUBY_LOG="$RUBY_MARSHAL_LOG" 125 | /usr/bin/env ruby ruby.rb &>/dev/null 126 | 127 | export SPARK_RUBY_SERIALIZER='oj' 128 | export RUBY_LOG="$RUBY_OJ_LOG" 129 | /usr/bin/env ruby ruby.rb &>/dev/null 130 | 131 | # # --- Python 132 | "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null 133 | 134 | # # --- Scala 135 | /usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null 136 | "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null 137 | 138 | # --- R 139 | # "$RSPARK_HOME"/sparkR r.r #&>/dev/null 140 | 141 | 142 | # Parse results 143 | echo "# Ruby (Marshal)" 144 | cat $RUBY_MARSHAL_LOG 145 | echo "" 146 | 147 | echo "# Ruby (Oj)" 148 | cat $RUBY_OJ_LOG 149 | echo "" 150 | 151 | echo "# Python" 152 | cat $PYTHON_LOG 153 | echo "" 154 | 155 | echo "# Scala" 156 | cat $SCALA_LOG 157 | echo "" 158 | 159 | echo "# R" 160 | cat $R_LOG 161 | -------------------------------------------------------------------------------- /benchmark/custom_marshal.rb: -------------------------------------------------------------------------------- 1 | require 'benchmark' 2 | require 'benchmark/ips' 3 | 4 | def pack_int(data) 5 | [data].pack('l>') 6 | end 7 | 8 | def pack_long(data) 9 | [data].pack('q>') 10 | end 11 | 12 | def pack_doubles(data) 13 | data.pack('G*') 14 | end 15 | 16 | module Standard 17 | class LabeledPoint 18 | def initialize(label, features) 19 | @label = label 20 | @features = Standard::Vector.new(features) 21 | end 22 | 23 | def marshal_dump 24 | [@label, @features] 25 | end 26 | 27 | def marshal_load(*) 28 | end 29 | end 30 | 31 | class Vector 32 | def initialize(array) 33 | @values = array 34 | end 35 | 36 | def marshal_dump 37 | [@values] 38 | end 39 | 40 | def marshal_load(*) 41 | end 42 | end 43 | end 44 | 45 | module Custom 46 | class LabeledPoint 47 | def initialize(label, features) 48 | @label = label 49 | @features = Custom::Vector.new(features) 50 | end 51 | 52 | def _dump(*) 53 | pack_long(@label) + @features._dump 54 | end 55 | 56 | def self._load(*) 57 | end 58 | end 59 | 60 | class Vector 61 | def initialize(array) 62 | @values = array 63 | end 64 | 65 | def _dump(*) 66 | result = 'v' 67 | result << pack_int(@values.size) 68 | result << pack_doubles(@values) 69 | result.encode(Encoding::ASCII_8BIT) 70 | end 71 | 72 | def self._load(*) 73 | end 74 | end 75 | end 76 | 77 | data_size = 10_000 78 | vector_size = 1_000 79 | values = Array.new(vector_size) { |x| rand(10_000..100_000) } 80 | 81 | @data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)} 82 | @data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)} 83 | 84 | Benchmark.ips do |r| 85 | r.report('standard') do 86 | Marshal.dump(@data1) 87 | end 88 | 89 | r.report('custom') do 90 | Marshal.dump(@data2) 91 | end 92 | 93 | r.compare! 94 | end 95 | -------------------------------------------------------------------------------- /benchmark/digest.rb: -------------------------------------------------------------------------------- 1 | lib = File.expand_path(File.dirname(__FILE__) + '/../lib') 2 | $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) 3 | 4 | def java? 5 | RUBY_PLATFORM =~ /java/ 6 | end 7 | 8 | unless java? 9 | require 'murmurhash3' 10 | end 11 | 12 | require 'digest' 13 | require 'benchmark' 14 | require 'ruby-spark' 15 | 16 | TEST = 5_000_000 17 | WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"] 18 | 19 | puts "TEST COUNT = #{TEST*WORDS.size}" 20 | 21 | # ================================================================================================= 22 | # Pure ruby mumrumur 23 | # funny-falcon/murmurhash3-ruby 24 | 25 | MASK32 = 0xffffffff 26 | 27 | def murmur3_32_rotl(x, r) 28 | ((x << r) | (x >> (32 - r))) & MASK32 29 | end 30 | 31 | def murmur3_32_fmix(h) 32 | h &= MASK32 33 | h ^= h >> 16 34 | h = (h * 0x85ebca6b) & MASK32 35 | h ^= h >> 13 36 | h = (h * 0xc2b2ae35) & MASK32 37 | h ^ (h >> 16) 38 | end 39 | 40 | def murmur3_32__mmix(k1) 41 | k1 = (k1 * 0xcc9e2d51) & MASK32 42 | k1 = murmur3_32_rotl(k1, 15) 43 | (k1 * 0x1b873593) & MASK32 44 | end 45 | 46 | def murmur3_32_str_hash(str, seed=0) 47 | h1 = seed 48 | numbers = str.unpack('V*C*') 49 | tailn = str.bytesize % 4 50 | tail = numbers.slice!(numbers.size - tailn, tailn) 51 | for k1 in numbers 52 | h1 ^= murmur3_32__mmix(k1) 53 | h1 = murmur3_32_rotl(h1, 13) 54 | h1 = (h1*5 + 0xe6546b64) & MASK32 55 | end 56 | 57 | unless tail.empty? 58 | k1 = 0 59 | tail.reverse_each do |c1| 60 | k1 = (k1 << 8) | c1 61 | end 62 | h1 ^= murmur3_32__mmix(k1) 63 | end 64 | 65 | h1 ^= str.bytesize 66 | murmur3_32_fmix(h1) 67 | end 68 | 69 | 70 | # ================================================================================================= 71 | # Benchmark 72 | 73 | Benchmark.bm(18) do |x| 74 | 75 | x.report("ruby hash"){ 76 | TEST.times{ 77 | WORDS.each{ |word| 78 | word.hash 79 | } 80 | } 81 | } 82 | 83 | x.report("ext portable"){ 84 | TEST.times{ 85 | WORDS.each{ |word| 86 | Spark::Digest.portable_hash(word) 87 | } 88 | } 89 | } 90 | 91 | x.report("murmur3 32"){ 92 | TEST.times{ 93 | WORDS.each{ |word| 94 | # MurmurHash3::V128.str_hash(word) 95 | # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>") 96 | # MurmurHash3::V128.str_hash(word) 97 | # a = MurmurHash3::V32.str_hash(word).to_s 98 | # a.slice!(0,8) 99 | 100 | MurmurHash3::V32.str_hash(word) 101 | } 102 | } 103 | } unless java? 104 | 105 | # Too slow 106 | # x.report("murmur3 32 (ruby)"){ 107 | # TEST.times{ 108 | # WORDS.each{ |word| 109 | # # MurmurHash3::V128.str_hash(word) 110 | # # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>") 111 | # # MurmurHash3::V128.str_hash(word) 112 | # # a = murmur3_32_str_hash(word).to_s 113 | # # a.slice!(0,8) 114 | 115 | # murmur3_32_str_hash(word) 116 | # } 117 | # } 118 | # } 119 | 120 | x.report("murmur3 128"){ 121 | TEST.times{ 122 | WORDS.each{ |word| 123 | # MurmurHash3::V128.str_hash(word) 124 | # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>") 125 | # a = MurmurHash3::V128.str_hash(word).to_s 126 | # a.slice!(0,8) 127 | 128 | MurmurHash3::V128.str_hash(word) 129 | } 130 | } 131 | } unless java? 132 | 133 | # x.report("sha256"){ 134 | # TEST.times{ 135 | # WORDS.each{ |word| 136 | # a = Digest::SHA256.digest(word) 137 | # # a.slice!(0,8) 138 | # } 139 | # } 140 | # } 141 | 142 | # x.report("md5"){ 143 | # TEST.times{ 144 | # WORDS.each{ |word| 145 | # a = Digest::MD5.digest(word) 146 | # # a.slice!(0,8) 147 | # } 148 | # } 149 | # } 150 | end 151 | -------------------------------------------------------------------------------- /benchmark/enumerator.rb: -------------------------------------------------------------------------------- 1 | require "benchmark" 2 | 3 | class Enumerator 4 | def defer(&blk) 5 | self.class.new do |y| 6 | each do |*input| 7 | blk.call(y, *input) 8 | end 9 | end 10 | end 11 | end 12 | 13 | ARRAY_SIZE = 50_000_000 14 | 15 | def type_yield 16 | return to_enum(__callee__) unless block_given? 17 | 18 | ARRAY_SIZE.times { |i| 19 | yield i 20 | } 21 | end 22 | 23 | def yield_map_x2(enum) 24 | return to_enum(__callee__, enum) unless block_given? 25 | 26 | enum.each do |item| 27 | yield item*2 28 | end 29 | end 30 | 31 | def type_enumerator_new 32 | Enumerator.new do |e| 33 | ARRAY_SIZE.times { |i| 34 | e << i 35 | } 36 | end 37 | end 38 | 39 | def enumerator_new_map_x2(enum) 40 | Enumerator.new do |e| 41 | enum.each do |item| 42 | e << item*2 43 | end 44 | end 45 | end 46 | 47 | def enumerator_defer_x2(enum) 48 | enum.defer do |out, inp| 49 | out << inp*2 50 | end 51 | end 52 | 53 | Benchmark.bm(26) do |x| 54 | x.report("yield max") do 55 | type_yield.max 56 | end 57 | 58 | x.report("yield sum") do 59 | type_yield.reduce(:+) 60 | end 61 | 62 | x.report("yield map x*2 sum") do 63 | yield_map_x2(type_yield).reduce(:+) 64 | end 65 | 66 | x.report("yield defer map x*2 sum") do 67 | enumerator_defer_x2(type_yield).reduce(:+) 68 | end 69 | 70 | x.report("-----"){} 71 | 72 | x.report("Enum.new max") do 73 | type_enumerator_new.max 74 | end 75 | 76 | x.report("Enum.new sum") do 77 | type_enumerator_new.reduce(:+) 78 | end 79 | 80 | x.report("Enum.new map x*2 sum") do 81 | enumerator_new_map_x2(type_enumerator_new).reduce(:+) 82 | end 83 | 84 | x.report("Enum.new defer map x*2 sum") do 85 | enumerator_defer_x2(type_enumerator_new).reduce(:+) 86 | end 87 | 88 | end 89 | -------------------------------------------------------------------------------- /benchmark/serializer.rb: -------------------------------------------------------------------------------- 1 | require "benchmark" 2 | require "yaml" 3 | require "msgpack" 4 | require "oj" 5 | # require "thrift" 6 | 7 | puts "Simple" 8 | 9 | data = (0..100000).to_a 10 | 11 | Benchmark.bmbm do |x| 12 | x.report("YAML") do 13 | serialized = YAML.dump(data) 14 | deserialized = YAML.load(serialized) 15 | puts "Size: #{serialized.size}, Equal: #{deserialized == data}" 16 | end 17 | 18 | x.report("Marshal") do 19 | serialized = Marshal.dump(data) 20 | deserialized = Marshal.load(serialized) 21 | puts "Size: #{serialized.size}, Equal: #{deserialized == data}" 22 | end 23 | 24 | x.report("MessagePack") do 25 | serialized = MessagePack.dump(data) 26 | deserialized = MessagePack.load(serialized) 27 | puts "Size: #{serialized.size}, Equal: #{deserialized == data}" 28 | end 29 | 30 | x.report("Oj") do 31 | serialized = Oj.dump(data) 32 | deserialized = Oj.load(serialized) 33 | puts "Size: #{serialized.size}, Equal: #{deserialized == data}" 34 | end 35 | 36 | # x.report("Thrift") do 37 | # serializer = Thrift::Serializer.new 38 | # deserializer = Thrift::Deserializer.new 39 | 40 | # serialized = serializer.serialize(data) 41 | # end 42 | end 43 | 44 | puts "" 45 | puts "More complex" 46 | 47 | data = Array.new(10000000) { 48 | [rand(97..122).chr, rand(10000000)] 49 | } 50 | 51 | Benchmark.bm do |x| 52 | # Take too long 53 | # x.report("YAML") do 54 | # serialized = YAML.dump(data) 55 | # YAML.load(serialized) 56 | # end 57 | 58 | x.report("Marshal") do 59 | serialized = Marshal.dump(data) 60 | deserialized = Marshal.load(serialized) 61 | puts " Size: #{serialized.size}, Equal: #{deserialized == data}" 62 | end 63 | 64 | x.report("MessagePack") do 65 | serialized = MessagePack.dump(data) 66 | deserialized = MessagePack.load(serialized) 67 | puts " Size: #{serialized.size}, Equal: #{deserialized == data}" 68 | end 69 | 70 | x.report("Oj") do 71 | serialized = Oj.dump(data) 72 | deserialized = Oj.load(serialized) 73 | puts " Size: #{serialized.size}, Equal: #{deserialized == data}" 74 | end 75 | 76 | # x.report("Thrift") do 77 | # serializer = Thrift::Serializer.new 78 | # deserializer = Thrift::Deserializer.new 79 | 80 | # serialized = serializer.serialize(data) 81 | # end 82 | end 83 | -------------------------------------------------------------------------------- /benchmark/sort.rb: -------------------------------------------------------------------------------- 1 | require "benchmark" 2 | 3 | array = [] 4 | 1000.times { 5 | array << {:bar => rand(1000)} 6 | } 7 | 8 | n = 500 9 | Benchmark.bm(20) do |x| 10 | x.report("sort") { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } } 11 | x.report("sort reverse") { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } } 12 | x.report("sort_by -a[:bar]") { n.times { array.sort_by{ |a| -a[:bar] } } } 13 | x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } } 14 | x.report("sort_by.reverse!") { n.times { array.sort_by{ |a| a[:bar] }.reverse } } 15 | end 16 | 17 | 18 | array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join } 19 | 20 | Benchmark.bm(20) do |x| 21 | x.report("sort asc") { n.times { array.sort } } 22 | x.report("sort asc block") { n.times { array.sort{|a,b| a <=> b} } } 23 | x.report("sort desc") { n.times { array.sort{|a,b| b <=> a} } } 24 | x.report("sort asc reverse") { n.times { array.sort.reverse } } 25 | end 26 | 27 | 28 | key_value = Struct.new(:key, :value) do 29 | def <=>(other) 30 | key <=> other.key 31 | end 32 | end 33 | 34 | count = 10000 35 | item_range = 1000000 36 | array1 = Array.new(count) { [rand(item_range), rand(item_range)] } 37 | array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) } 38 | 39 | Benchmark.bm(20) do |x| 40 | x.report("sort_by") { n.times { array1.sort_by {|a| a[0]} } } 41 | x.report("sort struct") { n.times { array2.sort } } 42 | end 43 | 44 | -------------------------------------------------------------------------------- /benchmark/take.rb: -------------------------------------------------------------------------------- 1 | require "benchmark" 2 | 3 | SIZE = 100_000_000 4 | 5 | @array1 = (0..SIZE).to_a; 6 | @array2 = (0..SIZE).to_a; 7 | @array3 = (0..SIZE).to_a; 8 | 9 | TAKE = 100_000 10 | 11 | Benchmark.bm(15) do |x| 12 | # Fastest 13 | x.report("take"){ 14 | a=@array1.take(TAKE) 15 | } 16 | 17 | # Slowest and take most memory 18 | x.report("reverse drop"){ 19 | @array2.reverse! 20 | @array2.drop(@array2.size - TAKE) 21 | @array2.reverse! 22 | } 23 | 24 | # Least memory 25 | x.report("splice"){ 26 | a=@array2.slice!(0, TAKE) 27 | } 28 | end 29 | -------------------------------------------------------------------------------- /bin/ruby-spark: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | lib = File.expand_path(File.dirname(__FILE__) + '/../lib') 4 | $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) 5 | 6 | require 'ruby-spark' 7 | 8 | Spark::CLI.new.run 9 | -------------------------------------------------------------------------------- /example/pi.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | lib = File.expand_path(File.dirname(__FILE__) + '/../lib') 4 | $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) 5 | 6 | require 'ruby-spark' 7 | 8 | Spark.logger.disable 9 | Spark.start 10 | 11 | slices = 3 12 | n = 100000 * slices 13 | 14 | def map(_) 15 | x = rand * 2 - 1 16 | y = rand * 2 - 1 17 | 18 | if x**2 + y**2 < 1 19 | return 1 20 | else 21 | return 0 22 | end 23 | end 24 | 25 | rdd = Spark.context.parallelize(1..n, slices) 26 | rdd = rdd.map(method(:map)) 27 | 28 | puts 'Pi is roughly %f' % (4.0 * rdd.sum / n) 29 | -------------------------------------------------------------------------------- /example/website_search.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Parse sitemap and search word on every page 4 | 5 | require 'optparse' 6 | require 'open-uri' 7 | require 'nokogiri' 8 | require 'ruby-spark' 9 | 10 | options = { 11 | sitemap: 'http://fit.cvut.cz/sitemap.xml', 12 | query: 'cvut', 13 | workers: 2 14 | } 15 | 16 | opt_parser = OptionParser.new do |opts| 17 | opts.banner = 'Usage: website_search.rb [options]' 18 | 19 | opts.separator '' 20 | opts.separator 'Specific options:' 21 | 22 | opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap| 23 | options[:sitemap] = sitemap 24 | end 25 | 26 | opts.on('-q', '--query QUERY', 'Query to search') do |query| 27 | options[:query] = query 28 | end 29 | 30 | opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers| 31 | options[:workers] = workers 32 | end 33 | 34 | opts.on('--quite', 'Run quitely') do |v| 35 | Spark.logger.disabled 36 | end 37 | 38 | opts.on_tail('-h', '--help', 'Show this message') do 39 | puts opts 40 | exit 41 | end 42 | end 43 | 44 | opt_parser.parse! 45 | 46 | @links = [] 47 | 48 | def parse_sitemap(doc) 49 | doc.xpath('//sitemapindex/sitemap/loc').each do |loc| 50 | next_doc = Nokogiri::HTML(open(loc.text)) 51 | parse_sitemap(next_doc) 52 | end 53 | 54 | doc.xpath('//url/loc').each do |loc| 55 | @links << loc.text 56 | end 57 | end 58 | 59 | doc = Nokogiri::HTML(open(options[:sitemap])) 60 | parse_sitemap(doc) 61 | 62 | # Map function 63 | func = Proc.new do |url| 64 | begin 65 | open(url) {|f| 66 | [url, f.read.scan(query).count] 67 | } 68 | rescue 69 | [url, 0] 70 | end 71 | end 72 | 73 | Spark.start 74 | 75 | rdd = Spark.sc.parallelize(@links, options[:workers]) 76 | .add_library('open-uri') 77 | .bind(query: options[:query]) 78 | .map(func) 79 | .sort_by(lambda{|(_, value)| value}, false) 80 | 81 | rdd.collect.each do |(url, count)| 82 | puts "#{url} => #{count}" 83 | end 84 | -------------------------------------------------------------------------------- /ext/ruby_c/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | create_makefile("ruby_spark_ext") 4 | -------------------------------------------------------------------------------- /ext/ruby_c/murmur.h: -------------------------------------------------------------------------------- 1 | #ifndef MURMUR_INCLUDED 2 | #define MURMUR_INCLUDED 3 | 4 | #include "ruby.h" 5 | 6 | VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass); 7 | VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /ext/ruby_c/ruby-spark.c: -------------------------------------------------------------------------------- 1 | #include "ruby.h" 2 | #include "murmur.h" 3 | 4 | 5 | VALUE SparkModule; 6 | VALUE SparkDigestModule; 7 | VALUE SparkDigestMurmur2Class; 8 | 9 | 10 | void Init_ruby_spark_ext() 11 | { 12 | SparkModule = rb_define_module("Spark"); 13 | SparkDigestModule = rb_define_module_under(SparkModule, "Digest"); 14 | SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject); 15 | 16 | rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1); 17 | rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1); 18 | } 19 | -------------------------------------------------------------------------------- /ext/ruby_java/Digest.java: -------------------------------------------------------------------------------- 1 | import org.jruby.Ruby; 2 | import org.jruby.RubyModule; 3 | import org.jruby.RubyObject; 4 | import org.jruby.RubyClass; 5 | import org.jruby.RubyString; 6 | import org.jruby.RubyFixnum; 7 | import org.jruby.anno.JRubyModule; 8 | import org.jruby.anno.JRubyMethod; 9 | import org.jruby.runtime.ThreadContext; 10 | import org.jruby.runtime.builtin.IRubyObject; 11 | 12 | @JRubyModule(name="Spark::Digest") 13 | public class Digest extends RubyObject{ 14 | 15 | // Have to be the same as in C extension 16 | final static long PORTABLE_HASH_SEED = 16154832; 17 | 18 | public Digest(final Ruby ruby, RubyClass rubyClass) { 19 | super(ruby, rubyClass); 20 | } 21 | 22 | @JRubyMethod(module=true) 23 | public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) { 24 | Ruby ruby = self.getRuntime(); 25 | 26 | RubyString keyString = (RubyString)arg; 27 | 28 | long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED); 29 | 30 | RubyFixnum result = new RubyFixnum(ruby, hash); 31 | 32 | return result; 33 | } 34 | 35 | } 36 | 37 | -------------------------------------------------------------------------------- /ext/ruby_java/Murmur2.java: -------------------------------------------------------------------------------- 1 | import org.jruby.Ruby; 2 | import org.jruby.RubyClass; 3 | import org.jruby.RubyObject; 4 | import org.jruby.RubyString; 5 | import org.jruby.RubyFixnum; 6 | import org.jruby.anno.JRubyClass; 7 | import org.jruby.anno.JRubyMethod; 8 | import org.jruby.runtime.ThreadContext; 9 | import org.jruby.runtime.builtin.IRubyObject; 10 | 11 | /** Murmur hash 2.0. 12 | * 13 | * The murmur hash is a relative fast hash function from 14 | * http://murmurhash.googlepages.com/ for platforms with efficient 15 | * multiplication. 16 | * 17 | * http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/ 18 | * 19 | */ 20 | 21 | @JRubyClass(name="Spark::Digest::Murmur2") 22 | public class Murmur2 extends RubyObject { 23 | 24 | public Murmur2(final Ruby ruby, RubyClass rubyClass) { 25 | super(ruby, rubyClass); 26 | } 27 | 28 | @JRubyMethod(required=1, optional=1, module=true) 29 | public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) { 30 | Ruby ruby = context.getRuntime(); 31 | 32 | RubyString keyString = (RubyString)args[0]; 33 | long seed; 34 | 35 | if(args.length > 1){ 36 | RubyFixnum rb_seed = (RubyFixnum)args[1]; 37 | seed = rb_seed.getLongValue(); 38 | } 39 | else{ 40 | seed = 0; 41 | } 42 | 43 | long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed); 44 | 45 | RubyFixnum result = new RubyFixnum(ruby, hash); 46 | return result; 47 | } 48 | 49 | 50 | /** Generates 64 bit hash from byte array of the given length and seed. 51 | * 52 | * @param data byte array to hash 53 | * @param length length of the array to hash 54 | * @param seed initial seed value 55 | * @return 64 bit hash of the given array 56 | */ 57 | public static long hash64(final byte[] data, int length, long seed) { 58 | final long m = 0xc6a4a7935bd1e995L; 59 | final int r = 47; 60 | 61 | long h = (seed&0xffffffffl)^(length*m); 62 | 63 | int length8 = length/8; 64 | 65 | for (int i=0; i>> r; 74 | k *= m; 75 | 76 | h ^= k; 77 | h *= m; 78 | } 79 | 80 | switch (length%8) { 81 | case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48; 82 | case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40; 83 | case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32; 84 | case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24; 85 | case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16; 86 | case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8; 87 | case 1: h ^= (long)(data[length&~7]&0xff); 88 | h *= m; 89 | }; 90 | 91 | h ^= h >>> r; 92 | h *= m; 93 | h ^= h >>> r; 94 | 95 | return h; 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /ext/ruby_java/RubySparkExtService.java: -------------------------------------------------------------------------------- 1 | import org.jruby.Ruby; 2 | import org.jruby.RubyClass; 3 | import org.jruby.RubyModule; 4 | import org.jruby.runtime.ObjectAllocator; 5 | import org.jruby.runtime.builtin.IRubyObject; 6 | import org.jruby.runtime.load.BasicLibraryService; 7 | 8 | public class RubySparkExtService implements BasicLibraryService 9 | { 10 | public boolean basicLoad(final Ruby ruby) throws java.io.IOException { 11 | 12 | RubyModule sparkModule = ruby.defineModule("Spark"); 13 | RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest"); 14 | RubyClass sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator); 15 | 16 | sparkDigestModule.defineAnnotatedMethods(Digest.class); 17 | sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class); 18 | 19 | return true; 20 | } 21 | 22 | public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() { 23 | public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) { 24 | return new Murmur2(ruby, rubyClass); 25 | } 26 | }; 27 | 28 | } 29 | -------------------------------------------------------------------------------- /ext/ruby_java/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | create_makefile("ruby_spark_ext") 4 | -------------------------------------------------------------------------------- /ext/spark/build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | // Default values 6 | val defaultScalaVersion = "2.10.4" 7 | val defaultSparkVersion = "1.6.0" 8 | val defaultSparkCoreVersion = "2.10" 9 | val defaultTargetDir = "target" 10 | val defaultHadoopVersion = "1.0.4" 11 | 12 | // Values 13 | val _hadoopVersion = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion) 14 | val _scalaVersion = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion) 15 | val _sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion) 16 | val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion) 17 | val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir) 18 | 19 | // Project settings 20 | name := "ruby-spark" 21 | 22 | version := "1.0.0" 23 | 24 | scalaVersion := _scalaVersion 25 | 26 | javacOptions ++= Seq("-source", "1.7", "-target", "1.7") 27 | 28 | // Jar target folder 29 | artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar") 30 | outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar") 31 | 32 | // Protocol buffer support 33 | seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*) 34 | 35 | // Additional libraries 36 | libraryDependencies ++= Seq( 37 | "org.apache.spark" %% "spark-core" % _sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")), 38 | "org.apache.spark" %% "spark-graphx" % _sparkVersion, 39 | "org.apache.spark" %% "spark-mllib" % _sparkVersion, 40 | "org.apache.spark" %% "spark-sql" % _sparkVersion, 41 | "org.apache.hadoop" % "hadoop-client" % _hadoopVersion, 42 | "com.github.fommil.netlib" % "all" % "1.1.2", 43 | "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test" 44 | ) 45 | 46 | // Repositories 47 | resolvers ++= Seq( 48 | "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", 49 | "Spray Repository" at "http://repo.spray.io/", 50 | "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 51 | "Akka Repository" at "http://repo.akka.io/releases/", 52 | "Twitter4J Repository" at "http://twitter4j.org/maven2/", 53 | "Apache HBase" at "https://repository.apache.org/content/repositories/releases", 54 | "Twitter Maven Repo" at "http://maven.twttr.com/", 55 | "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", 56 | "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", 57 | "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", 58 | "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", 59 | Resolver.sonatypeRepo("public") 60 | ) 61 | 62 | // Merge strategy 63 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => 64 | { 65 | case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard 66 | case m if m.startsWith("META-INF") => MergeStrategy.discard 67 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first 68 | case PathList("org", "apache", xs @ _*) => MergeStrategy.first 69 | case PathList("org", "jboss", xs @ _*) => MergeStrategy.first 70 | case "about.html" => MergeStrategy.rename 71 | case "reference.conf" => MergeStrategy.concat 72 | case _ => MergeStrategy.first 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /ext/spark/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 2 | 3 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 4 | 5 | resolvers += "Spray Repository" at "http://repo.spray.io/" 6 | 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") 8 | 9 | addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3") 10 | -------------------------------------------------------------------------------- /ext/spark/sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script launches sbt for this project. If present it uses the system 4 | # version of sbt. If there is no system version of sbt it attempts to download 5 | # sbt locally. 6 | SBT_VERSION=0.13.9 7 | URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 8 | URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 9 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar 10 | 11 | # Download sbt launch jar if it hasn't been downloaded yet 12 | if [ ! -f ${JAR} ]; then 13 | # Download 14 | printf "Attempting to fetch sbt\n" 15 | JAR_DL=${JAR}.part 16 | if hash wget 2>/dev/null; then 17 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 18 | elif hash curl 2>/dev/null; then 19 | (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR} 20 | else 21 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 22 | exit -1 23 | fi 24 | fi 25 | if [ ! -f ${JAR} ]; then 26 | # We failed to download 27 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 28 | exit -1 29 | fi 30 | printf "Launching sbt from ${JAR}\n" 31 | java \ 32 | -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ 33 | -jar ${JAR} \ 34 | "$@" 35 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/Exec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby 2 | 3 | import java.io.{File, FileOutputStream, InputStreamReader, BufferedReader} 4 | 5 | import scala.collection.JavaConversions._ 6 | 7 | import org.apache.spark.{SparkEnv, Logging} 8 | import org.apache.spark.util._ 9 | 10 | 11 | /* ================================================================================================= 12 | * class FileCommand 13 | * ================================================================================================= 14 | * 15 | * Save command to file and than execute him because from Scala you cannot simply run 16 | * something like "bash --norc -i -c 'source .zshrc; ruby master.rb'" 17 | */ 18 | 19 | class FileCommand(command: String) extends Logging { 20 | 21 | var pb: ProcessBuilder = null 22 | var file: File = null 23 | 24 | // Command is complete. 25 | def this(command: String, env: SparkEnv) = { 26 | this(command) 27 | create(env) 28 | } 29 | 30 | // Template must contains %s which will be replaced for command 31 | def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = { 32 | this(template.format(command), env) 33 | setEnvVars(envVars) 34 | } 35 | 36 | private def create(env: SparkEnv) { 37 | val dir = new File(env.sparkFilesDir) 38 | val ext = if(Utils.isWindows) ".cmd" else ".sh" 39 | val shell = if(Utils.isWindows) "cmd" else "bash" 40 | 41 | file = File.createTempFile("command", ext, dir) 42 | 43 | val out = new FileOutputStream(file) 44 | out.write(command.getBytes) 45 | out.close 46 | 47 | logInfo(s"New FileCommand at ${file.getAbsolutePath}") 48 | 49 | pb = new ProcessBuilder(shell, file.getAbsolutePath) 50 | } 51 | 52 | def setEnvVars(vars: Map[String, String]) { 53 | pb.environment().putAll(vars) 54 | } 55 | 56 | def run = { 57 | new ExecutedFileCommand(pb.start) 58 | } 59 | } 60 | 61 | 62 | /* ================================================================================================= 63 | * class ExecutedFileCommand 64 | * ================================================================================================= 65 | * 66 | * Represent process executed from file. 67 | */ 68 | 69 | class ExecutedFileCommand(process: Process) { 70 | 71 | var reader: BufferedReader = null 72 | 73 | def readLine = { 74 | openInput 75 | reader.readLine.toString.trim 76 | } 77 | 78 | def openInput { 79 | if(reader != null){ 80 | return 81 | } 82 | 83 | val input = new InputStreamReader(process.getInputStream) 84 | reader = new BufferedReader(input) 85 | } 86 | 87 | // Delegation 88 | def destroy = process.destroy 89 | def getInputStream = process.getInputStream 90 | def getErrorStream = process.getErrorStream 91 | } 92 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/MLLibAPI.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.api.python 2 | 3 | // PythonMLLibAPI is private for python 4 | class MLLibAPI extends PythonMLLibAPI {} 5 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/Marshal.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby.marshal 2 | 3 | import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | import scala.collection.JavaConverters._ 7 | 8 | 9 | /* ================================================================================================= 10 | * object Marshal 11 | * ================================================================================================= 12 | */ 13 | object Marshal { 14 | def load(bytes: Array[Byte]) = { 15 | val is = new DataInputStream(new ByteArrayInputStream(bytes)) 16 | 17 | val majorVersion = is.readUnsignedByte // 4 18 | val minorVersion = is.readUnsignedByte // 8 19 | 20 | (new MarshalLoad(is)).load 21 | } 22 | 23 | def dump(data: Any) = { 24 | val aos = new ByteArrayOutputStream 25 | val os = new DataOutputStream(aos) 26 | 27 | os.writeByte(4) 28 | os.writeByte(8) 29 | 30 | (new MarshalDump(os)).dump(data) 31 | aos.toByteArray 32 | } 33 | } 34 | 35 | 36 | /* ================================================================================================= 37 | * class IterableMarshaller 38 | * ================================================================================================= 39 | */ 40 | class IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] { 41 | private val buffer = new ArrayBuffer[Any] 42 | 43 | override def hasNext: Boolean = iter.hasNext 44 | 45 | override def next(): Array[Byte] = { 46 | while (iter.hasNext) { 47 | buffer += iter.next() 48 | } 49 | 50 | Marshal.dump(buffer) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/MarshalDump.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby.marshal 2 | 3 | import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | import scala.collection.JavaConverters._ 7 | import scala.reflect.{ClassTag, classTag} 8 | 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector} 11 | 12 | 13 | /* ================================================================================================= 14 | * class MarshalDump 15 | * ================================================================================================= 16 | */ 17 | class MarshalDump(os: DataOutputStream) { 18 | 19 | val NAN_BYTELIST = "nan".getBytes 20 | val NEGATIVE_INFINITY_BYTELIST = "-inf".getBytes 21 | val INFINITY_BYTELIST = "inf".getBytes 22 | 23 | def dump(data: Any) { 24 | data match { 25 | case null => 26 | os.writeByte('0') 27 | 28 | case item: Boolean => 29 | val char = if(item) 'T' else 'F' 30 | os.writeByte(char) 31 | 32 | case item: Int => 33 | os.writeByte('i') 34 | dumpInt(item) 35 | 36 | case item: Array[_] => 37 | os.writeByte('[') 38 | dumpArray(item) 39 | 40 | case item: Double => 41 | os.writeByte('f') 42 | dumpFloat(item) 43 | 44 | case item: ArrayBuffer[Any] => dump(item.toArray) 45 | } 46 | } 47 | 48 | def dumpInt(data: Int) { 49 | if(data == 0){ 50 | os.writeByte(0) 51 | } 52 | else if (0 < data && data < 123) { 53 | os.writeByte(data + 5) 54 | } 55 | else if (-124 < data && data < 0) { 56 | os.writeByte((data - 5) & 0xff) 57 | } 58 | else { 59 | val buffer = new Array[Byte](4) 60 | var value = data 61 | 62 | var i = 0 63 | while(i != 4 && value != 0 && value != -1){ 64 | buffer(i) = (value & 0xff).toByte 65 | value = value >> 8 66 | 67 | i += 1 68 | } 69 | val lenght = i + 1 70 | if(value < 0){ 71 | os.writeByte(-lenght) 72 | } 73 | else{ 74 | os.writeByte(lenght) 75 | } 76 | os.write(buffer, 0, lenght) 77 | } 78 | } 79 | 80 | def dumpArray(array: Array[_]) { 81 | dumpInt(array.size) 82 | 83 | for(item <- array) { 84 | dump(item) 85 | } 86 | } 87 | 88 | def dumpFloat(value: Double) { 89 | if(value.isPosInfinity){ 90 | dumpString(NEGATIVE_INFINITY_BYTELIST) 91 | } 92 | else if(value.isNegInfinity){ 93 | dumpString(INFINITY_BYTELIST) 94 | } 95 | else if(value.isNaN){ 96 | dumpString(NAN_BYTELIST) 97 | } 98 | else{ 99 | // dumpString("%.17g".format(value)) 100 | dumpString(value.toString) 101 | } 102 | } 103 | 104 | def dumpString(data: String) { 105 | dumpString(data.getBytes) 106 | } 107 | 108 | def dumpString(data: Array[Byte]) { 109 | dumpInt(data.size) 110 | os.write(data) 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyAccumulatorParam.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby 2 | 3 | import java.io._ 4 | import java.net._ 5 | import java.util.{List, ArrayList} 6 | 7 | import scala.collection.JavaConversions._ 8 | import scala.collection.immutable._ 9 | 10 | import org.apache.spark._ 11 | import org.apache.spark.util.Utils 12 | 13 | /** 14 | * Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it 15 | * collects a list of pickled strings that we pass to Ruby through a socket. 16 | */ 17 | private class RubyAccumulatorParam(serverHost: String, serverPort: Int) 18 | extends AccumulatorParam[List[Array[Byte]]] { 19 | 20 | // Utils.checkHost(serverHost, "Expected hostname") 21 | 22 | val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536) 23 | 24 | // Socket shoudl not be serialized 25 | // Otherwise: SparkException: Task not serializable 26 | @transient var socket: Socket = null 27 | @transient var socketOutputStream: DataOutputStream = null 28 | @transient var socketInputStream: DataInputStream = null 29 | 30 | def openSocket(){ 31 | synchronized { 32 | if (socket == null || socket.isClosed) { 33 | socket = new Socket(serverHost, serverPort) 34 | 35 | socketInputStream = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize)) 36 | socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize)) 37 | } 38 | } 39 | } 40 | 41 | override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList 42 | 43 | override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized { 44 | if (serverHost == null) { 45 | // This happens on the worker node, where we just want to remember all the updates 46 | val1.addAll(val2) 47 | val1 48 | } else { 49 | // This happens on the master, where we pass the updates to Ruby through a socket 50 | openSocket() 51 | 52 | socketOutputStream.writeInt(val2.size) 53 | for (array <- val2) { 54 | socketOutputStream.writeInt(array.length) 55 | socketOutputStream.write(array) 56 | } 57 | socketOutputStream.flush() 58 | 59 | // Wait for acknowledgement 60 | // http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock 61 | // 62 | // if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){ 63 | // throw new SparkException("Accumulator was not acknowledged") 64 | // } 65 | 66 | new ArrayList 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyBroadcast.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby 2 | 3 | import org.apache.spark.api.python.PythonBroadcast 4 | 5 | /** 6 | * An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will 7 | * write the data into disk after deserialization, then Ruby can read it from disks. 8 | * 9 | * Class use Python logic - only for semantic 10 | */ 11 | class RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyConstant.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby 2 | 3 | object RubyConstant { 4 | val DATA_EOF = -2 5 | val WORKER_ERROR = -1 6 | val WORKER_DONE = 0 7 | val CREATE_WORKER = 1 8 | val KILL_WORKER = 2 9 | val KILL_WORKER_AND_WAIT = 3 10 | val SUCCESSFULLY_KILLED = 4 11 | val UNSUCCESSFUL_KILLING = 5 12 | val ACCUMULATOR_ACK = 6 13 | } 14 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyMLLibAPI.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.api.ruby 2 | 3 | import java.util.ArrayList 4 | 5 | import scala.collection.JavaConverters._ 6 | 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.api.java.JavaRDD 9 | import org.apache.spark.mllib.linalg._ 10 | import org.apache.spark.mllib.regression.LabeledPoint 11 | import org.apache.spark.mllib.classification.NaiveBayes 12 | import org.apache.spark.mllib.clustering.GaussianMixtureModel 13 | import org.apache.spark.mllib.stat.distribution.MultivariateGaussian 14 | import org.apache.spark.mllib.api.python.MLLibAPI 15 | 16 | 17 | class RubyMLLibAPI extends MLLibAPI { 18 | // trainLinearRegressionModelWithSGD 19 | // trainLassoModelWithSGD 20 | // trainRidgeModelWithSGD 21 | // trainLogisticRegressionModelWithSGD 22 | // trainLogisticRegressionModelWithLBFGS 23 | // trainSVMModelWithSGD 24 | // trainKMeansModel 25 | // trainGaussianMixtureModel 26 | 27 | // Rjb have a problem with theta: Array[Array[Double]] 28 | override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = { 29 | val model = NaiveBayes.train(data.rdd, lambda) 30 | 31 | List( 32 | Vectors.dense(model.labels), 33 | Vectors.dense(model.pi), 34 | model.theta.toSeq 35 | ).map(_.asInstanceOf[Object]).asJava 36 | } 37 | 38 | // On python is wt just Object 39 | def predictSoftGMM( 40 | data: JavaRDD[Vector], 41 | wt: ArrayList[Object], 42 | mu: ArrayList[Object], 43 | si: ArrayList[Object]): RDD[Array[Double]] = { 44 | 45 | // val weight = wt.asInstanceOf[Array[Double]] 46 | val weight = wt.toArray.map(_.asInstanceOf[Double]) 47 | val mean = mu.toArray.map(_.asInstanceOf[DenseVector]) 48 | val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix]) 49 | val gaussians = Array.tabulate(weight.length){ 50 | i => new MultivariateGaussian(mean(i), sigma(i)) 51 | } 52 | val model = new GaussianMixtureModel(weight, gaussians) 53 | model.predictSoft(data) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyMLLibUtilAPI.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.api.ruby 2 | 3 | import java.util.ArrayList 4 | 5 | import org.apache.spark.mllib.util.LinearDataGenerator 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | 8 | object RubyMLLibUtilAPI { 9 | 10 | // Ruby does have a problem with creating Array[Double] 11 | def generateLinearInput( 12 | intercept: Double, 13 | weights: ArrayList[String], 14 | nPoints: Int, 15 | seed: Int, 16 | eps: Double = 0.1): Seq[LabeledPoint] = { 17 | 18 | LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps) 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyPage.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ui.ruby 2 | 3 | // import javax.servlet.http.HttpServletRequest 4 | 5 | // import scala.xml.Node 6 | 7 | // import org.apache.spark.ui.{WebUIPage, UIUtils} 8 | // import org.apache.spark.util.Utils 9 | 10 | // private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage("") { 11 | 12 | // def render(request: HttpServletRequest): Seq[Node] = { 13 | // val content = UIUtils.listingTable(header, row, rbConfig) 14 | // UIUtils.headerSparkPage("Ruby Config", content, parent) 15 | // } 16 | 17 | // private def header = Seq( 18 | // "Number" 19 | // ) 20 | 21 | // private def row(keyValue: (String, String)): Seq[Node] = { 22 | // // scalastyle:off 23 | // keyValue match { 24 | // case (key, value) => 25 | // 26 | // {key} 27 | // {value} 28 | // 29 | // } 30 | // // scalastyle:on 31 | // } 32 | // } 33 | 34 | class RubyPage {} 35 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubySerializer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby 2 | 3 | import scala.collection.JavaConverters._ 4 | import scala.reflect.{ClassTag, classTag} 5 | 6 | import org.apache.spark.api.java.JavaRDD 7 | import org.apache.spark.api.ruby.marshal._ 8 | 9 | 10 | /* ================================================================================================= 11 | * object RubySerializer 12 | * ================================================================================================= 13 | */ 14 | object RubySerializer { } 15 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyTab.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ui.ruby 2 | 3 | import scala.collection.mutable.HashMap 4 | 5 | import org.apache.spark.ui._ 6 | 7 | // class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, "ruby"){ 8 | // attachPage(new RubyPage(this, rbConfig.toArray)) 9 | // } 10 | 11 | class RubyTab {} 12 | -------------------------------------------------------------------------------- /ext/spark/src/main/scala/RubyUtils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby 2 | 3 | import org.apache.spark.util._ 4 | import org.apache.spark.{SparkConf, Logging} 5 | 6 | object RubyUtils extends Logging { 7 | 8 | def loadPropertiesFile(conf: SparkConf, path: String): String = { 9 | Utils.getPropertiesFromFile(path).foreach { 10 | case (key, value) => conf.set(key, value) 11 | } 12 | path 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /ext/spark/src/test/scala/MarshalSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.api.ruby.marshal 2 | 3 | import org.scalatest._ 4 | 5 | 6 | import org.apache.spark.api.ruby.marshal._ 7 | 8 | class MarshalSpec extends FunSpec with Matchers { 9 | 10 | // ==================================================================================== 11 | // Load 12 | 13 | describe("Marshal.load"){ 14 | describe("single value"){ 15 | it("int"){ 16 | val data = 1 17 | val serialized = Array[Byte](4, 8, 105, 6) 18 | 19 | Marshal.load(serialized) should equal(data) 20 | } 21 | 22 | it("double"){ 23 | val data = 1.2 24 | val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50) 25 | 26 | Marshal.load(serialized) should equal(data) 27 | } 28 | } 29 | 30 | describe("array"){ 31 | it("ints"){ 32 | val data = Array(1, 2, 3, 4, 5) 33 | val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10) 34 | 35 | Marshal.load(serialized) should equal(data) 36 | } 37 | 38 | it("doubles"){ 39 | val data = Array(1.1, 2.2, 3.3) 40 | val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51) 41 | 42 | Marshal.load(serialized) should equal(data) 43 | } 44 | } 45 | } 46 | 47 | // ==================================================================================== 48 | // Dump 49 | 50 | describe("Marshal.dump"){ 51 | describe("single value"){ 52 | it("int"){ 53 | val data = 1 54 | val serialized = Array(4, 8, 105, 6) 55 | 56 | Marshal.dump(data) should equal(serialized) 57 | } 58 | 59 | it("double"){ 60 | val data = 1.2 61 | val serialized = Array(4, 8, 102, 8, 49, 46, 50) 62 | 63 | Marshal.dump(data) should equal(serialized) 64 | } 65 | } 66 | 67 | describe("array"){ 68 | it("ints"){ 69 | val data = Array(1, 2, 3, 4, 5) 70 | val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10) 71 | 72 | Marshal.dump(data) should equal(serialized) 73 | } 74 | 75 | it("doubles"){ 76 | val data = Array(1.1, 2.2, 3.3) 77 | val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51) 78 | 79 | Marshal.dump(data) should equal(serialized) 80 | } 81 | } 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /lib/ruby-spark.rb: -------------------------------------------------------------------------------- 1 | require_relative 'spark' 2 | -------------------------------------------------------------------------------- /lib/spark/broadcast.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | ## 3 | # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast 4 | # object for reading it in distributed functions. The variable will 5 | # be sent to each cluster only once. 6 | # 7 | # == Example: 8 | # 9 | # broadcast1 = $sc.broadcast('a') 10 | # broadcast2 = $sc.broadcast('b') 11 | # broadcast3 = $sc.broadcast([1,2,3]) 12 | # 13 | # func = Proc.new do |part, index| 14 | # [ 15 | # broadcast1.value * index, 16 | # broadcast2.value * index, 17 | # broadcast3.value.reduce(:+) 18 | # ] 19 | # end 20 | # 21 | # rdd = $sc.parallelize(0..5, 4) 22 | # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3) 23 | # rdd = rdd.map_partitions_with_index(func) 24 | # rdd.collect 25 | # # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6] 26 | # 27 | class Broadcast 28 | 29 | LOADED = 0 # id, value, path 30 | NOT_LOADED = 1 # id, path 31 | WITHOUT_PATH = 2 # id 32 | 33 | attr_reader :id, :state, :path, :jbroadcast 34 | 35 | @@registered = {} 36 | 37 | # ========================================================================= 38 | # Creating broadcast for SparkContext 39 | 40 | # Create new Broadcast and dump value to the disk 41 | # 42 | # b = $sc.broadcast('a') 43 | # 44 | # b.value # => 'a' 45 | # b.path 46 | # b.jbroadcast 47 | # 48 | def initialize(sc, value) 49 | @id = object_id 50 | @value = value 51 | @state = LOADED 52 | 53 | file = Tempfile.create('broadcast', sc.temp_dir) 54 | file.binmode 55 | file.write(Marshal.dump(value)) 56 | file.close 57 | 58 | @path = file.path 59 | @jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id)) 60 | 61 | ObjectSpace.define_finalizer(self, proc { File.unlink(@path) }) 62 | end 63 | 64 | def inspect 65 | result = %{#<#{self.class.name}:0x#{object_id}\n} 66 | result << %{ ID: #{@id}\n} 67 | result << %{Value: #{@value.to_s[0, 10]}>} 68 | result 69 | end 70 | 71 | def self.register(id, path) 72 | @@registered[id] = path 73 | end 74 | 75 | def value 76 | case state 77 | when LOADED 78 | @value 79 | when NOT_LOADED 80 | @value = Marshal.load(File.read(@path)) 81 | @state = LOADED 82 | @value 83 | when WITHOUT_PATH 84 | @path = @@registered[id] 85 | 86 | if @path 87 | @state = NOT_LOADED 88 | value 89 | else 90 | raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path." 91 | end 92 | end 93 | end 94 | 95 | def marshal_dump 96 | @id 97 | end 98 | 99 | def marshal_load(id) 100 | @id = id 101 | @state = WITHOUT_PATH 102 | end 103 | 104 | end 105 | end 106 | -------------------------------------------------------------------------------- /lib/spark/build.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Build 3 | 4 | DEFAULT_SCALA_VERSION = '2.10.4' 5 | DEFAULT_CORE_VERSION = '2.10' 6 | DEFAULT_SPARK_VERSION = '1.6.0' 7 | DEFAULT_HADOOP_VERSION = '1.0.4' 8 | 9 | SBT = 'sbt/sbt' 10 | SBT_DEPS = 'assemblyPackageDependency' 11 | SBT_EXT = 'package' 12 | SBT_CLEAN = 'clean' 13 | 14 | def self.build(options={}) 15 | scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION 16 | spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION 17 | spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION 18 | hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION 19 | target = options[:target] || Spark.target_dir 20 | only_ext = options[:only_ext] || false 21 | 22 | env = { 23 | 'SCALA_VERSION' => scala_version, 24 | 'SPARK_VERSION' => spark_version, 25 | 'SPARK_CORE_VERSION' => spark_core_version, 26 | 'HADOOP_VERSION' => hadoop_version, 27 | 'TARGET_DIR' => target 28 | } 29 | 30 | cmd = [SBT] 31 | cmd << SBT_EXT 32 | cmd << SBT_DEPS unless only_ext 33 | cmd << SBT_CLEAN unless $DEBUG 34 | 35 | Dir.chdir(Spark.spark_ext_dir) do 36 | unless Kernel.system(env, cmd.join(' ')) 37 | raise Spark::BuildError, 'Spark cannot be assembled.' 38 | end 39 | end 40 | end 41 | 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/spark/command.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | ## 3 | # Container which includes all commands and other things for worker 4 | # Every RDD have own copy of Command 5 | # 6 | class Command 7 | 8 | attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects 9 | 10 | def initialize 11 | @serializer = nil 12 | @deserializer = nil 13 | @commands = [] 14 | @libraries = [] 15 | @bound_objects = {} 16 | end 17 | 18 | def execute(iterator, split_index) 19 | # Require necessary libraries 20 | libraries.each{|lib| require lib} 21 | 22 | # Prepare bound objects 23 | @commands.each do |command| 24 | command.__objects__ = bound_objects 25 | end 26 | 27 | # Prepare for running 28 | @commands.each(&:prepare) 29 | 30 | # Run all task 31 | @commands.each do |command| 32 | iterator = command.execute(iterator, split_index) 33 | end 34 | 35 | # Return changed iterator. This is not be necessary for some tasks 36 | # because of using inplace changing but some task can return 37 | # only one value (for example reduce). 38 | iterator 39 | end 40 | 41 | def last 42 | @commands.last 43 | end 44 | 45 | def bound_objects 46 | # Objects from users 47 | # Already initialized objects on worker 48 | return @bound_objects if @bound_objects 49 | 50 | if @serialized_bound_objects 51 | # Still serialized 52 | @bound_objects = Marshal.load(@serialized_bound_objects) 53 | else 54 | # Something else 55 | @bound_objects = {} 56 | end 57 | end 58 | 59 | # Bound objects can depend on library which is loaded during @execute 60 | # In that case worker raise "undefined class/module" 61 | def marshal_dump 62 | [@serializer, @deserializer, @commands, @libraries, serialized_bound_objects] 63 | end 64 | 65 | def marshal_load(array) 66 | @serializer = array.shift 67 | @deserializer = array.shift 68 | @commands = array.shift 69 | @libraries = array.shift 70 | @serialized_bound_objects = array.shift 71 | end 72 | 73 | private 74 | 75 | def serialized_bound_objects 76 | @serialized_bound_objects ||= Marshal.dump(@bound_objects) 77 | end 78 | 79 | end 80 | end 81 | 82 | require 'spark/command/base' 83 | require 'spark/command/basic' 84 | require 'spark/command/pair' 85 | require 'spark/command/statistic' 86 | require 'spark/command/sort' 87 | -------------------------------------------------------------------------------- /lib/spark/command/pair.rb: -------------------------------------------------------------------------------- 1 | _Base = Spark::Command::Base 2 | 3 | # ------------------------------------------------------------------------------------------------- 4 | # CombineByKey 5 | 6 | class Spark::Command::CombineByKey 7 | 8 | # --------------- 9 | 10 | class Base < Spark::Command::Base 11 | def run(iterator, *) 12 | _run(iterator).to_a 13 | end 14 | 15 | def lazy_run(iterator, *) 16 | _run(iterator).lazy 17 | end 18 | end 19 | 20 | # --------------- 21 | 22 | class Combine < Base 23 | variable :create_combiner 24 | variable :merge_value 25 | 26 | def _run(iterator) 27 | # Not use combiners[key] ||= .. 28 | # it tests nil and not has_key? 29 | combiners = {} 30 | iterator.each do |key, value| 31 | if combiners.has_key?(key) 32 | combiners[key] = @merge_value.call(combiners[key], value) 33 | else 34 | combiners[key] = @create_combiner.call(value) 35 | end 36 | end 37 | combiners 38 | end 39 | end 40 | 41 | # --------------- 42 | 43 | class Merge < Base 44 | variable :merge_combiners 45 | 46 | def _run(iterator, *) 47 | combiners = {} 48 | iterator.each do |key, value| 49 | if combiners.has_key?(key) 50 | combiners[key] = @merge_combiners.call(combiners[key], value) 51 | else 52 | combiners[key] = value 53 | end 54 | end 55 | combiners 56 | end 57 | end 58 | 59 | # --------------- 60 | 61 | class CombineWithZero < Base 62 | variable :zero_value, function: false, type: Object 63 | variable :merge_value 64 | 65 | def _run(iterator) 66 | # Not use combiners[key] ||= .. 67 | # it tests nil and not has_key? 68 | combiners = {} 69 | iterator.each do |key, value| 70 | unless combiners.has_key?(key) 71 | combiners[key] = @zero_value 72 | end 73 | 74 | combiners[key] = @merge_value.call(combiners[key], value) 75 | end 76 | combiners 77 | end 78 | end 79 | 80 | 81 | # --------------- 82 | 83 | end 84 | 85 | # ------------------------------------------------------------------------------------------------- 86 | # MapValues 87 | 88 | class Spark::Command::MapValues < _Base 89 | variable :map_function 90 | 91 | def run(iterator, *) 92 | iterator.map! do |item| 93 | item[1] = @map_function.call(item[1]) 94 | item 95 | end 96 | iterator 97 | end 98 | 99 | def lazy_run(iterator, *) 100 | iterator.map do |item| 101 | item[1] = @map_function.call(item[1]) 102 | item 103 | end 104 | end 105 | end 106 | 107 | # ------------------------------------------------------------------------------------------------- 108 | # FlatMapValues 109 | 110 | class Spark::Command::FlatMapValues < _Base 111 | variable :map_function 112 | 113 | def run(iterator, *) 114 | iterator.map! do |(key, values)| 115 | values = @map_function.call(values) 116 | values.flatten!(1) 117 | values.map! do |value| 118 | [key, value] 119 | end 120 | end 121 | iterator.flatten!(1) 122 | iterator 123 | end 124 | end 125 | -------------------------------------------------------------------------------- /lib/spark/command/sort.rb: -------------------------------------------------------------------------------- 1 | _Base = Spark::Command::Base 2 | 3 | # ------------------------------------------------------------------------------------------------- 4 | # Sort 5 | 6 | class Spark::Command::SortByKey < _Base 7 | variable :key_function 8 | variable :ascending, function: false, type: [TrueClass, FalseClass] 9 | variable :spilling, function: false, type: [TrueClass, FalseClass] 10 | variable :memory, function: false, type: [Numeric, NilClass] 11 | variable :serializer, function: false, type: Spark::Serializer::Base 12 | 13 | # Currently disabled 14 | def before_run 15 | @spilling = false 16 | end 17 | 18 | def run(iterator, _) 19 | if @spilling 20 | iterator = run_with_spilling(iterator.each) 21 | else 22 | run_without_spilling(iterator) 23 | end 24 | 25 | iterator 26 | end 27 | 28 | def run_with_enum(iterator, _) 29 | if @spilling 30 | iterator = run_with_spilling(iterator) 31 | else 32 | iterator = iterator.to_a 33 | run_without_spilling(iterator) 34 | end 35 | 36 | iterator 37 | end 38 | 39 | private 40 | 41 | def run_with_spilling(iterator) 42 | sorter = Spark::ExternalSorter.new(@memory, @serializer) 43 | sorter.sort_by(iterator, @ascending, @key_function) 44 | end 45 | 46 | def run_without_spilling(iterator) 47 | iterator.sort_by!(&@key_function) 48 | iterator.reverse! unless @ascending 49 | end 50 | 51 | end 52 | -------------------------------------------------------------------------------- /lib/spark/command/statistic.rb: -------------------------------------------------------------------------------- 1 | _Base = Spark::Command::Base 2 | 3 | # ------------------------------------------------------------------------------------------------- 4 | # Sample 5 | 6 | class Spark::Command::Sample < _Base 7 | variable :with_replacement, function: false, type: [TrueClass, FalseClass] 8 | variable :fraction, function: false, type: Numeric 9 | variable :seed, function: false, type: [NilClass, Numeric] 10 | 11 | def run(iterator, _) 12 | sampler.sample(iterator) 13 | end 14 | 15 | def lazy_run(iterator, _) 16 | sampler.lazy_sample(iterator) 17 | end 18 | 19 | def sampler 20 | @sampler ||= _sampler 21 | end 22 | 23 | def _sampler 24 | if @with_replacement 25 | sampler = Spark::Sampler::Poisson 26 | else 27 | sampler = Spark::Sampler::Uniform 28 | end 29 | 30 | sampler = sampler.new(@fraction, @seed) 31 | end 32 | end 33 | 34 | # ------------------------------------------------------------------------------------------------- 35 | # Stats 36 | 37 | class Spark::Command::Stats < _Base 38 | 39 | def run(iterator, *) 40 | [Spark::StatCounter.new(iterator)] 41 | end 42 | 43 | def lazy_run(iterator, *) 44 | run(iterator) 45 | end 46 | 47 | end 48 | 49 | # ------------------------------------------------------------------------------------------------- 50 | # Histogram 51 | 52 | class Spark::Command::Histogram < _Base 53 | include Spark::Helper::Statistic 54 | 55 | variable :even, function: false, type: [TrueClass, FalseClass] 56 | variable :buckets, function: false, type: Array 57 | 58 | def run(iterator, *) 59 | counters = Array.new(counter_size) { 0 } 60 | iterator.each do |item| 61 | if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min 62 | next 63 | end 64 | 65 | x = bucket_function.call(item) 66 | if x.nil? 67 | # next 68 | else 69 | counters[x] += 1 70 | end 71 | end 72 | [counters] 73 | end 74 | 75 | def lazy_run(iterator, *) 76 | run(iterator) 77 | end 78 | 79 | private 80 | 81 | def min 82 | @buckets.first 83 | end 84 | 85 | def max 86 | @buckets.last 87 | end 88 | 89 | def counter_size 90 | @buckets.size-1 91 | end 92 | 93 | def increment 94 | @buckets[1]-@buckets[0] 95 | end 96 | 97 | # Decide which bucket function to pass. We decide here rather than having 98 | # a general function so that the decission need only be made once. 99 | def bucket_function 100 | @bucket_function ||= _bucket_function 101 | end 102 | 103 | def _bucket_function 104 | if @even 105 | fast_bucket_function 106 | else 107 | basic_bucket_function 108 | end 109 | end 110 | 111 | # Determine the bucket function in constant time. 112 | # Requires that buckets are evenly spaced 113 | def fast_bucket_function 114 | Proc.new do |item| 115 | if item.is_a?(Float) && item.nan? 116 | nil 117 | else 118 | bucket_number = (item - min)/increment 119 | if bucket_number > counter_size || bucket_number < 0 120 | nil 121 | else 122 | [bucket_number.to_i, counter_size-1].min 123 | end 124 | end 125 | end 126 | end 127 | 128 | # Basic bucket function. Same as right bisect. 129 | def basic_bucket_function 130 | Proc.new do |item| 131 | bucket_number = bisect_right(@buckets, item) - 1 132 | 133 | # Counters is @buckets.size - 1 134 | # [bucket_number, counter_size-1].min 135 | 136 | if bucket_number > counter_size-1 137 | counter_size-1 138 | else 139 | bucket_number 140 | end 141 | end 142 | end 143 | 144 | end 145 | -------------------------------------------------------------------------------- /lib/spark/command_validator.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CommandValidator 3 | 4 | def validate(value, options) 5 | validate_type(value, options[:type]) 6 | end 7 | 8 | def valid?(value, options) 9 | begin 10 | validate(value, options) 11 | return true 12 | rescue 13 | return false 14 | end 15 | end 16 | 17 | def validate_type(value, types) 18 | types = [types] if !types.is_a?(Array) 19 | 20 | types.each do |type| 21 | return if value.is_a?(type) 22 | end 23 | 24 | error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}." 25 | end 26 | 27 | def validate_size(array1, array2) 28 | if array1.size != array2.size 29 | error "Wrong number of arguments (#{array1.size} for #{array2.size})" 30 | end 31 | end 32 | 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/spark/constant.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | # Commond constant for Ruby and Spark 3 | module Constant 4 | DATA_EOF = -2 5 | WORKER_ERROR = -1 6 | WORKER_DONE = 0 7 | CREATE_WORKER = 1 8 | KILL_WORKER = 2 9 | KILL_WORKER_AND_WAIT = 3 10 | SUCCESSFULLY_KILLED = 4 11 | UNSUCCESSFUL_KILLING = 5 12 | ACCUMULATOR_ACK = 6 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/spark/error.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | # Extension cannot be built 3 | class BuildError < StandardError 4 | end 5 | 6 | # Proc.to_source 7 | # Java object cannot be converted 8 | class SerializeError < StandardError 9 | end 10 | 11 | # Serializer method 12 | # Non-existing serializer 13 | class NotImplemented < StandardError 14 | end 15 | 16 | # Missison app_name or master 17 | class ConfigurationError < StandardError 18 | end 19 | 20 | # Wrong parameters 21 | class RDDError < StandardError 22 | end 23 | 24 | # Validations 25 | class CommandError < StandardError 26 | end 27 | 28 | # Parser helper 29 | # SQL DataType 30 | class ParseError < StandardError 31 | end 32 | 33 | # Validation in context 34 | class ContextError < StandardError 35 | end 36 | 37 | # Broadcasts 38 | # Missing path 39 | class BroadcastError < StandardError 40 | end 41 | 42 | # Accumulators 43 | # Existing keys 44 | # Wrong ID 45 | class AccumulatorError < StandardError 46 | end 47 | 48 | # Wrong instances 49 | class MllibError < StandardError 50 | end 51 | 52 | # Wrong datatype 53 | class SQLError < StandardError 54 | end 55 | 56 | # Missing Java class 57 | class JavaBridgeError < StandardError 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/spark/ext/hash.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module Hash 4 | module ClassMethods 5 | end 6 | 7 | module InstanceMethods 8 | # Destructively convert all keys to strings. 9 | def stringify_keys_with_spark! 10 | transform_keys!{ |key| key.to_s } 11 | end 12 | 13 | # Destructively convert all keys to symbols, as long as they respond 14 | def symbolize_keys_with_spark! 15 | transform_keys!{ |key| key.to_sym rescue key } 16 | end 17 | 18 | # Destructively convert all keys using the block operations. 19 | # Same as transform_keys but modifies +self+. 20 | def transform_keys_with_spark! 21 | keys.each do |key| 22 | self[yield(key)] = delete(key) 23 | end 24 | self 25 | end 26 | end 27 | 28 | def self.included(base) 29 | base.extend(ClassMethods) 30 | base.send(:include, InstanceMethods) 31 | base.class_eval do 32 | patch_unless_exist :stringify_keys!, :spark 33 | patch_unless_exist :symbolize_keys!, :spark 34 | patch_unless_exist :transform_keys!, :spark 35 | end 36 | end 37 | end 38 | end 39 | end 40 | 41 | Hash.__send__(:include, Spark::CoreExtension::Hash) 42 | -------------------------------------------------------------------------------- /lib/spark/ext/integer.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module Integer 4 | module ClassMethods 5 | end 6 | 7 | module InstanceMethods 8 | end 9 | 10 | def self.included(base) 11 | base.extend(ClassMethods) 12 | base.send(:include, InstanceMethods) 13 | base.class_eval do 14 | const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1 15 | const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1 16 | 17 | path_const_unless_exist :MAX, :SPARK 18 | path_const_unless_exist :MIN, :SPARK 19 | end 20 | end 21 | end 22 | end 23 | end 24 | 25 | Integer.__send__(:include, Spark::CoreExtension::Integer) 26 | -------------------------------------------------------------------------------- /lib/spark/ext/io.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module IO 4 | module ClassMethods 5 | end 6 | 7 | module InstanceMethods 8 | 9 | # Reading 10 | 11 | def read_int 12 | unpack_int(read(4)) 13 | end 14 | 15 | def read_int_or_eof 16 | bytes = read(4) 17 | return Spark::Constant::DATA_EOF if bytes.nil? 18 | unpack_int(bytes) 19 | end 20 | 21 | def read_long 22 | unpack_long(read(8)) 23 | end 24 | 25 | def read_string 26 | read(read_int) 27 | end 28 | 29 | def read_data 30 | Marshal.load(read_string) 31 | end 32 | 33 | 34 | # Writing 35 | 36 | def write_int(data) 37 | write(pack_int(data)) 38 | end 39 | 40 | def write_long(data) 41 | write(pack_long(data)) 42 | end 43 | 44 | # Size and data can have different encoding 45 | # Marshal: both ASCII 46 | # Oj: ASCII and UTF-8 47 | def write_string(data) 48 | write_int(data.bytesize) 49 | write(data) 50 | end 51 | 52 | def write_data(data) 53 | write_string(Marshal.dump(data)) 54 | end 55 | end 56 | 57 | def self.included(base) 58 | base.extend(ClassMethods) 59 | base.send(:include, Spark::Helper::Serialize) 60 | base.send(:include, InstanceMethods) 61 | end 62 | end 63 | end 64 | end 65 | 66 | IO.__send__(:include, Spark::CoreExtension::IO) 67 | StringIO.__send__(:include, Spark::CoreExtension::IO) 68 | -------------------------------------------------------------------------------- /lib/spark/ext/ip_socket.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module IPSocket 4 | module ClassMethods 5 | end 6 | 7 | module InstanceMethods 8 | def port 9 | addr[1] 10 | end 11 | 12 | def hostname 13 | addr(true)[2] 14 | end 15 | 16 | def numeric_address 17 | addr[3] 18 | end 19 | end 20 | 21 | def self.included(base) 22 | base.extend(ClassMethods) 23 | base.send(:include, InstanceMethods) 24 | end 25 | end 26 | end 27 | end 28 | 29 | IPSocket.__send__(:include, Spark::CoreExtension::IPSocket) 30 | -------------------------------------------------------------------------------- /lib/spark/ext/module.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module Module 4 | 5 | # Patch method to class unless already exist 6 | # 7 | # == Example: 8 | # 9 | # class Hash 10 | # def a 11 | # 1 12 | # end 13 | # end 14 | # 15 | # module HashExtension 16 | # module InstanceMethods 17 | # def a_with_spark 18 | # 2 19 | # end 20 | # 21 | # def b_with_spark 22 | # 1 23 | # end 24 | # end 25 | # 26 | # def self.included(base) 27 | # base.send(:include, InstanceMethods) 28 | # base.class_eval do 29 | # patch_unless_exist :a, :spark 30 | # patch_unless_exist :b, :spark 31 | # end 32 | # end 33 | # end 34 | # 35 | # Hash.include(HashExtension) 36 | # 37 | # Hash.new.a # => 1 38 | # Hash.new.b # => 1 39 | # 40 | def patch_unless_exist(target, suffix) 41 | unless method_defined?(target) 42 | aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1 43 | 44 | alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}" 45 | end 46 | end 47 | 48 | def path_const_unless_exist(target, suffix) 49 | unless const_defined?(target) 50 | const_set(target, const_get("#{target}_WITH_#{suffix}")) 51 | end 52 | end 53 | 54 | end 55 | end 56 | end 57 | 58 | Module.__send__(:include, Spark::CoreExtension::Module) 59 | -------------------------------------------------------------------------------- /lib/spark/ext/object.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module Object 4 | module ClassMethods 5 | end 6 | 7 | module InstanceMethods 8 | def deep_copy_with_spark 9 | Marshal.load(Marshal.dump(self)) 10 | end 11 | 12 | def silence_warnings 13 | old_verbose, $VERBOSE = $VERBOSE, nil 14 | yield 15 | ensure 16 | $VERBOSE = old_verbose 17 | end 18 | 19 | def cattr_reader_with_spark(*syms) 20 | syms.each do |sym| 21 | raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/ 22 | 23 | class_eval(<<-EOS, __FILE__, __LINE__ + 1) 24 | @@#{sym} = nil unless defined? @@#{sym} 25 | def self.#{sym} 26 | @@#{sym} 27 | end 28 | EOS 29 | 30 | class_eval(<<-EOS, __FILE__, __LINE__ + 1) 31 | def #{sym} 32 | @@#{sym} 33 | end 34 | EOS 35 | end 36 | end 37 | 38 | def cattr_writer_with_spark(*syms) 39 | syms.each do |sym| 40 | raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/ 41 | 42 | class_eval(<<-EOS, __FILE__, __LINE__ + 1) 43 | @@#{sym} = nil unless defined? @@#{sym} 44 | def self.#{sym}=(obj) 45 | @@#{sym} = obj 46 | end 47 | EOS 48 | 49 | class_eval(<<-EOS, __FILE__, __LINE__ + 1) 50 | def #{sym}=(obj) 51 | @@#{sym} = obj 52 | end 53 | EOS 54 | end 55 | end 56 | 57 | def cattr_accessor_with_spark(*syms) 58 | cattr_reader_with_spark(*syms) 59 | cattr_writer_with_spark(*syms) 60 | end 61 | end 62 | 63 | def self.included(base) 64 | base.extend(ClassMethods) 65 | base.send(:include, InstanceMethods) 66 | base.class_eval do 67 | patch_unless_exist :deep_copy, :spark 68 | patch_unless_exist :silence_warnings, :spark 69 | patch_unless_exist :cattr_accessor, :spark 70 | end 71 | end 72 | end 73 | end 74 | end 75 | 76 | Object.__send__(:include, Spark::CoreExtension::Object) 77 | -------------------------------------------------------------------------------- /lib/spark/ext/string.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module CoreExtension 3 | module String 4 | module ClassMethods 5 | end 6 | 7 | module InstanceMethods 8 | def camelize_with_spark 9 | self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase } 10 | end 11 | end 12 | 13 | def self.included(base) 14 | base.extend(ClassMethods) 15 | base.send(:include, InstanceMethods) 16 | base.class_eval do 17 | patch_unless_exist :camelize, :spark 18 | end 19 | end 20 | end 21 | end 22 | end 23 | 24 | String.__send__(:include, Spark::CoreExtension::String) 25 | -------------------------------------------------------------------------------- /lib/spark/helper.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Helper 3 | autoload :System, "spark/helper/system" 4 | autoload :Logger, "spark/helper/logger" 5 | autoload :Statistic, "spark/helper/statistic" 6 | autoload :Serialize, "spark/helper/serialize" 7 | autoload :Partition, "spark/helper/partition" 8 | autoload :Parser, "spark/helper/parser" 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/spark/helper/logger.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Helper 3 | module Logger 4 | 5 | def self.included(base) 6 | base.send :extend, Methods 7 | base.send :include, Methods 8 | end 9 | 10 | module Methods 11 | def log_info(message) 12 | Spark.logger.info(message) 13 | end 14 | 15 | def log_debug(message) 16 | Spark.logger.debug(message) 17 | end 18 | 19 | def log_trace(message) 20 | Spark.logger.trace(message) 21 | end 22 | 23 | def log_warning(message) 24 | Spark.logger.warning(message) 25 | end 26 | 27 | def log_error(message) 28 | Spark.logger.error(message) 29 | end 30 | 31 | alias_method :logInfo, :log_info 32 | alias_method :logDebug, :log_debug 33 | alias_method :logTrace, :log_trace 34 | alias_method :logWarning, :log_warning 35 | alias_method :logError, :log_error 36 | 37 | end # Methods 38 | end # Logger 39 | end # Helper 40 | end # Spark 41 | -------------------------------------------------------------------------------- /lib/spark/helper/parser.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Helper 3 | module Parser 4 | 5 | def self.included(base) 6 | base.send :extend, Methods 7 | base.send :include, Methods 8 | end 9 | 10 | module Methods 11 | def to_java_hash(hash) 12 | hash_map = HashMap.new 13 | hash.each_pair do |key, value| 14 | begin 15 | # RJB raise Object is NULL (but new record is put correctly) 16 | hash_map.put(key, value) 17 | rescue RuntimeError 18 | end 19 | end 20 | hash_map 21 | end 22 | 23 | def convert_to_java_int(data) 24 | if data.is_a?(Array) 25 | data.map{|x| JInteger.new(x)} 26 | else 27 | JInteger.new(data) 28 | end 29 | end 30 | 31 | def to_java_array_list(array) 32 | array_list = ArrayList.new 33 | array.each do |item| 34 | array_list.add(item) 35 | end 36 | array_list 37 | end 38 | 39 | # Parse and convert memory size. Shifting be better but Float doesn't support it. 40 | # 41 | # == Examples: 42 | # to_memory_size("512mb") 43 | # # => 524288 44 | # 45 | # to_memory_size("512 MB") 46 | # # => 524288 47 | # 48 | # to_memory_size("512mb", "GB") 49 | # # => 0.5 50 | # 51 | def to_memory_size(memory, result_unit="KB") 52 | match = memory.match(/([\d]+)[\s]*([\w]*)/) 53 | if match.nil? 54 | raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'" 55 | end 56 | 57 | size = match[1].to_f 58 | unit = match[2] 59 | 60 | size *= memory_multiplier_based_kb(unit) 61 | size /= memory_multiplier_based_kb(result_unit) 62 | size.round(2) 63 | end 64 | 65 | # Based to KB 66 | def memory_multiplier_based_kb(type) 67 | case type.to_s.upcase 68 | when "G", "GB" 69 | 1048576 70 | when "M", "MB" 71 | 1024 72 | when "K", "KB" 73 | 1 74 | else 75 | raise Spark::ParseError, "Unsupported type #{type}" 76 | end 77 | end 78 | 79 | end # Methods 80 | 81 | end # Parser 82 | end # Helper 83 | end # Spark 84 | 85 | 86 | -------------------------------------------------------------------------------- /lib/spark/helper/serialize.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Helper 3 | module Serialize 4 | 5 | DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>' 6 | DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*' 7 | DIRECTIVE_LONG_BIG_ENDIAN = 'q>' 8 | DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*' 9 | DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G' 10 | DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*' 11 | DIRECTIVE_UNSIGNED_CHARS = 'C*' 12 | DIRECTIVE_CHARS = 'c*' 13 | 14 | # Packing 15 | 16 | def pack_int(data) 17 | [data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN) 18 | end 19 | 20 | def pack_long(data) 21 | [data].pack(DIRECTIVE_LONG_BIG_ENDIAN) 22 | end 23 | 24 | def pack_double(data) 25 | [data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN) 26 | end 27 | 28 | def pack_unsigned_chars(data) 29 | data.pack(DIRECTIVE_UNSIGNED_CHARS) 30 | end 31 | 32 | def pack_ints(data) 33 | __check_array(data) 34 | data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN) 35 | end 36 | 37 | def pack_longs(data) 38 | __check_array(data) 39 | data.pack(DIRECTIVE_LONGS_BIG_ENDIAN) 40 | end 41 | 42 | def pack_doubles(data) 43 | __check_array(data) 44 | data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN) 45 | end 46 | 47 | # Unpacking 48 | 49 | def unpack_int(data) 50 | data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0] 51 | end 52 | 53 | def unpack_long(data) 54 | data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0] 55 | end 56 | 57 | def unpack_chars(data) 58 | data.unpack(DIRECTIVE_CHARS) 59 | end 60 | 61 | private 62 | 63 | def __check_array(data) 64 | unless data.is_a?(Array) 65 | raise ArgumentError, 'Data must be an Array.' 66 | end 67 | end 68 | 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /lib/spark/helper/statistic.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Helper 3 | module Statistic 4 | 5 | # Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time. 6 | # 7 | # == How the sampling rate is determined: 8 | # Let p = num / total, where num is the sample size and total is the total number of 9 | # datapoints in the RDD. We're trying to compute q > p such that 10 | # * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), 11 | # where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total), 12 | # i.e. the failure rate of not having a sufficiently large sample < 0.0001. 13 | # Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for 14 | # num > 12, but we need a slightly larger q (9 empirically determined). 15 | # * when sampling without replacement, we're drawing each datapoint with prob_i 16 | # ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success 17 | # rate, where success rate is defined the same as in sampling with replacement. 18 | # 19 | def compute_fraction(lower_bound, total, with_replacement) 20 | lower_bound = lower_bound.to_f 21 | 22 | if with_replacement 23 | upper_poisson_bound(lower_bound) / total 24 | else 25 | fraction = lower_bound / total 26 | upper_binomial_bound(0.00001, total, fraction) 27 | end 28 | end 29 | 30 | def upper_poisson_bound(bound) 31 | num_std = if bound < 6 32 | 12 33 | elsif bound < 16 34 | 9 35 | else 36 | 6 37 | end.to_f 38 | 39 | [bound + num_std * Math.sqrt(bound), 1e-10].max 40 | end 41 | 42 | def upper_binomial_bound(delta, total, fraction) 43 | gamma = -Math.log(delta) / total 44 | [1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min 45 | end 46 | 47 | # Bisect right 48 | # 49 | # == Examples: 50 | # data = [1,5,6,8,96,120,133] 51 | # 52 | # bisect_right(data, 0) # => 0 53 | # bisect_right(data, 1) # => 1 54 | # bisect_right(data, 5) # => 2 55 | # bisect_right(data, 9) # => 4 56 | # bisect_right(data, 150) # => 7 57 | # 58 | def bisect_right(data, value, low=0, high=data.size) 59 | if low < 0 60 | raise ArgumentError, 'Low must be >= 0.' 61 | end 62 | 63 | while low < high 64 | mid = (low + high) / 2 65 | if value < data[mid] 66 | high = mid 67 | else 68 | low = mid + 1 69 | end 70 | end 71 | 72 | low 73 | end 74 | 75 | # Determine bound of partitioning 76 | # 77 | # == Example: 78 | # data = [0,1,2,3,4,5,6,7,8,9,10] 79 | # determine_bounds(data, 3) 80 | # # => [3, 7] 81 | # 82 | def determine_bounds(data, num_partitions) 83 | if num_partitions > data.size 84 | return data 85 | end 86 | 87 | bounds = [] 88 | count = data.size 89 | (0...(num_partitions-1)).each do |index| 90 | bounds << data[count * (index+1) / num_partitions] 91 | end 92 | bounds 93 | end 94 | 95 | end 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /lib/spark/helper/system.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Helper 3 | module System 4 | 5 | def self.included(base) 6 | base.send :extend, Methods 7 | base.send :include, Methods 8 | end 9 | 10 | module Methods 11 | def windows? 12 | RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ 13 | end 14 | 15 | def mri? 16 | RbConfig::CONFIG['ruby_install_name'] == 'ruby' 17 | end 18 | 19 | def jruby? 20 | RbConfig::CONFIG['ruby_install_name'] == 'jruby' 21 | end 22 | 23 | def pry? 24 | !!Thread.current[:__pry__] 25 | end 26 | 27 | # Memory usage in kb 28 | def memory_usage 29 | if jruby? 30 | runtime = java.lang.Runtime.getRuntime 31 | (runtime.totalMemory - runtime.freeMemory) >> 10 32 | elsif windows? 33 | # not yet 34 | else 35 | `ps -o rss= -p #{Process.pid}`.to_i 36 | end 37 | end 38 | end # Methods 39 | 40 | end # System 41 | end # Helper 42 | end # Spark 43 | -------------------------------------------------------------------------------- /lib/spark/java_bridge.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module JavaBridge 3 | 4 | autoload :Base, 'spark/java_bridge/base' 5 | autoload :JRuby, 'spark/java_bridge/jruby' 6 | autoload :RJB, 'spark/java_bridge/rjb' 7 | 8 | include Spark::Helper::System 9 | 10 | def self.init(*args) 11 | if jruby? 12 | klass = JRuby 13 | else 14 | klass = RJB 15 | end 16 | 17 | klass.new(*args) 18 | end 19 | 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/spark/java_bridge/jruby.rb: -------------------------------------------------------------------------------- 1 | require 'java' 2 | 3 | module Spark 4 | module JavaBridge 5 | class JRuby < Base 6 | 7 | def initialize(*args) 8 | super 9 | jars.each {|jar| require jar} 10 | end 11 | 12 | def import(name, klass) 13 | klass = "Java::#{klass}" 14 | Object.const_set(name, eval(klass)) 15 | rescue NameError 16 | raise_missing_class(klass) 17 | end 18 | 19 | def java_object?(object) 20 | object.is_a?(JavaProxy) 21 | end 22 | 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/spark/java_bridge/rjb.rb: -------------------------------------------------------------------------------- 1 | if !ENV.has_key?('JAVA_HOME') 2 | raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set' 3 | end 4 | 5 | require 'rjb' 6 | 7 | module Spark 8 | module JavaBridge 9 | class RJB < Base 10 | 11 | def initialize(*args) 12 | super 13 | Rjb.load(jars) 14 | Rjb.primitive_conversion = true 15 | end 16 | 17 | def import(name, klass) 18 | Object.const_set(name, silence_warnings { Rjb.import(klass) }) 19 | rescue NoClassDefFoundError 20 | raise_missing_class(klass) 21 | end 22 | 23 | def java_object?(object) 24 | object.is_a?(Rjb::Rjb_JavaProxy) 25 | end 26 | 27 | private 28 | 29 | def jars 30 | separator = windows? ? ';' : ':' 31 | super.join(separator) 32 | end 33 | 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/spark/library.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Library 3 | 4 | def autoload(klass, location, import=true) 5 | if import 6 | @for_importing ||= [] 7 | @for_importing << klass 8 | end 9 | 10 | super(klass, location) 11 | end 12 | 13 | def autoload_without_import(klass, location) 14 | autoload(klass, location, false) 15 | end 16 | 17 | def import(to=Object) 18 | @for_importing.each do |klass| 19 | to.const_set(klass, const_get(klass)) 20 | end 21 | nil 22 | end 23 | 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/spark/logger.rb: -------------------------------------------------------------------------------- 1 | # Necessary libraries 2 | Spark.load_lib 3 | 4 | module Spark 5 | class Logger 6 | 7 | attr_reader :jlogger 8 | 9 | def initialize 10 | @jlogger = JLogger.getLogger('Ruby') 11 | end 12 | 13 | def level_off 14 | JLevel.toLevel('OFF') 15 | end 16 | 17 | # Disable all Spark log 18 | def disable 19 | jlogger.setLevel(level_off) 20 | JLogger.getLogger('org').setLevel(level_off) 21 | JLogger.getLogger('akka').setLevel(level_off) 22 | JLogger.getRootLogger.setLevel(level_off) 23 | end 24 | 25 | def enabled? 26 | !disabled? 27 | end 28 | 29 | def info(message) 30 | jlogger.info(message) if info? 31 | end 32 | 33 | def debug(message) 34 | jlogger.debug(message) if debug? 35 | end 36 | 37 | def trace(message) 38 | jlogger.trace(message) if trace? 39 | end 40 | 41 | def warning(message) 42 | jlogger.warn(message) if warning? 43 | end 44 | 45 | def error(message) 46 | jlogger.error(message) if error? 47 | end 48 | 49 | def info? 50 | level_enabled?('info') 51 | end 52 | 53 | def debug? 54 | level_enabled?('debug') 55 | end 56 | 57 | def trace? 58 | level_enabled?('trace') 59 | end 60 | 61 | def warning? 62 | level_enabled?('warn') 63 | end 64 | 65 | def error? 66 | level_enabled?('error') 67 | end 68 | 69 | def level_enabled?(type) 70 | jlogger.isEnabledFor(JPriority.toPriority(type.upcase)) 71 | end 72 | 73 | alias_method :warn, :warning 74 | 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /lib/spark/mllib.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | # MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utilities, 3 | # including classification, regression, clustering, collaborative filtering, dimensionality reduction, 4 | # as well as underlying optimization primitives. 5 | module Mllib 6 | extend Spark::Library 7 | 8 | # Base classes 9 | autoload_without_import :VectorBase, 'spark/mllib/vector' 10 | autoload_without_import :MatrixBase, 'spark/mllib/matrix' 11 | autoload_without_import :RegressionMethodBase, 'spark/mllib/regression/common' 12 | autoload_without_import :ClassificationMethodBase, 'spark/mllib/classification/common' 13 | 14 | # Linear algebra 15 | autoload :Vectors, 'spark/mllib/vector' 16 | autoload :DenseVector, 'spark/mllib/vector' 17 | autoload :SparseVector, 'spark/mllib/vector' 18 | autoload :Matrices, 'spark/mllib/matrix' 19 | autoload :DenseMatrix, 'spark/mllib/matrix' 20 | autoload :SparseMatrix, 'spark/mllib/matrix' 21 | 22 | # Regression 23 | autoload :LabeledPoint, 'spark/mllib/regression/labeled_point' 24 | autoload :RegressionModel, 'spark/mllib/regression/common' 25 | autoload :LinearRegressionModel, 'spark/mllib/regression/linear' 26 | autoload :LinearRegressionWithSGD, 'spark/mllib/regression/linear' 27 | autoload :LassoModel, 'spark/mllib/regression/lasso' 28 | autoload :LassoWithSGD, 'spark/mllib/regression/lasso' 29 | autoload :RidgeRegressionModel, 'spark/mllib/regression/ridge' 30 | autoload :RidgeRegressionWithSGD, 'spark/mllib/regression/ridge' 31 | 32 | # Classification 33 | autoload :ClassificationModel, 'spark/mllib/classification/common' 34 | autoload :LogisticRegressionWithSGD, 'spark/mllib/classification/logistic_regression' 35 | autoload :LogisticRegressionWithLBFGS, 'spark/mllib/classification/logistic_regression' 36 | autoload :SVMModel, 'spark/mllib/classification/svm' 37 | autoload :SVMWithSGD, 'spark/mllib/classification/svm' 38 | autoload :NaiveBayesModel, 'spark/mllib/classification/naive_bayes' 39 | autoload :NaiveBayes, 'spark/mllib/classification/naive_bayes' 40 | 41 | # Clustering 42 | autoload :KMeans, 'spark/mllib/clustering/kmeans' 43 | autoload :KMeansModel, 'spark/mllib/clustering/kmeans' 44 | autoload :GaussianMixture, 'spark/mllib/clustering/gaussian_mixture' 45 | autoload :GaussianMixtureModel, 'spark/mllib/clustering/gaussian_mixture' 46 | 47 | # Stat 48 | autoload :MultivariateGaussian, 'spark/mllib/stat/distribution' 49 | 50 | def self.prepare 51 | return if @prepared 52 | 53 | # if narray? 54 | # require 'spark/mllib/narray/vector' 55 | # require 'spark/mllib/narray/matrix' 56 | # elsif mdarray? 57 | # require 'spark/mllib/mdarray/vector' 58 | # require 'spark/mllib/mdarray/matrix' 59 | # else 60 | # require 'spark/mllib/matrix/vector' 61 | # require 'spark/mllib/matrix/matrix' 62 | # end 63 | 64 | require 'spark/mllib/ruby_matrix/vector_adapter' 65 | require 'spark/mllib/ruby_matrix/matrix_adapter' 66 | 67 | @prepared = true 68 | nil 69 | end 70 | 71 | def self.narray? 72 | Gem::Specification::find_all_by_name('narray').any? 73 | end 74 | 75 | def self.mdarray? 76 | Gem::Specification::find_all_by_name('mdarray').any? 77 | end 78 | end 79 | end 80 | 81 | Spark::Mllib.prepare 82 | -------------------------------------------------------------------------------- /lib/spark/mllib/classification/common.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Mllib 3 | class ClassificationModel 4 | 5 | attr_reader :weights, :intercept, :threshold 6 | 7 | def initialize(weights, intercept) 8 | @weights = Spark::Mllib::Vectors.to_vector(weights) 9 | @intercept = intercept.to_f 10 | @threshold = nil 11 | end 12 | 13 | def threshold=(value) 14 | @threshold = value.to_f 15 | end 16 | 17 | def clear_threshold 18 | @threshold = nil 19 | end 20 | 21 | end 22 | end 23 | end 24 | 25 | module Spark 26 | module Mllib 27 | class ClassificationMethodBase < RegressionMethodBase 28 | 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/spark/mllib/classification/naive_bayes.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Mllib 3 | ## 4 | # NaiveBayesModel 5 | # 6 | # Model for Naive Bayes classifiers. 7 | # 8 | # Contains two parameters: 9 | # pi:: vector of logs of class priors (dimension C) 10 | # theta:: matrix of logs of class conditional probabilities (CxD) 11 | # 12 | # == Examples: 13 | # 14 | # Spark::Mllib.import 15 | # 16 | # # Dense vectors 17 | # data = [ 18 | # LabeledPoint.new(0.0, [0.0, 0.0]), 19 | # LabeledPoint.new(0.0, [0.0, 1.0]), 20 | # LabeledPoint.new(1.0, [1.0, 0.0]) 21 | # ] 22 | # model = NaiveBayes.train($sc.parallelize(data)) 23 | # 24 | # model.predict([0.0, 1.0]) 25 | # # => 0.0 26 | # model.predict([1.0, 0.0]) 27 | # # => 1.0 28 | # 29 | # 30 | # # Sparse vectors 31 | # data = [ 32 | # LabeledPoint.new(0.0, SparseVector.new(2, {1 => 0.0})), 33 | # LabeledPoint.new(0.0, SparseVector.new(2, {1 => 1.0})), 34 | # LabeledPoint.new(1.0, SparseVector.new(2, {0 => 1.0})) 35 | # ] 36 | # model = NaiveBayes.train($sc.parallelize(data)) 37 | # 38 | # model.predict(SparseVector.new(2, {1 => 1.0})) 39 | # # => 0.0 40 | # model.predict(SparseVector.new(2, {0 => 1.0})) 41 | # # => 1.0 42 | # 43 | class NaiveBayesModel 44 | 45 | attr_reader :labels, :pi, :theta 46 | 47 | def initialize(labels, pi, theta) 48 | @labels = labels 49 | @pi = pi 50 | @theta = theta 51 | end 52 | 53 | # Predict values for a single data point or an RDD of points using 54 | # the model trained. 55 | def predict(vector) 56 | vector = Spark::Mllib::Vectors.to_vector(vector) 57 | array = (vector.dot(theta) + pi).to_a 58 | index = array.index(array.max) 59 | labels[index] 60 | end 61 | 62 | end 63 | end 64 | end 65 | 66 | 67 | module Spark 68 | module Mllib 69 | class NaiveBayes 70 | 71 | # Trains a Naive Bayes model given an RDD of (label, features) pairs. 72 | # 73 | # This is the Multinomial NB (http://tinyurl.com/lsdw6p) which can handle all kinds of 74 | # discrete data. For example, by converting documents into TF-IDF vectors, it can be used for 75 | # document classification. By making every vector a 0-1 vector, it can also be used as 76 | # Bernoulli NB (http://tinyurl.com/p7c96j6). The input feature values must be nonnegative. 77 | # 78 | # == Arguments: 79 | # rdd:: RDD of LabeledPoint. 80 | # lambda:: The smoothing parameter. 81 | # 82 | def self.train(rdd, lambda=1.0) 83 | # Validation 84 | first = rdd.first 85 | unless first.is_a?(LabeledPoint) 86 | raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}" 87 | end 88 | 89 | labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda) 90 | theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta) 91 | 92 | NaiveBayesModel.new(labels, pi, theta) 93 | end 94 | 95 | end 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /lib/spark/mllib/clustering/gaussian_mixture.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Mllib 3 | ## 4 | # GaussianMixtureModel 5 | # 6 | # A clustering model derived from the Gaussian Mixture Model method. 7 | # 8 | # == Examples: 9 | # 10 | # Spark::Mllib.import 11 | # 12 | # data = [ 13 | # DenseVector.new([-0.1, -0.05]), 14 | # DenseVector.new([-0.01, -0.1]), 15 | # DenseVector.new([0.9, 0.8]), 16 | # DenseVector.new([0.75, 0.935]), 17 | # DenseVector.new([-0.83, -0.68]), 18 | # DenseVector.new([-0.91, -0.76]) 19 | # ] 20 | # 21 | # model = GaussianMixture.train($sc.parallelize(data), 3, convergence_tol: 0.0001, max_iterations: 50, seed: 10) 22 | # 23 | # labels = model.predict($sc.parallelize(data)).collect 24 | # 25 | class GaussianMixtureModel 26 | 27 | attr_reader :weights, :gaussians, :k 28 | 29 | def initialize(weights, gaussians) 30 | @weights = weights 31 | @gaussians = gaussians 32 | @k = weights.size 33 | end 34 | 35 | # Find the cluster to which the points in 'x' has maximum membership 36 | # in this model. 37 | def predict(rdd) 38 | if rdd.is_a?(Spark::RDD) 39 | predict_soft(rdd).map('lambda{|x| x.index(x.max)}') 40 | else 41 | raise ArgumentError, 'Argument must be a RDD.' 42 | end 43 | end 44 | 45 | # Find the membership of each point in 'x' to all mixture components. 46 | def predict_soft(rdd) 47 | Spark.jb.call(RubyMLLibAPI.new, 'predictSoftGMM', rdd, weights, means, sigmas) 48 | end 49 | 50 | def means 51 | @means ||= @gaussians.map(&:mu) 52 | end 53 | 54 | def sigmas 55 | @sigmas ||= @gaussians.map(&:sigma) 56 | end 57 | 58 | end 59 | end 60 | end 61 | 62 | module Spark 63 | module Mllib 64 | class GaussianMixture 65 | 66 | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil) 67 | weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd, 68 | k, convergence_tol, max_iterations, Spark.jb.to_long(seed)) 69 | 70 | means.map! {|mu| Spark.jb.java_to_ruby(mu)} 71 | sigmas.map!{|sigma| Spark.jb.java_to_ruby(sigma)} 72 | 73 | mvgs = Array.new(k) do |i| 74 | MultivariateGaussian.new(means[i], sigmas[i]) 75 | end 76 | 77 | GaussianMixtureModel.new(weights, mvgs) 78 | end 79 | 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /lib/spark/mllib/matrix.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Mllib 3 | module Matrices 4 | 5 | def self.dense(*args) 6 | DenseMatrix.new(*args) 7 | end 8 | 9 | def self.sparse(*args) 10 | SparseMatrix.new(*args) 11 | end 12 | 13 | def self.to_matrix(data) 14 | if data.is_a?(SparseMatrix) || data.is_a?(DenseMatrix) 15 | data 16 | elsif data.is_a?(Array) 17 | DenseMatrix.new(data) 18 | end 19 | end 20 | 21 | end 22 | end 23 | end 24 | 25 | module Spark 26 | module Mllib 27 | # @abstract Parent for all type of matrices 28 | class MatrixBase < MatrixAdapter 29 | end 30 | end 31 | end 32 | 33 | module Spark 34 | module Mllib 35 | ## 36 | # DenseMatrix 37 | # 38 | # DenseMatrix.new(2, 3, [[1,2,3], [4,5,6]]).values 39 | # # => [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] 40 | # 41 | class DenseMatrix < MatrixBase 42 | 43 | def initialize(rows, cols, values) 44 | super(:dense, rows, cols, values.to_a) 45 | end 46 | 47 | def to_java 48 | JDenseMatrix.new(shape[0], shape[1], values.flatten) 49 | end 50 | 51 | def self.from_java(object) 52 | rows = object.numRows 53 | cols = object.numCols 54 | values = object.values 55 | 56 | DenseMatrix.new(rows, cols, values) 57 | end 58 | 59 | end 60 | end 61 | end 62 | 63 | module Spark 64 | module Mllib 65 | ## 66 | # SparseMatrix 67 | # 68 | # == Arguments: 69 | # rows:: 70 | # Number of rows. 71 | # 72 | # cols:: 73 | # Number of columns. 74 | # 75 | # col_pointers:: 76 | # The index corresponding to the start of a new column. 77 | # 78 | # row_indices:: 79 | # The row index of the entry. They must be in strictly 80 | # increasing order for each column. 81 | # 82 | # values:: 83 | # Nonzero matrix entries in column major. 84 | # 85 | # == Examples: 86 | # 87 | # SparseMatrix.new(3, 3, [0, 2, 3, 6], [0, 2, 1, 0, 1, 2], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).values 88 | # 89 | # # => [ 90 | # # [1.0, 0.0, 4.0], 91 | # # [0.0, 3.0, 5.0], 92 | # # [2.0, 0.0, 6.0] 93 | # # ] 94 | # 95 | class SparseMatrix < MatrixBase 96 | 97 | attr_reader :col_pointers, :row_indices 98 | 99 | def initialize(rows, cols, col_pointers, row_indices, values) 100 | super(:sparse, rows, cols) 101 | 102 | @col_pointers = col_pointers 103 | @row_indices = row_indices 104 | @values = values 105 | 106 | j = 0 107 | while j < cols 108 | idx = col_pointers[j] 109 | idx_end = col_pointers[j+1] 110 | while idx < idx_end 111 | self[row_indices[idx], j] = values[idx] 112 | idx += 1 113 | end 114 | j += 1 115 | end 116 | end 117 | 118 | end 119 | end 120 | end 121 | -------------------------------------------------------------------------------- /lib/spark/mllib/regression/common.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Mllib 3 | ## 4 | # RegressionModel 5 | # 6 | # A linear model that has a vector of coefficients and an intercept. 7 | # 8 | class RegressionModel 9 | 10 | attr_reader :weights, :intercept 11 | 12 | def initialize(weights, intercept) 13 | @weights = Spark::Mllib::Vectors.to_vector(weights) 14 | @intercept = intercept.to_f 15 | end 16 | 17 | # Predict the value of the dependent variable given a vector data 18 | # containing values for the independent variables. 19 | # 20 | # == Examples: 21 | # lm = RegressionModel.new([1.0, 2.0], 0.1) 22 | # 23 | # lm.predict([-1.03, 7.777]) - 14.624 < 1e-6 24 | # # => true 25 | # 26 | # lm.predict(SparseVector.new(2, {0 => -1.03, 1 => 7.777})) - 14.624 < 1e-6 27 | # # => true 28 | # 29 | def predict(data) 30 | data = Spark::Mllib::Vectors.to_vector(data) 31 | @weights.dot(data) + @intercept 32 | end 33 | 34 | end 35 | end 36 | end 37 | 38 | 39 | module Spark 40 | module Mllib 41 | ## 42 | # RegressionMethodBase 43 | # 44 | # Parent for regression methods 45 | # 46 | class RegressionMethodBase 47 | 48 | def self.train(rdd, options) 49 | # String keys to symbols 50 | options.symbolize_keys! 51 | 52 | # Reverse merge 53 | self::DEFAULT_OPTIONS.each do |key, value| 54 | if options.has_key?(key) 55 | # value from user 56 | else 57 | options[key] = value 58 | end 59 | end 60 | 61 | # Validation 62 | first = rdd.first 63 | unless first.is_a?(LabeledPoint) 64 | raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}" 65 | end 66 | 67 | # Initial weights is optional for user (not for Spark) 68 | options[:initial_weights] = Vectors.to_vector(options[:initial_weights] || [0.0] * first.features.size) 69 | end 70 | 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /lib/spark/mllib/regression/labeled_point.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Mllib 3 | ## 4 | # LabeledPoint 5 | # 6 | # The features and labels of a data point. 7 | # 8 | # == Parameters: 9 | # label:: 10 | # Label for this data point. 11 | # 12 | # features:: 13 | # Vector of features for this point 14 | # 15 | class LabeledPoint 16 | 17 | attr_reader :label, :features 18 | 19 | def initialize(label, features) 20 | @label = label.to_f 21 | @features = Spark::Mllib::Vectors.to_vector(features) 22 | end 23 | 24 | def self.from_java(object) 25 | LabeledPoint.new( 26 | object.label, 27 | Spark.jb.java_to_ruby(object.features) 28 | ) 29 | end 30 | 31 | def marshal_dump 32 | [@label, @features] 33 | end 34 | 35 | def marshal_load(array) 36 | initialize(array[0], array[1]) 37 | end 38 | 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/spark/mllib/ruby_matrix/matrix_adapter.rb: -------------------------------------------------------------------------------- 1 | require 'matrix' 2 | 3 | module Spark 4 | module Mllib 5 | class MatrixAdapter < ::Matrix 6 | 7 | def self.new(*args) 8 | object = self.allocate 9 | 10 | if args.size == 2 11 | # Matrix is initialized from Matrix 12 | # Arguments: rows, column count 13 | object.__send__(:original_initialize, *args) 14 | else 15 | object.__send__(:initialize, *args) 16 | end 17 | 18 | object 19 | end 20 | 21 | alias_method :original_initialize, :initialize 22 | 23 | def initialize(type, rows, cols, values=nil) 24 | case type 25 | when :dense 26 | values = values.dup 27 | if rows * cols == values.size 28 | # Values are on one row 29 | # 2x2 => [1,2,3,4] 30 | values = values.each_slice(cols).to_a 31 | else 32 | # 2x2 => [[1,2], [3,4]] 33 | end 34 | when :sparse 35 | values = Array.new(rows) { Array.new(cols) { 0.0 } } 36 | else 37 | raise Spark::MllibError, 'Unknow vector type.' 38 | end 39 | 40 | super(values, cols) 41 | end 42 | 43 | def shape 44 | [row_count, column_count] 45 | end 46 | 47 | def values 48 | @values || to_a 49 | end 50 | 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/spark/mllib/ruby_matrix/vector_adapter.rb: -------------------------------------------------------------------------------- 1 | require 'matrix' 2 | 3 | # Based on ruby 2.1 4 | 5 | class Vector 6 | def self.elements(array, copy=true) 7 | DenseVector.new(convert_to_array(array, copy)) 8 | end 9 | end 10 | 11 | module Spark 12 | module Mllib 13 | class VectorAdapter < ::Vector 14 | 15 | def self.new(*args) 16 | object = self.allocate 17 | object.__send__(:initialize, *args) 18 | object 19 | end 20 | 21 | def initialize(*args) 22 | case args.shift 23 | when :dense 24 | values = args.shift.dup 25 | when :sparse 26 | values = [0.0] * args.shift.to_i 27 | else 28 | raise Spark::MllibError, 'Unknow vector type.' 29 | end 30 | 31 | super(values) 32 | end 33 | 34 | def []=(index, value) 35 | @elements[index] = value 36 | end 37 | 38 | def dot(other) 39 | if other.is_a?(Spark::Mllib::MatrixBase) 40 | other * self 41 | else 42 | inner_product(other) 43 | end 44 | end 45 | 46 | def squared_distance(other) 47 | diff = self - other 48 | diff.dot(diff) 49 | end 50 | 51 | def values 52 | @values || to_a 53 | end 54 | 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/spark/mllib/stat/distribution.rb: -------------------------------------------------------------------------------- 1 | ## 2 | # MultivariateGaussian 3 | # 4 | # This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In 5 | # the event that the covariance matrix is singular, the density will be computed in a 6 | # reduced dimensional subspace under which the distribution is supported. 7 | # 8 | # == Arguments: 9 | # mu:: The mean vector of the distribution 10 | # sigma:: The covariance matrix of the distribution 11 | # 12 | Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma) 13 | -------------------------------------------------------------------------------- /lib/spark/sampler.rb: -------------------------------------------------------------------------------- 1 | require 'distribution' 2 | 3 | # Random Generators 4 | module Spark 5 | module RandomGenerator 6 | class Poisson 7 | 8 | def initialize(mean, seed) 9 | generator = Random.new(seed) 10 | @exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator) 11 | end 12 | 13 | def rand 14 | t = 0.0 15 | number = 0 16 | 17 | loop{ 18 | t += @exp_rng.call 19 | if t > 1 20 | return number 21 | end 22 | number += 1 23 | } 24 | end 25 | 26 | end 27 | end 28 | end 29 | 30 | # Samplers 31 | module Spark 32 | module Sampler 33 | 34 | class Base 35 | attr_reader :fraction, :seed 36 | 37 | def initialize(fraction, seed=nil) 38 | @fraction = fraction 39 | @seed = seed || Random.new_seed 40 | end 41 | end 42 | 43 | # Poisson Sampler 44 | # ------------------------------------------------------------------------- 45 | class Poisson < Base 46 | 47 | def sample(iterator) 48 | iterator.map! do |item| 49 | count = rng.rand 50 | Array.new(count) { item } 51 | end 52 | iterator.flatten! 53 | iterator.compact! 54 | iterator 55 | end 56 | 57 | def lazy_sample(iterator) 58 | Enumerator::Lazy.new(iterator) do |yielder, value| 59 | count = rng.rand 60 | count.times { yielder << value } 61 | end 62 | end 63 | 64 | def rng 65 | @rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed) 66 | end 67 | 68 | end 69 | 70 | # Uniform Sampler 71 | # ------------------------------------------------------------------------- 72 | class Uniform < Base 73 | 74 | def sample(iterator) 75 | iterator.select!{|item| rng.rand <= fraction} 76 | iterator 77 | end 78 | 79 | def lazy_sample(iterator) 80 | iterator.select do |item| 81 | rng.rand <= fraction 82 | end 83 | end 84 | 85 | def rng 86 | @rng ||= Random.new(seed) 87 | end 88 | 89 | end 90 | 91 | end 92 | end 93 | -------------------------------------------------------------------------------- /lib/spark/serializer.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | ## 3 | # Serializer 4 | # 5 | module Serializer 6 | 7 | DEFAULT_COMPRESS = false 8 | DEFAULT_BATCH_SIZE = 1024 9 | DEFAULT_SERIALIZER_NAME = 'marshal' 10 | 11 | @@registered = {} 12 | 13 | # Register class and create method for quick access. 14 | # Class will be available also as __name__ for using 15 | # in build method (Proc binding problem). 16 | # 17 | # == Examples: 18 | # register('test1', 'test2', Class) 19 | # 20 | # Spark::Serializer.test1 21 | # Spark::Serializer.test2 22 | # 23 | # # Proc binding problem 24 | # build { marshal } # => Spark::Serializer::Marshal 25 | # 26 | # marshal = 1 27 | # build { marshal } # => 1 28 | # 29 | # build { __marshal__ } # => Spark::Serializer::Marshal 30 | # 31 | def self.register(*args) 32 | klass = args.pop 33 | args.each do |arg| 34 | @@registered[arg] = klass 35 | define_singleton_method(arg.to_sym){|*args| klass.new(*args) } 36 | define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) } 37 | end 38 | end 39 | 40 | def self.find(name) 41 | @@registered[name.to_s.downcase] 42 | end 43 | 44 | def self.find!(name) 45 | klass = find(name) 46 | 47 | if klass.nil? 48 | raise Spark::SerializeError, "Unknow serializer #{name}." 49 | end 50 | 51 | klass 52 | end 53 | 54 | def self.build(text=nil, &block) 55 | if block_given? 56 | class_eval(&block) 57 | else 58 | class_eval(text.to_s.downcase) 59 | end 60 | end 61 | 62 | end 63 | end 64 | 65 | # Parent 66 | require 'spark/serializer/base' 67 | 68 | # Basic 69 | require 'spark/serializer/oj' 70 | require 'spark/serializer/marshal' 71 | require 'spark/serializer/message_pack' 72 | require 'spark/serializer/text' 73 | 74 | # Others 75 | require 'spark/serializer/batched' 76 | require 'spark/serializer/auto_batched' 77 | require 'spark/serializer/compressed' 78 | require 'spark/serializer/pair' 79 | require 'spark/serializer/cartesian' 80 | -------------------------------------------------------------------------------- /lib/spark/serializer/auto_batched.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | ## 4 | # AutoBatched serializator 5 | # 6 | # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer. 7 | # 8 | class AutoBatched < Batched 9 | 10 | MAX_RATIO = 10 11 | 12 | def initialize(serializer, best_size=65536) 13 | @serializer = serializer 14 | @best_size = best_size.to_i 15 | 16 | error('Batch size must be greater than 1') if @best_size < 2 17 | end 18 | 19 | def batched? 20 | true 21 | end 22 | 23 | def unbatch! 24 | end 25 | 26 | def name 27 | "AutoBatched(#{@best_size})" 28 | end 29 | 30 | def dump_to_io(data, io) 31 | check_each(data) 32 | 33 | # Only Array have .slice 34 | data = data.to_a 35 | 36 | index = 0 37 | batch = 2 38 | max = @best_size * MAX_RATIO 39 | 40 | loop do 41 | chunk = data.slice(index, batch) 42 | if chunk.nil? || chunk.empty? 43 | break 44 | end 45 | 46 | serialized = @serializer.dump(chunk) 47 | io.write_string(serialized) 48 | 49 | index += batch 50 | 51 | size = serialized.bytesize 52 | if size < @best_size 53 | batch *= 2 54 | elsif size > max && batch > 1 55 | batch /= 2 56 | end 57 | end 58 | 59 | io.flush 60 | end 61 | 62 | end 63 | end 64 | end 65 | 66 | Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched) 67 | -------------------------------------------------------------------------------- /lib/spark/serializer/base.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | # @abstract Parent for all serializers 4 | class Base 5 | 6 | def load_from_io(io) 7 | return to_enum(__callee__, io) unless block_given? 8 | 9 | loop do 10 | size = io.read_int_or_eof 11 | break if size == Spark::Constant::DATA_EOF 12 | 13 | yield load(io.read(size)) 14 | end 15 | end 16 | 17 | def load_from_file(file, *args) 18 | return to_enum(__callee__, file, *args) unless block_given? 19 | 20 | load_from_io(file, *args).each do |item| 21 | yield item 22 | end 23 | 24 | file.close 25 | file.unlink 26 | end 27 | 28 | def ==(other) 29 | self.to_s == other.to_s 30 | end 31 | 32 | def batched? 33 | false 34 | end 35 | 36 | def unbatch! 37 | end 38 | 39 | def check_each(data) 40 | unless data.respond_to?(:each) 41 | error('Data must be iterable.') 42 | end 43 | end 44 | 45 | def error(message) 46 | raise Spark::SerializeError, message 47 | end 48 | 49 | def name 50 | self.class.name.split('::').last 51 | end 52 | 53 | def to_s 54 | name 55 | end 56 | 57 | def inspect 58 | %{#} 59 | end 60 | 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/spark/serializer/batched.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Batched < Base 4 | 5 | attr_writer :serializer 6 | 7 | def initialize(serializer, batch_size=nil) 8 | batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE 9 | 10 | @serializer = serializer 11 | @batch_size = batch_size.to_i 12 | 13 | error('Batch size must be greater than 0') if @batch_size < 1 14 | end 15 | 16 | # Really batched 17 | def batched? 18 | @batch_size > 1 19 | end 20 | 21 | def unbatch! 22 | @batch_size = 1 23 | end 24 | 25 | def load(data) 26 | @serializer.load(data) 27 | end 28 | 29 | def dump(data) 30 | @serializer.dump(data) 31 | end 32 | 33 | def name 34 | "Batched(#{@batch_size})" 35 | end 36 | 37 | def to_s 38 | "#{name} -> #{@serializer}" 39 | end 40 | 41 | 42 | # === Dump ============================================================== 43 | 44 | def dump_to_io(data, io) 45 | check_each(data) 46 | 47 | if batched? 48 | data = data.each_slice(@batch_size) 49 | end 50 | 51 | data.each do |item| 52 | serialized = dump(item) 53 | io.write_string(serialized) 54 | end 55 | 56 | io.flush 57 | end 58 | 59 | 60 | # === Load ============================================================== 61 | 62 | def load_from_io(io) 63 | return to_enum(__callee__, io) unless block_given? 64 | 65 | loop do 66 | size = io.read_int_or_eof 67 | break if size == Spark::Constant::DATA_EOF 68 | 69 | data = io.read(size) 70 | data = load(data) 71 | 72 | if batched? 73 | data.each{|item| yield item } 74 | else 75 | yield data 76 | end 77 | end 78 | end 79 | 80 | end 81 | end 82 | end 83 | 84 | Spark::Serializer.register('batched', Spark::Serializer::Batched) 85 | -------------------------------------------------------------------------------- /lib/spark/serializer/cartesian.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Cartesian < Pair 4 | 5 | def aggregate(item1, item2) 6 | item1.product(item2) 7 | end 8 | 9 | end 10 | end 11 | end 12 | 13 | Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian) 14 | -------------------------------------------------------------------------------- /lib/spark/serializer/compressed.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Compressed < Base 4 | 5 | def initialize(serializer) 6 | @serializer = serializer 7 | end 8 | 9 | def dump(data) 10 | Zlib::Deflate.deflate(@serializer.dump(data)) 11 | end 12 | 13 | def load(data) 14 | @serializer.load(Zlib::Inflate.inflate(data)) 15 | end 16 | 17 | end 18 | end 19 | end 20 | 21 | begin 22 | # TODO: require only if it is necessary 23 | require 'zlib' 24 | 25 | Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed) 26 | rescue LoadError 27 | end 28 | -------------------------------------------------------------------------------- /lib/spark/serializer/marshal.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Marshal < Base 4 | 5 | def dump(data) 6 | ::Marshal.dump(data) 7 | end 8 | 9 | def load(data) 10 | ::Marshal.load(data) 11 | end 12 | 13 | end 14 | end 15 | end 16 | 17 | Spark::Serializer.register('marshal', Spark::Serializer::Marshal) 18 | -------------------------------------------------------------------------------- /lib/spark/serializer/message_pack.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class MessagePack < Base 4 | 5 | def dump(data) 6 | ::MessagePack.dump(data) 7 | end 8 | 9 | def load(data) 10 | ::MessagePack.load(data) 11 | end 12 | 13 | end 14 | end 15 | end 16 | 17 | begin 18 | # TODO: require only if it is necessary 19 | require 'msgpack' 20 | 21 | Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack) 22 | rescue LoadError 23 | end 24 | -------------------------------------------------------------------------------- /lib/spark/serializer/oj.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Oj < Base 4 | 5 | def dump(data) 6 | ::Oj.dump(data) 7 | end 8 | 9 | def load(data) 10 | ::Oj.load(data) 11 | end 12 | 13 | end 14 | end 15 | end 16 | 17 | begin 18 | # TODO: require only if it is necessary 19 | require 'oj' 20 | 21 | Spark::Serializer.register('oj', Spark::Serializer::Oj) 22 | rescue LoadError 23 | end 24 | -------------------------------------------------------------------------------- /lib/spark/serializer/pair.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Pair < Base 4 | 5 | def initialize(serializer1, serializer2) 6 | @serializer1 = serializer1 7 | @serializer2 = serializer2 8 | end 9 | 10 | def to_s 11 | "#{name}(#{@serializer1}, #{@serializer2})" 12 | end 13 | 14 | def aggregate(item1, item2) 15 | item1.zip(item2) 16 | end 17 | 18 | def load_from_io(io) 19 | return to_enum(__callee__, io) unless block_given? 20 | 21 | loop do 22 | size = io.read_int_or_eof 23 | break if size == Spark::Constant::DATA_EOF 24 | 25 | item1 = @serializer1.load(io.read(size)) 26 | item2 = @serializer2.load(io.read_string) 27 | 28 | item1 = [item1] unless @serializer1.batched? 29 | item2 = [item2] unless @serializer2.batched? 30 | 31 | aggregate(item1, item2).each do |item| 32 | yield item 33 | end 34 | end 35 | end 36 | 37 | end 38 | end 39 | end 40 | 41 | Spark::Serializer.register('pair', Spark::Serializer::Pair) 42 | -------------------------------------------------------------------------------- /lib/spark/serializer/text.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module Serializer 3 | class Text < Base 4 | 5 | attr_reader :encoding 6 | 7 | def initialize(encoding=Encoding::UTF_8) 8 | error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding) 9 | 10 | @encoding = encoding 11 | end 12 | 13 | def load(data) 14 | data.to_s.force_encoding(@encoding) 15 | end 16 | 17 | def to_s 18 | "Text(#{@encoding})" 19 | end 20 | 21 | end 22 | end 23 | end 24 | 25 | Spark::Serializer.register('string', 'text', Spark::Serializer::Text) 26 | -------------------------------------------------------------------------------- /lib/spark/sql.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module SQL 3 | extend Spark::Library 4 | 5 | autoload_without_import :Context, 'spark/sql/context' 6 | autoload_without_import :DataType, 'spark/sql/data_type' 7 | autoload_without_import :DataFrame, 'spark/sql/data_frame' 8 | autoload_without_import :DataFrameReader, 'spark/sql/data_frame_reader' 9 | 10 | autoload :Row, 'spark/sql/row' 11 | autoload :Column, 'spark/sql/column' 12 | 13 | # Types 14 | autoload :StructType, 'spark/sql/data_type' 15 | autoload :StructField, 'spark/sql/data_type' 16 | autoload :AtomicType, 'spark/sql/data_type' 17 | autoload :NumericType, 'spark/sql/data_type' 18 | autoload :IntegralType, 'spark/sql/data_type' 19 | autoload :StringType, 'spark/sql/data_type' 20 | autoload :LongType, 'spark/sql/data_type' 21 | end 22 | 23 | SQLContext = Spark::SQL::Context 24 | end 25 | -------------------------------------------------------------------------------- /lib/spark/sql/context.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module SQL 3 | class Context 4 | 5 | attr_reader :spark_context, :jsql_context 6 | 7 | def initialize(spark_context) 8 | @spark_context = spark_context 9 | @jsql_context = JSQLContext.new(spark_context.sc) 10 | end 11 | 12 | def read 13 | DataFrameReader.new(self) 14 | end 15 | 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/spark/sql/data_frame_reader.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module SQL 3 | class DataFrameReader 4 | 5 | attr_reader :sql_context, :jreader 6 | 7 | def initialize(sql_context) 8 | @sql_context = sql_context 9 | @jreader = sql_context.jsql_context.read 10 | end 11 | 12 | def df(jdf) 13 | DataFrame.new(jdf, sql_context) 14 | end 15 | 16 | # Specifies the input data source format. 17 | # Parameter is name of the data source, e.g. 'json', 'parquet'. 18 | def format(source) 19 | jreader.format(source) 20 | self 21 | end 22 | 23 | # Adds an input option for the underlying data source. 24 | def option(key, value) 25 | jreader.option(key, value.to_s) 26 | self 27 | end 28 | 29 | # Adds input options for the underlying data source. 30 | def options(options) 31 | options.each do |key, value| 32 | jreader.option(key, value.to_s) 33 | end 34 | self 35 | end 36 | 37 | # Loads data from a data source and returns it as a :class`DataFrame`. 38 | # 39 | # == Parameters: 40 | # path:: Optional string for file-system backed data sources. 41 | # format:: Optional string for format of the data source. Default to 'parquet'. 42 | # schema:: Optional {StructType} for the input schema. 43 | # options:: All other string options. 44 | # 45 | def load(path=nil, new_format=nil, new_schema=nil, new_options=nil) 46 | new_format && format(new_format) 47 | new_schema && schema(new_schema) 48 | new_options && options(new_options) 49 | 50 | if path.nil? 51 | df(jreader.load) 52 | else 53 | df(jreader.load(path)) 54 | end 55 | end 56 | 57 | # Specifies the input schema. 58 | # 59 | # Some data sources (e.g. JSON) can infer the input schema automatically from data. 60 | # By specifying the schema here, the underlying data source can skip the schema 61 | # inference step, and thus speed up data loading. 62 | # 63 | # Parameter schema must be StructType object. 64 | # 65 | def schema(new_schema) 66 | unless new_schema.is_a?(StructType) 67 | raise ArgumentError, 'Schema must be a StructType.' 68 | end 69 | 70 | jschema = sql_context.jsql_context.parseDataType(new_schema.json) 71 | jreader.schema(jschema) 72 | self 73 | end 74 | 75 | # Loads a JSON file (one object per line) and returns the result as {DataFrame} 76 | # 77 | # If the schema parameter is not specified, this function goes 78 | # through the input once to determine the input schema. 79 | # 80 | # == Parameters: 81 | # path:: string, path to the JSON dataset 82 | # schema:: an optional {StructType} for the input schema. 83 | # 84 | # == Example: 85 | # df = sql.read.json('people.json') 86 | # df.dtypes 87 | # # => [('age', 'bigint'), ('name', 'string')] 88 | # 89 | def json(path, new_schema=nil) 90 | # ClassNotFoundException: Failed to load class for data source: json 91 | # df(jreader.json(path)) 92 | 93 | load(path, 'org.apache.spark.sql.execution.datasources.json', new_schema) 94 | end 95 | 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /lib/spark/sql/row.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | module SQL 3 | ## 4 | # Spark::SQL::Row 5 | # 6 | class Row 7 | attr_reader :data 8 | 9 | def self.from_java(object, with_schema=true) 10 | if with_schema 11 | fields = object.schema.fieldNames 12 | else 13 | # Create virtual schema (t0, t1, t2, ...) 14 | raise Spark::NotImplemented, 'Row must have a schema' 15 | end 16 | 17 | if object.anyNull 18 | data = {} 19 | object.size.times do |i| 20 | if object.isNullAt(i) 21 | value = nil 22 | else 23 | value = Spark.jb.to_ruby(object.get(i)) 24 | end 25 | 26 | data[ fields[i] ] = value 27 | end 28 | else 29 | data = fields.zip(Spark.jb.to_ruby(object.values)) 30 | end 31 | 32 | Row.new(data) 33 | end 34 | 35 | def initialize(data={}) 36 | @data = data.to_h 37 | end 38 | 39 | def [](item) 40 | @data[item] 41 | end 42 | 43 | def to_h 44 | @data 45 | end 46 | 47 | def inspect 48 | formated = data.map do |key, value| 49 | "#{key}: \"#{value}\"" 50 | end 51 | 52 | %{#} 53 | end 54 | 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/spark/stat_counter.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | class StatCounter 3 | 4 | attr_reader :n # count of our values 5 | attr_reader :mu # mean of our values 6 | attr_reader :m2 # variance numerator (sum of (x - mean)^2) 7 | attr_reader :max # max of our values 8 | attr_reader :min # min of our values 9 | 10 | def initialize(iterator) 11 | @n = 0 12 | @mu = 0.0 13 | @m2 = 0.0 14 | @max = -Float::INFINITY 15 | @min = Float::INFINITY 16 | 17 | merge(iterator) 18 | end 19 | 20 | def merge(other) 21 | if other.is_a?(Spark::StatCounter) 22 | merge_stat_counter(other) 23 | elsif other.respond_to?(:each) 24 | merge_array(other) 25 | else 26 | merge_value(other) 27 | end 28 | 29 | self 30 | end 31 | 32 | def sum 33 | @n * @mu 34 | end 35 | 36 | # Return the variance of the values. 37 | def variance 38 | if @n == 0 39 | Float::NAN 40 | else 41 | @m2 / @n 42 | end 43 | end 44 | 45 | # Return the sample variance, which corrects for bias in estimating the variance by dividing 46 | # by N-1 instead of N. 47 | def sample_variance 48 | if @n <= 1 49 | Float::NAN 50 | else 51 | @m2 / (@n - 1) 52 | end 53 | end 54 | 55 | # Return the standard deviation of the values. 56 | def stdev 57 | Math.sqrt(variance) 58 | end 59 | 60 | # Return the sample standard deviation of the values, which corrects for bias in estimating the 61 | # variance by dividing by N-1 instead of N. 62 | def sample_stdev 63 | Math.sqrt(sample_variance) 64 | end 65 | 66 | def to_s 67 | "(count: #{count}, mean: #{mean}, stdev: #{stdev}, max: #{max}, min: #{min})" 68 | end 69 | 70 | alias_method :count, :n 71 | alias_method :mean, :mu 72 | alias_method :max_value, :max 73 | alias_method :min_value, :min 74 | alias_method :sampleStdev, :sample_stdev 75 | alias_method :sampleVariance, :sample_variance 76 | 77 | private 78 | 79 | def merge_stat_counter(other) 80 | if other == self 81 | other = self.deep_copy 82 | end 83 | 84 | if @n == 0 85 | @n = other.n 86 | @mu = other.mu 87 | @m2 = other.m2 88 | @max = other.max 89 | @min = other.min 90 | elsif other.n != 0 91 | delta = other.mu - @mu 92 | 93 | if other.n * 10 < @n 94 | @mu = @mu + (delta * other.n) / (@n + other.n) 95 | elsif @n * 10 < other.n 96 | @mu = other.mu - (delta * @n) / (@n + other.n) 97 | else 98 | @mu = (@mu * @n + other.mu * other.n) / (@n + other.n) 99 | end 100 | 101 | @max = [@max, other.max].max 102 | @min = [@min, other.min].min 103 | 104 | @m2 += other.m2 + (delta * delta * @n * other.n) / (@n + other.n) 105 | @n += other.n 106 | end 107 | end 108 | 109 | def merge_array(array) 110 | array.each do |item| 111 | merge_value(item) 112 | end 113 | end 114 | 115 | def merge_value(value) 116 | delta = value - @mu 117 | @n += 1 118 | @mu += delta / @n 119 | @m2 += delta * (value - @mu) 120 | @max = [@max, value].max 121 | @min = [@min, value].min 122 | end 123 | 124 | end 125 | end 126 | -------------------------------------------------------------------------------- /lib/spark/storage_level.rb: -------------------------------------------------------------------------------- 1 | # Necessary libraries 2 | Spark.load_lib 3 | 4 | module Spark 5 | class StorageLevel 6 | 7 | def self.reload 8 | return if @reloaded 9 | reload! 10 | @reloaded = true 11 | end 12 | 13 | def self.reload! 14 | self.const_set(:NONE, JStorageLevel.NONE) 15 | self.const_set(:DISK_ONLY, JStorageLevel.DISK_ONLY) 16 | self.const_set(:DISK_ONLY_2, JStorageLevel.DISK_ONLY_2) 17 | self.const_set(:MEMORY_ONLY, JStorageLevel.MEMORY_ONLY) 18 | self.const_set(:MEMORY_ONLY_SER, JStorageLevel.MEMORY_ONLY_SER) 19 | self.const_set(:MEMORY_ONLY_2, JStorageLevel.MEMORY_ONLY_2) 20 | self.const_set(:MEMORY_ONLY_SER_2, JStorageLevel.MEMORY_ONLY_SER_2) 21 | self.const_set(:MEMORY_AND_DISK, JStorageLevel.MEMORY_AND_DISK) 22 | self.const_set(:MEMORY_AND_DISK_2, JStorageLevel.MEMORY_AND_DISK_2) 23 | self.const_set(:MEMORY_AND_DISK_SER, JStorageLevel.MEMORY_AND_DISK_SER) 24 | self.const_set(:MEMORY_AND_DISK_SER_2, JStorageLevel.MEMORY_AND_DISK_SER_2) 25 | self.const_set(:OFF_HEAP, JStorageLevel.OFF_HEAP) 26 | end 27 | 28 | def self.java_get(arg) 29 | reload 30 | 31 | if arg.is_a?(String) 32 | const_get(arg.upcase) 33 | else 34 | arg 35 | end 36 | end 37 | 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/spark/version.rb: -------------------------------------------------------------------------------- 1 | module Spark 2 | VERSION = '1.2.1' 3 | end 4 | -------------------------------------------------------------------------------- /lib/spark/worker/master.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | $PROGRAM_NAME = 'RubySparkMaster' 4 | 5 | require 'socket' 6 | require 'io/wait' 7 | require 'nio' 8 | 9 | require_relative 'worker' 10 | 11 | # New process group 12 | # Otherwise master can be killed from pry console 13 | Process.setsid 14 | 15 | # ================================================================================================= 16 | # Master 17 | # 18 | module Master 19 | 20 | def self.create 21 | case ARGV[0].to_s.strip 22 | when 'thread' 23 | Master::Thread.new 24 | else 25 | Master::Process.new 26 | end 27 | end 28 | 29 | class Base 30 | include Spark::Constant 31 | 32 | def initialize 33 | @port = ARGV[1].to_s.strip.to_i 34 | @socket = TCPSocket.open('localhost', @port) 35 | @worker_arguments = @socket.read_string 36 | end 37 | 38 | def run 39 | selector = NIO::Selector.new 40 | monitor = selector.register(@socket, :r) 41 | monitor.value = Proc.new { receive_message } 42 | loop { 43 | selector.select {|monitor| monitor.value.call} 44 | } 45 | end 46 | 47 | def receive_message 48 | command = @socket.read_int 49 | 50 | case command 51 | when CREATE_WORKER 52 | create_worker 53 | when KILL_WORKER 54 | kill_worker 55 | when KILL_WORKER_AND_WAIT 56 | kill_worker_and_wait 57 | end 58 | end 59 | 60 | def kill_worker_and_wait 61 | if kill_worker 62 | @socket.write_int(SUCCESSFULLY_KILLED) 63 | else 64 | @socket.write_int(UNSUCCESSFUL_KILLING) 65 | end 66 | end 67 | end 68 | 69 | # =============================================================================================== 70 | # Worker::Process 71 | # 72 | class Process < Base 73 | 74 | def create_worker 75 | if fork? 76 | pid = ::Process.fork do 77 | Worker::Process.new(@port).run 78 | end 79 | else 80 | pid = ::Process.spawn("ruby #{@worker_arguments} worker.rb #{@port}") 81 | end 82 | 83 | # Detach child from master to avoid zombie process 84 | ::Process.detach(pid) 85 | end 86 | 87 | def kill_worker 88 | worker_id = @socket.read_long 89 | ::Process.kill('TERM', worker_id) 90 | rescue 91 | nil 92 | end 93 | 94 | def fork? 95 | @can_fork ||= _fork? 96 | end 97 | 98 | def _fork? 99 | return false if !::Process.respond_to?(:fork) 100 | 101 | pid = ::Process.fork 102 | exit unless pid # exit the child immediately 103 | true 104 | rescue NotImplementedError 105 | false 106 | end 107 | 108 | end 109 | 110 | # =============================================================================================== 111 | # Worker::Thread 112 | # 113 | class Thread < Base 114 | 115 | def initialize 116 | ::Thread.abort_on_exception = true 117 | 118 | # For synchronous access to socket IO 119 | $mutex_for_command = Mutex.new 120 | $mutex_for_iterator = Mutex.new 121 | 122 | super 123 | end 124 | 125 | def create_worker 126 | ::Thread.new do 127 | Worker::Thread.new(@port).run 128 | end 129 | end 130 | 131 | def kill_worker 132 | worker_id = @socket.read_long 133 | 134 | thread = ObjectSpace._id2ref(worker_id) 135 | thread.kill 136 | rescue 137 | nil 138 | end 139 | 140 | end 141 | end 142 | 143 | # Create proper master by worker_type 144 | Master.create.run 145 | -------------------------------------------------------------------------------- /lib/spark/worker/spark_files.rb: -------------------------------------------------------------------------------- 1 | class SparkFiles 2 | 3 | class << self 4 | attr_accessor :root_directory 5 | end 6 | 7 | def self.get(file_name) 8 | File.join(root_directory, file_name) 9 | end 10 | 11 | def self.get_content(file_name) 12 | File.read(get(file_name)) 13 | end 14 | 15 | end 16 | -------------------------------------------------------------------------------- /ruby-spark.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | lib = File.expand_path('../lib', __FILE__) 4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 5 | 6 | require 'spark/version' 7 | 8 | Gem::Specification.new do |spec| 9 | spec.name = 'ruby-spark' 10 | spec.version = Spark::VERSION 11 | spec.authors = ['Ondřej Moravčík'] 12 | spec.email = ['moravcik.ondrej@gmail.com'] 13 | spec.summary = %q{Ruby wrapper for Apache Spark} 14 | spec.description = %q{} 15 | spec.homepage = '' 16 | spec.license = 'MIT' 17 | 18 | spec.files = `git ls-files -z`.split("\x0") 19 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 20 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 21 | spec.require_paths = ['lib'] 22 | 23 | if RUBY_PLATFORM =~ /java/ 24 | spec.platform = 'java' 25 | 26 | extensions = ['ext/ruby_java/extconf.rb'] 27 | else 28 | extensions = ['ext/ruby_c/extconf.rb'] 29 | 30 | spec.add_dependency 'rjb' 31 | end 32 | 33 | spec.extensions = extensions 34 | spec.required_ruby_version = '>= 2.0' 35 | 36 | spec.requirements << 'java, scala' 37 | 38 | spec.add_dependency 'sourcify', '0.6.0.rc4' 39 | spec.add_dependency 'method_source' 40 | spec.add_dependency 'commander' 41 | spec.add_dependency 'pry' 42 | spec.add_dependency 'nio4r' 43 | spec.add_dependency 'distribution' 44 | 45 | spec.add_development_dependency 'bundler', '~> 1.6' 46 | spec.add_development_dependency 'rake' 47 | end 48 | -------------------------------------------------------------------------------- /spec/generator.rb: -------------------------------------------------------------------------------- 1 | class Generator 2 | def self.numbers(size=1000) 3 | Array.new(size){ rand(1..1000) } 4 | end 5 | 6 | def self.numbers_with_zero(size=1000) 7 | Array.new(size){ rand(0..1000) } 8 | end 9 | 10 | def self.words(size=1000) 11 | Array.new(size) { word } 12 | end 13 | 14 | def self.word(size=10) 15 | Array.new(rand(1..size)){(97+rand(26)).chr}.join 16 | end 17 | 18 | def self.lines(size=1000, letters=3) 19 | Array.new(size) do 20 | Array.new(rand(50..100)){ 21 | (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0)) 22 | }.join 23 | end 24 | end 25 | 26 | def self.hash(size=1000) 27 | Array.new(size) do 28 | [word(2), rand(1..10)] 29 | end 30 | end 31 | 32 | def self.hash_with_values(size=1000, values_count=10) 33 | Array.new(size) do 34 | [word(2), Array.new(values_count) { rand(1..10) }] 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /spec/inputs/numbers/1.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | 11 12 | 12 13 | 13 14 | 14 15 | 15 16 | 16 17 | 17 18 | 18 19 | 19 20 | 20 21 | 21 22 | 22 23 | 23 24 | 24 25 | 25 26 | 26 27 | 27 28 | 28 29 | 29 30 | 30 31 | 31 32 | 32 33 | 33 34 | 34 35 | 35 36 | 36 37 | 37 38 | 38 39 | 39 40 | 40 41 | 41 42 | 42 43 | 43 44 | 44 45 | 45 46 | 46 47 | 47 48 | 48 49 | 49 50 | 50 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/10.txt: -------------------------------------------------------------------------------- 1 | 451 2 | 452 3 | 453 4 | 454 5 | 455 6 | 456 7 | 457 8 | 458 9 | 459 10 | 460 11 | 461 12 | 462 13 | 463 14 | 464 15 | 465 16 | 466 17 | 467 18 | 468 19 | 469 20 | 470 21 | 471 22 | 472 23 | 473 24 | 474 25 | 475 26 | 476 27 | 477 28 | 478 29 | 479 30 | 480 31 | 481 32 | 482 33 | 483 34 | 484 35 | 485 36 | 486 37 | 487 38 | 488 39 | 489 40 | 490 41 | 491 42 | 492 43 | 493 44 | 494 45 | 495 46 | 496 47 | 497 48 | 498 49 | 499 50 | 500 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/11.txt: -------------------------------------------------------------------------------- 1 | 501 2 | 502 3 | 503 4 | 504 5 | 505 6 | 506 7 | 507 8 | 508 9 | 509 10 | 510 11 | 511 12 | 512 13 | 513 14 | 514 15 | 515 16 | 516 17 | 517 18 | 518 19 | 519 20 | 520 21 | 521 22 | 522 23 | 523 24 | 524 25 | 525 26 | 526 27 | 527 28 | 528 29 | 529 30 | 530 31 | 531 32 | 532 33 | 533 34 | 534 35 | 535 36 | 536 37 | 537 38 | 538 39 | 539 40 | 540 41 | 541 42 | 542 43 | 543 44 | 544 45 | 545 46 | 546 47 | 547 48 | 548 49 | 549 50 | 550 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/12.txt: -------------------------------------------------------------------------------- 1 | 551 2 | 552 3 | 553 4 | 554 5 | 555 6 | 556 7 | 557 8 | 558 9 | 559 10 | 560 11 | 561 12 | 562 13 | 563 14 | 564 15 | 565 16 | 566 17 | 567 18 | 568 19 | 569 20 | 570 21 | 571 22 | 572 23 | 573 24 | 574 25 | 575 26 | 576 27 | 577 28 | 578 29 | 579 30 | 580 31 | 581 32 | 582 33 | 583 34 | 584 35 | 585 36 | 586 37 | 587 38 | 588 39 | 589 40 | 590 41 | 591 42 | 592 43 | 593 44 | 594 45 | 595 46 | 596 47 | 597 48 | 598 49 | 599 50 | 600 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/13.txt: -------------------------------------------------------------------------------- 1 | 601 2 | 602 3 | 603 4 | 604 5 | 605 6 | 606 7 | 607 8 | 608 9 | 609 10 | 610 11 | 611 12 | 612 13 | 613 14 | 614 15 | 615 16 | 616 17 | 617 18 | 618 19 | 619 20 | 620 21 | 621 22 | 622 23 | 623 24 | 624 25 | 625 26 | 626 27 | 627 28 | 628 29 | 629 30 | 630 31 | 631 32 | 632 33 | 633 34 | 634 35 | 635 36 | 636 37 | 637 38 | 638 39 | 639 40 | 640 41 | 641 42 | 642 43 | 643 44 | 644 45 | 645 46 | 646 47 | 647 48 | 648 49 | 649 50 | 650 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/14.txt: -------------------------------------------------------------------------------- 1 | 651 2 | 652 3 | 653 4 | 654 5 | 655 6 | 656 7 | 657 8 | 658 9 | 659 10 | 660 11 | 661 12 | 662 13 | 663 14 | 664 15 | 665 16 | 666 17 | 667 18 | 668 19 | 669 20 | 670 21 | 671 22 | 672 23 | 673 24 | 674 25 | 675 26 | 676 27 | 677 28 | 678 29 | 679 30 | 680 31 | 681 32 | 682 33 | 683 34 | 684 35 | 685 36 | 686 37 | 687 38 | 688 39 | 689 40 | 690 41 | 691 42 | 692 43 | 693 44 | 694 45 | 695 46 | 696 47 | 697 48 | 698 49 | 699 50 | 700 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/15.txt: -------------------------------------------------------------------------------- 1 | 701 2 | 702 3 | 703 4 | 704 5 | 705 6 | 706 7 | 707 8 | 708 9 | 709 10 | 710 11 | 711 12 | 712 13 | 713 14 | 714 15 | 715 16 | 716 17 | 717 18 | 718 19 | 719 20 | 720 21 | 721 22 | 722 23 | 723 24 | 724 25 | 725 26 | 726 27 | 727 28 | 728 29 | 729 30 | 730 31 | 731 32 | 732 33 | 733 34 | 734 35 | 735 36 | 736 37 | 737 38 | 738 39 | 739 40 | 740 41 | 741 42 | 742 43 | 743 44 | 744 45 | 745 46 | 746 47 | 747 48 | 748 49 | 749 50 | 750 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/16.txt: -------------------------------------------------------------------------------- 1 | 751 2 | 752 3 | 753 4 | 754 5 | 755 6 | 756 7 | 757 8 | 758 9 | 759 10 | 760 11 | 761 12 | 762 13 | 763 14 | 764 15 | 765 16 | 766 17 | 767 18 | 768 19 | 769 20 | 770 21 | 771 22 | 772 23 | 773 24 | 774 25 | 775 26 | 776 27 | 777 28 | 778 29 | 779 30 | 780 31 | 781 32 | 782 33 | 783 34 | 784 35 | 785 36 | 786 37 | 787 38 | 788 39 | 789 40 | 790 41 | 791 42 | 792 43 | 793 44 | 794 45 | 795 46 | 796 47 | 797 48 | 798 49 | 799 50 | 800 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/17.txt: -------------------------------------------------------------------------------- 1 | 801 2 | 802 3 | 803 4 | 804 5 | 805 6 | 806 7 | 807 8 | 808 9 | 809 10 | 810 11 | 811 12 | 812 13 | 813 14 | 814 15 | 815 16 | 816 17 | 817 18 | 818 19 | 819 20 | 820 21 | 821 22 | 822 23 | 823 24 | 824 25 | 825 26 | 826 27 | 827 28 | 828 29 | 829 30 | 830 31 | 831 32 | 832 33 | 833 34 | 834 35 | 835 36 | 836 37 | 837 38 | 838 39 | 839 40 | 840 41 | 841 42 | 842 43 | 843 44 | 844 45 | 845 46 | 846 47 | 847 48 | 848 49 | 849 50 | 850 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/18.txt: -------------------------------------------------------------------------------- 1 | 851 2 | 852 3 | 853 4 | 854 5 | 855 6 | 856 7 | 857 8 | 858 9 | 859 10 | 860 11 | 861 12 | 862 13 | 863 14 | 864 15 | 865 16 | 866 17 | 867 18 | 868 19 | 869 20 | 870 21 | 871 22 | 872 23 | 873 24 | 874 25 | 875 26 | 876 27 | 877 28 | 878 29 | 879 30 | 880 31 | 881 32 | 882 33 | 883 34 | 884 35 | 885 36 | 886 37 | 887 38 | 888 39 | 889 40 | 890 41 | 891 42 | 892 43 | 893 44 | 894 45 | 895 46 | 896 47 | 897 48 | 898 49 | 899 50 | 900 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/19.txt: -------------------------------------------------------------------------------- 1 | 901 2 | 902 3 | 903 4 | 904 5 | 905 6 | 906 7 | 907 8 | 908 9 | 909 10 | 910 11 | 911 12 | 912 13 | 913 14 | 914 15 | 915 16 | 916 17 | 917 18 | 918 19 | 919 20 | 920 21 | 921 22 | 922 23 | 923 24 | 924 25 | 925 26 | 926 27 | 927 28 | 928 29 | 929 30 | 930 31 | 931 32 | 932 33 | 933 34 | 934 35 | 935 36 | 936 37 | 937 38 | 938 39 | 939 40 | 940 41 | 941 42 | 942 43 | 943 44 | 944 45 | 945 46 | 946 47 | 947 48 | 948 49 | 949 50 | 950 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/2.txt: -------------------------------------------------------------------------------- 1 | 51 2 | 52 3 | 53 4 | 54 5 | 55 6 | 56 7 | 57 8 | 58 9 | 59 10 | 60 11 | 61 12 | 62 13 | 63 14 | 64 15 | 65 16 | 66 17 | 67 18 | 68 19 | 69 20 | 70 21 | 71 22 | 72 23 | 73 24 | 74 25 | 75 26 | 76 27 | 77 28 | 78 29 | 79 30 | 80 31 | 81 32 | 82 33 | 83 34 | 84 35 | 85 36 | 86 37 | 87 38 | 88 39 | 89 40 | 90 41 | 91 42 | 92 43 | 93 44 | 94 45 | 95 46 | 96 47 | 97 48 | 98 49 | 99 50 | 100 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/20.txt: -------------------------------------------------------------------------------- 1 | 951 2 | 952 3 | 953 4 | 954 5 | 955 6 | 956 7 | 957 8 | 958 9 | 959 10 | 960 11 | 961 12 | 962 13 | 963 14 | 964 15 | 965 16 | 966 17 | 967 18 | 968 19 | 969 20 | 970 21 | 971 22 | 972 23 | 973 24 | 974 25 | 975 26 | 976 27 | 977 28 | 978 29 | 979 30 | 980 31 | 981 32 | 982 33 | 983 34 | 984 35 | 985 36 | 986 37 | 987 38 | 988 39 | 989 40 | 990 41 | 991 42 | 992 43 | 993 44 | 994 45 | 995 46 | 996 47 | 997 48 | 998 49 | 999 50 | 1000 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/3.txt: -------------------------------------------------------------------------------- 1 | 101 2 | 102 3 | 103 4 | 104 5 | 105 6 | 106 7 | 107 8 | 108 9 | 109 10 | 110 11 | 111 12 | 112 13 | 113 14 | 114 15 | 115 16 | 116 17 | 117 18 | 118 19 | 119 20 | 120 21 | 121 22 | 122 23 | 123 24 | 124 25 | 125 26 | 126 27 | 127 28 | 128 29 | 129 30 | 130 31 | 131 32 | 132 33 | 133 34 | 134 35 | 135 36 | 136 37 | 137 38 | 138 39 | 139 40 | 140 41 | 141 42 | 142 43 | 143 44 | 144 45 | 145 46 | 146 47 | 147 48 | 148 49 | 149 50 | 150 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/4.txt: -------------------------------------------------------------------------------- 1 | 151 2 | 152 3 | 153 4 | 154 5 | 155 6 | 156 7 | 157 8 | 158 9 | 159 10 | 160 11 | 161 12 | 162 13 | 163 14 | 164 15 | 165 16 | 166 17 | 167 18 | 168 19 | 169 20 | 170 21 | 171 22 | 172 23 | 173 24 | 174 25 | 175 26 | 176 27 | 177 28 | 178 29 | 179 30 | 180 31 | 181 32 | 182 33 | 183 34 | 184 35 | 185 36 | 186 37 | 187 38 | 188 39 | 189 40 | 190 41 | 191 42 | 192 43 | 193 44 | 194 45 | 195 46 | 196 47 | 197 48 | 198 49 | 199 50 | 200 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/5.txt: -------------------------------------------------------------------------------- 1 | 201 2 | 202 3 | 203 4 | 204 5 | 205 6 | 206 7 | 207 8 | 208 9 | 209 10 | 210 11 | 211 12 | 212 13 | 213 14 | 214 15 | 215 16 | 216 17 | 217 18 | 218 19 | 219 20 | 220 21 | 221 22 | 222 23 | 223 24 | 224 25 | 225 26 | 226 27 | 227 28 | 228 29 | 229 30 | 230 31 | 231 32 | 232 33 | 233 34 | 234 35 | 235 36 | 236 37 | 237 38 | 238 39 | 239 40 | 240 41 | 241 42 | 242 43 | 243 44 | 244 45 | 245 46 | 246 47 | 247 48 | 248 49 | 249 50 | 250 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/6.txt: -------------------------------------------------------------------------------- 1 | 251 2 | 252 3 | 253 4 | 254 5 | 255 6 | 256 7 | 257 8 | 258 9 | 259 10 | 260 11 | 261 12 | 262 13 | 263 14 | 264 15 | 265 16 | 266 17 | 267 18 | 268 19 | 269 20 | 270 21 | 271 22 | 272 23 | 273 24 | 274 25 | 275 26 | 276 27 | 277 28 | 278 29 | 279 30 | 280 31 | 281 32 | 282 33 | 283 34 | 284 35 | 285 36 | 286 37 | 287 38 | 288 39 | 289 40 | 290 41 | 291 42 | 292 43 | 293 44 | 294 45 | 295 46 | 296 47 | 297 48 | 298 49 | 299 50 | 300 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/7.txt: -------------------------------------------------------------------------------- 1 | 301 2 | 302 3 | 303 4 | 304 5 | 305 6 | 306 7 | 307 8 | 308 9 | 309 10 | 310 11 | 311 12 | 312 13 | 313 14 | 314 15 | 315 16 | 316 17 | 317 18 | 318 19 | 319 20 | 320 21 | 321 22 | 322 23 | 323 24 | 324 25 | 325 26 | 326 27 | 327 28 | 328 29 | 329 30 | 330 31 | 331 32 | 332 33 | 333 34 | 334 35 | 335 36 | 336 37 | 337 38 | 338 39 | 339 40 | 340 41 | 341 42 | 342 43 | 343 44 | 344 45 | 345 46 | 346 47 | 347 48 | 348 49 | 349 50 | 350 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/8.txt: -------------------------------------------------------------------------------- 1 | 351 2 | 352 3 | 353 4 | 354 5 | 355 6 | 356 7 | 357 8 | 358 9 | 359 10 | 360 11 | 361 12 | 362 13 | 363 14 | 364 15 | 365 16 | 366 17 | 367 18 | 368 19 | 369 20 | 370 21 | 371 22 | 372 23 | 373 24 | 374 25 | 375 26 | 376 27 | 377 28 | 378 29 | 379 30 | 380 31 | 381 32 | 382 33 | 383 34 | 384 35 | 385 36 | 386 37 | 387 38 | 388 39 | 389 40 | 390 41 | 391 42 | 392 43 | 393 44 | 394 45 | 395 46 | 396 47 | 397 48 | 398 49 | 399 50 | 400 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers/9.txt: -------------------------------------------------------------------------------- 1 | 401 2 | 402 3 | 403 4 | 404 5 | 405 6 | 406 7 | 407 8 | 408 9 | 409 10 | 410 11 | 411 12 | 412 13 | 413 14 | 414 15 | 415 16 | 416 17 | 417 18 | 418 19 | 419 20 | 420 21 | 421 22 | 422 23 | 423 24 | 424 25 | 425 26 | 426 27 | 427 28 | 428 29 | 429 30 | 430 31 | 431 32 | 432 33 | 433 34 | 434 35 | 435 36 | 436 37 | 437 38 | 438 39 | 439 40 | 440 41 | 441 42 | 442 43 | 443 44 | 444 45 | 445 46 | 446 47 | 447 48 | 448 49 | 449 50 | 450 51 | -------------------------------------------------------------------------------- /spec/inputs/numbers_0_100.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | 10 12 | 11 13 | 12 14 | 13 15 | 14 16 | 15 17 | 16 18 | 17 19 | 18 20 | 19 21 | 20 22 | 21 23 | 22 24 | 23 25 | 24 26 | 25 27 | 26 28 | 27 29 | 28 30 | 29 31 | 30 32 | 31 33 | 32 34 | 33 35 | 34 36 | 35 37 | 36 38 | 37 39 | 38 40 | 39 41 | 40 42 | 41 43 | 42 44 | 43 45 | 44 46 | 45 47 | 46 48 | 47 49 | 48 50 | 49 51 | 50 52 | 51 53 | 52 54 | 53 55 | 54 56 | 55 57 | 56 58 | 57 59 | 58 60 | 59 61 | 60 62 | 61 63 | 62 64 | 63 65 | 64 66 | 65 67 | 66 68 | 67 69 | 68 70 | 69 71 | 70 72 | 71 73 | 72 74 | 73 75 | 74 76 | 75 77 | 76 78 | 77 79 | 78 80 | 79 81 | 80 82 | 81 83 | 82 84 | 83 85 | 84 86 | 85 87 | 86 88 | 87 89 | 88 90 | 89 91 | 90 92 | 91 93 | 92 94 | 93 95 | 94 96 | 95 97 | 96 98 | 97 99 | 98 100 | 99 101 | 100 -------------------------------------------------------------------------------- /spec/inputs/numbers_1_100.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | 11 12 | 12 13 | 13 14 | 14 15 | 15 16 | 16 17 | 17 18 | 18 19 | 19 20 | 20 21 | 21 22 | 22 23 | 23 24 | 24 25 | 25 26 | 26 27 | 27 28 | 28 29 | 29 30 | 30 31 | 31 32 | 32 33 | 33 34 | 34 35 | 35 36 | 36 37 | 37 38 | 38 39 | 39 40 | 40 41 | 41 42 | 42 43 | 43 44 | 44 45 | 45 46 | 46 47 | 47 48 | 48 49 | 49 50 | 50 51 | 51 52 | 52 53 | 53 54 | 54 55 | 55 56 | 56 57 | 57 58 | 58 59 | 59 60 | 60 61 | 61 62 | 62 63 | 63 64 | 64 65 | 65 66 | 66 67 | 67 68 | 68 69 | 69 70 | 70 71 | 71 72 | 72 73 | 73 74 | 74 75 | 75 76 | 76 77 | 77 78 | 78 79 | 79 80 | 80 81 | 81 82 | 82 83 | 83 84 | 84 85 | 85 86 | 86 87 | 87 88 | 88 89 | 89 90 | 90 91 | 91 92 | 92 93 | 93 94 | 94 95 | 95 96 | 96 97 | 97 98 | 98 99 | 99 100 | 100 -------------------------------------------------------------------------------- /spec/lib/collect_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Spark::RDD do 4 | 5 | let(:mapping) { lambda{|x| [x, 1]} } 6 | let(:numbers) { Generator.numbers } 7 | 8 | it '.collect_as_hash' do 9 | rdd = $sc.parallelize(numbers) 10 | rdd = rdd.map(mapping) 11 | 12 | expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)]) 13 | end 14 | 15 | context '.take' do 16 | let(:size) { 1000 } 17 | let(:numbers) { Generator.numbers(size) } 18 | let(:rdd) { $sc.parallelize(numbers) } 19 | 20 | it 'nothing' do 21 | expect(rdd.take(0)).to eql([]) 22 | end 23 | 24 | it 'first' do 25 | expect(rdd.first).to eql(numbers.first) 26 | end 27 | 28 | it 'less than limit' do 29 | _size = size / 2 30 | expect(rdd.take(_size)).to eql(numbers.take(_size)) 31 | end 32 | 33 | it 'all' do 34 | expect(rdd.take(size)).to eql(numbers) 35 | end 36 | 37 | it 'more than limit' do 38 | expect(rdd.take(size*2)).to eql(numbers) 39 | end 40 | end 41 | 42 | end 43 | -------------------------------------------------------------------------------- /spec/lib/command_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | def to_s_method(x) 4 | x.to_s 5 | end 6 | 7 | RSpec::describe Spark::CommandBuilder do 8 | let(:numbers) { Generator.numbers } 9 | let(:rdd) { $sc.parallelize(numbers, 1) } 10 | 11 | context '.serialize_function' do 12 | let(:result) { numbers.map(&:to_s) } 13 | 14 | it 'string' do 15 | expect(rdd.map('lambda{|x| x.to_s}').collect).to eql(result) 16 | end 17 | 18 | it 'symbol' do 19 | expect(rdd.map(:to_s).collect).to eql(result) 20 | end 21 | 22 | it 'lambda' do 23 | expect(rdd.map(lambda{|x| x.to_s}).collect).to eql(result) 24 | end 25 | 26 | it 'method' do 27 | expect(rdd.map(method(:to_s_method)).collect).to eql(result) 28 | end 29 | end 30 | 31 | context '.bind' do 32 | it 'number' do 33 | number = rand(0..10000000) 34 | rdd2 = rdd.map(lambda{|x| x * number}).bind(number: number) 35 | 36 | expect(rdd2.collect).to eq(numbers.map{|x| x * number}) 37 | end 38 | 39 | it 'open struct' do 40 | require 'ostruct' 41 | 42 | struct = OpenStruct.new 43 | struct.number = 3 44 | struct.string = '3' 45 | struct.array = [1, 2, 3] 46 | 47 | func = lambda{|item| 48 | item * struct.number + struct.string.to_i + struct.array[0] 49 | } 50 | 51 | rdd2 = rdd.add_library('ostruct') 52 | rdd2 = rdd2.map(func) 53 | rdd2 = rdd2.bind(struct: struct) 54 | 55 | expect(rdd2.collect).to eq(numbers.map(&func)) 56 | end 57 | 58 | it 'different naming' do 59 | array = [1, 2, 3] 60 | 61 | rdd2 = rdd.map(lambda{|_| my_array.size}) 62 | rdd2 = rdd2.bind(my_array: array) 63 | 64 | expect(rdd2.sum).to eq(numbers.size * array.size) 65 | end 66 | end 67 | 68 | end 69 | -------------------------------------------------------------------------------- /spec/lib/config_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Spark::Config do 4 | 5 | before(:context) do 6 | Spark.stop 7 | end 8 | 9 | after(:context) do 10 | spark_start 11 | end 12 | 13 | it 'should be stopped' do 14 | expect(Spark.started?).to be_falsy 15 | end 16 | 17 | context 'new config' do 18 | 19 | let(:configuration) do 20 | { 21 | 'test.test1' => 'test1', 22 | 'test.test2' => 'test2', 23 | 'test.test3' => 'test3' 24 | } 25 | end 26 | 27 | before(:each) do 28 | Spark.clear_config 29 | end 30 | 31 | it 'throught methods' do 32 | configuration.each do |key, value| 33 | Spark.config.set(key, value) 34 | end 35 | 36 | configuration.each do |key, value| 37 | expect(Spark.config.get(key)).to eql(value) 38 | end 39 | end 40 | 41 | it 'throught hash style' do 42 | configuration.each do |key, value| 43 | Spark.config[key] = value 44 | end 45 | 46 | configuration.each do |key, value| 47 | expect(Spark.config[key]).to eql(value) 48 | end 49 | end 50 | 51 | it 'throught dsl' do 52 | configuration.each do |key, value| 53 | Spark.config { 54 | set key, value 55 | } 56 | end 57 | 58 | configuration.each do |key, value| 59 | expect(Spark.config[key]).to eql(value) 60 | end 61 | end 62 | end 63 | 64 | end 65 | -------------------------------------------------------------------------------- /spec/lib/ext_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Array do 4 | 5 | it '.deep_copy' do 6 | data = ['a', 'b', 'c'] 7 | new_data = data.dup 8 | 9 | data[0] << 'a' 10 | 11 | expect(data).to eql(new_data) 12 | 13 | new_data = data.deep_copy 14 | 15 | data[1] << 'b' 16 | 17 | expect(data).to_not eql(new_data) 18 | end 19 | 20 | end 21 | 22 | RSpec.describe Hash do 23 | 24 | it '.stringify_keys!' do 25 | data = { 26 | a: 'a', 27 | b: 'b', 28 | c: 'c' 29 | } 30 | 31 | data.stringify_keys! 32 | 33 | expect(data).to eql({ 34 | 'a' => 'a', 35 | 'b' => 'b', 36 | 'c' => 'c' 37 | }) 38 | end 39 | 40 | end 41 | 42 | RSpec.describe String do 43 | 44 | it '.camelize' do 45 | data = 'aaa_bbb_ccc'.camelize 46 | expect(data).to eql('AaaBbbCcc') 47 | end 48 | 49 | end 50 | 51 | RSpec.describe IO do 52 | 53 | it 'serialize' do 54 | file = Tempfile.new('serialize') 55 | file.binmode 56 | 57 | file.write_int(1) 58 | file.write_long(2) 59 | file.write_string('3') 60 | file.write_data([4]) 61 | 62 | file.rewind 63 | 64 | expect(file.read_int).to eq(1) 65 | expect(file.read_long).to eq(2) 66 | expect(file.read_string).to eq('3') 67 | expect(file.read_data).to eq([4]) 68 | 69 | file.unlink 70 | end 71 | 72 | end 73 | -------------------------------------------------------------------------------- /spec/lib/external_apps_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Spark::RDD do 4 | 5 | context '.pipe' do 6 | let(:words) { Generator.words } 7 | let(:numbers) { Generator.numbers } 8 | 9 | it 'single program' do 10 | skip if windows? 11 | 12 | rdd = $sc.parallelize(words, 1) 13 | rdd = rdd.pipe('tr a b') 14 | 15 | result = words.dup 16 | result.map! do |x| 17 | x.gsub('a', 'b') 18 | end 19 | 20 | expect(rdd.collect).to eql(result) 21 | end 22 | 23 | it 'multiple program' do 24 | skip if windows? 25 | 26 | rdd = $sc.parallelize(numbers, 1) 27 | rdd = rdd.pipe("tr 1 5", "awk '{print $1*10}'") 28 | rdd = rdd.map(lambda{|x| x.to_i * 100}) 29 | 30 | result = numbers.dup 31 | result.map! do |x| 32 | x.to_s.gsub('1', '5') 33 | end 34 | result.map! do |x| 35 | x.to_i * 10 36 | end 37 | result.map! do |x| 38 | x * 100 39 | end 40 | 41 | expect(rdd.collect).to eql(result) 42 | end 43 | end 44 | 45 | end 46 | -------------------------------------------------------------------------------- /spec/lib/filter_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | def func4(item) 4 | item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106 5 | end 6 | 7 | RSpec.shared_examples 'a filtering' do |workers| 8 | context "with #{workers || 'default'} worker" do 9 | it 'when numbers' do 10 | rdd2 = rdd_numbers(workers) 11 | rdd2 = rdd2.filter(func1) 12 | result = numbers.select(&func1) 13 | 14 | expect(rdd2.collect).to eql(result) 15 | 16 | rdd3 = rdd_numbers(workers) 17 | rdd3 = rdd3.filter(func1) 18 | rdd3 = rdd3.filter(func2) 19 | 20 | expect(rdd3.collect).to eql([]) 21 | end 22 | 23 | it 'when words' do 24 | rdd2 = rdd_words(workers) 25 | rdd2 = rdd2.filter(func3) 26 | result = words.select{|x| func3.call(x)} 27 | 28 | expect(rdd2.collect).to eql(result) 29 | 30 | rdd3 = rdd_words(workers) 31 | rdd3 = rdd3.filter(method(:func4)) 32 | result = words.select{|x| func4(x)} 33 | 34 | expect(rdd3.collect).to eql(result) 35 | end 36 | end 37 | end 38 | 39 | RSpec.describe 'Spark::RDD.filter' do 40 | let(:func1) { lambda{|x| x.to_i.even?} } 41 | let(:func2) { lambda{|x| x.to_i.odd?} } 42 | let(:func3) { lambda{|x| x.to_s.start_with?('b')} } 43 | 44 | context 'throught parallelize' do 45 | let(:numbers) { Generator.numbers_with_zero } 46 | let(:words) { Generator.words } 47 | 48 | def rdd_numbers(workers) 49 | $sc.parallelize(numbers, workers) 50 | end 51 | 52 | def rdd_words(workers) 53 | $sc.parallelize(words, workers) 54 | end 55 | 56 | it_behaves_like 'a filtering', 2 57 | # it_behaves_like 'a filtering', nil 58 | # it_behaves_like 'a filtering', rand(2..10) 59 | end 60 | 61 | context 'throught text_file' do 62 | let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') } 63 | let(:file_words) { File.join('spec', 'inputs', 'lorem_300.txt') } 64 | 65 | let(:numbers) { File.readlines(file_numbers).map(&:strip) } 66 | let(:words) { File.readlines(file_words).map(&:strip) } 67 | 68 | def rdd_numbers(workers) 69 | $sc.text_file(file_numbers, workers) 70 | end 71 | 72 | def rdd_words(workers) 73 | $sc.text_file(file_words, workers) 74 | end 75 | 76 | it_behaves_like 'a filtering', 2 77 | # it_behaves_like 'a filtering', nil 78 | # it_behaves_like 'a filtering', rand(2..10) 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /spec/lib/flat_map_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.shared_examples 'a flat mapping' do |workers| 4 | it "with #{workers || 'default'} worker" do 5 | rdd2 = rdd(workers).map(func1) 6 | result = numbers.flat_map(&func1) 7 | 8 | expect(rdd2.collect).to eql(result) 9 | 10 | rdd3 = rdd(workers) 11 | rdd3 = rdd3.flat_map(func1) 12 | rdd3 = rdd3.flat_map(func2) 13 | rdd3 = rdd3.flat_map(func3) 14 | result = numbers.flat_map(&func1).flat_map(&func2).flat_map(&func3) 15 | 16 | expect(rdd3.collect).to eql(result) 17 | 18 | rdd4 = rdd(workers) 19 | rdd4 = rdd4.flat_map(func1) 20 | rdd4 = rdd4.flat_map(func2) 21 | rdd4 = rdd4.flat_map(func3) 22 | 23 | expect(rdd4.collect).to eql(rdd3.collect) 24 | end 25 | end 26 | 27 | RSpec.shared_examples 'a flat mapping values' do |workers| 28 | it "with #{workers || 'default'} worker" do 29 | rdd2 = rdd(workers).flat_map_values(func1) 30 | result = [] 31 | hash_with_values.each do |(key, values)| 32 | values = func1.call(values).flatten 33 | values.each do |value| 34 | result << [key, value] 35 | end 36 | end 37 | 38 | expect(rdd2.collect).to eql(result) 39 | 40 | rdd2 = rdd(workers).flat_map_values(func2) 41 | result = [] 42 | hash_with_values.each do |(key, values)| 43 | values = func2.call(values).flatten 44 | values.each do |value| 45 | result << [key, value] 46 | end 47 | end 48 | 49 | expect(rdd2.collect).to eql(result) 50 | end 51 | end 52 | 53 | RSpec.describe 'Spark::RDD' do 54 | let(:func1) { lambda{|x| x*2} } 55 | let(:func2) { lambda{|x| [x*3, 1, 1]} } 56 | let(:func3) { lambda{|x| [x*4, 2, 2]} } 57 | 58 | context 'throught parallelize' do 59 | context '.flat_map' do 60 | let(:numbers) { Generator.numbers_with_zero } 61 | 62 | def rdd(workers) 63 | $sc.parallelize(numbers, workers) 64 | end 65 | 66 | it_behaves_like 'a flat mapping', 1 67 | it_behaves_like 'a flat mapping', 2 68 | # it_behaves_like 'a flat mapping', nil 69 | # it_behaves_like 'a flat mapping', rand(2..10) 70 | end 71 | 72 | context '.flat_map_values' do 73 | let(:func1) { lambda{|x| x*2} } 74 | let(:func2) { lambda{|x| [x.first]} } 75 | let(:hash_with_values) { Generator.hash_with_values } 76 | 77 | def rdd(workers) 78 | $sc.parallelize(hash_with_values, workers) 79 | end 80 | 81 | it_behaves_like 'a flat mapping values', 1 82 | it_behaves_like 'a flat mapping values', 2 83 | # it_behaves_like 'a flat mapping values', nil 84 | # it_behaves_like 'a flat mapping values', rand(2..10) 85 | end 86 | end 87 | 88 | context 'throught text_file' do 89 | context '.flat_map' do 90 | let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } 91 | let(:numbers) { File.readlines(file).map(&:strip) } 92 | 93 | def rdd(workers) 94 | $sc.text_file(file, workers) 95 | end 96 | 97 | it_behaves_like 'a flat mapping', 1 98 | it_behaves_like 'a flat mapping', 2 99 | # it_behaves_like 'a flat mapping', nil 100 | # it_behaves_like 'a flat mapping', rand(2..10) 101 | end 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /spec/lib/group_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.shared_examples 'a groupping by key' do |workers| 4 | it "with #{workers || 'default'} worker" do 5 | expect(rdd_result(workers)).to eql(result) 6 | end 7 | end 8 | 9 | RSpec.shared_examples 'a cogroupping by key' do |workers| 10 | context "with #{workers || 'default'} worker" do 11 | it '.group_with' do 12 | rdd = rdd_1(workers).group_with(rdd_2(workers)) 13 | expect(rdd.collect_as_hash).to eql(result_12) 14 | end 15 | 16 | it '.cogroup' do 17 | rdd = rdd_1(workers).cogroup(rdd_2(workers), rdd_3(workers)) 18 | expect(rdd.collect_as_hash).to eql(result_123) 19 | end 20 | end 21 | end 22 | 23 | RSpec.shared_examples 'a groupping by' do |workers| 24 | it "with #{workers || 'default'} worker" do 25 | rdd = rdd_numbers(workers) 26 | rdd = rdd.group_by(key_function1) 27 | 28 | expect(rdd.collect_as_hash).to eql(numbers.group_by(&key_function1)) 29 | 30 | rdd = rdd_words(workers) 31 | rdd = rdd.group_by(key_function2) 32 | 33 | expect(rdd.collect_as_hash).to eql(words.group_by(&key_function2)) 34 | end 35 | end 36 | 37 | RSpec.describe 'Spark::RDD' do 38 | 39 | def make_result(*hashes) 40 | _result = {} 41 | hashes.each do |data| 42 | data.each do |key, value| 43 | _result[key] ||= [] 44 | _result[key] << value 45 | end 46 | end 47 | _result 48 | end 49 | 50 | context '.group_by_key' do 51 | let(:hash) { Generator.hash } 52 | let(:result) { make_result(hash) } 53 | 54 | def rdd_result(workers) 55 | rdd = $sc.parallelize(hash) 56 | rdd.group_by_key.collect_as_hash 57 | end 58 | 59 | it_behaves_like 'a groupping by key', 1 60 | it_behaves_like 'a groupping by key', 2 61 | # it_behaves_like 'a groupping by key', nil 62 | # it_behaves_like 'a groupping by key', rand(2..10) 63 | end 64 | 65 | context 'cogroup' do 66 | let(:hash1) { Generator.hash } 67 | let(:hash2) { Generator.hash } 68 | let(:hash3) { Generator.hash } 69 | 70 | let(:result_12) { make_result(hash1, hash2) } 71 | let(:result_123) { make_result(hash1, hash2, hash3) } 72 | 73 | def rdd_1(workers) 74 | $sc.parallelize(hash1) 75 | end 76 | 77 | def rdd_2(workers) 78 | $sc.parallelize(hash2) 79 | end 80 | 81 | def rdd_3(workers) 82 | $sc.parallelize(hash3) 83 | end 84 | 85 | it_behaves_like 'a cogroupping by key', 1 86 | it_behaves_like 'a cogroupping by key', 2 87 | # it_behaves_like 'a cogroupping by key', nil 88 | # it_behaves_like 'a cogroupping by key', rand(2..10) 89 | end 90 | 91 | context 'group_by' do 92 | let(:key_function1) { lambda{|x| x%2} } 93 | let(:key_function2) { lambda{|x| x.size} } 94 | 95 | let(:numbers) { Generator.numbers } 96 | let(:words) { Generator.words } 97 | 98 | def rdd_numbers(workers) 99 | $sc.parallelize(numbers) 100 | end 101 | 102 | def rdd_words(workers) 103 | $sc.parallelize(words) 104 | end 105 | 106 | it_behaves_like 'a groupping by', 1 107 | it_behaves_like 'a groupping by', 2 108 | # it_behaves_like 'a groupping by', nil 109 | # it_behaves_like 'a groupping by', rand(2..10) 110 | end 111 | 112 | end 113 | -------------------------------------------------------------------------------- /spec/lib/helper_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.configure do |c| 4 | c.include Spark::Helper::Parser 5 | c.include Spark::Helper::Statistic 6 | end 7 | 8 | RSpec.describe Spark::Helper do 9 | 10 | it 'memory size' do 11 | expect(to_memory_size('512mb')).to eql(524288.0) 12 | expect(to_memory_size('1586 mb')).to eql(1624064.0) 13 | expect(to_memory_size('3 MB')).to eql(3072.0) 14 | expect(to_memory_size('9gb')).to eql(9437184.0) 15 | expect(to_memory_size('9gb', 'mb')).to eql(9216.0) 16 | expect(to_memory_size('9mb', 'gb')).to eql(0.01) 17 | expect(to_memory_size('6652548796kb', 'mb')).to eql(6496629.68) 18 | end 19 | 20 | context 'statistic' do 21 | it 'compute_fraction' do 22 | expect(compute_fraction(1, 1000, true)).to be_within(0.001).of(0.013) 23 | expect(compute_fraction(2, 1000, true)).to be_within(0.001).of(0.018) 24 | expect(compute_fraction(3, 1000, true)).to be_within(0.001).of(0.023) 25 | expect(compute_fraction(4, 1000, true)).to be_within(0.001).of(0.028) 26 | expect(compute_fraction(5, 1000, true)).to be_within(0.001).of(0.031) 27 | 28 | expect(compute_fraction(1, 1000, false)).to be_within(0.001).of(0.0249) 29 | expect(compute_fraction(2, 1000, false)).to be_within(0.001).of(0.0268) 30 | expect(compute_fraction(3, 1000, false)).to be_within(0.001).of(0.0287) 31 | expect(compute_fraction(4, 1000, false)).to be_within(0.001).of(0.0305) 32 | expect(compute_fraction(5, 1000, false)).to be_within(0.001).of(0.0322) 33 | end 34 | 35 | it 'bisect_right' do 36 | data = [10, 20, 30, 40, 50, 60, 70, 80, 90] 37 | 38 | expect(bisect_right(data, 0)).to eq(0) 39 | expect(bisect_right(data, 1)).to eq(0) 40 | expect(bisect_right(data, 1, 2)).to eq(2) 41 | expect(bisect_right(data, 1, 3)).to eq(3) 42 | expect(bisect_right(data, 1, 4)).to eq(4) 43 | expect(bisect_right(data, 9)).to eq(0) 44 | expect(bisect_right(data, 10)).to eq(1) 45 | expect(bisect_right(data, 40)).to eq(4) 46 | expect(bisect_right(data, 42)).to eq(4) 47 | expect(bisect_right(data, 72)).to eq(7) 48 | expect(bisect_right(data, 80, 4)).to eq(8) 49 | expect(bisect_right(data, 80, 5)).to eq(8) 50 | expect(bisect_right(data, 80, 8)).to eq(8) 51 | expect(bisect_right(data, 80, 9)).to eq(9) 52 | expect(bisect_right(data, 200)).to eq(9) 53 | end 54 | 55 | it 'determine_bounds' do 56 | data = [10, 20, 30, 40, 50, 60, 70, 80, 90] 57 | 58 | expect(determine_bounds(data, 0)).to eq([]) 59 | expect(determine_bounds(data, 1)).to eq([]) 60 | expect(determine_bounds(data, 2)).to eq([50]) 61 | expect(determine_bounds(data, 3)).to eq([40, 70]) 62 | expect(determine_bounds(data, 4)).to eq([30, 50, 70]) 63 | expect(determine_bounds(data, 20)).to eq(data) 64 | end 65 | end 66 | 67 | end 68 | -------------------------------------------------------------------------------- /spec/lib/key_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.shared_examples 'a keying by' do |workers| 4 | it "with #{workers || 'default'} worker" do 5 | rdd = rdd_numbers(workers) 6 | rdd = rdd.key_by(key_function1) 7 | 8 | result = numbers.map{|item| [key_function1.call(item), item]} 9 | expect(rdd.collect).to eql(result) 10 | 11 | rdd = rdd_words(workers) 12 | rdd = rdd.key_by(key_function2) 13 | 14 | result = words.map{|item| [key_function2.call(item), item]} 15 | expect(rdd.collect).to eql(result) 16 | end 17 | end 18 | 19 | RSpec.describe 'Spark::RDD' do 20 | 21 | context 'key_by' do 22 | let(:key_function1) { lambda{|x| x.even?} } 23 | let(:key_function2) { lambda{|x| x.include?('a')} } 24 | 25 | let(:numbers) { Generator.numbers } 26 | let(:words) { Generator.words } 27 | 28 | def rdd_numbers(workers) 29 | $sc.parallelize(numbers) 30 | end 31 | 32 | def rdd_words(workers) 33 | $sc.parallelize(words) 34 | end 35 | 36 | it_behaves_like 'a keying by', 1 37 | it_behaves_like 'a keying by', 2 38 | # it_behaves_like 'a keying by', nil 39 | # it_behaves_like 'a keying by', rand(2..10) 40 | end 41 | 42 | it 'lookup' do 43 | numbers = Generator.numbers 44 | rdd_numbers = $sc.parallelize(numbers, 2) 45 | 46 | rdd = rdd_numbers.group_by(lambda {|x| x%3}) 47 | rdd.lookup(2) 48 | 49 | expect(rdd.lookup(2).first).to eq( 50 | numbers.group_by{|x| x%3}[2] 51 | ) 52 | 53 | rdd = rdd_numbers.key_by(lambda{|x| x.even?}) 54 | expect(rdd.lookup(true)).to eq( 55 | numbers.select(&:even?) 56 | ) 57 | end 58 | 59 | end 60 | -------------------------------------------------------------------------------- /spec/lib/manipulation_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe 'Spark::RDD' do 4 | let(:numbers) { 1..100 } 5 | let(:rand_numbers) { Generator.numbers } 6 | 7 | it '.glom' do 8 | rdd = $sc.parallelize(numbers, 1).glom 9 | expect(rdd.collect).to eql([numbers.to_a]) 10 | 11 | ser = Spark::Serializer.build { __batched__(__marshal__, 1) } 12 | 13 | rdd = $sc.parallelize(numbers, 5, ser).glom 14 | expect(rdd.collect).to eql(numbers.each_slice(20).to_a) 15 | end 16 | 17 | it '.coalesce' do 18 | rdd = $sc.parallelize(numbers, 5) 19 | 20 | rdd2 = rdd.glom 21 | expect(rdd2.collect.size).to eql(5) 22 | 23 | rdd3 = rdd.coalesce(4).glom 24 | expect(rdd3.collect.size).to eql(4) 25 | end 26 | 27 | it '.distinct' do 28 | rdd = $sc.parallelize(rand_numbers, 5) 29 | rdd = rdd.distinct 30 | expect(rdd.collect.sort).to eql(rand_numbers.uniq.sort) 31 | 32 | rdd = $sc.parallelize(numbers, 5) 33 | rdd = rdd.map(lambda{|x| 1}) 34 | rdd = rdd.distinct 35 | expect(rdd.collect).to eql([1]) 36 | end 37 | 38 | context '.union' do 39 | it 'classic method' do 40 | rdd = $sc.parallelize(numbers, 5) 41 | rdd = rdd.union(rdd).collect 42 | 43 | expect(rdd.collect.sort).to eql((numbers.to_a+numbers.to_a).sort) 44 | end 45 | 46 | it 'with a different serializer' do 47 | rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) }) 48 | rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) }) 49 | 50 | expect { rdd1.union(rdd2).collect }.to_not raise_error 51 | end 52 | 53 | it 'as operator' do 54 | rdd1 = $sc.parallelize(numbers) 55 | rdd2 = $sc.parallelize(rand_numbers) 56 | 57 | expect((rdd1+rdd2).sum).to eql((numbers.to_a+rand_numbers).reduce(:+)) 58 | end 59 | end 60 | 61 | it '.compact' do 62 | data = [nil, nil , 0, 0, 1, 2, nil, 6] 63 | result = data.compact 64 | ser = Spark::Serializer.build { __batched__(__marshal__, 1) } 65 | 66 | rdd = $sc.parallelize(data, 1).compact 67 | expect(rdd.collect).to eql(result) 68 | 69 | rdd = $sc.parallelize(data, 5, ser).compact 70 | expect(rdd.collect).to eql(result) 71 | 72 | rdd = $sc.parallelize(data, 1, ser).compact 73 | expect(rdd.collect).to eql(result) 74 | end 75 | 76 | it '.intersection' do 77 | data1 = [0,1,2,3,4,5,6,7,8,9,10] 78 | data2 = [5,6,7,8,9,10,11,12,13,14,15] 79 | 80 | rdd1 = $sc.parallelize(data1) 81 | rdd2 = $sc.parallelize(data2) 82 | 83 | expect(rdd1.intersection(rdd2).collect.sort).to eql(data1 & data2) 84 | end 85 | 86 | it '.shuffle' do 87 | data = Generator.numbers 88 | rdd = $sc.parallelize(data) 89 | 90 | expect(rdd.shuffle.collect).to_not eql(data) 91 | end 92 | 93 | context '.cartesian' do 94 | let(:data1) { Generator.numbers(100) } 95 | let(:data2) { Generator.numbers(100) } 96 | let(:result) { data1.product(data2).map(&:to_s).sort } 97 | 98 | it 'unbatched' do 99 | ser = Spark::Serializer.build { __batched__(__marshal__, 1) } 100 | 101 | rdd1 = $sc.parallelize(data1, 2, ser) 102 | rdd2 = $sc.parallelize(data2, 2, ser) 103 | 104 | rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s}) 105 | 106 | expect(rdd.collect.sort).to eql(result) 107 | end 108 | 109 | it 'batched' do 110 | ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) } 111 | ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) } 112 | 113 | rdd1 = $sc.parallelize(data1, 2, ser1) 114 | rdd2 = $sc.parallelize(data2, 2, ser2) 115 | 116 | rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s}) 117 | 118 | expect(rdd.collect.sort).to eql(result) 119 | end 120 | end 121 | 122 | end 123 | -------------------------------------------------------------------------------- /spec/lib/map_partitions_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | def func3(x) 4 | x.map(&:to_i).reduce(:+) 5 | end 6 | 7 | def func4_with_index(data, index) 8 | [{ 9 | index => data.map(&:to_i).reduce(:*) 10 | }] 11 | end 12 | 13 | RSpec.shared_examples 'a map partitions' do |workers| 14 | context "with #{workers || 'default'} worker" do 15 | it 'without index' do 16 | rdd2 = rdd(workers).map_partitions(func1) 17 | result = func1.call(numbers) 18 | 19 | expect(func1.call(rdd2.collect)).to eql(result) 20 | 21 | rdd3 = rdd(workers) 22 | rdd3 = rdd3.map_partitions(func1) 23 | rdd3 = rdd3.map_partitions(func2) 24 | rdd3 = rdd3.map_partitions(method(:func3)) 25 | result = func3(func2.call(func1.call(numbers))) 26 | 27 | # Not same number of workers 28 | expect(rdd3.collect.size).to be >= 1 29 | 30 | rdd4 = rdd(workers) 31 | rdd4 = rdd4.map_partitions(func1) 32 | rdd4 = rdd4.map_partitions(func2) 33 | rdd4 = rdd4.map_partitions(method(:func3)) 34 | 35 | expect(rdd4.collect).to eql(rdd3.collect) 36 | end 37 | 38 | it 'with index' do 39 | rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index)) 40 | result = rdd2.collect 41 | 42 | expect(result).to be_a(Array) 43 | 44 | result.each do |x| 45 | expect(x).to be_a(Hash) 46 | end 47 | 48 | # Multiply by 0 49 | # Some values are 0 because of batched serialization 50 | expect(result.map(&:values).flatten.compact.uniq.first).to eql(0) 51 | end 52 | end 53 | end 54 | 55 | RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do 56 | let(:func1) { lambda{|x| x.map(&:to_i)} } 57 | let(:func2) { 58 | lambda{|x| 59 | x.map{|y| y*2} 60 | } 61 | } 62 | 63 | context 'throught parallelize' do 64 | let(:numbers) { 0..1000 } 65 | 66 | def rdd(workers) 67 | $sc.parallelize(numbers, workers) 68 | end 69 | 70 | it_behaves_like 'a map partitions', 1 71 | it_behaves_like 'a map partitions', 2 72 | # it_behaves_like 'a map partitions', nil 73 | # it_behaves_like 'a map partitions', rand(2..10) 74 | end 75 | 76 | context 'throught text_file' do 77 | let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } 78 | let(:numbers) { File.readlines(file).map(&:strip) } 79 | 80 | def rdd(workers) 81 | $sc.text_file(file, workers) 82 | end 83 | 84 | it_behaves_like 'a map partitions', 1 85 | it_behaves_like 'a map partitions', 2 86 | # it_behaves_like 'a map partitions', nil 87 | # it_behaves_like 'a map partitions', rand(2..10) 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /spec/lib/map_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.shared_examples 'a mapping' do |workers| 4 | it "with #{workers || 'default'} worker" do 5 | rdd2 = rdd(workers).map(func1) 6 | result = numbers.map(&func1) 7 | 8 | expect(rdd2.collect).to eql(result) 9 | 10 | rdd3 = rdd(workers) 11 | rdd3 = rdd3.map(func1) 12 | rdd3 = rdd3.map(func2) 13 | rdd3 = rdd3.map(func3) 14 | result = numbers.map(&func1).map(&func2).map(&func3) 15 | 16 | expect(rdd3.collect).to eql(result) 17 | 18 | rdd4 = rdd(workers) 19 | rdd4 = rdd4.map(func3) 20 | rdd4 = rdd4.map(func2) 21 | rdd4 = rdd4.map(func1) 22 | 23 | expect(rdd4.collect).to eql(rdd3.collect) 24 | end 25 | end 26 | 27 | RSpec.shared_examples 'a mapping values' do |workers| 28 | it "with #{workers || 'default'} worker" do 29 | rdd2 = rdd(workers).map_values(func1) 30 | result = hash.map{|key, value| [key, func1.call(value)]} 31 | 32 | expect(rdd2.collect).to eql(result) 33 | 34 | rdd3 = rdd(workers) 35 | rdd3 = rdd3.map_values(func1) 36 | rdd3 = rdd3.map_values(func2) 37 | rdd3 = rdd3.map_values(func3) 38 | result = hash.map{|key, value| [key, func1.call(value)]} 39 | .map{|key, value| [key, func2.call(value)]} 40 | .map{|key, value| [key, func3.call(value)]} 41 | 42 | expect(rdd3.collect).to eql(result) 43 | end 44 | end 45 | 46 | RSpec.describe 'Spark::RDD' do 47 | let(:func1) { lambda{|x| x*2} } 48 | let(:func2) { lambda{|x| x*3} } 49 | let(:func3) { lambda{|x| x*4} } 50 | 51 | context 'throught parallelize' do 52 | context '.map' do 53 | let(:numbers) { Generator.numbers } 54 | 55 | def rdd(workers) 56 | $sc.parallelize(numbers, workers) 57 | end 58 | 59 | it_behaves_like 'a mapping', 1 60 | it_behaves_like 'a mapping', 2 61 | # it_behaves_like 'a mapping', nil 62 | # it_behaves_like 'a mapping', rand(2..10) 63 | end 64 | 65 | context '.map_values' do 66 | let!(:hash) { Generator.hash } 67 | 68 | def rdd(workers) 69 | $sc.parallelize(hash, workers) 70 | end 71 | 72 | it_behaves_like 'a mapping values', 1 73 | it_behaves_like 'a mapping values', 2 74 | # it_behaves_like 'a mapping values', nil 75 | # it_behaves_like 'a mapping values', rand(2..10) 76 | end 77 | end 78 | 79 | context 'throught text_file' do 80 | context '.map' do 81 | let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } 82 | let(:numbers) { File.readlines(file).map(&:strip) } 83 | 84 | def rdd(workers) 85 | $sc.text_file(file, workers) 86 | end 87 | 88 | it_behaves_like 'a mapping', 1 89 | it_behaves_like 'a mapping', 2 90 | # it_behaves_like 'a mapping', nil 91 | # it_behaves_like 'a mapping', rand(2..10) 92 | end 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /spec/lib/mllib/classification_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe 'Spark::Mllib classification' do 4 | 5 | let(:data1) do 6 | [ 7 | LabeledPoint.new(0.0, [1, 0, 0]), 8 | LabeledPoint.new(1.0, [0, 1, 1]), 9 | LabeledPoint.new(0.0, [2, 0, 0]), 10 | LabeledPoint.new(1.0, [0, 2, 1]) 11 | ] 12 | end 13 | 14 | let(:values1) do 15 | data1.map do |lp| 16 | lp.features.values 17 | end 18 | end 19 | 20 | let(:rdd1) { $sc.parallelize(data1) } 21 | 22 | context 'logistic regression' do 23 | it 'test' do 24 | lrm = LogisticRegressionWithSGD.train(rdd1) 25 | 26 | expect(lrm.predict(values1[0])).to be <= 0 27 | expect(lrm.predict(values1[1])).to be > 0 28 | expect(lrm.predict(values1[2])).to be <= 0 29 | expect(lrm.predict(values1[3])).to be > 0 30 | end 31 | end 32 | 33 | context 'svm' do 34 | it 'test' do 35 | lrm = SVMWithSGD.train(rdd1) 36 | 37 | expect(lrm.predict(values1[0])).to be <= 0 38 | expect(lrm.predict(values1[1])).to be > 0 39 | expect(lrm.predict(values1[2])).to be <= 0 40 | expect(lrm.predict(values1[3])).to be > 0 41 | end 42 | end 43 | 44 | context 'naive bayes' do 45 | it 'test' do 46 | lrm = NaiveBayes.train(rdd1) 47 | 48 | expect(lrm.predict(values1[0])).to be <= 0 49 | expect(lrm.predict(values1[1])).to be > 0 50 | expect(lrm.predict(values1[2])).to be <= 0 51 | expect(lrm.predict(values1[3])).to be > 0 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /spec/lib/mllib/clustering_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe 'Spark::Mllib clustering' do 4 | context 'kmeans' do 5 | it 'test' do 6 | data = [ 7 | DenseVector.new([0, 1.1]), 8 | DenseVector.new([0, 1.2]), 9 | DenseVector.new([1.1, 0]), 10 | DenseVector.new([1.2, 0]) 11 | ] 12 | model = KMeans.train($sc.parallelize(data), 2, initialization_mode: 'k-means||') 13 | 14 | expect(model.predict(data[0])).to eq(model.predict(data[1])) 15 | expect(model.predict(data[2])).to eq(model.predict(data[3])) 16 | end 17 | 18 | it 'deterministic' do 19 | data = Array.new(10) do |i| 20 | i *= 10 21 | DenseVector.new([i, i]) 22 | end 23 | 24 | clusters1 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42) 25 | clusters2 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42) 26 | 27 | centers1 = clusters1.centers.to_a 28 | centers2 = clusters2.centers.to_a 29 | 30 | centers1.zip(centers2).each do |c1, c2| 31 | expect(c1).to eq(c2) 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /spec/lib/mllib/matrix_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe 'Spark::Mllib::Matrix' do 4 | context 'dense' do 5 | it 'construct' do 6 | values = [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]] 7 | matrix = DenseMatrix.new(3, 3, [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]) 8 | 9 | expect(matrix.shape).to eq([3, 3]) 10 | expect(matrix.values).to eq([[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]) 11 | end 12 | end 13 | 14 | context 'sparse' do 15 | it 'construct' do 16 | values = [1.0, 2.0, 4.0, 5.0] 17 | column_pointers = [0, 2, 2, 4, 4] 18 | row_indices = [1, 2, 1, 2] 19 | 20 | matrix = SparseMatrix.new(3, 4, column_pointers, row_indices, values) 21 | 22 | expect(matrix.shape).to eq([3, 4]) 23 | expect(matrix.to_a).to eq( 24 | [ 25 | [0.0, 0.0, 0.0, 0.0], 26 | [1.0, 0.0, 4.0, 0.0], 27 | [2.0, 0.0, 5.0, 0.0] 28 | ] 29 | ) 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /spec/lib/mllib/regression_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | # Mllib functions are tested on Spark 4 | # This just test if ruby call proper methods 5 | 6 | RSpec.describe 'Spark::Mllib regression' do 7 | 8 | let(:data1) do 9 | [ 10 | LabeledPoint.new(-1.0, [0, -1]), 11 | LabeledPoint.new(1.0, [0, 1]), 12 | LabeledPoint.new(-1.0, [0, -2]), 13 | LabeledPoint.new(1.0, [0, 2]) 14 | ] 15 | end 16 | 17 | let(:values1) do 18 | data1.map do |lp| 19 | lp.features.values 20 | end 21 | end 22 | 23 | let(:rdd1) { $sc.parallelize(data1) } 24 | 25 | context 'labeled point' do 26 | let(:lp) { LabeledPoint.new(1, [1,2,3]) } 27 | 28 | it 'from array' do 29 | expect(lp.label).to eql(1.0) 30 | expect(lp.features).to be_a(DenseVector) 31 | end 32 | 33 | it 'serialize' do 34 | lp2 = Marshal.load(Marshal.dump(lp)) 35 | 36 | expect(lp2.label).to eql(lp.label) 37 | expect(lp2.features.values).to eql(lp.features.values) 38 | end 39 | end 40 | 41 | context 'linear regression' do 42 | context 'test' do 43 | let(:lrm) { LinearRegressionWithSGD.train(rdd1) } 44 | 45 | it 'test' do 46 | expect(lrm.predict(values1[0])).to be <= 0 47 | expect(lrm.predict(values1[1])).to be > 0 48 | expect(lrm.predict(values1[2])).to be <= 0 49 | expect(lrm.predict(values1[3])).to be > 0 50 | end 51 | 52 | it 'test via rdd' do 53 | rdd = $sc.parallelize(values1, 1) 54 | rdd = rdd.map(lambda{|value| model.predict(value)}) 55 | rdd = rdd.bind(model: lrm) 56 | 57 | result = rdd.collect 58 | 59 | expect(result[0]).to be <= 0 60 | expect(result[1]).to be > 0 61 | expect(result[2]).to be <= 0 62 | expect(result[3]).to be > 0 63 | end 64 | end 65 | 66 | # Y = 3 + 10*X1 + 10*X2 67 | it 'linear regression' do 68 | data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 3.0, ['10.0', '10.0'], 100, 42, 0.1) 69 | rdd = $sc.parallelize(data) 70 | 71 | lrm = LinearRegressionWithSGD.train(rdd, iterations: 1000, intercept: true, step: 1.0) 72 | 73 | expect(lrm.intercept).to be_between(2.5, 3.5) 74 | expect(lrm.weights.size).to eq(2) 75 | expect(lrm.weights[0]).to be_between(9.0, 11.0) 76 | expect(lrm.weights[1]).to be_between(9.0, 11.0) 77 | end 78 | end 79 | 80 | context 'lasso' do 81 | it 'test' do 82 | lrm = LassoWithSGD.train(rdd1) 83 | 84 | expect(lrm.predict(values1[0])).to be <= 0 85 | expect(lrm.predict(values1[1])).to be > 0 86 | expect(lrm.predict(values1[2])).to be <= 0 87 | expect(lrm.predict(values1[3])).to be > 0 88 | end 89 | 90 | it 'local random SGD with initial weights' do 91 | data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 2.0, ['-1.5', '0.01'], 1000, 42, 0.1) 92 | data.map! do |lp| 93 | LabeledPoint.new(lp.label, [1.0] + lp.features.values) 94 | end 95 | 96 | rdd = $sc.parallelize(data); 97 | 98 | lrm = LassoWithSGD.train(rdd, step: 1.0, reg_param: 0.01, iterations: 40, initial_weights: [-1.0, -1.0, -1.0]) 99 | 100 | expect(lrm.weights[0]).to be_between(1.9, 2.1) 101 | expect(lrm.weights[1]).to be_between(-1.60, -1.40) 102 | expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2) 103 | end 104 | end 105 | 106 | context 'ridge' do 107 | it 'test' do 108 | lrm = RidgeRegressionWithSGD.train(rdd1) 109 | 110 | expect(lrm.predict(values1[0])).to be <= 0 111 | expect(lrm.predict(values1[1])).to be > 0 112 | expect(lrm.predict(values1[2])).to be <= 0 113 | expect(lrm.predict(values1[3])).to be > 0 114 | end 115 | end 116 | end 117 | -------------------------------------------------------------------------------- /spec/lib/mllib/vector_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe 'Spark::Mllib::Vector' do 4 | 5 | context 'parsing' do 6 | it 'dense vector' do 7 | dv = DenseVector.new([1.0, 2.0, 3.0, 4.0, 5.0]) 8 | dv2 = DenseVector.parse(dv.to_s) 9 | dv3 = Vectors.parse(dv.to_s) 10 | 11 | expect(dv.to_s).to eq("[1.0,2.0,3.0,4.0,5.0]") 12 | expect(dv2.values).to eq(dv.values) 13 | expect(dv3.values).to eq(dv.values) 14 | end 15 | 16 | it 'sparse vector' do 17 | sv = SparseVector.new(5, {1 => 3, 4 => 5}) 18 | sv2 = SparseVector.parse(sv.to_s) 19 | sv3 = Vectors.parse(sv.to_s) 20 | 21 | expect(sv.to_s).to eq("(5,[1,4],[3,5])") 22 | expect(sv2.size).to eq(sv.size) 23 | expect(sv2.indices).to eq(sv.indices) 24 | expect(sv2.values).to eq(sv.values) 25 | expect(sv3.size).to eq(sv.size) 26 | expect(sv3.indices).to eq(sv.indices) 27 | expect(sv3.values).to eq(sv.values) 28 | end 29 | end 30 | 31 | it 'dot' do 32 | sv = SparseVector.new(4, {1 => 1, 3 => 2}) 33 | dv = DenseVector.new([1.0, 2.0, 3.0, 4.0]) 34 | lst = DenseVector.new([1, 2, 3, 4]) 35 | 36 | expect(sv.dot(dv)).to eq(10.0) 37 | expect(dv.dot(dv)).to eq(30.0) 38 | expect(lst.dot(dv)).to eq(30.0) 39 | end 40 | 41 | it 'squared distance' do 42 | sv = SparseVector.new(4, {1 => 1, 3 => 2}) 43 | dv = DenseVector.new([1.0, 2.0, 3.0, 4.0]) 44 | lst = DenseVector.new([4, 3, 2, 1]) 45 | 46 | expect(sv.squared_distance(dv)).to eq(15) 47 | expect(sv.squared_distance(lst)).to eq(25) 48 | expect(dv.squared_distance(lst)).to eq(20) 49 | expect(dv.squared_distance(sv)).to eq(15) 50 | expect(lst.squared_distance(sv)).to eq(25) 51 | expect(lst.squared_distance(dv)).to eq(20) 52 | expect(sv.squared_distance(sv)).to eq(0) 53 | expect(dv.squared_distance(dv)).to eq(0) 54 | expect(lst.squared_distance(lst)).to eq(0) 55 | end 56 | 57 | it 'sparse vector indexing' do 58 | sv1 = SparseVector.new(4, {1 => 1, 3 => 2}) 59 | sv2 = SparseVector.new(4, [1, 3], [1, 2]) 60 | 61 | expect(sv1[0]).to eq(0) 62 | expect(sv1[3]).to eq(2) 63 | expect(sv1[1]).to eq(1) 64 | expect(sv1[2]).to eq(0) 65 | expect(sv1[-1]).to eq(2) 66 | expect(sv1[-2]).to eq(0) 67 | expect(sv1[-4]).to eq(0) 68 | 69 | expect(sv2[0]).to eq(0) 70 | expect(sv2[3]).to eq(2) 71 | expect(sv2[1]).to eq(1) 72 | expect(sv2[2]).to eq(0) 73 | expect(sv2[-1]).to eq(2) 74 | expect(sv2[-2]).to eq(0) 75 | expect(sv2[-4]).to eq(0) 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /spec/lib/reduce_by_key_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | def flat_map(line) 4 | line.split 5 | end 6 | 7 | def map(item) 8 | [item, 1] 9 | end 10 | 11 | def reduce(x,y) 12 | x+y 13 | end 14 | 15 | RSpec.shared_examples 'a words counting' do |workers| 16 | context "with #{workers || 'default'} worker" do 17 | let(:result) do 18 | keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]} 19 | result = keyyed.reduce({}){|memo, item| 20 | key = item[0] 21 | value = item[1] 22 | 23 | memo[key] ||= 0 24 | memo[key] += value 25 | memo 26 | } 27 | result 28 | end 29 | 30 | it 'when lambda' do 31 | rdd2 = rdd(workers) 32 | rdd2 = rdd2.flat_map(lambda{|line| line.split}) 33 | rdd2 = rdd2.map(lambda{|word| [word, 1]}) 34 | rdd2 = rdd2.reduce_by_key(lambda{|x,y| x+y}) 35 | 36 | expect(rdd2.collect_as_hash).to eql(result) 37 | end 38 | 39 | it 'when method' do 40 | rdd2 = rdd(workers) 41 | rdd2 = rdd2.flat_map(method(:flat_map)) 42 | rdd2 = rdd2.map(method(:map)) 43 | rdd2 = rdd2.reduce_by_key(method(:reduce)) 44 | 45 | expect(rdd2.collect_as_hash).to eql(result) 46 | end 47 | 48 | it 'keys, values' do 49 | rdd2 = rdd(workers) 50 | rdd2 = rdd2.flat_map(method(:flat_map)) 51 | rdd2 = rdd2.map(method(:map)) 52 | rdd2 = rdd2.reduce_by_key(method(:reduce)) 53 | 54 | expect(rdd2.keys.collect.sort).to eql(result.keys.sort) 55 | expect { rdd2.values.collect.reduce(:+) }.to_not raise_error 56 | end 57 | end 58 | end 59 | 60 | RSpec.describe 'Spark::RDD' do 61 | context '.reduce_by_key' do 62 | context 'throught parallelize' do 63 | let(:lines) { Generator.lines } 64 | 65 | def rdd(workers) 66 | $sc.parallelize(lines, workers) 67 | end 68 | 69 | it_behaves_like 'a words counting', 2 70 | # it_behaves_like 'a words counting', nil 71 | # it_behaves_like 'a words counting', rand(2..10) 72 | end 73 | 74 | context 'throught text_file' do 75 | let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') } 76 | let(:lines) { File.readlines(file).map(&:strip) } 77 | 78 | def rdd(workers) 79 | $sc.text_file(file, workers) 80 | end 81 | 82 | it_behaves_like 'a words counting', 2 83 | # it_behaves_like 'a words counting', nil 84 | # it_behaves_like 'a words counting', rand(2..10) 85 | end 86 | end 87 | 88 | context '.fold_by_key' do 89 | let(:numbers) { Generator.numbers } 90 | let(:zero_value) { 0 } 91 | let(:rdd) { $sc.parallelize(numbers) } 92 | let(:map) { lambda{|x| [x, 1]} } 93 | let(:add) { lambda{|x,y| x+y} } 94 | 95 | let(:result) do 96 | _result = {} 97 | numbers.map(&map).each do |key, value| 98 | _result[key] ||= zero_value 99 | _result[key] = add.call(_result[key], value) 100 | end 101 | _result 102 | end 103 | 104 | def fold_by_key(num_partitions=nil) 105 | rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash 106 | end 107 | 108 | it 'default num_partitions' do 109 | expect(fold_by_key).to eq(result) 110 | end 111 | 112 | it 'default num_partitions' do 113 | expect( 114 | fold_by_key rand(1..10) 115 | ).to eq(result) 116 | end 117 | end 118 | end 119 | -------------------------------------------------------------------------------- /spec/lib/sample_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | # Sample method can not be tested because of random generator 4 | # Just test it for raising error 5 | 6 | RSpec.shared_examples 'a sampler' do |workers| 7 | context "with #{workers || 'default'} worker" do 8 | 9 | context '.sample' do 10 | it 'with replacement' do 11 | rdd2 = rdd(workers).sample(true, rand) 12 | expect { rdd2.collect }.to_not raise_error 13 | end 14 | 15 | it 'without replacement' do 16 | rdd2 = rdd(workers).sample(false, rand) 17 | expect { rdd2.collect }.to_not raise_error 18 | end 19 | end 20 | 21 | context '.take_sample' do 22 | it 'with replacement' do 23 | size = rand(10..999) 24 | expect(rdd(workers).take_sample(true, size).size).to eql(size) 25 | end 26 | 27 | it 'without replacement' do 28 | size = rand(10..999) 29 | expect(rdd(workers).take_sample(false, size).size).to eql(size) 30 | end 31 | end 32 | 33 | end 34 | end 35 | 36 | RSpec.describe 'Spark::RDD' do 37 | let(:numbers) { Generator.numbers(1000) } 38 | 39 | def rdd(workers) 40 | $sc.parallelize(numbers, workers) 41 | end 42 | 43 | it_behaves_like 'a sampler', 1 44 | it_behaves_like 'a sampler', 2 45 | # it_behaves_like 'a sampler', nil 46 | # it_behaves_like 'a sampler', rand(2..10) 47 | end 48 | -------------------------------------------------------------------------------- /spec/lib/sort_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.shared_examples 'a sorting' do |workers| 4 | it "with #{workers || 'default'} worker" do 5 | rdd2 = rdd(workers) 6 | 7 | rdd2 = rdd2.flat_map(split) 8 | result = lines.flat_map(&split) 9 | 10 | # Sort by self 11 | rdd3 = rdd2.map(map).sort_by_key 12 | result2 = result.map(&map).sort_by{|(key, _)| key} 13 | 14 | expect(rdd3.collect).to eql(result2) 15 | 16 | # Sort by len 17 | rdd3 = rdd2.map(len_map).sort_by_key 18 | result2 = result.map(&len_map).sort_by{|(key, _)| key} 19 | 20 | expect(rdd3.collect).to eql(result2) 21 | end 22 | end 23 | 24 | 25 | RSpec.describe 'Spark::RDD' do 26 | let(:split) { lambda{|x| x.split} } 27 | let(:map) { lambda{|x| [x.to_s, 1]} } 28 | let(:len_map) { lambda{|x| [x.size, x]} } 29 | 30 | context 'throught parallelize' do 31 | context '.map' do 32 | let(:lines) { Generator.lines } 33 | 34 | def rdd(workers) 35 | $sc.parallelize(lines, workers) 36 | end 37 | 38 | it_behaves_like 'a sorting', 1 39 | it_behaves_like 'a sorting', 2 40 | # it_behaves_like 'a sorting', nil 41 | # it_behaves_like 'a sorting', rand(2..10) 42 | end 43 | end 44 | 45 | context 'throught text_file' do 46 | context '.map' do 47 | let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') } 48 | let(:lines) { File.readlines(file).map(&:strip) } 49 | 50 | def rdd(workers) 51 | $sc.text_file(file, workers) 52 | end 53 | 54 | it_behaves_like 'a sorting', 1 55 | it_behaves_like 'a sorting', 2 56 | # it_behaves_like 'a sorting', nil 57 | # it_behaves_like 'a sorting', rand(2..10) 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /spec/lib/sql/data_frame_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Spark::SQL::DataFrame do 4 | 5 | let(:file) { File.join('spec', 'inputs', 'people.json') } 6 | let(:df) { $sql.read.json(file) } 7 | 8 | context '[]' do 9 | 10 | it 'String' do 11 | value = df['age'] 12 | expect(value).to be_a(Spark::SQL::Column) 13 | expect(value.to_s).to eq('Column("age")') 14 | end 15 | 16 | it 'Array' do 17 | value = df[ ['name', 'age'] ] 18 | expect(value).to be_a(Spark::SQL::DataFrame) 19 | expect(value.columns).to eq(['name', 'age']) 20 | end 21 | 22 | it 'Numeric' do 23 | value = df[0] 24 | expect(value).to be_a(Spark::SQL::Column) 25 | expect(value.to_s).to eq('Column("active")') 26 | end 27 | 28 | it 'Column' do 29 | value = df[ df[0] == true ] 30 | expect(value).to be_a(Spark::SQL::DataFrame) 31 | end 32 | 33 | end 34 | 35 | it 'columns' do 36 | expect(df.columns).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name']) 37 | end 38 | 39 | it 'schema' do 40 | schema = df.schema 41 | expect(schema).to be_a(Spark::SQL::StructType) 42 | expect(schema.type_name).to eq('struct') 43 | end 44 | 45 | it 'show_string' do 46 | expect(df.show_string).to start_with('+--') 47 | end 48 | 49 | it 'dtypes' do 50 | expect(df.dtypes).to eq([['active', 'boolean'], ['address', 'string'], ['age', 'long'], ['email', 'string'], ['id', 'long'], ['ip_address', 'string'], ['name', 'string']]) 51 | end 52 | 53 | it 'take' do 54 | expect(df.take(10).size).to eq(10) 55 | end 56 | 57 | it 'count' do 58 | expect(df.count).to eq(100) 59 | end 60 | 61 | context 'select' do 62 | 63 | it '*' do 64 | row = df.select('*').first 65 | expect(row.data.keys).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name']) 66 | end 67 | 68 | it 'with string' do 69 | row = df.select('name', 'age').first 70 | expect(row.data.keys).to eq(['name', 'age']) 71 | end 72 | 73 | it 'with column' do 74 | row = df.select(df.name, df.age).first 75 | expect(row.data.keys).to eq(['name', 'age']) 76 | end 77 | 78 | end 79 | 80 | context 'where' do 81 | 82 | it 'with string' do 83 | eq_20 = df.filter('age = 20').collect 84 | expect(eq_20.map{|c| c['age']}).to all(be == 20) 85 | end 86 | 87 | it 'with column' do 88 | nil_values = df.where(df.age.is_null).collect 89 | greater_or_eq_20 = df.where(df.age >= 20).collect 90 | lesser_than_20 = df.where(df.age < 20).collect 91 | 92 | expect(nil_values.size + greater_or_eq_20.size + lesser_than_20.size).to eq(df.count) 93 | 94 | expect(nil_values.map{|c| c['age']}).to all(be_nil) 95 | expect(greater_or_eq_20.map{|c| c['age']}).to all(be >= 20) 96 | expect(lesser_than_20.map{|c| c['age']}).to all(be < 20) 97 | end 98 | 99 | end 100 | 101 | end 102 | -------------------------------------------------------------------------------- /spec/lib/whole_text_files_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.shared_examples 'a whole_text_files' do |workers| 4 | it "with #{workers || 'default'} worker" do 5 | rdd2 = rdd(workers).map(get_numbers) 6 | result = files.size 7 | 8 | expect(rdd2.collect.size).to eql(result) 9 | 10 | rdd3 = rdd(workers) 11 | rdd3 = rdd3.flat_map(get_numbers) 12 | 13 | result = 0 14 | files.each{|f| result += File.read(f).split.map(&:to_i).reduce(:+)} 15 | 16 | expect(rdd3.sum).to eql(result) 17 | end 18 | end 19 | 20 | RSpec.describe 'Spark::Context' do 21 | let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} } 22 | 23 | let(:dir) { File.join('spec', 'inputs', 'numbers') } 24 | let(:files) { Dir.glob(File.join(dir, '*')) } 25 | 26 | def rdd(workers) 27 | $sc.whole_text_files(dir, workers) 28 | end 29 | 30 | it_behaves_like 'a whole_text_files', 1 31 | it_behaves_like 'a whole_text_files', 2 32 | # it_behaves_like 'a whole_text_files', nil 33 | # it_behaves_like 'a whole_text_files', rand(2..10) 34 | end 35 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'simplecov' 2 | SimpleCov.start 3 | 4 | $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib' 5 | require 'ruby-spark' 6 | require 'generator' 7 | 8 | # Loading 9 | Spark.load_lib 10 | Spark.jb.import_all_test 11 | Spark::Mllib.import 12 | 13 | # Keep it on method because its called from config test 14 | def spark_start 15 | Spark.logger.disable 16 | Spark.config do 17 | set 'spark.ruby.serializer.batch_size', 100 18 | end 19 | $sc = Spark.start 20 | $sql = Spark.start_sql 21 | end 22 | 23 | def windows? 24 | RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ 25 | end 26 | 27 | RSpec.configure do |config| 28 | config.default_formatter = 'doc' 29 | config.color = true 30 | config.tty = true 31 | 32 | config.before(:suite) do 33 | spark_start 34 | end 35 | config.after(:suite) do 36 | Spark.stop 37 | end 38 | end 39 | --------------------------------------------------------------------------------