├── VERSION ├── .document ├── Gemfile ├── .gitignore ├── lib ├── mapredus │ ├── outputter.rb │ ├── mapper.rb │ ├── finalizer.rb │ ├── inputter.rb │ ├── reducer.rb │ ├── support.rb │ ├── filesystem.rb │ ├── keys.rb │ ├── default_classes.rb │ ├── master.rb │ └── process.rb └── mapredus.rb ├── LICENSE ├── spec ├── helper.rb ├── helper_classes.rb ├── redis-test.conf └── mapredus_spec.rb ├── Rakefile ├── mapredus.gemspec └── README.md /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.6 2 | -------------------------------------------------------------------------------- /.document: -------------------------------------------------------------------------------- 1 | README.rdoc 2 | lib/**/*.rb 3 | bin/* 4 | features/**/*.feature 5 | LICENSE 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "http://rubygems.org" 2 | gem "jeweler" 3 | gem "rake" 4 | gem "rspec" 5 | gem "redis" 6 | gem "resque" 7 | gem "resque-scheduler" 8 | gem "redis_support", "0.0.12" 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## MAC OS 2 | .DS_Store 3 | 4 | ## TEXTMATE 5 | *.tmproj 6 | tmtags 7 | 8 | ## EMACS 9 | *~ 10 | \#* 11 | .\#* 12 | 13 | ## VIM 14 | *.swp 15 | 16 | ## PROJECT::GENERAL 17 | coverage 18 | rdoc 19 | pkg 20 | log 21 | 22 | ## PROJECT::SPECIFIC 23 | .bundle 24 | Gemfile.lock 25 | vendor -------------------------------------------------------------------------------- /lib/mapredus/outputter.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | # 3 | # Standard readers for the input and output of Files coming out 4 | # of the FileSystem. 5 | # 6 | class Outputter < QueueProcess 7 | def self.decode(result_key) 8 | FileSystem.get(result_key) 9 | end 10 | 11 | def self.encode(result_key, o) 12 | FileSystem.set(result_key, o) 13 | end 14 | 15 | # 16 | # type should either be "decode" or "encode" 17 | # 18 | def self.perform(type, o) 19 | send(type, o) 20 | end 21 | end 22 | 23 | class JsonOutputter < Outputter 24 | def self.decode(result_key) 25 | Helper.decode(FileSystem.get(result_key)) 26 | end 27 | 28 | def self.encode(result_key, o) 29 | FileSystem.set(result_key, Helper.encode(o)) 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/mapredus/mapper.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | # Map is a function that takes a data chunk 3 | # where each data chunk is a list of pieces of your raw data 4 | # and emits a list of key, value pairs. 5 | # 6 | # The output of the map shall always be 7 | # [ [key, value], [key, value], ... ] 8 | # 9 | # Note: Values must be string, integers, booleans, or floats. 10 | # i.e., They must be primitive types since these are the only 11 | # types that redis supports and since anything inputted into 12 | # redis becomes a string. 13 | class Mapper < QueueProcess 14 | def self.map(data_chunk); raise InvalidMapper; end 15 | 16 | def self.perform(pid, data_key) 17 | process = Process.open(pid) 18 | data_chunk = FileSystem.hget(ProcessInfo.input(pid), data_key) 19 | map( data_chunk ) do |*key_value| 20 | process.emit_intermediate(*key_value) 21 | end 22 | ensure 23 | Master.free_slave(pid) 24 | process.next_state 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/mapredus/finalizer.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | # Run the stuff you want to run at the end of the process. 3 | # Define subclass which defines self.finalize and self.serialize 4 | # to do what is needed when you want to get the final output 5 | # out of redis and into ruby. 6 | # 7 | # This is basically the message back to the user program that a 8 | # process is completed storing the necessary info. 9 | # 10 | class Finalizer < QueueProcess 11 | 12 | # The default finalizer is to notify of process completion 13 | # 14 | # Example 15 | # Finalizer::finalize(pid) 16 | # # => "MapRedus Process : 111 : has completed" 17 | # 18 | # Returns a message notification 19 | def self.finalize(pid) 20 | "MapRedus Process : #{pid} : has completed" 21 | end 22 | 23 | def self.perform(pid) 24 | process = Process.open(pid) 25 | result = finalize(process) 26 | Master.finish_metrics(pid) 27 | result 28 | ensure 29 | Master.free_slave(pid) 30 | process.next_state 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/mapredus/inputter.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | class InputStream < QueueProcess 3 | # 4 | # An InputSteam needs to implement a way to scan through the 5 | # data_object (the object data that is sent to the MapRedus 6 | # process). The scan function implements how the data object is 7 | # broken sizable pieces for the mappers to operate on. 8 | # 9 | # It does this by yielding a pair. The key 10 | # specifies the location storage in redis. map_data is string 11 | # data that will be written to the redis. 12 | # 13 | # Example 14 | # scan(data_object) do |key, map_data| 15 | # ... 16 | # end 17 | def self.scan(*data_object) 18 | raise InvalidInputStream 19 | end 20 | 21 | def self.perform(pid, data_object) 22 | process = Process.open(pid) 23 | scan(*data_object) do |key, map_data| 24 | FileSystem.hset(ProcessInfo.input(pid), key, map_data) 25 | Master.enslave_map(process, key) 26 | end 27 | ensure 28 | Master.free_slave(pid) 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Dolores Labs 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /spec/helper.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'spec' 3 | 4 | dir = File.dirname(__FILE__) 5 | $LOAD_PATH.unshift(File.join(dir, '..', 'lib')) 6 | $LOAD_PATH.unshift(dir) 7 | require 'mapredus' 8 | 9 | # 10 | # make sure we can run redis 11 | # 12 | if !system("which redis-server") 13 | puts '', "** can't find `redis-server` in your path" 14 | abort '' 15 | end 16 | 17 | # 18 | # start our own redis when the tests start, 19 | # kill it when they end (redis is run as a daemon) 20 | # 21 | puts "Starting redis for testing at localhost:9736..." 22 | `redis-server #{dir}/redis-test.conf` 23 | 24 | at_exit do 25 | # 26 | # hope that no other processes have redis-test in the name... 27 | # TODO: fixme 28 | # 29 | pid = `ps -A -o pid,command | grep [r]edis-test`.split(" ")[0] 30 | puts "Killing test redis server..." 31 | `rm -f #{dir}/dump.rdb` 32 | Process.kill("KILL", pid.to_i) 33 | end 34 | 35 | # 36 | # Set the redis server 37 | # 38 | RedisSupport.redis = 'localhost:9736:0' 39 | MapRedus.redis = 'localhost:9736:0' 40 | Resque.redis = MapRedus.redis 41 | require 'resque/failure/redis' 42 | Resque::Failure.backend = Resque::Failure::Redis 43 | 44 | require 'helper_classes' 45 | 46 | def work_off 47 | Resque::Worker.new("*").work(0) 48 | end 49 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'bundler' 3 | Bundler.setup 4 | 5 | require 'rake' 6 | require 'spec/rake/spectask' 7 | 8 | begin 9 | require 'jeweler' 10 | Jeweler::Tasks.new do |gem| 11 | gem.name = "mapredus" 12 | gem.summary = %Q{mapredus initial} 13 | gem.description = %Q{simple mapreduce framework using redis and resque} 14 | gem.email = "john@doloreslabs.com" 15 | gem.homepage = "http://github.com/dolores/mapredus" 16 | gem.authors = ["John Le", "Brian O'Rourke"] 17 | gem.files = Dir['lib/**/*.rb'] 18 | gem.add_dependency "redis", ">= 1.0.4" 19 | gem.add_dependency "resque", ">= 1.8" 20 | gem.add_dependency "resque-scheduler" 21 | gem.add_dependency "redis_support", ">= 0" 22 | end 23 | Jeweler::GemcutterTasks.new 24 | rescue LoadError 25 | puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler" 26 | end 27 | 28 | require 'rake/rdoctask' 29 | Rake::RDocTask.new do |rdoc| 30 | version = File.exist?('VERSION') ? File.read('VERSION') : "" 31 | 32 | rdoc.rdoc_dir = 'rdoc' 33 | rdoc.title = "mapredus #{version}" 34 | rdoc.rdoc_files.include('README*') 35 | rdoc.rdoc_files.include('lib/**/*.rb') 36 | end 37 | 38 | Spec::Rake::SpecTask.new(:spec) do |t| 39 | t.spec_files = FileList['spec/helper.rb', 'spec/helper_classes.rb', 'spec/mapredus_spec.rb'] 40 | t.spec_opts = ["--color", "--format", "specdoc", 41 | "-f", "o:log/spec_profile.txt", 42 | "-f", "e:log/spec_failing.txt"] 43 | end 44 | 45 | task :spec => :check_dependencies 46 | 47 | task :default => :spec 48 | -------------------------------------------------------------------------------- /lib/mapredus/reducer.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | # Reduce is a function that takes in "all" the values for a single given key 3 | # and outputs a list of values or a single value that usually "reduces" 4 | # the initial given value set. 5 | # 6 | # The output of the reduce shall always be 7 | # reduce(values) = [ reduced value, reduced value, ... ] 8 | # and it will often only be a single element array 9 | # 10 | # The input values and the output values of the reduce will always 11 | # be a string. As described in the paper, it is up to the client 12 | # to define how to deal with this restriction. 13 | # 14 | class Reducer < QueueProcess 15 | # 16 | # After a recoverable fail this describes how much time we shall wait before 17 | # readding the reducer back on to the queue. 18 | # 19 | DEFAULT_WAIT = 10 # seconds 20 | def self.wait; DEFAULT_WAIT; end 21 | 22 | def self.reduce(values); raise InvalidReducer; end 23 | 24 | # 25 | # The overridable portion of a reducer perform. In some default 26 | # classes like Identity and Counter we do not call self.reduce but 27 | # provide optimization for the reduction by overriding this 28 | # method. 29 | # 30 | def self.reduce_perform(process, key) 31 | reduce(process.map_values(key)) do |reduce_val| 32 | process.emit( key, reduce_val ) 33 | end 34 | end 35 | 36 | # Doesn't handle redundant workers and fault tolerance 37 | # 38 | # TODO: Resque::AutoRetry might mess this up. 39 | def self.perform(pid, key) 40 | process = Process.open(pid) 41 | reduce_perform(process, key) 42 | rescue MapRedus::RecoverableFail 43 | Master.enslave_later_reduce(process, key) 44 | ensure 45 | Master.free_slave(pid) 46 | process.next_state 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/mapredus/support.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | module Support 3 | class MapRedusRunnerError < StandardError; end 4 | class DuplicateProcessDefinitionError < MapRedusRunnerError ; end 5 | 6 | class Runner 7 | def initialize(class_name) 8 | @class = class_name 9 | end 10 | 11 | def method_missing(method, *args, &block) 12 | mr_process = "#{@class}_#{method.to_s}" 13 | if self.respond_to?(mr_process) 14 | self.send(mr_process, *args, &block) 15 | else 16 | super(method, *args, &block) 17 | end 18 | end 19 | end 20 | 21 | def mapreduce 22 | @mapreduce_runner ||= Runner.new(self.class.to_s.gsub(/\W/,"_")) 23 | end 24 | 25 | module ClassMethods 26 | def mapreduce_process( process_name, mapredus_process_class, result_store ) 27 | runner_self = Runner 28 | class_name = self.to_s.gsub(/\W/,"_") 29 | 30 | global_process_name = "#{class_name}_#{process_name.to_s}" 31 | 32 | if runner_self.methods.include?(global_process_name) 33 | raise DuplicateProcessDefintionError 34 | end 35 | 36 | mapredus_process_class.set_result_key( result_store ) 37 | 38 | runner_self.send( :define_method, global_process_name ) do |data, key_arguments| 39 | process = mapredus_process_class.create 40 | process.update(:key_args => key_arguments) 41 | process.run(data) 42 | process 43 | end 44 | 45 | runner_self.send( :define_method, "#{global_process_name}_result" ) do |key_arguments, *outputter_args| 46 | key = mapredus_process_class.result_key( *key_arguments ) 47 | mapredus_process_class.outputter.decode( key, *outputter_args) 48 | end 49 | end 50 | end 51 | 52 | def self.included(model) 53 | model.extend ClassMethods 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/mapredus/filesystem.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | # Manages the book keeping of redis keys and redis usage 3 | # provides the data storage for process information through redis 4 | # All interaction with redis should go through this class 5 | # 6 | class FileSystem 7 | def self.storage 8 | MapRedus.redis 9 | end 10 | 11 | # Save/Read functions to save/read values for a redis key 12 | # 13 | # Examples 14 | # FileSystem.save( key, value ) 15 | def self.save(key, value, time = nil) 16 | storage.set(key, value) 17 | storage.expire(key, time) if time 18 | end 19 | 20 | def self.method_missing(method, *args, &block) 21 | if storage.respond_to?(method) 22 | storage.send(method, *args) 23 | else 24 | super 25 | end 26 | end 27 | 28 | # Copy the values from one key to a second key 29 | # 30 | # NOTE TODO: currently only works for the redis list data 31 | # structure but will be extended for arbitrary data types. 32 | # 33 | # NOTE: this does not account for the key being changed during the 34 | # copy, so should not be used in situations where the first_key 35 | # value can change during the running of copy. 36 | # 37 | # Examples 38 | # FileSystem.copy("key_one", "key_two") 39 | # 40 | # returns true on success false otherwise 41 | def self.copy(first_key, second_key) 42 | list_length = storage.llen(first_key) 43 | list_length.times do |index| 44 | storage.rpush(second_key, storage.lindex(first_key, index)) 45 | end 46 | true 47 | end 48 | 49 | # Setup locks on results using RedisSupport lock functionality 50 | # 51 | # Examples 52 | # FileSystem::has_lock?(key) 53 | # # => true or false 54 | # 55 | # Returns true if there's a lock 56 | def self.has_lock?(key) 57 | MapRedus.has_redis_lock?( RedisKey.result_cache(key) ) 58 | end 59 | 60 | def self.acquire_lock(key) 61 | MapRedus.acquire_redis_lock_nonblock( RedisKey.result_cache(key), 60 * 60 ) 62 | end 63 | 64 | def self.release_lock(key) 65 | MapRedus.release_redis_lock( RedisKey.result_cache(key) ) 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /mapredus.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{mapredus} 8 | s.version = "0.0.6" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["John Le", "Brian O'Rourke"] 12 | s.date = %q{2010-08-05} 13 | s.description = %q{simple mapreduce framework using redis and resque} 14 | s.email = %q{john@doloreslabs.com} 15 | s.extra_rdoc_files = [ 16 | "LICENSE", 17 | "README.md" 18 | ] 19 | s.files = [ 20 | "lib/mapredus.rb", 21 | "lib/mapredus/default_classes.rb", 22 | "lib/mapredus/filesystem.rb", 23 | "lib/mapredus/finalizer.rb", 24 | "lib/mapredus/inputter.rb", 25 | "lib/mapredus/keys.rb", 26 | "lib/mapredus/mapper.rb", 27 | "lib/mapredus/master.rb", 28 | "lib/mapredus/outputter.rb", 29 | "lib/mapredus/process.rb", 30 | "lib/mapredus/reducer.rb", 31 | "lib/mapredus/support.rb" 32 | ] 33 | s.homepage = %q{http://github.com/dolores/mapredus} 34 | s.rdoc_options = ["--charset=UTF-8"] 35 | s.require_paths = ["lib"] 36 | s.rubygems_version = %q{1.3.7} 37 | s.summary = %q{mapredus initial} 38 | s.test_files = [ 39 | "spec/helper_classes.rb", 40 | "spec/mapredus_spec.rb", 41 | "spec/helper.rb" 42 | ] 43 | 44 | if s.respond_to? :specification_version then 45 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 46 | s.specification_version = 3 47 | 48 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 49 | s.add_runtime_dependency(%q, [">= 1.0.4"]) 50 | s.add_runtime_dependency(%q, [">= 1.8"]) 51 | s.add_runtime_dependency(%q, [">= 0"]) 52 | s.add_runtime_dependency(%q, [">= 0"]) 53 | else 54 | s.add_dependency(%q, [">= 1.0.4"]) 55 | s.add_dependency(%q, [">= 1.8"]) 56 | s.add_dependency(%q, [">= 0"]) 57 | s.add_dependency(%q, [">= 0"]) 58 | end 59 | else 60 | s.add_dependency(%q, [">= 1.0.4"]) 61 | s.add_dependency(%q, [">= 1.8"]) 62 | s.add_dependency(%q, [">= 0"]) 63 | s.add_dependency(%q, [">= 0"]) 64 | end 65 | end 66 | 67 | -------------------------------------------------------------------------------- /spec/helper_classes.rb: -------------------------------------------------------------------------------- 1 | class CharStream < MapRedus::InputStream 2 | def self.scan(data_object) 3 | test_string = MapRedus::FileSystem.get(data_object) 4 | 5 | 0.step(test_string.size, 30) do |index| 6 | char_set = test_string[index...(index+30)] 7 | next if char_set.empty? 8 | yield(index, char_set) 9 | end 10 | end 11 | end 12 | 13 | class CharCounter < MapRedus::Mapper 14 | def self.map(map_data) 15 | map_data.each_char do |char| 16 | yield(char, 1) 17 | end 18 | end 19 | end 20 | 21 | class ExtraResultKeyHash < MapRedus::Finalizer 22 | def self.finalize(process) 23 | process.each_key_reduced_value do |key, value| 24 | process.outputter.encode(process.result_key("extra"), key, value) 25 | end 26 | end 27 | end 28 | 29 | class GetCharCount < MapRedus::Process 30 | EXPECTED_ANSWER = {"k"=>2, "v"=>1, " "=>54, ","=>3, "w"=>7, "a"=>17, "l"=>12, "b"=>2, "m"=>4, "c"=>3, "."=>2, "y"=>3, "n"=>18, "D"=>1, "d"=>15, "o"=>13, "p"=>14, "e"=>34, "f"=>6, "r"=>13, "g"=>6, "S"=>1, "s"=>12, "h"=>19, "H"=>1, "t"=>20, "i"=>16, "u"=>5, "j"=>1} 31 | inputter CharStream 32 | mapper CharCounter 33 | end 34 | 35 | class CharCountTest < MapRedus::Process 36 | inputter CharStream 37 | mapper CharCounter 38 | end 39 | 40 | class GetWordCount < MapRedus::Process 41 | TEST = "He pointed his finger in friendly jest and went over to the parapet laughing to himself. Stephen Dedalus stepped up, followed him wearily halfway and sat down on the edge of the gunrest, watching him still as he propped his mirror on the parapet, dipped the brush in the bowl and lathered cheeks and neck." 42 | EXPECTED_ANSWER = {"gunrest"=>1, "over"=>1, "still"=>1, "of"=>1, "him"=>2, "and"=>4, "bowl"=>1, "himself"=>1, "went"=>1, "friendly"=>1, "finger"=>1, "propped"=>1, "cheeks"=>1, "dipped"=>1, "down"=>1, "wearily"=>1, "up"=>1, "stepped"=>1, "dedalus"=>1, "to"=>2, "in"=>2, "sat"=>1, "the"=>6, "pointed"=>1, "as"=>1, "followed"=>1, "stephen"=>1, "laughing"=>1, "his"=>2, "he"=>2, "brush"=>1, "jest"=>1, "neck"=>1, "mirror"=>1, "edge"=>1, "on"=>2, "parapet"=>2, "lathered"=>1, "watching"=>1, "halfway"=>1} 43 | set_result_key "test:result" 44 | end 45 | 46 | class TestHash < MapRedus::Finalizer 47 | def self.finalize(process) 48 | process.each_key_reduced_value do |key, value| 49 | process.outputter.encode(process.result_key("extra_arg"), key, value) 50 | end 51 | end 52 | end 53 | 54 | class TestResultKeyArguments < MapRedus::Process 55 | # 56 | # EXTRA_KEY_ARG is not known at the time the process is run 57 | # but it is known by the time the finalizer is running 58 | # 59 | finalizer TestHash 60 | set_result_key "test:KEY_ARG:test:EXTRA_KEY_ARG" 61 | key_args ["key_argument"] 62 | end 63 | 64 | class Document 65 | include MapRedus::Support 66 | mapreduce_process :char_count, GetCharCount, "document:count:ID" 67 | 68 | attr_accessor :id 69 | def initialize(id) 70 | @id = id 71 | end 72 | 73 | def calculate_chars(data_reference) 74 | mapreduce.char_count(data_reference, id) 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /lib/mapredus/keys.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | RedisKey = MapRedus::Keys 3 | ProcessInfo = RedisKey 4 | 5 | #### USED WITHIN process.rb #### 6 | 7 | # Holds the current map reduce processes that are either running or which still have data lying around 8 | # 9 | redis_key :processes, "mapredus:processes" 10 | redis_key :processes_count, "mapredus:processes:count" 11 | 12 | # Holds the information (mapper, reducer, etc.) in json format for a map reduce process with pid PID 13 | # 14 | redis_key :pid, "mapredus:process:PID" 15 | 16 | # The input blocks broken down by the InputStream 17 | redis_key :input, "mapredus:process:PID:input" 18 | 19 | # All the keys that the map produced 20 | # 21 | redis_key :keys, "mapredus:process:PID:keys" 22 | 23 | # The hashed key to actual string value of key 24 | # 25 | redis_key :hash_to_key, "mapredus:process:PID:keys:HASHED_KEY" # to ACTUAL KEY 26 | 27 | # The list of values for a given key generated by our map function. 28 | # When a reduce is run it takes elements from this key and pushes them to :reduce 29 | # 30 | # key - list of values 31 | # 32 | redis_key :map, "mapredus:process:PID:map_key:HASHED_KEY" 33 | redis_key :reduce, "mapredus:process:PID:map_key:HASHED_KEY:reduce" 34 | 35 | # Temporary redis space for reduce functions to use 36 | # 37 | redis_key :temp, "mapredus:process:PID:temp_reduce_key:HASHED_KEY:UNIQUE_REDUCE_HOSTNAME:UNIQUE_REDUCE_PROCESS_ID" 38 | 39 | # The default location for the result 40 | # 41 | DEFAULT_RESULT_KEY = "mapredus:process:PID:result" 42 | 43 | #### USED WITHIN master.rb #### 44 | 45 | # Keeps track of the current slaves (by appending "1" to a redis list) 46 | # 47 | # TODO: should append some sort of proper process id so we can explicitly keep track 48 | # of processes 49 | # 50 | redis_key :slaves, "mapredus:process:PID:master:slaves" 51 | 52 | # 53 | # Use these constants to keep track of the progress of a process 54 | # 55 | # Example 56 | # state => map_in_progress 57 | # reduce_in_progress 58 | # finalize_in_progress 59 | # complete 60 | # failed 61 | # not_started 62 | # 63 | # contained in the ProcessInfo hash (redis_key :state, "mapredus:process:PID:master:state") 64 | # 65 | NOT_STARTED = "not_started" 66 | INPUT_MAP_IN_PROGRESS = "mappers" 67 | REDUCE_IN_PROGRESS = "reducers" 68 | FINALIZER_IN_PROGRESS = "finalizer" 69 | COMPLETE = "complete" 70 | FAILED = "failed" 71 | STATE_MACHINE = { nil => NOT_STARTED, 72 | NOT_STARTED => INPUT_MAP_IN_PROGRESS, 73 | INPUT_MAP_IN_PROGRESS => REDUCE_IN_PROGRESS, 74 | REDUCE_IN_PROGRESS => FINALIZER_IN_PROGRESS, 75 | FINALIZER_IN_PROGRESS => COMPLETE} 76 | 77 | # These keep track of timing information for a map reduce process of pid PID 78 | # 79 | redis_key :requested_at, "mapredus:process:PID:request_at" 80 | redis_key :started_at, "mapredus:process:PID:started_at" 81 | redis_key :finished_at, "mapredus:process:PID:finished_at" 82 | redis_key :recent_time_to_complete, "mapredus:process:recent_time_to_complete" 83 | end 84 | -------------------------------------------------------------------------------- /lib/mapredus/default_classes.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | class WordStream < InputStream 3 | def self.scan(data_object) 4 | # 5 | # The data_object should be a reference to an object that is 6 | # stored on your system. The scanner is used to break up what you 7 | # need from the object into manageable pieces for the mapper. In 8 | # this example, the data object is a reference to a redis string. 9 | # 10 | test_string = FileSystem.get(data_object) 11 | 12 | test_string.split.each_slice(10).each_with_index do |word_set, i| 13 | yield(i, word_set.join(" ")) 14 | end 15 | end 16 | end 17 | 18 | ################################################################################ 19 | 20 | class WordCounter < Mapper 21 | def self.map(map_data) 22 | map_data.split(/\W/).each do |word| 23 | next if word.empty? 24 | yield(word.downcase, 1) 25 | end 26 | end 27 | end 28 | 29 | ####################################REDUCERS#################################### 30 | 31 | class Adder < Reducer 32 | def self.reduce(value_list) 33 | yield( value_list.reduce(0) { |r, v| r += v.to_i } ) 34 | end 35 | end 36 | 37 | # Emits the identity function on the map values. 38 | # 39 | # The identity reducer should never actually have to reduce as a 40 | # special class in mapredus, the values should just be copied from 41 | # one key to a new key directly in redis. 42 | class Identity < Reducer 43 | def self.reduce_perform(process, key) 44 | FileSystem.copy( process.map_key(key), process.reduce_key(key) ) 45 | end 46 | 47 | def self.reduce(value_list) 48 | value_list.each do |v| 49 | yield v 50 | end 51 | end 52 | end 53 | 54 | # Emits the length of the mapped value list. 55 | # 56 | # The counter reducer tells how many values were emitted by the 57 | # mapper. In situations where an adder could used but only has to 58 | # sum up 1's, counter will be much faster. 59 | # 60 | # This works in MapRedus because all the values produced for one key 61 | # is processed (reduced) by a single worker. 62 | class Counter < Reducer 63 | def self.reduce_perform(process, key) 64 | process.emit(key, FileSystem.llen(process.map_key(key))) 65 | end 66 | 67 | def self.reduce(value_list) 68 | yield value_list.size 69 | end 70 | end 71 | 72 | ################################################################################ 73 | 74 | class ToRedisHash < Finalizer 75 | def self.finalize(process) 76 | process.each_key_reduced_value do |key, value| 77 | process.outputter.encode(process.result_key, key, value) 78 | end 79 | end 80 | end 81 | 82 | class RedisHasher < Outputter 83 | def self.to_hash(result_key) 84 | keys(result_key).inject({}) do |hash, key| 85 | hash[key] = decode(result_key, key) 86 | hash 87 | end 88 | end 89 | 90 | def self.values(result_key) 91 | FileSystem.hvals(result_key) 92 | end 93 | 94 | def self.keys(result_key) 95 | FileSystem.hkeys(result_key) 96 | end 97 | 98 | def self.encode(result_key, k, v) 99 | FileSystem.hset(result_key, k, v) 100 | end 101 | 102 | def self.decode(result_key, k) 103 | FileSystem.hget(result_key, k) 104 | end 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /lib/mapredus.rb: -------------------------------------------------------------------------------- 1 | require 'redis' 2 | require 'redis_support' 3 | require 'resque' 4 | require 'resque_scheduler' 5 | 6 | module MapRedus 7 | include RedisSupport 8 | 9 | class InvalidProcess < NotImplementedError 10 | def initialize; super("MapRedus QueueProcess: need to have perform method defined");end 11 | end 12 | 13 | class ProcessSpecificationError < InvalidProcess 14 | def initialize; super("MapRedus Process: need to have the specification defined");end 15 | end 16 | 17 | class InvalidMapper < NotImplementedError 18 | def initialize; super("MapRedus Mapper: need to have map method defined");end 19 | end 20 | 21 | class InvalidReducer < NotImplementedError 22 | def initialize; super("MapRedus Reducer: need to have reduce method defined");end 23 | end 24 | 25 | class InvalidInputStream < NotImplementedError 26 | def initialize; super("MapRedus InputStream: need to have scan method defined");end 27 | end 28 | 29 | class InvalidProcess < NotImplementedError 30 | def initialize; super("MapRedus Process Creation Failed: Specifications were not specified");end 31 | end 32 | 33 | class RecoverableFail < StandardError 34 | def initialize; super("MapRedus Operation Failed: but it is recoverable") ;end 35 | end 36 | 37 | # All Queue Processes should have a function called perform 38 | # ensuring that when the class is put on the resque queue it can perform its work 39 | # 40 | # Caution: defines redis, which is also defined in RedisSupport 41 | # 42 | class QueueProcess 43 | def self.queue; :mapredus; end 44 | def self.perform(*args); raise InvalidProcess; end 45 | end 46 | 47 | # TODO: When you send work to a worker using a mapper you define, 48 | # the worker won't have that class name defined, unless it was started up 49 | # with the class loaded 50 | # 51 | def register_reducer(klass); end; 52 | def register_mapper(klass); end; 53 | 54 | class Helper 55 | # resque helpers defines 56 | # redis 57 | # encode 58 | # decode 59 | # classify 60 | # constantize 61 | # 62 | # This is extended here because we want to use the encode and decode function 63 | # when we interact with resque queues 64 | extend Resque::Helpers 65 | 66 | # Defines a hash by taking the absolute value of ruby's string 67 | # hash to rid the dashes since redis keys should not contain any. 68 | # 69 | # key - The key to be hashed. 70 | # 71 | # Examples 72 | # 73 | # Support::key_hash( key ) 74 | # # => '8dd8hflf8dhod8doh9hef' 75 | # 76 | # Returns the hash. 77 | def self.key_hash( key ) 78 | key.to_s.hash.abs.to_s(16) 79 | end 80 | 81 | # Returns the classname of the namespaced class. 82 | # 83 | # The full name of the class. 84 | # 85 | # Examples 86 | # 87 | # Support::class_get( Super::Long::Namespace::ClassName ) 88 | # # => 'ClassName' 89 | # 90 | # Returns the class name. 91 | def self.class_get(string) 92 | constantize(string) 93 | end 94 | end 95 | end 96 | 97 | require 'mapredus/keys' 98 | require 'mapredus/filesystem' 99 | require 'mapredus/master' 100 | require 'mapredus/mapper' 101 | require 'mapredus/reducer' 102 | require 'mapredus/finalizer' 103 | require 'mapredus/support' 104 | require 'mapredus/outputter' 105 | require 'mapredus/inputter' 106 | require 'mapredus/default_classes' 107 | require 'mapredus/process' 108 | -------------------------------------------------------------------------------- /spec/redis-test.conf: -------------------------------------------------------------------------------- 1 | # Redis configuration file example 2 | 3 | # By default Redis does not run as a daemon. Use 'yes' if you need it. 4 | # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. 5 | daemonize yes 6 | # When run as a daemon, Redis write a pid file in /var/run/redis.pid by default. 7 | # You can specify a custom pid file location here. 8 | pidfile ./spec/redis-test.pid 9 | 10 | # Accept connections on the specified port, default is 6379 11 | port 9736 12 | 13 | # If you want you can bind a single interface, if the bind option is not 14 | # specified all the interfaces will listen for connections. 15 | # 16 | # bind 127.0.0.1 17 | 18 | # Close the connection after a client is idle for N seconds (0 to disable) 19 | timeout 300 20 | 21 | # Save the DB on disk: 22 | # 23 | # save 24 | # 25 | # Will save the DB if both the given number of seconds and the given 26 | # number of write operations against the DB occurred. 27 | # 28 | # In the example below the behaviour will be to save: 29 | # after 900 sec (15 min) if at least 1 key changed 30 | # after 300 sec (5 min) if at least 10 keys changed 31 | # after 60 sec if at least 10000 keys changed 32 | save 900 1 33 | save 300 10 34 | save 60 10000 35 | 36 | # The filename where to dump the DB 37 | dbfilename dump.rdb 38 | 39 | # For default save/load DB in/from the working directory 40 | # Note that you must specify a directory not a file name. 41 | dir ./spec/ 42 | 43 | # Set server verbosity to 'debug' 44 | # it can be one of: 45 | # debug (a lot of information, useful for development/testing) 46 | # notice (moderately verbose, what you want in production probably) 47 | # warning (only very important / critical messages are logged) 48 | loglevel debug 49 | 50 | # Specify the log file name. Also 'stdout' can be used to force 51 | # the demon to log on the standard output. Note that if you use standard 52 | # output for logging but daemonize, logs will be sent to /dev/null 53 | logfile stdout 54 | 55 | # Set the number of databases. The default database is DB 0, you can select 56 | # a different one on a per-connection basis using SELECT where 57 | # dbid is a number between 0 and 'databases'-1 58 | databases 16 59 | 60 | ################################# REPLICATION ################################# 61 | 62 | # Master-Slave replication. Use slaveof to make a Redis instance a copy of 63 | # another Redis server. Note that the configuration is local to the slave 64 | # so for example it is possible to configure the slave to save the DB with a 65 | # different interval, or to listen to another port, and so on. 66 | 67 | # slaveof 68 | 69 | ################################## SECURITY ################################### 70 | 71 | # Require clients to issue AUTH before processing any other 72 | # commands. This might be useful in environments in which you do not trust 73 | # others with access to the host running redis-server. 74 | # 75 | # This should stay commented out for backward compatibility and because most 76 | # people do not need auth (e.g. they run their own servers). 77 | 78 | # requirepass foobared 79 | 80 | ################################### LIMITS #################################### 81 | 82 | # Set the max number of connected clients at the same time. By default there 83 | # is no limit, and it's up to the number of file descriptors the Redis process 84 | # is able to open. The special value '0' means no limts. 85 | # Once the limit is reached Redis will close all the new connections sending 86 | # an error 'max number of clients reached'. 87 | 88 | # maxclients 128 89 | 90 | # Don't use more memory than the specified amount of bytes. 91 | # When the memory limit is reached Redis will try to remove keys with an 92 | # EXPIRE set. It will try to start freeing keys that are going to expire 93 | # in little time and preserve keys with a longer time to live. 94 | # Redis will also try to remove objects from free lists if possible. 95 | # 96 | # If all this fails, Redis will start to reply with errors to commands 97 | # that will use more memory, like SET, LPUSH, and so on, and will continue 98 | # to reply to most read-only commands like GET. 99 | # 100 | # WARNING: maxmemory can be a good idea mainly if you want to use Redis as a 101 | # 'state' server or cache, not as a real DB. When Redis is used as a real 102 | # database the memory usage will grow over the weeks, it will be obvious if 103 | # it is going to use too much memory in the long run, and you'll have the time 104 | # to upgrade. With maxmemory after the limit is reached you'll start to get 105 | # errors for write operations, and this may even lead to DB inconsistency. 106 | 107 | # maxmemory 108 | 109 | ############################### ADVANCED CONFIG ############################### 110 | 111 | # Glue small output buffers together in order to send small replies in a 112 | # single TCP packet. Uses a bit more CPU but most of the times it is a win 113 | # in terms of number of queries per second. Use 'yes' if unsure. 114 | glueoutputbuf yes 115 | -------------------------------------------------------------------------------- /lib/mapredus/master.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | # Note: Instead of using Resque directly within the process, we implement 3 | # a master interface with Resque 4 | # 5 | # Does bookkeeping to keep track of how many slaves are doing work. If we have 6 | # no slaves doing work for a process then the process is donex. While there is work available 7 | # the slaves will always be doing work. 8 | # 9 | class Master < QueueProcess 10 | # Check whether there are still workers working on process PID's processes 11 | # 12 | # In synchronous condition, master is always working since nothing is going to 13 | # the queue. 14 | def self.working?(pid) 15 | 0 < FileSystem.llen(ProcessInfo.slaves(pid)) 16 | end 17 | 18 | # 19 | # Master performs the work that it needs to do: 20 | # it must free itself as a slave from Resque 21 | # enslave mappers 22 | # 23 | def self.perform( pid, data_object ) 24 | process = Process.open(pid) 25 | enslave_inputter(process, *data_object) 26 | process.update(:state => INPUT_MAP_IN_PROGRESS) 27 | end 28 | 29 | # 30 | # The order of operations that occur in the mapreduce process 31 | # 32 | # The inputter sets off the mapper processes 33 | # 34 | def self.mapreduce( process, *data_object ) 35 | start_metrics(process.pid) 36 | if process.synchronous 37 | process.update(:state => INPUT_MAP_IN_PROGRESS) 38 | enslave_inputter(process, *data_object) 39 | process.update(:state => REDUCE_IN_PROGRESS) 40 | enslave_reducers(process) 41 | process.update(:state => FINALIZER_IN_PROGRESS) 42 | enslave_finalizer(process) 43 | else 44 | Resque.push(QueueProcess.queue, {:class => MapRedus::Master , :args => [process.pid, data_object]} ) 45 | end 46 | end 47 | 48 | def self.enslave_inputter(process, *data_object) 49 | enslave( process, process.inputter, process.pid, *data_object ) 50 | end 51 | 52 | # Enslave the reducers: 53 | # 54 | # For each key, enslave a reducer to process the values on that 55 | # key. If there were no keys produced during the map operation we 56 | # must set off the finalizer. 57 | # 58 | # TODO: inject optimizations here for special reducers like the 59 | # identity reduce 60 | # 61 | # returns nothing 62 | def self.enslave_reducers( process ) 63 | if( process.num_keys > 0 ) 64 | process.map_keys.each do |key| 65 | enslave_reduce( process, key ) 66 | end 67 | else 68 | process.next_state 69 | end 70 | end 71 | 72 | def self.enslave_finalizer( process ) 73 | enslave( process, process.finalizer, process.pid ) 74 | end 75 | 76 | # Have these to match what the Mapper/Reducer perform function expects to see as arguments 77 | # 78 | # though instead of process the perform function will receive the pid 79 | def self.enslave_map(process, data_chunk) 80 | enslave( process, process.mapper, process.pid, data_chunk ) 81 | end 82 | 83 | def self.enslave_reduce(process, key) 84 | enslave( process, process.reducer, process.pid, key ) 85 | end 86 | 87 | def self.enslave_later_reduce(process, key) 88 | enslave_later( process.reducer.wait, process, process.reducer, process.pid, key ) 89 | end 90 | 91 | # The current default (QUEUE) that we push on to is 92 | # :mapredus 93 | # 94 | def self.enslave( process, klass, *args ) 95 | FileSystem.rpush(ProcessInfo.slaves(process.pid), 1) 96 | 97 | if( process.synchronous ) 98 | klass.perform(*args) 99 | else 100 | Resque.push( klass.queue, { :class => klass.to_s, :args => args } ) 101 | end 102 | end 103 | 104 | def self.enslave_later( delay_in_seconds, process, klass, *args) 105 | FileSystem.rpush(ProcessInfo.slaves(process.pid), 1) 106 | 107 | if( process.synchronous ) 108 | klass.perform(*args) 109 | else 110 | # 111 | # TODO: I cannot get enqueue_in to work with my tests 112 | # there seems to be a silent failure somewhere 113 | # in the tests such that it never calls the function 114 | # and the queue gets emptied 115 | # 116 | # Resque.enqueue_in(delay_in_seconds, klass, *args) 117 | 118 | ## 119 | ## Temporary, immediately just push process back onto the resque queue 120 | Resque.push( klass.queue, { :class => klass.to_s, :args => args } ) 121 | end 122 | end 123 | 124 | def self.slaves(pid) 125 | FileSystem.lrange(ProcessInfo.slaves(pid), 0, -1) 126 | end 127 | 128 | def self.free_slave(pid) 129 | FileSystem.lpop(ProcessInfo.slaves(pid)) 130 | end 131 | 132 | def self.emancipate(pid) 133 | process = Process.open(pid) 134 | return unless process 135 | 136 | # Working on resque directly seems dangerous 137 | # 138 | # Warning: this is supposed to be used as a debugging operation 139 | # and isn't intended for normal use. It is potentially very expensive. 140 | # 141 | destroyed = 0 142 | qs = [queue, process.mapper.queue, process.reducer.queue, process.finalizer.queue].uniq 143 | qs.each do |q| 144 | q_key = "queue:#{q}" 145 | Resque.redis.lrange(q_key, 0, -1).each do | string | 146 | json = Helper.decode(string) 147 | match = json['class'] == "MapRedus::Master" 148 | match |= json['class'] == process.inputter.to_s 149 | match |= json['class'] == process.mapper.to_s 150 | match |= json['class'] == process.reducer.to_s 151 | match |= json['class'] == process.finalizer.to_s 152 | match &= json['args'].first.to_s == process.pid.to_s 153 | if match 154 | destroyed += Resque.redis.lrem(q_key, 0, string).to_i 155 | end 156 | end 157 | end 158 | 159 | # 160 | # our slave information is kept track of on file and not in Resque 161 | # 162 | FileSystem.del(ProcessInfo.slaves(pid)) 163 | destroyed 164 | end 165 | 166 | # Time metrics for measuring how long it takes map reduce to do a process 167 | # 168 | def self.set_request_time(pid) 169 | FileSystem.set( ProcessInfo.requested_at(pid), Time.now.to_i ) 170 | end 171 | 172 | def self.start_metrics(pid) 173 | started = ProcessInfo.started_at( pid ) 174 | FileSystem.set started, Time.now.to_i 175 | end 176 | 177 | def self.finish_metrics(pid) 178 | started = ProcessInfo.started_at( pid ) 179 | finished = ProcessInfo.finished_at( pid ) 180 | requested = ProcessInfo.requested_at( pid ) 181 | 182 | completion_time = Time.now.to_i 183 | 184 | FileSystem.set finished, completion_time 185 | time_to_complete = completion_time - FileSystem.get(started).to_i 186 | 187 | recent_ttcs = ProcessInfo.recent_time_to_complete 188 | FileSystem.lpush( recent_ttcs , time_to_complete ) 189 | FileSystem.ltrim( recent_ttcs , 0, 30 - 1) 190 | 191 | FileSystem.expire finished, 60 * 60 192 | FileSystem.expire started, 60 * 60 193 | FileSystem.expire requested, 60 * 60 194 | end 195 | end 196 | end 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MapRedus 2 | ========= 3 | 4 | Simple MapReduce type framework using redis and resque. 5 | 6 | Overview 7 | -------- 8 | 9 | This is an experimental implementation of MapReduce using Ruby for 10 | process definition, Resque for work execution, and Redis for data 11 | storage. 12 | 13 | Goals: 14 | 15 | * simple M/R-style programming for existing Ruby projects 16 | * low cost of entry (no need for a dedicated cluster) 17 | 18 | if you are looking for a high-performance MapReduce implementation 19 | that can meet your big data needs, try Hadoop. 20 | 21 | 22 | Using MapRedus 23 | --------------- 24 | 25 | MapRedus uses Resque to handle the processes that it runs, and redis 26 | to keep a store for the values/data produced. 27 | 28 | Workers for a MapRedus process are Resque workers. Refer to the 29 | Resque worker documentation to see how to load the necessary 30 | environment for your worker to be able to run mapreduce processs. An 31 | example is also located in the tests. 32 | 33 | ### Attaching a mapreduce process to a class 34 | 35 | You will often want to define a mapreduce process that does some 36 | operations on data within a class. The process should have an 37 | inputter, mapper, reducer, finalizer, and outputter defined. By 38 | default a process will have the specifications shown below. There is 39 | also an example of how to do this in the tests. 40 | 41 | class GetWordCount < MapRedus::Process 42 | inputter MapRedus::WordStream 43 | mapper MapRedus::WordCounter 44 | reducer MapRedus::Adder 45 | finalizer MapRedus::ToRedisHash 46 | outputter MapRedus::RedisHasher 47 | ordered false 48 | end 49 | 50 | class GetCharCount < MapRedus::Process 51 | inputter MapRedus::CharStream 52 | mapper MapRedus::CharCounter 53 | end 54 | 55 | class Job 56 | mapreduce_process :word_count, GetWordCount, "job:store:result" 57 | end 58 | 59 | The mapreduce_process needs a name, mapper, reducer, finalizer, 60 | outputter, and key to store the result. The operation would then be 61 | run on a job calling the following. 62 | 63 | job = Job.new 64 | job.mapreduce.word_count( data ) 65 | 66 | The data specifies the data on which this operation is to run. We are 67 | currently working on a way to allow the result_store_key to change 68 | depending on class properties. For instance in the above example, if 69 | the Job class had an id attribute, we may want to store the final 70 | mapreduce result in "job:store:result:#{id}". 71 | 72 | ### Inputters, Mappers, Reducers, Finalizers 73 | 74 | MapRedus needs a input stream, mapper, reducer, finalizer to be 75 | defined to run. The input stream defines how a block of your data 76 | gets divided so that a mapper can work on a small portion to map. For 77 | example: 78 | 79 | class InputStream < MapRedus::InputStream 80 | def self.scan(data_object) 81 | # your data object is a reference to a block of text in redis 82 | text_block = MapRedus.redis.get(data_object) 83 | text_block.each_line.each_with_index do |line, i| 84 | yield(i, line) 85 | end 86 | end 87 | end 88 | 89 | class Mapper < MapRedus::Mapper 90 | def self.map(data_to_map) 91 | data_to_map.each do |data| 92 | key = data 93 | value = 1 94 | yield( key, value ) 95 | end 96 | end 97 | end 98 | 99 | In this example, the input stream calls yield to output a mapredus 100 | file number and a the value that is saved to file (in redis). The 101 | mapper's `map` function calls yield to emit the key value pair for 102 | storage in redis. The reducer's `reduce` function acts similarly. 103 | 104 | The finalizer runs whatever needs to be run when a process completes, 105 | an example: 106 | 107 | class Finalizer < MapRedus::Finalizer 108 | def self.finalize(process) 109 | process.each_key_reduced_value do |key, value| 110 | process.outputter.encode(process.result_key, key, value) 111 | end 112 | ... 113 | < set off a new mapredus process to use this stored data > 114 | end 115 | end 116 | 117 | The process.result_key refers the final result key that is stored in 118 | redis. The result_key may take arguments which define the output of 119 | the key. The process will also incorporate initially given key 120 | arguments into the result_key. result_key's are defined exactly as a 121 | redis_key in the redis_support gem. The outputter is needed to define 122 | how exactly that encoding is defined. We provided an outputter that 123 | encodes your data into a redis hash. 124 | 125 | class RedisHasher < MapRedus::Outputter 126 | def encode(result_key, k, v) 127 | MapRedus::FileSystem.hset(result_key, k, v) 128 | end 129 | 130 | def decode(result_key, k) 131 | MapRedus::FileSystem.hget(result_key, k) 132 | end 133 | end 134 | 135 | The default Outputter makes no changes to original result, and tries 136 | to store that directly into redis as a string. 137 | 138 | Working Locally 139 | --------------- 140 | 141 | MapRedus uses Bundler to manage dependencies. With Bundler installed: 142 | 143 | bundle install 144 | 145 | You should now be able to run tests and do all other tasks with 146 | `rake`. 147 | 148 | Running Tests 149 | ------------- 150 | 151 | Run the tests which tests the word counter example and some other 152 | tests (you'll need to have bundler installed) 153 | rake 154 | 155 | Requirements 156 | ------------ 157 | * Bundler (this will install all the requirements below) 158 | * Redis 159 | * RedisSupport 160 | * Resque 161 | * Resque-scheduler 162 | 163 | ### Notes 164 | Instead of calling `emit_intermediate`/`emit` in your map/reduce 165 | to produce a key value pair/value you call `yield`, which will call 166 | emit_intermediate/emit for you. This gives flexibility in using 167 | Mapper/Reducer classes especially in testing. 168 | 169 | TODO 170 | ---- 171 | not necessarily in the given order 172 | * Ensure that the type that is inputted is the type that is outputted 173 | 174 | * if a process fails we do what we are supposed to do i.e. add a 175 | failure_hook which does something if your process fails 176 | 177 | * include functionality for a partitioner, input reader, combiner 178 | 179 | * implement this shit (registering of environment shit in resque) so 180 | that we can run mapreduce commands from the command line. Defining 181 | any arbitrary mapper and reducer. 182 | 183 | * implement redundant workers (workers doing the same work in case one 184 | of them fails) 185 | 186 | * if a reducer runs a recoverable fail, then make sure that an attempt 187 | to reenslave the worker is delayed by some fixed interval 188 | 189 | * edit emit for when we have multiple workers doing the same reduce 190 | (redundant workers for fault tolerance might need to change the 191 | rpush to a lock and setting of just a value) even if other workers 192 | do work on the same answer, want to make sure that the final reduced 193 | thing is the same every time 194 | 195 | * Add fault tolerance, better tracking of which workers fail, 196 | especially when we have multiple workers doing the same work 197 | ... currently is handled by Resque failure auto retry 198 | 199 | * if a perform operation fails then we need to have worker recover 200 | 201 | * make use of finish_metrics somewhere so that we can have statistics 202 | on how long map reduce processs take 203 | 204 | * better tracking of work being assigned so we can know when a process is finished 205 | or in progress and have a trigger to do things when shit finishes 206 | 207 | in resque there is functionality for an after hook which performs 208 | something after your process does it's work 209 | 210 | might also check out the resque-status plugin for a cheap and easy 211 | way to plug status and completion-rate into existing resque jobs. 212 | 213 | * ensure reducers only do a fixed amount of work? See section 3.2 of 214 | paper. bookkeeping that tells the master when tasks are in-progress 215 | or completed. this will be important for better paralleziation of 216 | tasks 217 | 218 | * think about the following logic 219 | 220 | + if a reducer starts working on a key after all maps have finished 221 | then when it is done the work on that key is finished forerver 222 | 223 | + this would imply a process finishes when all map tasks have 224 | finished and all reduce tasks that start after the map tasks have 225 | finished 226 | 227 | + if a reducer started before all map tasks were finished, then load 228 | its reduced result back onto the value list 229 | 230 | + if the reducer started after all map tasks finished, then emit the 231 | result 232 | 233 | Note on Patches/Pull Requests 234 | ----------------------------- 235 | 236 | * Fork the project. 237 | * Make your feature addition or bug fix. 238 | * Add tests for it. This is important so I don't break it in a 239 | future version unintentionally. 240 | * Commit, do not mess with rakefile, version, or history. (if you 241 | want to have your own version, that is fine but bump version in a 242 | commit by itself I can ignore when I pull) 243 | * Send me a pull request. Bonus points for topic branches. 244 | 245 | ## Copyright 246 | Copyright (c) 2010 Dolores Labs. See LICENSE for details. 247 | -------------------------------------------------------------------------------- /spec/mapredus_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/helper' 2 | 3 | describe "MapRedus" do 4 | # this is called before each test case 5 | before(:each) do 6 | MapRedus::FileSystem.flushall 7 | @process = GetWordCount.create 8 | MapRedus::FileSystem.setnx("wordstream:test", GetWordCount::TEST) 9 | end 10 | 11 | it "has sets up the correct default classes" do 12 | MapRedus::Process.inputter.should == MapRedus::WordStream 13 | MapRedus::Process.mapper.should == MapRedus::WordCounter 14 | MapRedus::Process.reducer.should == MapRedus::Adder 15 | MapRedus::Process.finalizer.should == MapRedus::ToRedisHash 16 | MapRedus::Process.outputter.should == MapRedus::RedisHasher 17 | 18 | GetWordCount.result_key.should == "test:result" 19 | GetWordCount.inputter.should == MapRedus::WordStream 20 | GetWordCount.mapper.should == MapRedus::WordCounter 21 | GetWordCount.reducer.should == MapRedus::Adder 22 | GetWordCount.finalizer.should == MapRedus::ToRedisHash 23 | GetWordCount.outputter.should == MapRedus::RedisHasher 24 | 25 | GetCharCount.inputter.should == CharStream 26 | GetCharCount.mapper.should == CharCounter 27 | GetCharCount.reducer.should == MapRedus::Adder 28 | GetCharCount.finalizer.should == MapRedus::ToRedisHash 29 | GetCharCount.outputter.should == MapRedus::RedisHasher 30 | end 31 | 32 | it "creates a process successfully" do 33 | process = GetWordCount.open(@process.pid) 34 | 35 | process.inputter.should == MapRedus::WordStream 36 | process.mapper.should == MapRedus::WordCounter 37 | process.reducer.should == MapRedus::Adder 38 | process.finalizer.should == MapRedus::ToRedisHash 39 | process.outputter.should == MapRedus::RedisHasher 40 | 41 | process = GetCharCount.create 42 | process.inputter.should == CharStream 43 | process.mapper.should == CharCounter 44 | process.reducer.should == MapRedus::Adder 45 | process.finalizer.should == MapRedus::ToRedisHash 46 | process.outputter.should == MapRedus::RedisHasher 47 | end 48 | 49 | it "runs a map reduce process synchronously" do 50 | ## 51 | ## In general map reduce shouldn't be running operations synchronously 52 | ## 53 | @process.class.should == GetWordCount 54 | @process.run("wordstream:test", synchronously = true) 55 | @process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size 56 | 57 | @process.map_keys.each do |key| 58 | reduce_values = @process.reduce_values(key) 59 | reduce_values.size.should == 1 60 | end 61 | 62 | @process.each_key_reduced_value do |key, value| 63 | @process.outputter.decode(@process.result_key, key).to_i.should == GetWordCount::EXPECTED_ANSWER[key] 64 | end 65 | end 66 | 67 | it "runs a map reduce process asynchronously" do 68 | @process.run("wordstream:test", synchronously = false) 69 | work_off 70 | 71 | @process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size 72 | @process.map_keys.each do |key| 73 | reduce_values = @process.reduce_values(key) 74 | reduce_values.size.should == 1 75 | end 76 | 77 | @process.each_key_reduced_value do |key, value| 78 | @process.outputter.decode(@process.result_key, key).to_i.should == GetWordCount::EXPECTED_ANSWER[key] 79 | end 80 | end 81 | 82 | it "runs the default process" do 83 | process = MapRedus::Process.create 84 | process.update(:key_args => [process.pid]) 85 | process.result_key.should == "mapredus:process:#{process.pid}:result" 86 | process.run("wordstream:test") 87 | work_off 88 | 89 | process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size 90 | process.map_keys.each do |key| 91 | reduce_values = process.reduce_values(key) 92 | reduce_values.size.should == 1 93 | end 94 | 95 | process.each_key_reduced_value do |key, value| 96 | process.outputter.decode(process.result_key, key).to_i.should == GetWordCount::EXPECTED_ANSWER[key] 97 | end 98 | end 99 | 100 | it "runs a process without result_key being set (using the default key location)" do 101 | process = CharCountTest.create 102 | process.update(:key_args => [process.pid]) 103 | process.result_key.should == "mapredus:process:#{process.pid}:result" 104 | process.run("wordstream:test") 105 | work_off 106 | 107 | process.map_keys.size.should == GetCharCount::EXPECTED_ANSWER.size 108 | process.map_keys.each do |key| 109 | reduce_values = process.reduce_values(key) 110 | reduce_values.size.should == 1 111 | end 112 | 113 | process.each_key_reduced_value do |key, value| 114 | process.outputter.decode(process.result_key, key).to_i.should == GetCharCount::EXPECTED_ANSWER[key] 115 | end 116 | end 117 | 118 | it "runs a process where key arguments exist and extra arguments are used" do 119 | process = TestResultKeyArguments.create 120 | process.result_key("extra_arg").should == "test:key_argument:test:extra_arg" 121 | process.run("wordstream:test") 122 | work_off 123 | 124 | process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size 125 | process.map_keys.each do |key| 126 | reduce_values = process.reduce_values(key) 127 | reduce_values.size.should == 1 128 | end 129 | 130 | process.each_key_reduced_value do |key, value| 131 | process.outputter.decode(process.result_key("extra_arg"), key).to_i.should == GetWordCount::EXPECTED_ANSWER[key] 132 | end 133 | end 134 | end 135 | 136 | describe "MapRedus Process" do 137 | before(:each) do 138 | MapRedus::FileSystem.flushall 139 | @process = GetWordCount.create 140 | end 141 | 142 | it "saves a process" do 143 | @process.mapper = CharCounter 144 | @process.synchronous = true 145 | @process.save 146 | 147 | @process = MapRedus::Process.open(@process.pid) 148 | 149 | @process.mapper.should == CharCounter 150 | @process.synchronous.should == true 151 | end 152 | 153 | it "updates a process" do 154 | @process.update(:mapper => CharCounter, :ordered => true) 155 | @process = MapRedus::Process.open(@process.pid) 156 | 157 | @process.mapper.should == CharCounter 158 | @process.ordered.should == true 159 | end 160 | 161 | it "deletes a process" do 162 | @process.delete 163 | 164 | proc = MapRedus::Process.open(@process.pid) 165 | proc.should == nil 166 | end 167 | 168 | it "kills a process" do 169 | @process.run(GetWordCount::TEST) 170 | MapRedus::Process.kill(@process.pid) 171 | Resque.size(:mapredus).should == 0 172 | end 173 | 174 | it "kills a process that is started" do 175 | @process.run(GetWordCount::TEST) 176 | 177 | worker = Resque::Worker.new("*") 178 | worker.perform(worker.reserve) # do some work 179 | 180 | MapRedus::Process.kill(@process.pid) 181 | Resque.size(:mapredus).should == 0 182 | end 183 | 184 | it "kills all process" do 185 | proc_1 = GetWordCount.create 186 | proc_2 = GetWordCount.create 187 | proc_3 = GetWordCount.create 188 | proc_4 = GetWordCount.create 189 | proc_5 = GetWordCount.create 190 | proc_6 = GetWordCount.create 191 | 192 | proc_1.run(GetWordCount::TEST) 193 | proc_2.run(GetWordCount::TEST) 194 | proc_3.run(GetWordCount::TEST) 195 | 196 | worker = Resque::Worker.new("*") 197 | 6.times do 198 | worker.perform(worker.reserve) 199 | end 200 | 201 | proc_4.run(GetWordCount::TEST) 202 | proc_5.run(GetWordCount::TEST) 203 | proc_6.run(GetWordCount::TEST) 204 | 205 | 6.times do 206 | worker.perform(worker.reserve) 207 | end 208 | 209 | MapRedus::Process.kill_all 210 | Resque.peek(:mapredus, 0, 100) == [] 211 | end 212 | 213 | it "responds to next state correctly" do 214 | @process.state.should == MapRedus::NOT_STARTED 215 | @process.next_state 216 | @process.state.should == MapRedus::INPUT_MAP_IN_PROGRESS 217 | work_off 218 | 219 | ## 220 | ## Since there are no map keys produced in this the next state 221 | ## should go directly to the finalizer 222 | ## 223 | 224 | @process.next_state 225 | @process.state.should == MapRedus::FINALIZER_IN_PROGRESS 226 | work_off 227 | 228 | @process.next_state 229 | @process.state.should == MapRedus::COMPLETE 230 | end 231 | 232 | it "responds to next state correcty when keys are produced" do 233 | @process.state.should == MapRedus::NOT_STARTED 234 | @process.next_state 235 | @process.state.should == MapRedus::INPUT_MAP_IN_PROGRESS 236 | work_off 237 | 238 | @process.emit_intermediate("hell", "yeah") 239 | 240 | @process.next_state 241 | @process.state.should == MapRedus::REDUCE_IN_PROGRESS 242 | work_off 243 | 244 | @process.next_state 245 | @process.state.should == MapRedus::FINALIZER_IN_PROGRESS 246 | work_off 247 | 248 | @process.next_state 249 | @process.state.should == MapRedus::COMPLETE 250 | end 251 | 252 | it "emit_intermediate unordered successfully" do 253 | @process.emit_intermediate("hell", "yeah") 254 | result = [] 255 | @process.each_key_nonreduced_value do |key, value| 256 | result << [key, value] 257 | end 258 | 259 | result.should == [["hell", "yeah"]] 260 | end 261 | 262 | it "emit_intermediate on an ordered process" do 263 | @process.update(:ordered => true) 264 | @process.emit_intermediate(2, "place", "two") 265 | @process.emit_intermediate(1, "number", "one") 266 | res = [] 267 | @process.each_key_nonreduced_value do |key, value| 268 | res << [key, value] 269 | end 270 | 271 | res.should == [["number", "one"], ["place", "two"]] 272 | end 273 | 274 | it "emit successfully" do 275 | @process.emit_intermediate("something", "non_reduced_value") 276 | @process.emit("something", "reduced") 277 | result = [] 278 | @process.each_key_reduced_value do |key, rv| 279 | result << [key, rv] 280 | end 281 | result.should == [["something", "reduced"]] 282 | end 283 | 284 | it "produces the correct map keys" do 285 | @process.emit_intermediate("map key 1", "value") 286 | @process.emit_intermediate("map key 1", "value") 287 | @process.emit_intermediate("map key 2", "value") 288 | 289 | @process.map_keys.sort.should == ["map key 1", "map key 2"] 290 | end 291 | 292 | it "produces the correct map/reduce values" do 293 | MapRedus::FileSystem.setnx("wordstream:test", GetWordCount::TEST) 294 | @process.run("wordstream:test") 295 | work_off 296 | @process.map_keys.sort.should == GetWordCount::EXPECTED_ANSWER.keys.sort 297 | 298 | @process.each_key_reduced_value do |key, reduced_value| 299 | reduced_value.to_i.should == GetWordCount::EXPECTED_ANSWER[key] 300 | @process.map_values(key).should == ["1"] * reduced_value.to_i 301 | end 302 | end 303 | end 304 | 305 | describe "MapRedus Master" do 306 | before(:each) do 307 | MapRedus::FileSystem.flushall 308 | MapRedus::FileSystem.setnx("test", "some data") 309 | @process = GetWordCount.create 310 | end 311 | 312 | it "handles slaves (enslaving) correctly" do 313 | MapRedus::Master.enslave(@process, MapRedus::WordCounter, @process.pid, "test") 314 | Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid, "test"], "class"=>"MapRedus::WordCounter"} 315 | MapRedus::Master.slaves(@process.pid).should == ["1"] 316 | end 317 | 318 | it "handles slaves (freeing) correctly" do 319 | MapRedus::Master.enslave(@process, MapRedus::WordCounter, @process.pid, "test") 320 | MapRedus::Master.enslave(@process, MapRedus::WordCounter, @process.pid, "test") 321 | 322 | MapRedus::Master.slaves(@process.pid).should == ["1", "1"] 323 | 324 | MapRedus::Master.free_slave(@process.pid) 325 | MapRedus::Master.free_slave(@process.pid) 326 | MapRedus::Master.slaves(@process.pid).should == [] 327 | end 328 | 329 | it "handles redundant multiple workers (same output regardless of how many workers complete)" 330 | end 331 | 332 | describe "MapRedus Mapper/Reducer/Finalizer" do 333 | before(:each) do 334 | MapRedus::FileSystem.flushall 335 | MapRedus::FileSystem.setnx("wordstream:test", "data") 336 | @process = GetWordCount.create 337 | end 338 | 339 | it "runs a map correctly proceeding to the next state" do 340 | @process.update(:state => MapRedus::INPUT_MAP_IN_PROGRESS) 341 | @process.state.should == MapRedus::INPUT_MAP_IN_PROGRESS 342 | @process.inputter.perform(@process.pid, "wordstream:test") 343 | Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid, 0], "class"=>"MapRedus::WordCounter"} 344 | Resque.pop(:mapredus) 345 | @process.mapper.perform(@process.pid, 0) 346 | @process.reload 347 | @process.state.should == MapRedus::REDUCE_IN_PROGRESS 348 | Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid, "data"], "class"=>"MapRedus::Adder"} 349 | 350 | MapRedus::Process.open(@process.pid).state.should == MapRedus::REDUCE_IN_PROGRESS 351 | end 352 | 353 | it "runs a reduce correctly proceeding to the correct next state" do 354 | @process.update(:state => MapRedus::REDUCE_IN_PROGRESS) 355 | @process.state.should == MapRedus::REDUCE_IN_PROGRESS 356 | @process.emit_intermediate("data", "1") 357 | @process.reducer.perform(@process.pid, "data") 358 | @process.reload 359 | @process.state.should == MapRedus::FINALIZER_IN_PROGRESS 360 | Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid], "class"=>"MapRedus::ToRedisHash"} 361 | 362 | MapRedus::Process.open(@process.pid).state.should == MapRedus::FINALIZER_IN_PROGRESS 363 | end 364 | 365 | it "should test that the finalizer correctly saves" do 366 | @process.update(:state => MapRedus::FINALIZER_IN_PROGRESS) 367 | @process.state.should == MapRedus::FINALIZER_IN_PROGRESS 368 | @process.emit_intermediate("data", "1") 369 | @process.emit("data", "1") 370 | @process.finalizer.perform(@process.pid) 371 | @process.reload 372 | @process.state.should == MapRedus::COMPLETE 373 | Resque.peek(:mapredus, 0, 100).should == [] 374 | @process.outputter.decode("test:result", "data").should == "1" 375 | 376 | MapRedus::Process.open(@process.pid).state.should == MapRedus::COMPLETE 377 | end 378 | end 379 | 380 | describe "MapRedus Support" do 381 | before(:each) do 382 | MapRedus::FileSystem.flushall 383 | @doc = Document.new(10) 384 | @other_doc = Document.new(15) 385 | end 386 | 387 | it "should be simple to create a mapredus as a part of a job" do 388 | MapRedus::FileSystem.setnx("wordstream:test", GetWordCount::TEST) 389 | MapRedus::FileSystem.setnx("charstream:test", "simpler test") 390 | other_answer = {" "=>1, "l"=>1, "m"=>1, "e"=>2, "p"=>1, "r"=>1, "s"=>2, "t"=>2, "i"=>1} 391 | 392 | @doc.calculate_chars("wordstream:test") 393 | @other_doc.calculate_chars("charstream:test") 394 | work_off 395 | 396 | GetCharCount::EXPECTED_ANSWER.keys.each do |char| 397 | @doc.mapreduce.char_count_result([@doc.id], char).should == GetCharCount::EXPECTED_ANSWER[char].to_s 398 | end 399 | 400 | other_answer.keys.each do |char| 401 | @other_doc.mapreduce.char_count_result([@other_doc.id], char).should == other_answer[char].to_s 402 | end 403 | end 404 | end 405 | 406 | describe "MapRedus Default Classes" do 407 | before(:each) do 408 | MapRedus::FileSystem.flushall 409 | @process = GetWordCount.create 410 | end 411 | 412 | it "testing that the identity copy actually does a copy" do 413 | MapRedus::FileSystem.rpush(@process.map_key("test_key"), "whatever") 414 | MapRedus::FileSystem.rpush(@process.map_key("test_key"), "yeah") 415 | MapRedus::Identity.perform(@process.pid, "test_key") 416 | 417 | @process.map_values("test_key").should == ["whatever", "yeah"] 418 | @process.reduce_values("test_key").should == ["whatever", "yeah"] 419 | end 420 | 421 | it "should properly do a count" do 422 | MapRedus::FileSystem.rpush(@process.map_key("test_key"), "whatever") 423 | MapRedus::FileSystem.rpush(@process.map_key("test_key"), "yeah") 424 | MapRedus::Counter.perform(@process.pid, "test_key") 425 | 426 | @process.reduce_values("test_key").should == ["2"] 427 | end 428 | end 429 | -------------------------------------------------------------------------------- /lib/mapredus/process.rb: -------------------------------------------------------------------------------- 1 | module MapRedus 2 | 3 | # This is what keeps track of our map reduce processes 4 | # 5 | # We use a redis key to identify the id of map reduce process 6 | # the value of the redis object is a json object which contains: 7 | # 8 | # { 9 | # inputter : inputstreamclass, 10 | # mapper : mapclass, 11 | # reducer : reduceclass, 12 | # finalizer : finalizerclass, 13 | # outputter : outputterclass, 14 | # partitioner : , 15 | # combiner : , 16 | # ordered : true_or_false ## ensures ordering keys from the map output --> [ order, key, value ], 17 | # synchronous : true_or_false ## runs the process synchronously or not (generally used for testing) 18 | # result_timeout : lenght of time a result is saved ## 3600 * 24 19 | # key_args : arguments to be added to the key location of the result save (cache location) 20 | # state : the current state of the process (shouldn't be set by the process and starts off as nil) 21 | # type : the original process class ( currently this is needed so we can have namespaces for the result_cache keys ) 22 | # } 23 | # 24 | # The user has the ability in subclassing this class to create extra features if needed 25 | # 26 | class Process 27 | # Public: Keep track of information that may show up as the redis json value 28 | # This is so we know exactly what might show up in the json hash 29 | READERS = [:pid] 30 | ATTRS = [:inputter, :mapper, :reducer, :finalizer, :outputter, :ordered, :synchronous, :result_timeout, :key_args, :state, :type] 31 | READERS.each { |r| attr_reader r } 32 | ATTRS.each { |a| attr_accessor a } 33 | 34 | DEFAULT_TIME = 3600 * 24 35 | def initialize(pid, json_info) 36 | @pid = pid 37 | read(json_info) 38 | end 39 | 40 | def read(json_info) 41 | @inputter = Helper.class_get(json_helper(json_info, :inputter)) 42 | @mapper = Helper.class_get(json_helper(json_info, :mapper)) 43 | @reducer = Helper.class_get(json_helper(json_info, :reducer)) 44 | @finalizer = Helper.class_get(json_helper(json_info, :finalizer)) 45 | @ordered = json_helper(json_info, :ordered) 46 | @synchronous = json_helper(json_info, :synchronous) 47 | @result_timeout = json_helper(json_info, :result_timeout) || DEFAULT_TIME 48 | @key_args = json_helper(json_info, :key_args) || [] 49 | @state = json_helper(json_info, :state) || NOT_STARTED 50 | @outputter = json_helper(json_info, :outputter) 51 | @outputter = @outputter ? Helper.class_get(@outputter) : MapRedus::Outputter 52 | @type = Helper.class_get(json_helper(json_info, :type) || Process) 53 | end 54 | 55 | def json_helper(json_info, key) 56 | json_info[key.to_s] || json_info[key.to_sym] 57 | end 58 | 59 | def to_s; to_json; end 60 | 61 | def to_hash 62 | (ATTRS + READERS).inject({}) do |h, attr| 63 | h[attr] = send(attr) 64 | h 65 | end 66 | end 67 | 68 | def to_json 69 | Helper.encode(to_hash) 70 | end 71 | 72 | def save 73 | FileSystem.sadd( ProcessInfo.processes, @pid ) 74 | FileSystem.save( ProcessInfo.pid(@pid), to_json ) 75 | self 76 | end 77 | 78 | def update(attrs = {}) 79 | attrs.each do |attr, val| 80 | send("#{attr}=", val) 81 | end 82 | save 83 | end 84 | 85 | def reload 86 | read(Helper.decode(FileSystem.get(ProcessInfo.pid(@pid)))) 87 | self 88 | end 89 | 90 | # This will not delete if the master is working 91 | # It can't get ahold of the files to shred while the master is working 92 | # 93 | # if safe is set to false, this will delete all the redis stores associated 94 | # with this process, but will not kill the process from the queue, if it is 95 | # on the queue. The process operations will fail to work when its data is deleted 96 | # 97 | # Examples 98 | # delete(safe) 99 | # # => true or false 100 | # 101 | # Returns true as long as the master is not working. 102 | def delete(safe = true) 103 | return false if (safe && Master.working?(@pid)) 104 | FileSystem.keys("mapredus:process:#{@pid}*").each do |k| 105 | FileSystem.del(k) 106 | end 107 | FileSystem.srem(ProcessInfo.processes, @pid) 108 | FileSystem.set(ProcessInfo.processes_count, 0) if( 0 == FileSystem.scard(ProcessInfo.processes) ) 109 | true 110 | end 111 | 112 | # Iterates through the key, values 113 | # 114 | # Example 115 | # each_key_reduced_value(pid) 116 | # 117 | # Returns nothing. 118 | def each_key_reduced_value 119 | map_keys.each do |key| 120 | reduce_values(key).each do |value| 121 | yield key, value 122 | end 123 | end 124 | end 125 | 126 | # Iterates through the key, values 127 | # 128 | # Example 129 | # each_key_nonreduced_value(pid) 130 | # 131 | # Returns nothing. 132 | def each_key_nonreduced_value 133 | map_keys.each do |key| 134 | map_values(key).each do |value| 135 | yield key, value 136 | end 137 | end 138 | end 139 | 140 | def run( data_object, synchronous = false ) 141 | update(:synchronous => synchronous) 142 | Master.mapreduce( self, data_object ) 143 | true 144 | end 145 | 146 | # TODO: 147 | # Should also have some notion of whether the process is completed or not 148 | # since the master might not be working, but the process is not yet complete 149 | # so it is still running 150 | def running? 151 | Master.working?(@pid) 152 | end 153 | 154 | # Change the process state 155 | # if the process is not running and is not synchronous 156 | # 157 | # Examples 158 | # process.next_state(pid) 159 | # 160 | # returns the state that the process switched to (or stays the same) 161 | def next_state 162 | if((not running?) and (not @synchronous)) 163 | new_state = STATE_MACHINE[self.state] 164 | update(:state => new_state) 165 | method = "enslave_#{new_state}".to_sym 166 | Master.send(method, self) if( Master.respond_to?(method) ) 167 | new_state 168 | end 169 | end 170 | 171 | ### The following functions deal with keys/values produced during the 172 | ### running of a process 173 | 174 | # Emissions, when we get map/reduce results back we emit these 175 | # to be stored in our file system (redis) 176 | # 177 | # key_value - The key, value 178 | # 179 | # Examples 180 | # emit_intermediate(key, value) 181 | # # => if an ordering is required 182 | # emit_intermediate(rank, key, value) 183 | # 184 | # Returns the true on success. 185 | def emit_intermediate(*key_value) 186 | if( not @ordered ) 187 | key, value = key_value 188 | FileSystem.sadd( ProcessInfo.keys(@pid), key ) 189 | hashed_key = Helper.key_hash(key) 190 | FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value ) 191 | else 192 | # if there's an order for the process then we should use a zset above 193 | # ordered process's map emits [rank, key, value] 194 | # 195 | rank, key, value = key_value 196 | FileSystem.zadd( ProcessInfo.keys(@pid), rank, key ) 197 | hashed_key = Helper.key_hash(key) 198 | FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value ) 199 | end 200 | raise "Key Collision: key:#{key}, #{key.class} => hashed key:#{hashed_key}" if key_collision?(hashed_key, key) 201 | true 202 | end 203 | 204 | # The emission associated with a reduce. Currently all reduced 205 | # values are pushed onto a redis list. It may be the case that we 206 | # want to directly use a different redis type given the kind of 207 | # reduce we are doing. Often a reduce only returns one value, so 208 | # instead of a rpush, we should do a set. 209 | # 210 | # Examples 211 | # emit(key, reduced_value) 212 | # 213 | # Returns "OK" on success. 214 | def emit(key, reduce_val) 215 | hashed_key = Helper.key_hash(key) 216 | FileSystem.rpush( ProcessInfo.reduce(@pid, hashed_key), reduce_val ) 217 | end 218 | 219 | def key_collision?(hashed_key, key) 220 | not ( FileSystem.setnx( ProcessInfo.hash_to_key(@pid, hashed_key), key ) || 221 | FileSystem.get( ProcessInfo.hash_to_key(@pid, hashed_key) ) == key.to_s ) 222 | end 223 | 224 | # Convenience methods to get the mapredus internal key string for a key 225 | # 226 | # Examples 227 | # reduce_key("document") 228 | # # => mapredus:process:PID:map_key::reduce 229 | # map_key("document") 230 | # # => mapredus:process:PID:map_key: 231 | # 232 | # Returns the internal mapreduce string key for a given key. 233 | [:reduce, :map].each do |internal_key| 234 | define_method("#{internal_key}_key") do |key| 235 | ProcessInfo.send(internal_key, @pid, Helper.key_hash(key)) 236 | end 237 | end 238 | 239 | # Keys that the map operation produced 240 | # 241 | # Examples 242 | # map_keys 243 | # # => 244 | # 245 | # Returns the Keys. 246 | def map_keys 247 | if( not @ordered ) 248 | FileSystem.smembers( ProcessInfo.keys(@pid) ) 249 | else 250 | FileSystem.zrange( ProcessInfo.keys(@pid), 0, -1 ) 251 | end 252 | end 253 | 254 | def num_keys() 255 | if( not @ordered ) 256 | FileSystem.scard( ProcessInfo.keys(@pid) ) 257 | else 258 | FileSystem.zcard( ProcessInfo.keys(@pid) ) 259 | end 260 | end 261 | 262 | # values that the map operation produced, for a key 263 | # 264 | # Examples 265 | # map_values(key) 266 | # # => 267 | # 268 | # Returns the values. 269 | def map_values(key) 270 | hashed_key = Helper.key_hash(key) 271 | FileSystem.lrange( ProcessInfo.map(@pid, hashed_key), 0, -1 ) 272 | end 273 | 274 | def num_values(key) 275 | hashed_key = Helper.key_hash(key) 276 | FileSystem.llen( ProcessInfo.map(@pid, hashed_key) ) 277 | end 278 | 279 | # values that the reduce operation produced, for a key 280 | # 281 | # Examples 282 | # reduce_values(key) 283 | # # => 284 | # 285 | # Returns the values. 286 | def reduce_values(key) 287 | hashed_key = Helper.key_hash(key) 288 | FileSystem.lrange( ProcessInfo.reduce(@pid, hashed_key), 0, -1 ) 289 | end 290 | 291 | # functions to manage the location of the result in the FileSystem 292 | # 293 | # Examples 294 | # process.result_key(extra, arguments) 295 | # Process.result_key(all, needed, arguments) 296 | # # => "something:that:uses:the:extra:arguments" 297 | # 298 | # SomeProcessSubclass.set_result_key("something:ARG:something:VAR") 299 | # # sets the result key for (CAPITAL require arguments to fill in the values) 300 | def result_key(*args) 301 | Helper.class_get(@type).result_key(*[@key_args, args].flatten) 302 | end 303 | 304 | def self.result_key(*args) 305 | key_maker = "#{self.to_s.gsub(/\W/,"_")}_result_cache" 306 | key_maker = ProcessInfo.respond_to?(key_maker) ? key_maker : "#{MapRedus::Process.to_s.gsub(/\W/,"_")}_result_cache" 307 | ProcessInfo.send( key_maker, *args ) 308 | end 309 | 310 | def self.set_result_key(key_struct) 311 | MapRedus.redefine_redis_key( "#{self.to_s.gsub(/\W/,"_")}_result_cache", key_struct ) 312 | end 313 | 314 | # Create sets up a process to be run with the given specification. 315 | # It saves the information in the FileSystem and returns an 316 | # instance of the process that run should be called on when 317 | # running is desired. 318 | # 319 | # Example 320 | # process = MapRedus::Process.create 321 | # process.run 322 | # 323 | # Returns an instance of the process 324 | def self.create 325 | new_pid = get_available_pid 326 | specification = ATTRS.inject({}) do |ret, attr| 327 | ret[attr] = send(attr) 328 | ret 329 | end 330 | specification[:type] = self 331 | self.new(new_pid, specification).save 332 | end 333 | 334 | # This defines the attributes to be associated with a MapRedus process 335 | # This will allow us to subclass a Process, creating a new specification 336 | # by specifying what say the inputter should equal 337 | # 338 | # Example 339 | # class AnswerDistribution < MapRedus::Process 340 | # inputter JudgmentStream 341 | # mapper ResponseFrequencyMap 342 | # reducer Adder 343 | # finalizer AnswerCount 344 | # outputter MapRedus::RedisHasher 345 | # end 346 | class << self; attr_reader *ATTRS; end 347 | 348 | # Setter/Getter method definitions to set/get the attribute for 349 | # the class. In the getter if it is not defined (nil) then return 350 | # the default attribute defined in MapRedus::Process. 351 | # 352 | # Example 353 | # class AnswerDistribution < MapRedus::Process 354 | # inputter JudgmentStream 355 | # mapper ResponseFrequency 356 | # end 357 | # AnswerDistribution.reducer.should == Adder 358 | ATTRS.each do |attr| 359 | (class << self; self; end).send(:define_method, attr) do |*one_arg| 360 | attribute = "@#{attr}" 361 | case one_arg.size 362 | when 0 363 | instance_variable_get(attribute) || MapRedus::Process.instance_variable_get(attribute) 364 | when 1 365 | instance_variable_set(attribute, one_arg.first) 366 | else 367 | raise ArgumentError.new("wrong number of arguments (#{one_arg.size}) when zero or one arguments were expected") 368 | end 369 | end 370 | end 371 | 372 | # Default attributes for the process class. All other attributes 373 | # are nil by default. 374 | inputter WordStream 375 | mapper WordCounter 376 | reducer Adder 377 | finalizer ToRedisHash 378 | outputter RedisHasher 379 | type Process 380 | set_result_key DEFAULT_RESULT_KEY 381 | 382 | # This function returns all the redis keys produced associated 383 | # with a process's process id. 384 | # 385 | # Example 386 | # Process.info(17) 387 | # 388 | # Returns an array of keys associated with the process id. 389 | def self.info(pid) 390 | FileSystem.keys(ProcessInfo.pid(pid) + "*") 391 | end 392 | 393 | # Returns an instance of the process class given the process id. 394 | # If no such process id exists returns nil. 395 | # 396 | # Example 397 | # process = Process.open(17) 398 | def self.open(pid) 399 | spec = Helper.decode( FileSystem.get(ProcessInfo.pid(pid)) ) 400 | spec && self.new( pid, spec ) 401 | end 402 | 403 | # Find out what map reduce processes are out there 404 | # 405 | # Examples 406 | # FileSystem::ps 407 | # 408 | # Returns a list of the map reduce process ids 409 | def self.ps 410 | FileSystem.smembers(ProcessInfo.processes) 411 | end 412 | 413 | # Find out what map reduce processes are out there 414 | # 415 | # Examples 416 | # FileSystem::get_available_pid 417 | # 418 | # Returns an avilable pid. 419 | def self.get_available_pid 420 | FileSystem.incrby(ProcessInfo.processes_count, 1 + rand(20)) 421 | end 422 | 423 | # Given a arguments for a result key, delete the result from the 424 | # filesystem. 425 | # 426 | # Examples 427 | # Process.delete_saved_result(key) 428 | def self.delete_saved_result(*key_args) 429 | FileSystem.del( result_key(*key_args) ) 430 | end 431 | 432 | # Remove redis keys associated with this process if the Master isn't working. 433 | # 434 | # potentially is very expensive. 435 | # 436 | # Example 437 | # Process::kill(pid) 438 | # # => true 439 | # 440 | # Returns true on success. 441 | def self.kill(pid) 442 | num_killed = Master.emancipate(pid) 443 | proc = Process.open(pid) 444 | proc.delete if proc 445 | num_killed 446 | end 447 | 448 | def self.kill_all 449 | ps.each do |pid| 450 | kill(pid) 451 | end 452 | FileSystem.del(ProcessInfo.processes) 453 | FileSystem.del(ProcessInfo.processes_count) 454 | end 455 | end 456 | end 457 | --------------------------------------------------------------------------------