├── VERSION
├── .document
├── Gemfile
├── .gitignore
├── lib
    ├── mapredus
    │   ├── outputter.rb
    │   ├── mapper.rb
    │   ├── finalizer.rb
    │   ├── inputter.rb
    │   ├── reducer.rb
    │   ├── support.rb
    │   ├── filesystem.rb
    │   ├── keys.rb
    │   ├── default_classes.rb
    │   ├── master.rb
    │   └── process.rb
    └── mapredus.rb
├── LICENSE
├── spec
    ├── helper.rb
    ├── helper_classes.rb
    ├── redis-test.conf
    └── mapredus_spec.rb
├── Rakefile
├── mapredus.gemspec
└── README.md


/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.6
2 | 


--------------------------------------------------------------------------------
/.document:
--------------------------------------------------------------------------------
1 | README.rdoc
2 | lib/**/*.rb
3 | bin/*
4 | features/**/*.feature
5 | LICENSE
6 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "http://rubygems.org"
2 | gem "jeweler"
3 | gem "rake"
4 | gem "rspec"
5 | gem "redis"
6 | gem "resque"
7 | gem "resque-scheduler"
8 | gem "redis_support", "0.0.12"
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## MAC OS
 2 | .DS_Store
 3 | 
 4 | ## TEXTMATE
 5 | *.tmproj
 6 | tmtags
 7 | 
 8 | ## EMACS
 9 | *~
10 | \#*
11 | .\#*
12 | 
13 | ## VIM
14 | *.swp
15 | 
16 | ## PROJECT::GENERAL
17 | coverage
18 | rdoc
19 | pkg
20 | log
21 | 
22 | ## PROJECT::SPECIFIC
23 | .bundle
24 | Gemfile.lock
25 | vendor


--------------------------------------------------------------------------------
/lib/mapredus/outputter.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   #
 3 |   # Standard readers for the input and output of Files coming out
 4 |   # of the FileSystem.
 5 |   #
 6 |   class Outputter < QueueProcess
 7 |     def self.decode(result_key)
 8 |       FileSystem.get(result_key)
 9 |     end
10 | 
11 |     def self.encode(result_key, o)
12 |       FileSystem.set(result_key, o)
13 |     end
14 | 
15 |     #
16 |     # type should either be "decode" or "encode"
17 |     #
18 |     def self.perform(type, o)
19 |       send(type, o)
20 |     end
21 |   end
22 | 
23 |   class JsonOutputter < Outputter
24 |     def self.decode(result_key)
25 |       Helper.decode(FileSystem.get(result_key))
26 |     end
27 | 
28 |     def self.encode(result_key, o)
29 |       FileSystem.set(result_key, Helper.encode(o))
30 |     end
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/lib/mapredus/mapper.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   # Map is a function that takes a data chunk
 3 |   # where each data chunk is a list of pieces of your raw data
 4 |   # and emits a list of key, value pairs.
 5 |   #
 6 |   # The output of the map shall always be
 7 |   #   [ [key, value], [key, value], ... ]
 8 |   #
 9 |   # Note: Values must be string, integers, booleans, or floats.
10 |   # i.e., They must be primitive types since these are the only
11 |   # types that redis supports and since anything inputted into
12 |   # redis becomes a string.
13 |   class Mapper < QueueProcess    
14 |     def self.map(data_chunk); raise InvalidMapper; end
15 |     
16 |     def self.perform(pid, data_key)
17 |       process = Process.open(pid)
18 |       data_chunk = FileSystem.hget(ProcessInfo.input(pid), data_key)
19 |       map( data_chunk ) do |*key_value|
20 |         process.emit_intermediate(*key_value)
21 |       end
22 |     ensure
23 |       Master.free_slave(pid)
24 |       process.next_state
25 |     end
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/lib/mapredus/finalizer.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   # Run the stuff you want to run at the end of the process.
 3 |   # Define subclass which defines self.finalize and self.serialize
 4 |   # to do what is needed when you want to get the final output
 5 |   # out of redis and into ruby.
 6 |   #
 7 |   # This is basically the message back to the user program that a
 8 |   # process is completed storing the necessary info.
 9 |   #
10 |   class Finalizer < QueueProcess
11 | 
12 |     # The default finalizer is to notify of process completion
13 |     #
14 |     # Example
15 |     #   Finalizer::finalize(pid)
16 |     #   # => "MapRedus Process : 111 : has completed"
17 |     #
18 |     # Returns a message notification
19 |     def self.finalize(pid)
20 |       "MapRedus Process : #{pid} : has completed"
21 |     end
22 | 
23 |     def self.perform(pid)
24 |       process = Process.open(pid)
25 |       result = finalize(process)
26 |       Master.finish_metrics(pid)
27 |       result
28 |     ensure
29 |       Master.free_slave(pid)
30 |       process.next_state
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/mapredus/inputter.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   class InputStream < QueueProcess
 3 |     #
 4 |     # An InputSteam needs to implement a way to scan through the
 5 |     # data_object (the object data that is sent to the MapRedus
 6 |     # process). The scan function implements how the data object is
 7 |     # broken sizable pieces for the mappers to operate on.
 8 |     #
 9 |     # It does this by yielding a <key, map_data> pair.  The key
10 |     # specifies the location storage in redis.  map_data is string
11 |     # data that will be written to the redis.
12 |     #
13 |     # Example
14 |     #   scan(data_object) do |key, map_data|
15 |     #     ...
16 |     #   end
17 |     def self.scan(*data_object)
18 |       raise InvalidInputStream
19 |     end
20 | 
21 |     def self.perform(pid, data_object)
22 |       process = Process.open(pid)
23 |       scan(*data_object) do |key, map_data|
24 |         FileSystem.hset(ProcessInfo.input(pid), key, map_data)
25 |         Master.enslave_map(process, key)
26 |       end
27 |     ensure
28 |       Master.free_slave(pid)
29 |     end
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 Dolores Labs
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/spec/helper.rb:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'spec'
 3 | 
 4 | dir = File.dirname(__FILE__)
 5 | $LOAD_PATH.unshift(File.join(dir, '..', 'lib'))
 6 | $LOAD_PATH.unshift(dir)
 7 | require 'mapredus'
 8 | 
 9 | #
10 | # make sure we can run redis
11 | #
12 | if !system("which redis-server")
13 |   puts '', "** can't find `redis-server` in your path"
14 |   abort ''
15 | end
16 | 
17 | #
18 | # start our own redis when the tests start,
19 | # kill it when they end (redis is run as a daemon)
20 | #
21 | puts "Starting redis for testing at localhost:9736..."
22 | `redis-server #{dir}/redis-test.conf`
23 | 
24 | at_exit do
25 |   #
26 |   # hope that no other processes have redis-test in the name...
27 |   # TODO: fixme
28 |   #
29 |   pid = `ps -A -o pid,command | grep [r]edis-test`.split(" ")[0]
30 |   puts "Killing test redis server..."
31 |   `rm -f #{dir}/dump.rdb`
32 |   Process.kill("KILL", pid.to_i)
33 | end
34 | 
35 | #
36 | # Set the redis server
37 | #
38 | RedisSupport.redis = 'localhost:9736:0'
39 | MapRedus.redis = 'localhost:9736:0'
40 | Resque.redis = MapRedus.redis
41 | require 'resque/failure/redis'
42 | Resque::Failure.backend = Resque::Failure::Redis
43 | 
44 | require 'helper_classes'
45 | 
46 | def work_off
47 |   Resque::Worker.new("*").work(0)
48 | end
49 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'bundler'
 3 | Bundler.setup
 4 | 
 5 | require 'rake'
 6 | require 'spec/rake/spectask'
 7 | 
 8 | begin
 9 |   require 'jeweler'
10 |   Jeweler::Tasks.new do |gem|
11 |     gem.name = "mapredus"
12 |     gem.summary = %Q{mapredus initial}
13 |     gem.description = %Q{simple mapreduce framework using redis and resque}
14 |     gem.email = "john@doloreslabs.com"
15 |     gem.homepage = "http://github.com/dolores/mapredus"
16 |     gem.authors = ["John Le", "Brian O'Rourke"]
17 |     gem.files = Dir['lib/**/*.rb']
18 |     gem.add_dependency "redis", ">= 1.0.4"
19 |     gem.add_dependency "resque", ">= 1.8"
20 |     gem.add_dependency "resque-scheduler"
21 |     gem.add_dependency "redis_support", ">= 0"
22 |   end
23 |   Jeweler::GemcutterTasks.new
24 | rescue LoadError
25 |   puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
26 | end
27 | 
28 | require 'rake/rdoctask'
29 | Rake::RDocTask.new do |rdoc|
30 |   version = File.exist?('VERSION') ? File.read('VERSION') : ""
31 | 
32 |   rdoc.rdoc_dir = 'rdoc'
33 |   rdoc.title = "mapredus #{version}"
34 |   rdoc.rdoc_files.include('README*')
35 |   rdoc.rdoc_files.include('lib/**/*.rb')
36 | end
37 | 
38 | Spec::Rake::SpecTask.new(:spec) do |t|
39 |   t.spec_files = FileList['spec/helper.rb', 'spec/helper_classes.rb', 'spec/mapredus_spec.rb']
40 |   t.spec_opts = ["--color", "--format", "specdoc", 
41 |                  "-f", "o:log/spec_profile.txt", 
42 |                  "-f", "e:log/spec_failing.txt"]
43 | end
44 | 
45 | task :spec => :check_dependencies
46 | 
47 | task :default => :spec
48 | 


--------------------------------------------------------------------------------
/lib/mapredus/reducer.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   # Reduce is a function that takes in "all" the values for a single given key
 3 |   # and outputs a list of values or a single value that usually "reduces"
 4 |   # the initial given value set.
 5 |   #
 6 |   # The output of the reduce shall always be
 7 |   #   reduce(values) = [ reduced value, reduced value, ... ]
 8 |   # and it will often only be a single element array
 9 |   #
10 |   # The input values and the output values of the reduce will always
11 |   # be a string. As described in the paper, it is up to the client
12 |   # to define how to deal with this restriction.
13 |   #
14 |   class Reducer < QueueProcess
15 |     #
16 |     # After a recoverable fail this describes how much time we shall wait before
17 |     # readding the reducer back on to the queue.
18 |     #
19 |     DEFAULT_WAIT = 10 # seconds
20 |     def self.wait; DEFAULT_WAIT; end
21 |     
22 |     def self.reduce(values); raise InvalidReducer; end
23 |     
24 |     #
25 |     # The overridable portion of a reducer perform.  In some default
26 |     # classes like Identity and Counter we do not call self.reduce but
27 |     # provide optimization for the reduction by overriding this
28 |     # method.
29 |     #
30 |     def self.reduce_perform(process, key)
31 |       reduce(process.map_values(key)) do |reduce_val|
32 |         process.emit( key, reduce_val )
33 |       end
34 |     end
35 |     
36 |     # Doesn't handle redundant workers and fault tolerance
37 |     #
38 |     # TODO: Resque::AutoRetry might mess this up.
39 |     def self.perform(pid, key)
40 |       process = Process.open(pid)
41 |       reduce_perform(process, key)
42 |     rescue MapRedus::RecoverableFail
43 |       Master.enslave_later_reduce(process, key)
44 |     ensure
45 |       Master.free_slave(pid)
46 |       process.next_state
47 |     end
48 |   end
49 | end
50 | 


--------------------------------------------------------------------------------
/lib/mapredus/support.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   module Support
 3 |     class MapRedusRunnerError < StandardError; end
 4 |     class DuplicateProcessDefinitionError < MapRedusRunnerError ; end
 5 | 
 6 |     class Runner
 7 |       def initialize(class_name)
 8 |         @class = class_name
 9 |       end
10 | 
11 |       def method_missing(method, *args, &block)
12 |         mr_process = "#{@class}_#{method.to_s}"
13 |         if self.respond_to?(mr_process)
14 |           self.send(mr_process, *args, &block)
15 |         else
16 |           super(method, *args, &block)  
17 |         end
18 |       end
19 |     end
20 | 
21 |     def mapreduce
22 |       @mapreduce_runner ||= Runner.new(self.class.to_s.gsub(/\W/,"_"))
23 |     end
24 | 
25 |     module ClassMethods
26 |       def mapreduce_process( process_name, mapredus_process_class, result_store )
27 |         runner_self = Runner
28 |         class_name = self.to_s.gsub(/\W/,"_")
29 | 
30 |         global_process_name = "#{class_name}_#{process_name.to_s}"
31 | 
32 |         if runner_self.methods.include?(global_process_name)
33 |           raise DuplicateProcessDefintionError
34 |         end
35 |         
36 |         mapredus_process_class.set_result_key( result_store )
37 | 
38 |         runner_self.send( :define_method, global_process_name ) do |data, key_arguments|
39 |           process = mapredus_process_class.create
40 |           process.update(:key_args => key_arguments)
41 |           process.run(data)
42 |           process
43 |         end
44 | 
45 |         runner_self.send( :define_method, "#{global_process_name}_result" ) do |key_arguments, *outputter_args|
46 |           key = mapredus_process_class.result_key( *key_arguments )
47 |           mapredus_process_class.outputter.decode( key, *outputter_args)
48 |         end
49 |       end
50 |     end
51 | 
52 |     def self.included(model)
53 |       model.extend ClassMethods
54 |     end
55 |   end
56 | end
57 | 


--------------------------------------------------------------------------------
/lib/mapredus/filesystem.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   # Manages the book keeping of redis keys and redis usage
 3 |   # provides the data storage for process information through redis
 4 |   # All interaction with redis should go through this class
 5 |   # 
 6 |   class FileSystem
 7 |     def self.storage
 8 |       MapRedus.redis
 9 |     end
10 |     
11 |     # Save/Read functions to save/read values for a redis key
12 |     #
13 |     # Examples
14 |     #   FileSystem.save( key, value ) 
15 |     def self.save(key, value, time = nil)
16 |       storage.set(key, value)
17 |       storage.expire(key, time) if time
18 |     end
19 | 
20 |     def self.method_missing(method, *args, &block)
21 |       if storage.respond_to?(method)
22 |         storage.send(method, *args)
23 |       else
24 |         super
25 |       end
26 |     end
27 | 
28 |     # Copy the values from one key to a second key
29 |     # 
30 |     # NOTE TODO: currently only works for the redis list data
31 |     # structure but will be extended for arbitrary data types.
32 |     #
33 |     # NOTE: this does not account for the key being changed during the
34 |     # copy, so should not be used in situations where the first_key
35 |     # value can change during the running of copy.
36 |     #
37 |     # Examples
38 |     #   FileSystem.copy("key_one", "key_two")
39 |     #
40 |     # returns true on success false otherwise
41 |     def self.copy(first_key, second_key)
42 |       list_length = storage.llen(first_key)
43 |       list_length.times do |index|
44 |         storage.rpush(second_key, storage.lindex(first_key, index))
45 |       end
46 |       true
47 |     end
48 |     
49 |     # Setup locks on results using RedisSupport lock functionality
50 |     #
51 |     # Examples
52 |     #   FileSystem::has_lock?(key)
53 |     #   # => true or false 
54 |     #
55 |     # Returns true if there's a lock
56 |     def self.has_lock?(key)
57 |       MapRedus.has_redis_lock?( RedisKey.result_cache(key) ) 
58 |     end
59 |     
60 |     def self.acquire_lock(key)
61 |       MapRedus.acquire_redis_lock_nonblock( RedisKey.result_cache(key), 60 * 60 )
62 |     end
63 |     
64 |     def self.release_lock(key)
65 |       MapRedus.release_redis_lock( RedisKey.result_cache(key) )
66 |     end
67 |   end
68 | end
69 | 


--------------------------------------------------------------------------------
/mapredus.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = %q{mapredus}
 8 |   s.version = "0.0.6"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["John Le", "Brian O'Rourke"]
12 |   s.date = %q{2010-08-05}
13 |   s.description = %q{simple mapreduce framework using redis and resque}
14 |   s.email = %q{john@doloreslabs.com}
15 |   s.extra_rdoc_files = [
16 |     "LICENSE",
17 |      "README.md"
18 |   ]
19 |   s.files = [
20 |     "lib/mapredus.rb",
21 |      "lib/mapredus/default_classes.rb",
22 |      "lib/mapredus/filesystem.rb",
23 |      "lib/mapredus/finalizer.rb",
24 |      "lib/mapredus/inputter.rb",
25 |      "lib/mapredus/keys.rb",
26 |      "lib/mapredus/mapper.rb",
27 |      "lib/mapredus/master.rb",
28 |      "lib/mapredus/outputter.rb",
29 |      "lib/mapredus/process.rb",
30 |      "lib/mapredus/reducer.rb",
31 |      "lib/mapredus/support.rb"
32 |   ]
33 |   s.homepage = %q{http://github.com/dolores/mapredus}
34 |   s.rdoc_options = ["--charset=UTF-8"]
35 |   s.require_paths = ["lib"]
36 |   s.rubygems_version = %q{1.3.7}
37 |   s.summary = %q{mapredus initial}
38 |   s.test_files = [
39 |     "spec/helper_classes.rb",
40 |      "spec/mapredus_spec.rb",
41 |      "spec/helper.rb"
42 |   ]
43 | 
44 |   if s.respond_to? :specification_version then
45 |     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
46 |     s.specification_version = 3
47 | 
48 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
49 |       s.add_runtime_dependency(%q<redis>, [">= 1.0.4"])
50 |       s.add_runtime_dependency(%q<resque>, [">= 1.8"])
51 |       s.add_runtime_dependency(%q<resque-scheduler>, [">= 0"])
52 |       s.add_runtime_dependency(%q<redis_support>, [">= 0"])
53 |     else
54 |       s.add_dependency(%q<redis>, [">= 1.0.4"])
55 |       s.add_dependency(%q<resque>, [">= 1.8"])
56 |       s.add_dependency(%q<resque-scheduler>, [">= 0"])
57 |       s.add_dependency(%q<redis_support>, [">= 0"])
58 |     end
59 |   else
60 |     s.add_dependency(%q<redis>, [">= 1.0.4"])
61 |     s.add_dependency(%q<resque>, [">= 1.8"])
62 |     s.add_dependency(%q<resque-scheduler>, [">= 0"])
63 |     s.add_dependency(%q<redis_support>, [">= 0"])
64 |   end
65 | end
66 | 
67 | 


--------------------------------------------------------------------------------
/spec/helper_classes.rb:
--------------------------------------------------------------------------------
 1 | class CharStream < MapRedus::InputStream
 2 |   def self.scan(data_object)
 3 |     test_string = MapRedus::FileSystem.get(data_object)
 4 |     
 5 |     0.step(test_string.size, 30) do |index|
 6 |       char_set = test_string[index...(index+30)]
 7 |       next if char_set.empty?
 8 |       yield(index, char_set)
 9 |     end
10 |   end
11 | end
12 | 
13 | class CharCounter < MapRedus::Mapper
14 |   def self.map(map_data)
15 |     map_data.each_char do |char|
16 |       yield(char, 1)
17 |     end
18 |   end
19 | end
20 | 
21 | class ExtraResultKeyHash < MapRedus::Finalizer
22 |   def self.finalize(process)
23 |     process.each_key_reduced_value do |key, value|
24 |       process.outputter.encode(process.result_key("extra"), key, value)
25 |     end
26 |   end
27 | end
28 | 
29 | class GetCharCount < MapRedus::Process
30 |   EXPECTED_ANSWER = {"k"=>2, "v"=>1, " "=>54, ","=>3, "w"=>7, "a"=>17, "l"=>12, "b"=>2, "m"=>4, "c"=>3, "."=>2, "y"=>3, "n"=>18, "D"=>1, "d"=>15, "o"=>13, "p"=>14, "e"=>34, "f"=>6, "r"=>13, "g"=>6, "S"=>1, "s"=>12, "h"=>19, "H"=>1, "t"=>20, "i"=>16, "u"=>5, "j"=>1}
31 |   inputter CharStream
32 |   mapper CharCounter
33 | end
34 | 
35 | class CharCountTest < MapRedus::Process
36 |   inputter CharStream
37 |   mapper CharCounter
38 | end
39 | 
40 | class GetWordCount < MapRedus::Process
41 |   TEST = "He pointed his finger in friendly jest and went over to the parapet laughing to himself. Stephen Dedalus stepped up, followed him wearily halfway and sat down on the edge of the gunrest, watching him still as he propped his mirror on the parapet, dipped the brush in the bowl and lathered cheeks and neck."
42 |   EXPECTED_ANSWER = {"gunrest"=>1, "over"=>1, "still"=>1, "of"=>1, "him"=>2, "and"=>4, "bowl"=>1, "himself"=>1, "went"=>1, "friendly"=>1, "finger"=>1, "propped"=>1, "cheeks"=>1, "dipped"=>1, "down"=>1, "wearily"=>1, "up"=>1, "stepped"=>1, "dedalus"=>1, "to"=>2, "in"=>2, "sat"=>1, "the"=>6, "pointed"=>1, "as"=>1, "followed"=>1, "stephen"=>1, "laughing"=>1, "his"=>2, "he"=>2, "brush"=>1, "jest"=>1, "neck"=>1, "mirror"=>1, "edge"=>1, "on"=>2, "parapet"=>2, "lathered"=>1, "watching"=>1, "halfway"=>1}
43 |   set_result_key "test:result"
44 | end
45 | 
46 | class TestHash < MapRedus::Finalizer
47 |   def self.finalize(process)
48 |     process.each_key_reduced_value do |key, value|
49 |       process.outputter.encode(process.result_key("extra_arg"), key, value)
50 |     end
51 |   end
52 | end
53 | 
54 | class TestResultKeyArguments < MapRedus::Process
55 |   #
56 |   # EXTRA_KEY_ARG is not known at the time the process is run
57 |   # but it is known by the time the finalizer is running
58 |   #
59 |   finalizer TestHash
60 |   set_result_key "test:KEY_ARG:test:EXTRA_KEY_ARG"
61 |   key_args ["key_argument"]
62 | end
63 | 
64 | class Document
65 |   include MapRedus::Support
66 |   mapreduce_process :char_count, GetCharCount, "document:count:ID"
67 | 
68 |   attr_accessor :id
69 |   def initialize(id)
70 |     @id = id
71 |   end
72 | 
73 |   def calculate_chars(data_reference)
74 |     mapreduce.char_count(data_reference, id)
75 |   end
76 | end
77 | 


--------------------------------------------------------------------------------
/lib/mapredus/keys.rb:
--------------------------------------------------------------------------------
 1 | module MapRedus
 2 |   RedisKey = MapRedus::Keys
 3 |   ProcessInfo = RedisKey
 4 |   
 5 |   #### USED WITHIN process.rb ####
 6 |  
 7 |   # Holds the current map reduce processes that are either running or which still have data lying around
 8 |   #
 9 |   redis_key :processes, "mapredus:processes"
10 |   redis_key :processes_count, "mapredus:processes:count"
11 |   
12 |   # Holds the information (mapper, reducer, etc.) in json format for a map reduce process with pid PID
13 |   #
14 |   redis_key :pid, "mapredus:process:PID"
15 | 
16 |   # The input blocks broken down by the InputStream
17 |   redis_key :input, "mapredus:process:PID:input"
18 | 
19 |   # All the keys that the map produced
20 |   #
21 |   redis_key :keys, "mapredus:process:PID:keys"
22 | 
23 |   # The hashed key to actual string value of key
24 |   #
25 |   redis_key :hash_to_key, "mapredus:process:PID:keys:HASHED_KEY" # to ACTUAL KEY
26 |   
27 |   # The list of values for a given key generated by our map function.
28 |   # When a reduce is run it takes elements from this key and pushes them to :reduce
29 |   #
30 |   # key - list of values
31 |   #
32 |   redis_key :map, "mapredus:process:PID:map_key:HASHED_KEY"
33 |   redis_key :reduce, "mapredus:process:PID:map_key:HASHED_KEY:reduce"
34 |   
35 |   # Temporary redis space for reduce functions to use
36 |   #
37 |   redis_key :temp, "mapredus:process:PID:temp_reduce_key:HASHED_KEY:UNIQUE_REDUCE_HOSTNAME:UNIQUE_REDUCE_PROCESS_ID"
38 | 
39 |   # The default location for the result
40 |   #
41 |   DEFAULT_RESULT_KEY = "mapredus:process:PID:result"
42 | 
43 |   #### USED WITHIN master.rb ####
44 |   
45 |   # Keeps track of the current slaves (by appending "1" to a redis list)
46 |   #
47 |   # TODO: should append some sort of proper process id so we can explicitly keep track
48 |   #       of processes
49 |   #
50 |   redis_key :slaves, "mapredus:process:PID:master:slaves"
51 | 
52 |   #
53 |   # Use these constants to keep track of the progress of a process
54 |   #
55 |   # Example
56 |   #   state => map_in_progress
57 |   #            reduce_in_progress
58 |   #            finalize_in_progress
59 |   #            complete
60 |   #            failed
61 |   #            not_started
62 |   #
63 |   # contained in the ProcessInfo hash (redis_key :state, "mapredus:process:PID:master:state")
64 |   #
65 |   NOT_STARTED = "not_started"
66 |   INPUT_MAP_IN_PROGRESS = "mappers"
67 |   REDUCE_IN_PROGRESS = "reducers"
68 |   FINALIZER_IN_PROGRESS = "finalizer"
69 |   COMPLETE = "complete"
70 |   FAILED = "failed"
71 |   STATE_MACHINE = { nil => NOT_STARTED,
72 |     NOT_STARTED => INPUT_MAP_IN_PROGRESS,
73 |     INPUT_MAP_IN_PROGRESS => REDUCE_IN_PROGRESS,
74 |     REDUCE_IN_PROGRESS => FINALIZER_IN_PROGRESS,
75 |     FINALIZER_IN_PROGRESS => COMPLETE}
76 | 
77 |   # These keep track of timing information for a map reduce process of pid PID
78 |   #
79 |   redis_key :requested_at, "mapredus:process:PID:request_at"
80 |   redis_key :started_at, "mapredus:process:PID:started_at"
81 |   redis_key :finished_at, "mapredus:process:PID:finished_at"
82 |   redis_key :recent_time_to_complete, "mapredus:process:recent_time_to_complete"
83 | end
84 | 


--------------------------------------------------------------------------------
/lib/mapredus/default_classes.rb:
--------------------------------------------------------------------------------
  1 | module MapRedus
  2 |   class WordStream < InputStream
  3 |     def self.scan(data_object)
  4 |       #
  5 |       # The data_object should be a reference to an object that is
  6 |       # stored on your system.  The scanner is used to break up what you
  7 |       # need from the object into manageable pieces for the mapper.  In
  8 |       # this example, the data object is a reference to a redis string.
  9 |       #
 10 |       test_string = FileSystem.get(data_object)
 11 |       
 12 |       test_string.split.each_slice(10).each_with_index do |word_set, i|
 13 |         yield(i, word_set.join(" "))
 14 |       end
 15 |     end
 16 |   end
 17 | 
 18 |   ################################################################################
 19 | 
 20 |   class WordCounter < Mapper
 21 |     def self.map(map_data)
 22 |       map_data.split(/\W/).each do |word|
 23 |         next if word.empty?
 24 |         yield(word.downcase, 1)
 25 |       end
 26 |     end
 27 |   end
 28 | 
 29 |   ####################################REDUCERS####################################
 30 | 
 31 |   class Adder < Reducer
 32 |     def self.reduce(value_list)
 33 |       yield( value_list.reduce(0) { |r, v| r += v.to_i } )
 34 |     end
 35 |   end
 36 | 
 37 |   # Emits the identity function on the map values.
 38 |   #
 39 |   # The identity reducer should never actually have to reduce as a
 40 |   # special class in mapredus, the values should just be copied from
 41 |   # one key to a new key directly in redis.
 42 |   class Identity < Reducer
 43 |     def self.reduce_perform(process, key)
 44 |       FileSystem.copy( process.map_key(key), process.reduce_key(key) )
 45 |     end
 46 | 
 47 |     def self.reduce(value_list)
 48 |       value_list.each do |v|
 49 |         yield v 
 50 |       end
 51 |     end
 52 |   end
 53 | 
 54 |   # Emits the length of the mapped value list.
 55 |   # 
 56 |   # The counter reducer tells how many values were emitted by the
 57 |   # mapper.  In situations where an adder could used but only has to
 58 |   # sum up 1's, counter will be much faster.
 59 |   #
 60 |   # This works in MapRedus because all the values produced for one key
 61 |   # is processed (reduced) by a single worker.
 62 |   class Counter < Reducer
 63 |     def self.reduce_perform(process, key)
 64 |       process.emit(key, FileSystem.llen(process.map_key(key)))
 65 |     end
 66 | 
 67 |     def self.reduce(value_list)
 68 |       yield value_list.size
 69 |     end
 70 |   end
 71 | 
 72 |   ################################################################################
 73 | 
 74 |   class ToRedisHash < Finalizer
 75 |     def self.finalize(process)
 76 |       process.each_key_reduced_value do |key, value|
 77 |         process.outputter.encode(process.result_key, key, value)
 78 |       end
 79 |     end
 80 |   end
 81 | 
 82 |   class RedisHasher < Outputter
 83 |     def self.to_hash(result_key)
 84 |       keys(result_key).inject({}) do |hash, key|
 85 |         hash[key] = decode(result_key, key)
 86 |         hash
 87 |       end
 88 |     end
 89 | 
 90 |     def self.values(result_key)
 91 |       FileSystem.hvals(result_key)
 92 |     end
 93 | 
 94 |     def self.keys(result_key)
 95 |       FileSystem.hkeys(result_key)
 96 |     end
 97 | 
 98 |     def self.encode(result_key, k, v)
 99 |       FileSystem.hset(result_key, k, v)
100 |     end
101 | 
102 |     def self.decode(result_key, k)
103 |       FileSystem.hget(result_key, k)
104 |     end
105 |   end
106 | end
107 | 


--------------------------------------------------------------------------------
/lib/mapredus.rb:
--------------------------------------------------------------------------------
  1 | require 'redis'
  2 | require 'redis_support'
  3 | require 'resque'
  4 | require 'resque_scheduler'
  5 | 
  6 | module MapRedus
  7 |   include RedisSupport
  8 |   
  9 |   class InvalidProcess < NotImplementedError
 10 |     def initialize; super("MapRedus QueueProcess: need to have perform method defined");end
 11 |   end
 12 |   
 13 |   class ProcessSpecificationError < InvalidProcess
 14 |     def initialize; super("MapRedus Process: need to have the specification defined");end
 15 |   end
 16 | 
 17 |   class InvalidMapper < NotImplementedError
 18 |     def initialize; super("MapRedus Mapper: need to have map method defined");end
 19 |   end
 20 | 
 21 |   class InvalidReducer < NotImplementedError
 22 |     def initialize; super("MapRedus Reducer: need to have reduce method defined");end
 23 |   end
 24 | 
 25 |   class InvalidInputStream < NotImplementedError
 26 |     def initialize; super("MapRedus InputStream: need to have scan method defined");end
 27 |   end
 28 | 
 29 |   class InvalidProcess < NotImplementedError
 30 |     def initialize; super("MapRedus Process Creation Failed: Specifications were not specified");end
 31 |   end
 32 | 
 33 |   class RecoverableFail < StandardError
 34 |     def initialize; super("MapRedus Operation Failed: but it is recoverable") ;end
 35 |   end
 36 |   
 37 |   # All Queue Processes should have a function called perform
 38 |   # ensuring that when the class is put on the resque queue it can perform its work
 39 |   # 
 40 |   # Caution: defines redis, which is also defined in RedisSupport
 41 |   # 
 42 |   class QueueProcess
 43 |     def self.queue; :mapredus; end
 44 |     def self.perform(*args); raise InvalidProcess; end
 45 |   end
 46 | 
 47 |   # TODO: When you send work to a worker using a mapper you define, 
 48 |   # the worker won't have that class name defined, unless it was started up
 49 |   # with the class loaded
 50 |   #
 51 |   def register_reducer(klass); end;
 52 |   def register_mapper(klass); end;
 53 | 
 54 |   class Helper
 55 |     # resque helpers defines
 56 |     #   redis
 57 |     #   encode
 58 |     #   decode
 59 |     #   classify
 60 |     #   constantize
 61 |     #
 62 |     # This is extended here because we want to use the encode and decode function
 63 |     # when we interact with resque queues
 64 |     extend Resque::Helpers
 65 | 
 66 |     # Defines a hash by taking the absolute value of ruby's string
 67 |     # hash to rid the dashes since redis keys should not contain any.
 68 |     #
 69 |     # key - The key to be hashed.
 70 |     #
 71 |     # Examples
 72 |     #
 73 |     #   Support::key_hash( key )
 74 |     #   # => '8dd8hflf8dhod8doh9hef'
 75 |     #
 76 |     # Returns the hash.
 77 |     def self.key_hash( key )
 78 |       key.to_s.hash.abs.to_s(16)
 79 |     end
 80 | 
 81 |     # Returns the classname of the namespaced class.
 82 |     #
 83 |     # The full name of the class.
 84 |     #
 85 |     # Examples
 86 |     #
 87 |     #   Support::class_get( Super::Long::Namespace::ClassName )
 88 |     #   # => 'ClassName'
 89 |     #
 90 |     # Returns the class name.
 91 |     def self.class_get(string)
 92 |       constantize(string)
 93 |     end
 94 |   end 
 95 | end
 96 | 
 97 | require 'mapredus/keys'
 98 | require 'mapredus/filesystem'
 99 | require 'mapredus/master'
100 | require 'mapredus/mapper'
101 | require 'mapredus/reducer'
102 | require 'mapredus/finalizer'
103 | require 'mapredus/support'
104 | require 'mapredus/outputter'
105 | require 'mapredus/inputter'
106 | require 'mapredus/default_classes'
107 | require 'mapredus/process'
108 | 


--------------------------------------------------------------------------------
/spec/redis-test.conf:
--------------------------------------------------------------------------------
  1 | # Redis configuration file example
  2 | 
  3 | # By default Redis does not run as a daemon. Use 'yes' if you need it.
  4 | # Note that Redis will write a pid file in /var/run/redis.pid when daemonized.
  5 | daemonize yes
  6 | # When run as a daemon, Redis write a pid file in /var/run/redis.pid by default.
  7 | # You can specify a custom pid file location here.
  8 | pidfile ./spec/redis-test.pid
  9 | 
 10 | # Accept connections on the specified port, default is 6379
 11 | port 9736
 12 | 
 13 | # If you want you can bind a single interface, if the bind option is not
 14 | # specified all the interfaces will listen for connections.
 15 | #
 16 | # bind 127.0.0.1
 17 | 
 18 | # Close the connection after a client is idle for N seconds (0 to disable)
 19 | timeout 300
 20 | 
 21 | # Save the DB on disk:
 22 | #
 23 | #   save <seconds> <changes>
 24 | #
 25 | #   Will save the DB if both the given number of seconds and the given
 26 | #   number of write operations against the DB occurred.
 27 | #
 28 | #   In the example below the behaviour will be to save:
 29 | #   after 900 sec (15 min) if at least 1 key changed
 30 | #   after 300 sec (5 min) if at least 10 keys changed
 31 | #   after 60 sec if at least 10000 keys changed
 32 | save 900 1
 33 | save 300 10
 34 | save 60 10000
 35 | 
 36 | # The filename where to dump the DB
 37 | dbfilename dump.rdb
 38 | 
 39 | # For default save/load DB in/from the working directory
 40 | # Note that you must specify a directory not a file name.
 41 | dir ./spec/
 42 | 
 43 | # Set server verbosity to 'debug'
 44 | # it can be one of:
 45 | # debug (a lot of information, useful for development/testing)
 46 | # notice (moderately verbose, what you want in production probably)
 47 | # warning (only very important / critical messages are logged)
 48 | loglevel debug
 49 | 
 50 | # Specify the log file name. Also 'stdout' can be used to force
 51 | # the demon to log on the standard output. Note that if you use standard
 52 | # output for logging but daemonize, logs will be sent to /dev/null
 53 | logfile stdout
 54 | 
 55 | # Set the number of databases. The default database is DB 0, you can select
 56 | # a different one on a per-connection basis using SELECT <dbid> where
 57 | # dbid is a number between 0 and 'databases'-1
 58 | databases 16
 59 | 
 60 | ################################# REPLICATION #################################
 61 | 
 62 | # Master-Slave replication. Use slaveof to make a Redis instance a copy of
 63 | # another Redis server. Note that the configuration is local to the slave
 64 | # so for example it is possible to configure the slave to save the DB with a
 65 | # different interval, or to listen to another port, and so on.
 66 | 
 67 | # slaveof <masterip> <masterport>
 68 | 
 69 | ################################## SECURITY ###################################
 70 | 
 71 | # Require clients to issue AUTH <PASSWORD> before processing any other
 72 | # commands.  This might be useful in environments in which you do not trust
 73 | # others with access to the host running redis-server.
 74 | #
 75 | # This should stay commented out for backward compatibility and because most
 76 | # people do not need auth (e.g. they run their own servers).
 77 | 
 78 | # requirepass foobared
 79 | 
 80 | ################################### LIMITS ####################################
 81 | 
 82 | # Set the max number of connected clients at the same time. By default there
 83 | # is no limit, and it's up to the number of file descriptors the Redis process
 84 | # is able to open. The special value '0' means no limts.
 85 | # Once the limit is reached Redis will close all the new connections sending
 86 | # an error 'max number of clients reached'.
 87 | 
 88 | # maxclients 128
 89 | 
 90 | # Don't use more memory than the specified amount of bytes.
 91 | # When the memory limit is reached Redis will try to remove keys with an
 92 | # EXPIRE set. It will try to start freeing keys that are going to expire
 93 | # in little time and preserve keys with a longer time to live.
 94 | # Redis will also try to remove objects from free lists if possible.
 95 | #
 96 | # If all this fails, Redis will start to reply with errors to commands
 97 | # that will use more memory, like SET, LPUSH, and so on, and will continue
 98 | # to reply to most read-only commands like GET.
 99 | #
100 | # WARNING: maxmemory can be a good idea mainly if you want to use Redis as a
101 | # 'state' server or cache, not as a real DB. When Redis is used as a real
102 | # database the memory usage will grow over the weeks, it will be obvious if
103 | # it is going to use too much memory in the long run, and you'll have the time
104 | # to upgrade. With maxmemory after the limit is reached you'll start to get
105 | # errors for write operations, and this may even lead to DB inconsistency.
106 | 
107 | # maxmemory <bytes>
108 | 
109 | ############################### ADVANCED CONFIG ###############################
110 | 
111 | # Glue small output buffers together in order to send small replies in a
112 | # single TCP packet. Uses a bit more CPU but most of the times it is a win
113 | # in terms of number of queries per second. Use 'yes' if unsure.
114 | glueoutputbuf yes
115 | 


--------------------------------------------------------------------------------
/lib/mapredus/master.rb:
--------------------------------------------------------------------------------
  1 | module MapRedus
  2 |   # Note: Instead of using Resque directly within the process, we implement
  3 |   # a master interface with Resque
  4 |   #
  5 |   # Does bookkeeping to keep track of how many slaves are doing work. If we have
  6 |   # no slaves doing work for a process then the process is donex. While there is work available
  7 |   # the slaves will always be doing work.
  8 |   #
  9 |   class Master < QueueProcess
 10 |     # Check whether there are still workers working on process PID's processes
 11 |     #
 12 |     # In synchronous condition, master is always working since nothing is going to
 13 |     # the queue.
 14 |     def self.working?(pid)
 15 |       0 < FileSystem.llen(ProcessInfo.slaves(pid))
 16 |     end
 17 | 
 18 |     #
 19 |     # Master performs the work that it needs to do: 
 20 |     #   it must free itself as a slave from Resque
 21 |     #   enslave mappers
 22 |     #
 23 |     def self.perform( pid, data_object )
 24 |       process = Process.open(pid)
 25 |       enslave_inputter(process, *data_object)
 26 |       process.update(:state => INPUT_MAP_IN_PROGRESS)
 27 |     end
 28 | 
 29 |     #
 30 |     # The order of operations that occur in the mapreduce process
 31 |     #
 32 |     # The inputter sets off the mapper processes
 33 |     #
 34 |     def self.mapreduce( process, *data_object )
 35 |       start_metrics(process.pid)
 36 |       if process.synchronous
 37 |         process.update(:state => INPUT_MAP_IN_PROGRESS)
 38 |         enslave_inputter(process, *data_object)
 39 |         process.update(:state => REDUCE_IN_PROGRESS)
 40 |         enslave_reducers(process)
 41 |         process.update(:state => FINALIZER_IN_PROGRESS)
 42 |         enslave_finalizer(process)
 43 |       else
 44 |         Resque.push(QueueProcess.queue, {:class => MapRedus::Master , :args => [process.pid, data_object]} )
 45 |       end
 46 |     end
 47 | 
 48 |     def self.enslave_inputter(process, *data_object)
 49 |       enslave( process, process.inputter, process.pid, *data_object )
 50 |     end
 51 |     
 52 |     # Enslave the reducers:
 53 |     # 
 54 |     # For each key, enslave a reducer to process the values on that
 55 |     # key. If there were no keys produced during the map operation we
 56 |     # must set off the finalizer.
 57 |     #
 58 |     # TODO: inject optimizations here for special reducers like the
 59 |     # identity reduce
 60 |     #
 61 |     # returns nothing
 62 |     def self.enslave_reducers( process )
 63 |       if( process.num_keys > 0 )
 64 |         process.map_keys.each do |key|
 65 |           enslave_reduce( process, key )
 66 |         end
 67 |       else
 68 |         process.next_state
 69 |       end
 70 |     end
 71 | 
 72 |     def self.enslave_finalizer( process )
 73 |       enslave( process, process.finalizer, process.pid )
 74 |     end
 75 | 
 76 |     # Have these to match what the Mapper/Reducer perform function expects to see as arguments
 77 |     #
 78 |     # though instead of process the perform function will receive the pid
 79 |     def self.enslave_map(process, data_chunk)
 80 |       enslave( process, process.mapper, process.pid, data_chunk )
 81 |     end
 82 | 
 83 |     def self.enslave_reduce(process, key)
 84 |       enslave( process, process.reducer, process.pid, key )
 85 |     end
 86 | 
 87 |     def self.enslave_later_reduce(process, key)
 88 |       enslave_later( process.reducer.wait, process, process.reducer, process.pid, key )
 89 |     end
 90 | 
 91 |     # The current default (QUEUE) that we push on to is
 92 |     #   :mapredus
 93 |     #
 94 |     def self.enslave( process, klass, *args )
 95 |       FileSystem.rpush(ProcessInfo.slaves(process.pid), 1)
 96 | 
 97 |       if( process.synchronous )
 98 |         klass.perform(*args)
 99 |       else
100 |         Resque.push( klass.queue, { :class => klass.to_s, :args => args } )
101 |       end
102 |     end
103 | 
104 |     def self.enslave_later( delay_in_seconds, process, klass, *args)
105 |       FileSystem.rpush(ProcessInfo.slaves(process.pid), 1)
106 | 
107 |       if( process.synchronous )
108 |         klass.perform(*args)
109 |       else
110 |         #
111 |         # TODO: I cannot get enqueue_in to work with my tests
112 |         #       there seems to be a silent failure somewhere
113 |         #       in the tests such that it never calls the function
114 |         #       and the queue gets emptied
115 |         #
116 |         # Resque.enqueue_in(delay_in_seconds, klass, *args)
117 |         
118 |         ##
119 |         ## Temporary, immediately just push process back onto the resque queue
120 |         Resque.push( klass.queue, { :class => klass.to_s, :args => args } )
121 |       end
122 |     end
123 | 
124 |     def self.slaves(pid)
125 |       FileSystem.lrange(ProcessInfo.slaves(pid), 0, -1)
126 |     end
127 | 
128 |     def self.free_slave(pid)
129 |       FileSystem.lpop(ProcessInfo.slaves(pid))
130 |     end
131 | 
132 |     def self.emancipate(pid)
133 |       process = Process.open(pid)
134 |       return unless process
135 |       
136 |       # Working on resque directly seems dangerous
137 |       #
138 |       # Warning: this is supposed to be used as a debugging operation
139 |       # and isn't intended for normal use.  It is potentially very expensive.
140 |       #
141 |       destroyed = 0
142 |       qs = [queue, process.mapper.queue, process.reducer.queue, process.finalizer.queue].uniq
143 |       qs.each do |q|
144 |         q_key = "queue:#{q}"
145 |         Resque.redis.lrange(q_key, 0, -1).each do | string |
146 |           json   = Helper.decode(string)
147 |           match  = json['class'] == "MapRedus::Master"
148 |           match |= json['class'] == process.inputter.to_s
149 |           match |= json['class'] == process.mapper.to_s
150 |           match |= json['class'] == process.reducer.to_s
151 |           match |= json['class'] == process.finalizer.to_s
152 |           match &= json['args'].first.to_s == process.pid.to_s
153 |           if match
154 |             destroyed += Resque.redis.lrem(q_key, 0, string).to_i
155 |           end
156 |         end
157 |       end
158 | 
159 |       #
160 |       # our slave information is kept track of on file and not in Resque
161 |       #
162 |       FileSystem.del(ProcessInfo.slaves(pid))
163 |       destroyed
164 |     end
165 | 
166 |     # Time metrics for measuring how long it takes map reduce to do a process
167 |     #
168 |     def self.set_request_time(pid)
169 |       FileSystem.set( ProcessInfo.requested_at(pid), Time.now.to_i )
170 |     end
171 | 
172 |     def self.start_metrics(pid)
173 |       started  = ProcessInfo.started_at( pid )
174 |       FileSystem.set started, Time.now.to_i
175 |     end
176 | 
177 |     def self.finish_metrics(pid)
178 |       started  = ProcessInfo.started_at( pid )
179 |       finished = ProcessInfo.finished_at( pid )
180 |       requested = ProcessInfo.requested_at( pid )
181 |       
182 |       completion_time = Time.now.to_i
183 |       
184 |       FileSystem.set finished, completion_time
185 |       time_to_complete = completion_time - FileSystem.get(started).to_i
186 |       
187 |       recent_ttcs = ProcessInfo.recent_time_to_complete
188 |       FileSystem.lpush( recent_ttcs , time_to_complete )
189 |       FileSystem.ltrim( recent_ttcs , 0, 30 - 1)
190 |       
191 |       FileSystem.expire finished, 60 * 60
192 |       FileSystem.expire started, 60 * 60
193 |       FileSystem.expire requested, 60 * 60
194 |     end
195 |   end
196 | end
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | MapRedus
  2 | =========
  3 | 
  4 | Simple MapReduce type framework using redis and resque.
  5 | 
  6 | Overview
  7 | --------
  8 | 
  9 | This is an experimental implementation of MapReduce using Ruby for
 10 | process definition, Resque for work execution, and Redis for data
 11 | storage.
 12 | 
 13 | Goals:
 14 | 
 15 | * simple M/R-style programming for existing Ruby projects
 16 | * low cost of entry (no need for a dedicated cluster)
 17 | 
 18 | if you are looking for a high-performance MapReduce implementation
 19 | that can meet your big data needs, try Hadoop.
 20 | 
 21 | 
 22 | Using MapRedus
 23 | ---------------
 24 | 
 25 | MapRedus uses Resque to handle the processes that it runs, and redis
 26 | to keep a store for the values/data produced.
 27 | 
 28 | Workers for a MapRedus process are Resque workers.  Refer to the
 29 | Resque worker documentation to see how to load the necessary
 30 | environment for your worker to be able to run mapreduce processs.  An
 31 | example is also located in the tests.
 32 | 
 33 | ### Attaching a mapreduce process to a class
 34 | 
 35 | You will often want to define a mapreduce process that does some
 36 | operations on data within a class.  The process should have an
 37 | inputter, mapper, reducer, finalizer, and outputter defined. By
 38 | default a process will have the specifications shown below.  There is
 39 | also an example of how to do this in the tests.
 40 | 
 41 |     class GetWordCount < MapRedus::Process
 42 |       inputter MapRedus::WordStream
 43 |       mapper MapRedus::WordCounter
 44 |       reducer MapRedus::Adder
 45 |       finalizer MapRedus::ToRedisHash
 46 |       outputter MapRedus::RedisHasher
 47 |       ordered false
 48 |     end
 49 | 
 50 |     class GetCharCount < MapRedus::Process
 51 |       inputter MapRedus::CharStream
 52 |       mapper MapRedus::CharCounter
 53 |     end
 54 | 
 55 |     class Job
 56 |       mapreduce_process :word_count, GetWordCount, "job:store:result"
 57 |     end
 58 | 
 59 | The mapreduce_process needs a name, mapper, reducer, finalizer,
 60 | outputter, and key to store the result.  The operation would then be
 61 | run on a job calling the following.
 62 | 
 63 |     job = Job.new
 64 |     job.mapreduce.word_count( data )
 65 | 
 66 | The data specifies the data on which this operation is to run.  We are
 67 | currently working on a way to allow the result_store_key to change
 68 | depending on class properties.  For instance in the above example, if
 69 | the Job class had an id attribute, we may want to store the final
 70 | mapreduce result in "job:store:result:#{id}".
 71 | 
 72 | ### Inputters, Mappers, Reducers, Finalizers
 73 | 
 74 | MapRedus needs a input stream, mapper, reducer, finalizer to be
 75 | defined to run.  The input stream defines how a block of your data
 76 | gets divided so that a mapper can work on a small portion to map. For
 77 | example:
 78 | 
 79 |     class InputStream < MapRedus::InputStream
 80 |       def self.scan(data_object)
 81 |         # your data object is a reference to a block of text in redis
 82 |         text_block = MapRedus.redis.get(data_object)
 83 |         text_block.each_line.each_with_index do |line, i|
 84 |           yield(i, line)
 85 |         end
 86 |       end
 87 |     end
 88 | 
 89 |     class Mapper < MapRedus::Mapper
 90 |       def self.map(data_to_map)
 91 |         data_to_map.each do |data|
 92 |           key = data
 93 |           value = 1
 94 |           yield( key, value )
 95 |         end
 96 |       end
 97 |     end
 98 | 
 99 | In this example, the input stream calls yield to output a mapredus
100 | file number and a the value that is saved to file (in redis).  The
101 | mapper's `map` function calls yield to emit the key value pair for
102 | storage in redis.  The reducer's `reduce` function acts similarly.
103 | 
104 | The finalizer runs whatever needs to be run when a process completes,
105 | an example:
106 | 
107 |     class Finalizer < MapRedus::Finalizer
108 |       def self.finalize(process)
109 |         process.each_key_reduced_value do |key, value|
110 |           process.outputter.encode(process.result_key, key, value)
111 |         end
112 |         ...
113 |         < set off a new mapredus process to use this stored data >
114 |       end
115 |     end
116 | 
117 | The process.result_key refers the final result key that is stored in
118 | redis.  The result_key may take arguments which define the output of
119 | the key.  The process will also incorporate initially given key
120 | arguments into the result_key.  result_key's are defined exactly as a
121 | redis_key in the redis_support gem. The outputter is needed to define
122 | how exactly that encoding is defined.  We provided an outputter that
123 | encodes your data into a redis hash.
124 | 
125 |     class RedisHasher < MapRedus::Outputter
126 |       def encode(result_key, k, v)
127 |         MapRedus::FileSystem.hset(result_key, k, v)
128 |       end
129 | 
130 |       def decode(result_key, k)
131 |         MapRedus::FileSystem.hget(result_key, k)
132 |       end
133 |     end
134 | 
135 | The default Outputter makes no changes to original result, and tries
136 | to store that directly into redis as a string.
137 | 
138 | Working Locally
139 | ---------------
140 | 
141 | MapRedus uses Bundler to manage dependencies. With Bundler installed:
142 | 
143 |     bundle install
144 | 
145 | You should now be able to run tests and do all other tasks with
146 | `rake`.
147 | 
148 | Running Tests
149 | -------------
150 | 
151 | Run the tests which tests the word counter example and some other
152 | tests (you'll need to have bundler installed)
153 |     rake
154 | 
155 | Requirements
156 | ------------
157 | * Bundler (this will install all the requirements below)
158 | * Redis
159 | * RedisSupport
160 | * Resque
161 | * Resque-scheduler
162 | 
163 | ### Notes
164 |     Instead of calling `emit_intermediate`/`emit` in your map/reduce
165 |     to produce a key value pair/value you call `yield`, which will call
166 |     emit_intermediate/emit for you.  This gives flexibility in using
167 |     Mapper/Reducer classes especially in testing.
168 | 
169 | TODO
170 | ----
171 | not necessarily in the given order
172 | * Ensure that the type that is inputted is the type that is outputted
173 | 
174 | * if a process fails we do what we are supposed to do i.e. add a
175 |   failure_hook which does something if your process fails
176 | 
177 | * include functionality for a partitioner, input reader, combiner
178 | 
179 | * implement this shit (registering of environment shit in resque) so
180 |   that we can run mapreduce commands from the command line.  Defining
181 |   any arbitrary mapper and reducer.
182 | 
183 | * implement redundant workers (workers doing the same work in case one
184 |   of them fails)
185 | 
186 | * if a reducer runs a recoverable fail, then make sure that an attempt
187 |   to reenslave the worker is delayed by some fixed interval
188 | 
189 | * edit emit for when we have multiple workers doing the same reduce
190 |   (redundant workers for fault tolerance might need to change the
191 |   rpush to a lock and setting of just a value) even if other workers
192 |   do work on the same answer, want to make sure that the final reduced
193 |   thing is the same every time
194 | 
195 | * Add fault tolerance, better tracking of which workers fail,
196 |   especially when we have multiple workers doing the same work
197 |   ... currently is handled by Resque failure auto retry
198 | 
199 | * if a perform operation fails then we need to have worker recover
200 | 
201 | * make use of finish_metrics somewhere so that we can have statistics
202 |   on how long map reduce processs take
203 | 
204 | * better tracking of work being assigned so we can know when a process is finished
205 |   or in progress and have a trigger to do things when shit finishes
206 |   
207 |     in resque there is functionality for an after hook which performs
208 |     something after your process does it's work
209 | 
210 |     might also check out the resque-status plugin for a cheap and easy
211 |     way to plug status and completion-rate into existing resque jobs.
212 | 
213 | * ensure reducers only do a fixed amount of work?  See section 3.2 of
214 |   paper. bookkeeping that tells the master when tasks are in-progress
215 |   or completed.  this will be important for better paralleziation of
216 |   tasks
217 | 
218 | * think about the following logic
219 | 
220 |     + if a reducer starts working on a key after all maps have finished
221 |     then when it is done the work on that key is finished forerver
222 |     
223 |     + this would imply a process finishes when all map tasks have
224 |     finished and all reduce tasks that start after the map tasks have
225 |     finished
226 |     
227 |     + if a reducer started before all map tasks were finished, then load
228 |     its reduced result back onto the value list
229 |     
230 |     + if the reducer started after all map tasks finished, then emit the
231 |     result
232 | 
233 | Note on Patches/Pull Requests
234 | -----------------------------
235 |  
236 | * Fork the project.
237 | * Make your feature addition or bug fix.
238 | * Add tests for it. This is important so I don't break it in a
239 |   future version unintentionally.
240 | * Commit, do not mess with rakefile, version, or history.  (if you
241 |   want to have your own version, that is fine but bump version in a
242 |   commit by itself I can ignore when I pull)
243 | * Send me a pull request. Bonus points for topic branches.
244 |     
245 | ## Copyright
246 | Copyright (c) 2010 Dolores Labs. See LICENSE for details.
247 | 


--------------------------------------------------------------------------------
/spec/mapredus_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.dirname(__FILE__) + '/helper'
  2 | 
  3 | describe "MapRedus" do
  4 |   # this is called before each test case
  5 |   before(:each) do
  6 |     MapRedus::FileSystem.flushall
  7 |     @process = GetWordCount.create
  8 |     MapRedus::FileSystem.setnx("wordstream:test", GetWordCount::TEST)
  9 |   end
 10 | 
 11 |   it "has sets up the correct default classes" do
 12 |     MapRedus::Process.inputter.should == MapRedus::WordStream
 13 |     MapRedus::Process.mapper.should == MapRedus::WordCounter
 14 |     MapRedus::Process.reducer.should == MapRedus::Adder
 15 |     MapRedus::Process.finalizer.should == MapRedus::ToRedisHash
 16 |     MapRedus::Process.outputter.should == MapRedus::RedisHasher
 17 | 
 18 |     GetWordCount.result_key.should == "test:result"
 19 |     GetWordCount.inputter.should == MapRedus::WordStream
 20 |     GetWordCount.mapper.should == MapRedus::WordCounter
 21 |     GetWordCount.reducer.should == MapRedus::Adder
 22 |     GetWordCount.finalizer.should == MapRedus::ToRedisHash
 23 |     GetWordCount.outputter.should == MapRedus::RedisHasher
 24 |     
 25 |     GetCharCount.inputter.should == CharStream
 26 |     GetCharCount.mapper.should == CharCounter
 27 |     GetCharCount.reducer.should == MapRedus::Adder
 28 |     GetCharCount.finalizer.should == MapRedus::ToRedisHash
 29 |     GetCharCount.outputter.should == MapRedus::RedisHasher
 30 |   end
 31 | 
 32 |   it "creates a process successfully" do
 33 |     process = GetWordCount.open(@process.pid)
 34 |     
 35 |     process.inputter.should == MapRedus::WordStream
 36 |     process.mapper.should == MapRedus::WordCounter
 37 |     process.reducer.should == MapRedus::Adder
 38 |     process.finalizer.should == MapRedus::ToRedisHash
 39 |     process.outputter.should == MapRedus::RedisHasher
 40 | 
 41 |     process = GetCharCount.create
 42 |     process.inputter.should == CharStream
 43 |     process.mapper.should == CharCounter
 44 |     process.reducer.should == MapRedus::Adder
 45 |     process.finalizer.should == MapRedus::ToRedisHash
 46 |     process.outputter.should == MapRedus::RedisHasher
 47 |   end
 48 | 
 49 |   it "runs a map reduce process synchronously" do
 50 |     ##
 51 |     ## In general map reduce shouldn't be running operations synchronously
 52 |     ##
 53 |     @process.class.should == GetWordCount
 54 |     @process.run("wordstream:test", synchronously = true)
 55 |     @process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size
 56 | 
 57 |     @process.map_keys.each do |key|
 58 |       reduce_values = @process.reduce_values(key)
 59 |       reduce_values.size.should == 1
 60 |     end
 61 | 
 62 |     @process.each_key_reduced_value do |key, value|
 63 |       @process.outputter.decode(@process.result_key, key).to_i.should == GetWordCount::EXPECTED_ANSWER[key]
 64 |     end
 65 |   end
 66 | 
 67 |   it "runs a map reduce process asynchronously" do
 68 |     @process.run("wordstream:test", synchronously = false)
 69 |     work_off
 70 | 
 71 |     @process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size
 72 |     @process.map_keys.each do |key|
 73 |       reduce_values = @process.reduce_values(key)
 74 |       reduce_values.size.should == 1
 75 |     end
 76 | 
 77 |     @process.each_key_reduced_value do |key, value|
 78 |       @process.outputter.decode(@process.result_key, key).to_i.should == GetWordCount::EXPECTED_ANSWER[key]
 79 |     end
 80 |   end
 81 | 
 82 |   it "runs the default process" do
 83 |     process = MapRedus::Process.create
 84 |     process.update(:key_args => [process.pid])
 85 |     process.result_key.should == "mapredus:process:#{process.pid}:result"
 86 |     process.run("wordstream:test")
 87 |     work_off
 88 |     
 89 |     process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size
 90 |     process.map_keys.each do |key|
 91 |       reduce_values = process.reduce_values(key)
 92 |       reduce_values.size.should == 1
 93 |     end
 94 | 
 95 |     process.each_key_reduced_value do |key, value|
 96 |       process.outputter.decode(process.result_key, key).to_i.should == GetWordCount::EXPECTED_ANSWER[key]
 97 |     end
 98 |   end
 99 | 
100 |   it "runs a process without result_key being set (using the default key location)" do
101 |     process = CharCountTest.create
102 |     process.update(:key_args => [process.pid])
103 |     process.result_key.should == "mapredus:process:#{process.pid}:result"
104 |     process.run("wordstream:test")
105 |     work_off
106 |     
107 |     process.map_keys.size.should == GetCharCount::EXPECTED_ANSWER.size
108 |     process.map_keys.each do |key|
109 |       reduce_values = process.reduce_values(key)
110 |       reduce_values.size.should == 1
111 |     end
112 | 
113 |     process.each_key_reduced_value do |key, value|
114 |       process.outputter.decode(process.result_key, key).to_i.should == GetCharCount::EXPECTED_ANSWER[key]
115 |     end
116 |   end
117 | 
118 |   it "runs a process where key arguments exist and extra arguments are used" do 
119 |     process = TestResultKeyArguments.create
120 |     process.result_key("extra_arg").should == "test:key_argument:test:extra_arg"
121 |     process.run("wordstream:test")
122 |     work_off
123 |     
124 |     process.map_keys.size.should == GetWordCount::EXPECTED_ANSWER.size
125 |     process.map_keys.each do |key|
126 |       reduce_values = process.reduce_values(key)
127 |       reduce_values.size.should == 1
128 |     end
129 | 
130 |     process.each_key_reduced_value do |key, value|
131 |       process.outputter.decode(process.result_key("extra_arg"), key).to_i.should == GetWordCount::EXPECTED_ANSWER[key]
132 |     end
133 |   end
134 | end
135 | 
136 | describe "MapRedus Process" do
137 |   before(:each) do
138 |     MapRedus::FileSystem.flushall
139 |     @process = GetWordCount.create
140 |   end
141 | 
142 |   it "saves a process" do
143 |     @process.mapper = CharCounter
144 |     @process.synchronous = true
145 |     @process.save
146 | 
147 |     @process = MapRedus::Process.open(@process.pid)
148 |     
149 |     @process.mapper.should == CharCounter
150 |     @process.synchronous.should == true
151 |   end
152 |   
153 |   it "updates a process" do
154 |     @process.update(:mapper => CharCounter, :ordered => true)
155 |     @process = MapRedus::Process.open(@process.pid)
156 |     
157 |     @process.mapper.should == CharCounter
158 |     @process.ordered.should == true
159 |   end
160 |   
161 |   it "deletes a process" do
162 |     @process.delete
163 |     
164 |     proc = MapRedus::Process.open(@process.pid)
165 |     proc.should == nil
166 |   end
167 |   
168 |   it "kills a process" do
169 |     @process.run(GetWordCount::TEST)
170 |     MapRedus::Process.kill(@process.pid)
171 |     Resque.size(:mapredus).should == 0
172 |   end
173 | 
174 |   it "kills a process that is started" do
175 |     @process.run(GetWordCount::TEST)
176 | 
177 |     worker = Resque::Worker.new("*")
178 |     worker.perform(worker.reserve)   # do some work
179 |     
180 |     MapRedus::Process.kill(@process.pid)
181 |     Resque.size(:mapredus).should == 0
182 |   end
183 |   
184 |   it "kills all process" do
185 |     proc_1 = GetWordCount.create
186 |     proc_2 = GetWordCount.create
187 |     proc_3 = GetWordCount.create
188 |     proc_4 = GetWordCount.create
189 |     proc_5 = GetWordCount.create
190 |     proc_6 = GetWordCount.create
191 | 
192 |     proc_1.run(GetWordCount::TEST)
193 |     proc_2.run(GetWordCount::TEST)
194 |     proc_3.run(GetWordCount::TEST)
195 | 
196 |     worker = Resque::Worker.new("*")
197 |     6.times do 
198 |       worker.perform(worker.reserve)
199 |     end
200 |     
201 |     proc_4.run(GetWordCount::TEST)
202 |     proc_5.run(GetWordCount::TEST)
203 |     proc_6.run(GetWordCount::TEST)
204 |     
205 |     6.times do
206 |       worker.perform(worker.reserve)
207 |     end
208 | 
209 |     MapRedus::Process.kill_all
210 |     Resque.peek(:mapredus, 0, 100) == []
211 |   end
212 | 
213 |   it "responds to next state correctly" do
214 |     @process.state.should == MapRedus::NOT_STARTED
215 |     @process.next_state
216 |     @process.state.should == MapRedus::INPUT_MAP_IN_PROGRESS
217 |     work_off
218 | 
219 |     ##
220 |     ## Since there are no map keys produced in this the next state
221 |     ## should go directly to the finalizer
222 |     ##
223 |     
224 |     @process.next_state
225 |     @process.state.should == MapRedus::FINALIZER_IN_PROGRESS
226 |     work_off
227 |     
228 |     @process.next_state
229 |     @process.state.should == MapRedus::COMPLETE
230 |   end
231 | 
232 |   it "responds to next state correcty when keys are produced" do
233 |     @process.state.should == MapRedus::NOT_STARTED
234 |     @process.next_state
235 |     @process.state.should == MapRedus::INPUT_MAP_IN_PROGRESS
236 |     work_off
237 | 
238 |     @process.emit_intermediate("hell", "yeah")
239 | 
240 |     @process.next_state
241 |     @process.state.should == MapRedus::REDUCE_IN_PROGRESS
242 |     work_off
243 |     
244 |     @process.next_state
245 |     @process.state.should == MapRedus::FINALIZER_IN_PROGRESS
246 |     work_off
247 |     
248 |     @process.next_state
249 |     @process.state.should == MapRedus::COMPLETE
250 |   end
251 | 
252 |   it "emit_intermediate unordered successfully" do
253 |     @process.emit_intermediate("hell", "yeah")
254 |     result = []
255 |     @process.each_key_nonreduced_value do |key, value|
256 |       result << [key, value]
257 |     end
258 | 
259 |     result.should == [["hell", "yeah"]]
260 |   end
261 | 
262 |   it "emit_intermediate on an ordered process" do
263 |     @process.update(:ordered => true)
264 |     @process.emit_intermediate(2, "place", "two")
265 |     @process.emit_intermediate(1, "number", "one")
266 |     res = []
267 |     @process.each_key_nonreduced_value do |key, value|
268 |       res << [key, value]
269 |     end
270 |     
271 |     res.should == [["number", "one"], ["place", "two"]]
272 |   end
273 | 
274 |   it "emit successfully" do
275 |     @process.emit_intermediate("something", "non_reduced_value")
276 |     @process.emit("something", "reduced")
277 |     result = []
278 |     @process.each_key_reduced_value do |key, rv|
279 |       result << [key, rv]
280 |     end
281 |     result.should == [["something", "reduced"]]
282 |   end
283 | 
284 |   it "produces the correct map keys" do
285 |     @process.emit_intermediate("map key 1", "value")
286 |     @process.emit_intermediate("map key 1", "value")
287 |     @process.emit_intermediate("map key 2", "value")
288 | 
289 |     @process.map_keys.sort.should == ["map key 1", "map key 2"]
290 |   end
291 | 
292 |   it "produces the correct map/reduce values" do
293 |     MapRedus::FileSystem.setnx("wordstream:test", GetWordCount::TEST)
294 |     @process.run("wordstream:test")
295 |     work_off
296 |     @process.map_keys.sort.should == GetWordCount::EXPECTED_ANSWER.keys.sort
297 |     
298 |     @process.each_key_reduced_value do |key, reduced_value|
299 |       reduced_value.to_i.should == GetWordCount::EXPECTED_ANSWER[key]
300 |       @process.map_values(key).should == ["1"] * reduced_value.to_i
301 |     end
302 |   end
303 | end
304 | 
305 | describe "MapRedus Master" do
306 |   before(:each) do
307 |     MapRedus::FileSystem.flushall
308 |     MapRedus::FileSystem.setnx("test", "some data")
309 |     @process = GetWordCount.create
310 |   end
311 | 
312 |   it "handles slaves (enslaving) correctly" do
313 |     MapRedus::Master.enslave(@process, MapRedus::WordCounter, @process.pid, "test")
314 |     Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid, "test"], "class"=>"MapRedus::WordCounter"}
315 |     MapRedus::Master.slaves(@process.pid).should == ["1"]
316 |   end
317 | 
318 |   it "handles slaves (freeing) correctly" do
319 |     MapRedus::Master.enslave(@process, MapRedus::WordCounter, @process.pid, "test")
320 |     MapRedus::Master.enslave(@process, MapRedus::WordCounter, @process.pid, "test")
321 | 
322 |     MapRedus::Master.slaves(@process.pid).should == ["1", "1"]
323 | 
324 |     MapRedus::Master.free_slave(@process.pid)
325 |     MapRedus::Master.free_slave(@process.pid)
326 |     MapRedus::Master.slaves(@process.pid).should == []
327 |   end
328 | 
329 |   it "handles redundant multiple workers (same output regardless of how many workers complete)"
330 | end
331 | 
332 | describe "MapRedus Mapper/Reducer/Finalizer" do
333 |   before(:each) do
334 |     MapRedus::FileSystem.flushall
335 |     MapRedus::FileSystem.setnx("wordstream:test", "data")
336 |     @process = GetWordCount.create
337 |   end
338 | 
339 |   it "runs a map correctly proceeding to the next state" do
340 |     @process.update(:state => MapRedus::INPUT_MAP_IN_PROGRESS)
341 |     @process.state.should == MapRedus::INPUT_MAP_IN_PROGRESS
342 |     @process.inputter.perform(@process.pid, "wordstream:test")
343 |     Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid, 0], "class"=>"MapRedus::WordCounter"}
344 |     Resque.pop(:mapredus)
345 |     @process.mapper.perform(@process.pid, 0)
346 |     @process.reload
347 |     @process.state.should == MapRedus::REDUCE_IN_PROGRESS
348 |     Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid, "data"], "class"=>"MapRedus::Adder"}
349 | 
350 |     MapRedus::Process.open(@process.pid).state.should == MapRedus::REDUCE_IN_PROGRESS
351 |   end
352 | 
353 |   it "runs a reduce correctly proceeding to the correct next state" do
354 |     @process.update(:state => MapRedus::REDUCE_IN_PROGRESS)
355 |     @process.state.should == MapRedus::REDUCE_IN_PROGRESS
356 |     @process.emit_intermediate("data", "1")
357 |     @process.reducer.perform(@process.pid, "data")
358 |     @process.reload
359 |     @process.state.should == MapRedus::FINALIZER_IN_PROGRESS
360 |     Resque.peek(:mapredus, 0, 1).should == {"args"=>[@process.pid], "class"=>"MapRedus::ToRedisHash"}
361 | 
362 |     MapRedus::Process.open(@process.pid).state.should == MapRedus::FINALIZER_IN_PROGRESS
363 |   end
364 | 
365 |   it "should test that the finalizer correctly saves" do
366 |     @process.update(:state => MapRedus::FINALIZER_IN_PROGRESS)
367 |     @process.state.should == MapRedus::FINALIZER_IN_PROGRESS
368 |     @process.emit_intermediate("data", "1")
369 |     @process.emit("data", "1")
370 |     @process.finalizer.perform(@process.pid)
371 |     @process.reload
372 |     @process.state.should == MapRedus::COMPLETE
373 |     Resque.peek(:mapredus, 0, 100).should == []
374 |     @process.outputter.decode("test:result", "data").should == "1"
375 | 
376 |     MapRedus::Process.open(@process.pid).state.should == MapRedus::COMPLETE
377 |   end
378 | end
379 | 
380 | describe "MapRedus Support" do
381 |   before(:each) do
382 |     MapRedus::FileSystem.flushall
383 |     @doc = Document.new(10)
384 |     @other_doc = Document.new(15)
385 |   end
386 | 
387 |   it "should be simple to create a mapredus as a part of a job" do
388 |     MapRedus::FileSystem.setnx("wordstream:test", GetWordCount::TEST)
389 |     MapRedus::FileSystem.setnx("charstream:test", "simpler test")
390 |     other_answer = {" "=>1, "l"=>1, "m"=>1, "e"=>2, "p"=>1, "r"=>1, "s"=>2, "t"=>2, "i"=>1}
391 | 
392 |     @doc.calculate_chars("wordstream:test")
393 |     @other_doc.calculate_chars("charstream:test")
394 |     work_off
395 | 
396 |     GetCharCount::EXPECTED_ANSWER.keys.each do |char|
397 |       @doc.mapreduce.char_count_result([@doc.id], char).should == GetCharCount::EXPECTED_ANSWER[char].to_s
398 |     end
399 |     
400 |     other_answer.keys.each do |char|
401 |       @other_doc.mapreduce.char_count_result([@other_doc.id], char).should == other_answer[char].to_s
402 |     end
403 |   end
404 | end
405 | 
406 | describe "MapRedus Default Classes" do
407 |   before(:each) do
408 |     MapRedus::FileSystem.flushall
409 |     @process = GetWordCount.create
410 |   end
411 | 
412 |   it "testing that the identity copy actually does a copy" do
413 |     MapRedus::FileSystem.rpush(@process.map_key("test_key"), "whatever")
414 |     MapRedus::FileSystem.rpush(@process.map_key("test_key"), "yeah")
415 |     MapRedus::Identity.perform(@process.pid, "test_key")
416 | 
417 |     @process.map_values("test_key").should == ["whatever", "yeah"]
418 |     @process.reduce_values("test_key").should == ["whatever", "yeah"]
419 |   end
420 | 
421 |   it "should properly do a count" do
422 |     MapRedus::FileSystem.rpush(@process.map_key("test_key"), "whatever")
423 |     MapRedus::FileSystem.rpush(@process.map_key("test_key"), "yeah")
424 |     MapRedus::Counter.perform(@process.pid, "test_key")
425 | 
426 |     @process.reduce_values("test_key").should == ["2"]
427 |   end
428 | end
429 | 


--------------------------------------------------------------------------------
/lib/mapredus/process.rb:
--------------------------------------------------------------------------------
  1 | module MapRedus
  2 | 
  3 |   # This is what keeps track of our map reduce processes
  4 |   #  
  5 |   # We use a redis key to identify the id of map reduce process
  6 |   # the value of the redis object is a json object which contains: 
  7 |   # 
  8 |   #   {
  9 |   #     inputter : inputstreamclass,
 10 |   #     mapper : mapclass,
 11 |   #     reducer : reduceclass,
 12 |   #     finalizer : finalizerclass,
 13 |   #     outputter : outputterclass,
 14 |   #     partitioner : <not supported>,
 15 |   #     combiner : <not supported>,
 16 |   #     ordered : true_or_false   ## ensures ordering keys from the map output --> [ order, key, value ],
 17 |   #     synchronous : true_or_false   ## runs the process synchronously or not (generally used for testing)
 18 |   #     result_timeout : lenght of time a result is saved ## 3600 * 24
 19 |   #     key_args : arguments to be added to the key location of the result save (cache location)
 20 |   #     state : the current state of the process (shouldn't be set by the process and starts off as nil)
 21 |   #     type : the original process class ( currently this is needed so we can have namespaces for the result_cache keys )
 22 |   #   }
 23 |   #
 24 |   # The user has the ability in subclassing this class to create extra features if needed
 25 |   # 
 26 |   class Process
 27 |     # Public: Keep track of information that may show up as the redis json value
 28 |     #         This is so we know exactly what might show up in the json hash
 29 |     READERS = [:pid]
 30 |     ATTRS = [:inputter, :mapper, :reducer, :finalizer, :outputter, :ordered, :synchronous, :result_timeout, :key_args, :state, :type]
 31 |     READERS.each { |r| attr_reader r }
 32 |     ATTRS.each { |a| attr_accessor a }
 33 | 
 34 |     DEFAULT_TIME = 3600 * 24
 35 |     def initialize(pid, json_info)
 36 |       @pid = pid
 37 |       read(json_info)
 38 |     end
 39 | 
 40 |     def read(json_info)
 41 |       @inputter = Helper.class_get(json_helper(json_info, :inputter))
 42 |       @mapper = Helper.class_get(json_helper(json_info, :mapper))
 43 |       @reducer = Helper.class_get(json_helper(json_info, :reducer))
 44 |       @finalizer = Helper.class_get(json_helper(json_info, :finalizer))
 45 |       @ordered = json_helper(json_info, :ordered)
 46 |       @synchronous = json_helper(json_info, :synchronous)
 47 |       @result_timeout = json_helper(json_info, :result_timeout) || DEFAULT_TIME
 48 |       @key_args = json_helper(json_info, :key_args) || []
 49 |       @state = json_helper(json_info, :state) || NOT_STARTED
 50 |       @outputter = json_helper(json_info, :outputter)
 51 |       @outputter = @outputter ? Helper.class_get(@outputter) : MapRedus::Outputter
 52 |       @type = Helper.class_get(json_helper(json_info, :type) || Process)
 53 |     end
 54 | 
 55 |     def json_helper(json_info, key)
 56 |       json_info[key.to_s] || json_info[key.to_sym]
 57 |     end
 58 | 
 59 |     def to_s; to_json; end
 60 | 
 61 |     def to_hash
 62 |       (ATTRS + READERS).inject({}) do |h, attr|
 63 |         h[attr] = send(attr)
 64 |         h 
 65 |       end
 66 |     end
 67 | 
 68 |     def to_json
 69 |       Helper.encode(to_hash)
 70 |     end
 71 | 
 72 |     def save
 73 |       FileSystem.sadd( ProcessInfo.processes, @pid ) 
 74 |       FileSystem.save( ProcessInfo.pid(@pid), to_json )
 75 |       self
 76 |     end
 77 | 
 78 |     def update(attrs = {})
 79 |       attrs.each do |attr, val|
 80 |         send("#{attr}=", val)
 81 |       end
 82 |       save
 83 |     end
 84 | 
 85 |     def reload
 86 |       read(Helper.decode(FileSystem.get(ProcessInfo.pid(@pid))))
 87 |       self
 88 |     end
 89 | 
 90 |     # This will not delete if the master is working
 91 |     # It can't get ahold of the files to shred while the master is working
 92 |     #
 93 |     # if safe is set to false, this will delete all the redis stores associated
 94 |     # with this process, but will not kill the process from the queue, if it is
 95 |     # on the queue.  The process operations will fail to work when its data is deleted 
 96 |     #
 97 |     # Examples
 98 |     #   delete(safe)
 99 |     #   # => true or false
100 |     #
101 |     # Returns true as long as the master is not working.
102 |     def delete(safe = true)
103 |       return false if (safe && Master.working?(@pid))
104 |       FileSystem.keys("mapredus:process:#{@pid}*").each do |k|
105 |         FileSystem.del(k)
106 |       end        
107 |       FileSystem.srem(ProcessInfo.processes, @pid)
108 |       FileSystem.set(ProcessInfo.processes_count, 0) if( 0 == FileSystem.scard(ProcessInfo.processes) )
109 |       true
110 |     end
111 | 
112 |     # Iterates through the key, values
113 |     # 
114 |     # Example
115 |     #   each_key_reduced_value(pid)
116 |     # 
117 |     # Returns nothing.
118 |     def each_key_reduced_value
119 |       map_keys.each do |key|
120 |         reduce_values(key).each do |value|
121 |           yield key, value
122 |         end
123 |       end
124 |     end
125 | 
126 |     # Iterates through the key, values
127 |     # 
128 |     # Example
129 |     #   each_key_nonreduced_value(pid)
130 |     # 
131 |     # Returns nothing.
132 |     def each_key_nonreduced_value
133 |       map_keys.each do |key|
134 |         map_values(key).each do |value|
135 |           yield key, value
136 |         end
137 |       end
138 |     end
139 | 
140 |     def run( data_object, synchronous = false )
141 |       update(:synchronous => synchronous)
142 |       Master.mapreduce( self, data_object )
143 |       true
144 |     end
145 | 
146 |     # TODO:
147 |     # Should also have some notion of whether the process is completed or not
148 |     # since the master might not be working, but the process is not yet complete
149 |     # so it is still running
150 |     def running?
151 |       Master.working?(@pid)
152 |     end
153 | 
154 |     # Change the process state
155 |     # if the process is not running and is not synchronous
156 |     #
157 |     # Examples
158 |     #   process.next_state(pid)
159 |     #
160 |     # returns the state that the process switched to (or stays the same)
161 |     def next_state
162 |       if((not running?) and (not @synchronous))
163 |         new_state = STATE_MACHINE[self.state]
164 |         update(:state => new_state)
165 |         method = "enslave_#{new_state}".to_sym
166 |         Master.send(method, self) if( Master.respond_to?(method) )
167 |         new_state
168 |       end
169 |     end
170 | 
171 |     ### The following functions deal with keys/values produced during the
172 |     ### running of a process
173 |     
174 |     # Emissions, when we get map/reduce results back we emit these 
175 |     # to be stored in our file system (redis)
176 |     #
177 |     # key_value  - The key, value
178 |     #
179 |     # Examples
180 |     #   emit_intermediate(key, value)
181 |     #   # => if an ordering is required
182 |     #   emit_intermediate(rank, key, value)
183 |     #
184 |     # Returns the true on success.
185 |     def emit_intermediate(*key_value)
186 |       if( not @ordered )
187 |         key, value = key_value
188 |         FileSystem.sadd( ProcessInfo.keys(@pid), key )
189 |         hashed_key = Helper.key_hash(key)
190 |         FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value )
191 |       else
192 |         # if there's an order for the process then we should use a zset above
193 |         # ordered process's map emits [rank, key, value]
194 |         #
195 |         rank, key, value = key_value
196 |         FileSystem.zadd( ProcessInfo.keys(@pid), rank, key )
197 |         hashed_key = Helper.key_hash(key)
198 |         FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value )
199 |       end
200 |       raise "Key Collision: key:#{key}, #{key.class} => hashed key:#{hashed_key}" if key_collision?(hashed_key, key)
201 |       true
202 |     end
203 | 
204 |     # The emission associated with a reduce.  Currently all reduced
205 |     # values are pushed onto a redis list.  It may be the case that we
206 |     # want to directly use a different redis type given the kind of
207 |     # reduce we are doing.  Often a reduce only returns one value, so
208 |     # instead of a rpush, we should do a set.
209 |     # 
210 |     # Examples
211 |     #   emit(key, reduced_value)
212 |     #
213 |     # Returns "OK" on success.
214 |     def emit(key, reduce_val)
215 |       hashed_key = Helper.key_hash(key)
216 |       FileSystem.rpush( ProcessInfo.reduce(@pid, hashed_key), reduce_val )
217 |     end
218 | 
219 |     def key_collision?(hashed_key, key)
220 |       not ( FileSystem.setnx( ProcessInfo.hash_to_key(@pid, hashed_key), key ) ||
221 |             FileSystem.get( ProcessInfo.hash_to_key(@pid, hashed_key) ) == key.to_s )
222 |     end
223 | 
224 |     # Convenience methods to get the mapredus internal key string for a key
225 |     #
226 |     # Examples
227 |     #   reduce_key("document")
228 |     #   # => mapredus:process:PID:map_key:<Helper.key_hash("document")>:reduce
229 |     #   map_key("document")
230 |     #   # => mapredus:process:PID:map_key:<Helper.key_hash("document")>
231 |     #
232 |     # Returns the internal mapreduce string key for a given key.
233 |     [:reduce, :map].each do |internal_key|
234 |       define_method("#{internal_key}_key") do |key|
235 |         ProcessInfo.send(internal_key, @pid, Helper.key_hash(key))
236 |       end
237 |     end
238 | 
239 |     # Keys that the map operation produced
240 |     #
241 |     # Examples
242 |     #   map_keys
243 |     #   # =>
244 |     #
245 |     # Returns the Keys.
246 |     def map_keys
247 |       if( not @ordered )
248 |         FileSystem.smembers( ProcessInfo.keys(@pid) )
249 |       else
250 |         FileSystem.zrange( ProcessInfo.keys(@pid), 0, -1 )
251 |       end
252 |     end
253 | 
254 |     def num_keys()
255 |       if( not @ordered )
256 |         FileSystem.scard( ProcessInfo.keys(@pid) )
257 |       else
258 |         FileSystem.zcard( ProcessInfo.keys(@pid) )
259 |       end
260 |     end
261 | 
262 |     # values that the map operation produced, for a key
263 |     #
264 |     # Examples
265 |     #   map_values(key)
266 |     #   # =>
267 |     #
268 |     # Returns the values.
269 |     def map_values(key)
270 |       hashed_key = Helper.key_hash(key)
271 |       FileSystem.lrange( ProcessInfo.map(@pid, hashed_key), 0, -1 )
272 |     end
273 | 
274 |     def num_values(key)
275 |       hashed_key = Helper.key_hash(key)
276 |       FileSystem.llen( ProcessInfo.map(@pid, hashed_key) )
277 |     end
278 | 
279 |     # values that the reduce operation produced, for a key
280 |     #
281 |     # Examples
282 |     #   reduce_values(key)
283 |     #   # =>
284 |     #
285 |     # Returns the values.
286 |     def reduce_values(key)
287 |       hashed_key = Helper.key_hash(key)
288 |       FileSystem.lrange( ProcessInfo.reduce(@pid, hashed_key), 0, -1 )
289 |     end
290 | 
291 |     # functions to manage the location of the result in the FileSystem
292 |     #
293 |     # Examples
294 |     #   process.result_key(extra, arguments)
295 |     #   Process.result_key(all, needed, arguments)
296 |     #   # => "something:that:uses:the:extra:arguments"
297 |     # 
298 |     #   SomeProcessSubclass.set_result_key("something:ARG:something:VAR")
299 |     #   # sets the result key for (CAPITAL require arguments to fill in the values)
300 |     def result_key(*args)
301 |       Helper.class_get(@type).result_key(*[@key_args, args].flatten)
302 |     end
303 | 
304 |     def self.result_key(*args)
305 |       key_maker = "#{self.to_s.gsub(/\W/,"_")}_result_cache"
306 |       key_maker = ProcessInfo.respond_to?(key_maker) ? key_maker : "#{MapRedus::Process.to_s.gsub(/\W/,"_")}_result_cache"
307 |       ProcessInfo.send( key_maker, *args )
308 |     end
309 | 
310 |     def self.set_result_key(key_struct)
311 |       MapRedus.redefine_redis_key( "#{self.to_s.gsub(/\W/,"_")}_result_cache", key_struct )
312 |     end
313 | 
314 |     # Create sets up a process to be run with the given specification.
315 |     # It saves the information in the FileSystem and returns an
316 |     # instance of the process that run should be called on when
317 |     # running is desired.
318 |     # 
319 |     # Example
320 |     #   process = MapRedus::Process.create
321 |     #   process.run
322 |     #   
323 |     # Returns an instance of the process
324 |     def self.create
325 |       new_pid = get_available_pid
326 |       specification = ATTRS.inject({}) do |ret, attr|
327 |         ret[attr] = send(attr)
328 |         ret
329 |       end
330 |       specification[:type] = self
331 |       self.new(new_pid, specification).save
332 |     end
333 | 
334 |     # This defines the attributes to be associated with a MapRedus process
335 |     # This will allow us to subclass a Process, creating a new specification
336 |     # by specifying what say the inputter should equal
337 |     #
338 |     # Example
339 |     #   class AnswerDistribution < MapRedus::Process
340 |     #     inputter JudgmentStream
341 |     #     mapper ResponseFrequencyMap
342 |     #     reducer Adder
343 |     #     finalizer AnswerCount
344 |     #     outputter MapRedus::RedisHasher
345 |     #   end
346 |     class << self; attr_reader *ATTRS; end
347 | 
348 |     # Setter/Getter method definitions to set/get the attribute for
349 |     # the class. In the getter if it is not defined (nil) then return
350 |     # the default attribute defined in MapRedus::Process.
351 |     #
352 |     # Example
353 |     #   class AnswerDistribution < MapRedus::Process
354 |     #     inputter JudgmentStream
355 |     #     mapper ResponseFrequency
356 |     #   end
357 |     #   AnswerDistribution.reducer.should == Adder
358 |     ATTRS.each do |attr|
359 |       (class << self; self; end).send(:define_method, attr) do |*one_arg|
360 |         attribute = "@#{attr}"
361 |         case one_arg.size
362 |         when 0
363 |           instance_variable_get(attribute) || MapRedus::Process.instance_variable_get(attribute)
364 |         when 1
365 |           instance_variable_set(attribute, one_arg.first)
366 |         else
367 |           raise ArgumentError.new("wrong number of arguments (#{one_arg.size}) when zero or one arguments were expected")
368 |         end
369 |       end
370 |     end
371 | 
372 |     # Default attributes for the process class.  All other attributes
373 |     # are nil by default.
374 |     inputter WordStream
375 |     mapper WordCounter
376 |     reducer Adder
377 |     finalizer ToRedisHash
378 |     outputter RedisHasher
379 |     type Process
380 |     set_result_key DEFAULT_RESULT_KEY
381 |     
382 |     # This function returns all the redis keys produced associated
383 |     # with a process's process id.
384 |     #
385 |     # Example
386 |     #   Process.info(17)
387 |     #
388 |     # Returns an array of keys associated with the process id.
389 |     def self.info(pid)
390 |       FileSystem.keys(ProcessInfo.pid(pid) + "*")
391 |     end
392 |     
393 |     # Returns an instance of the process class given the process id.
394 |     # If no such process id exists returns nil.
395 |     #
396 |     # Example
397 |     #   process = Process.open(17)
398 |     def self.open(pid)
399 |       spec = Helper.decode( FileSystem.get(ProcessInfo.pid(pid)) )
400 |       spec && self.new( pid, spec )
401 |     end
402 | 
403 |     # Find out what map reduce processes are out there
404 |     # 
405 |     # Examples
406 |     #   FileSystem::ps
407 |     #
408 |     # Returns a list of the map reduce process ids
409 |     def self.ps
410 |       FileSystem.smembers(ProcessInfo.processes)
411 |     end
412 | 
413 |     # Find out what map reduce processes are out there
414 |     # 
415 |     # Examples
416 |     #   FileSystem::get_available_pid
417 |     #
418 |     # Returns an avilable pid.
419 |     def self.get_available_pid
420 |       FileSystem.incrby(ProcessInfo.processes_count, 1 + rand(20)) 
421 |     end
422 | 
423 |     # Given a arguments for a result key, delete the result from the
424 |     # filesystem.
425 |     #
426 |     # Examples
427 |     #   Process.delete_saved_result(key)
428 |     def self.delete_saved_result(*key_args)
429 |       FileSystem.del( result_key(*key_args) )
430 |     end
431 |     
432 |     # Remove redis keys associated with this process if the Master isn't working.
433 |     #
434 |     # potentially is very expensive.
435 |     #
436 |     # Example
437 |     #   Process::kill(pid)
438 |     #   # => true
439 |     #
440 |     # Returns true on success.
441 |     def self.kill(pid)
442 |       num_killed = Master.emancipate(pid)
443 |       proc = Process.open(pid)
444 |       proc.delete if proc
445 |       num_killed
446 |     end
447 | 
448 |     def self.kill_all
449 |       ps.each do |pid|
450 |         kill(pid)
451 |       end
452 |       FileSystem.del(ProcessInfo.processes)
453 |       FileSystem.del(ProcessInfo.processes_count)
454 |     end
455 |   end
456 | end
457 | 


--------------------------------------------------------------------------------