├── .gitignore
├── VERSION
├── contrib
└── hudson
│ ├── plugins
│ └── hadoop-ruby
│ │ ├── .gitignore
│ │ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── index.jelly
│ │ │ └── hudson
│ │ │ │ └── plugins
│ │ │ │ └── hadoop
│ │ │ │ └── ruby
│ │ │ │ └── HadoopRuby
│ │ │ │ └── config.jelly
│ │ │ ├── webapp
│ │ │ └── help.html
│ │ │ └── java
│ │ │ └── hudson
│ │ │ └── plugins
│ │ │ └── hadoop
│ │ │ └── ruby
│ │ │ ├── HadoopRuby.java
│ │ │ └── ItemListenerImpl.java
│ │ └── pom.xml
│ ├── conf
│ └── hadoop-site.xml
│ └── bin
│ ├── hadoop-papyrus.sh
│ └── hadoop
├── bin
└── papyrus
├── examples
├── word_count_test.rb
├── hive_like_test.rb
└── log_analysis_test.rb
├── spec
├── spec_helper.rb
├── client_spec.rb
├── dsl_init_spec.rb
├── util_spec.rb
├── hive_like_spec.rb
├── core_spec.rb
├── mapred_factory_spec.rb
├── example_spec.rb
├── word_count_spec.rb
└── log_analysis_spec.rb
├── lib
├── hadoop_dsl.rb
├── dsl_init.rb
├── util.rb
├── hadoop_dsl_client.rb
├── word_count.rb
├── mapred_factory.rb
├── core.rb
├── hive_like.rb
└── log_analysis.rb
├── conf
└── hadoop-site.xml
├── Rakefile
├── README.rdoc
└── hadoop-papyrus.gemspec
/.gitignore:
--------------------------------------------------------------------------------
1 | pkg
2 |
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.6
2 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/.gitignore:
--------------------------------------------------------------------------------
1 | .*
2 | target
3 | work
4 | rubygems
5 | tmp
6 |
--------------------------------------------------------------------------------
/bin/papyrus:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'hadoop_dsl_client'
4 |
5 | HadoopDsl::Client.new(ARGV).run
6 |
--------------------------------------------------------------------------------
/examples/word_count_test.rb:
--------------------------------------------------------------------------------
1 | dsl 'WordCount'
2 |
3 | from 'wc/inputs'
4 | to 'wc/outputs'
5 |
6 | count_uniq
7 | total :bytes, :words, :lines
8 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | # spec helper
2 | require 'rubygems'
3 | gem 'jruby-on-hadoop'
4 |
5 | require 'tempfile'
6 |
7 | def create_tmp_script(body)
8 | tmp = Tempfile.new('test.rb')
9 | tmp.print body
10 | tmp.close
11 | tmp.path
12 | end
13 |
14 |
--------------------------------------------------------------------------------
/examples/hive_like_test.rb:
--------------------------------------------------------------------------------
1 | dsl 'HiveLike'
2 |
3 | # hive-like/items.txt
4 | # apple, 3, 100
5 | # banana, 1, 50
6 |
7 | create_table items(item STRING, quantity INT, price INT);
8 | load_data "hive-like/items.txt" items;
9 |
10 | select quantity, price, item from items;
11 |
12 | # expect
13 | # 0 apple 3 300
14 | # 1 banana 1 50
15 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/resources/index.jelly:
--------------------------------------------------------------------------------
1 |
6 |
7 | This is Hadoop Ruby plugin. The build scripts written in hadoop ruby will be executed by this plugin.
8 |
9 |
--------------------------------------------------------------------------------
/lib/hadoop_dsl.rb:
--------------------------------------------------------------------------------
1 | require 'util'
2 | require 'mapred_factory'
3 | require 'core'
4 |
5 | # for jruby
6 | if defined? JRUBY_VERSION
7 | require 'java'
8 | import 'org.apache.hadoop.io.IntWritable'
9 | import 'org.apache.hadoop.io.Text'
10 |
11 | # Hadoop IO types
12 | HadoopDsl::Text = Text
13 | HadoopDsl::IntWritable = IntWritable
14 | end
15 |
--------------------------------------------------------------------------------
/contrib/hudson/conf/hadoop-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | fs.default.name
10 | hdfs://localhost:9000/
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/resources/hudson/plugins/hadoop/ruby/HadoopRuby/config.jelly:
--------------------------------------------------------------------------------
1 |
3 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/webapp/help.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Runs a Hadoop Ruby script (defaults to ruby interpreter) for building the project.
4 | The script will be run with the workspace as the current directory.
5 |
6 |
7 |
8 | The shell will be invoked with "-v" option. So all of the commands are printed before being executed,
9 | and the build is considered a failure if any of the commands exits with a non-zero exit code.
10 |
11 |
12 |
--------------------------------------------------------------------------------
/conf/hadoop-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | fs.default.name
9 | hdfs://localhost:9000/
10 |
11 |
12 | mapred.job.tracker
13 | localhost:50040
14 |
15 |
16 | mapred.child.java.opts
17 | -Xmx512m
18 |
19 |
20 |
--------------------------------------------------------------------------------
/contrib/hudson/bin/hadoop-papyrus.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CURRENT_DIR=$(cd $(dirname $0); pwd)
4 | PATH=$CURRENT_DIR:$PATH
5 |
6 | GEM_HOME=$CURRENT_DIR/..
7 | HADOOP_HOME=$HUDSON_HOME/hadoop/dist
8 | HADOOP_CONF_DIR=$CURRENT_DIR/../conf
9 | JRUBY_JAR_DIR=$GEM_HOME/gems/jruby-jars-1.4.0/lib/
10 |
11 | export PATH GEM_HOME HADOOP_HOME HADOOP_CONF_DIR
12 |
13 | #echo java -classpath $JRUBY_JAR_DIR/jruby-core-1.4.0.jar:$JRUBY_JAR_DIR/jruby-stdlib-1.4.0.jar org.jruby.Main $CURRENT_DIR/papyrus $1
14 | java -classpath $JRUBY_JAR_DIR/jruby-core-1.4.0.jar:$JRUBY_JAR_DIR/jruby-stdlib-1.4.0.jar org.jruby.Main $CURRENT_DIR/papyrus $1
15 |
--------------------------------------------------------------------------------
/lib/dsl_init.rb:
--------------------------------------------------------------------------------
1 | require 'hadoop_dsl'
2 |
3 | include HadoopDsl
4 |
5 | def map(key, value, output, reporter, script)
6 | mapper = MapperFactory.create(script, key, value)
7 | mapper.run
8 |
9 | write(output, mapper)
10 | end
11 |
12 | def reduce(key, values, output, reporter, script)
13 | reducer = ReducerFactory.create(script, key, values)
14 | reducer.run
15 |
16 | write(output, reducer)
17 | end
18 |
19 | def setup(conf, script)
20 | setup = SetupFactory.create(script, conf)
21 | setup.run
22 | setup.paths
23 | end
24 |
25 | private
26 |
27 | def write(output, controller)
28 | controller.emitted.each do |e|
29 | e.each do |k, v|
30 | output.collect(k, v)
31 | end
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | org.jvnet.hudson.plugins
5 | plugin
6 | 1.318
7 | ../pom.xml
8 |
9 |
10 | hadoop-ruby
11 | hpi
12 | 1.1-SNAPSHOT
13 | Hudson Hadoop Ruby Plugin
14 | http://wiki.hudson-ci.org/display/HUDSON/Hadoop+Ruby+Plugin
15 |
16 |
--------------------------------------------------------------------------------
/lib/util.rb:
--------------------------------------------------------------------------------
1 | # utility functions
2 | require 'hadoop_dsl'
3 |
4 | module HadoopDsl
5 | # file body cache
6 | # reading file in map/reduce cause critical issues!
7 | @@file_bodies = {}
8 |
9 | def self.snake_case(str)
10 | str.gsub(/\B[A-Z]/, '_\&').downcase
11 | end
12 |
13 | def self.read_file(file_name)
14 | # use if cached
15 | body = @@file_bodies[file_name] if @@file_bodies[file_name]
16 |
17 | # read as usual
18 | body = File.open(file_name).read rescue nil unless body
19 |
20 | # read from loadpath
21 | unless body
22 | $:.each do |path|
23 | body = File.open(File.join(path, file_name)).read rescue next
24 | break
25 | end
26 | end
27 |
28 | raise "cannot find file - #{file_name}" unless body
29 |
30 | # for cache
31 | @@file_bodies[file_name] = body
32 | body
33 | end
34 |
35 | def self.reset_dsl_file
36 | @@file_bodies = {}
37 | end
38 | end
39 |
--------------------------------------------------------------------------------
/spec/client_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__), 'spec_helper')
2 | require 'hadoop_dsl_client'
3 |
4 | describe HadoopDsl::Client do
5 | before do
6 | @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
7 | end
8 |
9 | it 'can parse args' do
10 | @client.files.join.should match /ruby_wrapper\.rb/
11 | @client.files.join.should match /dsl_init\.rb/
12 | @client.files.should include 'examples/wordcount.rb'
13 | @client.inputs.should == 'in'
14 | @client.outputs.should == 'out'
15 | end
16 |
17 | it 'can add dsl file into mapred args' do
18 | @client.mapred_args.should ==
19 | "--script dsl_init.rb in out --dslfile wordcount.rb"
20 | end
21 |
22 | it 'can add dsl lib files' do
23 | lib_path = HadoopDsl.lib_path
24 | @client.files.should include File.join(lib_path, 'core.rb')
25 | @client.files.should include File.join(lib_path, 'log_analysis.rb')
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/lib/hadoop_dsl_client.rb:
--------------------------------------------------------------------------------
1 | require 'jruby-on-hadoop'
2 |
3 | module HadoopDsl
4 | def self.lib_path
5 | File.expand_path(File.dirname(__FILE__))
6 | end
7 |
8 | def self.dsl_init_script
9 | File.join(lib_path, "dsl_init.rb")
10 | end
11 |
12 | class Client < JRubyOnHadoop::Client
13 | def parse_args
14 | super
15 | @script_path = HadoopDsl.dsl_init_script
16 | @script = File.basename(@script_path)
17 | @dsl_file_path = @args[0]
18 | @dsl_file = File.basename(@dsl_file_path)
19 | @files << @script_path << @dsl_file_path
20 |
21 | # TODO move properly, with jruby-on-hadoop
22 | add_dsl_lib_files
23 | ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
24 | end
25 |
26 | def mapred_args
27 | args = super
28 | args += " --dslfile #{@dsl_file}"
29 | args
30 | end
31 |
32 | def add_dsl_lib_files
33 | lib_path = HadoopDsl.lib_path
34 | @files += Dir.glob(File.join(lib_path, "*.rb"))
35 | end
36 | end
37 | end
38 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | # spec
2 | require 'rubygems'
3 | require 'spec/rake/spectask'
4 |
5 | Spec::Rake::SpecTask.new do |t|
6 | def hadoop_core_jar
7 | hadoop_home = ENV['HADOOP_HOME']
8 | Dir.glob("#{hadoop_home}/hadoop-*-core.jar").first
9 | end
10 |
11 | t.libs = ['lib']
12 | t.spec_opts = ['-c', '-fs', "-r #{hadoop_core_jar}"]
13 | t.spec_files = FileList['spec/**/*_spec.rb']
14 | end
15 |
16 | # jeweler
17 | begin
18 | require 'jeweler'
19 | Jeweler::Tasks.new do |gemspec|
20 | gemspec.name = "hadoop-papyrus"
21 | gemspec.summary = "Hadoop papyrus"
22 | gemspec.description = "Hadoop papyrus - Ruby DSL for Hadoop"
23 | gemspec.email = "fujibee@gmail.com"
24 | gemspec.homepage = "http://github.com/fujibee/hadoop-papyrus"
25 | gemspec.authors = ["Koichi Fujikawa"]
26 |
27 | gemspec.add_dependency 'jruby-on-hadoop'
28 | gemspec.files.exclude "spec/**/*"
29 | end
30 | Jeweler::GemcutterTasks.new
31 | rescue LoadError
32 | puts "Jeweler not available. Install it with: gem install jeweler"
33 | end
34 |
35 |
--------------------------------------------------------------------------------
/spec/dsl_init_spec.rb:
--------------------------------------------------------------------------------
1 | require 'dsl_init'
2 |
3 | describe 'mapreduce init' do
4 |
5 | before(:each) do
6 | @script = create_tmp_script(<<-EOF)
7 | dsl 'LogAnalysis'
8 | data 'test' do
9 | from 'test/inputs'
10 | to 'test/outputs'
11 |
12 | separate(" ")
13 | column_name 'c0', 'c1', 'c2', 'c3'
14 | topic 't1' do
15 | count_uniq columns(:c1)
16 | end
17 | end
18 | EOF
19 | end
20 |
21 | before do
22 | @one = 1
23 | @output = mock('output')
24 | end
25 |
26 | it 'can map sucessfully' do
27 | key = 'key'
28 | value = 'it should be fine'
29 | @output.should_receive(:collect).once #.with(@text, @one)
30 |
31 | map(key, value, @output, nil, @script)
32 | end
33 |
34 | it 'can reduce sucessfully' do
35 | key = "t1\tkey"
36 | values = [@one, @one, @one]
37 | @output.should_receive(:collect).once #.with(@text, @one)
38 |
39 | reduce(key, values, @output, nil, @script)
40 | end
41 |
42 | it 'can set job conf' do
43 | conf = mock('jobconf')
44 | paths = setup(conf, @script)
45 |
46 | paths[0].should == 'test/inputs'
47 | paths[1].should == 'test/outputs'
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/spec/util_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__) , 'spec_helper')
2 | require 'util'
3 |
4 | describe 'utilities' do
5 | before do
6 | HadoopDsl.reset_dsl_file
7 | @script_body = 'This is a script body.'
8 | @script = create_tmp_script(@script_body)
9 | end
10 |
11 | it 'can change camelcase str to snakecase' do
12 | HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
13 | end
14 |
15 | it 'can read file and get file data to string' do
16 | HadoopDsl.read_file(@script).should == @script_body
17 | end
18 |
19 | it 'raise error if no file in loadpath' do
20 | lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
21 | end
22 |
23 | it 'can load from cache if script is loaded' do
24 | HadoopDsl.read_file(@script).should == @script_body
25 | File.delete(@script)
26 | HadoopDsl.read_file(@script).should == @script_body
27 | end
28 |
29 | it 'can load from each cache even if one script is loaded' do
30 | HadoopDsl.read_file(@script).should == @script_body
31 | another_script = create_tmp_script("another")
32 | HadoopDsl.read_file(another_script).should == "another"
33 | end
34 | end
35 |
--------------------------------------------------------------------------------
/examples/log_analysis_test.rb:
--------------------------------------------------------------------------------
1 | dsl 'LogAnalysis'
2 |
3 | data 'apache log on test2' do
4 | from 'apachelog/inputs'
5 | to 'apachelog/outputs'
6 |
7 | # 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
8 | # 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
9 |
10 | each_line do
11 | pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
12 | column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
13 |
14 | topic 'ua counts', :label => 'ua' do
15 | count_uniq column[:ua]
16 | end
17 |
18 | topic 'count bot', :label => 'bot' do
19 | ua = column[:ua].value
20 | bot = ua if ua =~ /bot/i
21 | count_uniq bot
22 | end
23 |
24 | topic 'ua counts group by path' do
25 | request = column[:request].value
26 | if request
27 | path = request.split(/\s+/)[1]
28 | group_by path
29 | end
30 | count_uniq column[:ua]
31 | end
32 |
33 | topic 'ua counts by daily' do
34 | group_date_by column[:access_date], :daily
35 | count_uniq column[:ua]
36 | end
37 |
38 | # topic 'total bytes' do
39 | # select_date column[:access_date], BY_MONTHLY
40 | # sum column[:bytes].to_kilobytes # / 1024
41 | # end
42 | end
43 | end
44 |
--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
1 | = hadoop-papyrus
2 |
3 | Enable to run Ruby DSL script on your Hadoop.
4 |
5 | == Description
6 |
7 | You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
8 | This gem depends on 'jruby-on-hadoop' project.
9 |
10 | == Install
11 |
12 | Required gems are all on GemCutter.
13 |
14 | 1. Upgrade your rubygem to 1.3.5
15 | 2. Install gems
16 | $ gem install hadoop-papyrus
17 |
18 | == Usage
19 |
20 | 1. Run Hadoop cluster on your machines and put your 'hadoop' executable to your PATH or set HADOOP_HOME env variable.
21 | 2. put files into your hdfs. ex) wc/inputs/file1
22 | 3. Now you can run 'papyrus' like below:
23 | $ papyrus examples/word_count_test.rb
24 | You can get Hadoop job results in your hdfs wc/outputs/part-*
25 |
26 | == Examples
27 |
28 | Word Count DSL script
29 | dsl 'WordCount'
30 |
31 | from 'wc/inputs'
32 | to 'wc/outputs'
33 |
34 | count_uniq
35 | total :bytes, :words, :lines
36 |
37 | Log Analysis DSL script
38 | dsl 'LogAnalysis'
39 |
40 | data 'apache log on test2' do
41 | from 'apachelog/inputs'
42 | to 'apachelog/outputs'
43 |
44 | each_line do
45 | pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
46 | column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
47 |
48 | topic 'ua counts', :label => 'ua' do
49 | count_uniq column[:ua]
50 | end
51 | end
52 | end
53 |
54 | == Run spec
55 | Set HADOOP_HOME on your env and run 'jruby -S rake spec'
56 |
57 | == Author
58 | Koichi Fujikawa
59 |
60 | == Copyright
61 | License: Apache License
62 |
--------------------------------------------------------------------------------
/lib/word_count.rb:
--------------------------------------------------------------------------------
1 | require 'hadoop_dsl'
2 | require 'enumerator'
3 |
4 | module HadoopDsl::WordCount
5 | MODEL_METHODS = []
6 | TOTAL_PREFIX = "\t"
7 |
8 | # controller
9 | class WordCountMapper < HadoopDsl::BaseMapper
10 | def initialize(script, key, value)
11 | super(script, WordCountMapperModel.new(key, value))
12 | end
13 |
14 | # model methods
15 | def_delegators :@model, *MODEL_METHODS
16 |
17 | # emitters
18 | def count_uniq
19 | @model.value.split.each {|word| emit(word => 1)}
20 | end
21 |
22 | def total(*types)
23 | types.each do |type|
24 | case type
25 | when :bytes
26 | emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
27 | when :words
28 | emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
29 | when :lines
30 | emit("#{TOTAL_PREFIX}total lines" => 1)
31 | end
32 | end
33 | end
34 | end
35 |
36 | class WordCountReducer < HadoopDsl::BaseReducer
37 | def initialize(script, key, values)
38 | super(script, WordCountReducerModel.new(key, values))
39 | end
40 |
41 | # model methods
42 | def_delegators :@model, *MODEL_METHODS
43 |
44 | # emitters
45 | def count_uniq; aggregate unless @model.total_value? end
46 | def total(*types); aggregate if @model.total_value? end
47 | end
48 |
49 | # model
50 | class WordCountMapperModel < HadoopDsl::BaseMapperModel
51 | end
52 |
53 | class WordCountReducerModel < HadoopDsl::BaseReducerModel
54 | def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
55 | end
56 | end
57 |
--------------------------------------------------------------------------------
/lib/mapred_factory.rb:
--------------------------------------------------------------------------------
1 | require 'hadoop_dsl'
2 |
3 | module HadoopDsl
4 | class MapRedFactory
5 | def self.dsl_name(script)
6 | HadoopDsl.read_file(script).each_line do |line|
7 | dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
8 | return dsl_name if dsl_name
9 | end
10 | end
11 |
12 | def self.require_dsl_lib(dsl_name)
13 | require HadoopDsl.snake_case(dsl_name)
14 | end
15 | end
16 |
17 | class MapperFactory < MapRedFactory
18 | # for cache in map loop
19 | @@mapper_class = nil
20 | def self.create(script, key, value)
21 | # once decide in map loop
22 | unless @@mapper_class
23 | dsl_name = self.dsl_name(script)
24 | require_dsl_lib(dsl_name)
25 | @@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
26 | end
27 |
28 | @@mapper_class.new(script, key, value)
29 | end
30 | end
31 |
32 | class ReducerFactory < MapRedFactory
33 | @@reducer_class = nil
34 | def self.create(script, key, values)
35 | # once decide in reduce loop
36 | unless @@reducer_class
37 | dsl_name = self.dsl_name(script)
38 | require_dsl_lib(dsl_name)
39 | @@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
40 | end
41 |
42 | @@reducer_class.new(script, key, values)
43 | end
44 | end
45 |
46 | class SetupFactory < MapRedFactory
47 | def self.create(script, conf)
48 | dsl_name = self.dsl_name(script)
49 | require_dsl_lib(dsl_name)
50 | setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup"
51 | eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
52 | end
53 | end
54 | end
55 |
--------------------------------------------------------------------------------
/spec/hive_like_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__), 'spec_helper')
2 | require 'hive_like'
3 |
4 | include HadoopDsl::HiveLike
5 |
6 | describe HiveLikeSetup do
7 | it 'should load data' do
8 | script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
9 | conf = mock('conf')
10 | conf.should_receive(:output_key_class=).once
11 | conf.should_receive(:output_value_class=).once
12 |
13 | setup = HiveLikeSetup.new(script, conf)
14 | setup.run
15 | setup.paths[0].should == 'hive-like/inputs'
16 | setup.paths[1].should == 'hive-like/outputs'
17 | end
18 | end
19 |
20 | describe HiveLikeMapper do
21 | before do
22 | @value = 'apple, 3, 100'
23 | end
24 |
25 | it 'should create table' do
26 | mapper = HiveLikeMapper.new(nil, nil, @value)
27 | mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
28 | mapper.table.name.should == 'items'
29 | mapper.table.column(0).should == 'item'
30 | mapper.table.column(1).should == 'quantity'
31 | end
32 |
33 | it 'should select' do
34 | mapper = HiveLikeMapper.new(nil, nil, @value)
35 | mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
36 | mapper.select("item", "quantity", "price", "from", "items")
37 | mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
38 | end
39 |
40 | it 'should pre process script body' do
41 | body = "select foo, bar from table;\n"
42 | mapper = HiveLikeMapper.new(nil, nil, @value)
43 | processed = mapper.pre_process(body)
44 | processed.should == %Q!select("foo", "bar", "from", "table")\n!
45 | end
46 | end
47 |
48 | describe HiveLikeReducer do
49 | it 'should select as identity' do
50 | key = 'Lorem'
51 | values = [1, 1, 1]
52 | reducer = HiveLikeReducer.new(nil, key, values)
53 |
54 | reducer.select
55 | reducer.emitted[0].should == {'Lorem' => 1}
56 | end
57 | end
58 |
--------------------------------------------------------------------------------
/spec/core_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__), 'spec_helper')
2 | require 'core'
3 |
4 | include HadoopDsl
5 |
6 | describe 'BaseMapRed' do
7 | before(:all) do
8 | @script = create_tmp_script(<<-EOF)
9 | from 'test/inputs'
10 | to 'test/outputs'
11 | EOF
12 | end
13 |
14 | it 'emit key value' do
15 | mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
16 | mapper.emit('key' => 'value')
17 | mapper.emitted.should == [{'key' => 'value'}]
18 | end
19 |
20 | it 'can run BaseMapper in minimum' do
21 | model = BaseMapperModel.new('key', 'value')
22 | mapper = BaseMapper.new(@script, model)
23 | mapper.run
24 | end
25 |
26 | it 'can run BaseReducer in minimum' do
27 | model = BaseReducerModel.new('key', 'values')
28 | reducer = BaseReducer.new(@script, model)
29 | reducer.run
30 | end
31 |
32 | it 'can run BaseSetup in minimum' do
33 | setup = BaseSetup.new(@script, nil)
34 | setup.run
35 | end
36 |
37 | describe BaseMapper do
38 | it 'can emit as identity' do
39 | model = BaseMapperModel.new('key', 'value')
40 | mapper = BaseMapper.new(@script, model)
41 | mapper.identity
42 |
43 | mapper.emitted.should == [{'key' => 'value'}]
44 | end
45 | end
46 |
47 | describe BaseReducer do
48 | it 'can emit as aggregate' do
49 | model = BaseReducerModel.new('key', [1, 2, 3])
50 | reducer = BaseReducer.new(@script, model)
51 | reducer.aggregate
52 |
53 | reducer.emitted.should == [{'key' => 6}]
54 | end
55 |
56 | it 'can emit as identity' do
57 | model = BaseReducerModel.new('key', [1, 2, 3])
58 | reducer = BaseReducer.new(@script, model)
59 | reducer.identity
60 |
61 | reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
62 | end
63 | end
64 |
65 | describe BaseSetup do
66 | it 'can get paths' do
67 | setup = BaseSetup.new(@script, nil)
68 | setup.run
69 | setup.paths[0].should == 'test/inputs'
70 | setup.paths[1].should == 'test/outputs'
71 | end
72 | end
73 | end
74 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/HadoopRuby.java:
--------------------------------------------------------------------------------
1 | package hudson.plugins.hadoop.ruby;
2 |
3 | import hudson.Extension;
4 | import hudson.FilePath;
5 | import hudson.model.AbstractProject;
6 | import hudson.model.Descriptor;
7 | import hudson.model.Hudson;
8 | import hudson.tasks.BuildStepDescriptor;
9 | import hudson.tasks.Builder;
10 | import hudson.tasks.CommandInterpreter;
11 |
12 | import java.io.File;
13 |
14 | import net.sf.json.JSONObject;
15 |
16 | import org.kohsuke.stapler.StaplerRequest;
17 |
18 | /**
19 | * Invokes the hadoop ruby interpreter and invokes the Hadoop Ruby script
20 | * entered on the hudson build configuration.
21 | *
22 | * It is expected that the hadoop ruby interpreter is available on the system
23 | * PATH.
24 | *
25 | * @author Koichi Fujikawa
26 | */
27 | public class HadoopRuby extends CommandInterpreter {
28 |
29 | private HadoopRuby(String command) {
30 | super(command);
31 | }
32 |
33 | protected String[] buildCommandLine(FilePath script) {
34 | File rootDir = Hudson.getInstance().getRootDir();
35 | String cmd = rootDir.toString()
36 | + "/hadoop-ruby/bin/hadoop-papyrus.sh";
37 | return new String[] { cmd, script.getRemote() };
38 | }
39 |
40 | protected String getContents() {
41 | return command;
42 | }
43 |
44 | protected String getFileExtension() {
45 | return ".rb";
46 | }
47 |
48 | @Override
49 | public Descriptor getDescriptor() {
50 | return DESCRIPTOR;
51 | }
52 |
53 | @Extension
54 | public static final DescriptorImpl DESCRIPTOR = new DescriptorImpl();
55 |
56 | public static final class DescriptorImpl extends
57 | BuildStepDescriptor {
58 | private DescriptorImpl() {
59 | super(HadoopRuby.class);
60 | }
61 |
62 | @Override
63 | public Builder newInstance(StaplerRequest req, JSONObject formData) {
64 | return new HadoopRuby(formData.getString("hadoop-ruby"));
65 | }
66 |
67 | public String getDisplayName() {
68 | return "Execute Hadoop Ruby script";
69 | }
70 |
71 | @Override
72 | public String getHelpFile() {
73 | return "/plugin/hadoop-ruby/help.html";
74 | }
75 |
76 | @Override
77 | public boolean isApplicable(Class extends AbstractProject> jobType) {
78 | return true;
79 | }
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/lib/core.rb:
--------------------------------------------------------------------------------
1 | require 'hadoop_dsl'
2 | require 'forwardable'
3 |
4 | module HadoopDsl
5 | # common
6 | module DslElement
7 | # all DSL statements without def is processed here
8 | def method_missing(name, *args)
9 | # if block given, labeled for non-local exit
10 | catch name do; yield end if block_given?
11 | self
12 | end
13 | end
14 |
15 | # controller
16 | module DslController
17 | include DslElement
18 |
19 | def run
20 | body = pre_process(HadoopDsl.read_file(@script))
21 | eval(body, binding, @script)
22 | end
23 |
24 | def pre_process(body)
25 | body # do nothing
26 | end
27 | end
28 |
29 | class BaseMapRed
30 | extend Forwardable
31 | include DslController
32 |
33 | attr_reader :emitted
34 |
35 | def initialize(script, model)
36 | @script, @model = script, model
37 | @model.controller = self
38 | @emitted = []
39 | end
40 |
41 | def emit(hash) @emitted << hash end
42 |
43 | private
44 | def key; @model.key end
45 | end
46 |
47 | class BaseSetup
48 | include DslController
49 |
50 | def initialize(script, conf)
51 | @script, @conf = script, conf
52 | output_format
53 | end
54 |
55 | def output_format; end # do nothing
56 | def paths; [@from, @to] end
57 | def from(path) @from = path end
58 | def to(path) @to = path end
59 | end
60 |
61 | class BaseMapper < BaseMapRed
62 | # common functions
63 | def identity
64 | emit(@model.key => @model.value)
65 | end
66 |
67 | private
68 | def value; @model.values end
69 | end
70 |
71 | class BaseReducer < BaseMapRed
72 | # common functions
73 | def aggregate
74 | emit(@model.key => @model.values.inject {|ret, i| ret + i})
75 | end
76 |
77 | def identity
78 | @model.values.each {|v| emit(@model.key => v)}
79 | end
80 |
81 | private
82 | def values; @model.values end
83 | end
84 |
85 | # model
86 | class BaseModel
87 | include DslElement
88 | attr_accessor :controller
89 | end
90 |
91 | class BaseMapperModel < BaseModel
92 | attr_reader :key, :value
93 |
94 | def initialize(key, value)
95 | @key, @value = key, value
96 | end
97 | end
98 |
99 | class BaseReducerModel < BaseModel
100 | attr_reader :key, :values
101 |
102 | def initialize(key, values)
103 | @key, @values = key, values
104 | end
105 | end
106 | end
107 |
--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/ItemListenerImpl.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright (c) 2004-2009, Sun Microsystems, Inc.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 | package hudson.plugins.hadoop.ruby;
25 |
26 | import hudson.Extension;
27 | import hudson.FilePath;
28 | import hudson.model.Hudson;
29 | import hudson.model.listeners.ItemListener;
30 | import hudson.util.StreamTaskListener;
31 |
32 | import java.io.File;
33 | import java.util.logging.Level;
34 | import java.util.logging.Logger;
35 |
36 | /**
37 | * Install Hadoop Ruby DSL
38 | *
39 | * @author Koichi Fujikawa
40 | */
41 | @Extension
42 | public class ItemListenerImpl extends ItemListener {
43 |
44 | @Override
45 | public void onLoaded() {
46 | try {
47 | LOGGER.log(Level.INFO, "install start for Hadoop Ruby");
48 | StreamTaskListener listener = new StreamTaskListener(System.out);
49 | File rootDir = Hudson.getInstance().getRootDir();
50 | rootDir = new File(rootDir, "hadoop-ruby");
51 | FilePath distDir = new FilePath(rootDir);
52 | distDir.installIfNecessaryFrom(ItemListenerImpl.class
53 | .getResource("hadoop-ruby.tgz"), listener, "Hadoop Ruby");
54 | LOGGER.log(Level.INFO, "install finished for Hadoop Ruby");
55 |
56 | } catch (Exception e) {
57 | LOGGER.log(Level.WARNING, "Failed to install Hadoop Ruby", e);
58 | }
59 | }
60 |
61 | private static final Logger LOGGER = Logger
62 | .getLogger(ItemListenerImpl.class.getName());
63 | }
64 |
--------------------------------------------------------------------------------
/spec/mapred_factory_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__) , 'spec_helper')
2 | require 'mapred_factory'
3 |
4 | include HadoopDsl
5 |
6 | describe 'MapRed Factory' do
7 | before(:each) do
8 | @script = create_tmp_script("dsl 'LogAnalysis'")
9 | end
10 |
11 | it 'can create mapper' do
12 | mapper = MapperFactory.create(@script, nil, nil)
13 | mapper.class.should == LogAnalysis::LogAnalysisMapper
14 | end
15 |
16 | it 'can create reducer' do
17 | reducer = ReducerFactory.create(@script, nil, nil)
18 | reducer.class.should == LogAnalysis::LogAnalysisReducer
19 | end
20 |
21 | it 'can create setup' do
22 | conf = mock('conf')
23 | conf.should_receive(:output_key_class=).once
24 | conf.should_receive(:output_value_class=).once
25 | s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf)
26 | s.class.should == HiveLike::HiveLikeSetup
27 | end
28 |
29 | it 'can create base if not exists in specific DSL' do
30 | s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil)
31 | s.class.should == BaseSetup
32 | end
33 |
34 | it 'specify dsl name from script' do
35 | dsl_name = MapRedFactory.dsl_name(@script)
36 | dsl_name.should == 'LogAnalysis'
37 | end
38 |
39 | it 'can convert dsl name to dsl lib file and require' do
40 | dsl_name = MapRedFactory.dsl_name(@script)
41 | MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil
42 | LogAnalysis::LogAnalysisMapper
43 | end
44 |
45 | it 'can create mapper if statement has double quote' do
46 | script = create_tmp_script(%Q!dsl "LogAnalysis"!)
47 | mapper = MapperFactory.create(script, nil, nil)
48 | mapper.class.should == LogAnalysis::LogAnalysisMapper
49 | end
50 |
51 | it 'can create mapper if exists more space' do
52 | script = create_tmp_script(%Q! dsl "LogAnalysis" !)
53 | mapper = MapperFactory.create(script, nil, nil)
54 | mapper.class.should == LogAnalysis::LogAnalysisMapper
55 | end
56 |
57 | it 'can create mapper if exists bracket' do
58 | script = create_tmp_script(%Q! dsl ("LogAnalysis") !)
59 | mapper = MapperFactory.create(script, nil, nil)
60 | mapper.class.should == LogAnalysis::LogAnalysisMapper
61 | end
62 |
63 | it 'can create mapper from class name cache' do
64 | mapper = MapperFactory.create(@script, nil, nil)
65 | mapper2 = MapperFactory.create(@script, nil, nil)
66 | mapper.class.should == mapper2.class
67 | end
68 |
69 | it 'can create reducer from class name cache' do
70 | reducer = ReducerFactory.create(@script, nil, nil)
71 | reducer2 = ReducerFactory.create(@script, nil, nil)
72 | reducer.class.should == reducer2.class
73 | end
74 | end
75 |
--------------------------------------------------------------------------------
/spec/example_spec.rb:
--------------------------------------------------------------------------------
1 | require 'log_analysis'
2 | require 'word_count'
3 | require 'hive_like'
4 |
5 | include HadoopDsl::LogAnalysis
6 | describe 'Aapach Log Example' do
7 | before(:all) do
8 | @script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
9 | @bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
10 | @value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
11 | end
12 |
13 | it 'can run example by mapper' do
14 | mapper = LogAnalysisMapper.new(@script, nil, @value)
15 | mapper.run
16 | mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
17 | end
18 |
19 | it 'can run example by reducer' do
20 | reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
21 | reducer.run
22 | reducer.emitted.first["ua\tChrome"].should == 3
23 | end
24 | end
25 |
26 | include HadoopDsl::WordCount
27 | describe 'Word Count Example' do
28 | before(:all) do
29 | @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
30 | @value = 'Lorem ipsum ipsum Lorem sit amet,'
31 | end
32 |
33 | it 'can run example by mapper' do
34 | mapper = WordCountMapper.new(@script, nil, @value)
35 | mapper.run
36 | mapper.emitted.size.should == 9
37 | mapper.emitted.each do |e|
38 | case e.keys.first
39 | when 'Lorem'
40 | e.values.first.should == 1
41 | when 'total words'
42 | e.values.first.should == 6
43 | end
44 | end
45 | end
46 |
47 | it 'can run example by reducer' do
48 | reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
49 | reducer.run
50 | reducer.emitted.first["Lorem"].should == 3
51 | end
52 | end
53 |
54 | include HadoopDsl::HiveLike
55 | describe 'Hive Like Example' do
56 | before(:all) do
57 | @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
58 | @value = 'apple, 3, 100'
59 | end
60 |
61 | it 'can run setup' do
62 | conf = mock('conf')
63 | conf.should_receive(:output_key_class=).once
64 | conf.should_receive(:output_value_class=).once
65 |
66 | setup = HiveLikeSetup.new(@script, conf)
67 | setup.run
68 | setup.paths[0].should == 'hive-like/items.txt'
69 | end
70 |
71 | it 'can run example by mapper' do
72 | mapper = HiveLikeMapper.new(@script, nil, @value)
73 | mapper.run
74 | mapper.emitted.size.should == 1
75 | mapper.emitted.first['items'].should == '3, 100, apple'
76 | end
77 |
78 | it 'can run example by reducer' do
79 | values = ['v1', 'v2', 'v3']
80 | reducer = HiveLikeReducer.new(@script, "items", values)
81 | reducer.run
82 | reducer.emitted.first["items"].should == 'v1'
83 | end
84 | end
85 |
--------------------------------------------------------------------------------
/spec/word_count_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__), 'spec_helper')
2 | require 'word_count'
3 |
4 | include HadoopDsl::WordCount
5 |
6 | describe WordCountMapper do
7 | it 'should count uniq' do
8 | value = 'Lorem ipsum Lorem sit amet,'
9 | mapper = WordCountMapper.new(nil, nil, value)
10 |
11 | mapper.count_uniq
12 | mapper.emitted[0].should == {'Lorem' => 1}
13 | mapper.emitted[1].should == {'ipsum' => 1}
14 | mapper.emitted[2].should == {'Lorem' => 1}
15 | end
16 |
17 | it 'should count total bytes' do
18 | value = 'Lorem ipsum Lorem sit amet,'
19 | mapper = WordCountMapper.new(nil, nil, value)
20 |
21 | mapper.total :bytes
22 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 23}
23 | end
24 |
25 | it 'should count total words' do
26 | value = 'Lorem ipsum Lorem sit amet,'
27 | mapper = WordCountMapper.new(nil, nil, value)
28 |
29 | mapper.total :words
30 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total words" => 5}
31 | end
32 |
33 | it 'should count total lines' do
34 | value = 'Lorem ipsum Lorem sit amet,'
35 | mapper = WordCountMapper.new(nil, nil, value)
36 |
37 | mapper.total :lines
38 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total lines" => 1}
39 | end
40 |
41 | it 'should count total bytes, words, lines' do
42 | value = 'Lorem ipsum Lorem sit amet,'
43 | mapper = WordCountMapper.new(nil, nil, value)
44 |
45 | mapper.total :bytes, :words, :lines
46 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 23}
47 | mapper.emitted[1].should == {"#{TOTAL_PREFIX}total words" => 5}
48 | mapper.emitted[2].should == {"#{TOTAL_PREFIX}total lines" => 1}
49 | end
50 | end
51 |
52 | describe WordCountReducer do
53 | it 'should count uniq' do
54 | key = 'Lorem'
55 | values = [1, 1, 1]
56 | reducer = WordCountReducer.new(nil, key, values)
57 |
58 | reducer.count_uniq
59 | reducer.emitted[0].should == {'Lorem' => 3}
60 | end
61 |
62 | it 'should count total bytes' do
63 | key = "#{TOTAL_PREFIX}total bytes"
64 | values = [12, 23, 45]
65 | reducer = WordCountReducer.new(nil, key, values)
66 |
67 | reducer.total :bytes
68 | reducer.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 12 + 23 + 45}
69 | end
70 |
71 | it 'should count total words' do
72 | key = "#{TOTAL_PREFIX}total words"
73 | values = [3, 4, 5]
74 | reducer = WordCountReducer.new(nil, key, values)
75 |
76 | reducer.total :words
77 | reducer.emitted[0].should == {"#{TOTAL_PREFIX}total words" => 3 + 4 + 5}
78 | end
79 |
80 | it 'should count total lines' do
81 | key = "#{TOTAL_PREFIX}total lines"
82 | values = [1, 2, 3]
83 | reducer = WordCountReducer.new(nil, key, values)
84 |
85 | reducer.total :lines
86 | reducer.emitted[0].should == {"#{TOTAL_PREFIX}total lines" => 6}
87 | end
88 | end
89 |
--------------------------------------------------------------------------------
/lib/hive_like.rb:
--------------------------------------------------------------------------------
1 | require 'hadoop_dsl'
2 |
3 | module HadoopDsl::HiveLike
4 | # common
5 | module HiveLikeMapRed
6 | def pre_process(body)
7 | processed = ""
8 | body.each do |line|
9 | next if line =~ /^#/
10 | if line =~ /^(\w*)\s+(.*);$/
11 | method = $1
12 | args = sprit_and_marge_args($2)
13 | processed << "#{method}(#{args})\n"
14 | else
15 | processed << line + "\n" if line
16 | end
17 | end
18 | processed
19 | end
20 |
21 | def sprit_and_marge_args(raw)
22 | raw.gsub(/[\(\)]/, ' ').split.map do |s|
23 | stripped = s.gsub(/[\s,"']/, '')
24 | %Q!"#{stripped}"!
25 | end.join(", ")
26 | end
27 | end
28 |
29 | # controller
30 | class HiveLikeSetup < HadoopDsl::BaseSetup
31 | def load_data(inputs, table)
32 | @from = inputs
33 | @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
34 | end
35 |
36 | def output_format
37 | @conf.output_key_class = HadoopDsl::Text
38 | @conf.output_value_class = HadoopDsl::Text
39 | end
40 |
41 | # might not need but occur error if not exists
42 | def select(*args) end
43 |
44 | include HiveLikeMapRed
45 | end
46 |
47 | class HiveLikeMapper < HadoopDsl::BaseMapper
48 | def initialize(script, key, value)
49 | super(script, HiveLikeMapperModel.new(key, value))
50 | end
51 |
52 | include HiveLikeMapRed
53 |
54 | def_delegators :@model, :create_table, :table
55 |
56 | # emitters
57 | def select(*args)
58 | from_index = args.index('from')
59 | if from_index
60 | values = args[0...from_index].map do |column|
61 | splitted = @model.value.split(/[,\s]+/)
62 | splitted[@model.table.columns.index(column)]
63 | end
64 | emit(args[from_index + 1] => values.join(", "))
65 | end
66 | end
67 | end
68 |
69 | class HiveLikeReducer < HadoopDsl::BaseReducer
70 | def initialize(script, key, values)
71 | super(script, HiveLikeReducerModel.new(key, values))
72 | end
73 |
74 | include HiveLikeMapRed
75 |
76 | # emitters
77 | def select(*args) identity end
78 | end
79 |
80 | # model
81 | class HiveLikeMapperModel < HadoopDsl::BaseMapperModel
82 | attr_reader :table
83 |
84 | def create_table(name, *column_and_type)
85 | @table = Table.new(name)
86 | column_and_type.each_with_index do |column, index|
87 | next if index % 2 != 0 # type
88 | @table.columns << column_and_type[index]
89 | end
90 | end
91 |
92 | class Table
93 | attr_reader :name, :columns
94 |
95 | def initialize(name)
96 | @name = name
97 | @columns = []
98 | end
99 |
100 | def column(index) @columns[index] end
101 | end
102 | end
103 |
104 | class HiveLikeReducerModel < HadoopDsl::BaseReducerModel
105 | end
106 | end
107 |
--------------------------------------------------------------------------------
/hadoop-papyrus.gemspec:
--------------------------------------------------------------------------------
1 | # Generated by jeweler
2 | # DO NOT EDIT THIS FILE DIRECTLY
3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4 | # -*- encoding: utf-8 -*-
5 |
6 | Gem::Specification.new do |s|
7 | s.name = %q{hadoop-papyrus}
8 | s.version = "0.0.6"
9 |
10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 | s.authors = ["Koichi Fujikawa"]
12 | s.date = %q{2010-02-09}
13 | s.default_executable = %q{papyrus}
14 | s.description = %q{Hadoop papyrus - Ruby DSL for Hadoop}
15 | s.email = %q{fujibee@gmail.com}
16 | s.executables = ["papyrus"]
17 | s.extra_rdoc_files = [
18 | "README.rdoc"
19 | ]
20 | s.files = [
21 | ".gitignore",
22 | "README.rdoc",
23 | "Rakefile",
24 | "VERSION",
25 | "bin/papyrus",
26 | "conf/hadoop-site.xml",
27 | "contrib/hudson/bin/hadoop",
28 | "contrib/hudson/bin/hadoop-papyrus.sh",
29 | "contrib/hudson/conf/hadoop-site.xml",
30 | "contrib/hudson/plugins/hadoop-ruby/.gitignore",
31 | "contrib/hudson/plugins/hadoop-ruby/pom.xml",
32 | "contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/HadoopRuby.java",
33 | "contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/ItemListenerImpl.java",
34 | "contrib/hudson/plugins/hadoop-ruby/src/main/resources/hudson/plugins/hadoop/ruby/HadoopRuby/config.jelly",
35 | "contrib/hudson/plugins/hadoop-ruby/src/main/resources/index.jelly",
36 | "contrib/hudson/plugins/hadoop-ruby/src/main/webapp/help.html",
37 | "examples/hive_like_test.rb",
38 | "examples/log_analysis_test.rb",
39 | "examples/word_count_test.rb",
40 | "hadoop-papyrus.gemspec",
41 | "lib/core.rb",
42 | "lib/dsl_init.rb",
43 | "lib/hadoop_dsl.rb",
44 | "lib/hadoop_dsl_client.rb",
45 | "lib/hive_like.rb",
46 | "lib/log_analysis.rb",
47 | "lib/mapred_factory.rb",
48 | "lib/util.rb",
49 | "lib/word_count.rb"
50 | ]
51 | s.homepage = %q{http://github.com/fujibee/hadoop-papyrus}
52 | s.rdoc_options = ["--charset=UTF-8"]
53 | s.require_paths = ["lib"]
54 | s.rubygems_version = %q{1.3.5}
55 | s.summary = %q{Hadoop papyrus}
56 | s.test_files = [
57 | "spec/spec_helper.rb",
58 | "spec/dsl_init_spec.rb",
59 | "spec/core_spec.rb",
60 | "spec/client_spec.rb",
61 | "spec/util_spec.rb",
62 | "spec/mapred_factory_spec.rb",
63 | "spec/word_count_spec.rb",
64 | "spec/hive_like_spec.rb",
65 | "spec/log_analysis_spec.rb",
66 | "spec/example_spec.rb",
67 | "examples/hive_like_test.rb",
68 | "examples/log_analysis_test.rb",
69 | "examples/word_count_test.rb"
70 | ]
71 |
72 | if s.respond_to? :specification_version then
73 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
74 | s.specification_version = 3
75 |
76 | if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
77 | s.add_runtime_dependency(%q, [">= 0"])
78 | else
79 | s.add_dependency(%q, [">= 0"])
80 | end
81 | else
82 | s.add_dependency(%q, [">= 0"])
83 | end
84 | end
85 |
86 |
--------------------------------------------------------------------------------
/lib/log_analysis.rb:
--------------------------------------------------------------------------------
1 | require 'hadoop_dsl'
2 | require 'enumerator'
3 |
4 | module HadoopDsl::LogAnalysis
5 | KEY_SEP = "\t"
6 | PREFIX = 'col'
7 | PASS = nil
8 | MODEL_METHODS = [:column, :value]
9 |
10 | # controller
11 | class LogAnalysisMapper < HadoopDsl::BaseMapper
12 | @@reg_cache = {}
13 |
14 | def initialize(script, key, value)
15 | super(script, LogAnalysisMapperModel.new(key, value))
16 | end
17 |
18 | # model methods
19 | def_delegators :@model, *MODEL_METHODS
20 |
21 | def topic(desc, options = {}, &block)
22 | @model.create_topic(desc, options)
23 | yield if block_given?
24 | current_topic
25 | end
26 |
27 | def separate(sep)
28 | parts = case sep
29 | when Symbol
30 | case sep
31 | when :csv
32 | require 'csv'
33 | CSV.parse(value).flatten
34 | when :tsv then value.split("\t")
35 | else raise "no supported separator #{sep}"
36 | end
37 | when String then value.split(sep)
38 | end
39 | @model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
40 | end
41 |
42 | def pattern(reg_str)
43 | # try to get RE from cache
44 | cached = @@reg_cache[reg_str]
45 | re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str)
46 | @@reg_cache[reg_str] ||= re # new cache
47 |
48 | if value =~ re
49 | md = Regexp.last_match
50 | @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
51 | else throw :each_line # non-local exit
52 | end
53 | end
54 |
55 | # column names by String converted to Symbol
56 | def column_name(*names)
57 | sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
58 | @model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
59 | end
60 |
61 | def group_by(column_or_value)
62 | case column_or_value
63 | when LogAnalysisMapperModel::Column
64 | column = column_or_value
65 | current_topic.key_elements << column.value
66 | else
67 | value = column_or_value
68 | current_topic.key_elements << value
69 | end
70 | end
71 |
72 | def group_date_by(column, term)
73 | require 'time'
74 | time = parse_time(column.value)
75 | time_key = case term
76 | when :hour_of_day then time.strftime('%H')
77 | when :daily then time.strftime('%Y%m%d')
78 | when :monthly then time.strftime('%Y%m')
79 | when :yearly then time.strftime('%Y')
80 | end
81 | current_topic.key_elements << time_key
82 | end
83 |
84 | # emitters
85 | def count_uniq(column_or_value)
86 | uniq_key =
87 | case column_or_value
88 | when LogAnalysisMapperModel::Column
89 | column = column_or_value
90 | column.value
91 | else column_or_value # value
92 | end
93 | current_topic.key_elements << uniq_key
94 | emit(current_topic.key => 1)
95 | end
96 |
97 | def count
98 | emit(current_topic.key => 1)
99 | end
100 |
101 | def sum(column)
102 | emit(current_topic.key => column.value.to_i)
103 | end
104 |
105 | private
106 | def current_topic; @model.current_topic end
107 |
108 | def parse_time(str)
109 | begin Time.parse(str)
110 | rescue
111 | # apachelog pattern ex) "10/Oct/2000:13:55:36 -0700"
112 | Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/
113 | end
114 | end
115 | end
116 |
117 | class LogAnalysisReducer < HadoopDsl::BaseReducer
118 | def initialize(script, key, values)
119 | super(script, LogAnalysisReducerModel.new(key, values))
120 | end
121 |
122 | # model methods
123 | def_delegators :@model, *MODEL_METHODS
124 |
125 | def topic(desc, options = {}, &block)
126 | @model.create_topic(desc, options)
127 | yield if block_given?
128 | @model.current_topic
129 | end
130 |
131 | def count_uniq(column)
132 | aggregate_on_topic
133 | end
134 |
135 | def count
136 | aggregate_on_topic
137 | end
138 |
139 | def sum(column)
140 | aggregate_on_topic
141 | end
142 |
143 | private
144 | def aggregate_on_topic
145 | aggregate if @model.topic == @model.current_topic
146 | end
147 |
148 | end
149 |
150 | # model
151 | class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel
152 | attr_reader :current_topic
153 |
154 | def initialize(key, value)
155 | super(key, value)
156 | @columns = ColumnArray.new
157 | @topics = []
158 | end
159 |
160 | def column; @columns end
161 |
162 | def create_topic(desc, options)
163 | @topics << @current_topic = Topic.new(desc, options[:label])
164 | end
165 |
166 | def create_or_replace_columns_with(array, &block)
167 | columns = array.enum_for(:each_with_index).map do |p, i|
168 | c = @columns[i] ? @columns[i] : Column.new(i)
169 | yield c, p
170 | c
171 | end
172 | @columns = ColumnArray.new(columns)
173 | end
174 |
175 | class ColumnArray < Array
176 | def [](key)
177 | case key
178 | when Integer then at(key)
179 | when Symbol then (select {|c| c.name == key}).first
180 | when String then (select {|c| c.name == key.to_sym}).first
181 | end
182 | end
183 | end
184 |
185 | class Column
186 | attr_reader :index
187 | attr_accessor :value, :name
188 |
189 | def initialize(index, value = nil)
190 | @index, @value = index, value
191 | end
192 | end
193 |
194 | class Topic
195 | attr_reader :key_elements
196 |
197 | def initialize(desc, label = nil)
198 | @desc, @label = desc, label
199 | @key_elements = []
200 | end
201 |
202 | def label
203 | @label || @desc.gsub(/\s/, '_')
204 | end
205 |
206 | def key
207 | without_label =
208 | @key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil
209 | [label, without_label].compact.join(KEY_SEP)
210 | end
211 | end
212 | end
213 |
214 | class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel
215 | attr_reader :topic, :current_topic
216 |
217 | def initialize(key, values)
218 | super(key, values)
219 | if key =~ /(\w*)#{KEY_SEP}?(.*)/
220 | @topic = Topic.new($1, values)
221 | end
222 | end
223 |
224 | def create_topic(desc, options)
225 | @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
226 | end
227 |
228 | class Topic
229 | attr_reader :label, :values
230 |
231 | def initialize(label, values)
232 | @label, @values = label, values
233 | end
234 |
235 | def ==(rh) self.label == rh.label end
236 | end
237 | end
238 | end
239 |
--------------------------------------------------------------------------------
/spec/log_analysis_spec.rb:
--------------------------------------------------------------------------------
1 | require File.join(File.dirname(__FILE__), 'spec_helper')
2 | require 'log_analysis'
3 |
4 | include HadoopDsl::LogAnalysis
5 |
6 | describe LogAnalysisMapper do
7 | before do
8 | @apache_log = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
9 | end
10 |
11 | it 'should separate data by space' do
12 | value = 'Lorem ipsum dolor sit amet,'
13 | mapper = LogAnalysisMapper.new(nil, nil, value)
14 | mapper.separate(' ')
15 |
16 | mapper.column[1].value.should == 'ipsum'
17 | end
18 |
19 | it 'should separate by pattern' do
20 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
21 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
22 |
23 | mapper.column[2].value.should == 'frank'
24 | end
25 |
26 | it 'should separate by comma (CSV) with csv library' do
27 | value = '"Lorem","ip,sum","dolor","sit","amet"'
28 | mapper = LogAnalysisMapper.new(nil, nil, value)
29 | mapper.separate(:csv)
30 |
31 | require('csv').should be_false # already required
32 | mapper.column[1].value.should == 'ip,sum'
33 | end
34 |
35 | it 'should separate by tab char (TSV)' do
36 | value = "Lorem\tipsum\tdolor\tsit\tamet,"
37 | mapper = LogAnalysisMapper.new(nil, nil, value)
38 | mapper.separate(:tsv)
39 |
40 | mapper.column[4].value.should == 'amet,'
41 | end
42 |
43 | it 'should not separate by non support separator' do
44 | value = 'Lorem ipsum dolor sit amet,'
45 | mapper = LogAnalysisMapper.new(nil, nil, value)
46 | lambda { mapper.separate(:nonsupport) }.should raise_error
47 | end
48 |
49 | it 'should non-local exit if cannot separate by pattern' do
50 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a")
51 | mapper.each_line do
52 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/
53 | fail 'should not be reached'
54 | end
55 | mapper.column[0].should be_nil
56 | end
57 |
58 | it 'should label column name by string' do
59 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
60 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
61 | mapper.column_name 'remote_host', PASS, 'user', 'access_date', 'request', 'status', 'bytes'
62 |
63 | mapper.column['user'].value.should == 'frank'
64 | end
65 |
66 | it 'should label column name by symbol' do
67 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
68 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
69 | mapper.column_name :remote_host, PASS, :user, :access_date, :request, :status, :bytes
70 |
71 | mapper.column[:user].value.should == 'frank'
72 | end
73 |
74 | it 'should count uniq by column' do
75 | value = 'count uniq'
76 | mapper = LogAnalysisMapper.new(nil, nil, value)
77 | mapper.separate(' ')
78 | mapper.topic('t1') { mapper.count_uniq mapper.column[1] }
79 |
80 | mapper.emitted.should == [{"t1\tuniq" => 1}]
81 | end
82 |
83 | it 'should count uniq by value' do
84 | value = 'count uniq'
85 | mapper = LogAnalysisMapper.new(nil, nil, value)
86 | mapper.separate(' ')
87 | mapper.topic('t1') { mapper.count_uniq 'orig value' }
88 |
89 | mapper.emitted.should == [{"t1\torig value" => 1}]
90 | end
91 |
92 | it 'should just count' do
93 | value = 'count only'
94 | mapper = LogAnalysisMapper.new(nil, nil, value)
95 | mapper.separate(' ')
96 | mapper.topic('t1') { mapper.count }
97 |
98 | mapper.emitted.should == [{"t1" => 1}]
99 | end
100 |
101 | it 'should sum column value' do
102 | value = 'sum 123'
103 | mapper = LogAnalysisMapper.new(nil, nil, value)
104 | mapper.separate(' ')
105 | mapper.topic('t1') { mapper.sum mapper.column[1] }
106 |
107 | mapper.emitted.first["t1"].should == 123
108 | end
109 |
110 | it 'has topic which returns label' do
111 | value = 'Lorem ipsum dolor sit amet,'
112 | mapper = LogAnalysisMapper.new(nil, nil, value)
113 | mapper.separate(' ')
114 |
115 | topic = mapper.topic('desc', :label => 'label')
116 | topic.label.should == 'label'
117 | end
118 |
119 | it 'has topic which returns label as desc' do
120 | value = 'Lorem ipsum dolor sit amet,'
121 | mapper = LogAnalysisMapper.new(nil, nil, value)
122 | mapper.separate(' ')
123 |
124 | topic = mapper.topic('desc')
125 | topic.label.should == 'desc'
126 | end
127 |
128 | it 'has topic which returns label as desc with space' do
129 | value = 'Lorem ipsum dolor sit amet,'
130 | mapper = LogAnalysisMapper.new(nil, nil, value)
131 | mapper.separate(' ')
132 |
133 | topic = mapper.topic('desc with space')
134 | topic.label.should == 'desc_with_space'
135 | end
136 |
137 | it 'can group date monthly' do
138 | value = "2010/1/1 21:23:10\tnewyearday"
139 | mapper = LogAnalysisMapper.new(nil, nil, value)
140 | mapper.separate("\t")
141 | mapper.column_name 'date', 'holiday'
142 |
143 | ['yearly', 'monthly', 'daily', 'hour_of_day'].each do |term|
144 | mapper.topic(term) do
145 | mapper.group_date_by mapper.column[:date], term.to_sym
146 | mapper.count_uniq mapper.column[:holiday]
147 | end
148 | end
149 | mapper.emitted.should ==
150 | [
151 | {"yearly\t2010\tnewyearday" => 1},
152 | {"monthly\t201001\tnewyearday" => 1},
153 | {"daily\t20100101\tnewyearday" => 1},
154 | {"hour_of_day\t21\tnewyearday" => 1}
155 | ]
156 | end
157 |
158 | it 'can group by' do
159 | value = '1 sub_2 bingo!'
160 | mapper = LogAnalysisMapper.new(nil, nil, value)
161 | mapper.separate(' ')
162 | mapper.column_name 'id', 'sub_id', 'data'
163 |
164 | mapper.topic('test') do
165 | mapper.group_by mapper.column[:sub_id]
166 | mapper.count_uniq mapper.column[:data]
167 | end
168 | mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}]
169 | end
170 | end
171 |
172 | Topic = LogAnalysisMapperModel::Topic
173 | describe Topic do
174 | it 'can get key with label' do
175 | t = Topic.new('label')
176 | t.key.should == 'label'
177 | end
178 |
179 | it 'can get key with label and elements' do
180 | t = Topic.new('label')
181 | t.key_elements << 'e1'
182 | t.key_elements << 'e2'
183 | t.key.should == "label\te1\te2"
184 | end
185 | end
186 |
187 | describe LogAnalysisReducer do
188 | it 'should count uniq in the topic' do
189 | key = "t1\tuniq"
190 | values = [1, 1, 1]
191 | reducer = LogAnalysisReducer.new(nil, key, values)
192 | reducer.separate(' ')
193 | reducer.topic('t1') { reducer.count_uniq(nil) }
194 |
195 | reducer.emitted.first["t1\tuniq"].should == 3
196 | end
197 |
198 | it 'should not count uniq of other topic' do
199 | key = "t2\tuniq"
200 | values = [1, 1, 1]
201 | reducer = LogAnalysisReducer.new(nil, key, values)
202 | reducer.separate(' ')
203 | reducer.topic('t1') { reducer.count_uniq(nil) }
204 |
205 | reducer.emitted.first.should be_nil
206 | end
207 |
208 | it 'should sum column value' do
209 | key = "t1"
210 | values = [123, 456, 789]
211 | reducer = LogAnalysisReducer.new(nil, key, values)
212 | reducer.separate(' ')
213 | reducer.topic('t1') { reducer.sum(nil) }
214 |
215 | reducer.emitted.first["t1"].should == 123+456+789
216 | end
217 | end
218 |
--------------------------------------------------------------------------------
/contrib/hudson/bin/hadoop:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Licensed to the Apache Software Foundation (ASF) under one or more
4 | # contributor license agreements. See the NOTICE file distributed with
5 | # this work for additional information regarding copyright ownership.
6 | # The ASF licenses this file to You under the Apache License, Version 2.0
7 | # (the "License"); you may not use this file except in compliance with
8 | # the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | # The Hadoop command script
20 | #
21 | # Environment Variables
22 | #
23 | # JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
24 | #
25 | # HADOOP_CLASSPATH Extra Java CLASSPATH entries.
26 | #
27 | # HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
28 | # Default is 1000.
29 | #
30 | # HADOOP_OPTS Extra Java runtime options.
31 | #
32 | # HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
33 | # HADOOP_CLIENT_OPTS when the respective command is run.
34 | # HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
35 | # for e.g. HADOOP_CLIENT_OPTS applies to
36 | # more than one command (fs, dfs, fsck,
37 | # dfsadmin etc)
38 | #
39 | # HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
40 | #
41 | # HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
42 | #
43 |
44 | bin=`dirname "$0"`
45 | bin=`cd "$bin"; pwd`
46 |
47 | if [ -f "$bin"/hadoop-config.sh ]; then
48 | . "$bin"/hadoop-config.sh
49 | fi
50 |
51 | cygwin=false
52 | case "`uname`" in
53 | CYGWIN*) cygwin=true;;
54 | esac
55 |
56 | # if no args specified, show usage
57 | if [ $# = 0 ]; then
58 | echo "Usage: hadoop [--config confdir] COMMAND"
59 | echo "where COMMAND is one of:"
60 | echo " namenode -format format the DFS filesystem"
61 | echo " secondarynamenode run the DFS secondary namenode"
62 | echo " namenode run the DFS namenode"
63 | echo " datanode run a DFS datanode"
64 | echo " dfsadmin run a DFS admin client"
65 | echo " fsck run a DFS filesystem checking utility"
66 | echo " fs run a generic filesystem user client"
67 | echo " balancer run a cluster balancing utility"
68 | echo " jobtracker run the MapReduce job Tracker node"
69 | echo " pipes run a Pipes job"
70 | echo " tasktracker run a MapReduce task Tracker node"
71 | echo " job manipulate MapReduce jobs"
72 | echo " queue get information regarding JobQueues"
73 | echo " version print the version"
74 | echo " jar run a jar file"
75 | echo " distcp copy file or directories recursively"
76 | echo " archive -archiveName NAME * create a hadoop archive"
77 | echo " daemonlog get/set the log level for each daemon"
78 | echo " or"
79 | echo " CLASSNAME run the class named CLASSNAME"
80 | echo "Most commands print help when invoked w/o parameters."
81 | exit 1
82 | fi
83 |
84 | # get arguments
85 | COMMAND=$1
86 | shift
87 |
88 | if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
89 | . "${HADOOP_CONF_DIR}/hadoop-env.sh"
90 | fi
91 |
92 | # some Java parameters
93 | if [ "$JAVA_HOME" != "" ]; then
94 | #echo "run java in $JAVA_HOME"
95 | JAVA_HOME=$JAVA_HOME
96 | fi
97 |
98 | if [ "$JAVA_HOME" = "" ]; then
99 | echo "Error: JAVA_HOME is not set."
100 | exit 1
101 | fi
102 |
103 | JAVA=$JAVA_HOME/bin/java
104 | JAVA_HEAP_MAX=-Xmx1000m
105 |
106 | # check envvars which might override default args
107 | if [ "$HADOOP_HEAPSIZE" != "" ]; then
108 | #echo "run with heapsize $HADOOP_HEAPSIZE"
109 | JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
110 | #echo $JAVA_HEAP_MAX
111 | fi
112 |
113 | # CLASSPATH initially contains $HADOOP_CONF_DIR
114 | CLASSPATH="${HADOOP_CONF_DIR}"
115 | CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
116 |
117 | # for developers, add Hadoop classes to CLASSPATH
118 | if [ -d "$HADOOP_HOME/build/classes" ]; then
119 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
120 | fi
121 | if [ -d "$HADOOP_HOME/build/webapps" ]; then
122 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
123 | fi
124 | if [ -d "$HADOOP_HOME/build/test/classes" ]; then
125 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
126 | fi
127 | if [ -d "$HADOOP_HOME/build/tools" ]; then
128 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
129 | fi
130 |
131 | # so that filenames w/ spaces are handled correctly in loops below
132 | IFS=
133 |
134 | # for releases, add core hadoop jar & webapps to CLASSPATH
135 | if [ -d "$HADOOP_HOME/webapps" ]; then
136 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME
137 | fi
138 | for f in $HADOOP_HOME/hadoop-*-core.jar; do
139 | CLASSPATH=${CLASSPATH}:$f;
140 | done
141 |
142 | # add libs to CLASSPATH
143 | for f in $HADOOP_HOME/lib/*.jar; do
144 | CLASSPATH=${CLASSPATH}:$f;
145 | done
146 |
147 | for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
148 | CLASSPATH=${CLASSPATH}:$f;
149 | done
150 |
151 | for f in $HADOOP_HOME/hadoop-*-tools.jar; do
152 | TOOL_PATH=${TOOL_PATH}:$f;
153 | done
154 | for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
155 | TOOL_PATH=${TOOL_PATH}:$f;
156 | done
157 |
158 | # add user-specified CLASSPATH last
159 | if [ "$HADOOP_CLASSPATH" != "" ]; then
160 | CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
161 | fi
162 |
163 | # default log directory & file
164 | if [ "$HADOOP_LOG_DIR" = "" ]; then
165 | HADOOP_LOG_DIR="$HADOOP_HOME/logs"
166 | fi
167 | if [ "$HADOOP_LOGFILE" = "" ]; then
168 | HADOOP_LOGFILE='hadoop.log'
169 | fi
170 |
171 | # restore ordinary behaviour
172 | unset IFS
173 |
174 | # figure out which class to run
175 | if [ "$COMMAND" = "namenode" ] ; then
176 | CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
177 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
178 | elif [ "$COMMAND" = "secondarynamenode" ] ; then
179 | CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
180 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
181 | elif [ "$COMMAND" = "datanode" ] ; then
182 | CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
183 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
184 | elif [ "$COMMAND" = "fs" ] ; then
185 | CLASS=org.apache.hadoop.fs.FsShell
186 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
187 | elif [ "$COMMAND" = "dfs" ] ; then
188 | CLASS=org.apache.hadoop.fs.FsShell
189 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
190 | elif [ "$COMMAND" = "dfsadmin" ] ; then
191 | CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
192 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
193 | elif [ "$COMMAND" = "fsck" ] ; then
194 | CLASS=org.apache.hadoop.hdfs.tools.DFSck
195 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
196 | elif [ "$COMMAND" = "balancer" ] ; then
197 | CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
198 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
199 | elif [ "$COMMAND" = "jobtracker" ] ; then
200 | CLASS=org.apache.hadoop.mapred.JobTracker
201 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
202 | elif [ "$COMMAND" = "tasktracker" ] ; then
203 | CLASS=org.apache.hadoop.mapred.TaskTracker
204 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
205 | elif [ "$COMMAND" = "job" ] ; then
206 | CLASS=org.apache.hadoop.mapred.JobClient
207 | elif [ "$COMMAND" = "queue" ] ; then
208 | CLASS=org.apache.hadoop.mapred.JobQueueClient
209 | elif [ "$COMMAND" = "pipes" ] ; then
210 | CLASS=org.apache.hadoop.mapred.pipes.Submitter
211 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
212 | elif [ "$COMMAND" = "version" ] ; then
213 | CLASS=org.apache.hadoop.util.VersionInfo
214 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
215 | elif [ "$COMMAND" = "jar" ] ; then
216 | CLASS=org.apache.hadoop.mapred.JobShell
217 | elif [ "$COMMAND" = "distcp" ] ; then
218 | CLASS=org.apache.hadoop.tools.DistCp
219 | CLASSPATH=${CLASSPATH}:${TOOL_PATH}
220 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
221 | elif [ "$COMMAND" = "daemonlog" ] ; then
222 | CLASS=org.apache.hadoop.log.LogLevel
223 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
224 | elif [ "$COMMAND" = "archive" ] ; then
225 | CLASS=org.apache.hadoop.tools.HadoopArchives
226 | CLASSPATH=${CLASSPATH}:${TOOL_PATH}
227 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
228 | elif [ "$COMMAND" = "sampler" ] ; then
229 | CLASS=org.apache.hadoop.mapred.lib.InputSampler
230 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
231 | else
232 | CLASS=$COMMAND
233 | fi
234 |
235 | # cygwin path translation
236 | if $cygwin; then
237 | CLASSPATH=`cygpath -p -w "$CLASSPATH"`
238 | HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
239 | HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
240 | TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
241 | fi
242 | # setup 'java.library.path' for native-hadoop code if necessary
243 | JAVA_LIBRARY_PATH=''
244 | if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
245 | JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
246 |
247 | if [ -d "$HADOOP_HOME/build/native" ]; then
248 | JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
249 | fi
250 |
251 | if [ -d "${HADOOP_HOME}/lib/native" ]; then
252 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
253 | JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
254 | else
255 | JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
256 | fi
257 | fi
258 | fi
259 |
260 | # cygwin path translation
261 | if $cygwin; then
262 | JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
263 | fi
264 |
265 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
266 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
267 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
268 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
269 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
270 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
271 | HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
272 | fi
273 |
274 | # run it
275 | #echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
276 | exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
277 |
--------------------------------------------------------------------------------