├── .gitignore ├── VERSION ├── contrib └── hudson │ ├── plugins │ └── hadoop-ruby │ │ ├── .gitignore │ │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── index.jelly │ │ │ └── hudson │ │ │ │ └── plugins │ │ │ │ └── hadoop │ │ │ │ └── ruby │ │ │ │ └── HadoopRuby │ │ │ │ └── config.jelly │ │ │ ├── webapp │ │ │ └── help.html │ │ │ └── java │ │ │ └── hudson │ │ │ └── plugins │ │ │ └── hadoop │ │ │ └── ruby │ │ │ ├── HadoopRuby.java │ │ │ └── ItemListenerImpl.java │ │ └── pom.xml │ ├── conf │ └── hadoop-site.xml │ └── bin │ ├── hadoop-papyrus.sh │ └── hadoop ├── bin └── papyrus ├── examples ├── word_count_test.rb ├── hive_like_test.rb └── log_analysis_test.rb ├── spec ├── spec_helper.rb ├── client_spec.rb ├── dsl_init_spec.rb ├── util_spec.rb ├── hive_like_spec.rb ├── core_spec.rb ├── mapred_factory_spec.rb ├── example_spec.rb ├── word_count_spec.rb └── log_analysis_spec.rb ├── lib ├── hadoop_dsl.rb ├── dsl_init.rb ├── util.rb ├── hadoop_dsl_client.rb ├── word_count.rb ├── mapred_factory.rb ├── core.rb ├── hive_like.rb └── log_analysis.rb ├── conf └── hadoop-site.xml ├── Rakefile ├── README.rdoc └── hadoop-papyrus.gemspec /.gitignore: -------------------------------------------------------------------------------- 1 | pkg 2 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.6 2 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | target 3 | work 4 | rubygems 5 | tmp 6 | -------------------------------------------------------------------------------- /bin/papyrus: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'hadoop_dsl_client' 4 | 5 | HadoopDsl::Client.new(ARGV).run 6 | -------------------------------------------------------------------------------- /examples/word_count_test.rb: -------------------------------------------------------------------------------- 1 | dsl 'WordCount' 2 | 3 | from 'wc/inputs' 4 | to 'wc/outputs' 5 | 6 | count_uniq 7 | total :bytes, :words, :lines 8 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # spec helper 2 | require 'rubygems' 3 | gem 'jruby-on-hadoop' 4 | 5 | require 'tempfile' 6 | 7 | def create_tmp_script(body) 8 | tmp = Tempfile.new('test.rb') 9 | tmp.print body 10 | tmp.close 11 | tmp.path 12 | end 13 | 14 | -------------------------------------------------------------------------------- /examples/hive_like_test.rb: -------------------------------------------------------------------------------- 1 | dsl 'HiveLike' 2 | 3 | # hive-like/items.txt 4 | # apple, 3, 100 5 | # banana, 1, 50 6 | 7 | create_table items(item STRING, quantity INT, price INT); 8 | load_data "hive-like/items.txt" items; 9 | 10 | select quantity, price, item from items; 11 | 12 | # expect 13 | # 0 apple 3 300 14 | # 1 banana 1 50 15 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/src/main/resources/index.jelly: -------------------------------------------------------------------------------- 1 | 6 |
7 | This is Hadoop Ruby plugin. The build scripts written in hadoop ruby will be executed by this plugin. 8 |
9 | -------------------------------------------------------------------------------- /lib/hadoop_dsl.rb: -------------------------------------------------------------------------------- 1 | require 'util' 2 | require 'mapred_factory' 3 | require 'core' 4 | 5 | # for jruby 6 | if defined? JRUBY_VERSION 7 | require 'java' 8 | import 'org.apache.hadoop.io.IntWritable' 9 | import 'org.apache.hadoop.io.Text' 10 | 11 | # Hadoop IO types 12 | HadoopDsl::Text = Text 13 | HadoopDsl::IntWritable = IntWritable 14 | end 15 | -------------------------------------------------------------------------------- /contrib/hudson/conf/hadoop-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | fs.default.name 10 | hdfs://localhost:9000/ 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/src/main/resources/hudson/plugins/hadoop/ruby/HadoopRuby/config.jelly: -------------------------------------------------------------------------------- 1 | 3 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/src/main/webapp/help.html: -------------------------------------------------------------------------------- 1 |
2 |

3 | Runs a Hadoop Ruby script (defaults to ruby interpreter) for building the project. 4 | The script will be run with the workspace as the current directory. 5 |

6 | 7 |

8 | The shell will be invoked with "-v" option. So all of the commands are printed before being executed, 9 | and the build is considered a failure if any of the commands exits with a non-zero exit code. 10 |

11 |
12 | -------------------------------------------------------------------------------- /conf/hadoop-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | fs.default.name 9 | hdfs://localhost:9000/ 10 | 11 | 12 | mapred.job.tracker 13 | localhost:50040 14 | 15 | 16 | mapred.child.java.opts 17 | -Xmx512m 18 | 19 | 20 | -------------------------------------------------------------------------------- /contrib/hudson/bin/hadoop-papyrus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CURRENT_DIR=$(cd $(dirname $0); pwd) 4 | PATH=$CURRENT_DIR:$PATH 5 | 6 | GEM_HOME=$CURRENT_DIR/.. 7 | HADOOP_HOME=$HUDSON_HOME/hadoop/dist 8 | HADOOP_CONF_DIR=$CURRENT_DIR/../conf 9 | JRUBY_JAR_DIR=$GEM_HOME/gems/jruby-jars-1.4.0/lib/ 10 | 11 | export PATH GEM_HOME HADOOP_HOME HADOOP_CONF_DIR 12 | 13 | #echo java -classpath $JRUBY_JAR_DIR/jruby-core-1.4.0.jar:$JRUBY_JAR_DIR/jruby-stdlib-1.4.0.jar org.jruby.Main $CURRENT_DIR/papyrus $1 14 | java -classpath $JRUBY_JAR_DIR/jruby-core-1.4.0.jar:$JRUBY_JAR_DIR/jruby-stdlib-1.4.0.jar org.jruby.Main $CURRENT_DIR/papyrus $1 15 | -------------------------------------------------------------------------------- /lib/dsl_init.rb: -------------------------------------------------------------------------------- 1 | require 'hadoop_dsl' 2 | 3 | include HadoopDsl 4 | 5 | def map(key, value, output, reporter, script) 6 | mapper = MapperFactory.create(script, key, value) 7 | mapper.run 8 | 9 | write(output, mapper) 10 | end 11 | 12 | def reduce(key, values, output, reporter, script) 13 | reducer = ReducerFactory.create(script, key, values) 14 | reducer.run 15 | 16 | write(output, reducer) 17 | end 18 | 19 | def setup(conf, script) 20 | setup = SetupFactory.create(script, conf) 21 | setup.run 22 | setup.paths 23 | end 24 | 25 | private 26 | 27 | def write(output, controller) 28 | controller.emitted.each do |e| 29 | e.each do |k, v| 30 | output.collect(k, v) 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | org.jvnet.hudson.plugins 5 | plugin 6 | 1.318 7 | ../pom.xml 8 | 9 | 10 | hadoop-ruby 11 | hpi 12 | 1.1-SNAPSHOT 13 | Hudson Hadoop Ruby Plugin 14 | http://wiki.hudson-ci.org/display/HUDSON/Hadoop+Ruby+Plugin 15 | 16 | -------------------------------------------------------------------------------- /lib/util.rb: -------------------------------------------------------------------------------- 1 | # utility functions 2 | require 'hadoop_dsl' 3 | 4 | module HadoopDsl 5 | # file body cache 6 | # reading file in map/reduce cause critical issues! 7 | @@file_bodies = {} 8 | 9 | def self.snake_case(str) 10 | str.gsub(/\B[A-Z]/, '_\&').downcase 11 | end 12 | 13 | def self.read_file(file_name) 14 | # use if cached 15 | body = @@file_bodies[file_name] if @@file_bodies[file_name] 16 | 17 | # read as usual 18 | body = File.open(file_name).read rescue nil unless body 19 | 20 | # read from loadpath 21 | unless body 22 | $:.each do |path| 23 | body = File.open(File.join(path, file_name)).read rescue next 24 | break 25 | end 26 | end 27 | 28 | raise "cannot find file - #{file_name}" unless body 29 | 30 | # for cache 31 | @@file_bodies[file_name] = body 32 | body 33 | end 34 | 35 | def self.reset_dsl_file 36 | @@file_bodies = {} 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /spec/client_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__), 'spec_helper') 2 | require 'hadoop_dsl_client' 3 | 4 | describe HadoopDsl::Client do 5 | before do 6 | @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"]) 7 | end 8 | 9 | it 'can parse args' do 10 | @client.files.join.should match /ruby_wrapper\.rb/ 11 | @client.files.join.should match /dsl_init\.rb/ 12 | @client.files.should include 'examples/wordcount.rb' 13 | @client.inputs.should == 'in' 14 | @client.outputs.should == 'out' 15 | end 16 | 17 | it 'can add dsl file into mapred args' do 18 | @client.mapred_args.should == 19 | "--script dsl_init.rb in out --dslfile wordcount.rb" 20 | end 21 | 22 | it 'can add dsl lib files' do 23 | lib_path = HadoopDsl.lib_path 24 | @client.files.should include File.join(lib_path, 'core.rb') 25 | @client.files.should include File.join(lib_path, 'log_analysis.rb') 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/hadoop_dsl_client.rb: -------------------------------------------------------------------------------- 1 | require 'jruby-on-hadoop' 2 | 3 | module HadoopDsl 4 | def self.lib_path 5 | File.expand_path(File.dirname(__FILE__)) 6 | end 7 | 8 | def self.dsl_init_script 9 | File.join(lib_path, "dsl_init.rb") 10 | end 11 | 12 | class Client < JRubyOnHadoop::Client 13 | def parse_args 14 | super 15 | @script_path = HadoopDsl.dsl_init_script 16 | @script = File.basename(@script_path) 17 | @dsl_file_path = @args[0] 18 | @dsl_file = File.basename(@dsl_file_path) 19 | @files << @script_path << @dsl_file_path 20 | 21 | # TODO move properly, with jruby-on-hadoop 22 | add_dsl_lib_files 23 | ENV['RUBYLIB'] = File.dirname(@dsl_file_path) 24 | end 25 | 26 | def mapred_args 27 | args = super 28 | args += " --dslfile #{@dsl_file}" 29 | args 30 | end 31 | 32 | def add_dsl_lib_files 33 | lib_path = HadoopDsl.lib_path 34 | @files += Dir.glob(File.join(lib_path, "*.rb")) 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # spec 2 | require 'rubygems' 3 | require 'spec/rake/spectask' 4 | 5 | Spec::Rake::SpecTask.new do |t| 6 | def hadoop_core_jar 7 | hadoop_home = ENV['HADOOP_HOME'] 8 | Dir.glob("#{hadoop_home}/hadoop-*-core.jar").first 9 | end 10 | 11 | t.libs = ['lib'] 12 | t.spec_opts = ['-c', '-fs', "-r #{hadoop_core_jar}"] 13 | t.spec_files = FileList['spec/**/*_spec.rb'] 14 | end 15 | 16 | # jeweler 17 | begin 18 | require 'jeweler' 19 | Jeweler::Tasks.new do |gemspec| 20 | gemspec.name = "hadoop-papyrus" 21 | gemspec.summary = "Hadoop papyrus" 22 | gemspec.description = "Hadoop papyrus - Ruby DSL for Hadoop" 23 | gemspec.email = "fujibee@gmail.com" 24 | gemspec.homepage = "http://github.com/fujibee/hadoop-papyrus" 25 | gemspec.authors = ["Koichi Fujikawa"] 26 | 27 | gemspec.add_dependency 'jruby-on-hadoop' 28 | gemspec.files.exclude "spec/**/*" 29 | end 30 | Jeweler::GemcutterTasks.new 31 | rescue LoadError 32 | puts "Jeweler not available. Install it with: gem install jeweler" 33 | end 34 | 35 | -------------------------------------------------------------------------------- /spec/dsl_init_spec.rb: -------------------------------------------------------------------------------- 1 | require 'dsl_init' 2 | 3 | describe 'mapreduce init' do 4 | 5 | before(:each) do 6 | @script = create_tmp_script(<<-EOF) 7 | dsl 'LogAnalysis' 8 | data 'test' do 9 | from 'test/inputs' 10 | to 'test/outputs' 11 | 12 | separate(" ") 13 | column_name 'c0', 'c1', 'c2', 'c3' 14 | topic 't1' do 15 | count_uniq columns(:c1) 16 | end 17 | end 18 | EOF 19 | end 20 | 21 | before do 22 | @one = 1 23 | @output = mock('output') 24 | end 25 | 26 | it 'can map sucessfully' do 27 | key = 'key' 28 | value = 'it should be fine' 29 | @output.should_receive(:collect).once #.with(@text, @one) 30 | 31 | map(key, value, @output, nil, @script) 32 | end 33 | 34 | it 'can reduce sucessfully' do 35 | key = "t1\tkey" 36 | values = [@one, @one, @one] 37 | @output.should_receive(:collect).once #.with(@text, @one) 38 | 39 | reduce(key, values, @output, nil, @script) 40 | end 41 | 42 | it 'can set job conf' do 43 | conf = mock('jobconf') 44 | paths = setup(conf, @script) 45 | 46 | paths[0].should == 'test/inputs' 47 | paths[1].should == 'test/outputs' 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /spec/util_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__) , 'spec_helper') 2 | require 'util' 3 | 4 | describe 'utilities' do 5 | before do 6 | HadoopDsl.reset_dsl_file 7 | @script_body = 'This is a script body.' 8 | @script = create_tmp_script(@script_body) 9 | end 10 | 11 | it 'can change camelcase str to snakecase' do 12 | HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str' 13 | end 14 | 15 | it 'can read file and get file data to string' do 16 | HadoopDsl.read_file(@script).should == @script_body 17 | end 18 | 19 | it 'raise error if no file in loadpath' do 20 | lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error 21 | end 22 | 23 | it 'can load from cache if script is loaded' do 24 | HadoopDsl.read_file(@script).should == @script_body 25 | File.delete(@script) 26 | HadoopDsl.read_file(@script).should == @script_body 27 | end 28 | 29 | it 'can load from each cache even if one script is loaded' do 30 | HadoopDsl.read_file(@script).should == @script_body 31 | another_script = create_tmp_script("another") 32 | HadoopDsl.read_file(another_script).should == "another" 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /examples/log_analysis_test.rb: -------------------------------------------------------------------------------- 1 | dsl 'LogAnalysis' 2 | 3 | data 'apache log on test2' do 4 | from 'apachelog/inputs' 5 | to 'apachelog/outputs' 6 | 7 | # 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)" 8 | # 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65" 9 | 10 | each_line do 11 | pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/ 12 | column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua' 13 | 14 | topic 'ua counts', :label => 'ua' do 15 | count_uniq column[:ua] 16 | end 17 | 18 | topic 'count bot', :label => 'bot' do 19 | ua = column[:ua].value 20 | bot = ua if ua =~ /bot/i 21 | count_uniq bot 22 | end 23 | 24 | topic 'ua counts group by path' do 25 | request = column[:request].value 26 | if request 27 | path = request.split(/\s+/)[1] 28 | group_by path 29 | end 30 | count_uniq column[:ua] 31 | end 32 | 33 | topic 'ua counts by daily' do 34 | group_date_by column[:access_date], :daily 35 | count_uniq column[:ua] 36 | end 37 | 38 | # topic 'total bytes' do 39 | # select_date column[:access_date], BY_MONTHLY 40 | # sum column[:bytes].to_kilobytes # / 1024 41 | # end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = hadoop-papyrus 2 | 3 | Enable to run Ruby DSL script on your Hadoop. 4 | 5 | == Description 6 | 7 | You can write DSL by Ruby to run Hadoop as Mapper / Reducer. 8 | This gem depends on 'jruby-on-hadoop' project. 9 | 10 | == Install 11 | 12 | Required gems are all on GemCutter. 13 | 14 | 1. Upgrade your rubygem to 1.3.5 15 | 2. Install gems 16 | $ gem install hadoop-papyrus 17 | 18 | == Usage 19 | 20 | 1. Run Hadoop cluster on your machines and put your 'hadoop' executable to your PATH or set HADOOP_HOME env variable. 21 | 2. put files into your hdfs. ex) wc/inputs/file1 22 | 3. Now you can run 'papyrus' like below: 23 | $ papyrus examples/word_count_test.rb 24 | You can get Hadoop job results in your hdfs wc/outputs/part-* 25 | 26 | == Examples 27 | 28 | Word Count DSL script 29 | dsl 'WordCount' 30 | 31 | from 'wc/inputs' 32 | to 'wc/outputs' 33 | 34 | count_uniq 35 | total :bytes, :words, :lines 36 | 37 | Log Analysis DSL script 38 | dsl 'LogAnalysis' 39 | 40 | data 'apache log on test2' do 41 | from 'apachelog/inputs' 42 | to 'apachelog/outputs' 43 | 44 | each_line do 45 | pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/ 46 | column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua' 47 | 48 | topic 'ua counts', :label => 'ua' do 49 | count_uniq column[:ua] 50 | end 51 | end 52 | end 53 | 54 | == Run spec 55 | Set HADOOP_HOME on your env and run 'jruby -S rake spec' 56 | 57 | == Author 58 | Koichi Fujikawa 59 | 60 | == Copyright 61 | License: Apache License 62 | -------------------------------------------------------------------------------- /lib/word_count.rb: -------------------------------------------------------------------------------- 1 | require 'hadoop_dsl' 2 | require 'enumerator' 3 | 4 | module HadoopDsl::WordCount 5 | MODEL_METHODS = [] 6 | TOTAL_PREFIX = "\t" 7 | 8 | # controller 9 | class WordCountMapper < HadoopDsl::BaseMapper 10 | def initialize(script, key, value) 11 | super(script, WordCountMapperModel.new(key, value)) 12 | end 13 | 14 | # model methods 15 | def_delegators :@model, *MODEL_METHODS 16 | 17 | # emitters 18 | def count_uniq 19 | @model.value.split.each {|word| emit(word => 1)} 20 | end 21 | 22 | def total(*types) 23 | types.each do |type| 24 | case type 25 | when :bytes 26 | emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length) 27 | when :words 28 | emit("#{TOTAL_PREFIX}total words" => @model.value.split.size) 29 | when :lines 30 | emit("#{TOTAL_PREFIX}total lines" => 1) 31 | end 32 | end 33 | end 34 | end 35 | 36 | class WordCountReducer < HadoopDsl::BaseReducer 37 | def initialize(script, key, values) 38 | super(script, WordCountReducerModel.new(key, values)) 39 | end 40 | 41 | # model methods 42 | def_delegators :@model, *MODEL_METHODS 43 | 44 | # emitters 45 | def count_uniq; aggregate unless @model.total_value? end 46 | def total(*types); aggregate if @model.total_value? end 47 | end 48 | 49 | # model 50 | class WordCountMapperModel < HadoopDsl::BaseMapperModel 51 | end 52 | 53 | class WordCountReducerModel < HadoopDsl::BaseReducerModel 54 | def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/mapred_factory.rb: -------------------------------------------------------------------------------- 1 | require 'hadoop_dsl' 2 | 3 | module HadoopDsl 4 | class MapRedFactory 5 | def self.dsl_name(script) 6 | HadoopDsl.read_file(script).each_line do |line| 7 | dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/ 8 | return dsl_name if dsl_name 9 | end 10 | end 11 | 12 | def self.require_dsl_lib(dsl_name) 13 | require HadoopDsl.snake_case(dsl_name) 14 | end 15 | end 16 | 17 | class MapperFactory < MapRedFactory 18 | # for cache in map loop 19 | @@mapper_class = nil 20 | def self.create(script, key, value) 21 | # once decide in map loop 22 | unless @@mapper_class 23 | dsl_name = self.dsl_name(script) 24 | require_dsl_lib(dsl_name) 25 | @@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper") 26 | end 27 | 28 | @@mapper_class.new(script, key, value) 29 | end 30 | end 31 | 32 | class ReducerFactory < MapRedFactory 33 | @@reducer_class = nil 34 | def self.create(script, key, values) 35 | # once decide in reduce loop 36 | unless @@reducer_class 37 | dsl_name = self.dsl_name(script) 38 | require_dsl_lib(dsl_name) 39 | @@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer") 40 | end 41 | 42 | @@reducer_class.new(script, key, values) 43 | end 44 | end 45 | 46 | class SetupFactory < MapRedFactory 47 | def self.create(script, conf) 48 | dsl_name = self.dsl_name(script) 49 | require_dsl_lib(dsl_name) 50 | setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup" 51 | eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf) 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /spec/hive_like_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__), 'spec_helper') 2 | require 'hive_like' 3 | 4 | include HadoopDsl::HiveLike 5 | 6 | describe HiveLikeSetup do 7 | it 'should load data' do 8 | script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!) 9 | conf = mock('conf') 10 | conf.should_receive(:output_key_class=).once 11 | conf.should_receive(:output_value_class=).once 12 | 13 | setup = HiveLikeSetup.new(script, conf) 14 | setup.run 15 | setup.paths[0].should == 'hive-like/inputs' 16 | setup.paths[1].should == 'hive-like/outputs' 17 | end 18 | end 19 | 20 | describe HiveLikeMapper do 21 | before do 22 | @value = 'apple, 3, 100' 23 | end 24 | 25 | it 'should create table' do 26 | mapper = HiveLikeMapper.new(nil, nil, @value) 27 | mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT'); 28 | mapper.table.name.should == 'items' 29 | mapper.table.column(0).should == 'item' 30 | mapper.table.column(1).should == 'quantity' 31 | end 32 | 33 | it 'should select' do 34 | mapper = HiveLikeMapper.new(nil, nil, @value) 35 | mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT'); 36 | mapper.select("item", "quantity", "price", "from", "items") 37 | mapper.emitted.first.should == {'items' => 'apple, 3, 100'} 38 | end 39 | 40 | it 'should pre process script body' do 41 | body = "select foo, bar from table;\n" 42 | mapper = HiveLikeMapper.new(nil, nil, @value) 43 | processed = mapper.pre_process(body) 44 | processed.should == %Q!select("foo", "bar", "from", "table")\n! 45 | end 46 | end 47 | 48 | describe HiveLikeReducer do 49 | it 'should select as identity' do 50 | key = 'Lorem' 51 | values = [1, 1, 1] 52 | reducer = HiveLikeReducer.new(nil, key, values) 53 | 54 | reducer.select 55 | reducer.emitted[0].should == {'Lorem' => 1} 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /spec/core_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__), 'spec_helper') 2 | require 'core' 3 | 4 | include HadoopDsl 5 | 6 | describe 'BaseMapRed' do 7 | before(:all) do 8 | @script = create_tmp_script(<<-EOF) 9 | from 'test/inputs' 10 | to 'test/outputs' 11 | EOF 12 | end 13 | 14 | it 'emit key value' do 15 | mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil)) 16 | mapper.emit('key' => 'value') 17 | mapper.emitted.should == [{'key' => 'value'}] 18 | end 19 | 20 | it 'can run BaseMapper in minimum' do 21 | model = BaseMapperModel.new('key', 'value') 22 | mapper = BaseMapper.new(@script, model) 23 | mapper.run 24 | end 25 | 26 | it 'can run BaseReducer in minimum' do 27 | model = BaseReducerModel.new('key', 'values') 28 | reducer = BaseReducer.new(@script, model) 29 | reducer.run 30 | end 31 | 32 | it 'can run BaseSetup in minimum' do 33 | setup = BaseSetup.new(@script, nil) 34 | setup.run 35 | end 36 | 37 | describe BaseMapper do 38 | it 'can emit as identity' do 39 | model = BaseMapperModel.new('key', 'value') 40 | mapper = BaseMapper.new(@script, model) 41 | mapper.identity 42 | 43 | mapper.emitted.should == [{'key' => 'value'}] 44 | end 45 | end 46 | 47 | describe BaseReducer do 48 | it 'can emit as aggregate' do 49 | model = BaseReducerModel.new('key', [1, 2, 3]) 50 | reducer = BaseReducer.new(@script, model) 51 | reducer.aggregate 52 | 53 | reducer.emitted.should == [{'key' => 6}] 54 | end 55 | 56 | it 'can emit as identity' do 57 | model = BaseReducerModel.new('key', [1, 2, 3]) 58 | reducer = BaseReducer.new(@script, model) 59 | reducer.identity 60 | 61 | reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}] 62 | end 63 | end 64 | 65 | describe BaseSetup do 66 | it 'can get paths' do 67 | setup = BaseSetup.new(@script, nil) 68 | setup.run 69 | setup.paths[0].should == 'test/inputs' 70 | setup.paths[1].should == 'test/outputs' 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/HadoopRuby.java: -------------------------------------------------------------------------------- 1 | package hudson.plugins.hadoop.ruby; 2 | 3 | import hudson.Extension; 4 | import hudson.FilePath; 5 | import hudson.model.AbstractProject; 6 | import hudson.model.Descriptor; 7 | import hudson.model.Hudson; 8 | import hudson.tasks.BuildStepDescriptor; 9 | import hudson.tasks.Builder; 10 | import hudson.tasks.CommandInterpreter; 11 | 12 | import java.io.File; 13 | 14 | import net.sf.json.JSONObject; 15 | 16 | import org.kohsuke.stapler.StaplerRequest; 17 | 18 | /** 19 | * Invokes the hadoop ruby interpreter and invokes the Hadoop Ruby script 20 | * entered on the hudson build configuration. 21 | *

22 | * It is expected that the hadoop ruby interpreter is available on the system 23 | * PATH. 24 | * 25 | * @author Koichi Fujikawa 26 | */ 27 | public class HadoopRuby extends CommandInterpreter { 28 | 29 | private HadoopRuby(String command) { 30 | super(command); 31 | } 32 | 33 | protected String[] buildCommandLine(FilePath script) { 34 | File rootDir = Hudson.getInstance().getRootDir(); 35 | String cmd = rootDir.toString() 36 | + "/hadoop-ruby/bin/hadoop-papyrus.sh"; 37 | return new String[] { cmd, script.getRemote() }; 38 | } 39 | 40 | protected String getContents() { 41 | return command; 42 | } 43 | 44 | protected String getFileExtension() { 45 | return ".rb"; 46 | } 47 | 48 | @Override 49 | public Descriptor getDescriptor() { 50 | return DESCRIPTOR; 51 | } 52 | 53 | @Extension 54 | public static final DescriptorImpl DESCRIPTOR = new DescriptorImpl(); 55 | 56 | public static final class DescriptorImpl extends 57 | BuildStepDescriptor { 58 | private DescriptorImpl() { 59 | super(HadoopRuby.class); 60 | } 61 | 62 | @Override 63 | public Builder newInstance(StaplerRequest req, JSONObject formData) { 64 | return new HadoopRuby(formData.getString("hadoop-ruby")); 65 | } 66 | 67 | public String getDisplayName() { 68 | return "Execute Hadoop Ruby script"; 69 | } 70 | 71 | @Override 72 | public String getHelpFile() { 73 | return "/plugin/hadoop-ruby/help.html"; 74 | } 75 | 76 | @Override 77 | public boolean isApplicable(Class jobType) { 78 | return true; 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /lib/core.rb: -------------------------------------------------------------------------------- 1 | require 'hadoop_dsl' 2 | require 'forwardable' 3 | 4 | module HadoopDsl 5 | # common 6 | module DslElement 7 | # all DSL statements without def is processed here 8 | def method_missing(name, *args) 9 | # if block given, labeled for non-local exit 10 | catch name do; yield end if block_given? 11 | self 12 | end 13 | end 14 | 15 | # controller 16 | module DslController 17 | include DslElement 18 | 19 | def run 20 | body = pre_process(HadoopDsl.read_file(@script)) 21 | eval(body, binding, @script) 22 | end 23 | 24 | def pre_process(body) 25 | body # do nothing 26 | end 27 | end 28 | 29 | class BaseMapRed 30 | extend Forwardable 31 | include DslController 32 | 33 | attr_reader :emitted 34 | 35 | def initialize(script, model) 36 | @script, @model = script, model 37 | @model.controller = self 38 | @emitted = [] 39 | end 40 | 41 | def emit(hash) @emitted << hash end 42 | 43 | private 44 | def key; @model.key end 45 | end 46 | 47 | class BaseSetup 48 | include DslController 49 | 50 | def initialize(script, conf) 51 | @script, @conf = script, conf 52 | output_format 53 | end 54 | 55 | def output_format; end # do nothing 56 | def paths; [@from, @to] end 57 | def from(path) @from = path end 58 | def to(path) @to = path end 59 | end 60 | 61 | class BaseMapper < BaseMapRed 62 | # common functions 63 | def identity 64 | emit(@model.key => @model.value) 65 | end 66 | 67 | private 68 | def value; @model.values end 69 | end 70 | 71 | class BaseReducer < BaseMapRed 72 | # common functions 73 | def aggregate 74 | emit(@model.key => @model.values.inject {|ret, i| ret + i}) 75 | end 76 | 77 | def identity 78 | @model.values.each {|v| emit(@model.key => v)} 79 | end 80 | 81 | private 82 | def values; @model.values end 83 | end 84 | 85 | # model 86 | class BaseModel 87 | include DslElement 88 | attr_accessor :controller 89 | end 90 | 91 | class BaseMapperModel < BaseModel 92 | attr_reader :key, :value 93 | 94 | def initialize(key, value) 95 | @key, @value = key, value 96 | end 97 | end 98 | 99 | class BaseReducerModel < BaseModel 100 | attr_reader :key, :values 101 | 102 | def initialize(key, values) 103 | @key, @values = key, values 104 | end 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/ItemListenerImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright (c) 2004-2009, Sun Microsystems, Inc. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | package hudson.plugins.hadoop.ruby; 25 | 26 | import hudson.Extension; 27 | import hudson.FilePath; 28 | import hudson.model.Hudson; 29 | import hudson.model.listeners.ItemListener; 30 | import hudson.util.StreamTaskListener; 31 | 32 | import java.io.File; 33 | import java.util.logging.Level; 34 | import java.util.logging.Logger; 35 | 36 | /** 37 | * Install Hadoop Ruby DSL 38 | * 39 | * @author Koichi Fujikawa 40 | */ 41 | @Extension 42 | public class ItemListenerImpl extends ItemListener { 43 | 44 | @Override 45 | public void onLoaded() { 46 | try { 47 | LOGGER.log(Level.INFO, "install start for Hadoop Ruby"); 48 | StreamTaskListener listener = new StreamTaskListener(System.out); 49 | File rootDir = Hudson.getInstance().getRootDir(); 50 | rootDir = new File(rootDir, "hadoop-ruby"); 51 | FilePath distDir = new FilePath(rootDir); 52 | distDir.installIfNecessaryFrom(ItemListenerImpl.class 53 | .getResource("hadoop-ruby.tgz"), listener, "Hadoop Ruby"); 54 | LOGGER.log(Level.INFO, "install finished for Hadoop Ruby"); 55 | 56 | } catch (Exception e) { 57 | LOGGER.log(Level.WARNING, "Failed to install Hadoop Ruby", e); 58 | } 59 | } 60 | 61 | private static final Logger LOGGER = Logger 62 | .getLogger(ItemListenerImpl.class.getName()); 63 | } 64 | -------------------------------------------------------------------------------- /spec/mapred_factory_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__) , 'spec_helper') 2 | require 'mapred_factory' 3 | 4 | include HadoopDsl 5 | 6 | describe 'MapRed Factory' do 7 | before(:each) do 8 | @script = create_tmp_script("dsl 'LogAnalysis'") 9 | end 10 | 11 | it 'can create mapper' do 12 | mapper = MapperFactory.create(@script, nil, nil) 13 | mapper.class.should == LogAnalysis::LogAnalysisMapper 14 | end 15 | 16 | it 'can create reducer' do 17 | reducer = ReducerFactory.create(@script, nil, nil) 18 | reducer.class.should == LogAnalysis::LogAnalysisReducer 19 | end 20 | 21 | it 'can create setup' do 22 | conf = mock('conf') 23 | conf.should_receive(:output_key_class=).once 24 | conf.should_receive(:output_value_class=).once 25 | s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf) 26 | s.class.should == HiveLike::HiveLikeSetup 27 | end 28 | 29 | it 'can create base if not exists in specific DSL' do 30 | s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil) 31 | s.class.should == BaseSetup 32 | end 33 | 34 | it 'specify dsl name from script' do 35 | dsl_name = MapRedFactory.dsl_name(@script) 36 | dsl_name.should == 'LogAnalysis' 37 | end 38 | 39 | it 'can convert dsl name to dsl lib file and require' do 40 | dsl_name = MapRedFactory.dsl_name(@script) 41 | MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil 42 | LogAnalysis::LogAnalysisMapper 43 | end 44 | 45 | it 'can create mapper if statement has double quote' do 46 | script = create_tmp_script(%Q!dsl "LogAnalysis"!) 47 | mapper = MapperFactory.create(script, nil, nil) 48 | mapper.class.should == LogAnalysis::LogAnalysisMapper 49 | end 50 | 51 | it 'can create mapper if exists more space' do 52 | script = create_tmp_script(%Q! dsl "LogAnalysis" !) 53 | mapper = MapperFactory.create(script, nil, nil) 54 | mapper.class.should == LogAnalysis::LogAnalysisMapper 55 | end 56 | 57 | it 'can create mapper if exists bracket' do 58 | script = create_tmp_script(%Q! dsl ("LogAnalysis") !) 59 | mapper = MapperFactory.create(script, nil, nil) 60 | mapper.class.should == LogAnalysis::LogAnalysisMapper 61 | end 62 | 63 | it 'can create mapper from class name cache' do 64 | mapper = MapperFactory.create(@script, nil, nil) 65 | mapper2 = MapperFactory.create(@script, nil, nil) 66 | mapper.class.should == mapper2.class 67 | end 68 | 69 | it 'can create reducer from class name cache' do 70 | reducer = ReducerFactory.create(@script, nil, nil) 71 | reducer2 = ReducerFactory.create(@script, nil, nil) 72 | reducer.class.should == reducer2.class 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /spec/example_spec.rb: -------------------------------------------------------------------------------- 1 | require 'log_analysis' 2 | require 'word_count' 3 | require 'hive_like' 4 | 5 | include HadoopDsl::LogAnalysis 6 | describe 'Aapach Log Example' do 7 | before(:all) do 8 | @script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb') 9 | @bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 10 | @value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"! 11 | end 12 | 13 | it 'can run example by mapper' do 14 | mapper = LogAnalysisMapper.new(@script, nil, @value) 15 | mapper.run 16 | mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1} 17 | end 18 | 19 | it 'can run example by reducer' do 20 | reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1]) 21 | reducer.run 22 | reducer.emitted.first["ua\tChrome"].should == 3 23 | end 24 | end 25 | 26 | include HadoopDsl::WordCount 27 | describe 'Word Count Example' do 28 | before(:all) do 29 | @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb') 30 | @value = 'Lorem ipsum ipsum Lorem sit amet,' 31 | end 32 | 33 | it 'can run example by mapper' do 34 | mapper = WordCountMapper.new(@script, nil, @value) 35 | mapper.run 36 | mapper.emitted.size.should == 9 37 | mapper.emitted.each do |e| 38 | case e.keys.first 39 | when 'Lorem' 40 | e.values.first.should == 1 41 | when 'total words' 42 | e.values.first.should == 6 43 | end 44 | end 45 | end 46 | 47 | it 'can run example by reducer' do 48 | reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1]) 49 | reducer.run 50 | reducer.emitted.first["Lorem"].should == 3 51 | end 52 | end 53 | 54 | include HadoopDsl::HiveLike 55 | describe 'Hive Like Example' do 56 | before(:all) do 57 | @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb') 58 | @value = 'apple, 3, 100' 59 | end 60 | 61 | it 'can run setup' do 62 | conf = mock('conf') 63 | conf.should_receive(:output_key_class=).once 64 | conf.should_receive(:output_value_class=).once 65 | 66 | setup = HiveLikeSetup.new(@script, conf) 67 | setup.run 68 | setup.paths[0].should == 'hive-like/items.txt' 69 | end 70 | 71 | it 'can run example by mapper' do 72 | mapper = HiveLikeMapper.new(@script, nil, @value) 73 | mapper.run 74 | mapper.emitted.size.should == 1 75 | mapper.emitted.first['items'].should == '3, 100, apple' 76 | end 77 | 78 | it 'can run example by reducer' do 79 | values = ['v1', 'v2', 'v3'] 80 | reducer = HiveLikeReducer.new(@script, "items", values) 81 | reducer.run 82 | reducer.emitted.first["items"].should == 'v1' 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /spec/word_count_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__), 'spec_helper') 2 | require 'word_count' 3 | 4 | include HadoopDsl::WordCount 5 | 6 | describe WordCountMapper do 7 | it 'should count uniq' do 8 | value = 'Lorem ipsum Lorem sit amet,' 9 | mapper = WordCountMapper.new(nil, nil, value) 10 | 11 | mapper.count_uniq 12 | mapper.emitted[0].should == {'Lorem' => 1} 13 | mapper.emitted[1].should == {'ipsum' => 1} 14 | mapper.emitted[2].should == {'Lorem' => 1} 15 | end 16 | 17 | it 'should count total bytes' do 18 | value = 'Lorem ipsum Lorem sit amet,' 19 | mapper = WordCountMapper.new(nil, nil, value) 20 | 21 | mapper.total :bytes 22 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 23} 23 | end 24 | 25 | it 'should count total words' do 26 | value = 'Lorem ipsum Lorem sit amet,' 27 | mapper = WordCountMapper.new(nil, nil, value) 28 | 29 | mapper.total :words 30 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total words" => 5} 31 | end 32 | 33 | it 'should count total lines' do 34 | value = 'Lorem ipsum Lorem sit amet,' 35 | mapper = WordCountMapper.new(nil, nil, value) 36 | 37 | mapper.total :lines 38 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total lines" => 1} 39 | end 40 | 41 | it 'should count total bytes, words, lines' do 42 | value = 'Lorem ipsum Lorem sit amet,' 43 | mapper = WordCountMapper.new(nil, nil, value) 44 | 45 | mapper.total :bytes, :words, :lines 46 | mapper.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 23} 47 | mapper.emitted[1].should == {"#{TOTAL_PREFIX}total words" => 5} 48 | mapper.emitted[2].should == {"#{TOTAL_PREFIX}total lines" => 1} 49 | end 50 | end 51 | 52 | describe WordCountReducer do 53 | it 'should count uniq' do 54 | key = 'Lorem' 55 | values = [1, 1, 1] 56 | reducer = WordCountReducer.new(nil, key, values) 57 | 58 | reducer.count_uniq 59 | reducer.emitted[0].should == {'Lorem' => 3} 60 | end 61 | 62 | it 'should count total bytes' do 63 | key = "#{TOTAL_PREFIX}total bytes" 64 | values = [12, 23, 45] 65 | reducer = WordCountReducer.new(nil, key, values) 66 | 67 | reducer.total :bytes 68 | reducer.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 12 + 23 + 45} 69 | end 70 | 71 | it 'should count total words' do 72 | key = "#{TOTAL_PREFIX}total words" 73 | values = [3, 4, 5] 74 | reducer = WordCountReducer.new(nil, key, values) 75 | 76 | reducer.total :words 77 | reducer.emitted[0].should == {"#{TOTAL_PREFIX}total words" => 3 + 4 + 5} 78 | end 79 | 80 | it 'should count total lines' do 81 | key = "#{TOTAL_PREFIX}total lines" 82 | values = [1, 2, 3] 83 | reducer = WordCountReducer.new(nil, key, values) 84 | 85 | reducer.total :lines 86 | reducer.emitted[0].should == {"#{TOTAL_PREFIX}total lines" => 6} 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /lib/hive_like.rb: -------------------------------------------------------------------------------- 1 | require 'hadoop_dsl' 2 | 3 | module HadoopDsl::HiveLike 4 | # common 5 | module HiveLikeMapRed 6 | def pre_process(body) 7 | processed = "" 8 | body.each do |line| 9 | next if line =~ /^#/ 10 | if line =~ /^(\w*)\s+(.*);$/ 11 | method = $1 12 | args = sprit_and_marge_args($2) 13 | processed << "#{method}(#{args})\n" 14 | else 15 | processed << line + "\n" if line 16 | end 17 | end 18 | processed 19 | end 20 | 21 | def sprit_and_marge_args(raw) 22 | raw.gsub(/[\(\)]/, ' ').split.map do |s| 23 | stripped = s.gsub(/[\s,"']/, '') 24 | %Q!"#{stripped}"! 25 | end.join(", ") 26 | end 27 | end 28 | 29 | # controller 30 | class HiveLikeSetup < HadoopDsl::BaseSetup 31 | def load_data(inputs, table) 32 | @from = inputs 33 | @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs') 34 | end 35 | 36 | def output_format 37 | @conf.output_key_class = HadoopDsl::Text 38 | @conf.output_value_class = HadoopDsl::Text 39 | end 40 | 41 | # might not need but occur error if not exists 42 | def select(*args) end 43 | 44 | include HiveLikeMapRed 45 | end 46 | 47 | class HiveLikeMapper < HadoopDsl::BaseMapper 48 | def initialize(script, key, value) 49 | super(script, HiveLikeMapperModel.new(key, value)) 50 | end 51 | 52 | include HiveLikeMapRed 53 | 54 | def_delegators :@model, :create_table, :table 55 | 56 | # emitters 57 | def select(*args) 58 | from_index = args.index('from') 59 | if from_index 60 | values = args[0...from_index].map do |column| 61 | splitted = @model.value.split(/[,\s]+/) 62 | splitted[@model.table.columns.index(column)] 63 | end 64 | emit(args[from_index + 1] => values.join(", ")) 65 | end 66 | end 67 | end 68 | 69 | class HiveLikeReducer < HadoopDsl::BaseReducer 70 | def initialize(script, key, values) 71 | super(script, HiveLikeReducerModel.new(key, values)) 72 | end 73 | 74 | include HiveLikeMapRed 75 | 76 | # emitters 77 | def select(*args) identity end 78 | end 79 | 80 | # model 81 | class HiveLikeMapperModel < HadoopDsl::BaseMapperModel 82 | attr_reader :table 83 | 84 | def create_table(name, *column_and_type) 85 | @table = Table.new(name) 86 | column_and_type.each_with_index do |column, index| 87 | next if index % 2 != 0 # type 88 | @table.columns << column_and_type[index] 89 | end 90 | end 91 | 92 | class Table 93 | attr_reader :name, :columns 94 | 95 | def initialize(name) 96 | @name = name 97 | @columns = [] 98 | end 99 | 100 | def column(index) @columns[index] end 101 | end 102 | end 103 | 104 | class HiveLikeReducerModel < HadoopDsl::BaseReducerModel 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /hadoop-papyrus.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{hadoop-papyrus} 8 | s.version = "0.0.6" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["Koichi Fujikawa"] 12 | s.date = %q{2010-02-09} 13 | s.default_executable = %q{papyrus} 14 | s.description = %q{Hadoop papyrus - Ruby DSL for Hadoop} 15 | s.email = %q{fujibee@gmail.com} 16 | s.executables = ["papyrus"] 17 | s.extra_rdoc_files = [ 18 | "README.rdoc" 19 | ] 20 | s.files = [ 21 | ".gitignore", 22 | "README.rdoc", 23 | "Rakefile", 24 | "VERSION", 25 | "bin/papyrus", 26 | "conf/hadoop-site.xml", 27 | "contrib/hudson/bin/hadoop", 28 | "contrib/hudson/bin/hadoop-papyrus.sh", 29 | "contrib/hudson/conf/hadoop-site.xml", 30 | "contrib/hudson/plugins/hadoop-ruby/.gitignore", 31 | "contrib/hudson/plugins/hadoop-ruby/pom.xml", 32 | "contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/HadoopRuby.java", 33 | "contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/ItemListenerImpl.java", 34 | "contrib/hudson/plugins/hadoop-ruby/src/main/resources/hudson/plugins/hadoop/ruby/HadoopRuby/config.jelly", 35 | "contrib/hudson/plugins/hadoop-ruby/src/main/resources/index.jelly", 36 | "contrib/hudson/plugins/hadoop-ruby/src/main/webapp/help.html", 37 | "examples/hive_like_test.rb", 38 | "examples/log_analysis_test.rb", 39 | "examples/word_count_test.rb", 40 | "hadoop-papyrus.gemspec", 41 | "lib/core.rb", 42 | "lib/dsl_init.rb", 43 | "lib/hadoop_dsl.rb", 44 | "lib/hadoop_dsl_client.rb", 45 | "lib/hive_like.rb", 46 | "lib/log_analysis.rb", 47 | "lib/mapred_factory.rb", 48 | "lib/util.rb", 49 | "lib/word_count.rb" 50 | ] 51 | s.homepage = %q{http://github.com/fujibee/hadoop-papyrus} 52 | s.rdoc_options = ["--charset=UTF-8"] 53 | s.require_paths = ["lib"] 54 | s.rubygems_version = %q{1.3.5} 55 | s.summary = %q{Hadoop papyrus} 56 | s.test_files = [ 57 | "spec/spec_helper.rb", 58 | "spec/dsl_init_spec.rb", 59 | "spec/core_spec.rb", 60 | "spec/client_spec.rb", 61 | "spec/util_spec.rb", 62 | "spec/mapred_factory_spec.rb", 63 | "spec/word_count_spec.rb", 64 | "spec/hive_like_spec.rb", 65 | "spec/log_analysis_spec.rb", 66 | "spec/example_spec.rb", 67 | "examples/hive_like_test.rb", 68 | "examples/log_analysis_test.rb", 69 | "examples/word_count_test.rb" 70 | ] 71 | 72 | if s.respond_to? :specification_version then 73 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 74 | s.specification_version = 3 75 | 76 | if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then 77 | s.add_runtime_dependency(%q, [">= 0"]) 78 | else 79 | s.add_dependency(%q, [">= 0"]) 80 | end 81 | else 82 | s.add_dependency(%q, [">= 0"]) 83 | end 84 | end 85 | 86 | -------------------------------------------------------------------------------- /lib/log_analysis.rb: -------------------------------------------------------------------------------- 1 | require 'hadoop_dsl' 2 | require 'enumerator' 3 | 4 | module HadoopDsl::LogAnalysis 5 | KEY_SEP = "\t" 6 | PREFIX = 'col' 7 | PASS = nil 8 | MODEL_METHODS = [:column, :value] 9 | 10 | # controller 11 | class LogAnalysisMapper < HadoopDsl::BaseMapper 12 | @@reg_cache = {} 13 | 14 | def initialize(script, key, value) 15 | super(script, LogAnalysisMapperModel.new(key, value)) 16 | end 17 | 18 | # model methods 19 | def_delegators :@model, *MODEL_METHODS 20 | 21 | def topic(desc, options = {}, &block) 22 | @model.create_topic(desc, options) 23 | yield if block_given? 24 | current_topic 25 | end 26 | 27 | def separate(sep) 28 | parts = case sep 29 | when Symbol 30 | case sep 31 | when :csv 32 | require 'csv' 33 | CSV.parse(value).flatten 34 | when :tsv then value.split("\t") 35 | else raise "no supported separator #{sep}" 36 | end 37 | when String then value.split(sep) 38 | end 39 | @model.create_or_replace_columns_with(parts) {|column, value| column.value = value} 40 | end 41 | 42 | def pattern(reg_str) 43 | # try to get RE from cache 44 | cached = @@reg_cache[reg_str] 45 | re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str) 46 | @@reg_cache[reg_str] ||= re # new cache 47 | 48 | if value =~ re 49 | md = Regexp.last_match 50 | @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value} 51 | else throw :each_line # non-local exit 52 | end 53 | end 54 | 55 | # column names by String converted to Symbol 56 | def column_name(*names) 57 | sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name } 58 | @model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name} 59 | end 60 | 61 | def group_by(column_or_value) 62 | case column_or_value 63 | when LogAnalysisMapperModel::Column 64 | column = column_or_value 65 | current_topic.key_elements << column.value 66 | else 67 | value = column_or_value 68 | current_topic.key_elements << value 69 | end 70 | end 71 | 72 | def group_date_by(column, term) 73 | require 'time' 74 | time = parse_time(column.value) 75 | time_key = case term 76 | when :hour_of_day then time.strftime('%H') 77 | when :daily then time.strftime('%Y%m%d') 78 | when :monthly then time.strftime('%Y%m') 79 | when :yearly then time.strftime('%Y') 80 | end 81 | current_topic.key_elements << time_key 82 | end 83 | 84 | # emitters 85 | def count_uniq(column_or_value) 86 | uniq_key = 87 | case column_or_value 88 | when LogAnalysisMapperModel::Column 89 | column = column_or_value 90 | column.value 91 | else column_or_value # value 92 | end 93 | current_topic.key_elements << uniq_key 94 | emit(current_topic.key => 1) 95 | end 96 | 97 | def count 98 | emit(current_topic.key => 1) 99 | end 100 | 101 | def sum(column) 102 | emit(current_topic.key => column.value.to_i) 103 | end 104 | 105 | private 106 | def current_topic; @model.current_topic end 107 | 108 | def parse_time(str) 109 | begin Time.parse(str) 110 | rescue 111 | # apachelog pattern ex) "10/Oct/2000:13:55:36 -0700" 112 | Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/ 113 | end 114 | end 115 | end 116 | 117 | class LogAnalysisReducer < HadoopDsl::BaseReducer 118 | def initialize(script, key, values) 119 | super(script, LogAnalysisReducerModel.new(key, values)) 120 | end 121 | 122 | # model methods 123 | def_delegators :@model, *MODEL_METHODS 124 | 125 | def topic(desc, options = {}, &block) 126 | @model.create_topic(desc, options) 127 | yield if block_given? 128 | @model.current_topic 129 | end 130 | 131 | def count_uniq(column) 132 | aggregate_on_topic 133 | end 134 | 135 | def count 136 | aggregate_on_topic 137 | end 138 | 139 | def sum(column) 140 | aggregate_on_topic 141 | end 142 | 143 | private 144 | def aggregate_on_topic 145 | aggregate if @model.topic == @model.current_topic 146 | end 147 | 148 | end 149 | 150 | # model 151 | class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel 152 | attr_reader :current_topic 153 | 154 | def initialize(key, value) 155 | super(key, value) 156 | @columns = ColumnArray.new 157 | @topics = [] 158 | end 159 | 160 | def column; @columns end 161 | 162 | def create_topic(desc, options) 163 | @topics << @current_topic = Topic.new(desc, options[:label]) 164 | end 165 | 166 | def create_or_replace_columns_with(array, &block) 167 | columns = array.enum_for(:each_with_index).map do |p, i| 168 | c = @columns[i] ? @columns[i] : Column.new(i) 169 | yield c, p 170 | c 171 | end 172 | @columns = ColumnArray.new(columns) 173 | end 174 | 175 | class ColumnArray < Array 176 | def [](key) 177 | case key 178 | when Integer then at(key) 179 | when Symbol then (select {|c| c.name == key}).first 180 | when String then (select {|c| c.name == key.to_sym}).first 181 | end 182 | end 183 | end 184 | 185 | class Column 186 | attr_reader :index 187 | attr_accessor :value, :name 188 | 189 | def initialize(index, value = nil) 190 | @index, @value = index, value 191 | end 192 | end 193 | 194 | class Topic 195 | attr_reader :key_elements 196 | 197 | def initialize(desc, label = nil) 198 | @desc, @label = desc, label 199 | @key_elements = [] 200 | end 201 | 202 | def label 203 | @label || @desc.gsub(/\s/, '_') 204 | end 205 | 206 | def key 207 | without_label = 208 | @key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil 209 | [label, without_label].compact.join(KEY_SEP) 210 | end 211 | end 212 | end 213 | 214 | class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel 215 | attr_reader :topic, :current_topic 216 | 217 | def initialize(key, values) 218 | super(key, values) 219 | if key =~ /(\w*)#{KEY_SEP}?(.*)/ 220 | @topic = Topic.new($1, values) 221 | end 222 | end 223 | 224 | def create_topic(desc, options) 225 | @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil) 226 | end 227 | 228 | class Topic 229 | attr_reader :label, :values 230 | 231 | def initialize(label, values) 232 | @label, @values = label, values 233 | end 234 | 235 | def ==(rh) self.label == rh.label end 236 | end 237 | end 238 | end 239 | -------------------------------------------------------------------------------- /spec/log_analysis_spec.rb: -------------------------------------------------------------------------------- 1 | require File.join(File.dirname(__FILE__), 'spec_helper') 2 | require 'log_analysis' 3 | 4 | include HadoopDsl::LogAnalysis 5 | 6 | describe LogAnalysisMapper do 7 | before do 8 | @apache_log = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326' 9 | end 10 | 11 | it 'should separate data by space' do 12 | value = 'Lorem ipsum dolor sit amet,' 13 | mapper = LogAnalysisMapper.new(nil, nil, value) 14 | mapper.separate(' ') 15 | 16 | mapper.column[1].value.should == 'ipsum' 17 | end 18 | 19 | it 'should separate by pattern' do 20 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log) 21 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/ 22 | 23 | mapper.column[2].value.should == 'frank' 24 | end 25 | 26 | it 'should separate by comma (CSV) with csv library' do 27 | value = '"Lorem","ip,sum","dolor","sit","amet"' 28 | mapper = LogAnalysisMapper.new(nil, nil, value) 29 | mapper.separate(:csv) 30 | 31 | require('csv').should be_false # already required 32 | mapper.column[1].value.should == 'ip,sum' 33 | end 34 | 35 | it 'should separate by tab char (TSV)' do 36 | value = "Lorem\tipsum\tdolor\tsit\tamet," 37 | mapper = LogAnalysisMapper.new(nil, nil, value) 38 | mapper.separate(:tsv) 39 | 40 | mapper.column[4].value.should == 'amet,' 41 | end 42 | 43 | it 'should not separate by non support separator' do 44 | value = 'Lorem ipsum dolor sit amet,' 45 | mapper = LogAnalysisMapper.new(nil, nil, value) 46 | lambda { mapper.separate(:nonsupport) }.should raise_error 47 | end 48 | 49 | it 'should non-local exit if cannot separate by pattern' do 50 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a") 51 | mapper.each_line do 52 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/ 53 | fail 'should not be reached' 54 | end 55 | mapper.column[0].should be_nil 56 | end 57 | 58 | it 'should label column name by string' do 59 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log) 60 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/ 61 | mapper.column_name 'remote_host', PASS, 'user', 'access_date', 'request', 'status', 'bytes' 62 | 63 | mapper.column['user'].value.should == 'frank' 64 | end 65 | 66 | it 'should label column name by symbol' do 67 | mapper = LogAnalysisMapper.new(nil, nil, @apache_log) 68 | mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/ 69 | mapper.column_name :remote_host, PASS, :user, :access_date, :request, :status, :bytes 70 | 71 | mapper.column[:user].value.should == 'frank' 72 | end 73 | 74 | it 'should count uniq by column' do 75 | value = 'count uniq' 76 | mapper = LogAnalysisMapper.new(nil, nil, value) 77 | mapper.separate(' ') 78 | mapper.topic('t1') { mapper.count_uniq mapper.column[1] } 79 | 80 | mapper.emitted.should == [{"t1\tuniq" => 1}] 81 | end 82 | 83 | it 'should count uniq by value' do 84 | value = 'count uniq' 85 | mapper = LogAnalysisMapper.new(nil, nil, value) 86 | mapper.separate(' ') 87 | mapper.topic('t1') { mapper.count_uniq 'orig value' } 88 | 89 | mapper.emitted.should == [{"t1\torig value" => 1}] 90 | end 91 | 92 | it 'should just count' do 93 | value = 'count only' 94 | mapper = LogAnalysisMapper.new(nil, nil, value) 95 | mapper.separate(' ') 96 | mapper.topic('t1') { mapper.count } 97 | 98 | mapper.emitted.should == [{"t1" => 1}] 99 | end 100 | 101 | it 'should sum column value' do 102 | value = 'sum 123' 103 | mapper = LogAnalysisMapper.new(nil, nil, value) 104 | mapper.separate(' ') 105 | mapper.topic('t1') { mapper.sum mapper.column[1] } 106 | 107 | mapper.emitted.first["t1"].should == 123 108 | end 109 | 110 | it 'has topic which returns label' do 111 | value = 'Lorem ipsum dolor sit amet,' 112 | mapper = LogAnalysisMapper.new(nil, nil, value) 113 | mapper.separate(' ') 114 | 115 | topic = mapper.topic('desc', :label => 'label') 116 | topic.label.should == 'label' 117 | end 118 | 119 | it 'has topic which returns label as desc' do 120 | value = 'Lorem ipsum dolor sit amet,' 121 | mapper = LogAnalysisMapper.new(nil, nil, value) 122 | mapper.separate(' ') 123 | 124 | topic = mapper.topic('desc') 125 | topic.label.should == 'desc' 126 | end 127 | 128 | it 'has topic which returns label as desc with space' do 129 | value = 'Lorem ipsum dolor sit amet,' 130 | mapper = LogAnalysisMapper.new(nil, nil, value) 131 | mapper.separate(' ') 132 | 133 | topic = mapper.topic('desc with space') 134 | topic.label.should == 'desc_with_space' 135 | end 136 | 137 | it 'can group date monthly' do 138 | value = "2010/1/1 21:23:10\tnewyearday" 139 | mapper = LogAnalysisMapper.new(nil, nil, value) 140 | mapper.separate("\t") 141 | mapper.column_name 'date', 'holiday' 142 | 143 | ['yearly', 'monthly', 'daily', 'hour_of_day'].each do |term| 144 | mapper.topic(term) do 145 | mapper.group_date_by mapper.column[:date], term.to_sym 146 | mapper.count_uniq mapper.column[:holiday] 147 | end 148 | end 149 | mapper.emitted.should == 150 | [ 151 | {"yearly\t2010\tnewyearday" => 1}, 152 | {"monthly\t201001\tnewyearday" => 1}, 153 | {"daily\t20100101\tnewyearday" => 1}, 154 | {"hour_of_day\t21\tnewyearday" => 1} 155 | ] 156 | end 157 | 158 | it 'can group by' do 159 | value = '1 sub_2 bingo!' 160 | mapper = LogAnalysisMapper.new(nil, nil, value) 161 | mapper.separate(' ') 162 | mapper.column_name 'id', 'sub_id', 'data' 163 | 164 | mapper.topic('test') do 165 | mapper.group_by mapper.column[:sub_id] 166 | mapper.count_uniq mapper.column[:data] 167 | end 168 | mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}] 169 | end 170 | end 171 | 172 | Topic = LogAnalysisMapperModel::Topic 173 | describe Topic do 174 | it 'can get key with label' do 175 | t = Topic.new('label') 176 | t.key.should == 'label' 177 | end 178 | 179 | it 'can get key with label and elements' do 180 | t = Topic.new('label') 181 | t.key_elements << 'e1' 182 | t.key_elements << 'e2' 183 | t.key.should == "label\te1\te2" 184 | end 185 | end 186 | 187 | describe LogAnalysisReducer do 188 | it 'should count uniq in the topic' do 189 | key = "t1\tuniq" 190 | values = [1, 1, 1] 191 | reducer = LogAnalysisReducer.new(nil, key, values) 192 | reducer.separate(' ') 193 | reducer.topic('t1') { reducer.count_uniq(nil) } 194 | 195 | reducer.emitted.first["t1\tuniq"].should == 3 196 | end 197 | 198 | it 'should not count uniq of other topic' do 199 | key = "t2\tuniq" 200 | values = [1, 1, 1] 201 | reducer = LogAnalysisReducer.new(nil, key, values) 202 | reducer.separate(' ') 203 | reducer.topic('t1') { reducer.count_uniq(nil) } 204 | 205 | reducer.emitted.first.should be_nil 206 | end 207 | 208 | it 'should sum column value' do 209 | key = "t1" 210 | values = [123, 456, 789] 211 | reducer = LogAnalysisReducer.new(nil, key, values) 212 | reducer.separate(' ') 213 | reducer.topic('t1') { reducer.sum(nil) } 214 | 215 | reducer.emitted.first["t1"].should == 123+456+789 216 | end 217 | end 218 | -------------------------------------------------------------------------------- /contrib/hudson/bin/hadoop: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # The Hadoop command script 20 | # 21 | # Environment Variables 22 | # 23 | # JAVA_HOME The java implementation to use. Overrides JAVA_HOME. 24 | # 25 | # HADOOP_CLASSPATH Extra Java CLASSPATH entries. 26 | # 27 | # HADOOP_HEAPSIZE The maximum amount of heap to use, in MB. 28 | # Default is 1000. 29 | # 30 | # HADOOP_OPTS Extra Java runtime options. 31 | # 32 | # HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS 33 | # HADOOP_CLIENT_OPTS when the respective command is run. 34 | # HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker 35 | # for e.g. HADOOP_CLIENT_OPTS applies to 36 | # more than one command (fs, dfs, fsck, 37 | # dfsadmin etc) 38 | # 39 | # HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf. 40 | # 41 | # HADOOP_ROOT_LOGGER The root appender. Default is INFO,console 42 | # 43 | 44 | bin=`dirname "$0"` 45 | bin=`cd "$bin"; pwd` 46 | 47 | if [ -f "$bin"/hadoop-config.sh ]; then 48 | . "$bin"/hadoop-config.sh 49 | fi 50 | 51 | cygwin=false 52 | case "`uname`" in 53 | CYGWIN*) cygwin=true;; 54 | esac 55 | 56 | # if no args specified, show usage 57 | if [ $# = 0 ]; then 58 | echo "Usage: hadoop [--config confdir] COMMAND" 59 | echo "where COMMAND is one of:" 60 | echo " namenode -format format the DFS filesystem" 61 | echo " secondarynamenode run the DFS secondary namenode" 62 | echo " namenode run the DFS namenode" 63 | echo " datanode run a DFS datanode" 64 | echo " dfsadmin run a DFS admin client" 65 | echo " fsck run a DFS filesystem checking utility" 66 | echo " fs run a generic filesystem user client" 67 | echo " balancer run a cluster balancing utility" 68 | echo " jobtracker run the MapReduce job Tracker node" 69 | echo " pipes run a Pipes job" 70 | echo " tasktracker run a MapReduce task Tracker node" 71 | echo " job manipulate MapReduce jobs" 72 | echo " queue get information regarding JobQueues" 73 | echo " version print the version" 74 | echo " jar run a jar file" 75 | echo " distcp copy file or directories recursively" 76 | echo " archive -archiveName NAME * create a hadoop archive" 77 | echo " daemonlog get/set the log level for each daemon" 78 | echo " or" 79 | echo " CLASSNAME run the class named CLASSNAME" 80 | echo "Most commands print help when invoked w/o parameters." 81 | exit 1 82 | fi 83 | 84 | # get arguments 85 | COMMAND=$1 86 | shift 87 | 88 | if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then 89 | . "${HADOOP_CONF_DIR}/hadoop-env.sh" 90 | fi 91 | 92 | # some Java parameters 93 | if [ "$JAVA_HOME" != "" ]; then 94 | #echo "run java in $JAVA_HOME" 95 | JAVA_HOME=$JAVA_HOME 96 | fi 97 | 98 | if [ "$JAVA_HOME" = "" ]; then 99 | echo "Error: JAVA_HOME is not set." 100 | exit 1 101 | fi 102 | 103 | JAVA=$JAVA_HOME/bin/java 104 | JAVA_HEAP_MAX=-Xmx1000m 105 | 106 | # check envvars which might override default args 107 | if [ "$HADOOP_HEAPSIZE" != "" ]; then 108 | #echo "run with heapsize $HADOOP_HEAPSIZE" 109 | JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m" 110 | #echo $JAVA_HEAP_MAX 111 | fi 112 | 113 | # CLASSPATH initially contains $HADOOP_CONF_DIR 114 | CLASSPATH="${HADOOP_CONF_DIR}" 115 | CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar 116 | 117 | # for developers, add Hadoop classes to CLASSPATH 118 | if [ -d "$HADOOP_HOME/build/classes" ]; then 119 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes 120 | fi 121 | if [ -d "$HADOOP_HOME/build/webapps" ]; then 122 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build 123 | fi 124 | if [ -d "$HADOOP_HOME/build/test/classes" ]; then 125 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes 126 | fi 127 | if [ -d "$HADOOP_HOME/build/tools" ]; then 128 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools 129 | fi 130 | 131 | # so that filenames w/ spaces are handled correctly in loops below 132 | IFS= 133 | 134 | # for releases, add core hadoop jar & webapps to CLASSPATH 135 | if [ -d "$HADOOP_HOME/webapps" ]; then 136 | CLASSPATH=${CLASSPATH}:$HADOOP_HOME 137 | fi 138 | for f in $HADOOP_HOME/hadoop-*-core.jar; do 139 | CLASSPATH=${CLASSPATH}:$f; 140 | done 141 | 142 | # add libs to CLASSPATH 143 | for f in $HADOOP_HOME/lib/*.jar; do 144 | CLASSPATH=${CLASSPATH}:$f; 145 | done 146 | 147 | for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do 148 | CLASSPATH=${CLASSPATH}:$f; 149 | done 150 | 151 | for f in $HADOOP_HOME/hadoop-*-tools.jar; do 152 | TOOL_PATH=${TOOL_PATH}:$f; 153 | done 154 | for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do 155 | TOOL_PATH=${TOOL_PATH}:$f; 156 | done 157 | 158 | # add user-specified CLASSPATH last 159 | if [ "$HADOOP_CLASSPATH" != "" ]; then 160 | CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH} 161 | fi 162 | 163 | # default log directory & file 164 | if [ "$HADOOP_LOG_DIR" = "" ]; then 165 | HADOOP_LOG_DIR="$HADOOP_HOME/logs" 166 | fi 167 | if [ "$HADOOP_LOGFILE" = "" ]; then 168 | HADOOP_LOGFILE='hadoop.log' 169 | fi 170 | 171 | # restore ordinary behaviour 172 | unset IFS 173 | 174 | # figure out which class to run 175 | if [ "$COMMAND" = "namenode" ] ; then 176 | CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode' 177 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS" 178 | elif [ "$COMMAND" = "secondarynamenode" ] ; then 179 | CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode' 180 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS" 181 | elif [ "$COMMAND" = "datanode" ] ; then 182 | CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode' 183 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS" 184 | elif [ "$COMMAND" = "fs" ] ; then 185 | CLASS=org.apache.hadoop.fs.FsShell 186 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 187 | elif [ "$COMMAND" = "dfs" ] ; then 188 | CLASS=org.apache.hadoop.fs.FsShell 189 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 190 | elif [ "$COMMAND" = "dfsadmin" ] ; then 191 | CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin 192 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 193 | elif [ "$COMMAND" = "fsck" ] ; then 194 | CLASS=org.apache.hadoop.hdfs.tools.DFSck 195 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 196 | elif [ "$COMMAND" = "balancer" ] ; then 197 | CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer 198 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS" 199 | elif [ "$COMMAND" = "jobtracker" ] ; then 200 | CLASS=org.apache.hadoop.mapred.JobTracker 201 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS" 202 | elif [ "$COMMAND" = "tasktracker" ] ; then 203 | CLASS=org.apache.hadoop.mapred.TaskTracker 204 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS" 205 | elif [ "$COMMAND" = "job" ] ; then 206 | CLASS=org.apache.hadoop.mapred.JobClient 207 | elif [ "$COMMAND" = "queue" ] ; then 208 | CLASS=org.apache.hadoop.mapred.JobQueueClient 209 | elif [ "$COMMAND" = "pipes" ] ; then 210 | CLASS=org.apache.hadoop.mapred.pipes.Submitter 211 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 212 | elif [ "$COMMAND" = "version" ] ; then 213 | CLASS=org.apache.hadoop.util.VersionInfo 214 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 215 | elif [ "$COMMAND" = "jar" ] ; then 216 | CLASS=org.apache.hadoop.mapred.JobShell 217 | elif [ "$COMMAND" = "distcp" ] ; then 218 | CLASS=org.apache.hadoop.tools.DistCp 219 | CLASSPATH=${CLASSPATH}:${TOOL_PATH} 220 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 221 | elif [ "$COMMAND" = "daemonlog" ] ; then 222 | CLASS=org.apache.hadoop.log.LogLevel 223 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 224 | elif [ "$COMMAND" = "archive" ] ; then 225 | CLASS=org.apache.hadoop.tools.HadoopArchives 226 | CLASSPATH=${CLASSPATH}:${TOOL_PATH} 227 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 228 | elif [ "$COMMAND" = "sampler" ] ; then 229 | CLASS=org.apache.hadoop.mapred.lib.InputSampler 230 | HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" 231 | else 232 | CLASS=$COMMAND 233 | fi 234 | 235 | # cygwin path translation 236 | if $cygwin; then 237 | CLASSPATH=`cygpath -p -w "$CLASSPATH"` 238 | HADOOP_HOME=`cygpath -d "$HADOOP_HOME"` 239 | HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"` 240 | TOOL_PATH=`cygpath -p -w "$TOOL_PATH"` 241 | fi 242 | # setup 'java.library.path' for native-hadoop code if necessary 243 | JAVA_LIBRARY_PATH='' 244 | if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then 245 | JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"` 246 | 247 | if [ -d "$HADOOP_HOME/build/native" ]; then 248 | JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib 249 | fi 250 | 251 | if [ -d "${HADOOP_HOME}/lib/native" ]; then 252 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then 253 | JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM} 254 | else 255 | JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM} 256 | fi 257 | fi 258 | fi 259 | 260 | # cygwin path translation 261 | if $cygwin; then 262 | JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"` 263 | fi 264 | 265 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR" 266 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE" 267 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME" 268 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING" 269 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}" 270 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then 271 | HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" 272 | fi 273 | 274 | # run it 275 | #echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@" 276 | exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@" 277 | --------------------------------------------------------------------------------