├── .gitignore
├── VERSION
├── contrib
    └── hudson
    │   ├── plugins
    │       └── hadoop-ruby
    │       │   ├── .gitignore
    │       │   ├── src
    │       │       └── main
    │       │       │   ├── resources
    │       │       │       ├── index.jelly
    │       │       │       └── hudson
    │       │       │       │   └── plugins
    │       │       │       │       └── hadoop
    │       │       │       │           └── ruby
    │       │       │       │               └── HadoopRuby
    │       │       │       │                   └── config.jelly
    │       │       │   ├── webapp
    │       │       │       └── help.html
    │       │       │   └── java
    │       │       │       └── hudson
    │       │       │           └── plugins
    │       │       │               └── hadoop
    │       │       │                   └── ruby
    │       │       │                       ├── HadoopRuby.java
    │       │       │                       └── ItemListenerImpl.java
    │       │   └── pom.xml
    │   ├── conf
    │       └── hadoop-site.xml
    │   └── bin
    │       ├── hadoop-papyrus.sh
    │       └── hadoop
├── bin
    └── papyrus
├── examples
    ├── word_count_test.rb
    ├── hive_like_test.rb
    └── log_analysis_test.rb
├── spec
    ├── spec_helper.rb
    ├── client_spec.rb
    ├── dsl_init_spec.rb
    ├── util_spec.rb
    ├── hive_like_spec.rb
    ├── core_spec.rb
    ├── mapred_factory_spec.rb
    ├── example_spec.rb
    ├── word_count_spec.rb
    └── log_analysis_spec.rb
├── lib
    ├── hadoop_dsl.rb
    ├── dsl_init.rb
    ├── util.rb
    ├── hadoop_dsl_client.rb
    ├── word_count.rb
    ├── mapred_factory.rb
    ├── core.rb
    ├── hive_like.rb
    └── log_analysis.rb
├── conf
    └── hadoop-site.xml
├── Rakefile
├── README.rdoc
└── hadoop-papyrus.gemspec


/.gitignore:
--------------------------------------------------------------------------------
1 | pkg
2 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.6
2 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/.gitignore:
--------------------------------------------------------------------------------
1 | .*
2 | target
3 | work
4 | rubygems
5 | tmp
6 | 


--------------------------------------------------------------------------------
/bin/papyrus:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | 
3 | require 'hadoop_dsl_client'
4 | 
5 | HadoopDsl::Client.new(ARGV).run
6 | 


--------------------------------------------------------------------------------
/examples/word_count_test.rb:
--------------------------------------------------------------------------------
1 | dsl 'WordCount'
2 | 
3 | from 'wc/inputs'
4 | to 'wc/outputs'
5 | 
6 | count_uniq
7 | total :bytes, :words, :lines
8 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | # spec helper
 2 | require 'rubygems'
 3 | gem 'jruby-on-hadoop'
 4 | 
 5 | require 'tempfile'
 6 | 
 7 | def create_tmp_script(body)
 8 |   tmp = Tempfile.new('test.rb')
 9 |   tmp.print body
10 |   tmp.close
11 |   tmp.path
12 | end
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/hive_like_test.rb:
--------------------------------------------------------------------------------
 1 | dsl 'HiveLike'
 2 | 
 3 | # hive-like/items.txt
 4 | # apple, 3, 100
 5 | # banana, 1, 50
 6 | 
 7 | create_table items(item STRING, quantity INT, price INT);
 8 | load_data "hive-like/items.txt" items;
 9 | 
10 | select quantity, price, item from items;
11 | 
12 | # expect
13 | # 0  apple 3 300
14 | # 1  banana 1 50
15 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/resources/index.jelly:
--------------------------------------------------------------------------------
1 | <!--
2 |   This view is used to render the plugin list page.
3 | 
4 |   Since we don't really have anything dynamic here, let's just use static HTML. 
5 | -->
6 | <div>
7 |     This is Hadoop Ruby plugin. The build scripts written in hadoop ruby will be executed by this plugin.
8 | </div>
9 | 


--------------------------------------------------------------------------------
/lib/hadoop_dsl.rb:
--------------------------------------------------------------------------------
 1 | require 'util'
 2 | require 'mapred_factory'
 3 | require 'core'
 4 | 
 5 | # for jruby
 6 | if defined? JRUBY_VERSION
 7 |   require 'java'
 8 |   import 'org.apache.hadoop.io.IntWritable'
 9 |   import 'org.apache.hadoop.io.Text'
10 | 
11 |   # Hadoop IO types
12 |   HadoopDsl::Text = Text
13 |   HadoopDsl::IntWritable = IntWritable
14 | end
15 | 


--------------------------------------------------------------------------------
/contrib/hudson/conf/hadoop-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 |     <property>
 9 |         <name>fs.default.name</name>
10 |         <value>hdfs://localhost:9000/</value>
11 |     </property>
12 | 
13 | </configuration>
14 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/resources/hudson/plugins/hadoop/ruby/HadoopRuby/config.jelly:
--------------------------------------------------------------------------------
1 | <j:jelly xmlns:j="jelly:core" xmlns:st="jelly:stapler" xmlns:d="jelly:define" xmlns:l="/lib/layout"
2 |          xmlns:t="/lib/hudson" xmlns:f="/lib/form">
3 |     <f:entry title="Script"
4 |              description="See &lt;a href='${rootURL}/env-vars.html' target=_new>the list of available environment variables&lt;/a>">
5 |         <f:textarea name="hadoop-ruby" value="${instance.command}"/>
6 |     </f:entry>
7 | </j:jelly>
8 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/webapp/help.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |     <p>
 3 |         Runs a Hadoop Ruby script (defaults to <tt>ruby</tt> interpreter) for building the project.
 4 |         The script will be run with the workspace as the current directory.
 5 |     </p>
 6 | 
 7 |     <p>
 8 |         The shell will be invoked with "-v" option. So all of the commands are printed before being executed,
 9 |         and the build is considered a failure if any of the commands exits with a non-zero exit code.
10 |     </p>
11 | </div>
12 | 


--------------------------------------------------------------------------------
/conf/hadoop-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 |     <property>
 8 |         <name>fs.default.name</name>
 9 |         <value>hdfs://localhost:9000/</value>
10 |     </property>
11 |     <property>
12 |         <name>mapred.job.tracker</name>
13 |         <value>localhost:50040</value>
14 |     </property>
15 |     <property>
16 |         <name>mapred.child.java.opts</name>
17 |         <value>-Xmx512m</value>
18 |     </property>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/contrib/hudson/bin/hadoop-papyrus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CURRENT_DIR=$(cd $(dirname $0); pwd)
 4 | PATH=$CURRENT_DIR:$PATH
 5 | 
 6 | GEM_HOME=$CURRENT_DIR/..
 7 | HADOOP_HOME=$HUDSON_HOME/hadoop/dist
 8 | HADOOP_CONF_DIR=$CURRENT_DIR/../conf
 9 | JRUBY_JAR_DIR=$GEM_HOME/gems/jruby-jars-1.4.0/lib/
10 | 
11 | export PATH GEM_HOME HADOOP_HOME HADOOP_CONF_DIR
12 | 
13 | #echo java -classpath $JRUBY_JAR_DIR/jruby-core-1.4.0.jar:$JRUBY_JAR_DIR/jruby-stdlib-1.4.0.jar org.jruby.Main $CURRENT_DIR/papyrus $1
14 | java -classpath $JRUBY_JAR_DIR/jruby-core-1.4.0.jar:$JRUBY_JAR_DIR/jruby-stdlib-1.4.0.jar org.jruby.Main $CURRENT_DIR/papyrus $1
15 | 


--------------------------------------------------------------------------------
/lib/dsl_init.rb:
--------------------------------------------------------------------------------
 1 | require 'hadoop_dsl'
 2 | 
 3 | include HadoopDsl
 4 | 
 5 | def map(key, value, output, reporter, script)
 6 |   mapper = MapperFactory.create(script, key, value)
 7 |   mapper.run
 8 | 
 9 |   write(output, mapper)
10 | end
11 | 
12 | def reduce(key, values, output, reporter, script)
13 |   reducer = ReducerFactory.create(script, key, values)
14 |   reducer.run
15 | 
16 |   write(output, reducer)
17 | end
18 | 
19 | def setup(conf, script)
20 |   setup = SetupFactory.create(script, conf)
21 |   setup.run
22 |   setup.paths
23 | end
24 | 
25 | private
26 | 
27 | def write(output, controller)
28 |   controller.emitted.each do |e|
29 |     e.each do |k, v|
30 |       output.collect(k, v)
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 2 |     <modelVersion>4.0.0</modelVersion>
 3 |     <parent>
 4 |         <groupId>org.jvnet.hudson.plugins</groupId>
 5 |         <artifactId>plugin</artifactId>
 6 |         <version>1.318</version>
 7 |         <relativePath>../pom.xml</relativePath>
 8 |     </parent>
 9 | 
10 |     <artifactId>hadoop-ruby</artifactId>
11 |     <packaging>hpi</packaging>
12 |     <version>1.1-SNAPSHOT</version>
13 |     <name>Hudson Hadoop Ruby Plugin</name>
14 |     <url>http://wiki.hudson-ci.org/display/HUDSON/Hadoop+Ruby+Plugin</url>
15 | </project>
16 | 


--------------------------------------------------------------------------------
/lib/util.rb:
--------------------------------------------------------------------------------
 1 | # utility functions
 2 | require 'hadoop_dsl'
 3 | 
 4 | module HadoopDsl
 5 |   # file body cache
 6 |   # reading file in map/reduce cause critical issues!
 7 |   @@file_bodies = {}
 8 | 
 9 |   def self.snake_case(str)
10 |     str.gsub(/\B[A-Z]/, '_\&').downcase
11 |   end
12 | 
13 |   def self.read_file(file_name)
14 |     # use if cached
15 |     body = @@file_bodies[file_name] if @@file_bodies[file_name]
16 | 
17 |     # read as usual
18 |     body = File.open(file_name).read rescue nil unless body
19 | 
20 |     # read from loadpath
21 |     unless body
22 |       $:.each do |path|
23 |         body = File.open(File.join(path, file_name)).read rescue next
24 |         break
25 |       end
26 |     end
27 | 
28 |     raise "cannot find file - #{file_name}" unless body
29 | 
30 |     # for cache
31 |     @@file_bodies[file_name] = body
32 |     body
33 |   end
34 | 
35 |   def self.reset_dsl_file
36 |     @@file_bodies = {}
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/spec/client_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.join(File.dirname(__FILE__), 'spec_helper')
 2 | require 'hadoop_dsl_client'
 3 | 
 4 | describe HadoopDsl::Client do
 5 |   before do
 6 |     @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
 7 |   end
 8 | 
 9 |   it 'can parse args' do
10 |     @client.files.join.should match /ruby_wrapper\.rb/
11 |     @client.files.join.should match /dsl_init\.rb/
12 |     @client.files.should include 'examples/wordcount.rb'
13 |     @client.inputs.should == 'in'
14 |     @client.outputs.should == 'out'
15 |   end
16 | 
17 |   it 'can add dsl file into mapred args' do
18 |     @client.mapred_args.should ==
19 |       "--script dsl_init.rb in out --dslfile wordcount.rb"
20 |   end
21 | 
22 |   it 'can add dsl lib files' do
23 |     lib_path = HadoopDsl.lib_path
24 |     @client.files.should include File.join(lib_path, 'core.rb')
25 |     @client.files.should include File.join(lib_path, 'log_analysis.rb')
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/lib/hadoop_dsl_client.rb:
--------------------------------------------------------------------------------
 1 | require 'jruby-on-hadoop'
 2 | 
 3 | module HadoopDsl
 4 |   def self.lib_path
 5 |     File.expand_path(File.dirname(__FILE__))
 6 |   end
 7 | 
 8 |   def self.dsl_init_script
 9 |     File.join(lib_path, "dsl_init.rb")
10 |   end
11 | 
12 |   class Client < JRubyOnHadoop::Client
13 |     def parse_args
14 |       super
15 |       @script_path = HadoopDsl.dsl_init_script
16 |       @script = File.basename(@script_path)
17 |       @dsl_file_path = @args[0]
18 |       @dsl_file = File.basename(@dsl_file_path)
19 |       @files << @script_path << @dsl_file_path
20 | 
21 |       # TODO move properly, with jruby-on-hadoop
22 |       add_dsl_lib_files
23 |       ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
24 |     end
25 | 
26 |     def mapred_args
27 |       args = super
28 |       args += " --dslfile #{@dsl_file}"
29 |       args
30 |     end
31 | 
32 |     def add_dsl_lib_files
33 |       lib_path = HadoopDsl.lib_path
34 |       @files += Dir.glob(File.join(lib_path, "*.rb"))
35 |     end
36 |   end
37 | end
38 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # spec
 2 | require 'rubygems'
 3 | require 'spec/rake/spectask'
 4 | 
 5 | Spec::Rake::SpecTask.new do |t|
 6 |   def hadoop_core_jar
 7 |     hadoop_home = ENV['HADOOP_HOME']
 8 |     Dir.glob("#{hadoop_home}/hadoop-*-core.jar").first
 9 |   end
10 | 
11 |   t.libs = ['lib']
12 |   t.spec_opts = ['-c', '-fs', "-r #{hadoop_core_jar}"]
13 |   t.spec_files = FileList['spec/**/*_spec.rb']
14 | end
15 | 
16 | # jeweler
17 | begin
18 |   require 'jeweler'
19 |   Jeweler::Tasks.new do |gemspec|
20 |     gemspec.name = "hadoop-papyrus"
21 |     gemspec.summary = "Hadoop papyrus"
22 |     gemspec.description = "Hadoop papyrus - Ruby DSL for Hadoop"
23 |     gemspec.email = "fujibee@gmail.com"
24 |     gemspec.homepage = "http://github.com/fujibee/hadoop-papyrus"
25 |     gemspec.authors = ["Koichi Fujikawa"]
26 | 
27 |     gemspec.add_dependency 'jruby-on-hadoop'
28 |     gemspec.files.exclude "spec/**/*"
29 |   end
30 |   Jeweler::GemcutterTasks.new
31 | rescue LoadError
32 |   puts "Jeweler not available. Install it with: gem install jeweler"
33 | end
34 | 
35 | 


--------------------------------------------------------------------------------
/spec/dsl_init_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'dsl_init'
 2 | 
 3 | describe 'mapreduce init' do
 4 | 
 5 |   before(:each) do
 6 |     @script = create_tmp_script(<<-EOF)
 7 | dsl 'LogAnalysis'
 8 | data 'test' do
 9 |   from 'test/inputs'
10 |   to 'test/outputs'
11 | 
12 |   separate(" ")
13 |   column_name 'c0', 'c1', 'c2', 'c3'
14 |   topic 't1' do
15 |     count_uniq columns(:c1)
16 |   end
17 | end
18 |     EOF
19 |   end
20 | 
21 |   before do
22 |     @one = 1
23 |     @output = mock('output')
24 |   end
25 | 
26 |   it 'can map sucessfully' do
27 |     key = 'key'
28 |     value = 'it should be fine'
29 |     @output.should_receive(:collect).once #.with(@text, @one)
30 | 
31 |     map(key, value, @output, nil, @script)
32 |   end
33 | 
34 |   it 'can reduce sucessfully' do
35 |     key = "t1\tkey"
36 |     values = [@one, @one, @one]
37 |     @output.should_receive(:collect).once #.with(@text, @one)
38 | 
39 |     reduce(key, values, @output, nil, @script)
40 |   end
41 | 
42 |   it 'can set job conf' do
43 |     conf = mock('jobconf')
44 |     paths = setup(conf, @script)
45 | 
46 |     paths[0].should == 'test/inputs'
47 |     paths[1].should == 'test/outputs'
48 |   end
49 | end
50 | 


--------------------------------------------------------------------------------
/spec/util_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.join(File.dirname(__FILE__) , 'spec_helper')
 2 | require 'util'
 3 | 
 4 | describe 'utilities' do
 5 |   before do
 6 |     HadoopDsl.reset_dsl_file
 7 |     @script_body = 'This is a script body.'
 8 |     @script = create_tmp_script(@script_body)
 9 |   end
10 | 
11 |   it 'can change camelcase str to snakecase' do
12 |     HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
13 |   end
14 | 
15 |   it 'can read file and get file data to string' do
16 |     HadoopDsl.read_file(@script).should == @script_body
17 |   end
18 | 
19 |   it 'raise error if no file in loadpath' do
20 |     lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
21 |   end
22 | 
23 |   it 'can load from cache if script is loaded' do
24 |     HadoopDsl.read_file(@script).should == @script_body
25 |     File.delete(@script)
26 |     HadoopDsl.read_file(@script).should == @script_body
27 |   end
28 | 
29 |   it 'can load from each cache even if one script is loaded' do
30 |     HadoopDsl.read_file(@script).should == @script_body
31 |     another_script = create_tmp_script("another")
32 |     HadoopDsl.read_file(another_script).should == "another"
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/examples/log_analysis_test.rb:
--------------------------------------------------------------------------------
 1 | dsl 'LogAnalysis'
 2 | 
 3 | data 'apache log on test2' do
 4 |   from 'apachelog/inputs'
 5 |   to 'apachelog/outputs'
 6 | 
 7 |   # 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
 8 |   # 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
 9 | 
10 |   each_line do
11 |     pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
12 |     column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
13 | 
14 |     topic 'ua counts', :label => 'ua' do
15 |       count_uniq column[:ua]
16 |     end
17 | 
18 |     topic 'count bot', :label => 'bot' do
19 |       ua = column[:ua].value
20 |       bot = ua if ua =~ /bot/i
21 |       count_uniq bot
22 |     end
23 | 
24 |     topic 'ua counts group by path' do
25 |       request = column[:request].value
26 |       if request
27 |         path = request.split(/\s+/)[1]
28 |         group_by path
29 |       end
30 |       count_uniq column[:ua]
31 |     end
32 | 
33 |     topic 'ua counts by daily' do
34 |       group_date_by column[:access_date], :daily
35 |       count_uniq column[:ua]
36 |     end
37 | 
38 | #    topic 'total bytes' do
39 | #      select_date column[:access_date], BY_MONTHLY
40 | #      sum column[:bytes].to_kilobytes # / 1024
41 | #    end
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
 1 | = hadoop-papyrus
 2 | 
 3 | Enable to run Ruby DSL script on your Hadoop.
 4 | 
 5 | == Description
 6 | 
 7 | You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
 8 | This gem depends on 'jruby-on-hadoop' project.
 9 | 
10 | == Install
11 | 
12 | Required gems are all on GemCutter.
13 | 
14 | 1. Upgrade your rubygem to 1.3.5
15 | 2. Install gems
16 |  $ gem install hadoop-papyrus
17 | 
18 | == Usage
19 | 
20 | 1. Run Hadoop cluster on your machines and put your 'hadoop' executable to your PATH or set HADOOP_HOME env variable.
21 | 2. put files into your hdfs. ex) wc/inputs/file1
22 | 3. Now you can run 'papyrus' like below:
23 |  $ papyrus examples/word_count_test.rb
24 | You can get Hadoop job results in your hdfs wc/outputs/part-*
25 | 
26 | == Examples
27 | 
28 | Word Count DSL script
29 |  dsl 'WordCount'
30 | 
31 |  from 'wc/inputs'
32 |  to 'wc/outputs'
33 |  
34 |  count_uniq
35 |  total :bytes, :words, :lines
36 | 
37 | Log Analysis DSL script
38 |  dsl 'LogAnalysis'
39 |  
40 |  data 'apache log on test2' do
41 |    from 'apachelog/inputs'
42 |    to 'apachelog/outputs'
43 |  
44 |    each_line do
45 |      pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
46 |      column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
47 |  
48 |      topic 'ua counts', :label => 'ua' do
49 |        count_uniq column[:ua]
50 |      end
51 |    end
52 |  end
53 | 
54 | == Run spec
55 | Set HADOOP_HOME on your env and run 'jruby -S rake spec'
56 | 
57 | == Author
58 | Koichi Fujikawa <fujibee@gmail.com>
59 | 
60 | == Copyright
61 | License: Apache License
62 | 


--------------------------------------------------------------------------------
/lib/word_count.rb:
--------------------------------------------------------------------------------
 1 | require 'hadoop_dsl'
 2 | require 'enumerator'
 3 | 
 4 | module HadoopDsl::WordCount
 5 |   MODEL_METHODS = []
 6 |   TOTAL_PREFIX = "\t"
 7 | 
 8 |   # controller
 9 |   class WordCountMapper < HadoopDsl::BaseMapper
10 |     def initialize(script, key, value)
11 |       super(script, WordCountMapperModel.new(key, value))
12 |     end
13 | 
14 |     # model methods
15 |     def_delegators :@model, *MODEL_METHODS
16 | 
17 |     # emitters
18 |     def count_uniq
19 |       @model.value.split.each {|word| emit(word => 1)}
20 |     end
21 | 
22 |     def total(*types)
23 |       types.each do |type|
24 |         case type
25 |         when :bytes
26 |           emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
27 |         when :words
28 |           emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
29 |         when :lines
30 |           emit("#{TOTAL_PREFIX}total lines" => 1)
31 |         end
32 |       end
33 |     end
34 |   end
35 | 
36 |   class WordCountReducer < HadoopDsl::BaseReducer
37 |     def initialize(script, key, values)
38 |       super(script, WordCountReducerModel.new(key, values))
39 |     end
40 | 
41 |     # model methods
42 |     def_delegators :@model, *MODEL_METHODS
43 | 
44 |     # emitters
45 |     def count_uniq; aggregate unless @model.total_value? end
46 |     def total(*types); aggregate if @model.total_value? end
47 |   end
48 | 
49 |   # model
50 |   class WordCountMapperModel < HadoopDsl::BaseMapperModel
51 |   end
52 | 
53 |   class WordCountReducerModel < HadoopDsl::BaseReducerModel
54 |     def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
55 |   end
56 | end
57 | 


--------------------------------------------------------------------------------
/lib/mapred_factory.rb:
--------------------------------------------------------------------------------
 1 | require 'hadoop_dsl'
 2 | 
 3 | module HadoopDsl
 4 |   class MapRedFactory
 5 |     def self.dsl_name(script)
 6 |       HadoopDsl.read_file(script).each_line do |line|
 7 |         dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
 8 |         return dsl_name if dsl_name
 9 |       end
10 |     end
11 | 
12 |     def self.require_dsl_lib(dsl_name)
13 |       require HadoopDsl.snake_case(dsl_name)
14 |     end
15 |   end
16 | 
17 |   class MapperFactory < MapRedFactory
18 |     # for cache in map loop
19 |     @@mapper_class = nil
20 |     def self.create(script, key, value)
21 |       # once decide in map loop
22 |       unless @@mapper_class
23 |         dsl_name = self.dsl_name(script)
24 |         require_dsl_lib(dsl_name)
25 |         @@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
26 |       end
27 | 
28 |       @@mapper_class.new(script, key, value)
29 |     end
30 |   end
31 | 
32 |   class ReducerFactory < MapRedFactory
33 |     @@reducer_class = nil
34 |     def self.create(script, key, values)
35 |       # once decide in reduce loop
36 |       unless @@reducer_class
37 |         dsl_name = self.dsl_name(script)
38 |         require_dsl_lib(dsl_name)
39 |         @@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
40 |       end
41 | 
42 |       @@reducer_class.new(script, key, values)
43 |     end
44 |   end
45 | 
46 |   class SetupFactory < MapRedFactory
47 |     def self.create(script, conf)
48 |       dsl_name = self.dsl_name(script)
49 |       require_dsl_lib(dsl_name)
50 |       setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup" 
51 |       eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
52 |     end
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/spec/hive_like_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.join(File.dirname(__FILE__), 'spec_helper')
 2 | require 'hive_like'
 3 | 
 4 | include HadoopDsl::HiveLike
 5 | 
 6 | describe HiveLikeSetup do
 7 |   it 'should load data' do
 8 |     script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
 9 |     conf = mock('conf')
10 |     conf.should_receive(:output_key_class=).once
11 |     conf.should_receive(:output_value_class=).once
12 | 
13 |     setup = HiveLikeSetup.new(script, conf)
14 |     setup.run
15 |     setup.paths[0].should == 'hive-like/inputs'
16 |     setup.paths[1].should == 'hive-like/outputs'
17 |   end
18 | end
19 | 
20 | describe HiveLikeMapper do
21 |   before do
22 |     @value = 'apple, 3, 100'
23 |   end
24 | 
25 |   it 'should create table' do
26 |     mapper = HiveLikeMapper.new(nil, nil, @value)
27 |     mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
28 |     mapper.table.name.should == 'items'
29 |     mapper.table.column(0).should == 'item'
30 |     mapper.table.column(1).should == 'quantity'
31 |   end
32 | 
33 |   it 'should select' do
34 |     mapper = HiveLikeMapper.new(nil, nil, @value)
35 |     mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
36 |     mapper.select("item", "quantity", "price", "from", "items")
37 |     mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
38 |   end
39 | 
40 |   it 'should pre process script body' do
41 |     body = "select foo, bar from table;\n"
42 |     mapper = HiveLikeMapper.new(nil, nil, @value)
43 |     processed = mapper.pre_process(body)
44 |     processed.should == %Q!select("foo", "bar", "from", "table")\n!
45 |   end
46 | end
47 | 
48 | describe HiveLikeReducer do
49 |   it 'should select as identity' do
50 |     key = 'Lorem'
51 |     values = [1, 1, 1]
52 |     reducer = HiveLikeReducer.new(nil, key, values)
53 | 
54 |     reducer.select
55 |     reducer.emitted[0].should == {'Lorem' => 1}
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/spec/core_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.join(File.dirname(__FILE__), 'spec_helper')
 2 | require 'core'
 3 | 
 4 | include HadoopDsl
 5 | 
 6 | describe 'BaseMapRed' do
 7 |   before(:all) do
 8 |     @script = create_tmp_script(<<-EOF)
 9 | from 'test/inputs'
10 | to 'test/outputs'
11 |     EOF
12 |   end
13 | 
14 |   it 'emit key value' do
15 |     mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
16 |     mapper.emit('key' => 'value')
17 |     mapper.emitted.should == [{'key' => 'value'}]
18 |   end
19 | 
20 |   it 'can run BaseMapper in minimum' do
21 |     model = BaseMapperModel.new('key', 'value')
22 |     mapper = BaseMapper.new(@script, model)
23 |     mapper.run
24 |   end
25 | 
26 |   it 'can run BaseReducer in minimum' do
27 |     model = BaseReducerModel.new('key', 'values')
28 |     reducer = BaseReducer.new(@script, model)
29 |     reducer.run
30 |   end
31 | 
32 |   it 'can run BaseSetup in minimum' do
33 |     setup = BaseSetup.new(@script, nil)
34 |     setup.run
35 |   end
36 | 
37 |   describe BaseMapper do
38 |     it 'can emit as identity' do
39 |       model = BaseMapperModel.new('key', 'value')
40 |       mapper = BaseMapper.new(@script, model)
41 |       mapper.identity
42 | 
43 |       mapper.emitted.should == [{'key' => 'value'}] 
44 |     end
45 |   end
46 | 
47 |   describe BaseReducer do
48 |     it 'can emit as aggregate' do
49 |       model = BaseReducerModel.new('key', [1, 2, 3])
50 |       reducer = BaseReducer.new(@script, model)
51 |       reducer.aggregate
52 | 
53 |       reducer.emitted.should == [{'key' => 6}] 
54 |     end
55 | 
56 |     it 'can emit as identity' do
57 |       model = BaseReducerModel.new('key', [1, 2, 3])
58 |       reducer = BaseReducer.new(@script, model)
59 |       reducer.identity
60 | 
61 |       reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}] 
62 |     end
63 |   end
64 | 
65 |   describe BaseSetup do
66 |     it 'can get paths' do
67 |       setup = BaseSetup.new(@script, nil)
68 |       setup.run
69 |       setup.paths[0].should == 'test/inputs'
70 |       setup.paths[1].should == 'test/outputs'
71 |     end
72 |   end
73 | end
74 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/HadoopRuby.java:
--------------------------------------------------------------------------------
 1 | package hudson.plugins.hadoop.ruby;
 2 | 
 3 | import hudson.Extension;
 4 | import hudson.FilePath;
 5 | import hudson.model.AbstractProject;
 6 | import hudson.model.Descriptor;
 7 | import hudson.model.Hudson;
 8 | import hudson.tasks.BuildStepDescriptor;
 9 | import hudson.tasks.Builder;
10 | import hudson.tasks.CommandInterpreter;
11 | 
12 | import java.io.File;
13 | 
14 | import net.sf.json.JSONObject;
15 | 
16 | import org.kohsuke.stapler.StaplerRequest;
17 | 
18 | /**
19 |  * Invokes the hadoop ruby interpreter and invokes the Hadoop Ruby script
20 |  * entered on the hudson build configuration.
21 |  * <p/>
22 |  * It is expected that the hadoop ruby interpreter is available on the system
23 |  * PATH.
24 |  * 
25 |  * @author Koichi Fujikawa
26 |  */
27 | public class HadoopRuby extends CommandInterpreter {
28 | 
29 | 	private HadoopRuby(String command) {
30 | 		super(command);
31 | 	}
32 | 
33 | 	protected String[] buildCommandLine(FilePath script) {
34 | 		File rootDir = Hudson.getInstance().getRootDir();
35 | 		String cmd = rootDir.toString()
36 | 				+ "/hadoop-ruby/bin/hadoop-papyrus.sh";
37 | 		return new String[] { cmd, script.getRemote() };
38 | 	}
39 | 
40 | 	protected String getContents() {
41 | 		return command;
42 | 	}
43 | 
44 | 	protected String getFileExtension() {
45 | 		return ".rb";
46 | 	}
47 | 
48 | 	@Override
49 | 	public Descriptor<Builder> getDescriptor() {
50 | 		return DESCRIPTOR;
51 | 	}
52 | 
53 | 	@Extension
54 | 	public static final DescriptorImpl DESCRIPTOR = new DescriptorImpl();
55 | 
56 | 	public static final class DescriptorImpl extends
57 | 			BuildStepDescriptor<Builder> {
58 | 		private DescriptorImpl() {
59 | 			super(HadoopRuby.class);
60 | 		}
61 | 
62 | 		@Override
63 | 		public Builder newInstance(StaplerRequest req, JSONObject formData) {
64 | 			return new HadoopRuby(formData.getString("hadoop-ruby"));
65 | 		}
66 | 
67 | 		public String getDisplayName() {
68 | 			return "Execute Hadoop Ruby script";
69 | 		}
70 | 
71 | 		@Override
72 | 		public String getHelpFile() {
73 | 			return "/plugin/hadoop-ruby/help.html";
74 | 		}
75 | 
76 | 		@Override
77 | 		public boolean isApplicable(Class<? extends AbstractProject> jobType) {
78 | 			return true;
79 | 		}
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/lib/core.rb:
--------------------------------------------------------------------------------
  1 | require 'hadoop_dsl'
  2 | require 'forwardable'
  3 | 
  4 | module HadoopDsl
  5 |   # common
  6 |   module DslElement
  7 |     # all DSL statements without def is processed here
  8 |     def method_missing(name, *args)
  9 |       # if block given, labeled for non-local exit
 10 |       catch name do; yield end if block_given?
 11 |       self
 12 |     end
 13 |   end
 14 | 
 15 |   # controller
 16 |   module DslController
 17 |     include DslElement
 18 | 
 19 |     def run
 20 |       body = pre_process(HadoopDsl.read_file(@script))
 21 |       eval(body, binding, @script)
 22 |     end
 23 | 
 24 |     def pre_process(body)
 25 |       body # do nothing
 26 |     end
 27 |   end
 28 | 
 29 |   class BaseMapRed
 30 |     extend Forwardable
 31 |     include DslController
 32 | 
 33 |     attr_reader :emitted
 34 | 
 35 |     def initialize(script, model)
 36 |       @script, @model = script, model
 37 |       @model.controller = self
 38 |       @emitted = []
 39 |     end
 40 | 
 41 |     def emit(hash) @emitted << hash end
 42 | 
 43 |   private
 44 |     def key; @model.key end
 45 |   end
 46 | 
 47 |   class BaseSetup
 48 |     include DslController
 49 | 
 50 |     def initialize(script, conf)
 51 |       @script, @conf = script, conf
 52 |       output_format
 53 |     end
 54 | 
 55 |     def output_format; end # do nothing
 56 |     def paths; [@from, @to] end
 57 |     def from(path) @from = path end
 58 |     def to(path) @to = path end
 59 |   end
 60 | 
 61 |   class BaseMapper < BaseMapRed
 62 |     # common functions
 63 |     def identity
 64 |       emit(@model.key => @model.value)
 65 |     end
 66 | 
 67 |   private
 68 |     def value; @model.values end
 69 |   end
 70 | 
 71 |   class BaseReducer < BaseMapRed
 72 |     # common functions
 73 |     def aggregate
 74 |       emit(@model.key => @model.values.inject {|ret, i| ret + i})
 75 |     end
 76 | 
 77 |     def identity
 78 |       @model.values.each {|v| emit(@model.key => v)}
 79 |     end
 80 | 
 81 |   private
 82 |     def values; @model.values end
 83 |   end
 84 | 
 85 |   # model
 86 |   class BaseModel
 87 |     include DslElement
 88 |     attr_accessor :controller
 89 |   end
 90 | 
 91 |   class BaseMapperModel < BaseModel
 92 |     attr_reader :key, :value
 93 | 
 94 |     def initialize(key, value)
 95 |       @key, @value = key, value
 96 |     end
 97 |   end
 98 | 
 99 |   class BaseReducerModel < BaseModel
100 |     attr_reader :key, :values
101 | 
102 |     def initialize(key, values)
103 |       @key, @values = key, values
104 |     end
105 |   end
106 | end
107 | 


--------------------------------------------------------------------------------
/contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/ItemListenerImpl.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * The MIT License
 3 |  *
 4 |  * Copyright (c) 2004-2009, Sun Microsystems, Inc.
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in
14 |  * all copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 |  * THE SOFTWARE.
23 |  */
24 | package hudson.plugins.hadoop.ruby;
25 | 
26 | import hudson.Extension;
27 | import hudson.FilePath;
28 | import hudson.model.Hudson;
29 | import hudson.model.listeners.ItemListener;
30 | import hudson.util.StreamTaskListener;
31 | 
32 | import java.io.File;
33 | import java.util.logging.Level;
34 | import java.util.logging.Logger;
35 | 
36 | /**
37 |  * Install Hadoop Ruby DSL
38 |  * 
39 |  * @author Koichi Fujikawa
40 |  */
41 | @Extension
42 | public class ItemListenerImpl extends ItemListener {
43 | 
44 | 	@Override
45 | 	public void onLoaded() {
46 | 		try {
47 | 			LOGGER.log(Level.INFO, "install start for Hadoop Ruby");
48 | 			StreamTaskListener listener = new StreamTaskListener(System.out);
49 | 			File rootDir = Hudson.getInstance().getRootDir();
50 | 			rootDir = new File(rootDir, "hadoop-ruby");
51 | 			FilePath distDir = new FilePath(rootDir);
52 | 			distDir.installIfNecessaryFrom(ItemListenerImpl.class
53 | 					.getResource("hadoop-ruby.tgz"), listener, "Hadoop Ruby");
54 | 			LOGGER.log(Level.INFO, "install finished for Hadoop Ruby");
55 | 
56 | 		} catch (Exception e) {
57 | 			LOGGER.log(Level.WARNING, "Failed to install Hadoop Ruby", e);
58 | 		}
59 | 	}
60 | 
61 | 	private static final Logger LOGGER = Logger
62 | 			.getLogger(ItemListenerImpl.class.getName());
63 | }
64 | 


--------------------------------------------------------------------------------
/spec/mapred_factory_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.join(File.dirname(__FILE__) , 'spec_helper')
 2 | require 'mapred_factory'
 3 | 
 4 | include HadoopDsl
 5 | 
 6 | describe 'MapRed Factory' do
 7 |   before(:each) do
 8 |     @script = create_tmp_script("dsl 'LogAnalysis'")
 9 |   end
10 | 
11 |   it 'can create mapper' do
12 |     mapper = MapperFactory.create(@script, nil, nil)
13 |     mapper.class.should == LogAnalysis::LogAnalysisMapper
14 |   end
15 | 
16 |   it 'can create reducer' do
17 |     reducer = ReducerFactory.create(@script, nil, nil)
18 |     reducer.class.should == LogAnalysis::LogAnalysisReducer
19 |   end
20 | 
21 |   it 'can create setup' do
22 |     conf = mock('conf')
23 |     conf.should_receive(:output_key_class=).once
24 |     conf.should_receive(:output_value_class=).once
25 |     s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf)
26 |     s.class.should == HiveLike::HiveLikeSetup
27 |   end
28 | 
29 |   it 'can create base if not exists in specific DSL' do
30 |     s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil)
31 |     s.class.should == BaseSetup
32 |   end
33 | 
34 |   it 'specify dsl name from script' do
35 |     dsl_name = MapRedFactory.dsl_name(@script)
36 |     dsl_name.should == 'LogAnalysis'
37 |   end
38 | 
39 |   it 'can convert dsl name to dsl lib file and require' do
40 |     dsl_name = MapRedFactory.dsl_name(@script)
41 |     MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil
42 |     LogAnalysis::LogAnalysisMapper
43 |   end
44 | 
45 |   it 'can create mapper if statement has double quote' do
46 |     script = create_tmp_script(%Q!dsl "LogAnalysis"!)
47 |     mapper = MapperFactory.create(script, nil, nil)
48 |     mapper.class.should == LogAnalysis::LogAnalysisMapper
49 |   end
50 | 
51 |   it 'can create mapper if exists more space' do
52 |     script = create_tmp_script(%Q!  dsl   "LogAnalysis"   !)
53 |     mapper = MapperFactory.create(script, nil, nil)
54 |     mapper.class.should == LogAnalysis::LogAnalysisMapper
55 |   end
56 | 
57 |   it 'can create mapper if exists bracket' do
58 |     script = create_tmp_script(%Q!  dsl ("LogAnalysis")   !)
59 |     mapper = MapperFactory.create(script, nil, nil)
60 |     mapper.class.should == LogAnalysis::LogAnalysisMapper
61 |   end
62 | 
63 |   it 'can create mapper from class name cache' do
64 |     mapper = MapperFactory.create(@script, nil, nil)
65 |     mapper2 = MapperFactory.create(@script, nil, nil)
66 |     mapper.class.should == mapper2.class
67 |   end
68 | 
69 |   it 'can create reducer from class name cache' do
70 |     reducer = ReducerFactory.create(@script, nil, nil)
71 |     reducer2 = ReducerFactory.create(@script, nil, nil)
72 |     reducer.class.should == reducer2.class
73 |   end
74 | end
75 | 


--------------------------------------------------------------------------------
/spec/example_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'log_analysis'
 2 | require 'word_count'
 3 | require 'hive_like'
 4 | 
 5 | include HadoopDsl::LogAnalysis
 6 | describe 'Aapach Log Example' do
 7 |   before(:all) do
 8 |     @script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
 9 |     @bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
10 |     @value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
11 |   end
12 | 
13 |   it 'can run example by mapper' do
14 |     mapper = LogAnalysisMapper.new(@script, nil, @value)
15 |     mapper.run
16 |     mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
17 |   end
18 | 
19 |   it 'can run example by reducer' do
20 |     reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
21 |     reducer.run
22 |     reducer.emitted.first["ua\tChrome"].should == 3
23 |   end
24 | end
25 | 
26 | include HadoopDsl::WordCount
27 | describe 'Word Count Example' do
28 |   before(:all) do
29 |     @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
30 |     @value = 'Lorem ipsum ipsum Lorem sit amet,'
31 |   end
32 | 
33 |   it 'can run example by mapper' do
34 |     mapper = WordCountMapper.new(@script, nil, @value)
35 |     mapper.run
36 |     mapper.emitted.size.should == 9
37 |     mapper.emitted.each do |e|
38 |       case e.keys.first
39 |       when 'Lorem'
40 |         e.values.first.should == 1
41 |       when 'total words'
42 |         e.values.first.should == 6
43 |       end
44 |     end
45 |   end
46 | 
47 |   it 'can run example by reducer' do
48 |     reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
49 |     reducer.run
50 |     reducer.emitted.first["Lorem"].should == 3
51 |   end
52 | end
53 | 
54 | include HadoopDsl::HiveLike
55 | describe 'Hive Like Example' do
56 |   before(:all) do
57 |     @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
58 |     @value = 'apple, 3, 100'
59 |   end
60 | 
61 |   it 'can run setup' do
62 |     conf = mock('conf')
63 |     conf.should_receive(:output_key_class=).once
64 |     conf.should_receive(:output_value_class=).once
65 | 
66 |     setup = HiveLikeSetup.new(@script, conf)
67 |     setup.run
68 |     setup.paths[0].should == 'hive-like/items.txt'
69 |   end
70 | 
71 |   it 'can run example by mapper' do
72 |     mapper = HiveLikeMapper.new(@script, nil, @value)
73 |     mapper.run
74 |     mapper.emitted.size.should == 1
75 |     mapper.emitted.first['items'].should == '3, 100, apple'
76 |   end
77 | 
78 |   it 'can run example by reducer' do
79 |     values = ['v1', 'v2', 'v3']
80 |     reducer = HiveLikeReducer.new(@script, "items", values)
81 |     reducer.run
82 |     reducer.emitted.first["items"].should == 'v1'
83 |   end
84 | end
85 | 


--------------------------------------------------------------------------------
/spec/word_count_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.join(File.dirname(__FILE__), 'spec_helper')
 2 | require 'word_count'
 3 | 
 4 | include HadoopDsl::WordCount
 5 | 
 6 | describe WordCountMapper do
 7 |   it 'should count uniq' do
 8 |     value = 'Lorem ipsum Lorem sit amet,'
 9 |     mapper = WordCountMapper.new(nil, nil, value)
10 | 
11 |     mapper.count_uniq
12 |     mapper.emitted[0].should == {'Lorem' => 1}
13 |     mapper.emitted[1].should == {'ipsum' => 1}
14 |     mapper.emitted[2].should == {'Lorem' => 1}
15 |   end
16 | 
17 |   it 'should count total bytes' do
18 |     value = 'Lorem ipsum Lorem sit amet,'
19 |     mapper = WordCountMapper.new(nil, nil, value)
20 | 
21 |     mapper.total :bytes
22 |     mapper.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 23}
23 |   end
24 | 
25 |   it 'should count total words' do
26 |     value = 'Lorem ipsum Lorem sit amet,'
27 |     mapper = WordCountMapper.new(nil, nil, value)
28 | 
29 |     mapper.total :words
30 |     mapper.emitted[0].should == {"#{TOTAL_PREFIX}total words" => 5}
31 |   end
32 | 
33 |   it 'should count total lines' do
34 |     value = 'Lorem ipsum Lorem sit amet,'
35 |     mapper = WordCountMapper.new(nil, nil, value)
36 | 
37 |     mapper.total :lines
38 |     mapper.emitted[0].should == {"#{TOTAL_PREFIX}total lines" => 1}
39 |   end
40 | 
41 |   it 'should count total bytes, words, lines' do
42 |     value = 'Lorem ipsum Lorem sit amet,'
43 |     mapper = WordCountMapper.new(nil, nil, value)
44 | 
45 |     mapper.total :bytes, :words, :lines
46 |     mapper.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 23}
47 |     mapper.emitted[1].should == {"#{TOTAL_PREFIX}total words" => 5}
48 |     mapper.emitted[2].should == {"#{TOTAL_PREFIX}total lines" => 1}
49 |   end
50 | end
51 | 
52 | describe WordCountReducer do
53 |   it 'should count uniq' do
54 |     key = 'Lorem'
55 |     values = [1, 1, 1]
56 |     reducer = WordCountReducer.new(nil, key, values)
57 | 
58 |     reducer.count_uniq
59 |     reducer.emitted[0].should == {'Lorem' => 3}
60 |   end
61 | 
62 |   it 'should count total bytes' do
63 |     key = "#{TOTAL_PREFIX}total bytes"
64 |     values = [12, 23, 45]
65 |     reducer = WordCountReducer.new(nil, key, values)
66 | 
67 |     reducer.total :bytes
68 |     reducer.emitted[0].should == {"#{TOTAL_PREFIX}total bytes" => 12 + 23 + 45}
69 |   end
70 | 
71 |   it 'should count total words' do
72 |     key = "#{TOTAL_PREFIX}total words"
73 |     values = [3, 4, 5]
74 |     reducer = WordCountReducer.new(nil, key, values)
75 | 
76 |     reducer.total :words
77 |     reducer.emitted[0].should == {"#{TOTAL_PREFIX}total words" => 3 + 4 + 5}
78 |   end
79 | 
80 |   it 'should count total lines' do
81 |     key = "#{TOTAL_PREFIX}total lines"
82 |     values = [1, 2, 3]
83 |     reducer = WordCountReducer.new(nil, key, values)
84 | 
85 |     reducer.total :lines
86 |     reducer.emitted[0].should == {"#{TOTAL_PREFIX}total lines" => 6}
87 |   end
88 | end
89 | 


--------------------------------------------------------------------------------
/lib/hive_like.rb:
--------------------------------------------------------------------------------
  1 | require 'hadoop_dsl'
  2 | 
  3 | module HadoopDsl::HiveLike
  4 |   # common
  5 |   module HiveLikeMapRed
  6 |     def pre_process(body)
  7 |       processed = ""
  8 |       body.each do |line|
  9 |         next if line =~ /^#/
 10 |         if line =~ /^(\w*)\s+(.*);$/
 11 |           method = $1
 12 |           args = sprit_and_marge_args($2)
 13 |           processed << "#{method}(#{args})\n"
 14 |         else 
 15 |           processed << line + "\n" if line
 16 |         end
 17 |       end
 18 |       processed
 19 |     end
 20 | 
 21 |     def sprit_and_marge_args(raw)
 22 |       raw.gsub(/[\(\)]/, ' ').split.map do |s|
 23 |         stripped = s.gsub(/[\s,"']/, '')
 24 |         %Q!"#{stripped}"!
 25 |       end.join(", ")
 26 |     end
 27 |   end
 28 | 
 29 |   # controller
 30 |   class HiveLikeSetup < HadoopDsl::BaseSetup
 31 |     def load_data(inputs, table)
 32 |       @from = inputs
 33 |       @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
 34 |     end
 35 |     
 36 |     def output_format
 37 |       @conf.output_key_class = HadoopDsl::Text
 38 |       @conf.output_value_class = HadoopDsl::Text
 39 |     end
 40 | 
 41 |     # might not need but occur error if not exists
 42 |     def select(*args) end
 43 | 
 44 |     include HiveLikeMapRed
 45 |   end
 46 | 
 47 |   class HiveLikeMapper < HadoopDsl::BaseMapper
 48 |     def initialize(script, key, value)
 49 |       super(script, HiveLikeMapperModel.new(key, value))
 50 |     end
 51 | 
 52 |     include HiveLikeMapRed
 53 | 
 54 |     def_delegators :@model, :create_table, :table
 55 | 
 56 |     # emitters
 57 |     def select(*args)
 58 |       from_index = args.index('from')
 59 |       if from_index
 60 |         values = args[0...from_index].map do |column|
 61 |           splitted = @model.value.split(/[,\s]+/)
 62 |           splitted[@model.table.columns.index(column)]
 63 |         end
 64 |         emit(args[from_index + 1] => values.join(", "))
 65 |       end
 66 |     end
 67 |   end
 68 | 
 69 |   class HiveLikeReducer < HadoopDsl::BaseReducer
 70 |     def initialize(script, key, values)
 71 |       super(script, HiveLikeReducerModel.new(key, values))
 72 |     end
 73 | 
 74 |     include HiveLikeMapRed
 75 | 
 76 |     # emitters
 77 |     def select(*args) identity end
 78 |   end
 79 | 
 80 |   # model
 81 |   class HiveLikeMapperModel < HadoopDsl::BaseMapperModel
 82 |     attr_reader :table
 83 | 
 84 |     def create_table(name, *column_and_type)
 85 |       @table = Table.new(name)
 86 |       column_and_type.each_with_index do |column, index|
 87 |         next if index % 2 != 0 # type
 88 |         @table.columns << column_and_type[index]
 89 |       end
 90 |     end
 91 | 
 92 |     class Table
 93 |       attr_reader :name, :columns
 94 | 
 95 |       def initialize(name)
 96 |         @name = name
 97 |         @columns = []
 98 |       end
 99 | 
100 |       def column(index) @columns[index] end
101 |     end
102 |   end
103 | 
104 |   class HiveLikeReducerModel < HadoopDsl::BaseReducerModel
105 |   end
106 | end
107 | 


--------------------------------------------------------------------------------
/hadoop-papyrus.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = %q{hadoop-papyrus}
 8 |   s.version = "0.0.6"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["Koichi Fujikawa"]
12 |   s.date = %q{2010-02-09}
13 |   s.default_executable = %q{papyrus}
14 |   s.description = %q{Hadoop papyrus - Ruby DSL for Hadoop}
15 |   s.email = %q{fujibee@gmail.com}
16 |   s.executables = ["papyrus"]
17 |   s.extra_rdoc_files = [
18 |     "README.rdoc"
19 |   ]
20 |   s.files = [
21 |     ".gitignore",
22 |      "README.rdoc",
23 |      "Rakefile",
24 |      "VERSION",
25 |      "bin/papyrus",
26 |      "conf/hadoop-site.xml",
27 |      "contrib/hudson/bin/hadoop",
28 |      "contrib/hudson/bin/hadoop-papyrus.sh",
29 |      "contrib/hudson/conf/hadoop-site.xml",
30 |      "contrib/hudson/plugins/hadoop-ruby/.gitignore",
31 |      "contrib/hudson/plugins/hadoop-ruby/pom.xml",
32 |      "contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/HadoopRuby.java",
33 |      "contrib/hudson/plugins/hadoop-ruby/src/main/java/hudson/plugins/hadoop/ruby/ItemListenerImpl.java",
34 |      "contrib/hudson/plugins/hadoop-ruby/src/main/resources/hudson/plugins/hadoop/ruby/HadoopRuby/config.jelly",
35 |      "contrib/hudson/plugins/hadoop-ruby/src/main/resources/index.jelly",
36 |      "contrib/hudson/plugins/hadoop-ruby/src/main/webapp/help.html",
37 |      "examples/hive_like_test.rb",
38 |      "examples/log_analysis_test.rb",
39 |      "examples/word_count_test.rb",
40 |      "hadoop-papyrus.gemspec",
41 |      "lib/core.rb",
42 |      "lib/dsl_init.rb",
43 |      "lib/hadoop_dsl.rb",
44 |      "lib/hadoop_dsl_client.rb",
45 |      "lib/hive_like.rb",
46 |      "lib/log_analysis.rb",
47 |      "lib/mapred_factory.rb",
48 |      "lib/util.rb",
49 |      "lib/word_count.rb"
50 |   ]
51 |   s.homepage = %q{http://github.com/fujibee/hadoop-papyrus}
52 |   s.rdoc_options = ["--charset=UTF-8"]
53 |   s.require_paths = ["lib"]
54 |   s.rubygems_version = %q{1.3.5}
55 |   s.summary = %q{Hadoop papyrus}
56 |   s.test_files = [
57 |     "spec/spec_helper.rb",
58 |      "spec/dsl_init_spec.rb",
59 |      "spec/core_spec.rb",
60 |      "spec/client_spec.rb",
61 |      "spec/util_spec.rb",
62 |      "spec/mapred_factory_spec.rb",
63 |      "spec/word_count_spec.rb",
64 |      "spec/hive_like_spec.rb",
65 |      "spec/log_analysis_spec.rb",
66 |      "spec/example_spec.rb",
67 |      "examples/hive_like_test.rb",
68 |      "examples/log_analysis_test.rb",
69 |      "examples/word_count_test.rb"
70 |   ]
71 | 
72 |   if s.respond_to? :specification_version then
73 |     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
74 |     s.specification_version = 3
75 | 
76 |     if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
77 |       s.add_runtime_dependency(%q<jruby-on-hadoop>, [">= 0"])
78 |     else
79 |       s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
80 |     end
81 |   else
82 |     s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
83 |   end
84 | end
85 | 
86 | 


--------------------------------------------------------------------------------
/lib/log_analysis.rb:
--------------------------------------------------------------------------------
  1 | require 'hadoop_dsl'
  2 | require 'enumerator'
  3 | 
  4 | module HadoopDsl::LogAnalysis
  5 |   KEY_SEP = "\t"
  6 |   PREFIX = 'col'
  7 |   PASS = nil
  8 |   MODEL_METHODS = [:column, :value]
  9 | 
 10 |   # controller
 11 |   class LogAnalysisMapper < HadoopDsl::BaseMapper
 12 |     @@reg_cache = {}
 13 | 
 14 |     def initialize(script, key, value)
 15 |       super(script, LogAnalysisMapperModel.new(key, value))
 16 |     end
 17 | 
 18 |     # model methods
 19 |     def_delegators :@model, *MODEL_METHODS
 20 |    
 21 |     def topic(desc, options = {}, &block)
 22 |       @model.create_topic(desc, options)
 23 |       yield if block_given?
 24 |       current_topic
 25 |     end
 26 | 
 27 |     def separate(sep)
 28 |       parts = case sep
 29 |               when Symbol
 30 |                 case sep
 31 |                 when :csv
 32 |                   require 'csv'
 33 |                   CSV.parse(value).flatten
 34 |                 when :tsv then value.split("\t")
 35 |                 else raise "no supported separator #{sep}"
 36 |                 end
 37 |               when String then value.split(sep)
 38 |               end
 39 |       @model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
 40 |     end
 41 | 
 42 |     def pattern(reg_str)
 43 |       # try to get RE from cache
 44 |       cached = @@reg_cache[reg_str] 
 45 |       re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str)
 46 |       @@reg_cache[reg_str] ||= re # new cache
 47 | 
 48 |       if value =~ re
 49 |         md = Regexp.last_match
 50 |         @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
 51 |       else throw :each_line # non-local exit
 52 |       end
 53 |     end
 54 | 
 55 |     # column names by String converted to Symbol
 56 |     def column_name(*names)
 57 |       sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
 58 |       @model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
 59 |     end
 60 | 
 61 |     def group_by(column_or_value)
 62 |       case column_or_value
 63 |       when LogAnalysisMapperModel::Column
 64 |         column = column_or_value
 65 |         current_topic.key_elements << column.value
 66 |       else
 67 |         value = column_or_value
 68 |         current_topic.key_elements << value
 69 |       end
 70 |     end
 71 | 
 72 |     def group_date_by(column, term)
 73 |       require 'time'
 74 |       time = parse_time(column.value)
 75 |       time_key = case term
 76 |                  when :hour_of_day then time.strftime('%H') 
 77 |                  when :daily then time.strftime('%Y%m%d') 
 78 |                  when :monthly then time.strftime('%Y%m') 
 79 |                  when :yearly then time.strftime('%Y') 
 80 |                  end
 81 |       current_topic.key_elements << time_key
 82 |     end
 83 | 
 84 |     # emitters
 85 |     def count_uniq(column_or_value)
 86 |       uniq_key =
 87 |         case column_or_value
 88 |         when LogAnalysisMapperModel::Column
 89 |           column = column_or_value
 90 |           column.value
 91 |         else column_or_value # value
 92 |         end
 93 |       current_topic.key_elements << uniq_key
 94 |       emit(current_topic.key => 1)
 95 |     end
 96 | 
 97 |     def count
 98 |       emit(current_topic.key => 1)
 99 |     end
100 | 
101 |     def sum(column)
102 |       emit(current_topic.key => column.value.to_i)
103 |     end
104 | 
105 |   private
106 |     def current_topic; @model.current_topic end
107 | 
108 |     def parse_time(str)
109 |       begin Time.parse(str)
110 |       rescue
111 |         # apachelog pattern ex) "10/Oct/2000:13:55:36 -0700"
112 |         Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/
113 |       end
114 |     end
115 |   end
116 | 
117 |   class LogAnalysisReducer < HadoopDsl::BaseReducer
118 |     def initialize(script, key, values)
119 |       super(script, LogAnalysisReducerModel.new(key, values))
120 |     end
121 | 
122 |     # model methods
123 |     def_delegators :@model, *MODEL_METHODS
124 | 
125 |     def topic(desc, options = {}, &block)
126 |       @model.create_topic(desc, options)
127 |       yield if block_given?
128 |       @model.current_topic
129 |     end
130 | 
131 |     def count_uniq(column)
132 |       aggregate_on_topic
133 |     end
134 | 
135 |     def count
136 |       aggregate_on_topic
137 |     end
138 | 
139 |     def sum(column)
140 |       aggregate_on_topic
141 |     end
142 | 
143 |   private
144 |     def aggregate_on_topic
145 |       aggregate if @model.topic == @model.current_topic
146 |     end
147 | 
148 |   end
149 | 
150 |   # model
151 |   class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel
152 |     attr_reader :current_topic
153 | 
154 |     def initialize(key, value)
155 |       super(key, value)
156 |       @columns = ColumnArray.new
157 |       @topics = []
158 |     end
159 | 
160 |     def column; @columns end
161 | 
162 |     def create_topic(desc, options)
163 |       @topics << @current_topic = Topic.new(desc, options[:label])
164 |     end
165 | 
166 |     def create_or_replace_columns_with(array, &block)
167 |       columns = array.enum_for(:each_with_index).map do |p, i|
168 |         c = @columns[i] ? @columns[i] : Column.new(i)
169 |         yield c, p
170 |         c
171 |       end
172 |       @columns = ColumnArray.new(columns)
173 |     end
174 | 
175 |     class ColumnArray < Array
176 |       def [](key)
177 |         case key
178 |         when Integer then at(key)
179 |         when Symbol then (select {|c| c.name == key}).first
180 |         when String then (select {|c| c.name == key.to_sym}).first
181 |         end
182 |       end
183 |     end
184 | 
185 |     class Column
186 |       attr_reader :index
187 |       attr_accessor :value, :name
188 | 
189 |       def initialize(index, value = nil)
190 |         @index, @value = index, value
191 |       end
192 |     end
193 | 
194 |     class Topic
195 |       attr_reader :key_elements
196 | 
197 |       def initialize(desc, label = nil)
198 |         @desc, @label = desc, label
199 |         @key_elements = []
200 |       end
201 | 
202 |       def label
203 |         @label || @desc.gsub(/\s/, '_')
204 |       end
205 | 
206 |       def key
207 |         without_label =
208 |           @key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil
209 |         [label, without_label].compact.join(KEY_SEP)
210 |       end
211 |     end
212 |   end
213 | 
214 |   class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel
215 |     attr_reader :topic, :current_topic
216 | 
217 |     def initialize(key, values)
218 |       super(key, values)
219 |       if key =~ /(\w*)#{KEY_SEP}?(.*)/
220 |         @topic = Topic.new($1, values)
221 |       end
222 |     end
223 | 
224 |     def create_topic(desc, options)
225 |       @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
226 |     end
227 | 
228 |     class Topic
229 |       attr_reader :label, :values
230 |       
231 |       def initialize(label, values)
232 |         @label, @values = label, values
233 |       end
234 | 
235 |       def ==(rh) self.label == rh.label end
236 |     end
237 |   end
238 | end
239 | 


--------------------------------------------------------------------------------
/spec/log_analysis_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.join(File.dirname(__FILE__), 'spec_helper')
  2 | require 'log_analysis'
  3 | 
  4 | include HadoopDsl::LogAnalysis
  5 | 
  6 | describe LogAnalysisMapper do
  7 |   before do
  8 |     @apache_log = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
  9 |   end
 10 |   
 11 |   it 'should separate data by space' do
 12 |     value = 'Lorem ipsum dolor sit amet,'
 13 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 14 |     mapper.separate(' ')
 15 | 
 16 |     mapper.column[1].value.should == 'ipsum'
 17 |   end
 18 | 
 19 |   it 'should separate by pattern' do
 20 |     mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
 21 |     mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
 22 | 
 23 |     mapper.column[2].value.should == 'frank'
 24 |   end
 25 | 
 26 |   it 'should separate by comma (CSV) with csv library' do
 27 |     value = '"Lorem","ip,sum","dolor","sit","amet"'
 28 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 29 |     mapper.separate(:csv)
 30 | 
 31 |     require('csv').should be_false # already required
 32 |     mapper.column[1].value.should == 'ip,sum'
 33 |   end
 34 | 
 35 |   it 'should separate by tab char (TSV)' do
 36 |     value = "Lorem\tipsum\tdolor\tsit\tamet,"
 37 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 38 |     mapper.separate(:tsv)
 39 | 
 40 |     mapper.column[4].value.should == 'amet,'
 41 |   end
 42 | 
 43 |   it 'should not separate by non support separator' do
 44 |     value = 'Lorem ipsum dolor sit amet,'
 45 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 46 |     lambda { mapper.separate(:nonsupport) }.should raise_error
 47 |   end
 48 | 
 49 |   it 'should non-local exit if cannot separate by pattern' do
 50 |     mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a")
 51 |     mapper.each_line do
 52 |       mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/
 53 |       fail 'should not be reached'
 54 |     end
 55 |     mapper.column[0].should be_nil
 56 |   end
 57 | 
 58 |   it 'should label column name by string' do
 59 |     mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
 60 |     mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
 61 |     mapper.column_name 'remote_host', PASS, 'user', 'access_date', 'request', 'status', 'bytes'
 62 | 
 63 |     mapper.column['user'].value.should == 'frank'
 64 |   end
 65 | 
 66 |   it 'should label column name by symbol' do
 67 |     mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
 68 |     mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
 69 |     mapper.column_name :remote_host, PASS, :user, :access_date, :request, :status, :bytes
 70 | 
 71 |     mapper.column[:user].value.should == 'frank'
 72 |   end
 73 | 
 74 |   it 'should count uniq by column' do
 75 |     value = 'count uniq'
 76 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 77 |     mapper.separate(' ')
 78 |     mapper.topic('t1') { mapper.count_uniq mapper.column[1] }
 79 | 
 80 |     mapper.emitted.should == [{"t1\tuniq" => 1}]
 81 |   end
 82 | 
 83 |   it 'should count uniq by value' do
 84 |     value = 'count uniq'
 85 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 86 |     mapper.separate(' ')
 87 |     mapper.topic('t1') { mapper.count_uniq 'orig value' }
 88 | 
 89 |     mapper.emitted.should == [{"t1\torig value" => 1}]
 90 |   end
 91 | 
 92 |   it 'should just count' do
 93 |     value = 'count only'
 94 |     mapper = LogAnalysisMapper.new(nil, nil, value)
 95 |     mapper.separate(' ')
 96 |     mapper.topic('t1') { mapper.count }
 97 | 
 98 |     mapper.emitted.should == [{"t1" => 1}]
 99 |   end
100 | 
101 |   it 'should sum column value' do
102 |     value = 'sum 123'
103 |     mapper = LogAnalysisMapper.new(nil, nil, value)
104 |     mapper.separate(' ')
105 |     mapper.topic('t1') { mapper.sum mapper.column[1] }
106 | 
107 |     mapper.emitted.first["t1"].should == 123
108 |   end
109 | 
110 |   it 'has topic which returns label' do
111 |     value = 'Lorem ipsum dolor sit amet,'
112 |     mapper = LogAnalysisMapper.new(nil, nil, value)
113 |     mapper.separate(' ')
114 | 
115 |     topic = mapper.topic('desc', :label => 'label')
116 |     topic.label.should == 'label'
117 |   end
118 |   
119 |   it 'has topic which returns label as desc' do
120 |     value = 'Lorem ipsum dolor sit amet,'
121 |     mapper = LogAnalysisMapper.new(nil, nil, value)
122 |     mapper.separate(' ')
123 | 
124 |     topic = mapper.topic('desc')
125 |     topic.label.should == 'desc'
126 |   end
127 | 
128 |   it 'has topic which returns label as desc with space' do
129 |     value = 'Lorem ipsum dolor sit amet,'
130 |     mapper = LogAnalysisMapper.new(nil, nil, value)
131 |     mapper.separate(' ')
132 | 
133 |     topic = mapper.topic('desc with space')
134 |     topic.label.should == 'desc_with_space'
135 |   end
136 | 
137 |   it 'can group date monthly' do
138 |     value = "2010/1/1 21:23:10\tnewyearday"
139 |     mapper = LogAnalysisMapper.new(nil, nil, value)
140 |     mapper.separate("\t")
141 |     mapper.column_name 'date', 'holiday'
142 | 
143 |     ['yearly', 'monthly', 'daily', 'hour_of_day'].each do |term|
144 |       mapper.topic(term) do
145 |         mapper.group_date_by mapper.column[:date], term.to_sym
146 |         mapper.count_uniq mapper.column[:holiday]
147 |       end
148 |     end
149 |     mapper.emitted.should ==
150 |       [
151 |         {"yearly\t2010\tnewyearday" => 1},
152 |         {"monthly\t201001\tnewyearday" => 1},
153 |         {"daily\t20100101\tnewyearday" => 1},
154 |         {"hour_of_day\t21\tnewyearday" => 1}
155 |       ]
156 |   end
157 | 
158 |   it 'can group by' do
159 |     value = '1 sub_2 bingo!'
160 |     mapper = LogAnalysisMapper.new(nil, nil, value)
161 |     mapper.separate(' ')
162 |     mapper.column_name 'id', 'sub_id', 'data'
163 | 
164 |     mapper.topic('test') do
165 |       mapper.group_by mapper.column[:sub_id]
166 |       mapper.count_uniq mapper.column[:data]
167 |     end
168 |     mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}]
169 |   end
170 | end
171 | 
172 | Topic = LogAnalysisMapperModel::Topic
173 | describe Topic do
174 |   it 'can get key with label' do
175 |     t = Topic.new('label')
176 |     t.key.should == 'label'
177 |   end
178 | 
179 |   it 'can get key with label and elements' do
180 |     t = Topic.new('label')
181 |     t.key_elements << 'e1'
182 |     t.key_elements << 'e2'
183 |     t.key.should == "label\te1\te2"
184 |   end
185 | end
186 | 
187 | describe LogAnalysisReducer do
188 |   it 'should count uniq in the topic' do
189 |     key = "t1\tuniq"
190 |     values = [1, 1, 1]
191 |     reducer = LogAnalysisReducer.new(nil, key, values)
192 |     reducer.separate(' ')
193 |     reducer.topic('t1') { reducer.count_uniq(nil) }
194 | 
195 |     reducer.emitted.first["t1\tuniq"].should == 3
196 |   end
197 | 
198 |   it 'should not count uniq of other topic' do
199 |     key = "t2\tuniq"
200 |     values = [1, 1, 1]
201 |     reducer = LogAnalysisReducer.new(nil, key, values)
202 |     reducer.separate(' ')
203 |     reducer.topic('t1') { reducer.count_uniq(nil) }
204 | 
205 |     reducer.emitted.first.should be_nil
206 |   end
207 | 
208 |   it 'should sum column value' do
209 |     key = "t1"
210 |     values = [123, 456, 789]
211 |     reducer = LogAnalysisReducer.new(nil, key, values)
212 |     reducer.separate(' ')
213 |     reducer.topic('t1') { reducer.sum(nil) }
214 | 
215 |     reducer.emitted.first["t1"].should == 123+456+789
216 |   end
217 | end
218 | 


--------------------------------------------------------------------------------
/contrib/hudson/bin/hadoop:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | 
 19 | # The Hadoop command script
 20 | #
 21 | # Environment Variables
 22 | #
 23 | #   JAVA_HOME        The java implementation to use.  Overrides JAVA_HOME.
 24 | #
 25 | #   HADOOP_CLASSPATH Extra Java CLASSPATH entries.
 26 | #
 27 | #   HADOOP_HEAPSIZE  The maximum amount of heap to use, in MB. 
 28 | #                    Default is 1000.
 29 | #
 30 | #   HADOOP_OPTS      Extra Java runtime options.
 31 | #   
 32 | #   HADOOP_NAMENODE_OPTS       These options are added to HADOOP_OPTS 
 33 | #   HADOOP_CLIENT_OPTS         when the respective command is run.
 34 | #   HADOOP_{COMMAND}_OPTS etc  HADOOP_JT_OPTS applies to JobTracker 
 35 | #                              for e.g.  HADOOP_CLIENT_OPTS applies to 
 36 | #                              more than one command (fs, dfs, fsck, 
 37 | #                              dfsadmin etc)  
 38 | #
 39 | #   HADOOP_CONF_DIR  Alternate conf dir. Default is ${HADOOP_HOME}/conf.
 40 | #
 41 | #   HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
 42 | #
 43 | 
 44 | bin=`dirname "$0"`
 45 | bin=`cd "$bin"; pwd`
 46 | 
 47 | if [ -f "$bin"/hadoop-config.sh ]; then
 48 |   . "$bin"/hadoop-config.sh
 49 | fi
 50 | 
 51 | cygwin=false
 52 | case "`uname`" in
 53 | CYGWIN*) cygwin=true;;
 54 | esac
 55 | 
 56 | # if no args specified, show usage
 57 | if [ $# = 0 ]; then
 58 |   echo "Usage: hadoop [--config confdir] COMMAND"
 59 |   echo "where COMMAND is one of:"
 60 |   echo "  namenode -format     format the DFS filesystem"
 61 |   echo "  secondarynamenode    run the DFS secondary namenode"
 62 |   echo "  namenode             run the DFS namenode"
 63 |   echo "  datanode             run a DFS datanode"
 64 |   echo "  dfsadmin             run a DFS admin client"
 65 |   echo "  fsck                 run a DFS filesystem checking utility"
 66 |   echo "  fs                   run a generic filesystem user client"
 67 |   echo "  balancer             run a cluster balancing utility"
 68 |   echo "  jobtracker           run the MapReduce job Tracker node" 
 69 |   echo "  pipes                run a Pipes job"
 70 |   echo "  tasktracker          run a MapReduce task Tracker node" 
 71 |   echo "  job                  manipulate MapReduce jobs"
 72 |   echo "  queue                get information regarding JobQueues" 
 73 |   echo "  version              print the version"
 74 |   echo "  jar <jar>            run a jar file"
 75 |   echo "  distcp <srcurl> <desturl> copy file or directories recursively"
 76 |   echo "  archive -archiveName NAME <src>* <dest> create a hadoop archive"
 77 |   echo "  daemonlog            get/set the log level for each daemon"
 78 |   echo " or"
 79 |   echo "  CLASSNAME            run the class named CLASSNAME"
 80 |   echo "Most commands print help when invoked w/o parameters."
 81 |   exit 1
 82 | fi
 83 | 
 84 | # get arguments
 85 | COMMAND=$1
 86 | shift
 87 | 
 88 | if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
 89 |   . "${HADOOP_CONF_DIR}/hadoop-env.sh"
 90 | fi
 91 | 
 92 | # some Java parameters
 93 | if [ "$JAVA_HOME" != "" ]; then
 94 |   #echo "run java in $JAVA_HOME"
 95 |   JAVA_HOME=$JAVA_HOME
 96 | fi
 97 |   
 98 | if [ "$JAVA_HOME" = "" ]; then
 99 |   echo "Error: JAVA_HOME is not set."
100 |   exit 1
101 | fi
102 | 
103 | JAVA=$JAVA_HOME/bin/java
104 | JAVA_HEAP_MAX=-Xmx1000m 
105 | 
106 | # check envvars which might override default args
107 | if [ "$HADOOP_HEAPSIZE" != "" ]; then
108 |   #echo "run with heapsize $HADOOP_HEAPSIZE"
109 |   JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
110 |   #echo $JAVA_HEAP_MAX
111 | fi
112 | 
113 | # CLASSPATH initially contains $HADOOP_CONF_DIR
114 | CLASSPATH="${HADOOP_CONF_DIR}"
115 | CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
116 | 
117 | # for developers, add Hadoop classes to CLASSPATH
118 | if [ -d "$HADOOP_HOME/build/classes" ]; then
119 |   CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
120 | fi
121 | if [ -d "$HADOOP_HOME/build/webapps" ]; then
122 |   CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
123 | fi
124 | if [ -d "$HADOOP_HOME/build/test/classes" ]; then
125 |   CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
126 | fi
127 | if [ -d "$HADOOP_HOME/build/tools" ]; then
128 |   CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
129 | fi
130 | 
131 | # so that filenames w/ spaces are handled correctly in loops below
132 | IFS=
133 | 
134 | # for releases, add core hadoop jar & webapps to CLASSPATH
135 | if [ -d "$HADOOP_HOME/webapps" ]; then
136 |   CLASSPATH=${CLASSPATH}:$HADOOP_HOME
137 | fi
138 | for f in $HADOOP_HOME/hadoop-*-core.jar; do
139 |   CLASSPATH=${CLASSPATH}:$f;
140 | done
141 | 
142 | # add libs to CLASSPATH
143 | for f in $HADOOP_HOME/lib/*.jar; do
144 |   CLASSPATH=${CLASSPATH}:$f;
145 | done
146 | 
147 | for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
148 |   CLASSPATH=${CLASSPATH}:$f;
149 | done
150 | 
151 | for f in $HADOOP_HOME/hadoop-*-tools.jar; do
152 |   TOOL_PATH=${TOOL_PATH}:$f;
153 | done
154 | for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
155 |   TOOL_PATH=${TOOL_PATH}:$f;
156 | done
157 | 
158 | # add user-specified CLASSPATH last
159 | if [ "$HADOOP_CLASSPATH" != "" ]; then
160 |   CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
161 | fi
162 | 
163 | # default log directory & file
164 | if [ "$HADOOP_LOG_DIR" = "" ]; then
165 |   HADOOP_LOG_DIR="$HADOOP_HOME/logs"
166 | fi
167 | if [ "$HADOOP_LOGFILE" = "" ]; then
168 |   HADOOP_LOGFILE='hadoop.log'
169 | fi
170 | 
171 | # restore ordinary behaviour
172 | unset IFS
173 | 
174 | # figure out which class to run
175 | if [ "$COMMAND" = "namenode" ] ; then
176 |   CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
177 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
178 | elif [ "$COMMAND" = "secondarynamenode" ] ; then
179 |   CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
180 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
181 | elif [ "$COMMAND" = "datanode" ] ; then
182 |   CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
183 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
184 | elif [ "$COMMAND" = "fs" ] ; then
185 |   CLASS=org.apache.hadoop.fs.FsShell
186 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
187 | elif [ "$COMMAND" = "dfs" ] ; then
188 |   CLASS=org.apache.hadoop.fs.FsShell
189 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
190 | elif [ "$COMMAND" = "dfsadmin" ] ; then
191 |   CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
192 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
193 | elif [ "$COMMAND" = "fsck" ] ; then
194 |   CLASS=org.apache.hadoop.hdfs.tools.DFSck
195 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
196 | elif [ "$COMMAND" = "balancer" ] ; then
197 |   CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
198 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
199 | elif [ "$COMMAND" = "jobtracker" ] ; then
200 |   CLASS=org.apache.hadoop.mapred.JobTracker
201 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
202 | elif [ "$COMMAND" = "tasktracker" ] ; then
203 |   CLASS=org.apache.hadoop.mapred.TaskTracker
204 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
205 | elif [ "$COMMAND" = "job" ] ; then
206 |   CLASS=org.apache.hadoop.mapred.JobClient
207 | elif [ "$COMMAND" = "queue" ] ; then
208 |   CLASS=org.apache.hadoop.mapred.JobQueueClient
209 | elif [ "$COMMAND" = "pipes" ] ; then
210 |   CLASS=org.apache.hadoop.mapred.pipes.Submitter
211 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
212 | elif [ "$COMMAND" = "version" ] ; then
213 |   CLASS=org.apache.hadoop.util.VersionInfo
214 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
215 | elif [ "$COMMAND" = "jar" ] ; then
216 |   CLASS=org.apache.hadoop.mapred.JobShell
217 | elif [ "$COMMAND" = "distcp" ] ; then
218 |   CLASS=org.apache.hadoop.tools.DistCp
219 |   CLASSPATH=${CLASSPATH}:${TOOL_PATH}
220 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
221 | elif [ "$COMMAND" = "daemonlog" ] ; then
222 |   CLASS=org.apache.hadoop.log.LogLevel
223 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
224 | elif [ "$COMMAND" = "archive" ] ; then
225 |   CLASS=org.apache.hadoop.tools.HadoopArchives
226 |   CLASSPATH=${CLASSPATH}:${TOOL_PATH}
227 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
228 | elif [ "$COMMAND" = "sampler" ] ; then
229 |   CLASS=org.apache.hadoop.mapred.lib.InputSampler
230 |   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
231 | else
232 |   CLASS=$COMMAND
233 | fi
234 | 
235 | # cygwin path translation
236 | if $cygwin; then
237 |   CLASSPATH=`cygpath -p -w "$CLASSPATH"`
238 |   HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
239 |   HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
240 |   TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
241 | fi
242 | # setup 'java.library.path' for native-hadoop code if necessary
243 | JAVA_LIBRARY_PATH=''
244 | if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
245 |   JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
246 |   
247 |   if [ -d "$HADOOP_HOME/build/native" ]; then
248 |     JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
249 |   fi
250 |   
251 |   if [ -d "${HADOOP_HOME}/lib/native" ]; then
252 |     if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
253 |       JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
254 |     else
255 |       JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
256 |     fi
257 |   fi
258 | fi
259 | 
260 | # cygwin path translation
261 | if $cygwin; then
262 |   JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
263 | fi
264 | 
265 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
266 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
267 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
268 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
269 | HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
270 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
271 |   HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
272 | fi  
273 | 
274 | # run it
275 | #echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
276 | exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
277 | 


--------------------------------------------------------------------------------