├── .gitignore
├── LICENSE
├── README.textile
├── Rakefile
├── VERSION
├── bin
    ├── hadoop-stream
    └── hdp-tree
├── examples
    └── pagerank
    │   ├── data
    │       └── seinfeld_network.tsv
    │   ├── pagerank.rb
    │   └── scripts
    │       ├── cut_off_list.rb
    │       ├── histogram.R
    │       ├── pagerank.pig
    │       └── pagerank_initialize.pig
├── lib
    ├── swineherd.rb
    └── swineherd
    │   ├── filesystem.rb
    │   ├── filesystem
    │       ├── README_filesystem.textile
    │       ├── basefilesystem.rb
    │       ├── filesystems.rb
    │       ├── hadoopfilesystem.rb
    │       ├── localfilesystem.rb
    │       ├── localfs.rb
    │       └── s3filesystem.rb
    │   ├── script.rb
    │   ├── script
    │       ├── hadoop_script.rb
    │       ├── pig_script.rb
    │       ├── r_script.rb
    │       └── wukong_script.rb
    │   ├── template.rb
    │   ├── workflow.rb
    │   └── workflow
    │       └── job.rb
├── notes.txt
├── swineherd.gemspec
└── tests
    ├── test_filesystem.rb
    ├── test_s3_filesystem.rb
    └── testcfg.yaml


/.gitignore:
--------------------------------------------------------------------------------
 1 | ## OS
 2 | .DS_Store
 3 | Icon?
 4 | nohup.out
 5 | .bak
 6 | 
 7 | ## EDITORS
 8 | \#*
 9 | .\#*
10 | *~
11 | *.swp
12 | REVISION
13 | TAGS*
14 | tmtags
15 | *_flymake.*
16 | *_flymake
17 | *.tmproj
18 | .project
19 | .settings
20 | 
21 | ## COMPILED
22 | a.out
23 | *.o
24 | *.pyc
25 | *.so
26 | 
27 | ## OTHER SCM
28 | .bzr
29 | .hg
30 | .svn
31 | 
32 | ## PROJECT::GENERAL
33 | coverage
34 | rdoc
35 | doc
36 | pkg
37 | .yardoc
38 | *private*
39 | 
40 | ## PROJECT::SPECIFIC
41 | 
42 | *.rdb
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |     http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
  1 | h1. SwineHerd
  2 | 
  3 | Swineherd is for running scripts and workflows on filesystems.
  4 | 
  5 | h3. Outline
  6 | 
  7 | A @workflow@ is built with @script@ objects and ran on a @filesystem@.
  8 | 
  9 | h4. Script:
 10 | 
 11 | A script has the following
 12 | 
 13 | * @source@ - The source file used. These can be "Apache Pig":http://pig.apache.org/ scripts, "Wukong":http://github.com/infochimps/wukong scripts, even "R":http://www.r-project.org/ scripts. You can add your own scripts by subclassing the @script@ class. 
 14 | * @input@ - An array of input paths.
 15 | * @output@ - An array of output paths.
 16 | * @options@ - A ruby hash of options used as command line args. Eg. {:foo => 'bar'}. How these options are mapped to command line arguments is up to the particular script class.
 17 | * @attributes@ - A ruby hash of parameters used for variable substitution. Every script is assumed to be (but not required to be) an eruby template.
 18 | 
 19 | h4. Workflow:
 20 | 
 21 | A workflow is built using rake @task@ objects that doing nothing more than run scripts. A workflow
 22 | 
 23 | * can be described with a directed dependency graph
 24 | * has an @id@ which is used to run its tasks idempotently. At the moment it is the responsibility of the running process (or human being) to choose a suitable id.
 25 | * manages intermediate outputs by using the @next_output@ and @latest_output@ methods. See the examples dir for usage.
 26 | * A workflow has a working directory in which all intermediate outputs go
 27 | ** These are named according to the rake task that created them
 28 | 
 29 | h4. FileSystem
 30 | 
 31 | Workflows are intended to run on filesystems. At the moment, implemented filesystems are
 32 | 
 33 | * @file@ - Local file system. Only thoroughly tested on unbuntu linux.
 34 | * @hdfs@ - Hadoop distributed file system. Uses jruby and the Apache Hadoop 0.20 api.
 35 | * @s3@ - Uses the right_aws gem for interacting with Amazon Simple Storage System (s3).
 36 | 
 37 | Using the filesystem:
 38 | 
 39 | Paths should be absolute.
 40 | 
 41 | <pre><code>
 42 | # get a new instance of local filesystem and write to it
 43 | localfs = FileSystem.get(:file)
 44 | localfs.open("mylocalfile", 'w') do |f|
 45 |   f.write("Writing a string to a local file")
 46 | end
 47 | 
 48 | # get a new instance of hadoop filesystem and write to it
 49 | hadoopfs = FileSystem.get(:hdfs)
 50 | hadoopfs.open("myhadoopfile", 'w') do |f|
 51 |   f.write("Writing a string to an hdfs file")
 52 | end
 53 | 
 54 | # get a new instance of s3 filesystem and write to it
 55 | access_key_id     = '1234abcd'
 56 | secret_access_key = 'foobar1234'
 57 | s3fs = FileSystem.get(:s3, accees_key_id, secret_access_key)
 58 | s3fs.mkpath 'mys3bucket' # bucket must exist
 59 | s3fs.open("mys3bucket/mys3file", 'w') do |f|
 60 |   f.write("Writing a string to an s3 file")
 61 | end
 62 | </code></pre>
 63 | 
 64 | h3. Working Example
 65 | 
 66 | For the most up to date working example see the examples directory. Here's a simple example for running pagerank:
 67 | 
 68 | <pre><code>
 69 | #!/usr/bin/env ruby
 70 | 
 71 | $LOAD_PATH << '../../lib'
 72 | require 'swineherd'        ; include Swineherd
 73 | require 'swineherd/script' ; include Swineherd::Script
 74 | require 'swineherd/filesystem'
 75 | 
 76 | Settings.define :flow_id,     :required => true,                     :description => "Flow id required to make run of workflow unique"
 77 | Settings.define :iterations,  :type => Integer,  :default => 10,     :description => "Number of pagerank iterations to run"
 78 | Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
 79 | Settings.resolve!
 80 | 
 81 | flow = Workflow.new(Settings.flow_id) do
 82 | 
 83 |   # The filesystems we're going to be working with
 84 |   hdfs    = Swineherd::FileSystem.get(:hdfs)
 85 |   localfs = Swineherd::FileSystem.get(:file)
 86 | 
 87 |   # The scripts we're going to use
 88 |   initializer = PigScript.new('scripts/pagerank_initialize.pig')
 89 |   iterator    = PigScript.new('scripts/pagerank.pig')
 90 |   finisher    = WukongScript.new('scripts/cut_off_list.rb')
 91 |   plotter     = RScript.new('scripts/histogram.R')
 92 | 
 93 |   #
 94 |   # Runs simple pig script to initialize pagerank. We must specify the input
 95 |   # here as this is the first step in the workflow. The output attribute is to
 96 |   # ensure idempotency and the options attribute is the hash that will be
 97 |   # converted into command-line args for the pig interpreter.
 98 |   #
 99 |   task :pagerank_initialize do
100 |     initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
101 |     initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
102 |   end
103 | 
104 |   #
105 |   # Runs multiple iterations of pagerank with another pig script and manages all
106 |   # the intermediate outputs.
107 |   #
108 |   task :pagerank_iterate => [:pagerank_initialize] do
109 |     iterator.options[:damp]           = '0.85f'
110 |     iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
111 |     Settings.iterations.times do
112 |       iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
113 |       iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
114 |       iterator.refresh!
115 |       iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
116 |     end
117 |   end
118 | 
119 |   #
120 |   # Here we use a wukong script to cut off the last field (a big pig bag of
121 |   # links). Notice how every wukong script MUST have an input but pig scripts do
122 |   # not.
123 |   #
124 |   task :cut_off_adjacency_list => [:pagerank_iterate] do
125 |     finisher.input  << latest_output(:pagerank_iterate)
126 |     finisher.output << next_output(:cut_off_adjacency_list)
127 |     finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
128 |   end
129 | 
130 |   #
131 |   # We want to pull down one result file, merge the part-000.. files into one file
132 |   #
133 |   task :merge_results => [:cut_off_adjacency_list] do
134 |     merged_results = next_output(:merge_results)
135 |     hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
136 |   end
137 | 
138 |   #
139 |   # Cat results into a local directory with the same structure
140 |   # eg. #{work_dir}/#{flow_id}/pull_down_results-0.
141 |   #
142 |   # FIXME: Bridging filesystems is cludgey.
143 |   #
144 |   task :pull_down_results => [:merge_results] do
145 |     local_results = next_output(:pull_down_results)
146 |     hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
147 |   end
148 | 
149 |   #
150 |   # Plot 2nd column of the result as a histogram (requires R and
151 |   # ggplot2). Note that the output here is a png file but doesn't have that
152 |   # extension. Ensmarten me as to the right way to handle that?
153 |   #
154 |   task :plot_results =>  [:pull_down_results] do
155 |     plotter.attributes = {
156 |       :pagerank_data => latest_output(:pull_down_results),
157 |       :plot_file     => next_output(:plot_results), # <-- this will be a png...
158 |       :raw_rank      => "aes(x=d$V2)"
159 |     }
160 |     plotter.run(:local) unless localfs.exists? latest_output(:plot_results)
161 |   end
162 | 
163 | end
164 | 
165 | flow.workdir = "/tmp/pagerank_example"
166 | flow.describe
167 | flow.run(:plot_results)
168 | </code></pre>
169 | 
170 | h3. Utils
171 | 
172 | There's a fun little program to emphasize the ease of using the filesystem abstraction called 'hdp-tree':
173 | 
174 | <pre><code>
175 | $: bin/hdp-tree /tmp/my_hdfs_directory
176 | --- 
177 | /tmp/my_hdfs_directory: 
178 |   - my_hdfs_directory: 
179 |       - sub_dir_a: leaf_file_1
180 |       - sub_dir_a: leaf_file_2
181 |       - sub_dir_a: leaf_file_3
182 |   - my_hdfs_directory: 
183 |       - sub_dir_b: leaf_file_1
184 |       - sub_dir_b: leaf_file_2
185 |       - sub_dir_b: leaf_file_3
186 |   - my_hdfs_directory: 
187 |       - sub_dir_c: leaf_file_1
188 |       - sub_dir_c: leaf_file_2
189 |       - sub_dir_c: leaf_file_3
190 |       - sub_dir_c: 
191 |           - sub_sub_dir_a: yet_another_leaf_file
192 |       - sub_dir_c: sub_sub_dir_b
193 |       - sub_dir_c: sub_sub_dir_c
194 | </code></pre>
195 | 
196 | I know, it's not as pretty as unix tree, but this IS github...
197 | 
198 | h3. TODO
199 | 
200 | * next task in a workflow should NOT run if the previous step failed
201 | ** this is made difficult by the fact that, sometimes?, when a pig script fails it still returns a 0 exit status
202 | ** same for wukong scripts
203 | * add a @job@ object that implements a @not_if@ function. this way a @workflow@ will be constructed of @job@ objects
204 | ** a @job@ will do nothing more than execute the ruby code in it's (run?) block, unless @not_if@ is true
205 | ** this way we can put @script@ objects inside a @job@ and only run under certain conditions that the user specifies when
206 |    they create the @job@
207 | * implement ftp filesystem interfaces
208 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'rake'
 3 | 
 4 | require 'jeweler'
 5 | Jeweler::Tasks.new do |gem|
 6 |   # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
 7 |   gem.name = "swineherd"
 8 |   gem.homepage = "http://github.com/Ganglion/swineherd"
 9 |   gem.license = "MIT"
10 |   gem.summary = %Q{Flexible data workflow glue.}
11 |   gem.description = %Q{Swineherd is for running scripts and workflows on filesystems.}
12 |   gem.email = "jacob.a.perkins@gmail.com"
13 |   gem.authors = ["Jacob Perkins"]
14 |   # Include your dependencies below. Runtime dependencies are required when using your gem,
15 |   # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
16 |   #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
17 |   #  gem.add_development_dependency 'rspec', '> 1.2.3'
18 |   gem.add_development_dependency "yard", "~> 0.6.0"
19 |   gem.add_development_dependency "jeweler", "~> 1.5.2"
20 |   gem.add_development_dependency "rcov", ">= 0"
21 |   gem.add_dependency 'configliere'
22 |   gem.add_dependency 'gorillib'
23 |   gem.add_dependency 'erubis'
24 |   gem.add_dependency 'right_aws'
25 | end
26 | Jeweler::RubygemsDotOrgTasks.new
27 | 
28 | 
29 | require 'yard'
30 | YARD::Rake::YardocTask.new
31 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.4


--------------------------------------------------------------------------------
/bin/hadoop-stream:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'rubygems'
 4 | require 'rake'
 5 | require 'swineherd' ; include Swineherd
 6 | 
 7 | #
 8 | # Uses hadoop and rake's multitask capability to stream many source
 9 | # files in parallel into a single destination directory.
10 | #
11 | 
12 | Settings.define :input, :type => Array, :required => true, :description => "Comma spearated list of directories (hdfs paths, s3 paths, etc) to stream"
13 | Settings.define :output,                :required => true, :description => "Destination directory (s3 or hdfs)"
14 | Settings.resolve!
15 | 
16 | #
17 | # Takes a hash of paths eg: {'filename' => 'full path'} and defines
18 | # a new streaming task for each one
19 | #
20 | def define_tasks list_of_tasks
21 |   list_of_tasks.each do |basename, source|
22 |     task basename do
23 |       destination = File.join(Settings.output, basename) # each file gets its own output
24 |       HDFS.stream(source, destination)
25 |     end
26 |   end
27 | end
28 | 
29 | # Create a list of tasks, one per file
30 | list_of_tasks = Settings.input.inject({}){|list, path| list[File.basename(path)] = path; list}
31 | define_tasks list_of_tasks
32 | 
33 | multitask :stream_all => list_of_tasks.keys
34 | 
35 | Rake::MultiTask["stream_all"].invoke
36 | 


--------------------------------------------------------------------------------
/bin/hdp-tree:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env jruby
 2 | 
 3 | require 'swineherd'
 4 | 
 5 | #
 6 | # Creates a 'tree' view of an hdfs path. It's not as pretty as the
 7 | # unix tree command but that's only because I'm not smart enough to
 8 | # print the hierarchy properly.
 9 | #
10 | 
11 | FS   = Swineherd::FileSystem.get(:hdfs)
12 | path = ARGV[0]
13 | 
14 | # Recursively list paths
15 | def lr path
16 |   paths = FS.entries(path)
17 |   if (paths && !paths.empty?)
18 |     paths.map{|e| {File.basename(path) => lr(e)}}.flatten
19 |   else
20 |     File.basename(path)
21 |   end
22 | end
23 | 
24 | 
25 | tree = {File.dirname(path) => lr(path)}.to_yaml
26 | puts tree
27 | 


--------------------------------------------------------------------------------
/examples/pagerank/data/seinfeld_network.tsv:
--------------------------------------------------------------------------------
  1 | jerry	costanza81
  2 | jerry	ElaineBenes
  3 | jerry	kramer
  4 | jerry	NewmanUSPS
  5 | jerry	THE_REAL_PUDDY
  6 | jerry	JPeterman
  7 | jerry	FRANKCOSTANZA
  8 | costanza81	jerry
  9 | costanza81	ElaineBenes
 10 | costanza81	kramer
 11 | costanza81	NewmanUSPS
 12 | costanza81	THE_REAL_PUDDY
 13 | costanza81	JPeterman
 14 | costanza81	FRANKCOSTANZA
 15 | ElaineBenes	jerry
 16 | ElaineBenes	costanza81
 17 | ElaineBenes	kramer
 18 | ElaineBenes	THE_REAL_PUDDY
 19 | ElaineBenes	JPeterman
 20 | kramer	jerry
 21 | kramer	costanza81
 22 | kramer	ElaineBenes
 23 | kramer	NewmanUSPS
 24 | kramer	THE_REAL_PUDDY
 25 | kramer	JPeterman
 26 | kramer	FRANKCOSTANZA
 27 | NewmanUSPS	jerry
 28 | NewmanUSPS	costanza81
 29 | NewmanUSPS	ElaineBenes
 30 | NewmanUSPS	kramer
 31 | NewmanUSPS	THE_REAL_PUDDY
 32 | NewmanUSPS	JPeterman
 33 | NewmanUSPS	FRANKCOSTANZA
 34 | THE_REAL_PUDDY	jerry
 35 | THE_REAL_PUDDY	costanza81
 36 | THE_REAL_PUDDY	ElaineBenes
 37 | THE_REAL_PUDDY	kramer
 38 | THE_REAL_PUDDY	NewmanUSPS
 39 | THE_REAL_PUDDY	JPeterman
 40 | THE_REAL_PUDDY	FRANKCOSTANZA
 41 | THE_REAL_PUDDY	Vegetable_Lasagna
 42 | FRANKCOSTANZA	jerry
 43 | FRANKCOSTANZA	costanza81
 44 | FRANKCOSTANZA	kramer
 45 | jerry	MortySeinfeld
 46 | jerry	HelenSeinfeld
 47 | jerry	Izzy_Mandelbaum
 48 | jerry	UncleLEO
 49 | jerry	Artie_Levine
 50 | MortySeinfeld	UncleLEO
 51 | MortySeinfeld	HelenSeinfeld
 52 | MortySeinfeld	Cousin_Jeffrey
 53 | Izzy_Mandelbaum	jerry
 54 | UncleLEO	jerry
 55 | UncleLEO	MortySeinfeld
 56 | UncleLEO	HelenSeinfeld
 57 | UncleLEO	Cousin_Jeffrey
 58 | UncleLEO	Babs_Kramer
 59 | Babs_Kramer	UncleLEO
 60 | Cousin_Jeffrey	jerry
 61 | Cousin_Jeffrey	MortySeinfeld
 62 | Cousin_Jeffrey	UncleLEO
 63 | jerry	Nana
 64 | Nana	MortySeinfeld
 65 | MortySeinfeld	Nana
 66 | Cousin_Jeffrey	Nana
 67 | UncleLEO	Nana
 68 | Nana	UncleLEO
 69 | JackKlompus	MortySeinfeld
 70 | MortySeinfeld	JackKlompus
 71 | Dolores	jerry
 72 | MarlatheVirgin	jerry
 73 | TiaVanCamp	jerry
 74 | Rachel_Goldstein	jerry
 75 | meryl	jerry
 76 | MissRhodeIsland	jerry
 77 | Pam	jerry
 78 | Sheila	jerry
 79 | MelissaFlyingFree	jerry
 80 | jerry	Dolores
 81 | jerry	MarlatheVirgin
 82 | jerry	TiaVanCamp
 83 | jerry	Rachel_Goldstein
 84 | jerry	meryl
 85 | jerry	MissRhodeIsland
 86 | jerry	Pam
 87 | jerry	Sheila
 88 | jerry	MelissaFlyingFree
 89 | jerry	Laura
 90 | jerry	Sandy
 91 | Laura	Sandy
 92 | Laura	jerry
 93 | Sandy	jerry
 94 | kramer	MissRhodeIsland
 95 | kramer	Pam
 96 | jerry	Jenna
 97 | Jenna	jerry
 98 | jerry	bania
 99 | costanza81	bania
100 | bania	jerry
101 | bania	The_Soup_Nazi
102 | bania	Poppie
103 | bania	Jenna
104 | jerry	Noreen
105 | jerry	JackKlompus
106 | jerry	Milos
107 | jerry	JeanPaul_JeanPaul
108 | jerry	FusilliJerry
109 | FusilliJerry	jerry
110 | FusilliJerry	kramer
111 | kramer	FusilliJerry
112 | FusilliJerry	FRANKCOSTANZA
113 | jerry	pez
114 | jerry	superman
115 | jerry	bigstein
116 | Milos	jerry
117 | jerry	Roy_the_Dentist
118 | jerry	BabuBhatt
119 | kramer	BabuBhatt
120 | BabuBhatt	kramer
121 | kramer	Poppie
122 | Poppie	kramer
123 | Poppie	ElaineBenes
124 | ElaineBenes	Poppie
125 | jerry	Poppie
126 | jerry	Shaky_the_Mohel
127 | jerry	bubble_boy
128 | bubble_boy	jerry
129 | MatthewSeinfeldFan	jerry
130 | FragileFrankieMerman	jerry
131 | jerry	FragileFrankieMerman
132 | costanza81	FragileFrankieMerman
133 | costanza81	EstelleC
134 | FRANKCOSTANZA	EstelleC
135 | EstelleC	costanza81
136 | EstelleC	FRANKCOSTANZA
137 | FRANKCOSTANZA	Lloyd_Braun
138 | EstelleC	Lloyd_Braun
139 | Lloyd_Braun	FRANKCOSTANZA
140 | Lloyd_Braun	costanza81
141 | Lloyd_Braun	EstelleC
142 | kramer	MrWilhelm
143 | costanza81	MrWilhelm
144 | costanza81	Allison
145 | Allison	costanza81
146 | costanza81	LindsayEnright
147 | LindsayEnright	costanza81
148 | costanza81	marisa_tomei
149 | costanza81	SusanRoss
150 | SusanRoss	MrandMrsRoss
151 | MrandMrsRoss	SusanRoss
152 | SusanRoss	jerry
153 | jerry	SusanRoss
154 | SusanRoss	ElaineBenes
155 | kramer	SusanRoss
156 | SusanRoss	Russell_Dalrymple
157 | Russell_Dalrymple	SusanRoss
158 | Russell_Dalrymple	ElaineBenes
159 | SallyWeaver	SusanRoss
160 | SusanRoss	SallyWeaver
161 | SallyWeaver	MrandMrsRoss
162 | SallyWeaver	jerry
163 | WyckThayer	MrandMrsRoss
164 | MrandMrsRoss	WyckThayer
165 | SusanRoss	WyckThayer
166 | WyckThayer	SusanRoss
167 | costanza81	MrKruger
168 | MrKruger	costanza81
169 | ElaineBenes	MrKruger
170 | costanza81	guitarbern
171 | costanza81	intangibles
172 | costanza81	cushman
173 | cushman	bigstein
174 | bigstein	cushman
175 | costanza81	JonVoight
176 | costanza81	bubble_boy
177 | costanza81	Pastrami
178 | costanza81	bigstein
179 | Victoria	bigstein
180 | bigstein	Victoria
181 | cushman	Victoria
182 | Victoria	cushman
183 | bigstein	intangibles
184 | bigstein	guitarbern
185 | guitarbern	intangibles
186 | guitarbern	bigstein
187 | intangibles	Victoria
188 | intangibles	bigstein
189 | bubble_boy	trivial_pursuit
190 | costanza81	StankyHanke
191 | ElaineBenes	MrLippman
192 | ElaineBenes	MrPitt
193 | ElaineBenes	Jack_The_Wiz
194 | Jack_The_Wiz	ElaineBenes
195 | TheSuzie	ElaineBenes
196 | jerry	TheSuzie
197 | TheSuzie	Peggy
198 | Peggy	TheSuzie
199 | Peggy	JPeterman
200 | JPeterman	kramer
201 | JPeterman	TheSuzie
202 | ElaineBenes	RobertKennedyJr
203 | MarlatheVirgin	RobertKennedyJr
204 | RobertKennedyJr	MarlatheVirgin
205 | Jackie_Chiles	MarlatheVirgin
206 | Sue_Ellen_Mischke	RobertKennedyJr
207 | JPeterman	RobertKennedyJr
208 | RobertKennedyJr	JPeterman
209 | MrPitt	RobertKennedyJr
210 | RobertKennedyJr	MrPitt
211 | kramer	TinaRobbins
212 | TinaRobbins	kramer
213 | ElaineBenes	TinaRobbins
214 | TinaRobbins	ElaineBenes
215 | Jake_Jarmel	ElaineBenes
216 | Noreen
217 | ElaineBenes	HalKitzmiller
218 | HalKitzmiller	ElaineBenes
219 | kramer	HalKitzmiller
220 | HalKitzmiller	kramer
221 | Joel_Rifkin	ElaineBenes
222 | Darryl	ElaineBenes
223 | NedIsakoff	ElaineBenes
224 | ElaineBenes	NedIsakoff
225 | ElaineBenes	Carl_Farbman
226 | Carl_Farbman	JPeterman
227 | JPeterman	Carl_Farbman
228 | CrazyJoeDavola	jerry
229 | CrazyJoeDavola	costanza81
230 | CrazyJoeDavola	ElaineBenes
231 | CrazyJoeDavola	kramer
232 | CrazyJoeDavola	NewmanUSPS
233 | costanza81	CrazyJoeDavola
234 | ElaineBenes	CrazyJoeDavola
235 | kramer	CrazyJoeDavola
236 | NewmanUSPS	CrazyJoeDavola
237 | jerry	DrTimWhatley
238 | costanza81	DrTimWhatley
239 | ElaineBenes	DrTimWhatley
240 | kramer	DrTimWhatley
241 | NewmanUSPS	DrTimWhatley
242 | DrTimWhatley	jerry
243 | DrTimWhatley	costanza81
244 | DrTimWhatley	ElaineBenes
245 | DrTimWhatley	kramer
246 | jerry	TheDrake
247 | costanza81	TheDrake
248 | ElaineBenes	TheDrake
249 | kramer	TheDrake
250 | NewmanUSPS	TheDrake
251 | TheDrake	jerry
252 | TheDrake	costanza81
253 | TheDrake	ElaineBenes
254 | TheDrake	kramer
255 | jerry	JoeMayo
256 | ElaineBenes	JoeMayo
257 | costanza81	JoeMayo
258 | JoeMayo	jerry
259 | JoeMayo	ElaineBenes
260 | JoeMayo	costanza81
261 | jerry	Alec_Berg
262 | Alec_Berg	jerry
263 | ElaineBenes	Sue_Ellen_Mischke
264 | Sue_Ellen_Mischke	ElaineBenes
265 | Sue_Ellen_Mischke	jerry
266 | jerry	Sue_Ellen_Mischke
267 | kramer	Sue_Ellen_Mischke
268 | kramer	Mickey_Abbott
269 | jerry	Mickey_Abbott
270 | costanza81	Mickey_Abbott
271 | NewmanUSPS	Mickey_Abbott
272 | Mickey_Abbott	kramer
273 | Mickey_Abbott	jerry
274 | Mickey_Abbott	costanza81
275 | Mickey_Abbott	NewmanUSPS
276 | Babs_Kramer	NewmanUSPS
277 | NewmanUSPS	Babs_Kramer
278 | kramer	Bob_Sacamano
279 | kramer	Lomez
280 | kramer	JayRiemenschneider
281 | kramer	CorkyRamirez
282 | kramer	LenNicademo
283 | kramer	Specter
284 | kramer	Brody
285 | Bob_Sacamano	kramer
286 | Lomez	kramer
287 | JayRiemenschneider	kramer
288 | CorkyRamirez	kramer
289 | LenNicademo	kramer
290 | Specter	kramer
291 | Brody	kramer
292 | Bob_Sacamano	Lomez
293 | Bob_Sacamano	JayRiemenschneider
294 | Bob_Sacamano	CorkyRamirez
295 | Bob_Sacamano	LenNicademo
296 | Bob_Sacamano	Specter
297 | Bob_Sacamano	Brody
298 | Bob_Sacamano	jerry
299 | Brody	Bob_Sacamano
300 | Brody	Lomez
301 | Brody	JayRiemenschneider
302 | Brody	CorkyRamirez
303 | Brody	LenNicademo
304 | Brody	Specter
305 | CorkyRamirez	Bob_Sacamano
306 | CorkyRamirez	Lomez
307 | CorkyRamirez	JayRiemenschneider
308 | CorkyRamirez	Specter
309 | CorkyRamirez	Brody
310 | JayRiemenschneider	Bob_Sacamano
311 | JayRiemenschneider	Lomez
312 | JayRiemenschneider	CorkyRamirez
313 | JayRiemenschneider	LenNicademo
314 | JayRiemenschneider	Brody
315 | LenNicademo	Bob_Sacamano
316 | LenNicademo	Lomez
317 | LenNicademo	JayRiemenschneider
318 | LenNicademo	CorkyRamirez
319 | LenNicademo	Brody
320 | Lomez	Bob_Sacamano
321 | Lomez	JayRiemenschneider
322 | Lomez	CorkyRamirez
323 | Lomez	LenNicademo
324 | Lomez	Brody
325 | Specter	Bob_Sacamano
326 | Specter	Lomez
327 | Specter	CorkyRamirez
328 | kramer	FranklinDelanoRomanowski
329 | kramer	SalBass
330 | kramer	EstelleC
331 | kramer	Vegetable_Lasagna
332 | kramer	MortySeinfeld
333 | kramer	Noreen
334 | kramer	Babs_Kramer
335 | kramer	Shaky_the_Mohel
336 | kramer	assman
337 | assman	DrTimWhatley
338 | DrTimWhatley	assman
339 | kramer	Stan_the_Caddy
340 | Stan_the_Caddy	Jackie_Chiles
341 | NewmanUSPS	Jackie_Chiles
342 | Jackie_Chiles	NewmanUSPS
343 | kramer	Jackie_Chiles
344 | Jackie_Chiles	kramer
345 | EstelleC	kramer
346 | Vegetable_Lasagna	kramer
347 | MortySeinfeld	kramer
348 | Noreen	kramer
349 | Babs_Kramer	kramer
350 | Shaky_the_Mohel	kramer
351 | Bob_Cobb	kramer
352 | kramer	Bob_Cobb
353 | Bob_Cobb	FRANKCOSTANZA
354 | FRANKCOSTANZA	Bob_Cobb
355 | kramer	Earl_Haffler
356 | Earl_Haffler	kramer
357 | kramer	MikeMoffit
358 | MikeMoffit	kramer
359 | MikeMoffit	jerry
360 | NewmanUSPS	Henry_Atkins
361 | AvisRental	jerry
362 | jerry	The_Soup_Nazi
363 | costanza81	The_Soup_Nazi
364 | ElaineBenes	The_Soup_Nazi
365 | kramer	The_Soup_Nazi
366 | NewmanUSPS	The_Soup_Nazi
367 | FRANKCOSTANZA	The_Soup_Nazi
368 | THE_REAL_PUDDY	The_Soup_Nazi
369 | jerry	ArtVandelay
370 | costanza81	ArtVandelay
371 | ElaineBenes	ArtVandelay
372 | kramer	ArtVandelay
373 | jerry	Kel_Varnsen
374 | costanza81	Kel_Varnsen
375 | ElaineBenes	Kel_Varnsen
376 | kramer	Kel_Varnsen
377 | jerry	HEPennypacker
378 | costanza81	HEPennypacker
379 | ElaineBenes	HEPennypacker
380 | kramer	HEPennypacker
381 | jerry	MartinvanNostrand
382 | costanza81	MartinvanNostrand
383 | ElaineBenes	MartinvanNostrand
384 | kramer	MartinvanNostrand
385 | jerry	WandaPepper
386 | costanza81	WandaPepper
387 | ElaineBenes	WandaPepper
388 | kramer	WandaPepper
389 | ArtVandelay	ArtVandelay
390 | ArtVandelay	Kel_Varnsen
391 | ArtVandelay	HEPennypacker
392 | ArtVandelay	MartinvanNostrand
393 | ArtVandelay	WandaPepper
394 | Kel_Varnsen	ArtVandelay
395 | Kel_Varnsen	HEPennypacker
396 | Kel_Varnsen	MartinvanNostrand
397 | Kel_Varnsen	WandaPepper
398 | HEPennypacker	ArtVandelay
399 | HEPennypacker	Kel_Varnsen
400 | HEPennypacker	MartinvanNostrand
401 | HEPennypacker	WandaPepper
402 | MartinvanNostrand	ArtVandelay
403 | MartinvanNostrand	Kel_Varnsen
404 | MartinvanNostrand	HEPennypacker
405 | MartinvanNostrand	WandaPepper
406 | WandaPepper	ArtVandelay
407 | WandaPepper	Kel_Varnsen
408 | WandaPepper	HEPennypacker
409 | WandaPepper	MartinvanNostrand
410 | Kevin	Gene
411 | Kevin	Feldman
412 | Kevin	Vargas
413 | Kevin	ElaineBenes
414 | Gene	Kevin
415 | Gene	Feldman
416 | Gene	Vargas
417 | Gene	ElaineBenes
418 | Feldman	Kevin
419 | Feldman	Gene
420 | Feldman	Vargas
421 | Feldman	ElaineBenes
422 | Vargas	Kevin
423 | Vargas	Gene
424 | Vargas	Feldman
425 | Vargas	ElaineBenes
426 | ElaineBenes	Kevin
427 | ElaineBenes	Gene
428 | ElaineBenes	Feldman
429 | ElaineBenes	Vargas
430 | 


--------------------------------------------------------------------------------
/examples/pagerank/pagerank.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | $LOAD_PATH << '../../lib'
  4 | require 'swineherd'        ; include Swineherd
  5 | require 'swineherd/script' ; include Swineherd::Script
  6 | require 'swineherd/filesystem'
  7 | 
  8 | Settings.define :flow_id,     :required => true,                     :description => "Flow id required to make run of workflow unique"
  9 | Settings.define :iterations,  :type => Integer,  :default => 10,     :description => "Number of pagerank iterations to run"
 10 | Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
 11 | Settings.resolve!
 12 | 
 13 | flow = Workflow.new(Settings.flow_id) do
 14 | 
 15 |   # The filesystems we're going to be working with
 16 |   hdfs    = Swineherd::FileSystem.get(:hdfs)
 17 |   localfs = Swineherd::FileSystem.get(:file)
 18 | 
 19 |   # The scripts we're going to use
 20 |   initializer = PigScript.new('scripts/pagerank_initialize.pig')
 21 |   iterator    = PigScript.new('scripts/pagerank.pig')
 22 |   finisher    = WukongScript.new('scripts/cut_off_list.rb')
 23 |   plotter     = RScript.new('scripts/histogram.R')
 24 | 
 25 |   #
 26 |   # Runs simple pig script to initialize pagerank. We must specify the input
 27 |   # here as this is the first step in the workflow. The output attribute is to
 28 |   # ensure idempotency and the options attribute is the hash that will be
 29 |   # converted into command-line args for the pig interpreter.
 30 |   #
 31 |   task :pagerank_initialize do
 32 |     initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
 33 |     initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
 34 |   end
 35 | 
 36 |   #
 37 |   # Runs multiple iterations of pagerank with another pig script and manages all
 38 |   # the intermediate outputs.
 39 |   #
 40 |   task :pagerank_iterate => [:pagerank_initialize] do
 41 |     iterator.options[:damp]           = '0.85f'
 42 |     iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
 43 |     Settings.iterations.times do
 44 |       iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
 45 |       iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
 46 |       iterator.refresh!
 47 |       iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
 48 |     end
 49 |   end
 50 | 
 51 |   #
 52 |   # Here we use a wukong script to cut off the last field (a big pig bag of
 53 |   # links). Notice how every wukong script MUST have an input but pig scripts do
 54 |   # not.
 55 |   #
 56 |   task :cut_off_adjacency_list => [:pagerank_iterate] do
 57 |     finisher.input  << latest_output(:pagerank_iterate)
 58 |     finisher.output << next_output(:cut_off_adjacency_list)
 59 |     finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
 60 |   end
 61 | 
 62 |   #
 63 |   # We want to pull down one result file, merge the part-000.. files into one file
 64 |   #
 65 |   task :merge_results => [:cut_off_adjacency_list] do
 66 |     merged_results = next_output(:merge_results)
 67 |     hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
 68 |   end
 69 | 
 70 |   #
 71 |   # Cat results into a local directory with the same structure
 72 |   # eg. #{work_dir}/#{flow_id}/pull_down_results-0.
 73 |   #
 74 |   # FIXME: Bridging filesystems is cludgey.
 75 |   #
 76 |   task :pull_down_results => [:merge_results] do
 77 |     local_results = next_output(:pull_down_results)
 78 |     hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
 79 |   end
 80 | 
 81 |   #
 82 |   # Plot 2nd column of the result as a histogram (requires R and
 83 |   # ggplot2). Note that the output here is a png file but doesn't have that
 84 |   # extension. Ensmarten me as to the right way to handle that?
 85 |   #
 86 |   task :plot_results =>  [:pull_down_results] do
 87 |     plotter.attributes = {
 88 |       :pagerank_data => latest_output(:pull_down_results),
 89 |       :plot_file     => next_output(:plot_results), # <-- this will be a png...
 90 |       :raw_rank      => "aes(x=d$V2)"
 91 |     }
 92 |     plotter.run(:hadoop) unless localfs.exists? latest_output(:plot_results)
 93 |   end
 94 | 
 95 | end
 96 | 
 97 | flow.workdir = "/tmp/pagerank_example"
 98 | flow.describe
 99 | flow.run(:plot_results)
100 | 


--------------------------------------------------------------------------------
/examples/pagerank/scripts/cut_off_list.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'rubygems'
 4 | require 'wukong'
 5 | 
 6 | #
 7 | # Does the very simple job of cutting of the giant adjacency list
 8 | #
 9 | class CutMapper < Wukong::Streamer::RecordStreamer
10 |   def process *args
11 |     node_a, node_b, list = args
12 |     yield [node_a, node_b]
13 |   end
14 | end
15 | 
16 | Wukong::Script.new(CutMapper, nil).run
17 | 


--------------------------------------------------------------------------------
/examples/pagerank/scripts/histogram.R:
--------------------------------------------------------------------------------
1 | library(ggplot2);
2 | png('<%= plot_file %>', width=900, res=132);
3 | d <- read.table('<%= pagerank_data %>', header=FALSE, sep='\t');
4 | p <- ggplot(d, <%= raw_rank %>) + geom_histogram() + xlab("") + ylab("");
5 | p;
6 | 


--------------------------------------------------------------------------------
/examples/pagerank/scripts/pagerank.pig:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Runs exactly one pagerank iteration
 3 | --
 4 | network      = LOAD '$CURR_ITER_FILE' AS (node_a:chararray, rank:float, out_links:bag { link:tuple (node_b:chararray) });
 5 | sent_shares  = FOREACH network GENERATE FLATTEN(out_links) AS node_b, (float)(rank / (float)SIZE(out_links)) AS share:float;
 6 | sent_links   = FOREACH network GENERATE node_a, out_links;
 7 | rcvd_shares  = COGROUP sent_links BY node_a INNER, sent_shares BY node_b;
 8 | next_iter    = FOREACH rcvd_shares
 9 |                {
10 |                    raw_rank    = (float)SUM(sent_shares.share);
11 |                    -- treat the case that a node has no in links                   
12 |                    damped_rank = ((raw_rank IS NOT NULL AND raw_rank > 1.0e-12f) ? raw_rank*$DAMP + 1.0f - $DAMP : 0.0f);
13 |                    GENERATE
14 |                        group         AS node_a,
15 |                        damped_rank   AS rank,
16 |                        FLATTEN(sent_links.out_links) -- hack, should only be one bag, unbag it
17 |                    ;
18 |                };
19 | 
20 | STORE next_iter INTO '$NEXT_ITER_FILE';
21 | 


--------------------------------------------------------------------------------
/examples/pagerank/scripts/pagerank_initialize.pig:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Create initial graph on which to iterate the pagerank algorithm.
 3 | --
 4 | 
 5 | --
 6 | -- Generate a unique list of nodes with in links to cogroup on. This allows
 7 | -- us to treat the case where nodes have in links but no out links.
 8 | --
 9 | network     = LOAD '$ADJLIST' AS (node_a:chararray, node_b:chararray);
10 | cut_rhs     = FOREACH network GENERATE node_b;
11 | uniq_rhs    = DISTINCT cut_rhs;
12 | list_links  = COGROUP network BY node_a, uniq_rhs BY node_b;
13 | count_links = FOREACH list_links
14 |               {
15 |                   -- if network.node_b is empty there are no out links, set to dummy value
16 |                   out_links = (IsEmpty(network.node_b) ? {('dummy')} : network.node_b);
17 |                   GENERATE
18 |                       group     AS node_a,
19 |                       1.0f      AS rank,
20 |                       out_links AS out_links
21 |                   ;
22 |               };
23 | 
24 | STORE count_links INTO '$INITGRPH';
25 | 


--------------------------------------------------------------------------------
/lib/swineherd.rb:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
 3 | require 'rake'
 4 | require 'gorillib/logger/log'
 5 | 
 6 | module Swineherd
 7 |   autoload :Template,     'swineherd/template'
 8 |   autoload :FileSystem,   'swineherd/filesystem'
 9 |   autoload :Script,       'swineherd/script'
10 |   autoload :Workflow,     'swineherd/workflow'
11 |   
12 |   # For rake 0.9 compatibility
13 |   include Rake::DSL if defined?(Rake::DSL)
14 | end
15 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd
 2 |   autoload :BaseFileSystem,   'swineherd/filesystem/basefilesystem'
 3 |   autoload :LocalFileSystem,  'swineherd/filesystem/localfilesystem'
 4 |   autoload :HadoopFileSystem, 'swineherd/filesystem/hadoopfilesystem'
 5 |   autoload :S3FileSystem,     'swineherd/filesystem/s3filesystem'
 6 | 
 7 |   class FileSystem
 8 | 
 9 |     FILESYSTEMS = {
10 |       'file' => Swineherd::LocalFileSystem,
11 |       'hdfs' => Swineherd::HadoopFileSystem,
12 |       's3'   => Swineherd::S3FileSystem
13 |     }
14 | 
15 |     # A factory function that returns an instance of the requested class
16 |     def self.get scheme, *args
17 |       begin
18 |         FILESYSTEMS[scheme.to_s].new *args
19 |       rescue NoMethodError => e
20 |         raise "Filesystem with scheme #{scheme} does not exist.\n #{e.message}"
21 |       end
22 |     end
23 | 
24 |   end
25 | 
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/README_filesystem.textile:
--------------------------------------------------------------------------------
 1 | h1.  File System Abstraction
 2 | 
 3 | Hackboxen need to access files and directories in order to do their
 4 | stuff.  We currently expect them to use at least the following types
 5 | of filesystems:
 6 | 
 7 | * Local File System
 8 | * Ephemeral Hadoop cluster HDFS
 9 | * s3/HDFS
10 | 
11 | Each of these filesystem types has different methods to accomplish the same operations. In order to make this diversity more easily used by hackboxen, an abstraction layer has been created.  
12 | 
13 | h2. Interface
14 | 
15 | A new @FileSystem@ class has a single class method @get@ taking two argugments:
16 | 
17 | * @scheme@: A token which specifies the filesystem scheme. Currently, only @:file@ is supported.
18 | * @*args@: Optional arguments (e.g. credentitals)
19 | 
20 | The returned (abstracted) filesystem instnace has the following methods:
21 | 
22 | * @open(path,mode,blk)@: Return a @File@ like file handle object.  @mode@ and @blk@ arguments are optional and work like the standard ruby @File.open@ arguments.
23 | * @rm(path)@: Works like UNIX @rm -r@.
24 | * @exists?(path)@: Returns @true@ if the file/directory exists
25 | * @mv(srcpath,dstpath)@: Renames/moves the file/directory.
26 | * @cp(srcpath,dstpath)@: Works like UNIX @cp -r@.
27 | * @mkpath(dirpath)@: Creates a directory and all required parent directories.
28 | * @type(path)@: Returns one of "dir", "file", or "symlink".
29 | * @entries(dirpath)@: Returns the the files/subdirectories in this directory
30 | 
31 | The @File@ object returned by the @open@ methods has the following methods:
32 | 
33 | * @read@: Return the contents of the entire file as a string.
34 | * @readline@: Return the next line in the file, or nil if there no more lines.
35 | * @write(string)@: Write @string@ to the file.
36 | * @close@:  Close the file
37 | 
38 | h2. Creating an abstraction
39 | 
40 | Each abstraction is not expected to catch and rethrow exceptions of the abstracted subsystems. Rather, exceptions should pass through.  However, each method should try to be built to behave similarly to the corresponding native ruby @File@ and @FileUtils@ methods.
41 | 
42 | h2. Current State
43 | 
44 | The only currently implemented filesystem abstraction is @:file@ (local file system).
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/basefilesystem.rb:
--------------------------------------------------------------------------------
  1 | module Swineherd
  2 | 
  3 |   #
  4 |   # All methods a filesystem should have
  5 |   #
  6 |   module BaseFileSystem
  7 | 
  8 |     #
  9 |     # Return a new instance of 'this' filesystem. Classes that include this
 10 |     # module are expected to know how to pull their particular set of arguments
 11 |     # from *args and initialize themselves by opening any required connections, &c.
 12 |     #
 13 |     def initialize *args
 14 |     end
 15 | 
 16 |     #
 17 |     # Open a file in this filesystem. Should return a usable file handle for in
 18 |     # the mode (read 'r' or 'w') given. File classes should, at minimum, have
 19 |     # the methods defined in BaseFile
 20 |     #
 21 |     def open path, mode="r", &blk
 22 |     end
 23 | 
 24 |     #
 25 |     # Recursively measure the size of path. Results in bytes.
 26 |     #
 27 |     def size path
 28 |     end
 29 |     
 30 |     #
 31 |     # Recursively delete the path and all paths below it.
 32 |     #
 33 |     def rm path
 34 |     end
 35 | 
 36 |     #
 37 |     # Returns true if the file or path exists and false otherwise.
 38 |     #
 39 |     def exists? path
 40 |     end
 41 | 
 42 |     #
 43 |     # Moves the source path to the destination path
 44 |     #
 45 |     def mv srcpath, dstpath
 46 |     end
 47 | 
 48 |     #
 49 |     # Recursively copies all files and directories under srcpath to dstpath
 50 |     #
 51 |     def cp srcpath, dstpath
 52 |     end
 53 | 
 54 |     #
 55 |     # Make directory path if it does not (partly) exist
 56 |     #
 57 |     def mkpath path
 58 |     end
 59 | 
 60 |     #
 61 |     # Return file type ("directory" or "file" or "symlink")
 62 |     #
 63 |     def type path
 64 |     end
 65 | 
 66 |     #
 67 |     # Give contained files/dirs
 68 |     #
 69 |     def entries dirpath
 70 |     end
 71 | 
 72 |     #
 73 |     # For running tasks idempotently. Returns true if no paths exist, false if all paths exist,
 74 |     # and raises an error otherwise.
 75 |     #
 76 |     def check_paths paths
 77 |       exist_count = paths.inject(0){|cnt, path| cnt += 1 if exists?(path); cnt}
 78 |       raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
 79 |       return true if exist_count == 0
 80 |       false
 81 |     end
 82 | 
 83 |     #
 84 |     # Needs to close the filesystem by cleaning up any open connections, &c.
 85 |     #
 86 |     def close *args
 87 |     end
 88 | 
 89 |     class BaseFile
 90 |       attr_accessor :path, :scheme, :mode
 91 | 
 92 | 
 93 |       def initialize *args, &blk
 94 |       end
 95 | 
 96 |       #
 97 |       # A new file in the filesystem needs to be instantiated with a
 98 |       # path, a mode (read 'r' or write 'w').
 99 |       #
100 |       def open path, mode="r", &blk
101 |       end
102 | 
103 |       #
104 |       # Return whole file and as a string
105 |       #
106 |       def read
107 |       end
108 | 
109 |       #
110 |       # Return a line from stream
111 |       #
112 |       def readline
113 |       end
114 | 
115 |       #
116 |       # Writes a string to the file
117 |       #
118 |       def write string
119 |       end
120 | 
121 |       #
122 |       # Close the file
123 |       #
124 |       def close *args
125 |       end
126 | 
127 |     end
128 | 
129 |   end
130 | 
131 | end
132 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/filesystems.rb:
--------------------------------------------------------------------------------
  1 | require 'fileutils'
  2 | 
  3 | class FileSystem
  4 | 
  5 |   # A factory function that returns an instance of the requested class
  6 |   def self.get(scheme, *args)
  7 |     if scheme == :file
  8 |       LocalFileSystem.new()
  9 |     else
 10 |       nil
 11 |     end
 12 |   end
 13 |    
 14 |   class LocalFileSystem
 15 |     
 16 |     # Open a file in this filesystem
 17 |     def open(path,mode="r",&blk)
 18 |       return LocalFile.new(path,mode,&blk)
 19 |     end
 20 | 
 21 |     # Works like rm -r
 22 |     def rm(path)
 23 |       FileUtils.rm_r(path)
 24 |     end
 25 |     
 26 |     # Does this exist?
 27 |     def exists?(path)
 28 |       File.exists?(path)
 29 |     end
 30 |     
 31 |     # Works like UNIX mv
 32 |     def mv(srcpath,dstpath)
 33 |       FileUtils.mv(srcpath,dstpath)
 34 |     end
 35 |     
 36 |     # Works like UNIX cp -r
 37 |     def cp(srcpath,dstpath)
 38 |       FileUtils.cp_r(srcpath,dstpath)
 39 |     end
 40 |     
 41 |     # Make directory path if it does not (partly) exist
 42 |     def mkpath(path)
 43 |       FileUtils.mkpath
 44 |     end
 45 |     
 46 |     # Return file type ("dir" or "file" or "symlink")
 47 |     def type(path)
 48 |       if File.symlink?(path)
 49 |         return "symlink"
 50 |       end
 51 |       if File.directory?(path)
 52 |         return "directory"
 53 |       end
 54 |       if File.file?(path)
 55 |         return "file"
 56 |       end
 57 |       "unknown"
 58 |     end
 59 |     
 60 |     # Give contained files/dirs
 61 |     def entries(dirpath)
 62 |       if type(dirpath) != "directory"
 63 |         return nil
 64 |       end
 65 |       Dir.entries(dirpath)
 66 |     end
 67 |     
 68 |     class LocalFile
 69 |       attr_accessor :path, :scheme, :mode
 70 |       
 71 |       def initialize(path,mode="r",&blk)
 72 |         @path=path
 73 |         @mode=mode
 74 |         @handle=File.open(path,mode,&blk)
 75 |       end
 76 |       
 77 |       def open(path,mode="r")
 78 |         # Only "r" and "w" modes are supported.
 79 |         initialize(path,mode)
 80 |       end
 81 |       
 82 |       # Return whole file and as a string
 83 |       def read
 84 |         @handle.read
 85 |       end
 86 |       
 87 |       # Return a line from stream
 88 |       def readline
 89 |         @handle.gets
 90 |       end
 91 |       
 92 |       # Writes to the file
 93 |       def write(string)
 94 |         @handle.write(string)
 95 |       end
 96 |       
 97 |       # Close file
 98 |       def close
 99 |         @handle.close
100 |       end
101 |     end
102 |   end
103 | end
104 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/hadoopfilesystem.rb:
--------------------------------------------------------------------------------
  1 | module Swineherd
  2 | 
  3 |   #
  4 |   # Methods for dealing with hadoop distributed file system (hdfs). This class
  5 |   # requires that you run with JRuby as it makes use of the native java hadoop
  6 |   # libraries.
  7 |   #
  8 |   class HadoopFileSystem
  9 | 
 10 |     include Swineherd::BaseFileSystem
 11 | 
 12 |     attr_accessor :conf, :hdfs
 13 | 
 14 |     #
 15 |     # Initialize a new hadoop file system, needs path to hadoop configuration
 16 |     #
 17 |     def initialize *args
 18 |       check_and_set_environment
 19 |       @conf = Java::org.apache.hadoop.conf.Configuration.new
 20 |       @hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
 21 |     end
 22 | 
 23 |     #
 24 |     # Make sure environment is sane then set up environment for use
 25 |     #
 26 |     def check_and_set_environment
 27 |       check_env
 28 |       set_env
 29 |     end
 30 | 
 31 |     def open path, mode="r", &blk
 32 |       HadoopFile.new(path,mode,self,&blk)
 33 |     end
 34 | 
 35 |     def size path
 36 |       lr(path).inject(0){|sz, f| sz += @hdfs.get_file_status(Path.new(f)).get_len}
 37 |     end
 38 | 
 39 |     #
 40 |     # Recursively list paths
 41 |     #
 42 |     def lr path
 43 |       paths = entries(path)
 44 |       if (paths && !paths.empty?)
 45 |         paths.map{|e| lr(e)}.flatten
 46 |       else
 47 |         path
 48 |       end
 49 |     end
 50 |     
 51 |     def rm path
 52 |       @hdfs.delete(Path.new(path), true)
 53 |       [path]
 54 |     end
 55 | 
 56 |     def exists? path
 57 |       @hdfs.exists(Path.new(path))
 58 |     end
 59 | 
 60 |     def mv srcpath, dstpath
 61 |       @hdfs.rename(Path.new(srcpath), Path.new(dstpath))
 62 |     end
 63 | 
 64 |     def cp srcpath, dstpath
 65 |       FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf)
 66 |     end
 67 | 
 68 |     def mkpath path
 69 |       @hdfs.mkdirs(Path.new(path))
 70 |       path
 71 |     end
 72 | 
 73 |     def type path
 74 |       return "unknown" unless exists? path
 75 |       status = @hdfs.get_file_status(Path.new(path))
 76 |       return "directory" if status.is_dir?
 77 |       "file"
 78 |       # case
 79 |       # when status.isFile then
 80 |       #   return "file"
 81 |       # when status.is_directory? then
 82 |       #   return "directory"
 83 |       # when status.is_symlink? then
 84 |       #   return "symlink"
 85 |       # end
 86 |     end
 87 | 
 88 |     def entries dirpath
 89 |       return unless type(dirpath) == "directory"
 90 |       list = @hdfs.list_status(Path.new(dirpath))
 91 |       list.map{|path| path.get_path.to_s} rescue []
 92 |     end
 93 | 
 94 |     #
 95 |     # Merge all part files in a directory into one file.
 96 |     #
 97 |     def merge srcdir, dstfile
 98 |       FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
 99 |     end
100 | 
101 |     #
102 |     # This is hackety. Use with caution.
103 |     #
104 |     def stream input, output
105 |       require 'uri'
106 |       input_fs_scheme  = URI.parse(input).scheme
107 |       output_fs_scheme = URI.parse(output).scheme
108 |       system("#{@hadoop_home}/bin/hadoop \\
109 |        jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar                     \\
110 |        -D          mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
111 |        -D          mapred.min.split.size=1000000000                                            \\
112 |        -D          mapred.reduce.tasks=0                                                       \\
113 |        -mapper     \"/bin/cat\"                                                                \\
114 |        -input      \"#{input}\"                                                                \\
115 |        -output     \"#{output}\"")
116 |     end
117 | 
118 |     #
119 |     # BZIP
120 |     #
121 |     def bzip input, output
122 |       system("#{@hadoop_home}/bin/hadoop \\
123 |        jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar     \\
124 |        -D          mapred.output.compress=true                                  \\
125 |        -D          mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec  \\
126 |        -D          mapred.reduce.tasks=1                                        \\
127 |        -mapper     \"/bin/cat\"                                                 \\
128 |        -reducer    \"/bin/cat\"                                                 \\
129 |        -input      \"#{input}\"                                                 \\
130 |        -output     \"#{output}\"")
131 |     end
132 | 
133 |     #
134 |     # Merges many input files into :reduce_tasks amount of output files
135 |     #
136 |     def dist_merge inputs, output, options = {}
137 |       options[:reduce_tasks]     ||= 25
138 |       options[:partition_fields] ||= 2
139 |       options[:sort_fields]      ||= 2
140 |       options[:field_separator]  ||= '/t'
141 |       names = inputs.map{|inp| File.basename(inp)}.join(',')
142 |       cmd   = "#{@hadoop_home}/bin/hadoop \\
143 |        jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar                   \\
144 |        -D          mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\"               \\
145 |        -D          num.key.fields.for.partition=\"#{options[:partition_fields]}\"            \\
146 |        -D          stream.num.map.output.key.fields=\"#{options[:sort_fields]}\"             \\
147 |        -D          mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
148 |        -D          stream.map.output.field.separator=\"'#{options[:field_separator]}'\"      \\
149 |        -D          mapred.min.split.size=1000000000                                          \\
150 |        -D          mapred.reduce.tasks=#{options[:reduce_tasks]}                             \\
151 |        -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner                    \\
152 |        -mapper     \"/bin/cat\"                                                              \\
153 |        -reducer    \"/usr/bin/uniq\"                                                         \\
154 |        -input      \"#{inputs.join(',')}\"                                                   \\
155 |        -output     \"#{output}\""
156 |       puts cmd
157 |       system cmd
158 |     end
159 | 
160 |     #
161 |     # Copy hdfs file to local filesystem
162 |     #
163 |     def copy_to_local srcfile, dstfile
164 |       @hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
165 |     end
166 | 
167 |     #
168 |     # Copy local file to hdfs filesystem
169 |     #
170 |     def copy_from_local srcfile, dstfile
171 |       @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
172 |     end
173 | 
174 |     def close *args
175 |       @hdfs.close
176 |     end
177 | 
178 |     class HadoopFile
179 |       attr_accessor :path, :handle, :hdfs
180 | 
181 |       #
182 |       # In order to open input and output streams we must pass around the hadoop fs object itself
183 |       #
184 |       def initialize path, mode, fs, &blk
185 |         @fs   = fs
186 |         @path = Path.new(path)
187 |         case mode
188 |         when "r" then
189 |           raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file"
190 |           @handle = @fs.hdfs.open(@path).to_io(&blk)
191 |         when "w" then
192 |           # Open path for writing
193 |           raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown")
194 |           @handle = @fs.hdfs.create(@path).to_io.to_outputstream
195 |           if block_given?
196 |             yield self
197 |             self.close # muy muy importante
198 |           end
199 |         end
200 |       end
201 | 
202 |       def read
203 |         @handle.read
204 |       end
205 | 
206 |       def readline
207 |         @handle.readline
208 |       end
209 | 
210 |       def write string
211 |         @handle.write(string.to_java_string.get_bytes)
212 |       end
213 | 
214 |       def puts string
215 |         write(string+"\n")
216 |       end
217 | 
218 |       def close
219 |         @handle.close
220 |       end
221 | 
222 |     end
223 | 
224 |     # #
225 |     # # Distributed streaming from input to output
226 |     # #
227 |     #
228 |     # #
229 |     # # Given an array of input dirs, stream all into output dir and remove duplicate records.
230 |     # # Reasonable default hadoop streaming options are chosen.
231 |     # #
232 |     # def self.merge inputs, output, options = {}
233 |     #   options[:reduce_tasks]     ||= 25
234 |     #   options[:partition_fields] ||= 2
235 |     #   options[:sort_fields]      ||= 2
236 |     #   options[:field_separator]  ||= '/t'
237 |     #   names = inputs.map{|inp| File.basename(inp)}.join(',')
238 |     #   cmd   = "${HADOOP_HOME}/bin/hadoop \\
239 |     #    jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar                   \\
240 |     #    -D          mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\"               \\
241 |     #    -D          num.key.fields.for.partition=\"#{options[:partition_fields]}\"            \\
242 |     #    -D          stream.num.map.output.key.fields=\"#{options[:sort_fields]}\"             \\
243 |     #    -D          mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
244 |     #    -D          stream.map.output.field.separator=\"'#{options[:field_separator]}'\"      \\
245 |     #    -D          mapred.min.split.size=1000000000                                          \\
246 |     #    -D          mapred.reduce.tasks=#{options[:reduce_tasks]}                             \\
247 |     #    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner                    \\
248 |     #    -mapper     \"/bin/cat\"                                                              \\
249 |     #    -reducer    \"/usr/bin/uniq\"                                                         \\
250 |     #    -input      \"#{inputs.join(',')}\"                                                   \\
251 |     #    -output     \"#{output}\""
252 |     #   puts cmd
253 |     #   system cmd
254 |     # end
255 |     #
256 |     # #
257 |     # # Concatenates a hadoop dir or file into a local file
258 |     # #
259 |     # def self.cat_to_local src, dest
260 |     #   system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest)
261 |     # end
262 |     #
263 | 
264 |     #
265 |     # Check that we are running with jruby, check for hadoop home. hadoop_home
266 |     # is preferentially set to the HADOOP_HOME environment variable if it's set,
267 |     # '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and
268 |     # '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else
269 |     # fails inform the user that HADOOP_HOME really should be set.
270 |     #
271 |     def check_env
272 |       begin
273 |         require 'java'
274 |       rescue LoadError => e
275 |         raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
276 |       end
277 |       @hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop')
278 |       @hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home
279 |       raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home
280 |     end
281 | 
282 |     #
283 |     # Place hadoop jars in class path, require appropriate jars, set hadoop conf
284 |     #
285 |     def set_env
286 |       require 'java'
287 |       @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
288 |       @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
289 |       $CLASSPATH << @hadoop_conf
290 |       Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
291 | 
292 |       java_import 'org.apache.hadoop.conf.Configuration'
293 |       java_import 'org.apache.hadoop.fs.Path'
294 |       java_import 'org.apache.hadoop.fs.FileSystem'
295 |       java_import 'org.apache.hadoop.fs.FileUtil'
296 |       java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat'
297 |       java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat'
298 |       java_import 'org.apache.hadoop.fs.FSDataOutputStream'
299 |       java_import 'org.apache.hadoop.fs.FSDataInputStream'
300 | 
301 |     end
302 | 
303 |   end
304 | 
305 | end
306 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/localfilesystem.rb:
--------------------------------------------------------------------------------
 1 | require 'fileutils'
 2 | require 'find'
 3 | module Swineherd
 4 | 
 5 |   class LocalFileSystem
 6 | 
 7 |     include Swineherd::BaseFileSystem
 8 | 
 9 |     def initialize *args
10 |     end
11 | 
12 |     def open path, mode="r", &blk
13 |       return LocalFile.new path, mode, &blk
14 |     end
15 | 
16 |     def size path
17 |       sz = 0
18 |       Find.find(path){|f| sz += File.size(f)}
19 |       sz
20 |     end
21 |     
22 |     def rm path
23 |       FileUtils.rm_r path
24 |     end
25 | 
26 |     def exists? path
27 |       File.exists?(path)
28 |     end
29 | 
30 |     def mv srcpath, dstpath
31 |       FileUtils.mv(srcpath,dstpath)
32 |     end
33 | 
34 |     def cp srcpath, dstpath
35 |       FileUtils.cp_r(srcpath,dstpath)
36 |     end
37 | 
38 |     def mkpath path
39 |       FileUtils.mkpath path
40 |     end
41 | 
42 |     def type path
43 |       case
44 |       when File.symlink?(path) then
45 |         return "symlink"
46 |       when File.directory?(path) then
47 |         return "directory"
48 |       when File.file?(path) then
49 |           return "file"
50 |       end
51 |       "unknown"
52 |     end
53 | 
54 |     def entries dirpath
55 |       return unless (type(dirpath) == "directory")
56 |       Dir.entries(dirpath)
57 |     end
58 | 
59 |     class LocalFile
60 |       attr_accessor :path, :scheme, :handle, :mode
61 | 
62 |       def initialize path, mode="r", &blk
63 |         @path   = path
64 |         @mode   = mode
65 |         @handle = File.open(path,mode,&blk)
66 |       end
67 | 
68 |       def open path, mode="r", &blk
69 |         initialize(path,mode,&blk)
70 |       end
71 | 
72 |       def read
73 |         @handle.read
74 |       end
75 | 
76 |       def readline
77 |         @handle.gets
78 |       end
79 | 
80 |       def write string
81 |         @handle.write(string)
82 |       end
83 | 
84 |       def close
85 |         @handle.close
86 |       end
87 |     end
88 | 
89 |   end
90 | end
91 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/localfs.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd
 2 |   class LocalFS
 3 |     def self.check_paths paths
 4 |       exist_count   = 0 # no outputs exist
 5 |       paths.each{|path| exist_count += 1 if File.exist?(path) }
 6 |       raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
 7 |       return true if exist_count == 0
 8 |       false
 9 |     end
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/swineherd/filesystem/s3filesystem.rb:
--------------------------------------------------------------------------------
  1 | require 'tempfile'
  2 | module Swineherd
  3 | 
  4 |   #
  5 |   # Methods for interacting with Amazon's Simple Store Service (s3).
  6 |   #
  7 |   class S3FileSystem
  8 | 
  9 |     include Swineherd::BaseFileSystem
 10 | 
 11 |     attr_accessor :s3
 12 | 
 13 |     #
 14 |     # Initialize a new s3 file system, needs path to aws keys
 15 |     #
 16 |     def initialize aws_access_key_id, aws_secret_access_key
 17 |       require 'right_aws'
 18 |       @s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key)
 19 |     end
 20 | 
 21 |     def open path, mode="r", &blk
 22 |       S3File.new(path,mode,self,&blk)
 23 |     end
 24 | 
 25 |     def size path
 26 |       sz = 0
 27 |       if type(path) == "directory"
 28 |         lr(path).each do |f|
 29 |           sz += file_size(f)
 30 |         end        
 31 |       else
 32 |         sz += file_size(path)
 33 |       end
 34 |       sz
 35 |     end    
 36 | 
 37 |     def file_size path
 38 |       containing_bucket = bucket(path)
 39 |       header            = @s3.interface.head(containing_bucket, key_path(path))
 40 |       header['content-length'].to_i
 41 |     end
 42 |     
 43 |     def rm path
 44 |       bkt = bucket(path)
 45 |       key = key_path(path)
 46 |       if key.empty? # only the bucket was passed in, delete it
 47 |         @s3.interface.force_delete_bucket(bkt)
 48 |       else
 49 |         case type(path)
 50 |         when "directory" then
 51 |           keys_to_delete = lr(path)
 52 |           keys_to_delete.each do |k|
 53 |             key_to_delete = key_path(k)
 54 |             @s3.interface.delete(bkt, key_to_delete)
 55 |           end
 56 |           keys_to_delete
 57 |         when "file" then
 58 |           @s3.interface.delete(bkt, key)
 59 |           [path]
 60 |         end
 61 |       end
 62 |     end
 63 | 
 64 |     def bucket path
 65 |       uri = URI.parse(path)
 66 |       uri.path.split('/').reject{|x| x.empty?}.first
 67 |     end
 68 | 
 69 |     def key_path path
 70 |       uri = URI.parse(path)
 71 |       File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1])
 72 |     end
 73 | 
 74 |     def needs_trailing_slash pre
 75 |       has_trailing_slash = pre.end_with? '/'
 76 |       is_empty_prefix    = pre.empty?
 77 |       !(has_trailing_slash || is_empty_prefix)
 78 |     end
 79 | 
 80 |     def full_contents path
 81 |       bkt = bucket(path)
 82 |       pre = key_path(path)
 83 |       pre += '/' if needs_trailing_slash(pre)
 84 |       contents = []
 85 |       s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res|
 86 |         contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
 87 |         contents += res[:contents].map{|c| File.join(bkt, c[:key])}
 88 |       end
 89 |       contents
 90 |     end
 91 | 
 92 |     def exists? path
 93 |       object     = File.basename(path)
 94 |       search_dir = File.dirname(path)
 95 |       case search_dir
 96 |       when '.' then # only a bucket was passed in
 97 |         begin
 98 |           (full_contents(object).size > 0)
 99 |         rescue RightAws::AwsError => e
100 |           if e.message =~ /nosuchbucket/i
101 |             false
102 |           else
103 |             raise e
104 |           end
105 |         end
106 |       else
107 |         search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')}
108 |         search_dir_contents.include?(object)
109 |       end
110 |     end
111 | 
112 |     def mv srcpath, dstpath
113 |       src_bucket   = bucket(srcpath)
114 |       dst_bucket   = bucket(dstpath)
115 |       dst_key_path = key_path(dstpath)
116 |       mkpath(dstpath)
117 |       case type(srcpath)
118 |       when "directory" then
119 |         paths_to_copy = lr(srcpath)
120 |         common_dir    = common_directory(paths_to_copy)
121 |         paths_to_copy.each do |path|
122 |           src_key = key_path(path)
123 |           dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
124 |           @s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
125 |         end
126 |       when "file" then
127 |         @s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
128 |       end
129 |     end
130 | 
131 |     def cp srcpath, dstpath
132 |       src_bucket   = bucket(srcpath)
133 |       dst_bucket   = bucket(dstpath)
134 |       dst_key_path = key_path(dstpath)
135 |       mkpath(dstpath)
136 |       case type(srcpath)
137 |       when "directory" then
138 |         paths_to_copy = lr(srcpath)
139 |         common_dir    = common_directory(paths_to_copy)
140 |         paths_to_copy.each do |path|
141 |           src_key = key_path(path)
142 |           dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
143 |           @s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
144 |         end
145 |       when "file" then
146 |         @s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
147 |       end
148 |     end
149 | 
150 |     # right now this only works on single files
151 |     def copy_to_local srcpath, dstpath
152 |       src_bucket   = bucket(srcpath)
153 |       src_key_path = key_path(srcpath)
154 |       dstfile      = File.new(dstpath, 'w')
155 |       @s3.interface.get(src_bucket, src_key_path) do |chunk|
156 |         dstfile.write(chunk)
157 |       end
158 |       dstfile.close
159 |     end 
160 | 
161 |     # This is a bit funny, there's actually no need to create a 'path' since
162 |     # s3 is nothing more than a glorified key-value store. When you create a
163 |     # 'file' (key) the 'path' will be created for you. All we do here is create
164 |     # the bucket unless it already exists.
165 |     #
166 |     def mkpath path
167 |       bkt = bucket(path)
168 |       key = key_path(path)
169 |       if key.empty?
170 |         @s3.interface.create_bucket(bkt)
171 |       else
172 |         @s3.interface.create_bucket(bkt) unless exists? bkt
173 |       end
174 |       path
175 |     end
176 | 
177 |     def type path
178 |       return "unknown" unless exists? path
179 |       return "directory" if full_contents(path).size > 0
180 |       "file"
181 |     end
182 | 
183 |     def entries dirpath
184 |       return unless type(dirpath) == "directory"
185 |       full_contents(dirpath)
186 |     end
187 | 
188 |     # Recursively list paths
189 |     def lr path
190 |       paths = entries(path)
191 |       if paths
192 |         paths.map{|e| lr(e)}.flatten
193 |       else
194 |         path
195 |       end
196 |     end
197 | 
198 |     #
199 |     # Ick.
200 |     #
201 |     def common_directory paths
202 |       dirs     = paths.map{|path| path.split('/')}
203 |       min_size = dirs.map{|splits| splits.size}.min
204 |       dirs.map!{|splits| splits[0...min_size]}
205 |       uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
206 |       dirs[0][0...uncommon_idx].join('/')
207 |     end
208 | 	
209 |     def put srcpath, destpath
210 |       dest_bucket = bucket(destpath)
211 |       if File.directory? srcpath
212 | 	# handle Dir later
213 |       else
214 |         key = srcpath
215 |       end
216 |       @s3.interface.put(dest_bucket, key, File.open(srcpath))
217 |     end	
218 | 
219 |     def close *args
220 |     end
221 | 
222 |     class S3File
223 |       attr_accessor :path, :handle, :fs
224 | 
225 |       #
226 |       # In order to open input and output streams we must pass around the s3 fs object itself
227 |       #
228 |       def initialize path, mode, fs, &blk
229 |         @fs   = fs
230 |         @path = path
231 |         case mode
232 |         when "r" then
233 |           raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
234 |         when "w" then
235 |           raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
236 |           @handle = Tempfile.new('s3filestream')
237 |           if block_given?
238 |             yield self
239 |             close
240 |           end
241 |         end
242 |       end
243 | 
244 |       #
245 |       # Faster than iterating
246 |       #
247 |       def read
248 |         resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path))
249 |         resp
250 |       end
251 | 
252 |       #
253 |       # This is a little hackety. That is, once you call (.each) on the object the full object starts
254 |       # downloading...
255 |       #
256 |       def readline
257 |         @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
258 |         begin
259 |           @handle.next
260 |         rescue StopIteration, NoMethodError
261 |           @handle = nil
262 |           raise EOFError.new("end of file reached")
263 |         end
264 |       end
265 | 
266 |       def write string
267 |         @handle.write(string)
268 |       end
269 | 
270 |       def puts string
271 |         write(string+"\n")
272 |       end
273 | 
274 |       def close
275 |         if @handle
276 |           @handle.read
277 |           fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r'))
278 |           @handle.close
279 |         end
280 |         @handle = nil
281 |       end
282 | 
283 |     end
284 | 
285 |   end
286 | 
287 | end
288 | 


--------------------------------------------------------------------------------
/lib/swineherd/script.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd
 2 |   module Script
 3 | 
 4 |     autoload :WukongScript, 'swineherd/script/wukong_script'
 5 |     autoload :PigScript,    'swineherd/script/pig_script'
 6 |     autoload :RScript,      'swineherd/script/r_script'
 7 | 
 8 |     module Common
 9 |       
10 |       attr_accessor :input, :output, :options, :attributes
11 |       def initialize(source, input = [], output = [], options = {}, attributes ={})
12 |         @source     = source
13 |         @input      = input
14 |         @output     = output
15 |         @options    = options
16 |         @attributes = attributes
17 |       end
18 | 
19 |       #
20 |       # Allows for setting the environment the script will be ran in
21 |       #
22 |       def env
23 |         ENV
24 |       end
25 | 
26 |       def script
27 |         @script ||= Template.new(@source, @attributes).substitute!
28 |       end
29 | 
30 |       #
31 |       # So we can reuse ourselves
32 |       #
33 |       def refresh!
34 |         @script = nil
35 |         @output = []
36 |         @input  = []
37 |       end
38 | 
39 |       #
40 |       # This depends on the type of script
41 |       #
42 |       def cmd
43 |         raise "Override this in subclass!"
44 |       end
45 | 
46 |       #
47 |       # Override this in subclass to decide how script runs in 'local' mode
48 |       # Best practice is that it needs to be able to run on a laptop w/o
49 |       # hadoop.
50 |       #
51 |       def local_cmd
52 |         raise "Override this in subclass!"
53 |       end
54 | 
55 |       #
56 |       # Default is to run with hadoop
57 |       #
58 |       def run mode=:hadoop
59 |         case mode
60 |         when :local then
61 |           sh local_cmd do |res, ok|
62 |             Log.info("Exit status was #{ok}")
63 |             raise "Local mode script failed with exit status #{ok}" if ok != 0
64 |           end
65 |         when :hadoop then
66 |           sh cmd do |res, ok|
67 |             Log.info("Exit status was #{ok}")
68 |             raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
69 |           end
70 |         end
71 |       end
72 | 
73 |     end
74 |   end
75 | end
76 | 


--------------------------------------------------------------------------------
/lib/swineherd/script/hadoop_script.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd::Script
 2 |   
 3 |   #
 4 |   # native Java map-reduce
 5 |   #
 6 |   class HadoopScript
 7 |     include Common
 8 |     attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
 9 | 
10 |     def initialize *args
11 |       super(*args)
12 |       @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
13 |     end
14 | 
15 |     #
16 |     # Converts an arbitrarily nested hash to flattened arguments
17 |     # for passing to java program. For example:
18 |     #
19 |     # {:mapred => {:reduce => {:tasks => 0}}}
20 |     #
21 |     # will transform to:
22 |     #
23 |     # '-Dmapred.reduce.tasks=0'
24 |     #
25 |     def java_args args
26 |       to_dotted_args(args).map{|arg| "-D#{arg}"}
27 |     end
28 | 
29 |     #
30 |     # Uses recursion to take an arbitrarily nested hash and
31 |     # flatten it into dotted args. See 'to_java_args'. Can
32 |     # you do it any better?
33 |     #
34 |     def to_dotted_args args
35 |       args.map do |k,v|
36 |         if v.is_a?(Hash)
37 |           to_dotted_args(v).map do |s|
38 |             [k,s].join(".")
39 |           end
40 |         else
41 |           "#{k}=#{v}"
42 |         end
43 |       end.flatten
44 |     end
45 |     
46 |     def cmd
47 |       [
48 |         "HADOOP_CLASSPATH=#{hadoop_classpath}",
49 |         "#{hadoop_home}/bin/hadoop jar #{run_jar}",
50 |         main_class,
51 |         java_args(options),
52 |         "-libjars #{libjars}",
53 |         "#{input.join(',')}",
54 |         "#{output.join(',')}"
55 |       ].flatten.compact.join(" \t\\\n  ")
56 |     end
57 | 
58 |   end
59 | end
60 | 


--------------------------------------------------------------------------------
/lib/swineherd/script/pig_script.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd::Script
 2 |   class PigScript
 3 |     include Common
 4 | 
 5 |     #
 6 |     # Not guaranteeing anything.
 7 |     #
 8 |     AVRO_PIG_MAPPING = {
 9 |       'string' => 'chararray',
10 |       'int'    => 'int',
11 |       'long'   => 'long',
12 |       'float'  => 'float',
13 |       'double' => 'double',
14 |       'bytes'  => 'bytearray',
15 |       'fixed'  => 'bytearray'
16 |     }
17 | 
18 |     #
19 |     # Simple utility function for mapping avro types to pig types
20 |     #
21 |     def self.avro_to_pig avro_type
22 |       AVRO_PIG_MAPPING[avro_type]
23 |     end
24 | 
25 |     #
26 |     # Convert a generic hash of options {:foo => 'bar'} into
27 |     # command line options for pig '-p FOO=bar'
28 |     #
29 |     def pig_args options
30 |       options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
31 |     end
32 | 
33 | 
34 | 
35 |     def local_cmd
36 |       Log.info("Launching Pig script in local mode")
37 |       "pig -x local #{pig_args(@options)} #{script}"
38 |     end
39 | 
40 |     def cmd
41 |       Log.info("Launching Pig script in hadoop mode")
42 |       "pig #{pig_args(@options)} #{script}"
43 |     end
44 | 
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/lib/swineherd/script/r_script.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd::Script
 2 |   class RScript
 3 |     include Common
 4 | 
 5 |     def local_cmd
 6 |       "/usr/bin/Rscript --vanilla #{script}"
 7 |     end
 8 | 
 9 |     def cmd
10 |       local_cmd
11 |     end
12 | 
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/lib/swineherd/script/wukong_script.rb:
--------------------------------------------------------------------------------
 1 | require 'pathname'
 2 | 
 3 | module Swineherd::Script
 4 |   class WukongScript
 5 |     include Common
 6 | 
 7 |     def wukong_args options
 8 |       options.map{|param,val| "--#{param}=#{val}" }.join(' ')
 9 |     end
10 | 
11 |     #
12 |     # Don't treat wukong scripts as templates
13 |     #
14 |     def script
15 |       @source
16 |     end
17 | 
18 |     def cmd
19 |       raise "No wukong input specified" if input.empty?
20 |       Log.info("Launching Wukong script in hadoop mode")
21 |       "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
22 |     end
23 | 
24 |     def local_cmd
25 |       inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
26 |       Log.info("Launching Wukong script in local mode")
27 |       "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
28 |     end
29 | 
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/swineherd/template.rb:
--------------------------------------------------------------------------------
 1 | require 'erubis'
 2 | require 'tempfile'
 3 | 
 4 | 
 5 | # Template.new(script_path, attributes).substitute!
 6 | 
 7 | module Swineherd
 8 | 
 9 |   class Template
10 |     attr_accessor :source_template, :attributes
11 | 
12 |     def initialize source_template, attributes
13 |       @source_template = source_template
14 |       @attributes      = attributes
15 |     end
16 | 
17 |     def compile!
18 |       dest << Erubis::Eruby.new(source).result(attributes)
19 |       dest << "\n"
20 |       dest
21 |     end
22 | 
23 |     def substitute!
24 |       compile!
25 |       dest.read
26 |       dest.path
27 |     end
28 | 
29 |     protected
30 | 
31 |     def source
32 |       File.open(source_template).read
33 |     end
34 | 
35 |     def dest
36 |       return @dest if @dest
37 |       @dest ||= Tempfile.new(basename)
38 |     end
39 | 
40 |     def basename
41 |       File.basename(source_template)
42 |     end
43 | 
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/lib/swineherd/workflow.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd
 2 |   class Workflow
 3 |     attr_accessor :workdir, :outputs, :output_counts
 4 |     
 5 |     #
 6 |     # Create a new workflow and new namespace for this workflow
 7 |     #
 8 |     def initialize flow_id, &blk
 9 |       @flow_id = flow_id
10 |       @output_counts = Hash.new{|h,k| h[k] = 0}
11 |       @outputs       = Hash.new{|h,k| h[k] = []}
12 |       namespace @flow_id do
13 |         self.instance_eval(&blk)
14 |       end
15 |     end
16 | 
17 |     #
18 |     # Get next logical output of taskname by incrementing internal counter
19 |     #
20 |     def next_output taskname
21 |       raise "No working directory specified." unless @workdir
22 |       @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
23 |       @output_counts[taskname] += 1
24 |       latest_output(taskname)
25 |     end
26 | 
27 |     #
28 |     # Get latest output of taskname
29 |     #
30 |     def latest_output taskname
31 |       @outputs[taskname].last
32 |     end
33 | 
34 |     #
35 |     # Runs workflow starting with taskname
36 |     #
37 |     def run taskname
38 |       Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
39 |       Rake::Task["#{@flow_id}:#{taskname}"].invoke
40 |       Log.info "Workflow task #{@flow_id}:#{taskname} finished"
41 |     end
42 | 
43 |     #
44 |     # Describes the dependency tree of all tasks belonging to self
45 |     #
46 |     def describe
47 |       Rake::Task.tasks.each do |t|
48 |         Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
49 |       end
50 |     end
51 | 
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/lib/swineherd/workflow/job.rb:
--------------------------------------------------------------------------------
 1 | module Swineherd
 2 | 
 3 |   #
 4 |   # Job class is at its core a rake task
 5 |   #
 6 |   class Job
 7 | 
 8 |     #
 9 |     # Initialize job, fill variables, and create rake task
10 |     #
11 |     def initialize job_id, &blk
12 |       @job_id       = job_id
13 |       @name         = ''
14 |       @dependencies = []
15 |       @script       = ''
16 |       self.instance_eval(&blk)
17 |       raketask
18 |       handle_dependencies
19 |     end
20 | 
21 |     #
22 |     # Will be the name of the rake task
23 |     #
24 |     def name name = nil
25 |       return @name unless name
26 |       @name = name
27 |     end
28 | 
29 |     def script script = nil
30 |       return @script unless script
31 |       @script = script
32 |     end
33 | 
34 |     #
35 |     # An array of job names as dependencies
36 |     #
37 |     def dependencies dependencies = nil
38 |       return @dependencies unless dependencies
39 |       @dependencies = dependencies
40 |     end
41 | 
42 |     def handle_dependencies
43 |       return if dependencies.empty?
44 |       task name => dependencies
45 |     end
46 | 
47 |     def cmd
48 |       @script.cmd
49 |     end
50 | 
51 |     #
52 |     # Every job is compiled into a rake task
53 |     #
54 |     def raketask
55 |       task name do
56 |         @script.run
57 |       end
58 |     end
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
 1 | Logging:
 2 | 
 3 | 1. All output from the launched workflow should go to a workflow log file
 4 | 2. Hadoop output is special and should be pulled down from the jobtracker
 5 |    - jobconf.xml
 6 |    - job details page
 7 | 
 8 | Workflow should specify a logdir, defualts to workdir + '/logs'
 9 | 
10 | Fetching hadoop job stats:
11 | 
12 | 1. Get job id
13 | 2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
14 | 3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
15 | 4. Fetch the two urls we care about and dump into the workflow's log dir.
16 | 5. Possibly parse the results into an ongoing workflow-statistics.tsv file
17 | 
18 | Other output:
19 | 
20 | Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.
21 | 


--------------------------------------------------------------------------------
/swineherd.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = %q{swineherd}
 8 |   s.version = "0.0.4"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["Jacob Perkins"]
12 |   s.date = %q{2011-06-22}
13 |   s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
14 |   s.email = %q{jacob.a.perkins@gmail.com}
15 |   s.executables = ["hdp-tree", "hadoop-stream"]
16 |   s.extra_rdoc_files = [
17 |     "LICENSE",
18 |     "README.textile"
19 |   ]
20 |   s.files = [
21 |     "LICENSE",
22 |     "README.textile",
23 |     "Rakefile",
24 |     "VERSION",
25 |     "bin/hadoop-stream",
26 |     "bin/hdp-tree",
27 |     "examples/pagerank/data/seinfeld_network.tsv",
28 |     "examples/pagerank/pagerank.rb",
29 |     "examples/pagerank/scripts/cut_off_list.rb",
30 |     "examples/pagerank/scripts/histogram.R",
31 |     "examples/pagerank/scripts/pagerank.pig",
32 |     "examples/pagerank/scripts/pagerank_initialize.pig",
33 |     "lib/swineherd.rb",
34 |     "lib/swineherd/filesystem.rb",
35 |     "lib/swineherd/filesystem/README_filesystem.textile",
36 |     "lib/swineherd/filesystem/basefilesystem.rb",
37 |     "lib/swineherd/filesystem/filesystems.rb",
38 |     "lib/swineherd/filesystem/hadoopfilesystem.rb",
39 |     "lib/swineherd/filesystem/localfilesystem.rb",
40 |     "lib/swineherd/filesystem/localfs.rb",
41 |     "lib/swineherd/filesystem/s3filesystem.rb",
42 |     "lib/swineherd/script.rb",
43 |     "lib/swineherd/script/hadoop_script.rb",
44 |     "lib/swineherd/script/pig_script.rb",
45 |     "lib/swineherd/script/r_script.rb",
46 |     "lib/swineherd/script/wukong_script.rb",
47 |     "lib/swineherd/template.rb",
48 |     "lib/swineherd/workflow.rb",
49 |     "lib/swineherd/workflow/job.rb",
50 |     "notes.txt",
51 |     "swineherd.gemspec",
52 |     "tests/test_filesystem.rb",
53 |     "tests/test_s3_filesystem.rb",
54 |     "tests/testcfg.yaml"
55 |   ]
56 |   s.homepage = %q{http://github.com/Ganglion/swineherd}
57 |   s.licenses = ["MIT"]
58 |   s.require_paths = ["lib"]
59 |   s.rubygems_version = %q{1.3.7}
60 |   s.summary = %q{Flexible data workflow glue.}
61 | 
62 |   if s.respond_to? :specification_version then
63 |     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
64 |     s.specification_version = 3
65 | 
66 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
67 |       s.add_development_dependency(%q<yard>, ["~> 0.6.0"])
68 |       s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
69 |       s.add_development_dependency(%q<rcov>, [">= 0"])
70 |       s.add_runtime_dependency(%q<configliere>, [">= 0"])
71 |       s.add_runtime_dependency(%q<gorillib>, [">= 0"])
72 |       s.add_runtime_dependency(%q<erubis>, [">= 0"])
73 |       s.add_runtime_dependency(%q<right_aws>, [">= 0"])
74 |     else
75 |       s.add_dependency(%q<yard>, ["~> 0.6.0"])
76 |       s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
77 |       s.add_dependency(%q<rcov>, [">= 0"])
78 |       s.add_dependency(%q<configliere>, [">= 0"])
79 |       s.add_dependency(%q<gorillib>, [">= 0"])
80 |       s.add_dependency(%q<erubis>, [">= 0"])
81 |       s.add_dependency(%q<right_aws>, [">= 0"])
82 |     end
83 |   else
84 |     s.add_dependency(%q<yard>, ["~> 0.6.0"])
85 |     s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
86 |     s.add_dependency(%q<rcov>, [">= 0"])
87 |     s.add_dependency(%q<configliere>, [">= 0"])
88 |     s.add_dependency(%q<gorillib>, [">= 0"])
89 |     s.add_dependency(%q<erubis>, [">= 0"])
90 |     s.add_dependency(%q<right_aws>, [">= 0"])
91 |   end
92 | end
93 | 
94 | 


--------------------------------------------------------------------------------
/tests/test_filesystem.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | $LOAD_PATH << 'lib'
  4 | require 'swineherd/filesystem' ; include Swineherd
  5 | require 'rubygems'
  6 | require 'yaml'
  7 | require 'rspec'
  8 | 
  9 | options      = YAML.load(File.read(File.dirname(__FILE__)+'/testcfg.yaml'))
 10 | current_test = options['filesystem_to_test']
 11 | describe "A new filesystem" do
 12 | 
 13 |   before do
 14 |     @test_path   = "/tmp/rspec/test_path"
 15 |     @test_path2  = "/tmp/rspec/test_path2"
 16 |     @test_string = "@('_')@" 
 17 |     @fs = Swineherd::FileSystem.get(current_test)
 18 |   end
 19 | 
 20 |   it "should implement exists?" do
 21 |     [true, false].should include(@fs.exists?(@test_path))
 22 |   end
 23 | 
 24 |   it "should be able to create a path" do
 25 |     @fs.mkpath(@test_path)
 26 |     @fs.exists?(@test_path).should eql(true)
 27 |   end
 28 | 
 29 |   it "should be able to remove a path" do
 30 |     @fs.mkpath(@test_path)
 31 |     @fs.rm(@test_path)
 32 |     @fs.exists?(@test_path).should eql(false)
 33 |   end
 34 | 
 35 |   it "should implement size" do
 36 |     @fs.mkpath(File.dirname(@test_path))
 37 |     fileobj = @fs.open(@test_path, 'w')
 38 |     fileobj.write(@test_string)
 39 |     fileobj.close
 40 |     7.should eql(@fs.size(@test_path))
 41 |     @fs.rm(@test_path)
 42 |     @fs.rm(File.dirname(@test_path))
 43 |   end
 44 | 
 45 |   it "should be able to copy paths" do
 46 |     @fs.mkpath(@test_path)
 47 |     @fs.cp(@test_path, @test_path2)
 48 |     @fs.exists?(@test_path2).should eql(true)
 49 |     @fs.rm(@test_path)
 50 |     @fs.rm(@test_path2)
 51 |   end
 52 | 
 53 |   it "should be able to move paths" do
 54 |     @fs.mkpath(@test_path)
 55 |     @fs.mv(@test_path, @test_path2)
 56 |     @fs.exists?(@test_path).should eql(false)
 57 |     @fs.exists?(@test_path2).should eql(true)
 58 |     @fs.rm(@test_path2)
 59 |   end
 60 | 
 61 |   it "should return a sane path type" do
 62 |     @fs.mkpath(@test_path)
 63 |     ["file", "directory", "symlink", "unknown"].should include(@fs.type(@test_path))
 64 |     @fs.rm(@test_path)
 65 |   end
 66 | 
 67 |   it "can return an array of directory entries" do
 68 |     sub_paths = ["a", "b", "c"]
 69 |     sub_paths.each do |sub_path|
 70 |       @fs.mkpath(File.join(@test_path, sub_path))
 71 |     end
 72 |     @fs.entries(@test_path).class.should eql(Array)
 73 |     @fs.entries(@test_path).map{|path| File.basename(path)}.reject{|x| x =~ /\./}.sort.should eql(sub_paths.sort)
 74 |     @fs.rm(@test_path)
 75 |   end
 76 | 
 77 |   it "can answer to open with a writable file object" do
 78 |     fileobj = @fs.open(@test_path, 'w')
 79 |     fileobj.should respond_to :write
 80 |     @fs.rm(@test_path)
 81 |   end
 82 | 
 83 | end
 84 | 
 85 | describe "A new file" do
 86 |   before do
 87 |     @test_path   = "/tmp/rspec/test_path"
 88 |     @test_path2  = "/tmp/rspec/test_path2"
 89 |     @test_string = "@('_')@"
 90 |     @fs = Swineherd::FileSystem.get(current_test)
 91 |   end
 92 | 
 93 |   it "should be closeable" do
 94 |     @fs.open(@test_path, 'w').close
 95 |   end
 96 | 
 97 |   it "should be writeable" do
 98 |     fileobj = @fs.open(@test_path, 'w')
 99 |     fileobj.write(@test_string)
100 |     fileobj.close
101 |     @fs.rm(@test_path)
102 |   end
103 | 
104 |   it "should be readable" do
105 | 
106 |     fileobjw = @fs.open(@test_path, 'w')
107 |     fileobjw.write(@test_string)
108 |     fileobjw.close
109 | 
110 |     fileobjr = @fs.open(@test_path, 'r')
111 |     fileobjr.read.should eql(@test_string)
112 | 
113 |     @fs.rm(@test_path)
114 |   end
115 | 
116 | end
117 | 


--------------------------------------------------------------------------------
/tests/test_s3_filesystem.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | #
  4 | # These tests cannot possibly pass unless you have an amazon account with proper
  5 | # credentials. Furthermore, you definitely want a test bucket to play with. In
  6 | # this set of mock tests I've called it 'test-bucket' which will certainly get
  7 | # you and 'access-denied' error. Also, despite all that, 4 tests (see below)
  8 | # will fail outright.
  9 | #
 10 | # This one has to break the rules slightly because amazon-s3 is not actually a
 11 | # filesystem implementation. There's no such thing as a 'path' and so the following
 12 | # tests will fail:
 13 | #
 14 | # 1. it "should be able to create a path" (path wont exist but it's ok, thats what
 15 | # we expect)
 16 | #
 17 | # 2. it "should be able to copy paths" (it can't create paths that aren't files
 18 | # and so we expect this to fail, again it's ok.)
 19 | #
 20 | # 3. it "should be able to move paths" (it can't create paths that aren't files
 21 | # and so we expect this to fail, again it's ok.)
 22 | #
 23 | # 4. it "can return an array of directory entries" (ditto)
 24 | #
 25 | # Note: If one were to rewrite the above tests to use existing paths on s3 then the
 26 | # tests will succeed. Try it.
 27 | #
 28 | 
 29 | 
 30 | $LOAD_PATH << 'lib'
 31 | require 'swineherd/filesystem' ; include Swineherd
 32 | require 'rubygems'
 33 | require 'yaml'
 34 | require 'rspec'
 35 | 
 36 | options = YAML.load(File.read(File.dirname(__FILE__)+'/testcfg.yaml'))
 37 | current_test = 's3'
 38 | describe "A new filesystem" do
 39 | 
 40 |   before do
 41 |     @test_path   = "#{options['s3_test_bucket']}/tmp/rspec/test_path"
 42 |     @test_path2  = "#{options['s3_test_bucket']}/tmp/rspec/test_path2"
 43 |     @test_string = "@('_')@"     
 44 |     @fs = Swineherd::FileSystem.get(current_test, options['aws_access_key_id'], options['aws_secret_access_key'])
 45 |   end
 46 | 
 47 |   it "should implement exists?" do
 48 |     [true, false].should include(@fs.exists?(@test_path))
 49 |   end
 50 | 
 51 |   it "should be able to create a path" do
 52 |     @fs.mkpath(@test_path)
 53 |     @fs.exists?(@test_path).should eql(true)
 54 |   end
 55 | 
 56 |   it "should be able to remove a path" do
 57 |     @fs.mkpath(@test_path)
 58 |     @fs.rm(@test_path)
 59 |     @fs.exists?(@test_path).should eql(false)
 60 |   end
 61 | 
 62 |   it "should implement size" do
 63 |     @fs.mkpath(File.dirname(@test_path))
 64 |     fileobj = @fs.open(@test_path, 'w')
 65 |     fileobj.write(@test_string)
 66 |     fileobj.close
 67 |     7.should eql(@fs.size(@test_path))
 68 |     @fs.rm(@test_path)
 69 |     @fs.rm(File.dirname(@test_path))
 70 |   end
 71 | 
 72 |   it "should be able to copy paths" do
 73 |     @fs.mkpath(@test_path)
 74 |     @fs.cp(@test_path, @test_path2)
 75 |     @fs.exists?(@test_path2).should eql(true)
 76 |     @fs.rm(@test_path)
 77 |     @fs.rm(@test_path2)
 78 |   end
 79 | 
 80 |   it "should be able to move paths" do
 81 |     @fs.mkpath(@test_path)
 82 |     @fs.mv(@test_path, @test_path2)
 83 |     @fs.exists?(@test_path).should eql(false)
 84 |     @fs.exists?(@test_path2).should eql(true)
 85 |     @fs.rm(@test_path2)
 86 |   end
 87 | 
 88 |   it "should return a sane path type" do
 89 |     @fs.mkpath(@test_path)
 90 |     ["file", "directory", "symlink", "unknown"].should include(@fs.type(@test_path))
 91 |     @fs.rm(@test_path)
 92 |   end
 93 | 
 94 |   it "can return an array of directory entries" do
 95 |     sub_paths = ["a", "b", "c"]
 96 |     sub_paths.each do |sub_path|
 97 |       @fs.mkpath(File.join(@test_path, sub_path))
 98 |     end
 99 |     @fs.entries(@test_path).class.should eql(Array)
100 |     @fs.entries(@test_path).map{|path| File.basename(path)}.reject{|x| x =~ /\./}.sort.should eql(sub_paths.sort)
101 |     @fs.rm(@test_path)
102 |   end
103 | 
104 |   it "can answer to open with a writable file object" do
105 |     fileobj = @fs.open(@test_path, 'w')
106 |     fileobj.should respond_to :write
107 |     @fs.rm(@test_path)
108 |   end
109 | 
110 | end
111 | 
112 | describe "A new file" do
113 |   before do
114 |     @test_path   = "#{options['s3_test_bucket']}/tmp/rspec/test_path"
115 |     @test_path2  = "#{options['s3_test_bucket']}/test_path2"
116 |     @test_string = "@('_')@"
117 |     @fs = Swineherd::FileSystem.get(current_test, options['aws_access_key_id'], options['aws_secret_access_key'])
118 |   end
119 | 
120 |   it "should be closeable" do
121 |     @fs.open(@test_path, 'w').close
122 |   end
123 | 
124 |   it "should be writeable" do
125 |     fileobj = @fs.open(@test_path, 'w')
126 |     fileobj.write(@test_string)
127 |     fileobj.close
128 |     @fs.rm(@test_path)
129 |   end
130 | 
131 |   it "should be readable" do
132 | 
133 |     fileobjw = @fs.open(@test_path, 'w')
134 |     fileobjw.write(@test_string)
135 |     fileobjw.close
136 | 
137 |     fileobjr = @fs.open(@test_path, 'r')
138 |     fileobjr.read.should eql(@test_string)
139 | 
140 |     @fs.rm(@test_path)
141 |   end
142 | 
143 | end
144 | 


--------------------------------------------------------------------------------
/tests/testcfg.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | filesystem_to_test: file
3 | s3_test_bucket: infochimps-test
4 | 
5 | # :) you'll probably want to change these
6 | aws_access_key_id: myaccessid
7 | aws_secret_access_key: 1234mysecretaccesskey8q7fh
8 | 


--------------------------------------------------------------------------------