├── .gitignore ├── LICENSE ├── README.textile ├── Rakefile ├── VERSION ├── bin ├── hadoop-stream └── hdp-tree ├── examples └── pagerank │ ├── data │ └── seinfeld_network.tsv │ ├── pagerank.rb │ └── scripts │ ├── cut_off_list.rb │ ├── histogram.R │ ├── pagerank.pig │ └── pagerank_initialize.pig ├── lib ├── swineherd.rb └── swineherd │ ├── filesystem.rb │ ├── filesystem │ ├── README_filesystem.textile │ ├── basefilesystem.rb │ ├── filesystems.rb │ ├── hadoopfilesystem.rb │ ├── localfilesystem.rb │ ├── localfs.rb │ └── s3filesystem.rb │ ├── script.rb │ ├── script │ ├── hadoop_script.rb │ ├── pig_script.rb │ ├── r_script.rb │ └── wukong_script.rb │ ├── template.rb │ ├── workflow.rb │ └── workflow │ └── job.rb ├── notes.txt ├── swineherd.gemspec └── tests ├── test_filesystem.rb ├── test_s3_filesystem.rb └── testcfg.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | ## OS 2 | .DS_Store 3 | Icon? 4 | nohup.out 5 | .bak 6 | 7 | ## EDITORS 8 | \#* 9 | .\#* 10 | *~ 11 | *.swp 12 | REVISION 13 | TAGS* 14 | tmtags 15 | *_flymake.* 16 | *_flymake 17 | *.tmproj 18 | .project 19 | .settings 20 | 21 | ## COMPILED 22 | a.out 23 | *.o 24 | *.pyc 25 | *.so 26 | 27 | ## OTHER SCM 28 | .bzr 29 | .hg 30 | .svn 31 | 32 | ## PROJECT::GENERAL 33 | coverage 34 | rdoc 35 | doc 36 | pkg 37 | .yardoc 38 | *private* 39 | 40 | ## PROJECT::SPECIFIC 41 | 42 | *.rdb 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h1. SwineHerd 2 | 3 | Swineherd is for running scripts and workflows on filesystems. 4 | 5 | h3. Outline 6 | 7 | A @workflow@ is built with @script@ objects and ran on a @filesystem@. 8 | 9 | h4. Script: 10 | 11 | A script has the following 12 | 13 | * @source@ - The source file used. These can be "Apache Pig":http://pig.apache.org/ scripts, "Wukong":http://github.com/infochimps/wukong scripts, even "R":http://www.r-project.org/ scripts. You can add your own scripts by subclassing the @script@ class. 14 | * @input@ - An array of input paths. 15 | * @output@ - An array of output paths. 16 | * @options@ - A ruby hash of options used as command line args. Eg. {:foo => 'bar'}. How these options are mapped to command line arguments is up to the particular script class. 17 | * @attributes@ - A ruby hash of parameters used for variable substitution. Every script is assumed to be (but not required to be) an eruby template. 18 | 19 | h4. Workflow: 20 | 21 | A workflow is built using rake @task@ objects that doing nothing more than run scripts. A workflow 22 | 23 | * can be described with a directed dependency graph 24 | * has an @id@ which is used to run its tasks idempotently. At the moment it is the responsibility of the running process (or human being) to choose a suitable id. 25 | * manages intermediate outputs by using the @next_output@ and @latest_output@ methods. See the examples dir for usage. 26 | * A workflow has a working directory in which all intermediate outputs go 27 | ** These are named according to the rake task that created them 28 | 29 | h4. FileSystem 30 | 31 | Workflows are intended to run on filesystems. At the moment, implemented filesystems are 32 | 33 | * @file@ - Local file system. Only thoroughly tested on unbuntu linux. 34 | * @hdfs@ - Hadoop distributed file system. Uses jruby and the Apache Hadoop 0.20 api. 35 | * @s3@ - Uses the right_aws gem for interacting with Amazon Simple Storage System (s3). 36 | 37 | Using the filesystem: 38 | 39 | Paths should be absolute. 40 | 41 |
42 | # get a new instance of local filesystem and write to it
43 | localfs = FileSystem.get(:file)
44 | localfs.open("mylocalfile", 'w') do |f|
45 | f.write("Writing a string to a local file")
46 | end
47 |
48 | # get a new instance of hadoop filesystem and write to it
49 | hadoopfs = FileSystem.get(:hdfs)
50 | hadoopfs.open("myhadoopfile", 'w') do |f|
51 | f.write("Writing a string to an hdfs file")
52 | end
53 |
54 | # get a new instance of s3 filesystem and write to it
55 | access_key_id = '1234abcd'
56 | secret_access_key = 'foobar1234'
57 | s3fs = FileSystem.get(:s3, accees_key_id, secret_access_key)
58 | s3fs.mkpath 'mys3bucket' # bucket must exist
59 | s3fs.open("mys3bucket/mys3file", 'w') do |f|
60 | f.write("Writing a string to an s3 file")
61 | end
62 |
63 |
64 | h3. Working Example
65 |
66 | For the most up to date working example see the examples directory. Here's a simple example for running pagerank:
67 |
68 |
69 | #!/usr/bin/env ruby
70 |
71 | $LOAD_PATH << '../../lib'
72 | require 'swineherd' ; include Swineherd
73 | require 'swineherd/script' ; include Swineherd::Script
74 | require 'swineherd/filesystem'
75 |
76 | Settings.define :flow_id, :required => true, :description => "Flow id required to make run of workflow unique"
77 | Settings.define :iterations, :type => Integer, :default => 10, :description => "Number of pagerank iterations to run"
78 | Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
79 | Settings.resolve!
80 |
81 | flow = Workflow.new(Settings.flow_id) do
82 |
83 | # The filesystems we're going to be working with
84 | hdfs = Swineherd::FileSystem.get(:hdfs)
85 | localfs = Swineherd::FileSystem.get(:file)
86 |
87 | # The scripts we're going to use
88 | initializer = PigScript.new('scripts/pagerank_initialize.pig')
89 | iterator = PigScript.new('scripts/pagerank.pig')
90 | finisher = WukongScript.new('scripts/cut_off_list.rb')
91 | plotter = RScript.new('scripts/histogram.R')
92 |
93 | #
94 | # Runs simple pig script to initialize pagerank. We must specify the input
95 | # here as this is the first step in the workflow. The output attribute is to
96 | # ensure idempotency and the options attribute is the hash that will be
97 | # converted into command-line args for the pig interpreter.
98 | #
99 | task :pagerank_initialize do
100 | initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
101 | initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
102 | end
103 |
104 | #
105 | # Runs multiple iterations of pagerank with another pig script and manages all
106 | # the intermediate outputs.
107 | #
108 | task :pagerank_iterate => [:pagerank_initialize] do
109 | iterator.options[:damp] = '0.85f'
110 | iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
111 | Settings.iterations.times do
112 | iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
113 | iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
114 | iterator.refresh!
115 | iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
116 | end
117 | end
118 |
119 | #
120 | # Here we use a wukong script to cut off the last field (a big pig bag of
121 | # links). Notice how every wukong script MUST have an input but pig scripts do
122 | # not.
123 | #
124 | task :cut_off_adjacency_list => [:pagerank_iterate] do
125 | finisher.input << latest_output(:pagerank_iterate)
126 | finisher.output << next_output(:cut_off_adjacency_list)
127 | finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
128 | end
129 |
130 | #
131 | # We want to pull down one result file, merge the part-000.. files into one file
132 | #
133 | task :merge_results => [:cut_off_adjacency_list] do
134 | merged_results = next_output(:merge_results)
135 | hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
136 | end
137 |
138 | #
139 | # Cat results into a local directory with the same structure
140 | # eg. #{work_dir}/#{flow_id}/pull_down_results-0.
141 | #
142 | # FIXME: Bridging filesystems is cludgey.
143 | #
144 | task :pull_down_results => [:merge_results] do
145 | local_results = next_output(:pull_down_results)
146 | hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
147 | end
148 |
149 | #
150 | # Plot 2nd column of the result as a histogram (requires R and
151 | # ggplot2). Note that the output here is a png file but doesn't have that
152 | # extension. Ensmarten me as to the right way to handle that?
153 | #
154 | task :plot_results => [:pull_down_results] do
155 | plotter.attributes = {
156 | :pagerank_data => latest_output(:pull_down_results),
157 | :plot_file => next_output(:plot_results), # <-- this will be a png...
158 | :raw_rank => "aes(x=d$V2)"
159 | }
160 | plotter.run(:local) unless localfs.exists? latest_output(:plot_results)
161 | end
162 |
163 | end
164 |
165 | flow.workdir = "/tmp/pagerank_example"
166 | flow.describe
167 | flow.run(:plot_results)
168 |
169 |
170 | h3. Utils
171 |
172 | There's a fun little program to emphasize the ease of using the filesystem abstraction called 'hdp-tree':
173 |
174 |
175 | $: bin/hdp-tree /tmp/my_hdfs_directory
176 | ---
177 | /tmp/my_hdfs_directory:
178 | - my_hdfs_directory:
179 | - sub_dir_a: leaf_file_1
180 | - sub_dir_a: leaf_file_2
181 | - sub_dir_a: leaf_file_3
182 | - my_hdfs_directory:
183 | - sub_dir_b: leaf_file_1
184 | - sub_dir_b: leaf_file_2
185 | - sub_dir_b: leaf_file_3
186 | - my_hdfs_directory:
187 | - sub_dir_c: leaf_file_1
188 | - sub_dir_c: leaf_file_2
189 | - sub_dir_c: leaf_file_3
190 | - sub_dir_c:
191 | - sub_sub_dir_a: yet_another_leaf_file
192 | - sub_dir_c: sub_sub_dir_b
193 | - sub_dir_c: sub_sub_dir_c
194 |
195 |
196 | I know, it's not as pretty as unix tree, but this IS github...
197 |
198 | h3. TODO
199 |
200 | * next task in a workflow should NOT run if the previous step failed
201 | ** this is made difficult by the fact that, sometimes?, when a pig script fails it still returns a 0 exit status
202 | ** same for wukong scripts
203 | * add a @job@ object that implements a @not_if@ function. this way a @workflow@ will be constructed of @job@ objects
204 | ** a @job@ will do nothing more than execute the ruby code in it's (run?) block, unless @not_if@ is true
205 | ** this way we can put @script@ objects inside a @job@ and only run under certain conditions that the user specifies when
206 | they create the @job@
207 | * implement ftp filesystem interfaces
208 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rubygems'
2 | require 'rake'
3 |
4 | require 'jeweler'
5 | Jeweler::Tasks.new do |gem|
6 | # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
7 | gem.name = "swineherd"
8 | gem.homepage = "http://github.com/Ganglion/swineherd"
9 | gem.license = "MIT"
10 | gem.summary = %Q{Flexible data workflow glue.}
11 | gem.description = %Q{Swineherd is for running scripts and workflows on filesystems.}
12 | gem.email = "jacob.a.perkins@gmail.com"
13 | gem.authors = ["Jacob Perkins"]
14 | # Include your dependencies below. Runtime dependencies are required when using your gem,
15 | # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
16 | # gem.add_runtime_dependency 'jabber4r', '> 0.1'
17 | # gem.add_development_dependency 'rspec', '> 1.2.3'
18 | gem.add_development_dependency "yard", "~> 0.6.0"
19 | gem.add_development_dependency "jeweler", "~> 1.5.2"
20 | gem.add_development_dependency "rcov", ">= 0"
21 | gem.add_dependency 'configliere'
22 | gem.add_dependency 'gorillib'
23 | gem.add_dependency 'erubis'
24 | gem.add_dependency 'right_aws'
25 | end
26 | Jeweler::RubygemsDotOrgTasks.new
27 |
28 |
29 | require 'yard'
30 | YARD::Rake::YardocTask.new
31 |
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.4
--------------------------------------------------------------------------------
/bin/hadoop-stream:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'rubygems'
4 | require 'rake'
5 | require 'swineherd' ; include Swineherd
6 |
7 | #
8 | # Uses hadoop and rake's multitask capability to stream many source
9 | # files in parallel into a single destination directory.
10 | #
11 |
12 | Settings.define :input, :type => Array, :required => true, :description => "Comma spearated list of directories (hdfs paths, s3 paths, etc) to stream"
13 | Settings.define :output, :required => true, :description => "Destination directory (s3 or hdfs)"
14 | Settings.resolve!
15 |
16 | #
17 | # Takes a hash of paths eg: {'filename' => 'full path'} and defines
18 | # a new streaming task for each one
19 | #
20 | def define_tasks list_of_tasks
21 | list_of_tasks.each do |basename, source|
22 | task basename do
23 | destination = File.join(Settings.output, basename) # each file gets its own output
24 | HDFS.stream(source, destination)
25 | end
26 | end
27 | end
28 |
29 | # Create a list of tasks, one per file
30 | list_of_tasks = Settings.input.inject({}){|list, path| list[File.basename(path)] = path; list}
31 | define_tasks list_of_tasks
32 |
33 | multitask :stream_all => list_of_tasks.keys
34 |
35 | Rake::MultiTask["stream_all"].invoke
36 |
--------------------------------------------------------------------------------
/bin/hdp-tree:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env jruby
2 |
3 | require 'swineherd'
4 |
5 | #
6 | # Creates a 'tree' view of an hdfs path. It's not as pretty as the
7 | # unix tree command but that's only because I'm not smart enough to
8 | # print the hierarchy properly.
9 | #
10 |
11 | FS = Swineherd::FileSystem.get(:hdfs)
12 | path = ARGV[0]
13 |
14 | # Recursively list paths
15 | def lr path
16 | paths = FS.entries(path)
17 | if (paths && !paths.empty?)
18 | paths.map{|e| {File.basename(path) => lr(e)}}.flatten
19 | else
20 | File.basename(path)
21 | end
22 | end
23 |
24 |
25 | tree = {File.dirname(path) => lr(path)}.to_yaml
26 | puts tree
27 |
--------------------------------------------------------------------------------
/examples/pagerank/data/seinfeld_network.tsv:
--------------------------------------------------------------------------------
1 | jerry costanza81
2 | jerry ElaineBenes
3 | jerry kramer
4 | jerry NewmanUSPS
5 | jerry THE_REAL_PUDDY
6 | jerry JPeterman
7 | jerry FRANKCOSTANZA
8 | costanza81 jerry
9 | costanza81 ElaineBenes
10 | costanza81 kramer
11 | costanza81 NewmanUSPS
12 | costanza81 THE_REAL_PUDDY
13 | costanza81 JPeterman
14 | costanza81 FRANKCOSTANZA
15 | ElaineBenes jerry
16 | ElaineBenes costanza81
17 | ElaineBenes kramer
18 | ElaineBenes THE_REAL_PUDDY
19 | ElaineBenes JPeterman
20 | kramer jerry
21 | kramer costanza81
22 | kramer ElaineBenes
23 | kramer NewmanUSPS
24 | kramer THE_REAL_PUDDY
25 | kramer JPeterman
26 | kramer FRANKCOSTANZA
27 | NewmanUSPS jerry
28 | NewmanUSPS costanza81
29 | NewmanUSPS ElaineBenes
30 | NewmanUSPS kramer
31 | NewmanUSPS THE_REAL_PUDDY
32 | NewmanUSPS JPeterman
33 | NewmanUSPS FRANKCOSTANZA
34 | THE_REAL_PUDDY jerry
35 | THE_REAL_PUDDY costanza81
36 | THE_REAL_PUDDY ElaineBenes
37 | THE_REAL_PUDDY kramer
38 | THE_REAL_PUDDY NewmanUSPS
39 | THE_REAL_PUDDY JPeterman
40 | THE_REAL_PUDDY FRANKCOSTANZA
41 | THE_REAL_PUDDY Vegetable_Lasagna
42 | FRANKCOSTANZA jerry
43 | FRANKCOSTANZA costanza81
44 | FRANKCOSTANZA kramer
45 | jerry MortySeinfeld
46 | jerry HelenSeinfeld
47 | jerry Izzy_Mandelbaum
48 | jerry UncleLEO
49 | jerry Artie_Levine
50 | MortySeinfeld UncleLEO
51 | MortySeinfeld HelenSeinfeld
52 | MortySeinfeld Cousin_Jeffrey
53 | Izzy_Mandelbaum jerry
54 | UncleLEO jerry
55 | UncleLEO MortySeinfeld
56 | UncleLEO HelenSeinfeld
57 | UncleLEO Cousin_Jeffrey
58 | UncleLEO Babs_Kramer
59 | Babs_Kramer UncleLEO
60 | Cousin_Jeffrey jerry
61 | Cousin_Jeffrey MortySeinfeld
62 | Cousin_Jeffrey UncleLEO
63 | jerry Nana
64 | Nana MortySeinfeld
65 | MortySeinfeld Nana
66 | Cousin_Jeffrey Nana
67 | UncleLEO Nana
68 | Nana UncleLEO
69 | JackKlompus MortySeinfeld
70 | MortySeinfeld JackKlompus
71 | Dolores jerry
72 | MarlatheVirgin jerry
73 | TiaVanCamp jerry
74 | Rachel_Goldstein jerry
75 | meryl jerry
76 | MissRhodeIsland jerry
77 | Pam jerry
78 | Sheila jerry
79 | MelissaFlyingFree jerry
80 | jerry Dolores
81 | jerry MarlatheVirgin
82 | jerry TiaVanCamp
83 | jerry Rachel_Goldstein
84 | jerry meryl
85 | jerry MissRhodeIsland
86 | jerry Pam
87 | jerry Sheila
88 | jerry MelissaFlyingFree
89 | jerry Laura
90 | jerry Sandy
91 | Laura Sandy
92 | Laura jerry
93 | Sandy jerry
94 | kramer MissRhodeIsland
95 | kramer Pam
96 | jerry Jenna
97 | Jenna jerry
98 | jerry bania
99 | costanza81 bania
100 | bania jerry
101 | bania The_Soup_Nazi
102 | bania Poppie
103 | bania Jenna
104 | jerry Noreen
105 | jerry JackKlompus
106 | jerry Milos
107 | jerry JeanPaul_JeanPaul
108 | jerry FusilliJerry
109 | FusilliJerry jerry
110 | FusilliJerry kramer
111 | kramer FusilliJerry
112 | FusilliJerry FRANKCOSTANZA
113 | jerry pez
114 | jerry superman
115 | jerry bigstein
116 | Milos jerry
117 | jerry Roy_the_Dentist
118 | jerry BabuBhatt
119 | kramer BabuBhatt
120 | BabuBhatt kramer
121 | kramer Poppie
122 | Poppie kramer
123 | Poppie ElaineBenes
124 | ElaineBenes Poppie
125 | jerry Poppie
126 | jerry Shaky_the_Mohel
127 | jerry bubble_boy
128 | bubble_boy jerry
129 | MatthewSeinfeldFan jerry
130 | FragileFrankieMerman jerry
131 | jerry FragileFrankieMerman
132 | costanza81 FragileFrankieMerman
133 | costanza81 EstelleC
134 | FRANKCOSTANZA EstelleC
135 | EstelleC costanza81
136 | EstelleC FRANKCOSTANZA
137 | FRANKCOSTANZA Lloyd_Braun
138 | EstelleC Lloyd_Braun
139 | Lloyd_Braun FRANKCOSTANZA
140 | Lloyd_Braun costanza81
141 | Lloyd_Braun EstelleC
142 | kramer MrWilhelm
143 | costanza81 MrWilhelm
144 | costanza81 Allison
145 | Allison costanza81
146 | costanza81 LindsayEnright
147 | LindsayEnright costanza81
148 | costanza81 marisa_tomei
149 | costanza81 SusanRoss
150 | SusanRoss MrandMrsRoss
151 | MrandMrsRoss SusanRoss
152 | SusanRoss jerry
153 | jerry SusanRoss
154 | SusanRoss ElaineBenes
155 | kramer SusanRoss
156 | SusanRoss Russell_Dalrymple
157 | Russell_Dalrymple SusanRoss
158 | Russell_Dalrymple ElaineBenes
159 | SallyWeaver SusanRoss
160 | SusanRoss SallyWeaver
161 | SallyWeaver MrandMrsRoss
162 | SallyWeaver jerry
163 | WyckThayer MrandMrsRoss
164 | MrandMrsRoss WyckThayer
165 | SusanRoss WyckThayer
166 | WyckThayer SusanRoss
167 | costanza81 MrKruger
168 | MrKruger costanza81
169 | ElaineBenes MrKruger
170 | costanza81 guitarbern
171 | costanza81 intangibles
172 | costanza81 cushman
173 | cushman bigstein
174 | bigstein cushman
175 | costanza81 JonVoight
176 | costanza81 bubble_boy
177 | costanza81 Pastrami
178 | costanza81 bigstein
179 | Victoria bigstein
180 | bigstein Victoria
181 | cushman Victoria
182 | Victoria cushman
183 | bigstein intangibles
184 | bigstein guitarbern
185 | guitarbern intangibles
186 | guitarbern bigstein
187 | intangibles Victoria
188 | intangibles bigstein
189 | bubble_boy trivial_pursuit
190 | costanza81 StankyHanke
191 | ElaineBenes MrLippman
192 | ElaineBenes MrPitt
193 | ElaineBenes Jack_The_Wiz
194 | Jack_The_Wiz ElaineBenes
195 | TheSuzie ElaineBenes
196 | jerry TheSuzie
197 | TheSuzie Peggy
198 | Peggy TheSuzie
199 | Peggy JPeterman
200 | JPeterman kramer
201 | JPeterman TheSuzie
202 | ElaineBenes RobertKennedyJr
203 | MarlatheVirgin RobertKennedyJr
204 | RobertKennedyJr MarlatheVirgin
205 | Jackie_Chiles MarlatheVirgin
206 | Sue_Ellen_Mischke RobertKennedyJr
207 | JPeterman RobertKennedyJr
208 | RobertKennedyJr JPeterman
209 | MrPitt RobertKennedyJr
210 | RobertKennedyJr MrPitt
211 | kramer TinaRobbins
212 | TinaRobbins kramer
213 | ElaineBenes TinaRobbins
214 | TinaRobbins ElaineBenes
215 | Jake_Jarmel ElaineBenes
216 | Noreen
217 | ElaineBenes HalKitzmiller
218 | HalKitzmiller ElaineBenes
219 | kramer HalKitzmiller
220 | HalKitzmiller kramer
221 | Joel_Rifkin ElaineBenes
222 | Darryl ElaineBenes
223 | NedIsakoff ElaineBenes
224 | ElaineBenes NedIsakoff
225 | ElaineBenes Carl_Farbman
226 | Carl_Farbman JPeterman
227 | JPeterman Carl_Farbman
228 | CrazyJoeDavola jerry
229 | CrazyJoeDavola costanza81
230 | CrazyJoeDavola ElaineBenes
231 | CrazyJoeDavola kramer
232 | CrazyJoeDavola NewmanUSPS
233 | costanza81 CrazyJoeDavola
234 | ElaineBenes CrazyJoeDavola
235 | kramer CrazyJoeDavola
236 | NewmanUSPS CrazyJoeDavola
237 | jerry DrTimWhatley
238 | costanza81 DrTimWhatley
239 | ElaineBenes DrTimWhatley
240 | kramer DrTimWhatley
241 | NewmanUSPS DrTimWhatley
242 | DrTimWhatley jerry
243 | DrTimWhatley costanza81
244 | DrTimWhatley ElaineBenes
245 | DrTimWhatley kramer
246 | jerry TheDrake
247 | costanza81 TheDrake
248 | ElaineBenes TheDrake
249 | kramer TheDrake
250 | NewmanUSPS TheDrake
251 | TheDrake jerry
252 | TheDrake costanza81
253 | TheDrake ElaineBenes
254 | TheDrake kramer
255 | jerry JoeMayo
256 | ElaineBenes JoeMayo
257 | costanza81 JoeMayo
258 | JoeMayo jerry
259 | JoeMayo ElaineBenes
260 | JoeMayo costanza81
261 | jerry Alec_Berg
262 | Alec_Berg jerry
263 | ElaineBenes Sue_Ellen_Mischke
264 | Sue_Ellen_Mischke ElaineBenes
265 | Sue_Ellen_Mischke jerry
266 | jerry Sue_Ellen_Mischke
267 | kramer Sue_Ellen_Mischke
268 | kramer Mickey_Abbott
269 | jerry Mickey_Abbott
270 | costanza81 Mickey_Abbott
271 | NewmanUSPS Mickey_Abbott
272 | Mickey_Abbott kramer
273 | Mickey_Abbott jerry
274 | Mickey_Abbott costanza81
275 | Mickey_Abbott NewmanUSPS
276 | Babs_Kramer NewmanUSPS
277 | NewmanUSPS Babs_Kramer
278 | kramer Bob_Sacamano
279 | kramer Lomez
280 | kramer JayRiemenschneider
281 | kramer CorkyRamirez
282 | kramer LenNicademo
283 | kramer Specter
284 | kramer Brody
285 | Bob_Sacamano kramer
286 | Lomez kramer
287 | JayRiemenschneider kramer
288 | CorkyRamirez kramer
289 | LenNicademo kramer
290 | Specter kramer
291 | Brody kramer
292 | Bob_Sacamano Lomez
293 | Bob_Sacamano JayRiemenschneider
294 | Bob_Sacamano CorkyRamirez
295 | Bob_Sacamano LenNicademo
296 | Bob_Sacamano Specter
297 | Bob_Sacamano Brody
298 | Bob_Sacamano jerry
299 | Brody Bob_Sacamano
300 | Brody Lomez
301 | Brody JayRiemenschneider
302 | Brody CorkyRamirez
303 | Brody LenNicademo
304 | Brody Specter
305 | CorkyRamirez Bob_Sacamano
306 | CorkyRamirez Lomez
307 | CorkyRamirez JayRiemenschneider
308 | CorkyRamirez Specter
309 | CorkyRamirez Brody
310 | JayRiemenschneider Bob_Sacamano
311 | JayRiemenschneider Lomez
312 | JayRiemenschneider CorkyRamirez
313 | JayRiemenschneider LenNicademo
314 | JayRiemenschneider Brody
315 | LenNicademo Bob_Sacamano
316 | LenNicademo Lomez
317 | LenNicademo JayRiemenschneider
318 | LenNicademo CorkyRamirez
319 | LenNicademo Brody
320 | Lomez Bob_Sacamano
321 | Lomez JayRiemenschneider
322 | Lomez CorkyRamirez
323 | Lomez LenNicademo
324 | Lomez Brody
325 | Specter Bob_Sacamano
326 | Specter Lomez
327 | Specter CorkyRamirez
328 | kramer FranklinDelanoRomanowski
329 | kramer SalBass
330 | kramer EstelleC
331 | kramer Vegetable_Lasagna
332 | kramer MortySeinfeld
333 | kramer Noreen
334 | kramer Babs_Kramer
335 | kramer Shaky_the_Mohel
336 | kramer assman
337 | assman DrTimWhatley
338 | DrTimWhatley assman
339 | kramer Stan_the_Caddy
340 | Stan_the_Caddy Jackie_Chiles
341 | NewmanUSPS Jackie_Chiles
342 | Jackie_Chiles NewmanUSPS
343 | kramer Jackie_Chiles
344 | Jackie_Chiles kramer
345 | EstelleC kramer
346 | Vegetable_Lasagna kramer
347 | MortySeinfeld kramer
348 | Noreen kramer
349 | Babs_Kramer kramer
350 | Shaky_the_Mohel kramer
351 | Bob_Cobb kramer
352 | kramer Bob_Cobb
353 | Bob_Cobb FRANKCOSTANZA
354 | FRANKCOSTANZA Bob_Cobb
355 | kramer Earl_Haffler
356 | Earl_Haffler kramer
357 | kramer MikeMoffit
358 | MikeMoffit kramer
359 | MikeMoffit jerry
360 | NewmanUSPS Henry_Atkins
361 | AvisRental jerry
362 | jerry The_Soup_Nazi
363 | costanza81 The_Soup_Nazi
364 | ElaineBenes The_Soup_Nazi
365 | kramer The_Soup_Nazi
366 | NewmanUSPS The_Soup_Nazi
367 | FRANKCOSTANZA The_Soup_Nazi
368 | THE_REAL_PUDDY The_Soup_Nazi
369 | jerry ArtVandelay
370 | costanza81 ArtVandelay
371 | ElaineBenes ArtVandelay
372 | kramer ArtVandelay
373 | jerry Kel_Varnsen
374 | costanza81 Kel_Varnsen
375 | ElaineBenes Kel_Varnsen
376 | kramer Kel_Varnsen
377 | jerry HEPennypacker
378 | costanza81 HEPennypacker
379 | ElaineBenes HEPennypacker
380 | kramer HEPennypacker
381 | jerry MartinvanNostrand
382 | costanza81 MartinvanNostrand
383 | ElaineBenes MartinvanNostrand
384 | kramer MartinvanNostrand
385 | jerry WandaPepper
386 | costanza81 WandaPepper
387 | ElaineBenes WandaPepper
388 | kramer WandaPepper
389 | ArtVandelay ArtVandelay
390 | ArtVandelay Kel_Varnsen
391 | ArtVandelay HEPennypacker
392 | ArtVandelay MartinvanNostrand
393 | ArtVandelay WandaPepper
394 | Kel_Varnsen ArtVandelay
395 | Kel_Varnsen HEPennypacker
396 | Kel_Varnsen MartinvanNostrand
397 | Kel_Varnsen WandaPepper
398 | HEPennypacker ArtVandelay
399 | HEPennypacker Kel_Varnsen
400 | HEPennypacker MartinvanNostrand
401 | HEPennypacker WandaPepper
402 | MartinvanNostrand ArtVandelay
403 | MartinvanNostrand Kel_Varnsen
404 | MartinvanNostrand HEPennypacker
405 | MartinvanNostrand WandaPepper
406 | WandaPepper ArtVandelay
407 | WandaPepper Kel_Varnsen
408 | WandaPepper HEPennypacker
409 | WandaPepper MartinvanNostrand
410 | Kevin Gene
411 | Kevin Feldman
412 | Kevin Vargas
413 | Kevin ElaineBenes
414 | Gene Kevin
415 | Gene Feldman
416 | Gene Vargas
417 | Gene ElaineBenes
418 | Feldman Kevin
419 | Feldman Gene
420 | Feldman Vargas
421 | Feldman ElaineBenes
422 | Vargas Kevin
423 | Vargas Gene
424 | Vargas Feldman
425 | Vargas ElaineBenes
426 | ElaineBenes Kevin
427 | ElaineBenes Gene
428 | ElaineBenes Feldman
429 | ElaineBenes Vargas
430 |
--------------------------------------------------------------------------------
/examples/pagerank/pagerank.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | $LOAD_PATH << '../../lib'
4 | require 'swineherd' ; include Swineherd
5 | require 'swineherd/script' ; include Swineherd::Script
6 | require 'swineherd/filesystem'
7 |
8 | Settings.define :flow_id, :required => true, :description => "Flow id required to make run of workflow unique"
9 | Settings.define :iterations, :type => Integer, :default => 10, :description => "Number of pagerank iterations to run"
10 | Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
11 | Settings.resolve!
12 |
13 | flow = Workflow.new(Settings.flow_id) do
14 |
15 | # The filesystems we're going to be working with
16 | hdfs = Swineherd::FileSystem.get(:hdfs)
17 | localfs = Swineherd::FileSystem.get(:file)
18 |
19 | # The scripts we're going to use
20 | initializer = PigScript.new('scripts/pagerank_initialize.pig')
21 | iterator = PigScript.new('scripts/pagerank.pig')
22 | finisher = WukongScript.new('scripts/cut_off_list.rb')
23 | plotter = RScript.new('scripts/histogram.R')
24 |
25 | #
26 | # Runs simple pig script to initialize pagerank. We must specify the input
27 | # here as this is the first step in the workflow. The output attribute is to
28 | # ensure idempotency and the options attribute is the hash that will be
29 | # converted into command-line args for the pig interpreter.
30 | #
31 | task :pagerank_initialize do
32 | initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
33 | initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
34 | end
35 |
36 | #
37 | # Runs multiple iterations of pagerank with another pig script and manages all
38 | # the intermediate outputs.
39 | #
40 | task :pagerank_iterate => [:pagerank_initialize] do
41 | iterator.options[:damp] = '0.85f'
42 | iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
43 | Settings.iterations.times do
44 | iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
45 | iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
46 | iterator.refresh!
47 | iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
48 | end
49 | end
50 |
51 | #
52 | # Here we use a wukong script to cut off the last field (a big pig bag of
53 | # links). Notice how every wukong script MUST have an input but pig scripts do
54 | # not.
55 | #
56 | task :cut_off_adjacency_list => [:pagerank_iterate] do
57 | finisher.input << latest_output(:pagerank_iterate)
58 | finisher.output << next_output(:cut_off_adjacency_list)
59 | finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
60 | end
61 |
62 | #
63 | # We want to pull down one result file, merge the part-000.. files into one file
64 | #
65 | task :merge_results => [:cut_off_adjacency_list] do
66 | merged_results = next_output(:merge_results)
67 | hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
68 | end
69 |
70 | #
71 | # Cat results into a local directory with the same structure
72 | # eg. #{work_dir}/#{flow_id}/pull_down_results-0.
73 | #
74 | # FIXME: Bridging filesystems is cludgey.
75 | #
76 | task :pull_down_results => [:merge_results] do
77 | local_results = next_output(:pull_down_results)
78 | hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
79 | end
80 |
81 | #
82 | # Plot 2nd column of the result as a histogram (requires R and
83 | # ggplot2). Note that the output here is a png file but doesn't have that
84 | # extension. Ensmarten me as to the right way to handle that?
85 | #
86 | task :plot_results => [:pull_down_results] do
87 | plotter.attributes = {
88 | :pagerank_data => latest_output(:pull_down_results),
89 | :plot_file => next_output(:plot_results), # <-- this will be a png...
90 | :raw_rank => "aes(x=d$V2)"
91 | }
92 | plotter.run(:hadoop) unless localfs.exists? latest_output(:plot_results)
93 | end
94 |
95 | end
96 |
97 | flow.workdir = "/tmp/pagerank_example"
98 | flow.describe
99 | flow.run(:plot_results)
100 |
--------------------------------------------------------------------------------
/examples/pagerank/scripts/cut_off_list.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'rubygems'
4 | require 'wukong'
5 |
6 | #
7 | # Does the very simple job of cutting of the giant adjacency list
8 | #
9 | class CutMapper < Wukong::Streamer::RecordStreamer
10 | def process *args
11 | node_a, node_b, list = args
12 | yield [node_a, node_b]
13 | end
14 | end
15 |
16 | Wukong::Script.new(CutMapper, nil).run
17 |
--------------------------------------------------------------------------------
/examples/pagerank/scripts/histogram.R:
--------------------------------------------------------------------------------
1 | library(ggplot2);
2 | png('<%= plot_file %>', width=900, res=132);
3 | d <- read.table('<%= pagerank_data %>', header=FALSE, sep='\t');
4 | p <- ggplot(d, <%= raw_rank %>) + geom_histogram() + xlab("") + ylab("");
5 | p;
6 |
--------------------------------------------------------------------------------
/examples/pagerank/scripts/pagerank.pig:
--------------------------------------------------------------------------------
1 | --
2 | -- Runs exactly one pagerank iteration
3 | --
4 | network = LOAD '$CURR_ITER_FILE' AS (node_a:chararray, rank:float, out_links:bag { link:tuple (node_b:chararray) });
5 | sent_shares = FOREACH network GENERATE FLATTEN(out_links) AS node_b, (float)(rank / (float)SIZE(out_links)) AS share:float;
6 | sent_links = FOREACH network GENERATE node_a, out_links;
7 | rcvd_shares = COGROUP sent_links BY node_a INNER, sent_shares BY node_b;
8 | next_iter = FOREACH rcvd_shares
9 | {
10 | raw_rank = (float)SUM(sent_shares.share);
11 | -- treat the case that a node has no in links
12 | damped_rank = ((raw_rank IS NOT NULL AND raw_rank > 1.0e-12f) ? raw_rank*$DAMP + 1.0f - $DAMP : 0.0f);
13 | GENERATE
14 | group AS node_a,
15 | damped_rank AS rank,
16 | FLATTEN(sent_links.out_links) -- hack, should only be one bag, unbag it
17 | ;
18 | };
19 |
20 | STORE next_iter INTO '$NEXT_ITER_FILE';
21 |
--------------------------------------------------------------------------------
/examples/pagerank/scripts/pagerank_initialize.pig:
--------------------------------------------------------------------------------
1 | --
2 | -- Create initial graph on which to iterate the pagerank algorithm.
3 | --
4 |
5 | --
6 | -- Generate a unique list of nodes with in links to cogroup on. This allows
7 | -- us to treat the case where nodes have in links but no out links.
8 | --
9 | network = LOAD '$ADJLIST' AS (node_a:chararray, node_b:chararray);
10 | cut_rhs = FOREACH network GENERATE node_b;
11 | uniq_rhs = DISTINCT cut_rhs;
12 | list_links = COGROUP network BY node_a, uniq_rhs BY node_b;
13 | count_links = FOREACH list_links
14 | {
15 | -- if network.node_b is empty there are no out links, set to dummy value
16 | out_links = (IsEmpty(network.node_b) ? {('dummy')} : network.node_b);
17 | GENERATE
18 | group AS node_a,
19 | 1.0f AS rank,
20 | out_links AS out_links
21 | ;
22 | };
23 |
24 | STORE count_links INTO '$INITGRPH';
25 |
--------------------------------------------------------------------------------
/lib/swineherd.rb:
--------------------------------------------------------------------------------
1 | require 'rubygems'
2 | require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
3 | require 'rake'
4 | require 'gorillib/logger/log'
5 |
6 | module Swineherd
7 | autoload :Template, 'swineherd/template'
8 | autoload :FileSystem, 'swineherd/filesystem'
9 | autoload :Script, 'swineherd/script'
10 | autoload :Workflow, 'swineherd/workflow'
11 |
12 | # For rake 0.9 compatibility
13 | include Rake::DSL if defined?(Rake::DSL)
14 | end
15 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 | autoload :BaseFileSystem, 'swineherd/filesystem/basefilesystem'
3 | autoload :LocalFileSystem, 'swineherd/filesystem/localfilesystem'
4 | autoload :HadoopFileSystem, 'swineherd/filesystem/hadoopfilesystem'
5 | autoload :S3FileSystem, 'swineherd/filesystem/s3filesystem'
6 |
7 | class FileSystem
8 |
9 | FILESYSTEMS = {
10 | 'file' => Swineherd::LocalFileSystem,
11 | 'hdfs' => Swineherd::HadoopFileSystem,
12 | 's3' => Swineherd::S3FileSystem
13 | }
14 |
15 | # A factory function that returns an instance of the requested class
16 | def self.get scheme, *args
17 | begin
18 | FILESYSTEMS[scheme.to_s].new *args
19 | rescue NoMethodError => e
20 | raise "Filesystem with scheme #{scheme} does not exist.\n #{e.message}"
21 | end
22 | end
23 |
24 | end
25 |
26 | end
27 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/README_filesystem.textile:
--------------------------------------------------------------------------------
1 | h1. File System Abstraction
2 |
3 | Hackboxen need to access files and directories in order to do their
4 | stuff. We currently expect them to use at least the following types
5 | of filesystems:
6 |
7 | * Local File System
8 | * Ephemeral Hadoop cluster HDFS
9 | * s3/HDFS
10 |
11 | Each of these filesystem types has different methods to accomplish the same operations. In order to make this diversity more easily used by hackboxen, an abstraction layer has been created.
12 |
13 | h2. Interface
14 |
15 | A new @FileSystem@ class has a single class method @get@ taking two argugments:
16 |
17 | * @scheme@: A token which specifies the filesystem scheme. Currently, only @:file@ is supported.
18 | * @*args@: Optional arguments (e.g. credentitals)
19 |
20 | The returned (abstracted) filesystem instnace has the following methods:
21 |
22 | * @open(path,mode,blk)@: Return a @File@ like file handle object. @mode@ and @blk@ arguments are optional and work like the standard ruby @File.open@ arguments.
23 | * @rm(path)@: Works like UNIX @rm -r@.
24 | * @exists?(path)@: Returns @true@ if the file/directory exists
25 | * @mv(srcpath,dstpath)@: Renames/moves the file/directory.
26 | * @cp(srcpath,dstpath)@: Works like UNIX @cp -r@.
27 | * @mkpath(dirpath)@: Creates a directory and all required parent directories.
28 | * @type(path)@: Returns one of "dir", "file", or "symlink".
29 | * @entries(dirpath)@: Returns the the files/subdirectories in this directory
30 |
31 | The @File@ object returned by the @open@ methods has the following methods:
32 |
33 | * @read@: Return the contents of the entire file as a string.
34 | * @readline@: Return the next line in the file, or nil if there no more lines.
35 | * @write(string)@: Write @string@ to the file.
36 | * @close@: Close the file
37 |
38 | h2. Creating an abstraction
39 |
40 | Each abstraction is not expected to catch and rethrow exceptions of the abstracted subsystems. Rather, exceptions should pass through. However, each method should try to be built to behave similarly to the corresponding native ruby @File@ and @FileUtils@ methods.
41 |
42 | h2. Current State
43 |
44 | The only currently implemented filesystem abstraction is @:file@ (local file system).
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/basefilesystem.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 |
3 | #
4 | # All methods a filesystem should have
5 | #
6 | module BaseFileSystem
7 |
8 | #
9 | # Return a new instance of 'this' filesystem. Classes that include this
10 | # module are expected to know how to pull their particular set of arguments
11 | # from *args and initialize themselves by opening any required connections, &c.
12 | #
13 | def initialize *args
14 | end
15 |
16 | #
17 | # Open a file in this filesystem. Should return a usable file handle for in
18 | # the mode (read 'r' or 'w') given. File classes should, at minimum, have
19 | # the methods defined in BaseFile
20 | #
21 | def open path, mode="r", &blk
22 | end
23 |
24 | #
25 | # Recursively measure the size of path. Results in bytes.
26 | #
27 | def size path
28 | end
29 |
30 | #
31 | # Recursively delete the path and all paths below it.
32 | #
33 | def rm path
34 | end
35 |
36 | #
37 | # Returns true if the file or path exists and false otherwise.
38 | #
39 | def exists? path
40 | end
41 |
42 | #
43 | # Moves the source path to the destination path
44 | #
45 | def mv srcpath, dstpath
46 | end
47 |
48 | #
49 | # Recursively copies all files and directories under srcpath to dstpath
50 | #
51 | def cp srcpath, dstpath
52 | end
53 |
54 | #
55 | # Make directory path if it does not (partly) exist
56 | #
57 | def mkpath path
58 | end
59 |
60 | #
61 | # Return file type ("directory" or "file" or "symlink")
62 | #
63 | def type path
64 | end
65 |
66 | #
67 | # Give contained files/dirs
68 | #
69 | def entries dirpath
70 | end
71 |
72 | #
73 | # For running tasks idempotently. Returns true if no paths exist, false if all paths exist,
74 | # and raises an error otherwise.
75 | #
76 | def check_paths paths
77 | exist_count = paths.inject(0){|cnt, path| cnt += 1 if exists?(path); cnt}
78 | raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
79 | return true if exist_count == 0
80 | false
81 | end
82 |
83 | #
84 | # Needs to close the filesystem by cleaning up any open connections, &c.
85 | #
86 | def close *args
87 | end
88 |
89 | class BaseFile
90 | attr_accessor :path, :scheme, :mode
91 |
92 |
93 | def initialize *args, &blk
94 | end
95 |
96 | #
97 | # A new file in the filesystem needs to be instantiated with a
98 | # path, a mode (read 'r' or write 'w').
99 | #
100 | def open path, mode="r", &blk
101 | end
102 |
103 | #
104 | # Return whole file and as a string
105 | #
106 | def read
107 | end
108 |
109 | #
110 | # Return a line from stream
111 | #
112 | def readline
113 | end
114 |
115 | #
116 | # Writes a string to the file
117 | #
118 | def write string
119 | end
120 |
121 | #
122 | # Close the file
123 | #
124 | def close *args
125 | end
126 |
127 | end
128 |
129 | end
130 |
131 | end
132 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/filesystems.rb:
--------------------------------------------------------------------------------
1 | require 'fileutils'
2 |
3 | class FileSystem
4 |
5 | # A factory function that returns an instance of the requested class
6 | def self.get(scheme, *args)
7 | if scheme == :file
8 | LocalFileSystem.new()
9 | else
10 | nil
11 | end
12 | end
13 |
14 | class LocalFileSystem
15 |
16 | # Open a file in this filesystem
17 | def open(path,mode="r",&blk)
18 | return LocalFile.new(path,mode,&blk)
19 | end
20 |
21 | # Works like rm -r
22 | def rm(path)
23 | FileUtils.rm_r(path)
24 | end
25 |
26 | # Does this exist?
27 | def exists?(path)
28 | File.exists?(path)
29 | end
30 |
31 | # Works like UNIX mv
32 | def mv(srcpath,dstpath)
33 | FileUtils.mv(srcpath,dstpath)
34 | end
35 |
36 | # Works like UNIX cp -r
37 | def cp(srcpath,dstpath)
38 | FileUtils.cp_r(srcpath,dstpath)
39 | end
40 |
41 | # Make directory path if it does not (partly) exist
42 | def mkpath(path)
43 | FileUtils.mkpath
44 | end
45 |
46 | # Return file type ("dir" or "file" or "symlink")
47 | def type(path)
48 | if File.symlink?(path)
49 | return "symlink"
50 | end
51 | if File.directory?(path)
52 | return "directory"
53 | end
54 | if File.file?(path)
55 | return "file"
56 | end
57 | "unknown"
58 | end
59 |
60 | # Give contained files/dirs
61 | def entries(dirpath)
62 | if type(dirpath) != "directory"
63 | return nil
64 | end
65 | Dir.entries(dirpath)
66 | end
67 |
68 | class LocalFile
69 | attr_accessor :path, :scheme, :mode
70 |
71 | def initialize(path,mode="r",&blk)
72 | @path=path
73 | @mode=mode
74 | @handle=File.open(path,mode,&blk)
75 | end
76 |
77 | def open(path,mode="r")
78 | # Only "r" and "w" modes are supported.
79 | initialize(path,mode)
80 | end
81 |
82 | # Return whole file and as a string
83 | def read
84 | @handle.read
85 | end
86 |
87 | # Return a line from stream
88 | def readline
89 | @handle.gets
90 | end
91 |
92 | # Writes to the file
93 | def write(string)
94 | @handle.write(string)
95 | end
96 |
97 | # Close file
98 | def close
99 | @handle.close
100 | end
101 | end
102 | end
103 | end
104 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/hadoopfilesystem.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 |
3 | #
4 | # Methods for dealing with hadoop distributed file system (hdfs). This class
5 | # requires that you run with JRuby as it makes use of the native java hadoop
6 | # libraries.
7 | #
8 | class HadoopFileSystem
9 |
10 | include Swineherd::BaseFileSystem
11 |
12 | attr_accessor :conf, :hdfs
13 |
14 | #
15 | # Initialize a new hadoop file system, needs path to hadoop configuration
16 | #
17 | def initialize *args
18 | check_and_set_environment
19 | @conf = Java::org.apache.hadoop.conf.Configuration.new
20 | @hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
21 | end
22 |
23 | #
24 | # Make sure environment is sane then set up environment for use
25 | #
26 | def check_and_set_environment
27 | check_env
28 | set_env
29 | end
30 |
31 | def open path, mode="r", &blk
32 | HadoopFile.new(path,mode,self,&blk)
33 | end
34 |
35 | def size path
36 | lr(path).inject(0){|sz, f| sz += @hdfs.get_file_status(Path.new(f)).get_len}
37 | end
38 |
39 | #
40 | # Recursively list paths
41 | #
42 | def lr path
43 | paths = entries(path)
44 | if (paths && !paths.empty?)
45 | paths.map{|e| lr(e)}.flatten
46 | else
47 | path
48 | end
49 | end
50 |
51 | def rm path
52 | @hdfs.delete(Path.new(path), true)
53 | [path]
54 | end
55 |
56 | def exists? path
57 | @hdfs.exists(Path.new(path))
58 | end
59 |
60 | def mv srcpath, dstpath
61 | @hdfs.rename(Path.new(srcpath), Path.new(dstpath))
62 | end
63 |
64 | def cp srcpath, dstpath
65 | FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf)
66 | end
67 |
68 | def mkpath path
69 | @hdfs.mkdirs(Path.new(path))
70 | path
71 | end
72 |
73 | def type path
74 | return "unknown" unless exists? path
75 | status = @hdfs.get_file_status(Path.new(path))
76 | return "directory" if status.is_dir?
77 | "file"
78 | # case
79 | # when status.isFile then
80 | # return "file"
81 | # when status.is_directory? then
82 | # return "directory"
83 | # when status.is_symlink? then
84 | # return "symlink"
85 | # end
86 | end
87 |
88 | def entries dirpath
89 | return unless type(dirpath) == "directory"
90 | list = @hdfs.list_status(Path.new(dirpath))
91 | list.map{|path| path.get_path.to_s} rescue []
92 | end
93 |
94 | #
95 | # Merge all part files in a directory into one file.
96 | #
97 | def merge srcdir, dstfile
98 | FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
99 | end
100 |
101 | #
102 | # This is hackety. Use with caution.
103 | #
104 | def stream input, output
105 | require 'uri'
106 | input_fs_scheme = URI.parse(input).scheme
107 | output_fs_scheme = URI.parse(output).scheme
108 | system("#{@hadoop_home}/bin/hadoop \\
109 | jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
110 | -D mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
111 | -D mapred.min.split.size=1000000000 \\
112 | -D mapred.reduce.tasks=0 \\
113 | -mapper \"/bin/cat\" \\
114 | -input \"#{input}\" \\
115 | -output \"#{output}\"")
116 | end
117 |
118 | #
119 | # BZIP
120 | #
121 | def bzip input, output
122 | system("#{@hadoop_home}/bin/hadoop \\
123 | jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
124 | -D mapred.output.compress=true \\
125 | -D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
126 | -D mapred.reduce.tasks=1 \\
127 | -mapper \"/bin/cat\" \\
128 | -reducer \"/bin/cat\" \\
129 | -input \"#{input}\" \\
130 | -output \"#{output}\"")
131 | end
132 |
133 | #
134 | # Merges many input files into :reduce_tasks amount of output files
135 | #
136 | def dist_merge inputs, output, options = {}
137 | options[:reduce_tasks] ||= 25
138 | options[:partition_fields] ||= 2
139 | options[:sort_fields] ||= 2
140 | options[:field_separator] ||= '/t'
141 | names = inputs.map{|inp| File.basename(inp)}.join(',')
142 | cmd = "#{@hadoop_home}/bin/hadoop \\
143 | jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
144 | -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
145 | -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
146 | -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
147 | -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
148 | -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
149 | -D mapred.min.split.size=1000000000 \\
150 | -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
151 | -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
152 | -mapper \"/bin/cat\" \\
153 | -reducer \"/usr/bin/uniq\" \\
154 | -input \"#{inputs.join(',')}\" \\
155 | -output \"#{output}\""
156 | puts cmd
157 | system cmd
158 | end
159 |
160 | #
161 | # Copy hdfs file to local filesystem
162 | #
163 | def copy_to_local srcfile, dstfile
164 | @hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
165 | end
166 |
167 | #
168 | # Copy local file to hdfs filesystem
169 | #
170 | def copy_from_local srcfile, dstfile
171 | @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
172 | end
173 |
174 | def close *args
175 | @hdfs.close
176 | end
177 |
178 | class HadoopFile
179 | attr_accessor :path, :handle, :hdfs
180 |
181 | #
182 | # In order to open input and output streams we must pass around the hadoop fs object itself
183 | #
184 | def initialize path, mode, fs, &blk
185 | @fs = fs
186 | @path = Path.new(path)
187 | case mode
188 | when "r" then
189 | raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file"
190 | @handle = @fs.hdfs.open(@path).to_io(&blk)
191 | when "w" then
192 | # Open path for writing
193 | raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown")
194 | @handle = @fs.hdfs.create(@path).to_io.to_outputstream
195 | if block_given?
196 | yield self
197 | self.close # muy muy importante
198 | end
199 | end
200 | end
201 |
202 | def read
203 | @handle.read
204 | end
205 |
206 | def readline
207 | @handle.readline
208 | end
209 |
210 | def write string
211 | @handle.write(string.to_java_string.get_bytes)
212 | end
213 |
214 | def puts string
215 | write(string+"\n")
216 | end
217 |
218 | def close
219 | @handle.close
220 | end
221 |
222 | end
223 |
224 | # #
225 | # # Distributed streaming from input to output
226 | # #
227 | #
228 | # #
229 | # # Given an array of input dirs, stream all into output dir and remove duplicate records.
230 | # # Reasonable default hadoop streaming options are chosen.
231 | # #
232 | # def self.merge inputs, output, options = {}
233 | # options[:reduce_tasks] ||= 25
234 | # options[:partition_fields] ||= 2
235 | # options[:sort_fields] ||= 2
236 | # options[:field_separator] ||= '/t'
237 | # names = inputs.map{|inp| File.basename(inp)}.join(',')
238 | # cmd = "${HADOOP_HOME}/bin/hadoop \\
239 | # jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \\
240 | # -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
241 | # -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
242 | # -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
243 | # -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
244 | # -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
245 | # -D mapred.min.split.size=1000000000 \\
246 | # -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
247 | # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
248 | # -mapper \"/bin/cat\" \\
249 | # -reducer \"/usr/bin/uniq\" \\
250 | # -input \"#{inputs.join(',')}\" \\
251 | # -output \"#{output}\""
252 | # puts cmd
253 | # system cmd
254 | # end
255 | #
256 | # #
257 | # # Concatenates a hadoop dir or file into a local file
258 | # #
259 | # def self.cat_to_local src, dest
260 | # system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest)
261 | # end
262 | #
263 |
264 | #
265 | # Check that we are running with jruby, check for hadoop home. hadoop_home
266 | # is preferentially set to the HADOOP_HOME environment variable if it's set,
267 | # '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and
268 | # '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else
269 | # fails inform the user that HADOOP_HOME really should be set.
270 | #
271 | def check_env
272 | begin
273 | require 'java'
274 | rescue LoadError => e
275 | raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
276 | end
277 | @hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop')
278 | @hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home
279 | raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home
280 | end
281 |
282 | #
283 | # Place hadoop jars in class path, require appropriate jars, set hadoop conf
284 | #
285 | def set_env
286 | require 'java'
287 | @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
288 | @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
289 | $CLASSPATH << @hadoop_conf
290 | Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
291 |
292 | java_import 'org.apache.hadoop.conf.Configuration'
293 | java_import 'org.apache.hadoop.fs.Path'
294 | java_import 'org.apache.hadoop.fs.FileSystem'
295 | java_import 'org.apache.hadoop.fs.FileUtil'
296 | java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat'
297 | java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat'
298 | java_import 'org.apache.hadoop.fs.FSDataOutputStream'
299 | java_import 'org.apache.hadoop.fs.FSDataInputStream'
300 |
301 | end
302 |
303 | end
304 |
305 | end
306 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/localfilesystem.rb:
--------------------------------------------------------------------------------
1 | require 'fileutils'
2 | require 'find'
3 | module Swineherd
4 |
5 | class LocalFileSystem
6 |
7 | include Swineherd::BaseFileSystem
8 |
9 | def initialize *args
10 | end
11 |
12 | def open path, mode="r", &blk
13 | return LocalFile.new path, mode, &blk
14 | end
15 |
16 | def size path
17 | sz = 0
18 | Find.find(path){|f| sz += File.size(f)}
19 | sz
20 | end
21 |
22 | def rm path
23 | FileUtils.rm_r path
24 | end
25 |
26 | def exists? path
27 | File.exists?(path)
28 | end
29 |
30 | def mv srcpath, dstpath
31 | FileUtils.mv(srcpath,dstpath)
32 | end
33 |
34 | def cp srcpath, dstpath
35 | FileUtils.cp_r(srcpath,dstpath)
36 | end
37 |
38 | def mkpath path
39 | FileUtils.mkpath path
40 | end
41 |
42 | def type path
43 | case
44 | when File.symlink?(path) then
45 | return "symlink"
46 | when File.directory?(path) then
47 | return "directory"
48 | when File.file?(path) then
49 | return "file"
50 | end
51 | "unknown"
52 | end
53 |
54 | def entries dirpath
55 | return unless (type(dirpath) == "directory")
56 | Dir.entries(dirpath)
57 | end
58 |
59 | class LocalFile
60 | attr_accessor :path, :scheme, :handle, :mode
61 |
62 | def initialize path, mode="r", &blk
63 | @path = path
64 | @mode = mode
65 | @handle = File.open(path,mode,&blk)
66 | end
67 |
68 | def open path, mode="r", &blk
69 | initialize(path,mode,&blk)
70 | end
71 |
72 | def read
73 | @handle.read
74 | end
75 |
76 | def readline
77 | @handle.gets
78 | end
79 |
80 | def write string
81 | @handle.write(string)
82 | end
83 |
84 | def close
85 | @handle.close
86 | end
87 | end
88 |
89 | end
90 | end
91 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/localfs.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 | class LocalFS
3 | def self.check_paths paths
4 | exist_count = 0 # no outputs exist
5 | paths.each{|path| exist_count += 1 if File.exist?(path) }
6 | raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
7 | return true if exist_count == 0
8 | false
9 | end
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/swineherd/filesystem/s3filesystem.rb:
--------------------------------------------------------------------------------
1 | require 'tempfile'
2 | module Swineherd
3 |
4 | #
5 | # Methods for interacting with Amazon's Simple Store Service (s3).
6 | #
7 | class S3FileSystem
8 |
9 | include Swineherd::BaseFileSystem
10 |
11 | attr_accessor :s3
12 |
13 | #
14 | # Initialize a new s3 file system, needs path to aws keys
15 | #
16 | def initialize aws_access_key_id, aws_secret_access_key
17 | require 'right_aws'
18 | @s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key)
19 | end
20 |
21 | def open path, mode="r", &blk
22 | S3File.new(path,mode,self,&blk)
23 | end
24 |
25 | def size path
26 | sz = 0
27 | if type(path) == "directory"
28 | lr(path).each do |f|
29 | sz += file_size(f)
30 | end
31 | else
32 | sz += file_size(path)
33 | end
34 | sz
35 | end
36 |
37 | def file_size path
38 | containing_bucket = bucket(path)
39 | header = @s3.interface.head(containing_bucket, key_path(path))
40 | header['content-length'].to_i
41 | end
42 |
43 | def rm path
44 | bkt = bucket(path)
45 | key = key_path(path)
46 | if key.empty? # only the bucket was passed in, delete it
47 | @s3.interface.force_delete_bucket(bkt)
48 | else
49 | case type(path)
50 | when "directory" then
51 | keys_to_delete = lr(path)
52 | keys_to_delete.each do |k|
53 | key_to_delete = key_path(k)
54 | @s3.interface.delete(bkt, key_to_delete)
55 | end
56 | keys_to_delete
57 | when "file" then
58 | @s3.interface.delete(bkt, key)
59 | [path]
60 | end
61 | end
62 | end
63 |
64 | def bucket path
65 | uri = URI.parse(path)
66 | uri.path.split('/').reject{|x| x.empty?}.first
67 | end
68 |
69 | def key_path path
70 | uri = URI.parse(path)
71 | File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1])
72 | end
73 |
74 | def needs_trailing_slash pre
75 | has_trailing_slash = pre.end_with? '/'
76 | is_empty_prefix = pre.empty?
77 | !(has_trailing_slash || is_empty_prefix)
78 | end
79 |
80 | def full_contents path
81 | bkt = bucket(path)
82 | pre = key_path(path)
83 | pre += '/' if needs_trailing_slash(pre)
84 | contents = []
85 | s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res|
86 | contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
87 | contents += res[:contents].map{|c| File.join(bkt, c[:key])}
88 | end
89 | contents
90 | end
91 |
92 | def exists? path
93 | object = File.basename(path)
94 | search_dir = File.dirname(path)
95 | case search_dir
96 | when '.' then # only a bucket was passed in
97 | begin
98 | (full_contents(object).size > 0)
99 | rescue RightAws::AwsError => e
100 | if e.message =~ /nosuchbucket/i
101 | false
102 | else
103 | raise e
104 | end
105 | end
106 | else
107 | search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')}
108 | search_dir_contents.include?(object)
109 | end
110 | end
111 |
112 | def mv srcpath, dstpath
113 | src_bucket = bucket(srcpath)
114 | dst_bucket = bucket(dstpath)
115 | dst_key_path = key_path(dstpath)
116 | mkpath(dstpath)
117 | case type(srcpath)
118 | when "directory" then
119 | paths_to_copy = lr(srcpath)
120 | common_dir = common_directory(paths_to_copy)
121 | paths_to_copy.each do |path|
122 | src_key = key_path(path)
123 | dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
124 | @s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
125 | end
126 | when "file" then
127 | @s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
128 | end
129 | end
130 |
131 | def cp srcpath, dstpath
132 | src_bucket = bucket(srcpath)
133 | dst_bucket = bucket(dstpath)
134 | dst_key_path = key_path(dstpath)
135 | mkpath(dstpath)
136 | case type(srcpath)
137 | when "directory" then
138 | paths_to_copy = lr(srcpath)
139 | common_dir = common_directory(paths_to_copy)
140 | paths_to_copy.each do |path|
141 | src_key = key_path(path)
142 | dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
143 | @s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
144 | end
145 | when "file" then
146 | @s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
147 | end
148 | end
149 |
150 | # right now this only works on single files
151 | def copy_to_local srcpath, dstpath
152 | src_bucket = bucket(srcpath)
153 | src_key_path = key_path(srcpath)
154 | dstfile = File.new(dstpath, 'w')
155 | @s3.interface.get(src_bucket, src_key_path) do |chunk|
156 | dstfile.write(chunk)
157 | end
158 | dstfile.close
159 | end
160 |
161 | # This is a bit funny, there's actually no need to create a 'path' since
162 | # s3 is nothing more than a glorified key-value store. When you create a
163 | # 'file' (key) the 'path' will be created for you. All we do here is create
164 | # the bucket unless it already exists.
165 | #
166 | def mkpath path
167 | bkt = bucket(path)
168 | key = key_path(path)
169 | if key.empty?
170 | @s3.interface.create_bucket(bkt)
171 | else
172 | @s3.interface.create_bucket(bkt) unless exists? bkt
173 | end
174 | path
175 | end
176 |
177 | def type path
178 | return "unknown" unless exists? path
179 | return "directory" if full_contents(path).size > 0
180 | "file"
181 | end
182 |
183 | def entries dirpath
184 | return unless type(dirpath) == "directory"
185 | full_contents(dirpath)
186 | end
187 |
188 | # Recursively list paths
189 | def lr path
190 | paths = entries(path)
191 | if paths
192 | paths.map{|e| lr(e)}.flatten
193 | else
194 | path
195 | end
196 | end
197 |
198 | #
199 | # Ick.
200 | #
201 | def common_directory paths
202 | dirs = paths.map{|path| path.split('/')}
203 | min_size = dirs.map{|splits| splits.size}.min
204 | dirs.map!{|splits| splits[0...min_size]}
205 | uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
206 | dirs[0][0...uncommon_idx].join('/')
207 | end
208 |
209 | def put srcpath, destpath
210 | dest_bucket = bucket(destpath)
211 | if File.directory? srcpath
212 | # handle Dir later
213 | else
214 | key = srcpath
215 | end
216 | @s3.interface.put(dest_bucket, key, File.open(srcpath))
217 | end
218 |
219 | def close *args
220 | end
221 |
222 | class S3File
223 | attr_accessor :path, :handle, :fs
224 |
225 | #
226 | # In order to open input and output streams we must pass around the s3 fs object itself
227 | #
228 | def initialize path, mode, fs, &blk
229 | @fs = fs
230 | @path = path
231 | case mode
232 | when "r" then
233 | raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
234 | when "w" then
235 | raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
236 | @handle = Tempfile.new('s3filestream')
237 | if block_given?
238 | yield self
239 | close
240 | end
241 | end
242 | end
243 |
244 | #
245 | # Faster than iterating
246 | #
247 | def read
248 | resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path))
249 | resp
250 | end
251 |
252 | #
253 | # This is a little hackety. That is, once you call (.each) on the object the full object starts
254 | # downloading...
255 | #
256 | def readline
257 | @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
258 | begin
259 | @handle.next
260 | rescue StopIteration, NoMethodError
261 | @handle = nil
262 | raise EOFError.new("end of file reached")
263 | end
264 | end
265 |
266 | def write string
267 | @handle.write(string)
268 | end
269 |
270 | def puts string
271 | write(string+"\n")
272 | end
273 |
274 | def close
275 | if @handle
276 | @handle.read
277 | fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r'))
278 | @handle.close
279 | end
280 | @handle = nil
281 | end
282 |
283 | end
284 |
285 | end
286 |
287 | end
288 |
--------------------------------------------------------------------------------
/lib/swineherd/script.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 | module Script
3 |
4 | autoload :WukongScript, 'swineherd/script/wukong_script'
5 | autoload :PigScript, 'swineherd/script/pig_script'
6 | autoload :RScript, 'swineherd/script/r_script'
7 |
8 | module Common
9 |
10 | attr_accessor :input, :output, :options, :attributes
11 | def initialize(source, input = [], output = [], options = {}, attributes ={})
12 | @source = source
13 | @input = input
14 | @output = output
15 | @options = options
16 | @attributes = attributes
17 | end
18 |
19 | #
20 | # Allows for setting the environment the script will be ran in
21 | #
22 | def env
23 | ENV
24 | end
25 |
26 | def script
27 | @script ||= Template.new(@source, @attributes).substitute!
28 | end
29 |
30 | #
31 | # So we can reuse ourselves
32 | #
33 | def refresh!
34 | @script = nil
35 | @output = []
36 | @input = []
37 | end
38 |
39 | #
40 | # This depends on the type of script
41 | #
42 | def cmd
43 | raise "Override this in subclass!"
44 | end
45 |
46 | #
47 | # Override this in subclass to decide how script runs in 'local' mode
48 | # Best practice is that it needs to be able to run on a laptop w/o
49 | # hadoop.
50 | #
51 | def local_cmd
52 | raise "Override this in subclass!"
53 | end
54 |
55 | #
56 | # Default is to run with hadoop
57 | #
58 | def run mode=:hadoop
59 | case mode
60 | when :local then
61 | sh local_cmd do |res, ok|
62 | Log.info("Exit status was #{ok}")
63 | raise "Local mode script failed with exit status #{ok}" if ok != 0
64 | end
65 | when :hadoop then
66 | sh cmd do |res, ok|
67 | Log.info("Exit status was #{ok}")
68 | raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
69 | end
70 | end
71 | end
72 |
73 | end
74 | end
75 | end
76 |
--------------------------------------------------------------------------------
/lib/swineherd/script/hadoop_script.rb:
--------------------------------------------------------------------------------
1 | module Swineherd::Script
2 |
3 | #
4 | # native Java map-reduce
5 | #
6 | class HadoopScript
7 | include Common
8 | attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
9 |
10 | def initialize *args
11 | super(*args)
12 | @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
13 | end
14 |
15 | #
16 | # Converts an arbitrarily nested hash to flattened arguments
17 | # for passing to java program. For example:
18 | #
19 | # {:mapred => {:reduce => {:tasks => 0}}}
20 | #
21 | # will transform to:
22 | #
23 | # '-Dmapred.reduce.tasks=0'
24 | #
25 | def java_args args
26 | to_dotted_args(args).map{|arg| "-D#{arg}"}
27 | end
28 |
29 | #
30 | # Uses recursion to take an arbitrarily nested hash and
31 | # flatten it into dotted args. See 'to_java_args'. Can
32 | # you do it any better?
33 | #
34 | def to_dotted_args args
35 | args.map do |k,v|
36 | if v.is_a?(Hash)
37 | to_dotted_args(v).map do |s|
38 | [k,s].join(".")
39 | end
40 | else
41 | "#{k}=#{v}"
42 | end
43 | end.flatten
44 | end
45 |
46 | def cmd
47 | [
48 | "HADOOP_CLASSPATH=#{hadoop_classpath}",
49 | "#{hadoop_home}/bin/hadoop jar #{run_jar}",
50 | main_class,
51 | java_args(options),
52 | "-libjars #{libjars}",
53 | "#{input.join(',')}",
54 | "#{output.join(',')}"
55 | ].flatten.compact.join(" \t\\\n ")
56 | end
57 |
58 | end
59 | end
60 |
--------------------------------------------------------------------------------
/lib/swineherd/script/pig_script.rb:
--------------------------------------------------------------------------------
1 | module Swineherd::Script
2 | class PigScript
3 | include Common
4 |
5 | #
6 | # Not guaranteeing anything.
7 | #
8 | AVRO_PIG_MAPPING = {
9 | 'string' => 'chararray',
10 | 'int' => 'int',
11 | 'long' => 'long',
12 | 'float' => 'float',
13 | 'double' => 'double',
14 | 'bytes' => 'bytearray',
15 | 'fixed' => 'bytearray'
16 | }
17 |
18 | #
19 | # Simple utility function for mapping avro types to pig types
20 | #
21 | def self.avro_to_pig avro_type
22 | AVRO_PIG_MAPPING[avro_type]
23 | end
24 |
25 | #
26 | # Convert a generic hash of options {:foo => 'bar'} into
27 | # command line options for pig '-p FOO=bar'
28 | #
29 | def pig_args options
30 | options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
31 | end
32 |
33 |
34 |
35 | def local_cmd
36 | Log.info("Launching Pig script in local mode")
37 | "pig -x local #{pig_args(@options)} #{script}"
38 | end
39 |
40 | def cmd
41 | Log.info("Launching Pig script in hadoop mode")
42 | "pig #{pig_args(@options)} #{script}"
43 | end
44 |
45 | end
46 | end
47 |
--------------------------------------------------------------------------------
/lib/swineherd/script/r_script.rb:
--------------------------------------------------------------------------------
1 | module Swineherd::Script
2 | class RScript
3 | include Common
4 |
5 | def local_cmd
6 | "/usr/bin/Rscript --vanilla #{script}"
7 | end
8 |
9 | def cmd
10 | local_cmd
11 | end
12 |
13 | end
14 | end
15 |
--------------------------------------------------------------------------------
/lib/swineherd/script/wukong_script.rb:
--------------------------------------------------------------------------------
1 | require 'pathname'
2 |
3 | module Swineherd::Script
4 | class WukongScript
5 | include Common
6 |
7 | def wukong_args options
8 | options.map{|param,val| "--#{param}=#{val}" }.join(' ')
9 | end
10 |
11 | #
12 | # Don't treat wukong scripts as templates
13 | #
14 | def script
15 | @source
16 | end
17 |
18 | def cmd
19 | raise "No wukong input specified" if input.empty?
20 | Log.info("Launching Wukong script in hadoop mode")
21 | "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
22 | end
23 |
24 | def local_cmd
25 | inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
26 | Log.info("Launching Wukong script in local mode")
27 | "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
28 | end
29 |
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/lib/swineherd/template.rb:
--------------------------------------------------------------------------------
1 | require 'erubis'
2 | require 'tempfile'
3 |
4 |
5 | # Template.new(script_path, attributes).substitute!
6 |
7 | module Swineherd
8 |
9 | class Template
10 | attr_accessor :source_template, :attributes
11 |
12 | def initialize source_template, attributes
13 | @source_template = source_template
14 | @attributes = attributes
15 | end
16 |
17 | def compile!
18 | dest << Erubis::Eruby.new(source).result(attributes)
19 | dest << "\n"
20 | dest
21 | end
22 |
23 | def substitute!
24 | compile!
25 | dest.read
26 | dest.path
27 | end
28 |
29 | protected
30 |
31 | def source
32 | File.open(source_template).read
33 | end
34 |
35 | def dest
36 | return @dest if @dest
37 | @dest ||= Tempfile.new(basename)
38 | end
39 |
40 | def basename
41 | File.basename(source_template)
42 | end
43 |
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/lib/swineherd/workflow.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 | class Workflow
3 | attr_accessor :workdir, :outputs, :output_counts
4 |
5 | #
6 | # Create a new workflow and new namespace for this workflow
7 | #
8 | def initialize flow_id, &blk
9 | @flow_id = flow_id
10 | @output_counts = Hash.new{|h,k| h[k] = 0}
11 | @outputs = Hash.new{|h,k| h[k] = []}
12 | namespace @flow_id do
13 | self.instance_eval(&blk)
14 | end
15 | end
16 |
17 | #
18 | # Get next logical output of taskname by incrementing internal counter
19 | #
20 | def next_output taskname
21 | raise "No working directory specified." unless @workdir
22 | @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
23 | @output_counts[taskname] += 1
24 | latest_output(taskname)
25 | end
26 |
27 | #
28 | # Get latest output of taskname
29 | #
30 | def latest_output taskname
31 | @outputs[taskname].last
32 | end
33 |
34 | #
35 | # Runs workflow starting with taskname
36 | #
37 | def run taskname
38 | Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
39 | Rake::Task["#{@flow_id}:#{taskname}"].invoke
40 | Log.info "Workflow task #{@flow_id}:#{taskname} finished"
41 | end
42 |
43 | #
44 | # Describes the dependency tree of all tasks belonging to self
45 | #
46 | def describe
47 | Rake::Task.tasks.each do |t|
48 | Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
49 | end
50 | end
51 |
52 | end
53 | end
54 |
--------------------------------------------------------------------------------
/lib/swineherd/workflow/job.rb:
--------------------------------------------------------------------------------
1 | module Swineherd
2 |
3 | #
4 | # Job class is at its core a rake task
5 | #
6 | class Job
7 |
8 | #
9 | # Initialize job, fill variables, and create rake task
10 | #
11 | def initialize job_id, &blk
12 | @job_id = job_id
13 | @name = ''
14 | @dependencies = []
15 | @script = ''
16 | self.instance_eval(&blk)
17 | raketask
18 | handle_dependencies
19 | end
20 |
21 | #
22 | # Will be the name of the rake task
23 | #
24 | def name name = nil
25 | return @name unless name
26 | @name = name
27 | end
28 |
29 | def script script = nil
30 | return @script unless script
31 | @script = script
32 | end
33 |
34 | #
35 | # An array of job names as dependencies
36 | #
37 | def dependencies dependencies = nil
38 | return @dependencies unless dependencies
39 | @dependencies = dependencies
40 | end
41 |
42 | def handle_dependencies
43 | return if dependencies.empty?
44 | task name => dependencies
45 | end
46 |
47 | def cmd
48 | @script.cmd
49 | end
50 |
51 | #
52 | # Every job is compiled into a rake task
53 | #
54 | def raketask
55 | task name do
56 | @script.run
57 | end
58 | end
59 | end
60 | end
61 |
--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
1 | Logging:
2 |
3 | 1. All output from the launched workflow should go to a workflow log file
4 | 2. Hadoop output is special and should be pulled down from the jobtracker
5 | - jobconf.xml
6 | - job details page
7 |
8 | Workflow should specify a logdir, defualts to workdir + '/logs'
9 |
10 | Fetching hadoop job stats:
11 |
12 | 1. Get job id
13 | 2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
14 | 3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
15 | 4. Fetch the two urls we care about and dump into the workflow's log dir.
16 | 5. Possibly parse the results into an ongoing workflow-statistics.tsv file
17 |
18 | Other output:
19 |
20 | Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.
21 |
--------------------------------------------------------------------------------
/swineherd.gemspec:
--------------------------------------------------------------------------------
1 | # Generated by jeweler
2 | # DO NOT EDIT THIS FILE DIRECTLY
3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4 | # -*- encoding: utf-8 -*-
5 |
6 | Gem::Specification.new do |s|
7 | s.name = %q{swineherd}
8 | s.version = "0.0.4"
9 |
10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 | s.authors = ["Jacob Perkins"]
12 | s.date = %q{2011-06-22}
13 | s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
14 | s.email = %q{jacob.a.perkins@gmail.com}
15 | s.executables = ["hdp-tree", "hadoop-stream"]
16 | s.extra_rdoc_files = [
17 | "LICENSE",
18 | "README.textile"
19 | ]
20 | s.files = [
21 | "LICENSE",
22 | "README.textile",
23 | "Rakefile",
24 | "VERSION",
25 | "bin/hadoop-stream",
26 | "bin/hdp-tree",
27 | "examples/pagerank/data/seinfeld_network.tsv",
28 | "examples/pagerank/pagerank.rb",
29 | "examples/pagerank/scripts/cut_off_list.rb",
30 | "examples/pagerank/scripts/histogram.R",
31 | "examples/pagerank/scripts/pagerank.pig",
32 | "examples/pagerank/scripts/pagerank_initialize.pig",
33 | "lib/swineherd.rb",
34 | "lib/swineherd/filesystem.rb",
35 | "lib/swineherd/filesystem/README_filesystem.textile",
36 | "lib/swineherd/filesystem/basefilesystem.rb",
37 | "lib/swineherd/filesystem/filesystems.rb",
38 | "lib/swineherd/filesystem/hadoopfilesystem.rb",
39 | "lib/swineherd/filesystem/localfilesystem.rb",
40 | "lib/swineherd/filesystem/localfs.rb",
41 | "lib/swineherd/filesystem/s3filesystem.rb",
42 | "lib/swineherd/script.rb",
43 | "lib/swineherd/script/hadoop_script.rb",
44 | "lib/swineherd/script/pig_script.rb",
45 | "lib/swineherd/script/r_script.rb",
46 | "lib/swineherd/script/wukong_script.rb",
47 | "lib/swineherd/template.rb",
48 | "lib/swineherd/workflow.rb",
49 | "lib/swineherd/workflow/job.rb",
50 | "notes.txt",
51 | "swineherd.gemspec",
52 | "tests/test_filesystem.rb",
53 | "tests/test_s3_filesystem.rb",
54 | "tests/testcfg.yaml"
55 | ]
56 | s.homepage = %q{http://github.com/Ganglion/swineherd}
57 | s.licenses = ["MIT"]
58 | s.require_paths = ["lib"]
59 | s.rubygems_version = %q{1.3.7}
60 | s.summary = %q{Flexible data workflow glue.}
61 |
62 | if s.respond_to? :specification_version then
63 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
64 | s.specification_version = 3
65 |
66 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
67 | s.add_development_dependency(%q