├── .gitignore ├── LICENSE ├── README.textile ├── Rakefile ├── VERSION ├── bin ├── hadoop-stream └── hdp-tree ├── examples └── pagerank │ ├── data │ └── seinfeld_network.tsv │ ├── pagerank.rb │ └── scripts │ ├── cut_off_list.rb │ ├── histogram.R │ ├── pagerank.pig │ └── pagerank_initialize.pig ├── lib ├── swineherd.rb └── swineherd │ ├── filesystem.rb │ ├── filesystem │ ├── README_filesystem.textile │ ├── basefilesystem.rb │ ├── filesystems.rb │ ├── hadoopfilesystem.rb │ ├── localfilesystem.rb │ ├── localfs.rb │ └── s3filesystem.rb │ ├── script.rb │ ├── script │ ├── hadoop_script.rb │ ├── pig_script.rb │ ├── r_script.rb │ └── wukong_script.rb │ ├── template.rb │ ├── workflow.rb │ └── workflow │ └── job.rb ├── notes.txt ├── swineherd.gemspec └── tests ├── test_filesystem.rb ├── test_s3_filesystem.rb └── testcfg.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | ## OS 2 | .DS_Store 3 | Icon? 4 | nohup.out 5 | .bak 6 | 7 | ## EDITORS 8 | \#* 9 | .\#* 10 | *~ 11 | *.swp 12 | REVISION 13 | TAGS* 14 | tmtags 15 | *_flymake.* 16 | *_flymake 17 | *.tmproj 18 | .project 19 | .settings 20 | 21 | ## COMPILED 22 | a.out 23 | *.o 24 | *.pyc 25 | *.so 26 | 27 | ## OTHER SCM 28 | .bzr 29 | .hg 30 | .svn 31 | 32 | ## PROJECT::GENERAL 33 | coverage 34 | rdoc 35 | doc 36 | pkg 37 | .yardoc 38 | *private* 39 | 40 | ## PROJECT::SPECIFIC 41 | 42 | *.rdb 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h1. SwineHerd 2 | 3 | Swineherd is for running scripts and workflows on filesystems. 4 | 5 | h3. Outline 6 | 7 | A @workflow@ is built with @script@ objects and ran on a @filesystem@. 8 | 9 | h4. Script: 10 | 11 | A script has the following 12 | 13 | * @source@ - The source file used. These can be "Apache Pig":http://pig.apache.org/ scripts, "Wukong":http://github.com/infochimps/wukong scripts, even "R":http://www.r-project.org/ scripts. You can add your own scripts by subclassing the @script@ class. 14 | * @input@ - An array of input paths. 15 | * @output@ - An array of output paths. 16 | * @options@ - A ruby hash of options used as command line args. Eg. {:foo => 'bar'}. How these options are mapped to command line arguments is up to the particular script class. 17 | * @attributes@ - A ruby hash of parameters used for variable substitution. Every script is assumed to be (but not required to be) an eruby template. 18 | 19 | h4. Workflow: 20 | 21 | A workflow is built using rake @task@ objects that doing nothing more than run scripts. A workflow 22 | 23 | * can be described with a directed dependency graph 24 | * has an @id@ which is used to run its tasks idempotently. At the moment it is the responsibility of the running process (or human being) to choose a suitable id. 25 | * manages intermediate outputs by using the @next_output@ and @latest_output@ methods. See the examples dir for usage. 26 | * A workflow has a working directory in which all intermediate outputs go 27 | ** These are named according to the rake task that created them 28 | 29 | h4. FileSystem 30 | 31 | Workflows are intended to run on filesystems. At the moment, implemented filesystems are 32 | 33 | * @file@ - Local file system. Only thoroughly tested on unbuntu linux. 34 | * @hdfs@ - Hadoop distributed file system. Uses jruby and the Apache Hadoop 0.20 api. 35 | * @s3@ - Uses the right_aws gem for interacting with Amazon Simple Storage System (s3). 36 | 37 | Using the filesystem: 38 | 39 | Paths should be absolute. 40 | 41 |

 42 | # get a new instance of local filesystem and write to it
 43 | localfs = FileSystem.get(:file)
 44 | localfs.open("mylocalfile", 'w') do |f|
 45 |   f.write("Writing a string to a local file")
 46 | end
 47 | 
 48 | # get a new instance of hadoop filesystem and write to it
 49 | hadoopfs = FileSystem.get(:hdfs)
 50 | hadoopfs.open("myhadoopfile", 'w') do |f|
 51 |   f.write("Writing a string to an hdfs file")
 52 | end
 53 | 
 54 | # get a new instance of s3 filesystem and write to it
 55 | access_key_id     = '1234abcd'
 56 | secret_access_key = 'foobar1234'
 57 | s3fs = FileSystem.get(:s3, accees_key_id, secret_access_key)
 58 | s3fs.mkpath 'mys3bucket' # bucket must exist
 59 | s3fs.open("mys3bucket/mys3file", 'w') do |f|
 60 |   f.write("Writing a string to an s3 file")
 61 | end
 62 | 
63 | 64 | h3. Working Example 65 | 66 | For the most up to date working example see the examples directory. Here's a simple example for running pagerank: 67 | 68 |

 69 | #!/usr/bin/env ruby
 70 | 
 71 | $LOAD_PATH << '../../lib'
 72 | require 'swineherd'        ; include Swineherd
 73 | require 'swineherd/script' ; include Swineherd::Script
 74 | require 'swineherd/filesystem'
 75 | 
 76 | Settings.define :flow_id,     :required => true,                     :description => "Flow id required to make run of workflow unique"
 77 | Settings.define :iterations,  :type => Integer,  :default => 10,     :description => "Number of pagerank iterations to run"
 78 | Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
 79 | Settings.resolve!
 80 | 
 81 | flow = Workflow.new(Settings.flow_id) do
 82 | 
 83 |   # The filesystems we're going to be working with
 84 |   hdfs    = Swineherd::FileSystem.get(:hdfs)
 85 |   localfs = Swineherd::FileSystem.get(:file)
 86 | 
 87 |   # The scripts we're going to use
 88 |   initializer = PigScript.new('scripts/pagerank_initialize.pig')
 89 |   iterator    = PigScript.new('scripts/pagerank.pig')
 90 |   finisher    = WukongScript.new('scripts/cut_off_list.rb')
 91 |   plotter     = RScript.new('scripts/histogram.R')
 92 | 
 93 |   #
 94 |   # Runs simple pig script to initialize pagerank. We must specify the input
 95 |   # here as this is the first step in the workflow. The output attribute is to
 96 |   # ensure idempotency and the options attribute is the hash that will be
 97 |   # converted into command-line args for the pig interpreter.
 98 |   #
 99 |   task :pagerank_initialize do
100 |     initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
101 |     initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
102 |   end
103 | 
104 |   #
105 |   # Runs multiple iterations of pagerank with another pig script and manages all
106 |   # the intermediate outputs.
107 |   #
108 |   task :pagerank_iterate => [:pagerank_initialize] do
109 |     iterator.options[:damp]           = '0.85f'
110 |     iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
111 |     Settings.iterations.times do
112 |       iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
113 |       iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
114 |       iterator.refresh!
115 |       iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
116 |     end
117 |   end
118 | 
119 |   #
120 |   # Here we use a wukong script to cut off the last field (a big pig bag of
121 |   # links). Notice how every wukong script MUST have an input but pig scripts do
122 |   # not.
123 |   #
124 |   task :cut_off_adjacency_list => [:pagerank_iterate] do
125 |     finisher.input  << latest_output(:pagerank_iterate)
126 |     finisher.output << next_output(:cut_off_adjacency_list)
127 |     finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
128 |   end
129 | 
130 |   #
131 |   # We want to pull down one result file, merge the part-000.. files into one file
132 |   #
133 |   task :merge_results => [:cut_off_adjacency_list] do
134 |     merged_results = next_output(:merge_results)
135 |     hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
136 |   end
137 | 
138 |   #
139 |   # Cat results into a local directory with the same structure
140 |   # eg. #{work_dir}/#{flow_id}/pull_down_results-0.
141 |   #
142 |   # FIXME: Bridging filesystems is cludgey.
143 |   #
144 |   task :pull_down_results => [:merge_results] do
145 |     local_results = next_output(:pull_down_results)
146 |     hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
147 |   end
148 | 
149 |   #
150 |   # Plot 2nd column of the result as a histogram (requires R and
151 |   # ggplot2). Note that the output here is a png file but doesn't have that
152 |   # extension. Ensmarten me as to the right way to handle that?
153 |   #
154 |   task :plot_results =>  [:pull_down_results] do
155 |     plotter.attributes = {
156 |       :pagerank_data => latest_output(:pull_down_results),
157 |       :plot_file     => next_output(:plot_results), # <-- this will be a png...
158 |       :raw_rank      => "aes(x=d$V2)"
159 |     }
160 |     plotter.run(:local) unless localfs.exists? latest_output(:plot_results)
161 |   end
162 | 
163 | end
164 | 
165 | flow.workdir = "/tmp/pagerank_example"
166 | flow.describe
167 | flow.run(:plot_results)
168 | 
169 | 170 | h3. Utils 171 | 172 | There's a fun little program to emphasize the ease of using the filesystem abstraction called 'hdp-tree': 173 | 174 |

175 | $: bin/hdp-tree /tmp/my_hdfs_directory
176 | --- 
177 | /tmp/my_hdfs_directory: 
178 |   - my_hdfs_directory: 
179 |       - sub_dir_a: leaf_file_1
180 |       - sub_dir_a: leaf_file_2
181 |       - sub_dir_a: leaf_file_3
182 |   - my_hdfs_directory: 
183 |       - sub_dir_b: leaf_file_1
184 |       - sub_dir_b: leaf_file_2
185 |       - sub_dir_b: leaf_file_3
186 |   - my_hdfs_directory: 
187 |       - sub_dir_c: leaf_file_1
188 |       - sub_dir_c: leaf_file_2
189 |       - sub_dir_c: leaf_file_3
190 |       - sub_dir_c: 
191 |           - sub_sub_dir_a: yet_another_leaf_file
192 |       - sub_dir_c: sub_sub_dir_b
193 |       - sub_dir_c: sub_sub_dir_c
194 | 
195 | 196 | I know, it's not as pretty as unix tree, but this IS github... 197 | 198 | h3. TODO 199 | 200 | * next task in a workflow should NOT run if the previous step failed 201 | ** this is made difficult by the fact that, sometimes?, when a pig script fails it still returns a 0 exit status 202 | ** same for wukong scripts 203 | * add a @job@ object that implements a @not_if@ function. this way a @workflow@ will be constructed of @job@ objects 204 | ** a @job@ will do nothing more than execute the ruby code in it's (run?) block, unless @not_if@ is true 205 | ** this way we can put @script@ objects inside a @job@ and only run under certain conditions that the user specifies when 206 | they create the @job@ 207 | * implement ftp filesystem interfaces 208 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | 4 | require 'jeweler' 5 | Jeweler::Tasks.new do |gem| 6 | # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options 7 | gem.name = "swineherd" 8 | gem.homepage = "http://github.com/Ganglion/swineherd" 9 | gem.license = "MIT" 10 | gem.summary = %Q{Flexible data workflow glue.} 11 | gem.description = %Q{Swineherd is for running scripts and workflows on filesystems.} 12 | gem.email = "jacob.a.perkins@gmail.com" 13 | gem.authors = ["Jacob Perkins"] 14 | # Include your dependencies below. Runtime dependencies are required when using your gem, 15 | # and development dependencies are only needed for development (ie running rake tasks, tests, etc) 16 | # gem.add_runtime_dependency 'jabber4r', '> 0.1' 17 | # gem.add_development_dependency 'rspec', '> 1.2.3' 18 | gem.add_development_dependency "yard", "~> 0.6.0" 19 | gem.add_development_dependency "jeweler", "~> 1.5.2" 20 | gem.add_development_dependency "rcov", ">= 0" 21 | gem.add_dependency 'configliere' 22 | gem.add_dependency 'gorillib' 23 | gem.add_dependency 'erubis' 24 | gem.add_dependency 'right_aws' 25 | end 26 | Jeweler::RubygemsDotOrgTasks.new 27 | 28 | 29 | require 'yard' 30 | YARD::Rake::YardocTask.new 31 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.4 -------------------------------------------------------------------------------- /bin/hadoop-stream: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'rake' 5 | require 'swineherd' ; include Swineherd 6 | 7 | # 8 | # Uses hadoop and rake's multitask capability to stream many source 9 | # files in parallel into a single destination directory. 10 | # 11 | 12 | Settings.define :input, :type => Array, :required => true, :description => "Comma spearated list of directories (hdfs paths, s3 paths, etc) to stream" 13 | Settings.define :output, :required => true, :description => "Destination directory (s3 or hdfs)" 14 | Settings.resolve! 15 | 16 | # 17 | # Takes a hash of paths eg: {'filename' => 'full path'} and defines 18 | # a new streaming task for each one 19 | # 20 | def define_tasks list_of_tasks 21 | list_of_tasks.each do |basename, source| 22 | task basename do 23 | destination = File.join(Settings.output, basename) # each file gets its own output 24 | HDFS.stream(source, destination) 25 | end 26 | end 27 | end 28 | 29 | # Create a list of tasks, one per file 30 | list_of_tasks = Settings.input.inject({}){|list, path| list[File.basename(path)] = path; list} 31 | define_tasks list_of_tasks 32 | 33 | multitask :stream_all => list_of_tasks.keys 34 | 35 | Rake::MultiTask["stream_all"].invoke 36 | -------------------------------------------------------------------------------- /bin/hdp-tree: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env jruby 2 | 3 | require 'swineherd' 4 | 5 | # 6 | # Creates a 'tree' view of an hdfs path. It's not as pretty as the 7 | # unix tree command but that's only because I'm not smart enough to 8 | # print the hierarchy properly. 9 | # 10 | 11 | FS = Swineherd::FileSystem.get(:hdfs) 12 | path = ARGV[0] 13 | 14 | # Recursively list paths 15 | def lr path 16 | paths = FS.entries(path) 17 | if (paths && !paths.empty?) 18 | paths.map{|e| {File.basename(path) => lr(e)}}.flatten 19 | else 20 | File.basename(path) 21 | end 22 | end 23 | 24 | 25 | tree = {File.dirname(path) => lr(path)}.to_yaml 26 | puts tree 27 | -------------------------------------------------------------------------------- /examples/pagerank/data/seinfeld_network.tsv: -------------------------------------------------------------------------------- 1 | jerry costanza81 2 | jerry ElaineBenes 3 | jerry kramer 4 | jerry NewmanUSPS 5 | jerry THE_REAL_PUDDY 6 | jerry JPeterman 7 | jerry FRANKCOSTANZA 8 | costanza81 jerry 9 | costanza81 ElaineBenes 10 | costanza81 kramer 11 | costanza81 NewmanUSPS 12 | costanza81 THE_REAL_PUDDY 13 | costanza81 JPeterman 14 | costanza81 FRANKCOSTANZA 15 | ElaineBenes jerry 16 | ElaineBenes costanza81 17 | ElaineBenes kramer 18 | ElaineBenes THE_REAL_PUDDY 19 | ElaineBenes JPeterman 20 | kramer jerry 21 | kramer costanza81 22 | kramer ElaineBenes 23 | kramer NewmanUSPS 24 | kramer THE_REAL_PUDDY 25 | kramer JPeterman 26 | kramer FRANKCOSTANZA 27 | NewmanUSPS jerry 28 | NewmanUSPS costanza81 29 | NewmanUSPS ElaineBenes 30 | NewmanUSPS kramer 31 | NewmanUSPS THE_REAL_PUDDY 32 | NewmanUSPS JPeterman 33 | NewmanUSPS FRANKCOSTANZA 34 | THE_REAL_PUDDY jerry 35 | THE_REAL_PUDDY costanza81 36 | THE_REAL_PUDDY ElaineBenes 37 | THE_REAL_PUDDY kramer 38 | THE_REAL_PUDDY NewmanUSPS 39 | THE_REAL_PUDDY JPeterman 40 | THE_REAL_PUDDY FRANKCOSTANZA 41 | THE_REAL_PUDDY Vegetable_Lasagna 42 | FRANKCOSTANZA jerry 43 | FRANKCOSTANZA costanza81 44 | FRANKCOSTANZA kramer 45 | jerry MortySeinfeld 46 | jerry HelenSeinfeld 47 | jerry Izzy_Mandelbaum 48 | jerry UncleLEO 49 | jerry Artie_Levine 50 | MortySeinfeld UncleLEO 51 | MortySeinfeld HelenSeinfeld 52 | MortySeinfeld Cousin_Jeffrey 53 | Izzy_Mandelbaum jerry 54 | UncleLEO jerry 55 | UncleLEO MortySeinfeld 56 | UncleLEO HelenSeinfeld 57 | UncleLEO Cousin_Jeffrey 58 | UncleLEO Babs_Kramer 59 | Babs_Kramer UncleLEO 60 | Cousin_Jeffrey jerry 61 | Cousin_Jeffrey MortySeinfeld 62 | Cousin_Jeffrey UncleLEO 63 | jerry Nana 64 | Nana MortySeinfeld 65 | MortySeinfeld Nana 66 | Cousin_Jeffrey Nana 67 | UncleLEO Nana 68 | Nana UncleLEO 69 | JackKlompus MortySeinfeld 70 | MortySeinfeld JackKlompus 71 | Dolores jerry 72 | MarlatheVirgin jerry 73 | TiaVanCamp jerry 74 | Rachel_Goldstein jerry 75 | meryl jerry 76 | MissRhodeIsland jerry 77 | Pam jerry 78 | Sheila jerry 79 | MelissaFlyingFree jerry 80 | jerry Dolores 81 | jerry MarlatheVirgin 82 | jerry TiaVanCamp 83 | jerry Rachel_Goldstein 84 | jerry meryl 85 | jerry MissRhodeIsland 86 | jerry Pam 87 | jerry Sheila 88 | jerry MelissaFlyingFree 89 | jerry Laura 90 | jerry Sandy 91 | Laura Sandy 92 | Laura jerry 93 | Sandy jerry 94 | kramer MissRhodeIsland 95 | kramer Pam 96 | jerry Jenna 97 | Jenna jerry 98 | jerry bania 99 | costanza81 bania 100 | bania jerry 101 | bania The_Soup_Nazi 102 | bania Poppie 103 | bania Jenna 104 | jerry Noreen 105 | jerry JackKlompus 106 | jerry Milos 107 | jerry JeanPaul_JeanPaul 108 | jerry FusilliJerry 109 | FusilliJerry jerry 110 | FusilliJerry kramer 111 | kramer FusilliJerry 112 | FusilliJerry FRANKCOSTANZA 113 | jerry pez 114 | jerry superman 115 | jerry bigstein 116 | Milos jerry 117 | jerry Roy_the_Dentist 118 | jerry BabuBhatt 119 | kramer BabuBhatt 120 | BabuBhatt kramer 121 | kramer Poppie 122 | Poppie kramer 123 | Poppie ElaineBenes 124 | ElaineBenes Poppie 125 | jerry Poppie 126 | jerry Shaky_the_Mohel 127 | jerry bubble_boy 128 | bubble_boy jerry 129 | MatthewSeinfeldFan jerry 130 | FragileFrankieMerman jerry 131 | jerry FragileFrankieMerman 132 | costanza81 FragileFrankieMerman 133 | costanza81 EstelleC 134 | FRANKCOSTANZA EstelleC 135 | EstelleC costanza81 136 | EstelleC FRANKCOSTANZA 137 | FRANKCOSTANZA Lloyd_Braun 138 | EstelleC Lloyd_Braun 139 | Lloyd_Braun FRANKCOSTANZA 140 | Lloyd_Braun costanza81 141 | Lloyd_Braun EstelleC 142 | kramer MrWilhelm 143 | costanza81 MrWilhelm 144 | costanza81 Allison 145 | Allison costanza81 146 | costanza81 LindsayEnright 147 | LindsayEnright costanza81 148 | costanza81 marisa_tomei 149 | costanza81 SusanRoss 150 | SusanRoss MrandMrsRoss 151 | MrandMrsRoss SusanRoss 152 | SusanRoss jerry 153 | jerry SusanRoss 154 | SusanRoss ElaineBenes 155 | kramer SusanRoss 156 | SusanRoss Russell_Dalrymple 157 | Russell_Dalrymple SusanRoss 158 | Russell_Dalrymple ElaineBenes 159 | SallyWeaver SusanRoss 160 | SusanRoss SallyWeaver 161 | SallyWeaver MrandMrsRoss 162 | SallyWeaver jerry 163 | WyckThayer MrandMrsRoss 164 | MrandMrsRoss WyckThayer 165 | SusanRoss WyckThayer 166 | WyckThayer SusanRoss 167 | costanza81 MrKruger 168 | MrKruger costanza81 169 | ElaineBenes MrKruger 170 | costanza81 guitarbern 171 | costanza81 intangibles 172 | costanza81 cushman 173 | cushman bigstein 174 | bigstein cushman 175 | costanza81 JonVoight 176 | costanza81 bubble_boy 177 | costanza81 Pastrami 178 | costanza81 bigstein 179 | Victoria bigstein 180 | bigstein Victoria 181 | cushman Victoria 182 | Victoria cushman 183 | bigstein intangibles 184 | bigstein guitarbern 185 | guitarbern intangibles 186 | guitarbern bigstein 187 | intangibles Victoria 188 | intangibles bigstein 189 | bubble_boy trivial_pursuit 190 | costanza81 StankyHanke 191 | ElaineBenes MrLippman 192 | ElaineBenes MrPitt 193 | ElaineBenes Jack_The_Wiz 194 | Jack_The_Wiz ElaineBenes 195 | TheSuzie ElaineBenes 196 | jerry TheSuzie 197 | TheSuzie Peggy 198 | Peggy TheSuzie 199 | Peggy JPeterman 200 | JPeterman kramer 201 | JPeterman TheSuzie 202 | ElaineBenes RobertKennedyJr 203 | MarlatheVirgin RobertKennedyJr 204 | RobertKennedyJr MarlatheVirgin 205 | Jackie_Chiles MarlatheVirgin 206 | Sue_Ellen_Mischke RobertKennedyJr 207 | JPeterman RobertKennedyJr 208 | RobertKennedyJr JPeterman 209 | MrPitt RobertKennedyJr 210 | RobertKennedyJr MrPitt 211 | kramer TinaRobbins 212 | TinaRobbins kramer 213 | ElaineBenes TinaRobbins 214 | TinaRobbins ElaineBenes 215 | Jake_Jarmel ElaineBenes 216 | Noreen 217 | ElaineBenes HalKitzmiller 218 | HalKitzmiller ElaineBenes 219 | kramer HalKitzmiller 220 | HalKitzmiller kramer 221 | Joel_Rifkin ElaineBenes 222 | Darryl ElaineBenes 223 | NedIsakoff ElaineBenes 224 | ElaineBenes NedIsakoff 225 | ElaineBenes Carl_Farbman 226 | Carl_Farbman JPeterman 227 | JPeterman Carl_Farbman 228 | CrazyJoeDavola jerry 229 | CrazyJoeDavola costanza81 230 | CrazyJoeDavola ElaineBenes 231 | CrazyJoeDavola kramer 232 | CrazyJoeDavola NewmanUSPS 233 | costanza81 CrazyJoeDavola 234 | ElaineBenes CrazyJoeDavola 235 | kramer CrazyJoeDavola 236 | NewmanUSPS CrazyJoeDavola 237 | jerry DrTimWhatley 238 | costanza81 DrTimWhatley 239 | ElaineBenes DrTimWhatley 240 | kramer DrTimWhatley 241 | NewmanUSPS DrTimWhatley 242 | DrTimWhatley jerry 243 | DrTimWhatley costanza81 244 | DrTimWhatley ElaineBenes 245 | DrTimWhatley kramer 246 | jerry TheDrake 247 | costanza81 TheDrake 248 | ElaineBenes TheDrake 249 | kramer TheDrake 250 | NewmanUSPS TheDrake 251 | TheDrake jerry 252 | TheDrake costanza81 253 | TheDrake ElaineBenes 254 | TheDrake kramer 255 | jerry JoeMayo 256 | ElaineBenes JoeMayo 257 | costanza81 JoeMayo 258 | JoeMayo jerry 259 | JoeMayo ElaineBenes 260 | JoeMayo costanza81 261 | jerry Alec_Berg 262 | Alec_Berg jerry 263 | ElaineBenes Sue_Ellen_Mischke 264 | Sue_Ellen_Mischke ElaineBenes 265 | Sue_Ellen_Mischke jerry 266 | jerry Sue_Ellen_Mischke 267 | kramer Sue_Ellen_Mischke 268 | kramer Mickey_Abbott 269 | jerry Mickey_Abbott 270 | costanza81 Mickey_Abbott 271 | NewmanUSPS Mickey_Abbott 272 | Mickey_Abbott kramer 273 | Mickey_Abbott jerry 274 | Mickey_Abbott costanza81 275 | Mickey_Abbott NewmanUSPS 276 | Babs_Kramer NewmanUSPS 277 | NewmanUSPS Babs_Kramer 278 | kramer Bob_Sacamano 279 | kramer Lomez 280 | kramer JayRiemenschneider 281 | kramer CorkyRamirez 282 | kramer LenNicademo 283 | kramer Specter 284 | kramer Brody 285 | Bob_Sacamano kramer 286 | Lomez kramer 287 | JayRiemenschneider kramer 288 | CorkyRamirez kramer 289 | LenNicademo kramer 290 | Specter kramer 291 | Brody kramer 292 | Bob_Sacamano Lomez 293 | Bob_Sacamano JayRiemenschneider 294 | Bob_Sacamano CorkyRamirez 295 | Bob_Sacamano LenNicademo 296 | Bob_Sacamano Specter 297 | Bob_Sacamano Brody 298 | Bob_Sacamano jerry 299 | Brody Bob_Sacamano 300 | Brody Lomez 301 | Brody JayRiemenschneider 302 | Brody CorkyRamirez 303 | Brody LenNicademo 304 | Brody Specter 305 | CorkyRamirez Bob_Sacamano 306 | CorkyRamirez Lomez 307 | CorkyRamirez JayRiemenschneider 308 | CorkyRamirez Specter 309 | CorkyRamirez Brody 310 | JayRiemenschneider Bob_Sacamano 311 | JayRiemenschneider Lomez 312 | JayRiemenschneider CorkyRamirez 313 | JayRiemenschneider LenNicademo 314 | JayRiemenschneider Brody 315 | LenNicademo Bob_Sacamano 316 | LenNicademo Lomez 317 | LenNicademo JayRiemenschneider 318 | LenNicademo CorkyRamirez 319 | LenNicademo Brody 320 | Lomez Bob_Sacamano 321 | Lomez JayRiemenschneider 322 | Lomez CorkyRamirez 323 | Lomez LenNicademo 324 | Lomez Brody 325 | Specter Bob_Sacamano 326 | Specter Lomez 327 | Specter CorkyRamirez 328 | kramer FranklinDelanoRomanowski 329 | kramer SalBass 330 | kramer EstelleC 331 | kramer Vegetable_Lasagna 332 | kramer MortySeinfeld 333 | kramer Noreen 334 | kramer Babs_Kramer 335 | kramer Shaky_the_Mohel 336 | kramer assman 337 | assman DrTimWhatley 338 | DrTimWhatley assman 339 | kramer Stan_the_Caddy 340 | Stan_the_Caddy Jackie_Chiles 341 | NewmanUSPS Jackie_Chiles 342 | Jackie_Chiles NewmanUSPS 343 | kramer Jackie_Chiles 344 | Jackie_Chiles kramer 345 | EstelleC kramer 346 | Vegetable_Lasagna kramer 347 | MortySeinfeld kramer 348 | Noreen kramer 349 | Babs_Kramer kramer 350 | Shaky_the_Mohel kramer 351 | Bob_Cobb kramer 352 | kramer Bob_Cobb 353 | Bob_Cobb FRANKCOSTANZA 354 | FRANKCOSTANZA Bob_Cobb 355 | kramer Earl_Haffler 356 | Earl_Haffler kramer 357 | kramer MikeMoffit 358 | MikeMoffit kramer 359 | MikeMoffit jerry 360 | NewmanUSPS Henry_Atkins 361 | AvisRental jerry 362 | jerry The_Soup_Nazi 363 | costanza81 The_Soup_Nazi 364 | ElaineBenes The_Soup_Nazi 365 | kramer The_Soup_Nazi 366 | NewmanUSPS The_Soup_Nazi 367 | FRANKCOSTANZA The_Soup_Nazi 368 | THE_REAL_PUDDY The_Soup_Nazi 369 | jerry ArtVandelay 370 | costanza81 ArtVandelay 371 | ElaineBenes ArtVandelay 372 | kramer ArtVandelay 373 | jerry Kel_Varnsen 374 | costanza81 Kel_Varnsen 375 | ElaineBenes Kel_Varnsen 376 | kramer Kel_Varnsen 377 | jerry HEPennypacker 378 | costanza81 HEPennypacker 379 | ElaineBenes HEPennypacker 380 | kramer HEPennypacker 381 | jerry MartinvanNostrand 382 | costanza81 MartinvanNostrand 383 | ElaineBenes MartinvanNostrand 384 | kramer MartinvanNostrand 385 | jerry WandaPepper 386 | costanza81 WandaPepper 387 | ElaineBenes WandaPepper 388 | kramer WandaPepper 389 | ArtVandelay ArtVandelay 390 | ArtVandelay Kel_Varnsen 391 | ArtVandelay HEPennypacker 392 | ArtVandelay MartinvanNostrand 393 | ArtVandelay WandaPepper 394 | Kel_Varnsen ArtVandelay 395 | Kel_Varnsen HEPennypacker 396 | Kel_Varnsen MartinvanNostrand 397 | Kel_Varnsen WandaPepper 398 | HEPennypacker ArtVandelay 399 | HEPennypacker Kel_Varnsen 400 | HEPennypacker MartinvanNostrand 401 | HEPennypacker WandaPepper 402 | MartinvanNostrand ArtVandelay 403 | MartinvanNostrand Kel_Varnsen 404 | MartinvanNostrand HEPennypacker 405 | MartinvanNostrand WandaPepper 406 | WandaPepper ArtVandelay 407 | WandaPepper Kel_Varnsen 408 | WandaPepper HEPennypacker 409 | WandaPepper MartinvanNostrand 410 | Kevin Gene 411 | Kevin Feldman 412 | Kevin Vargas 413 | Kevin ElaineBenes 414 | Gene Kevin 415 | Gene Feldman 416 | Gene Vargas 417 | Gene ElaineBenes 418 | Feldman Kevin 419 | Feldman Gene 420 | Feldman Vargas 421 | Feldman ElaineBenes 422 | Vargas Kevin 423 | Vargas Gene 424 | Vargas Feldman 425 | Vargas ElaineBenes 426 | ElaineBenes Kevin 427 | ElaineBenes Gene 428 | ElaineBenes Feldman 429 | ElaineBenes Vargas 430 | -------------------------------------------------------------------------------- /examples/pagerank/pagerank.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | $LOAD_PATH << '../../lib' 4 | require 'swineherd' ; include Swineherd 5 | require 'swineherd/script' ; include Swineherd::Script 6 | require 'swineherd/filesystem' 7 | 8 | Settings.define :flow_id, :required => true, :description => "Flow id required to make run of workflow unique" 9 | Settings.define :iterations, :type => Integer, :default => 10, :description => "Number of pagerank iterations to run" 10 | Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config" 11 | Settings.resolve! 12 | 13 | flow = Workflow.new(Settings.flow_id) do 14 | 15 | # The filesystems we're going to be working with 16 | hdfs = Swineherd::FileSystem.get(:hdfs) 17 | localfs = Swineherd::FileSystem.get(:file) 18 | 19 | # The scripts we're going to use 20 | initializer = PigScript.new('scripts/pagerank_initialize.pig') 21 | iterator = PigScript.new('scripts/pagerank.pig') 22 | finisher = WukongScript.new('scripts/cut_off_list.rb') 23 | plotter = RScript.new('scripts/histogram.R') 24 | 25 | # 26 | # Runs simple pig script to initialize pagerank. We must specify the input 27 | # here as this is the first step in the workflow. The output attribute is to 28 | # ensure idempotency and the options attribute is the hash that will be 29 | # converted into command-line args for the pig interpreter. 30 | # 31 | task :pagerank_initialize do 32 | initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)} 33 | initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize) 34 | end 35 | 36 | # 37 | # Runs multiple iterations of pagerank with another pig script and manages all 38 | # the intermediate outputs. 39 | # 40 | task :pagerank_iterate => [:pagerank_initialize] do 41 | iterator.options[:damp] = '0.85f' 42 | iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize) 43 | Settings.iterations.times do 44 | iterator.options[:next_iter_file] = next_output(:pagerank_iterate) 45 | iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate) 46 | iterator.refresh! 47 | iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate) 48 | end 49 | end 50 | 51 | # 52 | # Here we use a wukong script to cut off the last field (a big pig bag of 53 | # links). Notice how every wukong script MUST have an input but pig scripts do 54 | # not. 55 | # 56 | task :cut_off_adjacency_list => [:pagerank_iterate] do 57 | finisher.input << latest_output(:pagerank_iterate) 58 | finisher.output << next_output(:cut_off_adjacency_list) 59 | finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list) 60 | end 61 | 62 | # 63 | # We want to pull down one result file, merge the part-000.. files into one file 64 | # 65 | task :merge_results => [:cut_off_adjacency_list] do 66 | merged_results = next_output(:merge_results) 67 | hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results 68 | end 69 | 70 | # 71 | # Cat results into a local directory with the same structure 72 | # eg. #{work_dir}/#{flow_id}/pull_down_results-0. 73 | # 74 | # FIXME: Bridging filesystems is cludgey. 75 | # 76 | task :pull_down_results => [:merge_results] do 77 | local_results = next_output(:pull_down_results) 78 | hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results 79 | end 80 | 81 | # 82 | # Plot 2nd column of the result as a histogram (requires R and 83 | # ggplot2). Note that the output here is a png file but doesn't have that 84 | # extension. Ensmarten me as to the right way to handle that? 85 | # 86 | task :plot_results => [:pull_down_results] do 87 | plotter.attributes = { 88 | :pagerank_data => latest_output(:pull_down_results), 89 | :plot_file => next_output(:plot_results), # <-- this will be a png... 90 | :raw_rank => "aes(x=d$V2)" 91 | } 92 | plotter.run(:hadoop) unless localfs.exists? latest_output(:plot_results) 93 | end 94 | 95 | end 96 | 97 | flow.workdir = "/tmp/pagerank_example" 98 | flow.describe 99 | flow.run(:plot_results) 100 | -------------------------------------------------------------------------------- /examples/pagerank/scripts/cut_off_list.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'wukong' 5 | 6 | # 7 | # Does the very simple job of cutting of the giant adjacency list 8 | # 9 | class CutMapper < Wukong::Streamer::RecordStreamer 10 | def process *args 11 | node_a, node_b, list = args 12 | yield [node_a, node_b] 13 | end 14 | end 15 | 16 | Wukong::Script.new(CutMapper, nil).run 17 | -------------------------------------------------------------------------------- /examples/pagerank/scripts/histogram.R: -------------------------------------------------------------------------------- 1 | library(ggplot2); 2 | png('<%= plot_file %>', width=900, res=132); 3 | d <- read.table('<%= pagerank_data %>', header=FALSE, sep='\t'); 4 | p <- ggplot(d, <%= raw_rank %>) + geom_histogram() + xlab("") + ylab(""); 5 | p; 6 | -------------------------------------------------------------------------------- /examples/pagerank/scripts/pagerank.pig: -------------------------------------------------------------------------------- 1 | -- 2 | -- Runs exactly one pagerank iteration 3 | -- 4 | network = LOAD '$CURR_ITER_FILE' AS (node_a:chararray, rank:float, out_links:bag { link:tuple (node_b:chararray) }); 5 | sent_shares = FOREACH network GENERATE FLATTEN(out_links) AS node_b, (float)(rank / (float)SIZE(out_links)) AS share:float; 6 | sent_links = FOREACH network GENERATE node_a, out_links; 7 | rcvd_shares = COGROUP sent_links BY node_a INNER, sent_shares BY node_b; 8 | next_iter = FOREACH rcvd_shares 9 | { 10 | raw_rank = (float)SUM(sent_shares.share); 11 | -- treat the case that a node has no in links 12 | damped_rank = ((raw_rank IS NOT NULL AND raw_rank > 1.0e-12f) ? raw_rank*$DAMP + 1.0f - $DAMP : 0.0f); 13 | GENERATE 14 | group AS node_a, 15 | damped_rank AS rank, 16 | FLATTEN(sent_links.out_links) -- hack, should only be one bag, unbag it 17 | ; 18 | }; 19 | 20 | STORE next_iter INTO '$NEXT_ITER_FILE'; 21 | -------------------------------------------------------------------------------- /examples/pagerank/scripts/pagerank_initialize.pig: -------------------------------------------------------------------------------- 1 | -- 2 | -- Create initial graph on which to iterate the pagerank algorithm. 3 | -- 4 | 5 | -- 6 | -- Generate a unique list of nodes with in links to cogroup on. This allows 7 | -- us to treat the case where nodes have in links but no out links. 8 | -- 9 | network = LOAD '$ADJLIST' AS (node_a:chararray, node_b:chararray); 10 | cut_rhs = FOREACH network GENERATE node_b; 11 | uniq_rhs = DISTINCT cut_rhs; 12 | list_links = COGROUP network BY node_a, uniq_rhs BY node_b; 13 | count_links = FOREACH list_links 14 | { 15 | -- if network.node_b is empty there are no out links, set to dummy value 16 | out_links = (IsEmpty(network.node_b) ? {('dummy')} : network.node_b); 17 | GENERATE 18 | group AS node_a, 19 | 1.0f AS rank, 20 | out_links AS out_links 21 | ; 22 | }; 23 | 24 | STORE count_links INTO '$INITGRPH'; 25 | -------------------------------------------------------------------------------- /lib/swineherd.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'configliere' ; Configliere.use(:commandline, :env_var, :define) 3 | require 'rake' 4 | require 'gorillib/logger/log' 5 | 6 | module Swineherd 7 | autoload :Template, 'swineherd/template' 8 | autoload :FileSystem, 'swineherd/filesystem' 9 | autoload :Script, 'swineherd/script' 10 | autoload :Workflow, 'swineherd/workflow' 11 | 12 | # For rake 0.9 compatibility 13 | include Rake::DSL if defined?(Rake::DSL) 14 | end 15 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | autoload :BaseFileSystem, 'swineherd/filesystem/basefilesystem' 3 | autoload :LocalFileSystem, 'swineherd/filesystem/localfilesystem' 4 | autoload :HadoopFileSystem, 'swineherd/filesystem/hadoopfilesystem' 5 | autoload :S3FileSystem, 'swineherd/filesystem/s3filesystem' 6 | 7 | class FileSystem 8 | 9 | FILESYSTEMS = { 10 | 'file' => Swineherd::LocalFileSystem, 11 | 'hdfs' => Swineherd::HadoopFileSystem, 12 | 's3' => Swineherd::S3FileSystem 13 | } 14 | 15 | # A factory function that returns an instance of the requested class 16 | def self.get scheme, *args 17 | begin 18 | FILESYSTEMS[scheme.to_s].new *args 19 | rescue NoMethodError => e 20 | raise "Filesystem with scheme #{scheme} does not exist.\n #{e.message}" 21 | end 22 | end 23 | 24 | end 25 | 26 | end 27 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/README_filesystem.textile: -------------------------------------------------------------------------------- 1 | h1. File System Abstraction 2 | 3 | Hackboxen need to access files and directories in order to do their 4 | stuff. We currently expect them to use at least the following types 5 | of filesystems: 6 | 7 | * Local File System 8 | * Ephemeral Hadoop cluster HDFS 9 | * s3/HDFS 10 | 11 | Each of these filesystem types has different methods to accomplish the same operations. In order to make this diversity more easily used by hackboxen, an abstraction layer has been created. 12 | 13 | h2. Interface 14 | 15 | A new @FileSystem@ class has a single class method @get@ taking two argugments: 16 | 17 | * @scheme@: A token which specifies the filesystem scheme. Currently, only @:file@ is supported. 18 | * @*args@: Optional arguments (e.g. credentitals) 19 | 20 | The returned (abstracted) filesystem instnace has the following methods: 21 | 22 | * @open(path,mode,blk)@: Return a @File@ like file handle object. @mode@ and @blk@ arguments are optional and work like the standard ruby @File.open@ arguments. 23 | * @rm(path)@: Works like UNIX @rm -r@. 24 | * @exists?(path)@: Returns @true@ if the file/directory exists 25 | * @mv(srcpath,dstpath)@: Renames/moves the file/directory. 26 | * @cp(srcpath,dstpath)@: Works like UNIX @cp -r@. 27 | * @mkpath(dirpath)@: Creates a directory and all required parent directories. 28 | * @type(path)@: Returns one of "dir", "file", or "symlink". 29 | * @entries(dirpath)@: Returns the the files/subdirectories in this directory 30 | 31 | The @File@ object returned by the @open@ methods has the following methods: 32 | 33 | * @read@: Return the contents of the entire file as a string. 34 | * @readline@: Return the next line in the file, or nil if there no more lines. 35 | * @write(string)@: Write @string@ to the file. 36 | * @close@: Close the file 37 | 38 | h2. Creating an abstraction 39 | 40 | Each abstraction is not expected to catch and rethrow exceptions of the abstracted subsystems. Rather, exceptions should pass through. However, each method should try to be built to behave similarly to the corresponding native ruby @File@ and @FileUtils@ methods. 41 | 42 | h2. Current State 43 | 44 | The only currently implemented filesystem abstraction is @:file@ (local file system). 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/basefilesystem.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | 3 | # 4 | # All methods a filesystem should have 5 | # 6 | module BaseFileSystem 7 | 8 | # 9 | # Return a new instance of 'this' filesystem. Classes that include this 10 | # module are expected to know how to pull their particular set of arguments 11 | # from *args and initialize themselves by opening any required connections, &c. 12 | # 13 | def initialize *args 14 | end 15 | 16 | # 17 | # Open a file in this filesystem. Should return a usable file handle for in 18 | # the mode (read 'r' or 'w') given. File classes should, at minimum, have 19 | # the methods defined in BaseFile 20 | # 21 | def open path, mode="r", &blk 22 | end 23 | 24 | # 25 | # Recursively measure the size of path. Results in bytes. 26 | # 27 | def size path 28 | end 29 | 30 | # 31 | # Recursively delete the path and all paths below it. 32 | # 33 | def rm path 34 | end 35 | 36 | # 37 | # Returns true if the file or path exists and false otherwise. 38 | # 39 | def exists? path 40 | end 41 | 42 | # 43 | # Moves the source path to the destination path 44 | # 45 | def mv srcpath, dstpath 46 | end 47 | 48 | # 49 | # Recursively copies all files and directories under srcpath to dstpath 50 | # 51 | def cp srcpath, dstpath 52 | end 53 | 54 | # 55 | # Make directory path if it does not (partly) exist 56 | # 57 | def mkpath path 58 | end 59 | 60 | # 61 | # Return file type ("directory" or "file" or "symlink") 62 | # 63 | def type path 64 | end 65 | 66 | # 67 | # Give contained files/dirs 68 | # 69 | def entries dirpath 70 | end 71 | 72 | # 73 | # For running tasks idempotently. Returns true if no paths exist, false if all paths exist, 74 | # and raises an error otherwise. 75 | # 76 | def check_paths paths 77 | exist_count = paths.inject(0){|cnt, path| cnt += 1 if exists?(path); cnt} 78 | raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size) 79 | return true if exist_count == 0 80 | false 81 | end 82 | 83 | # 84 | # Needs to close the filesystem by cleaning up any open connections, &c. 85 | # 86 | def close *args 87 | end 88 | 89 | class BaseFile 90 | attr_accessor :path, :scheme, :mode 91 | 92 | 93 | def initialize *args, &blk 94 | end 95 | 96 | # 97 | # A new file in the filesystem needs to be instantiated with a 98 | # path, a mode (read 'r' or write 'w'). 99 | # 100 | def open path, mode="r", &blk 101 | end 102 | 103 | # 104 | # Return whole file and as a string 105 | # 106 | def read 107 | end 108 | 109 | # 110 | # Return a line from stream 111 | # 112 | def readline 113 | end 114 | 115 | # 116 | # Writes a string to the file 117 | # 118 | def write string 119 | end 120 | 121 | # 122 | # Close the file 123 | # 124 | def close *args 125 | end 126 | 127 | end 128 | 129 | end 130 | 131 | end 132 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/filesystems.rb: -------------------------------------------------------------------------------- 1 | require 'fileutils' 2 | 3 | class FileSystem 4 | 5 | # A factory function that returns an instance of the requested class 6 | def self.get(scheme, *args) 7 | if scheme == :file 8 | LocalFileSystem.new() 9 | else 10 | nil 11 | end 12 | end 13 | 14 | class LocalFileSystem 15 | 16 | # Open a file in this filesystem 17 | def open(path,mode="r",&blk) 18 | return LocalFile.new(path,mode,&blk) 19 | end 20 | 21 | # Works like rm -r 22 | def rm(path) 23 | FileUtils.rm_r(path) 24 | end 25 | 26 | # Does this exist? 27 | def exists?(path) 28 | File.exists?(path) 29 | end 30 | 31 | # Works like UNIX mv 32 | def mv(srcpath,dstpath) 33 | FileUtils.mv(srcpath,dstpath) 34 | end 35 | 36 | # Works like UNIX cp -r 37 | def cp(srcpath,dstpath) 38 | FileUtils.cp_r(srcpath,dstpath) 39 | end 40 | 41 | # Make directory path if it does not (partly) exist 42 | def mkpath(path) 43 | FileUtils.mkpath 44 | end 45 | 46 | # Return file type ("dir" or "file" or "symlink") 47 | def type(path) 48 | if File.symlink?(path) 49 | return "symlink" 50 | end 51 | if File.directory?(path) 52 | return "directory" 53 | end 54 | if File.file?(path) 55 | return "file" 56 | end 57 | "unknown" 58 | end 59 | 60 | # Give contained files/dirs 61 | def entries(dirpath) 62 | if type(dirpath) != "directory" 63 | return nil 64 | end 65 | Dir.entries(dirpath) 66 | end 67 | 68 | class LocalFile 69 | attr_accessor :path, :scheme, :mode 70 | 71 | def initialize(path,mode="r",&blk) 72 | @path=path 73 | @mode=mode 74 | @handle=File.open(path,mode,&blk) 75 | end 76 | 77 | def open(path,mode="r") 78 | # Only "r" and "w" modes are supported. 79 | initialize(path,mode) 80 | end 81 | 82 | # Return whole file and as a string 83 | def read 84 | @handle.read 85 | end 86 | 87 | # Return a line from stream 88 | def readline 89 | @handle.gets 90 | end 91 | 92 | # Writes to the file 93 | def write(string) 94 | @handle.write(string) 95 | end 96 | 97 | # Close file 98 | def close 99 | @handle.close 100 | end 101 | end 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/hadoopfilesystem.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | 3 | # 4 | # Methods for dealing with hadoop distributed file system (hdfs). This class 5 | # requires that you run with JRuby as it makes use of the native java hadoop 6 | # libraries. 7 | # 8 | class HadoopFileSystem 9 | 10 | include Swineherd::BaseFileSystem 11 | 12 | attr_accessor :conf, :hdfs 13 | 14 | # 15 | # Initialize a new hadoop file system, needs path to hadoop configuration 16 | # 17 | def initialize *args 18 | check_and_set_environment 19 | @conf = Java::org.apache.hadoop.conf.Configuration.new 20 | @hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf) 21 | end 22 | 23 | # 24 | # Make sure environment is sane then set up environment for use 25 | # 26 | def check_and_set_environment 27 | check_env 28 | set_env 29 | end 30 | 31 | def open path, mode="r", &blk 32 | HadoopFile.new(path,mode,self,&blk) 33 | end 34 | 35 | def size path 36 | lr(path).inject(0){|sz, f| sz += @hdfs.get_file_status(Path.new(f)).get_len} 37 | end 38 | 39 | # 40 | # Recursively list paths 41 | # 42 | def lr path 43 | paths = entries(path) 44 | if (paths && !paths.empty?) 45 | paths.map{|e| lr(e)}.flatten 46 | else 47 | path 48 | end 49 | end 50 | 51 | def rm path 52 | @hdfs.delete(Path.new(path), true) 53 | [path] 54 | end 55 | 56 | def exists? path 57 | @hdfs.exists(Path.new(path)) 58 | end 59 | 60 | def mv srcpath, dstpath 61 | @hdfs.rename(Path.new(srcpath), Path.new(dstpath)) 62 | end 63 | 64 | def cp srcpath, dstpath 65 | FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf) 66 | end 67 | 68 | def mkpath path 69 | @hdfs.mkdirs(Path.new(path)) 70 | path 71 | end 72 | 73 | def type path 74 | return "unknown" unless exists? path 75 | status = @hdfs.get_file_status(Path.new(path)) 76 | return "directory" if status.is_dir? 77 | "file" 78 | # case 79 | # when status.isFile then 80 | # return "file" 81 | # when status.is_directory? then 82 | # return "directory" 83 | # when status.is_symlink? then 84 | # return "symlink" 85 | # end 86 | end 87 | 88 | def entries dirpath 89 | return unless type(dirpath) == "directory" 90 | list = @hdfs.list_status(Path.new(dirpath)) 91 | list.map{|path| path.get_path.to_s} rescue [] 92 | end 93 | 94 | # 95 | # Merge all part files in a directory into one file. 96 | # 97 | def merge srcdir, dstfile 98 | FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "") 99 | end 100 | 101 | # 102 | # This is hackety. Use with caution. 103 | # 104 | def stream input, output 105 | require 'uri' 106 | input_fs_scheme = URI.parse(input).scheme 107 | output_fs_scheme = URI.parse(output).scheme 108 | system("#{@hadoop_home}/bin/hadoop \\ 109 | jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\ 110 | -D mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\ 111 | -D mapred.min.split.size=1000000000 \\ 112 | -D mapred.reduce.tasks=0 \\ 113 | -mapper \"/bin/cat\" \\ 114 | -input \"#{input}\" \\ 115 | -output \"#{output}\"") 116 | end 117 | 118 | # 119 | # BZIP 120 | # 121 | def bzip input, output 122 | system("#{@hadoop_home}/bin/hadoop \\ 123 | jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\ 124 | -D mapred.output.compress=true \\ 125 | -D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\ 126 | -D mapred.reduce.tasks=1 \\ 127 | -mapper \"/bin/cat\" \\ 128 | -reducer \"/bin/cat\" \\ 129 | -input \"#{input}\" \\ 130 | -output \"#{output}\"") 131 | end 132 | 133 | # 134 | # Merges many input files into :reduce_tasks amount of output files 135 | # 136 | def dist_merge inputs, output, options = {} 137 | options[:reduce_tasks] ||= 25 138 | options[:partition_fields] ||= 2 139 | options[:sort_fields] ||= 2 140 | options[:field_separator] ||= '/t' 141 | names = inputs.map{|inp| File.basename(inp)}.join(',') 142 | cmd = "#{@hadoop_home}/bin/hadoop \\ 143 | jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\ 144 | -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\ 145 | -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\ 146 | -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\ 147 | -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\ 148 | -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\ 149 | -D mapred.min.split.size=1000000000 \\ 150 | -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\ 151 | -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\ 152 | -mapper \"/bin/cat\" \\ 153 | -reducer \"/usr/bin/uniq\" \\ 154 | -input \"#{inputs.join(',')}\" \\ 155 | -output \"#{output}\"" 156 | puts cmd 157 | system cmd 158 | end 159 | 160 | # 161 | # Copy hdfs file to local filesystem 162 | # 163 | def copy_to_local srcfile, dstfile 164 | @hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile)) 165 | end 166 | 167 | # 168 | # Copy local file to hdfs filesystem 169 | # 170 | def copy_from_local srcfile, dstfile 171 | @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile)) 172 | end 173 | 174 | def close *args 175 | @hdfs.close 176 | end 177 | 178 | class HadoopFile 179 | attr_accessor :path, :handle, :hdfs 180 | 181 | # 182 | # In order to open input and output streams we must pass around the hadoop fs object itself 183 | # 184 | def initialize path, mode, fs, &blk 185 | @fs = fs 186 | @path = Path.new(path) 187 | case mode 188 | when "r" then 189 | raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file" 190 | @handle = @fs.hdfs.open(@path).to_io(&blk) 191 | when "w" then 192 | # Open path for writing 193 | raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown") 194 | @handle = @fs.hdfs.create(@path).to_io.to_outputstream 195 | if block_given? 196 | yield self 197 | self.close # muy muy importante 198 | end 199 | end 200 | end 201 | 202 | def read 203 | @handle.read 204 | end 205 | 206 | def readline 207 | @handle.readline 208 | end 209 | 210 | def write string 211 | @handle.write(string.to_java_string.get_bytes) 212 | end 213 | 214 | def puts string 215 | write(string+"\n") 216 | end 217 | 218 | def close 219 | @handle.close 220 | end 221 | 222 | end 223 | 224 | # # 225 | # # Distributed streaming from input to output 226 | # # 227 | # 228 | # # 229 | # # Given an array of input dirs, stream all into output dir and remove duplicate records. 230 | # # Reasonable default hadoop streaming options are chosen. 231 | # # 232 | # def self.merge inputs, output, options = {} 233 | # options[:reduce_tasks] ||= 25 234 | # options[:partition_fields] ||= 2 235 | # options[:sort_fields] ||= 2 236 | # options[:field_separator] ||= '/t' 237 | # names = inputs.map{|inp| File.basename(inp)}.join(',') 238 | # cmd = "${HADOOP_HOME}/bin/hadoop \\ 239 | # jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \\ 240 | # -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\ 241 | # -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\ 242 | # -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\ 243 | # -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\ 244 | # -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\ 245 | # -D mapred.min.split.size=1000000000 \\ 246 | # -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\ 247 | # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\ 248 | # -mapper \"/bin/cat\" \\ 249 | # -reducer \"/usr/bin/uniq\" \\ 250 | # -input \"#{inputs.join(',')}\" \\ 251 | # -output \"#{output}\"" 252 | # puts cmd 253 | # system cmd 254 | # end 255 | # 256 | # # 257 | # # Concatenates a hadoop dir or file into a local file 258 | # # 259 | # def self.cat_to_local src, dest 260 | # system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest) 261 | # end 262 | # 263 | 264 | # 265 | # Check that we are running with jruby, check for hadoop home. hadoop_home 266 | # is preferentially set to the HADOOP_HOME environment variable if it's set, 267 | # '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and 268 | # '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else 269 | # fails inform the user that HADOOP_HOME really should be set. 270 | # 271 | def check_env 272 | begin 273 | require 'java' 274 | rescue LoadError => e 275 | raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message 276 | end 277 | @hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop') 278 | @hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home 279 | raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home 280 | end 281 | 282 | # 283 | # Place hadoop jars in class path, require appropriate jars, set hadoop conf 284 | # 285 | def set_env 286 | require 'java' 287 | @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf')) 288 | @hadoop_conf += "/" unless @hadoop_conf.end_with? "/" 289 | $CLASSPATH << @hadoop_conf 290 | Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar} 291 | 292 | java_import 'org.apache.hadoop.conf.Configuration' 293 | java_import 'org.apache.hadoop.fs.Path' 294 | java_import 'org.apache.hadoop.fs.FileSystem' 295 | java_import 'org.apache.hadoop.fs.FileUtil' 296 | java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat' 297 | java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat' 298 | java_import 'org.apache.hadoop.fs.FSDataOutputStream' 299 | java_import 'org.apache.hadoop.fs.FSDataInputStream' 300 | 301 | end 302 | 303 | end 304 | 305 | end 306 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/localfilesystem.rb: -------------------------------------------------------------------------------- 1 | require 'fileutils' 2 | require 'find' 3 | module Swineherd 4 | 5 | class LocalFileSystem 6 | 7 | include Swineherd::BaseFileSystem 8 | 9 | def initialize *args 10 | end 11 | 12 | def open path, mode="r", &blk 13 | return LocalFile.new path, mode, &blk 14 | end 15 | 16 | def size path 17 | sz = 0 18 | Find.find(path){|f| sz += File.size(f)} 19 | sz 20 | end 21 | 22 | def rm path 23 | FileUtils.rm_r path 24 | end 25 | 26 | def exists? path 27 | File.exists?(path) 28 | end 29 | 30 | def mv srcpath, dstpath 31 | FileUtils.mv(srcpath,dstpath) 32 | end 33 | 34 | def cp srcpath, dstpath 35 | FileUtils.cp_r(srcpath,dstpath) 36 | end 37 | 38 | def mkpath path 39 | FileUtils.mkpath path 40 | end 41 | 42 | def type path 43 | case 44 | when File.symlink?(path) then 45 | return "symlink" 46 | when File.directory?(path) then 47 | return "directory" 48 | when File.file?(path) then 49 | return "file" 50 | end 51 | "unknown" 52 | end 53 | 54 | def entries dirpath 55 | return unless (type(dirpath) == "directory") 56 | Dir.entries(dirpath) 57 | end 58 | 59 | class LocalFile 60 | attr_accessor :path, :scheme, :handle, :mode 61 | 62 | def initialize path, mode="r", &blk 63 | @path = path 64 | @mode = mode 65 | @handle = File.open(path,mode,&blk) 66 | end 67 | 68 | def open path, mode="r", &blk 69 | initialize(path,mode,&blk) 70 | end 71 | 72 | def read 73 | @handle.read 74 | end 75 | 76 | def readline 77 | @handle.gets 78 | end 79 | 80 | def write string 81 | @handle.write(string) 82 | end 83 | 84 | def close 85 | @handle.close 86 | end 87 | end 88 | 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/localfs.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | class LocalFS 3 | def self.check_paths paths 4 | exist_count = 0 # no outputs exist 5 | paths.each{|path| exist_count += 1 if File.exist?(path) } 6 | raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size) 7 | return true if exist_count == 0 8 | false 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/swineherd/filesystem/s3filesystem.rb: -------------------------------------------------------------------------------- 1 | require 'tempfile' 2 | module Swineherd 3 | 4 | # 5 | # Methods for interacting with Amazon's Simple Store Service (s3). 6 | # 7 | class S3FileSystem 8 | 9 | include Swineherd::BaseFileSystem 10 | 11 | attr_accessor :s3 12 | 13 | # 14 | # Initialize a new s3 file system, needs path to aws keys 15 | # 16 | def initialize aws_access_key_id, aws_secret_access_key 17 | require 'right_aws' 18 | @s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key) 19 | end 20 | 21 | def open path, mode="r", &blk 22 | S3File.new(path,mode,self,&blk) 23 | end 24 | 25 | def size path 26 | sz = 0 27 | if type(path) == "directory" 28 | lr(path).each do |f| 29 | sz += file_size(f) 30 | end 31 | else 32 | sz += file_size(path) 33 | end 34 | sz 35 | end 36 | 37 | def file_size path 38 | containing_bucket = bucket(path) 39 | header = @s3.interface.head(containing_bucket, key_path(path)) 40 | header['content-length'].to_i 41 | end 42 | 43 | def rm path 44 | bkt = bucket(path) 45 | key = key_path(path) 46 | if key.empty? # only the bucket was passed in, delete it 47 | @s3.interface.force_delete_bucket(bkt) 48 | else 49 | case type(path) 50 | when "directory" then 51 | keys_to_delete = lr(path) 52 | keys_to_delete.each do |k| 53 | key_to_delete = key_path(k) 54 | @s3.interface.delete(bkt, key_to_delete) 55 | end 56 | keys_to_delete 57 | when "file" then 58 | @s3.interface.delete(bkt, key) 59 | [path] 60 | end 61 | end 62 | end 63 | 64 | def bucket path 65 | uri = URI.parse(path) 66 | uri.path.split('/').reject{|x| x.empty?}.first 67 | end 68 | 69 | def key_path path 70 | uri = URI.parse(path) 71 | File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1]) 72 | end 73 | 74 | def needs_trailing_slash pre 75 | has_trailing_slash = pre.end_with? '/' 76 | is_empty_prefix = pre.empty? 77 | !(has_trailing_slash || is_empty_prefix) 78 | end 79 | 80 | def full_contents path 81 | bkt = bucket(path) 82 | pre = key_path(path) 83 | pre += '/' if needs_trailing_slash(pre) 84 | contents = [] 85 | s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res| 86 | contents += res[:common_prefixes].map{|c| File.join(bkt,c)} 87 | contents += res[:contents].map{|c| File.join(bkt, c[:key])} 88 | end 89 | contents 90 | end 91 | 92 | def exists? path 93 | object = File.basename(path) 94 | search_dir = File.dirname(path) 95 | case search_dir 96 | when '.' then # only a bucket was passed in 97 | begin 98 | (full_contents(object).size > 0) 99 | rescue RightAws::AwsError => e 100 | if e.message =~ /nosuchbucket/i 101 | false 102 | else 103 | raise e 104 | end 105 | end 106 | else 107 | search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')} 108 | search_dir_contents.include?(object) 109 | end 110 | end 111 | 112 | def mv srcpath, dstpath 113 | src_bucket = bucket(srcpath) 114 | dst_bucket = bucket(dstpath) 115 | dst_key_path = key_path(dstpath) 116 | mkpath(dstpath) 117 | case type(srcpath) 118 | when "directory" then 119 | paths_to_copy = lr(srcpath) 120 | common_dir = common_directory(paths_to_copy) 121 | paths_to_copy.each do |path| 122 | src_key = key_path(path) 123 | dst_key = File.join(dst_key_path, path.gsub(common_dir, '')) 124 | @s3.interface.move(src_bucket, src_key, dst_bucket, dst_key) 125 | end 126 | when "file" then 127 | @s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path) 128 | end 129 | end 130 | 131 | def cp srcpath, dstpath 132 | src_bucket = bucket(srcpath) 133 | dst_bucket = bucket(dstpath) 134 | dst_key_path = key_path(dstpath) 135 | mkpath(dstpath) 136 | case type(srcpath) 137 | when "directory" then 138 | paths_to_copy = lr(srcpath) 139 | common_dir = common_directory(paths_to_copy) 140 | paths_to_copy.each do |path| 141 | src_key = key_path(path) 142 | dst_key = File.join(dst_key_path, path.gsub(common_dir, '')) 143 | @s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key) 144 | end 145 | when "file" then 146 | @s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path) 147 | end 148 | end 149 | 150 | # right now this only works on single files 151 | def copy_to_local srcpath, dstpath 152 | src_bucket = bucket(srcpath) 153 | src_key_path = key_path(srcpath) 154 | dstfile = File.new(dstpath, 'w') 155 | @s3.interface.get(src_bucket, src_key_path) do |chunk| 156 | dstfile.write(chunk) 157 | end 158 | dstfile.close 159 | end 160 | 161 | # This is a bit funny, there's actually no need to create a 'path' since 162 | # s3 is nothing more than a glorified key-value store. When you create a 163 | # 'file' (key) the 'path' will be created for you. All we do here is create 164 | # the bucket unless it already exists. 165 | # 166 | def mkpath path 167 | bkt = bucket(path) 168 | key = key_path(path) 169 | if key.empty? 170 | @s3.interface.create_bucket(bkt) 171 | else 172 | @s3.interface.create_bucket(bkt) unless exists? bkt 173 | end 174 | path 175 | end 176 | 177 | def type path 178 | return "unknown" unless exists? path 179 | return "directory" if full_contents(path).size > 0 180 | "file" 181 | end 182 | 183 | def entries dirpath 184 | return unless type(dirpath) == "directory" 185 | full_contents(dirpath) 186 | end 187 | 188 | # Recursively list paths 189 | def lr path 190 | paths = entries(path) 191 | if paths 192 | paths.map{|e| lr(e)}.flatten 193 | else 194 | path 195 | end 196 | end 197 | 198 | # 199 | # Ick. 200 | # 201 | def common_directory paths 202 | dirs = paths.map{|path| path.split('/')} 203 | min_size = dirs.map{|splits| splits.size}.min 204 | dirs.map!{|splits| splits[0...min_size]} 205 | uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last 206 | dirs[0][0...uncommon_idx].join('/') 207 | end 208 | 209 | def put srcpath, destpath 210 | dest_bucket = bucket(destpath) 211 | if File.directory? srcpath 212 | # handle Dir later 213 | else 214 | key = srcpath 215 | end 216 | @s3.interface.put(dest_bucket, key, File.open(srcpath)) 217 | end 218 | 219 | def close *args 220 | end 221 | 222 | class S3File 223 | attr_accessor :path, :handle, :fs 224 | 225 | # 226 | # In order to open input and output streams we must pass around the s3 fs object itself 227 | # 228 | def initialize path, mode, fs, &blk 229 | @fs = fs 230 | @path = path 231 | case mode 232 | when "r" then 233 | raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file" 234 | when "w" then 235 | raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown") 236 | @handle = Tempfile.new('s3filestream') 237 | if block_given? 238 | yield self 239 | close 240 | end 241 | end 242 | end 243 | 244 | # 245 | # Faster than iterating 246 | # 247 | def read 248 | resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)) 249 | resp 250 | end 251 | 252 | # 253 | # This is a little hackety. That is, once you call (.each) on the object the full object starts 254 | # downloading... 255 | # 256 | def readline 257 | @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each 258 | begin 259 | @handle.next 260 | rescue StopIteration, NoMethodError 261 | @handle = nil 262 | raise EOFError.new("end of file reached") 263 | end 264 | end 265 | 266 | def write string 267 | @handle.write(string) 268 | end 269 | 270 | def puts string 271 | write(string+"\n") 272 | end 273 | 274 | def close 275 | if @handle 276 | @handle.read 277 | fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r')) 278 | @handle.close 279 | end 280 | @handle = nil 281 | end 282 | 283 | end 284 | 285 | end 286 | 287 | end 288 | -------------------------------------------------------------------------------- /lib/swineherd/script.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | module Script 3 | 4 | autoload :WukongScript, 'swineherd/script/wukong_script' 5 | autoload :PigScript, 'swineherd/script/pig_script' 6 | autoload :RScript, 'swineherd/script/r_script' 7 | 8 | module Common 9 | 10 | attr_accessor :input, :output, :options, :attributes 11 | def initialize(source, input = [], output = [], options = {}, attributes ={}) 12 | @source = source 13 | @input = input 14 | @output = output 15 | @options = options 16 | @attributes = attributes 17 | end 18 | 19 | # 20 | # Allows for setting the environment the script will be ran in 21 | # 22 | def env 23 | ENV 24 | end 25 | 26 | def script 27 | @script ||= Template.new(@source, @attributes).substitute! 28 | end 29 | 30 | # 31 | # So we can reuse ourselves 32 | # 33 | def refresh! 34 | @script = nil 35 | @output = [] 36 | @input = [] 37 | end 38 | 39 | # 40 | # This depends on the type of script 41 | # 42 | def cmd 43 | raise "Override this in subclass!" 44 | end 45 | 46 | # 47 | # Override this in subclass to decide how script runs in 'local' mode 48 | # Best practice is that it needs to be able to run on a laptop w/o 49 | # hadoop. 50 | # 51 | def local_cmd 52 | raise "Override this in subclass!" 53 | end 54 | 55 | # 56 | # Default is to run with hadoop 57 | # 58 | def run mode=:hadoop 59 | case mode 60 | when :local then 61 | sh local_cmd do |res, ok| 62 | Log.info("Exit status was #{ok}") 63 | raise "Local mode script failed with exit status #{ok}" if ok != 0 64 | end 65 | when :hadoop then 66 | sh cmd do |res, ok| 67 | Log.info("Exit status was #{ok}") 68 | raise "Hadoop mode script failed with exit status #{ok}" if ok != 0 69 | end 70 | end 71 | end 72 | 73 | end 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /lib/swineherd/script/hadoop_script.rb: -------------------------------------------------------------------------------- 1 | module Swineherd::Script 2 | 3 | # 4 | # native Java map-reduce 5 | # 6 | class HadoopScript 7 | include Common 8 | attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars 9 | 10 | def initialize *args 11 | super(*args) 12 | @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this 13 | end 14 | 15 | # 16 | # Converts an arbitrarily nested hash to flattened arguments 17 | # for passing to java program. For example: 18 | # 19 | # {:mapred => {:reduce => {:tasks => 0}}} 20 | # 21 | # will transform to: 22 | # 23 | # '-Dmapred.reduce.tasks=0' 24 | # 25 | def java_args args 26 | to_dotted_args(args).map{|arg| "-D#{arg}"} 27 | end 28 | 29 | # 30 | # Uses recursion to take an arbitrarily nested hash and 31 | # flatten it into dotted args. See 'to_java_args'. Can 32 | # you do it any better? 33 | # 34 | def to_dotted_args args 35 | args.map do |k,v| 36 | if v.is_a?(Hash) 37 | to_dotted_args(v).map do |s| 38 | [k,s].join(".") 39 | end 40 | else 41 | "#{k}=#{v}" 42 | end 43 | end.flatten 44 | end 45 | 46 | def cmd 47 | [ 48 | "HADOOP_CLASSPATH=#{hadoop_classpath}", 49 | "#{hadoop_home}/bin/hadoop jar #{run_jar}", 50 | main_class, 51 | java_args(options), 52 | "-libjars #{libjars}", 53 | "#{input.join(',')}", 54 | "#{output.join(',')}" 55 | ].flatten.compact.join(" \t\\\n ") 56 | end 57 | 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/swineherd/script/pig_script.rb: -------------------------------------------------------------------------------- 1 | module Swineherd::Script 2 | class PigScript 3 | include Common 4 | 5 | # 6 | # Not guaranteeing anything. 7 | # 8 | AVRO_PIG_MAPPING = { 9 | 'string' => 'chararray', 10 | 'int' => 'int', 11 | 'long' => 'long', 12 | 'float' => 'float', 13 | 'double' => 'double', 14 | 'bytes' => 'bytearray', 15 | 'fixed' => 'bytearray' 16 | } 17 | 18 | # 19 | # Simple utility function for mapping avro types to pig types 20 | # 21 | def self.avro_to_pig avro_type 22 | AVRO_PIG_MAPPING[avro_type] 23 | end 24 | 25 | # 26 | # Convert a generic hash of options {:foo => 'bar'} into 27 | # command line options for pig '-p FOO=bar' 28 | # 29 | def pig_args options 30 | options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ') 31 | end 32 | 33 | 34 | 35 | def local_cmd 36 | Log.info("Launching Pig script in local mode") 37 | "pig -x local #{pig_args(@options)} #{script}" 38 | end 39 | 40 | def cmd 41 | Log.info("Launching Pig script in hadoop mode") 42 | "pig #{pig_args(@options)} #{script}" 43 | end 44 | 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /lib/swineherd/script/r_script.rb: -------------------------------------------------------------------------------- 1 | module Swineherd::Script 2 | class RScript 3 | include Common 4 | 5 | def local_cmd 6 | "/usr/bin/Rscript --vanilla #{script}" 7 | end 8 | 9 | def cmd 10 | local_cmd 11 | end 12 | 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/swineherd/script/wukong_script.rb: -------------------------------------------------------------------------------- 1 | require 'pathname' 2 | 3 | module Swineherd::Script 4 | class WukongScript 5 | include Common 6 | 7 | def wukong_args options 8 | options.map{|param,val| "--#{param}=#{val}" }.join(' ') 9 | end 10 | 11 | # 12 | # Don't treat wukong scripts as templates 13 | # 14 | def script 15 | @source 16 | end 17 | 18 | def cmd 19 | raise "No wukong input specified" if input.empty? 20 | Log.info("Launching Wukong script in hadoop mode") 21 | "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}" 22 | end 23 | 24 | def local_cmd 25 | inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',') 26 | Log.info("Launching Wukong script in local mode") 27 | "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}" 28 | end 29 | 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/swineherd/template.rb: -------------------------------------------------------------------------------- 1 | require 'erubis' 2 | require 'tempfile' 3 | 4 | 5 | # Template.new(script_path, attributes).substitute! 6 | 7 | module Swineherd 8 | 9 | class Template 10 | attr_accessor :source_template, :attributes 11 | 12 | def initialize source_template, attributes 13 | @source_template = source_template 14 | @attributes = attributes 15 | end 16 | 17 | def compile! 18 | dest << Erubis::Eruby.new(source).result(attributes) 19 | dest << "\n" 20 | dest 21 | end 22 | 23 | def substitute! 24 | compile! 25 | dest.read 26 | dest.path 27 | end 28 | 29 | protected 30 | 31 | def source 32 | File.open(source_template).read 33 | end 34 | 35 | def dest 36 | return @dest if @dest 37 | @dest ||= Tempfile.new(basename) 38 | end 39 | 40 | def basename 41 | File.basename(source_template) 42 | end 43 | 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /lib/swineherd/workflow.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | class Workflow 3 | attr_accessor :workdir, :outputs, :output_counts 4 | 5 | # 6 | # Create a new workflow and new namespace for this workflow 7 | # 8 | def initialize flow_id, &blk 9 | @flow_id = flow_id 10 | @output_counts = Hash.new{|h,k| h[k] = 0} 11 | @outputs = Hash.new{|h,k| h[k] = []} 12 | namespace @flow_id do 13 | self.instance_eval(&blk) 14 | end 15 | end 16 | 17 | # 18 | # Get next logical output of taskname by incrementing internal counter 19 | # 20 | def next_output taskname 21 | raise "No working directory specified." unless @workdir 22 | @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}" 23 | @output_counts[taskname] += 1 24 | latest_output(taskname) 25 | end 26 | 27 | # 28 | # Get latest output of taskname 29 | # 30 | def latest_output taskname 31 | @outputs[taskname].last 32 | end 33 | 34 | # 35 | # Runs workflow starting with taskname 36 | # 37 | def run taskname 38 | Log.info "Launching workflow task #{@flow_id}:#{taskname} ..." 39 | Rake::Task["#{@flow_id}:#{taskname}"].invoke 40 | Log.info "Workflow task #{@flow_id}:#{taskname} finished" 41 | end 42 | 43 | # 44 | # Describes the dependency tree of all tasks belonging to self 45 | # 46 | def describe 47 | Rake::Task.tasks.each do |t| 48 | Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/ 49 | end 50 | end 51 | 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/swineherd/workflow/job.rb: -------------------------------------------------------------------------------- 1 | module Swineherd 2 | 3 | # 4 | # Job class is at its core a rake task 5 | # 6 | class Job 7 | 8 | # 9 | # Initialize job, fill variables, and create rake task 10 | # 11 | def initialize job_id, &blk 12 | @job_id = job_id 13 | @name = '' 14 | @dependencies = [] 15 | @script = '' 16 | self.instance_eval(&blk) 17 | raketask 18 | handle_dependencies 19 | end 20 | 21 | # 22 | # Will be the name of the rake task 23 | # 24 | def name name = nil 25 | return @name unless name 26 | @name = name 27 | end 28 | 29 | def script script = nil 30 | return @script unless script 31 | @script = script 32 | end 33 | 34 | # 35 | # An array of job names as dependencies 36 | # 37 | def dependencies dependencies = nil 38 | return @dependencies unless dependencies 39 | @dependencies = dependencies 40 | end 41 | 42 | def handle_dependencies 43 | return if dependencies.empty? 44 | task name => dependencies 45 | end 46 | 47 | def cmd 48 | @script.cmd 49 | end 50 | 51 | # 52 | # Every job is compiled into a rake task 53 | # 54 | def raketask 55 | task name do 56 | @script.run 57 | end 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | Logging: 2 | 3 | 1. All output from the launched workflow should go to a workflow log file 4 | 2. Hadoop output is special and should be pulled down from the jobtracker 5 | - jobconf.xml 6 | - job details page 7 | 8 | Workflow should specify a logdir, defualts to workdir + '/logs' 9 | 10 | Fetching hadoop job stats: 11 | 12 | 1. Get job id 13 | 2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/" 14 | 3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....) 15 | 4. Fetch the two urls we care about and dump into the workflow's log dir. 16 | 5. Possibly parse the results into an ongoing workflow-statistics.tsv file 17 | 18 | Other output: 19 | 20 | Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well. 21 | -------------------------------------------------------------------------------- /swineherd.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{swineherd} 8 | s.version = "0.0.4" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["Jacob Perkins"] 12 | s.date = %q{2011-06-22} 13 | s.description = %q{Swineherd is for running scripts and workflows on filesystems.} 14 | s.email = %q{jacob.a.perkins@gmail.com} 15 | s.executables = ["hdp-tree", "hadoop-stream"] 16 | s.extra_rdoc_files = [ 17 | "LICENSE", 18 | "README.textile" 19 | ] 20 | s.files = [ 21 | "LICENSE", 22 | "README.textile", 23 | "Rakefile", 24 | "VERSION", 25 | "bin/hadoop-stream", 26 | "bin/hdp-tree", 27 | "examples/pagerank/data/seinfeld_network.tsv", 28 | "examples/pagerank/pagerank.rb", 29 | "examples/pagerank/scripts/cut_off_list.rb", 30 | "examples/pagerank/scripts/histogram.R", 31 | "examples/pagerank/scripts/pagerank.pig", 32 | "examples/pagerank/scripts/pagerank_initialize.pig", 33 | "lib/swineherd.rb", 34 | "lib/swineherd/filesystem.rb", 35 | "lib/swineherd/filesystem/README_filesystem.textile", 36 | "lib/swineherd/filesystem/basefilesystem.rb", 37 | "lib/swineherd/filesystem/filesystems.rb", 38 | "lib/swineherd/filesystem/hadoopfilesystem.rb", 39 | "lib/swineherd/filesystem/localfilesystem.rb", 40 | "lib/swineherd/filesystem/localfs.rb", 41 | "lib/swineherd/filesystem/s3filesystem.rb", 42 | "lib/swineherd/script.rb", 43 | "lib/swineherd/script/hadoop_script.rb", 44 | "lib/swineherd/script/pig_script.rb", 45 | "lib/swineherd/script/r_script.rb", 46 | "lib/swineherd/script/wukong_script.rb", 47 | "lib/swineherd/template.rb", 48 | "lib/swineherd/workflow.rb", 49 | "lib/swineherd/workflow/job.rb", 50 | "notes.txt", 51 | "swineherd.gemspec", 52 | "tests/test_filesystem.rb", 53 | "tests/test_s3_filesystem.rb", 54 | "tests/testcfg.yaml" 55 | ] 56 | s.homepage = %q{http://github.com/Ganglion/swineherd} 57 | s.licenses = ["MIT"] 58 | s.require_paths = ["lib"] 59 | s.rubygems_version = %q{1.3.7} 60 | s.summary = %q{Flexible data workflow glue.} 61 | 62 | if s.respond_to? :specification_version then 63 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 64 | s.specification_version = 3 65 | 66 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 67 | s.add_development_dependency(%q, ["~> 0.6.0"]) 68 | s.add_development_dependency(%q, ["~> 1.5.2"]) 69 | s.add_development_dependency(%q, [">= 0"]) 70 | s.add_runtime_dependency(%q, [">= 0"]) 71 | s.add_runtime_dependency(%q, [">= 0"]) 72 | s.add_runtime_dependency(%q, [">= 0"]) 73 | s.add_runtime_dependency(%q, [">= 0"]) 74 | else 75 | s.add_dependency(%q, ["~> 0.6.0"]) 76 | s.add_dependency(%q, ["~> 1.5.2"]) 77 | s.add_dependency(%q, [">= 0"]) 78 | s.add_dependency(%q, [">= 0"]) 79 | s.add_dependency(%q, [">= 0"]) 80 | s.add_dependency(%q, [">= 0"]) 81 | s.add_dependency(%q, [">= 0"]) 82 | end 83 | else 84 | s.add_dependency(%q, ["~> 0.6.0"]) 85 | s.add_dependency(%q, ["~> 1.5.2"]) 86 | s.add_dependency(%q, [">= 0"]) 87 | s.add_dependency(%q, [">= 0"]) 88 | s.add_dependency(%q, [">= 0"]) 89 | s.add_dependency(%q, [">= 0"]) 90 | s.add_dependency(%q, [">= 0"]) 91 | end 92 | end 93 | 94 | -------------------------------------------------------------------------------- /tests/test_filesystem.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | $LOAD_PATH << 'lib' 4 | require 'swineherd/filesystem' ; include Swineherd 5 | require 'rubygems' 6 | require 'yaml' 7 | require 'rspec' 8 | 9 | options = YAML.load(File.read(File.dirname(__FILE__)+'/testcfg.yaml')) 10 | current_test = options['filesystem_to_test'] 11 | describe "A new filesystem" do 12 | 13 | before do 14 | @test_path = "/tmp/rspec/test_path" 15 | @test_path2 = "/tmp/rspec/test_path2" 16 | @test_string = "@('_')@" 17 | @fs = Swineherd::FileSystem.get(current_test) 18 | end 19 | 20 | it "should implement exists?" do 21 | [true, false].should include(@fs.exists?(@test_path)) 22 | end 23 | 24 | it "should be able to create a path" do 25 | @fs.mkpath(@test_path) 26 | @fs.exists?(@test_path).should eql(true) 27 | end 28 | 29 | it "should be able to remove a path" do 30 | @fs.mkpath(@test_path) 31 | @fs.rm(@test_path) 32 | @fs.exists?(@test_path).should eql(false) 33 | end 34 | 35 | it "should implement size" do 36 | @fs.mkpath(File.dirname(@test_path)) 37 | fileobj = @fs.open(@test_path, 'w') 38 | fileobj.write(@test_string) 39 | fileobj.close 40 | 7.should eql(@fs.size(@test_path)) 41 | @fs.rm(@test_path) 42 | @fs.rm(File.dirname(@test_path)) 43 | end 44 | 45 | it "should be able to copy paths" do 46 | @fs.mkpath(@test_path) 47 | @fs.cp(@test_path, @test_path2) 48 | @fs.exists?(@test_path2).should eql(true) 49 | @fs.rm(@test_path) 50 | @fs.rm(@test_path2) 51 | end 52 | 53 | it "should be able to move paths" do 54 | @fs.mkpath(@test_path) 55 | @fs.mv(@test_path, @test_path2) 56 | @fs.exists?(@test_path).should eql(false) 57 | @fs.exists?(@test_path2).should eql(true) 58 | @fs.rm(@test_path2) 59 | end 60 | 61 | it "should return a sane path type" do 62 | @fs.mkpath(@test_path) 63 | ["file", "directory", "symlink", "unknown"].should include(@fs.type(@test_path)) 64 | @fs.rm(@test_path) 65 | end 66 | 67 | it "can return an array of directory entries" do 68 | sub_paths = ["a", "b", "c"] 69 | sub_paths.each do |sub_path| 70 | @fs.mkpath(File.join(@test_path, sub_path)) 71 | end 72 | @fs.entries(@test_path).class.should eql(Array) 73 | @fs.entries(@test_path).map{|path| File.basename(path)}.reject{|x| x =~ /\./}.sort.should eql(sub_paths.sort) 74 | @fs.rm(@test_path) 75 | end 76 | 77 | it "can answer to open with a writable file object" do 78 | fileobj = @fs.open(@test_path, 'w') 79 | fileobj.should respond_to :write 80 | @fs.rm(@test_path) 81 | end 82 | 83 | end 84 | 85 | describe "A new file" do 86 | before do 87 | @test_path = "/tmp/rspec/test_path" 88 | @test_path2 = "/tmp/rspec/test_path2" 89 | @test_string = "@('_')@" 90 | @fs = Swineherd::FileSystem.get(current_test) 91 | end 92 | 93 | it "should be closeable" do 94 | @fs.open(@test_path, 'w').close 95 | end 96 | 97 | it "should be writeable" do 98 | fileobj = @fs.open(@test_path, 'w') 99 | fileobj.write(@test_string) 100 | fileobj.close 101 | @fs.rm(@test_path) 102 | end 103 | 104 | it "should be readable" do 105 | 106 | fileobjw = @fs.open(@test_path, 'w') 107 | fileobjw.write(@test_string) 108 | fileobjw.close 109 | 110 | fileobjr = @fs.open(@test_path, 'r') 111 | fileobjr.read.should eql(@test_string) 112 | 113 | @fs.rm(@test_path) 114 | end 115 | 116 | end 117 | -------------------------------------------------------------------------------- /tests/test_s3_filesystem.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # 4 | # These tests cannot possibly pass unless you have an amazon account with proper 5 | # credentials. Furthermore, you definitely want a test bucket to play with. In 6 | # this set of mock tests I've called it 'test-bucket' which will certainly get 7 | # you and 'access-denied' error. Also, despite all that, 4 tests (see below) 8 | # will fail outright. 9 | # 10 | # This one has to break the rules slightly because amazon-s3 is not actually a 11 | # filesystem implementation. There's no such thing as a 'path' and so the following 12 | # tests will fail: 13 | # 14 | # 1. it "should be able to create a path" (path wont exist but it's ok, thats what 15 | # we expect) 16 | # 17 | # 2. it "should be able to copy paths" (it can't create paths that aren't files 18 | # and so we expect this to fail, again it's ok.) 19 | # 20 | # 3. it "should be able to move paths" (it can't create paths that aren't files 21 | # and so we expect this to fail, again it's ok.) 22 | # 23 | # 4. it "can return an array of directory entries" (ditto) 24 | # 25 | # Note: If one were to rewrite the above tests to use existing paths on s3 then the 26 | # tests will succeed. Try it. 27 | # 28 | 29 | 30 | $LOAD_PATH << 'lib' 31 | require 'swineherd/filesystem' ; include Swineherd 32 | require 'rubygems' 33 | require 'yaml' 34 | require 'rspec' 35 | 36 | options = YAML.load(File.read(File.dirname(__FILE__)+'/testcfg.yaml')) 37 | current_test = 's3' 38 | describe "A new filesystem" do 39 | 40 | before do 41 | @test_path = "#{options['s3_test_bucket']}/tmp/rspec/test_path" 42 | @test_path2 = "#{options['s3_test_bucket']}/tmp/rspec/test_path2" 43 | @test_string = "@('_')@" 44 | @fs = Swineherd::FileSystem.get(current_test, options['aws_access_key_id'], options['aws_secret_access_key']) 45 | end 46 | 47 | it "should implement exists?" do 48 | [true, false].should include(@fs.exists?(@test_path)) 49 | end 50 | 51 | it "should be able to create a path" do 52 | @fs.mkpath(@test_path) 53 | @fs.exists?(@test_path).should eql(true) 54 | end 55 | 56 | it "should be able to remove a path" do 57 | @fs.mkpath(@test_path) 58 | @fs.rm(@test_path) 59 | @fs.exists?(@test_path).should eql(false) 60 | end 61 | 62 | it "should implement size" do 63 | @fs.mkpath(File.dirname(@test_path)) 64 | fileobj = @fs.open(@test_path, 'w') 65 | fileobj.write(@test_string) 66 | fileobj.close 67 | 7.should eql(@fs.size(@test_path)) 68 | @fs.rm(@test_path) 69 | @fs.rm(File.dirname(@test_path)) 70 | end 71 | 72 | it "should be able to copy paths" do 73 | @fs.mkpath(@test_path) 74 | @fs.cp(@test_path, @test_path2) 75 | @fs.exists?(@test_path2).should eql(true) 76 | @fs.rm(@test_path) 77 | @fs.rm(@test_path2) 78 | end 79 | 80 | it "should be able to move paths" do 81 | @fs.mkpath(@test_path) 82 | @fs.mv(@test_path, @test_path2) 83 | @fs.exists?(@test_path).should eql(false) 84 | @fs.exists?(@test_path2).should eql(true) 85 | @fs.rm(@test_path2) 86 | end 87 | 88 | it "should return a sane path type" do 89 | @fs.mkpath(@test_path) 90 | ["file", "directory", "symlink", "unknown"].should include(@fs.type(@test_path)) 91 | @fs.rm(@test_path) 92 | end 93 | 94 | it "can return an array of directory entries" do 95 | sub_paths = ["a", "b", "c"] 96 | sub_paths.each do |sub_path| 97 | @fs.mkpath(File.join(@test_path, sub_path)) 98 | end 99 | @fs.entries(@test_path).class.should eql(Array) 100 | @fs.entries(@test_path).map{|path| File.basename(path)}.reject{|x| x =~ /\./}.sort.should eql(sub_paths.sort) 101 | @fs.rm(@test_path) 102 | end 103 | 104 | it "can answer to open with a writable file object" do 105 | fileobj = @fs.open(@test_path, 'w') 106 | fileobj.should respond_to :write 107 | @fs.rm(@test_path) 108 | end 109 | 110 | end 111 | 112 | describe "A new file" do 113 | before do 114 | @test_path = "#{options['s3_test_bucket']}/tmp/rspec/test_path" 115 | @test_path2 = "#{options['s3_test_bucket']}/test_path2" 116 | @test_string = "@('_')@" 117 | @fs = Swineherd::FileSystem.get(current_test, options['aws_access_key_id'], options['aws_secret_access_key']) 118 | end 119 | 120 | it "should be closeable" do 121 | @fs.open(@test_path, 'w').close 122 | end 123 | 124 | it "should be writeable" do 125 | fileobj = @fs.open(@test_path, 'w') 126 | fileobj.write(@test_string) 127 | fileobj.close 128 | @fs.rm(@test_path) 129 | end 130 | 131 | it "should be readable" do 132 | 133 | fileobjw = @fs.open(@test_path, 'w') 134 | fileobjw.write(@test_string) 135 | fileobjw.close 136 | 137 | fileobjr = @fs.open(@test_path, 'r') 138 | fileobjr.read.should eql(@test_string) 139 | 140 | @fs.rm(@test_path) 141 | end 142 | 143 | end 144 | -------------------------------------------------------------------------------- /tests/testcfg.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | filesystem_to_test: file 3 | s3_test_bucket: infochimps-test 4 | 5 | # :) you'll probably want to change these 6 | aws_access_key_id: myaccessid 7 | aws_secret_access_key: 1234mysecretaccesskey8q7fh 8 | --------------------------------------------------------------------------------